From 36cb4116d15cfef2d42ec4a834efd4a958f261b5 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 27 Mar 2021 15:15:09 -0500 Subject: [PATCH 001/226] Switch allocator mutexes to static initialization. Details: - Switched the small block allocator (sba), as defined in bli_sba.c and bli_apool.c, to static initialization of its internal mutex. Did a similar thing for the packing block allocator (pba), which appears as global_membrk in bli_membrk.c. - Commented out bli_membrk_init_mutex() and bli_membrk_finalize_mutex() to ensure they won't be used in the future. - In bli_thrcomm_pthreads.c and .h, removed old, commented-out cpp blocks guarded by BLIS_USE_PTHREAD_MUTEX. --- frame/base/bli_apool.c | 16 ++++++++++++---- frame/base/bli_membrk.c | 18 ++++++++++++------ frame/base/bli_membrk.h | 18 +++++++++--------- frame/base/bli_sba.c | 2 +- frame/thread/bli_thrcomm_pthreads.c | 14 -------------- frame/thread/bli_thrcomm_pthreads.h | 4 ---- 6 files changed, 34 insertions(+), 38 deletions(-) diff --git a/frame/base/bli_apool.c b/frame/base/bli_apool.c index 2e40c396de..ffd50b12e0 100644 --- a/frame/base/bli_apool.c +++ b/frame/base/bli_apool.c @@ -39,12 +39,17 @@ void bli_apool_init apool_t* restrict apool ) { + // NOTE: The apool_t is only used in one place; it is the type used to + // define the sba. We've switched to static initialization of the mutex + // field to remove one more thing that could possibly go wrong during + // library initialization. + // Query the mutex from the apool_t. - bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool ); + //bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool ); // Initialize the mutex. //*mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; - bli_pthread_mutex_init( mutex, NULL ); + //bli_pthread_mutex_init( mutex, NULL ); // We choose to start with: // - an empty pool @@ -212,11 +217,14 @@ void bli_apool_finalize apool_t* restrict apool ) { + // NOTE: Since the apool_t's mutex is now initialized statically, we no + // longer need to explicitly destroy it. + // Query the mutex from the apool_t. - bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool ); + //bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool ); // Destroy the mutex. - bli_pthread_mutex_destroy( mutex ); + //bli_pthread_mutex_destroy( mutex ); // Query the underlying pool_t and mutex from the apool_t. pool_t* restrict pool = bli_apool_pool( apool ); diff --git a/frame/base/bli_membrk.c b/frame/base/bli_membrk.c index e01c119b2b..811c68d332 100644 --- a/frame/base/bli_membrk.c +++ b/frame/base/bli_membrk.c @@ -36,7 +36,8 @@ #include "blis.h" -static membrk_t global_membrk; +// Statically initialize the mutex within the global membrk object. +static membrk_t global_membrk = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER }; // ----------------------------------------------------------------------------- @@ -62,7 +63,10 @@ void bli_membrk_init bli_membrk_set_malloc_fp( malloc_fp, membrk ); bli_membrk_set_free_fp( free_fp, membrk ); - bli_membrk_init_mutex( membrk ); + // The mutex field of membrk is initialized statically above. This + // keeps bli_membrk_init() simpler and removes the possibility of + // something going wrong during mutex initialization. + #ifdef BLIS_ENABLE_PBA_POOLS bli_membrk_init_pools( cntx, membrk ); #endif @@ -75,13 +79,15 @@ void bli_membrk_finalize { membrk_t* restrict membrk = bli_membrk_query(); - bli_membrk_set_malloc_fp( NULL, membrk ); - bli_membrk_set_free_fp( NULL, membrk ); - #ifdef BLIS_ENABLE_PBA_POOLS bli_membrk_finalize_pools( membrk ); #endif - bli_membrk_finalize_mutex( membrk ); + + // The mutex field of membrk is initialized statically above, and + // therefore never destroyed. + + bli_membrk_set_malloc_fp( NULL, membrk ); + bli_membrk_set_free_fp( NULL, membrk ); } void bli_membrk_acquire_m diff --git a/frame/base/bli_membrk.h b/frame/base/bli_membrk.h index 6e2a2fefd5..b8a878abd3 100644 --- a/frame/base/bli_membrk.h +++ b/frame/base/bli_membrk.h @@ -39,15 +39,15 @@ // membrk init -BLIS_INLINE void bli_membrk_init_mutex( membrk_t* membrk ) -{ - bli_pthread_mutex_init( &(membrk->mutex), NULL ); -} - -BLIS_INLINE void bli_membrk_finalize_mutex( membrk_t* membrk ) -{ - bli_pthread_mutex_destroy( &(membrk->mutex) ); -} +//BLIS_INLINE void bli_membrk_init_mutex( membrk_t* membrk ) +//{ +// bli_pthread_mutex_init( &(membrk->mutex), NULL ); +//} + +//BLIS_INLINE void bli_membrk_finalize_mutex( membrk_t* membrk ) +//{ +// bli_pthread_mutex_destroy( &(membrk->mutex) ); +//} // membrk query diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c index 2e072504ce..3d748a3898 100644 --- a/frame/base/bli_sba.c +++ b/frame/base/bli_sba.c @@ -35,7 +35,7 @@ #include "blis.h" // The small block allocator: an apool_t of array_t of pool_t. -static apool_t sba; +static apool_t sba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER }; apool_t* bli_sba_query( void ) { diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 0f555158ec..d0896f94df 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -93,18 +93,10 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) comm->n_threads = n_threads; comm->barrier_sense = 0; comm->barrier_threads_arrived = 0; - -//#ifdef BLIS_USE_PTHREAD_MUTEX -// bli_pthread_mutex_init( &comm->mutex, NULL ); -//#endif } void bli_thrcomm_cleanup( thrcomm_t* comm ) { -//#ifdef BLIS_USE_PTHREAD_MUTEX -// if ( comm == NULL ) return; -// bli_pthread_mutex_destroy( &comm->mutex ); -//#endif } void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) @@ -114,13 +106,7 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) bool my_sense = comm->sense; dim_t my_threads_arrived; -#ifdef BLIS_USE_PTHREAD_MUTEX - bli_pthread_mutex_lock( &comm->mutex ); - my_threads_arrived = ++(comm->threads_arrived); - bli_pthread_mutex_unlock( &comm->mutex ); -#else my_threads_arrived = __sync_add_and_fetch(&(comm->threads_arrived), 1); -#endif if ( my_threads_arrived == comm->n_threads ) { diff --git a/frame/thread/bli_thrcomm_pthreads.h b/frame/thread/bli_thrcomm_pthreads.h index 75d56c400a..2c2e885515 100644 --- a/frame/thread/bli_thrcomm_pthreads.h +++ b/frame/thread/bli_thrcomm_pthreads.h @@ -52,10 +52,6 @@ struct thrcomm_s void* sent_object; dim_t n_threads; -//#ifdef BLIS_USE_PTHREAD_MUTEX -// bli_pthread_mutex_t mutex; -//#endif - // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. From 3a6f41afb8197e831b6ce2f1ae7f63735685fa0a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 27 Mar 2021 17:22:14 -0500 Subject: [PATCH 002/226] Renamed membrk files/vars/functions to pba. Details: - Renamed the files, variables, and functions relating to the packing block allocator from its legacy name (membrk) to its current name (pba). This more clearly contrasts the packing block allocator with the small block allocator (sba). - Fixed a typo in bli_pack_set_pack_b(), defined in bli_pack.c, that caused the function to erroneously change the value of the pack_a field of the global rntm_t instead of the pack_b field. (Apparently nobody has used this API yet.) - Comment updates. --- build/libblis-symbols.def | 22 +-- frame/1/other/packv/bli_packv_init.c | 12 +- frame/3/bli_l3_packm.c | 6 +- frame/3/bli_l3_sup_packm_a.c | 8 +- frame/3/bli_l3_sup_packm_b.c | 8 +- frame/base/bli_cntl.c | 4 +- frame/base/bli_memsys.c | 8 +- frame/base/bli_pack.c | 2 +- frame/base/{bli_membrk.c => bli_pba.c} | 157 +++++++++++---------- frame/base/{bli_membrk.h => bli_pba.h} | 99 +++++++------ frame/base/bli_rntm.h | 18 +-- frame/base/bli_sba.c | 3 +- frame/include/bli_type_defs.h | 6 +- frame/include/blis.h | 2 +- frame/thread/bli_l3_decor_openmp.c | 2 +- frame/thread/bli_l3_decor_pthreads.c | 2 +- frame/thread/bli_l3_decor_single.c | 2 +- frame/thread/bli_l3_sup_decor_openmp.c | 2 +- frame/thread/bli_l3_sup_decor_pthreads.c | 2 +- frame/thread/bli_l3_sup_decor_single.c | 2 +- kernels/zen/3/bli_gemm_small.c | 20 +-- ref_kernels/bli_cntx_ref.c | 4 - sandbox/ref99/old/packm/blx_l3_packm.c | 14 +- sandbox/ref99/old/thread/blx_gemm_thread.c | 2 +- 24 files changed, 210 insertions(+), 197 deletions(-) rename frame/base/{bli_membrk.c => bli_pba.c} (81%) rename frame/base/{bli_membrk.h => bli_pba.h} (61%) diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index e1bfce807e..97146a7861 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -1297,17 +1297,17 @@ bli_malloc_user bli_mbool_create bli_mbool_free bli_mbool_init -bli_membrk_acquire_m -bli_membrk_compute_pool_block_sizes -bli_membrk_compute_pool_block_sizes_dt -bli_membrk_finalize -bli_membrk_finalize_pools -bli_membrk_init -bli_membrk_init_pools -bli_membrk_pool_size -bli_membrk_query -bli_membrk_release -bli_membrk_rntm_set_membrk +bli_pba_acquire_m +bli_pba_compute_pool_block_sizes +bli_pba_compute_pool_block_sizes_dt +bli_pba_finalize +bli_pba_finalize_pools +bli_pba_init +bli_pba_init_pools +bli_pba_pool_size +bli_pba_query +bli_pba_release +bli_pba_rntm_set_pba bli_memsys_finalize bli_memsys_init bli_mkherm diff --git a/frame/1/other/packv/bli_packv_init.c b/frame/1/other/packv/bli_packv_init.c index 31fbda27d5..ba424996f5 100644 --- a/frame/1/other/packv/bli_packv_init.c +++ b/frame/1/other/packv/bli_packv_init.c @@ -117,7 +117,7 @@ siz_t bli_packv_init_pack dim_t dim_a = bli_obj_vector_dim( a ); dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); - membrk_t* membrk = bli_cntx_membrk( cntx ); + pba_t* pba = bli_cntx_pba( cntx ); #if 0 mem_t* mem_p; @@ -156,9 +156,7 @@ siz_t bli_packv_init_pack { // If the mem_t object of p has not yet been allocated, then acquire // a memory block suitable for a vector. - bli_membrk_acquire_v( membrk, - size_p, - mem_p ); + bli_pba_acquire_v( pba, size_p, mem_p ); } else { @@ -166,11 +164,9 @@ siz_t bli_packv_init_pack // re-acquire the memory so there is sufficient space. if ( bli_mem_size( mem_p ) < size_p ) { - bli_membrk_release( mem_p ); + bli_pba_release( mem_p ); - bli_membrk_acquire_v( membrk, - size_p, - mem_p ); + bli_pba_acquire_v( pba, size_p, mem_p ); } } diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c index d6efb593cc..48f55c3602 100644 --- a/frame/3/bli_l3_packm.c +++ b/frame/3/bli_l3_packm.c @@ -91,7 +91,7 @@ void bli_l3_packm // The chief thread acquires a block from the memory broker // and saves the associated mem_t entry to local_mem_s. - bli_membrk_acquire_m + bli_pba_acquire_m ( rntm, size_needed, @@ -130,12 +130,12 @@ void bli_l3_packm // The chief thread releases the existing block associated with // the mem_t entry in the control tree, and then re-acquires a // new block, saving the associated mem_t entry to local_mem_s. - bli_membrk_release + bli_pba_release ( rntm, cntl_mem_p ); - bli_membrk_acquire_m + bli_pba_acquire_m ( rntm, size_needed, diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c index 13c6905e74..d1b17bf139 100644 --- a/frame/3/bli_l3_sup_packm_a.c +++ b/frame/3/bli_l3_sup_packm_a.c @@ -86,7 +86,7 @@ void PASTEMAC(ch,opname) \ function before the other threads have a chance to copy from it. (A barrier would fix that race condition, but then again, I prefer to keep barriers to a minimum.) */ \ - bli_membrk_acquire_m \ + bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ @@ -130,12 +130,12 @@ void PASTEMAC(ch,opname) \ above for why the acquisition needs to be directly to the chief thread's passed-in mem_t and not a local (temporary) mem_t. */ \ - bli_membrk_release \ + bli_pba_release \ ( \ rntm, \ mem \ ); \ - bli_membrk_acquire_m \ + bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ @@ -194,7 +194,7 @@ void PASTEMAC(ch,opname) \ is allocated, which it should be. */ \ if ( bli_mem_is_alloc( mem ) ) \ { \ - bli_membrk_release \ + bli_pba_release \ ( \ rntm, \ mem \ diff --git a/frame/3/bli_l3_sup_packm_b.c b/frame/3/bli_l3_sup_packm_b.c index 024ad21794..84b0c55486 100644 --- a/frame/3/bli_l3_sup_packm_b.c +++ b/frame/3/bli_l3_sup_packm_b.c @@ -86,7 +86,7 @@ void PASTEMAC(ch,opname) \ function before the other threads have a chance to copy from it. (A barrier would fix that race condition, but then again, I prefer to keep barriers to a minimum.) */ \ - bli_membrk_acquire_m \ + bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ @@ -130,12 +130,12 @@ void PASTEMAC(ch,opname) \ above for why the acquisition needs to be directly to the chief thread's passed-in mem_t and not a local (temporary) mem_t. */ \ - bli_membrk_release \ + bli_pba_release \ ( \ rntm, \ mem \ ); \ - bli_membrk_acquire_m \ + bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ @@ -194,7 +194,7 @@ void PASTEMAC(ch,opname) \ is allocated, which it should be. */ \ if ( bli_mem_is_alloc( mem ) ) \ { \ - bli_membrk_release \ + bli_pba_release \ ( \ rntm, \ mem \ diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index 69aa2683bf..f8846198f1 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -192,7 +192,7 @@ void bli_cntl_free_w_thrinfo printf( "bli_cntl_free_w_thrinfo(): releasing mem pool block.\n" ); #endif - bli_membrk_release( rntm, cntl_pack_mem ); + bli_pba_release( rntm, cntl_pack_mem ); } // Free the current node. @@ -236,7 +236,7 @@ void bli_cntl_free_wo_thrinfo // allocated. if ( bli_mem_is_alloc( cntl_pack_mem ) ) { - bli_membrk_release( rntm, cntl_pack_mem ); + bli_pba_release( rntm, cntl_pack_mem ); } // Free the current node. diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c index 317d3e76d1..ca3c46f998 100644 --- a/frame/base/bli_memsys.c +++ b/frame/base/bli_memsys.c @@ -39,7 +39,7 @@ void bli_memsys_init( void ) { // Query a native context so we have something to pass into - // bli_membrk_init_pools(). We use BLIS_DOUBLE for the datatype, + // bli_pba_init_pools(). We use BLIS_DOUBLE for the datatype, // but the dt argument is actually only used when initializing // contexts for induced methods. // NOTE: Instead of calling bli_gks_query_cntx(), we call @@ -47,7 +47,7 @@ void bli_memsys_init( void ) cntx_t* cntx_p = bli_gks_query_cntx_noinit(); // Initialize the packing block allocator and its data structures. - bli_membrk_init( cntx_p ); + bli_pba_init( cntx_p ); // Initialize the small block allocator and its data structures. bli_sba_init(); @@ -58,7 +58,7 @@ void bli_memsys_finalize( void ) // Finalize the small block allocator and its data structures. bli_sba_finalize(); - // Finalize the global membrk_t object and its data structures. - bli_membrk_finalize(); + // Finalize the packing block allocator and its data structures. + bli_pba_finalize(); } diff --git a/frame/base/bli_pack.c b/frame/base/bli_pack.c index 5f4cca575e..9a5b45b39c 100644 --- a/frame/base/bli_pack.c +++ b/frame/base/bli_pack.c @@ -101,7 +101,7 @@ void bli_pack_set_pack_b( bool pack_b ) // Acquire the mutex protecting global_rntm. bli_pthread_mutex_lock( &global_rntm_mutex ); - bli_rntm_set_pack_a( pack_b, &global_rntm ); + bli_rntm_set_pack_b( pack_b, &global_rntm ); // Release the mutex protecting global_rntm. bli_pthread_mutex_unlock( &global_rntm_mutex ); diff --git a/frame/base/bli_membrk.c b/frame/base/bli_pba.c similarity index 81% rename from frame/base/bli_membrk.c rename to frame/base/bli_pba.c index 811c68d332..7c0b606485 100644 --- a/frame/base/bli_membrk.c +++ b/frame/base/bli_pba.c @@ -36,61 +36,61 @@ #include "blis.h" -// Statically initialize the mutex within the global membrk object. -static membrk_t global_membrk = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER }; +// Statically initialize the mutex within the packing block allocator object. +static pba_t pba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER }; // ----------------------------------------------------------------------------- -membrk_t* bli_membrk_query( void ) +pba_t* bli_pba_query( void ) { - return &global_membrk; + return &pba; } -void bli_membrk_init +void bli_pba_init ( cntx_t* restrict cntx ) { - membrk_t* restrict membrk = bli_membrk_query(); + pba_t* restrict pba = bli_pba_query(); const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE_GEN; malloc_ft malloc_fp = BLIS_MALLOC_POOL; free_ft free_fp = BLIS_FREE_POOL; // These fields are used for general-purpose allocation (ie: buf_type - // equal to BLIS_BUFFER_FOR_GEN_USE) within bli_membrk_acquire_m(). - bli_membrk_set_align_size( align_size, membrk ); - bli_membrk_set_malloc_fp( malloc_fp, membrk ); - bli_membrk_set_free_fp( free_fp, membrk ); + // equal to BLIS_BUFFER_FOR_GEN_USE) within bli_pba_acquire_m(). + bli_pba_set_align_size( align_size, pba ); + bli_pba_set_malloc_fp( malloc_fp, pba ); + bli_pba_set_free_fp( free_fp, pba ); - // The mutex field of membrk is initialized statically above. This - // keeps bli_membrk_init() simpler and removes the possibility of + // The mutex field of pba is initialized statically above. This + // keeps bli_pba_init() simpler and removes the possibility of // something going wrong during mutex initialization. #ifdef BLIS_ENABLE_PBA_POOLS - bli_membrk_init_pools( cntx, membrk ); + bli_pba_init_pools( cntx, pba ); #endif } -void bli_membrk_finalize +void bli_pba_finalize ( void ) { - membrk_t* restrict membrk = bli_membrk_query(); + pba_t* restrict pba = bli_pba_query(); #ifdef BLIS_ENABLE_PBA_POOLS - bli_membrk_finalize_pools( membrk ); + bli_pba_finalize_pools( pba ); #endif - // The mutex field of membrk is initialized statically above, and + // The mutex field of pba is initialized statically above, and // therefore never destroyed. - bli_membrk_set_malloc_fp( NULL, membrk ); - bli_membrk_set_free_fp( NULL, membrk ); + bli_pba_set_malloc_fp( NULL, pba ); + bli_pba_set_free_fp( NULL, pba ); } -void bli_membrk_acquire_m +void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, @@ -104,24 +104,24 @@ void bli_membrk_acquire_m // If the internal memory pools for packing block allocator are disabled, // we spoof the buffer type as BLIS_BUFFER_FOR_GEN_USE to induce the - // immediate usage of bli_membrk_malloc(). + // immediate usage of bli_pba_malloc(). #ifndef BLIS_ENABLE_PBA_POOLS buf_type = BLIS_BUFFER_FOR_GEN_USE; #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_membrk_acquire_m(): bli_fmalloc_align(): size %ld\n", + printf( "bli_pba_acquire_m(): bli_fmalloc_align(): size %ld\n", ( long )req_size ); #endif #endif // Query the memory broker from the runtime. - membrk_t* membrk = bli_rntm_membrk( rntm ); + pba_t* pba = bli_rntm_pba( rntm ); if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { - malloc_ft malloc_fp = bli_membrk_malloc_fp( membrk ); - siz_t align_size = bli_membrk_align_size( membrk ); + malloc_ft malloc_fp = bli_pba_malloc_fp( pba ); + siz_t align_size = bli_pba_align_size( pba ); // For general-use buffer requests, dynamically allocating memory // is assumed to be sufficient. @@ -131,7 +131,7 @@ void bli_membrk_acquire_m // - the address of the memory block, // - the buffer type (a packbuf_t value), // - the size of the requested region, - // - the membrk_t from which the mem_t entry was acquired. + // - the pba_t from which the mem_t entry was acquired. // NOTE: We initialize the pool field to NULL since this block did not // come from a memory pool. bli_mem_set_buffer( buf, mem ); @@ -148,13 +148,13 @@ void bli_membrk_acquire_m // Map the requested packed buffer type to a zero-based index, which // we then use to select the corresponding memory pool. pi = bli_packbuf_index( buf_type ); - pool = bli_membrk_pool( pi, membrk ); + pool = bli_pba_pool( pi, pba ); // Extract the address of the pblk_t struct within the mem_t. pblk = bli_mem_pblk( mem ); - // Acquire the mutex associated with the membrk object. - bli_membrk_lock( membrk ); + // Acquire the mutex associated with the pba object. + bli_pba_lock( pba ); // BEGIN CRITICAL SECTION { @@ -172,8 +172,8 @@ void bli_membrk_acquire_m } // END CRITICAL SECTION - // Release the mutex associated with the membrk object. - bli_membrk_unlock( membrk ); + // Release the mutex associated with the pba object. + bli_pba_unlock( pba ); // Query the block_size from the pblk_t. This will be at least // req_size, perhaps larger. @@ -184,7 +184,7 @@ void bli_membrk_acquire_m // - the address of the memory pool to which it belongs, // - the size of the contiguous memory block (NOT the size of the // requested region), - // - the membrk_t from which the mem_t entry was acquired. + // - the pba_t from which the mem_t entry was acquired. // The actual (aligned) address is already stored in the mem_t // struct's pblk_t field. bli_mem_set_buf_type( buf_type, mem ); @@ -194,7 +194,7 @@ void bli_membrk_acquire_m } -void bli_membrk_release +void bli_pba_release ( rntm_t* rntm, mem_t* mem @@ -205,21 +205,21 @@ void bli_membrk_release pblk_t* pblk; // Query the memory broker from the runtime. - membrk_t* membrk = bli_rntm_membrk( rntm ); + pba_t* pba = bli_rntm_pba( rntm ); // Extract the buffer type so we know what kind of memory was allocated. buf_type = bli_mem_buf_type( mem ); #ifndef BLIS_ENABLE_PBA_POOLS #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_membrk_release(): bli_ffree_align(): size %ld\n", + printf( "bli_pba_release(): bli_ffree_align(): size %ld\n", ( long )bli_mem_size( mem ) ); #endif #endif if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { - free_ft free_fp = bli_membrk_free_fp( membrk ); + free_ft free_fp = bli_pba_free_fp( pba ); void* buf = bli_mem_buffer( mem ); // For general-use buffers, we dynamically allocate memory, and so @@ -235,8 +235,8 @@ void bli_membrk_release // Extract the address of the pblk_t struct within the mem_t struct. pblk = bli_mem_pblk( mem ); - // Acquire the mutex associated with the membrk object. - bli_membrk_lock( membrk ); + // Acquire the mutex associated with the pba object. + bli_pba_lock( pba ); // BEGIN CRITICAL SECTION { @@ -247,15 +247,15 @@ void bli_membrk_release } // END CRITICAL SECTION - // Release the mutex associated with the membrk object. - bli_membrk_unlock( membrk ); + // Release the mutex associated with the pba object. + bli_pba_unlock( pba ); } // Clear the mem_t object so that it appears unallocated. This clears: // - the pblk_t struct's fields (ie: the buffer addresses) // - the pool field // - the size field - // - the membrk field + // - the pba field // NOTE: We do not clear the buf_type field since there is no // "uninitialized" value for packbuf_t. bli_mem_clear( mem ); @@ -263,35 +263,38 @@ void bli_membrk_release #if 0 -void bli_membrk_acquire_v +void bli_pba_acquire_v ( - membrk_t* membrk, - siz_t req_size, - mem_t* mem + pba_t* pba, + siz_t req_size, + mem_t* mem ) { - bli_membrk_acquire_m( membrk, - req_size, - BLIS_BUFFER_FOR_GEN_USE, - mem ); + bli_pba_acquire_m + ( + pba, + req_size, + BLIS_BUFFER_FOR_GEN_USE, + mem + ); } #endif -void bli_membrk_rntm_set_membrk +void bli_pba_rntm_set_pba ( rntm_t* rntm ) { - membrk_t* membrk = bli_membrk_query(); + pba_t* pba = bli_pba_query(); - bli_rntm_set_membrk( membrk, rntm ); + bli_rntm_set_pba( pba, rntm ); } -siz_t bli_membrk_pool_size +siz_t bli_pba_pool_size ( - membrk_t* membrk, + pba_t* pba, packbuf_t buf_type ) { @@ -311,7 +314,7 @@ siz_t bli_membrk_pool_size // Acquire the pointer to the pool corresponding to the buf_type // provided. pool_index = bli_packbuf_index( buf_type ); - pool = bli_membrk_pool( pool_index, membrk ); + pool = bli_pba_pool( pool_index, pba ); // Compute the pool "size" as the product of the block size // and the number of blocks in the pool. @@ -324,10 +327,10 @@ siz_t bli_membrk_pool_size // ----------------------------------------------------------------------------- -void bli_membrk_init_pools +void bli_pba_init_pools ( - cntx_t* cntx, - membrk_t* membrk + cntx_t* cntx, + pba_t* pba ) { // Map each of the packbuf_t values to an index starting at zero. @@ -336,9 +339,9 @@ void bli_membrk_init_pools const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = bli_membrk_pool( index_a, membrk ); - pool_t* pool_b = bli_membrk_pool( index_b, membrk ); - pool_t* pool_c = bli_membrk_pool( index_c, membrk ); + pool_t* pool_a = bli_pba_pool( index_a, pba ); + pool_t* pool_b = bli_pba_pool( index_b, pba ); + pool_t* pool_c = bli_pba_pool( index_c, pba ); // Start with empty pools. const dim_t num_blocks_a = 0; @@ -370,10 +373,10 @@ void bli_membrk_init_pools free_ft free_fp = BLIS_FREE_POOL; // Determine the block size for each memory pool. - bli_membrk_compute_pool_block_sizes( &block_size_a, - &block_size_b, - &block_size_c, - cntx ); + bli_pba_compute_pool_block_sizes( &block_size_a, + &block_size_b, + &block_size_c, + cntx ); // Initialize the memory pools for A, B, and C. bli_pool_init( num_blocks_a, block_ptrs_len_a, block_size_a, align_size_a, @@ -384,9 +387,9 @@ void bli_membrk_init_pools offset_size_c, malloc_fp, free_fp, pool_c ); } -void bli_membrk_finalize_pools +void bli_pba_finalize_pools ( - membrk_t* membrk + pba_t* pba ) { // Map each of the packbuf_t values to an index starting at zero. @@ -395,9 +398,9 @@ void bli_membrk_finalize_pools dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = bli_membrk_pool( index_a, membrk ); - pool_t* pool_b = bli_membrk_pool( index_b, membrk ); - pool_t* pool_c = bli_membrk_pool( index_c, membrk ); + pool_t* pool_a = bli_pba_pool( index_a, pba ); + pool_t* pool_b = bli_pba_pool( index_b, pba ); + pool_t* pool_c = bli_pba_pool( index_c, pba ); // Finalize the memory pools for A, B, and C. bli_pool_finalize( pool_a ); @@ -407,7 +410,7 @@ void bli_membrk_finalize_pools // ----------------------------------------------------------------------------- -void bli_membrk_compute_pool_block_sizes +void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, @@ -435,11 +438,11 @@ void bli_membrk_compute_pool_block_sizes // Avoid considering induced methods for real datatypes. if ( bli_is_real( dt ) && im != BLIS_NAT ) continue; - bli_membrk_compute_pool_block_sizes_dt( dt, - &bs_dt_a, - &bs_dt_b, - &bs_dt_c, - cntx ); + bli_pba_compute_pool_block_sizes_dt( dt, + &bs_dt_a, + &bs_dt_b, + &bs_dt_c, + cntx ); bs_cand_a = bli_max( bs_dt_a, bs_cand_a ); bs_cand_b = bli_max( bs_dt_b, bs_cand_b ); @@ -454,7 +457,7 @@ void bli_membrk_compute_pool_block_sizes // ----------------------------------------------------------------------------- -void bli_membrk_compute_pool_block_sizes_dt +void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, diff --git a/frame/base/bli_membrk.h b/frame/base/bli_pba.h similarity index 61% rename from frame/base/bli_membrk.h rename to frame/base/bli_pba.h index b8a878abd3..ce19991f55 100644 --- a/frame/base/bli_membrk.h +++ b/frame/base/bli_pba.h @@ -37,83 +37,100 @@ #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H -// membrk init +// Packing block allocator (formerly memory broker) -//BLIS_INLINE void bli_membrk_init_mutex( membrk_t* membrk ) +/* +typedef struct pba_s +{ + pool_t pools[3]; + bli_pthread_mutex_t mutex; + + // These fields are used for general-purpose allocation. + siz_t align_size; + malloc_ft malloc_fp; + free_ft free_fp; + +} pba_t; +*/ + + +// pba init + +//BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ -// bli_pthread_mutex_init( &(membrk->mutex), NULL ); +// bli_pthread_mutex_init( &(pba->mutex), NULL ); //} -//BLIS_INLINE void bli_membrk_finalize_mutex( membrk_t* membrk ) +//BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ -// bli_pthread_mutex_destroy( &(membrk->mutex) ); +// bli_pthread_mutex_destroy( &(pba->mutex) ); //} -// membrk query +// pba query -BLIS_INLINE pool_t* bli_membrk_pool( dim_t pool_index, membrk_t* membrk ) +BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { - return &(membrk->pools[ pool_index ]); + return &(pba->pools[ pool_index ]); } -BLIS_INLINE siz_t bli_membrk_align_size( membrk_t* membrk ) +BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { - return membrk->align_size; + return pba->align_size; } -BLIS_INLINE malloc_ft bli_membrk_malloc_fp( membrk_t* membrk ) +BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { - return membrk->malloc_fp; + return pba->malloc_fp; } -BLIS_INLINE free_ft bli_membrk_free_fp( membrk_t* membrk ) +BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { - return membrk->free_fp; + return pba->free_fp; } -// membrk modification +// pba modification -BLIS_INLINE void bli_membrk_set_align_size( siz_t align_size, membrk_t* membrk ) +BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { - membrk->align_size = align_size; + pba->align_size = align_size; } -BLIS_INLINE void bli_membrk_set_malloc_fp( malloc_ft malloc_fp, membrk_t* membrk ) +BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { - membrk->malloc_fp = malloc_fp; + pba->malloc_fp = malloc_fp; } -BLIS_INLINE void bli_membrk_set_free_fp( free_ft free_fp, membrk_t* membrk ) +BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { - membrk->free_fp = free_fp; + pba->free_fp = free_fp; } -// membrk action +// pba action -BLIS_INLINE void bli_membrk_lock( membrk_t* membrk ) +BLIS_INLINE void bli_pba_lock( pba_t* pba ) { - bli_pthread_mutex_lock( &(membrk->mutex) ); + bli_pthread_mutex_lock( &(pba->mutex) ); } -BLIS_INLINE void bli_membrk_unlock( membrk_t* membrk ) +BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { - bli_pthread_mutex_unlock( &(membrk->mutex) ); + bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- -membrk_t* bli_membrk_query( void ); +pba_t* bli_pba_query( void ); -void bli_membrk_init +void bli_pba_init ( cntx_t* cntx ); -void bli_membrk_finalize +void bli_pba_finalize ( void ); -void bli_membrk_acquire_m +void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, @@ -121,43 +138,43 @@ void bli_membrk_acquire_m mem_t* mem ); -void bli_membrk_release +void bli_pba_release ( rntm_t* rntm, mem_t* mem ); -void bli_membrk_rntm_set_membrk +void bli_pba_rntm_set_pba ( rntm_t* rntm ); -siz_t bli_membrk_pool_size +siz_t bli_pba_pool_size ( - membrk_t* membrk, + pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- -void bli_membrk_init_pools +void bli_pba_init_pools ( - cntx_t* cntx, - membrk_t* membrk + cntx_t* cntx, + pba_t* pba ); -void bli_membrk_finalize_pools +void bli_pba_finalize_pools ( - membrk_t* membrk + pba_t* pba ); -void bli_membrk_compute_pool_block_sizes +void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); -void bli_membrk_compute_pool_block_sizes_dt +void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h index ed80955f54..249a698051 100644 --- a/frame/base/bli_rntm.h +++ b/frame/base/bli_rntm.h @@ -52,7 +52,7 @@ typedef struct rntm_s bool l3_sup; pool_t* sba_pool; - membrk_t* membrk; + pba_t* pba; } rntm_t; */ @@ -124,9 +124,9 @@ BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) return rntm->sba_pool; } -BLIS_INLINE membrk_t* bli_rntm_membrk( rntm_t* rntm ) +BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { - return rntm->membrk; + return rntm->pba; } #if 0 @@ -205,9 +205,9 @@ BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) rntm->sba_pool = sba_pool; } -BLIS_INLINE void bli_rntm_set_membrk( membrk_t* membrk, rntm_t* rntm ) +BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { - rntm->membrk = membrk; + rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) @@ -222,9 +222,9 @@ BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } -BLIS_INLINE void bli_rntm_clear_membrk( rntm_t* rntm ) +BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { - bli_rntm_set_membrk( NULL, rntm ); + bli_rntm_set_pba( NULL, rntm ); } // @@ -313,7 +313,7 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ - .membrk = NULL, \ + .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) @@ -327,7 +327,7 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); - bli_rntm_clear_membrk( rntm ); + bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c index 3d748a3898..6b7919a11e 100644 --- a/frame/base/bli_sba.c +++ b/frame/base/bli_sba.c @@ -34,7 +34,8 @@ #include "blis.h" -// The small block allocator: an apool_t of array_t of pool_t. +// Statically initialize the mutex within the small block allocator. +// Note that the sba is an apool_t of array_t of pool_t. static apool_t sba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER }; apool_t* bli_sba_query( void ) diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 7dcc5a1b7b..de7eee10a5 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1097,7 +1097,7 @@ typedef struct // -- packing block allocator: Locked set of pools type -- -typedef struct membrk_s +typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; @@ -1107,7 +1107,7 @@ typedef struct membrk_s malloc_ft malloc_fp; free_ft free_fp; -} membrk_t; +} pba_t; // -- Memory object type -- @@ -1477,7 +1477,7 @@ typedef struct rntm_s pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. - membrk_t* membrk; + pba_t* pba; } rntm_t; diff --git a/frame/include/blis.h b/frame/include/blis.h index e51f0a5c39..e4046c2eb1 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -109,7 +109,7 @@ extern "C" { #include "bli_rntm.h" #include "bli_gks.h" #include "bli_ind.h" -#include "bli_membrk.h" +#include "bli_pba.h" #include "bli_pool.h" #include "bli_array.h" #include "bli_apool.h" diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c index 0bf3ad8547..d7c2c2cfd7 100644 --- a/frame/thread/bli_l3_decor_openmp.c +++ b/frame/thread/bli_l3_decor_openmp.c @@ -92,7 +92,7 @@ void bli_l3_thread_decorator // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. - bli_membrk_rntm_set_membrk( rntm ); + bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c index a2c4530625..93ff5f9cd0 100644 --- a/frame/thread/bli_l3_decor_pthreads.c +++ b/frame/thread/bli_l3_decor_pthreads.c @@ -176,7 +176,7 @@ void bli_l3_thread_decorator // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. - bli_membrk_rntm_set_membrk( rntm ); + bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c index 12f27ad873..51474f0eee 100644 --- a/frame/thread/bli_l3_decor_single.c +++ b/frame/thread/bli_l3_decor_single.c @@ -78,7 +78,7 @@ void bli_l3_thread_decorator bli_sba_rntm_set_pool( 0, array, rntm ); // Set the packing block allocator field of the rntm. - bli_membrk_rntm_set_membrk( rntm ); + bli_pba_rntm_set_pba( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c index 4baacd1ea7..1db9514fd4 100644 --- a/frame/thread/bli_l3_sup_decor_openmp.c +++ b/frame/thread/bli_l3_sup_decor_openmp.c @@ -76,7 +76,7 @@ err_t bli_l3_sup_thread_decorator // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. - bli_membrk_rntm_set_membrk( rntm ); + bli_pba_rntm_set_pba( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c index 5b23bc86b1..9fa8730e67 100644 --- a/frame/thread/bli_l3_sup_decor_pthreads.c +++ b/frame/thread/bli_l3_sup_decor_pthreads.c @@ -141,7 +141,7 @@ err_t bli_l3_sup_thread_decorator // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. - bli_membrk_rntm_set_membrk( rntm ); + bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); diff --git a/frame/thread/bli_l3_sup_decor_single.c b/frame/thread/bli_l3_sup_decor_single.c index 2275edf80b..a87af41032 100644 --- a/frame/thread/bli_l3_sup_decor_single.c +++ b/frame/thread/bli_l3_sup_decor_single.c @@ -69,7 +69,7 @@ err_t bli_l3_sup_thread_decorator bli_sba_rntm_set_pool( 0, array, rntm ); // Set the packing block allocator field of the rntm. - bli_membrk_rntm_set_membrk( rntm ); + bli_pba_rntm_set_pba( rntm ); #ifndef SKIP_THRINFO_TREE // Allcoate a global communicator for the root thrinfo_t structures. diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c index 73fa567d24..b04ffea580 100644 --- a/kernels/zen/3/bli_gemm_small.c +++ b/kernels/zen/3/bli_gemm_small.c @@ -268,12 +268,12 @@ static err_t bli_sgemm_small bli_rntm_init_from_global( &rntm ); bli_rntm_set_num_threads_only( 1, &rntm ); - bli_membrk_rntm_set_membrk( &rntm ); + bli_pba_rntm_set_pba( &rntm ); // Get the current size of the buffer pool for A block packing. // We will use the same size to avoid pool re-initialization - siz_t buffer_size = bli_pool_block_size(bli_membrk_pool(bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK), - bli_rntm_membrk(&rntm))); + siz_t buffer_size = bli_pool_block_size(bli_pba_pool(bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK), + bli_rntm_pba(&rntm))); // Based on the available memory in the buffer we will decide if // we want to do packing or not. @@ -299,7 +299,7 @@ static err_t bli_sgemm_small #endif // Get the buffer from the pool, if there is no pool with // required size, it will be created. - bli_membrk_acquire_m(&rntm, + bli_pba_acquire_m(&rntm, buffer_size, BLIS_BITVAL_BUFFER_FOR_A_BLOCK, &local_mem_buf_A_s); @@ -1699,7 +1699,7 @@ static err_t bli_sgemm_small #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_sgemm_small(): releasing mem pool block\n" ); #endif - bli_membrk_release(&rntm, + bli_pba_release(&rntm, &local_mem_buf_A_s); } @@ -1833,13 +1833,13 @@ static err_t bli_dgemm_small bli_rntm_init_from_global( &rntm ); bli_rntm_set_num_threads_only( 1, &rntm ); - bli_membrk_rntm_set_membrk( &rntm ); + bli_pba_rntm_set_pba( &rntm ); // Get the current size of the buffer pool for A block packing. // We will use the same size to avoid pool re-initliazaton siz_t buffer_size = bli_pool_block_size( - bli_membrk_pool(bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK), - bli_rntm_membrk(&rntm))); + bli_pba_pool(bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK), + bli_rntm_pba(&rntm))); // // This kernel assumes that "A" will be unpackged if N <= 3. @@ -1863,7 +1863,7 @@ static err_t bli_dgemm_small printf( "bli_dgemm_small: Requesting mem pool block of size %lu\n", buffer_size); #endif // Get the buffer from the pool. - bli_membrk_acquire_m(&rntm, + bli_pba_acquire_m(&rntm, buffer_size, BLIS_BITVAL_BUFFER_FOR_A_BLOCK, &local_mem_buf_A_s); @@ -3309,7 +3309,7 @@ static err_t bli_dgemm_small #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_dgemm_small(): releasing mem pool block\n" ); #endif - bli_membrk_release(&rntm, + bli_pba_release(&rntm, &local_mem_buf_A_s); } AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index 3c298379da..29e5de95cc 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -593,10 +593,6 @@ void GENBARNAME(cntx_init) bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx ); bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); bli_cntx_set_schema_c_panel( BLIS_NOT_PACKED, cntx ); - - //bli_cntx_set_anti_pref( FALSE, cntx ); - - //bli_cntx_set_membrk( bli_membrk_query(), cntx ); } // ----------------------------------------------------------------------------- diff --git a/sandbox/ref99/old/packm/blx_l3_packm.c b/sandbox/ref99/old/packm/blx_l3_packm.c index 4ec1ac108a..982e2d9631 100644 --- a/sandbox/ref99/old/packm/blx_l3_packm.c +++ b/sandbox/ref99/old/packm/blx_l3_packm.c @@ -45,7 +45,7 @@ void blx_l3_packm thrinfo_t* thread ) { - membrk_t* membrk; + pba_t* pba; packbuf_t pack_buf_type; mem_t* cntl_mem_p; siz_t size_needed; @@ -71,7 +71,7 @@ void blx_l3_packm if ( size_needed == 0 ) return; // Query the memory broker from the context. - membrk = bli_cntx_get_membrk( cntx ); + pba = bli_cntx_get_pba( cntx ); // Query the pack buffer type from the control tree node. pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); @@ -91,9 +91,9 @@ void blx_l3_packm { // The chief thread acquires a block from the memory broker // and saves the associated mem_t entry to local_mem_s. - bli_membrk_acquire_m + bli_pba_acquire_m ( - membrk, + pba, size_needed, pack_buf_type, &local_mem_s @@ -130,10 +130,10 @@ void blx_l3_packm // The chief thread releases the existing block associated with // the mem_t entry in the control tree, and then re-acquires a // new block, saving the associated mem_t entry to local_mem_s. - bli_membrk_release( cntl_mem_p ); - bli_membrk_acquire_m + bli_pba_release( cntl_mem_p ); + bli_pba_acquire_m ( - membrk, + pba, size_needed, pack_buf_type, &local_mem_s diff --git a/sandbox/ref99/old/thread/blx_gemm_thread.c b/sandbox/ref99/old/thread/blx_gemm_thread.c index b4a38f827b..b5657aa4f2 100644 --- a/sandbox/ref99/old/thread/blx_gemm_thread.c +++ b/sandbox/ref99/old/thread/blx_gemm_thread.c @@ -147,7 +147,7 @@ void blx_gemm_thread // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. - bli_membrk_rntm_set_membrk( rntm ); + bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); From 04502492671456b94bcdee60b9de347b6763a32d Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sun, 28 Mar 2021 19:11:43 -0500 Subject: [PATCH 003/226] Always stay initialized after BLAS compat calls. Details: - Removed the option to finalize BLIS after every BLAS call, which also means that BLIS would initialize at the beginning of every BLAS call. This option never really made sense and wasn't even implemented properly to begin with. (Because bli_init_auto() and _finalize_auto() were implemented in terms of bli_init_once() and _finalize_once(), respectively, the application would have only been able to call one BLAS routine before BLIS would find itself in a unusable, permanently uninitialized state.) Because this option was never meant for regular use, it never made it into configure as an actual configure-time option, and therefore this commit only removes parts of the code affected by the cpp macro guard BLIS_ENABLE_STAY_AUTO_INITIALIZED. --- frame/base/bli_info.c | 8 -------- frame/base/bli_init.c | 16 ++++------------ frame/include/bli_config_macro_defs.h | 10 ---------- 3 files changed, 4 insertions(+), 30 deletions(-) diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index bfd6f6fcc8..fa7901583f 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -69,14 +69,6 @@ gint_t bli_info_get_pool_addr_offset_size_a( void ) { return BLIS_POOL_ADDR_OF gint_t bli_info_get_pool_addr_offset_size_b( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_B; } gint_t bli_info_get_pool_addr_offset_size_c( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_C; } gint_t bli_info_get_pool_addr_offset_size_gen( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_GEN; } -gint_t bli_info_get_enable_stay_auto_init( void ) -{ -#ifdef BLIS_ENABLE_STAY_AUTO_INITIALIZED - return 1; -#else - return 0; -#endif -} gint_t bli_info_get_enable_blas( void ) { #ifdef BLIS_ENABLE_BLAS diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index a719b25f8b..1e28ace096 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -56,18 +56,10 @@ void bli_init_auto( void ) void bli_finalize_auto( void ) { -#ifdef BLIS_ENABLE_STAY_AUTO_INITIALIZED - - // If BLIS was configured to stay initialized after being automatically - // initialized, we honor the configuration request and do nothing. - // BLIS will remain initialized unless and until the user explicitly - // calls bli_finalize(). - -#else - - bli_finalize_once(); - -#endif + // The _auto() functions are used when initializing the BLAS compatibility + // layer. It would not make much sense to automatically initialize and + // finalize for every BLAS routine call; therefore, we remain initialized + // unless and until the application explicitly calls bli_finalize(). } // ----------------------------------------------------------------------------- diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index d00df2f0be..86f23df6e0 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -149,16 +149,6 @@ #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#ifdef BLIS_DISABLE_STAY_AUTO_INITIALIZED - #undef BLIS_ENABLE_STAY_AUTO_INITIALIZED -#else - // Default behavior is enabled. - #undef BLIS_ENABLE_STAY_AUTO_INITIALIZED // In case user explicitly enabled. - #define BLIS_ENABLE_STAY_AUTO_INITIALIZED -#endif - // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- From 09bd4f4f12311131938baa9f75d27e92b664d681 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 31 Mar 2021 17:09:36 -0500 Subject: [PATCH 004/226] Add err_t* "return" parameter to malloc functions. Details: - Added an err_t* parameter to memory allocation functions including bli_malloc_intl(), bli_calloc_intl(), bli_malloc_user(), bli_fmalloc_align(), and bli_fmalloc_noalign(). Since these functions already use the return value to return the allocated memory address, they can't communicate errors to the caller through the return value. This commit does not employ any error checking within these functions or their callers, but this sets up BLIS for a more comprehensive commit that moves in that direction. - Moved the typedefs for malloc_ft and free_ft from bli_malloc.h to bli_type_defs.h. This was done so that what remains of bli_malloc.h can be included after the definition of the err_t enum. (This ordering was needed because bli_malloc.h now contains function prototypes that use err_t.) - Defined bli_is_success() and bli_is_failure() static functions in bli_param_macro_defs.h. These functions provide easy checks for error codes and will be used more heavily in future commits. - Unfortunately, the additional err_t* argument discussed above breaks the API for bli_malloc_user(), which is an exported symbol in the shared library. However, it's quite possible that the only application that calls bli_malloc_user()--indeed, the reason it is was marked for symbol exporting to begin with--is the BLIS testsuite. And if that's the case, this breakage won't affect anyone. Nonetheless, the "major" part of the so_version file has been updated accordingly to 4.0.0. --- frame/1m/unpackm/bli_unpackm_cntl.c | 3 +- frame/base/bli_apool.c | 16 +++-- frame/base/bli_array.c | 8 ++- frame/base/bli_blksz.c | 8 ++- frame/base/bli_cntx.c | 83 ++++++++++++++---------- frame/base/bli_func.c | 3 +- frame/base/bli_gks.c | 9 ++- frame/base/bli_malloc.c | 31 ++++++--- frame/base/bli_malloc.h | 14 ++-- frame/base/bli_mbool.c | 3 +- frame/base/bli_obj.c | 3 +- frame/base/bli_pba.c | 3 +- frame/base/bli_pool.c | 12 +++- frame/base/bli_sba.c | 5 +- frame/include/bli_param_macro_defs.h | 15 +++++ frame/include/bli_type_defs.h | 12 ++-- frame/include/blis.h | 1 + frame/thread/bli_l3_decor_openmp.c | 3 +- frame/thread/bli_l3_decor_pthreads.c | 6 +- frame/thread/bli_l3_sup_decor_pthreads.c | 6 +- frame/thread/bli_thrcomm_openmp.c | 8 ++- frame/thread/bli_thrinfo.c | 4 +- frame/thread/bli_thrinfo_sup.c | 4 +- kernels/zen/3/bli_trsm_small.c | 3 +- so_version | 2 +- testsuite/src/test_libblis.c | 8 ++- 26 files changed, 181 insertions(+), 92 deletions(-) diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c index f2be05a54d..95d0545bec 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.c +++ b/frame/1m/unpackm/bli_unpackm_cntl.c @@ -45,13 +45,14 @@ cntl_t* bli_unpackm_cntl_create_node { cntl_t* cntl; unpackm_params_t* params; + err_t r_val; // NOTE: If this function is ever called, figure out whether the // bli_malloc_intl() below needs to be changed to bli_sba_acquire(). bli_abort(); // Allocate an unpackm_params_t struct. - params = bli_malloc_intl( sizeof( unpackm_params_t ) ); + params = bli_malloc_intl( sizeof( unpackm_params_t ), &r_val ); // Initialize the unpackm_params_t struct. params->size = sizeof( unpackm_params_t ); diff --git a/frame/base/bli_apool.c b/frame/base/bli_apool.c index ffd50b12e0..e2d8123511 100644 --- a/frame/base/bli_apool.c +++ b/frame/base/bli_apool.c @@ -39,6 +39,8 @@ void bli_apool_init apool_t* restrict apool ) { + err_t r_val; + // NOTE: The apool_t is only used in one place; it is the type used to // define the sba. We've switched to static initialization of the mutex // field to remove one more thing that could possibly go wrong during @@ -92,7 +94,7 @@ void bli_apool_init // Allocate the block_ptrs array. array_t** restrict block_ptrs = - bli_malloc_intl( block_ptrs_len * sizeof( array_t* ) ); + bli_malloc_intl( block_ptrs_len * sizeof( array_t* ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_init(): allocating %d array_t.\n", ( int )num_blocks ); @@ -141,6 +143,8 @@ void bli_apool_alloc_block array_t** restrict array_p ) { + err_t r_val; + // Since the apool_t is defined as a pool of array_t, we can hard-code // the block_size parameter. const siz_t block_size = sizeof( array_t ); @@ -154,7 +158,7 @@ void bli_apool_alloc_block // be recovered when it's time to free the block. array_t* restrict array = - bli_malloc_intl( block_size ); + bli_malloc_intl( block_size, &r_val ); // Initialize an array_t struct within the newly allocated memory region. bli_array_init( num_elem, sizeof( pool_t* ), array ); @@ -376,6 +380,8 @@ pool_t* bli_apool_array_elem array_t* restrict array ) { + err_t r_val; + // Query the array element corresponding to index. // NOTE: If we knew that the array_t contained elements of size // sizeof( void* ) or sizeof( whatever ), we could return the *value* @@ -425,7 +431,7 @@ pool_t* bli_apool_array_elem #endif // Allocate the pool_t. - pool = bli_malloc_intl( sizeof( pool_t ) ); + pool = bli_malloc_intl( sizeof( pool_t ), &r_val ); // Initialize the pool_t. bli_pool_init @@ -461,6 +467,8 @@ void bli_apool_grow apool_t* restrict apool ) { + err_t r_val; + // If the requested increase is zero, return early. if ( num_blocks_add == 0 ) return; @@ -501,7 +509,7 @@ void bli_apool_grow // Allocate a new block_ptrs array. array_t** restrict block_ptrs_new = - bli_malloc_intl( block_ptrs_len_new * sizeof( array_t* ) ); + bli_malloc_intl( block_ptrs_len_new * sizeof( array_t* ), &r_val ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); diff --git a/frame/base/bli_array.c b/frame/base/bli_array.c index 6232cffbbb..3844cd52f7 100644 --- a/frame/base/bli_array.c +++ b/frame/base/bli_array.c @@ -43,6 +43,8 @@ void bli_array_init array_t* restrict array ) { + err_t r_val; + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_array_init(): allocating array [%d * %d]: ", ( int )num_elem, ( int )elem_size ); @@ -52,7 +54,7 @@ void bli_array_init const size_t array_size = num_elem * elem_size; // Allocate the array buffer. - void* restrict buf = bli_malloc_intl( array_size ); + void* restrict buf = bli_malloc_intl( array_size, &r_val ); // Initialize the array elements to zero. THIS IS IMPORANT because // consumer threads will use the NULL-ness of the array elements to @@ -72,6 +74,8 @@ void bli_array_resize array_t* restrict array ) { + err_t r_val; + // Query the number of elements in the array. const siz_t num_elem_prev = bli_array_num_elem( array ); @@ -98,7 +102,7 @@ void bli_array_resize #endif // Allocate a new array buffer. - char* restrict buf_new = bli_malloc_intl( array_size_new ); + char* restrict buf_new = bli_malloc_intl( array_size_new, &r_val ); // Copy the previous array contents to the new array. memcpy( buf_new, buf_prev, array_size_prev ); diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index c6107ca809..524653d743 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -42,7 +42,9 @@ blksz_t* bli_blksz_create_ed dim_t b_z, dim_t be_z ) { - blksz_t* b = bli_malloc_intl( sizeof( blksz_t ) ); + err_t r_val; + + blksz_t* b = bli_malloc_intl( sizeof( blksz_t ), &r_val ); bli_blksz_init_ed ( @@ -62,7 +64,9 @@ blksz_t* bli_blksz_create dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ) { - blksz_t* b = bli_malloc_intl( sizeof( blksz_t ) ); + err_t r_val; + + blksz_t* b = bli_malloc_intl( sizeof( blksz_t ), &r_val ); bli_blksz_init ( diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index c4f2080038..82952cc28c 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -78,33 +78,34 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) va_list args; dim_t i; + err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif - bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); + bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif - blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) ); + blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif - bszid_t* bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); + bszid_t* bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif - double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif - double* msclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + double* msclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); // -- Begin variable argument section -- @@ -343,6 +344,7 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... ) va_list args; dim_t i; + err_t r_val; // Return early if called with BLIS_NAT. if ( method == BLIS_NAT ) return; @@ -352,17 +354,17 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... ) #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_ind_blkszs(): " ); #endif - bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); + bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_ind_blkszs(): " ); #endif - double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_ind_blkszs(): " ); #endif - double* msclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + double* msclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); // -- Begin variable argument section -- @@ -523,28 +525,29 @@ void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) va_list args; dim_t i; + err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif - l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ) ); + l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif - num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) ); + num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif - void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ) ); + void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif - bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ) ); + bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ), &r_val ); // -- Begin variable argument section -- @@ -680,23 +683,24 @@ void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ) va_list args; dim_t i; + err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_vir_ukrs(): " ); #endif - l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ) ); + l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_vir_ukrs(): " ); #endif - num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) ); + num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_vir_ukrs(): " ); #endif - void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ) ); + void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val ); // -- Begin variable argument section -- @@ -800,20 +804,21 @@ void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ) */ - va_list args; - dim_t i; + va_list args; + dim_t i; + err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_thresh(): " ); #endif - threshid_t* threshids = bli_malloc_intl( n_thresh * sizeof( threshid_t ) ); + threshid_t* threshids = bli_malloc_intl( n_thresh * sizeof( threshid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_thresh(): " ); #endif - blksz_t** threshs = bli_malloc_intl( n_thresh * sizeof( blksz_t* ) ); + blksz_t** threshs = bli_malloc_intl( n_thresh * sizeof( blksz_t* ), &r_val ); // -- Begin variable argument section -- @@ -907,18 +912,19 @@ void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ) va_list args; dim_t i; + err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_handlers(): " ); #endif - opid_t* op_ids = bli_malloc_intl( n_ops * sizeof( opid_t ) ); + opid_t* op_ids = bli_malloc_intl( n_ops * sizeof( opid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_handlers(): " ); #endif - void** op_fps = bli_malloc_intl( n_ops * sizeof( void* ) ); + void** op_fps = bli_malloc_intl( n_ops * sizeof( void* ), &r_val ); // -- Begin variable argument section -- @@ -1005,17 +1011,18 @@ void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ) va_list args; dim_t i; + err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif - bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); + bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif - blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) ); + blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val ); // -- Begin variable argument section -- @@ -1109,28 +1116,29 @@ void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ) va_list args; dim_t i; + err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif - stor3_t* st3_ids = bli_malloc_intl( n_ukrs * sizeof( stor3_t ) ); + stor3_t* st3_ids = bli_malloc_intl( n_ukrs * sizeof( stor3_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif - num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) ); + num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif - void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ) ); + void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif - bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ) ); + bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ), &r_val ); // -- Begin variable argument section -- @@ -1287,23 +1295,24 @@ void bli_cntx_set_l1f_kers( dim_t n_kers, ... ) va_list args; dim_t i; + err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1f_kers(): " ); #endif - l1fkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1fkr_t ) ); + l1fkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1fkr_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1f_kers(): " ); #endif - num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) ); + num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1f_kers(): " ); #endif - void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ) ); + void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); // -- Begin variable argument section -- @@ -1405,23 +1414,24 @@ void bli_cntx_set_l1v_kers( dim_t n_kers, ... ) va_list args; dim_t i; + err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1v_kers(): " ); #endif - l1vkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1vkr_t ) ); + l1vkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1vkr_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1v_kers(): " ); #endif - num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) ); + num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1v_kers(): " ); #endif - void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ) ); + void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); // -- Begin variable argument section -- @@ -1523,23 +1533,24 @@ void bli_cntx_set_packm_kers( dim_t n_kers, ... ) va_list args; dim_t i; + err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_packm_kers(): " ); #endif - l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ) ); + l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_packm_kers(): " ); #endif - num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) ); + num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_packm_kers(): " ); #endif - void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ) ); + void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); // -- Begin variable argument section -- diff --git a/frame/base/bli_func.c b/frame/base/bli_func.c index cca336e153..477710ff00 100644 --- a/frame/base/bli_func.c +++ b/frame/base/bli_func.c @@ -44,8 +44,9 @@ func_t* bli_func_create ) { func_t* f; + err_t r_val; - f = ( func_t* ) bli_malloc_intl( sizeof(func_t) ); + f = ( func_t* )bli_malloc_intl( sizeof( func_t ), &r_val ); bli_func_init ( diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index c789ec2067..42dc20c0fc 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -327,6 +327,8 @@ void bli_gks_register_cntx void_fp ind_fp ) { + err_t r_val; + // This function is called by bli_gks_init() for each architecture that // will be supported by BLIS. It takes an architecture id and three // function pointers, one to a function that initializes a native context @@ -375,7 +377,7 @@ void bli_gks_register_cntx // needs to be allocated. Allocate the memory and initialize it to // zeros/NULL, storing the address of the alloacted memory at the element // for the current architecture id. - gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS ); + gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS, &r_val ); // Alias the allocated array for readability. cntx_t** restrict gks_id = gks[ id ]; @@ -387,7 +389,7 @@ void bli_gks_register_cntx // Allocate memory for a single context and store the address at // the element in the gks[ id ] array that is reserved for native // execution. - gks_id[ BLIS_NAT ] = bli_calloc_intl( sizeof( cntx_t ) ); + gks_id[ BLIS_NAT ] = bli_calloc_intl( sizeof( cntx_t ), &r_val ); // Alias the allocated context address for readability. cntx_t* restrict gks_id_nat = gks_id[ BLIS_NAT ]; @@ -484,6 +486,7 @@ cntx_t* bli_gks_query_ind_cntx bli_init_once(); cntx_t* gks_id_ind; + err_t r_val; // Return the address of a context that will be suited for executing a // level-3 operation via the requested induced method (and datatype) for @@ -542,7 +545,7 @@ cntx_t* bli_gks_query_ind_cntx // If gks_id_ind is NULL, then we know we must allocate and then // initialize the context, storing its address back to // gks_id[ ind ]. - gks_id_ind = bli_calloc_intl( sizeof( cntx_t ) ); + gks_id_ind = bli_calloc_intl( sizeof( cntx_t ), &r_val ); gks_id[ ind ] = gks_id_ind; // Before we can call the induced method context initialization diff --git a/frame/base/bli_malloc.c b/frame/base/bli_malloc.c index 4e9b10ce32..f1993f62e3 100644 --- a/frame/base/bli_malloc.c +++ b/frame/base/bli_malloc.c @@ -71,7 +71,7 @@ void bli_free_pool( void* p ) // ----------------------------------------------------------------------------- -void* bli_malloc_user( size_t size ) +void* bli_malloc_user( size_t size, err_t* r_val ) { const malloc_ft malloc_fp = BLIS_MALLOC_USER; const size_t align_size = BLIS_HEAP_ADDR_ALIGN_SIZE; @@ -82,7 +82,9 @@ void* bli_malloc_user( size_t size ) fflush( stdout ); #endif - return bli_fmalloc_align( malloc_fp, size, align_size ); + void* p = bli_fmalloc_align( malloc_fp, size, align_size, r_val ); + + return p; } void bli_free_user( void* p ) @@ -97,7 +99,7 @@ void bli_free_user( void* p ) // ----------------------------------------------------------------------------- -void* bli_malloc_intl( size_t size ) +void* bli_malloc_intl( size_t size, err_t* r_val ) { const malloc_ft malloc_fp = BLIS_MALLOC_INTL; @@ -106,18 +108,21 @@ void* bli_malloc_intl( size_t size ) fflush( stdout ); #endif - return bli_fmalloc_noalign( malloc_fp, size ); + void* p = bli_fmalloc_noalign( malloc_fp, size, r_val ); + + return p; } -void* bli_calloc_intl( size_t size ) +void* bli_calloc_intl( size_t size, err_t* r_val ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_calloc_intl(): " ); #endif - void* p = bli_malloc_intl( size ); + void* p = bli_malloc_intl( size, r_val ); - memset( p, 0, size ); + if ( bli_is_success( *r_val ) ) + memset( p, 0, size ); return p; } @@ -138,7 +143,8 @@ void* bli_fmalloc_align ( malloc_ft f, size_t size, - size_t align_size + size_t align_size, + err_t* r_val ) { const size_t ptr_size = sizeof( void* ); @@ -165,6 +171,9 @@ void* bli_fmalloc_align if ( bli_error_checking_is_enabled() ) bli_fmalloc_post_check( p_orig ); + // The pseudo-return value isn't used yet. + *r_val = BLIS_SUCCESS; + // Advance the pointer by one pointer element. p_byte = p_orig; p_byte += ptr_size; @@ -226,7 +235,8 @@ void bli_ffree_align void* bli_fmalloc_noalign ( malloc_ft f, - size_t size + size_t size, + err_t* r_val ) { void* p = f( size ); @@ -235,6 +245,9 @@ void* bli_fmalloc_noalign if ( bli_error_checking_is_enabled() ) bli_fmalloc_post_check( p ); + // The pseudo-return value isn't used yet. + *r_val = BLIS_SUCCESS; + return p; } diff --git a/frame/base/bli_malloc.h b/frame/base/bli_malloc.h index 2659a81fa1..488124045f 100644 --- a/frame/base/bli_malloc.h +++ b/frame/base/bli_malloc.h @@ -34,8 +34,8 @@ */ // Typedef function pointer types for malloc() and free() substitutes. -typedef void* (*malloc_ft) ( size_t size ); -typedef void (*free_ft) ( void* p ); +//typedef void* (*malloc_ft) ( size_t size ); +//typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- @@ -44,19 +44,19 @@ BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif -void* bli_malloc_intl( size_t size ); -void* bli_calloc_intl( size_t size ); +void* bli_malloc_intl( size_t size, err_t* r_val ); +void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); -BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size ); +BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- -void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size ); +void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); -void* bli_fmalloc_noalign( malloc_ft f, size_t size ); +void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); diff --git a/frame/base/bli_mbool.c b/frame/base/bli_mbool.c index b120a88df5..d0b78dacd8 100644 --- a/frame/base/bli_mbool.c +++ b/frame/base/bli_mbool.c @@ -44,8 +44,9 @@ mbool_t* bli_mbool_create ) { mbool_t* b; + err_t r_val; - b = ( mbool_t* ) bli_malloc_intl( sizeof(mbool_t) ); + b = ( mbool_t* ) bli_malloc_intl( sizeof( mbool_t ), &r_val ); bli_mbool_init ( diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index f2b59e180d..43e5101b5f 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -147,6 +147,7 @@ void bli_obj_alloc_buffer siz_t elem_size; siz_t buffer_size; void* p; + err_t r_val; bli_init_once(); @@ -195,7 +196,7 @@ void bli_obj_alloc_buffer buffer_size = ( siz_t )n_elem * elem_size; // Allocate the buffer. - p = bli_malloc_user( buffer_size ); + p = bli_malloc_user( buffer_size, &r_val ); // Set individual fields. bli_obj_set_buffer( p, obj ); diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c index 7c0b606485..a924bbefc8 100644 --- a/frame/base/bli_pba.c +++ b/frame/base/bli_pba.c @@ -101,6 +101,7 @@ void bli_pba_acquire_m pool_t* pool; pblk_t* pblk; dim_t pi; + err_t r_val; // If the internal memory pools for packing block allocator are disabled, // we spoof the buffer type as BLIS_BUFFER_FOR_GEN_USE to induce the @@ -125,7 +126,7 @@ void bli_pba_acquire_m // For general-use buffer requests, dynamically allocating memory // is assumed to be sufficient. - void* buf = bli_fmalloc_align( malloc_fp, req_size, align_size ); + void* buf = bli_fmalloc_align( malloc_fp, req_size, align_size, &r_val ); // Initialize the mem_t object with: // - the address of the memory block, diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c index ae5d6c5522..08cbbbf2e7 100644 --- a/frame/base/bli_pool.c +++ b/frame/base/bli_pool.c @@ -49,6 +49,8 @@ void bli_pool_init pool_t* restrict pool ) { + err_t r_val; + // Make sure that block_ptrs_len is at least num_blocks. block_ptrs_len = bli_max( block_ptrs_len, num_blocks ); @@ -62,7 +64,7 @@ void bli_pool_init // well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g. pblk_t* restrict block_ptrs = - bli_malloc_intl( block_ptrs_len * sizeof( pblk_t ) ); + bli_malloc_intl( block_ptrs_len * sizeof( pblk_t ), &r_val ); // Allocate and initialize each entry in the block_ptrs array. for ( dim_t i = 0; i < num_blocks; ++i ) @@ -343,6 +345,8 @@ void bli_pool_grow pool_t* restrict pool ) { + err_t r_val; + // If the requested increase is zero, return early. if ( num_blocks_add == 0 ) return; @@ -377,7 +381,7 @@ void bli_pool_grow // well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g. pblk_t* restrict block_ptrs_new = - bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ) ); + bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ), &r_val ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); @@ -503,6 +507,8 @@ void bli_pool_alloc_block pblk_t* restrict block ) { + err_t r_val; + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_alloc_block(): calling fmalloc_align(): size %d (align %d, offset %d)\n", ( int )block_size, ( int )align_size, ( int )offset_size ); @@ -516,7 +522,7 @@ void bli_pool_alloc_block // that many bytes at the beginning of the allocated memory. void* restrict buf = - bli_fmalloc_align( malloc_fp, block_size + offset_size, align_size ); + bli_fmalloc_align( malloc_fp, block_size + offset_size, align_size, &r_val ); #if 0 // NOTE: This code is disabled because it is not needed, since diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c index 6b7919a11e..1da6723c79 100644 --- a/frame/base/bli_sba.c +++ b/frame/base/bli_sba.c @@ -62,11 +62,12 @@ void* bli_sba_acquire ) { void* block; + err_t r_val; #ifdef BLIS_ENABLE_SBA_POOLS if ( rntm == NULL ) { - block = bli_malloc_intl( req_size ); + block = bli_malloc_intl( req_size, &r_val ); } else { @@ -96,7 +97,7 @@ void* bli_sba_acquire } #else - block = bli_malloc_intl( req_size ); + block = bli_malloc_intl( req_size, &r_val ); #endif diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 91554b2269..2890274914 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -421,6 +421,21 @@ BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) } +// err_t-related + +BLIS_INLINE bool bli_is_success( err_t err ) +{ + return ( bool ) + ( err == BLIS_SUCCESS ); +} + +BLIS_INLINE bool bli_is_failure( err_t err ) +{ + return ( bool ) + ( err != BLIS_SUCCESS ); +} + + // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index de7eee10a5..b8b2d46450 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -198,16 +198,19 @@ typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; -// -- Void function pointer types -- +// -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). - //typedef void (*void_fp)( void ); typedef void* void_fp; +// Typedef function pointer types for malloc() and free() substitutes. +typedef void* (*malloc_ft)( size_t size ); +typedef void (*free_ft) ( void* p ); + // // -- BLIS info bit field offsets ---------------------------------------------- @@ -1036,10 +1039,9 @@ typedef enum // -- BLIS misc. structure types ----------------------------------------------- // -// These headers must be included here (or earlier) because definitions they -// provide are needed in the pool_t and related structs. +// This header must be included here (or earlier) because definitions it +// provides are needed in the pool_t and related structs. #include "bli_pthread.h" -#include "bli_malloc.h" // -- Pool block type -- diff --git a/frame/include/blis.h b/frame/include/blis.h index e4046c2eb1..e5fe8714c1 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -99,6 +99,7 @@ extern "C" { // -- Base operation prototypes -- #include "bli_init.h" +#include "bli_malloc.h" #include "bli_const.h" #include "bli_obj.h" #include "bli_obj_scalar.h" diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c index d7c2c2cfd7..5b40d06143 100644 --- a/frame/thread/bli_l3_decor_openmp.c +++ b/frame/thread/bli_l3_decor_openmp.c @@ -73,7 +73,8 @@ void bli_l3_thread_decorator const dim_t n_threads = bli_rntm_num_threads( rntm ); #ifdef PRINT_THRINFO - thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) ); + err_t r_val; + thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ), &r_val ); #endif // NOTE: The sba was initialized in bli_init(). diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c index 93ff5f9cd0..89b6ea1187 100644 --- a/frame/thread/bli_l3_decor_pthreads.c +++ b/frame/thread/bli_l3_decor_pthreads.c @@ -146,6 +146,8 @@ void bli_l3_thread_decorator cntl_t* cntl ) { + err_t r_val; + // This is part of a hack to support mixed domain in bli_gemm_front(). // Sometimes we need to specify a non-standard schema for A and B, and // we decided to transmit them via the schema field in the obj_t's @@ -187,12 +189,12 @@ void bli_l3_thread_decorator #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif - bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads ); + bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif - thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); // NOTE: We must iterate backwards so that the chief thread (thread id 0) // can spawn all other threads before proceeding with its own computation. diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c index 9fa8730e67..dade71a035 100644 --- a/frame/thread/bli_l3_sup_decor_pthreads.c +++ b/frame/thread/bli_l3_sup_decor_pthreads.c @@ -122,6 +122,8 @@ err_t bli_l3_sup_thread_decorator rntm_t* rntm ) { + err_t r_val; + // Query the total number of threads from the context. const dim_t n_threads = bli_rntm_num_threads( rntm ); @@ -152,12 +154,12 @@ err_t bli_l3_sup_thread_decorator #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif - bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads ); + bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif - thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); // NOTE: We must iterate backwards so that the chief thread (thread id 0) // can spawn all other threads before proceeding with its own computation. diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 174b4a0c52..9bb35ea31a 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -111,17 +111,21 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { + err_t r_val; + if ( comm == NULL ) return; comm->sent_object = NULL; comm->n_threads = n_threads; - comm->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads ); + comm->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads, &r_val ); bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, comm->barriers, 0 ); } //Tree barrier used for Intel Xeon Phi barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ) { - barrier_t* me = bli_malloc_intl( sizeof(barrier_t) ); + err_t r_val; + + barrier_t* me = bli_malloc_intl( sizeof( barrier_t ), &r_val ); me->dad = NULL; me->signal = 0; diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index f03e6b943c..f9cd5ce74b 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -332,8 +332,10 @@ thrinfo_t* bli_thrinfo_create_for_cntl // pointers. if ( bli_thread_am_ochief( thread_par ) ) { + err_t r_val; + if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ) ); + new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val ); else new_comms = static_comms; } diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c index e67e8b6426..ab28b7160c 100644 --- a/frame/thread/bli_thrinfo_sup.c +++ b/frame/thread/bli_thrinfo_sup.c @@ -197,8 +197,10 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl // pointers. if ( bli_thread_am_ochief( thread_par ) ) { + err_t r_val; + if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ) ); + new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val ); else new_comms = static_comms; } diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c index c6ea0d12b1..c0b241aa85 100644 --- a/kernels/zen/3/bli_trsm_small.c +++ b/kernels/zen/3/bli_trsm_small.c @@ -10773,10 +10773,11 @@ static err_t bli_dtrsm_small_XAltB_unitDiag( k_iter = j / D_NR; //number of GEMM operations to be performed(in block of 4x4) dim_t iter; + err_t r_val; if((j+n_remainder) == n) { - f_temp = bli_malloc_user(4 * sizeof(double)); + f_temp = bli_malloc_user(4 * sizeof(double), &r_val); for(iter = 0; iter < m_remainder; iter++) f_temp[iter] = (b11 + cs_b * (n_remainder-1))[iter]; } diff --git a/so_version b/so_version index 67cc4d1c81..436b8f7fa7 100644 --- a/so_version +++ b/so_version @@ -1,2 +1,2 @@ -3 +4 0.0 diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index fca8c29c03..f771290f0e 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -121,6 +121,8 @@ void* libblis_test_thread_entry( void* tdata_void ) void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops ) { + err_t r_val; + // Query the total number of threads to simulate. size_t nt = ( size_t )params->n_app_threads; @@ -130,12 +132,12 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops ) #ifdef BLIS_ENABLE_MEM_TRACING printf( "libblis_test_thread_decorator(): " ); #endif - bli_pthread_t* pthread = bli_malloc_user( sizeof( bli_pthread_t ) * nt ); + bli_pthread_t* pthread = bli_malloc_user( sizeof( bli_pthread_t ) * nt, &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "libblis_test_thread_decorator(): " ); #endif - thread_data_t* tdata = bli_malloc_user( sizeof( thread_data_t ) * nt ); + thread_data_t* tdata = bli_malloc_user( sizeof( thread_data_t ) * nt, &r_val ); // Allocate a mutex for the threads to share. //bli_pthread_mutex_t* mutex = bli_malloc_user( sizeof( bli_pthread_mutex_t ) ); @@ -145,7 +147,7 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops ) #ifdef BLIS_ENABLE_MEM_TRACING printf( "libblis_test_thread_decorator(): " ); #endif - bli_pthread_barrier_t* barrier = bli_malloc_user( sizeof( bli_pthread_barrier_t ) ); + bli_pthread_barrier_t* barrier = bli_malloc_user( sizeof( bli_pthread_barrier_t ), &r_val ); // Initialize the mutex. //bli_pthread_mutex_init( mutex, NULL ); From 6548cebaf55a1f9bdb8417cc89dd0444d8f9c2e4 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 14 Apr 2021 13:00:42 -0500 Subject: [PATCH 005/226] Allow clang for ThunderX2 config Needed for compiling on e.g. Mac M1. AFAIK clang supports the same -mcpu flag for ThunderX2 as gcc. --- config/thunderx2/make_defs.mk | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/config/thunderx2/make_defs.mk b/config/thunderx2/make_defs.mk index 1fd1721c52..b43fea87c5 100644 --- a/config/thunderx2/make_defs.mk +++ b/config/thunderx2/make_defs.mk @@ -65,7 +65,11 @@ CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mcpu=thunderx2t99 else -$(error gcc is required for this configuration.) +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := -mcpu=thunderx2t99 +else +$(error gcc or clang is required for this configuration.) +endif endif # Flags specific to reference kernels. From 4534daffd13ed7a8983c681d3f5e9de17c9f0b96 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 27 Apr 2021 18:16:44 -0500 Subject: [PATCH 006/226] Minor API breakage in bli_pack API. Details: - Changed bli_pack_get_pack_a() and bli_pack_get_pack_b() so that instead of returning a bool, they set a bool that is passed in by address. This does break the public exported API, but I expect very few users actually use this function. (This change is being made in preparation for a much more extensive commit relating to error checking.) --- frame/base/bli_pack.c | 8 ++++---- frame/base/bli_pack.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/frame/base/bli_pack.c b/frame/base/bli_pack.c index 9a5b45b39c..c5ce9cc6c9 100644 --- a/frame/base/bli_pack.c +++ b/frame/base/bli_pack.c @@ -57,22 +57,22 @@ void bli_pack_finalize( void ) // ----------------------------------------------------------------------------- -dim_t bli_pack_get_pack_a( void ) +void bli_pack_get_pack_a( bool* pack_a ) { // We must ensure that global_rntm has been initialized. bli_init_once(); - return bli_rntm_pack_a( &global_rntm ); + *pack_a = bli_rntm_pack_a( &global_rntm ); } // ----------------------------------------------------------------------------- -dim_t bli_pack_get_pack_b( void ) +void bli_pack_get_pack_b( bool* pack_b ) { // We must ensure that global_rntm has been initialized. bli_init_once(); - return bli_rntm_pack_b( &global_rntm ); + *pack_b = bli_rntm_pack_b( &global_rntm ); } // ---------------------------------------------------------------------------- diff --git a/frame/base/bli_pack.h b/frame/base/bli_pack.h index b9a41f5b5f..c12740148c 100644 --- a/frame/base/bli_pack.h +++ b/frame/base/bli_pack.h @@ -38,10 +38,10 @@ void bli_pack_init( void ); void bli_pack_finalize( void ); -BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_a( void ); -BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_b( void ); -BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); -BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); +BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); +BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); +BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); +BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); From 6a89c7d8f9ac3f51b5b4d8ccb2630d908d951e6f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 1 May 2021 18:54:48 -0500 Subject: [PATCH 007/226] Defined setijv, getijv to set/get vector elements. Details: - Defined getijv, setijv operations to get and set elements of a vector, in bli_setgetijv.c and .h. - Renamed bli_setgetij.c and .h to bli_setgetijm.c and .h, respectively. - Added additional bounds checking to getijm and setijm to prevent actions with negative indices. - Added documentation to BLISObjectAPI.md and BLISTypedAPI.md for getijv and setijv. - Added documentation to BLISTypedAPI.md for getijm and setijm, which were inadvertently missing. - Added a new entry to the FAQ titled "Why does BLIS have vector (level-1v) and matrix (level-1m) variations of most level-1 operations?" - Comment updates. --- docs/BLISObjectAPI.md | 34 +++- docs/BLISTypedAPI.md | 57 +++++- docs/FAQ.md | 11 ++ .../base/{bli_setgetij.c => bli_setgetijm.c} | 36 +--- .../base/{bli_setgetij.h => bli_setgetijm.h} | 0 frame/base/bli_setgetijv.c | 168 ++++++++++++++++++ frame/base/bli_setgetijv.h | 78 ++++++++ frame/include/blis.h | 3 +- 8 files changed, 353 insertions(+), 34 deletions(-) rename frame/base/{bli_setgetij.c => bli_setgetijm.c} (87%) rename frame/base/{bli_setgetij.h => bli_setgetijm.h} (100%) create mode 100644 frame/base/bli_setgetijv.c create mode 100644 frame/base/bli_setgetijv.h diff --git a/docs/BLISObjectAPI.md b/docs/BLISObjectAPI.md index e84703cdcc..a9ce9a24dc 100644 --- a/docs/BLISObjectAPI.md +++ b/docs/BLISObjectAPI.md @@ -53,7 +53,7 @@ This index provides a quick way to jump directly to the description for each ope * **[Level-3](BLISObjectAPI.md#level-3-operations)**: Operations with matrices that are multiplication-like: * [gemm](BLISObjectAPI.md#gemm), [hemm](BLISObjectAPI.md#hemm), [herk](BLISObjectAPI.md#herk), [her2k](BLISObjectAPI.md#her2k), [symm](BLISObjectAPI.md#symm), [syrk](BLISObjectAPI.md#syrk), [syr2k](BLISObjectAPI.md#syr2k), [trmm](BLISObjectAPI.md#trmm), [trmm3](BLISObjectAPI.md#trmm3), [trsm](BLISObjectAPI.md#trsm) * **[Utility](BLISObjectAPI.md#Utility-operations)**: Miscellaneous operations on matrices and vectors: - * [asumv](BLISObjectAPI.md#asumv), [norm1v](BLISObjectAPI.md#norm1v), [normfv](BLISObjectAPI.md#normfv), [normiv](BLISObjectAPI.md#normiv), [norm1m](BLISObjectAPI.md#norm1m), [normfm](BLISObjectAPI.md#normfm), [normim](BLISObjectAPI.md#normim), [mkherm](BLISObjectAPI.md#mkherm), [mksymm](BLISObjectAPI.md#mksymm), [mktrim](BLISObjectAPI.md#mktrim), [fprintv](BLISObjectAPI.md#fprintv), [fprintm](BLISObjectAPI.md#fprintm),[printv](BLISObjectAPI.md#printv), [printm](BLISObjectAPI.md#printm), [randv](BLISObjectAPI.md#randv), [randm](BLISObjectAPI.md#randm), [sumsqv](BLISObjectAPI.md#sumsqv), [getijm](BLISObjectAPI.md#getijm), [setijm](BLISObjectAPI.md#setijm) + * [asumv](BLISObjectAPI.md#asumv), [norm1v](BLISObjectAPI.md#norm1v), [normfv](BLISObjectAPI.md#normfv), [normiv](BLISObjectAPI.md#normiv), [norm1m](BLISObjectAPI.md#norm1m), [normfm](BLISObjectAPI.md#normfm), [normim](BLISObjectAPI.md#normim), [mkherm](BLISObjectAPI.md#mkherm), [mksymm](BLISObjectAPI.md#mksymm), [mktrim](BLISObjectAPI.md#mktrim), [fprintv](BLISObjectAPI.md#fprintv), [fprintm](BLISObjectAPI.md#fprintm),[printv](BLISObjectAPI.md#printv), [printm](BLISObjectAPI.md#printm), [randv](BLISObjectAPI.md#randv), [randm](BLISObjectAPI.md#randm), [sumsqv](BLISObjectAPI.md#sumsqv), [getijv](BLISObjectAPI.md#getijv), [getijm](BLISObjectAPI.md#getijm), [setijv](BLISObjectAPI.md#setijv), [setijm](BLISObjectAPI.md#setijm) @@ -2125,6 +2125,19 @@ where, on entry, `scale` and `sumsq` contain `scale_old` and `sumsq_old`, respec --- +#### getijv +```c +err_t bli_getijv + ( + dim_t i, + obj_t* b, + double* ar, + double* ai + ) +``` +Copy the real and imaginary values at the `i`th element of vector object `x` to `ar` and `ai`. If elements of `x` are stored as real types, then only `ar` is overwritten and `ai` is left unchanged. (If `x` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.) +If either the element offset `i` is beyond the vector dimension of `x` or less than zero, the function returns `BLIS_FAILURE` without taking any action. Similarly, if `x` is a global scalar constant such as `BLIS_ONE`, the function returns `BLIS_FAILURE`. + #### getijm ```c err_t bli_getijm @@ -2136,8 +2149,21 @@ err_t bli_getijm double* ai ) ``` -Copy the real and imaginary values at the (`i`,`j`) element of object `b` to `ar` and `ai`. f elements of `b` are stored as real types, then only `ar` is overwritten and `ai` is left unchanged. (If `b` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.) -If either the row offset `i` is beyond the _m_ dimension of `b`, or column offset `j` is beyond the _n_ dimension of `b`, the function does not perform any copy and returns `BLIS_FAILURE`. Similarly, if `b` is a global scalar constant such as `BLIS_ONE`, `BLIS_FAILURE` is returned. +Copy the real and imaginary values at the (`i`,`j`) element of object `b` to `ar` and `ai`. If elements of `b` are stored as real types, then only `ar` is overwritten and `ai` is left unchanged. (If `b` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.) +If either the row offset `i` is beyond the _m_ dimension of `b` or less than zero, or column offset `j` is beyond the _n_ dimension of `b` or less than zero, the function returns `BLIS_FAILURE` without taking any action. Similarly, if `b` is a global scalar constant such as `BLIS_ONE`, the function returns `BLIS_FAILURE`. + +#### setijv +```c +err_t bli_setijv + ( + double ar, + double ai, + dim_t i, + obj_t* x + ); +``` +Copy real and imaginary values `ar` and `ai` to the `i`th element of vector object `x`. If elements of `x` are stored as real types, then only `ar` is copied and `ai` is ignored. (If `x` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.) +If the element offset `i` is beyond the vector dimension of `x` or less than zero, the function returns `BLIS_FAILURE` without taking any action. Similarly, if `x` is a global scalar constant such as `BLIS_ONE`, the function returns `BLIS_FAILURE`. #### setijm ```c @@ -2151,7 +2177,7 @@ err_t bli_setijm ); ``` Copy real and imaginary values `ar` and `ai` to the (`i`,`j`) element of object `b`. If elements of `b` are stored as real types, then only `ar` is copied and `ai` is ignored. (If `b` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.) -If either the row offset `i` is beyond the _m_ dimension of `b`, or column offset `j` is beyond the _n_ dimension of `b`, the function does not perform any copy and returns `BLIS_FAILURE`. Similarly, if `b` is a global scalar constant such as `BLIS_ONE`, `BLIS_FAILURE` is returned. +If either the row offset `i` is beyond the _m_ dimension of `b` or less than zero, or column offset `j` is beyond the _n_ dimension of `b` or less than zero, the function returns `BLIS_FAILURE` without taking any action. Similarly, if `b` is a global scalar constant such as `BLIS_ONE`, the function returns `BLIS_FAILURE`. diff --git a/docs/BLISTypedAPI.md b/docs/BLISTypedAPI.md index 7d6e92edac..3dd77864ed 100644 --- a/docs/BLISTypedAPI.md +++ b/docs/BLISTypedAPI.md @@ -48,7 +48,7 @@ This index provides a quick way to jump directly to the description for each ope * **[Level-3](BLISTypedAPI.md#level-3-operations)**: Operations with matrices that are multiplication-like: * [gemm](BLISTypedAPI.md#gemm), [hemm](BLISTypedAPI.md#hemm), [herk](BLISTypedAPI.md#herk), [her2k](BLISTypedAPI.md#her2k), [symm](BLISTypedAPI.md#symm), [syrk](BLISTypedAPI.md#syrk), [syr2k](BLISTypedAPI.md#syr2k), [trmm](BLISTypedAPI.md#trmm), [trmm3](BLISTypedAPI.md#trmm3), [trsm](BLISTypedAPI.md#trsm) * **[Utility](BLISTypedAPI.md#Utility-operations)**: Miscellaneous operations on matrices and vectors: - * [asumv](BLISTypedAPI.md#asumv), [norm1v](BLISTypedAPI.md#norm1v), [normfv](BLISTypedAPI.md#normfv), [normiv](BLISTypedAPI.md#normiv), [norm1m](BLISTypedAPI.md#norm1m), [normfm](BLISTypedAPI.md#normfm), [normim](BLISTypedAPI.md#normim), [mkherm](BLISTypedAPI.md#mkherm), [mksymm](BLISTypedAPI.md#mksymm), [mktrim](BLISTypedAPI.md#mktrim), [fprintv](BLISTypedAPI.md#fprintv), [fprintm](BLISTypedAPI.md#fprintm),[printv](BLISTypedAPI.md#printv), [printm](BLISTypedAPI.md#printm), [randv](BLISTypedAPI.md#randv), [randm](BLISTypedAPI.md#randm), [sumsqv](BLISTypedAPI.md#sumsqv) + * [asumv](BLISTypedAPI.md#asumv), [norm1v](BLISTypedAPI.md#norm1v), [normfv](BLISTypedAPI.md#normfv), [normiv](BLISTypedAPI.md#normiv), [norm1m](BLISTypedAPI.md#norm1m), [normfm](BLISTypedAPI.md#normfm), [normim](BLISTypedAPI.md#normim), [mkherm](BLISTypedAPI.md#mkherm), [mksymm](BLISTypedAPI.md#mksymm), [mktrim](BLISTypedAPI.md#mktrim), [fprintv](BLISTypedAPI.md#fprintv), [fprintm](BLISTypedAPI.md#fprintm),[printv](BLISTypedAPI.md#printv), [printm](BLISTypedAPI.md#printm), [randv](BLISTypedAPI.md#randv), [randm](BLISTypedAPI.md#randm), [sumsqv](BLISTypedAPI.md#sumsqv), [getijv](BLISTypedAPI.md#getijv), [getijm](BLISTypedAPI.md#getijm), [setijv](BLISTypedAPI.md#setijv), [setijm](BLISTypedAPI.md#setijm) @@ -1695,6 +1695,61 @@ where, on entry, `scale` and `sumsq` contain `scale_old` and `sumsq_old`, respec --- +#### getijv +```c +err_t bli_?getijv + ( + dim_t i, + ctype* x, incx, + double* ar, + double* ai + ) +``` +Copy the real and imaginary values at the `i`th element of vector `x` to `ar` and `ai`. For real domain invocations, only `ar` is overwritten and `ai` is left unchanged. (If `x` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.) +Note that the object-based analogue of [getijv](BLISObjectAPI.md#getijv) does bounds checking of the vector element offset `i` against the vector length while the typed functions specified above do not (since the vector length is not given). + +#### setijv +```c +err_t bli_?setijv + ( + double ar, + double ai, + dim_t i, + ctype* x, incx + ); +``` +Copy real and imaginary values `ar` and `ai` to the `i`th element of vector object `x`. For real domain invocations, only `ar` is copied and `ai` is ignored. (If `x` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.) +Note that the object-based analogue of [setijv](BLISObjectAPI.md#setijv) does bounds checking of the vector element offset `i` against the vector length while the typed functions specified above do not (since the vector length is not given). + +#### getijm +```c +err_t bli_?getijm + ( + dim_t i, + dim_t j, + ctype* b, inc_t rs_b, inc_t cs_b, + double* ar, + double* ai + ) +``` +Copy the real and imaginary values at the (`i`,`j`) element of object `b` to `ar` and `ai`. For real domain invocations, only `ar` is overwritten and `ai` is left unchanged. (If `b` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.) +Note that the object-based analogue of [getijm](BLISObjectAPI.md#getijm) does bounds checking of the matrix element offsets (`i`,`j`) against the matrix dimensions while the typed functions specified above do not (since the matrix dimensions are not given). + +#### setijm +```c +err_t bli_?setijm + ( + double ar, + double ai, + dim_t i, + dim_t j, + ctype* b, inc_t rs_b, inc_t cs_b + ); +``` +Copy real and imaginary values `ar` and `ai` to the (`i`,`j`) element of object `b`. For real domain invocations, only `ar` is copied and `ai` is ignored. (If `b` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.) +Note that the object-based analogue of [setijm](BLISObjectAPI.md#setijm) does bounds checking of the matrix element offsets (`i`,`j`) against the matrix dimensions while the typed functions specified above do not (since the matrix dimensions are not given). + + ## Level-3 microkernels diff --git a/docs/FAQ.md b/docs/FAQ.md index 423009ae36..592ce11c1e 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -17,6 +17,7 @@ project, as well as those we think a new user or developer might ask. If you do * [What is a macrokernel?](FAQ.md#what-is-a-macrokernel) * [What is a context?](FAQ.md#what-is-a-context) * [I am used to thinking in terms of column-major/row-major storage and leading dimensions. What is a "row stride" / "column stride"?](FAQ.md#im-used-to-thinking-in-terms-of-column-majorrow-major-storage-and-leading-dimensions-what-is-a-row-stride--column-stride) + * [Why does BLIS have vector (level-1v) and matrix (level-1m) variations of most level-1 operations?](FAQ.md#why-does-blis-have-vector-level-1v-and-matrix-level-1m-variations-of-most-level-1-operations) * [What does it mean when a matrix with general stride is column-tilted or row-tilted?](FAQ.md#what-does-it-mean-when-a-matrix-with-general-stride-is-column-tilted-or-row-tilted) * [I am not really interested in all of these newfangled features in BLIS. Can I just use BLIS as a BLAS library?](FAQ.md#im-not-really-interested-in-all-of-these-newfangled-features-in-blis-can-i-just-use-blis-as-a-blas-library) * [What about CBLAS?](FAQ.md#what-about-cblas) @@ -117,6 +118,16 @@ In generalized storage, we have a row stride and a column stride. The row stride BLIS also supports situations where both the row stride and column stride are non-unit. We call this situation "general stride". +### Why does BLIS have vector (level-1v) and matrix (level-1m) variations of most level-1 operations? + +At first glance, it might appear that an element-wise operation such as `copym` or `axpym` would be sufficiently general purpose to cover the cases where the operands are vectors. After all, an *m x 1* matrix can be viewed as a vector of length m and vice versa. But in BLIS, operations on vectors are treated slightly differently than operations on matrices. + +If an application wishes to perform an element-wise operation on two objects, and the application calls a level-1m operation, the dimensions of those objects must be conformal, or "match up" (after any transposition implied by the object properties). This includes situations where one of the dimensions is unit. + +However, if an application instead decides to perform an element-wise operation on two objects, and the application calls a level-1v operation, the dimension constraints are slightly relaxed. In this scenario, BLIS only checks that the vector *lengths* are equal. This allows for the vectors to have different orientations (row vs column) while still being considered conformal. So, you could perform a `copyv` operation to copy from an *m x 1* vector to a *1 x m* vector. A `copym` operation on such objects would not be allowed (unless it was executed with the source object containing an implicit transposition). + +Another way to think about level-1v operations is that they will work with any two matrix objects in situations where (a) the corresponding level-1m operation *would have* worked if the input had been transposed, and (b) all operands happen to be vectors (i.e., have one unit dimension). + ### What does it mean when a matrix with general stride is column-tilted or row-tilted? When a matrix is stored with general stride, both the row stride and column stride (let's call them `rs` and `cs`) are non-unit. When `rs` < `cs`, we call the general stride matrix "column-tilted" because it is "closer" to being column-stored (than row-stored). Similarly, when `rs` > `cs`, the matrix is "row-tilted" because it is closer to being row-stored. diff --git a/frame/base/bli_setgetij.c b/frame/base/bli_setgetijm.c similarity index 87% rename from frame/base/bli_setgetij.c rename to frame/base/bli_setgetijm.c index 744e24c27e..78ff58a29c 100644 --- a/frame/base/bli_setgetij.c +++ b/frame/base/bli_setgetijm.c @@ -59,9 +59,9 @@ err_t bli_setijm dim_t cs = bli_obj_col_stride( b ); num_t dt = bli_obj_dt( b ); - // Return error if i or j is beyond bounds of matrix/vector. - if ( m <= i ) return BLIS_FAILURE; - if ( n <= j ) return BLIS_FAILURE; + // Return error if i or j is beyond bounds of the matrix/vector. + if ( i < 0 || m <= i ) return BLIS_FAILURE; + if ( j < 0 || n <= j ) return BLIS_FAILURE; // Don't modify scalar constants. if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE; @@ -133,35 +133,15 @@ err_t bli_getijm dim_t cs = bli_obj_col_stride( b ); num_t dt = bli_obj_dt( b ); - // Return error if i or j is beyond bounds of matrix/vector. - if ( m <= i ) return BLIS_FAILURE; - if ( n <= j ) return BLIS_FAILURE; - - void* b_p; - -#if 0 - // Handle scalar constants separately. - if ( dt == BLIS_CONSTANT ) - { - if ( i == 0 && j == 0 ) - { - dt = BLIS_DCOMPLEX; - b_p = bli_obj_buffer_for_const( dt, b ) - } - else return BLIS_FAILURE; - } - else - { - // Query the pointer to the buffer at the adjusted offsets. - b_p = bli_obj_buffer_at_off( b ); - } -#else + // Return error if i or j is beyond bounds of the matrix/vector. + if ( i < 0 || m <= i ) return BLIS_FAILURE; + if ( j < 0 || n <= j ) return BLIS_FAILURE; + // Disallow access into scalar constants. if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE; // Query the pointer to the buffer at the adjusted offsets. - b_p = bli_obj_buffer_at_off( b ); -#endif + void* b_p = bli_obj_buffer_at_off( b ); // Index into the function pointer array. getijm_fp f = ftypes_getijm[ dt ]; diff --git a/frame/base/bli_setgetij.h b/frame/base/bli_setgetijm.h similarity index 100% rename from frame/base/bli_setgetij.h rename to frame/base/bli_setgetijm.h diff --git a/frame/base/bli_setgetijv.c b/frame/base/bli_setgetijv.c new file mode 100644 index 0000000000..610f6f271c --- /dev/null +++ b/frame/base/bli_setgetijv.c @@ -0,0 +1,168 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +typedef void (*setijv_fp) + ( + double ar, + double ai, + dim_t i, + void* restrict x, inc_t incx + ); +static setijv_fp GENARRAY(ftypes_setijv,setijv); + +err_t bli_setijv + ( + double ar, + double ai, + dim_t i, + obj_t* x + ) +{ + dim_t n = bli_obj_vector_dim( x ); + dim_t incx = bli_obj_vector_inc( x ); + num_t dt = bli_obj_dt( x ); + + // Return error if i is beyond bounds of the vector. + if ( i < 0 || n <= i ) return BLIS_FAILURE; + + // Don't modify scalar constants. + if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE; + + // Query the pointer to the buffer at the adjusted offsets. + void* x_p = bli_obj_buffer_at_off( x ); + + // Index into the function pointer array. + setijv_fp f = ftypes_setijv[ dt ]; + + // Invoke the type-specific function. + f + ( + ar, + ai, + i, + x_p, incx + ); + + return BLIS_SUCCESS; +} + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + double ar, \ + double ai, \ + dim_t i, \ + void* restrict x, inc_t incx \ + ) \ +{ \ + ctype* restrict x_cast = ( ctype* )x; \ +\ + ctype* restrict x_i = x_cast + (i )*incx; \ +\ + PASTEMAC2(z,ch,sets)( ar, ai, *x_i ); \ +} + +INSERT_GENTFUNC_BASIC0( setijv ) + +// ----------------------------------------------------------------------------- + +typedef void (*getijv_fp) + ( + dim_t i, + void* restrict x, inc_t incx, + double* ar, + double* ai + ); +static getijv_fp GENARRAY(ftypes_getijv,getijv); + +err_t bli_getijv + ( + dim_t i, + obj_t* x, + double* ar, + double* ai + ) +{ + dim_t n = bli_obj_vector_dim( x ); + dim_t incx = bli_obj_vector_inc( x ); + num_t dt = bli_obj_dt( x ); + + // Return error if i is beyond bounds of the vector. + if ( i < 0 || n <= i ) return BLIS_FAILURE; + + // Disallow access into scalar constants. + if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE; + + // Query the pointer to the buffer at the adjusted offsets. + void* x_p = bli_obj_buffer_at_off( x ); + + // Index into the function pointer array. + getijv_fp f = ftypes_getijv[ dt ]; + + // Invoke the type-specific function. + f + ( + i, + x_p, incx, + ar, + ai + ); + + return BLIS_SUCCESS; +} + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t i, \ + void* restrict x, inc_t incx, \ + double* ar, \ + double* ai \ + ) \ +{ \ + ctype* restrict x_cast = ( ctype* )x; \ +\ + ctype* restrict x_i = x_cast + (i )*incx; \ +\ + PASTEMAC2(ch,z,gets)( *x_i, *ar, *ai ); \ +} + +INSERT_GENTFUNC_BASIC0( getijv ) + diff --git a/frame/base/bli_setgetijv.h b/frame/base/bli_setgetijv.h new file mode 100644 index 0000000000..703fe41aae --- /dev/null +++ b/frame/base/bli_setgetijv.h @@ -0,0 +1,78 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +BLIS_EXPORT_BLIS err_t bli_setijv + ( + double ar, + double ai, + dim_t i, + obj_t* x + ); + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + double ar, \ + double ai, \ + dim_t i, \ + void* restrict x, inc_t incx \ + ); + +INSERT_GENTPROT_BASIC0( setijv ) + +// ----------------------------------------------------------------------------- + +BLIS_EXPORT_BLIS err_t bli_getijv + ( + dim_t i, + obj_t* x, + double* ar, + double* ai + ); + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + dim_t i, \ + void* restrict b, inc_t incx, \ + double* ar, \ + double* ai \ + ); + +INSERT_GENTPROT_BASIC0( getijv ) + diff --git a/frame/include/blis.h b/frame/include/blis.h index e5fe8714c1..61b7a0f82f 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -136,7 +136,8 @@ extern "C" { #include "bli_arch.h" #include "bli_cpuid.h" #include "bli_string.h" -#include "bli_setgetij.h" +#include "bli_setgetijm.h" +#include "bli_setgetijv.h" #include "bli_setri.h" #include "bli_castm.h" From 5d46dbee4a06ba5a422e19817836976f8574cb4f Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 12 May 2021 18:42:09 -0500 Subject: [PATCH 008/226] Replace bli_dlamch with something less archaic (#498) Details: - Added new implementations of bli_slamch() and bli_dlamch() that use constants from the standard C library in lieu of dynamically-computed values (via code inherited from netlib). The previous implementation is still available when the cpp macro BLIS_ENABLE_LEGACY_LAMCH is defined by the subconfiguration at compile-time. Thanks to Devin Matthews for providing this patch, and to Stefano Zampini for reporting the issue (#497) that prompted Devin to propose the patch. --- CREDITS | 1 + frame/base/noopt/bli_dlamch.c | 67 +++++++++++++++++++++++++++++++---- frame/base/noopt/bli_slamch.c | 67 +++++++++++++++++++++++++++++++---- 3 files changed, 123 insertions(+), 12 deletions(-) diff --git a/CREDITS b/CREDITS index c6d5d7151a..caad6f6dd7 100644 --- a/CREDITS +++ b/CREDITS @@ -104,6 +104,7 @@ but many others have contributed code and feedback, including Costas Yamin @cosstas Chenhan Yu @ChenhanYu (The University of Texas at Austin) Roman Yurchak @rth (Symerio) + Stefano Zampini @stefanozampini M. Zhou @cdluminate BLIS's development was partially funded by grants from industry diff --git a/frame/base/noopt/bli_dlamch.c b/frame/base/noopt/bli_dlamch.c index 53a6609653..b8be23b382 100644 --- a/frame/base/noopt/bli_dlamch.c +++ b/frame/base/noopt/bli_dlamch.c @@ -1,12 +1,14 @@ -/* dlamch.f -- translated by f2c (version 19991025). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ +#include "blis.h" + +#include +#include +#include #ifdef __cplusplus extern "C" { #endif -#include "blis.h" + +#ifdef BLIS_ENABLE_LEGACY_LAMCH double bli_pow_di( bla_double* a, bla_integer* n ); @@ -1027,6 +1029,59 @@ bla_double bli_dlamc3(bla_double *a, bla_double *b) } /* bli_dlamc5_ */ -#ifdef __cplusplus +#else + +bla_double bli_dlamch(bla_character *cmach, ftnlen cmach_len) +{ +/* = 'E' or 'e', DLAMCH := eps */ +/* = 'S' or 's , DLAMCH := sfmin */ +/* = 'B' or 'b', DLAMCH := base */ +/* = 'P' or 'p', DLAMCH := eps*base */ +/* = 'N' or 'n', DLAMCH := t */ +/* = 'R' or 'r', DLAMCH := rnd */ +/* = 'M' or 'm', DLAMCH := emin */ +/* = 'U' or 'u', DLAMCH := rmin */ +/* = 'L' or 'l', DLAMCH := emax */ +/* = 'O' or 'o', DLAMCH := rmax */ + +/* where */ + +/* eps = relative machine precision */ +/* sfmin = safe minimum, such that 1/sfmin does not overflow */ +/* base = base of the machine */ +/* prec = eps*base */ +/* t = number of (base) digits in the mantissa */ +/* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */ +/* emin = minimum exponent before (gradual) underflow */ +/* rmin = underflow threshold - base**(emin-1) */ +/* emax = largest exponent before overflow */ +/* rmax = overflow threshold - (base**emax)*(1-eps) */ + + double safe_min = DBL_MIN; + double small = 1.0f / DBL_MAX; + + if ( small >= safe_min ) + safe_min = small * ( 1.0 + DBL_EPSILON ); + + switch ( toupper( *cmach ) ) + { + case 'E': return DBL_EPSILON; + case 'S': return safe_min; + case 'B': return FLT_RADIX; + case 'P': return FLT_RADIX*DBL_EPSILON; + case 'N': return DBL_MANT_DIG; + case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0 : 0.0; + case 'M': return DBL_MIN_EXP; + case 'U': return DBL_MIN; + case 'L': return DBL_MAX_EXP; + case 'O': return DBL_MAX; } + + return 0.0; +} + +#endif + +#ifdef __cplusplus +} #endif diff --git a/frame/base/noopt/bli_slamch.c b/frame/base/noopt/bli_slamch.c index 3f0b72cd8c..ec7cf85975 100644 --- a/frame/base/noopt/bli_slamch.c +++ b/frame/base/noopt/bli_slamch.c @@ -1,12 +1,14 @@ -/* slamch.f -- translated by f2c (version 19991025). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ +#include "blis.h" + +#include +#include +#include #ifdef __cplusplus extern "C" { #endif -#include "blis.h" + +#ifdef BLIS_ENABLE_LEGACY_LAMCH double bli_pow_ri( bla_real* a, bla_integer* n ); @@ -1022,6 +1024,59 @@ bla_real bli_slamc3(bla_real *a, bla_real *b) } /* bli_slamc5_ */ -#ifdef __cplusplus +#else + +bla_real bli_slamch(bla_character *cmach, ftnlen cmach_len) +{ +/* = 'E' or 'e', SLAMCH := eps */ +/* = 'S' or 's , SLAMCH := sfmin */ +/* = 'B' or 'b', SLAMCH := base */ +/* = 'P' or 'p', SLAMCH := eps*base */ +/* = 'N' or 'n', SLAMCH := t */ +/* = 'R' or 'r', SLAMCH := rnd */ +/* = 'M' or 'm', SLAMCH := emin */ +/* = 'U' or 'u', SLAMCH := rmin */ +/* = 'L' or 'l', SLAMCH := emax */ +/* = 'O' or 'o', SLAMCH := rmax */ + +/* where */ + +/* eps = relative machine precision */ +/* sfmin = safe minimum, such that 1/sfmin does not overflow */ +/* base = base of the machine */ +/* prec = eps*base */ +/* t = number of (base) digits in the mantissa */ +/* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */ +/* emin = minimum exponent before (gradual) underflow */ +/* rmin = underflow threshold - base**(emin-1) */ +/* emax = largest exponent before overflow */ +/* rmax = overflow threshold - (base**emax)*(1-eps) */ + + float safe_min = FLT_MIN; + float small = 1.0f / FLT_MAX; + + if ( small >= safe_min ) + safe_min = small * ( 1.0f + FLT_EPSILON ); + + switch ( toupper( *cmach ) ) + { + case 'E': return FLT_EPSILON; + case 'S': return safe_min; + case 'B': return FLT_RADIX; + case 'P': return FLT_RADIX*FLT_EPSILON; + case 'N': return FLT_MANT_DIG; + case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0f : 0.0f; + case 'M': return FLT_MIN_EXP; + case 'U': return FLT_MIN; + case 'L': return FLT_MAX_EXP; + case 'O': return FLT_MAX; } + + return 0.0f; +} + +#endif + +#ifdef __cplusplus +} #endif From f0e8634775094584e89f1b03811ee192f2aaf67f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 12 May 2021 18:45:32 -0500 Subject: [PATCH 009/226] Defined eqsc, eqv, eqm to test object equality. Details: - Defined eqsc, eqv, and eqm operations, which set a bool depending on whether the two scalars, two vectors, or two matrix operands are equal (element-wise). eqsc and eqv support implicit conjugation and eqm supports diagonal offset, diag, uplo, and trans parameters (in a manner consistent with other level-1m operations). These operations are currently housed under frame/util, at least for now, because they are not computational in nature. - Redefined bli_obj_equals() in terms of eqsc, eqv, and eqm. - Documented eqsc, eqv, and eqm in BLISObjectAPI.md and BLISTypedAPI.md. Also: - Documented getsc and setsc in both docs. - Reordered entry for setijv in BLISTypedAPI.md, and added separator bars to both docs. - Added missing "Observed object properties" clauses to various levle-1v entries in BLISObjectAPI.md. - Defined bli_apply_trans() in bli_param_macro_defs.h. - Defined supporting _check() function, bli_l0_xxbsc_check(), in bli_l0_check.c for eqsc. - Programming style and whitespace updates to bli_l1m_unb_var1.c. - Whitespace updates to bli_l0_oapi.c, bli_l1m_oapi.c - Consolidated redundant macro redefinition for copym function pointer type in bli_l1m_ft.h. - Added macros to bli_oapi_ba.h, _ex.h, and bli_tapi_ba.h, _ex.h that allow oapi and tapi source files to forego defining certain expert functions. (Certain operations such as printv and printm do not need to have both basic expert interfaces. This also includes eqsc, eqv, and eqm.) --- docs/BLISObjectAPI.md | 92 +++++- docs/BLISTypedAPI.md | 114 ++++++- frame/0/bli_l0_check.c | 35 +++ frame/0/bli_l0_check.h | 8 +- frame/0/bli_l0_ft.h | 1 - frame/0/bli_l0_oapi.c | 42 +-- frame/0/bli_l0_oapi.h | 6 - frame/1m/bli_l1m_ft.h | 19 -- frame/1m/bli_l1m_oapi.c | 136 ++++---- frame/1m/bli_l1m_unb_var1.c | 374 +++++++++++----------- frame/base/bli_query.c | 13 + frame/include/bli_oapi_ba.h | 6 + frame/include/bli_oapi_ex.h | 6 + frame/include/bli_param_macro_defs.h | 6 + frame/include/bli_tapi_ba.h | 6 + frame/include/bli_tapi_ex.h | 6 + frame/util/bli_util_check.c | 84 +++-- frame/util/bli_util_check.h | 59 +++- frame/util/bli_util_fpa.c | 6 + frame/util/bli_util_fpa.h | 8 +- frame/util/bli_util_ft.h | 59 ++++ frame/util/bli_util_oapi.c | 455 ++++++++++++++++++--------- frame/util/bli_util_oapi.h | 100 ++++-- frame/util/bli_util_tapi.c | 220 +++++++++---- frame/util/bli_util_tapi.h | 116 +++++-- frame/util/bli_util_unb_var1.c | 314 +++++++++++++----- frame/util/bli_util_unb_var1.h | 100 ++++-- 27 files changed, 1657 insertions(+), 734 deletions(-) diff --git a/docs/BLISObjectAPI.md b/docs/BLISObjectAPI.md index a9ce9a24dc..9a06e29a49 100644 --- a/docs/BLISObjectAPI.md +++ b/docs/BLISObjectAPI.md @@ -53,7 +53,7 @@ This index provides a quick way to jump directly to the description for each ope * **[Level-3](BLISObjectAPI.md#level-3-operations)**: Operations with matrices that are multiplication-like: * [gemm](BLISObjectAPI.md#gemm), [hemm](BLISObjectAPI.md#hemm), [herk](BLISObjectAPI.md#herk), [her2k](BLISObjectAPI.md#her2k), [symm](BLISObjectAPI.md#symm), [syrk](BLISObjectAPI.md#syrk), [syr2k](BLISObjectAPI.md#syr2k), [trmm](BLISObjectAPI.md#trmm), [trmm3](BLISObjectAPI.md#trmm3), [trsm](BLISObjectAPI.md#trsm) * **[Utility](BLISObjectAPI.md#Utility-operations)**: Miscellaneous operations on matrices and vectors: - * [asumv](BLISObjectAPI.md#asumv), [norm1v](BLISObjectAPI.md#norm1v), [normfv](BLISObjectAPI.md#normfv), [normiv](BLISObjectAPI.md#normiv), [norm1m](BLISObjectAPI.md#norm1m), [normfm](BLISObjectAPI.md#normfm), [normim](BLISObjectAPI.md#normim), [mkherm](BLISObjectAPI.md#mkherm), [mksymm](BLISObjectAPI.md#mksymm), [mktrim](BLISObjectAPI.md#mktrim), [fprintv](BLISObjectAPI.md#fprintv), [fprintm](BLISObjectAPI.md#fprintm),[printv](BLISObjectAPI.md#printv), [printm](BLISObjectAPI.md#printm), [randv](BLISObjectAPI.md#randv), [randm](BLISObjectAPI.md#randm), [sumsqv](BLISObjectAPI.md#sumsqv), [getijv](BLISObjectAPI.md#getijv), [getijm](BLISObjectAPI.md#getijm), [setijv](BLISObjectAPI.md#setijv), [setijm](BLISObjectAPI.md#setijm) + * [asumv](BLISObjectAPI.md#asumv), [norm1v](BLISObjectAPI.md#norm1v), [normfv](BLISObjectAPI.md#normfv), [normiv](BLISObjectAPI.md#normiv), [norm1m](BLISObjectAPI.md#norm1m), [normfm](BLISObjectAPI.md#normfm), [normim](BLISObjectAPI.md#normim), [mkherm](BLISObjectAPI.md#mkherm), [mksymm](BLISObjectAPI.md#mksymm), [mktrim](BLISObjectAPI.md#mktrim), [fprintv](BLISObjectAPI.md#fprintv), [fprintm](BLISObjectAPI.md#fprintm),[printv](BLISObjectAPI.md#printv), [printm](BLISObjectAPI.md#printm), [randv](BLISObjectAPI.md#randv), [randm](BLISObjectAPI.md#randm), [sumsqv](BLISObjectAPI.md#sumsqv), [getsc](BLISObjectAPI.md#getsc), [getijv](BLISObjectAPI.md#getijv), [getijm](BLISObjectAPI.md#getijm), [setsc](BLISObjectAPI.md#setsc), [setijv](BLISObjectAPI.md#setijv), [setijm](BLISObjectAPI.md#setijm), [eqsc](BLISObjectAPI.md#eqsc), [eqv](BLISObjectAPI.md#eqv), [eqm](BLISObjectAPI.md#eqm) @@ -790,6 +790,8 @@ Perform ``` where `x` and `y` are vectors of length _n_. +Observed object properties: `conj?(x)`. + --- #### dotv @@ -807,6 +809,8 @@ Perform ``` where `x` and `y` are vectors of length _n_, and `rho` is a scalar. +Observed object properties: `conj?(x)`, `conj?(y)`. + --- #### dotxv @@ -826,6 +830,8 @@ Perform ``` where `x` and `y` are vectors of length _n_, and `alpha`, `beta`, and `rho` are scalars. +Observed object properties: `conj?(alpha)`, `conj?(beta)`, `conj?(x)`, `conj?(y)`. + --- #### invertv @@ -2125,6 +2131,19 @@ where, on entry, `scale` and `sumsq` contain `scale_old` and `sumsq_old`, respec --- +#### getsc +```c +void bli_getsc + ( + obj_t* chi, + double* zeta_r, + double* zeta_i + ) +``` +Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and `zeta_i`. If `chi` is stored as a real type, then `zeta_i` is set to zero. (If `chi` is stored in single precision, the corresponding elements are typecast/promoted during the copy.) + +--- + #### getijv ```c err_t bli_getijv @@ -2138,6 +2157,8 @@ err_t bli_getijv Copy the real and imaginary values at the `i`th element of vector object `x` to `ar` and `ai`. If elements of `x` are stored as real types, then only `ar` is overwritten and `ai` is left unchanged. (If `x` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.) If either the element offset `i` is beyond the vector dimension of `x` or less than zero, the function returns `BLIS_FAILURE` without taking any action. Similarly, if `x` is a global scalar constant such as `BLIS_ONE`, the function returns `BLIS_FAILURE`. +--- + #### getijm ```c err_t bli_getijm @@ -2152,6 +2173,21 @@ err_t bli_getijm Copy the real and imaginary values at the (`i`,`j`) element of object `b` to `ar` and `ai`. If elements of `b` are stored as real types, then only `ar` is overwritten and `ai` is left unchanged. (If `b` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.) If either the row offset `i` is beyond the _m_ dimension of `b` or less than zero, or column offset `j` is beyond the _n_ dimension of `b` or less than zero, the function returns `BLIS_FAILURE` without taking any action. Similarly, if `b` is a global scalar constant such as `BLIS_ONE`, the function returns `BLIS_FAILURE`. +--- + +#### setsc +```c +void bli_setsc + ( + double* zeta_r, + double* zeta_i, + obj_t* chi + ); +``` +Copy real and imaginary values `zeta_r` and `zeta_i` to the scalar object `chi`. If `chi` is stored as a real type, then `zeta_i` is ignored. (If `chi` is stored in single precision, the contents are typecast/demoted during the copy.) + +--- + #### setijv ```c err_t bli_setijv @@ -2165,6 +2201,8 @@ err_t bli_setijv Copy real and imaginary values `ar` and `ai` to the `i`th element of vector object `x`. If elements of `x` are stored as real types, then only `ar` is copied and `ai` is ignored. (If `x` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.) If the element offset `i` is beyond the vector dimension of `x` or less than zero, the function returns `BLIS_FAILURE` without taking any action. Similarly, if `x` is a global scalar constant such as `BLIS_ONE`, the function returns `BLIS_FAILURE`. +--- + #### setijm ```c err_t bli_setijm @@ -2179,6 +2217,58 @@ err_t bli_setijm Copy real and imaginary values `ar` and `ai` to the (`i`,`j`) element of object `b`. If elements of `b` are stored as real types, then only `ar` is copied and `ai` is ignored. (If `b` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.) If either the row offset `i` is beyond the _m_ dimension of `b` or less than zero, or column offset `j` is beyond the _n_ dimension of `b` or less than zero, the function returns `BLIS_FAILURE` without taking any action. Similarly, if `b` is a global scalar constant such as `BLIS_ONE`, the function returns `BLIS_FAILURE`. +--- + +#### eqsc +```c +void bli_eqsc + ( + obj_t chi, + obj_t psi, + bool* is_eq + ); +``` +Perform an element-wise comparison between scalars `chi` and `psi` and store the boolean result in the `bool` pointed to by `is_eq`. +If exactly one of `conj(chi)` or `conj(psi)` (but not both) indicate a conjugation, then one of the scalars will be implicitly conjugated for purposes of the comparision. + +Observed object properties: `conj?(chi)`, `conj?(psi)`. + +--- + +#### eqv +```c +void bli_eqv + ( + obj_t x, + obj_t y, + bool* is_eq + ); +``` +Perform an element-wise comparison between vectors `x` and `y` and store the boolean result in the `bool` pointed to by `is_eq`. +If exactly one of `conj(x)` or `conj(y)` (but not both) indicate a conjugation, then one of the vectors will be implicitly conjugated for purposes of the comparision. + +Observed object properties: `conj?(x)`, `conj?(y)`. + +--- + +#### eqm +```c +void bli_eqm + ( + obj_t a, + obj_t b, + bool* is_eq + ); +``` +Perform an element-wise comparison between matrices `A` and `B` and store the boolean result in the `bool` pointed to by `is_eq`. +Here, `A` is stored as a dense matrix, or lower- or upper-triangular/trapezoidal matrix with arbitrary diagonal offset and unit or non-unit diagonal. +If `diag(A)` indicates a unit diagonal, the diagonals of both matrices will be ignored for purposes of the comparision. +If `uplo(A)` indicates lower or upper storage, only that part of both matrices `A` and `B` will be referenced. +If exactly one of `trans(A)` or `trans(B)` (but not both) indicate a transposition, then one of the matrices will be transposed for purposes of the comparison. +Similarly, if exactly one of `trans(A)` or `trans(B)` (but not both) indicate a conjugation, then one of the matrices will be implicitly conjugated for purposes of the comparision. + +Observed object properties: `diagoff(A)`, `diag(A)`, `uplo(A)`, `trans?(A)`, `trans?(B)`. + # Query function reference diff --git a/docs/BLISTypedAPI.md b/docs/BLISTypedAPI.md index 3dd77864ed..0864289341 100644 --- a/docs/BLISTypedAPI.md +++ b/docs/BLISTypedAPI.md @@ -48,7 +48,7 @@ This index provides a quick way to jump directly to the description for each ope * **[Level-3](BLISTypedAPI.md#level-3-operations)**: Operations with matrices that are multiplication-like: * [gemm](BLISTypedAPI.md#gemm), [hemm](BLISTypedAPI.md#hemm), [herk](BLISTypedAPI.md#herk), [her2k](BLISTypedAPI.md#her2k), [symm](BLISTypedAPI.md#symm), [syrk](BLISTypedAPI.md#syrk), [syr2k](BLISTypedAPI.md#syr2k), [trmm](BLISTypedAPI.md#trmm), [trmm3](BLISTypedAPI.md#trmm3), [trsm](BLISTypedAPI.md#trsm) * **[Utility](BLISTypedAPI.md#Utility-operations)**: Miscellaneous operations on matrices and vectors: - * [asumv](BLISTypedAPI.md#asumv), [norm1v](BLISTypedAPI.md#norm1v), [normfv](BLISTypedAPI.md#normfv), [normiv](BLISTypedAPI.md#normiv), [norm1m](BLISTypedAPI.md#norm1m), [normfm](BLISTypedAPI.md#normfm), [normim](BLISTypedAPI.md#normim), [mkherm](BLISTypedAPI.md#mkherm), [mksymm](BLISTypedAPI.md#mksymm), [mktrim](BLISTypedAPI.md#mktrim), [fprintv](BLISTypedAPI.md#fprintv), [fprintm](BLISTypedAPI.md#fprintm),[printv](BLISTypedAPI.md#printv), [printm](BLISTypedAPI.md#printm), [randv](BLISTypedAPI.md#randv), [randm](BLISTypedAPI.md#randm), [sumsqv](BLISTypedAPI.md#sumsqv), [getijv](BLISTypedAPI.md#getijv), [getijm](BLISTypedAPI.md#getijm), [setijv](BLISTypedAPI.md#setijv), [setijm](BLISTypedAPI.md#setijm) + * [asumv](BLISTypedAPI.md#asumv), [norm1v](BLISTypedAPI.md#norm1v), [normfv](BLISTypedAPI.md#normfv), [normiv](BLISTypedAPI.md#normiv), [norm1m](BLISTypedAPI.md#norm1m), [normfm](BLISTypedAPI.md#normfm), [normim](BLISTypedAPI.md#normim), [mkherm](BLISTypedAPI.md#mkherm), [mksymm](BLISTypedAPI.md#mksymm), [mktrim](BLISTypedAPI.md#mktrim), [fprintv](BLISTypedAPI.md#fprintv), [fprintm](BLISTypedAPI.md#fprintm),[printv](BLISTypedAPI.md#printv), [printm](BLISTypedAPI.md#printm), [randv](BLISTypedAPI.md#randv), [randm](BLISTypedAPI.md#randm), [sumsqv](BLISTypedAPI.md#sumsqv), [getsc](BLISTypedAPI.md#getsc), [getijv](BLISTypedAPI.md#getijv), [getijm](BLISTypedAPI.md#getijm), [setsc](BLISTypedAPI.md#setsc), [setijv](BLISTypedAPI.md#setijv), [setijm](BLISTypedAPI.md#setijm), [eqsc](BLISTypedAPI.md#eqsc), [eqv](BLISTypedAPI.md#eqv), [eqm](BLISTypedAPI.md#eqm) @@ -1695,6 +1695,19 @@ where, on entry, `scale` and `sumsq` contain `scale_old` and `sumsq_old`, respec --- +#### getsc +```c +void bli_getsc + ( + ctype* chi, + double* zeta_r, + double* zeta_i + ) +``` +Copy the real and imaginary values from the scalar object `chi` to `zeta_r` and `zeta_i`. If `chi` is stored as a real type, then `zeta_i` is set to zero. (If `chi` is stored in single precision, the corresponding elements are typecast/promoted during the copy.) + +--- + #### getijv ```c err_t bli_?getijv @@ -1708,18 +1721,7 @@ err_t bli_?getijv Copy the real and imaginary values at the `i`th element of vector `x` to `ar` and `ai`. For real domain invocations, only `ar` is overwritten and `ai` is left unchanged. (If `x` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.) Note that the object-based analogue of [getijv](BLISObjectAPI.md#getijv) does bounds checking of the vector element offset `i` against the vector length while the typed functions specified above do not (since the vector length is not given). -#### setijv -```c -err_t bli_?setijv - ( - double ar, - double ai, - dim_t i, - ctype* x, incx - ); -``` -Copy real and imaginary values `ar` and `ai` to the `i`th element of vector object `x`. For real domain invocations, only `ar` is copied and `ai` is ignored. (If `x` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.) -Note that the object-based analogue of [setijv](BLISObjectAPI.md#setijv) does bounds checking of the vector element offset `i` against the vector length while the typed functions specified above do not (since the vector length is not given). +--- #### getijm ```c @@ -1735,6 +1737,36 @@ err_t bli_?getijm Copy the real and imaginary values at the (`i`,`j`) element of object `b` to `ar` and `ai`. For real domain invocations, only `ar` is overwritten and `ai` is left unchanged. (If `b` contains elements stored in single precision, the corresponding elements are typecast/promoted during the copy.) Note that the object-based analogue of [getijm](BLISObjectAPI.md#getijm) does bounds checking of the matrix element offsets (`i`,`j`) against the matrix dimensions while the typed functions specified above do not (since the matrix dimensions are not given). +--- + +#### setsc +```c +void bli_setsc + ( + double* zeta_r, + double* zeta_i, + ctype* chi + ); +``` +Copy real and imaginary values `zeta_r` and `zeta_i` to the scalar object `chi`. If `chi` is stored as a real type, then `zeta_i` is ignored. (If `chi` is stored in single precision, the contents are typecast/demoted during the copy.) + +--- + +#### setijv +```c +err_t bli_?setijv + ( + double ar, + double ai, + dim_t i, + ctype* x, incx + ); +``` +Copy real and imaginary values `ar` and `ai` to the `i`th element of vector object `x`. For real domain invocations, only `ar` is copied and `ai` is ignored. (If `x` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.) +Note that the object-based analogue of [setijv](BLISObjectAPI.md#setijv) does bounds checking of the vector element offset `i` against the vector length while the typed functions specified above do not (since the vector length is not given). + +--- + #### setijm ```c err_t bli_?setijm @@ -1749,6 +1781,62 @@ err_t bli_?setijm Copy real and imaginary values `ar` and `ai` to the (`i`,`j`) element of object `b`. For real domain invocations, only `ar` is copied and `ai` is ignored. (If `b` contains elements stored in single precision, the corresponding elements are typecast/demoted during the copy.) Note that the object-based analogue of [setijm](BLISObjectAPI.md#setijm) does bounds checking of the matrix element offsets (`i`,`j`) against the matrix dimensions while the typed functions specified above do not (since the matrix dimensions are not given). +--- + +#### eqsc +```c +void bli_?eqsc + ( + conj_t conjchi, + ctype* chi, + ctype* psi, + bool* is_eq + ); +``` +Perform an element-wise comparison between scalars `chi` and `psi` and store the boolean result in the `bool` pointed to by `is_eq`. +If `conjchi` indicates a conjugation, `chi` will be implicitly conjugated for purposes of the comparision. + +--- + +#### eqv +```c +void bli_?eqv + ( + conj_t conjx, + dim_t n, + ctype* x, inc_t incx, + ctype* y, inc_t incy, + bool* is_eq + ); +``` +Perform an element-wise comparison between length _n_ vectors `x` and `y` and store the boolean result in the `bool` pointed to by `is_eq`. +If `conjx` indicates a conjugation, `x` will be implicitly conjugated for purposes of the comparision. + +--- + +#### eqm +```c +void bli_?eqm + ( + doff_t diagoffa, + diag_t diaga, + uplo_t uploa, + trans_t transa, + dim_t m, + dim_t n, + ctype* a, inc_t rs_a, inc_t cs_a, + ctype* b, inc_t rs_b, inc_t cs_b, + bool* is_eq + ) +``` +Perform an element-wise comparison between matrices `A` and `B` and store the boolean result in the `bool` pointed to by `is_eq`. +Here, `B` is an _m x n_ matrix, `A` is stored as a dense matrix, or lower- or upper-triangular/trapezoidal matrix with arbitrary diagonal offset and unit or non-unit diagonal. +If `diaga` indicates a unit diagonal, the diagonals of both matrices will be ignored for purposes of the comparision. +If `uploa` indicates lower or upper storage, only that part of matrix `A` will be referenced in the comparison. +If `transa` indicates a conjugation and/or transposition, then `A` will be conjugated and/or transposed for purposes of the comparison. + + + ## Level-3 microkernels diff --git a/frame/0/bli_l0_check.c b/frame/0/bli_l0_check.c index 65eeda1b7f..966f0c6aaa 100644 --- a/frame/0/bli_l0_check.c +++ b/frame/0/bli_l0_check.c @@ -87,6 +87,7 @@ void PASTEMAC(opname,_check) \ GENFRONT( absqsc ) GENFRONT( normfsc ) +// ----------------------------------------------------------------------------- void bli_getsc_check ( @@ -352,3 +353,37 @@ void bli_l0_xx2sc_check bli_check_error_code( e_val ); } +void bli_l0_xxbsc_check + ( + obj_t* chi, + obj_t* psi, + bool* is_eq + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_noninteger_object( chi ); + bli_check_error_code( e_val ); + + e_val = bli_check_noninteger_object( psi ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_scalar_object( chi ); + bli_check_error_code( e_val ); + + e_val = bli_check_scalar_object( psi ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( chi ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( psi ); + bli_check_error_code( e_val ); +} + diff --git a/frame/0/bli_l0_check.h b/frame/0/bli_l0_check.h index 262679aeb6..f495866c62 100644 --- a/frame/0/bli_l0_check.h +++ b/frame/0/bli_l0_check.h @@ -129,7 +129,6 @@ void PASTEMAC(opname,_check) \ GENTPROT( zipsc ) - // ----------------------------------------------------------------------------- void bli_l0_xsc_check @@ -148,3 +147,10 @@ void bli_l0_xx2sc_check obj_t* chi, obj_t* norm ); + +void bli_l0_xxbsc_check + ( + obj_t* chi, + obj_t* psi, + bool* is_eq + ); diff --git a/frame/0/bli_l0_ft.h b/frame/0/bli_l0_ft.h index 47d47276aa..b90e35eb59 100644 --- a/frame/0/bli_l0_ft.h +++ b/frame/0/bli_l0_ft.h @@ -175,4 +175,3 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \ INSERT_GENTDEFR( zipsc ) - diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c index 2dc37efd1a..ac62530dbc 100644 --- a/frame/0/bli_l0_oapi.c +++ b/frame/0/bli_l0_oapi.c @@ -69,8 +69,8 @@ void PASTEMAC0(opname) \ \ f \ ( \ - buf_chi, \ - buf_absq \ + buf_chi, \ + buf_absq \ ); \ } @@ -105,9 +105,9 @@ void PASTEMAC0(opname) \ \ f \ ( \ - conjchi, \ - buf_chi, \ - buf_psi \ + conjchi, \ + buf_chi, \ + buf_psi \ ); \ } @@ -142,8 +142,8 @@ void PASTEMAC0(opname) \ \ f \ ( \ - conjchi, \ - buf_chi \ + conjchi, \ + buf_chi \ ); \ } @@ -175,8 +175,8 @@ void PASTEMAC0(opname) \ \ f \ ( \ - buf_chi, \ - buf_psi \ + buf_chi, \ + buf_psi \ ); \ } @@ -218,9 +218,9 @@ void PASTEMAC0(opname) \ \ f \ ( \ - buf_chi, \ - zeta_r, \ - zeta_i \ + buf_chi, \ + zeta_r, \ + zeta_i \ ); \ } @@ -252,9 +252,9 @@ void PASTEMAC0(opname) \ \ f \ ( \ - zeta_r, \ - zeta_i, \ - buf_chi \ + zeta_r, \ + zeta_i, \ + buf_chi \ ); \ } @@ -295,9 +295,9 @@ void PASTEMAC0(opname) \ \ f \ ( \ - buf_chi, \ - buf_zeta_r, \ - buf_zeta_i \ + buf_chi, \ + buf_zeta_r, \ + buf_zeta_i \ ); \ } @@ -332,9 +332,9 @@ void PASTEMAC0(opname) \ \ f \ ( \ - buf_zeta_i, \ - buf_zeta_r, \ - buf_chi \ + buf_zeta_i, \ + buf_zeta_r, \ + buf_chi \ ); \ } diff --git a/frame/0/bli_l0_oapi.h b/frame/0/bli_l0_oapi.h index d0b05606f8..702bb40eaa 100644 --- a/frame/0/bli_l0_oapi.h +++ b/frame/0/bli_l0_oapi.h @@ -128,9 +128,3 @@ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ GENPROT( zipsc ) - - - - - - diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h index 152915df4c..af6c384e53 100644 --- a/frame/1m/bli_l1m_ft.h +++ b/frame/1m/bli_l1m_ft.h @@ -57,25 +57,6 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) - -// copym - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - INSERT_GENTDEF( copym ) // axpym diff --git a/frame/1m/bli_l1m_oapi.c b/frame/1m/bli_l1m_oapi.c index 224a41bc9f..840b058d4a 100644 --- a/frame/1m/bli_l1m_oapi.c +++ b/frame/1m/bli_l1m_oapi.c @@ -78,17 +78,17 @@ void PASTEMAC(opname,EX_SUF) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ - ( \ - diagoffx, \ - diagx, \ - uplox, \ - transx, \ - m, \ - n, \ - buf_x, rs_x, cs_x, \ - buf_y, rs_y, cs_y, \ - cntx, \ - rntm \ + ( \ + diagoffx, \ + diagx, \ + uplox, \ + transx, \ + m, \ + n, \ + buf_x, rs_x, cs_x, \ + buf_y, rs_y, cs_y, \ + cntx, \ + rntm \ ); \ } @@ -146,18 +146,18 @@ void PASTEMAC(opname,EX_SUF) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ - ( \ - diagoffx, \ - diagx, \ - uplox, \ - transx, \ - m, \ - n, \ - buf_alpha, \ - buf_x, rs_x, cs_x, \ - buf_y, rs_y, cs_y, \ - cntx, \ - rntm \ + ( \ + diagoffx, \ + diagx, \ + uplox, \ + transx, \ + m, \ + n, \ + buf_alpha, \ + buf_x, rs_x, cs_x, \ + buf_y, rs_y, cs_y, \ + cntx, \ + rntm \ ); \ } @@ -223,17 +223,17 @@ void PASTEMAC(opname,EX_SUF) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ - ( \ - BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \ - diagoffx, \ - diagx, \ - uplox, \ - m, \ - n, \ - buf_alpha, \ - buf_x, rs_x, cs_x, \ - cntx, \ - rntm \ + ( \ + BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \ + diagoffx, \ + diagx, \ + uplox, \ + m, \ + n, \ + buf_alpha, \ + buf_x, rs_x, cs_x, \ + cntx, \ + rntm \ ); \ } @@ -285,17 +285,17 @@ void PASTEMAC(opname,EX_SUF) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ - ( \ - BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \ - diagoffx, \ - diagx, \ - uplox, \ - m, \ - n, \ - buf_alpha, \ - buf_x, rs_x, cs_x, \ - cntx, \ - rntm \ + ( \ + BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \ + diagoffx, \ + diagx, \ + uplox, \ + m, \ + n, \ + buf_alpha, \ + buf_x, rs_x, cs_x, \ + cntx, \ + rntm \ ); \ } @@ -354,18 +354,18 @@ void PASTEMAC(opname,EX_SUF) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ - ( \ - diagoffx, \ - diagx, \ - uplox, \ - transx, \ - m, \ - n, \ - buf_x, rs_x, cs_x, \ - buf_beta, \ - buf_y, rs_y, cs_y, \ - cntx, \ - rntm \ + ( \ + diagoffx, \ + diagx, \ + uplox, \ + transx, \ + m, \ + n, \ + buf_x, rs_x, cs_x, \ + buf_beta, \ + buf_y, rs_y, cs_y, \ + cntx, \ + rntm \ ); \ } @@ -420,17 +420,17 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - diagoffx, \ - diagx, \ - uplox, \ - transx, \ - m, \ - n, \ - buf_x, rs_x, cs_x, \ - buf_beta, \ - buf_y, rs_y, cs_y, \ - cntx, \ - rntm \ + diagoffx, \ + diagx, \ + uplox, \ + transx, \ + m, \ + n, \ + buf_x, rs_x, cs_x, \ + buf_beta, \ + buf_y, rs_y, cs_y, \ + cntx, \ + rntm \ ); \ } diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c index cb6098e3f0..f2ce3c8d7e 100644 --- a/frame/1m/bli_l1m_unb_var1.c +++ b/frame/1m/bli_l1m_unb_var1.c @@ -57,15 +57,12 @@ void PASTEMAC(ch,opname) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ - ctype* x1; \ - ctype* y1; \ uplo_t uplox_eff; \ conj_t conjx; \ dim_t n_iter; \ - dim_t n_elem, n_elem_max; \ + dim_t n_elem_max; \ inc_t ldx, incx; \ inc_t ldy, incy; \ - dim_t j, i; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ @@ -88,62 +85,65 @@ void PASTEMAC(ch,opname) \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - n_elem = n_elem_max; \ + const dim_t n_elem = n_elem_max; \ \ - x1 = x + (j )*ldx + (0 )*incx; \ - y1 = y + (j )*ldy + (0 )*incy; \ + ctype* x1 = x + (j )*ldx + (0 )*incx; \ + ctype* y1 = y + (j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - x1, incx, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ + const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ - x1 = x + (ij0+j )*ldx + (0 )*incx; \ - y1 = y + (ij0+j )*ldy + (0 )*incy; \ + ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \ + ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - x1, incx, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ - n_elem = n_elem_max - i; \ + const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ + const dim_t n_elem = n_elem_max - offi; \ \ - x1 = x + (j )*ldx + (ij0+i )*incx; \ - y1 = y + (j )*ldy + (ij0+i )*incy; \ + ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \ + ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - x1, incx, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ } \ } \ } \ @@ -174,15 +174,12 @@ void PASTEMAC(ch,opname) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ - ctype* x1; \ - ctype* y1; \ uplo_t uplox_eff; \ conj_t conjx; \ dim_t n_iter; \ - dim_t n_elem, n_elem_max; \ + dim_t n_elem_max; \ inc_t ldx, incx; \ inc_t ldy, incy; \ - dim_t j, i; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ @@ -205,65 +202,68 @@ void PASTEMAC(ch,opname) \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - n_elem = n_elem_max; \ + const dim_t n_elem = n_elem_max; \ \ - x1 = x + (j )*ldx + (0 )*incx; \ - y1 = y + (j )*ldy + (0 )*incy; \ + ctype* x1 = x + (j )*ldx + (0 )*incx; \ + ctype* y1 = y + (j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - alpha, \ - x1, incx, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + alpha, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ + const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ - x1 = x + (ij0+j )*ldx + (0 )*incx; \ - y1 = y + (ij0+j )*ldy + (0 )*incy; \ + ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \ + ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - alpha, \ - x1, incx, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + alpha, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ - n_elem = n_elem_max - i; \ + const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ + const dim_t n_elem = n_elem_max - offi; \ \ - x1 = x + (j )*ldx + (ij0+i )*incx; \ - y1 = y + (j )*ldy + (ij0+i )*incy; \ + ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \ + ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - alpha, \ - x1, incx, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + alpha, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ } \ } \ } \ @@ -292,12 +292,10 @@ void PASTEMAC(ch,opname) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ - ctype* x1; \ uplo_t uplox_eff; \ dim_t n_iter; \ - dim_t n_elem, n_elem_max; \ + dim_t n_elem_max; \ inc_t ldx, incx; \ - dim_t j, i; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ @@ -317,59 +315,62 @@ void PASTEMAC(ch,opname) \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - n_elem = n_elem_max; \ + const dim_t n_elem = n_elem_max; \ \ - x1 = x + (j )*ldx + (0 )*incx; \ + ctype* x1 = x + (j )*ldx + (0 )*incx; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjalpha, \ - n_elem, \ - alpha, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + conjalpha, \ + n_elem, \ + alpha, \ + x1, incx, \ + cntx \ + ); \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ + const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ - x1 = x + (ij0+j )*ldx + (0 )*incx; \ + ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjalpha, \ - n_elem, \ - alpha, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + conjalpha, \ + n_elem, \ + alpha, \ + x1, incx, \ + cntx \ + ); \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ - n_elem = n_elem_max - i; \ + const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ + const dim_t n_elem = n_elem_max - offi; \ \ - x1 = x + (j )*ldx + (ij0+i )*incx; \ + ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjalpha, \ - n_elem, \ - alpha, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + conjalpha, \ + n_elem, \ + alpha, \ + x1, incx, \ + cntx \ + ); \ } \ } \ } \ @@ -399,15 +400,12 @@ void PASTEMAC(ch,opname) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ - ctype* x1; \ - ctype* y1; \ uplo_t uplox_eff; \ conj_t conjx; \ dim_t n_iter; \ - dim_t n_elem, n_elem_max; \ + dim_t n_elem_max; \ inc_t ldx, incx; \ inc_t ldy, incy; \ - dim_t j, i; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ @@ -430,65 +428,68 @@ void PASTEMAC(ch,opname) \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - n_elem = n_elem_max; \ + const dim_t n_elem = n_elem_max; \ \ - x1 = x + (j )*ldx + (0 )*incx; \ - y1 = y + (j )*ldy + (0 )*incy; \ + ctype* x1 = x + (j )*ldx + (0 )*incx; \ + ctype* y1 = y + (j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - x1, incx, \ - beta, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ + const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ - x1 = x + (ij0+j )*ldx + (0 )*incx; \ - y1 = y + (ij0+j )*ldy + (0 )*incy; \ + ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \ + ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - x1, incx, \ - beta, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ - n_elem = n_elem_max - i; \ + const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ + const dim_t n_elem = n_elem_max - offi; \ \ - x1 = x + (j )*ldx + (ij0+i )*incx; \ - y1 = y + (j )*ldy + (ij0+i )*incy; \ + ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \ + ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - x1, incx, \ - beta, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ } \ } \ } \ @@ -515,15 +516,12 @@ void PASTEMAC2(chx,chy,opname) \ rntm_t* rntm \ ) \ { \ - ctype_x* restrict x1; \ - ctype_y* restrict y1; \ - uplo_t uplox_eff; \ - dim_t n_iter; \ - dim_t n_elem, n_elem_max; \ - inc_t ldx, incx; \ - inc_t ldy, incy; \ - dim_t j, i; \ - dim_t ij0, n_shift; \ + uplo_t uplox_eff; \ + dim_t n_iter; \ + dim_t n_elem_max; \ + inc_t ldx, incx; \ + inc_t ldy, incy; \ + dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_uplo_2m \ @@ -542,35 +540,32 @@ void PASTEMAC2(chx,chy,opname) \ { \ if ( incx == 1 && incy == 1 ) \ { \ - n_elem = n_elem_max; \ + const dim_t n_elem = n_elem_max; \ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - x1 = x + (j )*ldx + (0 )*incx; \ - y1 = y + (j )*ldy + (0 )*incy; \ + ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \ + ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \ \ - ctype_x* restrict chi1 = x1; \ - ctype_y* restrict psi1 = y1; \ -\ - for ( i = 0; i < n_elem; ++i ) \ + for ( dim_t i = 0; i < n_elem; ++i ) \ { \ - PASTEMAC2(chx,chy,adds)( chi1[i], psi1[i] ); \ + PASTEMAC2(chx,chy,adds)( x1[i], y1[i] ); \ } \ } \ } \ else \ { \ - n_elem = n_elem_max; \ + const dim_t n_elem = n_elem_max; \ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - x1 = x + (j )*ldx + (0 )*incx; \ - y1 = y + (j )*ldy + (0 )*incy; \ + ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \ + ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \ \ ctype_x* restrict chi1 = x1; \ ctype_y* restrict psi1 = y1; \ \ - for ( i = 0; i < n_elem; ++i ) \ + for ( dim_t i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(chx,chy,adds)( *chi1, *psi1 ); \ \ @@ -584,35 +579,32 @@ void PASTEMAC2(chx,chy,opname) \ { \ if ( incx == 1 && incy == 1 ) \ { \ - n_elem = n_elem_max; \ + const dim_t n_elem = n_elem_max; \ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - x1 = x + (j )*ldx + (0 )*incx; \ - y1 = y + (j )*ldy + (0 )*incy; \ -\ - ctype_x* restrict chi1 = x1; \ - ctype_y* restrict psi1 = y1; \ + ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \ + ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \ \ - for ( i = 0; i < n_elem; ++i ) \ + for ( dim_t i = 0; i < n_elem; ++i ) \ { \ - PASTEMAC3(chx,chy,chy,xpbys)( chi1[i], *beta, psi1[i] ); \ + PASTEMAC3(chx,chy,chy,xpbys)( x1[i], *beta, y1[i] ); \ } \ } \ } \ else \ { \ - n_elem = n_elem_max; \ + const dim_t n_elem = n_elem_max; \ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( dim_t j = 0; j < n_iter; ++j ) \ { \ - x1 = x + (j )*ldx + (0 )*incx; \ - y1 = y + (j )*ldy + (0 )*incy; \ + ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \ + ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \ \ ctype_x* restrict chi1 = x1; \ ctype_y* restrict psi1 = y1; \ \ - for ( i = 0; i < n_elem; ++i ) \ + for ( dim_t i = 0; i < n_elem; ++i ) \ { \ PASTEMAC3(chx,chy,chy,xpbys)( *chi1, *beta, *psi1 ); \ \ diff --git a/frame/base/bli_query.c b/frame/base/bli_query.c index b54ef40019..c62a30cccd 100644 --- a/frame/base/bli_query.c +++ b/frame/base/bli_query.c @@ -36,6 +36,7 @@ bool bli_obj_equals( obj_t* a, obj_t* b ) { +#if 0 bool r_val = FALSE; num_t dt_a; num_t dt_b; @@ -80,6 +81,18 @@ bool bli_obj_equals( obj_t* a, obj_t* b ) } return r_val; +#else + bool r_val; + + if ( bli_obj_is_1x1( a ) && bli_obj_is_1x1( b ) ) + bli_eqsc( a, b, &r_val ); + else if ( bli_obj_is_vector( a ) && bli_obj_is_vector( b ) ) + bli_eqv( a, b, &r_val ); + else + bli_eqm( a, b, &r_val ); + + return r_val; +#endif } bool bli_obj_imag_equals( obj_t* a, obj_t* b ) diff --git a/frame/include/bli_oapi_ba.h b/frame/include/bli_oapi_ba.h index 3f0bfa35a8..a10f436303 100644 --- a/frame/include/bli_oapi_ba.h +++ b/frame/include/bli_oapi_ba.h @@ -35,6 +35,12 @@ // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. +// Define a macro so that the source code can determine which interface +// (basic or expert) we are compiling. +#undef BLIS_OAPI_EXPERT +#undef BLIS_OAPI_BASIC +#define BLIS_OAPI_BASIC + // Define the macro to remove the function name suffix (in function // definitions). #undef EX_SUF diff --git a/frame/include/bli_oapi_ex.h b/frame/include/bli_oapi_ex.h index 7acaf36230..924963a7d0 100644 --- a/frame/include/bli_oapi_ex.h +++ b/frame/include/bli_oapi_ex.h @@ -35,6 +35,12 @@ // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. +// Define a macro so that the source code can determine which interface +// (basic or expert) we are compiling. +#undef BLIS_OAPI_BASIC +#undef BLIS_OAPI_EXPERT +#define BLIS_OAPI_EXPERT + // Define the macro to add a suffix to the object API function names // (in function definitions). #undef EX_SUF diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 2890274914..781a2554f3 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -261,6 +261,12 @@ BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) ( trans ^ BLIS_CONJ_BIT ); } +BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) +{ + return ( trans_t ) + ( trans ^ transapp ); +} + BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); diff --git a/frame/include/bli_tapi_ba.h b/frame/include/bli_tapi_ba.h index 26356afe82..c106670d5a 100644 --- a/frame/include/bli_tapi_ba.h +++ b/frame/include/bli_tapi_ba.h @@ -35,6 +35,12 @@ // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. +// Define a macro so that the source code can determine which interface +// (basic or expert) we are compiling. +#undef BLIS_TAPI_EXPERT +#undef BLIS_TAPI_BASIC +#define BLIS_TAPI_BASIC + // Define the macro to remove the function name suffix (in function // definitions). #undef EX_SUF diff --git a/frame/include/bli_tapi_ex.h b/frame/include/bli_tapi_ex.h index 0e1b09226c..04a3ed6451 100644 --- a/frame/include/bli_tapi_ex.h +++ b/frame/include/bli_tapi_ex.h @@ -35,6 +35,12 @@ // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. +// Define a macro so that the source code can determine which interface +// (basic or expert) we are compiling. +#undef BLIS_TAPI_BASIC +#undef BLIS_TAPI_EXPERT +#define BLIS_TAPI_EXPERT + // Define the macro to add a suffix to the typed API function names // (in function definitions). #undef EX_SUF diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c index ae4ebb4612..3693ea39c1 100644 --- a/frame/util/bli_util_check.c +++ b/frame/util/bli_util_check.c @@ -108,18 +108,16 @@ GENFRONT( normim ) \ void PASTEMAC(opname,_check) \ ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + obj_t* x \ ) \ { \ - bli_utilm_fprint_check( file, s1, x, format, s2 ); \ + bli_utilm_rand_check( x ); \ } -GENFRONT( fprintv ) -GENFRONT( fprintm ) +GENFRONT( randv ) +GENFRONT( randnv ) +GENFRONT( randm ) +GENFRONT( randnm ) #undef GENFRONT @@ -127,16 +125,32 @@ GENFRONT( fprintm ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x \ + obj_t* x, \ + obj_t* scale, \ + obj_t* sumsq \ ) \ { \ - bli_utilm_rand_check( x ); \ + bli_utilv_sumsqv_check( x, scale, sumsq ); \ } -GENFRONT( randv ) -GENFRONT( randnv ) -GENFRONT( randm ) -GENFRONT( randnm ) +GENFRONT( sumsqv ) + +// ----------------------------------------------------------------------------- + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* chi, \ + obj_t* psi, \ + bool* is_eq \ + ) \ +{ \ + bli_l0_xxbsc_check( chi, psi, is_eq ); \ +} + +GENFRONT( eqsc ) #undef GENFRONT @@ -145,15 +159,49 @@ GENFRONT( randnm ) void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ - obj_t* scale, \ - obj_t* sumsq \ + obj_t* y, \ + bool* is_eq \ ) \ { \ - bli_utilv_sumsqv_check( x, scale, sumsq ); \ + bli_l1v_xy_check( x, y ); \ } -GENFRONT( sumsqv ) +GENFRONT( eqv ) + + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* y, \ + bool* is_eq \ + ) \ +{ \ + bli_l1m_xy_check( x, y ); \ +} + +GENFRONT( eqm ) + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + FILE* file, \ + char* s1, \ + obj_t* x, \ + char* format, \ + char* s2 \ + ) \ +{ \ + bli_utilm_fprint_check( file, s1, x, format, s2 ); \ +} + +GENFRONT( fprintv ) +GENFRONT( fprintm ) // ----------------------------------------------------------------------------- diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h index a789211c96..866a2cd895 100644 --- a/frame/util/bli_util_check.h +++ b/frame/util/bli_util_check.h @@ -90,22 +90,6 @@ GENPROT( normfm ) GENPROT( normim ) -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ - ); - -GENPROT( fprintv ) -GENPROT( fprintm ) - - #undef GENPROT #define GENPROT( opname ) \ \ @@ -132,6 +116,49 @@ void PASTEMAC(opname,_check) \ GENPROT( sumsqv ) +// ----------------------------------------------------------------------------- + +#undef GENTPROT +#define GENTPROT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* chi, \ + obj_t* psi, \ + bool* is_eq \ + ); + +GENTPROT( eqsc ) + + +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* y, \ + bool* is_eq \ + ); + +GENPROT( eqv ) +GENPROT( eqm ) + + +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + FILE* file, \ + char* s1, \ + obj_t* x, \ + char* format, \ + char* s2 \ + ); + +GENPROT( fprintv ) +GENPROT( fprintm ) // ----------------------------------------------------------------------------- diff --git a/frame/util/bli_util_fpa.c b/frame/util/bli_util_fpa.c index e46163e89a..fba513fae1 100644 --- a/frame/util/bli_util_fpa.c +++ b/frame/util/bli_util_fpa.c @@ -66,6 +66,9 @@ GENFRONT( randm ) GENFRONT( randnm ) GENFRONT( sumsqv ) +// ----------------------------------------------------------------------------- + +// Operations with only basic interfaces. #undef GENFRONT #define GENFRONT( opname ) \ @@ -83,6 +86,9 @@ PASTEMAC(opname,_qfp)( num_t dt ) \ return PASTECH(opname,_fpa)[ dt ]; \ } +GENFRONT( eqsc ) +GENFRONT( eqv ) +GENFRONT( eqm ) GENFRONT( fprintv ) GENFRONT( fprintm ) //GENFRONT( printv ) diff --git a/frame/util/bli_util_fpa.h b/frame/util/bli_util_fpa.h index 3eb2c48682..9ed6a4cf71 100644 --- a/frame/util/bli_util_fpa.h +++ b/frame/util/bli_util_fpa.h @@ -52,16 +52,13 @@ GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) -GENPROT( fprintv ) -GENPROT( fprintm ) -//GENPROT( printv ) -//GENPROT( printm ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) +// ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ @@ -69,6 +66,9 @@ GENPROT( sumsqv ) PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); +GENPROT( eqsc ) +GENPROT( eqv ) +GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h index c4f4f73d0b..8b2dbf69f5 100644 --- a/frame/util/bli_util_ft.h +++ b/frame/util/bli_util_ft.h @@ -191,3 +191,62 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ INSERT_GENTDEFR( sumsqv ) +// ----------------------------------------------------------------------------- + +// Operations with only basic interfaces. + +#ifdef BLIS_OAPI_BASIC + +// eqsc + +#undef GENTDEF +#define GENTDEF( ctype, ch, opname, tsuf ) \ +\ +typedef void (*PASTECH2(ch,opname,tsuf)) \ + ( \ + conj_t conjchi, \ + ctype* chi, \ + ctype* psi, \ + bool* is_eq \ + ); + +INSERT_GENTDEF( eqsc ) + +// eqv + +#undef GENTDEF +#define GENTDEF( ctype, ch, opname, tsuf ) \ +\ +typedef void (*PASTECH2(ch,opname,tsuf)) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy, \ + bool* is_eq \ + ); + +INSERT_GENTDEF( eqv ) + +// eqm + +#undef GENTDEF +#define GENTDEF( ctype, ch, opname, tsuf ) \ +\ +typedef void (*PASTECH2(ch,opname,tsuf)) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y, \ + bool* is_eq \ + ); + +INSERT_GENTDEF( eqm ) + +#endif // #ifdef BLIS_OAPI_BASIC + diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c index 128b1f92e1..afd221a587 100644 --- a/frame/util/bli_util_oapi.c +++ b/frame/util/bli_util_oapi.c @@ -72,11 +72,11 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - n, \ - buf_x, incx, \ - buf_asum, \ - cntx, \ - rntm \ + n, \ + buf_x, incx, \ + buf_asum, \ + cntx, \ + rntm \ ); \ } @@ -114,11 +114,11 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - uploa, \ - m, \ - buf_a, rs_a, cs_a, \ - cntx, \ - rntm \ + uploa, \ + m, \ + buf_a, rs_a, cs_a, \ + cntx, \ + rntm \ ); \ } @@ -158,11 +158,11 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - n, \ - buf_x, incx, \ - buf_norm, \ - cntx, \ - rntm \ + n, \ + buf_x, incx, \ + buf_norm, \ + cntx, \ + rntm \ ); \ } @@ -207,15 +207,15 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - diagoffx, \ - diagx, \ - uplox, \ - m, \ - n, \ - buf_x, rs_x, cs_x, \ - buf_norm, \ - cntx, \ - rntm \ + diagoffx, \ + diagx, \ + uplox, \ + m, \ + n, \ + buf_x, rs_x, cs_x, \ + buf_norm, \ + cntx, \ + rntm \ ); \ } @@ -229,11 +229,7 @@ GENFRONT( normim ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -248,31 +244,24 @@ void PASTEMAC(opname,EX_SUF) \ inc_t incx = bli_obj_vector_inc( x ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \ -\ - /* Handle constants up front. */ \ - if ( dt == BLIS_CONSTANT ) \ - { \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ - } \ + PASTEMAC(opname,_check)( x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ - PASTECH(opname,_vft) f = \ - PASTEMAC(opname,_qfp)( dt ); \ + PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ + PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ - file, \ - s1, \ - n, \ - buf_x, incx, \ - format, \ - s2 \ + n, \ + buf_x, incx, \ + cntx, \ + rntm \ ); \ } -GENFRONT( fprintv ) +GENFRONT( randv ) +GENFRONT( randnv ) #undef GENFRONT @@ -280,11 +269,7 @@ GENFRONT( fprintv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -294,6 +279,8 @@ void PASTEMAC(opname,EX_SUF) \ \ num_t dt = bli_obj_dt( x ); \ \ + doff_t diagoffx = bli_obj_diag_offset( x ); \ + uplo_t uplox = bli_obj_uplo( x ); \ dim_t m = bli_obj_length( x ); \ dim_t n = bli_obj_width( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ @@ -301,58 +288,37 @@ void PASTEMAC(opname,EX_SUF) \ inc_t cs_x = bli_obj_col_stride( x ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \ -\ - /* Handle constants up front. */ \ - if ( dt == BLIS_CONSTANT ) \ - { \ - float* sp = bli_obj_buffer_for_const( BLIS_FLOAT, x ); \ - double* dp = bli_obj_buffer_for_const( BLIS_DOUBLE, x ); \ - scomplex* cp = bli_obj_buffer_for_const( BLIS_SCOMPLEX, x ); \ - dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, x ); \ - gint_t* ip = bli_obj_buffer_for_const( BLIS_INT, x ); \ -\ - fprintf( file, "%s\n", s1 ); \ - fprintf( file, " float: %9.2e\n", bli_sreal( *sp ) ); \ - fprintf( file, " double: %9.2e\n", bli_dreal( *dp ) ); \ - fprintf( file, " scomplex: %9.2e + %9.2e\n", bli_creal( *cp ), \ - bli_cimag( *cp ) ); \ - fprintf( file, " dcomplex: %9.2e + %9.2e\n", bli_zreal( *zp ), \ - bli_zimag( *zp ) ); \ - fprintf( file, " int: %ld\n", ( long )(*ip) ); \ - fprintf( file, "\n" ); \ - return; \ - } \ + PASTEMAC(opname,_check)( x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ - PASTECH(opname,_vft) f = \ - PASTEMAC(opname,_qfp)( dt ); \ + PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ + PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ - file, \ - s1, \ - m, \ - n, \ - buf_x, rs_x, cs_x, \ - format, \ - s2 \ + diagoffx, \ + uplox, \ + m, \ + n, \ + buf_x, rs_x, cs_x, \ + cntx, \ + rntm \ ); \ } -GENFRONT( fprintm ) +GENFRONT( randm ) +GENFRONT( randnm ) #undef GENFRONT -#define GENFRONT( opname, varname ) \ +#define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ - char* s1, \ obj_t* x, \ - char* format, \ - char* s2 \ + obj_t* scale, \ + obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -360,155 +326,348 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_EX_DECLS \ \ - /* Suppress compiler warning about unused variables. */ \ - ( void )cntx; \ + num_t dt = bli_obj_dt( x ); \ \ - /* Invoke the typed function. */ \ - PASTEMAC0(varname) \ + dim_t n = bli_obj_vector_dim( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t incx = bli_obj_vector_inc( x ); \ + void* buf_scale = bli_obj_buffer_at_off( scale ); \ + void* buf_sumsq = bli_obj_buffer_at_off( sumsq ); \ +\ + if ( bli_error_checking_is_enabled() ) \ + PASTEMAC(opname,_check)( x, scale, sumsq ); \ +\ + /* Query a type-specific function pointer, except one that uses + void* for function arguments instead of typed pointers. */ \ + PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ + PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ +\ + f \ ( \ - stdout, \ - s1, \ - x, \ - format, \ - s2 \ + n, \ + buf_x, incx, \ + buf_scale, \ + buf_sumsq, \ + cntx, \ + rntm \ ); \ } -GENFRONT( printv, fprintv ) -GENFRONT( printm, fprintm ) +GENFRONT( sumsqv ) + +// ----------------------------------------------------------------------------- +// Operations with only basic interfaces. + +#ifdef BLIS_OAPI_BASIC #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,EX_SUF) \ +void PASTEMAC0(opname) \ ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ + obj_t* chi, \ + obj_t* psi, \ + bool* is_eq \ ) \ { \ bli_init_once(); \ \ - BLIS_OAPI_EX_DECLS \ + num_t dt_chi = bli_obj_dt( chi ); \ + num_t dt_psi = bli_obj_dt( psi ); \ + num_t dt; \ +\ + if ( bli_error_checking_is_enabled() ) \ + PASTEMAC(opname,_check)( chi, psi, is_eq ); \ +\ + /* Decide which datatype will be used to query the buffer from the + constant object (if there is one). */ \ + if ( bli_is_constant( dt_psi ) ) dt = dt_chi; \ + else dt = dt_psi; \ +\ + /* If chi and psi are both constants, then we compare only the dcomplex + fields. */ \ + if ( bli_is_constant( dt ) ) dt = BLIS_DCOMPLEX; \ +\ + void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \ + void* buf_psi = bli_obj_buffer_for_1x1( dt, psi ); \ +\ + /* Integer objects are handled separately. */ \ + if ( bli_is_int( dt ) ) \ + { \ + *is_eq = bli_ieqa( buf_chi, buf_psi ); \ + return; \ + } \ +\ + /* Query the conj status of each object and use the two to come up with a + single "net" conj_t value. */ \ + conj_t conjchi = bli_obj_conj_status( chi ); \ + conj_t conjpsi = bli_obj_conj_status( psi ); \ + conj_t conj = bli_apply_conj( conjchi, conjpsi ); \ +\ + /* Query a type-specific function pointer, except one that uses + void* for function arguments instead of typed pointers. */ \ + PASTECH(opname,_vft) f = \ + PASTEMAC(opname,_qfp)( dt ); \ +\ + f \ + ( \ + conj, \ + buf_chi, \ + buf_psi, \ + is_eq \ + ); \ +} + +GENFRONT( eqsc ) + + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC0(opname) \ + ( \ + obj_t* x, \ + obj_t* y, \ + bool* is_eq \ + ) \ +{ \ + bli_init_once(); \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t incx = bli_obj_vector_inc( x ); \ + inc_t inc_x = bli_obj_vector_inc( x ); \ + void* buf_y = bli_obj_buffer_at_off( y ); \ + inc_t inc_y = bli_obj_vector_inc( y ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x ); \ + PASTEMAC(opname,_check)( x, y, is_eq ); \ +\ + /* Query the conj status of each object and use the two to come up with a + single "net" conj_t value. */ \ + conj_t conjx = bli_obj_conj_status( x ); \ + conj_t conjy = bli_obj_conj_status( y ); \ + conj_t conj = bli_apply_conj( conjx, conjy ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ - PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ - PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ + PASTECH(opname,_vft) f = \ + PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ - n, \ - buf_x, incx, \ - cntx, \ - rntm \ + conj, \ + n, \ + buf_x, inc_x, \ + buf_y, inc_y, \ + is_eq \ ); \ } -GENFRONT( randv ) -GENFRONT( randnv ) +GENFRONT( eqv ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,EX_SUF) \ +void PASTEMAC0(opname) \ ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ + obj_t* x, \ + obj_t* y, \ + bool* is_eq \ ) \ { \ bli_init_once(); \ -\ - BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ + diag_t diagx = bli_obj_diag( x ); \ uplo_t uplox = bli_obj_uplo( x ); \ - dim_t m = bli_obj_length( x ); \ - dim_t n = bli_obj_width( x ); \ + dim_t m = bli_obj_length( y ); \ + dim_t n = bli_obj_width( y ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ + void* buf_y = bli_obj_buffer_at_off( y ); \ + inc_t rs_y = bli_obj_row_stride( y ); \ + inc_t cs_y = bli_obj_col_stride( y ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x ); \ + PASTEMAC(opname,_check)( x, y, is_eq ); \ +\ + /* Query the combined trans and conj status of each object and use the two + to come up with a single "net" trans_t value. */ \ + trans_t transx = bli_obj_conjtrans_status( x ); \ + trans_t transy = bli_obj_conjtrans_status( y ); \ + trans_t trans = bli_apply_trans( transy, transx ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ - PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ - PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ + PASTECH(opname,_vft) f = \ + PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ - diagoffx, \ - uplox, \ - m, \ - n, \ - buf_x, rs_x, cs_x, \ - cntx, \ - rntm \ + diagoffx, \ + diagx, \ + uplox, \ + trans, \ + m, \ + n, \ + buf_x, rs_x, cs_x, \ + buf_y, rs_y, cs_y, \ + is_eq \ ); \ } -GENFRONT( randm ) -GENFRONT( randnm ) +GENFRONT( eqm ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,EX_SUF) \ +void PASTEMAC0(opname) \ ( \ + FILE* file, \ + char* s1, \ obj_t* x, \ - obj_t* scale, \ - obj_t* sumsq \ - BLIS_OAPI_EX_PARAMS \ + char* format, \ + char* s2 \ ) \ { \ bli_init_once(); \ -\ - BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ - void* buf_scale = bli_obj_buffer_at_off( scale ); \ - void* buf_sumsq = bli_obj_buffer_at_off( sumsq ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, scale, sumsq ); \ + PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \ +\ + /* Handle constants up front. */ \ + if ( dt == BLIS_CONSTANT ) \ + { \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ + } \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ - PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ - PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ + PASTECH(opname,_vft) f = \ + PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ - n, \ - buf_x, incx, \ - buf_scale, \ - buf_sumsq, \ - cntx, \ - rntm \ + file, \ + s1, \ + n, \ + buf_x, incx, \ + format, \ + s2 \ ); \ } -GENFRONT( sumsqv ) +GENFRONT( fprintv ) + + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC0(opname) \ + ( \ + FILE* file, \ + char* s1, \ + obj_t* x, \ + char* format, \ + char* s2 \ + ) \ +{ \ + bli_init_once(); \ +\ + num_t dt = bli_obj_dt( x ); \ +\ + dim_t m = bli_obj_length( x ); \ + dim_t n = bli_obj_width( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t rs_x = bli_obj_row_stride( x ); \ + inc_t cs_x = bli_obj_col_stride( x ); \ +\ + if ( bli_error_checking_is_enabled() ) \ + PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \ +\ + /* Handle constants up front. */ \ + if ( dt == BLIS_CONSTANT ) \ + { \ + float* sp = bli_obj_buffer_for_const( BLIS_FLOAT, x ); \ + double* dp = bli_obj_buffer_for_const( BLIS_DOUBLE, x ); \ + scomplex* cp = bli_obj_buffer_for_const( BLIS_SCOMPLEX, x ); \ + dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, x ); \ + gint_t* ip = bli_obj_buffer_for_const( BLIS_INT, x ); \ +\ + fprintf( file, "%s\n", s1 ); \ + fprintf( file, " float: %9.2e\n", bli_sreal( *sp ) ); \ + fprintf( file, " double: %9.2e\n", bli_dreal( *dp ) ); \ + fprintf( file, " scomplex: %9.2e + %9.2e\n", bli_creal( *cp ), \ + bli_cimag( *cp ) ); \ + fprintf( file, " dcomplex: %9.2e + %9.2e\n", bli_zreal( *zp ), \ + bli_zimag( *zp ) ); \ + fprintf( file, " int: %ld\n", ( long )(*ip) ); \ + fprintf( file, "\n" ); \ + return; \ + } \ +\ + /* Query a type-specific function pointer, except one that uses + void* for function arguments instead of typed pointers. */ \ + PASTECH(opname,_vft) f = \ + PASTEMAC(opname,_qfp)( dt ); \ +\ + f \ + ( \ + file, \ + s1, \ + m, \ + n, \ + buf_x, rs_x, cs_x, \ + format, \ + s2 \ + ); \ +} + +GENFRONT( fprintm ) + + +#undef GENFRONT +#define GENFRONT( opname, varname ) \ +\ +void PASTEMAC0(opname) \ + ( \ + char* s1, \ + obj_t* x, \ + char* format, \ + char* s2 \ + ) \ +{ \ + bli_init_once(); \ +\ + /* Invoke the typed function. */ \ + PASTEMAC0(varname) \ + ( \ + stdout, \ + s1, \ + x, \ + format, \ + s2 \ + ); \ +} + +GENFRONT( printv, fprintv ) +GENFRONT( printm, fprintm ) +#endif // #ifdef BLIS_OAPI_BASIC #endif diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h index 1acce16065..92ce6c95f7 100644 --- a/frame/util/bli_util_oapi.h +++ b/frame/util/bli_util_oapi.h @@ -99,16 +99,12 @@ GENPROT( normim ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); -GENPROT( fprintv ) -GENPROT( fprintm ) +GENPROT( randv ) +GENPROT( randnv ) #undef GENPROT @@ -116,15 +112,12 @@ GENPROT( fprintm ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); -GENPROT( printv ) -GENPROT( printm ) +GENPROT( randm ) +GENPROT( randnm ) #undef GENPROT @@ -132,37 +125,92 @@ GENPROT( printm ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x \ + obj_t* x, \ + obj_t* scale, \ + obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); -GENPROT( randv ) -GENPROT( randnv ) +GENPROT( sumsqv ) + +// ----------------------------------------------------------------------------- + +// Operations with basic interfaces only. +#ifdef BLIS_OAPI_BASIC +/* #undef GENPROT #define GENPROT( opname ) \ \ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ +BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ + obj_t* chi, \ + obj_t* psi, \ + bool* is_eq \ ); -GENPROT( randm ) -GENPROT( randnm ) +GENPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ +BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ - obj_t* scale, \ - obj_t* sumsq \ - BLIS_OAPI_EX_PARAMS \ + obj_t* y, \ + bool* is_eq \ ); -GENPROT( sumsqv ) +GENPROT( eqv ) +*/ + + +#undef GENPROT +#define GENPROT( opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ + ( \ + obj_t* x, \ + obj_t* y, \ + bool* is_eq \ + ); + +GENPROT( eqsc ) +GENPROT( eqv ) +GENPROT( eqm ) + + +#undef GENPROT +#define GENPROT( opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ + ( \ + FILE* file, \ + char* s1, \ + obj_t* x, \ + char* format, \ + char* s2 \ + ); + +GENPROT( fprintv ) +GENPROT( fprintm ) + + +#undef GENPROT +#define GENPROT( opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ + ( \ + char* s1, \ + obj_t* x, \ + char* format, \ + char* s2 \ + ); + +GENPROT( printv ) +GENPROT( printm ) + +#endif // #ifdef BLIS_OAPI_BASIC diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index 489e016bba..ca0b3c279d 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -213,64 +213,6 @@ INSERT_GENTFUNCR_BASIC0( normfm ) INSERT_GENTFUNCR_BASIC0( normim ) -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ -\ -void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - char* s1, \ - dim_t n, \ - void* x, inc_t incx, \ - char* format, \ - char* s2 \ - ) \ -{ \ - bli_init_once(); \ -\ - PASTEMAC(ch,varname) \ - ( \ - stdout, \ - s1, \ - n, \ - x, incx, \ - format, \ - s2 \ - ); \ -} - -INSERT_GENTFUNC_BASIC_I( printv, fprintv ) - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ -\ -void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - char* s1, \ - dim_t m, \ - dim_t n, \ - void* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ - ) \ -{ \ - bli_init_once(); \ -\ - PASTEMAC(ch,varname) \ - ( \ - stdout, \ - s1, \ - m, \ - n, \ - x, rs_x, cs_x, \ - format, \ - s2 \ - ); \ -} - -INSERT_GENTFUNC_BASIC_I( printm, fprintm ) - - #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ @@ -430,6 +372,168 @@ void PASTEMAC2(ch,opname,EX_SUF) \ INSERT_GENTFUNCR_BASIC0( sumsqv ) +// ----------------------------------------------------------------------------- + +// Operations with only basic interfaces. + +#ifdef BLIS_TAPI_BASIC + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + conj_t conjchi, \ + ctype* chi, \ + ctype* psi, \ + bool* is_eq \ + ) \ +{ \ + bli_init_once(); \ +\ + ctype chi_conj; \ +\ + PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \ +\ + *is_eq = PASTEMAC(ch,eq)( chi_conj, *psi ); \ +} + +INSERT_GENTFUNC_BASIC0( eqsc ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy, \ + bool* is_eq \ + ) \ +{ \ + bli_init_once(); \ +\ + /* If x is zero length, return with a result of TRUE. */ \ + if ( bli_zero_dim1( n ) ) { *is_eq = TRUE; return; } \ +\ + /* Obtain a valid context from the gks if necessary. */ \ + /*if ( cntx == NULL ) cntx = bli_gks_query_cntx();*/ \ +\ + *is_eq = PASTEMAC2(ch,opname,_unb_var1) \ + ( \ + conjx, \ + n, \ + x, incx, \ + y, incy \ + ); \ +} + +INSERT_GENTFUNC_BASIC0( eqv ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y, \ + bool* is_eq \ + ) \ +{ \ + bli_init_once(); \ +\ + /* If x has a zero dimension, return with a result of TRUE. See the + _unb_var() variant for why we return TRUE in this scenario. */ \ + if ( bli_zero_dim2( m, n ) ) { *is_eq = TRUE; return; } \ +\ + /* Obtain a valid context from the gks if necessary. */ \ + /*if ( cntx == NULL ) cntx = bli_gks_query_cntx();*/ \ +\ + /* Invoke the helper variant. */ \ + *is_eq = PASTEMAC2(ch,opname,_unb_var1) \ + ( \ + diagoffx, \ + diagx, \ + uplox, \ + transx, \ + m, \ + n, \ + x, rs_x, cs_x, \ + y, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC_BASIC0( eqm ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + char* s1, \ + dim_t n, \ + void* x, inc_t incx, \ + char* format, \ + char* s2 \ + ) \ +{ \ + bli_init_once(); \ +\ + PASTEMAC(ch,varname) \ + ( \ + stdout, \ + s1, \ + n, \ + x, incx, \ + format, \ + s2 \ + ); \ +} + +INSERT_GENTFUNC_BASIC_I( printv, fprintv ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + char* s1, \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + char* format, \ + char* s2 \ + ) \ +{ \ + bli_init_once(); \ +\ + PASTEMAC(ch,varname) \ + ( \ + stdout, \ + s1, \ + m, \ + n, \ + x, rs_x, cs_x, \ + format, \ + s2 \ + ); \ +} + +INSERT_GENTFUNC_BASIC_I( printm, fprintm ) + +#endif // #ifdef BLIS_TAPI_BASIC + #endif diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h index c35702cbc4..43fbbdb063 100644 --- a/frame/util/bli_util_tapi.h +++ b/frame/util/bli_util_tapi.h @@ -103,37 +103,6 @@ INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - char* s1, \ - dim_t n, \ - void* x, inc_t incx, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTPROT_BASIC0_I( printv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - char* s1, \ - dim_t m, \ - dim_t n, \ - void* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTPROT_BASIC0_I( printm ) - - #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ @@ -179,4 +148,89 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ INSERT_GENTPROTR_BASIC0( sumsqv ) +// ----------------------------------------------------------------------------- + +// Operations with basic interfaces only. + +#ifdef BLIS_TAPI_BASIC + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + conj_t conjchi, \ + ctype* chi, \ + ctype* psi, \ + bool* is_eq \ + ); + +INSERT_GENTPROT_BASIC0( eqsc ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy, \ + bool* is_eq \ + ); + +INSERT_GENTPROT_BASIC0( eqv ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y, \ + bool* is_eq \ + ); + +INSERT_GENTPROT_BASIC0( eqm ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + char* s1, \ + dim_t n, \ + void* x, inc_t incx, \ + char* format, \ + char* s2 \ + ); + +INSERT_GENTPROT_BASIC0_I( printv ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + char* s1, \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + char* format, \ + char* s2 \ + ); + +INSERT_GENTPROT_BASIC0_I( printm ) + +#endif // #ifdef BLIS_TAPI_BASIC diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index e4042dd3b1..af550681aa 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -862,85 +862,6 @@ void PASTEMAC(ch,varname) \ INSERT_GENTFUNCR_BASIC( normim_unb_var1, norm1m_unb_var1 ) -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - FILE* file, \ - char* s1, \ - dim_t n, \ - ctype* x, inc_t incx, \ - char* format, \ - char* s2 \ - ) \ -{ \ - dim_t i; \ - ctype* chi1; \ - char default_spec[32] = PASTEMAC(ch,formatspec)(); \ -\ - if ( format == NULL ) format = default_spec; \ -\ - chi1 = x; \ -\ - fprintf( file, "%s\n", s1 ); \ -\ - for ( i = 0; i < n; ++i ) \ - { \ - PASTEMAC(ch,fprints)( file, format, *chi1 ); \ - fprintf( file, "\n" ); \ -\ - chi1 += incx; \ - } \ -\ - fprintf( file, "%s\n", s2 ); \ -} - -INSERT_GENTFUNC_BASIC0_I( fprintv ) - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - FILE* file, \ - char* s1, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ - ) \ -{ \ - dim_t i, j; \ - ctype* chi1; \ - char default_spec[32] = PASTEMAC(ch,formatspec)(); \ -\ - if ( format == NULL ) format = default_spec; \ -\ - fprintf( file, "%s\n", s1 ); \ -\ - for ( i = 0; i < m; ++i ) \ - { \ - for ( j = 0; j < n; ++j ) \ - { \ - chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \ -\ - PASTEMAC(ch,fprints)( file, format, *chi1 ); \ - fprintf( file, " " ); \ - } \ -\ - fprintf( file, "\n" ); \ - } \ -\ - fprintf( file, "%s\n", s2 ); \ - fflush( file ); \ -} - -INSERT_GENTFUNC_BASIC0_I( fprintm ) - - #undef GENTFUNC #define GENTFUNC( ctype, ch, varname, randmac ) \ \ @@ -1215,3 +1136,238 @@ void PASTEMAC(ch,varname) \ INSERT_GENTFUNCR_BASIC0( sumsqv_unb_var1 ) +// ----------------------------------------------------------------------------- + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +bool PASTEMAC(ch,opname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ + ) \ +{ \ + for ( dim_t i = 0; i < n; ++i ) \ + { \ + ctype* chi1 = x + (i )*incx; \ + ctype* psi1 = y + (i )*incy; \ +\ + ctype chi1c; \ +\ + if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *chi1, chi1c ); } \ + else { PASTEMAC(ch,copys)( *chi1, chi1c ); } \ +\ + if ( !PASTEMAC(ch,eq)( chi1c, *psi1 ) ) \ + return FALSE; \ + } \ +\ + return TRUE; \ +} + +INSERT_GENTFUNC_BASIC0( eqv_unb_var1 ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +bool PASTEMAC(ch,opname) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + uplo_t uplox_eff; \ + conj_t conjx; \ + dim_t n_iter; \ + dim_t n_elem_max; \ + inc_t ldx, incx; \ + inc_t ldy, incy; \ + dim_t ij0, n_shift; \ +\ + /* Set various loop parameters. */ \ + bli_set_dims_incs_uplo_2m \ + ( \ + diagoffx, diagx, transx, \ + uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ + &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ + &ij0, &n_shift \ + ); \ +\ + /* In the odd case where we are comparing against a complete unstored + matrix, we assert equality. Why? We assume the matrices are equal + unless we can find two corresponding elements that are unequal. So + if there are no elements, there is no inequality. Granted, this logic + is strange to think about no matter what, and thankfully it should + never be used under normal usage. */ \ + if ( bli_is_zeros( uplox_eff ) ) return TRUE; \ +\ + /* Extract the conjugation component from the transx parameter. */ \ + conjx = bli_extract_conj( transx ); \ +\ + /* Handle dense and upper/lower storage cases separately. */ \ + if ( bli_is_dense( uplox_eff ) ) \ + { \ + for ( dim_t j = 0; j < n_iter; ++j ) \ + { \ + const dim_t n_elem = n_elem_max; \ +\ + ctype* x1 = x + (j )*ldx + (0 )*incx; \ + ctype* y1 = y + (j )*ldy + (0 )*incy; \ +\ + for ( dim_t i = 0; i < n_elem; ++i ) \ + { \ + ctype* x11 = x1 + (i )*incx; \ + ctype* y11 = y1 + (i )*incy; \ + ctype x11c; \ +\ + if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \ + else { PASTEMAC(ch,copys)( *x11, x11c ); } \ +\ + if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \ + return FALSE; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_upper( uplox_eff ) ) \ + { \ + for ( dim_t j = 0; j < n_iter; ++j ) \ + { \ + const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ +\ + ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \ + ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \ +\ + for ( dim_t i = 0; i < n_elem; ++i ) \ + { \ + ctype* x11 = x1 + (i )*incx; \ + ctype* y11 = y1 + (i )*incy; \ + ctype x11c; \ +\ + if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \ + else { PASTEMAC(ch,copys)( *x11, x11c ); } \ +\ + if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \ + return FALSE; \ + } \ + } \ + } \ + else if ( bli_is_lower( uplox_eff ) ) \ + { \ + for ( dim_t j = 0; j < n_iter; ++j ) \ + { \ + const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ + const dim_t n_elem = n_elem_max - offi; \ +\ + ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \ + ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \ +\ + for ( dim_t i = 0; i < n_elem; ++i ) \ + { \ + ctype* x11 = x1 + (i )*incx; \ + ctype* y11 = y1 + (i )*incy; \ + ctype x11c; \ +\ + if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \ + else { PASTEMAC(ch,copys)( *x11, x11c ); } \ +\ + if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \ + return FALSE; \ + } \ + } \ + } \ + } \ +\ + return TRUE; \ +} + +INSERT_GENTFUNC_BASIC0( eqm_unb_var1 ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + FILE* file, \ + char* s1, \ + dim_t n, \ + ctype* x, inc_t incx, \ + char* format, \ + char* s2 \ + ) \ +{ \ + dim_t i; \ + ctype* chi1; \ + char default_spec[32] = PASTEMAC(ch,formatspec)(); \ +\ + if ( format == NULL ) format = default_spec; \ +\ + chi1 = x; \ +\ + fprintf( file, "%s\n", s1 ); \ +\ + for ( i = 0; i < n; ++i ) \ + { \ + PASTEMAC(ch,fprints)( file, format, *chi1 ); \ + fprintf( file, "\n" ); \ +\ + chi1 += incx; \ + } \ +\ + fprintf( file, "%s\n", s2 ); \ +} + +INSERT_GENTFUNC_BASIC0_I( fprintv ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + FILE* file, \ + char* s1, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + char* format, \ + char* s2 \ + ) \ +{ \ + dim_t i, j; \ + ctype* chi1; \ + char default_spec[32] = PASTEMAC(ch,formatspec)(); \ +\ + if ( format == NULL ) format = default_spec; \ +\ + fprintf( file, "%s\n", s1 ); \ +\ + for ( i = 0; i < m; ++i ) \ + { \ + for ( j = 0; j < n; ++j ) \ + { \ + chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \ +\ + PASTEMAC(ch,fprints)( file, format, *chi1 ); \ + fprintf( file, " " ); \ + } \ +\ + fprintf( file, "\n" ); \ + } \ +\ + fprintf( file, "%s\n", s2 ); \ + fflush( file ); \ +} + +INSERT_GENTFUNC_BASIC0_I( fprintm ) + diff --git a/frame/util/bli_util_unb_var1.h b/frame/util/bli_util_unb_var1.h index 3fb517eec9..f878488568 100644 --- a/frame/util/bli_util_unb_var1.h +++ b/frame/util/bli_util_unb_var1.h @@ -107,39 +107,6 @@ INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - FILE* file, \ - char* s1, \ - dim_t n, \ - ctype* x, inc_t incx, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTPROT_BASIC0_I( fprintv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - FILE* file, \ - char* s1, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTPROT_BASIC0_I( fprintm ) - - #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ @@ -188,3 +155,70 @@ void PASTEMAC(ch,varname) \ INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) +// ----------------------------------------------------------------------------- + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +bool PASTEMAC(ch,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ + ); + +INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +bool PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ + ); + +INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + FILE* file, \ + char* s1, \ + dim_t n, \ + ctype* x, inc_t incx, \ + char* format, \ + char* s2 \ + ); + +INSERT_GENTPROT_BASIC0_I( fprintv ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + FILE* file, \ + char* s1, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + char* format, \ + char* s2 \ + ); + +INSERT_GENTPROT_BASIC0_I( fprintm ) + + From 5aa63cd927b22a04e581b07d0b68ef391f4f9b1f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 12 May 2021 19:53:35 -0500 Subject: [PATCH 010/226] Fixed typo in cpp guard in bli_util_ft.h. Details: - Changed #ifdef BLIS_OAPI_BASIC to #ifdef BLIS_TAPI_BASIC in bli_util_ft.h. This typo was causing some types to be redefined when they weren't supposed to be. --- frame/util/bli_util_ft.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h index 8b2dbf69f5..673f4782aa 100644 --- a/frame/util/bli_util_ft.h +++ b/frame/util/bli_util_ft.h @@ -195,7 +195,7 @@ INSERT_GENTDEFR( sumsqv ) // Operations with only basic interfaces. -#ifdef BLIS_OAPI_BASIC +#ifdef BLIS_TAPI_BASIC // eqsc From d4427a5b2f5cab5d2a64c58d87416628867c2b4a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 13 May 2021 13:55:11 -0500 Subject: [PATCH 011/226] Minor preprocessor/header cleanup. Details: - Added frame/include/bli_xapi_undef.h, which explicitly undefines all macros defined in bli_oapi_ba.h, bli_oapi_ex.h, bli_tapi_ba.h, and bli_tapi_ex.h. (This is for safety and good cpp coding practice, not because it fixes anything.) - Added #include "bli_xapi_undef.h" to bli_l1v.h, bli_l1d.h, bli_l1f.h, bli_l1m.h, bli_l2.h, bli_l3.h, and bli_util.h. - Comment updates to bli_oapi_ba.h, bli_oapi_ex.h, bli_tapi_ba.h, and bli_tapi_ex.h. - Moved frame/3/bli_l3_ft_ex.h to local 'old' directory after realizing that nothing in BLIS used those function pointer types. Also commented out the "#include bli_l3_ft_ex.h" directive in frame/3/bli_l3.h. --- frame/1/bli_l1v.h | 3 ++ frame/1d/bli_l1d.h | 3 ++ frame/1f/bli_l1f.h | 3 ++ frame/1m/bli_l1m.h | 3 ++ frame/2/bli_l2.h | 3 ++ frame/3/bli_l3.h | 5 ++- frame/3/{ => old}/bli_l3_ft_ex.h | 0 frame/include/bli_oapi_ba.h | 6 ++-- frame/include/bli_oapi_ex.h | 6 ++-- frame/include/bli_tapi_ba.h | 6 ++-- frame/include/bli_tapi_ex.h | 6 ++-- frame/include/bli_xapi_undef.h | 57 ++++++++++++++++++++++++++++++++ frame/util/bli_util.h | 3 ++ 13 files changed, 91 insertions(+), 13 deletions(-) rename frame/3/{ => old}/bli_l3_ft_ex.h (100%) create mode 100644 frame/include/bli_xapi_undef.h diff --git a/frame/1/bli_l1v.h b/frame/1/bli_l1v.h index c64ed99126..c32d9c3048 100644 --- a/frame/1/bli_l1v.h +++ b/frame/1/bli_l1v.h @@ -54,6 +54,9 @@ #include "bli_l1v_tapi.h" #include "bli_l1v_ft.h" +// Clean up temporary macro defs from bli_?api_[ba|ex].h. +#include "bli_xapi_undef.h" + // Generate function pointer arrays for tapi functions (expert only). #include "bli_l1v_fpa.h" diff --git a/frame/1d/bli_l1d.h b/frame/1d/bli_l1d.h index c0eeb133fe..e0f5d3f963 100644 --- a/frame/1d/bli_l1d.h +++ b/frame/1d/bli_l1d.h @@ -50,6 +50,9 @@ #include "bli_l1d_tapi.h" #include "bli_l1d_ft.h" +// Clean up temporary macro defs from bli_?api_[ba|ex].h. +#include "bli_xapi_undef.h" + // Generate function pointer arrays for tapi functions (expert only). #include "bli_l1d_fpa.h" diff --git a/frame/1f/bli_l1f.h b/frame/1f/bli_l1f.h index 370b3c9a7c..d44914eab2 100644 --- a/frame/1f/bli_l1f.h +++ b/frame/1f/bli_l1f.h @@ -53,6 +53,9 @@ #include "bli_l1f_tapi.h" #include "bli_l1f_ft.h" +// Clean up temporary macro defs from bli_?api_[ba|ex].h. +#include "bli_xapi_undef.h" + // Generate function pointer arrays for tapi functions (expert only). #include "bli_l1f_fpa.h" diff --git a/frame/1m/bli_l1m.h b/frame/1m/bli_l1m.h index 1e782cc682..2a5b205bc2 100644 --- a/frame/1m/bli_l1m.h +++ b/frame/1m/bli_l1m.h @@ -56,6 +56,9 @@ #include "bli_l1m_tapi.h" #include "bli_l1m_ft.h" +// Clean up temporary macro defs from bli_?api_[ba|ex].h. +#include "bli_xapi_undef.h" + // Generate function pointer arrays for tapi functions (expert only). #include "bli_l1m_fpa.h" diff --git a/frame/2/bli_l2.h b/frame/2/bli_l2.h index 9415a0329b..c10dfeb192 100644 --- a/frame/2/bli_l2.h +++ b/frame/2/bli_l2.h @@ -53,6 +53,9 @@ #include "bli_l2_tapi.h" #include "bli_l2_ft.h" +// Clean up temporary macro defs from bli_?api_[ba|ex].h. +#include "bli_xapi_undef.h" + // Generate function pointer arrays for tapi functions (expert only). #include "bli_l2_fpa.h" diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index b64da054c9..e08e9b7f53 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -37,7 +37,7 @@ #include "bli_l3_check.h" // Define function types. -#include "bli_l3_ft_ex.h" +//#include "bli_l3_ft_ex.h" #include "bli_l3_ft_ukr.h" #include "bli_l3_oft.h" #include "bli_l3_oft_var.h" @@ -61,6 +61,9 @@ #include "bli_tapi_ba.h" #include "bli_l3_tapi.h" +// Clean up temporary macro defs from bli_?api_[ba|ex].h. +#include "bli_xapi_undef.h" + // Define function types for small/unpacked handlers/kernels. #include "bli_l3_sup_oft.h" #include "bli_l3_sup_ft_ker.h" diff --git a/frame/3/bli_l3_ft_ex.h b/frame/3/old/bli_l3_ft_ex.h similarity index 100% rename from frame/3/bli_l3_ft_ex.h rename to frame/3/old/bli_l3_ft_ex.h diff --git a/frame/include/bli_oapi_ba.h b/frame/include/bli_oapi_ba.h index a10f436303..e661798ee8 100644 --- a/frame/include/bli_oapi_ba.h +++ b/frame/include/bli_oapi_ba.h @@ -35,13 +35,13 @@ // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. -// Define a macro so that the source code can determine which interface +// Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC -// Define the macro to remove the function name suffix (in function +// Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF @@ -51,7 +51,7 @@ #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS -// Define the macro to declare local expert variables that are initialized +// Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS diff --git a/frame/include/bli_oapi_ex.h b/frame/include/bli_oapi_ex.h index 924963a7d0..f259eeab46 100644 --- a/frame/include/bli_oapi_ex.h +++ b/frame/include/bli_oapi_ex.h @@ -35,14 +35,14 @@ // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. -// Define a macro so that the source code can determine which interface +// Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT -// Define the macro to add a suffix to the object API function names -// (in function definitions). +// Define the macro to add a suffix to the function names (in function +// definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF diff --git a/frame/include/bli_tapi_ba.h b/frame/include/bli_tapi_ba.h index c106670d5a..90b6fdf2be 100644 --- a/frame/include/bli_tapi_ba.h +++ b/frame/include/bli_tapi_ba.h @@ -35,13 +35,13 @@ // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. -// Define a macro so that the source code can determine which interface +// Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC -// Define the macro to remove the function name suffix (in function +// Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF @@ -51,7 +51,7 @@ #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS -// Define the macro to declare local expert variables that are initialized +// Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS diff --git a/frame/include/bli_tapi_ex.h b/frame/include/bli_tapi_ex.h index 04a3ed6451..f803574411 100644 --- a/frame/include/bli_tapi_ex.h +++ b/frame/include/bli_tapi_ex.h @@ -35,14 +35,14 @@ // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. -// Define a macro so that the source code can determine which interface +// Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT -// Define the macro to add a suffix to the typed API function names -// (in function definitions). +// Define the macro to add a suffix to the function names (in function +// definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF diff --git a/frame/include/bli_xapi_undef.h b/frame/include/bli_xapi_undef.h new file mode 100644 index 0000000000..3d13051e51 --- /dev/null +++ b/frame/include/bli_xapi_undef.h @@ -0,0 +1,57 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// This file un-defines macros used to allow the _oapi.c and _tapi.c files to +// produce object and typed APIs that omit or contain expert parameters. + +// Un-define all macros that allow the source code to determine which interface +// (basic or expert) we are compiling. +#undef BLIS_OAPI_BASIC +#undef BLIS_OAPI_EXPERT +#undef BLIS_TAPI_BASIC +#undef BLIS_TAPI_EXPERT + +// Un-define the macro to omit or add the function name suffix (in function +// definitions). +#undef EX_SUF + +// Un-define the macro to omit or add expert arguments from function signatures +// and prototypes. +#undef BLIS_OAPI_EX_PARAMS +#undef BLIS_TAPI_EX_PARAMS + +// Un-define the macro to omit or add local expert variables. +#undef BLIS_OAPI_EX_DECLS +#undef BLIS_TAPI_EX_DECLS + diff --git a/frame/util/bli_util.h b/frame/util/bli_util.h index 6c34ebc676..8efefd6e8c 100644 --- a/frame/util/bli_util.h +++ b/frame/util/bli_util.h @@ -50,6 +50,9 @@ #include "bli_util_tapi.h" #include "bli_util_ft.h" +// Clean up temporary macro defs from bli_?api_[ba|ex].h. +#include "bli_xapi_undef.h" + // Generate function pointer arrays for tapi functions (expert only). #include "bli_util_fpa.h" From b683d01b9c4ea5f64c8031bda816beccfbf806a0 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 13 May 2021 15:23:22 -0500 Subject: [PATCH 012/226] Use extra #undef when including ba/ex API headers. Details: - Inserted a "#include bli_xapi_undef.h" after each usage of the basic and expert API macro setup headers: bli_oapi_ba.h, bli_oapi_ex.h, bli_tapi_ba.h, and bli_tapi_ex.h. This is functionally equivalent to the previous status quo, in which each header made minimal #undef prior to its own definitions and then a single instance of "#include bli_xapi_undef.h" cleaned up any remaining macro defs after all other headers were used. This commit will guarantee that macro defs from the setup of one header (say, bli_oapi_ex.h) don't "infect" the definitions made in a subsequent header. As with this previous commit, this change does not fix any issue but rather attempts to avoid creating orphaned macro definitions that are only needed within a very limited scope. - Removed minimal #undef from bli_?api_[ba|ex].h. - Removed old commented-out lines from bli_?api_[ba|ex].h. --- frame/1/bli_l1v.h | 5 +++-- frame/1d/bli_l1d.h | 5 +++-- frame/1f/bli_l1f.h | 5 +++-- frame/1m/bli_l1m.h | 5 +++-- frame/2/bli_l2.h | 5 +++-- frame/3/bli_l3.h | 5 +++-- frame/include/bli_oapi_ba.h | 5 ----- frame/include/bli_oapi_ex.h | 5 ----- frame/include/bli_tapi_ba.h | 5 ----- frame/include/bli_tapi_ex.h | 5 ----- frame/util/bli_util.h | 5 +++-- 11 files changed, 21 insertions(+), 34 deletions(-) diff --git a/frame/1/bli_l1v.h b/frame/1/bli_l1v.h index c32d9c3048..99ceb3a3fe 100644 --- a/frame/1/bli_l1v.h +++ b/frame/1/bli_l1v.h @@ -41,20 +41,21 @@ // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l1v_oapi.h" +#include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l1v_oapi.h" +#include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l1v_tapi.h" #include "bli_l1v_ft.h" +#include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l1v_tapi.h" #include "bli_l1v_ft.h" - -// Clean up temporary macro defs from bli_?api_[ba|ex].h. #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). diff --git a/frame/1d/bli_l1d.h b/frame/1d/bli_l1d.h index e0f5d3f963..aa42eeb44d 100644 --- a/frame/1d/bli_l1d.h +++ b/frame/1d/bli_l1d.h @@ -37,20 +37,21 @@ // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l1d_oapi.h" +#include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l1d_oapi.h" +#include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l1d_tapi.h" #include "bli_l1d_ft.h" +#include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l1d_tapi.h" #include "bli_l1d_ft.h" - -// Clean up temporary macro defs from bli_?api_[ba|ex].h. #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). diff --git a/frame/1f/bli_l1f.h b/frame/1f/bli_l1f.h index d44914eab2..43676ec4ef 100644 --- a/frame/1f/bli_l1f.h +++ b/frame/1f/bli_l1f.h @@ -40,20 +40,21 @@ // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l1f_oapi.h" +#include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l1f_oapi.h" +#include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l1f_tapi.h" #include "bli_l1f_ft.h" +#include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l1f_tapi.h" #include "bli_l1f_ft.h" - -// Clean up temporary macro defs from bli_?api_[ba|ex].h. #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). diff --git a/frame/1m/bli_l1m.h b/frame/1m/bli_l1m.h index 2a5b205bc2..925b9b376f 100644 --- a/frame/1m/bli_l1m.h +++ b/frame/1m/bli_l1m.h @@ -43,20 +43,21 @@ // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l1m_oapi.h" +#include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l1m_oapi.h" +#include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l1m_tapi.h" #include "bli_l1m_ft.h" +#include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l1m_tapi.h" #include "bli_l1m_ft.h" - -// Clean up temporary macro defs from bli_?api_[ba|ex].h. #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). diff --git a/frame/2/bli_l2.h b/frame/2/bli_l2.h index c10dfeb192..ef4517c98d 100644 --- a/frame/2/bli_l2.h +++ b/frame/2/bli_l2.h @@ -40,20 +40,21 @@ // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l2_oapi.h" +#include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l2_oapi.h" +#include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l2_tapi.h" #include "bli_l2_ft.h" +#include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l2_tapi.h" #include "bli_l2_ft.h" - -// Clean up temporary macro defs from bli_?api_[ba|ex].h. #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index e08e9b7f53..740733c3ed 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -50,18 +50,19 @@ // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l3_oapi.h" +#include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l3_oapi.h" +#include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l3_tapi.h" +#include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l3_tapi.h" - -// Clean up temporary macro defs from bli_?api_[ba|ex].h. #include "bli_xapi_undef.h" // Define function types for small/unpacked handlers/kernels. diff --git a/frame/include/bli_oapi_ba.h b/frame/include/bli_oapi_ba.h index e661798ee8..dc17507d11 100644 --- a/frame/include/bli_oapi_ba.h +++ b/frame/include/bli_oapi_ba.h @@ -37,7 +37,6 @@ // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. -#undef BLIS_OAPI_EXPERT #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC @@ -58,7 +57,3 @@ #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - diff --git a/frame/include/bli_oapi_ex.h b/frame/include/bli_oapi_ex.h index f259eeab46..0eb5eb2a1e 100644 --- a/frame/include/bli_oapi_ex.h +++ b/frame/include/bli_oapi_ex.h @@ -37,7 +37,6 @@ // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. -#undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT @@ -56,7 +55,3 @@ #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - diff --git a/frame/include/bli_tapi_ba.h b/frame/include/bli_tapi_ba.h index 90b6fdf2be..0177985d9d 100644 --- a/frame/include/bli_tapi_ba.h +++ b/frame/include/bli_tapi_ba.h @@ -37,7 +37,6 @@ // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. -#undef BLIS_TAPI_EXPERT #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC @@ -58,7 +57,3 @@ #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - diff --git a/frame/include/bli_tapi_ex.h b/frame/include/bli_tapi_ex.h index f803574411..c999b0ae9e 100644 --- a/frame/include/bli_tapi_ex.h +++ b/frame/include/bli_tapi_ex.h @@ -37,7 +37,6 @@ // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. -#undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT @@ -56,7 +55,3 @@ #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - diff --git a/frame/util/bli_util.h b/frame/util/bli_util.h index 8efefd6e8c..d7e623a43a 100644 --- a/frame/util/bli_util.h +++ b/frame/util/bli_util.h @@ -37,20 +37,21 @@ // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_util_oapi.h" +#include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_util_oapi.h" +#include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_util_tapi.h" #include "bli_util_ft.h" +#include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_util_tapi.h" #include "bli_util_ft.h" - -// Clean up temporary macro defs from bli_?api_[ba|ex].h. #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). From 61584deddf9b3af6d11a811e6e04328d22390202 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Wed, 19 May 2021 23:52:29 +0900 Subject: [PATCH 013/226] Added 512b SVE-based a64fx subconfig + SVE kernels. Details: - Added 512-bit specific 'a64fx' subconfiguration that uses empirically tuned block size by Stepan Nassyr. This subconfig also sets the sector cache size and enables memory-tagging code in SVE gemm kernels. This subconfig utilizes (16, k) and (10, k) DPACKM kernels. - Added a vector-length agnostic 'armsve' subconfiguration that computes blocksizes according to the analytical model. This part is ported from Stepan Nassyr's repository. - Implemented vector-length-agnostic [d/s/sh] gemm kernels for Arm SVE at size (2*VL, 10). These kernels use unindexed FMLA instructions because indexed FMLA takes 2 FMA units in many implementations. PS: There are indexed-FLMA kernels in Stepan Nassyr's repository. - Implemented 512-bit SVE dpackm kernels with in-register transpose support for sizes (16, k) and (10, k). - Extended 256-bit SVE dpackm kernels by Linaro Ltd. to 512-bit for size (12, k). This dpackm kernel is not currently used by any subconfiguration. - Implemented several experimental dgemmsup kernels which would improve performance in a few cases. However, those dgemmsup kernels generally underperform hence they are not currently used in any subconfig. - Note: This commit squashes several commits submitted by RuQing Xu via PR #424. --- config/a64fx/bli_a64fx_sector_cache.h | 117 ++++ config/a64fx/bli_cntx_init_a64fx.c | 151 +++++ config/a64fx/bli_family_a64fx.h | 46 ++ config/a64fx/make_defs.mk | 82 +++ config/armsve/bli_armsve_config_utils.c | 92 +++ config/armsve/bli_armsve_config_utils.h | 42 ++ config/armsve/bli_cntx_init_armsve.c | 157 ++++++ config/armsve/bli_family_armsve.h | 56 ++ config/armsve/make_defs.mk | 82 +++ config_registry | 2 + frame/base/bli_arch.c | 8 + frame/base/bli_cpuid.c | 44 +- frame/base/bli_cpuid.h | 5 +- frame/base/bli_gks.c | 10 + frame/include/bli_arch_config.h | 12 + frame/include/bli_type_defs.h | 4 +- .../armsve/1m/armsve512_asm_transpose_d8x2.h | 45 ++ .../armsve/1m/armsve512_asm_transpose_d8x8.h | 97 ++++ .../armsve/1m/bli_dpackm_armsve256_asm_8xk.c | 9 +- .../armsve/1m/bli_dpackm_armsve512_asm_10xk.c | 365 ++++++++++++ .../armsve/1m/bli_dpackm_armsve512_asm_12xk.c | 359 ++++++++++++ .../armsve/1m/bli_dpackm_armsve512_asm_16xk.c | 363 ++++++++++++ kernels/armsve/3/armsve_asm_2vx10.h | 191 +++++++ kernels/armsve/3/armsve_asm_macros.h | 123 ++++ kernels/armsve/3/armsve_asm_macros_double.h | 46 ++ kernels/armsve/3/armsve_asm_macros_half.h | 46 ++ kernels/armsve/3/armsve_asm_macros_single.h | 46 ++ .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 318 +++++++++++ .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 307 ++++++++++ .../3/bli_gemm_armsve_asm_sh2vx10_unindexed.c | 343 ++++++++++++ kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c | 450 +++++++++++++++ ...i_gemmsup_cv_armsve_asm_d2vx10_unindexed.c | 528 ++++++++++++++++++ ...i_gemmsup_rv_armsve_asm_d2vx10_unindexed.c | 412 ++++++++++++++ kernels/armsve/bli_kernels_armsve.h | 8 + 34 files changed, 4957 insertions(+), 9 deletions(-) create mode 100644 config/a64fx/bli_a64fx_sector_cache.h create mode 100644 config/a64fx/bli_cntx_init_a64fx.c create mode 100644 config/a64fx/bli_family_a64fx.h create mode 100644 config/a64fx/make_defs.mk create mode 100644 config/armsve/bli_armsve_config_utils.c create mode 100644 config/armsve/bli_armsve_config_utils.h create mode 100644 config/armsve/bli_cntx_init_armsve.c create mode 100644 config/armsve/bli_family_armsve.h create mode 100644 config/armsve/make_defs.mk create mode 100644 kernels/armsve/1m/armsve512_asm_transpose_d8x2.h create mode 100644 kernels/armsve/1m/armsve512_asm_transpose_d8x8.h create mode 100644 kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c create mode 100644 kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c create mode 100644 kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c create mode 100644 kernels/armsve/3/armsve_asm_2vx10.h create mode 100644 kernels/armsve/3/armsve_asm_macros.h create mode 100644 kernels/armsve/3/armsve_asm_macros_double.h create mode 100644 kernels/armsve/3/armsve_asm_macros_half.h create mode 100644 kernels/armsve/3/armsve_asm_macros_single.h create mode 100644 kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c create mode 100644 kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c create mode 100644 kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c create mode 100644 kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c create mode 100644 kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c create mode 100644 kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c diff --git a/config/a64fx/bli_a64fx_sector_cache.h b/config/a64fx/bli_a64fx_sector_cache.h new file mode 100644 index 0000000000..a81d04caca --- /dev/null +++ b/config/a64fx/bli_a64fx_sector_cache.h @@ -0,0 +1,117 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Forschunszentrum Juelich + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + // A64FX: set up cache sizes + // + // Reference: A64FX (TM) specification Fujitsu HPC Extension + // Link: https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Specification_HPC_Extension_v1_EN.pdf + // + // 63:15 | 14:12 | 11 | 10:08 | 07 | 06:04 | 03 | 02:00 | + // RES0 | l1_sec3_max | RES0 | l1_sec2_max | RES0 | l1_sec1_max | RES0 | l1_sec0_max | + // + // the bits set number of maximum sectors from 0-7 + // 000 - 0 + // 001 - 1 + // 010 - 2 + // 011 - 3 + // 100 - 4 + // 101 - 5 + // 110 - 6 + // 111 - 7 + // + // For L1 we want to maximize the number of sectors for B + // Configuration 1: 1 sector for C (sector 3) + // 1 sector for A (sector 1) + // 6 sectors for B (sector 2) + // 0 sectors for the rest (sector 0) + // + // 16b bitfield conf. 1: 0b0 001 0 110 0 001 0 000 + // + // Configuration 2: 1 sector for C (sector 3) + // 1 sector for A (sector 1) + // 5 sectors for B (sector 2) + // 1 sectors for the rest (sector 0) + // + // 16b bitfield conf. 2: 0b0 001 0 101 0 001 0 001 + // + // accessing the control register: + // + // MRS , S3_3_C11_C8_2 + // MSR S3_3_C11_C8_2, + // + // TODO: First tests showed no change in performance, a deeper investigation + // is necessary +#define A64FX_SETUP_SECTOR_CACHE_SIZES(config_bitfield)\ +{\ + uint64_t sector_cache_config = config_bitfield;\ + __asm__ volatile(\ + "msr s3_3_c11_c8_2,%[sector_cache_config]"\ + :\ + : [sector_cache_config] "r" (sector_cache_config)\ + :\ + );\ +} + +#define A64FX_SETUP_SECTOR_CACHE_SIZES_L2(config_bitfield)\ +{\ + uint64_t sector_cache_config = config_bitfield;\ + __asm__ volatile(\ + "msr s3_3_c15_c8_2,%[sector_cache_config]"\ + :\ + : [sector_cache_config] "r" (sector_cache_config)\ + :\ + );\ +} + + +#define A64FX_SET_CACHE_SECTOR(areg, tag, sparereg)\ +" mov "#sparereg", "#tag" \n\t"\ +" lsl "#sparereg", "#sparereg", 56 \n\t"\ +" orr "#areg", "#areg", "#sparereg" \n\t" + +#define A64FX_READ_SECTOR_CACHE_SIZES(output_uint64)\ +__asm__ volatile(\ + "mrs %["#output_uint64"],s3_3_c11_c8_2"\ + : [output_uint64] "=r" (output_uint64)\ + : \ + :\ + ); + +#define A64FX_SCC(sec0,sec1,sec2,sec3)\ + (uint64_t)((sec0 & 0x7LU) | ((sec1 & 0x7LU) << 4) | ((sec2 & 0x7LU) << 8) | ((sec3 & 0x7LU) << 12)) + +#define A64FX_SCC_L2(sec02,sec13)\ + (uint64_t)((sec02 & 0x1FLU) | ((sec13 & 0x1FLU) << 8)) + diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c new file mode 100644 index 0000000000..5061570f80 --- /dev/null +++ b/config/a64fx/bli_cntx_init_a64fx.c @@ -0,0 +1,151 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "bli_a64fx_sector_cache.h" + +void bli_cntx_init_a64fx( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + blksz_t thresh[ BLIS_NUM_THRESH ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_a64fx_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native gemm micro-kernels and + // their storage preferences. + bli_cntx_set_l3_nat_ukrs + ( + 2, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, + cntx + ); + + // Set SVE-512 packing routine. + bli_cntx_set_packm_kers + ( + 3, + BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, + BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk, + BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, + cntx + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, -1, -1 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 5, + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + cntx + ); + +#if 0 + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 ); + + // Initialize the context with the sup thresholds. + bli_cntx_set_l3_sup_thresh + ( + 3, + BLIS_MT, &thresh[ BLIS_MT ], + BLIS_NT, &thresh[ BLIS_NT ], + BLIS_KT, &thresh[ BLIS_KT ], + cntx + ); + + // Update the context with optimized small/unpacked gemm kernels. + bli_cntx_set_l3_sup_kers + ( + 4, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, + cntx + ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 ); + + // Update the context with the current architecture's register and cache + // blocksizes for small/unpacked level-3 problems. + bli_cntx_set_l3_sup_blkszs + ( + 5, + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); +#endif + + // Set A64FX cache sector sizes for each PE/CMG + // SC Fugaku might disable users' setting cache sizes. +#if !defined(CACHE_SECTOR_SIZE_READONLY) +#pragma omp parallel + { + A64FX_SETUP_SECTOR_CACHE_SIZES(A64FX_SCC(0,1,3,0)) + A64FX_SETUP_SECTOR_CACHE_SIZES_L2(A64FX_SCC_L2(9,28)) + } +#endif + +} + diff --git a/config/a64fx/bli_family_a64fx.h b/config/a64fx/bli_family_a64fx.h new file mode 100644 index 0000000000..5e3f29fd4b --- /dev/null +++ b/config/a64fx/bli_family_a64fx.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_FAMILY_H +//#define BLIS_FAMILY_H + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +#define BLIS_SIMD_ALIGN_SIZE 256 +#define BLIS_SIMD_NUM_REGISTERS 32 + + +//#endif + diff --git a/config/a64fx/make_defs.mk b/config/a64fx/make_defs.mk new file mode 100644 index 0000000000..d6871fac31 --- /dev/null +++ b/config/a64fx/make_defs.mk @@ -0,0 +1,82 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := a64fx +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := -D_GNU_SOURCE -D_A64FX +CMISCFLAGS := +CPICFLAGS := +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) +CKVECFLAGS := + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/armsve/bli_armsve_config_utils.c b/config/armsve/bli_armsve_config_utils.c new file mode 100644 index 0000000000..fdddeebabe --- /dev/null +++ b/config/armsve/bli_armsve_config_utils.c @@ -0,0 +1,92 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Forschunszentrum Juelich + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "blis.h" + +dim_t bli_vl_bits_armsve(void) +{ \ + uint64_t vl = 0; + __asm__ ( + " mov x0, xzr \n\t" + " incb x0 \n\t" + " mov %[vl], x0 \n\t" + : [vl] "=r" (vl) + : + : "x0" + ); + return vl; +} + + +#define EXPANDMAC_BLKSZ_ARMSVE(ch, S_Data) \ +void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \ + dim_t *k_c_, dim_t *m_c_, dim_t *n_c_) \ +{ \ + dim_t W_L1 = bli_env_get_var("BLIS_SVE_W_L1", W_L1_SVE_DEFAULT); \ + dim_t N_L1 = bli_env_get_var("BLIS_SVE_N_L1", N_L1_SVE_DEFAULT); \ + dim_t C_L1 = bli_env_get_var("BLIS_SVE_C_L1", C_L1_SVE_DEFAULT); \ + dim_t W_L2 = bli_env_get_var("BLIS_SVE_W_L2", W_L2_SVE_DEFAULT); \ + dim_t N_L2 = bli_env_get_var("BLIS_SVE_N_L2", N_L2_SVE_DEFAULT); \ + dim_t C_L2 = bli_env_get_var("BLIS_SVE_C_L2", C_L2_SVE_DEFAULT); \ + dim_t W_L3 = bli_env_get_var("BLIS_SVE_W_L3", W_L3_SVE_DEFAULT); \ + dim_t N_L3 = bli_env_get_var("BLIS_SVE_N_L3", N_L3_SVE_DEFAULT); \ + dim_t C_L3 = bli_env_get_var("BLIS_SVE_C_L3", C_L3_SVE_DEFAULT); \ +\ + dim_t vl_b = bli_vl_bits_armsve(); \ + dim_t vl = vl_b / S_Data; \ + dim_t m_r = 2 * vl; \ + dim_t n_r = 10; \ +\ + dim_t k_c = (dim_t)( floor((W_L1 - 1.0)/(1.0 + (double)n_r/m_r)) * N_L1 * C_L1 ) \ + / (n_r * S_Data); \ +\ + dim_t C_Ac = W_L2 - 1 - ceil( (2.0 * k_c * n_r * S_Data)/(C_L2 * N_L2) ); \ + dim_t m_c = C_Ac * (N_L2 * C_L2)/(k_c * S_Data); \ + m_c -= m_c % m_r; \ +\ + dim_t C_Bc = W_L3 - 1 - ceil( (2.0 * k_c * m_c * S_Data)/(C_L3 * N_L3) ); \ + dim_t n_c = C_Bc * (N_L3 * C_L3)/(k_c * S_Data); \ + n_c -= n_c % n_r; \ +\ + *m_r_ = m_r; \ + *n_r_ = n_r; \ + *k_c_ = k_c; \ + *m_c_ = m_c; \ + *n_c_ = n_c; \ +} + +EXPANDMAC_BLKSZ_ARMSVE( s, 4 ) +EXPANDMAC_BLKSZ_ARMSVE( d, 8 ) + diff --git a/config/armsve/bli_armsve_config_utils.h b/config/armsve/bli_armsve_config_utils.h new file mode 100644 index 0000000000..07aa9ba7d2 --- /dev/null +++ b/config/armsve/bli_armsve_config_utils.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Forschunszentrum Juelich + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "blis.h" + +dim_t bli_vl_bits_armsve(void); + +void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); +void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); + diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c new file mode 100644 index 0000000000..434979f915 --- /dev/null +++ b/config/armsve/bli_cntx_init_armsve.c @@ -0,0 +1,157 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "bli_armsve_config_utils.h" + +void bli_cntx_init_armsve( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; +#if 0 + blksz_t thresh[ BLIS_NUM_THRESH ]; +#endif + + // Set default kernel blocksizes and functions. + bli_cntx_init_armsve_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Block size. + dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s; + dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d; + bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s); + bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d); + + // Update the context with optimized native gemm micro-kernels and + // their storage preferences. + bli_cntx_set_l3_nat_ukrs + ( + 2, + // These are vector-length agnostic kernels. Yet knowing mr is required at runtime. + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, + cntx + ); + + // Set VL-specific packing routines if applicable. + if (m_r_d==16) + bli_cntx_set_packm_kers + ( + 3, + BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, + BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk, + BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, + cntx + ); + else if (m_r_d==8) + bli_cntx_set_packm_kers + ( + 1, + BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_asm_8xk, + cntx + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, -1, -1 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 5, + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + cntx + ); + +#if 0 + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 ); + + // Initialize the context with the sup thresholds. + bli_cntx_set_l3_sup_thresh + ( + 3, + BLIS_MT, &thresh[ BLIS_MT ], + BLIS_NT, &thresh[ BLIS_NT ], + BLIS_KT, &thresh[ BLIS_KT ], + cntx + ); + + // Update the context with optimized small/unpacked gemm kernels. + bli_cntx_set_l3_sup_kers + ( + 4, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, + cntx + ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 ); + + // Update the context with the current architecture's register and cache + // blocksizes for small/unpacked level-3 problems. + bli_cntx_set_l3_sup_blkszs + ( + 5, + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); +#endif +} + diff --git a/config/armsve/bli_family_armsve.h b/config/armsve/bli_family_armsve.h new file mode 100644 index 0000000000..b67ae7c606 --- /dev/null +++ b/config/armsve/bli_family_armsve.h @@ -0,0 +1,56 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_FAMILY_H +//#define BLIS_FAMILY_H + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +#define BLIS_SIMD_ALIGN_SIZE 256 +#define BLIS_SIMD_NUM_REGISTERS 32 + +// SVE-specific configs. +#define N_L1_SVE_DEFAULT 64 +#define W_L1_SVE_DEFAULT 4 +#define C_L1_SVE_DEFAULT 256 +#define N_L2_SVE_DEFAULT 2048 +#define W_L2_SVE_DEFAULT 16 +#define C_L2_SVE_DEFAULT 256 +#define N_L3_SVE_DEFAULT 8192 +#define W_L3_SVE_DEFAULT 16 +#define C_L3_SVE_DEFAULT 256 + +//#endif + diff --git a/config/armsve/make_defs.mk b/config/armsve/make_defs.mk new file mode 100644 index 0000000000..d3495efbb8 --- /dev/null +++ b/config/armsve/make_defs.mk @@ -0,0 +1,82 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := armsve +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := -D_GNU_SOURCE +CMISCFLAGS := +CPICFLAGS := +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) +CKVECFLAGS := + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +CRVECFLAGS := $(CKVECFLAGS) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config_registry b/config_registry index 29eb7640d5..feca9c484d 100644 --- a/config_registry +++ b/config_registry @@ -32,6 +32,8 @@ piledriver: piledriver bulldozer: bulldozer # ARM architectures. +armsve: armsve/armsve +a64fx: a64fx/armsve thunderx2: thunderx2/armv8a cortexa57: cortexa57/armv8a cortexa53: cortexa53/armv8a diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 47ce17fcfe..7fe69919f6 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -173,6 +173,12 @@ void bli_arch_set_id( void ) #endif // ARM microarchitectures. + #ifdef BLIS_FAMILY_ARMSVE + id = BLIS_ARCH_ARMSVE; + #endif + #ifdef BLIS_FAMILY_A64FX + id = BLIS_ARCH_A64FX; + #endif #ifdef BLIS_FAMILY_THUNDERX2 id = BLIS_ARCH_THUNDERX2; #endif @@ -242,6 +248,8 @@ static char* config_name[ BLIS_NUM_ARCHS ] = "thunderx2", "cortexa57", "cortexa53", + "armsve", + "a64fx", "cortexa15", "cortexa9", diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index 87a5971844..bc04f55861 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -76,7 +76,7 @@ arch_t bli_cpuid_query_id( void ) printf( "vendor = %s\n", vendor==1 ? "AMD": "INTEL" ); printf("family = %x\n", family ); printf( "model = %x\n", model ); - + printf( "features = %x\n", features ); #endif @@ -455,6 +455,14 @@ arch_t bli_cpuid_query_id( void ) { // Check for each ARMv8 configuration that is enabled, check for that // microarchitecture. We check from most recent to most dated. +#ifdef BLIS_CONFIG_ARMSVE + if ( bli_cpuid_is_armsve( model, part, features ) ) + return BLIS_ARCH_ARMSVE; +#endif +#ifdef BLIS_CONFIG_A64FX + if ( bli_cpuid_is_a64fx( model, part, features ) ) + return BLIS_ARCH_A64FX; +#endif #ifdef BLIS_CONFIG_THUNDERX2 if ( bli_cpuid_is_thunderx2( model, part, features ) ) return BLIS_ARCH_THUNDERX2; @@ -537,6 +545,36 @@ bool bli_cpuid_is_cortexa53 return TRUE; } +bool bli_cpuid_is_armsve + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_SVE; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + return TRUE; +} + +bool bli_cpuid_is_a64fx + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_SVE; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + return TRUE; +} + bool bli_cpuid_is_cortexa15 ( uint32_t family, @@ -1032,6 +1070,10 @@ uint32_t bli_cpuid_query strstr( feat_str, "asimd" ) != NULL ) *features |= FEATURE_NEON; + // Parse the feature string to check for SVE features. + if ( strstr( feat_str, "sve" ) != NULL ) + *features |= FEATURE_SVE; + //printf( "bli_cpuid_query(): features var: %u\n", *features ); // Parse the processor string to uncover the model. diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h index d7b43d8697..d8e597aee2 100644 --- a/frame/base/bli_cpuid.h +++ b/frame/base/bli_cpuid.h @@ -72,6 +72,8 @@ bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); +bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); +bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); @@ -175,7 +177,8 @@ enum }; enum { - FEATURE_NEON = 0x1 + FEATURE_NEON = 0x01, + FEATURE_SVE = 0x02 }; #endif diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index c789ec2067..03c89a1009 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -144,6 +144,16 @@ void bli_gks_init( void ) bli_cntx_init_cortexa53_ref, bli_cntx_init_cortexa53_ind ); #endif +#ifdef BLIS_CONFIG_ARMSVE + bli_gks_register_cntx( BLIS_ARCH_ARMSVE, bli_cntx_init_armsve, + bli_cntx_init_armsve_ref, + bli_cntx_init_armsve_ind ); +#endif +#ifdef BLIS_CONFIG_A64FX + bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx, + bli_cntx_init_a64fx_ref, + bli_cntx_init_a64fx_ind ); +#endif #ifdef BLIS_CONFIG_CORTEXA15 bli_gks_register_cntx( BLIS_ARCH_CORTEXA15, bli_cntx_init_cortexa15, bli_cntx_init_cortexa15_ref, diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index beb06a4afa..dddb31ad80 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -83,6 +83,12 @@ CNTX_INIT_PROTS( bulldozer ) // -- ARM architectures -- +#ifdef BLIS_CONFIG_ARMSVE +CNTX_INIT_PROTS( armsve ) +#endif +#ifdef BLIS_CONFIG_A64FX +CNTX_INIT_PROTS( a64fx ) +#endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif @@ -183,6 +189,12 @@ CNTX_INIT_PROTS( generic ) // -- ARM architectures -- +#ifdef BLIS_FAMILY_ARMSVE +#include "bli_family_armsve.h" +#endif +#ifdef BLIS_FAMILY_A64FX +#include "bli_family_a64fx.h" +#endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" #endif diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 7dcc5a1b7b..bd9fe66e93 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1005,6 +1005,8 @@ typedef enum BLIS_ARCH_BULLDOZER, // ARM + BLIS_ARCH_ARMSVE, + BLIS_ARCH_A64FX, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, @@ -1029,7 +1031,7 @@ typedef enum // NOTE: This value must be updated to reflect the number of enum values // listed above for arch_t! -#define BLIS_NUM_ARCHS 22 +//#define BLIS_NUM_ARCHS 25 // diff --git a/kernels/armsve/1m/armsve512_asm_transpose_d8x2.h b/kernels/armsve/1m/armsve512_asm_transpose_d8x2.h new file mode 100644 index 0000000000..31dd5704ab --- /dev/null +++ b/kernels/armsve/1m/armsve512_asm_transpose_d8x2.h @@ -0,0 +1,45 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#define SVE512_IN_REG_TRANSPOSE_d8x2(DST0,DST1,DST2,DST3,DST4,DST5,DST6SRC0,DST7SRC1,PT,P2C,P4C,P6C) \ + "trn1 " #DST0".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \ + "trn2 " #DST1".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \ + "compact " #DST2".d, " #P2C", " #DST0".d \n\t" \ + "compact " #DST3".d, " #P2C", " #DST1".d \n\t" \ + "compact " #DST4".d, " #P4C", " #DST0".d \n\t" \ + "compact " #DST5".d, " #P4C", " #DST1".d \n\t" \ + "compact " #DST6SRC0".d, " #P6C", " #DST0".d \n\t" \ + "compact " #DST7SRC1".d, " #P6C", " #DST1".d \n\t" + diff --git a/kernels/armsve/1m/armsve512_asm_transpose_d8x8.h b/kernels/armsve/1m/armsve512_asm_transpose_d8x8.h new file mode 100644 index 0000000000..98426c9476 --- /dev/null +++ b/kernels/armsve/1m/armsve512_asm_transpose_d8x8.h @@ -0,0 +1,97 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#define SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(XTMP,PT,P2C,P4C,P6C,PTFTF,P4,P6) \ + "ptrue " #PT".d \n\t" \ + "mov " #XTMP", #2 \n\t" \ + "whilelo " #P2C".d, xzr, " #XTMP" \n\t" \ + "mov " #XTMP", #4 \n\t" \ + "whilelo " #P4".d, xzr, " #XTMP" \n\t" \ + "mov " #XTMP", #6 \n\t" \ + "whilelo " #P6".d, xzr, " #XTMP" \n\t" \ + \ + "eor " #PTFTF".b, " #PT"/z, " #P6".b, " #P4".b \n\t" /***** o o | o */ \ + "orr " #PTFTF".b, " #PT"/z, " #PTFTF".b, " #P2C".b \n\t" /* | o | o */ \ + \ + "not " #P2C".b, " #PT"/z, " #P2C".b \n\t" \ + "not " #P4C".b, " #PT"/z, " #P4".b \n\t" \ + "not " #P6C".b, " #PT"/z, " #P6".b \n\t" \ + +#define SVE512_IN_REG_TRANSPOSE_d8x8(DST0,DST1,DST2,DST3,DST4,DST5,DST6,DST7,SRC0,SRC1,SRC2,SRC3,SRC4,SRC5,SRC6,SRC7,PT,P2C,P4C,P6C,PTFTF,P4,P6) \ + "trn1 " #DST0".d, " #SRC0".d, " #SRC1".d \n\t" \ + "trn2 " #DST1".d, " #SRC0".d, " #SRC1".d \n\t" \ + "trn1 " #DST2".d, " #SRC2".d, " #SRC3".d \n\t" \ + "trn2 " #DST3".d, " #SRC2".d, " #SRC3".d \n\t" \ + "trn1 " #DST4".d, " #SRC4".d, " #SRC5".d \n\t" \ + "trn2 " #DST5".d, " #SRC4".d, " #SRC5".d \n\t" \ + "trn1 " #DST6".d, " #SRC6".d, " #SRC7".d \n\t" \ + "trn2 " #DST7".d, " #SRC6".d, " #SRC7".d \n\t" \ + \ + "compact " #SRC0".d, " #P2C", " #DST0".d \n\t" \ + "compact " #SRC2".d, " #P2C", " #DST1".d \n\t" \ + "ext " #SRC1".b, " #SRC1".b, " #DST2".b, #48 \n\t" \ + "ext " #SRC3".b, " #SRC3".b, " #DST3".b, #48 \n\t" \ + "compact " #SRC4".d, " #P2C", " #DST4".d \n\t" \ + "compact " #SRC6".d, " #P2C", " #DST5".d \n\t" \ + "ext " #SRC5".b, " #SRC5".b, " #DST6".b, #48 \n\t" \ + "ext " #SRC7".b, " #SRC7".b, " #DST7".b, #48 \n\t" \ + \ + "sel " #DST0".d, " #PTFTF", " #DST0".d, " #SRC1".d \n\t" \ + "sel " #DST2".d, " #PTFTF", " #SRC0".d, " #DST2".d \n\t" \ + "sel " #DST1".d, " #PTFTF", " #DST1".d, " #SRC3".d \n\t" \ + "sel " #DST3".d, " #PTFTF", " #SRC2".d, " #DST3".d \n\t" \ + "sel " #DST4".d, " #PTFTF", " #DST4".d, " #SRC5".d \n\t" \ + "sel " #DST6".d, " #PTFTF", " #SRC4".d, " #DST6".d \n\t" \ + "sel " #DST5".d, " #PTFTF", " #DST5".d, " #SRC7".d \n\t" \ + "sel " #DST7".d, " #PTFTF", " #SRC6".d, " #DST7".d \n\t" \ + \ + "compact " #SRC0".d, " #P4C", " #DST0".d \n\t" \ + "compact " #SRC1".d, " #P4C", " #DST1".d \n\t" \ + "compact " #SRC2".d, " #P4C", " #DST2".d \n\t" \ + "compact " #SRC3".d, " #P4C", " #DST3".d \n\t" \ + "ext " #SRC4".b, " #SRC4".b, " #DST4".b, #32 \n\t" \ + "ext " #SRC5".b, " #SRC5".b, " #DST5".b, #32 \n\t" \ + "ext " #SRC6".b, " #SRC6".b, " #DST6".b, #32 \n\t" \ + "ext " #SRC7".b, " #SRC7".b, " #DST7".b, #32 \n\t" \ + \ + "sel " #DST0".d, " #P4", " #DST0".d, " #SRC4".d \n\t" \ + "sel " #DST1".d, " #P4", " #DST1".d, " #SRC5".d \n\t" \ + "sel " #DST2".d, " #P4", " #DST2".d, " #SRC6".d \n\t" \ + "sel " #DST3".d, " #P4", " #DST3".d, " #SRC7".d \n\t" \ + "sel " #DST4".d, " #P4", " #SRC0".d, " #DST4".d \n\t" \ + "sel " #DST5".d, " #P4", " #SRC1".d, " #DST5".d \n\t" \ + "sel " #DST6".d, " #P4", " #SRC2".d, " #DST6".d \n\t" \ + "sel " #DST7".d, " #P4", " #SRC3".d, " #DST7".d \n\t" + diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c index 82def6df7b..a9b3d0af8a 100644 --- a/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c +++ b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c @@ -52,15 +52,12 @@ void bli_dpackm_armsve256_asm_8xk dim_t cdim_, dim_t n_, dim_t n_max_, - void* restrict kappa_, - void* restrict a_, inc_t inca_, inc_t lda_, - void* restrict p_, inc_t ldp_, + double* restrict kappa, + double* restrict a, inc_t inca_, inc_t lda_, + double* restrict p, inc_t ldp_, cntx_t* restrict cntx ) { - double* a = ( double* )a_; - double* p = ( double* )p_; - double* kappa = ( double* )kappa_; const int64_t cdim = cdim_; const int64_t mnr = 8; const int64_t n = n_; diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c new file mode 100644 index 0000000000..851363a9e0 --- /dev/null +++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c @@ -0,0 +1,365 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "armsve512_asm_transpose_d8x8.h" +#include "armsve512_asm_transpose_d8x2.h" + +// assumption: +// SVE vector length = 512 bits. + +void bli_dpackm_armsve512_asm_10xk + ( + conj_t conja, + pack_t schema, + dim_t cdim_, + dim_t n_, + dim_t n_max_, + double* restrict kappa, + double* restrict a, inc_t inca_, inc_t lda_, + double* restrict p, inc_t ldp_, + cntx_t* restrict cntx + ) +{ + const int64_t cdim = cdim_; + const int64_t mnr = 10; + const int64_t n = n_; + const int64_t n_max = n_max_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + const bool gs = inca != 1 && lda != 1; + const bool unitk = bli_deq1( *kappa ); + +#ifdef _A64FX + if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) ) + { + // A twisted way to infer whether A or B is being packed. + if ( schema == bli_cntx_schema_a_block(cntx) ) + p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p; + if ( schema == bli_cntx_schema_b_panel(cntx) ) + p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p; + } +#endif + + if ( cdim == mnr && !gs && unitk ) + { + uint64_t n_mker = n / 8; + uint64_t n_left = n % 8; + __asm__ volatile ( + "mov x0, %[a] \n\t" + "mov x1, %[p] \n\t" + "mov x2, %[ldp] \n\t" + "mov x3, %[lda] \n\t" + "mov x4, %[inca] \n\t" + "cmp x4, #1 \n\t" + // Skips by sizeof(double). + "mov x8, #8 \n\t" + "madd x2, x2, x8, xzr \n\t" + "madd x3, x3, x8, xzr \n\t" + "madd x4, x4, x8, xzr \n\t" + // Loop constants. + "mov x8, %[n_mker] \n\t" + "mov x9, %[n_left] \n\t" + "ptrue p0.d \n\t" + "b.ne .AROWSTOR \n\t" + // A stored in columns. + " .ACOLSTOR: \n\t" + // Prefetch distance. + "mov x17, #8 \n\t" + "madd x17, x17, x3, xzr \n\t" +#ifdef _A64FX + // Disable hardware prefetch for A. + "mov x16, 0x6 \n\t" + "lsl x16, x16, #60 \n\t" + "orr x0, x0, x16 \n\t" +#endif + " .ACOLSTORMKER: \n\t" + "cmp x8, xzr \n\t" + "b.eq .ACOLSTORMKEREND \n\t" + "add x5, x0, x3 \n\t" + "add x6, x5, x3 \n\t" + "add x7, x6, x3 \n\t" + "ld1d z0.d, p0/z, [x0] \n\t" + "ldr q1, [x0, #64] \n\t" + "ld1d z2.d, p0/z, [x5] \n\t" + "ldr q3, [x5, #64] \n\t" + "ld1d z4.d, p0/z, [x6] \n\t" + "ldr q5, [x6, #64] \n\t" + "ld1d z6.d, p0/z, [x7] \n\t" + "ldr q7, [x7, #64] \n\t" + "add x18, x17, x0 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x5 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x6 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x7 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x0, x7, x3 \n\t" + "add x5, x0, x3 \n\t" + "add x6, x5, x3 \n\t" + "add x7, x6, x3 \n\t" + "ld1d z8.d, p0/z, [x0] \n\t" + "ldr q9, [x0, #64] \n\t" + "ld1d z10.d, p0/z, [x5] \n\t" + "ldr q11, [x5, #64] \n\t" + "ld1d z12.d, p0/z, [x6] \n\t" + "ldr q13, [x6, #64] \n\t" + "ld1d z14.d, p0/z, [x7] \n\t" + "ldr q15, [x7, #64] \n\t" + "add x18, x17, x0 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x5 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x6 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x7 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + // Plain storage + "add x10, x1, x2 \n\t" + "add x11, x10, x2 \n\t" + "add x12, x11, x2 \n\t" + "add x13, x12, x2 \n\t" + "add x14, x13, x2 \n\t" + "add x15, x14, x2 \n\t" + "add x16, x15, x2 \n\t" + "st1d z0.d, p0, [x1] \n\t" + "str q1, [x1, #64] \n\t" + "st1d z2.d, p0, [x10] \n\t" + "str q3, [x10, #64] \n\t" + "st1d z4.d, p0, [x11] \n\t" + "str q5, [x11, #64] \n\t" + "st1d z6.d, p0, [x12] \n\t" + "str q7, [x12, #64] \n\t" + "st1d z8.d, p0, [x13] \n\t" + "str q9, [x13, #64] \n\t" + "st1d z10.d, p0, [x14] \n\t" + "str q11, [x14, #64] \n\t" + "st1d z12.d, p0, [x15] \n\t" + "str q13, [x15, #64] \n\t" + "st1d z14.d, p0, [x16] \n\t" + "str q15, [x16, #64] \n\t" + "add x1, x16, x2 \n\t" + // Realign and store. + // "ext z1.b, z1.b, z1.b, #16 \n\t" + // "ext z1.b, z1.b, z2.b, #48 \n\t" + // "ext z2.b, z2.b, z3.b, #16 \n\t" + // "ext z2.b, z2.b, z4.b, #32 \n\t" + // "ext z4.b, z4.b, z5.b, #16 \n\t" + // "ext z4.b, z4.b, z6.b, #16 \n\t" + // "ext z6.b, z6.b, z7.b, #16 \n\t" + // "ext z9.b, z9.b, z9.b, #16 \n\t" + // "ext z9.b, z9.b, z10.b, #48 \n\t" + // "ext z10.b, z10.b, z11.b, #16 \n\t" + // "ext z10.b, z10.b, z12.b, #32 \n\t" + // "ext z12.b, z12.b, z13.b, #16 \n\t" + // "ext z12.b, z12.b, z14.b, #16 \n\t" + // "ext z14.b, z14.b, z15.b, #16 \n\t" + // "st1d z0.d, p0, [x1] \n\t" + // "st1d z1.d, p0, [x1, #1, mul vl] \n\t" + // "st1d z2.d, p0, [x1, #2, mul vl] \n\t" + // "st1d z4.d, p0, [x1, #3, mul vl] \n\t" + // "st1d z6.d, p0, [x1, #4, mul vl] \n\t" + // "add x1, x1, #320 \n\t" + // "st1d z8.d, p0, [x1] \n\t" + // "st1d z9.d, p0, [x1, #1, mul vl] \n\t" + // "st1d z10.d, p0, [x1, #2, mul vl] \n\t" + // "st1d z12.d, p0, [x1, #3, mul vl] \n\t" + // "st1d z14.d, p0, [x1, #4, mul vl] \n\t" + // "add x1, x1, #320 \n\t" + "add x0, x7, x3 \n\t" + "sub x8, x8, #1 \n\t" + "b .ACOLSTORMKER \n\t" + " .ACOLSTORMKEREND: \n\t" + " .ACOLSTORLEFT: \n\t" + "cmp x9, xzr \n\t" + "b.eq .UNITKDONE \n\t" + "ld1d z0.d, p0/z, [x0] \n\t" + "ldr q1, [x0, #64] \n\t" + "st1d z0.d, p0, [x1] \n\t" + "str q1, [x1, #64] \n\t" + "add x0, x0, x3 \n\t" + "add x1, x1, x2 \n\t" + "sub x9, x9, #1 \n\t" + "b .ACOLSTORLEFT \n\t" + // A stored in rows. + " .AROWSTOR: \n\t" + // Prepare predicates for in-reg transpose. + SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6) + " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful. + "cmp x8, xzr \n\t" + "b.eq .AROWSTORMKEREND \n\t" + "add x10, x0, x4 \n\t" + "add x11, x10, x4 \n\t" + "add x12, x11, x4 \n\t" + "add x13, x12, x4 \n\t" + "add x14, x13, x4 \n\t" + "add x15, x14, x4 \n\t" + "add x16, x15, x4 \n\t" + "add x17, x16, x4 \n\t" + "add x18, x17, x4 \n\t" + "ld1d z0.d, p0/z, [x0] \n\t" + "ld1d z1.d, p0/z, [x10] \n\t" + "ld1d z2.d, p0/z, [x11] \n\t" + "ld1d z3.d, p0/z, [x12] \n\t" + "ld1d z4.d, p0/z, [x13] \n\t" + "ld1d z5.d, p0/z, [x14] \n\t" + "ld1d z6.d, p0/z, [x15] \n\t" + "ld1d z7.d, p0/z, [x16] \n\t" + "ld1d z22.d, p0/z, [x17] \n\t" + "ld1d z23.d, p0/z, [x18] \n\t" + // Transpose first 8 rows. + SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6) + // Transpose last 2 rows. + SVE512_IN_REG_TRANSPOSE_d8x2(z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3) + // Plain storage. + "add x10, x1, x2 \n\t" + "add x11, x10, x2 \n\t" + "add x12, x11, x2 \n\t" + "add x13, x12, x2 \n\t" + "add x14, x13, x2 \n\t" + "add x15, x14, x2 \n\t" + "add x16, x15, x2 \n\t" + "st1d z8.d, p0, [x1] \n\t" + "str q16, [x1, #64] \n\t" + "st1d z9.d, p0, [x10] \n\t" + "str q17, [x10, #64] \n\t" + "st1d z10.d, p0, [x11] \n\t" + "str q18, [x11, #64] \n\t" + "st1d z11.d, p0, [x12] \n\t" + "str q19, [x12, #64] \n\t" + "st1d z12.d, p0, [x13] \n\t" + "str q20, [x13, #64] \n\t" + "st1d z13.d, p0, [x14] \n\t" + "str q21, [x14, #64] \n\t" + "st1d z14.d, p0, [x15] \n\t" + "str q22, [x15, #64] \n\t" + "st1d z15.d, p0, [x16] \n\t" + "str q23, [x16, #64] \n\t" + "add x1, x16, x2 \n\t" + "add x0, x0, #64 \n\t" + "sub x8, x8, #1 \n\t" + "b .AROWSTORMKER \n\t" + " .AROWSTORMKEREND: \n\t" + "mov x4, %[inca] \n\t" // Restore unshifted inca. + "index z30.d, xzr, x4 \n\t" // Generate index. + "lsl x4, x4, #3 \n\t" // Shift again. + "lsl x5, x4, #3 \n\t" // Virtual column vl. + " .AROWSTORLEFT: \n\t" + "cmp x9, xzr \n\t" + "b.eq .UNITKDONE \n\t" + "add x6, x0, x5 \n\t" + "add x7, x6, x4 \n\t" + "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t" + "ldr d1, [x6] \n\t" + "ldr d2, [x7] \n\t" + "trn1 v1.2d, v1.2d, v2.2d \n\t" + "st1d z0.d, p0, [x1] \n\t" + "str q1, [x1, #64] \n\t" + "add x1, x1, x2 \n\t" + "add x0, x0, #8 \n\t" + "sub x9, x9, #1 \n\t" + "b .AROWSTORLEFT \n\t" + " .UNITKDONE: \n\t" + "mov x0, #0 \n\t" + : + : [a] "r" (a), + [p] "r" (p), + [lda] "r" (lda), + [ldp] "r" (ldp), + [inca] "r" (inca), + [n_mker] "r" (n_mker), + [n_left] "r" (n_left) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14","x15", + "x16","x17","x18", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19","z20","z21","z22","z23", + // "z24","z25","z26","z27","z28","z29", + "z30","z31", + "p0", "p1", "p2", "p3", "p4", // "p5", + "p6", "p7", "p8" + ); + } + else // if ( cdim < mnr ) + { + bli_dscal2m_ex + ( + 0, + BLIS_NONUNIT_DIAG, + BLIS_DENSE, + ( trans_t )conja, + cdim, + n, + kappa, + a, inca, lda, + p, 1, ldp, + cntx, + NULL + ); + + // if ( cdim < mnr ) + { + const dim_t i = cdim; + const dim_t m_edge = mnr - i; + const dim_t n_edge = n_max; + double* restrict p_edge = p + (i )*1; + + bli_dset0s_mxn + ( + m_edge, + n_edge, + p_edge, 1, ldp + ); + } + } + + if ( n < n_max ) + { + const dim_t j = n; + const dim_t m_edge = mnr; + const dim_t n_edge = n_max - j; + double* restrict p_edge = p + (j )*ldp; + + bli_dset0s_mxn + ( + m_edge, + n_edge, + p_edge, 1, ldp + ); + } +} diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c new file mode 100644 index 0000000000..9f943fcd66 --- /dev/null +++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_12xk.c @@ -0,0 +1,359 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Linaro Limited + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include + +#ifdef __ARM_FEATURE_SVE +#include +#else +#error "No Arm SVE intrinsics support in compiler" +#endif // __ARM_FEATURE_SVE + +// assumption: +// SVE vector length = 512 bits. +// TODO: +// 2-rows -> 3 vectors packing and use predicator only in odd num of rows to be packed. +// prefetching is needed. + +void bli_dpackm_armsve512_asm_12xk + ( + conj_t conja, + pack_t schema, + dim_t cdim_, + dim_t n_, + dim_t n_max_, + double* restrict kappa, + double* restrict a, inc_t inca_, inc_t lda_, + double* restrict p, inc_t ldp_, + cntx_t* restrict cntx + ) +{ + const int64_t cdim = cdim_; + const int64_t mnr = 12; + const int64_t n = n_; + const int64_t n_max = n_max_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + + double* restrict alpha1 = a; + double* restrict alpha1_8 = alpha1 + 8 * inca; + double* restrict alpha1_p4 = alpha1 + 4 * inca; + double* restrict alpha1_m4 = alpha1 - 4 * inca; + double* restrict pi1 = p; + const svbool_t all_active = svptrue_b64(); + const svbool_t first_half_active = svwhilelt_b64(0, 4); + const svbool_t last_half_active = svnot_z(all_active, first_half_active); + svfloat64_t z_a0; + svfloat64_t z_a8; + svfloat64_t z_a8_lh; + svfloat64_t z_a16; + svuint64_t z_index; + + // creating index for gather/scatter + // with each element as: 0, 1*inca, 2*inca, 3*inca + z_index = svindex_u64( 0, inca * sizeof( double ) ); + + if ( cdim == mnr ) + { + if ( bli_deq1( *kappa ) ) + { + if ( inca == 1 ) // continous memory. packA style + { + dim_t k = n; + // 2 pack into 3 case. + if ( ldp == mnr ) + { + for ( ; k > 1; k -= 2 ) + { + // load 12 continuous elments from *a + z_a0 = svld1_f64( all_active, alpha1 ); + z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 ); + + // forward address - 0 to 1 + alpha1 += lda; + alpha1_p4 = alpha1 + 4 * inca; + alpha1_m4 = alpha1 - 4 * inca; + + // load 12 continuous elments from *a, filling last half of z8. + z_a8_lh = svld1_f64( last_half_active, alpha1_m4 ); + z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh ); + z_a16 = svld1_f64( all_active, alpha1_p4 ); + + // stored packed data into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a8 ); + svst1_vnum_f64( all_active, pi1, 2, z_a16 ); + + // forward address - 1 to 0 + alpha1 += lda; + alpha1_8 = alpha1 + 8 * inca; + pi1 += 2 * ldp; + } + } + // line-by-line packing case. + for ( ; k != 0; --k ) + { + // load 12 continuous elments from *a + z_a0 = svld1_f64( all_active, alpha1 ); + z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 ); + + // store them into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( first_half_active, pi1, 1, z_a8 ); + + alpha1 += lda; + alpha1_8 = alpha1 + 8 * inca; + pi1 += ldp; + } + } + else // gather/scatter load/store. packB style + { + dim_t k = n; + if ( ldp == mnr ) + { + for ( ; k > 1; k -= 2 ) + { + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index ); + + // forward address - 0 to 1 + alpha1 += lda; + alpha1_p4 = alpha1 + 4 * inca; + alpha1_m4 = alpha1 - 4 * inca; + + // gather load from *a, filling last half of z8. + z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index ); + z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh ); + z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index ); + + // stored packed data into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a8 ); + svst1_vnum_f64( all_active, pi1, 2, z_a16 ); + + // forward address - 1 to 0 + alpha1 += lda; + alpha1_8 = alpha1 + 8 * inca; + pi1 += 2 * ldp; + } + } + for ( ; k != 0; --k ) + { + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index ); + + // scatter store into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( first_half_active, pi1, 1, z_a8 ); + + alpha1 += lda; + alpha1_8 = alpha1 + 8 * inca; + pi1 += ldp; + } + } + } + else // *kappa != 1.0 + { + // load kappa into vector + svfloat64_t z_kappa; + + z_kappa = svdup_f64( *kappa ); + + if ( inca == 1 ) // continous memory. packA style + { + dim_t k = n; + if ( ldp == mnr ) + { + for ( ; k > 1; k -= 2 ) + { + // load 12 continuous elments from *a + z_a0 = svld1_f64( all_active, alpha1 ); + z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 ); + + // forward address - 0 to 1 + alpha1 += lda; + alpha1_p4 = alpha1 + 4 * inca; + alpha1_m4 = alpha1 - 4 * inca; + + // load 12 continuous elments from *a, filling last half of z8. + z_a8_lh = svld1_f64( last_half_active, alpha1_m4 ); + z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh ); + z_a16 = svld1_f64( all_active, alpha1_p4 ); + + // multiply by *kappa + z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); + z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 ); + z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 ); + + // stored packed data into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a8 ); + svst1_vnum_f64( all_active, pi1, 2, z_a16 ); + + // forward address - 1 to 0 + alpha1 += lda; + alpha1_8 = alpha1 + 8 * inca; + pi1 += 2 * ldp; + } + } + for ( ; k != 0; --k ) + { + // load 12 continuous elments from *a + z_a0 = svld1_f64( all_active, alpha1 ); + z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 ); + + // multiply by *kappa + z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); + z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 ); + + // store them into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( first_half_active, pi1, 1, z_a8 ); + + alpha1 += lda; + alpha1_8 = alpha1 + 8 * inca; + pi1 += ldp; + } + } + else // gather/scatter load/store. packB style + { + dim_t k = n; + if ( ldp == mnr ) + { + for ( ; k > 1; k -= 2 ) + { + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index ); + + // forward address - 0 to 1 + alpha1 += lda; + alpha1_p4 = alpha1 + 4 * inca; + alpha1_m4 = alpha1 - 4 * inca; + + // gather load from *a, filling last half of z8. + z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index ); + z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh ); + z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index ); + + // multiply by *kappa + z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); + z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 ); + z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 ); + + // stored packed data into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a8 ); + svst1_vnum_f64( all_active, pi1, 2, z_a16 ); + + // forward address - 1 to 0 + alpha1 += lda; + alpha1_8 = alpha1 + 8 * inca; + pi1 += 2 * ldp; + } + } + for ( ; k != 0; --k ) + { + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index ); + + // multiply by *kappa + z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); + z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 ); + + // scatter store into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( first_half_active, pi1, 1, z_a8 ); + + alpha1 += lda; + alpha1_8 = alpha1 + 8 * inca; + pi1 += ldp; + } + } + } // end of if ( *kappa == 1.0 ) + } + else // if ( cdim < mnr ) + { + bli_dscal2m_ex + ( + 0, + BLIS_NONUNIT_DIAG, + BLIS_DENSE, + ( trans_t )conja, + cdim, + n, + kappa, + a, inca, lda, + p, 1, ldp, + cntx, + NULL + ); + + // if ( cdim < mnr ) + { + const dim_t i = cdim; + const dim_t m_edge = mnr - i; + const dim_t n_edge = n_max; + double* restrict p_edge = p + (i )*1; + + bli_dset0s_mxn + ( + m_edge, + n_edge, + p_edge, 1, ldp + ); + } + } + + if ( n < n_max ) + { + const dim_t j = n; + const dim_t m_edge = mnr; + const dim_t n_edge = n_max - j; + double* restrict p_edge = p + (j )*ldp; + + bli_dset0s_mxn + ( + m_edge, + n_edge, + p_edge, 1, ldp + ); + } +} diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c new file mode 100644 index 0000000000..38fb0b9125 --- /dev/null +++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c @@ -0,0 +1,363 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "armsve512_asm_transpose_d8x8.h" + +// assumption: +// SVE vector length = 512 bits. + +void bli_dpackm_armsve512_asm_16xk + ( + conj_t conja, + pack_t schema, + dim_t cdim_, + dim_t n_, + dim_t n_max_, + double* restrict kappa, + double* restrict a, inc_t inca_, inc_t lda_, + double* restrict p, inc_t ldp_, + cntx_t* restrict cntx + ) +{ + const int64_t cdim = cdim_; + const int64_t mnr = 16; + const int64_t n = n_; + const int64_t n_max = n_max_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + const bool gs = inca != 1 && lda != 1; + const bool unitk = bli_deq1( *kappa ); + +#ifdef _A64FX + if ( bli_cntx_schema_a_block(cntx) != bli_cntx_schema_b_panel(cntx) ) + { + // A twisted way to infer whether A or B is being packed. + if ( schema == bli_cntx_schema_a_block(cntx) ) + p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p; + if ( schema == bli_cntx_schema_b_panel(cntx) ) + p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p; + } +#endif + + if ( cdim == mnr && !gs && unitk ) + { + uint64_t n_mker = n / 8; + uint64_t n_left = n % 8; + __asm__ volatile ( + "mov x0, %[a] \n\t" + "mov x1, %[p] \n\t" + "mov x2, %[ldp] \n\t" + "mov x3, %[lda] \n\t" + "mov x4, %[inca] \n\t" + "cmp x4, #1 \n\t" + // Skips by sizeof(double). + "mov x8, #8 \n\t" + "madd x2, x2, x8, xzr \n\t" + "madd x3, x3, x8, xzr \n\t" + "madd x4, x4, x8, xzr \n\t" + + // "mov x8, 0x8 \n\t" // Control#0 for A address. + // "mov x8, 0x24 \n\t" // Higher 6bit for Control#0: + // "lsl x8, x8, #58 \n\t" // Valid|Strong|Strong|Alloc|Load|Strong + // "orr x8, x8, x3 \n\t" // Stride. + // "msr S3_3_C11_C6_0, x8 \n\t" // Write system register. + + // Loop constants. + "mov x8, %[n_mker] \n\t" + "mov x9, %[n_left] \n\t" + "ptrue p0.d \n\t" + "b.ne .AROWSTOR \n\t" + // A stored in columns. + " .ACOLSTOR: \n\t" + // Prefetch distance. + "mov x17, #8 \n\t" + "madd x17, x17, x3, xzr \n\t" +#ifdef _A64FX + "mov x16, 0x6 \n\t" // Disable hardware prefetch for A. + "lsl x16, x16, #60 \n\t" + "orr x0, x0, x16 \n\t" +#endif + // "add x5, x0, x3 \n\t" + // "add x6, x5, x3 \n\t" + // "add x7, x6, x3 \n\t" + // "prfm PLDL1STRM, [x0] \n\t" + // "prfm PLDL1STRM, [x5] \n\t" + // "prfm PLDL1STRM, [x6] \n\t" + // "prfm PLDL1STRM, [x7] \n\t" + // "add x18, x7, x3 \n\t" + // "add x5, x18, x3 \n\t" + // "add x6, x5, x3 \n\t" + // "add x7, x6, x3 \n\t" + // "prfm PLDL1STRM, [x18] \n\t" + // "prfm PLDL1STRM, [x5] \n\t" + // "prfm PLDL1STRM, [x6] \n\t" + // "prfm PLDL1STRM, [x7] \n\t" + " .ACOLSTORMKER: \n\t" + "cmp x8, xzr \n\t" + "b.eq .ACOLSTORMKEREND \n\t" + "add x5, x0, x3 \n\t" + "add x6, x5, x3 \n\t" + "add x7, x6, x3 \n\t" + "add x10, x1, x2 \n\t" + "add x11, x10, x2 \n\t" + "add x12, x11, x2 \n\t" + "add x13, x12, x2 \n\t" + "add x14, x13, x2 \n\t" + "add x15, x14, x2 \n\t" + "add x16, x15, x2 \n\t" + "ld1d z0.d, p0/z, [x0] \n\t" + "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t" + "ld1d z2.d, p0/z, [x5] \n\t" + "ld1d z3.d, p0/z, [x5, #1, mul vl] \n\t" + "ld1d z4.d, p0/z, [x6] \n\t" + "ld1d z5.d, p0/z, [x6, #1, mul vl] \n\t" + "ld1d z6.d, p0/z, [x7] \n\t" + "ld1d z7.d, p0/z, [x7, #1, mul vl] \n\t" + "add x18, x17, x0 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x5 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x6 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x7 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x0, x7, x3 \n\t" + "add x5, x0, x3 \n\t" + "add x6, x5, x3 \n\t" + "add x7, x6, x3 \n\t" + "ld1d z8.d, p0/z, [x0] \n\t" + "ld1d z9.d, p0/z, [x0, #1, mul vl] \n\t" + "ld1d z10.d, p0/z, [x5] \n\t" + "ld1d z11.d, p0/z, [x5, #1, mul vl] \n\t" + "ld1d z12.d, p0/z, [x6] \n\t" + "ld1d z13.d, p0/z, [x6, #1, mul vl] \n\t" + "ld1d z14.d, p0/z, [x7] \n\t" + "ld1d z15.d, p0/z, [x7, #1, mul vl] \n\t" + "add x18, x17, x0 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x5 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x6 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "add x18, x17, x7 \n\t" + "prfm PLDL1STRM, [x18] \n\t" + "st1d z0.d, p0, [x1] \n\t" + "st1d z1.d, p0, [x1, #1, mul vl] \n\t" + "st1d z2.d, p0, [x10] \n\t" + "st1d z3.d, p0, [x10, #1, mul vl] \n\t" + "st1d z4.d, p0, [x11] \n\t" + "st1d z5.d, p0, [x11, #1, mul vl] \n\t" + "st1d z6.d, p0, [x12] \n\t" + "st1d z7.d, p0, [x12, #1, mul vl] \n\t" + "st1d z8.d, p0, [x13] \n\t" + "st1d z9.d, p0, [x13, #1, mul vl] \n\t" + "st1d z10.d, p0, [x14] \n\t" + "st1d z11.d, p0, [x14, #1, mul vl] \n\t" + "st1d z12.d, p0, [x15] \n\t" + "st1d z13.d, p0, [x15, #1, mul vl] \n\t" + "st1d z14.d, p0, [x16] \n\t" + "st1d z15.d, p0, [x16, #1, mul vl] \n\t" + "add x0, x7, x3 \n\t" + "add x1, x16, x2 \n\t" + "sub x8, x8, #1 \n\t" + "b .ACOLSTORMKER \n\t" + " .ACOLSTORMKEREND: \n\t" + " .ACOLSTORLEFT: \n\t" + "cmp x9, xzr \n\t" + "b.eq .UNITKDONE \n\t" + "ld1d z0.d, p0/z, [x0] \n\t" + "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t" + "st1d z0.d, p0, [x1] \n\t" + "st1d z1.d, p0, [x1, #1, mul vl] \n\t" + "add x0, x0, x3 \n\t" + "add x1, x1, x2 \n\t" + "sub x9, x9, #1 \n\t" + "b .ACOLSTORLEFT \n\t" + // A stored in rows. + " .AROWSTOR: \n\t" + // Prepare predicates for in-reg transpose. + SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6) + " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful. + "cmp x8, xzr \n\t" + "b.eq .AROWSTORMKEREND \n\t" + "add x10, x0, x4 \n\t" + "add x11, x10, x4 \n\t" + "add x12, x11, x4 \n\t" + "add x13, x12, x4 \n\t" + "add x14, x13, x4 \n\t" + "add x15, x14, x4 \n\t" + "add x16, x15, x4 \n\t" + "ld1d z0.d, p0/z, [x0] \n\t" + "ld1d z1.d, p0/z, [x10] \n\t" + "ld1d z2.d, p0/z, [x11] \n\t" + "ld1d z3.d, p0/z, [x12] \n\t" + "ld1d z4.d, p0/z, [x13] \n\t" + "ld1d z5.d, p0/z, [x14] \n\t" + "ld1d z6.d, p0/z, [x15] \n\t" + "ld1d z7.d, p0/z, [x16] \n\t" + "add x5, x16, x4 \n\t" + "add x10, x5, x4 \n\t" + "add x11, x10, x4 \n\t" + "add x12, x11, x4 \n\t" + "add x13, x12, x4 \n\t" + "add x14, x13, x4 \n\t" + "add x15, x14, x4 \n\t" + "add x16, x15, x4 \n\t" + "ld1d z16.d, p0/z, [x5] \n\t" + "ld1d z17.d, p0/z, [x10] \n\t" + "ld1d z18.d, p0/z, [x11] \n\t" + "ld1d z19.d, p0/z, [x12] \n\t" + "ld1d z20.d, p0/z, [x13] \n\t" + "ld1d z21.d, p0/z, [x14] \n\t" + "ld1d z22.d, p0/z, [x15] \n\t" + "ld1d z23.d, p0/z, [x16] \n\t" + // Transpose first 8 rows. + SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6) + // Transpose last 8 rows. + SVE512_IN_REG_TRANSPOSE_d8x8(z24,z25,z26,z27,z28,z29,z30,z31,z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3,p8,p4,p6) + "add x10, x1, x2 \n\t" + "add x11, x10, x2 \n\t" + "add x12, x11, x2 \n\t" + "add x13, x12, x2 \n\t" + "add x14, x13, x2 \n\t" + "add x15, x14, x2 \n\t" + "add x16, x15, x2 \n\t" + "st1d z8.d, p0, [x1] \n\t" + "st1d z24.d, p0, [x1, #1, mul vl] \n\t" + "st1d z9.d, p0, [x10] \n\t" + "st1d z25.d, p0, [x10, #1, mul vl] \n\t" + "st1d z10.d, p0, [x11] \n\t" + "st1d z26.d, p0, [x11, #1, mul vl] \n\t" + "st1d z11.d, p0, [x12] \n\t" + "st1d z27.d, p0, [x12, #1, mul vl] \n\t" + "st1d z12.d, p0, [x13] \n\t" + "st1d z28.d, p0, [x13, #1, mul vl] \n\t" + "st1d z13.d, p0, [x14] \n\t" + "st1d z29.d, p0, [x14, #1, mul vl] \n\t" + "st1d z14.d, p0, [x15] \n\t" + "st1d z30.d, p0, [x15, #1, mul vl] \n\t" + "st1d z15.d, p0, [x16] \n\t" + "st1d z31.d, p0, [x16, #1, mul vl] \n\t" + "add x0, x0, #64 \n\t" + "add x1, x16, x2 \n\t" + "sub x8, x8, #1 \n\t" + "b .AROWSTORMKER \n\t" + " .AROWSTORMKEREND: \n\t" + "mov x4, %[inca] \n\t" // Restore unshifted inca. + "index z30.d, xzr, x4 \n\t" // Generate index. + "lsl x4, x4, #3 \n\t" // Shift again. + "lsl x5, x4, #3 \n\t" // Virtual column vl. + " .AROWSTORLEFT: \n\t" + "cmp x9, xzr \n\t" + "b.eq .UNITKDONE \n\t" + "add x6, x0, x5 \n\t" + "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t" + "ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t" + "st1d z0.d, p0, [x1] \n\t" + "st1d z1.d, p0, [x1, #1, mul vl] \n\t" + "add x1, x1, x2 \n\t" + "add x0, x0, #8 \n\t" + "sub x9, x9, #1 \n\t" + "b .AROWSTORLEFT \n\t" + " .UNITKDONE: \n\t" + "mov x0, #0 \n\t" + : + : [a] "r" (a), + [p] "r" (p), + [lda] "r" (lda), + [ldp] "r" (ldp), + [inca] "r" (inca), + [n_mker] "r" (n_mker), + [n_left] "r" (n_left) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14","x15", + "x16","x17","x18", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10","z11","z12","z13","z14","z15", + // "z16","z17","z18","z19","z20","z21","z22","z23", + // "z24","z25","z26","z27","z28","z29","z30","z31", + "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7" + ); + } + else // if ( cdim < mnr ) + { + bli_dscal2m_ex + ( + 0, + BLIS_NONUNIT_DIAG, + BLIS_DENSE, + ( trans_t )conja, + cdim, + n, + kappa, + a, inca, lda, + p, 1, ldp, + cntx, + NULL + ); + + // if ( cdim < mnr ) + { + const dim_t i = cdim; + const dim_t m_edge = mnr - i; + const dim_t n_edge = n_max; + double* restrict p_edge = p + (i )*1; + + bli_dset0s_mxn + ( + m_edge, + n_edge, + p_edge, 1, ldp + ); + } + } + + if ( n < n_max ) + { + const dim_t j = n; + const dim_t m_edge = mnr; + const dim_t n_edge = n_max - j; + double* restrict p_edge = p + (j )*ldp; + + bli_dset0s_mxn + ( + m_edge, + n_edge, + p_edge, 1, ldp + ); + } +} diff --git a/kernels/armsve/3/armsve_asm_2vx10.h b/kernels/armsve/3/armsve_asm_2vx10.h new file mode 100644 index 0000000000..8e37585cba --- /dev/null +++ b/kernels/armsve/3/armsve_asm_2vx10.h @@ -0,0 +1,191 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#define GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \ + GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BADDR,8) \ + GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BADDR,9) \ +" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \ + GEMM_FMLA2_LD1R(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BADDR,0) \ + GEMM_FMLA2_LD1R(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BADDR,1) \ + GEMM_FMLA2_LD1R(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BADDR,2) \ + GEMM_FMLA2_LD1R(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BADDR,3) \ + GEMM_FMLA2_LD1R(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BADDR,4) \ + GEMM_FMLA2_LD1R(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BADDR,5) \ + \ + GEMM_FMLA2_LD1R(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BADDR,6) \ + GEMM_FMLA2_LD1R(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BADDR,7) + +// Second through forth microkernels are the first one with B vectors rotated. +#define GEMM_2VX10_MKER_LOOP_PLAIN_C_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \ + GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BRSBIT) + +#define GEMM_2VX10_MKER_LOOP_PLAIN_C_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \ + GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BRSBIT) + +#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \ + GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BRSBIT) +// NOTE: +// The microkernel (PLAIN_1-4 as a whole) satisfies on entry/exit +// (sth. akin to loop-invariant): +// - BV[0-7] holds B[0:7, 4*k_cur] +// - B's address stops at B[0, 4*k_cur+1] + +// Final loop inside K=4 microkernels. +#define GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \ + GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BADDR,8) \ + GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BADDR,9) \ +" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \ + GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \ + GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \ + GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \ + GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \ + GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \ + GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \ + GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \ + GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7) + +// K=4 MKer loop with B memory scattered. +#define GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \ + GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \ + GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT) \ +" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \ +" mov "#BELMADDR", "#BADDR" \n\t" \ + GEMM_FMLA2_LD1R_G_ELMFWD(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BELMADDR,BCSBIT) \ + GEMM_FMLA2_LD1R_G_ELMFWD(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BELMADDR,BCSBIT) \ + GEMM_FMLA2_LD1R_G_ELMFWD(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BELMADDR,BCSBIT) \ + GEMM_FMLA2_LD1R_G_ELMFWD(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BELMADDR,BCSBIT) \ + GEMM_FMLA2_LD1R_G_ELMFWD(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \ + GEMM_FMLA2_LD1R_G_ELMFWD(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \ + \ + GEMM_FMLA2_LD1R_G_ELMFWD(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \ + GEMM_FMLA2_LD1R_G_ELMFWD(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT) + +#define GEMM_2VX10_MKER_LOOP_PLAIN_G_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \ + GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BELMADDR,BRSBIT,BCSBIT) + +#define GEMM_2VX10_MKER_LOOP_PLAIN_G_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \ + GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BELMADDR,BRSBIT,BCSBIT) + +#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \ + GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BELMADDR,BRSBIT,BCSBIT) + +#define GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \ + GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \ + GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \ +" add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \ +" mov "#BELMADDR", "#BADDR" \n\t" \ + GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \ + GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \ + GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \ + GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \ + GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \ + GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \ + GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \ + GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7) + + +#define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \ + CLEAR_COL4(Z00,Z01,Z02,Z03) \ + CLEAR_COL4(Z04,Z05,Z06,Z07) \ + CLEAR_COL4(Z08,Z09,Z10,Z11) \ + CLEAR_COL4(Z12,Z13,Z14,Z15) \ + CLEAR_COL4(Z16,Z17,Z18,Z19) + +#define SCALE_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19,ZFACTOR) \ + SCALE_COL4(Z00,Z01,Z02,Z03,ZFACTOR) \ + SCALE_COL4(Z04,Z05,Z06,Z07,ZFACTOR) \ + SCALE_COL4(Z08,Z09,Z10,Z11,ZFACTOR) \ + SCALE_COL4(Z12,Z13,Z14,Z15,ZFACTOR) \ + SCALE_COL4(Z16,Z17,Z18,Z19,ZFACTOR) + +#define GEMM_C_FMAD_UKER(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE) \ + GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \ + GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \ + GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \ + GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \ + GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) + +#define GEMM_C_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS) + +#define GEMM_C_STORE_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS) + +#define GEMM_C_FMAD_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,CADDR,CCS) \ + GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \ + GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C0FH,C0LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \ + GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C1FH,C1LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \ + GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C2FH,C2LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \ + GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C3FH,C3LH,PFH,PLH,CADDR,CCS) \ + GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \ + GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C4FH,C4LH,PFH,PLH,CADDR,CCS) + +#define GEMM_C_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_GATHER_LOAD_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_GATHER_LOAD_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_GATHER_LOAD_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_GATHER_LOAD_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_GATHER_LOAD_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) + +#define GEMM_C_STORE_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_SCATTER_STORE_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_SCATTER_STORE_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_SCATTER_STORE_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_SCATTER_STORE_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_SCATTER_STORE_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) + +#define GEMM_C_FMAD_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,ZIDX,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \ + GEMM_CCOL_GATHER_LOAD_FWD(C0FH,C0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \ + GEMM_CCOL_GATHER_LOAD_FWD(C1FH,C1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \ + GEMM_CCOL_GATHER_LOAD_FWD(C2FH,C2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \ + GEMM_CCOL_GATHER_LOAD_FWD(C3FH,C3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \ + GEMM_CCOL_GATHER_LOAD_FWD(C4FH,C4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) + diff --git a/kernels/armsve/3/armsve_asm_macros.h b/kernels/armsve/3/armsve_asm_macros.h new file mode 100644 index 0000000000..5e8eb3c623 --- /dev/null +++ b/kernels/armsve/3/armsve_asm_macros.h @@ -0,0 +1,123 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#define CLEAR_COL2(Z0,Z1) \ +" dup "#Z0"."DT", #0 \n\t" \ +" dup "#Z1"."DT", #0 \n\t" + +#define CLEAR_COL4(Z0,Z1,Z2,Z3) \ + CLEAR_COL2(Z0,Z1) \ + CLEAR_COL2(Z2,Z3) + +#define SCALE_COL2(Z0,Z1,ZFACTOR) \ +" fmul "#Z0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \ +" fmul "#Z1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \ + +#define SCALE_COL4(Z0,Z1,Z2,Z3,ZFACTOR) \ + SCALE_COL2(Z0,Z1,ZFACTOR) \ + SCALE_COL2(Z2,Z3,ZFACTOR) + +// Prefetch or not. +#define PREFETCH_CONTIGUOUS_noprfm(LV,PROP,ADDR,SHIFT) +#define PREFETCH_CONTIGUOUS_prfm(LV,PROP,ADDR,SHIFT) \ +" prfm PLD"#LV""#PROP", ["#ADDR", "#SHIFT"] \n\t" + +#define GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \ +" fmla "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" /* A Row 0 :VL */ \ +" fmla "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t" /* A Row VL:2VL */ + +#define GEMM_FMLA2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \ + GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \ +" "LD1R" "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t" + +#define GEMM_FMLA2_LD1R_G_ELMFWD(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BELMADDR,BCSBIT) \ + GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \ +" "LD1R" "#BV"."DT", "#PT"/z, ["#BELMADDR"] \n\t" /* Load B */ \ +" add "#BELMADDR", "#BELMADDR", "#BCSBIT" \n\t" /* Forward B element */ + +#define GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \ +" "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR"] \n\t" \ +" "LD1" "#ZLH"."DT", "#PLH"/z, ["#AADDR", #1, mul vl]\n\t" + +#define GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \ +" "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR", "#ZIDX"."DT", "OFFS"]\n\t" \ +" add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \ +" "LD1" "#ZLH"."DT", "#PLH"/z, ["#ATEMP", "#ZIDX"."DT", "OFFS"]\n\t" + +// Prefetch or not. +#define GEMM_ACOL_GATHER_noprfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) +#define GEMM_ACOL_GATHER_prfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \ +" "PRFG" PLD"#LV""#PROP", "#PFH", ["#AADDR", "#ZIDX"."DT", "OFFS"] \n\t" \ +" add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \ +" "PRFG" PLD"#LV""#PROP", "#PLH", ["#ATEMP", "#ZIDX"."DT", "OFFS"] \n\t" + +#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(ZFH,ZLH,PFH,PLH,AADDR,A4KS,ACS,ATEMP,PREFMODE) \ +" add "#ATEMP", "#AADDR", "#A4KS" \n\t" \ +" add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \ + GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \ + PREFETCH_CONTIGUOUS_ ##PREFMODE(L1,STRM,ATEMP,0) + +#define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,A4KS,APS,ACS,AVSKIP,ATEMP,PREFMODEL1,PREFMODEL2) \ +" add "#ATEMP", "#AADDR", "#A4KS" \n\t" \ + GEMM_ACOL_GATHER_ ##PREFMODEL1(L1,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \ +" add "#ATEMP", "#AADDR", "#APS" \n\t" \ + GEMM_ACOL_GATHER_ ##PREFMODEL2(L2,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \ +" add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \ + GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) + +#define GEMM_CCOL_CONTIGUOUS_LOAD_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \ + GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,CADDR) \ +" add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (load) to next column. */ + +#define GEMM_CCOL_CONTIGUOUS_STORE_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \ +" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR"] \n\t" \ +" "ST1" "#ZLH"."DT", "#PLH", ["#CADDR", #1, mul vl] \n\t" \ +" add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (store) to next column. */ + +#define GEMM_CCOL_FMAD(ZFH,ZLH,PFH,PLH,CFH,CLH,ZSCALE) \ +" fmad "#ZFH"."DT", "#PFH"/m, "#ZSCALE"."DT", "#CFH"."DT" \n\t" \ +" fmad "#ZLH"."DT", "#PLH"/m, "#ZSCALE"."DT", "#CLH"."DT" \n\t" + +#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ + GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CVSKIP,CTEMP) \ +" add "#CADDR", "#CADDR", "#CCS" \n\t" + +#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ +" "ST1" "#ZFH"."DT", "#PFH", ["#CADDR", "#ZIDX"."DT", "OFFS"]\n\t" \ +" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \ +" "ST1" "#ZLH"."DT", "#PLH", ["#CTEMP", "#ZIDX"."DT", "OFFS"]\n\t" \ +" add "#CADDR", "#CADDR", "#CCS" \n\t" + + diff --git a/kernels/armsve/3/armsve_asm_macros_double.h b/kernels/armsve/3/armsve_asm_macros_double.h new file mode 100644 index 0000000000..f93d3f3821 --- /dev/null +++ b/kernels/armsve/3/armsve_asm_macros_double.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +// Specify to use double precision. +#define DT "d" +#define LD1 "ld1d" +#define ST1 "st1d" +#define LD1R "ld1rd" +#define PRFG "prfd" +#define SZ "8" +#define OFFS "lsl #3" +// Include macros. +#include "armsve_asm_macros.h" + diff --git a/kernels/armsve/3/armsve_asm_macros_half.h b/kernels/armsve/3/armsve_asm_macros_half.h new file mode 100644 index 0000000000..9a46763ef2 --- /dev/null +++ b/kernels/armsve/3/armsve_asm_macros_half.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +// Specify to use half precision. +#define DT "h" +#define LD1 "ld1h" +#define ST1 "st1h" +#define LD1R "ld1rh" +#define PRFG "prfh" +#define SZ "2" +// #define OFFS UNSUPPORTED +// Include macros. +#include "armsve_asm_macros.h" + diff --git a/kernels/armsve/3/armsve_asm_macros_single.h b/kernels/armsve/3/armsve_asm_macros_single.h new file mode 100644 index 0000000000..2203de3453 --- /dev/null +++ b/kernels/armsve/3/armsve_asm_macros_single.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +// Specify to use single precision. +#define DT "s" +#define LD1 "ld1w" +#define ST1 "st1w" +#define LD1R "ld1rw" +#define PRFG "prfw" +#define SZ "4" +#define OFFS "uxtw #2" +// Include macros. +#include "armsve_asm_macros.h" + diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c new file mode 100644 index 0000000000..5824d2d550 --- /dev/null +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -0,0 +1,318 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Forschunszentrum Juelich + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "blis.h" + +// Double-precision composite instructions. +#include "armsve_asm_macros_double.h" + +// 2vx10 microkernels. +#include "armsve_asm_2vx10.h" + +void bli_dgemm_armsve_asm_2vx10_unindexed + ( + dim_t k0, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + __asm__ volatile ( +" ldr x0, %[a] \n\t" +" ldr x1, %[b] \n\t" +" mov x2, xzr \n\t" +" incd x2, ALL, MUL #2 \n\t" // Column-skip of A. +" mov x3, #10 \n\t" // Row-skip of B. +" \n\t" +" ldr x5, %[c] \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +#ifdef _A64FX +" mov x8, 0x3 \n\t" // Tag C address. +" lsl x8, x8, #56 \n\t" +" orr x5, x5, x8 \n\t" +" mov x8, 0x2 \n\t" // Tag B address. +" lsl x8, x8, #56 \n\t" +" orr x1, x1, x8 \n\t" +" mov x8, 0x1 \n\t" // Tag A address. +" lsl x8, x8, #56 \n\t" +" orr x0, x0, x8 \n\t" +#endif +" \n\t" +" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double). +" madd x2, x8, x2, xzr \n\t" // cs_a +" madd x3, x8, x3, xzr \n\t" // rs_b +" madd x7, x8, x7, xzr \n\t" // cs_c +" ptrue p0.d \n\t" +" \n\t" +" ldr x4, %[k_mker] \n\t" // Number of loops. +" ldr x8, %[k_left] \n\t" +" \n\t" +" LOAD_ABC: \n\t" +" cmp x4, #0 \n\t" // Don't preload if no microkernel there. +" b.eq END_CCOL_PRFM \n\t" + +" ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row. +" ld1rd z21.d, p0/z, [x1, 8] \n\t" +" ld1rd z22.d, p0/z, [x1, 16] \n\t" +" ld1rd z23.d, p0/z, [x1, 24] \n\t" +" ld1rd z24.d, p0/z, [x1, 32] \n\t" +" ld1rd z25.d, p0/z, [x1, 40] \n\t" +" ld1rd z26.d, p0/z, [x1, 48] \n\t" +" ld1rd z27.d, p0/z, [x1, 56] \n\t" +" \n\t" +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +" \n\t" +" CCOL_PRFM: \n\t" +" cmp x6, #1 \n\t" +" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +" mov x16, x5 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" END_CCOL_PRFM: \n\t" +" \n\t" +CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) +" \n\t" +" cmp x4, #0 \n\t" // If no 4-microkernel can be applied +" b.eq K_LEFT_LOOP \n\t" +" \n\t" +" K_MKER_LOOP: \n\t" +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" \n\t" +" subs x4, x4, #1 \n\t" // Decrease counter before final replica. +" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" b K_MKER_LOOP \n\t" +" \n\t" +" FIN_MKER_LOOP: \n\t" +GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" add x0, x0, x2 \n\t" // Forward A to fill the blank. +" \n\t" +" K_LEFT_LOOP: \n\t" +" cmp x8, #0 \n\t" // End of execution. +" b.eq WRITE_MEM_PREP \n\t" +" \n\t" +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +" ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row. +" ld1rd z21.d, p0/z, [x1, 8] \n\t" +" ld1rd z22.d, p0/z, [x1, 16] \n\t" +" ld1rd z23.d, p0/z, [x1, 24] \n\t" +" ld1rd z24.d, p0/z, [x1, 32] \n\t" +" ld1rd z25.d, p0/z, [x1, 40] \n\t" +" ld1rd z26.d, p0/z, [x1, 48] \n\t" +" ld1rd z27.d, p0/z, [x1, 56] \n\t" +" ld1rd z28.d, p0/z, [x1, 64] \n\t" +" ld1rd z29.d, p0/z, [x1, 72] \n\t" +GEMM_FMLA2(z0,z1,p0,z30,z31,z20) +GEMM_FMLA2(z2,z3,p0,z30,z31,z21) +GEMM_FMLA2(z4,z5,p0,z30,z31,z22) +GEMM_FMLA2(z6,z7,p0,z30,z31,z23) +GEMM_FMLA2(z8,z9,p0,z30,z31,z24) +GEMM_FMLA2(z10,z11,p0,z30,z31,z25) +GEMM_FMLA2(z12,z13,p0,z30,z31,z26) +GEMM_FMLA2(z14,z15,p0,z30,z31,z27) +GEMM_FMLA2(z16,z17,p0,z30,z31,z28) +GEMM_FMLA2(z18,z19,p0,z30,z31,z29) +" add x0, x0, x2 \n\t" // Forward A. +" add x1, x1, x3 \n\t" // Forward B. +" sub x8, x8, #1 \n\t" +" b K_LEFT_LOOP \n\t" // Next column / row. +" \n\t" +" WRITE_MEM_PREP: \n\t" +" \n\t" +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ldr x4, [x4] \n\t" // Load alpha & beta (value). +" ldr x8, [x8] \n\t" +" dup z30.d, x4 \n\t" // Broadcast alpha & beta into vectors. +" dup z31.d, x8 \n\t" +" fmov d28, #1.0 \n\t" // Prepare FP 1.0. +" fmov x16, d28 \n\t" +" \n\t" +" PREFETCH_ABNEXT: \n\t" +" ldr x0, %[a_next] \n\t" +" ldr x1, %[b_next] \n\t" +#ifdef _A64FX +" mov x8, 0x2 \n\t" // Tag B address. +" lsl x8, x8, #56 \n\t" +" orr x1, x1, x8 \n\t" +" mov x8, 0x1 \n\t" // Tag A address. +" lsl x8, x8, #56 \n\t" +" orr x0, x0, x8 \n\t" +#endif +" prfm PLDL1STRM, [x0] \n\t" +" prfm PLDL1STRM, [x0, 256*1] \n\t" +// " prfm PLDL2KEEP, [x0, 256*2] \n\t" +// " prfm PLDL2KEEP, [x0, 256*3] \n\t" +// " prfm PLDL2KEEP, [x0, 256*4] \n\t" +// " prfm PLDL2KEEP, [x0, 256*5] \n\t" +// " prfm PLDL2KEEP, [x0, 256*6] \n\t" +// " prfm PLDL2KEEP, [x0, 256*7] \n\t" +// " prfm PLDL2KEEP, [x0, 256*8] \n\t" +// " prfm PLDL2KEEP, [x0, 256*9] \n\t" +// " prfm PLDL2KEEP, [x0, 256*10] \n\t" +// " prfm PLDL2KEEP, [x0, 256*11] \n\t" +// " prfm PLDL2KEEP, [x0, 256*12] \n\t" +// " prfm PLDL2KEEP, [x0, 256*13] \n\t" +// " prfm PLDL2KEEP, [x0, 256*14] \n\t" +// " prfm PLDL2KEEP, [x0, 256*15] \n\t" +" prfm PLDL1STRM, [x1] \n\t" +" prfm PLDL1STRM, [x1, 256*1] \n\t" +// " prfm PLDL2KEEP, [x1, 256*2] \n\t" +// " prfm PLDL2KEEP, [x1, 256*3] \n\t" +// " prfm PLDL2KEEP, [x1, 256*4] \n\t" +// " prfm PLDL2KEEP, [x1, 256*5] \n\t" +// " prfm PLDL2KEEP, [x1, 256*6] \n\t" +// " prfm PLDL2KEEP, [x1, 256*7] \n\t" +// " prfm PLDL2KEEP, [x1, 256*8] \n\t" +// " prfm PLDL2KEEP, [x1, 256*9] \n\t" +" \n\t" +" mov x9, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x6, #1 \n\t" // Preload first half of C for contiguous case. +" b.ne WRITE_MEM \n\t" +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +" \n\t" +" WRITE_MEM: \n\t" +" \n\t" +" cmp x16, x4 \n\t" +" b.eq UNIT_ALPHA \n\t" +" \n\t" +SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) +" \n\t" +" UNIT_ALPHA: \n\t" +" cmp x6, #1 \n\t" +" b.ne WRITE_MEM_G \n\t" +" \n\t" +" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. +" \n\t" // Here used scratch: Z[20-29]. +// First half of C is already loaded in this case. +GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) +" \n\t" +GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) +" b END_WRITE_MEM \n\t" +" \n\t" +" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. +" \n\t" // Here used scratch: Z[20-30] - Z30 as index. +" mov x8, xzr \n\t" +" incb x8 \n\t" +" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. +" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. +GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) +GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16) +" \n\t" +GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) +" \n\t" +" END_WRITE_MEM: \n\t" +" b END_EXEC \n\t" +" \n\t" +" END_ERROR: \n\t" +" mov x0, #1 \n\t" // Return error. +" END_EXEC: \n\t" +" mov x0, #0 \n\t" // Return normal. +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta), + [a_next] "m" (a_next), + [b_next] "m" (b_next) +: "x0","x1","x2","x3","x4","x5","x6","x7","x8", + "x9","x16", + "z0","z1","z2","z3","z4","z5","z6","z7", + "z8","z9","z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19", + "z20","z21","z22","z23", + "z24","z25","z26","z27", + "z28","z29","z30","z31" + ); +} + diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c new file mode 100644 index 0000000000..8659e8b7ee --- /dev/null +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -0,0 +1,307 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + Copyright (C) 2019, Forschunszentrum Juelich + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "blis.h" + +// Single-precision composite instructions. +#include "armsve_asm_macros_single.h" + +// 2vx10 microkernels. +#include "armsve_asm_2vx10.h" + +void bli_sgemm_armsve_asm_2vx10_unindexed + ( + dim_t k0, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + __asm__ volatile ( +" ldr x0, %[a] \n\t" +" ldr x1, %[b] \n\t" +" mov x2, xzr \n\t" +" incw x2, ALL, MUL #2 \n\t" // Column-skip of A. +" mov x3, #10 \n\t" // Row-skip of B. +" \n\t" +" ldr x5, %[c] \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +#ifdef _A64FX +" mov x8, 0x3 \n\t" // Tag C address. +" lsl x8, x8, #56 \n\t" +" orr x5, x5, x8 \n\t" +" mov x8, 0x2 \n\t" // Tag B address. +" lsl x8, x8, #56 \n\t" +" orr x1, x1, x8 \n\t" +" mov x8, 0x1 \n\t" // Tag A address. +" lsl x8, x8, #56 \n\t" +" orr x0, x0, x8 \n\t" +#endif +" \n\t" +" mov x8, #4 \n\t" // Multiply some address skips by sizeof(float). +" madd x2, x8, x2, xzr \n\t" // cs_a +" madd x3, x8, x3, xzr \n\t" // rs_b +" madd x7, x8, x7, xzr \n\t" // cs_c +" ptrue p0.s \n\t" +" \n\t" +" ldr x4, %[k_mker] \n\t" // Number of loops. +" ldr x8, %[k_left] \n\t" +" \n\t" +" LOAD_ABC: \n\t" +" cmp x4, #0 \n\t" // Don't preload if no microkernel there. +" b.eq END_CCOL_PRFM \n\t" + +" ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row. +" ld1rw z21.s, p0/z, [x1, 4] \n\t" +" ld1rw z22.s, p0/z, [x1, 8] \n\t" +" ld1rw z23.s, p0/z, [x1, 12] \n\t" +" ld1rw z24.s, p0/z, [x1, 16] \n\t" +" ld1rw z25.s, p0/z, [x1, 20] \n\t" +" ld1rw z26.s, p0/z, [x1, 24] \n\t" +" ld1rw z27.s, p0/z, [x1, 28] \n\t" +" \n\t" +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +" \n\t" +" CCOL_PRFM: \n\t" +" cmp x6, #1 \n\t" +" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +" mov x16, x5 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" END_CCOL_PRFM: \n\t" +" \n\t" +CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) +" \n\t" +" cmp x4, #0 \n\t" // If no 4-microkernel can be applied +" b.eq K_LEFT_LOOP \n\t" +" \n\t" +" K_MKER_LOOP: \n\t" +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" \n\t" +" subs x4, x4, #1 \n\t" // Decrease counter before final replica. +" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" b K_MKER_LOOP \n\t" +" \n\t" +" FIN_MKER_LOOP: \n\t" +GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" add x0, x0, x2 \n\t" // Forward A to fill the blank. +" \n\t" +" K_LEFT_LOOP: \n\t" +" cmp x8, #0 \n\t" // End of execution. +" b.eq WRITE_MEM_PREP \n\t" +" \n\t" +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +" ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row. +" ld1rw z21.s, p0/z, [x1, 4] \n\t" +" ld1rw z22.s, p0/z, [x1, 8] \n\t" +" ld1rw z23.s, p0/z, [x1, 12] \n\t" +" ld1rw z24.s, p0/z, [x1, 16] \n\t" +" ld1rw z25.s, p0/z, [x1, 20] \n\t" +" ld1rw z26.s, p0/z, [x1, 24] \n\t" +" ld1rw z27.s, p0/z, [x1, 28] \n\t" +" ld1rw z28.s, p0/z, [x1, 32] \n\t" +" ld1rw z29.s, p0/z, [x1, 36] \n\t" +GEMM_FMLA2(z0,z1,p0,z30,z31,z20) +GEMM_FMLA2(z2,z3,p0,z30,z31,z21) +GEMM_FMLA2(z4,z5,p0,z30,z31,z22) +GEMM_FMLA2(z6,z7,p0,z30,z31,z23) +GEMM_FMLA2(z8,z9,p0,z30,z31,z24) +GEMM_FMLA2(z10,z11,p0,z30,z31,z25) +GEMM_FMLA2(z12,z13,p0,z30,z31,z26) +GEMM_FMLA2(z14,z15,p0,z30,z31,z27) +GEMM_FMLA2(z16,z17,p0,z30,z31,z28) +GEMM_FMLA2(z18,z19,p0,z30,z31,z29) +" add x0, x0, x2 \n\t" // Forward A. +" add x1, x1, x3 \n\t" // Forward B. +" sub x8, x8, #1 \n\t" +" b K_LEFT_LOOP \n\t" // Next column / row. +" \n\t" +" WRITE_MEM_PREP: \n\t" +" \n\t" +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ldr w4, [x4] \n\t" // Load alpha & beta (value). +" ldr w8, [x8] \n\t" +" dup z30.s, w4 \n\t" // Broadcast alpha & beta into vectors. +" dup z31.s, w8 \n\t" +" \n\t" +" PREFETCH_ABNEXT: \n\t" +" ldr x0, %[a_next] \n\t" +" ldr x1, %[b_next] \n\t" +" prfm PLDL2KEEP, [x0] \n\t" +" prfm PLDL2KEEP, [x0, 256*1] \n\t" +" prfm PLDL2KEEP, [x0, 256*2] \n\t" +" prfm PLDL2KEEP, [x0, 256*3] \n\t" +" prfm PLDL2KEEP, [x0, 256*4] \n\t" +" prfm PLDL2KEEP, [x0, 256*5] \n\t" +" prfm PLDL2KEEP, [x0, 256*6] \n\t" +" prfm PLDL2KEEP, [x0, 256*7] \n\t" +" prfm PLDL2KEEP, [x0, 256*8] \n\t" +" prfm PLDL2KEEP, [x0, 256*9] \n\t" +" prfm PLDL2KEEP, [x0, 256*10] \n\t" +" prfm PLDL2KEEP, [x0, 256*11] \n\t" +" prfm PLDL2KEEP, [x0, 256*12] \n\t" +" prfm PLDL2KEEP, [x0, 256*13] \n\t" +" prfm PLDL2KEEP, [x0, 256*14] \n\t" +" prfm PLDL2KEEP, [x0, 256*15] \n\t" +" prfm PLDL2KEEP, [x1] \n\t" +" prfm PLDL2KEEP, [x1, 256*1] \n\t" +" prfm PLDL2KEEP, [x1, 256*2] \n\t" +" prfm PLDL2KEEP, [x1, 256*3] \n\t" +" prfm PLDL2KEEP, [x1, 256*4] \n\t" +" prfm PLDL2KEEP, [x1, 256*5] \n\t" +" prfm PLDL2KEEP, [x1, 256*6] \n\t" +" prfm PLDL2KEEP, [x1, 256*7] \n\t" +" prfm PLDL2KEEP, [x1, 256*8] \n\t" +" prfm PLDL2KEEP, [x1, 256*9] \n\t" +" \n\t" +" WRITE_MEM: \n\t" +" \n\t" +" fmov s28, #1.0 \n\t" +" fmov w16, s28 \n\t" +" cmp w16, w4 \n\t" +" b.eq UNIT_ALPHA \n\t" +" \n\t" +SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) +" \n\t" +" UNIT_ALPHA: \n\t" +" mov x9, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x6, #1 \n\t" +" b.ne WRITE_MEM_G \n\t" +" \n\t" +" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. +" \n\t" // Here used scratch: Z[20-29]. +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) +GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7) +" \n\t" +GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) +" b END_WRITE_MEM \n\t" +" \n\t" +" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. +" \n\t" // Here used scratch: Z[20-30] - Z30 as index. +" mov x8, xzr \n\t" +" incb x8 \n\t" +" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. +" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. +GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) +GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16) +" \n\t" +GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) +" \n\t" +" END_WRITE_MEM: \n\t" +" b END_EXEC \n\t" +" \n\t" +" END_ERROR: \n\t" +" mov x0, #1 \n\t" // Return error. +" END_EXEC: \n\t" +" mov x0, #0 \n\t" // Return normal. +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta), + [a_next] "m" (a_next), + [b_next] "m" (b_next) +: "x0","x1","x2","x3","x4","x5","x6","x7","x8", + "x9","x16", + "z0","z1","z2","z3","z4","z5","z6","z7", + "z8","z9","z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19", + "z20","z21","z22","z23", + "z24","z25","z26","z27", + "z28","z29","z30","z31" + ); +} + diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c new file mode 100644 index 0000000000..817153bfe9 --- /dev/null +++ b/kernels/armsve/3/bli_gemm_armsve_asm_sh2vx10_unindexed.c @@ -0,0 +1,343 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + Copyright (C) 2019, Forschunszentrum Juelich + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "blis.h" + +// Half-precision composite instructions. +#include "armsve_asm_macros_half.h" + +// 2vx10 microkernels. +#include "armsve_asm_2vx10.h" + +// Gather-load / scatter-store instruction for half-precision +// needs being defined separately. +#undef GEMM_CCOL_GATHER_LOAD_FWD +#undef GEMM_CCOL_SCATTER_STORE_FWD + +#define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \ +" add x28, "#CADDR", "#CRS2" \n\t" \ +" ld1h z31.s, "#PT"/z, ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \ +" ld1h "#ZFH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \ +" revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \ +" fadd "#ZFH".h, "#ZFH".h, z31.h \n\t" \ +" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \ +" add x28, "#CTEMP", "#CRS2" \n\t" \ +" ld1h z31.s, "#PT"/z, ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \ +" ld1h "#ZLH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \ +" revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \ +" fadd "#ZLH".h, "#ZLH".h, z31.h \n\t" \ +" add "#CADDR", "#CADDR", "#CCS" \n\t" + +#define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \ +" add x28, "#CADDR", "#CRS2" \n\t" \ +" st1h "#ZFH".s, "#PT", ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \ +" revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \ +" st1h "#ZFH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \ +" add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \ +" add x28, "#CTEMP", "#CRS2" \n\t" \ +" st1h "#ZLH".s, "#PT", ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \ +" revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \ +" st1h "#ZLH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \ +" add "#CADDR", "#CADDR", "#CCS" \n\t" + + +void bli_shgemm_armsve_asm_2vx10_unindexed + ( + dim_t k0, + void* restrict alpha, + void* restrict a, + void* restrict b, + void* restrict beta, + void* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + __asm__ volatile ( +" ldr x0, %[a] \n\t" +" ldr x1, %[b] \n\t" +" mov x2, xzr \n\t" +" inch x2, ALL, MUL #2 \n\t" // Column-skip of A. +" mov x3, #10 \n\t" // Row-skip of B. +" \n\t" +" ldr x5, %[c] \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +#ifdef _A64FX +" mov x8, 0x3 \n\t" // Tag C address. +" lsl x8, x8, #56 \n\t" +" orr x5, x5, x8 \n\t" +" mov x8, 0x2 \n\t" // Tag B address. +" lsl x8, x8, #56 \n\t" +" orr x1, x1, x8 \n\t" +" mov x8, 0x1 \n\t" // Tag A address. +" lsl x8, x8, #56 \n\t" +" orr x0, x0, x8 \n\t" +#endif +" \n\t" +" mov x8, #2 \n\t" // Multiply some address skips by sizeof(float16_t). +" madd x2, x8, x2, xzr \n\t" // cs_a +" madd x3, x8, x3, xzr \n\t" // rs_b +" madd x7, x8, x7, xzr \n\t" // cs_c +" ptrue p0.b \n\t" +" \n\t" +" ldr x4, %[k_mker] \n\t" // Number of loops. +" ldr x8, %[k_left] \n\t" +" \n\t" +" LOAD_ABC: \n\t" +" cmp x4, #0 \n\t" // Don't preload if no microkernel there. +" b.eq END_CCOL_PRFM \n\t" + +" ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row. +" ld1rh z21.h, p0/z, [x1, 2] \n\t" +" ld1rh z22.h, p0/z, [x1, 4] \n\t" +" ld1rh z23.h, p0/z, [x1, 6] \n\t" +" ld1rh z24.h, p0/z, [x1, 8] \n\t" +" ld1rh z25.h, p0/z, [x1, 10] \n\t" +" ld1rh z26.h, p0/z, [x1, 12] \n\t" +" ld1rh z27.h, p0/z, [x1, 14] \n\t" +" \n\t" +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +" \n\t" +" CCOL_PRFM: \n\t" +" cmp x6, #1 \n\t" +" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +" mov x16, x5 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" END_CCOL_PRFM: \n\t" +" \n\t" +CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) +" \n\t" +" cmp x4, #0 \n\t" // If no 4-microkernel can be applied +" b.eq K_LEFT_LOOP \n\t" +" \n\t" +" K_MKER_LOOP: \n\t" +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" \n\t" +" subs x4, x4, #1 \n\t" // Decrease counter before final replica. +" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +" \n\t" +" add x0, x0, x2 \n\t" // Forward A's address to the next column. +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" b K_MKER_LOOP \n\t" +" \n\t" +" FIN_MKER_LOOP: \n\t" +GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) +" add x0, x0, x2 \n\t" // Forward A to fill the blank. +" \n\t" +" K_LEFT_LOOP: \n\t" +" cmp x8, #0 \n\t" // End of execution. +" b.eq WRITE_MEM_PREP \n\t" +" \n\t" +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +" ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row. +" ld1rh z21.h, p0/z, [x1, 2] \n\t" +" ld1rh z22.h, p0/z, [x1, 4] \n\t" +" ld1rh z23.h, p0/z, [x1, 6] \n\t" +" ld1rh z24.h, p0/z, [x1, 8] \n\t" +" ld1rh z25.h, p0/z, [x1, 10] \n\t" +" ld1rh z26.h, p0/z, [x1, 12] \n\t" +" ld1rh z27.h, p0/z, [x1, 14] \n\t" +" ld1rh z28.h, p0/z, [x1, 16] \n\t" +" ld1rh z29.h, p0/z, [x1, 18] \n\t" +GEMM_FMLA2(z0,z1,p0,z30,z31,z20) +GEMM_FMLA2(z2,z3,p0,z30,z31,z21) +GEMM_FMLA2(z4,z5,p0,z30,z31,z22) +GEMM_FMLA2(z6,z7,p0,z30,z31,z23) +GEMM_FMLA2(z8,z9,p0,z30,z31,z24) +GEMM_FMLA2(z10,z11,p0,z30,z31,z25) +GEMM_FMLA2(z12,z13,p0,z30,z31,z26) +GEMM_FMLA2(z14,z15,p0,z30,z31,z27) +GEMM_FMLA2(z16,z17,p0,z30,z31,z28) +GEMM_FMLA2(z18,z19,p0,z30,z31,z29) +" add x0, x0, x2 \n\t" // Forward A. +" add x1, x1, x3 \n\t" // Forward B. +" sub x8, x8, #1 \n\t" +" b K_LEFT_LOOP \n\t" // Next column / row. +" \n\t" +" WRITE_MEM_PREP: \n\t" +" \n\t" +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1rh z30.h, p0/z, [x4] \n\t" // Load alpha & beta into vectors. +" ld1rh z31.h, p0/z, [x8] \n\t" +" fmov w4, h28 \n\t" // Copy alpha & beta to GP registers. +" fmov w8, h29 \n\t" +" \n\t" +" PREFETCH_ABNEXT: \n\t" +" ldr x0, %[a_next] \n\t" +" ldr x1, %[b_next] \n\t" +" prfm PLDL2KEEP, [x0] \n\t" +" prfm PLDL2KEEP, [x0, 256*1] \n\t" +" prfm PLDL2KEEP, [x0, 256*2] \n\t" +" prfm PLDL2KEEP, [x0, 256*3] \n\t" +" prfm PLDL2KEEP, [x0, 256*4] \n\t" +" prfm PLDL2KEEP, [x0, 256*5] \n\t" +" prfm PLDL2KEEP, [x0, 256*6] \n\t" +" prfm PLDL2KEEP, [x0, 256*7] \n\t" +" prfm PLDL2KEEP, [x0, 256*8] \n\t" +" prfm PLDL2KEEP, [x0, 256*9] \n\t" +" prfm PLDL2KEEP, [x0, 256*10] \n\t" +" prfm PLDL2KEEP, [x0, 256*11] \n\t" +" prfm PLDL2KEEP, [x0, 256*12] \n\t" +" prfm PLDL2KEEP, [x0, 256*13] \n\t" +" prfm PLDL2KEEP, [x0, 256*14] \n\t" +" prfm PLDL2KEEP, [x0, 256*15] \n\t" +" prfm PLDL2KEEP, [x1] \n\t" +" prfm PLDL2KEEP, [x1, 256*1] \n\t" +" prfm PLDL2KEEP, [x1, 256*2] \n\t" +" prfm PLDL2KEEP, [x1, 256*3] \n\t" +" prfm PLDL2KEEP, [x1, 256*4] \n\t" +" prfm PLDL2KEEP, [x1, 256*5] \n\t" +" prfm PLDL2KEEP, [x1, 256*6] \n\t" +" prfm PLDL2KEEP, [x1, 256*7] \n\t" +" prfm PLDL2KEEP, [x1, 256*8] \n\t" +" prfm PLDL2KEEP, [x1, 256*9] \n\t" +" \n\t" +" WRITE_MEM: \n\t" +" \n\t" +" fmov h28, #1.0 \n\t" +" fmov w16, h28 \n\t" +" cmp w16, w4 \n\t" +" b.eq UNIT_ALPHA \n\t" +" \n\t" +SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) +" \n\t" +" UNIT_ALPHA: \n\t" +" mov x9, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x6, #1 \n\t" +" b.ne WRITE_MEM_G \n\t" +" \n\t" +" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. +" \n\t" // Here used scratch: Z[20-29]. +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) +GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7) +" \n\t" +GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) +" b END_WRITE_MEM \n\t" +" \n\t" +" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. +" \n\t" // Here used scratch: Z[20-30] - Z30 as index. +" mov x10, xzr \n\t" +" incb x10 \n\t" +" madd x10, x10, x6, xzr \n\t" // C-column's logical 1-vector skip. +" mov x28, #2 \n\t" +" madd x6, x28, x6, xzr \n\t" // Double index skip for half-precision case. +" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. +GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x9,x7,x10,x16) +" dup z31.h, w8 \n\t" // Restore beta destroyed by loading. +GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) +GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x9,x7,x10,x16) +" \n\t" +" dup z31.h, w8 \n\t" // Restore beta destroyed by loading. +GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x5,x7,x10,x16) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x5,x7,x10,x16) +" \n\t" +" END_WRITE_MEM: \n\t" +" b END_EXEC \n\t" +" \n\t" +" END_ERROR: \n\t" +" mov x0, #1 \n\t" // Return error. +" END_EXEC: \n\t" +" mov x0, #0 \n\t" // Return normal. +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta), + [a_next] "m" (a_next), + [b_next] "m" (b_next) +: "x0","x1","x2","x3","x4","x5","x6","x7","x8", + "x9","x16","x10","x28", + "z0","z1","z2","z3","z4","z5","z6","z7", + "z8","z9","z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19", + "z20","z21","z22","z23", + "z24","z25","z26","z27", + "z28","z29","z30","z31" + ); +} + diff --git a/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c b/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c new file mode 100644 index 0000000000..ff3a35e7a6 --- /dev/null +++ b/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c @@ -0,0 +1,450 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// Separate instantiation for ArmSVE reference kernels. +// Temporary workaround. Will be removed after upstream has switched to a better way +// of exposing gemmsup interface. + +// +// -- Row storage case --------------------------------------------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +\ +void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + /* NOTE: This microkernel can actually handle arbitrarily large + values of m, n, and k. */ \ +\ + if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ + { \ + /* Traverse c by rows. */ \ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict ci = &c[ i*rs_c ]; \ + ctype* restrict ai = &a[ i*rs_a ]; \ +\ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cij = &ci[ j*cs_c ]; \ + ctype* restrict bj = &b [ j*cs_b ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dots)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ + { \ + /* Traverse c by rows. */ \ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict ci = &c[ i*rs_c ]; \ + ctype* restrict ai = &a[ i*rs_a ]; \ +\ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cij = &ci[ j*cs_c ]; \ + ctype* restrict bj = &b [ j*cs_b ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ + { \ + /* Traverse c by rows. */ \ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict ci = &c[ i*rs_c ]; \ + ctype* restrict ai = &a[ i*rs_a ]; \ +\ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cij = &ci[ j*cs_c ]; \ + ctype* restrict bj = &b [ j*cs_b ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ + { \ + /* Traverse c by rows. */ \ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict ci = &c[ i*rs_c ]; \ + ctype* restrict ai = &a[ i*rs_a ]; \ +\ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cij = &ci[ j*cs_c ]; \ + ctype* restrict bj = &b [ j*cs_b ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dots)( *aij, *bij, ab ); \ + } \ +\ + /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ + PASTEMAC(ch,conjs)( ab ); \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC2( gemmsup_r, _armsve, _ref2 ) + +// +// -- Column storage case ------------------------------------------------------ +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +\ +void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + /* NOTE: This microkernel can actually handle arbitrarily large + values of m, n, and k. */ \ +\ + if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ + { \ + /* Traverse c by columns. */ \ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cj = &c[ j*cs_c ]; \ + ctype* restrict bj = &b[ j*cs_b ]; \ +\ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict cij = &cj[ i*rs_c ]; \ + ctype* restrict ai = &a [ i*rs_a ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dots)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ + { \ + /* Traverse c by columns. */ \ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cj = &c[ j*cs_c ]; \ + ctype* restrict bj = &b[ j*cs_b ]; \ +\ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict cij = &cj[ i*rs_c ]; \ + ctype* restrict ai = &a [ i*rs_a ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ + { \ + /* Traverse c by columns. */ \ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cj = &c[ j*cs_c ]; \ + ctype* restrict bj = &b[ j*cs_b ]; \ +\ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict cij = &cj[ i*rs_c ]; \ + ctype* restrict ai = &a [ i*rs_a ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ + } \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ + { \ + /* Traverse c by columns. */ \ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ctype* restrict cj = &c[ j*cs_c ]; \ + ctype* restrict bj = &b[ j*cs_b ]; \ +\ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + ctype* restrict cij = &cj[ i*rs_c ]; \ + ctype* restrict ai = &a [ i*rs_a ]; \ + ctype ab; \ +\ + PASTEMAC(ch,set0s)( ab ); \ +\ + /* Perform a dot product to update the (i,j) element of c. */ \ + for ( dim_t l = 0; l < k; ++l ) \ + { \ + ctype* restrict aij = &ai[ l*cs_a ]; \ + ctype* restrict bij = &bj[ l*rs_b ]; \ +\ + PASTEMAC(ch,dots)( *aij, *bij, ab ); \ + } \ +\ + /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ + PASTEMAC(ch,conjs)( ab ); \ +\ + /* If beta is one, add ab into c. If beta is zero, overwrite c + with the result in ab. Otherwise, scale by beta and accumulate + ab to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ + } \ + else \ + { \ + PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC2( gemmsup_c, _armsve, _ref2 ) + diff --git a/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c new file mode 100644 index 0000000000..3341b63d00 --- /dev/null +++ b/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c @@ -0,0 +1,528 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "blis.h" +#include + +// Double-precision composite instructions. +#include "../armsve_asm_macros_double.h" + +// 2vx10 microkernels. +#include "../armsve_asm_2vx10.h" + +// Prototype reference kernel. +GEMMSUP_KER_PROT( double, d, gemmsup_c_armsve_ref2 ) + +void __attribute__ ((noinline,optimize(0))) bli_dgemmsup_cv_armsve_2vx10_unindexed + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + static int called = 0; + if ( !called ) + { + fprintf(stderr, "rv called.\n"); + called = 1; + } + // c*c requires A to be stored in columns. + assert( rs_a0 == 1 ); + + dim_t n0_mker = n0 / 10; + dim_t n0_left = n0 % 10; + + if ( n0_left ) + { + // A[:, ::] + // B[::, n0_mker*10:n0] + // C[: , n0_mker*10:n0] + double *ai = a; + double *bi = b + n0_mker * 10 * cs_b0; + double *ci = c + n0_mker * 10 * cs_c0; + bli_dgemmsup_c_armsve_ref2 + ( + conja, conjb, + m0, n0_left, k0, + alpha, + ai, rs_a0, cs_a0, + bi, rs_b0, cs_b0, + beta, + ci, rs_c0, cs_c0, + data, + cntx + ); + } + // Return if it's a pure edge case. + if ( !n0_mker ) + return; + + // Determine VL. + uint64_t vlen2; + __asm__ ( + " mov x0, xzr \n\t" + " incd x0, ALL, MUL #2 \n\t" + " mov %[vlen2], x0 \n\t" + : [vlen2] "=r" (vlen2) + : + : "x0" + ); + + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + // uint64_t rs_a = 1; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + + uint64_t k_mker = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t n_mker = n0_mker; + + dim_t m0_mker = m0 / vlen2; + dim_t m0_left = m0 % vlen2; + if ( m0_left ) + { + // Edge case on A side can be handled with one more (predicated) loop. + m0_mker++; + } else + m0_left = vlen2; + // uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_b = bli_auxinfo_ps_b( data ); + + for ( dim_t im0_mker = 0; im0_mker < m0_mker; ++im0_mker ) + { + uint64_t m_curr = vlen2; + if ( im0_mker == m0_mker - 1 ) + { + // Last m-loop. Maybe unnecessary. + m_curr = m0_left; + } + double *ai = a + im0_mker * vlen2 * rs_a0; + double *bi = b; + double *ci = c + im0_mker * vlen2 * rs_c0; + + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + __asm__ volatile ( +" ldr x0, %[bi] \n\t" +" ldr x1, %[rs_b] \n\t" // Row-skip of B. +" ldr x2, %[cs_b] \n\t" // Column-skip of B (element skip of B[l, :]). +" ldr x3, %[ps_b] \n\t" // Panel-skip (10*k) of B. +" ldr x4, %[cs_a] \n\t" // Column-Skip of A. +" \n\t" // Element skip of A[:, l] is guaranteed to be 1. +" ldr x5, %[ci] \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag C address. +" lsl x16, x16, #56 \n\t" +" orr x5, x5, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr x0, x0, x16 \n\t" +#endif +" \n\t" +" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double). +" madd x1, x8, x1, xzr \n\t" // rs_b +" madd x2, x8, x2, xzr \n\t" // cs_b +" madd x3, x8, x3, xzr \n\t" // ps_b +" madd x4, x8, x4, xzr \n\t" // cs_a +" madd x7, x8, x7, xzr \n\t" // cs_c +" mov x8, #4 \n\t" +" madd x15, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for A. +" \n\t" +#ifdef _A64FX +" mov x16, 0x20 \n\t" // Higher 6bit for Control#2: +" lsl x16, x16, #58 \n\t" // Valid|Strong|Strong|NoAlloc|Load|Strong +" orr x16, x16, x4 \n\t" // Stride. +" msr S3_3_C11_C6_2, x16 \n\t" // Write system register. +#endif +" \n\t" +" ldr x8, %[m_curr] \n\t" // Size of first dimension. +" mov x9, xzr \n\t" +" incd x9 \n\t" +" ptrue p0.d \n\t" +" whilelo p1.d, xzr, x8 \n\t" +" whilelo p2.d, x9, x8 \n\t" +" \n\t" +" ldr x8, %[n_mker] \n\t" // Number of N-loops. +" \n\t" +" ldr x20, %[ai] \n\t" // Parameters to be reloaded +" ldr x21, %[k_mker] \n\t" // within each millikernel loop. +" ldr x22, %[k_left] \n\t" +" ldr x23, %[alpha] \n\t" +" ldr x24, %[beta] \n\t" +" ldr x25, %[a_next] \n\t" +" ldr x26, %[b_next] \n\t" +" ldr x23, [x23] \n\t" // Directly load alpha and beta. +" ldr x24, [x24] \n\t" +" \n\t" +" MILLIKER_MLOOP: \n\t" +" \n\t" +" mov x11, x0 \n\t" // B's address. +// " ldr x10, %[ai] \n\t" // A's address. +" mov x10, x20 \n\t" +// " ldr x12, %[k_mker] \n\t" +" mov x12, x21 \n\t" +// " ldr x13, %[k_left] \n\t" +" mov x13, x22 \n\t" +#ifdef _A64FX +" mov x16, 0x3 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr x10, x10, x16 \n\t" +" mov x16, 0xa \n\t" // Control#2 for A address. +" lsl x16, x16, #60 \n\t" +" orr x10, x10, x16 \n\t" +#endif +" \n\t" +" cmp x12, #0 \n\t" // Don't preload if no microkernel there. +" b.eq END_CCOL_PRFM \n\t" +" \n\t" +" mov x14, x11 \n\t" +" ld1rd z20.d, p0/z, [x14] \n\t" // Load 8/10 of first B row. +" add x14, x14, x2 \n\t" +" ld1rd z21.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z22.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z23.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z24.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z25.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z26.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z27.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" prfm PLDL1KEEP, [x14] \n\t" // And prefetch the 2/10 left. +" add x14, x14, x2 \n\t" +" prfm PLDL1KEEP, [x14] \n\t" +" sub x14, x14, x2 \n\t" // Restore x14 to load edge. +" \n\t" +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p1,p2,x10) +" add x16, x10, x4 \n\t" +" prfm PLDL1STRM, [x16] \n\t" // Prefetch 3/4 of A. +" add x16, x10, x4 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x10, x4 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" \n\t" +" CCOL_PRFM: \n\t" +" cmp x6, #1 \n\t" +" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +" mov x16, x5 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" add x16, x16, x7 \n\t" +" prfm PLDL1STRM, [x16] \n\t" +" END_CCOL_PRFM: \n\t" +" \n\t" +CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) +" \n\t" +" cmp x12, #0 \n\t" // If no 4-microkernel can be applied +" b.eq K_LEFT_LOOP \n\t" +" \n\t" +" K_MKER_LOOP: \n\t" +" \n\t" +GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm) +GEMM_2VX10_MKER_LOOP_PLAIN_G_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) +" \n\t" +GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm) +GEMM_2VX10_MKER_LOOP_PLAIN_G_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) +" \n\t" +GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm) +GEMM_2VX10_MKER_LOOP_PLAIN_G_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) +" \n\t" +" subs x12, x12, #1 \n\t" // Decrease counter before final replica. +" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +" \n\t" +GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm) +GEMM_2VX10_MKER_LOOP_PLAIN_G_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) +" b K_MKER_LOOP \n\t" +" \n\t" +" FIN_MKER_LOOP: \n\t" +GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) +" add x10, x10, x4 \n\t" // Forward A to fill the blank. +" \n\t" +" K_LEFT_LOOP: \n\t" +" cmp x13, #0 \n\t" // End of execution. +" b.eq WRITE_MEM_PREP \n\t" +" \n\t" +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p1,p2,x10) +" mov x14, x11 \n\t" +" ld1rd z20.d, p0/z, [x14] \n\t" // Load 10/10 B. +" add x14, x14, x2 \n\t" +" ld1rd z21.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z22.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z23.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z24.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z25.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z26.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z27.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z28.d, p0/z, [x14] \n\t" +" add x14, x14, x2 \n\t" +" ld1rd z29.d, p0/z, [x14] \n\t" +GEMM_FMLA2(z0,z1,p0,z30,z31,z20) +GEMM_FMLA2(z2,z3,p0,z30,z31,z21) +GEMM_FMLA2(z4,z5,p0,z30,z31,z22) +GEMM_FMLA2(z6,z7,p0,z30,z31,z23) +GEMM_FMLA2(z8,z9,p0,z30,z31,z24) +GEMM_FMLA2(z10,z11,p0,z30,z31,z25) +GEMM_FMLA2(z12,z13,p0,z30,z31,z26) +GEMM_FMLA2(z14,z15,p0,z30,z31,z27) +GEMM_FMLA2(z16,z17,p0,z30,z31,z28) +GEMM_FMLA2(z18,z19,p0,z30,z31,z29) +" add x10, x10, x4 \n\t" // Forward A. +" add x11, x11, x1 \n\t" // Forward B. +" sub x13, x13, #1 \n\t" +" b K_LEFT_LOOP \n\t" // Next column / row. +" \n\t" +" WRITE_MEM_PREP: \n\t" +" \n\t" +// " ldr x10, %[ai] \n\t" +" mov x10, x20 \n\t" +" add x11, x0, x3 \n\t" +" dup z30.d, x23 \n\t" // Broadcast alpha & beta into vectors. +" dup z31.d, x24 \n\t" +" \n\t" +" cmp x8, #1 \n\t" +" b.eq PREFETCH_ABNEXT \n\t" +" prfm PLDL1STRM, [x10] \n\t" +" prfm PLDL1KEEP, [x11] \n\t" +" add x11, x11, x2 \n\t" +" prfm PLDL1KEEP, [x11] \n\t" +" add x11, x11, x2 \n\t" +" prfm PLDL1KEEP, [x11] \n\t" +" add x11, x11, x2 \n\t" +" prfm PLDL1KEEP, [x11] \n\t" +" add x11, x11, x2 \n\t" +" prfm PLDL1KEEP, [x11] \n\t" +" add x11, x11, x2 \n\t" +" prfm PLDL1KEEP, [x11] \n\t" +" add x11, x11, x2 \n\t" +" prfm PLDL1KEEP, [x11] \n\t" +" add x11, x11, x2 \n\t" +" prfm PLDL1KEEP, [x11] \n\t" +" add x11, x11, x2 \n\t" +" prfm PLDL1KEEP, [x11] \n\t" +" add x11, x11, x2 \n\t" +" prfm PLDL1KEEP, [x11] \n\t" +" b WRITE_MEM \n\t" +" \n\t" +" PREFETCH_ABNEXT: \n\t" +// " ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed. +" mov x1, x25 \n\t" +// " ldr x2, %[b_next] \n\t" +" mov x2, x26 \n\t" +" prfm PLDL2KEEP, [x1] \n\t" +" prfm PLDL2KEEP, [x1, 256*1] \n\t" +" prfm PLDL2KEEP, [x1, 256*2] \n\t" +" prfm PLDL2KEEP, [x1, 256*3] \n\t" +" prfm PLDL2KEEP, [x1, 256*4] \n\t" +" prfm PLDL2KEEP, [x1, 256*5] \n\t" +" prfm PLDL2KEEP, [x1, 256*6] \n\t" +" prfm PLDL2KEEP, [x1, 256*7] \n\t" +" prfm PLDL2KEEP, [x1, 256*8] \n\t" +" prfm PLDL2KEEP, [x1, 256*9] \n\t" +" prfm PLDL2KEEP, [x1, 256*10] \n\t" +" prfm PLDL2KEEP, [x1, 256*11] \n\t" +" prfm PLDL2KEEP, [x1, 256*12] \n\t" +" prfm PLDL2KEEP, [x1, 256*13] \n\t" +" prfm PLDL2KEEP, [x1, 256*14] \n\t" +" prfm PLDL2KEEP, [x1, 256*15] \n\t" +" prfm PLDL2KEEP, [x2] \n\t" +" prfm PLDL2KEEP, [x2, 256*1] \n\t" +" prfm PLDL2KEEP, [x2, 256*2] \n\t" +" prfm PLDL2KEEP, [x2, 256*3] \n\t" +" prfm PLDL2KEEP, [x2, 256*4] \n\t" +" prfm PLDL2KEEP, [x2, 256*5] \n\t" +" prfm PLDL2KEEP, [x2, 256*6] \n\t" +" prfm PLDL2KEEP, [x2, 256*7] \n\t" +" prfm PLDL2KEEP, [x2, 256*8] \n\t" +" prfm PLDL2KEEP, [x2, 256*9] \n\t" +" \n\t" +" WRITE_MEM: \n\t" +" \n\t" +" fmov d28, #1.0 \n\t" +" fmov x16, d28 \n\t" +" cmp x16, x23 \n\t" +" b.eq UNIT_ALPHA \n\t" +" \n\t" +SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) +" \n\t" +" UNIT_ALPHA: \n\t" +" mov x9, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x6, #1 \n\t" +" b.ne WRITE_MEM_G \n\t" +" \n\t" +" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. +" \n\t" // Here used scratch: Z[20-29]. +" mov x13, xzr \n\t" // C-column's physical 1-vector skip. +" incb x13 \n\t" +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7) +GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) +GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7) +" \n\t" +GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x5,x7) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x5,x7) +" b END_WRITE_MEM \n\t" +" \n\t" +" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. +" \n\t" // Here used scratch: Z[20-30] - Z30 as index. +" mov x12, xzr \n\t" +" incb x12 \n\t" +" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip. +" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. +GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16) +GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) +GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16) +" \n\t" +GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x5,x7,x13,x16) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x5,x7,x13,x16) +" \n\t" +" END_WRITE_MEM: \n\t" +" subs x8, x8, #1 \n\t" +" b.eq END_EXEC \n\t" +" \n\t" // Address of C already forwarded to next column. +" add x0, x0, x3 \n\t" // Forward B's base address to the next logic panel. +" b MILLIKER_MLOOP \n\t" +" \n\t" +" END_ERROR: \n\t" +" mov x0, #1 \n\t" // Return error. +" END_EXEC: \n\t" +" mov x0, #0 \n\t" // Return normal. +: +: [bi] "m" (bi), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [ps_b] "m" (ps_b), + [cs_a] "m" (cs_a), + [ci] "m" (ci), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [m_curr] "m" (m_curr), + [n_mker] "m" (n_mker), + [ai] "m" (ai), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta), + [a_next] "m" (a_next), + [b_next] "m" (b_next) +: "x0","x1","x2","x3","x4","x5","x6","x7","x8", + "x9","x10","x11","x12","x13","x14","x15","x16","x17", + "x20","x21","x22","x23","x24","x25","x26", + "z0","z1","z2","z3","z4","z5","z6","z7", + "z8","z9","z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19", + "z20","z21","z22","z23", + "z24","z25","z26","z27", + "z28","z29","z30","z31" + ); + } +} + +void bli_dgemmsup_rv_armsve_10x2v_unindexed + ( + conj_t conjat, + conj_t conjbt, + dim_t m0t, + dim_t n0t, + dim_t k0, + double* restrict alpha, + double* restrict at, inc_t rs_at0, inc_t cs_at0, + double* restrict bt, inc_t rs_bt0, inc_t cs_bt0, + double* restrict beta, + double* restrict ct, inc_t rs_ct0, inc_t cs_ct0, + auxinfo_t* restrict datat, + cntx_t* restrict cntx + ) +{ + auxinfo_t data; + bli_auxinfo_set_next_a( bli_auxinfo_next_b( datat ), &data ); + bli_auxinfo_set_next_b( bli_auxinfo_next_a( datat ), &data ); + bli_auxinfo_set_ps_a( bli_auxinfo_ps_b( datat ), &data ); + bli_auxinfo_set_ps_b( bli_auxinfo_ps_a( datat ), &data ); + bli_dgemmsup_cv_armsve_2vx10_unindexed + ( + conjbt, conjat, + n0t, m0t, k0, + alpha, + bt, cs_bt0, rs_bt0, + at, cs_at0, rs_at0, + beta, + ct, cs_ct0, rs_ct0, + &data, + cntx + ); +} + diff --git a/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c new file mode 100644 index 0000000000..6bcea73f5d --- /dev/null +++ b/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c @@ -0,0 +1,412 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "blis.h" +#include + +// Double-precision composite instructions. +#include "../armsve_asm_macros_double.h" + +// 2vx10 microkernels. +#include "../armsve_asm_2vx10.h" + +// Prototype reference kernel. +GEMMSUP_KER_PROT( double, d, gemmsup_r_armsve_ref2 ) + +void __attribute__ ((optimize(0))) bli_dgemmsup_rv_armsve_2vx10_unindexed + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + static int called = 0; + if ( !called ) + { + fprintf(stderr, "rv called.\n"); + called = 1; + } + // r*r requires B to be stored in rows. + assert(cs_b0 == 1); + + dim_t n0_mker = n0 / 10; + dim_t n0_left = n0 % 10; + + if ( n0_left ) + { + // A[:, ::] + // B[::, n0_mker*10:n0] + // C[: , n0_mker*10:n0] + double *ai = a; + double *bi = b + n0_mker * 10 * cs_b0; + double *ci = c + n0_mker * 10 * cs_c0; + bli_dgemmsup_r_armsve_ref2 + ( + conja, conjb, + m0, n0_left, k0, + alpha, + ai, rs_a0, cs_a0, + bi, rs_b0, cs_b0, + beta, + ci, rs_c0, cs_c0, + data, + cntx + ); + } + // Return if it's a pure edge case. + if ( !n0_mker ) + return; + + // Determine VL. + uint64_t vlen2; + __asm__ ( + " mov x0, xzr \n\t" + " incd x0, ALL, MUL #2 \n\t" + " mov %[vlen2], x0 \n\t" + : [vlen2] "=r" (vlen2) + : + : "x0" + ); + + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + // uint64_t cs_b = 1; + + uint64_t k_mker = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t m_mker = m0 / vlen2; + uint64_t m_left = m0 % vlen2; + if ( m_left ) + { + // Edge case on A side can be handled with one more (predicated) loop. + m_mker++; + } else + m_left = vlen2; + uint64_t ps_a = bli_auxinfo_ps_a( data ); + // uint64_t ps_b = bli_auxinfo_ps_b( data ); + + for ( dim_t in0_mker = 0; in0_mker < n0_mker; ++in0_mker ) + { + double *ai = a; + double *bi = b + in0_mker * 10 * cs_b0; + double *ci = c + in0_mker * 10 * cs_c0; + + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + __asm__ volatile ( +" ldr x0, %[ai] \n\t" +" ldr x1, %[rs_a] \n\t" // Row-skip of A (element skip of A[:, l]). +" ldr x2, %[cs_a] \n\t" // Column-skip of A. +" ldr x3, %[ps_a] \n\t" // Panel-skip (vlen2*k) of A. +" ldr x4, %[rs_b] \n\t" // Row-Skip of B. +" \n\t" // Element skip of B[l, :] is guaranteed to be 1. +" ldr x5, %[ci] \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag C address. +" lsl x16, x16, #56 \n\t" +" orr x5, x5, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr x0, x0, x16 \n\t" +#endif +" \n\t" +" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double). +" madd x2, x8, x2, xzr \n\t" // cs_a +" madd x3, x8, x3, xzr \n\t" // ps_a +" madd x4, x8, x4, xzr \n\t" // rs_b +" madd x7, x8, x7, xzr \n\t" // cs_c +" mov x8, xzr \n\t" +" incb x8 \n\t" +" madd x14, x8, x1, xzr \n\t" // A-column's logical 1-vector skip. +" mov x8, #4 \n\t" +" madd x15, x8, x2, xzr \n\t" // Logical K=4 microkernel skip for A. +// " mov x8, #4 \n\t" +// " madd x17, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for B. +" \n\t" +" ldr x8, %[m_mker] \n\t" // Number of M-loops. +" ptrue p0.d \n\t" +" ptrue p1.d \n\t" +" ptrue p2.d \n\t" +" \n\t" +" MILLIKER_MLOOP: \n\t" +" \n\t" +" cmp x8, #1 \n\t" +" b.ne UKER_BEGIN \n\t" +" \n\t" +" ldr x10, %[m_left] \n\t" // Final (incomplete) millikernel loop. +" mov x11, xzr \n\t" +" incd x11 \n\t" +" whilelo p1.d, xzr, x10 \n\t" // Overwrite p1/p2. +" whilelo p2.d, x11, x10 \n\t" +" \n\t" +" UKER_BEGIN: \n\t" +" mov x10, x0 \n\t" // A's address. +" ldr x11, %[bi] \n\t" // B's address. +" ldr x12, %[k_mker] \n\t" +" ldr x13, %[k_left] \n\t" +#ifdef _A64FX +" mov x16, 0x3 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr x11, x11, x16 \n\t" +#endif +" \n\t" +" mov x16, x11 \n\t" // Prefetch first kernel of B. +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, x4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" \n\t" +" ld1rd z20.d, p0/z, [x11] \n\t" // (Partial) first B row. +" ld1rd z21.d, p0/z, [x11, #8] \n\t" +" ld1rd z22.d, p0/z, [x11, #16] \n\t" +" ld1rd z23.d, p0/z, [x11, #24] \n\t" +" ld1rd z24.d, p0/z, [x11, #32] \n\t" +" ld1rd z25.d, p0/z, [x11, #40] \n\t" +" ld1rd z26.d, p0/z, [x11, #48] \n\t" +" ld1rd z27.d, p0/z, [x11, #56] \n\t" +" \n\t" +" index z29.d, xzr, x1 \n\t" // First A column. +" \n\t" // Skips passed to index is not multiplied by 8. +GEMM_ACOL_GATHER_LOAD(z28,z29,z29,p1,p2,x10,x14,x16) +" \n\t" +CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) +" \n\t" +" cmp x12, #0 \n\t" // If no 4-microkernel can be applied +" b.eq K_LEFT_LOOP \n\t" +" \n\t" +" K_MKER_LOOP: \n\t" // Unroll the 4-loop. +" \n\t" +" index z31.d, xzr, x1 \n\t" +GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) +GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) +" \n\t" +" index z29.d, xzr, x1 \n\t" +GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) +GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) +" \n\t" +" index z31.d, xzr, x1 \n\t" +GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) +GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) +" \n\t" +" subs x12, x12, #1 \n\t" // Decrease counter before final replica. +" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +" \n\t" +" index z29.d, xzr, x1 \n\t" +GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) +GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) +" b K_MKER_LOOP \n\t" +" \n\t" +" FIN_MKER_LOOP: \n\t" +GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) +" add x10, x10, x2 \n\t" // Forward A to fill the blank. +" \n\t" +" K_LEFT_LOOP: \n\t" +" cmp x13, #0 \n\t" +" b.eq WRITE_MEM_PREP \n\t" +" \n\t" +" index z31.d, xzr, x1 \n\t" +GEMM_ACOL_GATHER_LOAD(z30,z31,z31,p1,p2,x10,x14,x16) +" ld1rd z20.d, p0/z, [x11] \n\t" +" ld1rd z21.d, p0/z, [x11, #8] \n\t" +" ld1rd z22.d, p0/z, [x11, #16] \n\t" +" ld1rd z23.d, p0/z, [x11, #24] \n\t" +" ld1rd z24.d, p0/z, [x11, #32] \n\t" +" ld1rd z25.d, p0/z, [x11, #40] \n\t" +" ld1rd z26.d, p0/z, [x11, #48] \n\t" +" ld1rd z27.d, p0/z, [x11, #56] \n\t" +" ld1rd z28.d, p0/z, [x11, #64] \n\t" +" ld1rd z29.d, p0/z, [x11, #72] \n\t" +GEMM_FMLA2(z0,z1,p0,z30,z31,z20) +GEMM_FMLA2(z2,z3,p0,z30,z31,z21) +GEMM_FMLA2(z4,z5,p0,z30,z31,z22) +GEMM_FMLA2(z6,z7,p0,z30,z31,z23) +GEMM_FMLA2(z8,z9,p0,z30,z31,z24) +GEMM_FMLA2(z10,z11,p0,z30,z31,z25) +GEMM_FMLA2(z12,z13,p0,z30,z31,z26) +GEMM_FMLA2(z14,z15,p0,z30,z31,z27) +GEMM_FMLA2(z16,z17,p0,z30,z31,z28) +GEMM_FMLA2(z18,z19,p0,z30,z31,z29) +" add x10, x10, x2 \n\t" // Forward A. +" add x11, x11, x4 \n\t" // Forward B. +" sub x13, x13, #1 \n\t" +" b K_LEFT_LOOP \n\t" // Next column / row. +" \n\t" +" WRITE_MEM_PREP: \n\t" +" \n\t" +" ldr x11, %[bi] \n\t" +" ldr x12, %[alpha] \n\t" // Load alpha & beta. +" ldr x13, %[beta] \n\t" +" ld1rd z30.d, p0/z, [x12] \n\t" +" ld1rd z31.d, p0/z, [x13] \n\t" +" ldr x12, [x12] \n\t" +" \n\t" +" cmp x8, #1 \n\t" +" b.eq PREFETCH_ABNEXT \n\t" +" prfm PLDL2STRM, [x11] \n\t" +" b WRITE_MEM \n\t" +" \n\t" +" PREFETCH_ABNEXT: \n\t" +" ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed. +" ldr x2, %[b_next] \n\t" +" prfm PLDL2KEEP, [x1] \n\t" +" prfm PLDL2KEEP, [x1, 256*1] \n\t" +" prfm PLDL2KEEP, [x1, 256*2] \n\t" +" prfm PLDL2KEEP, [x1, 256*3] \n\t" +" prfm PLDL2KEEP, [x1, 256*4] \n\t" +" prfm PLDL2KEEP, [x1, 256*5] \n\t" +" prfm PLDL2KEEP, [x1, 256*6] \n\t" +" prfm PLDL2KEEP, [x1, 256*7] \n\t" +" prfm PLDL2KEEP, [x1, 256*8] \n\t" +" prfm PLDL2KEEP, [x1, 256*9] \n\t" +" prfm PLDL2KEEP, [x1, 256*10] \n\t" +" prfm PLDL2KEEP, [x1, 256*11] \n\t" +" prfm PLDL2KEEP, [x1, 256*12] \n\t" +" prfm PLDL2KEEP, [x1, 256*13] \n\t" +" prfm PLDL2KEEP, [x1, 256*14] \n\t" +" prfm PLDL2KEEP, [x1, 256*15] \n\t" +" prfm PLDL2KEEP, [x2] \n\t" +" prfm PLDL2KEEP, [x2, 256*1] \n\t" +" prfm PLDL2KEEP, [x2, 256*2] \n\t" +" prfm PLDL2KEEP, [x2, 256*3] \n\t" +" prfm PLDL2KEEP, [x2, 256*4] \n\t" +" prfm PLDL2KEEP, [x2, 256*5] \n\t" +" prfm PLDL2KEEP, [x2, 256*6] \n\t" +" prfm PLDL2KEEP, [x2, 256*7] \n\t" +" prfm PLDL2KEEP, [x2, 256*8] \n\t" +" prfm PLDL2KEEP, [x2, 256*9] \n\t" +" \n\t" +" WRITE_MEM: \n\t" +" \n\t" +" fmov d28, #1.0 \n\t" +" fmov x16, d28 \n\t" +" cmp x16, x12 \n\t" +" b.eq UNIT_ALPHA \n\t" +" \n\t" +SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) +" \n\t" +" UNIT_ALPHA: \n\t" +" mov x9, x5 \n\t" // C address for loading. +" mov x10, x5 \n\t" // C address for storing. +" cmp x6, #1 \n\t" +" b.ne WRITE_MEM_G \n\t" +" \n\t" +" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. +" \n\t" // Here used scratch: Z[20-29]. +" mov x13, xzr \n\t" // C-column's physical 1-vector skip. +" incb x13 \n\t" +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7) +GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) +GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7) +" \n\t" +GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x10,x7) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x10,x7) +" b END_WRITE_MEM \n\t" +" \n\t" +" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. +" \n\t" // Here used scratch: Z[20-30] - Z30 as index. +" mov x12, xzr \n\t" +" incb x12 \n\t" +" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip. +" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. +GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16) +GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) +GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16) +" \n\t" +GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x10,x7,x13,x16) +GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) +GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x10,x7,x13,x16) +" \n\t" +" END_WRITE_MEM: \n\t" +" subs x8, x8, #1 \n\t" +" b.eq END_EXEC \n\t" +" \n\t" +" add x0, x0, x3 \n\t" // Forward A's base address to the next logic panel. +" add x5, x5, x13 \n\t" // Forward C's base address to the next logic panel. +" add x5, x5, x13 \n\t" +" b MILLIKER_MLOOP \n\t" +" \n\t" +" END_ERROR: \n\t" +" mov x0, #1 \n\t" // Return error. +" END_EXEC: \n\t" +" mov x0, #0 \n\t" // Return normal. +: +: [ai] "m" (ai), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a] "m" (ps_a), + [rs_b] "m" (rs_b), + [ci] "m" (ci), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [m_mker] "m" (m_mker), + [m_left] "m" (m_left), + [bi] "m" (bi), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta), + [a_next] "m" (a_next), + [b_next] "m" (b_next) +: "x0","x1","x2","x3","x4","x5","x6","x7","x8", + "x9","x10","x11","x12","x13","x14","x15","x16",//"x17", + "z0","z1","z2","z3","z4","z5","z6","z7", + "z8","z9","z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19", + "z20","z21","z22","z23", + "z24","z25","z26","z27", + "z28","z29","z30","z31" + ); + } +} + diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h index a5934312a0..3ccd79b68e 100644 --- a/kernels/armsve/bli_kernels_armsve.h +++ b/kernels/armsve/bli_kernels_armsve.h @@ -33,5 +33,13 @@ */ GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) +GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed ) +GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed ) +GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed ) PACKM_KER_PROT( double, d, packm_armsve256_asm_8xk ) +PACKM_KER_PROT( double, d, packm_armsve512_asm_16xk ) +PACKM_KER_PROT( double, d, packm_armsve512_asm_12xk ) +PACKM_KER_PROT( double, d, packm_armsve512_asm_10xk ) From 91d3636031021af3712d14c9fcb1eb34b6fe2a31 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 15 May 2021 17:05:16 +0900 Subject: [PATCH 014/226] Travis Support Arm SVE - Updated distro to 20.04 focal aarch64-gcc-10. This is minimal version required by aarch64-gcc-10. SVE intrinsics would not compile without GCC >=10. - x86 toolchains use official repo instead of ubuntu-toolchain-r/test. 20.04 focal is not supported by that PPA at the moment. - Add extra configuration-time options to .travis.yml. - Add Arm SVE entry to .travis.yml. --- .travis.yml | 63 ++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/.travis.yml b/.travis.yml index bbae9a7d9f..b0d10749f8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,80 +1,83 @@ language: c sudo: required -dist: trusty +dist: focal matrix: include: # full testsuite (all tests except for mixed datatype) - os: linux compiler: gcc - env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" + env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" \ + PACKAGES="gcc-8 binutils" # mixed-datatype testsuite (gemm_nn only) - os: linux compiler: gcc - env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto" + env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto" \ + PACKAGES="gcc-8 binutils" # salt testsuite (fast set of operations+parameters) - os: linux compiler: gcc - env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto" + env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto" \ + PACKAGES="gcc-8 binutils" # test x86_64 ukrs with SDE - os: linux compiler: gcc - env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64" + env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64" \ + PACKAGES="gcc-8 binutils" # openmp build - os: linux compiler: gcc - env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto" + env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto" \ + PACKAGES="gcc-8 binutils" # pthreads build - os: linux compiler: gcc - env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto" + env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto" \ + PACKAGES="gcc-8 binutils" # out-of-tree build - os: linux compiler: gcc - env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto" + env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto" \ + PACKAGES="gcc-8 binutils" # clang build - os: linux compiler: clang - env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto" + env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto" \ + PACKAGES="clang-8 binutils" # macOS with system compiler (clang) - os: osx compiler: clang - env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" + env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" \ + PACKAGES="clang-8 binutils" # cortexa15 build and fast testsuite (qemu) - os: linux compiler: arm-linux-gnueabihf-gcc env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa15" \ - PACKAGES="gcc-arm-linux-gnueabihf qemu-system-arm qemu-user" \ + PACKAGES="gcc-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-system-arm qemu-user" \ TESTSUITE_WRAPPER="qemu-arm -cpu cortex-a15 -L /usr/arm-linux-gnueabihf/" # cortexa57 build and fast testsuite (qemu) - os: linux compiler: aarch64-linux-gnu-gcc env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa57" \ - PACKAGES="gcc-aarch64-linux-gnu qemu-system-arm qemu-user" \ + PACKAGES="gcc-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/" + # armsve build and fast testsuite (qemu) + - os: linux + compiler: aarch64-linux-gnu-gcc-10 + env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="armsve" \ + PACKAGES="gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ + TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/" install: -- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/as; fi -- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/as /usr/bin/as; fi -- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/ld; fi -- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/ld /usr/bin/ld; fi -- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-6"; fi +- if [ "$CC" = "clang" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="clang-8"; fi +- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8" ; fi - if [ -n "$PACKAGES" ]; then sudo apt-get install -y $PACKAGES; fi -addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - gcc-6 - - binutils-2.26 - - clang script: - export DIST_PATH=. -- pwd - if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi +- $DIST_PATH/configure -t $THR CC=$CC CFLAGS=$CFLAGS $CONF +- $CC --version - pwd -- $DIST_PATH/configure -t $THR CC=$CC $CONF -- pwd +- env - ls -l -- $CC --version -- make -j 2 +- make -j 2 V=1 - if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi - if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi From bd156a210d347a073a6939cc4adab3d9256c2e2b Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sun, 16 May 2021 02:56:14 +0900 Subject: [PATCH 015/226] Adjust TravisCI - ArmSVE don't test gemmt (seems Qemu-only problem); - Clang use TravisCI-provided version instead of fixing to clang-8 due to that clang-8 seems conflicting with TravisCI's clang-7. --- .travis.yml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index b0d10749f8..3a1a6f50bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -41,13 +41,14 @@ matrix: # clang build - os: linux compiler: clang - env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto" \ - PACKAGES="clang-8 binutils" + env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto" + # There seems to be some difficulty installing 2 Clang toolchains of different versions. + # Use the TravisCI default. + # PACKAGES="clang-8 binutils" # macOS with system compiler (clang) - os: osx compiler: clang - env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" \ - PACKAGES="clang-8 binutils" + env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" # cortexa15 build and fast testsuite (qemu) - os: linux compiler: arm-linux-gnueabihf-gcc @@ -67,9 +68,8 @@ matrix: PACKAGES="gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/" install: -- if [ "$CC" = "clang" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="clang-8"; fi -- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8" ; fi -- if [ -n "$PACKAGES" ]; then sudo apt-get install -y $PACKAGES; fi +- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8"; fi +- if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi script: - export DIST_PATH=. - if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi @@ -79,5 +79,7 @@ script: - env - ls -l - make -j 2 V=1 +# Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx). +- if [ "$CONF" = "armsve" ]; then sed -i 's/.*\.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi - if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi - if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi From 932dfe6abb9617223bd26a249e53447169033f8c Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Thu, 20 May 2021 02:07:31 +0900 Subject: [PATCH 016/226] Travis CI Revert Unnecessary Extras from 91d3636 - Removed `V=1` in make line - Removed `CFLAGS` in configure line - Restored `pwd` surrounding OOT line --- .travis.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3a1a6f50bd..34e7aa74b6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -72,13 +72,14 @@ install: - if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi script: - export DIST_PATH=. +- pwd - if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi -- $DIST_PATH/configure -t $THR CC=$CC CFLAGS=$CFLAGS $CONF -- $CC --version - pwd -- env +- $DIST_PATH/configure -t $THR CC=$CC $CONF +- pwd - ls -l -- make -j 2 V=1 +- $CC --version +- make -j 2 # Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx). - if [ "$CONF" = "armsve" ]; then sed -i 's/.*\.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi - if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi From 859fb77a320a3ace71d25a8885c23639b097a1b6 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 23 May 2021 18:15:23 -0500 Subject: [PATCH 017/226] Remove `rm-dupls` function in common.mk. AMD requested removal due to unclear licensing terms; original code was from stackoverflow. The function is unused but could easily be replaced by new implementation. --- common.mk | 6 ------ 1 file changed, 6 deletions(-) diff --git a/common.mk b/common.mk index 113571382c..4a5c5b8d5e 100644 --- a/common.mk +++ b/common.mk @@ -202,12 +202,6 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" files-that-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),))) files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f)))) -# Define a function that removes duplicate words from a list. -# NOTE: This function was obtained via [1]; thanks bobbogo for this -# concise definition. -# [1] https://stackoverflow.com/questions/16144115/makefile-remove-duplicate-words-without-sorting -rm-dupls = $(if $1,$(firstword $1) $(call rm-dupls,$(filter-out $(firstword $1),$1))) - # # --- Include makefile configuration file -------------------------------------- From 5feb04e233e1e6f81c727578ad9eae1367a2562f Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 23 May 2021 18:46:56 -0500 Subject: [PATCH 018/226] Add explicit compiler check for Windows. Check the C compiler for a predefined macro `_WIN32` to indicate (cross-)compilation for Windows. Fixes #463. --- configure | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/configure b/configure index 67d87a1b01..92a6c229c8 100755 --- a/configure +++ b/configure @@ -2373,6 +2373,11 @@ main() fi echo "${script_name}: using '${found_cc}' C compiler." + + # Also check the compiler to see if we are (cross-)compiling for Windows + if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then + is_win=yes + fi # -- Find a C++ compiler --------------------------------------------------- From 82af05f54c34526a60fd2ec46656f13e1ac8f719 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 25 May 2021 15:25:08 -0500 Subject: [PATCH 019/226] Updated Fugaku (a64fx) performance results. Details: - Updated the performance graphs (pdfs and pngs) for the Fugaku/a64fx entry within Performance.md, and also updated the experiment details accordingly. Thanks to RuQing Xu for re-running the BLIS and SSL2 experiments reflected in this commit. - In Performance.md, added an English translation of the project name under which the Fugaku results were gathered, courtesy of RuQing Xu. --- docs/Performance.md | 32 +++++++----------- .../large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf | Bin 0 -> 23848 bytes .../large/l3_perf_a64fx_jc1ic1jr12_nt12.png | Bin 0 -> 256360 bytes .../large/l3_perf_a64fx_jc1ic2jr6_nt12.pdf | Bin 23854 -> 0 bytes .../large/l3_perf_a64fx_jc1ic2jr6_nt12.png | Bin 254420 -> 0 bytes .../large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf | Bin 24191 -> 24234 bytes .../large/l3_perf_a64fx_jc1ic4jr12_nt48.png | Bin 264502 -> 265681 bytes docs/graphs/large/l3_perf_a64fx_nt1.pdf | Bin 29879 -> 29872 bytes docs/graphs/large/l3_perf_a64fx_nt1.png | Bin 255183 -> 255532 bytes test/3/octave/runthese.m | 6 ++-- 10 files changed, 16 insertions(+), 22 deletions(-) create mode 100644 docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf create mode 100644 docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png delete mode 100644 docs/graphs/large/l3_perf_a64fx_jc1ic2jr6_nt12.pdf delete mode 100644 docs/graphs/large/l3_perf_a64fx_jc1ic2jr6_nt12.png diff --git a/docs/Performance.md b/docs/Performance.md index 0a296c12a7..be287716d2 100644 --- a/docs/Performance.md +++ b/docs/Performance.md @@ -534,7 +534,7 @@ The `runthese.m` file will contain example invocations of the function. ### A64fx experiment details * Location: RIKEN Center of Computational Science in Kobe, Japan - * These test results were gathered on the Fugaku supercomputer under project "量子物質の創発と機能のための基礎科学 ―「富岳」と最先端実験の密連携による革新的強相関電子科学" (hp200132) + * These test results were gathered on the Fugaku supercomputer under project "量子物質の創発と機能のための基礎科学 ―「富岳」と最先端実験の密連携による革新的強相関電子科学" (hp200132) (Basic Science for Emergence and Functionality in Quantum Matter: Innovative Strongly-Correlated Electron Science by Integration of "Fugaku" and Frontier Experiments) * Processor model: Fujitsu A64fx * Core topology: one socket, 4 NUMA groups per socket, 13 cores per group (one reserved for the OS), 48 cores total * SMT status: Unknown @@ -546,23 +546,17 @@ The `runthese.m` file will contain example invocations of the function. * multicore: 70.4 GFLOPS/core (double-precision), 140.8 GFLOPS/core (single-precision) * Operating system: RHEL 8.3 * Page size: 256 bytes -* Compiler: gcc 9.3.0 -* Results gathered: 2 April 2021 +* Compiler: gcc 10.1.0 +* Results gathered: 2 April 2021; BLIS and SSL2 updated on 20 May 2021 * Implementations tested: - * BLIS 757cb1c (post-0.8.1) - * configured with `./configure -t openmp --sve-vector-size=vla CFLAGS="-D_A64FX -DPREFETCH256 -DSVE_NO_NAT_COMPLEX_KERNELS" arm64_sve` (single- and multithreaded) - * sub-configuration exercised: `arm64_sve` - * Single-threaded (1 core) execution requested via: - * `export BLIS_SVE_KC_D=2048 BLIS_SVE_MC_D=128 BLIS_SVE_NC_D=26880 BLIS_SVE_KERNEL_IDX_D=14` (double precision) - * `export BLIS_SVE_KC_S=2048 BLIS_SVE_MC_S=256 BLIS_SVE_NC_S=23040 BLIS_SVE_KERNEL_IDX_S=2` (single precision) - * Multithreaded (12 core) execution requested via: - * `export BLIS_JC_NT=1 BLIS_IC_NT=2 BLIS_JR_NT=6` - * `export BLIS_SVE_KC_D=2400 BLIS_SVE_MC_D=64 BLIS_SVE_NC_D=26880 BLIS_SVE_KERNEL_IDX_D=14` (double precision) - * `export BLIS_SVE_KC_S=2400 BLIS_SVE_MC_S=128 BLIS_SVE_NC_S=23040 BLIS_SVE_KERNEL_IDX_S=2` (single precision) - * Multithreaded (48 core) execution requested via: - * `export BLIS_JC_NT=1 BLIS_IC_NT=4 BLIS_JR_NT=12` - * `export BLIS_SVE_KC_D=2048 BLIS_SVE_MC_D=128 BLIS_SVE_NC_D=26880 BLIS_SVE_KERNEL_IDX_D=14` (double precision) - * `export BLIS_SVE_KC_S=2048 BLIS_SVE_MC_S=256 BLIS_SVE_NC_S=23040 BLIS_SVE_KERNEL_IDX_S=2` (single precision) + * BLIS 61584de (post-0.8.1) + * configured with: + * `../configure -t none CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (single-threaded) + * `../configure -t openmp CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (multithreaded) + * sub-configuration exercised: `a64fx` + * Single-threaded (1 core) execution requested via no change in environment variables + * Multithreaded (12 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=1 BLIS_JR_NT=12` + * Multithreaded (48 core) execution requested via `export BLIS_JC_NT=1 BLIS_IC_NT=4 BLIS_JR_NT=12` * Eigen 3.3.9 * Obtained via the [Eigen GitLab homepage](https://gitlab.com/libeigen/eigen) * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas` @@ -593,7 +587,7 @@ The `runthese.m` file will contain example invocations of the function. #### pdf * [A64fx single-threaded](graphs/large/l3_perf_a64fx_nt1.pdf) -* [A64fx multithreaded (12 cores)](graphs/large/l3_perf_a64fx_jc1ic2jr6_nt12.pdf) +* [A64fx multithreaded (12 cores)](graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf) * [A64fx multithreaded (48 cores)](graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf) #### png (inline) @@ -601,7 +595,7 @@ The `runthese.m` file will contain example invocations of the function. * **A64fx single-threaded** ![single-threaded](graphs/large/l3_perf_a64fx_nt1.png) * **A64fx multithreaded (12 cores)** -![multithreaded (12 cores)](graphs/large/l3_perf_a64fx_jc1ic2jr6_nt12.png) +![multithreaded (12 cores)](graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png) * **A64fx multithreaded (48 cores)** ![multithreaded (48 cores)](graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png) diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e273d1d098d5a1a8517e6939b6aa6d3645ed0451 GIT binary patch literal 23848 zcmZ^qWmH^Ev#22iNC*<#3GVJraCZiGcN<)TySux)ySux)Gk6FVB;3jOe($;W{J1}6 zbyrt)@7`-oO?5q8L#7}iM#n_Y0Y^4+5t9!`MkZ zBC0AWEXrqZZ0hQ0;RGHrmNBq3a|YYUn7Ei&^D%=*L~RUBj0LT&#q4aIfP7Al&VSA1 z?VT)aEIds9mH$~tO;SWfLP1_qPDPoIiG_>uN0b=A22Li9gbZTV;Fl9IF|so@;pc|~Iyssc*uc4E9(75_*zmYyt@ZQ}r$1uE z<_(PyW8yB=LT*qBi>-xUe72R8>af;Brv>!Y`g%_>>EY2l@VgW56F^dHOQgB88&$Vu z2ON@82>f|WIcVy{dlGox>G^$Ma_j&867qhx@bfmrk$|D7is)Tu+NJY#VCU_w#4dS8 zNq2>kVMYD&o|d8KkBYcnjmquLd@rTLzvS8f%b@Ol~sp#Y>#`m>F;H^U7&)v7|&a$;}E1Ib7#f=4yh3FNJ?aKKT zoJ(hyha)l$J&UC7QEIQM_4V`iQMJU{8XNTPRch}nao;Mh7)8flNvfa-uzBWNQ_jVN zv~Bl1wRhaopY)h*J&NDqC=X3l{g_%ddR2ET&c95^T=fi^ylrTd)||VSGZ<*vUf?P& zt+c%FwD^xk56W-(l2)XvoJU*tsoTo6`ue>Y%`PCw@l;Zo6_k4S)bNQ&w)6J*+ zs{`j(Tk_7!Zx*|%{i?mFH^`c5RbLyc&WE*&rnd~e!^@fFmznqH6pv1Vb1Z@PBZeb4 zz87_Y_nhA=zV8uU3tD|1LA(>^4r^@h2fpzwcdoZNdcGF2?-g?|^&HN5RdFo<-9HgIfbURX(%Uc~dU8`p9qY^TctnKTh8lr3bf4 zlC^7~95vNEtES*^8a1Oe6(*|c&a8SGjqyr_S7;BK%>6R+Cff2UgZ<|ZEeH4GUCJsX z`!tpvI`<8w=^V+Fl&h4xk?1&ID*m{{0Qm80e2Uu=mth~t$XfB~(j2j&$QH`|=VO9q zWOZ|1jVvWozNBf+>)!9bK>y4Y`IARh(&(UUbNFR;f+s{9VLRIkQPUtS_0WSpakLAG z4&=yZNy1Sj9(CTk(Wf|ZWr^`y{1JuEGTV_)S~@s|wvC{vt$Yi#R5&H-w^_QqOm_b} zEX2=L7r$m+Hv|22M!6bVK7aUEqBz~R22Iwkc53wDa|?Fi8D*qS!zK+ESOCA87bc~J zhjY^?c1<~12!EI+eOqd6pJ!>)_|WN6<;3fv;?JcCmXz)ESKHfd1{?BnHi&u0@ukur zuXzudglUEIbqO?Yh9=B59Nr3-RW%}48npuUs|?qh2` z9JfQ!uJ~5`uCnW(?UGmhoyozX1VZ$^ngcyoMeMDZe-5QW{?Ap**=@p^-MWU3x8rq* z_c&E|fd&A5Upc{JA=|5?kk_3%MVcwl_0$t-EzGa0u&TX4btMrbCp#?b0^2PXpT!$} zhVx|!pSW4BJfXyEElkKIr1O_peFBHcl6*O${2S>~*^o2p7E!TtgdEOt`TNv6JEZ1< zhZWM#BU%hXY9pI9r5_a3)e+E~BOW?Q(5=S~$~#>58iCIuxet)-Pd888l1Mi5CVGWd z^0Y^DIi&oH*C{FHM#@KhdWHN{ZNV+k<;-Yq6Y*6BJQqf*Re9ZWT2`xtQ`9ZXwcn9qzt*jmDtYsWChiA5&%Tw|*Ybchz_|bFT^P@STzT>pTQGT z9IU<^?d`Cx6-}2vHrsbveO3!))#^X@+CgbocadK0XxSnf-`;5nYF*XR3N2BvwJ}I3 zx85{fD?!ra4VUB3@2}csV~F`C^F#A#V<7^{R#6D%{1)30+frRn33);6?POu> zL*wDDi@dZ4*=1P{`usu@_>)$pBx;9>fmwTpuOzW4zhDI#0xh`CnRK z1-|KNSrN|y>;~v(bQA8@Zk%HQ&*KJDccKI7AeyRqOpTx#zDn$FH+2>-n~IwuRS-=G z+X5znRdG$Dl%8=?x1&0X_gt;%vsA^MsOrl?w87NP&_KG@R|9YavwR6Rn7Xt=vaO=Fc?8VyjX1hl%jRgty>7%gWWI&0E;aCJj=D1m`HrX|yLh z!C%ukp;r7EX2cIBD0aTu04Pq?0BkDjbWj92&_=`ZYJ!>O^7EQR;SuVp#@wHoq=_pO zy+@JQ-+JA`RnaH>i|857n@VMyuIio650#v9Yu5?&8Dy(Pg$oMKuiXt*#9&DmzqixZ zaF-EfkpY#85}bxU?G};UmKYQSvj15`R&6p72VLipv_9E5ONf^iz}&vxKYOOoFroIW z`w;nv&#Vy#`s^p2f7-3}i;i|}a{74FC{Yk2Tn){BwD>Es+S@{Xb>UL@EY}dS@+E6t zpx#5Fa5ih5w(TZQF=nyfqibG4 zpAo^KBuf5)%)6wX7jd|O=cwIUscC6JniM8192Yh#Uzioy-g;+1;|`v)_)Om2d*aOc z1#X5~Z-bV9v9(cTKZYP6CMDT;%U4YHeUB)rO4_`po}TGb36t=t5;yNj1tLljJyK+u zEQV>HbgO#me8W%#iGfrNy0;1nc`#w)LPmcjlO}e*b<=iG7=@mCnq$^7i-bjfG#-Uj z!&phRfqW>?`CBfM4Lyi>{gatF(7@Utvrv|6xguw_IJzZSZe2SxmQ7X4*#+4jaW{~6 z$aJq@?N(cdxbU3Ry_()|?kmw@TP|airBH6gjBQ`(7bjky??c8KPG=vMj96C@ z{pdTFR3&n6wZPPBiWh~YKhN|%RaF-ksF9qC*2^W4r5`{I?DZKsZ z$%)CF6mKynII4n^x4_*_q)b&8>+>jdl#p|6Sm{Tk-)5A)kw`N?%hK`Lpy8nC@xO;# zAk%W{JW7h7*(geQWIk;>!yOMvfx8r?4yWb>$}O77UYO7OE|yj9UZ1xY$1u)LbRB`- z1}7tC*q2mrh<|iXK8!~y~N=CzsF9KyTooz zXHu_A*2Ei8u)`OW+zpkH>{*rM1v1pH=eUT-@0}-e$NP`6qCF0&Zm5aKZ(tTh7uLuK zOX_Welb*~iJTw(hpjZ~>71l72rb>J*(U9#K`Fc*`e6rDzep6TM(Z7+_vUUIdod1drO553b6J$#+ziZPmdFFxD2gGLfh zIIFKYF%qZ}&C9t!({zN|F_FC+O8K|B<(%joq+uCmMROd2H>J6Rhs)_Y!(qGOZ_`>7laJd(>gLb>(l379=x9`nQ+$Xo+QJIk#UK={ROx&e7#S-(8F|N72t3$5ZF& zdf}VdF)e{a2@r45$KFK_8bdUSdUb6fL{%knfd_!qE~9xhmcyH%bDhfb@5B=Z33tjL zsI%a59D(ahY4e95UiM*OKS`7KL!-01A$_A*h=7$v0( zO;f&z>5v&0jSzd{kq@Z*D;9f4*7G+8+_0<4r@QxS z`(x2pqus0xcid4S+b#gBeqivLFrp z8&$L3d(s7uN~PM)@49HkgK@J3W?2=#6I+*Nf5j6XF1D@M$}xOASu)LE8i{Xl>@{2y zVPzu@5xva$R4b7}!7sT?X1XfBOjgYAgh=G%zIQOMYkqp$PM>j(sG;kY&0UII-+9xy z-e%%%Ad=4>U-W>{*n?Z&$*3lvk-Tziv8zzj>4gg?sYozVYY~E0BvSvdS44>RNi z4h5-XrkJngvv5dsRXvn0Vr5eTCrf~Gd1fqXiGpBrT`4KE4EiBtdE3g}?L2?zfIh*` zBujuLj#D{i^b}}diNfY=RXHXs6je}19&=vHNGckptmWYrdfU3MO*Ag-zu|yN7LU(5 zC;(VRxX8f>F-IVw;txTMFyss&Y`yKDsI8xYx)hN&6srVUt}jp?>js)4#$BJ|lHg z@^)UG#K_w9YriS1)V0lnqQNh>$rFDXt4uc;4rxl#@cPr%*=zVPYp*Mx5Z(R>h)&uR z$cr5T&Q?tT0sPfL%SW}Cn#+E8AgIM>YhfkOAT$FU%WJ8;6;~7glQIIH)(w=O_G_vB z4dnyHccKQ$|HN((&HpQbBRSa2VYLw&S#>X>s=4MYeQJ-@4vD!V}l8Ufk`P2weDBkvKqcwf7kY-&oUY z&a)XD6Ht%OTaiKSuXzi)x0MlMhBj?^AQx+guq}d)l{B!;+FT`MCGNOhv&pC58C~{4 zL`$7jRspDu%w%w}BjY4x-n17`>vC6Hr)^_gv|@^#gOy2}d~7^WIxkwCt>Nn={hODG zPoFG>yxqLF5EkTE5w32o0jwLJ6-i+`d}bX?klMQCFBw(rK%D=%_`_LNzA34?`^cM^ zU@p(S4SNb0S6w)TD7mtD%g+GnI)k{klwQzgeNo(cICjqo2_oTE;&U-4YB9CQk$|6X z*yV@w(ZVWhOTpUcxfpj)8vyDq?R`I+sZHaa^35K(D+4DjDZAh8JoENfBGTvGanmuZ zbe4)FSGl7PHTqEK02#mpoC$g~S9=S#b=n~%p)?az*Hg-jsZQ%@*$+XO#tHZ)n&=+x zVg;syixbaK3?F77yYiW3IPuG!sD8dw=BjWmKT#bF36 zSgP}^tOL}+wH5=_MZ3$5h2OSb;pIvAJX$KKx`Lu~xukf-?>Mcrr$FLaC3-QuT^3~Y zK9dGM%mCMj`qPrp{KS&7F>R`GnJq^}-E{d?iNxC;I^**EMMmp-N@oJCU)w^7r|ezL zQu&K03n>w|LX=t(368Z#AtB@ph4fbM{A^S)&-dh##;#Th3GpI@`BCncC~uw!a7qcG zR@Q8Cp||SWS~7~nWYrCn5hwP+lwazdzf{!g-$t2~aRF&3!WEv1C+iV$^QA2Im!F5$ zD{r%+hxP|2^M~8YT7P4a&IFU*XhV~9E~sn=9NE7#czDN)U^4S;HH4|Wx*3*Qsca<*;-GV|BwMD6 z6>VYd4YN|as__ZVv$eC1+>k~|Np?u8oG{G@p#T!rB=c{bRg}LD=@hLRVp&dWU1B=p z_50^nXNck<`}#J%YYzg>{A&E(#6OQn>LzEUy?Kgj-wWN|OeLgj@@lL-B=-2;If5KP zg9nc!)#n??D+!pYvDv;kr5?B8jPwpIpXGVt;pFk`(n@SlvsZ`t6!DI-ZRe6bh;bjC z8o4XUH?)@V+(;RlyGkzs`_#l7!IGg!{0Vby8Wz1ByX9{h0`Y9er9Q>W4&Sv{l8f~zGn z97U*RpR`L+!!3S7hSD*elas9zgkQ97k`_Mei~?=oB!)K$-8gzIrzD>8LKL@q1<6pI zL08C3eqgHc7{lde=)eGZq5tAxB#2fYQk`2g0P#x{c3xmjPUNmo>n#|tE7N~TvsOUhMP6Ag^4Xi8$(Zl{L)FCY34A4D2szY17J0zHAf) za?0PGjANW|W?p|?=RNY%G{#uR+b_(noMM5ol}SIqViyzqsj@2v(_K=4vPmk%E@GMg zHk*4ZkEHt2YIA?lDnG1>n6z%TMkbMbZ(EQuQMUd5PjU?U z0(kG-DoEM%DgrzcSGzjHUc* ze&=jZyz3I9`(GH$A5p+vH~duU%fCq-^D}DnS}RWdezwH>rPL1A;-n>@(d=i&n~4wJ z_xqExsg@9&8x6YO4>AKtY$rejf#+w&&~i3U1K}6D6IVeMZyy2?vJqbY(q}s;Mf5_x zya}={^RDBbq?CIt4HYP>GPs6Pm|cGE?*v#s{1heePvGHuzklRR6=jQkFB;gBgG?PI zm~Y;ug19veG5P%wT0einc^y7d5yF(O!1o#rVrmQ|vztm6672_>dWwgOFv10ani(RA z_hSw6PpD7W+3Q70h#KOD)YPelKT!b?$5dvvGP|+yKjg7t_48|#uzHzbRgpn-T(J6v ziQCU*(~zlHBQ4pDK#1ctFI7fP2-Gb0Uk7*)vLy9$_9qZBm1u2P!A99tgJ8`$S>(G3 z-j6VHwd+162*<1az8D9HLA60P#1FHmYo3l@dk1fmCQg-MKBSJy8{sVtdL%FG;tWHk zwzl+hmVf?CAK--bNgq-1WJnvY4z#TD@sqx#>r5^#*dEFvyZK`0JA{&>h7%s)k1&N2 zW*l+beo1`r{?y0hjq$wGTtbZ?$fG{$B2P(bPdM zuV9*r#6h2ng=2NwA34-%f%4gKG6XlYI66TjEKzv*F!n1CCtkJWN1uQ(JQPuFg7FaY zqi-3LOg6}0sZbhjDQHla&ihBku$fO7m8tk)oN{M=Fr&F8BT5> zQ;&1gxZf{wdA{@;2Sdm8_UT0mRs7Yjap$PeGiH zCtY1&s5vePG)0CO+GN{O3G^~jc(Y=cy{|Kb_&Q9PytGV1H9s8_6FQH4go9l|PV&=k z#`ug(Dwa6e95-%zWFF5U%IVLF=Kc^@+NKiC{4l?}mi1jpt zx;rDiM8}Z6(I9v&J&s5T{HR%PK`$jHXmo#QhX)M)rImtUS{d*^v=WQedydik-;C#f zDZiN@1_Uz@z)Zz({k=~ss=}r{~hU<^CV_FDGXq4p-Iy+;^j=<$jCO2q#c+EQ~QF-m~8i z;qUN*n9`?cq-Z+@@c{h&#=2ULr!;WRy zxaCrUJ(Q8@vdkfd;6k;>vRcH8Ey!CLx+@PW${L1r6q**nIW@V%B{@7L%yQ7kj4cq3 z*^-KkC1GFqutRug-Uvw3D;jVr*(;BXWvSEEDhUaQWnn7PhIcdx%;ej|49s+Y*yqKg zI6Q?YpR)zegGMAn5Q^Shu1z$eIGkij*7!&y?jK~ZwN3~&$ylZm9F0?NeDdZd61Ps? zNkVmWx|z!Zh>fPH8&sdc2}Lh@O=*S`9!hk2XnKDO%uEYyAUvEzrnvz>wJv=tI|$YB zP&-=ii_r9-S&1uhARdB&Jk4AmQ`hIHBL7&DZ_PEf4nonHp6btvUmeN$?Pak)2}L6+ zd4?f79+sNAHfcfH*H&f!VI8$3JP>2*(<)-|@XDJf5XLT>$8(WO+81P31@)fe$)w^Q z)F++iE%+HXHpI(~;C0)sIaBNP3KA!+8nxcZ7S5N(;Of~DdI~~q&7LZ3zlh@CLvT+@ z;^41)8D7jXt>6CKzVb1c-j3pNkDhcZ#V#f1R@|5k1dw) z1RHnq_5D|rB>z8Ba7jBUT}29~-wUT~2%qr8JquB{iU>pw^@RG=ki$U+bIYG>BS(?7|W70y#{wQZmNQ{aWZ(&6>qTQs{_ z$}q956H>dk3hG+ox7Fg~xw4|H@J`mLKPaT!z|Ww~SNwD6jNKkuHLoV742n>e?_ZHK`uz7kfH@t^7Crv5MQJcwyi1x> zdEQ@4xM^BY@5j_Q5>J~`d1^1p9vtahP~ZIPWRb=5m?hbPHCe+b;45%%6V{||&ilXY zG5H_%Sav50USi#XcJG|#Gj(vob624Dw=#|$0?S^rro#>8&pkCSWj2#~T=~Q*H<`vQQwqz)WM&wb zA6K23%HoG=H9W#(IDN5Xb{hZ|59wftN5 zDin*gQM}mF;HX;loC7z2h_w8zW*BA3fcb@1C*Q-is$D2wGoQ^-LZ#-ZoK(?WJ%ysX zL*%}h;@5HT@jQ#YoRl@s0N82AeaONPD*;%va}LAfNwKF`OD^_sL5QT*Y{AnA@O!7< zH7K13{VLg&O1C(HCxgR~gllpKm(6ziRg`<%YDU+#!mIYHciWcVTB@?~AK6-$_$TPs zh?Ej97XI)S81Sh6>&ap#&f+=`iIjnw0_~SIG4l-j6j+#nNIQn;(S_>M<5C*zN~Z= zOhb7#Z8*Gp&^V`*ndRfcDA(|On!0c?R8`-68dkrpsv&LGniuTGcW@e^-e~zbaeA;_ zs$iE>N<4eLJ9s9vPvxuZCJ$IK=+RSiAG>gwZIePjQ+h#VjWOhzMK{cj|oQ|nd7WkX! zkY^tz#EhS+(MUaQ!rh3iNxI-STfh+yn$;eXWZsHk5h^yC@<=KBn8OmQOt1XCQM$0K zoVGXGJju9R*)Ji_$&s(FbjySqiEY_DPzFgc-$XZ9CXUCQ@y600Z});a1%%9IK7Lm! zCnldpEJRQ_Uoh#(iP4L0dEiX-c%>XTM2FR}HpsNGzs96zq&vq|ALviPP_QkRXis?j zO8tONb}=!&nv(MR$iGPp|n3Y1x9KRG`tR2rvD(do5q2P zn?b=;_jUHYFq{4b#Oi+35dC9|jX;X#uo#CB)hiIjk2Q);o6Uv4Q+vTQ?&-fY4m`E| zX0R{if2Nj$r|SNlTEg&Ax3BBD>!a>Y4e<^Eb>|2*n7!%l021WypQmTkLe;kFO9j5# zd!u`h0YGVIO7+v6AMy*-vlITi;ffWmlp=npsPi=FIsg9^dHR3lv zwId~pJ1`=U@9aH)6Acr@kPo>T#%&uj8B+H1AAKcpB!Bzm%0#?t)W7uotT2nWF*U8f zO!Q-CL?h>{vt6ZVuN(|XHH`mz#BU(ffGws4vH|+R$>C$Ds1^MXGOkrI2ATiO(`I9+ zJ5P`L%KqSot7*-_&YT)}4!UsJd0dsqXG`uf2YvaH;ionxl0AlxR6i|MVyQ6H9v@=FU~E-(`KllFM*oiH$4Khh!%MdVikpV7^9aJ)!#1U z5?{_2# zK!J6+9{URF`!~>EY=Q&5#dg5PY8&TFjS9+_NDhVQYEJ3q8p@4sm$8R?qdMSf-o86NdbvdjVyfku_xglANR27O^vt~VscIYvag1@DUk1R(DJNEH2kPNi4ZdGoA=t-E~0;_W3U%=;HeIxG3DkFFDq^m1W zu3n?7fj*{2uhG$zLGD0JMy#DVLHibDvJqD&@1Q{NcVw|)>>F5cGN|gXdqUJ1b7tf( zF?u~6zCKOmtQ2ct_7U%CH-ZS*nPXDqOtmA!540WoD)=)Nr34ygNZ2Ma^9T*8iOD}v znV0+f4SeP~mP`EyBR41O_KnCNCh#%QzgYw9I0p&}zS+VPnz=c7oh)#E6 zh8=~rQsXin!#7!ehXD+qnTM6Yr&8k@)(6A^ZRB#K5CQCgOX86U`zP)rXA)zrf+slp z?L+EW!Ge$xk=jWJ10RwA7UEfjTQy;88?U^VYC@S>sIrkOY{E&{T*~ll8w*DVqI-7e23^YNk-H(8oxxeA z+rvd&>`Bz|;XI`dpn*sD{`6f{13~GQ_P^lV)YOMTm zWA^C`*GDC24?{&i1i>~Td3a;(P3vs9Lc@#AFq<3-jtmM)Ma%$A`CV?ESn@ek96w2{i3RI~(=!BlqvyLivTYADa&MVNjqWxW-iDGY?_B#`?BY=`1kWV(FM!8vGXH zhY*Y>E_w3(L?lmaE0R;?z`Zejh5Dr04^@F}F)Ansq*COMT9+3QS`tkL(3WFEo8k=? zn?klT+d^l8lO|0l`}LZARZ*KrT-Hl{;OfbEIv3b6F(qko zj`*_@N0PP~^r*w`Y;)3<9COk~#W@MX>w$pwYL%~5q)F;DWizQA)uxVCTE^Y*@-7bFp58!xe`wKWmxC*S1uL#giTtU}}g@~ZXi`Nc~qY7pl z0mXh9FFJ_nK84F(o~7^EOL2Dc(68Ow|M}6?)1O&d$xGVE_ZxkG9r7C`)M~QLXt1kK zLKLyo|7Kj#99-l7xgL<>v4e|8q2>HL~ksC24&Aw)F$N?>3sE@ikSYOSTtlf19Pb4G#a5hN?u_p&KqXrr4x{s=~n?|AeY` z!}og`x#qKHxT)b$Rs4pl=bzAPxz5cFtZ>wqm+x&&W>T;gJ%pzhvvr2p4Afq zl}FM+``tKS8j-${ehS)8{Z*{c%W6szWGS{c1~+8XpQ*NHkHCyZ`Yl5->XKO4q3;DTElELpK|HKsVV~q5nNMr~2ch`oR8G?`Q39!E>O*{7o2BmP4$a?MGty%{gR z`Gu^$m84lp4_my5L5}~k9#0&ZR~+CuT!LCh@mbPsoHg0L(PmzLSP>MOAm6_rXZ%^s zxG4z3LFH+m)A9=|s>bF@ijoRE>thFeGry8WH=l~{O!;U_$+Nef4gbn@K@EfAN%*qO zjDJ;)--*+y7dHP)sVV}`w#5o%sjBQ_P#sEvW|P%#7-GSuc=~{vCd6{UN>`;G`(inX zUn8oU9FOh#AoPlql-E3YtE$dQ<(c4g(^>2ExORoEoEgK%)pDb{LdD}qt0ksdALf;E zBV*N?l{!`YjnIrRqbd&`-VoMW)vlpQb^WTsk)z3*UQK5iR+Zstiy-4n#u3eX`V(f> zETgKz6Gi;&6ZMK{NL_q>waO%y0M=~yO2D*y38g2S z7!~{k5#IZKm5OL@SOkj`a2x0;yw0^O1!ZxdO|>-cZ^zf9T*X>4B6kSZNrqWw(hO)i zui`k))Tw$8n!5Qlok@7CDDx~yA>IaF=4v`uX3kFQG{d5PX%*n=>qK8??WPDKPTEP0 z&QqwpxF0GJsyp=DM1wQV-donrL!j9_CJbY*ET$u!3kLaB#EEQ#9R^6E*pRUIehy>r zTxyocM|g61(-z7{@Ki^SH(x+hB?FHm0WWZD2KL*k=7IrGsv)gnxSx=#H5be?mR8sT z#c*A&)1z3|O$QnB=GcvFRLvWw7JABO%`-?W#mpNG8jlU{v=<12=YR+SEKzKh*o2cI zjkL(~=E}_;L{aQA(b)D=Z0ofTGZMDdb1?fJW)GRgrL`is^Mf*t)r!CY9Gg@j@P1(~ z%Ca`ts1;2cJ~6Sb$3n4FTQowpl*tM%X3pKwGCkF2r1cf#CR5FE+eXhwlORNEe2euCHs-8%~_MTty*y+^W$A-r(@ zzv%>qd{IGY$$Z2cpU&XU!4tk@(3j}K{)uKd@xOidn3|-^XOXHqNkh{!LkxfT&|gb1#c!Svm%nogj68D zyTke^%ycek8jDJOgJl|dCA14@9RHMoUm3PnkOml;F&K$vf>1{GUozD zs@rh9KF*dy%~utyLpH`gft*;ApsL#Acj5I+zSBgC_gJR`#2J)3f&?4Ra@{Ok>CIK~ z&pSJqP7_U+=3dx3Gy5vJQ|H&i98v+|dc~T4V3E-(ndq3YBAPF~(Fz&d-d&6{Rv_xv zuW?S6L$zBe2!v=j%fzfEIDl7%U30k@gfM#tJSHjYr>sl=kNv0V zk}x1b&Q6nh{w|VbU^dE-Ovtvjl#|YK1^CNXSg9kDX(b`fGSw~uXf!hkTiydwY2`1x zJe1AB$?&0`cyxJP-z`!SiyKh}i_Hn8Jt-Hg5_;B()v>O1pU##}&og2w@#?vrW+JM? z9}At`h~UyLumCfKi>FrDt+NbaB65K`YJ7SeSs&=!ww*NbWP6-FFmPE`xre@w8fW$A z5HSmvcyr^DI)_6hC6oaj)uIz6vJ3~BBKBhO!`vv8PYyk*ce~pZ9D-c)3^kp$13exH zDj2j&4?^#*zEU$HB<#g<`#aKMoVl+NK$j`2Xhz2XvyPixz+J^^lNBmswp4_&h%!V| z;c-P4$3>m8UfswsDSS38{xuF>uf)C2Zqb_VI+IS=lg(}}lPkSIr)b)| z)v<}vo&++q5U%gh-ry9uo*9e^1-wj}1HdsXa?>*2)F zywh?TKXDi%>$<3*+j2fI3v7_%)yYcH7*RhuMRaP%S{!6|qqstwU%6ttdiVH%kA{~7 z=L0P#hTdx8{OIRA0(yDVpwNTIgQ^MlA!lI)Am{?p8kP^M3m@23IArb>i zy2OB}+`%-fh$JrWdI*~~E86=THdo1(oY^_HREd%E@?IgHG=k7{1j}h+VFAz%#?T>E zZiF~=nl(#z`XjGjF-e?Z$AVMgH4aU6Rxt@#AFN}#Tn*i_FSV?=&qfk>4K?>a1CUPP z^B{FKhxH&VsgeM`W5!n71XsH6cEj_aC$~4D$Rd)K14yS{C+>6|EkFTd79zy+%}z2( z;OZ7!WYL0iT`=l#V$njKQ~%m`Wh3oMf|Z=$d5}aQA9hi}mwh`*e2~GAlBx;FT<^DG zKR~UWi{ylrx%r-LdLA@7+YlqF!QC|?;a2P95rdum1Ek)LiCWEVe$t#gz3$iWbj7dt zU7KxpTgpf=o6OCVXq?IWyc&&cc-59tg08M8fJll}%;kkhYAd2Cj*{o$wGnAVv@GWh zuW%gkFqs8UHVMTBntRe@?VKAb#H{^-Fdk{y#7>;n1Ww0LWN^=wd@_35#bg*TqMb=| z8k*Ua>?EOV8WROnN^`r9wgQ?oYZ$IIQ7bpJ*o})2WF4$)r=V3#hc#@ zZ?kXH8myhs4!b6sL8oM(lhNg-kh1VIO;SgkXOWX3x-JKY}lS#?LCP&XIr6J^9CB2q%-8s_ni= zC=Z8EwUMn;9lPLqPEu2tLqD*Ny^dz7?I#dtZ0Z+*G_<x(u_CP=+t|^=`W+B(J-90 z)K)*}K%bcR^t6=NO>l}^qj$$XYGbDzLSInX^dqkY=J-Fn87~47YfEgERQ03l}az+2Xy;g_g9ue z_(MG=mO==={n&3%7--fi&PKtBfNF>*th(>dHDtBNoJjSao$nTd{`m>m1loS;3bPE< zkfY7z@?Pq%`-A@bZ5;1(VbIWwC>uzFVbB05wSrKrx?ICfFJ|xpe0@UphJt+ib06!z zuRLwsbJpcjizcQKp?5Iur1H-Lqxljt)fJND4HLhSV5pGpVb`pNK`ZSWLsEbxmo!`o zj8j338|=F89*5mbvR}W13zbC@8xH!zLAnNjcbH4;W}q)PG^}U3-|HW)W^VO{ubz9t zzMcg2<&Y975K3J^nzs6jzM$y&CN{Q`bEQTeBqD8TXgF&-9C+Gf-9s_h1lf=S21A9m zu{{)tuxt|+PIpnK$gf(X3~^bR9rjrro5daAg~gK(<&MMiu-FJl*OrO7KE z7oJgnAc}u|P5&T;vJsYGko&0&0tkho*__tNT{~n%w_2L&b7IzcJ>gYj?x?fIO+4*K z=l|2nmj^<*zWrB1##;8IhA?QxEM_bdql{#mXzU?|84NRK%z{Wn2q$|eq^wyIQaMOO zSwkw66k3U>Q0dV7jFz+f&hPzx|9GE&=6%wDuj^7b4rMD|#B}+oD^*@a%bEI*^WlPTar)n*N+gSBlSKG&NRhBUM(;V~tziZ82 z!%h(`(KHdacY<#k?9#g$9;BHzJQ(&!zQrv|E*52;Gj60Um#|ZP^iodu&YP{O znvCJnJN5 z8>xC4Yg91X9-yIfPM+>_!!s_{cQ?|nIe-q4;M3V7OZV}|iJzwXv_X36c6@BxdOEF5 zupQvXK0H8x3~tU|EabnQc|~*wKkK+f@c4)OCT;OIc0xYRw?JoOEk3=vrF7$qobkOz{tN!NGz>blGf$j+)& zl^_ahH6{HHHpYv0g9^qT)c6Xl*!`w;N9t+Vis-^ewYTE(pJNtv@6EmwiQm^n84YMy zO>wt_;@cME(nyV;&aBpXppsb1!>y9aRuR?C&Bb|B;#7;P%x9-9Ck^tbE`Acx$tw>* zp`>landsPd&Dn<;bIXN=t`VG!x8kZkl$8rF^s=w*JLVb@nsK$T%18^jG7&uM;N(t}CRUKi9yNERz|s#D%+< zjn8G9nX75Fh6`Fdxr8CWGKq5&Kmk$B@UPz`HP@D$Yfs#92F}-#c;{fHx(0`F?>3XRTre>L8>+T&cbBUZ zeK$1cJ+QR%>Z?)w%BRXC;3dTIPmA9d^f%{W-tE@!KlljJYaX?Mv}_AZQO{O~fVyxEyPyJ+WC&eI~aj zNkQ{XtHT{p>6uI?d0jhBqCc-)QC7fyV)L*`-mo(e|M)d>UFt+^k&DziWQV9U1y$zW zA!uA<5r`++C?0)Ye<4B5F${KGAq_F) zm?Ri*(Nm?X+9^pP4OeG-o=>-^uoC-eTa(fYX}!ZeNAQD|?CPr@WBuaEk$|2T1*2sm zdUiD>)hKbR(4gZY?D3R4lR*K3V$$=-00V?#JFS;5efziS*30X}9v(U)Q1ZMqor=dM z_|~bFicYDQ)||=JQIXZAEgAXZu^v+k@Iq1h+fg~E?!}g?c|D&2oU%RQ5dS>3E;&qY zwugzgmt`opnFZpdN9l57J-{cGbLxzT^dr8|-=4>|_tXspUbh>WA5ZsxU|T5MGiQFG z_=|ywrj#QBv`cErnO$wX*-XB3msEm1|MLJ$bNH-uMFlZ$@pozZ>^#dJ&;+_cd~9rQTRrX1H!22(~fS5>CxmS<_ zN2gshiE9%_#miK}7`+>JjcC?fF?o4D2lxK7$ktcO(1uZEzSbAQI7U-*)QYLttK-sc z^WkZL%)jcNy#Kn0;wO_wxOWM_skT7aRH8QL)^Oj__Zq(u zOZVIhlFdQatk-J%S`RNzisa2-y%G-8_!ZlrMugq3i{wd+^eqLt)2{B51^Np-7>O95 zf7tOuh=|xyWBewge>;gwHa&sg`t>8#K(EKBGz1Ys>3`mO`F(c@I{T%^;YY?R#}J}r z8K_)|S7;TowoC73bP=&|hdf$E11ZC3Nl8J`z9@;|H9O~Zmb2PYEUHe6$`RK$EDPW@ z70aH5hV!~aht3Q|DoK-76_z_a%w8JgElDF(rrPen^hC;XYXXcs%=X=cFFr>U6;MBl zKzEJQ_YKg+@M)JzjwH?@iZ;ll9n9YrOCCEIt0X=1{F2)rkxJ&l9Bd{Cp^}$L7r_r6 zKHw#n#yKh0F|Q{gbfY$lV4Rlpq&Df``IK&q=m7;4mH2^h>rAlH)a|nh5GCocL}z;L z36%hYFAy6u{)=yJsTZ+e?k)=IMFqP%WMu{ z($zikF?K^!sUFoSpe3&%A>LEbXv1=`b}3FzdJyU ztP^L3E#f%dZy~bXaVCNe8+JsiluKb0uWgD~*5_`G>`~P`ec;VKsdCHJs}+i%+otSc z&-h76_H2#`+-g$pPLFDN<8fV6&}7#L$K7mU+G&H1S;4SA+LmRxFo;pAyl1(y*?8;2 zF`>ma;Zuj-CLFH!=RC}QC30#yV9YDkH0xb{XG<;S!;o8{|BJi^Q!eFqL*rrf>=zW+ z;Kijwz+X`661&+JQX%W_yO>GI+n8Pvj`x(9_B`y|dd#)B+9LVg$%c%n&57E?k~r7m z@RLfmy?kdk;!8Srqvkz2rSuI>*@vnK!R!*6`Z3PM{2E!6gHe0=WHgqt8s2x)+M~`# z@jp7hwX@ia>qelYW_D(nrFt@Z=&nU(!S_F9DDOKNfvl+n`~<@U zM~ryE9qt}2<@Jo^1+GSA!lhe|cS0u;Ja5cr=uDiX-Re(TxV3qI=mgazW*6^_ZVEV~ zTljfK_rKg-AQ#Ct>kV2SFSh^eSpjP}xiBcZFVv-ybh^1&LR_F}CRkG9pZ_HN5gWPV zXgC_|3g&;CR-PPl0`GS|E`?b+C%+n*f;j2-DP=xC>SD>lPL;P)sj$)C98mWQitYwitGyW{?vPG{(wR68)z3%g)rcLNFh6)0;y zJ^3p0VW`5L&OpMJkA#Yu4a0Vk+Gk@ewO!LqLXwbTuIZ*B0>Hbx=^5<@ll(R1tl$ow z^(9eblv7PF>eUn{Z^{jm@vq?K{)6k;qU7kIB4IrG{g?+~pMHgi|KR7l5_V$#X^}5# z$DtksenQz2cf*j1>h?jwO7la3+{vPn4KhzjLQAH8n!C0JqaVt$%$&N&K{ zn~$?cs>34BY9d~pi}5|LW_9baburq$_hYUVHBC1?zRi?c-f`w@tc?53fxH^UjFJSg zPt1UH;#tki%_4Wu5BU`G&hBVK+ZRm--zl@NgfjbMse?iuF13@{Y8uq_e%e;H`(CdC zv@SQ7+~?UPp!x(^C$|``)@=Idrs6&CC&grZTV?8o3Hdb})E3Gbdn+4p;XS1Tuk{Uf z?p`b#xYD(}t-{GBcX?xaxyTS?()zyD8%bJ4pB^Uf(%@Fr#UrG-vD?c{o;kA*USkyV zbC=sQ3MEC|@<-~L&sAFQ!O^}rzL7NNpCyM^nFX31uWA9u?ztejyHmO$_izn=f;GKn z&*hP$LBe%Svg(IzDxfxYVp`6_>Lp)il;TX0UqU%U>ht#!MiG%thJu$}Jr}hzRi#w0(x<|KN|vnCHJ8=DO!$9`E?F7_zvn|^ z8ep&e#-iFg9E)8q<(vw8JdwY9QsUqWO*pts=3R`jQ7~Ir;-UDX`obJS$m8SgA(Qh1 z2K1cycAW`oT=+zMH2z~9{gxz9lc%^40}&Iw`~8$|azrUVtbspPt&INsT!45M&42ZgVD#~@ zug4Id?Vl*TA#hR7(^&Y9E#kEgZk^LlI!mn15L4eR(cMDCI{P9cql@=|S!OBG#TMzl zT}~IpGBro)Z)d&Q>Qp{MpH_s%c=r!K#5$Fqch)_b8eQCd#{b(kXz>@?P!6-&Zttz! z9Cb&F{x>}mr*^42Vxplwt{O3XdYqa0cDU8Hka$)x2pp96JCTyqlcpWLOI7t;myc^1 zD*Q&x-r{aoT&ALp#_vcUhM>Jg81mY94Rv)$K5;@((>|qxvR}@&d|igm3dF~P+MN-& zK0`w_kRn0L(@uK-W#tB` zRtR{mH_zU7=z-y0iW$2bf{2z-j2YchFTTHv}< zCaF=43v0U3AVG27i+NhmbXb1Lp*rPlgE{eBe_*=Gt7z91t*)ZxQk8zEm3meAbQ=xx zyysA;xm7_{x~hH++u4WUY0sG_pynIf4r~xHKiqax z&Hjb7d7DPJuw3*N@we*q_x*bs0xs z998XJ>p&(^lTYdsKc>+FdyQkcHv;ZD0uan01e=0|Wou<KcQ3c; zKFdUIyz2U6FfA(stU@M|beO)eu`fnLdVDvj;2LPUKuB_@mxEwAf-BNueoDEp zRwpNwv6vk3<^GOu*Ke+VasNi_?*Dc<`9~<;FVr8;p)^ueD>HUX9g{Iror=O2L z4D*)8G#yPl78S8sCqg>lo7I&{@(rvx&E%D1W-!xaxWl2fp-@F;j7hL?(Fh0pO zlg~cBwIr^tPSq0U47_bn8JVr?JF2Cwf$gfa>R@8;b77(Ei(f8yxsT%I}oSn=DHD}tBeqML({JPX_Xay5wdIl-Nq zG4pc%kC4)LqgJ|x;X7Y9Wf=TnJ$LHOmy&zBVvWx4A0b4RSMHyYHhej!czt{F*H!Ln zT>h$&yYgltC4l?`-x6r?Ffn;Zn7mIE5{1wK{UDF z2%vinq7wZNl-uEg;SY^I$rOAb|A43!c93u$_P?;0Acj4>lNXNhPzvE+C$lM35U8 z(7**WBL5MigXbhj9Dt@?1pm0cJpa-n~57RvFzMF;&k5v+b9l_F7k zC>-ub&J2t~m?STM3J2^-@u3D7LgpH8LcmnAA;b|wKoFRCiZ9hHltr-(Jz(b*>hGmb zh8W>M21X*8yjK>Srfqm zZ5|m80k5%e{0$*a1RF4i!Jzq5Ibf8Qz7`UO)P{n+StL3on8ES~BeirOfJzqG8%qR| z{#_w(X9)4-aF|#)JUBR5D_C2L!SaD4_4W1P2oxNJf&m;b_K^S%DFhb4-U--SbA;;j zODuq`wdRwS7lRI`I>C`z2>4G!Ccxvcs7xNVEtqGRXTT2@P|M6V^!5k3Fe>Vpt;D_4=01uYH zAXB}Mm;l(bI1~bfgds366vhsT!Xovt+S#9Sm;}QA9vV8GH`aK3SQCa9mkulp<|c*^E|*HiQaruUx)dF6m^TLS5{dNIfgykk zrcXlX>FFRy7=5zmf14w+*DpVqTo!F@G?KmG6dHwRk`1H@2{=Ob!g@1UbP@-c4@@Qv znBKf01_uTU1ZMzra;O{{1-CZV_tH4u8#F3!)M04=7P28E1V*NKlejbv1m{CTG1>0G zLP@5u0MKD?st=b1gq9ab8Ws1wv8Dw6r(&Sx#mh8<^Lys;v_oJ}7$gd*zc%9_2m}OR17_xb;IQLmUj1Kl`qK=9rT(}1<*JT% z{r}7`Km_XxuqfUjV7EnpctidB1=hh}v@u|B@OK&tsfz)&B;ExM z_({`7=>mcII}NF$5A19IpkZ`?ZS-$61Olb=|L8+v(7>MgH$D_n4+DG?_$Lho?7jb> z=_CJQgTo>LxZErNj2jHM1yI0&X9L5D3P4cHtKi?bF`lr~x)4M3vrvo;0& E5Ad9(TmS$7 literal 0 HcmV?d00001 diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png new file mode 100644 index 0000000000000000000000000000000000000000..1316647d653197cc94211f010d616439c04b7879 GIT binary patch literal 256360 zcmY&=2Rzs9+qQP8lu<^ckX4bDkx`-~p&!YL?5ymHjF#+5R(2ASoe&vCHd&z|dn+R; zWW2}q|3AFv}k)KpYd+s`N|P^hTZrBhL@ zo!`6>U-8m#qs2egTbwv|f{H5V;npRi4fyYUCQ6iZR8(%S$-`Hq;_Ot@kD*3;sB`G2J$`*Si9Vh%#^*{fssuN7Yzie_o zb53#7&<18&4n{4$5=}gYjGLymGsWooK6^(yb4weuea^1-X8Uf~+EY;}`DJLGeU*4* zTh~7Zr}dm)=vaL&7e~L;`8boJ$Gyc>@JvZ6wE%UXnahNAYfq0?tDt^J!(->R$y|$Q zm7m7#0jBh6!}*^x&$&-U)F%ZD{4@48d})37oRei-fc&-m17Tk-R#&|&4KLYUK~H@l z=ZyZ_N!J5RxBOk?m1e(iH?=lAb(>v#<4or9wd?bb20yz{r=>FZvfScQ4(;cTnojQ8 zm~ZB94|dkHBs*8OxE$*GQ1Rp~i~HMqI!=~iTSwST*8TIIRN{X%ap&ga;Ks_?)!Px$m9c z#z>k|H(n>KS9AH`pS-ZmVl7qu%IGe=@g+6^?`co}+6SVe9nYQC1shS4tM>-+o88+y z>%CWH^W_4zCq7h~)T0%BfgG9DOhY5X-{K5rnDuB*xqUdoF2HS5(7&s-WX*Zgf^p1PCI(N8TlzRE&T~hV9w(Iujl^uTz zDdovL9K3hjLZ44g4<%Ea+7r8}uNo=V>1*!xG)v{WSHLEEzD@fp=PrJZs?Q0RZ!N^M zt-DN}z-TG=xZlH4pPpw~!s*e*q=fgfd^@<+7SH@SZuItOQMA0{#AU571=`k0`+j^_ z6V39|qw>zJ16^xYnZ9i`7c1Hk-E%KeQQxH3_C#9cM4*ws)4lSl`^(A624R|RBg~t} zLLNl!7tH_u_P~(3^%2wD3QEUHUv%QK&#PU395hTmSEu&b@b7tI`Mmt`o&LOk3TGZ< z=zl0tl$)UW zl9muNCPW(6d!Wsn(%^P2T#9r6dxXxHDcbx9esiQBR^1eC2 z@Hah8Am)N~Mn=b1sWg`k8QnbH(`@ZutukEncwFohGj4M1(=(mgCI9&E!+(#Xo=N!M za+7PSNG$w2oVn!a^l>U?Rz}JyPdCW-|NWyRd9ol~?YP4K`)AP7mnQB1`%8ri&f1Fq z`77O*|4j8X-QT{aZc+5VKVz`)O9Vz?&`xV_sp>tdCQN_zI zH2GWG+RmIgbAmg>$i(DsLc)G3tGuYdz-^C1LR#9|DyphBnwXeSQ&ZQ}*H^rIN9Ct* zLvt4c!=vcvfZ$+SLS%YwqlX5qG&IhfJ$us7kbTddJ&jBU zmsFo8Cv$`;i}m=GJJ?;h;u{yYmkK{tsQ&=J8UGTdoW%S$jQ;|^qrSKNiK|xyxVgDU z$Ngtp|MQY|N1s*b($v({^p<;X{ZLuiH#w=Acq&MAV-Y2FaJ)UOtIR9@Yo4`6D&@ed z+*}7|XN8LwBktT;d-Z!!<=jN4!?kP4mTk#Qd-kk*^yty82M=g0EiLum*stN?;b9TK zn(i7KAFr088k?P+y$>(%ry!4)iV(Q;%(&8D)asKj7GdqWb?5oRoX1)>(9?&!D!s&E zx9|4t+lO>s-8*)D^0Kk9{J%dxBc)x1larIX-Zr{9%6o{ynRt zgNs_yA#BpMbMEw2_-JJ4ictv|mNpDGMlY=VoBEP#u>;?g_FQ`M{q1$9=zQE<>chNbN8-Jsk@}p$VapB zPpM={TM|#twqG>N@%VR5HD0zcM$&1tC2`aBscy5m@%BRo#ZR!7BGw&;P6aZ(AFK`e zl4s56vxD8d_VEEkmiMV3maw^>bgo-#K7KU+J=!w=?_aLloE1VZQp|eyVcpkHFQjP& zvWWR3H~gyDA!k3;EmOL(XzlLqKH6P!j8Q=UscxRtjT<+fYVv=1ZAmXFDapjb;`n#+ zlIOw@+SW2Sd6|OKUBWu+b%4O#EYG8!?X5{ zjrj!xYz<-&GyY!WB5K*X9;bbxBSX@me?8WVidDj%%FD~k+1Z)ug-$NF%*{U^Iy%B1 zupQ%ws9*S1FX-Upl;|?ifj9{c3)^tK#BFZ0BV$i~e!lZ)^X;W2_frp8&kWUtS$@ku zynFXK0sW#LKDR zxq_9rUfXdNTYSkBe&Qd9^La?tR8NeF6${hnKw-d;POXLX^gtLu&HEH@Rh+5Ua| zZb>=~Q#HS~)Zo9E8JL+VfV^G)`STO3(zC?GlUQ2>+36(8n&ww#XJcYx*VsEaysxYC zM<9{!Bmd$3qHGlv4DS2U^Srt-W89HtsD^xg1*?WLN98`(esO7OwrL5ABzmR!*0rBs zTX3MMimv^vdnmV}gm@4V5<)t+?pX8r+c!o(j$P(dN?dg3tYmKG=86bM{z&~+XDS@j zC&)peL*WuKZ(vGRjjh`uw{$#&``nf9MPhiz4VTOBZ6R48XpeaR$jE8L$;|KH>z^Ww zaro}uySE-E&%xgQ>501=qdYoOH8zQeh-3`>>?w=Kp?w@4o*em{me!R`+U>Wmq1SR4 zJ&)?XV5QNnLXo>0w{sjl8jiIKj*6;NlUx2^5${ozXWjXz)^z=*P4qi=zC)^)xIW3+ z*Vp$vB_-FSn#RV)hDpkq7m4;qS7AtKD8t5WOz%?SJ?qBWQqCwT?ZfAl-2B_$+xrfm zbqc%ewX(p&%*^b*IF_=F>Bw>S`vIZ6*zi5#SF^E~SG$YPpty{E$>l+e_9G4H7diU@ z3V5z8SRfCdR#Dkz@iFH8_wTVt1SG8R0P{F=Z=A>I5YEUwv+p;t?+a7@l5e|bQvUJe1T>oFJ-FdDNhe{#5T;yt(a@t3*@;Q|#t}b1zd$CTJ6Q8wdAf~{_MiNWN36KN{iHyI z2zDBOYvbl%E$el=lp)`z&3W- z7`9{Ao>EeoWZXni6U?T2-*fTsh&T=^Au@MtroWGp#VUQ{5q6;U>4|kee*6&q7hY?s zSXG*m!jqy>+tJZ`Q0{KH zx*FNoG~nT$2%Ax(Qa?AV52uQe^NO1psHSISZFu7_0FY)f+MKX{{rcty}0xyPKCMhC{+}=un`L zNtHUl0BZL~oDr(eIVPLP?>%+uR3NM5V?^5YSV}w{o)&wMou6;M_n3F)!FiJn8-6eeYy|Xp$auGCoiuOxuB~{KjSY$>CvM{tKPrA?YX>|gEY{%`1!Lo z09=5=V+Dme|LTYAB37Sx%^RaCvyCd4giSa1xX*5Xd+p~lwRl;)(gRw~Q%6i|cFHdQ z)(AhOQ!ZGxR36AIlAV*2ppl}A2&7|UVHAO?KSMxaB0%4!r%b^b7$FRv62>h)S`W*NjH*&Z>gV_5t< z)r7ymEMguw??KDwK*K8W?$(-hqn%moMMXstl9EQ3FWHEizQJ-fnaYf%#JwW8~JuUA1A-R6%(1EYK#`=IdEXZrAwE9f9e(;6%-6MBcuUb z)0zM7Icl@-$dN|?4<=@2mH4Y-11qQ;y*0s{x?_`5C4cwg+>>-PUSha@$BrE&r4_kM zY~HekZ{pj&U%OjNJ#LbRU^H}ppH;S^jGC1f_hmKteBvoZ-TMFvyZA=zot@X;qonfi z=;~j(l*iqNyk>82PxY?3dAcw}65#0$PKAC zhY!_OI5H49^C3ZoSGy!0uu96eC7;WS8@cdOv&F5fw6xUxS@ow+nhhK`*Usn}+()f$ zd47ie=+!Qzo>C8#r(S@e*H>EZAQFr$Eze{bl+6FgZJPe`hx_p1?Wvmn=mHUqq@p3d zT977Kj@l^QxFMmVqoWclm6tAfPC?=B?b~ajkJ=g|i8jO@V?2BIEU7*?A}yVrBPc@s z5r&>dlU;>RqM}+e^u+Jeu)TCGD<}~E^XHHARQIL&cb1>C_p7R^vYf1{tn^0oSZ`y~ z(9n=wnm*a}{d;gN!ou{*wrGU=5r=+-#?pp{hWXLNhjCxqC<=oYUZ^LmEn8VUh=gK_ zU-J3$=jksN@j^J+=%28Y&FQ(>*-s72Jaq?3u>smTIxR?uhX+a&19k&iS~8Del{xXY zR0syy`QCMoj*dzyDu@2FlfMvBt*IJ3jq3+-rkTXAvL>E>=sEr;JUm=+o1LG+ukxz- zdYgkB6bky0S1uD;G;C5Iuo;n3F8q##F+HCD&Lt{N+o8D?ja=`Y--P@udF=<&*4>Bp z9XQ~<>!21^X%m&uy5R#m**b))nwlOW(c7bkpin5IpHk@$>*YU}_of~k9u~g*j*3O} z%DbQ63M@O)nSs)tK7C4s9a7;oSzerrk@bkeL2P+(o)h`yUP*}z6$0TT8f%U8pHow6 z164s{)z@5H%+Rn&NJx;zC`0h$yW8trSC{{aWdUk8e2hKj;ONLdp~wAeV=IEBxTNHa zn%Ze};Xp3ubOW(tfL8zB*93Fd1Be9db>kbmt~WwRb(}IX zY@iQNb**41szx7Ev7&d2-JrV3djol2Rq

}=NlC+bD%=&{zq1`r%U^#_^kez!^*dV`R$`0x zar_cSgC(0Q`26o;teLEo4_NquwkZ)S&SzI?(yadAd(`STqq2RVM#G5pEP zM^ytxP*7B)Wn?@J%pv-%W(a}q7-v;R1$;sJ7;ppxD)RFyAF_2*RPf1T9l(;>~%{4bSmwGX4FFH!pHD+X%phk@Z`3)IvI_uW0L*{0q$H)D%va*KrKrA79HpNW4cQ&6-Yaj2-oBjRUIn$s-87m(DQg;tMeFZ8*9z`uo z`M}?)o)IL{H*emkmw7HfbUv{^Mxs++?`fe-zgv9oI4j^)Cjz&q^6bv zl&(AV*1m5o8VrC>MCgp_SAP5|&^lflE|5g4X5R4RylIJY66s6=*^Y%9dM<3hGlRb*Tn8RcLQ0Bm?_O1~ za!f+T3Oa@le3fW%@FR~HtzB7JLDea#aZ944TAsw27u&Xd`s7Q8QdFp8$WV``o`z;7;j$ z|5l)nQ`qJTpB*XaO`qj(QZhe-K^bX^9|34l6B}GpUS6(UaFtCt>2bMh!t6*BVdBCC zF8N19Fp^#nbmF_79z%xLbbR6{x7l8+o)7lQ)U~HwjF+lvw-N*CKuAQ8v&V1bPkqhX zTB!dKkjU51Z%cjn;Q{m(^lg6>>UJR}NUMVf33>9Qrfql<&;0N2)CibY&2;VaxzfPV zV8Z}66!DR7u21d7-W>oj(DKrb9vK4_yaIgY}Ecvw+K>ShchjDDci&b;*2D&wu&96vbd*H-P+ zsFyxIo6wF7{~l{I^N3eac-`FIo`h4^oY(0K3&-x;oydy`CCZmCQ z{`~n)R#yH0{NRZUEfXs%WwZdIViv0gUVY^o)~;QPra!l%2ql=XJ78-;!)m5!ZAU=2 zXwmXre9>;j%F6oUd}5VG=fN=LYm;4PK#tpLtRe4^mg}7*|tZIc4l1BOi~U*PlA*)A6Z=wHXqy| z9xIxX3cmhthDL!vqfn+M$x5zPMU!6d+uOs|tlMZm)2|3ruW#yMZq5xj_!N2cc~X)C zvU6imbF(_~oLD|UR3ds59Mj3j>a%KUBWPyqF1c!0%1tTXZC$aY@G^iu{YbgQ-Dy&P!jSwHf-9|20kMwgYsGOnS|Z}0f7ggdrDoJW(phv zBmUTFfXn#!@uRLzp5WP3&$Yma*vQKk7S;cz%9`*(0s4e0@+r0O!%8yY1|z!1vJ& z0RlZUseX{PEP<^f{0(AR$o1DCh28{VPRO-;-rho9EAHR~UU!U)j9B|Mfge;?S3j<* z`cdR>`pmr{7gqk7PuC04cE%Faqc^bS^cNH%Y8&4SL>N@(H{SK6w&&zO;*3 zRV^*SC|v~RT*>g-rFQ`FikD~we+Pcz>`z?-@g&LJ$^orQwsm!N{a6)E?1-kGwwZ;+ zEJ*F??weD6Lqiu#OY{X!a|B=c!Vk<57#L_VDFQ|U=N`RnS^>Z9ye;cM(!w_e6mbC7 zvClaJ(F56WfC|t7a=|5>o8o{t1rVMizoC*@P)bS)=f5TS+#aBWg^m}$&CJao2M5n| zw2Nt_Ytx=SUEP`aIGK|ALL-GHMJ-;S{YxpxGOa9wb4ZRk(1rN;w(KRvnLwi@wuX69dsrOey7sZ&<>_e`(HWz z`SFEPQX0ae<~ZEI3^70fNZ#)=KT@oybw?l)Zs~=*uK!(GM7y@0o}Ad)#iE*~rjfrr zUTFLYONaHq7kC5(AA(J{2)})N#-!`ryIV*m|5?_T(UspMs?N?9@EtsureM=aanXLb zS4J7_Jn+f-eL_OPK-Z*Nai5F2mzv6lZ*s`Vu>OP_#>R}#%PcG`r7q`Luo?y2x=UP1a}~)cDLw$6=H})}QdNTbZ;s2$|8M)qb?n%l zTVSo46Ho6zTF85r3ZbvY^wkK<7Tg2AdHnw z7>G2mC=2}of`pWV28HvA_`^pu(^_WFc4k{+&>M=oT>MRdu@mA#U6 zp$%s2?mJ~s#DOUhq{P1klb@cReg&=YKIx{vKfY9e44IjmGpl~U zHaa?5>{7(fVMmkIZsc-S~YNeDsj?`hZ;s~@2l1*WAP1muCD|1@3jw8HbvAZWqj?Yw#O zpT$)ZBcl^|J2C&eh=~J;R0LHb@|;!%3n4<%l#eQO?>%(*@XYM&!ThUPz_+^j{)L5- zxfV?wpZRfWR3e2p?AY<}RVgcNFb7aM2=KecbJAArFOx3Z*UB|N2L-R1JP9gpDk2ri z(JtFv!NMm`FCW_VL>OryY<~VmdPYWk+N7?!`a5)4mY-g<=<#%O{OXbPSQN!U8&dWA z17Zct#K4aqOI^B=AiVxeey>Ln(eFq(O!;3UN{3S?d-x{nf9efmLESe{WSB$R8TtEieiZbCv<2 zB2Wwq-R53usZIQnx3M{dZ=(m4E;u@e*Fu9WBxdk@GlGyuM&?pT(l7qd`YL%8T)WsK z=%GVU35)aY9OB?WVNT-C=H{Si`R4nhw@yq_>Ond|iL^Xmhcof&wI~M`0Tt zJW$W&W@t;E$V=2L&$G54*!bZC8Z}lK83%(?|FwxI4BB#nlR}NKIvk?J1?tyoybj2} zz^;cI8;=0n_2smqqcCwcpdSPtS|@K=k7P_VA7nYe$bo?Y&T9gEoC-d3hZQpJ2X6BX z^8R7-LXVNFY2o=)_XZ7(8_)E-cQZ2|vD+(ZwLXCHt|oO+(N6Xq!jU^0yK*=2U$`^* z{XGKeP_O0B{TKNAN(4`;@BGody}R3fGR9seL-%%GUaxeT=FYt|1$J9UhJS2h_uL#I zVD>ES-=DeMUH`KduAQWvgZM$Zq~CN5)EfYfkRaTqw|Ip4Pl=$+7tTefT=t_H+b8<@}YO4*^bY#jxGUHTI4+V*?@{qYvD z*eAuZZF_>%h3q`<7}gi+ee5Yhq6{nZC+(?u&e6q`^CUjk(UCXJv(&iKV_>hoH79S^ zN`}_@ZsOe1AT}klo!QAzRfjiSB{J^gidYwSxRB>69%GUPSMiU%37D7#G*( zkusNqH*Oqv6#h`TrKOwa>xfLd(eD4`eexFnB@3`wf#-QIFS{KPzT>$ZH<>N- z@CC1go2LD?=%euD{M>9g2T{g<+AB}jUiu;1UCz_@20VZHJ*q=;eBP=W#r6F6;v)6b zmqPj?mzxe2>fH2yM6RVj5?-1Y8XVd(mvLBk^Mj#O4S9z7yF2dvWj(fWq-ksLMK!g) zVk%(~qcIATu!6l-{Ix?j^8RuE$CHHrM_D9I4~W@|Zg$?(yO8x~L^|al2Lpq)p?l(L zO7pu(4UQNb=`B<-v;=37sB>PwR^Seqfi@U6mvNKFHG?KiUw{Y3BMS?Q-z!rmQroRs zTU(Wsl#VA+`bI}z@Y!z!0|7x2vhcmgNM^9^QGZKqO-aC*T}hE8&HNp2Gw z<9H-+bg2M7M26|`^9>b;IRH!ZY25G9m9!7l)rneZV6)4A_~~eX=7KAzwzk%;B!lCZ zd7>si5`KAY?HT&RhxCD@Rp8eWj@$+l#kM%5IW#v!lZP27hK62r zn<~3`v%0df1r2`V+|baFk-2$QD4#}iTieYs-Za2UFssHZOADxa-oU!x>yM+KAN`tl z5Pa6N?^<=mT0V22N0;CQ0lvF9dvYfO!-qQ?wt`1FkwihWzWvn0z31SFpnBuJAS}21 zn?5)=7+_rF#`I-W$iK=WTtkt*n891fe7<(${*L{{1Sz(ctkpyt2~lYR%Jhl#Ci z5S|#YaR~%95gvz!tKd5`ct%|avY%yF&#FHJIHH|`%)HgR&F*vdZ9FMiC6#lAs_q)6 zPm@3-nG-b-PBF4FNVqv*1BhYjW!a0J0ZM+;x?Q)RV$!h7Hh^(DqpaN8V1xJ;avD}a zksEdK9OhJwpaQIc@r#R_?!p75HrX6b;BjH)sYZC{yfWPqA!tavbyWH# zZeiG1B50AIMeuq6Kwd#-mzh4L;is;?Z!_&-Z7M_0Eo{eDax-erS8nI1;8JB8-fsNdm9^nX5?H@=%#_Mv>u+c8%y&J!7iXnE53Y* z1n4kV(eBN!3xZHEf-^M{Ei2vlX1nPVXs2sT&^35kw^8udvC zEQnQS=5n3lmm$9fl@o)bORn0ll}6bXxhSdX6_LUFB@!be??g>KJ)t%g#KLy?br!Rn z!Hbg~Y{fK{-@%1?E{^fQK!e5`-s^Z2Fe)2cThh%#Q>c8C%}BEoowon3gAhoPk>`K^ z_@WW=XXX&eS;wvm;K6mpS`Spt&kVqHNa9zqBhU6*ILLL&@yJl+1CE9I15Y~Oggbfi zB-M|fKT)4a;DDEZIcWDrDHQ^y1f(CxyhE^e0^~S=yM+@r2x>O8&jZ}t_fb@!h@M0$ zN(JN3p49&Q;`qb_Dpg}axfZSKO=1^B+?XR6i3!O1 z5*~GaFhnhTp-f2k3A5(Bp5sKvVK66Pq2cy{P63)cAY4%It`?E~L5iPg{P03wU<4A& zNsi!V2=k5Ckh2h1rzc8w@oVzFDnnvMqyGNh7W#Q&GIeku-kG)(HF^dH&B4h1zZP9H z!xGhbmCl^$!vccYPzTWXH9IRLCgx$uSzB4DXi@{|OI}`{$fq!~eSp4>{0VJ8`HHTZ z+77tC`uqF+zuN&v-Td>z2>fzZrR0L(3r+cl&58l*&nGR+uAcE)78kQF&$qp!Jei?) zr+jkq`w{a7xoasY9(L}G+75IYh0%~Q%X=CaCN7I1^ zC;&_GU#g@H+CThEAHakVqdB;97y^j2URo-P#d=d%n2?qAP(I-@zyKHkldD%V!^SVL zz_GG!rzgO-$O0@JM8kxG31)d#=*B@J);2bQuuRR&%v4T_fC_@CViwIIq)MH-`0lr4 zi~%VeGy*InlvD<=lK5?Oe(M24fwhr#z&bgQQHprECb-g44a;Py@cD;dHNANw0T8u$ z+qOY`oiO%k7X=}E+=pE5vG}_W-T??iztDHSdi4sn-gjWpG<-q;kn{yoAc{5F+*>Jv za5M}qd;NNzWo#ED5Rz9ZC2lUHKC{|jGr(QMMgLg=Gn<%|r3NGO2dU$<+mV$$CTDY= z6CMR^!}@!*PavESr#*DgBVuCUZ!Xnq!95E?5>gEi)N@Lba^=ft|6)B1SP=UPh?cW+ zZ`3-T4G#@DU2C1YlE_1=-5wIYuNOmP=7+y53RY z**ur$E(Y0NfD8a;o_Ipxs3W?Ut?fC`YP1$@FuA}Jdp^fl@%gHqB>5*nd}u$J#jF@0 z&O`6Q`^`_jol^K79I=m=ml(QHWbdxNjyKM@0|k6#eqaauRVt{IadB}Cks>}Yl(wM^ z!*_J+ywNIt3J?$}_!Zy;WSMF#d%*XmS*!P8+pi$x3Z1mJQun@Fvw181;-jNi5AjaP zZasLg&+fa_ezqH}Tud^$FL(|I)+DD?*p;0~(`qiT`{GXviNR@h_?)AY6KoR4F(iXf zJHPHiT043x`aOHjLb^p6fiS!V)M^U<{iD#`(C_S|qm#$Cfa4LKBCyIfR z=h?82!gJ@%+qauRKpvf#FoD#5>eNQzNEMN5yQnt+RYI?YTk3^w9)Ax^d$g6KJUC`8 z4VwQJ7co5n%bl*tr`lRS?1!bnK?Z7L6O({Xsb9Z+;|!_&4D)T|ix9Ahj-$={vD{?e z;cslF`+b8hPj|UXXcAsMm>th9ODWWev%62PfzT@$V#26;f2ymHo}Hb&Dv&wRPeLZB zzv@PUZ@_4}zcdUn2kXM)4|$8Aw2fE?De(3L)s8 zw2s25*xra>%_imC3lLFNRdpX#tR(w&F3!8ObPRl3ur@sb{03hh_2mX6 zQQZtkn^vYTKn#oVZ5;}`w?5(F=-o|s`hO19%ZL8%JE^3?!0{YzhwB1|554=U>znC0 zyIn$x+9k94;mLT}JO0!DyW|^Dlq-*wWvxchKY_y*ttMz&q+jer!qca>kTPI8Abb-x zjMz7UD&W+f?rNfN1mnH@Qc^}l3J?^o<_c$0HPKC__}vNH5E&V1^Yt|>@T$$lu1>=r zW7GWnVOvL?#SEEK8A%bAqXD6^8#ZjvFY}CrTX_xer_}YyKuYQ-l}Cd^Lka4!E9AHb9I zKr0W}S@3N`G$#OPXTetj{tb?dpcFp8Fsr+F%NE+0 zj#EMFPs53xu*ot>`3~J=z*oznBKez9`L^Cl2ajhNWV?z;dv5zXxusv_=eLNAT*yZi zn2v&y*$Z6ap2YiaLS9WROR2=rGWX`+tHdXc;(Qj93$LFUqI26PCB=eD0r?hfE2xMF zE9H^5cy$gKPBBMh%vS&WIT!JVk`4q~5yZPF7w)Vy?ImJDpGB|wI3}h6T`wwmi$bi6 ziwh?2<#S1wuEIMN+LNt}kF!jN+Tc7=~b9#d6KCa%Geky$jx zj&_(8WPrb@{FPZfSxI(}-@#g5TJzOfaMZxx7mwA$x9gHR?xQ8f$?zPP0vRF&pl-99 zg4~53@EKY{u#67CDo7HR=*@%wQ67Wh#`&#AZAQxnS2!7UvC8YLgKD_6FdHM`K(aMx zl;db0iH~dF^H6rULl8qUxi0#|=i3x_fS;cl{wAd=qs?gS!T@6%)CbtfkVn^Wy$qxfn5{qCZOT@VQUeb`1f0-f53aUSio+X zw@2s9vh&(KZH?DDO8)5LQ!oHIH5#PbO1cfr*sXh>`P{n)<37ocpvAyUI1Sb9BEk$h zlChpLugIit;TNf`uTFeQJ&DAMiuozs$=Ufi+Cle!KMOF8F)%s#5p5$eQxgNW$GDvs zK{sdu-#0h!gfN4-ulLA8VgnabHGe@I!}!K=fJ5zE^B|@<&y9S1e5_K=zTib@+1SoG zJHIxMg_#4k&q35I5Gh-L>0uGcE+|Mr3qjNg4#5vlzuve^>;|CVGfsmgDIUln$8*_D zK>y9Vn^PsHm6V9+Fan3$!3hmUT2}$@wYRBYx9Z?scXy`_JavQ_KBW8F7w`Z>I1R83 zkdOh&H}hCjgv4tB_(ZHuzEwUCAMQZDK=gE-E>#NIdtB5=17 zf0m>Z=0^q8T(l7;#6et(QB$O8c&8}femXKQ1$cWa>UcQ|D(27-z74YvhaR_Os`E;~ zy$(OLVAw{+-MdeLVnqKf@M;pt1|q<~)Kn8-0CC=8j)Zsv!KFpadAM<>rZy*?t(8bq zJIO&iZkVFxL(941R~*iTrjz)}0RIUFz6DpyIO;3$kmH~rX%ejFzC70nN}TT^oxHc7<}1V$h%ePoQ65MijuFbRS> zx)7)yP%H#P24Kc}ScUx4k3kzFu0Kox!f#28;jsE4v(oXHDRgg!s=Rr}4j#yof~E8F z*0EShSnCpi1JKtaf%o)cR0vO0pO^7WotJnxEnA*do-M=p-^H{<1&}sS;bKhl+*9g^F)||A!Y$~bmdvj#U_RcTThHr5}pixTRi;^VPWCVWnOY5hd`jh zi#+>mzwkuJL_MvYWxx(oIJ(9hY80kSc8H>In32W~Zjms2zTAbUDbkNZ*Y|iHTq;q0 zspUO4@xDe$NlbuPKAs%4Wr3UPDf&<L`ri-pvEA+m{?2cO+x2!do{5lDvkCxBz1Go>ITSY_No@f?s+ z;d-P3a~Y?#XDh9}i^~uc*-wtWIQ4~~!!!y6-DSxvYXGAl>i%hXrn}!n>IYxFmL#6;KqsfT=c_mdSgD=D-%#>fP}P=Bfuuq z*43@;#hxQWt-E>iCb}jxaqnR(!UAH&tLwrJ5?Y9uDaf2S?7iqXq?qTNTJ6bM<<3;utbnu5wLh@?u3$q(G<(*jf#(4{J&APjtPQ_pmo>MIS zQo8Gg9B~pqqCFtgAp)GKI5S`;tt%!4r7vTe#^zi8xrz6C*xA+0&4)FTX5dH$5{D`G zF-%xUR45~Fii(Wk2|)k@f)VcEgj#q|u!0OK|IsD+T&TlF z6junoqD3DY)rn6cqv~0QV&LaI3_t^$M+!t9Doo||4h@mHG)&aL1J4BgAfk1aPr3jg zVg{Ati0xNV#L5d03m}h@aej#-6w@3;Z$R11%eQQQxeFa`N2us7A53QKTfcd-r>qI} z4zwbfwKAI%DiN-fN(aRS$1Dsd?;MlX>uEtkRxY9EDQ{1c|1j&grF0vR7sN9wFIa7m*>N1l5ebVlFl}IpP!23f?#gOb14WnHY;hah)+`hQ*hk;sE0pm33U z;$G6c0G|QE;d80SF;K>6pRX)8`(br(L&a4H`RFh~lK(7}QSUh_EVBQA;(S?7&b}j7 zS2iy!l&{YHzSpUvb4OLhXph>ubjwV3KUOwY~-lz*CQT_ z&a#W6oS3Q#ijQYUcycFA%NyK*JB|#Y;rSl2%MHUMWDK?u8#WDKY;0$z2JnV$DK#fU z{rDJ#dNa9fT%3P-*>FqKoOR6Q<9b4sZbR8O%TgB8Bxw+m$9DII~774sGyv z(OzBoX9qI`_G17TO7T}G#Dk@e66t5oWP2P-H&|YL7#0#zf%Lb4%0V1opgI$s_-vZ8%2wd2%4-F0J>1t)4W@hPE;!Zckm6xqvQ#TC1v%0kzEjKoNwsoor*$*r$ za+xqf9bgHpXo%>RQN@(@PPQ)OigVJu!hn3<7t&w z`i=@x1RzE`!rVwoJaXFV&yC?iNG&KYZM@a8+h;r5>yua?KiPcgE%n%ztY~r{NmP_S z4i)NLL!8VWJ3BkHe2SoXFuJCr(?bK-At6{1>_i9!=8|yz_1ydtE^YyuVmTnUv-P^+ z>7vXB4?gbUnGhg`iC>FY8B&sP>4JjEX!VB=ggt{5Q2WD5V?%=}&@liz%*%Eld1`R- z;F0F$;n{=|jrmG&^mNEJ@SizCzeQ_;238D07VpGaPwQaiB#cE84HO)AIkZG5c0F6w zv_^)8a1`&Npp$7-IHQsCLDUc%GmHpmhGG4Chv=K0ndyVnGmLSF8^f~8%gbas47AK( zZj)>UdSaN#h#*Wr3no%wrY4@EE%e-^8A8$nE046Wor<(2;M{@C2_)RVe}B~S#Kc4! zh7d^q2jSw&D>L0{yVmDhkcI^(9u6n*b{OSlXTO69a(dbx+XL4VL@4G>n>O)lrO&I* z^>w`-D{Go8=0%*7jR#Ea@9VpbI}Wh$#5;**1s)QVMs8@PsGk^9d~J0arr38VI>1pV z>gz#OA@{&-LlO!mpc)z*iOU>3d{pSQ|CGJp)VG4*8M)5P|=|S&}>ivas7oNuAqSZ1HTqSL&ssBLwBtp5D^j?SqmSm=y}QDizi`| zhNvs6A=Zh>LLlQ90$GrEF%LnmJHW4JxiH8>F6cp;B;%#HGJ>=g+-h+IDi9=ZQWa=4 zKMfgm^s+4+|1J~6--%0HF3CfS&bMKP2?tP?Bc%2P zVh0gj{qZ9W&<~+)5QPhi6Io*4FwqhW7Gj{c_ZB7f!?$nIATh1;qD92SDls9Ca`*W0 z<2Q~&{v9*%fuBqH7$&meKAr_C&BhfkXm?Sn$)5c9(Xff#{T&$YxJ4tmg%#34<9dv9 z{P)01LiEC9$Yo58;JD>&jKnMefG{p7*#U}X9Buya0|#^N^_wbcg0n1EK{)-ua1LGy zd%2m0#vBMP%0LAK`XIOm3ye5DJ5G`0!UIX<*NW{rChFlvMq~S5>z) zcp0ElJjP{5P%<%beDmdz=4d6LA=o-0fCdqn^Xk=)Am+$Qky|~G<8YY*-Yigs`y{Aw z+zf;rMM@#l*AUHoUk(oWViiqr*AYH2FFYkV*%9Oo4n->V14$*S>F~jW0eBLqaO6t} zFyqKON`*r}@946xYzYR-h*r1|qyjvj7*n@WYzNs%232@@H)Ao#Aa|D*?ZOn1_bfiL zg6dRRU0FT@uQ+xL?Yl;qrw3-ebjmzs$T$}yVtnwvhlBg2WfZiyhY|8hvR?{ za3uh2+Wrt9@f|a)7s~u>7zBZNuZ9Ggu};5)>3P?BO#0sSlsK%WGRbGlq)DElYk1&xW$OxwwwN zd_ra?^YY9Pl!SK1!xHlcXjpXq)tC|i&+Co0TFCf=@2k?7RTV2Mey~ayw2g($3{XE1 zlm@aIRu8D>E+*x%zV-VvAdNx0g1OZZ*B{;A#!Tsm;P~Z*vq-L%!f2o_%>>Zf6Il-_ zc!l6#4;M7Vdpk|f%^ko134jmWW2A3}`H5U#0IEKGZphd73VfwoaNLli;l0+WH!V!{5h1?cHSuh;>Nr0Nx9b_nAdD}p#b4IudZ;NdTscC9rP#$kwi8_?oB|aFTD9|JS0tvfu543*083yuK~kWoaEB1z<5I@|JLdSLV^c(E=m*dnSJ2>a zo_>IvSRTTQM24)0H4f8usAhvCl7Z|%2Mf8++XIMQ8n1ewyIuK!&1b6GTHJ~uzbE=) zPka>K${rJ*n4PHdrRBVW;t41@ghn;RLujGOJ6J3DV()>QCMCRpdVN3*xGL$n65Bh- zu80yNu-q8?A*NuWID#{*2IWxbxhw_xvl3mK-0D&Ygw|e6eTRf-==5Y<b5JXeuzN^UgcCyQw+eDVTMVBN zaHYljI~x#$rT-?69kuy<9GR2AQn(S(dDGI<)1y{ZK>@grE*rLTGY?A+4l0GyrwLC9 z#~JZ8LJP_jfEbE%V6`YQf_nsTCkl#mBS=E@6U4NJa@>-1mJyJ+8V0^INE8Z{GmfP8 zKwm)r6GiTiXlTIQa_}|n!kdA{YHn%SkI@|%P{_40(B{A$A3n-SF3!U<;W*vZ7ufU; zAhpXZ7|N~>V6g)NAMojQ)eW#lAP@%PL_k3&q9#I;7^cHO3!>mhfg zpeuWYphemKQx))6x)#~#Zk?!)5c*ih zKcj)e*)Do*MV#EIh8_#}8z|xm_$nx#glofz#I=D$`2zApBfS@QvXlH><*HzCUeXC^0w3 zN$w3{)xaPhWI!;@<=?+Ar|$m^>V5`k z`NYHSgiyKXvkmj3_fku27@7y@Avb$$928eWgm(AS6yCXU>40&9fUO`{NYzeCM&kG zBKT}@A$Zh{k?9GlsqPNde<0bq0H|IR`GNH>@874w-6B>LN0{&o9fc$J)*Lb+e};JE zTvoP@^$mY69WG5U5~>3wm7hz^acYvo2T6CSP^+M64vC6V#Igk_jXOi8!`y2-=x+#~ zteZ>$#oLV1651X6llj`%=;#GlxA2QzEUBxh_2Z5rd*s3e9WDt;UdrRk^r~F2(6_*h zB7}UzrS3y_Tvm?iW`oZbmQT91?0Hap)ynX?c$^G%wlS+WZbI`h{dBGM7KeYB-THgA zzMlT2N{RJed0ovX8)#`iwll4oeNG%R0|rdM$v5|@j85>X^<9Gwki^-Exsx`^*Z$ z(Jshj#OnZAqmaR2kgX5ZS7CfVayjr*(OH7p>oT&!*GBJ!t^Ht`}2W%iG|J+~m3`oDh>X9a#N%=qNa=B}F(42_C^i;YCwr^bn3oil>H=7NTo!~<4;QgFKe_WZXs@Vl$VE<>v-e+K z?iCiTS{W0`E7-Vsb5iw31GAb$*~z+XA2Zs|JnkR$_3QWg!yZ#)8~J$2^qZu24_gTc zTNJ^$sk=wKlk&vNaX;o(Yy*G=017y?YbNs_QwbtRcla8C*X^A2JS#JE>BFI%l$4oo z!xadI19dW} z0*_7**;on3oY`*I#lJSFMhphL(52z1x(~dTiG}42#00qMERl3!Bxk>lh^2971D&~x zI~wAj7T_cSzYD+tqVyw}1X=dvVgNK;M4{Kq0$9cdMn~WwYc?m|7)_HcpQ#^&-fx3k ztS?Ug#KgYHfN-$YK+SPQD)pP};PoIXjyRtT4j?(>m zSZYT_MIz7DU{o}I@?5Q|cZ9`!eD412-rpX4M(6y5>96m`(sJ&j%-^cE9=Rmr;r-={ z&&I{6cau+TX9gR63Qz(TAJ&^4&2v4pOM0JVmH^AQ3hME3pK!rYs^Px=jm5=sPp{Q& zsxenrVdyDw=GDy+l1w}IlB(MvgG=7ts);d9FIC6sY5&heOSKTy)-Q$BUb}byRpa}) zdKjiTySCk^pF}G^Lt2XpPf0i$Bp@CnFC?4L9;bB%obZp2H!V}WC?KDza|MDk!2R>@ zmq?lf&S{e>&lvPVQM0_2hnSo1tEINVWetT0dXm!3P9EP78}k zScAo2KHcf3_RyNO1T{yCV&y>u2re7m;=;nh;jaP^8?m?@N0CqThG+}4iGp>=kmL{~ zC7EYf^~OAUWSvim$Bm`}kX93$xP%0xkkI)MPJkq6d`m-{g`aG>od?NI18DRJfP{lB zAt9lYXASr27a&(CqB?~dp%4K`yG#1|15nyv8{XkdL7Vjh$6J>(AresCTcUviGDj+W zf_$P^MIJ$iWC~H)fdvM-P11Y85xj5)9uR2?86t1eRVFM2}&rvY!QL#iIK z-Rh7V2%x?V;a|i65AH{e<3mFOH98*DPR|UlBQZ~1{WiE1sP7R(xK2*B7FF}A3)_O_ z{XnK=5yy$Os$9fE<1eUajK5Gd*!OI48Lka4U>9JCJ0|#dbbXSR#NPF>;w*7hAt677 zsf>y?oHzjwB-~^xgM&4nU-&RRj^L43X^CU#baSi8Gp}9Je8Om}>Ehhi%<)KyiIK_I z!QMXcx?wv18A<+q3;$=AWpE(Bx4IlbcI;6tsyaHpjHQ{ zi(v`BQD4U%GO^A?tGvO-+7^6cS+O2I3>3*5aJloMOTky*>c$OEq7$2X;iXvsK|;NC zE9`{bxKxqRMJ@_3)l?H{AGRURTRu>e6>mYf_mC?oJSG$rE&7kwYk^MDwjvX|kpBReWCfD7gW7(x& zWTbpVT$FiK3*n`2AF{T2tSnQAWlFiMeyP?Kc)G~G^Hi$pLE8v=-$-Q&zpw8{6Bav> zz1COIa#z`YX|8r^7YC)y@Zs2o%ivW2au4AmAi>&H$ZCrXi7k0uQzIJ@>AmrwsOVEzF-iZ5T}vorz#@dYgfk== z7axfpK)VIOz+3E_46G9W$dLOLE9AG}P?$l1m=7J0&V=I#0&oEM07aoiX-UpHoEB{$ zUU9bpW|vLU%7fr8D4UUmg}1oSZ7UBi?;$Y5z#!DoQ8p?p)ODXkxDGlxB2@y0Mwmux z>-1;77C6ozlpTaB8;ZOF=z?(y7rHk`OJMydq2D6;WJo#$2RBkSK}y8?6x(45(9a~% z_t5d6PPgeu#XSbGwi5g(f#1{Qxsu?-5P7P}HT{!$XUPe;S@3Y40l|yCiYShq1m?gR zhj`cGMQdH{%>dt#^*A00ZuI|ojkw8BA*W$w0tdbTGa-sRD8sg%y0@hU5`5_D&fujm zR!@G^O1e7T|8ULlnQm>WnosYXSz|gbf1BHMjP-T?*DlCnR_%PXBd^%+&7C`pMN;yP`vLPcC9k!fsbU%=FY+6EqqyHk0079WB3Erza zKrle`F#q-g8^Vo*O)Kh&o$kQ{yjouzENjCnSToW14-J1ym^2?ee3+1^MmE|g7vR!J z03!kD-xqx>phz$b$mbn_-4eFN4+$rB!}*D$knNaz;lD3ccTgh%3v58>V0}FmU?{vB zH+X*#hxV<~BN{wDiM)lQ5y&YIlu^KXW^f&y0Ys}4pnXS#`hPhG?Xg0Fg?bQN!)xh9 zlAv%8vHy@qFa>dhydY5@2mpC6NZV3rM}())`^n2wz-nOh^Eezu;0zc*B!Qwkg)#sU zbI{D`W;uE*z|CXxG4Zy(VJNO|k|PU<5&w=E+L?W)A0_(d;VKJ9motTvx&R6rz{B;} zziOTO{(($BJNA6GzkJ!}yX$-o{pp5!`rqv|EO$>=)&4&1;*vD*+kc9lo&7z8iiks_ zAQEYKBcX!`0&!!PN(t|awWU2DGY#>#<>rb&z{0Mb0V0%S38R1}PjpY=1XPgiXO0{P zS>L%ENil*;!owAPThrO7hs{IV*6`C0vc@_`$8w z;Wi@u3yjIL^pLA^_Ik~8;NhUjJM%+KiqqVDa%sZn(!=cRyx*%{KcxFc7HpS<;;w-6 z%6-Op9-v8!cfT*Koao1QN=`$cY-P`Neb}wlD!bG*JtYXR1v z5T%x?19XX4LC{$N^s&9khr<5X=NH^K0wGZ2u(BFvJX3u0J8mnSIqygi#zf1+ zFBNaEmk8mMYHFLj1$G}1u0UlDfDotvw%j{}B$K$IG8ZfS7{CNpIKXfWx}f9%&|_*F z@B=+L3BXf`GBz?gd}G`M8|4tBDLCO9^$`nj2|y(N0K8W-8lBH72?P$jJ|2Ai8QhXs z@}k!-0k?vh75^2n>_GOqH!5IR3A!mo9K8WTxyZvuZ*hRUYZ!Qpt+#N0mcZGSjL%E# zw?BXCOsl?CQCX8r>Ud5abIsjd&hX2RI-eW>l0Hq36V_Uao15j*0@l2DBa{Nw4s;+v zcn%k=t(C!Y6Zjrrlo3NKOtknSDQ?+XeD?zbFXD^ABSgF`C;zQ>;V3~*-}d675>8KC z;1^-mhT0SdaQ4CzENN`m0(etDBz*E z5}2ji`%_!nblC=uHHVyeq|f` zE`EeIRZ3AOEM#t}!{F-$6%CEa*{-j{+|jghDhZxklKFe?ANun7RrLv&QGVn?;KdO4 zdH@_!^h}3&>fy-n(&4Z~|e7(eRCr@A1yD1Dpz!0M}x`eM;hW z0YO4?;30$oEQLCpxbbj8l2CZiXKp!(yQsdLCH|ZEV6TfmVU9T`dO zLfR?{PNa~|G{c{!ZO@NnlYW;suT5iO#eA#1`zJ(}kNL;M6rWoHkWUHXG7#PUy|oh` zL!UfRN60X+`c&K&vDD9P>qy|FlLJ%q`zF-{G?WDWQ?{b z#cMI*f&KJMi1JJE10M4;EN)nfu-i`o51X2vuKoITF?Dq7fddC(j(J=!vpKa-XBsC6va|3| zOQUo$@7H_(zeTwXPGXq^08C{I|8(>XNRLdw|{}(P=vga7O)utnd{HeS{rTuC5x_ zzxrOFM5TY+bq~cpa3_QF(NHU+QilX)4_dA5+fA#4B+b^wH*ROxD;e&qk>%^VN7&-h zK6wL)%ghvaDk^=M&0uh+obP4ZX;+fz=Dg6}CT_Z~=%qGyO=xoy%dXC0=10+z7ibh> zLNTTSwIvYak58+=Ls$uiSaq?xGXq2DbOY)H!j0pCh3ySI;~m6eK#>U8B+^ytj*WSP zf4#Aaxp4^k!hno0E?!0S3V>CJ;M?fuO?WQPBU}cCWE;_I@rvei*0QpxzYy#+G2!#lf z8M${!P7E}Qi3;-aJ+6S^aW*d_DglK6njuX*9^wSf)}MjTu)v{P9MXS91eN6Bh~uW> zGQwej_NW&1!2*ylU^!Dq{+vR^g=z;S6=c89voy~Q9N5dveFp&af2qZmXCaqBfQIj_ zTes3igP1SC6bKO;MC(Ql9w>%Mv^vpR)YbK$;QzjV-@d~)mtC`uo`(h%(n}~;Tnu~P zN^Cgbrza4JKT8UV&rk}&;`8ab9xZgl5Qg-y|JZ_qI9*J9yXwKNY$ngQ2uN??_?RzQ1^=eB}4sb6JIs z1PYrNL^0DGJm$&wspaNUw$^~~bSF6(p}cqq9RMKCcMvYa%xC>{oIJ+=Y~hfekohQR zqoD>g;ZC%KSOTp5KRl*4Su~+mfX=ofTcHNP1VJ^(eS-c8XVOJS$A|Ex5=T8`cEkx! zj27tBx(t~3r<<{;HHni3W{S{QZ5(KLBhMNrh_l9Q&A5aTZ-zJlaVwG701%#{;Vw`r zoJSuDUzuUS8ew2p9c^qHaH0gToI`2O4%V12|J&EEFYk-rsxRRGnJ12b4D`*2H%zj5 zDvfbNK&6Og1}8m`;`8?Q%WvoEj(RO76em?x$(8Ha+3l4sk^JQDE)PS(Kr8p}2r}VI z61HnEOfGn-v`zMUsp+w#O&zaJ-dFwlnQGx3$5AQ)YrRYwn<@d8-@PK~zoWVoKS(&z zFr3%c{rThvqFVt+-vdZPjBUuRwO$kLu)@ZbpaYcySqVDv{Xhgr4h4Dxq!8G2<#3?% zAc!E4DWbH4nAk8OUs;L=(*h6}{N#I7H6`dO&W{LD-WFb1N7K?`{W_&ksl6xd+Vehdxib2%JP z4G&;h<6nzVHUA;Ur}$Ctj!x=@(YZesmX@ue(vLs9mbdLKz1{IC!)JJ@@3z+bLZisX z@)G(3MMb{X|MY$0mt85IH09-?3aDy-h5=Bn;uDqoy|r%s|jU$%1+r z9Q2KjHZV+xx_@7p8ajish~I!*g=lp#g~HnZArw>>fCIcKIhk~BV3th!LEIa`>`?BJ zxSEs)wwEp`qSFpM?KCuu%A*ng!tLUnUEmolTWA}mSj0Q$pp<~52p1^n<*Z=kUQ@g3s8?Dxvby$X zoUw>I9t7!=-$eEB>&01%p}|(0LDj3aUE5`PW$*O0a+f+&(pY(3y+%1aySCL=OgDW` z;Y0pZQc4c*3Ph2>%C7@1 z8eW|KqS@oy-8=xJ{{(VC01H6Q&@;(n8$z_Mn55+d#~7j{{2>5?Ck-d18S^Lpiy&&t z5Q~Q0+mXH!AoCa!A@c3fd10IZnU(?JzjYfeZdDS~j15k(U05_SUoU4dtuia0QJ8KG#WDRA7Pm2?qC^!5z>N+cl;AP=*@j3T zAcG~48HmHpsMx*%H?Zqu-+DC~h0^!;Ydgi%V?`OPY7_z#{uek*4873ELQ@E2Zt7Pn zHyp=x=r$kW^g)YL*V7Y^x&lRO-TnkmozApXeI|fV5NME4X9#GBj9ra~2ZsruE`Ns% z({)%Scnd^ihC7P~!ikBhrC=}`?I@Jc&OJg03-}o*=p90mKxGCV$-_R-?&ADCf&^mf zKm(?J$YPJYX$UU4g=daa1v&jhhfIPUa;p{tpaLRNG@OzoDU^U@NcSRu6%Hhvdb*uy zrWC$%HCRh)A7*jGC_x4ZOafy1d%3xeIM?b0wO*SyVZaZFXA)n8Jo6{OipUT^oScMA z!;J`)0_gTeob>Tu!hhLdkvorm;zx42Z1_I)b+Q?gu-Cz0=noJSQR&V2ZJKywBS^5p ziMAf(vcL<)pU~1gsH$E|)U9nx&GV4(oagwrkff?Hq-86xf}sb>p$=^n#r6#H{_z%{ z57vBPakKh@QA|*FnImEp`JqI312rdk+W?~#AP>Qn4Tk?V2rlr21E`qr`lG){VHgVx zklA|g_Hs>&612b4Ljus(1r%EAP?x>1?GQkBO|l$F{4yXHN(ys-l4K2Gv38k^RYXM+ zD0-5|fSp6?5u%$ybO3-h;(r4Xjwy6MQES2u{vu0r{=b1fM1KsWnR@0?WWm4CBN7S} zFKSUjN$5JrE`{!n4D`#6EtA-R$_6b7iR8jn0|&FlgPckQ5_=`7MaW~o(tvVa0n0-NNd}351zY`a^kY7PZxi^>Jo|{QCQXAa12*ZaS0y3Cpz)Qv zdbU@qkYK${5LlA(2I3j0?u6IDBNq-W=s`r6fSPLt)<}|J$oMBTN+P>Lfm9O3EOcY= z6CvpjqeLMzS5oqW5(GkHr*|bx*Tk{Cx=W5%uiouR<(d$Jr3C9*eP=qz5C}NIU!#4~ zF2C*&R9J?bFLaCK=YQbPhcSm$b&Aog1pXRXfk={hvs(QLL1#9}!eE+=xY)2#pd%t_ zZWy#7i;sPUfh2;eE) zar3&yI2kFx=LBUFQ~=9I^@BdY8h$Epw=AQg0D#cbppkJbS_vsky(yvbm-m6g9~2xi z&iR<35#AF@by*f!OZZ%^pDCb0#oD`qtH60UZ~HWCk(dk_n3i)UvjdJMmGCw**FR`n zq0qw7Alf^%zY7@(C}VLHvzw)1JPRmq{C0>NiTwEd`APk#jS>3`cXgu8C5JHht%cFd zT+v2s?|W$G$V4#^;~*n2XM*J2BN={%wkr>(N({-Vf$B>;g^Pr~S78V{JJG7;@ajMLcmSy+>=TqAqvPc@8N!L+03lf%#G6PW!EsO$izu0k zgA4TOnkEBMiHO_$?%li0<(g$C8c>}y8?fYb2kODCEG5lQ~fb%#^c|mO~#sQby!R24_1|p(xSwF1(=uCr$)V znC=AK>IHCL*zgeB#J;o#PYK~V4sOBg)sSJ}*G6@_k7J3iE{=&{o8AL6h$#v5Vp~m* zfF}oBjEDu**-~6EL+~4V&VRpA2sJ2#nuLLSxJFxnE*&t@R;)%SP(iDqD*{LQUk-pT zG?LM6#r#{bJjfsrcXuLP2jf+X;|oGk($hl{^9cJ4`Yym zi%$R&E9IU~BrC{yfAYo#;Xoy|+TOIC{}^yJ_)2gf*d(7!^_5FP3MB*70&s?sk-kWt zUpnX3!=|aBL2w!HeNq3)Hc?PG!AB#}ndoJJM1o=bI*1sfpVvLt&m==xg`=4;%LDQ# zEEM2S$7aeAwI07@%z$eP^a;`>Qr&kqN-tzHfm@TaCce(HP>Wum_b#F zRshmF9P*G$Y8slo7bL|F)|2Y&qyL5%s{p{nD`Mw9m#tr&PlPXSh>!*wmg;^$L>2ab zTo;5tBK#*>`haFz{t#3a02ZK=;_vAHA6*2wIWO)VD48q{e{uPfitH1JZBXF}&raCw z1^SIsOlxccg>me}9~^3B7-@k!gJ27w77MIfxj?bwX>jk^vyOhT_2DilMyT)Ld{;xE zzy?h>Y!Vd21_qt+4)}_;3!5mz7JRnQnt_b&bN)W~1hqGSyM=`PmH|&GK&l)Nk%V>` zk!J_?&*>5QDjAgp&n9~2K(fA3~9K@Vum|SzF+|A3=w3B&??aXSa-IR}6*DyE>wS!@B|H z2+=mcZ-52OBE7U)acCS(EGJpP^?t_dElOR%IF2 zxaZs#E7BEUh9h1zNeB{d`L%6dsB(NJvk`=mR?l22GYN-@#sV@+6$sxYy1s^@hnhq{(R%{=h8ags%+|z&1~xzUY*;+X-&txZu?22A1dn>Z$-|HgLpCE z$?rzJ{9;qW9N_QM`_LED$OZ&=LH4|Y2R8?^SuKL|(6+0elZO@)Tj^8V`!n7v z%mGhuX2&7~>Hhm9E)jy;V*9iIpn-T42cR!t5>ypP+eVaQZc!z4K77rAG936YK;z!- zLBVGMsV;=QT2LB5+V}t(X*gBf%wL>>4*`1&T|ia(??pypr2$x}zI#;8*b>Y$shfZP zlmqOI9udJ$#fWQdd0xTL?b{i;~9-*xxOCP5$ ziQUJSf%Xt%U6HNZLQH>p;g}Ge48(Ehtu3zxBe!RTl0lC zuqh6MIW3IZ!N!I^DH?hatpNH){DQVmmW>vNi6R*I9f}HI3Z!4b1ueRSp0)<={P6X< zd~Y{d+jc9MiWt#l!rO_)fjkHtjONzX^-xzK|B3-68a_Ip71Ar>F?ucxMvDyQ!L*JqoL{{0659Tm|_6jlaP1Vtw2%|C>qq*t;uCk=nr5%Ksd@^ z6j~@!2cW&h`#p{$tsfj)8p;zQKFKxlbHu0>)NrDT8TEkx$zW=7XE0VkmMi#MxN=C_ z0jLz)ii{(`V^_<1ENc@6M*=#P9mhRO(>E*!YdsQhqYzrR?$$aLd0E{T5Im4fV<@iB zfnYiKg1<#cL0ml8I?&4!co`3wH_9O+>BrhyyrSkwN*Sq^FFF6dfBE|L_>smmdJLI@ z-Y=B8qV_-iEFdDF%pct^AC;2&fh0qIYQEfSzZ{R?m4a49am)*!0l1AYkN_x4&`E*P zzK^Ft)DSo-FsSewZd)9G3AjvW5W+@cIq@$fod>888&gwTy8%@bijcE76=8dUvtvC( z#Aqv46GTK$?7q3iaj9i>Z9^(!XO1s2kDmPpWCQ9Y<|&ZzWs;{3Oq)QfyF=8Pp7cbx z*hr~7_uCK!0BAiqu23hq5Vi}QO$-gr1waY-Fx5RCI|Avq+9=KQt1K~19fb?&r2#Xw zL)8uCmLxJ-P*FWYnMuSxXs{6z;)5wdWWfNMakG6gSpEPtl&EL~^p&Bg3;_T@y-3(2 z^n)zOW&mA2nZRxOQ@*B=gu-F`f$?j7R74_hNqqtXh{B`5>4zMU`nUMSWIid5RIDBm zI8hlEiiCm~ksH63twO+xQZhQg9fg0%x+ zFV*VE*_HWoGMHtei1qgcY4eaC6VKKECAvX`ToI^g*X`!DTobVee}uw zh^-0%OK2NGcoVr1*|*4<^G`l1Eqw%FI1Xrt+dW5K^5gxHNrj|@Ko$_9Scs1UvEhH$ zdTA~}l}R25fj)7HAGZIT$LPPVVNh;Nw~69{HJ`;L8mU^%y^HUj(NeKYOu*j-)cUv+r(C@P( z86+?`mEm5&83nU(J4ReWIzc46KprmO%7wIjrd4ToWCNSX1W^qUS0`kAgr5eLyw3cB zgpABC)Ha8X75pb@#UJ~LiU;@!ZOrMnD7Jy`X1wJT*ws%2Fo?K z05y`u1bQ3_HF!azpl;LwA@XUxOXgV@I#>Vs{hJM^%Ox)rFR2Y1 zfB5GM)HmFgy|b$KadBYoHVN{A5(Uw+dxp{iD-J?LM3AfS9g_oWT@c*hK7=5$sQzUe z7n(PiJ+W`%oLBm}b#7w#P$ED+yoQZm`_BR$0I+SV^X~nH2hmpOU(Uj+jN0@PkPBR; z*57wSlno%GaZ$9_4NEPsNDu!@rpSnP!HclPX|N*1N%5`QfmK*{fO;`tt*WWJkkHYn?baH+e~hG@wtb63t6e>qxx z;$D{G)3o*TUhy(2m#$llf9QGQ(Ofy~*?dL@W#7DRH)W(6kZY6=Yc`JPssX6Rh&#kC ztbP5eiJnPQW{Lu<3w6*?>IE{FJ|44<*%))~Fnm6dBo8(jn>6|d752q{{ ze_I*w8lQ(-NmA!|vT??GBX}w``J{?&@N7E1Mn5{n3^>R&^X# z0q{DcZ0WYEvr-ALRGN|*coSb5_TNqv508v|1Ki@hU2f%D;AyVElKc?Xw*TnSugkH3 zaL)ie_pr6paPo%ksJb1H25tv~c6A0S+Xtv6h>7okB6l@1+97)e+(5O}W!wW4!OWuApxTcfZM{F05e>=44&Mw9*eeN{|hV{9X*PnE;OM3pxqc>#y7H$9i zB@22a209^nU9NrM{^-`HF}773=K|itNC@ePiJy3G= zB=K5DS!6x}<45A1$i8vG9YLF-MEqQa6E!#<9XdsKDO0TAlqK{iAy zaRa2sf&92}KNUrXENs9i42R2x<11w>Z_d=pyLzH)v`)-SG2g z+AFSaYWcma-G4Z~u{zVmv_J28hb^&2;+r>h|8+m6m=cPL$H4UuTvi`oPqSUD`W`{c zdcNkR*1okU~<4jvl5 zgp~?+?`vyOf`$R_pYWfiDp7-)4e8kzVFK#fT3{d$N!Ve?j}yDb+)&ipz5L>1{1l#Z zurVRvJJIS3-gVy@q=@fHbfh46fGYG9W$Um&Q_6r}X_er0-R$OO${S=N4nFFs3|-<1 z07U*_39lX3B^f)2Y(!iq3ac|-@)weSpIzXrO1~b3BuoYq*&r@tpL+ z=Dy#C{^gN!&-?P=-zGlN0-Y%ROkM<>{QP<0^sj0F|KW6|^Ad`e~%|Pr(0Mx_R zMspX@Zo6V=hm z7}dAiiCoJCWSH02*CW6>l6Ztko(bp;wD}8AkQ`zULI!&QQe%4DhB>f14cbfQg82^| z7(}_83T%aCpBHwTI^_D)Lc7ayNkH`@DVKe?lrKb%aqfi;6{3x8P+OyLTmZvGrj#Ql zXfuRmBvlZ}cVxB-@B-4|eZSHBrnU8jQFj>_7tBC91pptB?@u(|q4HlqoGPT_7YWdBG5a31E6%iFh28I&|3>YJ-3b>2!Ks!#R+2L33K+p*CgPqkSjJ2o- z2I>V9nqq4TZ6t0O6&al^*o09K4v&WphfJ#^(@Ri>lG(h-+1!Kr4bx=rKx0MbO~6>-hLahaxP@aU zWG)A<3pq)0Ujfb9wX%Uu&*2DonMc386pDmib>QW zswq%?LvYTYM;;0aWIcCd9u+dw7h!ek5jHPR@u<*wABq`}aaC)~T$rpEiw{!N5~!*D zIq~O)PXkXN>xcLQcD0qOM=KWg{adPI(p+A7={;IAGV8REf7M{P@*FNoG8Ywl2iGqN zM#a7(0pMi*4IoGP%nr!>50ioti3J4sNAa9}P|7tIqX%E1njt6y9@!;7y)Yi~%?Co2 zF0+S{ynpQJ*0EU(Vf*x#m-@0?SvDgcDQX*xwG%^SZV3Y%krd!&L6}A}x$3ze%E$rQ z&#joIA@mt>!n7%>gh}EI!)vI0_s)-e#{T{~yozRYf{&2;3=Id?Lfi7FeC0_k4Q*`( zqLG3coeYC!W@SAB=Lv~l!-)1KV2sol$%(R=p!BHx3AcdKdf)FMj}Yf2g`|)_G2BXh zWy9y+(UrNXD%a2U3aU8Aj}|)|ypgkO(U{4kKaV{~5qsO1BqQO-ANgJ`0iH`Xj0XyY z*T9z_VKxbPPD_MZm9TKqv$B$yUC3NB+=?e~fs={K5Yjrsori+!7ydLz7X;F_gB8HU z6Q_4{b|!IgTVyWWO|_?9TrfY#h9sKzVhTQ#KJWX0S*GEU3{A%1mrWw=svi|3o`eRj z2Wzyr_)!?{5RU%y%J0vI6oYpejyRAZd1$?aOe_A~rW-uOwo2z+rdR#_{H+LPLu$ts zPPb{nLk#Y75!~X<3DE{e*ZyrZIX4)oB_d*IpLlbn+b+R$TlJc--@@@lzAc<4?Kgfe z;+YoUromY_HYlujpzKlg5zm*)feBc~C!3`w=vi2g6F^5tz6T)dXp;jl#I5@J`;p!l#8(ss6Vn^N;ak2nWuzG!;LF~I{%>l-pf@A+IEb4!t zXMaDU@K~a**4LRow$dYaB2SeuPKIB)qA{u=)J#mlH}p&x56a#*SXIPE0iD*XrlzKf zu%n2|#T#irHA;FJ0H`msW6E(0gEhyXFEUyR+!dKYjgIFokP|>NkdwxodPbUX$kRb_ zmH)VcT0lWTfi38*-2>+*8seMQm)e>{Mn!R{==gdv*R1&Rs9-N3naiiL(yJ(b2_7mL zC}N2u4OMeHBF@pYM5_hV2# zr6SgEo2TUGss4lBRm)Ut5*qqj=chsq6lJe#z)Z%`^2^$A@bdSgdp(K7tn=ZqqdwYKVz;c_{(!h?>L5g%={{lJ`pg;O#+2wEg@1@_L_i8@f#5nUbnaPX;?CUPc{hKzrzI@#GncDi}I+mM1XPn$i!o%`u;niJ(;t35rEl5hq-}tq3sy8^Dr(q!>TDXBO z^Wj4o<5Fve(+R@4#j1>$Lup)GPnWL4FCCU!xv8cmzs^vj;8LFJi*NV5gWnA@&zE=$^>s6UM`chb99 zT3VLK)NRR6Z$Kb1R){4Tpsm^bti)J9d|d>`D(SdK zq|dlFmM&e>OS^N#3JsveJFT=k!sje*C;zf`{k2!}`spY`1HOTeCaAp2i%s;;9oJY} zbq@ZOtbiAlpHl86UGa~r(4K$o+k>6zoPv@0rxv}}Hjj)cZvcZbV^= zqekyfmv(T?`SZBZhz<>EP$G1(8y`a%RSWA1ut`bKy?TsqMbhh$QwQ3}t|*}k`x>MKe$rev>nd00#6%a5W(|VWUi}2Lky+BsWoj=Ql6K&48 z=@sr$2{XH~Tq=}Ycq7-|;OZau6`3=dnkbEArm2<}1uK?v?2R$$!2X@*Y@|@mr;3H% zz;!p|AekHK$m~*Qy>wZTQmsm+a86&&5?Yzok*g&|ukA2c2tph{vxlSuL9kpxN+8fz zP)=wz0;J+zS&Kqxbi;tU=gO7Dmi9)qod#B1ji;RD>wOXhAiqupZ9qm6Y4O2eun#%_ zaB?VkTWyOTo!*C%0V)=9t*%`U=XAV{I~yRMo91xNv!^FjsCo7`76f@OuYY)KpY-!I z&BkWa_Ggo$nbTcM4f_ZERz}*V4XX0!hdz|)ug_J$cs%Nx%i8gkp3^FB#+E;S@vWVH zJlDH)?AV~m=i3|Y7KM8`X?$y!cZnqM!02}X=sl`q$m0dFon01upkeEj@Kd`@}xSU7zHDhKV%4mi(BeX{exZvr`X!u)c~!1Kut zOGq#~a6uHHBG|5(dhUn!J*{gvNYxIw8ao9i@doJkNun4zn-x=PAl$))NK+i3m_mG1 zV&DHRG2%{d3VP^Q(&OVTT)J%+pVIY}*=Uoc!nod>;SO=1&l;M{y?8hN`aXPZG?)1jKG##DfHiCV1{m>-mGEq$C7WdrAK$hlt(9YjHCI{Go3iL>&m<{mNzd z{oYDyYG42eXtIZA$mP*qf_e$B9 zT(gt1H*j*n)kr)9C>Vwu{+D6XhocO&o7A<@2)ABBT;=C;_34MZWa56eqKn429R^f$ z9@G!K@_B$1cHXqYl0&&cd_~L7Aj#pP|o89Ca^S;d`TrF%D#1v z-!-tHG;c;!Ut5GZB!1L-bXcKSZS_xt_%lW_aI)|=R(Cmu^F9Md{9DO7>ow&rr#5K<|=;MIt5!j=oNENiWGH7O^=0G5* z3~*F3`w6!moQ*bE=m71&9}?dPnR$Tx0H}=1`e|jHDPCT`mOOQ0R1V^hzW)9*DYcwF zOVQv42yhMJlL(9;3?UQ8OM2XCaXaFX5xF>F(9xh_1EZkU&2Xr1Xn=&VOyu1*I|8C=4>_-MIpgE+Ry&QMd`A&v|z%zp!pO0g+6q`1YC|{(9 zdA3_aA}kBu>R<2L>N_;`d~ZXRB5-~}9a)RkkW9A3qW};Q24;< z&|O0wSC5t^v%&#vQB7~}?iTNNv(&c7qD30RbDAF4-KF+^D7~-{6%na7@xTT+85!4U z<~7TL{tRV=FK9HZ=7-6;0wX<@(j*)L_A|8MJeNIk?_x?Y#Ce3;LqiUJ^gaZs zqznye3~w2~RjA7}>a(S9LQ@QBD?leY|5mboOCuanSQsNI)B$>8*xv=zFNl`L1>-vW zNA(;F8^!*b>09x2Z^ z-_HpT>%`YdT}h5RHPvF2rspfCt*xcvWQVSUe69wIpZG>fY3)eddkRA8&)NU#+NSZur_$pIT%Yb|_JP+W`hW23Vvd!w+Yy8O zV5dZLTy!sZ=boEO4rG92)xyxu|M>)p)49)4?>ZsuH?@hfVId_&sW-OiOi7OYZr*+G zTQA-SH*gnmZenUdZZ6SK;r^e3su5YYvQAVUSs^#*rT}ii}Oiy)}GT&QGa5KiS{pbm^?harW57-X*K#&oN@p{O=G;Cn# z!!LM-pNprA&_**}%Nxi8%XGM=7iJOj_n5~3ee`bch~3ZL1veDjLW6$fJ%_a1M2~mo zQq&Rm!Z-=kppDN@Sa18`!2oqO=svgKFW&J93L+n(%j9_-&O+R9QD_r^wop+~y?VLn z(k84ZsD;+!2qzfz8;$t9J()hAKmIOX83 z-v~d9ZOoq2P!M5y!|Bql3p~^TfrD409&9n(DVrJ=X_lrQzTc+E$j$g_VO;Zc&1Nas zsr5XJAB)+&Mb{4h?rRwG<-XoiO}&6AdYHBuP+Kd95SzW=hUq7*37Oy>K_D|^EqY{nn`F~fO0b7Xl0SX?aS zN3zfm@cIELC&6bYjGQzjGq-I5dMCmBI`cGcFCwHzkDne&y{D$Bb;ROOb$gnn<>IYd zYjjhE#Zmhm8x=a37*Css6=ZT08F`)B%WG!VYs4smQ$4>!a!{(h*J3Zf-FUlZk2HJ9VF|7A5YLsOxY6laT8Z4p z{G})O_-;8?g)Y5%?C9f%aksC0DD&BLZ1{H_7oy!qN!d`=I4@Ua?Jy7=zK2Zn)5*x1 zrLzk&kp3jHoc%32E>1|_fO1|8bsW^`yEr%mXG`~mn+Od3(w20kU;G<7qwPn>lC0qK z^$jow?7I{+ocNsNO@Mp0duh0#h6D2>P6Bqr*`Tj3F<6zBb=s8odl{3d_c^-$N(S%c zy_!A8mxTWQF>cRh)!iIMap;iq)i3GV`A!yYed^`c>32|@dF~B7U2L~Gw^G0TK-GL% zZoyo5_}+c3G=kC9*(0c!8G81V=7L{%boyEO^|L{WQl1Jap;ntUpM<&a(a;WVzM83@ zUnfBSX&(mJlG>Kc)}epK@QoGV$uv$@a%Iu11gXBYUN+{_N)20E8`>$gM((Hw2%@oM zkPR`*>}qUQvEg5y@YbrX%32-!bFSThJ6-s_OPEq{L`vi8F`E;%5fyEtf4pX>x7B1{ zF?%>fN!e$o>T`J8%qJzBKlDQM-@0(dB(_?Q3zP^*8#lj^a$__x88GaBdCgMq6hkvt zdJ}{%M3SkUrsLE*2{tIXeZ+M%BkgkL;X8ecpx~P(4j2zi+j=d@c~;m9D9XReGMw%m z)MHut8%pWY);{>PFO0`;WpSbA)$OhTYMOB~Vbh2@&;9CQV~ML>1|aZEud(QI&KK1^ zZ;P6|e}$IMj3N=#0|d$d12ZTHln?wt38xq6mD+Q??+PAoSSVg1fOzpZ+c$mfadxtKbpFD4P9!9bFkCiG5muij$*X+^!Id*kg{+;HD z2Cnd9?w_tUatcA7fRYaT1?T4FMa4=}aQ@Z&KRoh3mKLTgbriX0>th6%Z^+R0Z(T@^ zMMYZI?f$TV!m7ER*JjhUdua*Ve^mK+JVYWss65oROm8du;`c-QP0T_#R`p_@FW&9!*1M2VMZMJc%Wj@JdlkDa&Mfe0m?;94+_-Z(uXK(1Y#&fq?0-3&CDVU{lCu z`+{rFdoDBo6T2q9W(M5E#wMdIC+nnE2iD0!3_*zZ7?+9_cL->;hEVbzYAeKDYoJzi z1uiZimU|Q_kRTm;|6bb{F2o)Zy??EzKq+TyXMXCVL)*U{g&xELt6j)2p8{|+zBvF9 zh`!dRv?}9-W|MI}2#K8DS2}QFVB-*S^XTk37F~8cYMfz2_zRbs&3_*po0~W&pf#~X ztjP`gOp}MWE^zT@W0r@837TlN$a3@$Z4w_83>uzhLW#)MymK$}!0EAn6mwWv+MBHS`5Q%a_ zG%0~DkXK8z=8?|L`T2+8N=0~A!&bAjuEsZ+vN!(~Tkq|*p-3EKN+i>T^D~v*aBUS^mrPfOG3H zkN0$#$){{r#tC|lm`*^{v{xKJ2S*%21PsN*VIqNrrj~ZQQ2NM_KCOz%LC1KZ(%Fxv z}k=x3ZO0qnuK&*XI7W4P|!*W#J5$mGcow6m@yF zYkDWc-pn}!c0CRU$c}yr+;w)I7k)3UJTf2$Z3TYJ&mf5&GU2^&)v9&YgZYlEu-V_1 zSwR(_xLr~|?OVB|#8h=pWggRgN(qC08wUUZQ$yB^%~;4&+zY$;30Fd2#}97 zY$J0sT%HhX4NbCQ7n2==Hs5hc=3c41+~GMU`aLlvkSfy-D$5pyU@O3}kP~C=|KI_c zM>!;83vm$^^mzKiHCE_-5J6;G@csO{Y<all?_;YUO8z)C*^JnN8);m^Fzj-tH#VVh^uX_06#*Kr224@eRSkaZt zERi!b+_ARQyJr0*pO=(rc_J9WjdSOMhu-RE^r843P)};LH&xkf+4OJKqby-yxUu{6 zzA}qekBNq|=YiW1zAE;;&11Jp*#B^GQ>Iksa$UAB>id|m=OQ8_^;}zl%%b+qig=4z zF?q-kH+3ILr8yF{4(&FC{4OiX2<#y8R=@|2t{=+D;NcV8L*4@-hF6p z9zsJK5fSli@*18Gv@?32pS8XUV9C3*6|yPy$F)aV?#py|a6kly{!&?4xpeGy-*qIy z_d(5Z0orI>9j_|Rh4CO$U2s4O+O%y?G?d`s*!pnC&Yg{DbT2P0*&bvgLQX<-quuFn z-%X*{g6bGaXIXPIiV6x-W4YDSZE7Otm|igM-i>UnY$>@9h{MEH(ECw8l*b=+Kyq^O z<+82hGj7_Xzf{!4lbV_zPEt|ly7QAKKExcn%J*-vN76D?f!4Zvt|#{X0nh;8gF)f~ z%EgSp27DIR!AOG~f0+k?f$+}P*=OR3Ao%Ti-5bF>H7C!W{j8oaE9MU=fg`}Z-yt^_ zGfX&;v8xg8{zuHvGgE`ZocsF$_sV!!&&ibt7B;*5X0U;s;p6Fw4kb>vyV&UT+qYkD zWcA|C%F4n}hU}1G?bwv0*O{DyEpR;ZK!Sk%)8W?O!!DPk)rV;}ett?t&wW816Pptr zq8kwB%Tf_zm%P4VM$3s4)sKe`E1;0T)2Hc9P9BIgr)8}@qv`KAx#9yz0$mGq?Rs{* zMG_+Cr>4xYaY_oGTO5wT!jtd}3Jj!$ownrKfx(|%umzKTwd49WiViZl2@9>my&0tp znIQo{1hbEJ;4*%hI{*Eb6h@Z9QMuAW^=OPkkej>zHoeJVcFU=1mthRxwNy%0+9qYt z(LS@vSwOz%{5<^8sb8DT!|} zvO{-w{V}U}_$2UCTX49Pc`oFCQGR&;K1l07FEf?EiNKRyVo?fs){IGRJL`VGwZ#Q$ z+0-R!q_W{@!E4(wnimht49XX1zVR1sV`#FyCD6M+_WzaQ$?vYKxWAQse zaI!cI0}ZzXNrnTKWY7RVsc&jh$nA20@enxT&F}L)=?`JMgDm5@3B#%epMcM#(-)~H zl2a2QQ+rSK)ibaiT!(a=5&h1a5JEzYK&E}*tU<|(C~f)NE+hgUj5$5@;wEGRxaSyf zQjwv=va&Vjo|GGLbYe|Goj_*qtb<31@NUQ~d#3Ru0M8L$o+yIhkulBPfBg7ru-pZm zf@~?L4V*jox9kI-?7yrh8wpjcFSIj6J`cg2eAvLQQJ|)n^?V;u6K>~8qBlxrxPy1n z(U$~3%!cbvPV$m<`f|u>-^L$VGy;Ck=DDa+k;VQY$sYG&|BH8=@GRa!X*V;5k>QXc zl-Q^D(jmm)7)pbG4qY7EsV>WFXwV_0&OVd4dSQ`6me&V?3F4DtWhLP^8fn~n_a^Do z-GEw@%nDQQ4JdzmKBU8Iej?hSRQQ~anRg_|-KRWGP9Ys0R?V>4%j#w8kK9qgl>&Z` zXfHueoTPIp$;<1*sds7a_ZJ*!fa~wn!0x64ufr)E5UKZ1=}U}tHkiLU=3Z|;w7Tj& zQBZi-w9@vaZ&CELZY#<|Vn>F4e`=bYm6dB0!d`FyNrSa`T1*dE`MTQ8j7 zvCiWYbt&4jCH9LALgsz(il1E>cUW!I#|8K!RL2Af706k5FXk|=@i!Pi7EDW}YkK0m zCTMUF^+O7Z8AFeWinSfYev!ZMnBm{6GC$@+{tbC+0R}KBnoJQzCkgbA(kAVvm^q>D7LE)-`yFORmz8HGwxa+;s`Rd^sy7?!=N+H8S-IA*b&Cpuf&ua>E#@t#_~qiDpNR~L3} z*)qDJT?BxpsL-et`=c3z_V7mkA%U6ut1D{USzzxF*RhkH-(Ix!{DALY#v66730YXu z@s+i;=lCB_Vru93{C+x+6D5Afdc;EM+>7LdDL^3KrQj1ctWYvw%;IlHt(CXBdihSR z%qfHvRo%RErE;fffTu-#tS1My4EM(F4BCmc+k!S1Yx3vn>Kjmgbgwecf>Q@u;jWI8 zdL$Gj78Dd1kdDivT#jPdZh8XGS=D5bNU!K@WZdA;wI4K~QR(^~|7v)-%ak#L=k)jz z7GpA{8FqmyaW_|OK5mehm^cT!#{z9DOjRHg=@f0EC8Qpj&wah_$U+x{BMb-^EXeJ7 zT@6y&?aTT`n&A(ANQCPBuA^|k&P6$jdo}>2>}OOeHfHL72XS2+lw~wgv@*pepd0CE zH)F=ZpWk^Xu)A+jHpUN_mH~4B)Kr=ID|hyll9u;^?E^hOrB7V1;RcwK_{Z0#43Qxe zy2ScQn-u1&79Ci&-s*FP`scfS&%UmF_xSqE-`%@UIa~ycc9{E*s!S3pSWWl#sfvt8 zwKldVxCnrD-jlh}W<(go_e>Yh2fFknDp4speSPz;Sr2Ypp`b?q%HHJUbwMTOO%(^h zym}Xxsy+y~@o9^71rkb4KN&*0c}X%h!8EG^Xpj z8S`ttc+S;t5B=p3v2Bi$VqUkwgTIIB4`y9(>}=IC4SJc8WZN72UXn%1+hYeO`3H2> zi#17*T&InmLvD=Gl?u*vArWLxU!=_zB>mE%BicK7Ye8Q!nnvSzHJJ|}c z`|2UHeP-0Iij9klx!ERRci-DRYe%iVR8YcI3i0RsJ9zTvBJ=zfJ2=gGm}rg`4M4uF z+;@A(*irY*b8k=PG9oBANYh#;aued$E>=B0eQcT({k{x(3!2_~#+T^H69RjkI`8Bg z`}ybVT}8!R^GusJX&KtIwoR?~=x(MhyY!yAe^^~)=WWZJrsu>(t5JV)nTe_d(?oVo zj!>lO>GcpY%eVvD*}cQRwMXTQdhF@yeU%6QuAXs@up3;sfLbYEZg&zRhBS(*_Tu5j zdbiHbC^mQ8%noCX%szZ9J-vWQahRXROinh_h%}IhbXfLATf9g69(Ni~eAOvC?Ut6! zIS|DkQGI-dc5PFgHYH^@j(KL)gr%lv44%>KY-0K~QLNQw(8MX4oZ5{~ zc0X>?@%}44>kAhK?cLd2~h^l@$*&6+(t+n5^t84%1e^On7XVq!-Yh|h1twn3@hag+cvSk5U`O{up zX@^en;i_l)GuK$fo>FpF+GiRc<6fkY@@L4C6A6>FXB}#KWQtALfZgzah^VBLhk3{9 zAoH9ZHhyVHJ5hVn`TGk{8pDuDav*`T_rnK^RAlkXg$y7@G>?L%9Z2i~ta<|f%-VCx zIv`;4mRPZIdJs7K-P_*CN+4l4j}VV_n_9U9xwuh*36 zGkOg5DK1{s@8^y-rHNUqo{v7gugk*|n)x9=OWR$VHTwC5su3rrZSr)#{Oj_v7bR!A z*1XSj^SN~CQDi_$VEzf84_a?aGc&5I0!NpVGhZ~y&2CGhvnSt|_sqNZWKqe5)fJ6S zPsw~3oVua(&6Ak!l`T{bJhZaVTs}Rjq`L0=oruYny6p`VZ|@GM&U*6xYUPa@v&p+V z_x$_de(u>dF4HD^&dOy-+F(;}m8j(%U z+ZuIF4OblT%UosrRdqst#ZZG(Lu}g^b?>{kB*dea?fLD1Q->%9PHwuEwwC;v34w6hUIe7y6E-NdG2^XS! z>^!tlkmbx#8j(Th18_gcteMG!BI-v(R8@LD@IzEMy7po2y>8*VYWRE#KGwsA;2fXHVRbBOfllUtRT6+jMQOI3%vt*7}S5 ze_gpaL*>n|6~=3qwP{lt^lXF&DVB>0EbZ^FUKeb2t9P%GM}?!dlw7@AJ$bZANnW!q zxg!2KYzc+)XYqi3LlAsb%k4!?e^XKe{SEh z<%5D&_Zv^+R{iy;p6q6~w12H)?V!?DjPdnU?xCuhzN5{GcmACNHzka}*0EV56d>Ht zCujanqhu1jBBCJy+p5jp6ZwVK(#7vUa8ys-VeaQ`dvnhID<5i80N1Ug$LV?ipNcEG zk@L1o<6LK)D`<9n@(yihLmY$lC&xgbOl53<3O4{v$acS==3_P&ZPM=9v+2plGh$41 zl$GY>>driJcBRw1+==#K7BQudo_0UBzk6V>-lcW>H+;JjQ@?9h&s{2R>+k%q>1}%_ zraZb_la%1PHqEWc*`ljVSmU<4 zXnWUi@qokyeuXv}yEpr))m?e$*M3^gx-p+Mrdw)_>~EJ8c+BwDErK9^rOdkc`DJs{ zz5pU%f#Ry66Qr4w!96U)cyqP8H*RbJ9;v$e_xD~2EuM4DD|DH=-?a3c^NESZan{o< zE&KhFQ50Xl=S7~%tLPUpDcQrsG!O$8v+6pT1|mv`)_AeZAVdsrefzdRP{rWA(>r@)Da@sm8^DexTz>ts~Q%>KYM@B5|x;>s2l97L=fzW+q>7`{~Uk5&H*}KNZ zZ(RP2oxAq=bsIU-s9pb_P4{#TXkK^Quhjqi{X_HTXN^}27`=f7Jmt!|;3L_$cFi9Z zw!KOF!RGd1Uq64Z3{*y6{7ElzX7ml~9|^8UM|!5&W+|HmrkwDs8?fZLPVFl;$d@-G zruwv*6_L`StfXWPvOJ>i8qRm#mpO*=g>H%N+j>KX{jcYK|L*j|ZGGK<4~0F;rY0{{ zJ~=LT(?T31q#XdKX)wc3wDIT8$>2O(Q^@Hp5MJA_FK$T8hCGV}r2`05#Q5I2V@KN! zYqmu1P*RDZOa=O(B{OjUx~MuU%ExD{&!Nc2i-a5DCQF|ERENun6(4x;AX<(SRQm+- zgiyOR04}3x!{C}h>=a(Y9IiP#hg-Q7nZts1UfHF7T(f6S(LOH)k_NDDbsvy%KI;0{JGwu+s;;(aM*B|Q(GrEu z9>m**ysj%vI2BV6nD;abi|Z(Wt^=!Y<1-=vkrLy(wMk%rbxbMXo%DECoBR$9x@&ZJ z?ATRJ5R|T3AAhpj>USDto0h#$`dE;zmvE>47>!*@n^U@!=x1q9EOC0b`q3%B5BHRu z2T5)bKs$me4b#>jxpNpIPfThHF9@Yfs8txab<09wxcw1!j3_iD52(~mzA+KBDCQrz7}_%SkMu$uG1QO zlvEll25M@}^UsWqAEndMQkzWytguxQ0tFg-@!|*m_Yfx864xnl8OX>01Ui#c!BM0F z##%z)XKIDP04xqb<{?M6WsOM)wlQ{cDt!epTBALCc4_X@ME!j8RVxGhBJ5M=fz-Wz zyDjbH$-~_X&X;eox%09}ObO5Jf^F8R&PJNb_wFD0x$fn8>IBVcLIISZ z6O>^+BJ<)?z4`Eg{E*A1$JTdgfN2&L9sf-VDzO7nC`cX&Ds}EGQJv9~5pJZ<&{Tej ztDU+rh9kg{d!dGR4|{&aQ}p>>|A*%6GmZlDm|Rk?m545F)U5qAKBEyCl5rGT&D*u^ zylAI3fH10sf~a}DE-nZ4n}5f@tIS5j`*0K$FMYcVpQlY>wDWdEL|}N=+DsjmPUCEZ z>aU^IC%XtT1&N9owORG((X;(}^U`UV1T9~jt#wZS9*QKHWeo%yX>~%Xd!}A^>7OJfBMkgAD7pXLzDt5O z_!7;8T33Qk2}P_YC(%MArd$@dcegRBQo50bcBK@v>)0Lh=c9eneX*cl$N8foB74ld z!(K7UxOgCQZ)Ur;%VBT=BFsQJ(kI81jpx6!z#c1)dpytAj&FKEuzYST4$A)vxQ zEhO!rCp^t11nR~T);ho+mge1?RXUzT2B7CeKTj*XSskZ+#U$rjSX2j5S2dd2) z*5|?&gwZv+wdHv$m@2Q+r%&fwQx=3l4kj;*`PEp0Lb*eNY~wY%jymdZYUUj^-SPY9 z&`YPi3&xUgFIna}IXNklPv?Ymi3JKQxUkga(=MzGq$NYUSkG6f2+~8e**Vj|?d-25 z@qTb(0+X(L<>5Q!^hq+Z&%5O9!Px@*TV<^K-MaDL@=O5g$C#_xJ);Rp!d>h2Va4T3 zmx@>U-u+h{0SW2#YjZ|M2DXG(^8PvYB=j%BB4l40COk@yZo`NF-j>1m(lUqz8Hp27 z5Tx&;w`KOc^oaKdwzYer(yUPf*l+<)Xbr6@E{*r=*znFCXDT2W&r15IFvq|_OzFPg zVgT`Mpl-_xsRi9TS2K$|XICUa>;e@wx- zJ?Wr+J^!EUIe9r|!zHAVy-FRS6r}}HBdDN~(z5l1xcSAD`RvRaPnGWwiTpF78~2GH z@7>#~zR7>#yN6gLM_*YT;4`d#{drsNytG%zOFgO(c>L`TM4?0o#yyqnw_xb}`kv2q z`CTR8M%NkRLi{xJ!~Hhhb%WlY6331Vpr^?Eb9%N#pNmhBh+SEBHB&1uo#)VJ_>&90 zNp*;eF;k8f6$h4UkLSw9>n6G_o$B8um+hFiI7deuWln%DpJkSHi9`;o9Ap?kJ?58wS?tRblNnQ^O}K&=sNlrp9em54}&Lv0TK zA%A7iN3rjSWan~T>sGBcCneSNZti~O%qqVTNw)Eh1y_=1O+V|gA?$=l!sGLcr&DJ- zmlUF&z-HkGBO*y#NEAPIir*wWP+KLIN4(3OveHt?!6anF(`IQOU2nu)ubw_V+rZq; zCR(xT{YLEBPh3NP269Jg$A zLoHM3;aJya0Opyr0@>LZm{!VM3ej^gAzbz@m!oPG8Q}}}{35BheH@*TRsfx`WCK*x zh}N20s$NFSKU>Wr3_1;jo$1_d)460S%Hm@mzSL%R0`t%BSVWz8SKo_j+nKWB8$DjuJdb#chnL zyh8^!yPf|31owSBu%(jHlEic8+C5P? zWuL3qJ*{x8UeG)~-e$(x@OH_I-!xHNx7#{K@)7yN{i!c8y|`?8pc`ZsI$$xL+Sp4| z(OF;}T6w?v&sw-Ag9VcWxzz9T#|2DA+|+;aM8;OJK_8~n^h z-3N2I!Jm>GIxNx`$F)1YG1umF4tn{?qYYhOchQYe_k9zr8SFR5SY0)k6QLlHi=H{ZX! z`C5^;7g9z-A=ypKesmri*k$E=<5-Js;v-UtId0nzjXe+^L2)pKgExGObvqUu+-|u0 z*Ez-4eLxDt13ZNYEG`s;cxiFfZQI>=aKvrEQrqoEj zAz7H2UWT-1`>zXTwp{QZ9qh_#0F-Q8faKUPp{&_(x6TO zLE5`srT(gVRTC4V#2aSx9KWdr%-6Gm0+FR)RhM`lsyU8>muHS2JN`b(dg7pg0}r6J zfpu7YpV9Q+TD5O~2hpdH-H=XM_chNE(#UM3NMOdlmox<&QImo@}+sb+7*V6>}y#oGq#+p+a1>Zyy}l za6gUy;jjqy-rno*a%1&mY6q|cC;s}|Cwqr>?bWLptOzKT?vlNG5sG?csQVVpnRaTl zxe-1rqAfljyO4zrtx2g~tmu3BS0)#i3K?0wM}`=FpoK)1*!XE@3dg0&ClNr@kyusVP8d?Yr4kfQh<120Zj zQkqk2vi@~bl69OEgRfl~}6RrKPuc`~F(xp&q5tftUki zKnB^VpOHo~^KixUvi#(SV`j`~HZgdFeNJ$%_16~RX@DmF^Uwp|2!UaVNFv3fMU~4U zyd0yi3${U=p_nS}IgPSc`-Mm-am1g&%{u8^_iPxH!H(+yAL#iB;~Hu%ZFf?t&||sL zRH7Bg?-KlI=*riMSVZzvd+YT%Fn00UPy1aIlb7`lKHbW-%6n+~?c3W}a{U>H;_~hH zbk*;OF^lC>6H<1&)AGW#ck29}rP_VEN?efr@CrN0MahAic5rocZ&$ ze|c}%k6H%f;j$t7!YrbiG;P{dORHqhg8=t_?S|SdjGwjR%BLF+8j-sD%}OVa?N8Ch z6vc|!N6PL57!)L(yRzVxDQR4m=vo@?XGL|;N1kvP2yqW+m2L7ezjTG}Q?rWwTeV`z zuFCHyl8b$INmfXN-SZ- z?`H-12WtA(+Y3q>*QUv*qs$?oe9Xd?W$AduTA8!~$5$yv!~Ui;acfujGyL?86JyCK zd=b=sDDoWIJqeCMJ%V0m@vF4qGWie{gXX>35hZ`ze|7L1bFgEZ4ceP2i}O~yKn@jW zYXgR!8mk<9=s}>8Vy%H~+yR;vNc)G7UGsG8 zJpu0>$dGSARB;L?ae;e#{9-1(9(xEiyDs zYn0(CPor2hwZnP%waqXEMwdTrq8PNtzD45}9e3;xMhXqU@I^+yF&l^we>t%rNR zIbQueekv`rTJe_>Ad>KyTj`~e*r;&xtZPkf zI(RrV@vpBq>-UkygJ*dx)hTkTNorlxc#%RzOh!+|C7bK6sUGjSqUycN?0OB3H)?Bq zJGa`f@dJn7T^fbgdueQ9HT={U=b>$l)iuiA|7g9r{aOEe5gt&DzFWdfh`sou> zc4fsHy9RDom!9vvI^p#u8^3p}TXqZ`o9eEnIo@uC1K|A4r*R8=_BuRjOCz%}=Yxwj zb>n2<;)fv`?5TEP$>xPe8R)as|6tLd*LeG55;C7SA+>uCce-Yh(ZXW%K+?NojR2t8%H~6ILqzN;sNR z`o5p#-)EJZKWH47HT;0~*#^!jbp=&U1B};RXb^9I>{jkE>%WVxKFoF9s!$N~aqT7d z5qDz}Y&3@MDqXt5WbLvS3286izhCgV5iE3N((*#Akc$gNoQJ#X!4m%WaY2F*?XeXiGZTcs~w7IqkZdD-MaV>Kcz zzqnW;zX*Y(lMnl(r{ElRg9sHQhX%02w;9944|e)dskW(0;fvys z@fM4}x_ZXe*`*geAG~t@n3lnLW0X^T@*4fFsJ;4Qp^5v+iWHsiIvI<;?7e!|!BsEU zyW{VQ;e&=OTT{4e+utt*cEh$^u-v7ZJ#g&ma?*u5dQtB1C2IPcgWNy^9ij$8kH5w4f>l=L-7S8dic5nN znm6pztCz%*vWJ;y*+OUP(Mj;7qLB9nNqC+Ontf3>E1-tAWRVmp|-AW){>ZEb?;72Iz0HxfxXi# z6`x<&yi564&OBKW9BjW}09Kc0`+yuYM)}rNv**RGNzO-xbB6+RiNN3t0d$D<9}v$t zm{rqT-N|om^IKEf-gUfKe^l=`D6=KN5NV41L0nFE!SP81uj`%afF>n4xcieYib-^` zH`%-B-WQV6$IRpXU|yev$)gRl8}VS!tHWhszC&`7T;e*OP^@o!!3uXBpzyF8tlA&A z&7Q$qKxCwrI>p+xQ`%j6ao|ZnLd}6C6@#*snWatqW z=^kKG0HI-DZnmLaZ9@vz#Eik~`t0a!_x^K%OYg0Z0*_m-BEXiFRZhINd;~lf6!x#q z##KFEUbcPDrqDA^4Ukr!N*Jec@c?KVp2>A|jEr;9hRC>q_m@llMK+q{E`995I0K(u zSlVWJ%9lnDhgCO&b&+7@zG>gRebw9}PxYO?qQ$fr6K}#L&YU}U__=jFu&` zs+u1IZ@Opz@mZ4{25QviApfb~;O`Xs7R{QGdlz`%z}g$NaOMzTix=-1usr_6v~CS* z*SG6V&^3t+fQ8PN$wOLGv+`PDhD#gbWa#)0cAJg(8`kROWWOI(BaQ~ny%lb{XLb49 z;|FFJES-MDBkXeIgELXTjNpZx>(1&ClQx#jvHMvmfF^Ket-EybxZ#579{@$GXLFZ| z+E<`u?xy#DlIzip|+Jnc7&Fp9E^vc%$% ziPo<8Cwe9XJj)loO!?u_ZNrh^;PIf@QBhHbFQ{<=!cpH^xml!U&eP;^Q#dn>ZF)}d z(o?aq&4SG4Tc^UMUC+vTzpXehP^%T@1@H$HOP@TtO^_K2tXIM={{30t`r_gE;Rc8T zDuI+R8?qoYZApLIxC`FjcjITDqYvq_>P7ATgnK$VI(AWPx!qT^I zfeh2(={DMAL|Lg+)LabE4Ka&&4Xb-PGO`J7<5zbaf)iT4?3*`z3Po?s$56y1EZf?f z?{E8k+Cd_0-;~ew-BZ=5;D4T)TA1LG!ir6f3t!(I^onDEMDJ^F76d(WY42mQ^2Wpb zj0|4@Coa2n*z?sD4N zD3)`9ON`5jUC;l6kAC^P6-D+}2|77KPZ)k+V0S;I1nJ!vVg3KfiI?lBs? zfr-reQ%9T%4=1KocZupKziBa@pYjt#D+%*P;@gMk$$J^V1O7^iwES)W5k_3x;Dlm| zp?}NB$@+$YWMlrw8|x6>D1)>N%cBLSpJsFbG`2 zy-PT7@~J~GzDnRo8mJIh9|Sv~j6lfka(S_LnJ)Cu83`MapW{zJ~d)s`qCBjW3U}iPO2C1{>a%RaThCAL~2|Iz5$RZ z!-0%z0)LftSORZTV2a(l$3$7aE-Etb?a}!-nKHa6SLT!Qg9tEsyU7+oHhMZ}O~q@O znNcQJ5MC1})E3DqmN=CEK9y@w*8tM!bno669y_d$dkdr;)SoGD-4^P%=NWY!HtaBh zTGU4zM3ISq!CoQA`9u5$)jBgI_E?XxFk_AJLoer3=4d4m=CFdy{~|8))b(|Pc;6)5 zFs7af=J$v98`$*IQ5(g0Gyx?Gf5K;x98Kr^v-Zs7zExgG1H-+-6U~eze0O#dYKP#ssC#OgkazzYfc0-P(d8%9T{-u8;-_HNyw!{Q6c!-oxv|FLYF zkIza(YmmbLmv1Yg?>psOotArNrn)hLukT=Af?dE-c?~Oq+xUWQf(SpXA{>#YCWXvB zpfYm9w_X5u4NLltpWJKbzm$|=LZV<9+V$^EslW2|7%_|}FyU5s zd`?tyDdEOLh*E_vk1F3gZAF(XzdY@U2R<|F;dOCwy*n*#Fh@d=T(%`%H+)j0(W%QQ zECt-f*HHJ#{M(q^Y-u{kv*93+z(C~7okwo!%;-t`goVril>**s(`2L+3u#g?pizCI zqPK-UL2FZSm}$OC3cl~)%n!me0T~2 z848L0_8K(QsgY*sYls!0=#uFpZyz4p%g52^q%7tm5W{OPU#j_y8;uTTeNC>-N>|s> z9B$^ad_WfeH^EV$SEH^d3Q3wnnnHVP36cXN-bGzUQ=zGKT=wGI|5H926u~ z@EbH08QIx^6qjNtwY1!y-^xRR@wm$spke`S8=QRH#0%J0MzcarTDkYeCW#qTI9Jd^ zEUCb{5>VW6;F3FBH8NKwk#cL%0I{eYTApSKw8Sz@TDzomp%Db&(W4J=@vP@{-FI3> z=_?u`bcHB6oB*h~oI}w!YJwOt+5G-VaxzBQZ#SF7>Z0*Kb_wNV6vxcJyGUH4O=9&)&UMxpQwr|9*&!c;G z?>_!A&MRUq^~)(=0>8Sa?)UfCSX23Owr#3=lBKn^PM5(ejNfNR>{?C5rkg4QT})w5 z0)Yy;lY48%8JJDe*6se$;da^OtFt*p!VwdgK2KgA+^o@-)!d~~L?&nBQ`RyK3`adO zk?Yr=A=f?Rud^5ZVZ(JO0L?I0WRF+((3*UB@HDf@lY_Fev%|45nsN82Ub|KAxEazo?GBYt=6|Eqd2pjld`GEBiOH8h^7_` z7MvP@X{Y?ix|ac*+2Py&wopHS214iKtE&#A z`0kUC*C+y(E!^EN&(m)o4*EZlrZSjc^#`ED>ds8U z36QMW93>I*F68m!uIN4-d7*qGOLKFL-ZK)7-4RxX1E|&_Fi9*%`tt9}l8=Dm zGn@6Pn_BbKPJZqFg9m%~sSUTay|^MQJl8qqYp5(J6rU67kT%?ZbDz`#Z4U?z3*sj) zb>aC~*xIrpvbDABQA)4JH7R~n9i(+sE}R#rqg*v4BA($9DP$hWbz)%9Sb zXCo(wIDdXX)WwT4Lb-S3rvRl)=FaZ-JcWJv?~h*n&U`VUmkbTTNtvx3MNV=y`cdoB zyz}cS?H5z%Wy8^EZ%Nni@%5buA!m4O1N>n&El)3beJmb6?y>NfsTl%7(?56J$Rf)SBy>OQk*+&#`0bL^Eji>7#<_oa{i(OY*^` z(w4#P2%XqP3X-b_-zn1?o;WADjWfV90C3Npc^A!{=))*p2{+qAp|UR^U|Np)>7tQn zUAiJcC)V2Kx-AY*nh+;A82HQA=pSK|GBE=Se#I(-GWkLn<{WY``J6Zy8;IFfffap< z>;L?c{MLL6!jpE>@ljvJR#jwWX3ErTYFt*%X!L6JF7t;j0$Bh0L(|AB#fHbixir_j6%zoi8bhO#Z67ip{=hETii}2 zj_}Gb!C1LZW)X2qf~4)@TTu?nqz8a!$|7ZS{`JF0)1DD{p?n@|+g2=eYuz}s)4m3tTe&fM| zJxDP)hi11ZPPFa^T$;kIo>KQd1_iCnv(vZ)I{HOgdX8W4{Mj?}88apz3gJ{EtyYHK zRsZ2~X-$Pa<@6hB4bUJktwvtFOJHj)P0bT#2~jL6N}X$kpI8z!Ba9s$*r^iCt@E(; zEw`j!;|8SWK1~X>0#ng;uuekX6#f47i52t+>w(&y1+tqV7E}s_|1x zlk)7}ge*LINoSqa>73a)8H5+o=K%!CEFo&`dtGnx7%{CrEA;I<`|`ZCO7&GcYnz&!&gm*)_u2?*2|UoK zEoR}k_K3xE%hso-0l){KkA7qW%_BP>3%prECJPM7XK038&maT7tS6dhS?JWu{(*re zbgH~-0Vk3D{CkabX2sj%0JnE5RYf;`?>Uz*0FRG}8T%Us5(k8TZ#@a&ij zo%AE3E+Ip14Z_fJrF{nc&MoQY#T}N-pYMZ19TbGhl!*WM@*5ZiS=gc7-T!oW{SqyU zv)jw&Mp@3M?y|2^A{Vhy*Wexi?XB&!J&fiITM z$71u{oTGm{ryP};>dmGIB%o{gZrn0ExdwkYE^&0+OqmX5VpZ^IRlN{aA>$&iQNT29 z(nSB^1VaXBa?PG)B(luN;JNaI!?6~4Eonc~&hAhbYYuyhC@qY8Cm(OOj)~LQC^@He zpd3SnzXO^sg9r(Qte5#;W->yA3?6D8FbX;;CO0vMUhV5$9~G)eg-Z!YaYLGGBXOg# zM~4(wbs@OyHOA*1{OwR8@kG`LxFvHs;LRPeI=rncL_I)Rj7cVlGfwH19dDYNo&u0g z$2HB;h$a#AQcz*fdEUG*WAm5hdUj50w!Wtb)Ii+Q4a`Lg| zJ7xAfXtwL8MOGJH+`YSt?p5F%$teX?V1!$5j##;)5a0%L^PtNkw|9X=Q(V0Gq~MIH z`45TSM&m}EHJ&<#=PGGM!UbUCZg6(S{gFRS+|m}oGq5&}Oj&pTpgj=~0OHvff}iVB zFA4n4(c2o)ja-Y&cOZyRoX-QtH`-sOO(aO(dw$O)&X;Vi(t(7BaG=6Iy@g8A=-02+ zktx3EbG}c#avo>}C~(}#dXb6F%v4_VNzY2aPK|`~3_*F$yzd;!&FXm@C}rh>p>#C> z{E&n%rg~s6uyM&aRnfKc(~e`}tgBJ+NaV%2xgoR{oFwZau3ODkvIh*5#{uH@8GV;z zap_a@0*rho3SvEwl@M~=RKGA2h(({S&pPj;>XZ7C0|Zn1koA6xbCXdEZ#z_M@S*Y` zDHuW4P?6Om?iP&;>Upta;wY9`r2w)gVbm6kw^{L*kluGRu@owE=yjPP8}@h+qlYf( z-LMKXw|wqeNQj7}t%RGlkrtXV(hZmUOhPqr1k-1#dLg^|%*kdm!{f@niq?g8G^E4< zn1W!?JM{Ox1>gCq|FP5TnyDjxXz`>)x+NAO0EAE^bK?J3+w|R$Yvvi|d1$3Fa;2in zb(Eu9u`8*n@1fOl<4I6}iO(O9dBdr_S1w)ZNAF14@#7U~!s?#C(m6?fpsJlPd2;FB zKb3!xLKLH-XM|mlu$5m@TkEE`(BP6ol9kN@6RZKH=RtsmR-5Gsw-CXTQO(o&9=}GHqlgpPJ{d zm4pJ^EB3F*OOTk{1={^h1L!aSr8)T?htELpe^_4*CC9=4C3-e@kYHuetrY~a?|muhPV z{Ur+c@WP{R+ax@Wg16tMU+V=5$JmZRckW& zLf~9c7?(J2!#QB7WJ6isV$)ws<`O$!%HVp7Fegxx83xxx`8n-7?o2dNDQo^Mzy$e% zkBH~xXq}^?eLg8E?vd}VUAuTW1A;3!v{1R<8@7!%QzSrQ%B1<-W?RMQIVS=7X2Tvx z6$V5iCL|dXOdY_^%r4whRd|!m_1{CrP)0AwBo2;d$Hj|H`C_C`pZF1Cy)5{^$Y5o% z;+gA~@Vddo~4ghKKr9*`fbez!kA$|O;m9B0mN zF)VLlpv5_Z6-zrEiub+a7&nU3k5~-yaQDfhKyNJWP+zfLqfBQssq!4R<5 zKjs`);1Y!bi1r_9qsnDVM&53v--*Mj82MDKVEtg^XgA8F#n8Y$wcC7s$pQDK6k$iD zBbYz@x8W-F<*@%rxl7wvs?lOIAzPg7a&9G~O}_@Sbs8M!ETXoZW^1c-XYK|X z9Yzt6CIn^Y=94C3c0a$#K?zT|rmFe-HcjmG=+U(tnFi3cb{3zbf69eDlf9#h$p!B` zV&g&TDP-l2Pt|)=zJY;@f8@(VQ^scSi(h~GwCC#8zlKhi<^P)O54z9A{o}*Ahw*2u zvN&C$%w4OFdHna|94Uw-tdo@XA*4a^hE% zum1kliSzE}?b~M=aztlqU26DTF^JsXg?vR(6;J|#0bQp`l?iTm+42z*r#Ra>E!a!~ilY#eBR z030dPq<7?fysz~#xVI@)@WgWdXCSFs<6Nw~>x>}R+QxZ9W zvSo%+#K&*d)fjzV6n-MU<3OL;W+>^)pT>3K)DZ}r<%34}A>gZmQQy>m7im|TKfcj9 zUy*uz?(D2IS0N4Kuoo1QCIa{(@zY4wSOI`W4saNnpvORZA|bDVauYv{a@CaH~ zIg%Ot-cMPPZNp{(ezRK*Fg7y4f@)t%Aod|A9lGWsI?piAzkjSY4F{py$ZfJ< zb3J|zas)}7rKFk9^e0Y9E_}{1L8Hr#ZcqV^p%FMZbX_0Le+n;rXnJiskYFcjq+4_J zMbCwypX=}JcFo9A8Fa-XsV=SxRhg)u;H6+XW|SrI^<`y^?>Nqkv5u2K)N2@Fc#?p4 zGM`xp6B3o#Vr+8a*kQ#bFF`;cY6(a&4yiVr(H2oRDHWi+tj)t?m7qE}KAlRlg5Ht^ zVJ&rA4q`-;24uX&&K6MLdI$rC1Brhr_|n$;OIhX^J@o2vfjh#{$yLDyF1?G;g@b=T2p z1pbBksqgj8m5%HZK0TReZ_3JQkGU=3upna(7G`%$jEj#4JBGWAwFER8U1gr>8A#-k z@Srn)R(3p0Jwbs29JT)bx`csJ0i-V)(3)^H77mO2>Z!g|?0ng+)-gf@271|L4`i@`51)o9e3b_5fpH_ki$b<+Otc z;TAB+d;Vu#;F$F!Kk<~|F56k#MkOS zk-@(Lf~3~P|4*gCacib^|Ns8whS%BkB+JB>FJPE;7E51{h$l|=q}zdThNcbp_g6$K z7AHP6L0&wA+-63|ojRo1(z1hagMuXg_jm1?i1XQGL{uNf3ndh!T@YK;qu_qcbx2CmJ~6VNJBOF%7r6++XJfpk zLze_AFsNbQo;eBTNX3O#0nuL((Lv}kAn7{c-OZv_xdoz24Oh(llAe<@{p-q65hVN% zUQwigj}+h95w7rq$0|_~pbeMYKa72BH^I~u6%|{yYDI}#kD6di)c&#{;$=b}+*b?^1SOQ@OV<2nOOQK3$uAcQBFAq z#$PuhPMiA62|p8-G68?3KgDTFYsq|Rv3nHL#sJ}RUu&YI0@1 z{nf^583!ls5J^=*!(;qlXcrhO>NqU5UnD9-LQ4RfRv_w;vF|h@%;q|Q{)VKqV<04f zB%M-Ba{u4HdpG8~k+!MiaSVF7ehUaxHV8#%T66ElTd80gJc7KV`c=BKSiOGwZy6+g z9a9(X%{L%g3&B=o{*mE}*q>?~0s1r~rQ&@?UOeg0KtEJ5qJ6N9YgB|jTJ+gG`$Lop z=fF@|kk-O5P#sbxa8pVA4R*=TkH@P^8VE~rCLTRFUysSJ2Br_+${NS>DC!mA{z0Rr zj{JR-?q9)Z;ZHs-^_VOz`V8Vkng?o0B*NMc&(guKM2yJxU?5m$%(n`FD+d`u1l@TG z{D3@w{UQ*C`<5wWxA&?~UA_9eTVMnvB2Pe>d=HitrxZ}8I{A>urAe*o278W!>52}$ zJjpo%=pA)`-;?tMbY>o5k_t3il1>GpxgFF~#wif$2k9e>m`PX>_?Vn2xNs3}-G)V% zd`J3i@z#P`9K5Pr`ry4@Ldtdipp5Y5-N%G+!Rek&&Y;{Z>@ zofZ`Ep|$|FaQY&Z~kxWzovLGOS3n9iO@bEeV z2An^&{7{jaw9tZwLw348yN5zyM#}nr3cwI>dlEO!Su-AnLljb6S`r1sLiK^>0VUfm zx7OZ~1J$oUrxJ&uQ1tFy6XXA>af!=<5?1O8%APzZnK;;vP_pNyl~h*tt-9SX=r(!- zvc6>o`GSmiQ)VaXLi3alf!QUAk1YO(B-JZk0mdAA+lPs=%dk;6y=beL+n~#0 z%BJ^9|F8K$%`$XaVmr4eiS`u*!pMi*CMr!D9TKeZXKkg_FJKS8tVYQ{cFv6!TwfRq z#H5nBPV+y3EM$uaGszti{!qYLvK1iVW{?xkIun$Wg9dq4*xt*(5V;EA`!!B~@aw+M zz37!tW5CcZua|z3#@ow3Vco|9!)Tj?_;&00GHA?A;;dbN*R4T+A_Mm%zH3X15k-vM zlJ@fPA_cZoPT6^H|DVn-F3Mgzc9?FyHCr>lJi;ufzSDWy!7y`$cd2Qbnh)9320A)A z?o(TO0kLhGk{31I7i32>`0mWJ^O~}-$~cw*vRgw`que7|sAZ>tOOA7OyN7jAjHsTe z*mkALuKoXj?;H#XDbJhgrKGB>5v;uSHL$}u#~Y0}7rqNRhO${yYZL|ixyMX55S#{r z0l@1NO>^BFs;hT<_f3To1)!97!I8W#IC$YZJB=!2BUx?bROVt6U<$!Lu2)4bz|Y5# z)4DNzqCJp1H9m#HnH9w*lHN+o4e=*t8=0ex3ALS^q-fpbsHUPbqJ9Fnm3&5Ck3e9& zx9{&`5`@8^F_C-ae}ImzP5eEejGE$anwovLpt62Rk~0VZ+P2qd+#G3By);JdWyP^B z`Xe0?odGofxBFavljN!bIR<VB_MBI*{l6?%m|2i+ z!>aw%AklsADGocAH0}J9oheJuY0y&Yncw=#2EkI(U`5%3E?|jL$8&iIye{AfB~rgt zB@;x;$FpEu>C>_mySDFN!<7Jy{`8@%7jNsmY9?C;Foa84gyX<*UpVeoA(CKgO6GLP zfW@sjqL{}r;>=oOj*<1SpZx|7T=4c@h@f?>j81g}x2G&%^SZvi*_3}C&!h4;i`ZC| z*`2N2sHIZx(oQ!RKm9fJ!j;wK*Bh&E^un^d!RgAn$|*cIg~*xbbSQ}tNHl2ErVS*E z%;{rPtb!nGlAE;t6i^DUVf&dg{ZLBI0mhiCXB-f0M*Ub@`Kk=-LIk=8ua;`V&|hk` zZi+YjPU$K*Sp7kR8ec0cJS_Hh5DLwJ2^_-WujQPcp0?4A<@1mlfKwhuuHmIw4^Yih zDm5Yr1a2W!o=QmfLR-bt-Jy%aTbnxK2(fR7l z7kjRDm_B{F$NoL-|ManFErFY4x7Rl+!1dqvTYv6&%Q#!B%0fU zt@IkmLKi~E(4G-hI0p5slI0;N<>A2HB2lRGulwcpSHDHF6`%JtHSAp6H0Wx~9USe( zl+5=(w(ZM`w5wB2Z(BC*s@4radonnZzJIz>d2pDU0VIG(ZTWQF0wxG4NY;IqtAFbB z7;z;7wHc4m1)z+wZf2^x;lu;IB<_we2K}jjF0ZRpX;Pf7=lw#gVlKp@z) zw{vpZPsH5KEi zJeqmLjInPMZqXCocxR_f|gp~9oM59^(jxzemJ7(Lso zOToB(9@M~3v|CS+wX}~z+7h|RI$ET~M(T%`Ng^)?2pXZkue0?-#jJ@uUh8mTH zm$twuk}3crFs!DGxO11MvBjJ|bYLwc3ZIJ8JkHioa4{yKg!TOrWMVnvt^!&miFF3+ zW5Fw}-$B5Pk44lxG+kc)ifPCeJl0R*Fvj}Uhcllb45lm}ANdKuTlui4t<%Gk8-`!bnB+T;9is1nFc>-brw;3v1f4bYt52&GG1ZKW_O{&!0)<7b6XR*k(n zaH3Xg2oK5wgaYI5>8ZNSCp$q?U*8{T>23fJxdRahjAGD*!q-M$`Kk1d${r&|#w`Y7 z{!eFio00((wlgjAwzL!*m3*4^1?}BiE+6uvcqa#Z`%azOwalW<`9bYZf8jnq7uy06 zEANI|QUt-HzTcNNc8q;H3G;`h!V+y#&uQ28I~Dh+iQ4y@tH?T+JF20k!!;0yMv#*h2 zB?5L;4!MH|4{|-k*tDgt;3|34?mDs|4wvGJ3Ofz&QDaKR>a!7eO@%3L`&d!T_C%lW z6v=R4E{h0?O>phAu4Pj|l`+EICUb-(Q!(ES^Bu_hGc2QMKrv1jP=1IL7VW{EJKK4e z?@HGpzXyzl~8kA93h>b(EokNbO(05OoJo%QBH2c zEO*Am^)?7;hUp^QNM*@~-->v!g7jki2<2M6ZtBWRIJcbDo0) zy;y9}m&5y`ue;M+U>LD0*#Q4$1VG#!VTO>40BF-F6~OPz=FBy<)g1jB^BUH z3N>+GP}LqBvPL|z;Ai3}sJ}*Q%$O`)lFv9~e*gOR^{W;ID(_1=bDm>}!F(m5XXuqg zLc{s`h$d{Bb*p|&B6Npyh8S%8zT$TUe?i>){6Oed(KafO)~m?OR1hEuzhb?g=D^>U zW}~S&_12A4lQl!S2ojs0AHogu8Fb*y`}bz>+tep>@f-;Jh#q2>QK)8u;@h`x8$SJl zRch0Z(suH#=giT-xTB%3Z|;!)jb>{BLAHfg*0o?&jBepDrew#(OP5p+zCjcxCMV#5 z_C#*dDtJ>6-_;c@7Dd=$1l7>l?cKlgk;17M8!C0t!M;aP4*j@=*Uo{IP9?#q<^+Vs zb|o>sGsvZgVISRS=hmR#!Ao{nJ~=@^s|Co%8Zltv01;<+b^+4U-^X&6zgVtzw3V(ae$P^F^%61 z%IiTvlczc6r2Xlo&xV3Yb5gBY8d$?^jOsnXCN3Z@_^Yi?<%@x!~f#A?= zN%xZwE2pqGSkG0%k8v{Y-n;j`f+q+Y;&}en=fet~uY@&Alj4&4X#Q3=SJ#N>=wDCE zhZd)K2J25fwzD7&lN0y4EY)7Uj@o9~!`*Q#jRh@77ioFYo+00}Oe#Ix-HUo3-X6JV`r^kfMuxTsOk*1cady&Rd& zHe>9JM(pamD_35f9e8tR#VlO|>u*nXe)hs1Nzi=5rfJ2T1OmVSEFW|758FN!EnR5j zRB&q1bWkn$pX_Xv^r~~eTW&j`AUO&JKOeIA`}4d3yIqobBp`Ee0)8(uQAVbG9;?%q zFKKCMS=0ZWbLv|?*i#p$_gumC-;|Oe_Ug5AEZ$zry?RoR(TT<9jsWkoa*u~XVPz+Jk9MFxi z(>FN4u--+gO1bT}4CQV-DtP1CEmON954yWwIo@L7$h}lD49;~(DJ|=9owm}O{UkHD)_=6P z&OLBy+Vos%8fvmk`vU#+7|-;y9_9m$1r`;v4+XMH!7cR<-SAitw>mB#Ym7qUI-fa#)^I z_ZE@;W30%yR>!&NJ^ zs9dhZmho4Moe+O&IBHR19C@UM#33jYe?!ZJ5r+enFT|p*v2U)eozxc95rv=nvGy*&^y)q{gKyQ=mr82p20F$!jiLvkjG9u5u^Xr z8hmk>&sQCahrT+;>4zm~mkSsf2RbF5ZJz>t?IiKAi?;+$Le;i_def9XhfjomRm2M% z9c1MUsYhJUC= z!;wM7%7iL%hYG92;VZA7>)!Fwnl%v)!L|<5$~}g2u&-UxxfbNg1nU<(&KZDWJ0%*I z^d&Tj7GNyOx8#f$>JtUuXaYWDhs-#;Xrtz-P5FbwRSO9nS2UdaUeIIo<<=Cvyp(IV zZrv)nqSnTzCv`0cDp$N2PeZ&;^eLEv9Cl7Mchm#6gT$N?@wOxSoi&KBMeYUGDTD*N zg69-rxSLPgqwUy~j~XBq3mzwzE_T>$Qg-X*3Wdw8W%vl->+B*k)iyp_nhrb7wIgU6 zO;FRdFg9-!@w#H6OC-t=!C=i=Pc<(~19k8{kvG`M-shiVuR_x=rvG`c981gp`^#~x zMZNX7t=>PGq)J+ zJ*@V3zhQrEuvXC1vpK8p?Q42#&zvQFJ{*6L^UtN6KW7&!hn_;J@V~$EkB1~Aho-A^ z+;Twm*n^zFTIcQC53HEyeWtO6W6XhW0WWelZ(0|wvMHpE(f;$xm96JmE8S0O9TCxg zvHGy#+2uFz3)rNk9`MrqtLY%OoHGk;l8RsDMMm}Vp8xvh`aHD`Tf*AyeA=w_#e9|7ft$T({-I$q+CF%eEDqNqCoZ0k^SG`Y{d2^dsfDR*53)ZQ zSorI=a_GUcL-JirA1|A>`^A^e{TnP<5;!k*rTOFIykL|pebL`H73*KL}Iobrq21#Nr`6il{1^~;QhqE9pcytB)m_x_D;?O(oUZ2G@-o$IRLv_`A! zY|~~MEm{{}a=5VT=;ZLFO9H=8tavFc8++*9TUZ1aR0QZj2tKtyj|VFyfs?4-lF{-4 zf$(ADs6Jp%+WS7H_}{{O0ti7cKSASw>4aNE)j#^AJ(`$n+>ONHVAfhkxWJo7-mKn+ zXJ?#@n#$=W=`(;xyZ7vA6mTMSdC5r8jA3*^1sOLtH*|R*y`-=j8tmg+;r2Y0EJ}Vv*F$M?3nUAL-qUH$8C&^0^RA&^DS2XO{!{; z&68_e6yiUfJl|vzV`5b{zAvBZb+*`}{D4_A=d7#d(=MtWbLbbXK2WW<>6VVBneQ`J z=M0Uswwczract(E7vgO+YPRBYF5$apg9n< z{oi|I(2CE+#)1@q58^YCaUW$UGAtRufDBoZFu2#bWCTa0)okyZ58q~PW21t#7fGxP zB8BS^U5rA!b(0)SqE^bpl`I9EyC z6J$z;-H46{V3SXl-)VfJ@3bklwvts$72(D-Ul=3SvaolMA>HsCGt6ZE?R_DjKBNh} zplPe%I>WD9RP^JR`PgwAk}}&Wsb|z@9_UO;srML==X8IU!{H7i7VolI`eR}E^(N)0 znnx!ZEE{EL_&9A!<-=B*eY|Hl{TXs}#g9+3zkan%P8sW3^+w<8on5==@#`=Beq|OK zy9@42JYg8L9sj%tE9G~hclZVwBLW#XAR)mK>^#r(>lAN$9r(`O%S*Yp+WS9e{@%)MDvBeaUTCPy$Zu_7yrxY=udgq%8;jj4uz5S_HfUmjEnwig ziV}pE&OZ=hkUfH8;ucd^uMtgvr*~*orA4uc^@A%l`~ceZG3@$?8|C>f1En&#u;Nw! zL4yz%b;9U9nZ?2_&FxP25=+%*R&q3I;k=Qo@`AD;B9Pt2qc^L#D8yKajfv`tSPRoksLE1~Z6>C=o&e2rgUq-~sk8P#dS zs3bWe=K(z3+-tavjr;O-)VkB(ticT!Q4qdOe}{2>-IlFe=aX;2X3^4X{>XiR;1OU) z^Oh|;rQs#nD0oE(n!3WMS&)4`GazFen5FOWBUVpF(~d*h;^@AGdAj-;O~DNj>4*a% zKVSE&Cvci8KV8CGa660qmfp;%raj)u8&I^d?^gXtg{`bWXDsp(x>Qj6vx^=;`ezb2 z6oL{_&UVu{lb$=!2#ukN;`WXoYS|`2LyNR^R!b4Af5r*n{>4QO7C_x;mtfP7f{;%O zP9cKWvr<+~5^f%{f%RjdA34{>)x+!dt4u=Swnr2iCUg z{Ig2ybYW;jo54$TYfG*tR{oYXS6}x7-c@Wq+&X44Ce-!)so=U;x91UM3E=-Wxh#TV!r~W3B z576ohVX1A9=%dQ#6v{)MO&=`JdGqGL#h&i&!hOyn*y8a#>>?_RABc*I8X5dzu|P5i1a3-Pu-!U=1Bo+}bpgkVTtmy)ct7}pqy?OE%qV~oVq+X)w~G6QH+~ww z3$;0iU-tyZj1hR6MF-HL-r4rmcLh4wzL_XKZW?JcSrpWA$?u5QAzqMnyR!I&QOi(vxohVr6UW zMt{WxEzlHZjrE+^SQVIzXv~I|GY)qE$a+IyuOoB^s8UWpepQ_19}AjTYCr4k>IlP7 zkQh2P>#GN*ccvY|@gs6c&QMCVc=iQCwx;XkxuW0W7H)vdl~TYRMFP2mh(yqh04rUc z<}tm|Gvb!sNOm?o`~0Ai!dtHqI}4hbTTE}$ zdC3Hmb?!@jd`#6!V9kD5`#t&jTUo21-YKbytDgXqiC$iV0i^G1v>=6|loNyXzZD>m zes3GhJmybs4 zLw5Sp5mK=7j&f$~GxE9H?b}?onb-S=TSOiSh4O_gIi@V6=Y1mC-r>U+ppPR*jwIde z?BAA+BCDe6b4e`yR-^td`QZZ(CBSf`0!pFhG^V$b6e|Rxm!XjOfnCuIb5bB96@rHQ zKAH8$KM>Ii(DS3h<*4~bq7oBr3eQxOCc;D}+Qc=oO*~VyYR}%iH(-nq_vCO&2ANKt z(-j1d>^%{plCY&kFm1OV3qVC><|R@iaaQsAMA{VkoYzTSW|(n|X`FZb!xa&CvsEm| zsFZC53lq$@d!%)BY5@wku)6Up+IQ%n=KCom{fSk)ZJCEpqUH56&-V`vYsr z>g)vy2##$0eNgUo0b-C@Y^Zg#DCp~vRvT~hEOxd1Lf#-$E4?yiW6Dd<`agdEiDKrm z!iB7azEbJoWn)0-%o`bK0c+TOO4 zVmG$yHRtxegFkSCO2F*w+mL=N#l+-pOpY5#S zDct5nuKz+XI_;v@!u9XY@isv!gIU|AhQBp1ODMhHg?tsYzh`yYV#nB@ZVIfsjpP?t9; zzeg?HI&=DTwMxsZ0xA+Q=fc^L5_A-W%)in*R#TR_Q9hd0SsA`+Dt&UnY-+5wt`Us_ z7n+pz{`GZ+U}dj?`Wjv4#N=dEA93nGYFnTdhCV_rs;gW4sH=u5r&tCY+K5_rJHU-i zoT&nZ$rwZ$3DApx9qdRiH2Rf%P#C8M z(i$MyHXdrrWvMo>uW=}361TNp^?Mb)XLd=B`_TaHs__BhU{T)TttQy4=eSM}KVi%7 z2Y(q?H^bbz8EcmhFGupHDe9#~yh`*gL5G&bNWLFM*J!k8{8MN>j$^wMIWzX{GuUw* z!lhr;5;dRGcx6O`fHEqhXhqCSxjmi1q2>5S?OqAWx#IWtr3mU|A4vuc-Gsn=HHEY< zh=fPu-l%@2Z+~LbtiFt?Y>8v#Q%RNxOd(*PE(ocg-Yxw^pB+&oVksEMcHC0543_Q9 z{jm%5+G&8=*myDA__&Oh6Z1Lv*7lf~F-eX;HiiA(+Im#yyzM>8y+0c_pA=pfvzMw? zGVFmxs_avV8$;G7vycIOo>ORIT**jYT`${?;zNQvL}SAVBZ4^vM<~d-bEONNN5q^E zUmWfzG^eM`RS=#0xx@by=GvT^HC*I>4%#pRpnzJO#Bd!u$Zq_XuiL>GI%{jCA&Val ze(XK2-TsMroO@EMN~;lPBMT$5{?BT8?CfsnCT~6K<3w=)I1_7zm{=1Wn;ZyCOW`|q zT;Wuy*7C!|M#U#Q6uqHjh~k|p^vXQieEs(8F3=2tbdV(RjA5Jr0*05I&OBGFL}Kvb z*Y`rX&Y}_tIGC0N_?XD?u=$k{;1$O|npAZ&stFt{oNXh|hfFI+I0GFCS7N%p;1eQA z>SdO*!@GGPU79y<-o)skJ4ff{@}wxH%tA4v@L22WsuppS*f$(q_+OzeVYenPQMKV- z5xo*SOqMKqCngWB<3{1_2N`XnLqMDoCKn&Z7u$@X{Aze}Ig7h_`~LIKKN97FM4Ouk z()U=@R8-2SwkThZR%I!+Y2#?(e5;^9yJydhNK(}EXM&ef@Vr&NX>=kP1*j$pW?UiF znj5&8Ng|>B&L%Sjy2_n15>-=(ZDJY4DyGqAML*t#l$B>M&l){#x5~?BX@lB#x^i-6 z<&P56*do8*fd!c@JDHwe*74xMxz#4h`+AlcJ{ zK4(1IaczrJx|@5q%b5C+XrC)8PFhbpA9z_2gsCnrQydcV*eYKNGk;~_59+>FjOXt0J;t>f`15`@kZ=k zpi)0c@oO=nB5I3uyfy|7o&)vpcs3=e*fIfD@Cj%Zj{n4x61*VN{&HS?MT$EJwUI_D z>2EXsCp%(cx;{>JW|-F7Et<4Guh^8nJ>2Pdg&!w|{pi)tTQJg!_Axz#&rA6fXQQiR zclRT{S8!%mZ0JnTTMx0zNp15k=hhYE7NViSURPCo6ySmeas@mNH=a~QHSeB7jdGF@ zN$H$CAZ(M7GNpVTqEyauHgPi(%EL-0yIj6Dpwa;_(3;n)>i^V9wI6krPEy>)GV7W= zQ`!-fvx&%VxVwAXC0WGVrjTzb8dB-#@M%oLzk;SM(%OPm-F~3Urjh>BkKU7sAj0U7 zZ};_T=lZA#JUtPu(z=L<8_>FCOA+s*9D~n!e13UX;0k1YU+5bl#>j^;ta9l7b>r9P zO2_T=+1iLc_nT_<<^I9dtp2ql|K|=PRRs6A`1eQnSb=1~05H6D;uGUMpU0M`{t50|EKwuj*07a8Z}A{zkZkN$@S*$DbV{| zX&A^+fr=`G7ULG(es$bAkKtz)XqEmb`F$_pPS(ER-#TpRv@MQRWU=uv8o4i&b(M(9 z-euUaNm)yDAR-K)66`*7s5($Mms2jByeJYeKNm&4qoFBb7q{ZJmsJJ#GHSr#LKXBW zqO_H7Cq8Dwk~Q<*Xx%|mMig}cz>72mkm95JMoH2KoVG~oK>N66zfsQt#5{1w9KVLS ztF(gB18`b+0_@zpbqnt(c1tZ8pZ@_BuSfLe-yzTn6;jLSdpU6HnED8 z!fp`(y`-{%?7{Z)L#Q&!p_!d2`DBa1-oD%BgLj-3_s#RPLFkZV0daC55t+caF=R0P zBrg^h2}r=Q=NGcUQ>2Dvmm91MI!m!$%L|o(yYyp3%jZ%QcR=0UfGXMz)bUf#;Z!fNGY)%GmWX^YF=)J!e?+eFWRfK;&zaW5Z6e9KKVJ2w9W zjR~F|8dU+E2rbZxv=-?pvJdVMzJ5z~vn)D8r{X%0`x`7;JOnau1VJ8Z z7+NK#`C)Ox{`w1sr^Fqc#MJY)3eJ{q4;-9QsA*9C2tmk2|LApyJXes56q*yiPfpYI zhE}DE5r&?n9S@Z*YYYHZ0$tX~Ci}}R{qUn<$DCv^8DAW$=jf}{z-K$I07 zH(FD@wWgxu{cR%yYTryfss5=X_;Xfp>6Jk!bFQ~&-@gG{9oCQxqe7kZw_XQqn>)^= z1m$@&Yt{J$4fR3X7^3+?n##G09$^S8M0w*S}b&+k{NYXv%v*=ydsSG`6ypuQ46 zELVE(>c!q21px8#5ydCE$*uz85i)M-SFaitL8W@Zb1hqyq_; zjnt3r>&=U>IQ3kzI`vQA`47@$Pg)u+UqQR zxzJTD$n?uq#l!TBTxq$~#N+BlIBIZCTF0m6+r{OjU{m8dvc_+&*t&HuFf8677*Mr= zD?gf5)g9%L2+$_%rKlL9`PVk|zufrZ9i|yo;H}Wp$vBLdY2M=?^+$Z-IL2IgkOQUZ zrK7+88LC-6-9@4rc)l&at=10RR2xey01Bc7#p%C1yS_>@J^>ge`ahJfMQGr`2bw7= z%t!tacFSSAb$qV+RG&S$S$z)JU#Y1!&3t#|{ZTumNd4^JpUswiS_-sEA+q|DzI)q{ z11Bm<{-Sn9>}&?hjXrk(+SLPmbwN)Ai{(&}q2wGL_$oxXZlQm?Tc9?;c(~X=J<1E- z2Gyc#2@}~QaJ#r?%u3!lY=1eU{muuW4-2i=ZUUS*8TK)|whwrPR7|Xlaa1bRk=G{0 z7+n`c5OJb8|55etzlIG{%@6jQp>LO9!kL2@_<8Td6CFE-n16ljv#+Gsq^!CESXq>G zNLy&LC`ZKyA`a|Pqq0mb58-WLy(pJ0)UZJ33afsQ$DUw!EYH5U??Ok7>E$a+!Boom z42P%%fnw@Hevh%V6y>E5Dp64yEK1!olMqet+CZ)Vg!$@uPoG|#w&5sD18|Y_>#Pu2 zK7{7QF1powaYWC7(I=)gzWiSM_@?w4?z27Uow-c=4;T=~^$z-6SoFyDolcMDk!!<3 zr`hWsm_I?yY0a0vs2ppZ?f7TA<1qvl|W?pJ+*Fk3r78V)DX_k(({uwDX?_$KN#;y~W05QOEW?IP8b9P*mwS(9vnN`| zk~}XkN~yJZsSi>w$?K+L+c(k@_V{v#v)(Mz`t6u_II3R%0&*o`963fNwP`6w%s=KwBrLDcZ zW8ulz&>XR?*#^ASikYgiM8+TZM^EU`pyU1cRzp>qvG6wCKzX4 zNb_9DCX+|Ut%G5G;_}iRRyx#tVCcV}4{yfi)wh|BCc+sMNSqLygKOSewxk;nbqm-F z3PDk7O6ohOW!9OJ?dIBTBBw?e?x{`w#0!&9L)b#DRYC()6Pmt*xu8Gb_z}xE-+M3R zqGURvd{Z)-C-5!w7^ey(icE{`ziw-6;bA+R%rQeo*E%&!#%F@>KIr4*Sd}S78gV@o zwowaux98rMNPpliKxl)?nDISV$2IXuw43p6=@N6tE|Q|gh0jtQ7t;O+VKJ2Ys}R6X z4C&}+ZeBc?Iu$pRl)V1!SZRUy*LXUr0-LwkkTSZgzwixA+ z;a~82+m?HhUE&NuS7priJOK28XwakX}BHOd~30g4)BVXE4UP9)%hiEQ;{&HgyWay&kXs45Mo(n1idphxq zN?5u*D4+Cw$tcP=E$KqOw96O(h~T<`29Rx#FB4dYd+4wgEbPA+HbHp6`dm@{Zfb^` zec1z3)MK<&fPK?k{z?r-?WhEroBsRLC8?je!R4H{w$9;5VTDP(PLoRC(nD8w6Z>6t zPZDNq0oBFN&x%)5oTt&IjZcPm+|j9|zasZ`d z=FUC1JeYwSQo>@g=P$lTfw|@q_hcKZ#DwZfRw_XnjHAj0gHMEeh;Gxw+*&NooTb?B zvnKqX-wrHGYzO~ZCE8*{7}(|0F*|+MyfRP!3Xrgn7o5ydR$2oh92&>lj`j5&(R!7| z6kSzrG{O0Ej$-X#VoBoD79r{F(PePe@FPXL#sdVdHuEp}@!^RXpdBXa=(yk+I&^7b zRd$^pquS#*u*bVc?}J^{Xz=Qu`?nL(GDc@--Wu0r5Ju6mlph4eN-VY*U2&&DTQj+3 zOx7;seN?Pz?1q8SoB5RAnBAc|efkXT2qCCQ2Y-7%amVhQs?EIex6g)_&s(@KF=_M3 z2aE8RQs-y1O%2U|f6H_6RN5(o4XNNHz-42FF|*Ra+YEisiql>8sdA9QXJ#I8v>v~F z4utC8mH;*usP-pKK(%WQ!WXe$WUE~cS7|@Ua>4X({SouZd5!3OH^qAF}$;!AwF~O9v%zan*!X1A_k&0p7px9eqOJDt~Hs*&_y+1_`L1zlfB}6LHhZ8@} zk>Mo)fna+w+J{Ar*;zR9(FA&k-~N(IaGY}a(1<3g>c8;nWJSo!vKMw zPOlAnXJ-H46-N)4*Hp+mPSv@$wjJrqqkHk_r zm&plpCuKzig3@v`i7Zy~Hoz~qU-^E`AmRD8BEF%|6YnouiE-#NmQ4QMLiDM$uT)Fb z&$v15P3*rAYLgS#TncUwPEk56x}V_uCN63i&J`$O5>h2}i(hf=!2>k_93tDKx#1u5 zby>$vOKx%#>gbdoD?5nApB=n`pZxB8+Qo~Fpq;Nl0f{#a>6Wze(%}PV>>57ZqU_d- zpkqxq=z;l;HVWGgMoPg=j088AK-i2SqSF+|a+ZI+X`T@SV;czPK(GV|wp_Q=iQ^_X zPq11S9NNL7eq`rSt-7vh@as-bP8HfdiDRIc>8F2OTVYdJ*cIvq8fA5*$s?d1h*=PO zwqOTHW28J|id3TN8(;D>`jlK)u|be`cn!1GV-Ig8&j>UhWNrjnoAk@8d3&@qh&HS^ zbsCC)e0uVnhJb)qZrr$0bj8116k34FI%KgTIPXvXFz3W{tKkw0!CN{=Da@PL^y5zl zHk8mM@-3+jOh3nh*eBY7>7&BU1$-U0;^oy&amg5T2P`i(;&8ZvVn@lSYoo@Eg?UAd z6m)DX0IHd6aRk}~p;E8=hA`f5t0xvmOUv2OL1|rZ|Pm)TObkWou~CweJFs!;&mL{8Vh@9_y~S0 zs4It}@LOC}C?8lpP#U5&2HUy&*$H!N#uyvBJu$7pStOg3BA*Qc+1Zsth!R5cckNq% zZy~}Z`9eWKEzvtfRW3dnXG{ZPQsQP1MNg1_u7TG12aqoJUDV8& zi<*bP>I#&QsJH3zgJ!%YhkKB3)q>1l6`iOTBuEA@Gto{6U3y$<5bcdL?oC6?bJ(dM zKqh^oP+M(5T33FHhcjs2U6vPcLfuYXcErc2DHd|s1wL= z($!h#=He1g_zr)cXqVU)DGI#TfMw5ziM9j$fhp(|0M70n!x3Y!>(4w+;kJ|ef=Mm4 z&*PM47|hCQ#SEC*)$ddj9v~<#x@M`pKHW%s(Q(uUUhnRsl?uCCd&)efA<=#d79@-| zisISfjN8B?60IE240z~kh;>Oj_w@9vGaH#zZia|EC2w#ys>0uj)=VG9sxn!rXiB}Gq#}I~|+S-jMaL}Q18}^kMf5V4&VEfknp^@S% zMr_}1`(XtUy0-zwzI^+(I_}RM*XIAmP9SVD-r~byEzsa-QR_V)ayMefpJJz*(WW zdi?&0WBA_m^{dO%lj#<@P6XQLjFtf4%5UF#dpM!fYA{>mC;xAw-xl13WvjZR{#$vm8cS1gnuX z)cVT%&{=)+Kp;>KMJ_70WT)`3d|vNg1vVrzP+-spaHykh39H>qLLLCq>RG^OkOPi; z^>;#Ai{5}sm(*^-aHW)8U*ewm`pK|XLQ?D=?g_FeA_$bFBE4g2{9He8!UzL z3Yqcu$AxN!c}SP27SzA(uu$T%-f;Z*d*vHD``O*T{q*S`y(ve8rTvfd2Iw2zwc@8P zml=65zp{4RC>PFADhs;7!?uZz$=B<%EDvQfVDRw@{N+1*dxQ;=4qo!;zN|uPr(ji$>?rmG^?c5XeNY2 zSRsyBaQ11h8T&w+L;Jk1Pfg>9r!Q8EeBjW4HkLCBQHS1x;n=osQ%Y4Qxen~Wzjvj~ z!H5H^psah8EJFb#-!QTKQZJ>PBqc5$fygC>1*%~M@-QCC#_>H5(MEG%;JB3rlyRe% zNjyMl5e^JR%^~qUAR5dr;TR4M4$kv5UPV(c)A6X02UQvPER&ENYH#c6eS9E`0(~TS zz?-xC#g-+#n$-Ka6q=EV&B$+*tDB=Yep2knCQX~#;*s*?Ebl48w&4r{8lJvvjLws9jTNJ<5UJ2CBxOTLS)7@!|+ra^r?F;u4c-S4p4%ZM%PB`VKrc zsAg%Z5Lz;AavZRHT<}D660{!&pzbzmhFW#No*N{lCA>3!YdzQYrt+J}HO~&%I#RWs z%cUm#y5V|I8&Oa+5cvi@C1>0`b`gGnRBWawz$Vd;z9D~?KRJ(5NhmlJgA#FsTE;1K z?i@%GD36wY;~a;}BjFlUQf=uw%{~!^iF&9fao9X|0%ibh`mLBAKX^Lq5)n^xfbVGb z+xrgF?$>6E(TE9ZHsA6tcMQ4nwZ7 z!q~~CTxz7_qWIzsDo>pu3cX;5E7$MGl7DCFGuVENKFcH3qDbGEKw3-Ze?9HLC=X3PO4K(to6LPyA%WucFkhnxBY7uP#L|>qDlK}_F z+)PolzA4g@`Ot8Pa$A8MHf<^@v^sF?^pjV`IJdpRD zqNwg){VD#GAGMntxY@+Ba;hnKg;k%sQU?xj(5-l#T-0@|5IMm)I&wZX~?yXW@$JgPOA` z-$IVXy>!_qQ}Y;&sB}wvl61;EbK0?l&-;yew0@<3{r{K}`?kTR7)r-N1AGPG5a#dB zUo{=MWv_5Y$&3WbGT98El#-^EaYCZV@rULhuvy5=3ZhVwmUxMpjmLV>`N~E_QoqwK z8Ei@)A=HGdAq@MJmKGd$3mI^kIcGcZ1|s%iPxk^D=YY*fjl5>z*jk>ct}1SNkwN?Q zOI?E%{{ptDxREEiRrSU!yL!o3FKTsZ;#|7FirQk?`1cQGV~4Prv7FRaoT*`F_k?XK z+u`JWAR#jt6)c?)3Zgl0oYh~@hwy>EM575sU-elNXs+P_?){CKnRCeLQ8w|s}9!n`ZPP|?v*n@D1h33gqyojrV+fDb(s(e= zU98Gq0XaR%?j&60_l%0&VL<33jdlnpHW96)n!ep-vR*h47*cc{Qk-&y1n9}w)dpUw z?9uDW>(~uZ8eX4TSof&t3IS0 zq;ML;x)o9%%_jv>&TZK?vc}5J-ojjS0|HcvFKXU!&gB$te9>uzvAO%?gE*gkYfEymjvpr@OTK^V)6SnmjtZKQrLD$$`pD*!K z;R?*1|4h!g64dF*F}24}I`|IOF1!xa0+)v#_;QgM?`sE*R0`i)@_Yr&q5I!W@)tM2 zA#ND1Ye~E@YdyuhEb;n}6Pg$($>iLWn?p}vy^mf_@*WYdIKCNHxbF&N6Xp70BuOsi)4+IKRW z(Q)5j%^Z?2&RXP63ER0)Vlo)Jo2f?XRK7pt4I5qpav1~t8oAU}^1u@93h0faNBWI` z&UjFYIp)vL317OZU3#wO#U zN~>9&xzbs0Q;^&^jIx{msBeO;!lH+3QeL^ZW;^+lB}>elx35ytkUcNe?Dw}yJ)Ql$ zv#o7lRz>(|{YS`1!>8Fgdk%pM@ja#?~5%9P6n5Mjs{NgH}qTDpGk-kFIVE#*{( zS3_HFSoN`uPfM5znc+>014$udKgh%7h@ce4SN1kcEpjD_$mpR#T07}62ha>op01{Ml{Ss+B`L^i zmV)h}M?mp`X_8ossa`KPsY2n%&`1t)?%SG=^y)AFm*a;(oR{eM#;7pkB18y8CisfV z;D!I)gwip8eA3FSlOt_TBbH}hi>GDQCLcj0XsJ9@&s>tuMvt$0T(Xx z`n;F$EDETfz@<)au65j*ZZs{+QFnvU(mlx^Z+E%y_XL~v`}Z|b_%gYn?a>Xd@`C)j z-o9mR9WX9x&I^;$v-`UcPo3`)z(ucgJ-ge5)cRk|wrU2g{_*0~wm4@qt4YDQCy&iK zySSO9Wq;3}F7uU~uXXkB)A_T-owsw|+gnWtT2(yY%_C5>VV)H&L|udI5b^Ey|E7}5 zy}=?A{*oa}O)E;Cwe5+Pnj?MyLyR{RcZzt_n1c%6;dF*h9G$q|v~N^9baH4KS&r8T zx%rP~8W(^5dzcKb5xoR=SqE;pxXi0D9AYw=*z4__;H@ z0RKt|D9~#xHZn@St(}LP{wLzRLR$%E+LmSq9gZ0LShP}=kSm}u^7WwJCs-aF$ez@M z!f1nU07A~A84(nmoN53T!99?fcz;+SwG=%yF_U+eb4kph7nMSj6ej#+IjEEZ7!sx2 zycsb7$s=!Rp~;q(IYZmFO`CwZuPb`$t}HIjSr=*De~7^eyOHE*yMODs=8Hnv>X5TD zw0?Mfc3g0`NllgB^TIJl+wFW%vUbC#f3&imdfINU{Cws2>)uhTf8<=>(ch!YKW0{^ zPH*lvpEX;nWc&MfO0TYEAGF9>(YSHx$MbQPElbk}Mju#k!zFk1Qcn+)IPW$6w6P%X z8aK9KpRM#y9Ma}!O~eI^y)uPcL5?};A)rRGAx6hj>Qrk!-d@I017@_TcxptHDON2R z?!;zWci=KA*=T7M;mZ&c{X#D|4gUZSN%!86R;Z7tY7b%ImTKC<_Xe8?i=Q|F6=G&< zGeZSB8C*^z?C1Z%en!u-U$j92Bte43_^Ru#aciHL| zUB@iaUsBT7yM1hMl@~CazPULUL49uiUqLoH29Q#EDxEbZ*_LX{?`p<;Aa~X4;Kp9~502(r1=p zR6pf{BUxE06~7lep8U^621@!nnP>P0xIMgYccFEsp_fTG_)+2f>7Naan+4W8U2L+U z*_FpBI0Az54%x0hJES&i+O8`p^L873URBkI;}KT0;VO%DIJ)@DY z(jvfJ9(!Cg(bMzWz_^{8cJ|@oq9+%(Kb@>nI@jB9kzB#NKxQ7KItNS2NIvm(ppAZH zpXsTA0GZ!7ElwlaDJGd;?gROjY^1QAEWSW*2M7{!B-Kcpk ze;h?24cP|U#6HQ!1_oP@h4Qy%bT^{9pox+iiCJHg5i6;CEImdwyD+yv=2xJSO(l#8 z@#QyL*IRD3)2Pd2DjNYy#z1-9S!)3O?msJ?DL^Di8WOCOdO?TLyH*Wp^(*m|{p$_q z+cx?v1uM8X4M4MXpqE^F9TO7NH?nmtcc)`TFd+L?FQHgzi&Z2KiUKYptmue2D9heK8*v3u(^u3taDkAPYGtXuGA z)JbouC;nMPez(jqtzsz88ddgs%>8b zgRcIY`EVjBLMIw~$~_>%_~xD-flqq&QdW9c5NZ0!%`_mnu=;(n=TVjUzmsj1-CqW( zwE8`)AS&1~!lRXZSrK2<98Qv}Js@k|H##NYsw znD%(ywcOklbZ!H?4xM4tor~oFu$t?+{+q0vQ&V3&Jel(>$tjphTR>n~^Sfg%?Q9B} zO!XlZD5;KDOz-IO_tJcI z8hx$4ohU0Slc;B;+@V=n{&7Eus^bhISqbgKFwW5Ep%X)Enbs;J%_SF}@p102uChZ)gor+>Q*5;8{8$C0S{y8iU>=@~SH z6pJmEQ|2VIVqq#P64Qr@rVtgi=$F_JXqS*9^R3l;A2v$7c$jDcLTGFDe$$#CaE``_u=$Z4Ucm2j>xcmncRwix<=7(gmz!ylrh{;=>sa?<~?9Xsf$B!@2RA-v%dU z_ikeP#;b5`dV;Rj3+Hm{Vcs@Bhu7Eiz8Pas_TqKdL4S4HWw7CM=8h?+_BDf;jIY^$ z?dnxAgx&#n;Y>an`v{UF;lVrYQDjla46_6$gQ^%;J%j#+1$j9)w>(l;sk*1oeD54T zORJ^>b;CRu86>rHcUpJAAH4&1A5}0tNlkc3?OA%p#cS%S zAIY*HWO*0uHafB#0zO1hg6Q>br^8{^JMO=2l4WndOU1oo8!e+e4N65y;yVihmzd!v z=X*H~yZb77JSEiMkQtb6nE0U}Q({DXX%eB0!fer^W5uub6oO8GFL0kWQ^{!qm<7wGJNfc9N$;h{Lco5BJ~mpDKXt!m#&SAJ2>r< z`qnPLC#~OZ^=FF8UPFyPn%DlBt@!cQjou%e>pq-z`+elFZT5`?i+cxlAGJ9x?ZpFql-f0f}k^QV81Qoz@xiZRz--O0EV{>t~$zL7=*vA=xtX4<=)H6+A=>dSym_SYMY zKVpGk>rxETXu&fRQ9&hx%cfCot58&lo2^^F)*02ETHsn<-s_-FhGB|2y_6kqKRBL! zL%C6-KoeTBCJOgToOHk1Klk{`ls(gnzB8WsFZ}19i_Is0AG1|)WznLCs^?;j^d28y z_B?z?^jpY>m5H~%zz@UQ+z`}@0o)7g>u+r#5|XwFtIH|F`|Dnerp(c=GV%t5de&C0 zBJ3sbs4$~6NzRKF%{ca2-Dd=xskL#v2mAoLcOb`$Nw}mei6j~au8bH!q2jh{DwIN% z^VNQP(au5*Ttoa7`*UVHJ)gVx6h>VOQZ_Xy{`jo2G}!cSH50GFSFaIcWXa17xo~vh zyf53ihh@sGoQe!fYGkUIOhORFv@4;&M0~4mmvF(UVPosi_0!ph{JQJ94uVKx?`fPi z?%&^qjtHxqlwCY%i6w_0hqbZaDP6Iv|B|t&ldDp5zI`39bR2pbK-Y~dCBTyHyLacKBocrcn$jO@6#={vyPpdVc>1%>8W5*>2m_&()dS)_o6b7(KTx;O%`mJ zVt&CY`rurPxG^6XPrRjR{CG}pLTl4UR*gp1NC(4Qb`$U|IjE>nxE*-!6aUKzB|qZi z)oZG(4r=9}lp-yCm$wL+*sHd->}#)P$_Wol-(9fo@ga7P{^NKLk3iMxGlR7jb?;_U z88Bjaox|g&57Y`G+Zi2V@mU{_jI@b5G3?wjHKWt9YiuuAeV@6!`KypEY5~lDQN4S9 z@x#ER%7H3w#`* zVcHM(a9OYN9ij>XM_xEO2?-~HWeVTBH;i~SB;T}acTc-bGAmgYF(&^}20%9KuQnPe ziDf_$ClfenV`&yeI73~42D`(^HN&|0EIJm&*fn%I)TW{~Kq0_6W5v9;gwsjebhMTXnRnD!f%-Z4>DKb96#V#Y^qSI~w>_ zZ+EXez)Z0Cs3aT|LlChX@%Ljx@;5>a@*3mld?cETZku7d*EubTlQ2x^U$4uVB8eA* zybvv*w+n7BM!S~iRk_j^+??rEI0sblTj77XoX*cxoZiwlL2r;WH zJj|_w(~aV8VyH8{5`1x`g7cp%t(DK6PO=;8y>WcBw#$YsUHhHjq`8vrGvDjOG~dPH ziHQs23LXJ1NMPx5Bl2B_s(V z6L(tP7!&$KR61$5zr0wla&pm91DBGZpzNx*0ZX<|D#;8;z387c=M0cUoZhA&g(c7I zw_0v-?9=u`bUfp&-$@HfwTXN~q}C;)Y>(a(gFMnf5kj?__my!f%7GfF#|Nyf_2y}c zsF}#9HI>=T`suvk1~WXRiyfH{{C`6qEIzAt0NJ0UKcH4;^Ohj>!D_vgG@oT}aQfig zW6P|*R+Ynt*H2aHd&5d+l9T%}yWk$#&z{Y6x|#dz*(oDZP%)+`UuM>PwI|#TXfcR!8q>=D$AZ7 z3@V!V_~{I*){ho6>3uLdy7I^1{BqN{&k4HO?}n|I>l`##-&(Q7?*5~bZAP{lHR{4@ zjXyul-n59Ebb9`XQ%;4Z`UV*X$8DoL-+KQ1`H_g_k zDRIxZojL#@eZyD3d-Y0YHOukY?Qo9?F-Fsugt*NCfR^+JA}Ubl!q!d4CoOVDlqZOT z)j3W@0_|Xb`gD-{>|VXHL!!DaRPWmFaEG*uBZt4hL8$QL=!_v+$}8^cD*256_#yC! z=5hPxsoS!$++!}ASXt#~U)6aXbgC{|4S!@PUVS_OOm|^SZf1x(I+z%}&XV;MWU2+( zCth9zZ4&GdYLPY|ro2(P=`hR52xr7;BgzkjUcKU4A4^;~y71_R4ReNi-$|{UKi{-- z$@U&O*5CEZc7J%*%;A*c5z`eBzJ1AzqJDaqHI#8Nc)^eSkII3_IPvXxmn;`X>q?}% zH(xij@YdSg{99^)O`^vuKD%{Wy-C>S7TN7)ByLYGJTcvL8e;SM;l+y|v^Cn3Ro}PC zUu9!50|OOv3N)N*9oE!1yQJo=wu+^C&1v-K#puzCbreTyWM*!2I`nnGz_wevN7Ne5 zZxCb}95xOoT&e0qdIeJN!NEn+!bjYW?&#=Ix9Bz=0D)IX1|%mz5%t-SDtB|oHb~?q zck^g^0I34KmCPLRb8BLD;K8DQVIJHQA?10Q%_&-%URCSeB>R%z>g-8v+G^__W+Ke& zdjqvb+bmKG?G;rMexv^Q_2_MhdOv@;wo`SvFr@PwhE^&#cIwb!0Pq=_V2lcVCnK$X zEB?(`k;D`*!u@ub;ZyJ}_6dDtgAude^Ls!%_z5v$a9 zmOAv%njPQvrC#pWpZ&T(#y*c7qEXY~Ny@xt$k~2Y5Pmy6ZQ>GJg_wX@w9y7mP&0sw$7T>r2Ko7Qxk=AIx%@$x0<>g>Fv-Y z_es8$_ng|_N&Rm(Y#h^f-67wV&cD3X4my;3{2KQ;`@1&f2omJWFffJ8e|OOBJkh+& z$2&T;;F|{`JBp10%!-TOJo{1*v9FI&Gp`YGPglRbO)PHx*|5SSzsd&=rl%23s5Ge! z{`GsgmiFtZ)1qP}@P1Qn$8iyq8K6hkX%vPrP6Lqp%Afj&9#k`^sALKOB3}egQW>j< zfAJcTy~*S4_Bp!C?^#-Hcvs@oKizOaxBRX>d*vU{X|o%--q@M@1D?b zQm4&!w%J#cWkeN0UgK$IByi`M)2t-a=@_nWn18)ZlV@WP?UZsh0e3c; zZl`);X0si=lwF4#E)B3h8F+bdT5b7BbEO~~z4EDXQ+sFEb+5RZzMXg=4O9V-^p7rl zZ$J7vPkg~&*2_-WROMeEYg4J|w%F(Q((I9H?R9l8b?6Y#?6gj}RbcS7yspYm6bL*% zccAWfklpvAOY8I^cGl(BrpmxlCRj-T9t2oNq+8nkwxA)vz|zCM=~^XQ+a`xz|EUXz z<46j6ZJxKazyDfirvP*9O-p0fDtW$EiXn#XkMe~ZH&yRiIUcx)buEUigydkujFY%F zRX{x@Ar-EmVO(2pMAj@MM)-6{Jk$Kn9n((fmF}hO855;{Ge06{-$}c~1J5fHRy@yF z9Qo5kv9x6?b(f>TVb!lIl=P;$mZtmyTZ7F;R22t8+6vA{Gi#d zPcgdBQDJta;PpO4o8^1?K@k?fA z&d}b+dv$7j@fx1-?RPaZ(jpvbb{=O zom^3zQE&RP<*h+U?hHgRSw0TsH?qK2mC3yuG+5VQtodj=rJZ?w$ALeJf#EXBO?p*W+HhDKDc?_bv5vznl$^iO$(l0P7{KY*ptsu;I_ zh74?*ZXXvHaPV1N;}1`J=qkS)(f6x3{*h2RoSNl~n52P356CEy`g@{-Ktl&61HNAd z9A}sCBD0TJ*731U zlv9V)6Sip{rQh34Pf5w26nhPf;b45pwJkx*{R%bX4)5KR-`f(|9py&Tmdz$kQ1e)O z@Q`!9idN?v*T?Gmp7A|vb3f`=(h6AN_H=OeTc+MT>0iJta}3KO z_84R1F6$Un@^4CkZI(e-7S;d9)_Z`3{I+l7O3^UWvJxr{4JnaOQ5vGXr$mFKr70q# zq^&_qL%XzxChgKtN?K@7O46YJd4GTJdmQh3{NLv|p5u9*@1uOi{kiY!IyTMGOBZ0jG2G?^1JhkGrVT zs9;Jav}<@HpNQ24dHJe=sn4E2RLk#3d8wyAG1JSRHXW^C*yCH2r!_ zdQ8sK$DtY{1cTGME^nVQ?B-(>nVVzKEed{L{33Gmp^G&8%h96=cbQx}^h>qWQAtSh zIdw~r+~>gwjs4L@&UUQgBA#ddY17l($O7U)q;7?`SPs2T>F)m1nmKUA%}qa9T|Vrb z_}=&g+R}m9ex@t0=7#%Y_7x0Imi2H=wm(#;j^QfR0V&bH&K(`1q=*@ZhEkucoH_Bf=#9-B%s| zZY6<8Co)eV0ddJM`FQD0ujyyay8R9xOn>Ujl<&7&tDBBHgznS%jT>XREB1NNRJTPO z``h9j8p%Jj&tb}Z>?F?|FNVT<0USYgD)LJiDRD$MLlSI3GBQzpxkpVFgax^2noW#7 zt>@2&KD}mJ&rq2oZX#W;InAG8@T%^GynU|nl1I&^ScTb8qo|s9XJOWGmc(5(HpaOu zXsRcFyXaKNslW+Y3MSf@&d@eYEX_+(B&29ipM4y;_xhJld%V60?mG7LoWRzY+c)hL ziiL^-0+g@Re^jGM7UTmn%Mz4(SROs`_2tj{d+Tqlx1}B2%ZVHoYl|3QUB6EEfPbVs zIy@Y{VI*u*&d7K;puXbUtmrl)m84IvK5j2D-8d7%=d*O-0H-Xt0SJ)5CIG)XNUB;N zeXxSbv#Kh_iC+D#J8>d|>oE=x`H9grmty~!zT9MGlUse_`&So-(j^YA4ack*ZBM(qd$~|~{N-dl zx{s-HrWf_9S5N4tPts~$k35QX({0|aK8{-beN|T0Q!L5aV873#!;K^J*#zHs{W>Tl z#0!rE050`N7P=pZZbB3^4rDi^O50|$w@3&K3ZK-Y*|C-8Zb5%L$N@*Hs(N=GOIy#z zrg`NW{d5rEqX{#(mmn<(t3<{Qhy$Uf@#p}ExgV%e6gFU5lKq^pErGu6gvt~m z!xjUImXAC3xb3Q1-RbG4_d0Iw_q1H%C69w1P6s$IVS~dbeUdn9C8(0d7qcykj->4v z`JtK12E!0qWFiE@$Xzh6DjzIBWQPAJ)CBb) zQfjE-Zi9&};BLlY#m~!R$_r`}(ILm)46WlM77O?d?Tx&+HKc243#NiCor5O+{|-{) zG)%`dBint&@ig+;R8{;c64&4Z5Y14(bH{%XDb`aMnHRP?cPk(ex3cL z(tmE;nf>U$O{%cq&349*ilG%>^pvmo?cJgK@o|3gYvR30xMLuJU~&P)@hRe&1u-tT zc7RAw-j=3)L!SDWNhq(pyyvU%FmCl$_g|xt;XLQO^KS65&LDvej5i)1mX>dI+xZ-) z88$8cU8apZu#c&cl9I~#+OoZNq4JpF521Z-#^>bZHjlJy{oZA?w||T=akSVTj|R+m z5X?a-4eBm0+{c=6vA2TRy35HFE zz}P?ETy5S3h>wJl!vQD`|0GSnC=lRR4s5ELqK2NiDPYio`a0}PkNOO!e+2J?x94Q( zioT}n(t19Y-p{DM>$vjRt>u2#OGeuyx$ZkoKM?w+^x{DlD^#-@k~*BYRS#`P<0jmU^tQWM|65vc&_LJlJqz;pqVIls!O>h_rF(-vO0kWqE7UfTBK%yN>ArQFh-^ul7VeyoG+aD!Kz1tVST5`_6}A} z17?#5N^(BPn{n+Lz1>^+9VUi)x^yZOOGx!w4?miwPwBPXTGqQuZl4?kKNh)!@EnIW zRKlTI)AV~SU+y{(iz>d92tRwq$s($C8Ksy{rQ%?CbrtI5qi{+iIslj*+;VV$z>D~4 zz;za!J~CAY(Fw8_J!6%0c(=fJ=KaLlz-72|V8vjWLmF1-vcQXZFuZcs0Y3lVX@wa+ zu0PhzMElBE?xp;@;(4*ww55>oOz&oFZ5pm zy8|P>($Vk=!PoCSzGCXQfLNljEe zZH?fJ>l3=L`m#rRDF*)FbhYrON|5Pw3|M@B-i{qup?u&~*%U6#{XIsiDyO1d_3<-j zZIi!~g0Uewce-ca4e`ZYs;ZHtq3zE(U$?TOIzN=K?JU|}c^S&KwA6mM#EB$M28W4` zwsIqA4`2yo5Ut{7r6MgueX5u3|soy0Ec@GRJ#1VW&9i?J)fD}1Uaal3&^w;=MC07QAEiTrvVc% zPzf{#)7muUXD`7;#sj245kfo*pbFgzfa>oG`5atbgFu8uA9Dhl0lZ$!ABhzST2;`| zmu)U1K07Y%AP!4@M}6$jgfS0gmC0Y_^w9@YN!KH}(p=Ut&=$=0^SAY$4#<{5EqUnT z%R$IbF#C#CZ;3Rb`Hnq?*g6wgAT~(MHe~TS?m1riA96RkOSP#ZaO-ZtOaq5p^9~|Y z4S?Kj-i!HSjT8R{(-c?g^x)!w84_M4F>4nA08JIQr!5NwZce{-g% zy`=AC@`1PlgG|Hd-3Geqf67yZFrCN1#0wVUsOf(}vykoW;|gX+i|v!Kegz7Pb%UJ*iq0M2QU1{! z!@*JENnN#;s=htfXPZg7jg;_rfEdkFN(wKFU6CuVS- zU~h|_cncTKM3gfvr)IjfE2|1kANgxB?&Tla)0xi4n`%ajDztP#*wW^SGEb|i>gaPtrID5}%2g%7stGU_M8q#m< zB@osUXDigFXZ{mO6ZZR2o>b_zVYEWtF1FVs>@l173GCNZxMD^-H|D!r`ti-Ot?c`D z?1Rl38n0)c0?fe?kde?-yX6l6OG1-i+ zO!^`tjTD-w_E(IzPM(xvf9lO3%`MrTX>DK<;1qV_ON^e_OW9{b3Y{rQhBjj@@mbKr z!aO{>TTeqE)b!dltvcS6|M)a6QmJTw$ybiTj2MCbAFAt-w$7hCCp_31G?KTh84ZLe zaALmzjk$F2#*XbgyHMWZM1>9)r56i)FNyIQwgHGb?w)ot{6AFI(lyUVWs=oNWDjga zmM%np9Dm?&y-UEvcjqZxZhMu286Ukm8ACx-p`fCG=0~g^Q19X+6(zMW3TC*JNuJP1 z?c`?kW)SSa*+md(BByu~m|aKzBP*>PP5{_*t1v0n$-Y91(M9KfIhQ^SynYAqRbhzv z5qpvQdtG@~P?e;jN^PX-x^muZpI``kL0@gc#`1h}n7Y*UU8$?k9k;J_k^$NGq0moO z2bUa*F*5|65S|H~eh3*G=eJL=hP`;7VvxqB+tbnt37UM28min6UK!bq(0KXj=C9ry z09;4JsF;X{LFs5)1R*}XT}cyPs(7T$z0aT57KSjI5t%pgU{YW>lj2`fLq|E-S0XJH zz|7BnXd}jKfz89VU%#C?MPIw@>O@m}CoEUQ{h3x&Ufz#oY9}tQPN=kCScy&j&5A{y zZm@z`?5lM|+X3%$3|f_rV=Kor3!6B0Cvg3UBPKR7%+rAVfqeS9!S2|4+glWeazyHm zuJ`i9^}R*XVzZpKe zK6!t^^YOVRRr?r?PD`Hjnb^(=fu4!>pK&kp%N$eb6Y>`O_rD3;cHm@oZBFg1AKzA~ zZQ1Xg%MG0aTxU)#zINi_Sz+5 z7f4KCW`E3a{$`jr-JjVr^{S#=vce{_-74N9e;fD$Emo=tofhBX&-z2rVJzMm*-O2- zLgB^>mg5@-!>`5DSWNfGKP#4(XQ6zy=Qw+5ulut4?Q6NV(5E`su7OF%7xA_0K|M$KTG>iQ zuJpgQ^X<~%%9_-oUx$X?<2s06Ey}#$(9j%MGn3UzLj%oh6yZbb>UygtUjg$A)o6!n zv7e_WCH`)6K-s2e4pj0#OHZF>G7G_I1>CPOv=O9N!5atO1%)+iY6?ws61eE-=_w{o z74XUJJ~H68-u1T}S3t{|t%7gYU6R;8okn-tf8;)vyGv;l0-A3@Gy&L?nwlC4&(GIw z*^p5SZ8vz!xH010bLY-K{?x393xS5u{n?9j`fokTJ$}ZL@$G`9gBjwd!|giCe)m`v z$ogvr$qTo2n#YIgaNUnRn;@YaeTUa#gx%{P8|?eM;|F|Yd8Bpa>q91Qv`H7==@A*P zG-w{WnErfqV+yP2#N4*8Vp8d4p6DBJvP7f3*a4G7KtQSPa!On~h+G5KtFWgqxcZ!x ziAe@F9fTbV%9kVvk|_H?w10-c5g783sE@Uk@-VMCW)scV*e!Ut?#n zbwqV8B=jS{em|$V6M#Zu`a;~&@wZ?~_oDU|Om;!TK=~4dPstAuEXhuStJ!I=pC*R> z;kD=W7@V?M_bD@UHBdmuh=C^KF%D?#=$BzFil1W(st0BJ5HX^LiW&s?A830?_6xo` zQk`bLol}T>ty3IK8bE|Uzo0qFUnoy>OCTXT;ZDa}W7k7l2#r<0+q@FOlWlBmFT=or zB)<{r42aaNs1kR%7v2m8AsL*HtOz#_;*eClv)>Yr!tlz8isui2=4Z0u?{n(+POBXIs5f@FcFhO6EH#TO6E9NzT zhB%C(JpL`YME^oTLC{4Q2CyVwrxNo8s}Tm&$8f~FlyGK1{f=z{V32SlKFc4JXm%lk zZ7cWiz&Wov8juXe?@jKF?l**3_{doC{-%vdeYmS-8?>xljOPVgoAY3eg=R;6{je22bUny$|*<>Dz2^NEWanoUu! z8md>_@?Jdnrd4`YzImG`mJ#tRK;_U3P|QamfoKYGnWc;UjqZzmT5&qh;f;bD6die) zi$FHau*CM#UWyBd3cB8Le#GAqXSmp2FIDzJ=r@^BpmwqD|58~Q+pjte{~d!ay0K(6 z|8UwJhrA;-q^95G=fiA@yw=7B_^)8IkN~E-xt;lbWd+)QiV3$ziZ&~kQK}O}v@Ncd zsqx13>+pr;{Hzp=mO zqwH*sYMUR%7G1a64%F4(6YKYP%igfHY;Iv@rqg4f%%Epyda7H?-{nsC=*EWlt<85S z91IF9L8*$O&ziHn_`aLnmS$bc^++%YCL^= zf6&=mw~ipzKufEu#`?LWvLJdgFz2669XQXCz;Mj^0lywc^zuyK?vfBnsXk>LoqE_( z!Mn!NR7}QZGcTXuP*7-n{N@d%hPEpH^&B~;4jp0@9+GZVOi;C*3}{R^ez0ISj(vPM zFOJcEs1MNpK2yq)v8`2)y>PhAlCylulalA}SO^3l{{i6Gzf+@SlR5)E9dG6K@$1DK zv_;H~=?kXoR<k#4vLdhFpwHOl@#f#LBW3zkLO;HVUgw#!u0A0-GwL?&xOAZl{?Dtg4d6 zln}K#2e_cXO6UKm@rJ<1YhPTO9FmvNR(HaF5ru~rHeEPD`u=6Lrt_7hOBtKyuivks zU{L!^_4K0D{auFyuWCMLdz1O z_sYwI6&kCg8F%9Vk}JaFbN2e-ojU1Fv)&gs=z0Qvh-?5nLbqPs%>LQs)a%$km-zGSfMJ`(lH3TT#wXD(^QX zIsuu?E*O@{VrPfI0?<|U%SU-B68rbx`}XZJn7Oo@HeFYB&dAf%F40`pEd5zCEDrs~ zW%Lh3{7AMthtRyCA13496T1`@ES!sSOIN=8wQt;HJaS>2`mK&`p6ANUeNn4;NqGlj z2bI|lwL59L?gl7l;EIXf$8f$EBgg9k;TD@WtS1%Di<@P{^);^Tu|mSPY8IoRXD4Ss zJk0tZ)4YeM$eEa!U>r5ybhM?TXUgjP8M9yN>0W}52)KzNXkwUxjw;j^fFq2Y6y)W} zIGem_NG$tZCiyRL?ZBu{6bkiPC_B8p>A(>BaL`Y}R&DWuynHN{2~ub;a9MO!ec274 z$FO^}6U=dKb^rdZ5HA)?rQgFb2u&2AifR;h+}zy8IAp=EB1kkkTMXmGBK1}-qlJTW zrd7MjK&Pd4!Hl$}#Ld|Wg?FFQvo3r6ddADBn&zD?oZQs*)3eIyS5o|#jI4qbOL}@` z?fF0-O3HX~B`$8BfVA4Gs(3qr>MMc0>*?qdn(wN;jz4C0O|c=)P_Q#P>WK z+~m%{_F}8hUzsH(xwmY7AGRN-Tw6U@e(D$QsHo#F6_O_G?d0dkh_*>=$?

lutau$}XnG5RsjeFtwhVI&if;cD>tAFOIHUt3#bR4yi*u zzov59ttUd9zHgQ<+O#=j;@8gr$L0>p?s6ktn|)#z>0UX`?0LJA5ubbd-*NkCm!cJA zDGIO-rw98TLOC93@c3=nqH3hC&(TG3rhH;2ZOS*dUgyz}FFmjQhVEp!h{C9Os{}js z^5=p}%ghE=rtjKSd#I;crCe%Q^giWXx2WpT9f=d(=!ulJ)6tee&XtYW*D^)JO=(H&KfBz-%Usu-prHs`jR5^gGh0C zpFKEW^`GFOkUj#+n&$OeB_x5sq9!1|RYZdgI^YX5yo8d=fWa^#Lda+ul18gvPC`pd zfypcGiy8%NLadtz+W_?n)-TTSM9BQFPUcT?9X#8!-~G3ag+{CK-GMvn#l+;D-_D*M zAH6(NbY8deXn6eANgu&aeT~Aq+>p8yQ7C z2Dk^=xUWwRMG1|1yA9skNL);yB!=u0#}W*9?q?xw1FZ^HA|j-o>++>d2=nLv)eCwN zdfwUjM49N$0(vTk3kZoL({&CGD)^e9iT#6tB_?@bwvcFeum+GIaTz9iJzd*s&! ztrnyfu5G)lZ{Xo7MmqWNm!}Pi|NXKUU&f{s+?x7zg7>e!cN=&3(#<})Yairn zRYHE`-#X~_YrDkH2UnCJ51)>({!Dp$fS;!@KVP+Eg6{Th)+5rohZ~zihVrYV!;Vca z8)Ptgi~F_RdWR_IEv1GnQ1looX%Z0uYAr-Vv4F_A0lD5dxgq99es&HXthkD{AAmj_ zMHzg8@w)uJ6kE6Qym;SR)c#COJ-9BNHz^zjrQ3lefk}(t6p}mvxZ}%Po;(1VIDpVg zRx~ttV-WZTXdIb^3`VqPiL7&WZom5d)S}#*6CCtAcED+?De@$-yumD45rRH!ja>yv z$*S_HSLvuZIOX?coZlD1_}-h&X~}H-;L6gdXxH|={OJt!S6B|5>12BTJkYP}#>^*e zKCis&>_Cj*R5Z;Bs89*~z_|=owTGb{6OO0HaMk!YcVKrl+7#FjVs1(B0tk`qY&gK7 z)-AAZMt@-0_vQziwUd2_iuOydoE*ELV|#aq2u8Wh$s^IF$u#5b+m|>h;j45U7$x8m z?dXEfb`B0$GLgWIdb6q`A|rhtJtAy=4lV_i)>_|Gy?g5HNmZ~hCLRZl8ehERd;n6JbP*N49jTZpV1F}c+y4L@ zI}+7{f}%{k$OD`1-i(7fj`Flqy|nWqmG`$Q_;ByL!_6NLM$mw<^dV-Fv8ft4JFo^ls2T8nt)#hvi)2-3~_GS(XmGtf&}vTt)b2X@r-*zQAAey!L121{=-?)(iZp>96Tf zFcID5%j#QWS^LZ7l#Gl_mqRJ6qw#L;4#g4oJYotL zkBPa9b^hNbq;nXYz?yKTcrs%{>1_~r){VQvzD0xp zn(XPVfHHT%OEE{->c1iIg6Kdj+^_%r`Mz}q{j??h>Hh)}9zLXOS6HdIXY)^Vn}{hd zA0LT8f3$Ux8s8%EuCjU zzvteQ@sYhPr-Op)^a2R!Dl`D~rMpxJfU>7@wuAQ0nt`|@-<8sB>aPn6!|*%BoS1!l=KX*)WkZ<$ zZ*~3>q&JjuN2~-wG+=O-!JCQ|P9z(UA?t7z>7cSS`SHWV;uaSNRn5}?d%ySZO^Iy~ zm^XybhUH@EasS)p`V$^atnx=N(VVmE$uBJ{ZfN9u{g1`a-rpFazVE6Pkvc90RSxgyK*VKfD z%wFx@nG1>Z6l`{OockKPwd)@oJ0LCm!oks+jhXJ}&vxs{^y~7!;YywZUde{?Oi)rA z!8DX7;Fbi;IK&R8qA~}8IWr~AGgu8tfkvVUz8H$cSmmjTvsb1o0-bjT=7gE5YSI-2h2T*cN+@0 z^8n2=1iN)Cl7m0yUW%r#ZD@#RYAE)Hb?sNom$qgqYc>ODeux_kg~s_s%4;bmO|%K9 z@|I9yKJoD}MU?@NFT8PjY6>L;#77tl9AVuAu(-?R(XS!-?pv)7S?m=MQ|7gn5+jAV z*LT#naq;mFP}W7jH!Q3XAU%LMR$p209ATfGr)zK{FDD;D~CJ}bEQbt1ej1eJ;)^iS?E8m@;lp15^h44MtdYb^}MUjYEmDp4{ zEdD^UtHJT!tIxCdy^NJ4rBtLEU=j=u&SIzyOJ3=`x^j#|V6u&N4{o)B>tOs^+uoji z@xc;+1H3;=rUjMPL5vz93JnX%U5Y;9CMJPrEbY{ksl{4Q*b~*h^RKq^FvRON%T@pb z7L6(5%D0|5a|Q(%m__M%X|_BXMMipgmJZdnkr%yLOU(mvb9moIQi(MHWF^c!csK~@ z#Ti_{eKg-YfCWdC6!51odH1l>^~+OPd7Zz1OzX3MU*5V6$Nwzv$nr+b@cyf_ZG&F_lxcT`LPy_%~Fl6o?3(+!6 zW@i148XV#zWp(xU=!a4wPa^&T+$G37X315dd%A*Fha872hEB1LrS0Qmy#>_APr$cb z>k!`xW#s}a*mdCX4Z1U=YVo={(7J$_dE`9xOh@t#63+{Y+9#E))TBB3T<{8p6?nL% z*6T^qTOXyPqw6lRJ476PzE*j4HlU0nBO8)+T?8nRC~`onMKsZ$L@#H5$7uNNr7qQd zdT{a3FvTg`b3I-?Z_d*7|5={AtfgS%fC@+!g);`Zq4m5-MkPZ9(jpLw3mJZ(RtyCL z8j4mAwM87eD4_!TMZ@Zwq6LRjBaeKq+7KYAtoJ`CFlZ3AaIQj;(_{t`AXex@xPg(P zEC=>+D8O|vodRwawRxM;^pHGY6=CYr`>yw2fD;ArJjA}ywIwmapIsw3j640}mOb1j zT#V9Q&VD|4qUM-w3R`7fa!Az#wECDGr^*m?qiiPT7&-;)!8tg(8ync%en?43NH_tq zC7x*z>Vo+A6X?Y!-PU^EEZIq(C6C~ckZ^ZB%%ss;kql#RZ|`m}Kl6lvuoEj=pms~h zFJ33#<~EsLV0~QbEn>jh+O`+<2JSr>6Lr*`h6=xeP zZb%utiv1E^BRJHXKqpBFK`s$7b;eV7o_bRo239wF(NrliOL3a$3o3A~{<$@YS7)7@@k| zdut=+#e*0WVSYH^zm}}O29Ux!p40=}JpcOR8`JZOS3f*(oUZ6%(ad~Uc|SqK zzv_X3O%9HLIe6WZe}8f!gCf6Ezgdy65|flPmS`3-X*O$UaSt}=jo+CQ{>KO(sIX(F z+#GjEd-e1eBLUE8R9~5kiPc|?!W zuuVu=(x#$0F~H-@Kh~;u3Dt`CV#*42bcR*u=I4{sa^HW*;5;ZR=WKuD`d~E+r=6n@7tb5o zh@ z1UGdyZ)Urr-R@7V6t(-`Y6339!s5y6*M~5vgOAu!ww=I&$qnu;BP(C09iDr3s6%<3 z8Gca;Q!)%O4#On;_=){f-2)&D z%wvX~JDqjH>p7;+DTT4u?>mqlG^okRGk$#U$=|?Zcb9NdO8t#fmH+n{Dx>g{t*q?) z?ds|B&qG3Bm9z;CkX2en?lPb&JE#q|tA-uh%;tP9<-IZdLp;LDuk%5()7P0GmJDEhudFThBnH4Q$# z)pYVLZ&fdJ;DjP}HBcJV0+|t@n&#t%@pZh2$YS3i8IBhs$;;JUM`^hkRgDvoh7f;? zUWG4c@a4-rIJYo>tDSSpK9AXPP>@<4w?8HsATN>*c*Cr}m!VSmr|~fnPHjCspSy~l zSD(~VeaYv!<#_C8huRi0%Cb^WRBWc5lG?SFkRarmKU5-aeAoXZRl%{(pO*?t3Wna) zzSm|Jx%qoKXjM~CspX2uE)-m5U{?`+7t&u0S^>;!=g7`!`TCYFJw{&V;ONGMmxXgER11xuhnRm%^mU$*SG>PC5X#VK#BUrgX_IO*@^rn9j=Oox z-84$6xHk@|!HMASh6CjyA@)CTY_^wstW9`v?Qn9E)PI?qgN$Zz_ZF$Rz za0mqDpHM7uaLIB7?GJ4fq@T>tOvRNIHn~k+2Hi>f#rdj zy%;7mqOuR)GUl~QCiw1V3JF2R!xNfHPuX{<#9>m3Qvj`%@V-V6*P(pF5Dy=yJo*#R zfs-}*VDR)s>%IT}1E2F^exlJJ2~dPo^U_ihgB`H4F^fdo0fO?jeW+gOACrjiNWYP= zC;^_-#Q{N{)XQQD%F&pR`~+r%{M0&2W<5gQf^{G+c^f0+dxP738hqG(85l19_~x^B zcS(uo7ytNp4$Q@zh62tv)jpLyh?1EQ1^_Zas5OS=lx`NH$RqIZ?z+c*DX=Z#W#;wE|_hmrLAmi^ucFTJEdmN11=dP#H!lbaVRj+!NtZY(VO|Egr`smyCCs^2gys*$IJ$9 z9{qsE!GMGKa}XhCB%fN$ArtqHF%UP;y%0X|5RAKpV$UAD|X*}TM4Q&LV}I^ z-#Qanh?sl%$Jd-egFgK49A|p#;gwF(Kya|VoqKgjCwyp?x9~=C=N6I`hnn~J&9Wsf z@^9>*oXF-tHUT;byisC=4x6MSc*T5NyZe!$K+k-5kfgJ%tZ4DU1Z^Jx1!DaJUol^_ zdq+^tlf>JzTTj^=+(@&bjj3Z$zxMCo@#BSWqB94NaiGH>{Cltragu1IU&l;PU0wYL zW_E+a!z3<-e5q(*5`bZMx%)LJ>{U^Dl54r3ZF*bFW;Y-K(l&G>^*v8yJ!a$}vTWbB ztpXMsk3lf+@9!sC8_bz@oGI@S%))?0+0gLfm;C9aMI#HZGR4rEJCvRy9{_p~q#Oji zqk3D_-{2esO|y!mu;Uy?Q*DY6N)j~zhEvzFVSW7uEKwN+IjQ@Efkvoms9nc!^Lg1= zZ*Uo2DqmYUgX$S5?jT0i-%$wXT6N7loK5bm3c6(Y=PznolC%aN055d=iW^&^LIU6P z^eChTGBd*?#HQucwjCe4{ghheI(i&AxwswzQV9Sr4NHfF2Rl#abvX@(XgR?GNe8_$ z>0oe)-TIY90vsF0-+y~GffEF;09!txNivBbJF$1($a~` zk2G4jNz4a$UIeQJdXy5f_k`^ORNG%iMjq-596Ee>6Iw~k(DYMXulHPem7Q&tXMBxP z>MYU2VkT07Gd8ta5F>TiQ9p&c0C}eO@%Vuwa)&6ua_8olA z4F8ljL(5xZGwz}tIvc*3vc!Z0^!hh{Tjb^DlEFR@6r(&P2!p|VgyEIfnZI7;CEo4r z3~x)loSUb;F?4JN>;pU?rw z12_f@v+mlc?STUgUR5r;L8pO>4sbZup!$xO#os@E5SA{O3o(Ut0$t!YL`3M`IFyHv zl+bT;ax!(Vfm(@Q6LX;r5CVA6zxVx*{aKY9mfCk(ZM zyVMYaKnR4qAi?464|w@f-_#x7C1A{516@-H>AVY1vncRv2xCgH&yvhW=BF6*3e?)CF5Wl16Gkf9~NsXTQDyu@2kd!$l1v4}){XG^F%iSQvZm7Mh6tTJXL?()l?JZ>EcK5=X%siF zJl*Xpn^QG0IA|m9-aaxu*2W@SkAwJCrzP!=g^-KN+#F_LsMBS46y-4LT;jSvX;056 za;l-B0sOQBA|k&MCyF?q&jZWO)a`#ILr;`AkqR=!A9jbtZVIaBaMZqO3at-xT`(fk z{6b-wqHp!o6M733JYV!_f7i8bwF;w zSJE#}vyEDH%X%YUIbbxk)SL?v$H9FSkw#UXPl=I3%k= zvRC31=Sh?8@|SEjM+8^x+P?iB60M1mJZS}iULicL4O`x1Mx2L7R!$=al6&8j9NeZa z3*IsicA|JBdM09Whp~saFBzC9Z z*a?XI4Lz0Rv|>h^q;2-owz!Wf+>}-h>5|G?IHbOR`{wyqX1hXqdV08ob2jd5%a#%R zWM*C`a;sxLGBGeRc6RqM_SV4G;g9g*tBMLqlGzQ;0jWmY+S>90H3YcCST<}fHs0cf zxrNC1__^~k_u}M+ISNnjQZn&<{Fwc~x49%IG-mM1*q*2B8xo?C{wl^YFGgK+a5y3L zNcuC$R%IN%LYLn^wyed-0B-|1<ODub;9~rjRK^v-{ACsia zJvLB&hBScYA`rCPK6|g_Vw6Z=Wa-1OFbdy3rOF!@G=7vRf(JGPXyQCtu^Ttsn=70{ zRdwn3b?+KRKPXr<`0vj(-K7ZiFdeo%fnhpyJ#bxtKS{}cv%GAEJIl0&ZYs;CaEKP| zyHi)owL`$ggif&VPQ(@U1AKg`mi1$qp`MgkIjQd}D>F0|?n<_{0;w{59=G7aEr66) z;ljm!y@QsCgTLc>7d|)pKToVO&wYnH=Kof6!0r$Y$vGofuTbUb${5FFv2PPmvR2nj zt}?#+Y;QuL!a6I^t_Sa}FKmjS{`N&6iCU@Q(ZbcAbDs4S|(H=&!?DoTQ6_- z=W&}`Iwbnpw~z5&jG<8Y;GmTG02`Xus~4g_+!ri;FdudV?R7`QRk6z`6COlI4{Tn# z07PvsBEtav$W(Aj<;W&@wyV-hpS{`3sKjpGfG>;Tc&$FRkzI_ zObjn7CEWC5^X)ra$*+);^MYSMpln-H{kkNIoiV38Je!x+^w!BrdX)9ft=_VVX`pMS z6pCC6h(5zou|?TXJ@ltr5oN1^>Z9Q$NBv?Eu=(~z%r7Pt?;HIRUp}Vaw`BWQX0)ND zEB*Ld=FeAlhg*zHi<53F&kq;#NzlJr+V@%JM*e9dLybpom|I>=>`C>BDji;$WEx#d z7q@o(H+#kX-+*HDsnN;Oo}9uzt3ETYm3jS7)J@nt5Ea;PpQ6-IkWzp)0tgSUr{}R9 z+k}eWyukeHgaRkkhO3uv+}QTXm;e(T3EAU)yLlTMxc|+ZWMdOr*QmSU&<>gub4NO> zi&wYYULK(JIKU&h``+)0s~l!i6viyuwp>|%=zpVJ zK6J5A?%B}LvRJ>lZoS(*)6vP9O+)MUwk*~%&``d;n=?B5W@)wP;D_js)W>OGQog)b zH{X&VmbFVhb`uqyf3Vcn!NH|t&o(}%xV?_=Zq@vv9rd$y?sr8;Tc+=nl~#_fWG$}R z?WA6%*d%WB=-+D5xLm{ADYxj=yN(s7*?+sa`cTZy|5ZE4^X^N_+TY6+uCK*+LHwgpQ?n{O&PIDYdIJy$5>oryu0xsDbLso@5soXguC-_jKPr!^yEKFx~GOZhi8^ zW;9YfA1?T9Y;4_Sb99gY2z&01ulL;(RR2IVIX*j^0Xs3YM}m!8HFls%jB<5t|9xGr zaO;?NeD{g<8YGK4^Ihg23NGFGx7aYh=6Kxb(e%=!(&);s)+NW{$XTpd zJ0!1ZvGe%`P41LRiEJ@qpQchi2H6vSpr*Jlbo7VW*3oWlG7et2pZAhH??df z)|EsBil$vRw=hbIcbkX=3K#-@awF1?C2eFdOD1{>{87T(MSu`k zB4@wkcP$$97i<;2_|mlht6-t)dm7z`kHj29sS1uO@s6niFjY&ufD+LYa4P^dRua_& zxr?(w*OxC}j;=<2)zZQqEem0k7aAoJF2rKwMo|I7?YsCkGA8@_`pA@MSIKV5N+xhY zNQy61Q9>9X^*fL51-4HdcUmnoTSU$rI{(nuSM|?t+K9s#=fIX{CxTJS;7vi=bOQU} zZ^)Q*@mSm2>kt}kGrdhj2OWJS!6i^SEq(J5#CmZ9TLI}1lf`#duAVxj>S0_cReP6W zea4q%N#r_niFzUi*XQDEQ~1Z>JS8|kJi6aQwVj!H)V7F5_Mmw?BBr*Z!6&Rpbl=VZ zumL?|67sb;LyMd7z_sX@`pZJBBnty4{AiTxOf1fur);MpU`7g#bogXSTyAb)$t;vo zo{5b}jsW^40RrY#ooY0>a39d`TG+X>{y?N}jC>9Rd?@mpvm16?l8{nSX~P0upSE@| z$((6>5SNLC1)emaM9K4{0rjbV_inw1NB!-3kg0%2K<`-jB5mnY9LQ?ml+{?vMh1a1 zcJXz|lDD-A8K1Yt-gh?ZtF7JW;juOuMHR6FD;qGBsN>Ozmq@uh=D1@QSmF2#&X0YX zmj2g)8XMe#3_X+5Q8t!-msIC4KPE;-=omhCHz(Oyj~>|)n1)~tujuG~>p-p}O3#9{ zxem2Uw`%v^*6%7(iX1u495LCez8il#+Te;$nsWd1k0bUh{oAd3vadc^E$$p}vUgBcp%a_n(l5(iV3Xm!q zQ&X_wP|@j!o>@$nL@P@g4)|}pd7=86M5iNo^KJhupm9PTK|ilFv6GpRQGNc+?_a;p zz5b{IH14}7_Ga*XpiB4i@o99}_}hE}9H-qF`OY7>ZWkL{b6n1v5oY|peZ!WFjEo5B zHp_P;v_@zu2p`W0L$v*%#z9v4eW=6jm66zxwmJ7MTkH181(Ji8p!&JHFGKduozrB% z!IcURo509OgYTkXae4?Q-kTvcwB^d%kZzA;uG;K*||EzZRuVJ4Ly& z4fkVfgfV*ngAffErq!amP5O<62H|dDX$d__!o{ES^9SKO-%qWIc@Zpi!BNM9>2>p3 zXzHvf#26D047nUZ!OW|shNeQSc#@WmrV@t_r%5ee5n=@+q7r5H+qy1weA%qNq8rcT z6cp6Lkm0Rer#_=5vMRw*t7a{y1)T>bW7nqyj99c*o}nXwF#0%bvobQeb~OL^6tllp z`fQuOT@d{7`1mHDKVfyxJ2EphbphKCyj?0ll>vFJ#@afvbx*C+*$+y1pkz{x{QxW> zd5hcNK;j6kzaqRgU@l@PdLRX~dVCwcefIYTi$T)E?X;PLmt_EUWm}sHl`ccCu)2Fm zD)tNHSD-@I`Ywvy_3f{A#sB%4!tFZMcv+BfmcDr*uuY#u>_95Op9qZM^6(GL6GeFT zeS%2}luJ#CrtrB0V-tRn?cGW{XeKZr?g@yDyzwl|1ynA;~d>)+PheJ>^%66%W}8$&{S_sR^SD-V}_$EXUS0026KYxiIs9z1g<8Uczm))9lp z60j2(6x$PtA>eZ0YFO%fB%DY(nuy~^Do^qK0s>@_1R;(AR+fOGp`-YSZyi4^Y4h=a zQe;R{JGNNp5Ws+cmXmYz>w&G&G2!TjwJ;AR&QMrs0CmxAKS(=ZevIgM{rx2k4Ws_Y zOyKnCvNtcd3Y7xF1@ikWDT(Nt;K6_iiTC{fqS|AlyFQgjOC_dcVZj5AIG95Icx2>b zMx*SN92WE0|M~M8^udpC1)%kV68@uIQScTBZpZ$+6;l-4*9XcXaqf_S)9C1E=U#fM z0&!5CNK6yNG=ypk{A;W$Gg@egNo1GOMG~`3XpP|R;r~N z%)}K$d%yB7XbG{(#wI3h>{9y#$};r@$}gAThf&$%!%s*s%X19Bz=bu!6*QiG+GzDc zM#nX&kEk^;PXo%;`Ao~#YxmA~EPhM~2$GI9&#K^JDSr{#Uy@=V8@HXF-lwrgklj!{@HOTCBks+k zdfwmu-^_$vB14%e3PoiOnW9JoMX8hq)F!lPkR-BAnUhK((nuMiC=^12Oc|0fWe7=T z(Rp0<{kzXu=b!WE*=ybF{;s{L&*%Mqy{_STJ)h5Oc>S$Gj$w1^OkFZO`Wt-O8#ur= zE~(XpTem*$j)-bJ^h%g(SaQ+V4&ig&t(d9t(%{A3!0eA-_?L$bhdOMun5EtvJ{3cZ z4>dK<$gP9A>1t^?qXrSZKCm{_St-M9w2P(TjN z*pOEL{#hSgR?HuWjSYVFYCfBxok_L|T0foxk6*@Q11?}J$r;5-FIMds;N!el72<%b zr2v>$Vkf;`w_>shvp`KPtrjeHg48?h{rUPp$m0#PTIWLuGsP+!xL&FV5<8n%3C5jS zvW&&TbPEe320yIaT)d{Yt~`y)8SsI18`f}MvjzB_6L?(z_?$A!okuzl@?rUBZ} z4eWFjP+0V{Ns&okW!WLL7DFb@>W{z*x}GqwVD#wyeSaWQq<%eL4_U$=hP*u&H^|Cr zRU<1amGi%U^b?r;!Gnk^T~}=8hC92th2%IrMZ+W0K7^p)P%`mo(Ym#`y1_m}q_gVR z=MLm<{sW}=LJU?((|lDm1WdKxSUb}N?iT;$sle+q?|W;>$#Z?)C% zV{_&P3Z&|R$Q>obp>F48J6+w@gK_)46{`p*`ApK>cUyC2t}=`bfl&Y?UKllWD-v&j zV{-F6{x}Iy!OUloN?B)uJlsk-W=vxhXgXZ6JY)WM$MDHkwZV$8BfY;fmwb@1q!*RZ zr*{v-DS~epuG81o7cMZk90LHE|KSbp|DdV$$~34XHb?DE5J|G3QYg8ES!ej)i~}%N z5He?7T>vzzP8hRth&*n2-wv~ofoJv5{AjPXlrsdO&Z7hNz{K5xHuDw)B_q3M-}wW= z0F+s!p56}r{8k=5yk!PbaDJ7d#2*(IaS<@bLa(inj&bo+Dy57(9O}WnJBuob$uHU~zh5 z9!Y!`=5Gf3?nFQ=MH&nU1pxv3J7(?SAxT!?Y;49fj*dt*IowQ{MLyuxjJ7Hua2@7y zRc;NSVcHk)u?26*5MsQHso5{X4O;N<)Bw_z*iWE;6M8-$*Y@Q6HrQ40`tY045_b-4 zJ1(U;UxfA6mfR|GozFVoeiJ671MCyy8!Y=pVd+wM=F>1>EuqkD<)=2_$S?35aDfsA zt~-XQm;zl1k9Z7~{zbloId_m9;=&o`Jkl8KVfi?ue0K)k%n{`T;^#rdysu-qnKBF8 z_#SLMcco< zm-D+DEE~J3i#s%?^zGNL*N72k9DNR86oYk!Tu{UBFLnx}8V)woWU;Y@T;vnhR09Wm7grJWSPY);aZ;bC`ocW??ZVkty;%c z>pYHcM1aZ}6g@nQHKF@J1NcTU&K#c>1H+NXw`ILE1|_+MvByfb%p%KkbM)OiX4PoP z0L*&X{t836Zz4z;KYJ#8zSnZ*mQDnxyUD%(L9GsQi!-)2oX2vb+VVGQBSF7an*sqTqS_LwNgeJ7Z%vN(5uKU5}c>U!cH(TTZT=fzcw8gN=ANDAAcP@4GH!aqhG%OvbzdB~NW`OWmi;9q5^; z1q|Vh#}k8i>}wfKgO@a#yzwHW&`f?t)w0uvnSr4+tbx*m<^DXQQj;hg%iu9|6W$@U z%QFH;qu$=vusX*4Xag124)2{$^EhkIg!DocT;i%qwZnBXVKm|kR}b%+!_=N4kCmOy zOJ7VTMCqbYEX+MKRkNHum=(kxdMxa-XUb7!Nw0^bpG1bewM7+MjBj5-~Zpu5*&t~J3QdqK;wfq1ME}p)pA&BkzIJR zM!eBekR>$a*Rc0#&J>1|P;ubjuHx~n0GM=4PS58z6G>!!%wt}xQ5xNS#E8g|?=vp; z|B8?P9nXDKCY#AKURgCwMG*h$#%jtfh}Asi}S;7{kjpc6#I)sVfy*x2|~P^JK8O0&D_yPm+4*AvtygPcQ=g z)KVDHNb?j-hT#L8?5BF1i6d?J(&v|2(DOkPXa$?2@+VCj6QQx(*@M5jG#T6$W*54} z0&<>D#0UP%YA{J#+a~LAsl@t6L_0fn1aI23>H8y8@4ZKiC^cLs!bn%wLk|M(KY8*b zYs#D_hMaeI=Q4yDGqf2LQm`OGF~%GYHYK_h7*`~6G7Fu`a7=a62X$suHdp}>6*Ha# zH|Hmrx$@zFauJZHd6#wsqF7Bt#9S}^C{6KxYthNEmJTQxzy30mgFrae(;@STU%^w zPY!cZ7$j6Qi4$z=TFp)idK(Ze@yYAl*`jYdGk+?z zjzD$tN#+0C{n|Zcjlg_^NRRT-xN@Sg6V^NpDFmGGYjF_w+I8rFR|CjnYu;i>zWVbO zcu5ww)XfT_XzV^(4)4OA+iv>$zZ~z~BaFagp zoKVM*L)3p3Yh^T~1eN+LI(lV9#AHfnT4r#7^78UV7l@f!0|tmi1zcq;<_OS1pUyG4 zq&jM9^zj&%VX*EZDklL(S>0KcJ%?hrS@%JMwiE4y8^}7Yv0%wd0M@j$HfT*h%#M;P zwBexVy|BwBDS#p*he8>0A~O>14nWUgFDN9Jq6*#)YY}G7{;X=yg|`N>pYUa9qt{VY zjFTDj_tnuE5?=lAw3zA{sqva~Dyvt)dSC4AT@H4_kCF2cs!F|m$75Nx?u`czA9^Ej zM{@T7%RW|#xg+A(tUh#0P>=!)b-|-^7*lgx=kgy6;UV)?7gL;(y<05mOw9%MPq#%V zCN03QKwFBfud}D;6z&{(RBzSk6NsZiond29bAz{#DewUFf3PB| zs%naxn+92C9=``T{v)-e1O!paOF2b11?D0J&VpOLPptBs9!@fPwfxO)go~Q5!PYIx?17sr^sSug_2!>K~B`}7S$^~*m18w6cQpL)Kbav z$l4Wr5xWn=!4B9ZkHUax+ZM|}q);>#o;$p!n>7zfZ-b!66wJ5bl+MJyDWU7RoaUpy_og%!w+~__HN|kvX?xeR#yC_r7@JA!MyfN8G`~BP z-zg`X7#zszUs-F#9nHbVjC!Jy66Xs3eE5C{l|Ass>pqx-RQ0>75nfQF*=iH#B*~yEzM6!40pyZZ`hLSsyRS_5c=M*`M2syb6-) zKM0jhKlX3Q5hfi7mzFAV(4@OryR@elM9#lDy2YC<{6tvAxzGo%P`-(GBIVv(D91Ci zkLItfG6jkqGP$w!w&h(!bt~oVsfFr*fn5m#*p=)=@wOVuDFBqolquzj@iAwTRyp#p z8ZVnd*DrqK)LlJs*`bpIb6;kib4q*kL^Hu;W-X3Q+M~uR;3Rw`!b&ABj%wgc@Vu@- zrRHeK(~(VsE)Hf37OC9ng6G2pE)T`35~_YG@j>L?quSwivuDp4TsdKUU^Tsin2K<~ z#TrbAKjJpU?@;N|r3ljTYTh8Y%|MJK6`2toId=509w65T=YF?!9|=L#y5eckav>H*VaU^hZ)^G8;3f zU*zUC5yMjfu>(?d}qZD_&6+?n? z$@6ge271KzEk`vT9^L`*9lYc{1v`OU6%|#!uk#v95|LO1Diwf!KnfXhS(I5i)4Ak` zF^p%nMMs-qjmZy`#HubK#eK}{A1|mb0)D)XcTZN}iZVMg;d%llfPl7PMd;*~8zJ20 zay2$@-mE!tXp1-hl0x}skDi+%AtWB0g6bgZiZA_B`XKwyk3N0+bO@P0v#4z`F^hD} zO-*wjK0NClNxGN*<-@JakfzpH4O~L*4)1;3-em_nu@epqvM5BdQ#G9I|Bt}&H zC}c@TBV{Bvk%UY+*p6@H@h8B)PH1z}Bc0ZG4gFGEh@sH^Nx)PEVr5D-&MBjpxw3cv zN_u!qs+tglM)-jrtQjFRUgQ6Vl92% zlPCK)YrG+boYc3+DG$h}!qKC1`t$Rz(*8vdW2p7l(&&R#g6Ti9|4O&oymjl#1qI2+ z>Rp5Fc0@%LFv`6|4uU;Ip@)3e2()+cKmYUw#scgYo5h>+Ct$hHudnU+9%o%pwaTCd zw8k(;Z&4=tFFehoV;i+ANvm?n9=u3R<6;4$q;`j{N==4-jW{g6!<=a1rgS5$4c?Rb zh+ZEfMwt8ZfDpxC2AJ^0(WmOJ*@WijnusARr@8{JI-LhVDFlOfSiV69>*(FY6pNos zNh;$>?jg#C{rvHgynJHalI6<}(?EPKWyvefY*P^G41ThL{tqo<L^-ky}8UbLmX~^u^WKFi=yDxaWVe^I!gI4uJ*hZBp`zU|`&q(+*hmA2kwB(L^-LlJ&shkY9CHshG{_w~g#UNksqSP&cEqYD=U0tU;36$jxwQ%2|KY}|(z9^gFvP!+;XhS2rcU83N+^v^%XF%58)8djI?Ln+mgJB1o4YZi^ z(Wub~$K=qf)31s68PM|ez0_{(U+BT?@o`8n-8iS?0t-uC-8~Xz+fnij7>^9urF)VH zLGQ913pRiM5&HsvXer?lAs>xHw5H^jCMDH_ScCFPefiNLIUrSea65I@=4y34_ zAHO~d{`TXC0II@Lka`!17$9S~qkjuHI^wL&2)b~%Sz8sA4`06yjrUUn&|zQfQ@+Px zkQ1?}YS5_ZouBz}bQZwwl?3VDl<{0LnSfzXB#JyJZ2?HtMf*&-%6bSX$XSxaa_cSf zRLBdG8?f2R;XnFOslhJJ;U#e|YiVF1lo#?6Ew#hoUQT^ceYOdkuiA=dz@A}CdeZqZ zzg-K+#ckoaORnHA#c?hLK;&MxWif)i&dcqSx;nE6xANpLW_HgqU&dU#c=4d)!*`VA z81ncb@Oky%L?5Q!;%)>y$4l`gu6{@GdOyogpO7sT4;=t!y8~@AZ=6F+m2jB|AnFWI z19uQqA-w{zHT=L6uVwOr>!cUWZ60;HutYw5`O?%0Xf6N)F_`XFc6RwSdz+;`5pqU@%RwH zF%av*c`<%@oqZ~xyUqjJlnRe6ag!&@3RnrmuU=gR-xN%57?6k<5Rj}pe@i%r+X0Hj zZmtLtvc5gvBmg*Y%MSF}bZYeUFSr&j@kEjxbmVPb^(G`lf(aoXFk8APsYOKMfvm?~ z-thBjjL0N23N&y<7!GBX2!4^4nK#w8O&jO-_RB_e;dM9@OvsM6Q|F8!-xVER!zW=D z7Q^(H@#w`aK4oZH=4ES^H`UT828@%;N?4Nt1VCWKuc_vE3iE1cYs7bfXw+6&`OoT_ zof`;OyLB8)>2pv%Ar;B{`=h6U+5>_C=M(=HAACj1ri`eGv(S01XWQdk;0Evt z#PICpOA_j+!r*Dv-+7BX!uPqf`&(47Yvf-OG0r>~)IWKQS;UYV!};1OVDP+A(@izz>Pk3mzQT0Q`Xl z1Kd6fR?mp&>xMRJx#kJfQ4Az-u3nGoXULG56=^U3*F*KSErPIj|JNZstcDcfJ(tTea@*Su<6ae(Vfeo?z*`fQJPD=Jtk%vC;@%` zl_yWSF%6y;FHRKl{OIPXBynF6X0S=I+01eOubK7wih%!n;;n38vNresG4c-9++gwl z9EsoP-9|C_f9K=M%_DpNe@w)ihgkogX*v^hGnE5rivK%KU%lU?XFHq!|9*L_FW~4C zueZi!?>WPgk%W9sVMfPB*U^3FH9XN|UQkC2M+A-Phn<%8QY;Jx9>J(=f8!cg}wP@aaKS-QdgxJ^^(}Qshd+F=91yce# zkU^uAB?49s8^-Tf`?-dfXiv|oyhV`OyUP(4iXkP0^h&oscWTx2u=zfr z3z3vUAC1acxH-qk=-|!$7Jb*Ibf14}&mSg%e|A@R0eOqAFINHse6l`LRSE3$0a_RfE?pj*-46h zrZdfOVAK%d2}CRcZw=Y%1UH=`omFhJ{ZKrLL|WnQ-Me*|V^U^|01>pc@O=I6DU2mK zU!?BG-A9g`-8UuJj>cKa9I&l<^XS~Br!Pec2E@TT1^EJeDNL@VhP`#m3N;C+ARtdK zgkKR;ffbmD)BS;J^2eyiDZ(us9k(EAgyq9BB~WKFqY?ks)^@?7N#I~&$YEzE-pKYk zh8m5RjW4-hGT(q7S^nykudN@y7M2RX7C1LK^e5H*NxLK8qZ8D=cG&6`J?-B$YsRPh zC)?Ch>+()90B&4_&yErGFnnK_$*2mfEP-~jBgyOw3@W}@3nLmpI_X0+ zL<+R^2QB(4tA7ku7kW)y^{RT|J5urQZqZhA&y_6njO59eGJvOd;MCT#9Yb!S@j)3M zioGk1$%y4|uTn#I)N^eP@P(|X=94grS;8yBSqc=YkP|FHMPw|)KOjaomfMm`#4gA) zHG3T#E|)QCWA))_MhOEO|9I}tRfZw5O658}1P`dc8G*=uq*i7>n%3`MU*}NqNOAt{ z(`|Ex^c6hKUI0nVYTjpmb;NRMD@_UUNT1Llx_pH4&#CnR3X5qoiu8Vg+~NUt$ilA8 z*l;z~Kk|f2H0q@_bYw!SV~_(1T#}z;<1Pqpad98$tTN}RV||zr^G3#M`%9;lq-T6` zY&g~-4eLVzZe=ttriN0u^2^MD8Wb2Ay3z>;kV+*A3Oj}P+d7B`IS8O!Xi_#Qv&I7N zh~^lqmQBvWDmYjFdj*2BB5sRxjM(JGQVGe1S5m_7>iXZpdJ#}Y0^&~d(bov5laLqj zr1YS6@dvwssJa*K8B0azGDdH#z7x+LZ`KXXy82>1`?ED}e=(TR5?vZhroV;a&Y(6z zMv%M)K7PHzt%k({SE_-^uq=5WMa;)}$y7*`!`tJ6 zPG7n~u)n*ry^i6K2N*0Gc3O2esH@b-vj2o4O{N;a_{K+U^l>dR+WKyugMvbkg2{xj zGt_?WTlu9E;~-g1Agzd8BI-5qfR6Ef-lDyZ{Z-%;k;4gz;lP0d!f=+f3{r{BzKnc3 zVfDt%8l?LIr&}3}Q7DX{;h6`$MY8&z@`d;E>HVW906ZaFf~s+-Gn73ncJahUcrlrlxg9FROz z2GhbE*jp8*W@0EIwD zqOJhf97x7VQgxgjMtLqC%aF#puBux?mWZG&XHbi(Md2yQnxMk$yE_hDP~{@7{F1fB zY>JX?)_CAic5I`1+y??(oU$KYwH_n!c8Vc>0UcCT2L;yolC?M`%J0>KM@(YB9fej6 z_M3~r%t>3=HTguqMS?0ybxC_rRHJ8f{QeK#eR4!RGv64Zp2S)P_E-7P0{4D^st>wA z$@M5Jqb(wZ1j~c4zMrpd*7Yd|UfNz1pwFK^y}%z?IOIUP=5J)!assA`TvG8h?c2Y9 zZ<=3{;@;?J>J4TjsLhuizKvqjh zA++kZA6Nnjoe$SE%(Uds`Dl07cmj+*^gUc9q@t>K-oLG zl@x{ibrYUU-CMlD)K4zCm~k+U6~bH7)@rVSnT><<4&lVdpC#~apkeR+-Ic?s&XK^b zq2|-4-mC!J)_w^O5tx%oUO+w?Vovnl%61bwf(9=(a&0c>}X$$#hhQ#E%+^fCh`Sc=F&ERLs} zk8(Szj}H=~z!S5L0|M&Y+~;!TM)O9QMYhSb`m=u`_V#NOLOcEH*ebJu2$gBP>NVX* zLHnC40|In@R-VbsY+&^^bkmj8^XKc`?wvhzrlHl_*3LRw2}<0$(5+j|2Z#Lg;_CKS zpMR)b*-QTX&vVC_xtWAbW~Z`wY1qfy6N!#vcd%TzEn$k$X$pU;+oB#a_j>RK!u7LL6DlCPqIzD2bdstjXG&?w`$NRqFKq&^Nwr1lJ@sV zu+r9T?*WA^eTnYQYe$E@XYT=BGvpIN-)n0GnL_bYTRn5>Gq~d2#72?4C)*YTD;&3{ z!yPRiYQ$8Vce9f$`X+_Go07TXse}4h`{wV0^Ddw%`10!{`VDHu=xIKeJ>@cO>c^`J zz;Q%6DaspxBXCB&1}u7gywa$~>yJ68ul;oQDkZNz)q-RqyhsZe;6^}Oqc7nY2?+`M ziys6!aTTO$rOc0**?k_SKIC@6#Kg_gHrqeluh|!}csW)-2Il=qF@)cX0o9BMJRDJyj2p1mYQS3Uv66DSAeaOkhAjZZ6skmA+>L%2QQQ;?sfke=>k4KXy_D=c7- zJAIMZ7cdB8yBCTgtG@e!H8+7p@KH(1xBouMNOK&g^#P&7I&~Ste+<#(wJ{izzX^n4 z@@I7JV0}R-tH2_X4l(@~`;NT8cERrJ@D(7g!LU6MRzM3Qu*cx<*Anw*2HR@4k zFYx4#)JWdMFvsu~Kah9`T~3NckX3igngm0X%N_bCeYVQ6i=~ZeJobZ^$V!gV8oWk? z>^b_|(?$aD(i78*De*{#PP(|WpkiiL`;5QcChLUHZ2M7n$oS--1Dj7pi#qaYE+g1FXL@BzS~yTLj;w z#6Sc;4qS#?J)C_>-Bi*eb4~m;Tr8KkvecumH^WK*Z*dUVsRp*bX_sGzrpn!BFhgMjW z;8s%HQ$z1+p42R1$3OatEXUwY^1)SerL3%#gjhlgO#k62LI-4L1j04LzzxR*BFBAK)nO+}#G66K+^Otkj6r31{_ha{bC$63>&x2*k_r}#Mwok@ zMpLFeEP)FA_2b#|=Sv@!`l0AzB9xerfK;n1rP_(0Ui0k{$zT@Dq*j0ajs5*%cr$y5 zaV&HtZd~d=>nPk~!4`Y=WQLWvI@maVj(&n&7u~gfH{E{yc;;P%;K~F`niiU8zlT?~ z`SE;Y-Xt;6y;G`3VRDHF900F^ujdl6S{E`YfX@fj4yhEmVD};!L5HDmD?_(`f6)uP zc=?j5$^`#4cu(`+zBd>KszO8o4%>j6`GZ``|7ckND|8cs&uE41-fcBw##(?PZe^}r zP1D4oeFox)#-K$yjH9pi+H>b=pl2O-HAyd*en8MGMpg8i11T4oDysb)K3I*g%Y;b= zS58j*Eo`Ec`Ne`pkxw0tl{pRkK-~C1cK|7@+2jR%V3}NH+|2l2+|rmLA3S!}sv0L2 zUIn%lSv^8-U%a>mM#q7aAp<*g>I52)Y~0p{PbMWS1si3Xf(USg07$ys79M^@+Fq8? zyZFReqRE3b#0o;+@rv=38~~={Y>)Ke$Pmz^hvS(z)a(To64Pw%tj)V`)abcX#6gUB zAWb6xd$2dDYB_6+_Vh|{%<5q|xH7ffgPAuz0OE)T8Ju{U_^ju{g;WO?aHMk1oLRH3 zf@+IEg~D8VFTU{5vN<3?&giLOcF(L`B3m4|N)`0x2N(-O#jK%iIw7Y2z#MsjHVs$*S*Bm`%I_PZFu)J>=<>h0#!y#lIL(Y^E{M*BMDF|Qz`tY+&$o+OG!E*0s2C)I`@4jg z(=P@srno@^^`B5n++z<=>7l!DIr`CN#>MfTfIb9^NlD>7HQB9Lx{PJl%pS+Yo-I>;x*z z{LNn!SD30Qb|&rTdeX*1cF)Oszr)BWk6AIO4n5hTFFUh#30$u0Xg@$;&*F7!D#t`!8L@N)0)d`*~9ec%Y^bRFGDj zSYaCekvboKdjMh6?0JQL)nkGKFoQROyIUX?J{{%E zy_3D%$X4{;3)6P>2J`_60~xG3`53Y&Ojt5XcFI(60$liLNIFu35f36V`-5B%t0Rge zA>o0T;qhYwK^0PV+1s8g-+cD}XN1EE^>Us|>&llddJ&L?DD?~K9;LJdBNN~enyf$! z;bC~d4q^rQpQRL;W0|Ib-WbH+ZK!oC*7)F{_@)^uVsrWM)R6}f0mMe~PGoeaS>v}X z0n$aM)$BoHKMJ7?a0Mas5D>$|UcPuCTN)4+T`NdwB|`$J7c2-4I(eb3nwo#f3 z%Iwoe^P23R9k#A>%hnUe7n@lJzM;%mX6FSi#N-dimoc5M?RQ{G+q26D%vt$al@z}q z{W#tZEZxvKk@Ecg`zAP$epMjY6hsR_E~T8+OU0@Z%o)2uPXm-R@_rD4)!$wbzvnrXj zkKarV0?U_Ub|-P}tvzH=xJ2X08Gj`20Ceh?LfC30LqC$>`i&bs(w3mq_{QZFB0#s% zdw~}`DvS>vD-GU7CTY4RXYbZeraYr#b1y3jU7k_A?i3bPiL{lI66xT-b)8zd0yt1& zhmg-6Gp=bj=iN=FF8kr+YH0 zVCYlwb5RB=UX+iN4hFCOqE_iz|8-TpxIb{BA>=m1Mx732%kY~&;_Tn~@h(NW2*N=J zF@K1QA*w>2r~A;nBPl7^+L~oQiTBu?I_p@hUHA1rAmVMnt4pdW$j#RZ3mYGDh>xs0 z^kV6E>zPXj^zScp4wxI`4_FvP4RTa{bUO3hFay}GzIeSeRd!qR#S55Sq{(zE^1C7!FD1-1ka#xrpdv5RRj2WF$8X zsuv8g!Hz4^GxP4=-7WNo4jt|}+5FRb?0R-IK|yM30v&W)|0tQtWS(k-NqMtv z4Oeg$YQTr}-16JuD$P)m{oG6`T>k3xuDsm4#6Njyx4|8HH0xa-S*iI#VepTDyMwJ= zVh=P~19ij_LsgnnDL)eO)dn*^ql6dzBy4wbFOwo(3@FbW{q?J-YI<749qK_d^7oMT zZ93exXgj;4qu2bL?mSI~N$6unGDY;e{0rhK7F%rqmtdW_kEUJdHf2hIXJtd{zU_)k zw>67=GE(VQ=nlJa)r@qBSMMNcuj!h)=}Z{I32+qmIdYAQ9aj=y-_ zEVRe@-QnThXg1)a`hgZUSTOSeqaav*=e|8}Ni>Af53C5T!1lshdv{sSk8#QB@$&GQ zAMXdh3a>4hIi~{jUa$Tfoen5c0lX5#Qw*BYxO;*DF;ZttBro;s`!mTsqO_FD26A;6 z8x&{l#5gbmA(iZ9A|cEJcoNnPUt1`AjFDj3JtJ!`e0!L+!IvT0Fl5)Sk3lmz+bjDr zIKmd%W&C4B0uSH0;f@bpE-vHgXw=TWZkV%77M1hi8TL`dphdqXXG&Oe7if9elQ{lA zqFkFEvldUeL5~6{!DhgSF96mP;L{7%yx7Mlp4qUFi{q>|@CL;TgD57eC5}buW|%Fo z2q~cfGdDM{t)~90bZVl2s6}=Vh=&V^foN;#WvF*Tf9-7?T$OwM`q5IIrD0EE$bj5H z;VsB4hR`Id(g38_0}%5w{IzJ?4dXtgH%^Uo*i zqC$*RE?w68TY&%0z8T4^Qvt7Le4{@-=$1Rw@iETB#$FFX(F;HZ3jiJq(AhUG-FW?vNNK4!%aa=VjF+)ABii`}H1q*x zZl9T=y+!}QgV--t8*@crM&3exhxuA&t2iqY+Qa`e2y;G{F;NvDn1irAZ_%m!8(^|r z=dyGh^7ky4p%r=_y^=}v%|nfEzwKGya#0M>6Z0Hunqm@t@9-=haFoNMyu;I$96L1i z?x#^BeEw2V={?cFpxsjMoJU<|Xg6=cxH%zZ?F-lY=Nzomva@^iZ?&as$h2suD}R}+ zes^Q+k4KNYmY)7MQg7CwLl;gi82C?LUq6$3e!fP2!-tJrb7a8YQ7zBcHS{{CmHzDX zlS3<>`0SgpYTB08P1kHcZ0y}SNkciv)fUa+t@}mF&EKfv$Kr9;s^M7g@mIBj50Be9 zV^$N^2kqLGUl-7KK-jL#w1Fm>BUjp$`V|)c`_1?D{hJ4$K5vM3-r3z(wob|4m#X`z zTItal6Gbhfu?<+>>`?u-)uKhLX4h=XqP4TjO*Y0ycbR3IGhWHW^zc+otHZf_f7`&o7qF}2x#gIT$r)LFVW;`Hg56f5<^=T6y4PidefH=~d#hRd z0_~EjqJ4Dwmj0hlwfgB@KOT6z(OJkh82?B#s7|ALvdRxGI4 zH^s8A$B@+QZ>3+}X>_|YYQ@u?=11*kI&|M@vh3#%(=k4eG$Q&xeBf|4`hUOje!pJr z?lv_#xIIwkafsncyN@kHH?3*bHK@a!U3G@K?w{sgEA)9Xdi{p&TPj{&ER4Wjd&iS` zE1g{$wbeAZUc0oNPW=D=D20r_)=bIy9DCd3@JS7g4qFza5A^Dq^&z<3tE;X%cHP6s z*7>5-2PdsJ@56dNe|UfL@4cJaoBWPrFXOUND@^aT_`e^3_hCn$SqxO*K8tyIrRzn_ zog2!^?4@|X3L0*bq?y4GP#{#w2n0sXf?xG-qkZhHE9x&i`RX|2s>P6vEt}Zy4{rN4 zrL$W^9Q!fC;)bLc6rH?NXr-V2eY*Fq-IX?-hc8jG@9);)_}6nkfB*J9SJt!LMB6uh z=N+?ld0!t@?ETlv(JxPo8DMwk`?^|f)xE=em8MOYIQI788_D5)15dhbYF^!CM7tr2 z+Bx11RcCeZ$>o`CfB%l%b0aS__W1j&+hf&^PI%OPz<^LbJ5DzRf6vqo6Kw?&1mrC9 z@1{i5jND)1A!9uljBp~$;i8P&xN3JsGcdAD--+L5$l% ze}48KHFQxKTD_Xp+pT+dHG}XTt~&<}*y*VSlVLZ(7=tP!ypqFa>fn|6Z$a>B>G#P8 zeg9tPnzl;1ar%*?AYJ_{i>nsPAoL63 zDL_-x_)<&f8~ev*eaLOO?2K;JpC@*u+os@&^NLB2xogxqwMfm@)V_CWeuI61is#P7 zCcS$UQ0)dC(db3ew1}RiMU(He<|C8bL@8zlv2#>)br=k-bu}~9nGn*crg0o-wsaocv6X%1(h%x>z_-RhFk#rvogjUEvOL#!ls&iHZ9zfk@%FSI@G+5&c(fp!WHjn1!@zoF))3OFwG7bXP3ln(S4 zb!5e$Z`YMzFGH}uyTEqh^P3l)1}^^fW0?T&j1e{$$K&Bd?$l&PI(_=OVq z!HVn>WQJTu1%n0;#?f8~1M5tmtyD>$klr_iVUj5Mu{7FsY%&a-BbD<8$&gFoJ%jm@}me)BBVo;zD?eN&nqTd|3|ffQ9N09X|(NwFAeCHV?db?7}&_C#WuF~Eu|^= z4bsBeVymFAq~b z*c+1;Z&a48WpE5|aSdoZbNM?p_`?~xP!pj!-lXW&oVrIa%h$K!`rBu0V{RfZj;V@J z6sU>%!+4zg;G9>Fay>qS8{0j&^<9M z5>~Yw6u5|D1R=94TEaiKLM$5KQ13|TY1wbarrbzYqX6|Wuyi{$&(&47%fZ#~IO#f? zkp$)cgI9xin64afu87dX8Qe_T(m8RbVzN!kel|hzeGEKpme2Y3uh=)$^|3~wyV{#4 z_1|&aZn)`ym#Ac(vSB&lW9^)?%R`xx>OIqs133}z0^%dX9HHm);*>Gc#`liF0YG$N zQIXiiJV5l!_fOjGXL_eIGztkZ)OvDH#59Gnp2eyl`Y_*zulj8W&@3uu09Bqi+)92kPq|WVJ=BE+cj^^6R8*{GIH06A>Mcs}n;wdDgsc!!P$;(pN4}+1kA% zHtjDpgY~i(8Jt$fKEtW(J)5oRif83j$^PJoh)$^Dz((c1EM400!ks(q$Q2C!9Ugvd zp*(6TL3`v#-?}ANueT4`{^Qg4_;*va^g8W{DEYFvo5S^(Q3c@WY1#Lh9`kuh3>(q)6SlLJGn~WpwH5n{g=Zt|h%pXgxiWpZmmWOmgc}ZLUGAO8JVBcc0`rQBLb-My!OqoBt8!6a-_n2#J&iOc{4eYSz->5J zuOFVSb|`xHB<5byCFZ4slqhb9a+fX+?)zZP;$oFsTwI(n;OnZeN;1|492nB?IqJXn z1ZkxY;TcKI8kKmn|BNxIy9aO z&D`6&f`a%U|J+A zE<(mR_n!u9Ght?GSJ;E9F+=tBcS6^YeSEy&qzdB&20i=Va|-(Ad$pKRvD1z2+g-Gm z_y8io^XAw$r6*^$%b@UI{2dux?}>~AgNJUauN?rQjOI=JlK#VqFBBGTrZPd4052U} z+{wdZBZtm{xd7dZI(k`%I zU`z2g+#CMIr;F10-fc^I>b82@jal zatJRW42UQVyJ99ezbs>r0{>f-Rz7>T@>LjP#gWjNv+}b0{o31}rV%(%$Kj0Ztnqy7 zXg6!t4H{*m7e9Dw@=3S+4=IEJ+2*F5f)Fcp8dapagu+uUo1t&oj4b2as}Q=NLnPlE z>ZC0#U!Qa*%kWAj^qE6({y>Y#?XTrN}J9l!9)IAIfPccHu ztA}CA+ON^#+C$r~WJEd~ZhN_`EGu`OJwg_~l$yc(AQt`%C^Oq|W->Z`%=r)9#xDN% zR>inr(wQV^kYCX293;t%h)y`=G$ce`rWAUw)kh16pDZZ2b5o_Rg|I~Ib>6;xvTNPN zM}NV@0`;(4n6JHloaw{Uo1RblTy?RsDEIb=#ZG?JNtce`vLp}_?zOkrBp`(-f*`a$ zoEl6kVMj8{lC6|T`a|)?6GA$wf1xmdH*vRtG0-vIN^>3$npp#bjHyN6zAJovWo0B5 zGn>avG&?~ymEk|k88~VFncB%=2dHxeDM?@2MQH@T1v@zfic#ipf|kKlkjbOL!qPjw zWuW@}+`)^8t)0+AQq#&)X#KggsahmL3pxF4p&73)Z zYUb?`T|0K1rPIxR+;hZjGTTkpTaL`A-^`;gIIZF*hGWFh+-lE>#TipI4+3=WIIeBk zl%bJm;IHo+GpAqqqzMz!K#@fIh8Mx+ysIm6xF9&GO|`Po$bVX1evxtq^S!gFwKMvL zEJOw%<1;E6>f_p%#aD1KCAKrJo+G3?DlP3#R&`h`6yx#t9j2&2JoOM)H^;P4KHyI2 ziZ2b zK4L=wo@)42;n7>hdcp|B%u~t@4yOKl3@yeX?YLJ(dWMHIL)$u{di8`7_|gT!yhbTm zu-0p~ot?mDlo*?-Umy!{FPmX3dG_pC%k6vh=WTiM?xzxs0_1XHr))Ty6HpU!Gdaa3 znbd|vpwm`Jo4I9Womrb)8gpjtC5Hp|s)kcv%9t9;lt7}K9JR0fCT3ju*R+wGrjJ_I zV#8TTzA|1rhDV+-Ey30bPA)qWmJ(}|j8DOlcZpLBV`uc;5RGLFI?0>J4qXU z2%i&}dXUGRgr%3m7EQu;gE6Z+R%X(dRP5Rf@yPpkogbC@5+5IyFJQpBsQ+WjogG{4 zx7qYKsbzjrY6K!Hg;qsA&*xF%|`~KBnftZAVjgn6T{a&EzD) zyI|bJdQm+i?^cbo@+fLux;D!)cbWxqxTyROov=ohF?b7hql)5*D6groV$2okqQ%Lw?c2W2a@Mt$erRxR7F->tWAWmB`L(PanDb257k7|@!FNp8HMTso7bhsxS^^t*}e%jQH?C^dp3KdBU0Ps1qT=_3FexULt=%IYN z&NW;BjS1Be!}y!2bIzB>{Ae_mo}Bos$!mZr+C98LpvuzHNO~rtcE>&LrgVCAW?4CR znmDXkalIU7EMyEZ5YW`roEkNxHO$taX8BQb)Cr6+&T< z(jowQNo+J;iJb4rWvZkJayGlmjKmGw{RxCnS@>(qIE_!ax#OT<`mCe&nUI{@daQV( zqT+)?HG5`zpx=RH%(}sxjABOPu$FJ&E)Ul|hl40ODI`ha)PPaOoiCp>x>{<_m_=-r z2Bs5ZESOx%2lP0c;t)#Gn(^w_ag3c|26xoki{%ZcjrgML^EY!uaSg!$?FF3a3P$Z5 zT)2Rq?RqDx#96g7_jsHYP}NDfBQD#{oSsYANSxB0 zG%mI?YA1$ZYS{@arw=m?A@Fj*+_o&EJtLlDPM|;cQgc$h0pD$5#09B9~K0q!Ss zsURlZfY5XtvnJ!WDb@;%x;WI~i3T>-2nLna)u#|Ws`osM$14eI6>6pqOmK*PF`t%vHjJ+&yLPfXCiqckdrx*~vK*3)rRclfsAr6B?dZtMr%Ip|X`yl!F# zVnfMkOw~P)?7#VlZHR5)3j53qo9(%S=p{yIsi|I_LcZFF7#JpqqyOu;(wOC8$Dzjv z)tNm&P=e%1a__~C65+)Iy#X3V?TmymkH|5(eXqM~M0!Na2Vtuuv8*R>7D^K)4OEVg zEU%(HmW|EggF_D4Fv6JO5*wTPcDOsI1A$r-CLJz4VN>eA=~BMPR37K}dtXj!yy*yJ z_LQj*H!Aj2yww)kC7`+mB}pY+Mz*uh@@*4lI+4KRxuR5M*=DXBCIy=l$9)|a($xmP z8`;3g#6fh|Xd0ODtf3tn#N`{QbK~pAsHoZL?RD8{xu%(Oe!Fi~x|YAVifQRhqW`*k zq><9rf6Z-6JkqhR*nu0D-H{=ue(d23Q_~OW%0jIxOdMquBZbN#a+OPwm8~$wr!Ol)sQLBfgh`6V0Z=|v;SW)#Zq2pGHt-esXIlx7w zF`i+TvzotqxT)?=kixg0UaTb^U2orRw%(xC%?8J%fv{l=0%G-Fw}lpxp?PlxD_|Nh zcmQOulZ9qB83}g5-h(I8?n>EjVse0qWZvRLUe!%T^QGseZfLgmmg7QkQLg+Bkb&Vv z4k1?jCZX#HScA?G3NJ);Cqk^jf?JKj;OH-q z@2M)*^Nw`N{lc~-C=y=aT%H(orN+j<>+xPeBgnoA{7I;snN{uW{?F%8QEH~Dok9;Q zR(G_W;*Uc`T=2FUnkWTU@;lE2gC0Ew&R5>`#-p-woXNub7HeB7b#r{EpEZ<%K$JXS z6se~cFKn^lreMxJuBrVwPN9GrEQrJ%w+6UZWu`va&K^SAw zN1i8=-%M0JeKfX_8v*Bld4k{%#G^MGUP&g2h@||dJ0G>pEg_8!< z9^BQ$m;b+Jd=F4p>A80QeV=3~#o#Pd3Tg$XrSrj%2V}Zy#yNkx0f*wpI}CMIZQow1 zX*7ziC(mReqN%DXK;oB2`h)!Ljvz^HHumEfKvCeGo3YI})`l*jsbG2VlA(Ctpb67@ zBF0bSo*xkLGbqN+D^2fcm2EUC>uSo=%q=6Eqj|A(y(DeF`H!E!Hfmd9$Hfo zyHTu+xse&kg~fFEc9L~JLPiK{G8USf#BPLsiha65pFZ(=Jv?cW=uv=r5tIB66gPG1 zR9-OTX)!Ea5pmiYBiy=5wTS`mq6Vx z*QJ~UeSkkqF*^<(js0$PV>6!<_cJ2BA{;!8ikzV!;MhN2ZL=Dc|Xboi}KzSp2 zbeb^4j+B)q_yu6{!J~o9GyUNzwGESY5xVKMFsjKV4~+W{IF$n>_X#%`qHHe=yf4r8 zx8e>^1wKQU&kFHN32E=qv%=mG8+p23{lUg~t2@1(d7@Cdz=@=d7Ib z_T0*+Z(A*Mi1`g^UC1w9v&X7VsOq%fS%L%hs;nt&31iEx-CbVz;c3wuxjOtvH;fRZ z4;RfL7;h~mEBkN1MvqoOr6`UyIKZ7arTa35YNXXrug4<9-C^}{8??WxLODi*oZmO4 zgO0rujReyXF&GdgEtr#d2Z6katBk?l_qSmNqYr5bSpEW?gTuq{UNs8RXY$ZUz{WJ5 zcokcJ@5nbBCYhitas2IeG%>BD&O5vo^J5At z(FDyAKAbSL^7EfV|G2>Z7H|e?Vk8Uw&~%zCoU`LnFH-_4lm?M<;&MOGt~EOoMLU!c z_)DamFCiHX2rocvQKmA1b5jXEZz7Z2i;=L{$gnfI8+ses^deMnX$bFQZ}QiKKOW=Z*1La~JgO2Wqq&WWx2 z9B>3tM$jm{gja(zhM-HeTuyOc^YtCZsc-=Xpg5H?*)KI~9Ng54Z~U8wuayE4cVtY9 z!7vG_li@zTEZ+C>TAzJ~D0djga$LU~oqGZlq+?U)`7m?|Wuyfku$NZKG9;^=b>^Ha;}QSu&@xLA?ZVC@NQyE?vhY_Ufd%&0n;xVN|$+jt~s{Wi@9$!|{NOeY({_>%_G_pKGpoes!<< zX9S2S?_~v#KwDhnNI75(@5@F zGZ>6DaDIx4K%9zIw+L>)6|M1BDcQt;1vUxw9y7iRDEgr6?x)zGKhwt~0i=rxDQU1$ z5(8EZjV7VS(<@#s0-VA!lI2n7_vWf~>$Vk0=%yB^IZ|F{{zK3<7m;or%zF0X#RTvP z2!lx<0%bgGZ(nD$yv_E6uE?jTD9(F5NFNunj5y5nODx-@*b+&ZwzgOb@ePePceHel zsaORs1QoGfF3s77o6~1U_Om#1{-nXTXh$2O2~gH8O3vBWU%s_XJGs{8NYZbmC8w#e zB`MLriptQ7q562|oMf4(JTEV25Rh7P?qr(vAKHq|WErXcAX;M5 zp+RI1T0e~ki#K0fVW!+5iVwNLz*>3l2sB!46Rq3M=$*XFCh*tS5rIRGZ)AxMe+EM% z6GhL#lHGtKfS_cSv~RQ-zm_VALPi9n;F{L)ij8*rzx6Ctc2Mi$H`4E?qpK?58Nn10 z?HVFMM~Y^dDAK2t-SgAFGbB4J>)Y%2eX{F|-`@lO4-5(feq=>TqV~^QQG52x==p%~Wm{7);1RGBDt-FOPD+LpbhIq=+o9F_pnxORMryKV zP89($zGC@$`S8*n_2h5T*5AKh`@>KoLF94Wg zLj;Y8ci3mppf*sj8KF>%0H8DO>1XJbsQSvLc2LJl!IraoeB~!mCT~Ya2P~&iJ+`&w zO?}Hpm=#)CSOhVr(A>%NhFHR?rcRtxT0xm3v{U3C)D#)Xd$&dUK8OK#ph6Sw|Kl%@ zX-U zC8S`$dI=R-K2u%k3rTJ?cldX0#Xk&+G)Lgk*NoHtyFJT0=?*yQ-Ou*&rAuq6Y<5OQ z+8NtOQ0Zqo(Lb->Rz5jw31-d+ttY@yVfc+m{UZiJbD1Br7KFg|TtL+w2Kzt#GkJ2* zUtRXcj)} ze_@s4p^}c$t)QC^pa!LoU9Sv*4eSSNsGtO6V*Dmt5r`6!<2fouhTRAwEU!#F72kUO zhqK%^b}Wz_Ol()Kl~~3>#b%=8>1abg6|y+;Q+ff}~vfG-M`@K4RsBU4lrfQ99sVQj3=WcHpwolz=OIIGzCCdEAmE zH7l*V5727Ua@sB;7H7V1w5UGNpNsXL*pSd9iZ+DF1=sW#wTIZW z@ta`Xz(h$s6<6AFlOz`r@Ce+AgCML)BY` z3^r(YuI~Th>%GId?%%)hcH$}{BuP)?w3Eet@%JojFLF z?O^3l6Wj7_R8VUm2s;E;00z@%@PNWqB_t~P2}%^S<5S?XkhAa<>Fx+#c$Z#@bl{x~ z404e8UN6w@AMo3pe2?qCWiMVm5=H^0FI|_XS+OGmU*six7uI470H(mxgsA>S({Qyd zhZ}mle#CF)ST_42XcQSbB%=_YjHF-Tn|gp(Pz1Z3D7Q01hyw}&LWtBEL$HB9_%E9 z9zYDK(g2?2M~J|gZQCZk>s>ao7k(j4ov4CcR-TE8wk~P<-1O&_YO)7-Ntv0dr`p`q zK6!_{9|m6!Syl8lU45*z!UynKmBjTt6C++%Q&V4B)}!Zk#y zj8uE@OmLf!*fB8m=x}g*d_`&<5T(<=9SKSWr!+$~5AMIgo9~31b2$#4l-L7;10S|y zN{r|B&WNW^JA^xMSh_T~jp;ed!x%`;5C|tMrn-b@YUU5{bP|LA1zg)$593B<%*|ao zENb897WqmI7%s+8vL_GdT)52LeQL1N9k7|a1oEB;V{wAP3j$s=iMjXfeHt9zU@T2e z#IR#=&74)G}B%YWnlg=1mY0BO+hXk07d9-dHW>g3y&J8vKC(JCIM) z^6uLKwzdnH3H)%>8WEX;NdyWkvTcN*e8j_#dq^)Y#r3HL7;12Ig84sD>2MQvX=zz> z7KxF3F~rehoZI6{r_>RlCYWMK)@_ljEvShgR!h?_c-*S3m(zAOKECzqd$(qP=iV*X ze)6n(%<$YC-$Ch3QRnveEYlH+qtt5`W^FNQUR%I@x`a(~)j+|iS_b9QkN#;c=O-HvVq+N|#0 zUiHK)3_kD9*c~pf^eiY4`WqrIl++v)eu(nX(9JpN^WJK*KyZk69w{%J5Nx7~01zcn zXgL3*y39?ldma&?c{PFP_Wzc=N^?CZsM1kB4DZ4Vz`G_>L$GmTv&>$$vmdJ#VNreBNOfBYHR=QBjzfSEQ|HvsOw0% z{{8#6lXLE`p~Nov1Yr(=^Au%sm#HA}T>GAS$d|qD`PO2WkG+9qFT|qNwtZ@BX1MjX)>hCda_e#a$TP@_Pw2m0 zS5UyNn|q|`=Ovo15hHin_UoPW4OUrh2gVBSrlkpEfVfq%KYYx5D~xup&n}Q$Uwv6NC(Iu6aPY7SKjq z34WZ^7$UzB{&*}y_xP-|pKWp-_AX+n%m6w}k~8EDFk#h8ZM3loHeqI>h!poxII(kQ%v1OImHRKRY^}irwH$RC&qy zUq}ry7_iAB4xK2qtfATESncSP4PY*yp%6nX1*#7q^+o{C7_Xn(pZf~Qk2qj45+jZw z8CV8Mmfu(H`;z*=R_c4tpU($7tLiE%`>>FK;ubLp8m^wN)G zc_#PY=%)m5%oWOhCEvPTk2n3t5B8U?tG6d6X%#K(vV>Y&+c1~0`^JPEHYvB;zUN-y zlC*P20CRFxP2mk53%L;0e#}L!Ska0Xge093BFpHGq|5Y&iITQX>c?_|om`KX$Lz<|@H^*q?mtE{pX|C@FD@IAUeybNyX$SEwi?rydj)5yO)tI53zH z%%c8-+#_lEm~myHJsug+Gfh59N4MY7Yk4MIyHIK+=i1mSL+TT^n6--+`5rxz&h{wr zCHimk`saT-t!whEZ3k(J7E`oq7H1hYR=aau({P1HcE7!W6nSdIguC9*gqG81~1?CQJme73=qg?$K z|9Rt(+vh48W0pCH>MnuvC)+|Xib@8}p%W{Aa~s>q*O7#oL1V1* z0d+)tETC-qP!cfi{L5pZ9te*g#QcMEO+wgEuKUn=shyzo;VHXBqc}W0#QvbMd=uTy znIqSdw#sr$<{Xi%2_G=Ad>4t$4|g%tFi9?*eo$JuuPoEJ)Q>VYx&f&edlRkoXbqpf z;EO$5S}NPsxO|(o0(zpaiGeghjo)OIBJOYhG!TEemQK`13(tn~)40zeS=K~l&iA}6 z8GbM04;sIioME8s-|MS;k=gT972VW;-|f-FmyG=3SA=PG*XFuP-&h*27cBWFA-3zS zo$d15hZ;^(=Zuf2rOI^zQ*m)qx zro8yH&aJxXPU}t0C{xYMUeP?Otu1Hw`0ekcZ9rn3}2ot-Z49M4rY<(OJ`?eMXT2Yw0+ITNvTfH84z3ZV&o zXRRc*4$&*X^*|<9$kGt-D8mw-1LJHBHnfb^L8;*#mmA+)>P&Wo;X3<;INoeCFZu~rmCOjd;{T{ z?H7v$$kx3lPdwApts@kQ(MC;yO+wRu1=KYlv(Ih@Cf+|HfihzbBoUVYImzls+>Qu9 z*@5SQmJQd_=>(a}NH~N{FI3UAHeq3I0RIntAuurrhEdw)5}2Mp>okyAi6rv?1hNcT z|AYi_;crLkE9BmA`IuN*THYo$WV@FU5jqiWq94KKl1>giWv<(ER*IYAlO{qBcd4EA z$<$z#;$vK_2$1tAUcPPo>4+%9+T0}V5QXdP!_3^ueQTAmy88dP&iTF+d`}~o*WTY& z(=Wp$BI4<`bgBGdN=;44KE)P0q@H2U#nG!6(9UStV+2Wvnd$7=4wyi&@md~fAAq*d z^aKDA1IhHUre@@I+1%EDm_9&H^8ia805!cHt#OG?ve4;26&-z&UgvWiL%LB~y88U` z)TgR`R&8yGy1)GfS(!q;AMXCMy4)C2FKzSCvuwpXrts|t*Kr@w*&0E)a{aZ{%X?SH zTP{ss%{t%WJu`VKdV053YBT0SqQ#LY%jHj%puk!im5j(puHTv|H*T*LKV+I8Jw|)> zYetu}uTkSGI6y=ziR-A(cJ+n#~`-B2u@;(Kuf@nga9d=iZXn$WSK1}9$W*&D*&q=x-@A} zJy&OV(h|^I_xAUDgPjF)$_6?BSQGdIB}h_u5;8gAlSC{sYLpkAAV#QO%mPPdc>;WW zFJr)hMYR#617HOuC@aRrRGR8lLikIJ@~-_TZB{ZC7;GSJK0}U zv|Qxt8814ZSkz@<;nL4=-1cOjF*iN?x$Zah<>N}>od$b0QBlPt?bv6yx>;=xS`?Ba zyJMdeaGNkLjzr7ov;T9YzTmS*1&){9qL1F^{X2Q(__R(s)V|3By z5hFkMmL1FH#h>eUsqVLKqp(5Ny)ZO6QlPHL1ej*P`;TM5p3`<&>JjV;bJ%OizPGd{ zHSO52)=w|idwNE3Sj+W@;7$`0wxonXhjSWJKHxi<)MjE0BdP*OqVSAOfp1`ZhwjiI zv8s+CSQEIK%Bp$(^;qj~yNn*{53f~kt9wqBFwtVv-IV;d*m+G!|3%e_(M|)V{51pL z{o=Yz!2(8%f4ve1q?hO^gXM4LA3srctLtO%0rvRSF0ulbkjg0FNP{P zg4*{mfRZD%he@e*S?UiVt58V^%wj*k)U9rB&*r$)<(+SE6x6) zo$Q{NWRv4y4S%-BXBx^W8&nL*r^t?eJ~mWo)KA7Y%UWZk)z zQTs9lS2tsh^BandZSNSO+_W@WQfr-+&)FEcyypZNLpo#tu7{0FH$ly}P_nip4WGpq zoJ`^g1x`dFK!BF~1p*4Vi0lIyxxFW+4dpDG#528lL&SAp&q0H_j#-Q#yB#0m;EZml!U@!wohfJ>9x`njL)58(9GB)O80ZU`{G%z*~KNl@nkJtf3cmM z52t=o;oZea!Q2aeczQSisQ=@rC>4+)XjcfxfMj{3sN6$FG5^cE*g!CWXYdX3CU->4 zk7IZL2;@Ehi^OoI%90ge^|@f$EXW5A=Aj+<$0n1aC`2&H(wSN#%Qe0|N&D zN|8@z+wN* znS|%ZHVLxdo?zryJw+LcLcW+uQY+|YWq2xx%f6u@7&@<1=Fq%R!#jUd4xU-bF&F_ zKMeAU4&XcRsYyyK6r)$t9%yKWsA>Zy1kclhibQrPVq#fOTVfzLNq#k~1ZVttu4vrCChy#F;&`bf&2V_aML{h^C-)+^@*{u4a9w!+-Cx~1Cg5n?eFnu7uL8QR#`R)f5z3sVH4-^x6a1lQ)^}R55 z!`@y2XdzN@A#XxsNj6v!RT~s*L8XaSyYbW^s{ok~Z6Y-QD%5b`kT{8OZ=l-6b+27E z1Vx8$qR?C?LBMwOnb2U}^Fy&`Z;CUCQc5jsdhtTz!^Pt#_5ezy1!mhH%TZe^>u`fN z^~6B+*(Ry&%ts){!hj=-mkvvG3B5P?lnf1^fkg63Es-36Cxj|96nsIOic0R&&k0-F z_e(W`u>p?*16wm%{BFi{m(}c2D$-LvC%5Til;p(n)aB>rw;r?c<>@%#NYU%~QtC34 z2Qq)hkF~0!iN`R7HxUpO*{KHZN43P=O-)_>b6s6O*yInNKa(|qP&JP!)&qwpTX=qv zq6;Nn`rqk?_HNo_XCx2+XSlxo+mU`n(hUgi4Vn}chwRbAdsv3wX?@lpYN1K zF|{&LHmt6twEL*?vIYaVZ2ZxL+>^|wP_PYkXjHT_s^f@3tE>!Rrlb*kfmABp$)fW1fn%@TbyP?VTbJzUWHLh z5q-%d2z`;{p%G_t6mQ8zLuYTI@rMv9=;ceYmm3{L1>o$arY5~=9-zRf6Gh;m(U5lC zcsRF`Z62r~H=l8Apkljrv2Ne@@G*5RFY|##2Zoa6><8vAoBw{?s-9JRZlip}pMA!L zJ!7h=&2TsG%+)oB>vhC>Q4|eg&nHMGkf=BsF9 z4z~aBX<{?#O7YV|=L_W?F`J=l_XACcftp0w%J*mEq|S-n2k!(?Irp1%jNTx4P4rS- z6Z}hcc{FekzP&vM$K&F(1^mle6|rW<#vonSi${*V-U>m}Z@l9Gy)ff|cH9AC`yiHb zlrCRL*V5~DR#H@i@FxbQZc{I3Dk`Mv{XNEkF9M+s!q~3^0{bM|NPHEKpFY*@Z9a@d zOnjyB==2GJ2@wi(?C=5Suhd3HO`;uyMDP&=2?_Myk#CBZq5{WapYP@U`$HN^WrA%I z8Yp-Z(3n)m$3L8iT)yozCf3~yQ26%F(Zw0L{?FZIpVlHHt(H=FkaEq2xpVMi1~;%b zR57%P&>0f)qpb~DE{=LNg@zd=y@l-9Vgqp(Bz%_BQd)BFfC9??Zah)x9l4H&J*&Y^|N-N!#K%FL3P#yY81G*Oyf&G7C>! zG4-MOm1o_(wyH9?YfjHI#m>aIYR7i(nGw0DKVI9|i&hQy`QUNK!e0XCFjyQMd?Hds zZwAhR?8il$F%}j~*GzCx;Kp+X`Z=}AroxDph^>v?gXk3;26xS%8CGWIYLdVTZ9(1h zV`l)o!}U(Kx}nn9EESw^uEJR-Y3mJbI*PZ86hdC}!A_4#(xlz5<&$y-5>Sx-P^?%^ z%S9t24^+V*Ry`1%0%#ma423|=IgWKWx7Yt1+I`E;E=I*dzsdlXLK3|_gkL%~$NWaE zl8fhFMhRo^&SN%|Y_(lR^jmy%#o1O|85uClb^5e!1u?<(r!XG>e|Gg&|K|=f) z&y6lUpp4n+2iUWKN6yGbMnG&0WPcWr;i29UFF16ql&OYVIghYPo!ssH&IY%XuEZw^ zy>rdi7E~IIa-F4Mb6Z``-fW6h7+ay|cgF4m1d9g6r)lj3q8f*O>4mh%`-+ODkr}d_ z5u)n<6s02QGVqFLArc0f1IPAyt@ww}o;@S?NXMAm#OsYAc~&knnlZ;$^}?Uj7=Ku& zovX;?-07m^HPV^!{?HSjiLUXIU4LulC-Yz5F`$K9kA!y|#)l7uKmhR* zh>O6Y(qu)y{XhAE%8-eHLEL)mzHojUswgOeY)BLURlxyi4oVfBNs@B+eH3Vt5Q2-3 zr@yBd(sWRi6&n;`n*r7Vz!pnfh+a&7^AGI%L6;yaTP}I=NNy%gXfDrbaQR^C2VmlS zpRtE`^6FzKgHeJaX19Xvqy-^3Jv0D3h+emw^um)fVvP@sx8$20K_q_ zL0w2dGBjY|ongI_J|h2#_o8gfVGj@59zA`&qC{wFPT%iX6!}?5HExM3w^om#>Wa)M z_nBtbrQ0r8fegPrt{n)g$Vy|h+G`(}xAVmM>vX~*3PC*`1H&g+20ugojy`uQcDIlf z5AYJ=!iTY93yuQ>=p|}neYp5Z(i!B!u<&5W$GiiDeH`3;q?yH6jOPFT`9z3LU*c2Z z42%u+ppL%@?=OMU5y?jfoiB2X`KVEShiwcy2@oiQ%ogh-MdWm$eZ_I}#BWD=gJYGF z|JhtM#uqe(uFdF=9jvSmlwsE%YC{2Vl3m~lY=g+I%H1wiaZra%m6 z$WM9nFS1CI%JC-7Ri!0EUI!5maaO>J!O5LUy&r|@5I_suYQovRs0lcgq~rmdL*@^# zQkcQ*5V7tWrkV1*Ltk~Jr9wXb6raupXJ^}4zT3-L7Yq>!0%_GRgl@!x`5-nE$h?9s z%M?SbIH)rS0UMIl4-uy>P!kjfxRU_&vd*)(dY+Ost9wZ;y-LxZ z=Oc3_QN_=FC|_Ocl#@DyK;ot|hVUj~{~Q5okbXsSy0Bu7%#;#Ha-z<8uyj6zYTFpX z-x0;I87Mxoje`)uA?&{FggXfmXfw>CMAHa4JVUWf!_2IRQ$FOL`1)P`$z-po0W969c6*(Z-|ETNr_{vy>eVa4iv4R2 zDsGwMcakg^{>(Eu>{2&)q$wT(GWHb1$PR>UIwkO%bTTiUiY(gVlY%q@Z_r^k zP#O?Q0a9fnUZ&uzUynUKH-dg3TenfLi1GDjp8QA8J6gPyqG5SKLv5P$NB_<}(0LPm z%SLtm+dRnBV4m88=v%VY3@?$faxpGv{bH9;^ebQSDbOm8zw56pJ1ORkrJ&i6s6!q5 z31!F5*7g^PqrnyfNc1{xebR(Ld&>@%st@huGe7W&q0zao(HCAq_0N`u8Q_y2KK}W84JjHWr z`5__iQ;o&37h}@Us#l^<+bw?cs}I3eSH+C=C4By4TzK$sp4CQnK=%UTo|>;?IUVDa@0*#Fg#ko1$m%1A=t_3tjaD2fqVC&b57`5^4y-H_(+zv5M zbTN4XE9b_Fq+e)kFoDwHGhk-3($L18!W{5BINk5UYw8i)&@f0{g82}FG)#MR1T44dM`z)E92(rdRD<=%uC|OKtYO-|_nO z8jFefA^IOrnX0Pp%ypHlJ)WeaYbJ;ckWb>NRGZI`jf{_%?HdT#yHuY?LO`ur1dtyz z!aS}jfJgv7qaBrxw#A|0_EShNd(d5XOs>kt0Fc5BD4$5x#jQhD6K5{Jl>dpPYa~BZ zE0xHV{~ zSD0CIC9UCi2dGZ6@=(!ybR!>}-+J~e){a1i6nTQm#ml zQ*1JT;tlr~0m&SWK0ejc@T;0ZwSP|Qe7Jg|Y!+tI$tA$+V|Pr9 z?I`o|LSoV~A~GnZid`6@+~4JyAHrM!76ev?4|shMi9=Frp;{%R8ho|YUj6xqlf%bE zL>a!YU+6hVbnA^9Q*G}qJXL$-u2JRT)h^vK=;iR9}lj(T`4-=U$(3}PRw2s-d&ByXr z-7(`@Ce;^XU9oyD;%T~@8(;ZIJ;8NOR*Rt&!8m$AGM|o`K~9eSPyVTm!CFTgQU(Pk zzMSay6aW5Y*%$Jxjfj=OmJ{#|>QJ8&sT8zu@B7W+{D2D!lRMz0Q!qtADd>8oq>wtD z8bHtFS1nmYBI5Gw17!c&q1#XooVkjg*af2cBZMzaWf04COSLn_w{oY+8WS*CUKPiruUI8ce2 zCw6IUeVb6qjD8o~hVinbxcD<*Xu|Czn(4aL?j&n9b?^LFu>-^eO@NN^aY6oHbky5n z+X7S4@|5;f&gKB+Et0O3R=>wfPA|=VO^d)ulXwc-4q{}8$`(jlFQ8Y%MR?&;*yF9h zw!@_=#*(0bMK%j_9yRmIrVUjJsq4L$=JfPjxWM;!UT(#Yv=eV`GM{yR>RT)U`+d z{2_@X@X_$xnt(j8dU+L#W}9`qo4+&i44;33q%T}s@ca8_WjymLFFSelDi>PGq+mwu z3#k%YS#?$I`-RAF4-eJBU+p%hcqF(dqfrk_kq1d`ONcAGfpqQ%VvP=pXhfQl)va88 z=>m@j2bXfqx0+;L+kJERpx2B|NpttOk&Ne-qw4@m8Jn5O>m{FgT7J6XqbDTe?B=12B_5hsBc zupBm2l3>U82Wi-G$3IM@o2smQfX!c^ZsDu}LVyJGOC)v>z6jiw@5kjvyKv!Sp|u}# zfmQxRkJV(}fdVchd{5zpu%eEF=~U!6|SJ)>B?ZFxmH9z7{B zX~D{>Sf3`l8yy#%mMmiSBtIGHbp(W_^e$yGrek4I#@vW#FC_BM@&A zxHTYW_?t7b0XCIoR82k{8pS?wltG+-xbZytsLH}7J^ou2X7MAN1D`1#-5JycPvDD2 z;RHO2uA=pAiMyOYZM*HvuqNcwY!7{ zjajnptnEoq`;0R^kHSq%@4$y%H*x93R-+j+fl&*rFr(+k56~;j!_9+Y3P6hl3S`0o zk8z$XNCWe@nz+=^o@TD1{f^#cI_dXwKGjr| zzQ5S}OW-W14xj-11x+-WUExhb$P}R(@CG8H;0b!%rrD5jU*s{6Y+jHA5yxy;QQWzA z-34@V@Z%coFVNFiUlxH-AK=f#p>xvKHd9rL{6b!>G*B62UpD!`*qvi}&0R5p>+;2C z3x}`2xN59kvMMJml26CfN@8mtVo5d>AT^jT5^OK|2$LFuS?g&>hd$O1k%hjnAa5+! zohKh2(hhuh*rR`g#ZMpQ-c#Usj;T_5In$9!6zzI8{=56i%tb=9!}^a+St_BL-FdmW z2>@jXxeR=)xzqdm`s=%?p5w>HB(;EGYq1P;&8seAL{+<2h3w_L&&Ug$?* zj(-JL3zC;H%UfvafR23uPY zs?m#XCmIJg*KM4Gl}sO4e#wGXzXBaR$~vJ9FwPKU9bW;T(R-334-rW%lzn86GLB!- zcoy~p5?}yn>(v=biJotd&;M0|+=Ab-s926e%IC~hFumZ+d@x^<-BH9&jMXJ%?87(s z=KmQd5VdMe<|EKGL{x`xXoZF3Kp|9g@^+At4Id>w{Z7&wVdlY6=teyv3K1~wboBJH z_{lSh{!#)7g#)>3+ypKlQRa;WsmC#MTK=} zMo4l6VNU?p-&ac}9UKX5!-PT3DmE$2|-U) zbHIWmxB*4`9_l4_0po|A0@oEB7FI!mE<@Lx;T8eghhRM41{q02n4!Obs1}KZ{{SYy z{{cA)9R(c|Qx%>HPkQFNf|VzC3>3}GI00NWv)3vqgxDkH5*@xL*hTvS7TB)97M({S z1mwCEPB3WZVBvwygr9JIZ*5}~>AyEMX#g-!FEdja-FwVhP#KLsC64P=WG*JFx#~p$J3nQYWn#?kYjGL^QEqFo$eDDU^P?94Q zm-ZL?wrLQUjdKGw_7hMuJbGl>g4R5Mq-(U#O;1aoJ)`8;T{d}o>~I72_l>ubWW14agqPREg6IIfXP{FWS$^;aMT2ko2kjCG-D?j zylXg2lO(|cCGs-foE&(5S7lE7SlwfLUGac5ZBJgdZ=JPUZ9GC#F-gU|L$S zl^F7&WAY`RuVkMwtV$n2M-LGW)PSPm*EKZgaoYnxB@y)qD8bR&gaEV+zwmL06GdL< z)924VTd3U<@2^{TXWd!ZQySZuX&4wB+HVZs6bIxD)HwYp$CpcYF#mte;*?#$i@?aI zeG*!1rsKBzft*83XkuZZV=_*OnUictPffK7@W0qhlIVhhY`YffO~>-U@uP0M!ht|E z9?>ufU$;hX6Ql>gJU{`U%Q}x;7vnBiNgAk|4lL|F!ZR50YLfm6u1Fun1et$Vcl%Rg z7Z`%IkS(C!?F>UF!U$jz>&^Rca=xpuTmjS{(RY5A_Luz$+==kAXg&fcr}jrxBJd;< zEfiUnO{O)RSA?9MOl%AtDnXh7usC$S8cydCRsohG!hBf^aim`%q~a-#DwoxMug`Qx zH!p-|?K=YkW$@Rc5owthdEGcf*3tplCgTBwePqc58Vt;4riOixsp6+=vUC95jg$kV z##CgGj8U9%VfgXS>ZE&O;D9pG>ZUS2Itp(?EYxM~y`!U$pw{RV*mGFswsEC~yzaeq zfP@(!Nr9V>FYU2dMWr9*#{{zA2*RP|C|_g6CRTvb7(x_3f784R>>FA*WP*|1(?mK5 zY|~#p-1Xo6dmu-f7v1=|?i@G+ka4)=3R zz;)a;D{|}a!?0Pkm0OQ0-Z}K|%(Ce$I@|TkshY!bFZ|-j+(YTeRohYo4hR?eR`lou z!9s_Dq^=3;z>-E0L$H=;f5(nrhYdbds+T$7f|ikoL7&3eroF(VN$fEuk=(!RuE>fa zVdqGKB<5JimUmM*F7JXvs&bAfJ1mPVU11)?^0FR8W~R*u9rMv^1V)O^-14?QZdf7{ z>wPzZ?*Kmv)S$K2(&`Q30!RTMQ5esUm5hj?{s7y;@=3m_6>e_8rCe?t;8^WoK*zYj z$)M%(0hJf_uOLJ=x6HSKUAm1Q3$cDI$c6(216xS4P+D=UOajgbzML%sme**5o>O^l z+1Bgq0Ewdvk{h7vbB=HS+#!gfO`^(**0f5~51Pia;H)5Gf;T^1nG>5nG!#vaA|w=? z8Kyes^nW>1fpCmwO1RY!+$TTc|8Hzv`nLf4?OQT4xkikEAd+>nI{gCMeAO z%KW$)13M^c0E*E`NmYZ~{9Ih9RMCp|OBg~B5;Ql1iYR-SI#>5dR9DvN>DiI?d^3Bw zgTqn?9Ok;hO;3p;gH&x2YlcJ=oV!c4o*QVlSU}5D=HW-#Bn*;*sEzmT<=4Cr$_4+N z7%_-r5cE&m5KDO*KQFp^(<`CV)6BQ1f$^C#T7B^I#i_0$S9LX47|kK zSd>b4D*(}7ctS*ximC$XI;^&_&?JD>WZAp7<}(ktXw6msE+4gs_#?1@5 zsRq0GlQFy8u|4n-Js5&+E)K_LXMbpX^Jk+Qx)dDA@nnWTFIm%xsi_0(?4|R2tLy4G zI61eVQNeKf2{kCd*qCS^q@+5Z9_2!w zkVnwb(i(R#LQp0j_Yq$bRNm;=&z#z38H;KY&_OaaA`;Bscnrb*m~%Qc<1cDHenu%+ zyh(bM>z_eJMxo2HGfZD$F+gX?XIS(UZ`K5GX3!o>%*BR2?dqKJHRxOLGi6@?DNgXq zoanVZ2q!@SA&e7ZqC`{YoI1qqj)zR;`DuQw)2p0^%FBQtZ6$g);zm6G)W0Jp$L$6GM(Tp0R2a% zD57Zsd4mBNj29Ftd&%M#I5x2y<-(y@J^Xvf7yD2!qw!jRra0!1XP+fI+JF}k{CfrR z20lTN5cpqiI$6nzlhge_n(%q9pWF*#4f<0iX?F>-K^ns8o~il)M2kJzw1u*M3_S!w zxjwk3_YG&xB<<%}ivst`hA=3?5<%XH`mBu}Zlhk`pmBo#72fa~A~F?oTReg} zpcmjYD&kk|QC{d!z`10@XMnE~iA_ccrAc^Uh}e1_oE+H$jeLH<3&I!&OuqcVcS!C+ z5*r8zpf*af@SE=|-R)~5qt*x6*gk4EsK~^M4}ro$qkIKD1^Uu`u(^;R5!f8i1EE|H zR|7Uj2O^*W@4I{gJHvI7az9>>a8H-NC^=p4bmgO5x2+t*&{^NZrGX%3ynRbt0s^=I zj}`&B#`{B#2&P{=F8Oq=M$p8H`6p9%fC7M?aocObaDjD)Zdtv}WrAQ5k> z)g*8d_Gu)4Awaf+E0xGBkxtME>WwGz46smemt}2j&w#LkAFUQF*|d{YD(qP29n8-= z_wKDD5!9{s_~>#^6C*xe8|@NzNo2I|MXy2@IYGC)bsZW-GVYU=Nf@Pw(x|80M-#6# ze{yo{_>y4B!=s_ntJ*UrnA8Xp3ZoZEQbV06ruWe~!+q!!dJhp$t3*~nhIlk|dIBml z0O-ip79>o*`?gV6TU&^1ul@TM=qSoy=wuH}j{YiA3j~=bTE)+n+G^~JnPIUlBbaF8 zA>_(+&fs9B1^f#U7_z6;_4SqUG5j}w#J|EB!<-015fQIFjtWF~LO?ORpo$)O@teW~ z7^0f)ZnRn&J1aqfS%>Tcf@A_gte-GlGg$)+A&F1;gK;N=YqpZpDMmUbz9CLD*SLjH zD#>0HT=$PR(fORfjPe<6g`1vE0(8cNWycm0#;{JkKDG#WY-$GMVchSCdt6tB+z z#FiR*W3yML%_*A145#)|)5o^H7Z9l1Xx;YBfn(4hAYQ9&Sl4wCCBkt-2mTm)=SWiM=(HDc@TwL8EdN?`r*wcgr z(+J;^P>!QoUdO^>n=PRLd(5**sfi0=I{Bh+$s}g+G8=^&!VlgRnTDZDD=u&CPJ3-o z;`ikkVl5`)Kt^jx9+;t?9s!AW6%!g2V8t9YQ;#{@k@JeeooSu7&k`(#BTl- zIy&kDX!=mFHF#u=G9J`~%SBb?SnN+Yc~;^fT3wTOikM%3yJLR{u?!PeN3u8-9i7A7 zKi81|^a9?VY~zt*Hat!Pd(E}dTFH`kRMH^I`)4IW^hp(6|+; zQCcbb4rYlvST3-e;-{xXE*%0-ccJh%B8wZp>I}Zyc%idCTf@#hpZ?RqZL;j-(VKT~ zLzbbS{?)oCDpK3_&+G_uLe0FJ#05$6*n!sUe?X36byfrjY;1i0jB-noia=V0yQE3& zZ@SLlZ`t$Lbbd}e3K2j00vR{l)^Qw6@daW<2&_UaZ6(D^Wm}#~6qqsFf^*smB)SK9 znHjX`0FCl1ZloPNn6lBzDl>c7FLy@TP4m_m{pLvtNrno=*W@cGS=-UjFtIS8dE-4g zEDU!au)xjSpJgwc0T3=8KWk0H%EVfICF-a(&@&HrYAUL-ni?g_?N6Mp;sKEDE4Lid zKJH(PNJuy}^}D&TURfsg9mbzD!{UDzYW=UxCalb>%Od-3Oi93t%3vn+(7~luD5yrx zSmZPB!O2Wxp%0z<%y^zp3+I&`~$M^s*6_$*uNP85d7o4QHus6GUw5k@~ z4{E7Jg>$Fw23h3j4z_9f-VaWa!p-GU{9gINqm(esvGUcGXpPR7#%Q4k?ScX!hI)lM zTKi0tikdp`Sy3ci^U2-bx#*;k%;Jwh=Hk)^JOQwvgh@w>VAjD31`Le~NzcQ@3>9M- zMvzX2c|SGoRsPVQe!jj~J`$S{l5|Lw3dNNy3jgjb7vtjxQ4aUYU}3J2~&x2*mCw)5_cAk6*$v%QVX znm4v?T8_&^LhutpY31#EQ4_@^XbCw=@MtOK{`4 z7bavFL9PR@1nfZpd*nQNEE?n8Q|UBMgRF-{3pseasJ4XRe#Bx7AQl^pc3?yyEdV$Q zv@q`vSO0Pc`$1M6U~zy-we;g8N1$Z3d^J=hB$h$4WzaT+}Boho`&tAY9 zX<;rxhg4)Nc{-Gqu0#6#Ux=U_l9gde#VBOJF}}y=-RXj$U$8OYG92ex*F_CoKc8OS zKkDj@!tE+i5@(@4CO;mSI1mqZSl76NRl=!-GF5(Kcel{ZqIadjTMto3KzMkITi_l#x_jtJA=U_@|Ng+gL%(6)qM_IK zVL}fAIS=1?TLtRY|H(LYvx_oSF(KahM)wh>qsmSfJ?{;FkqmZvOTxgxW5^2#>U|^m z?_{Sru6leHvYZVVLN^5OBneJ&AX=mJwZR%aLnLWVmww$@5GH}EDFb2MJ}UoPSb`}>HlRHLcv;xf7gR0 z@7y~5u;On~Q>(ReWaLIfPES+-i_VF;lRe#>AV!Hj4*X3cW^Li6*M`N4xMcms(h1U0 zR@NJ38tTg>z{vl+Ex%-`K7@HB@!azY)HSD>=8GJrGi-_QMCk69UwV109GnDkpFCbK zh+I4TiA0xOoy+#Wy*Hr(qL5eS;xg!qFlPi_X2I0ZDC z?DYe33WD}cKPQPU#aa4n-rWcId3q$)aMqTrc$fkqUj22Th9h1*e$9 zH$eVI<*h0p7-G;$6wK9^K$?nJWaX5_)phn$oY=$7a(q8~%t?wAkJWAU5(iua&Oe0| z^Wp8fnA65nX4<*KG}t7NHo0I3x&$ze2>g`x*^0;#NHAqUzlJvs>|BR+V1p$T^qp83 z-moY8)J8OVp9>d-i0A;rEKE}AlUbk^6hUO;O!1hnQ$Byb0w?qB&(2O(2=XaS8K5o3 zb0z_>O%k)CI~y26xi=L6F(9d(Kpjy46Fk;dv4fOA48P&MOKKio5?2X$mW0zzEXE+k z6EVSH)5hX!sl;MWpJoIt3JW@Dpr?ARqp zez9Byf=#|%Uo;~zpJZcCEKEKM_>E&J3NK+YiBtQD^ju8OJj-9us$McP(=wL5D?1b2 zxbi0IPaB6zbB&2@3bG|c6Q-xGo@!x!jipnhOVQcATTJlZ7WEv*ZkLYK&+Q&pKVea-)%JA~j1lLD?JXg4sMkHP;5@HQ>jNt9nedb+0u(5#kLDOL7 zFOl>R2~Pz(uUX5v7SkRWb6+JHmyk@>zLegFVU}Ao{!!ywI}?0N|6KjCP-ac-wZ#!Z z_!of^;=+4?n;a#VSYk0n=C5=_QgR~U49^YNueQ#qxv@3+P#hDm9Lb!5>ajbpj{{}` zLG#@i*U2UVj0E6Y25|Gpum6n+8q%CD$1GsYuP<=Dj}VAD_=_etiyV%~nUOs`>M%5=?=Kl@+hf zx76K{JHCBeJ=?c%qO-yFUA4Ilt-cLvd92<(gbI0hqCO^Yu4T$MG#zR^6J`tu(6)N$PAk*(iTskyy+9?f3ea>6Hw% z*Qt|zr|t?V>_AXG@F2uE4=t8##Wq1okPcHN&rP}x z;5&nkzWFWNu(4b!1Lf~fUKzXT2^zzuoFi-(wru)fmEkiCm31k#^h2TMwVpqGAzwwU z_H%A|zo^JnSoiFshjoL!SkbDu#Om9Ls?M|L^YgwpM1NeFbIbS#$s`0OHVysHq=+CK zX|mx2ZfM|BU0kVX8W0`9%aPjQ8yhS0;sv{%!>GH_KyXkJ1$&g-EMK)Vii~<$~#G*bfz5X|HTT?=WkHexwBAt83VGl4zzl_v`P3V3AZ)g`f z@`K(cEcEj7ZDjA0jIVJQ}kx z2n>-%p@@DHgdq%3&}qmLM%bbV+@}jRrCu*o<1S*hDdxz{3{KJoH?RIG=^O35M?{Ahq#NiCQXd<(0w9=QT6$np)| z8<-IfQ+U(W(0#!aYZ6SB-OzVz+PW2Te~6nOWF|-%3oF(4y#@EdgcF0TC8qZBsnMh)eI?$VYghYrJ;; zmpTnNX`Sp2b(o``{wEh=J8~ohlbRM&)Pxa)`{A0LJ3A9+L%C1P?hg(*sL1#1DU1~g zj@(={WBo-Ev`Wo6$wHuA`Fc*DJNL0=*R^Zq0q>ktU(|d&rNAt_TUO2|+w0LInUt(N z#>)E-JazfM=((t+O+a;}EqGICy~On}(wA z+edfSu?@w)DlNFt)4Dk0J=9ey^LG8^%jH?Ct7{sxZMjMslbN{;m2IsjuN;(>zr1H} z0DL3NOK(EtX+zYWUAb;z;t8$?(idY`i+ zB3VuBz)|JAncozIT<<{StCJlixQQ>}{sPiTf<$2TNGUIS1}W|xXdsptn<#hhW^X#W z|0LhQ{xFlRTeI}T<>P+$eW;CC<4fGBe|Wg|QP8HdpUxY)@4jsE z$?Dg2-3V8wrdfj&=2hNQjHw~Pn(*hH!-WxTB4eFrR_qh&!xNi-rzR{7z^U zA)&&&R4`!s3)~^v42$J`7roCPy0#@WlLM@g2#CeYm_I6n4A7nYpXL(}q~zr4qN4AX zcE4?HWS<+|=+pluUY75{@86eqht+h?+f7Cu71>$S=1p~(J138^rl-FKpun2fU&Qe# zY@WZRm|0jjzc6^fPT7)2If~ElGy`%jth*xG%po`FXV>9Wi6Y`Q5}ggYAP!0)dGA?6 zAOG6&(pc{%UcK0g1%xG3%N`Ivew@TlU?t)?m;=D77#6(VIwy11hbExQDbD^SxUow> zGKksBNrErzu#ckZZ_NaR)=!^;Tw1`~AwM_Q6nq=@H}yq*i3r+nIPv3$4*JZHI&}JX zcOEKV6Hz*TMyNDI!Pd0%u(vo49V%ue{9pLs6#$c0Io1 z<1?Waru$k#(mPo%L(r(| zD`=7IHrHhzcmB|y;8d+`pJrxSybY!t3t|KWN^kJk&B2)*;&%o7h6#BqU2-~kIXF0E zW$7CW-e|C?#Qt8$7|7Akavsw9+u?RW1ABF;Xs53iVB90NjzGy-CXg#vth;!8bbIph zG@&zsjTHnV@eo8s8ICbQS0dhG_@VO5pgYiH=-=3DYIeRFln?N-5SP$rxoUnWgkN1< z^)U9tcs7_B_fnJEh~(svg$(~a)FCpAyWJx zt66QGTvz`IY$IL(TS=l(@rtoxA$%}pl2lK~n=(k*2-FETkJ_sg6Etq9FVwhrqCq(o z{jKK3Qn}2$0izw%)ZM1%RFvaKJBzD|ijDuYwnc@(x7TIfyUwPB+7B zgJjbNFz$fgiIEGcWWF292EY2wqVtT3IQs+(tY*LKu~#U{^svPqxw2Lw_Up$FwJZ*w z2Z5-d$SSyMo5~rTt8nd)w8PFnqsEP#F)-`!l-^WY$Tvv#SZj;Ql{Fyhy0!XFw+Cl5 z)2Lb2%x*oo=3*01t1Gijl5P5t!cLmIR6Pu*=;cCqhc!2SqI$?BSAL&D;S8wNS4r(}tA*q=ZJD*io}r0RP;p+f!hqOR|SbA4@*r|0P<@#2oa{T*2N zr6=gBX=q5AxmF|^tu3~c;0!)w+Q7%8K?OcV@i-Qgp~XXKaYPRHQzK2zCuBDpC}2Sn zRgOtUDD^azq?X3u13iv}#K!65MA=Lpi7>LTo}WKGuZZG9e%J6$WvbHN_?#SMj(J6Rfeog^hys*5V&_iGq~PX`!K^__ovl3Rp5EKtIqdEJ7BWVhs<7 zGyF#fo=ciMA9W4SKX}TdFYhnpXt$|5CMhj#Q_}48VRjj5o0-M()_W7j;a@KYsMDVE zXDC-8wK?PNyUX70tiuaai!5%9i6alDe4XP;SH2D1Jjlri*p)3Thn7_@IGBdc-b7kW zPHr<1!h?oJKlYQW#W`jyl`)fto)Y#_08z<#0QqRZDm3RMYoxfkEV7-Y!wms}2<#{S zA8mgg*Yn=(f54eT$xKCwWZE)|R3su}hLRGILMda3q=Yg@lsS^2L4%=0ghJ+uQaeL3 zD-9@7s{6U@>--*%`=9&2`#i4eoU?86{eFh^Uh6e2p+oU8`{f{@=UkCImj8ALl{kI9 zx}&*=<5A-S#{>2q?4;e%jK7CgA06vE_Nm6k#%d*^I7Xp|k4+ZMF%6iwAFv&We;afG z1VF(n@IaqgZ{l+zW3TsARGIkS#?k!;4vGI{v1tI3srWqpuVv?n)e)sUmQi z3Sul?4hM~WQ_dcDc5;f;OtzdGqhi@a$!Af>#sT(D#eLfE43CUP+~}iihW{YZIxwl) zC2CG7t(};^-M`jEzw52I2EW0c;9ygV89`-8lP=};Y`@FYgc=HQ3gP{(pkPIgOHk3+ zaYb#?ZKpC26Gt^maf`II1zktgg0*5ZM!naoe+6lqanDy0tO6e7 zG3$C1A0N#90=Y0`f9M};A=3x>kgO&Mn4}|$6SC(tu>JE77(_v&tNcO(F1OZbas6D% z@I{WMC&I$c-bpPJHvLsu2i$LvqlG9ALfo{wJlNL@2^^Cg5Kv}Zn033{GQiYYp zCN~cU*026~3vR{8vtUIJv)1^8&?Y2z(bMbjcL%CH0tDGPhcW=~pR)wX%+XDw;gXGj zUyt_vDf-8Z`cIzV`{7||TuVO{|BH(%oEvZWwq?WXppKzG7h=8a z#CktgNp{TZ&0KZu+g){;h*?=cAASqf4pT9P5wW3%PYZK;X?f)}80C`OxCTp>+%;NN zU>2I~^jmfrq4efbnFS)vvS${MqviYel|p&YGVMY^#yyh|E?ERoS-y7c*s9Flb#t-@ zFoc5keCJ1Ox3fr$b_$q)YYeRG4LFK~$${3sEMu%|D>4kK-)EeWwv)F`Aqp4oKQcc* zKImTgFOS?0ehg7;<1xs{ZdMG2LLHGRcY!tbq9=QozBsS#_QS(gR9w~QYaBjVmTH1h zAxiWourdhw{zb29cNwq2m1Ryx*V34fi4!0v|5(NOfb}L-?Zj_BeTHG;%CS@Ch8T@l za%01SlA~p1K?e@V7F>lg*7Az>A4$b+ELjY=uWCZKll8SNP&}%N5 zY@}W)4$79h8tcy8TvJ`KEpUN7q2s@5Jv$B_e?0TDjb2$<%Ugx}wAqg&OFyx0f<#F2 zZ`t?nZ=pd>Cnn}*Sc)Wr^Z9*z&fIs9_FcMmeH&s_iW~KkQ#+4`3kUm_ogi3B!Vl$c zqM3Qb!wH=B9ulyAd6+eGk>s+A*$~5nXe%iV`OiK%I2Ojn=Txp(5^oJwfx2bfv5Db5 zvJCYGFJnoQtag`WGi`Jxz2Px^12gkU$a456Sx64WIcR>$=A^j`x*$A+4i^xB6d$&$ z&L^`6=wdp{?YKiMV_86Z41fdbCkx#0F%Qw-;LyF3e#2Rv|5?#gvH#CCeGEcTXnVl2 zNOlXNBJ2SKaZjW;`>SZ4fj&g|SPu)*3FA!|@!oOG*Hoz9zyI&;+n2gX;+Ba-I5TaemXiI zH#UiKir?UoS6!O=^2(df;hIzG%AfULxbWq)_4qz_Rg zSE1%&PoU%j;;siy0_B@A75d6*&_1PnL^D!wwiIlK|vBai(~2hjTQy1mZxrRPT^+hmbXZj zB|Mg;>|)QM2|?Z`;Rr0HoXtwK6*)hKfc>O-*Z$Ca8JpC*1Nt2n0nE{rftImeJ-ATz zk^@+woyz(A>&urD5Bg|Tk8TL;(y=L_kRhj8sXM4PVg!3$~fXI&9bmpWZ zCt`Gebt#%fV-RDlY3=>~sKc!hJ=0OGdQCS&!_Gvl2JUrr=OZ{EDb(#Sq!htsSs=I#vhRv<5Y`wq(se?5ZvL>f+QRUZv+hph^ zAnj&*z%t3zRU5n&bAm}&lk(IuHk`PQU;T3`{H-|gZ{8e*!?~$SPR7YCdLKX9xhU6J z?Y8i!kG+9IWFk|o{;`8Xl2QPh&97 z29vl*%c$W5ds3+j=G9eo-~xc&VHAptx1iHmW;4YsP@JL&M+RRl`LY168u@$M4x5e~ zX_ahuD}2JliNoE$9@P5b?b0;Swe123UWqoO);f@g;gf9Zf<%0=fJSH=mc6iyICoB5 z;b>3QpKr}i?)-XPQJwem#e-!l4paH5yqeJ-e;5E)0Kh`dloSoST-S~t7pAN14GQ|ziN++Ou*bjMYf`^{=@@z9;^|z)v`|blf-X#8_ zr`xqlm-eHOzlbCSG?_Q%mPo1c)o&oK2XibV>_&)S;>0JTIcuSr%nY~y=qo+ z&#UULuHI(E6vu>Hr`)6@zd$EXJPKu{%7?ojf#e{@df@v_LrS7j}T^OXYZrO z?rciyv6r3#5_z{uv}Q2K76RHq$y6Oq38~UO0lLf+*$^xux4j=%2 z1wZI77=TS2E&isK->r7+vt(Wx5FV97p=FzWTjSk^V+2;3ICUUJk!5k?aluD7J{PDh zp#iwU_iArLK8=cP^Bmo?7cR7B>7RA<(693>E&b5c(;Hnjj7drwLF<^EKxP6PgFI(7 zW!Gs<`JQ7VDz2Z6wl;nkT(vf0<(1FVyt0sF!gkJxIv&Bk1E6)(?>dxEf~x%J3uZb; zEsoDia*cvgc^M}QuReaf)v$4)=btS-c1(}Bou6M9vhV4uKBS7swq}MoS&FN4~Xhg7@y7_CvqL?b`=WWIrFLo!w?< zD2E~J%zS(MR8G_h2*`}O_JJjwns2|WEfE!c?cls>WJx;uOf3S@Fr(Xh{& zwYSrHC{(muyJ&M>TV#NYrXKWPlHE=BWoWD+a!(a;n{%b%e`J- zi*yLP%E0n9*4Sfnmeb4*s;XwlGh~1FIg{BApxR=`jX8uCj);*G@BZZ7FJAV(v~2wu_}(IWgxHMAt*KaZcX@7sPZ#eVT}S?&SE%SSua!TY$RTsEU-GZ| zcx*TWnBIgpRwrE#Qu0>+2#j`X?XdWql^7=5HhKPM$C7W0~se`U2WwJ zzel-4f?z)_qplafTy|{AjX5b#PiG_>vY^GtCc$HFua^Pb0EnPvd6r*D?c&9gPm9su zB3^Y8!*DQX+4j4^R?G{H} z`H`t@^W%!f==%aYRZw~Fh6GmyCF zm;KLvnF44QKJP|yX5e7OsY5wOflt^|MIh5;w#W8Df4(I1d$O= z&Ub}ao#Nu+UX%(;&gadi^s)0#=^yC~jmkBy^m z^!mvkK1~eIUXhsCP_?>k!-8Pa$?5K|$J!&G`t8>w_|Ba@$5qH&ExufqHKV`1--d|8 z)=#2!6f3hWd-d)uR;BXtyB;1Lt0y%q_(-Dux>;eJ>XuZxettgrcv%SwZb_!XKCfDR zY4bSqV@g(~kccI=Di2$29=GMW|n-8+T*V6ugX-kc3eP*3-b6)Q5_xQ*f7S&D~KYr`^9o-F1Y9txYq}Qu77&ESA zp*|}|CY-)nbh`ZNs29`U=w&)aIIzI2JTOSDGv^m;pElV=(+r`lZ6Em~%;>C-9uZye_X79Vq2T47B z%QB$gkCTegW#6Imi(6fCZnEZ(_tch!mku8Ec>n#3lDgy9K*f|zXQ$r^HL-JwTtH^h zUo2q%Ws{bX2A1`ra2u6g`_V#Y!SsD)2Uo5p-tWM?_HJ`LmYN0L##KV1vyN_yDaW41 z7W9a`R+h*k+-XGcBvHOSe4pZ5Ha`4Z)}kgc;ag>80X~=Q?mL#x(}?o1pad!1O0j?Z z;K2l?wH-8^QOFb3NT!x^i%vdzuN79z?x4}5L%+FW0{@U!*S+0fHz7cr!GrmdjFn%Htp`b%gfYvdDzpZ(;188jHSAs6=An) z)R?ihDKFP6+`6@3xK5~Rp!PO;6VQkIPoA8ionS#&!d;i3RR8Y1d$(X>_)Ku^`KjYP zl(Me)xDhiV^Amz$PD)e4bLhQde*4}Tc~Yh1TxOF+W}^&G6c+66Ty^ta+x|&El@B(U z@_EypNbh>*Vyl<&pX{CVV7ZQ1#< zX4RX2zgo}k9b4Ql>E~SMa^t85RipYh^6I}Rz-W9QQ_Y1B6b?AQsHao=E=hG&alY}I zUoD2@JjrhAb;5bO(Y;HFuE(bh`DHt^_{V7FpI)_jH)6Bw@-uU4qi?-DzJF7%T&ujS zHz(pHsgrK{08W0%+@WTPgN`O}Z_=86fMx?)47qn=|K5V_7gyO2=wuas zk=A-GO#x}0Su$tS+oAe^xqwn)z@WQaF}R+e0$l! z@EMz?Nr)yh0rF<#?4amoY`4Lp2No5am7((@xt<)(4n{aOmx{-Py#ICWO~pHL0U1{rV=pren_C;rhde z7rcG@ea!Q3D6p7M|Nc(+`%3RU(piF(Drn3U%!@{L?DbJvlR)g!>78{lI^z2mhE6ihAvaz+#|MtVxQlR<=IMmTB+LEj_=P1hVAr*AW2rYF7AiFnHWn#9&>m{($D+uoXnbgr zBf|HT&s1-<%iNCjgAMt{E(<)ZzJv>!iy!1Q^(*FC&OSc6N}#U^!~qAe$)LShK! z2HeU$Ya2ZW19#a)7Ww>L2Ils#64MnI45aV6W!hw`%TIC_t*k@&y#4xhug9zU8+3Jw z+T8#Who$j$UuVR8oSWAh-=3VQApM*$yS$pIOmca?C2#tXN*yp5G{nbj3f?-pkT>uC zB4Gk-j-c6`N&yz>h3~A=oPWG#Cb)}E9H+)Zug;(LF8+P^)yHY4Mu*ywU!wPCg<+w# zS$&VCPtHC7Y;`U^$`l@`ja+#}T@%lHt_O^>SvJ8;Ope6Z&fX2PEIS|2ZVCsnZJXJ# zsZLt^?z>i>xWkf#IuJ%bFU<}ocjw=TEP5UG#@(eIk09}aR(&(PI%PT9hWCyCy~f6^ zbL^7#ZuUvG!`V_H#x9L0_hwu5Yr7atJp1fbrO8hHc5!(xUWi2WuO^WlFD6K~nG-Ir zPY=tmYO}dFb~BJv^>v{z0T@X_DGry{!^)(>6XuTXuvtxv_NPwWE?Y%UCi7{8h8}O- z`STGY{=L?yz#Z*2i1)eYmh_ufTx`aMnj2UeMy0%a%r_;R#K|Ry!${i2hn3j#j0SGJ zT=qF)M!;|V)t^T>I_l7>`)qjn6S!KD6{zl+4GT76V`xsa12hrNDD|f7we5(dj_k@TOzOI`dC| zKV9JNI$D;#6Ji<;WDR z*p!TNf+<77+ST-2mwuI>$|@!iZRXNi68FZnq8@LBM)-TAJqqeGOI*K#`pum?7u(Ek zuEdUn?PXR%RN-IaTI~HV(Hq|3rOVlVO;b{5eyVTj!)-&h%!>5(tBf|MO~;}FqK3QJ z$nxLvhV{yK9V2*)$D8M8!Q6diR*_CoFpFD<=cd@3U9#AG_2iVs=X|>;{@;J@#{9qk zSJ5p{(W;Sg%Z7jdvsvGaaq9nmOlAa zn#s6P*rUA-Q;lG6U>Hd1vvrc45jS=1%2A{M=$n|7#G|TNCFv{ts?vv*D^^5U>Y`0y z#705f1BQ@g^G+Y8qtqrz3cn)qHUdy%gKY;uu7LYWIU%wWIM1~Ddeb4m_N{}9Aoj_wjX`Yx02B0Nm^db9 zbhWCV4(MbZL=XW`pf{%!9~2gpPWr32@-C46yDU7Fs{Y~jwl;iw_@T(`AVSWua7k(` zaZ~`$&&_9PBtTzWAoUd4v8;OLDwbJOA_yCJM}LH1)2MZ&PC+d%#cmo#KArN5-o1Mh zwI);$v_!$jkCxFb4g>|ACw2W`R|(|+66Fjr5ou*Ma%I^fDLO67|_bb#s)k|DV7dc)Hrex z_=G~HhniF$VT;QF1kR{5+2P4;6A1Iw%r(OTKb>OjMk0~1x z#nr!m%ZyoN++T^+;)lu0hiaD$Cd}?_6!`1{3$r|4zU3blSJEFpj~jCv#O(pxQ?I^# zKgc~~E}|*MN~xJAA44`=3SkMaGHlgHB?LmJaRSMj3=lxnEE3)YC@YT|j(fWG5p15H z$QLCSYbeb@H|$k_H!v(pMV@lSNmJVgoIrT`J-FBn_e>~a^g)_i7RS<$eQ z#H0LKeYjASygO&ko!d!+egS9^6QCb$hJplbknIr0zu`LzH-X*(gaLi9q&};vDIuvo zd+O9?`XB!dQ#i0BvyV+qKqCUugKtWaF2y??+IbboA#rLGEG^sf zTK*yE68pSbj~-|J*Xi;Mh_6ksLRuITn>+dWn&M{s{h@=z`T*Rq33VZ+N5Jei^nKDE zT-R$_IVDhA4lF(wh*Tr$xq`4zy=I&tL7L;c8Cf7GIx-;46bKuZEeB>4Ms^(i4JV}1 z`?Uz?K={sMO1i-#MtTtu(QC)H&oWXXfJXRb9uQve-2ZK%O$ZGW#BiOUOE z1+Z2Y1(JZ9fdrf;q453nFYY%C3bcCn>a`cfyT>lgk#n6gPj0ycZ{rlJ+*yBS6>^JH zoP-f%En6qAzcLuZRxd+C+Xs&x?H5J=l3Tuwrmt9Wem?ZO1#sTUP<3oT}HL@pwkor()!V=b6Nl5I9(E z*QHgTrT~Act)rj+{lb*yQ_Qbs43$BM*?ZneOPk1r)IT;Q@5-vPgSmKLm<$6=!}u{z zPrRKHhv}pHW zX1-qi`jv8l~ZGEU#Bz0u48vV?nuM*4bxfe4vwqi9Lyki!+}dG4m;3`IO|35brd7{`}Cv zNIAb;uYA1^MfmA!*9^WOTG=PJIO5W!nXp8AAj!0mhEIqL58oq?E|{?yC&v2EpDq!q zvjnBcQxtsNqb>W;IhhGJ^QT7FCSj6%UY@?LO9rbgp(~iq+jd&=!v6Jq7QOX+^LRK_ zS!#ng!^#EdS)a24h+gr(m2g5pprj*tZ41SbG&0oz3mVdRApe!b|J^KxW6A;tPszyv zHoa;`2p4MvfnSL2FwFWPs0o}7bP3Wz8mMk)dm)tRd5Ohi^V~KHJjZ1?FfS(~+HVCZ z(VUh3XV3P)+9Y@=V*+Ot-g#QCKnVwjTm~5h=6uU&hv8qXdRDKOn-Nq&NfMu=(Y*N% zr&xrtO?_%>s_k7}ck}l`|9zwtpq8}O(Adq~%sZNrmDLtpi2nWoH$p*@R1jmH#l6wn-1S8bo#Tg12m7@qkm*S2Hf-@9ifQUjdf>R?TFO{EC z@c#W~Y>Ir)uJ~}ce&Xr{$v}s!1Xo3O3->gNqY}pVZmnB0J<@>0woM6%Nw5veT+S%X zm@o+l1*;)vNg1qE8Bb(`Dt&UpCsEM`3>hLGaM70W4X4kZO=x2en4Q0yTWo9w!NmbO zlCyf`JbM5DFqQT!*X3@>)*Q}{E2v}8-Q32R%Xk*zSu&g_hrw1$UtbiPCL_|2!Emgp z)*5M?p-mBfi0`q9aT#qYiJoX9>oMDr!e(M~+G+DB+v|F-wkuj`4uoIEmi`3%%hTkh z6S?3H4&7-3{9L%*e677?Kr&x*1p8?&r98L2>Z1U*S;Zi0`7U-H@k@ts$e)`hZ(#_3E_^5KmeI9sq@!^`@8$Q-^s#aG?ie z?xuMcvtmHN)c93T*mZs4#Le&<6AM0yY>AkQFC4PU&37QPnE_!Balf<={^>!3+Au!R z@}R;gyNsX+dFJB9-M=Yu8?x@6q3f!Z0gl4r$`5r5$>` zoX+DbQaN7Kh=f;2L-_U_eUXMs2`|y<)vIYn&U}nYspeMPN@O7-S^{_Cv3XlmIzeK^UBvr%c)+1|PRzHi4i;72|mD0aq3@;@X&+LCa_bo4l z$XrVkr)sp`BR*mK999J(}%W5HtR^%Qu^{8^7FG`We*Slt6{I(D>LR66MddWgT6qNDt&`IP za6OKYMO;`er|QSjwH_WBS5_^ed47llKo+kvx>83Js?LUx(cUtS=v#{0f?@qI^`{P> z#Yx$G-R5zdcp|9T5`x1^DS0ED{jh+5l)PS`mwx(7DT&h~24~F=ny`h8o~(Af#xS28 zaVpopGFba;Nd7`aoXXdS5@+ejMua)2)xAUxdCZz7;EE#}Cq3lBf6q$c(1)5o6CmMDvj+)au8LruhCvw$gIK=*qK2U75?E-Y-y zr+54LTHC;bL5`$~EtGvCrN>j1htt*b5QV|(+}!1sOYNX}hGVmi{&~iYF6(P(Ku>C4 zDtnY@PbCB$&_dD@X&34C2d@5XE=kbf1biB-)hQVlEg7OyHaRe^icy5rJ*E$IU=rR= z%~P%{Iubwe;lqaj9)mAUu3I}bc2|*Yc6b8AD}9P7LmJ^)dXGjfdv$dPk~3I{oD9oD ztP96%%a>T)r>otNuBc4Oz<I`L{ZB@L&ds546?-JYq<@`}}z@<3a(-T}}|Ch;tL-L)dy3 z6&0n`c@52PsNT4+Omh-e{Qc8R`mW(z;XEXqu|AcXn%$B59-CbC7q%zZ2XGTZ!B$RN zf_!*nFQ6=+yY5SO-qW7_`i%?J=1x=mO~W#=M)ES=eERf4o_Q!Z2ERjNUBZxpTQhpL z721j$i@|&Ikt6EV$3Lt)LK$q&|9sxddtq|++1>*O7KNvxpQO_8rV}E_5jK6=wE6e? z|B2(^Y=uUJ=Z4>ad42$k#09z{xN$0aU|A+k>%xh~r%0hQ6f}~uOYNQ&m5Q8z|3^Kc zPupe>9zIl_I*x_XG(T-r95jq6lM?+VLTQ$hp_r6q?8>R#qYC3JqbSBR=FXjhLO{fP zf2EK|oZL7zRM@%WA2L86%o>13^dT!(rqCLL3$(;M%wRIyyVig%Z-`R0SIL!6xxBsP z8sPWKy6?GNq!d@M2twPeh>5h{nsKYm2@>8I{GKn%A}9d$vIh#)*dlJ zU672c`;(Wl z|CwsS*g_|wE3%|kadhJeS(=znu-pMeS(2~7QC$Q|VR^fgWj97A5lo75nkQ9Kr-9bV zFi%TMYl89%4hV7K(E>xMpHPcD7x24dm}__ieUkUGvGlik>rDoWQO7&N|56uH6F7U( z0V{jwV(f{wY;JdOB!}E~5L!-Dc*(r}nmW3=yO~>w9z{eiKR3H|2hjIU{rbiFht!|v zQO3k9ftiFiNzO7on!uU7nY=?1NiJGEwMt$-`Elm{CZk7h-MaO}$&>YO?)iG!De++{ z&*m*MEFNK5iE?@NWi{7_?K9H0DHV`(z@L}B<}~P()a=4Cx_p8xB*LGu(9Xr>F+iMB ztR{Q_n&xk-Yg@HNyheRoHr~pfoDBMOrH3}e9m)R6gPK9~WGF0|CK-YDLlIj<^#SC9 zQ&A1}TW5vtMmFF<1^T&wuE?uJ8$U@WYN&D;NH={maBkt(>t-Iq09u7?hQEbE*$%+W z3^;}=rsWoCkmB0r>p- zdbLU4%hVN~H4(4@3rScvO@JBxIZ5F}q$&I*(u6jwLSg;zZvYyBVM(GzD8$m;Mno`7 z0`&u4O2NWJp_6UXOxU>u$-vu=88ar1(}y{s>+|_u4H?cwoei^ik4q^4wIKfyXF~=y zJjYlnZ43<}0FDW@nKq>U36K;fDA}NbGVV7}A~U4dXXjVL_4VHt7CxW@lX8O%PTDeX zCzhm;VXw#)UC22tf0T?{0gsUYO0G7$Q0WcYv}z?jH?UpFt?*d4?kPSEc{{m)W|Jnp zXH}$Noje8GwDtwyeCd<0^xqT=J6pGJU%(`QSUZD!O9kLT0I+7Ep`i@+--}{zFw>d| zhSII2Z~`D4S5|)5KL6NT|Lf0fEz-a6ag}rMx_6LjAh(S?TJq)OUeX2&LrojPJoeuc z#gp)l?NwRXxPmJ!)ZiqYLE6)4`1DxJ^a+4~YIt$UGaUqg{G>FFDLM&BYvwo{M#3-} zh%@k#D)DErj5OrPB@M!g4boys<4?;?2Xhc}39pE(U8Yd8pvF%QW8xD9WT6{RNtIp~ zdPW*OZtVgZM#Lq#{Ragf-Hn)k7&gu@dT)zFs> zG}^Po0{O3ynK$ZcM*w(BUO(i+-x+!7QeMOcboq?Q4dmVtTb-)*KxVFem1k@Q>>U#v zV9jBfrZV!+C_V|Yp*|-xFUYXRmEE>=>rH}ixCBy?J33kr=EJvP2i_fina?0ZW@_9f zM9SY@t;uJDiXX@d4Qv=x0oTa%?6Vb=Z+ZT4Mso3~K>g>ZG~_gg`UA%m&WW)d(|Q;I zS+Z%K2ZPraoos*Y-_p9tvKcIajmr&Zqb45kXw_2WBA~WEl_#E2beqUVEnbubx3{)J zUJfj>;oIA>$$!321oWWtI*J*QHSzsb2pK<2H@%lJik+E|(5LY%a4=#Y;ZdWF^P{>8 zuK?ZV{GpdSFCa9PAtV6`dJb}lyXL7m9Ui__X1ez6O_wa`#a$DxJ;F{+m_YVV5cxag zpN+C8Wkl&UdymNDL|%eqn(`?rY9f4P5;V%vTA5Nag5_)c&9ZF>=3vykv^DJ6PL#2`OslRqv>ymlQb)d9j?UPkn z83A2pi^Bujhm(rVTXy#dZylb>d;XE*h2}4V#9Z-$n&^~sFd!qYQ}wgqND0E(@M@1m zt#MNAve?Mzr{vPG%Q*l94XQ^Bc3m+X9hziHk@p~=PRi}=+ozrgcO#{dZ?l$OM`_h? z`YUO7&oA8 z-)YZM(QHTqEWuOZ;fgp|#q-B01>wo*icrd$%DZ%Nlmwttrs~mL0g(E{w?I}YSCN5v z$Bu`}?XE$>k@(Eyhk;A)CAzF1%Bw~@Ip?0kHT&t4`rS)8U=-F@;MPI0KtSLKJ)f-= z?=f?t`h^SP!{r_JqYWm6FSIVRxymdcAGY=}yS@GV*~6qWscqUIH2@#Vf6urRy4)&%X|)d>!Z*ET{`G9{Yu8S$8(mCN zl4tZToh!-FtPW?tQ@G_+{O|W3JQ#RRwJyDAH6UA4l@QY~t?u3L#D#nrSv1;sj%frj&&f~|$etVTc3(Kc=wQKjEE`k4D zsQqpHy#L7*8XAZ18*{Vw{}VA}nf-tN&Q@{F{-229|F^!OzUBW(9~y>EXa4FOdJd1I>FtYK*CA~(Z(aoE~r1od96GE0Is_{(*IgZS`3|3NH zX;MV4!AnXFe1B}sYva(C*`^--g?AMIgW?tdbb5YAH^un3G%vzy3;oR4RBaT3V;w`C z4CF-Msa2P6?qS7=L1Q4t3h268(lFmLb8dO-0CmW<&>hx z`E#vFZvb&+p^~hk?Fh2Ru_L?5#gc>&;AG}vqL0NwAf*Xq22SY*fV{ua1t|zBDofx< z57hIaMjs!f_$bpe&WeFoSG&9M=U<=N7B*SZWE>>%pH!pX5kyBCsVC zrh-uDFcA;f6$`i<1|73gZOC?x;$wWPdXfZ)7n&_fWTpM!w3W%7$?Xfs&4kmfcbPUA zMT^SsR929cpG>Z@3W~BP6pu_pxwBe2^+sj@(d6`k`XRe^PApsc2^j^gCP0`8D4l4E z=~Z~p5i5X}y+bfULNlBwRn$OL57XBd zQbH^t{ERQyljvgK@sVVqJ#7HDTcRA#-xylZYR!iS?LfVvTvjb#9&TC6{y(c58w`nX z5U2x>6YrCR#ee!FxDy?xsN-Z80Qy)KNzrz55($6?7$}Qli&x=;lZ*uh0MI+N>Zq0) zgJ?uK=4QV5SW+T<7(%QwCr|pKnBp=J7YnH)iKhViYwKz%dCx%zh5`#wP);`CjU}I!tTHX4V zxwHxp9=zfbHv)kC?A+`@_=ez1?;^_`;{2S=Aqq&JWRW7ZfGzWxh)ZNsA5b-I!)>}w zAQA=vK}pxL9Ey0=PrR0m-{jZT>LIH->DsOSbA6C($tzE;E*TO0&+O5(#|k1TMjHim z)1c}~GzSz%6B8k=tYlN~R2_78`8hC~g#!<1#g%{mK3b=`Zue@sI^-UtC^xmgk$;Zwk#X@Ln(nEqcfT<`fjVDhO#AcO>@_D^f$*YL0+G zc37aM5eXTjjv4-MBO@bTv&Ajs>p}u`3J{U z89(wVIX)(Kiya-^Q_b&39&gvI5zsNnNZErvu^s2ehKJwG%DQB|r?=wWaxVOjl;-_7 zrUhlBg3G@7f!h8N;o)yUg5PiHFc+A}SHNK!e_329Tce2v5LGCUe*sgcggs$1(_|y3 zk`H*0950j%$$b_1XkI0)U1rVIKnTm>vd^|48Z~|2k zMd&l*z-=^AKtRWlMf2Bxy(1l>fcs3h@8MI!m>%!es%=ioU{vxm4GX<+AyDuUo&;`* zXu5cM-=X7|=%1FOOvjD8h~9y=LMe^LNBj^#UXlqw=^Kl;#c*A@SV@_z_M%ZARd;he z3UGldnR*0R7nBz~gVIfI&)jahx>NY74L+2VJi#8!SjrSb(UcdT(`RhNOtE5eGW<^W zuC@1=%JOh5t8al#@HbopD%$Yp+c*|T81t~8ZyQcYXPI_G1%?mHb5(i@rU&@hQAZqlKFTNZ_^y}R6n+@kaI2m26-iR}~Nef>Btr9?rn0u+=A zg7Mv(H#cb`l+!c~Jo=LoJ8P9K-56l(u?IATF0vbdA;<0ch4))a zFww)BB)1|{S{cc*Z~^4;Q%LCj%?hCa3tEv&*g*&+YRH;<8E1Y5-Crn=BB8vX&dlR) z^ZFVrA9zTK#+bKdHVYO3(*#>a2mSK9A!mgA{lM&YjDFN-k!OO7GYk6)oCEx!_EV)r zi@S&o_?akDM1JD>DldcKb3jD!{t7dlbymB^vmu$%A$S1>C-|Fw}Faa%kT`*qxcxS5xCDTzZ8N5l*I%DpbqfB1>m%Mt<1 zTfy2QMVMW#?s2c5hqMlCT)Z#qsG`-bdjzu3DBDS(g0sccmk1>8ag7Au1b;olsU>;W zfb~D`WKw@pv&rC^zc?}fT)jFJgimAVjg)9So};Oz@^hA! zfg_W(2ZCi@#U_-OtM>hSX&hDzh~yA2Ln1g3*%*Cqsnr4UM3t(c;sb%~#%X$OW@tvY z%37Z|-ihFT^IX5q@O;13CwnXJE&}vxAD-Q31``hOFCSi9j3#dpC7;IfS^w3E^evKt z9&hb(FvnVEm+aj+*b+@f_fTLWNu5zw^vSp$(&=EzkJ<)z9HJXW$=o4F zDwyQixHxJ36qqnef7POBECY6twN2<8kWguhzn@=ew@M!~Fa;ipsuc+Y=(kWHNzj)9 zAtWP`Z*XJR(|=xC`fqE7IhgB%Br8A@4}RozRh1W&I`E71QMtQ7@9&ss+FhTK(qIPc z38%8O9PA$uNm!ua(51O8Gb$&b<~Tp+P}%q$Y#U^~m?U8RxAwt8fzeqhH5ZX4a7oX$ zdVVvQU?u&?5qFSu-W;U>wmpM2lg4$mi%tcKfnb~e=hrvUQghtTm^t${=U`-oE`aIYg$9M8vU>Xy@#?FDMk9^Xr7$0t;nl;10StVA^tac0&M!O^?{=wa$0> z48S4|Ihle`sTMoMb~f-R+gME5OrJ=kSf7QbQ|yoE4g zxbkGQ`iAH=fEPZ$Yyp_CUnh339|fuY^#2l6 z#mPi;zEWE7{e^xSry_e)S*WaDt$(cG*P4s5zp7i8q-~0<0_U zP8u~=Ni8l+Kt@Az${VvMC}=Dh^~z11=;lK;ENwlb60jaH&CYN3tfJyd;i}pz%d&f3 zJd*8Y?9no^FmEKDAl?@MG@x@8E||Vs;bvSSU)*Gn+wAW@18Edlcgw7bYL2_?iM$aN z`8qH1YTuZGb@6I(**`&fRc_z(&H-_(F*Pa%dW`lg5&|vLV%- z%PGNmut!{Wo>dWjK<_U*ynbv;+3rcYU+#)$IdV|ama%iH`{2QC=~syi1QEYK*?&^g z(YE`K9(@EM^jg zYyOUXZ4x9S9&S%P>2pe(NK2cRoF1rFLi2_)Q__3Tf8S&(6OtEcE$D*@&6RgeMF#@M zr^T=S>}!62o3;x?(MdLi3W+7kkPyQoM7Rr-4(cNHjM5l#en(%$A;g-wr@D1|Lm$Wr zHwijt?BZ0%=DAyDKexh}K(d&knvdSa-q?KeG>hKtfS({$Wbd;SFfm2*-opZ^a5b(* zom%N@HwoC~1H*l6LNvH8*NFuq;1`l;fiN!<8VaUqG?U0O{c?+zP9`MWNWwB^d%xpE zLdt4ZB)F1J!$He=E79{ZuA{J%9uh^!KvbIc-m`qoyRjQGV(3n7vhTTe1J?gq&U%z7 z5NlEiX_XezdPoZd=WxOFzE#6pxyTu#Y}2}%2{|OIIuY%j$JZg`P~vhenvj3yB_Cx< zei~}h1blOs?Rw+HFo^g349kBnEosBIr<>f&i%Kc{d+0wqEB0n-4LDk!p`N$zFt25a z*hI6RH-IQ6l1~9zidvP6$$_~{_7sU|=Fg9*LK1Fm)eRCM`TUN41{oO6-MgQgGq!bCGmp$Gk%? zAhA>UR#f@8v`dc(nNtc|AQDZckI8f9yzJY+@-Os#wd2r#Ub0ggzhz|h75~RipSA|$ zNllmB1{^&qM!yt*_j!vp(pNCru`Pu`lN7lA6#wfEH?7I!T)Fc51NjFe&9s!u|N9z%@jyr&KXzz7x*gg%{HpOE29*Y9 zS~uazgYK7;&!llG%@%iB$(zgOC=&?1|EurAxCT&sgfcExtgq4JKb7QD+HLy|A2x0$ z_aFFE#YX@NtLSCS`iijEiCj9&(J6FP@_sZPP#}uNFz7ES; zKtMo&8#lpZ)Ve+*T;f1xm@u$3`yeK>%a^^sp1FkewTV~K31-GTwYTvRR3E-}t4^Jk zCTbEf1-NCuc>N*L`p|PR&aA7|bopEsMOgd<{{|8I$ z?danE6wRv}_)mupZQ7VXn=2J7Bg_R?+q!Mrl@BWzqi^$qG*wg#|9SJ%w8~-6zvW%i zL*c)&qR>Lrv;2{g-@{CIlwfxN@Rh^dFJv0o9}tRm0&#cnf)|7hYOLikz5xKx1EK_{4JQg8NPb z7rOBHmB>hu_Cr$TCAd^)+(^npK!}62885+I-3JXETwR}VJ2k34fu=1d{gC?MdP$)l zF_t6i{_H~r)`UX+cgNpHKZnb(laMHv4R#}if)wP`!|6V&xF+BNL*<*264y)qO;skk zWUug%RG)-h@BNGEsKhCScIf3Sn>%;!HYVx{^m76e)h>4<73=@mrOzOF;g9H<1TVLj zZUtH(jP}pZrEX~yv?!+`$JS-TU?wHxU#AJiG97WeN%4wv@8Z`kBc^jcXI$m0DFY_l zw63^)0aXJ7X8wx05fwdok25)#(sZmE)x=Gwb++&DZetOjJG$H#g`{=7(^%s@UllVH z6nqrS#u(2u-k7=WRRCoaoMu%sJ%8Nu)3-S8O?F|+@Vx0Zt8y7lLENXu zzF(WL-#M3^qaa4W!OE-}MVTkLg(hTuyna6~?=Z!A#m7ggVMm>zSHR7qR}4J;BHn`M zh0w+t`*!D2-pv*dIq&6IiU~tbrCX=4mu94OjfUa{cnJTwh3}8MfjP)0`M^II>+_D@ z?6GB%j^|Q8y?N@+dOV;#|W0YbLJ@j%0id7&Llj&!0?QjXkboSnv^?c5BdvcyW zQH{zOl=4b5U%yVyoAC>Lmk*;EHGj_#a`ZJG`c#-jyaUPA0n(~F=l9lk_omB^o9M5NuFb)k=ool) z2f5?XjsYOwci_M*7uNu&Sr`c<`5hCs`1|9<(7j;SJYauh!gd_aTIr@iyit2lBk`vd z@Y5VM^t8l$JZ=J96F8K1&z_(CS~aAbf!#SuXEf?Y7KyhHI*)s%n3B3+zfT1e3_(sM zejy3s+9tgk0EAVxZvhU)jLAh&ul8)cIZ2>+P^o42mi~)Q8LXZE(}NS9hNm7vPjJkD zqfB-H(ifd(41ji*JsOcmLcFlE#!n(Hh$1mK4 zhXy({14E~Os`fj3#MeBBrj!9>b?S_Xoae-f-4R?A=^x7|1}^N3=jvXpU4jDAhm^>fZI7*g z?*fUkYuB#xJG2y4cHuKYjeJ+vuZuxjY@nv9ZyFU$)5+_d@-D{7*3!6tg1%~8v#E{E ze784Cd}Wdp>Gj}YMV+_T&RNsy-MLfclG9^EpWeQYCdFNBuGs|3|9)y1uK9qtl+ke)d=GydI4z=!Zi^?!o;C6E z{CV@{#yI;XGc89SH>&b(Q2)f5(!$cdy)v8lvihv}vbni}mR41lfg{un4BnL9sHooE zsi}8OqjbY2FD`G2AEKn$;@F6y%Gj0PwSWCKuBlr7tt)B47aff)o@NeOWcNy+voht;Avbj(Town#{m$JD|<$KJ5h3yS`|gnVAvRWOd;wl`dXh zB^Pmay{N67ttITQgL{+u_27_O}{&)|cGv`g!xxzbl z;+Co^sWo(UHMea14jV5#_@md^J|D*Jm2aLIC-F(8t3gcs&Qj05h1mrYX5gJ+* zlAdAiWvB0wS66z`VTX}NC!;wdv)prZNzU&$u;I1q6}KWLG#as{(Em;gjdLoF=^gMW#(N-76o4u=K!4M<)yJIbNyRnH;MW`5rRS9 z&ek;C3x7@3*=J!B_p2(0Zbhli&`H~@Zk};Hc32j_C~6QDIb*N3*B-Zm`m2kZ&7WVT zP@Fw$erDI2^-o%Kw)^3j@uj?U$Ih(E(N7MBsu&n+9Kf%Crfv^ysG-RYF^#_l+ksDLtL88iuvq6IH!^Rv63?tzg9%NOW9ZKUSSoe4ZnUO zy`$1reO-fl{$uO zxLW1uFwo1~tLD!$Pptz*55E+I9^bUi%W+;tlc;??iawoq@S)1?noiE$=g}1=8&x{_ zl)PGIWN>vT&;o7o?8~m50V!|dje^`2F(S#i zfz{A72+e&B^)T~=BoCl51sNVq8U^ZahM?t%&sw%I4mZJU92!jq!en3rRh+G%(ZZI| z0*S4g8odbU3$x{)y^Et1>yLt{U<9&b?!|E;)CiCWQYZdmlC#qxfMmNc7nxZ#5Hwgl zDL19f1ot}s@Zn~Gf**+vjL#*#_*)oA@pv&AK`j4U?IW{-1TbJo>%+s zGR?OkajPn1`k_gh`zuPKOMjo6H*aL+_pDxC3eBsooQ#R7T6;XzKlR$$>`mF6pC+3c zR~6jS@7%D_t7}T_+gH8Ls;!AqiND@q_wFjYqzT3wlV7S78oi7;R$G1Lke_me+0ny# zH-5A%>>V6jR+3aTw_rn!#w69!aYQii`9$#n!Gn}Vsre&zRu9Cn%K(0;|NQzO@&a`3 zvhp71F$QKI?jEtjUG&rBl8cC4(&uS9P=PLjF5&?HhSxG~QMv}RhQDRh5ckYRp{Ckq z3(yz*-y_yZ#hF$@1keoAe~R)tT7bvOS`a>?z)N6Y*uQ*f z3y^xG(7;axoFQ(O2u{J~rz`MOpN9^JCACE@{|t0ibSpT$g$t3;Q}g=OE* zH$E-Xh`Px56lC`wzyTD<3C7;Ly2)K$PP4RZKeDF04Yn%-;KusooCM#ft=d7C%%djp zN22e;#fB_Z=q`~gobT;*y-U`7`V4_kg)j#o4?Oj#p5)@SBJ;Lp3&H9Oh26@FgSam#S6*_NZ z{F#l1wlo^-I>)nK*tVvp&eqkf)zCkcZt$lk(tpuE(7oqWwUk&g(E`AAvmnht(lep? zQ*b6D%@7yN^`VY0Zr!RkZd}@k=ieyI5lxq?3Wi z9)6@rx3HlkeZahdC2n!%E7utFzGPB8(L4@wa-2i2WOF#wp>SrR!_zYcEO2gYk zf>%zBp4yz)fgzVH#F+!v%=St4IBe;~`7PWnY{sr|Nj!qE2QCy1d;WJ8+XXR_ zk{{ppomc4SW9W!TpZ8~ZNu}G->caVGLO;MlIY1*5`<=ljM2K-0rPX674S@)?Ocu5y zfrXht@`Pbj*H>iE0$%h%%~2kFe8%i{?Osb#dSB~HLy7H&gyDrJ$ zQ0ZX*Y~pbl}@->M#69`dE;(xohwc#`&RoEookoVG}h(J4FcAN&=mxajE#Iq;dq7E>f0@|QfU>07!dFBhL7)gkrSlu&nyz($Je z%V(aF?Zsd!@&|E=u+g1&1-f`yuAZUcF({k@EgmJgrF(1P^ApL2{5jMNezKh%>=j>#>^v1$0=Ge~ zlV2rr0iy4n4~8`p*CI3&rTN$I-z&f)$L%w4`b2 zwXk%>^&F@0Ti@nOw1|6Z^j8I+{6&)E9#hQw74-BE13ihgH319S!#Br=r$ z4^`(K*W>=if5+yCick?{7ZN2JNJX+!p@__s%8HbE(3G8>mC#aXQAx{|Qc0OvsgRXq z{+`!4-{0f+`2FELj&rI{_kF+b_xrkD>yoqpP+HUjBSu_NFrBn&Y;^H5JVxy5R}td4 z)&8K?(BQc68$Lnw-j5%n&y+|)b}Dd1fM`Z)?55hL#B@#EI&l{|PlRZ|0KOPrP7|O4 z0-%0M~$*-H)tK~ev3CUHs$PYZu83O%+SFjCABqV#Wg zwqL8aXgjEWWk5YDlarhu8!n$+hj2l>&cYB2(GIaDjvrz(v_o5h1DjtYE6a){D2R8_ zp|ZtqK_h_(KEyhlQT-oR#5oCoGR@s;+Kd?({RneD;~H&yXL6GzUS7l(V&b>p2Ow=8 zm|({_$Z|MQXX`yrgd7ZL<@Y?+X0WNLDLWxXP|>;IgTnWmgB6Z@OcsoM_R4}S4D|6& zwQDh-UvNgh6lU3HLR86SwP%m*J5x|fexgug*-5AfUm`MqXviNv#1yq3j}DEt4EqszwvJAtFUuY9^jA*WS8<`m_C()xed8we zCK=!v=pv?_Gi7d2!(f`~K8jItM{zKU(ofV()b+$TbF4XwO>yz?YN~?(X1j^*jH37@3dUDd|HtT(i=Uuw(qjFJ7)Wj>D#|s; zV*PR-!8rng5{&j+bf}0w?|)a!?%?FT5l;VenOhuh^j%gb)@4bAL8^{OCYzKtxr^Tr zLrYOQ{-Azj@xWoNku=D}4k`-2@{??)XcpGiOCY?5cjEEXUY0;zoGvn<`T45 z20EdYV2PlF&mU0Ji=rQ{)W*qa8$wk%xQnCgsKOdCn+?@@M{>LhF0W!MmevBe2F|b( zkJ%30GnN3TKUt43lW53INn|n!c`oBw)K}>YM2Kuil9fJ_I zfhrO!bcDH}0b=cjc%1^ZNj~U;f4OR`QU^HQ2D+E&Ba^U1EF2Vr+m(?lbc;GLAItf%?UD!*H>R5uEDZ(2J@cp`2 zBnU1)({Tp8ElUh$(~%5TzyLS*hIN&NimENTtZjVcNb-?=RqxOKS(V+id5g%)3o66M z?dr~^hTr%~NwQ&|KHulrBs&D2K(K=(7dO|MLd`Ue7w$KOsK_<}u&IP)aN&p%4KUqZ zJZ{GOAS;@^Vhupo?ZZRc;@n~e0P?U1yiz|ahX2VdEj>UuljxABsogsIoVAG*Qz8eA)k2@2c}2*uI6x)1yPZ7w%aeyA|_mEXG_ZEAvDNQL;} zv14x;51UOwz(Q&UL?$A!>eRV2YM&k$*`b7K$`x+eAgd1M7_|4z!176x2jgg~{wWzD zxP%Ioob4={V8qxGPr!UAL|=Ktoh>5Kg7PlCQgH^{AXs`H!dPVG8LyQ4)2+yG3QPq5 zz*?h4aelC*c)mQM3k)ocKgR?b5vw*A}7-4jVV3M=>gGyG4zh#a2t z>xkY6TdBI=)RtM{&uH?b$YoiV6Eq7+v1my@=RnYJ^dGN<_&}Zj&`yeLU)^sD^eFi-Uah6l1@h3o1eCNdHCAxcBNfWUKAg?{QBLyJL;Zk&IzB24ItFl;2XkF z`~kV^ixr3TK2w;maUQ1K!wnSfYQ`0a>YumhZ}<}y!n#}Y`XR=kG*WMf#vqrwI2L4P z+JyxQu*NZt$*nKAxlLreNr3zu4xy)dix4?Bfs09fHw+Ews#SHa8!KZ4R}kE2P5m+o zQJJuEe8yGs&4h_@_dNeCVO-B}s;PvO(kOCt#f!QcahMD_37KhY zmeLCzUknu+8#R0K!NJZkB5C=C=2@|imR~uDPXnoP#{sh&aH|K!WJcL)BqMiS?=miV$#3iOeOP-xYyI)ET0^fuUzHwbT3!a6|A zR_vqWnae!aG8>vGRb(UQmXwcVTcS}e{%6d^86HJWSx9vLbwQKNL!{+a`_})(5SOjI z`69zv!_@TDA*PJLfl#85B#XcKhrob>~k@kBU zeLTphsChVek-5VOg1(Br9;$6FqZ-%|XD=$-i$Fqft8j-@&~i z`JQk(JT0vgv^(A3tZh;*C{3d4!HKv24Iywt_%^J*b22 zM0O7ZCl*4?tIV`n48_>i&c=ITD#ch@Snt4AJgUnj`?jOEUCG=u=~noTE^FNg? zoOQgg(pw-wfMFIVhjaFTqo>#Z`7NLtg}ab}jG9WUeT^dE8h?fYHB)9N-I!T5By7TH zv{ceYN?sCamel^TVX$I&3y)(2!L%br^8z-kORH+OVYy}pUjr3cH?t{nJIRP1zL2SD z_5N`AUfsHPw}{ErY5vcL039S7K;dZl*!B)Mwl?k1*sSJ=<1h+4>0?nX0?YUmmLDY7 z_eH<9+l!i9y!d@}ba4C3oWQI7@BGq1S;{U(t^pT$RF958Y)>fFiHvMPB!_{~Hew%S z>gT$3JC3yO(c}9|Yv}M70u^Cz1KGDdbb29KRuK{NV|6tfJwqu7#g6r1K9bAC-}NK! z;I`SlXO9KwCuc4dveRt7(XYX63IMiLX`w~01BAO4P3?5lWAtBaa?f78n9?u$OFBn9 zGkzN64}Zg%bf#d^oVf!H4AN_A14-)M>>sYZbUuv0f0o?;^a&JkN7j8`#EVCSj)3E& z{9nJr!R-0=`QIbMc%7m_ci8WKy6>BM=9cNqbLR#u+;(}YSN*3$y0hkvBdnZaPS6j) z!^)z_wkWMq+|QqYafTun6dwrCED6Y^ z*4pu_o60Kq^lylhSJQ4vMkU9o#6@>}eOK(^#t zw8IDdD3ceC2D|wDh`oFEz=+SP@GrIt+~%j$?bz9*$!#=D0>B3msOsxKc#1dQvFAEb zW3y1D2;~Krii92v`BfVL@f6JgDPd90kJQGu@bIwT?B4-^hqg>SpfsV6^cyAUT}(~A z;QjHrgSuZ|+d4}9U7QS9owjo_apVbuhQ^w%43BztZtkmX+0$duc`-MSdQWfeRq72F zrl+FK?y9X#_kFjSjT2{db|-t(QpCNqU;HmKg>FVm5*uCYli=7YxvIFMx zH_Br&@<`@pe_d5OqTE?qSGR&fjRLUvTg`ib7zyEE-}=ns5TZypH?q2+~hp}ba^gA4zrXQeMKSbJU1Jy8#9doBlcIP_-w{GpOr&o}# zD%MM)#cmK(F~z-jQP`mSZ-VjjCc+KNDP@pP~@F;}Dr-lzWoC2_BhC zy*oXA_Fu%;v4KgjG)wEi(B%sfgP*;-e<7_ipdWPIjO)C^1c@#UOSv+vB^3l;(o;f3 z-MLILolwdoc|5+!#xQdA>Y?OU!B`y6>w7&6T{VyI03aWo+vOKEGfo$tbALD(aC{US zHki&y+84~8f%wUvOzE6fx^UQ=OAB!b!BmrO$y66I#g{%k;D59`dFajr9T#id6cZ$+ z4nZ4jDG@Sp1PGi&nIeiml$LWlI4Zg>mhJ`twu55?2WHy$s%TP+By1mPlO@y&#w=lD z#f!V$;>3+!VI(mVf)NnVoP0O;{ZUZEA>3FxA6!TW`3#Qyfbj4a?&=R$4f++^KQa9B z_bMGAkLXYd7oN+{A^l2Xc<7oq*Zq>L?M?@DeH7SzjA)3cjAVME6a3~6qbF=_DBIY@ zH%X>Fjh?OVJ5q+8i$@Wenkb&j+@L%p?N?frq}4t?4Dw|<)dI>GhRJRw4`k`L56<-E zGE4}>s?C?+PDbsj=KRnn`Ha%&W5MVycBugz`V`os5lvEm&L!cSW}-uWT@Yf%VeuNh z>D&;Ij|SgD!cJto)4cf9zezch(g0Anqff;6@#AUk#RG`;dF%8G3#2-`{O7mtzdfeM zV~1Wo0Zhac>{u-H^g@-L{hv7zaV%v3)<3ob-Lwx4Ir>aE(`FZ)ne6p_Q4ElSNbQ)} z2u{haA2Rdg=uzNcpjPlGkK^y)1EV73+!_rR3qY4uTpWrs2U*Ch51dTuJ^CE#`Y0IM z6qGLdj(eP%8BT>@YGfYUN9?9T|JPpY{yV9RGH?9A3H{QaZA&yUOwRnAahDo-y*l zeZMb>1=Jt8H03OQ7e zEkRf{B&k#;v(_$;iva(KGRo&o5`b%S+5fYI+|J9#!tDbxu8dlR~{qSzS6USJLRPN4_ z+nIC9HBP&8LfO-){u7J}?A;QrBcoayZB25B2s+_X+%w5yWcTY&e20IoyVh@FZh(c> z#GKU`POEn=&s(0ib^`zKnpx+yw!Z<#io$ur`Q{XVTZ9upMc=NaWrS`e3s=U8OJ?1) zX#?iuH2;lD682*hAWMkp3Q=tE42F^eMVZ*`Us@-*U;qBEIkYQ4v#*twsw$c;EhGu- zX~z0Zad&|)p;r7rcaXi#e_Q@&5E2Ub&gQLJ%|oe2$B}J%w!lTQ`Z3?=p9ug6JtEeq z4jrbRzS}48>K#@S0Ht^#n{G__euDTRaD2a3s>`N0_8{vNQ2U` zd#0D!by!~@Z<&CuU?v~orI!8twscWL^M7Cr(AHI#@}pH)ywaXl_-2! zg?nMi1#tuUQ=d_V{^7J{Vy)9+6;!c9up}Y*@lb zQOf&etg@^rd-(VT&H! zh|L)y8#0|+IXGd9BJcCD=!vo;zpH#XCFCOs&E3seCm)eqv5?Jx)mg+wDFT^VLlU{;+{ZSW?$Vb^MX zOj$sh=tN+@Rm*kfbg~|67XFu;+nMq8rzV{4hTH@B&91|T#}V-%iuXO8C6weabA{fu zcVE0163k`E*WyIU6yQwvIa)cMxDTY*G-Sy2)rBXb+z_)X_9D&U!@H1vYMlg_W*J(S z@}CuvF8lt06B#*=dl9+~Z|FRKz8L#~X4u{~aM-m~R1{Y^lp)4<#b^;Gv}n~zKdl;P zMn|0FVlGBUAa$v@@?O2lB?`He>C7QzZcnxvubjrKBH7;s)wPSCq#a-h_nc=$8;zX``VTg?#%&(BVLzL!{N6gY%NI8=H zbE{Ka+_Iw-@u3SA)HH?Gj=)dlS1Mn=o9%~g;x28*UlKY@IVd$Y)xJURzXA`+EFEkM zKIy-=HMzOf^f%ixDSi%d>zhGsaY&3oOePi|T0dw7wN_z=j+az>`fjI*K=A$GSjgBw zb+(MIFn0J%2s@Dh-W2K2_&gCOHnpp#MWK3gS4=}BCB-Yv^#{662}pn!C$DEN(liO7 zAUTR{ndUaGWcm5z|{R$4%)=q0&yl% zcfv7XS?(gKTCu6iR26f-R^{YzJO`8|5;~NMu+zmE+nD%3ypW!gf=4+>}@fl&QR|0wx$t{*Q9O^B3Jo<^v`Am^7TX zVC0AfMaK$9#ooAi6DM0sYF*M1CAALEWr6$MTl;N)DgZujK3mVhzIzxM4F@|NUoN9G zaFI`&WpBS=(vz5$j*bs~2QFDMeCLrP+rq-aoCo)2{wjlPp1*vV1EM6^7MLAKQ5oox z-<6D>EOck_=g$jkCjMifP=09wr6fWa;(`u?Lms8bIFdty!4b3j&I4koTKg4sLkLf+ z=tC;m?Wh-oMMykvht@}Ji`wcH`aeIn>E6Ta(BLJnTj_hYR*8*Cbolmmkkv&0b8~b5 zv{lX-Tr9T(CJf73DG96S@SwRR)*sIw9Z%ZNPq#R&T7WLnk_M-yrcQ47Ry#DlUpKAI zqk{aKeftr-zVIGe>*g&sdOUts=6!fVGwX;JKp1#u?pN=R^HiZZ%;W6;#ta-dtZPVU zLPkxwVb9jyG(G4LV>JQqUE6)lOt+Xm{eb2(XY%!_gQ%Y*U|2C}0{LcwhsxXzM@Q`9 zDGa#~;84OX)eT#tHf4G8vR&ihT#e{RCXnb%IhqgA1FAWwIZU5kPWhguX0NCe3x6u% zqM{uF`4bKw#~Aq2Z@*@vZEf38cS#mf{5V!kNi_Y( zAu+Y{pN8()6E6((z=0pAbV|L zKy*Yi7p>kB`@e8v77O$bK|+fo zKGs8=4;f-x`S!R#iS|D$JF~E@$HGR{|E9%FC+d<40T^Bn1slc9EUAfl zJvklBSA^vQ58TeT!BU{wZ~%QPmA7@0HR5T=mQf{TJ5qLv&=%!}5`Js!q?q;LDn4EB zww;#IkDEmbHmIUB1ClN)nd!jba^h|``4phJ#FIg(*ZyjTvm2W+g!60w7_Z+m)8_iy zMc9A^&H6y{hP?6So1apqQ^ShsTR7XwznlnDz-`Y48bRFv$M7%Ek1ZOMm|$gN^ytFZ zx9V)PhkFlGlgGlXe!F>!P0psT6Y{HTWhKp-WAko!4i{hs-QjP`AGto>(9O-m^LMXsOm=QpvMr;c0hJSQ=Ud3onosg&f&*pOPmvsGwV=GmpOB?=59A7Ec68! z*%~Bi$@-g-oT{4o04l}&CAWCgoi%5?7fyYkO@A#j=}>SPDwFsjL3>k6AROw@%wZTLvxI5Lvl?}Gy@)U|7lbLUQ3e_SziOufx}Z(ama< zyLWdR8l%vhd8=%F)39Xltg|0Je7NI$EH!Sunw&YCH^-A7W&V1eu$1xfXQmvLSUDt? z&(3}unpl;6iI6*)97N5u$orTu+2C(_Oy@#&X?w%jofbP55<~i zI&^F)s!qDB9$x8cNKN-6Y(#-Kn@ScS5QI0EV~~cGkWOLjDOP3TF$j$__0_0qw^Jq6 zoS03^xBQ6g5H_yoT*QgBXGi$Nu@>%*)lHu_;6bfVy4mJT03TW`8M!Hh5=-;?jT_GN z*EVhLH?_759GPou0{;RGQGk+wo23$qCd7O_@JdFMHMV zoE8MV;O9_o-8wVD)y^W?zMnW$MgyisM~so|bpEZLP&OCP=CF*>wLT9nG)wZ>GG@2M z`1czd<(~~(VljW<&ekSNjtzO*vYAuQA9s((&i>+(vdr@Bg@ks$f2*IG*rDY8L(lOy z?!*o^9@}%b9$O#uXfBe5B`a2(gmh!HGug&%BtV+a+f|AtAx~7I258N5QgoW4aX`2~_E%%1r&G&n@t5NGc@ZbqQ&MJ53$i@^8zi zt6^p1VHc#|;dLS!G4)WM$eB&JjSREOR}jc<*S_ zB<>mq29AQ|Jn*J=<{nQ8i*~3{gsvid4Ud7LkNtK`5Q5p_EtRIlFwqWh&w>w?JbM6% zI*S7n$LG7uzM(vMlTT{bj2W%l_PpZgY36d~+wEHK)q&HkhG&=zc6YdYd2PiKueI%b ze@yW9?hQi~)q8tdA07>qnIvNX5!;79n>%6K{?3U>?FVHV{&;iTd`bOztBTk&d(T~H zJLvY&H4W|U<2}Nxye@}@JV{U8AA5O;*^rCrzUI~U=R7uV^J>X+3;UVB>pg-|qY8rz zusnE$L5h8P8CH|JC%)T}^3c6mv!J44W`&+H1%LRTru?6O{*j@d04Z_pnm^>JQ49%n zjJzL(e5;wA)Ql5tTC60qQ`Bv01}zB`FmJtvf}4sGsan0?U7W=xCMGmfDX%W4r=^j} z81;F#>C%zyGL45kDy^tP2*{B$in}w(^rsnHt;MxuvW$)pZfojeR7p}UscQSye-IG7cUecOC zBH|jM*kxOnJ`mx%>P;_zV&sIm$fgMPlMauEdRw->=FJCuF`tRs&E2hdCXLgpnEQ>X zHNv+KE1PY-fHRr%HP%M)F4zn&*W5O>+X||kOZscJ(_eOSze>V6r98(4r>v&NzIVHs zW3ggZ%DP<_08}aZ+O>2kZrO8!N$`aW24-fTSATQ<+aX~?wtsnC#3J=2utigsGgFVnyZyRMC%+#9FzK&yS;vE?b~t2jZT+xTmcJIhUbap3I`5% z3J4;8Xp5Ojr@l^2hYrm;ljgK$#nWMvr!-l%t<%!uj)sw*mA}TJt&yw1CwYu-C!L2i zaSi`_E5FGUpDy6Jc*9-PpW(0CLbTm5)0PoVOyPwCY&ba+9hR28Q0Rmer4M z)Ym87>Xky94i(*Zo0)C2%2a5LZe-`xR&%Z{f9Ftsy=V94wJ-9*oJ$9$3>#E>zx~ya zwk5p{0g63`g|8W;;XR^v&t{DSx&;1cgjNOaN3_3i1Aa-t;FzbG*~Zbg^yPIW5pq$k zDX8}0WfA4@SNgUFTX%)7de!ph@6`A)x9?as^*eg>o1xz1#!(K*es0El_J#)5*w4J< z`}m`W_PwmbhYyqK(+JRJ;Pq7hEnD=Qvo<#^JTW59e|IlS?W+bey<9C2%V4vod-RozAtWXu-7?J zdGVswh3rQ^uU(#qpNI`Y(g;%fkS^7j-_gK)X)~9#>_Gae~j*Fq6+;RBu;x`8lG30_4_7;sqM<(=z z8r&W>Y?5u`(-T^Zo*b^!@W9uv&r4;&&Mww=xj*i~n&jXc2M=ZyP7HS2u}kaliqY2` zjqk4gk(IVOOLbjESk?CEMN{;Q9S@Bg9{Ald)-T;R^I_`w+0!ihEz9+NdOK#>trLW6 zN^~h&YBWzHnKXbrTw~ogXBce?%${0W4bjg*T#6?5>C+4wKlux+HjD}_yqF30?0FPg zk>t>?)2&1Ehj5d}WRo-xobPA>-_W9)F+_kc9h+THOHPC|M*omCw)|-U*G#I zSsHn5m{6<01rq;=|8PVQV|9RH1k2zsAl7%Y93*>NbCf1{E z5fIL1(xeffZ>q*CWA=>HDR?IF&obX$CiqtEjy9W+x88MM$ps&SoLsX%zqRaQPV|h7 zSKQWlljmguFv^#_f2v!RcD8tTeeKkp>ylpjVFw1gtI-*zxgSiP(cjf%YOHdf>!~YS zgq9Cmo72~G{P>%3mz4)UJvqj(X^ZHrq6DAxId^X7EdO~^MY0KgB%db@vHQYWC$k7; zgPZGrYCgI67pq_T+^l z7e%IWq2NP)?3#EfZ*9!m#XHNFyb2o9rdhM&KW8e%4_!0ui;9LpRYs|ej?PZ)_3p1G zPFMZ;E!-+RW5t*UCV#gNIQA>aJ!$_zS2u$^wbfI>7I+x@NT8J6DVQSwlM;SeraET)ZK2it~rrLwsya_TRdh6YNgn!w+|*Evvj#njT0^ z^=^OhQqc837njE`KfPp>=c|~1hP*%I8T7YX>_L(p znJd>(2sIJyAyGh#dU1wR_<>_P6u-SQbpCL2-t*NCGx|1Mp>pA92i>46$M)^p zADgBTYVE=pi(e%^6&`vnJFfdgMg}ZY=~1Apu=&%n@_Ck{|Gwr{ptW??`^P%>DsJ5> zPp!#$I$YoOL8H?XgP&ap3$uFsn4VjtU$Kt;N*_Gf!LU^7E##OOcZ#N0Y&)U#;Jjr* z?DwaCEEH!%t7y%gXrh@nF+%w(4*TSZ?`msRyq=o+T*>$oR72PI+}P3K;6zmMu~!|t zwYdfED9|rV8t%bt8Y;K89GJ%{rb;Z1_e9kw%pl%+38n$Dy}nP&+6unQ0}=a zWff34uk7!x#P$%>)X{fH?MR1%Rv&Gmb8jvD;-dFr^@YG!6Nf3aCc*y1;P9~>KJGZ{ z?XhQXOaI#EmLCTk(H~>cS*2xp&Nel>rD2wx{dZ4!wdec}MZF_U{@$##^0x^G_4*5H z$s>;|PQIP|~MQ+%bI(3 z(K)x{LPD-@)9I&@SNrPL9KTyRjW7SS>6opl85(vkWe5n^6W_S+iOH*i0&j#>9=Fmr zY28V&t~699tLgQ-9rC7*YI^%%kEGR~#_UvjRnunlwg9!4$IWxsRzKIeJk~Mk%8Tvh z3^qXH;&m_MG8GE+RYURb6Fge|a;TF-%VuqY7G@Pih7M|S#Jl0T-_wW0=;>)~unn8C z=$^#`ZO_YnI{g^7_V-x5!FmP7Mr~uR9@#si|F!JwMj&2ITXh@(wJIeN>Mi72D*Q)+ zyvAaB7p1r6C�Qhc;qNyGKGeXrDq+EG#UDRDWLF(6@vUib?V(ja-yT)WXSWfVF#P94;GNYV z_xN-D8brpZ*+kLlfSR}M+<81V3Ms?YVf|Wk`&5(h{NiBOJyvsVBRytTEg8f)2k1be z%?()vIK~N&-4+}?Hmr7Ls}@RktAhfkN7xjV_-zS1m>G0&;}KKK@iP}Jcy;^Y39WPK z?ece@=^q)HrL?xZ$>e3>Hi0pXn)%t@ol1&OA&gp)!?@!`uC{p$y7STF!y z=1VF(igs1i`cO%KgLf5$Fte{00>#v&DW9Iu>)wjx%PlP~EMKUpp|v~EKYFBQ%RjSx z#+)p-w|JJiszd8p{;jnd4q5YUeN)H9F#(6|PbW<5pYgfbxxf}_-=mfs{`E5_IQ-J1 zcdDC=dmw?p8mH&+Xb2?=_@{(4upm$ji5ZMOe#e5ks6eNtI<&`T}j4!RE zXoTXUbEmL*ea_gfQ`%j2eVOqNEi@=fE2N=wv{*eM8JN*Ho{M~c-9GZ@az)k7!-hSt zpH|pl9j^(;n9Ls(Kd4MKn^2Y5JJe9^?!eNJtLMYRy|>jonOW-mIHQYCZPaSF!#_Hw zJKd#7WWLpD37VTvbs)JfWPf3DK|JJQd#Qtu>CgIJTRUpDJL_4uzw=O=J>S=S3T(ec zixARv&o~<42h*=(H9>wR(5`>F6X zzzK`r95SVFR?jm#-L~p}uZck$ejPK*S~`A0`=?&Iy3WHl`v;dT@%8NwJln@Ye| zwl5_Epk1^MFi&V{hCi#UltJsu0*y?DMIekCu^eRB@9QjfR*66tyC*xmx^Iet1|a}+;NxJ2NEvjG%{7o|Dx%_y zi>n#`!`AxI5z;cmZiir^hrbsCHYGIy(T*)VoZ9S`o9%V(xkW$J{o=q|A!#2-hSRXq zEA)I;S)+S6q$R0$t`*_i)qLmGX|VZ{^B)E-s^01TqenB)L<>3eNmMNkSh8N8iwGmJUbK~cKdU>PM z&}EzbUC#{Mbx^xiwy`Bq2E5?Ru@>oE?>nuniuTiie`dwlI*<)B4C}Onq%SItQUO4j zk0KiHM|)U!Bh%j$VK?}}WZ{gA=vl1LBy7T&>e2_c>!!b{)k^H4H#uOlzGr`}H7ZTV z+}f*U`qd#d^3t7mI#$-o6QVLQhTO0JY%uj|+EawbqFtdpw{;-1E`!l>S!nFV?SZdI zMv<6&xYB4>euu2TQfNScE8)(N9{Z7M&#oh^voJuOPtuGg|MTU;hY^HjB0U=8R^A#} zHR9KexChjH&NiZF1_IL%*Qj|LQwyT6WSk7LozS9#AzK{!twXf`p}+3LpX=5RuXqSm zSzjHOA?g|GDvtqq+5wj`H#8_{X3*Q}_JBl>ehd7%M-jLzf=CJJ$t$tY7lARDEcaz7 zj#hz1#hM7jS9CFUnc?41o8IkOb3a|*O}{$uPFHRNbj+Q2C7>8@{zzA1dAyAv}RGdMj- zOyibIqz6h`bn4e={E5Ze>ug`?fS8M=KZgQc>;V{5)$fN5TdPL-LJR}Nhzy?KjAgj) zMwV%qQSsSrtip|~#$90ZXEF(i(O)BPCqubWFp*w;FCK{SFfN+%>gr%-RDtgy%UOmg z$7rpPw#!MDMzZ>#YnWZgsDQaYKWy4jpON%zc%LyvyP7s_xibWqLN{(tyMp%U{L8u! z?4~)SL6&n!NcuUmGz$?UNrJdE*c2YEn>QNHr(s7{J93Do;uTLtbMWA=msd{1Qz!x% zd2q(Y>J5lAfUpqzBBMt^(j_wWv`xG8!9$FVKU3KjMw|$6JW)m^#{AtYgc@K9?k$SA z7-M8#s1KokjxmIeS?Q?fkP{s<8xo^#81(GPql;M`Gs@IzleCYP;6%7VeSRHsN$qgw z?)!p`vH`cUeSW7Yy6<}3K59ENj8Otw#@ZKty>!NtI*SRYGJu#Q9Y%{lqRKekx~WKl>*0)CrcMx1M)>*H|mX$PC@INvVDB`G%;OUZ$>> zF}(0n&;IR5l?~xvrH@x<;bI z%=r%AACDvxo}5cKI!>`xGR?txCa*1@raxpSxBw4G{oZ&FMnZ%Wi9BUWq3fNm2mr}S z9CUE7qQb9T%6j2Z_QB2S$`Tb-dgS;kEB}tJxP3>Z&F9e?N(IYxbS8h8HKXr>xz*ay z)BAULP7-~K^44zE2V3mWZ_M(?zd^73<@@({*$!;`uv7mI$vWF7ps-r*wf#UB$;q^i z&@wPcyty{2*p=zs>We!+3uE*SC-_T_KRJTIO9sKCVBKgIkdVp&Bm=83nuHiWIWwig z=J=$dArBtr)4-WW$Ikq7*;q^_@HZP-yP!2r|J_K07`w=xdy!2g5dG-1L+q$n`=`Z5 zBhsC|Fu% z?b~CXvp)VIWi-|u)uYw8ar2;(z5w#k+nsL8C*IYs0O3r2W2>pXgF-Jx^hA zsaC`9;iH2W{{@+eqEEbe)RA3vC!TRx^-lZiQr%u1(jJSeelR{YTO@~Q5 z>V@7|QnIk?26ypr@71n-6{( z?-Xl~R#suX&czw)933qY*CACv{`tUrW_Vg{k#e;ogNK~m7j<7e*KgJ@#@ zO?i?LXUNUWT~*G;@_Jij$elYE228QBnU?0Q$4L4J{!nBj<=yP_a#=xm!j#j0&Ub$F z=vnDP%bKr=Zk8>VE{ariYuTn*y3+*gJf#`yha;$YTiR{$alN_kXN*g_%!Vnc>xoIZ zZeOMXy;(*V&slwE)=(&DAlXay-6nc+tVz`|aajFk$Qse5xaj+xN=feY(%qx|f2Q(D zk-Z-024{7R?ZXU@7=(Ptrwh*XTixB&A6*zQBy!>V)wc%owxaeF?Eo8|7Vy-CatVnUV2l{Ev$zeqUDX{OaiZ+hX+JdV5Qp zHg#FwF8!qLOyxn=cA_N!v10BC1+N!BpBXD4S0($S*yVrhxJ1cYp4Gi7O_TVflip#EIIvqbB0WCyfrw!O8#1w}9q1D64VpsHMt(s)1$SM&H z*V?+7BRYPB?Uq?oG^iw7jfDV3FN_wJt%Pi+(9H)t{D@VTt@?OdbsZnC;yoPVh5S&X z8IHESmfmH$0Cg?t4d{OE>kOx%u2V=S`D}qT@7t zVjVK)ZqGDt(?TV~R{@PFJOJ%ftFqfzstDcPsCj45d{c3;r_{wh6h-PnENaMpeM+a_0Y0=$xaaw)FB4rBv> zZ#76bt~Pw#noYWaHriVT{yabbL}+Px@1B@0v0p@%T=C(SAw(C$hUh*vV0R$MooI2< zF0&0zK``1oQKx@@#;7!2zy6Qo`>8#5Blw>G<-XzQ?Llw6 zoBaNV+pO)@5&JE}OjPs2HXS>*l8t_n?YxO2vtlhzsA+TG5DMyrnja&A?P6Md(I%y7SDQElGb z_&ehY&{IbrIR4r?yx;@ksE+`G zgc|wrNT~{WW0*^34!Dl`kpUy;@~$$+yHD@lXWCg@a_rtK_z2sLO-I47LC1Vs07~9LKQW)nE%oS0o zE17G*V(|~4!fHHJy&D{pxV&*KP}brIc8g!MM>%;R#GHxG`1N}z50`Q3%5~#EMyV{A z5l;(X6$g@xQW$3^Lnl3?nlIrNEtf-RnBd-9p6u6 z43BYWl9Z%j^7DiDgrh58*H_fc@{JhPI+`<@7OyYtwn(a2#mtbqj-Lx7uyj>OC}r%u zqi4F8c|yUpYo~X8FUOXmhcP=~9EzO3pp^k_ZVtL@l8w4doN;@NN@^Q7vCD{?v&E(i z(;~4bE5qMR?c>}D{qY;`JNk3CyqN-{1U(V~mGYBUQ+gIu(;{cXa~9SzR_@fK-tQfZ z#!lF#m=X}8GRL}g>jCqF_D75x+qP+8Tt<+?)~cj+Up^@huMHg6)1vkJk7Lq(PSXv9 z+FaH?-<*pGAn!(M2!K8=q#eYSm=J;VtH;$R9cRkvG|~dlU8EuUDYkw{WD6Ob_BVZo zGJ_tn(XV0!5-Ej#hsEFruw;(mIo&u$RdwXN$YUcKw@FXi{C!dGfWG}KjXrE{5jvcs zFhigf23hQ25-(FcU%y^ZuBlZ?RCC4#N181UiBUHfgqAK2JNdau`%swAxrb-Rb1}yaU~A+%K(EY1mMu_0w+K z1J*4(gt}Az==vw|j;*fGJCPTyx&5Ql6$HIFCsi98;cAv4+Z^O0!V3sAyp2$A#oarP zZ2iS-k#{)I-@h52A7kA>M0S zmu8{X^Ac@~-VHSlzOPzxO7V3?#qu}#n(cL+#syc6YNtM+<(4;v{U#gZhPPStS`V`i z`@|>k>htvExKj~&OjJtA*8cC5==ZdLzlg54umNa^EkCm;lk9*)n@|J>+K*o|eP6Mx)~etui5c1xs;?nbPH9z5BMi zTdHCD>{p#-{D%Vze>=|F*?xwzAPzF6NT}VFy`uIghwMFl#3$-r4JHY^U^1vAbxw4V zK}1BnPh!fSxK--(tjx|kjsp-1kox>z*BJw+QNeiR-}P{G)EgQ7qVp!SA(;)}kUry!jFPnL&_McuxAJ=;&&N z6Z5>xXwt}0@j8-3y)*Cm>2XS~my@&ZC=B}*`-J*5I6VSHq%F*n1Jsjf~LyLC_Hpm!wU<82T$&7 zWYs4@Ss6?YgYG5_b%V>wW(4|$IO(?AsI%G7w^sSD+^ELW*;xD$C^*&j>)2Iu^0hZB z{*=~Vy!hes-4lHyKMgjbSF^Gj($CX2 zZC%^H>AgbBoCs}EKVUAcn%UiKX+@h&ey0SyA2WDc z&MzkNYQj@f8(%<|&ivOF3JR}4nN9F@CcjJVqd9r%o_*%F?Gik7&(E;7o?X)4``JeI zWSn^2O-(z~{&N=4h|VGA1S^)ZR-blb9=(`xxv%w}p5qk%I%u}3Yu|CVEgB*Q6N3PZ z$Q2SE5DkYFMV(j5?;YIn=FO>Cbq@FFOv~1^!TA1L7*DGe@0@PC|C*=~{Ogwv{Rj+}$ny<}T%s?K)GRg{`uv z3g5KJrCf7&;IR88EL1f~bfX_oW*mOOS zAGNhnZm06tXDu?mmRnAkja@O;s%Q^pyaWgLkaoX{T$!b_^6iG?r%n5qJB@6}Xvr(o zX4#NdC`?Dpna_Z})!l+r{5rb(qW$Ol|8QVz34LAR!Ftszx|?0g9|Kci)b>*?DM;3a1`t zxlveaZ~|bSfuCXuD?XJY1&R<0yTtGi)588lFrRmbP4L5H zf5*g8V5`K^4_NtjTIH#|DLRt5M?4lP)s0SZ*N8^Drs2C48WDw%1VrODJI!52mHmN` zJffJlJ9MeT#Y4E}Wrv5Z`tkDuC;eVq6&dBIX=OD7fv{o6mNZGI$Nwh_Lre+F9Z%9W z-C7p}zofwY%OqYdNpy;qyiELYphlGIO$ps_G`T*xI^ywlp#v<()Ln?Fv@OIhV3oQrRv-{Doq*k zNZCAxV&>?uwcg~Nl1ohF+hN$z5c$J%-xRCdzP)|ii=yf+7mPeLR#Z2gdo|$7chtP4 zh(>T;lOSYqaB_N^l<-M}^1VVv5QfC)(tXxD;iwz2vU(wb$cZPOb9CjkT0vbw)ECH#jsoJt6SQ zglYR<##k3^6_@>=pUdfXWTWIxn^IPzeAy48E3+|pQ*&iXw8Z=ojLja#63KaB9=8h* zf&}6JM@jxu^#1Lz*VNYHfD*?IF2(cIrkt0u^}&_{PYA=zt(QA+*W`06Q44EnJwLnk z^ht{q!{3y)ZQWUUrefltGs= zmKtEH!IGcfmO?^FnxJ6HB-5<peWak>>pb*@_+$!MB1?_#o2g;=+m7u3kp8?>Yv+i)&C?I_Z0u2u__$(Jms`L5v^)U_@RKI_wZy2XqtoRn0s5vw8z>?BCvz|i>qrgwrva|Pzg zH*f9tVM%$8{R+E#F8YKkwMciG6`iy|xvq_9#pxEY{SmS{X3``@uzXDRtraiSBJzRF zWB7a~sT-giFW8-Gjq)B$E01Zd{`Kj3MDIKgs6M@~^Y@$??HbeR93;7{S19>xa&ZC= z9Xh1x&JfD2lzHc9eI#UC0mI3_3 zzPdbkf<2ct$OB?lN$fd)&k@jY*xRIjmozkdFcFs@DL24lunq76T51=V3TGgS4^sA8 z5`#r}d&3PN2wY3#(!tZbZshs>({N|Yp|dwk^71Tcy=KkknNtdSeM4oir*UTYs9X?} z)o+gbw>r=r6c0vH_3aL{QySjS{@Qdum~q15{2et4_hW{SdPP<7iF7B+dmA83(KpOv z3GBD}_4U1utA6&uqet(O!69iO;E`CS73z zyvhS+9lqdP%f5YkD;;vXmRrjIKKbw9gRb4Wh2LBXaCmM}VNWXfKHAE21{#L56#(A# z>HKXNcLvY!ADQV>G#E)6dUq+S4qGb5-+cG& zam6jeHoC2kQ(W&^s=V%}l2*6e8i@!&#_=j_)sxJJ4^y4uc3Zr|Q7@lW+2FQm~q}(YK3^j*i4- zpr?2rWmJi?v7N-J!S6nQ_wEj7?-+ag`{R>~t80_fWyIqgtBI+H$QAueBt-%KC)q<9 zwK3^h96fk8ra;exqTv9r%n-QE8$`T^PRt?P2WFBU-I?s z`iu_kTgC!4?pgY!Mt3VSPQ9hQklr6pvilyl?vssqC}NysCm1#KGBGG*Mb75nnsIqe zebSEbN z>$=e4gg)%kt|f4S=YEBo$n;3DWwryu00?Rk)2Jj0kaW=r4h4%A8g8QY{$=xHMy>6U z=umL9(&yDXHY0+ZZjcvzazU2zDh1q&R46#2VHc5-jK!2J*=ooOvUtb-|AY$GR)YPt zJ9&AR=dVbQX?6LJQ_)#Y*I1Q=9^IqNIVIutIHhFn2T!LteJEY}YeW=`Zb-1Pm#;v( zsW8&8Yo9)W#4PI(NGSVKc%sV=mj^T4q6s0{1o+B*IG{Jz&um9@5|MX5KQ?RJIH0u0 zfmSv)$F$m?oD?wZbCV`pBtu$)zbWCZg6=C9lU@v;#}MHbM1_z%(2s3jaTJy$M*(Yy0<`B$XnWl9@7;GNlrQN-ASXC`v+;sK^i@O+w0$ znG{Kq3K2=El)0oyqDUwW%aAGbey+9t$A0&I_ulI`o^?EyhWq~A*KnTS^E+q_*9;ox zy!w&!3ulDH+v{A}pRdk@Ov)17n443fL zp+3j)AF`o>r7zrjkZI9?rRFcR_5E_N8j95ZsHjn?>Je!Pv>jt{hG(RKdy>(}$q zZBdjlf-o~P8@;?cb-SP?oZ0+)B_(I&*tJXl`09B}*>CCE&=_lBdsr_|JBAG4hGE)FrBVt!wmgB3>3`E{da6wBac2~C5Q z8{l}9NgQr@oao#8tcs-AW1v^x&_A=^usPY?=2$n4KLIiiGFyk3a(RALNb!deWpI{R zt3M%pF*VMpT~F7+`4|eev%t3L-Mc&r49v570YBn3or1L!(??kC(R4%z97Op~^DANv zMCKdXk&^yrtVUSvNK&|MZ4xFtQOui`yt(wIp?b37 zbIg&b&fyaG=dNIj&msj;#4lhuFc{Y)3X`W~sGgD2B4&_{6*@c4iHVBUI{O$4;LCF6 z>FFDq#S|B?i7jkQ>B!8M5fyHbkLiL$dK7jQteMh#HaB{xsm1u^2Xy(nAiv_8=DNDU zf0N2S-hcSQc;&mvqC1VOzspt&pseI{5%n>;Ud-V{kOIxqGU#IxqCDrPDZwvC3oHoU z!-mD#yoo4L8`SRhPt$c{&+oj;m|L_~!ug=#taeK4ewA3M7v<%n0gjo&88H!&cWvLp z?L1=~yhX%+0*qovlKAzffGeOhi^2)VQ@2zVnAU7CykWBQ@`Azd527yqmj;PI405Ln zeJUHnYOq_2>E7RKo^wu9WYJ-ll$Y7X>_c!SWc1hoIQ@@|9;B6~kf&gp2AU?akc8W^mR9Hg5uv-5qS6i(&i1q_g!j52 z5DCj$U?o|$>xgTrt)RqEeUB))WKrCSCnl!V_#cSe9*%YS)L9(j%%~5rH1JAzCdLE$-HuttFTJNY=;px)3Amp+%QbCx zwuqPmWIZKHYd~vz;P*R{$y8dJ9`r8ZWL`WK+T8yk*>3XvbHSoR)HW@B73i?rYGBV`1~N-lBID{-{(SGf`(TWnSS$Qr&58u&^O;V61 z!;zhZGmLzdHVD_lf0G`3Xw9%>e_3uUB{Jm%F&FezdGm0k?}{ZU^C^-(-in;LYNV>_ z2j$Tct30L|z719=OzZ=QCj4`R0s4p@3u8*59tgWRC?}fN0d1FLr0#%bbk+UXAkuBgGelhOs&UMG+Cf2j?Ae&lv zbW&AO1C^p&vUm>FaI|l5mtKf(?&(T%A*a#&TJG7kR#qHRul65o-EmSh`4oUJTT6_ zoO$WHmXn7o2!vJW*I(AdHr_< z=6XE+IQ7ia&+nt6uB@pFFE2kBaTGp`@$A{s9MLFG=#$R{nw;!5pi8e&J3>N+*9uCi zAo;@qTE_1-{WwqKsb`u-zY0wFSPk=ap5C@YovmLg7cw3?6sLJvtDdTAeE^p=^D>PI z7vDb#m08i=W%GzZj@qcSMW!Gp+tAUY3(Ir%MWku!_d9PL`{T!vrA}wweXXv4{=>NUovgRxQltJhW0W~tWo8T ze4K2dZ3|O9obKa6LI*g`1XVLBNSQu#kYitB6%e30adL}51N@n0NQdVd_?w>LgbDil zX8`p_;EcmDJ9h$0eZku5gmA&!yz|+vPyP}TPI2ZYQ>~Shd}iD9Y^ds@r8QV46Uv8> zrtyf+I$sa;H+`Rcp`=arG1!~c!i5`+w^_)B<{3c`5cK>*I+m73t_9qPLk>iY?kq;l zyby6$VLE1!I9as`|5=0Hy?e>Zy5@8Kk5Jz=RL916_7lh8&i&M7HO;jxo}3CgH|x%W z3lVts3Lpf*g{N_XVQA}dZZFK4D95l@nZNq*;ls{XyLqq0&hAs2a5cm$?96rRh=?>h z!85(!x+^S5dhiqVL0)C3tER?UiA#Y#2XI-~H8T~Z${X5ylG69?D^~bwE%83LXj~VP z%#cTRdG~ntl-Qe(zSbp~f4^dK=1lV3>C2bP&N{vtB*7R@s86}w=9rC3P|}?{B&0>) zWtbS^$+O_bNsHK`1x?X*y)#mS8BW|pnaYQerY)P~K2(LCvEV+m`K?ashlq@F#CUk)_hmI>(rxaAf z&=Qmq)Z26VwFilXA75wrE8&hZFnsc<2Za1g!%mOOFYmUtl0t;agp1gKfL|}Q58ILh z<7esSll8ml0-*O{=&wt0402V#2P51id*g`1mv`vYGZ{h=C%v$M5}-a8QsVj8IuY%RGp?1(mKtl)oXi9Gu_dW}Y15|A9arE_2x%h_nY$&EY=k_OB9bbB% zQ35!vhnJ_K!Oegj4Sch9|1HVkDWgJ7hKM1{^;f>Tx!QpR%xr9WiopXfZ4mx8qL=ae zwGL$>zRC3O<0pMBG#0pnk4|v;!i85qj1D(ge@a-+mLE2W@t-#&q=c>%O}oR?x?V85xh2bp3s(F9V#zQcz}{DuP=W{Yf6aBc@3dZUX&?n-leppY1Hh&vtggxmbinQ^b}tbQRWJ0661yZq?M( zX!s4)>YTT?L;TCLxi3Mw1BFi&R!zcf55DPyjsFa!(l-Qje#^_dlbaieg{H6#7L^uc z2PUT--3(@&wMh_v5>38v8=z+@KWrS+bw9IFP-BsIiKD{Qq%t83yNyun2V6J;ZUZ&T$6pH;`WKOX!cC?xtJJf5)|%_$)R)T z=ItuAF4U!7kjNm}HGNph&CTcrD~#QM6v#C|W#gg&*xW`U;T&J8s+=iav(XB6J?@1}2|(?%$`O>%?7P zL9<7fNZAd+mGheyJ>A=;xm&T@gcU0`#I~=}6nUC_a@5G^+bB>m!N&AR33F2fUj%f3 z*%UH;PO05$$%4v* zO`)J5Q>9|EC100GK`m03VbG|5Pk8DweE9Gw2_;3B`lY4A^->3Q+1IzUwR?s=>h`Xn zm>*hxL9wR_&7r}P_Zi3loF-4$-qYO6f1fCfoEaEI-kzt60sx}r8pTQT;;p&%Rk|my z^{&qDVD4GB)hOvYw9D~@`S?%(D``IDR zW)`GVuI`pvu{42#lniHVGh^e_rXw zB^QMsX7Cts6>`B;W2*`sZy{_4xu9%M2aK86S=6lan`kr+Kuv+Ny(=tCfrgsfSaBG( zM0WyW*Xw-mh7Ok|-wW=bug^k_%MMedI$ig8ktd=yI3UrnBJAn&OR_zqVLT*(S`xi8 z*&qV=C@d_rQ#WDCl}XTH((Itgs`Vk+MwZt+Yvs~>n zgyG$HseAMQM#J!th}--tF-!7i4cF{2eR|rKpEAsOGNv}0#(3=`_=7q_Mo#WR$<}(^ z0%^xwA?n672mhQS1f>GwB`mM59BsCuBnR308OkP69X<084RtZB1j+hkXb>vB12QOlcRM>?C=)>JMGnz`A7w{PEm{L-A+Pw)jlcy89mKh|n{cCwlnTD_-bXxZAp z=O8ixVCn`k0um-qOb_R8J$GSqC1<6O5jHj1*BaU+?&#QGgyI@Pe6(+VuW6Y{QQB2W z$@^i${(7sVL${E{GM*I=jKGJ{+%*<4W*i=zoD?D=zB{3W5826V^Q|5|p3WkL(p_<*Lr)?P2NZ>O1FyFKKr0*56?x+c8LJ|*tl42b-3w}BHx0O0q15-O$^rY z$i3oHW16;3*0*IxT$FUL!**rmZOz+~*9@5Sz3XlfdKN4N)qE@rYJT&y@*QUlD#aloBjmh4cm~hRZ1OdVoM*I+- z9uZ(SgA!ZAz)oXTkCU@Q$RKAZMwGq9&E92e8z zkP)-sus}=Ekcx~t#5BPw=jE_}dBX5U7}x{m!5k8_DLt=cVSA%(KJMJRIy*b)(7oE~ z+720;`?Nk%Yw8GTO>mdldNteHS{-6{3YTXg+(a17Sf_oA0*NaX9NO%u+>b&4$T`50*YtI#-GG4*O-qTL`vd#S=>+Wx)lB|3(@XkHK>c-t?`Z~YFT zp(Y|;1ia&`nRL8Dz4d$ZurXu0lJ~@t(Ss!o=k(^8nn)52-p!Ouv^CEFe+mKcJSz)co4{E)I zZe{QoI4&@IU;=D3ep|>GuIOSR#~iA|WlRpaLo$l40nmtUipxUK3jf^9X6*3MTS^Tl8By|-JcS2aF9>cmvXqg;1__NdLj<~qxhv$dQ-A!(YR=sK z%aEauu}}6*g?$aH?mg}iS!ZZyl5o#UA zj?Hb$$e31=ZlQKAc7{dZuW5HvZQ`dyD|VI0u-`NFOxIS;k=;MqUhnzr`Lt;fk!pX} ztQ`8yZj27O@vkauFFte0{aLqeshheHL}HnO1#hpvDBsyQ)nw3DU$6e(z73Xluo-%( z%TBNPCLbO2-#YyMZsi-w^XI&2v1wT_URE6wCvi| zvr=$wk|HLzbr1jZ?s4>djSt1L7$GKw|JcaXZQt$98V4;t`F{GQ^X>V|2@KQ!wf+A{ zNQAJ+`3VLmc=-HKLATj(_IS$1F%6ZQj_SUeJJ$PbO2OhcTj9hjSB!nR3pk1+n6Mj^KN$sy&Uho&<+e0;sZ!QNyXn^zI)rXm#(}P ze~o}f<^V%9G&*hG+&b}?+AMSPI}8rrU2xYwxpCgZ<=KPgso!ney*ucZ z{YUaKXM$|8gg#NetNQMKwV3BUv+UN$J{+do>yT!*vM2c+Pi?W%i*qmCxp=JN-Pt|L z-drij^?Ld8b$ndl7r(!^%KM+5-bGnQ*z>GmHM~@YdjKt9zs_*R;#d@{hTAdRCuLaw(}j!fV|V-1_#ryQl6?78mtB{Bg7(i&|T1?*;?9PkyHNYVK zRq3Zs|2)=RmUr28-s_Raj^~WZlnJra3h6o|5OT_>6WP0a4H`IQmHWz}earUtyYuXT zh4(gp|LmtPGVHp4+vvOF`0?QtUOs=;^z0ru&etH(c52X&u@{UUDwI*yr$#G@1O*zV=%ze=jE6 zLs#o#;Z*)#QEa6Ku5@o~cmM5>lA@;VQEzUb@p^`tbtjzD;_A{FS|-_i)DE?qw|J!Q zfV{k2WlI(w+1bwGKL35@Xy^(fvPgwl0>E*IZ%WCpc zMF7tJ4;KP+8_#=9ngg$k`3rq0s2-#G6xer^fF9-Nj7H#%u_^e}!4`90U*FnM3#a;f z&Yg2&yK_{Jz0S*GJdLJ4%H6a{w&Q2?!I?(oa&ey7Qb)AoFH0qyJn1>Ihg{dO7k2)> zzFKqZr8wt;2Sp)suiP5p{;{pdN3Cgd(X%U`KAD?DpKohE9BASvQf17|?|VdEZFXL# zZz*fJc%=Qb`1m)e#g}7$EJ+ALPwbQTF==<%;j^ahTeR#0V`AiK7~D>}jTrPaVuVIO zKu5&YlMZS9IS!Mt_hvJt@5uk3K{moRYcq&122OIosRu5ngIsx>Zv*A!Gl;Hk=+K&O7fvph-^m>2DMF zN4Qi2w-iE_w2kYbYig~ow|?fWnA^WBZsG50qBPt7)XvnK>AT-X|7o2jX*qwtj8e;t z#eFG$fPq&wM4Opa&i7tB?Y_VLfUql9uBXJ_JcZu< z>^gIkH*+la9f{XlYpN=fx=Pl$K+b#ziqy|i@7@hQ<=deA=wzpH&e=~+QtR!%Qc+mw zl2tj$`~pAXYr`we%ZA??e(t`l5W6ih*I-)Mqz9;VX}|6T%69uumNdX3wx~LJhs!^W zI3GYEenjUm0>+GhXg@~gPR$#-IG zb}Qes_1k-HmV0UGOrtmUF-FI)u6@7k+UksZsPvo~2ANj|pa7i(H4(f+v;htAc$u&Q zBHc`nYr7gT0E8Sh7quIOJeAMuYnyeg0xUxF%&e^kZQ8U_Z{mzu@6=Ru>F;S&$5(95 z;CjESuebB-D=Z;jwSFluvK=(W^6Zam9vUZ}aB!UPUu>VDQhfhF(4?GzF1wyD2n{w~ zbX4(a!J|i`ca?^)$Wxrd3-wa3wDzI8jE}mO+t0G*;L_7p7m{LQYZVN>(Zh^A+Hfdi zc=J%^!(G;{otd=w#y0IAYE2qNMa%CkpLXx;*Y%;MA{2y|T&_EKO~O^{bsZ;q85N)C zZz%7SpZzMsNZU0_;%Q>Sv`&Mx9gZj6NNn8JFI!cmr;5VWH4rM7V;A4esS5`JYLCt@ zuH893)s{>NVOSgcZMo70ld!%v81V8+z(U|YslXMd7r%1^>CRwC5ZHfIdOL`=mO5#H z{iY5$QRIB4z(~gbgGxUDF-*9@H$he8a`2Ijk8W%AO#hwjF zLT=~jIrvHmUOy~|>guKN-U84>q<+^pu%f21pTbw2ZzC)!-$N)MxqXNj_X_gBL;A2> z42|jLM6NVYT;)EQD$(B)bc?5Xd7Q1&I(^KGG%uYz5$N3d{j?FN>Z;IYTqIp@-nO|<)Ei(V+o zhE9A9;w^eY-u>O=ng78uf_(uB3rM+~S>O04(!;ebx#aGfZDVuj?DTk}5{^P@$aA1=-hBAWPNNXQo z@%=}tz<;W%t2YllKIKf;7TdPJ8rGYtHn!JU-Q6u2tH^QBCpKrLH}C3MsiSk+S}tUM ze0SH|-Z#G1yUAFWt55#(wHriQ-EwsS{fnvL?;oG&cLl|pLoM~Y!(w??*H1W~y&CuZ zx=9U=*BFxKZ>aJUqC$bYTVRlndK>9x7L-~53z|;>m<_{M1P|IcP6gctkzgf;QK)jOlK=843!MxD z7;trC5~eqzynIfYEjY)zmf7dilqa9;+%5k8iNJm6cUIiruZCp?-1pxW9|@PGR+#{7 zGZaw}@8nC-j8Ty2?;n@mcxifE`$V1nJvFc98O{oAMb!R8m^^{&L8lx&lAMj-%vVrF zoC1aGAps##Q0fSdcRw3^!6pIOu61hEpFGte#^=BFHyl|%D<|jm{b7!ekF`~7no()T zI3j=*b_yW6PtyzNl{hryAqBFa6@V>XjU|ZC;fj*Eq$1O(-JwH=W#m__XqXq7HbS>v}tdAT! zrc=Jv&n?xg`Oi=1=+Ztt)DCV+_X)5(b79yn)u*0ykTw~sytv^>%ee-ED*`o^=wYGLfDJzoHWz9^o}}y3m=4fh&RZYwH4p|% z6DE*}?wxZJCYUfcX|Vtd6R!GbQW77GAq<+y8q^@QKjdIGKW*P_#ICgtj9{5(Ky2>i z?*7r&VEwes!|)O^E*wfqHx$GiZ^YzSk?-8SJHa~&t_J*%GjsL2_C9LCCtUx1hlREt z=LY>a*W=5%r@XEM)Cz0|mjjEsJ&fTfjVuO_MTyN34s9->(ix2_kC<1B!3HEnm*izL zCLnd~Ks=YDV_{sENxzd*rkCh(GLJiZqHGw0hTy1=M6Bo1(f+e~%=qzl;b=3AT5zKP zrZ&7oVoh`MOoMDT_jag?tN&A}s?r(1Wro$oJu)2XyA~v94p_5t2*ZG9@UH^yU0`|v zA9a3(I`lILeT*Fxj_S64^FQOHgzbTpxzYLWYCT@sT}YU=PQ$Gs*=8xpdN9y4Dz{&6 z9Snh{8}g>8^ofW4RVPj~YW{UzRfTNFL*?ZQL2!14goLDothKw87=BaGRXMe#=M=0i zFF)(JqSm8f&f!F>@ihp<>kw{1wox+ z403Wq0pAK!HRSt`te>ma)Sp52;9Uf@@7^>sX^<6+^I;_@u=_~sJ;c}rpJH@u#q>`#xx1vDL| zpId4aDTyBs>^c}|_qw`z3Yx+cZ=c$YW-^*5Ri_?n?MLVsB?XA3Bt=$6&ea5RvUg0ta)QRsX0dTEu?7CcOv|hk@_!!~J{r>g8O#)o;bbfHt#THfBbI&m_w zDfIh++?P3}rH8PE{1ANgOYno#V;#x}ff95=`GUq`>ks9YkDp%&={*5JAwdot5baSC#zQH5wNz^-jGGQ?dT!RWaVKa@g^YcUSAYx$208K<6A*6%7 zozSug7k69=EX*y|EdbBgE%U4f8uUZk zH$J$_WcdMpQcs>NUENvJZdc>S8xN+$_=~cD0k*<53q}C~b=#N7Xfh-!>UOECx02F^ z_LPe3UGNO1TVzI0gMplSRY%=yR?t~h%VV{S{~PA*Ps?|m>-duzifsk}-f{F*Y0fFT z)bIO_ABQTWq;QB_*X@F6hiT5vn~j`yZdEm`TwGD^-`j`5cCELdsly|be@O2a#!;rJ zaVx_Ok5;eg8Fua#j?pet$2mz~v{mzs>#Qu%oH@!jR>ytluX7eBr>m+9Co6C}Xg(4~ zPSuEbOSbNPU+W*|tkcx=;P|e27G}!FS#2T{7OIkj<#mz0$~}~j5&qfRbFGHN`3;&~ zL-VEw9CP&GJmwUAbjsj3Op>*vhF89PSyz(qt1g^6ic6S)O1{qj9?C*JfqdlA`%h_3|#et_%Ydh{30tG zab%&%!iCSIqn?cN+BSA+mV`)XqaQ>8wUQo_NH5ZHTPjAoYA0_Pb?{KI*n=Z9X2z5% zKXz;+H-SPqa_|S70^m@jU?n4_e@r#Od+(N%q}sN^=A-5-(`%u}SzX=K^n{VPa9kg` zcK!O5l%2v?1S8QM)WL<<*HpRb?k#gmrEWC8H6>=MqO$TmzFyuSm3$#+c(?EY7D_=r zCwyB*6@llo3r?|~hOOwosy(e{`#b)7w-dqQ|A{B5a7yLIm_#*Y^~HV^M!U3YYEdiv(D+r3!10Vo-n?H1m>8@oSI z@Oq)S&1-C(AG-tm^xjUWbM-Iupzme7bWWgac=bwG{iHscGh`qWE5uC!cD;Ykrch~r z0QKnT$d(HlZod&jwD7mEkC=QNXz~h`zVoz4KO zDd?ZJr8jOmPwI0+jV76+*5z8PRE)UMlhgdv)?alKE8c&qi8x}G8t{o1eSq$wQY0W^ zpxO!QM%cuPxS7&_|K2?@vDdWxy&G0GqfVU>U?F=6UtldPrtX&K!2WU3NQBG3X@*x#Yvzx;oDfiYE*;ZJ%6qX_v9N zlaGUJXmDW2^g**{hnjl4P#xr@J7Pq;wO_8RYxuARd_h}D_2B}j2}Ut@uzD1JgD8|s zDh$;+#KZ(lLaJhb-re^F->(JoP%x%QIU9$SeGUloGKd1K=7cbZMksI@EuBwV` z(KD<6>&Fm;+3I1ZyX2|)o-RLkC!tI@9A^FYJ+0v<{%_nFgn5v zc#dBx{PYwv7E|1y#*1 zp_{v+A8g!u2&U-ZKfk7V*6lZ{?>)-WSz9k;J~9qYP`WRsifHjFb;dVHU!BtVbo+A8Gr%M1Z`H^8tq&zS49vV)*=1+O&BVzO zn;(zApW)kh%YVj#?_NLauU%S@6!*7&!g9&GQxXzhXlMBTv0hM6&{FTyerM*d3EvaS z9=6n%KSuV7MZ(LFHHh?pi>7>?`|?)r^^H4ze!25}(bwdI5u%Auu<9E0aM<^ zj~i8FLw!w|%N{v`DUx}ZM`UsF5Dq=_YhflQpC*soRLyxQ=yCK?HPzKwphe2alc-Kh zUfh|JbuGcUw7s6>o(_LAZr+}do$_F+o?V%K(6Fw1pJ!(E2<+C~)2Gk#l9qveuZ_&= z`ufCtJ>Sv=*HfgP3_LNkOW>LLdp(UtEg4s;e0scR|JP|p+uO*5JnsEE_kL3UQ5{^o zee5nZmGypo@26zf1&3SP@0-$T;II0viI+}Im&%Tw80slyH}Hg4ho$RV*7hh{Hmb0H zxu4Hs&7@Oh69Ug{X{f)pUZ&&I_x0n=Z~4yZz94)^`bxXZgx10(e{_4k9{plThW*&; zuO(yBhe)=oU2u5835PxdJHLLLxA@N%jlZ3gv^&gR{HKq5*5EsmOOh|;eSiIuHdY+( zLNE-O6uY8-v%WdR#(gV<)(f0DSjZ-@a$bGJ+!q&a!u1xV8ajNsSoU#R<^ysZCUiSy zrnXHuINNwybD?5Zyc{z5lc%Te{i$e1`4!Q|SD1SG< zSI|rt9rkI}R#oY$@WAo$g9mT0fO8_aQNS|7>`cV;zJ5KGPB`KEf%Q`c1DTE%*g zUZK0bc^a=yY20qtw{_(U`Ad15=J!o%(<}|YbZ311PqN!jj7=VMJ^oHevlfi7l^nOi zp-SimL}nG2tq`KKD8mdU$%fiu@+{mu{>Rx`_#>h!j}YuiW>xTsE@WUgyf9S`Kx)gD zJ$@^dl-evE6NH3|ap$pZ*kNFGyptYLsc+v^4X@43ijOg@P2TuvnFy06@9sX<)K5g3 zJ0=tfD-vcN*LhqXH}L5_m$_?OfZOvaH1U9Ng3Ze~-SzopHPW%q|DC-<@yc4OiS0wB z?=g{v1K*DxJBu_M@wLcVQFVkFX!H6Jg+9N&W&rx2fy0N(um=U2#oiMIU#a0&U*a*< zW-NVU;_|$x+?h%2QS{3~>c{LK0;;gSN274;xw?{)!Vx(;wUHwq{mXXpBoNQ3%d9ts zu~R2p1TPZjq?9h4L6|IW=;8DF+h!cV;@6{nv;7;)7O)&4L)UZ&zOqz@eXQrAnZOaD z8Xm{LUm6v~i@KE(V)>pcr?lL7vr@L`TD({S#TTPiv*w~md>YkDsWT2woW1P}> z^XT+mxcCh~SKa4$dek{MZTB8L$Yxky9_KNT>>hj>0~G!LqaD&(SAHHxn*{#AKQQ#d z-u0h@r7twy;O5Ovjx;x@3X316&BQD5+Y7!tus^}C|#6I z5l38%;%38m_Ox$aL9G^ymIrx8UVn>|R#D$Ka<#DO@(gP&1oiF<>%@P zX`|fi(yLDQExj=Nn+>H7_3#AHK-$Kzchhh%h8d$aX;Njmar``#0inBg-TXNFFr+nP zK*);cd~KId-(Q#38EP*(p(F9!ras|>&6#cp-|XvK`fvjBGSNyWC#OZPaWy94pwGc= zYl0&qMe8F>Kp6Z!y?9h{v70=+Rsf}y4UWYlT71F-&iC^04j;aHeoJ5MNkHV#*x#1! zVUnoXvT0OtFJ60*TPnTNGvL~a=8OI5;h@RJqyfeCCFE2sn9k z>zeJm-8t3Ab#CVb)RtULs6>sT|HC%-Ub*r`b25f`M5-sz(haiIgw9TV#EnHG291Ze zYg~2TU9e8L$nyH%3^YlFor5w%c`I&2`AmEtTok%@Y_XR%L^Dy_JuziI1|$DrE?t%w z#n|$}fkt9$6WL7y=%UC#%d3o2;2VWHdvX6s&0}XUkjCI<3Mj}PvAgxfERupmrDz+S z8eRwjPR#3nEYKtodqGsjB9nUIdbq>5{+*c-zdvmoK%_8lJ<<1RuQ??8b?Y{Sy&VQW zsDkQdJJTK%?E$pp_igr#R^g;>1ax8n3R^~2vT)Y{=T)ARrtfEoK*f+S4J01V&@!Du zc7yx*6jM`ah)79)o~Wtr*#qkbxuPIQK*7b4LKsHTvg6DKRB#~Td^fl`v(XcArk#ek z{9jt|h<{%Z{$-xHe=t89uBEk-xH~@18NI`lOwUY>l@boE^d0|z3JSXt+E`%2@Z}%O zv{h&(XVTwoV)lko>OI?S(%&C7Y$@fNA10FV z&^s{4d^s-YgObvCVbVbLA(&u%(qRUD`NV`rD~uLw!0Q+drK&sz1a}u9C_tG{K?`kD z-BvWbHZv=pAOIINwIJH(De501Myo3n{a1pDGW6enT;`6yD9)Du{ZEhoDR)K5EB-MF zFcBsF|Lq?Y)m<<+ACLVuu&LMQ&!1rtvw7y*+H=;i``HHUa)MnV?k|^MZPPJx2A{rq zwL5%H;d*d|0}c3##l}Dd$JRcEn|0&$niBJ3OWYJ{tm3UvSy`(bhCIkBLUG^cN$=Az zR#{Jw-$(=)9;skaX#1uoLoaYRHc0%{Tgk~0lz03p@yD;G0 z&!~xyAMdm?_+5)vc)*9GBjjUDoq9(hztlyWAJqGo>2cnJ%3+Jxv0IWfnqIqSmWjFA z^5t`FY=RM$df(_idDREr4iLqfjjzO+^Z7F;cML3L&b~fPBBve~Cm46Sl{tWTyj&_J zf?gtpiJX}Go6DHw`xp0_r_kG6Qr{8)~i=_04KP^{|C$HmcIbs=GhrneQ?iASNQpI zox%QIbLY^56R;9SXG`qj*x@tIb1FR9D7t@*AILGT@8b%q>wzau-r}+f0d-m6Y346q_S4 zPx*-V)@^BNlVt+36zUMfNChcu?=#bsr} z@FvZ#RfoT*uB{D1w}$y|rzusbvkqnoZ)>J^)%EqexF*Jy@$unE@Z@xCEZ(@+!1Il< zekxyuKVpUFudl8mx`oO}(C&F!OP!rNRzH{8h~X4wV}X}0X~9PhW3FP1rnu&h`K(!Y z`Ppo0X=7t!Iz30G8^|h*8D!r6@um}}#*W7;oly3ZPtvnyTg&_xCO_RF3bMpIqRG)> zsM=0KQ&Uq9r(YmJsYr_sOiu*&XF~kh)YKkaLRnB1PbVgZf&V!XB(TT9*4jD{L@b)P zZ6>@&CafLG6y>JoEB76&93DJr z(yci!hNdd+GOhdYVW6b{_TTR(w+#Yle9I7L4}i};0vaP=B4DpD2UAxQygbYpy_ui2 z8nMqJb~$sVw=k6kvXI}>BP?fV`IeqxC?YRyU0va$e%Y^ey%iCGc9-$xvEq0qO4pbvmkG=XGbUq5TrdC`^cK?f(DCEPzhzQ4So|5xGEmWIb!FOA zk)4g~a*n7^G2^?(V+^%cSL@fW-*T{Z;jsJ`sg5cKy|4yHNV9$2x^=gjq7KAJr!1V= z-*Af=WO!roQ>ZqT7p+*KCNee=Wp{!FJ#<4;&)ZmAmy>MZdaJJ#R5I+HjiF-mUh^`06137VOBj_K3=5sJ_&t&kv2ozGH z$%|k>xGy)?MAQV{WY5G{#oK=`Ft;@3$BSD)PzV?)3(?iSmzmq5_Vm%+D=gK-@5348 zC~}$ue7lj%pe_RpRfXCNy2II+m>pu^4OVxcwA3cXTKztodTz{E!e+^!iEgqxAW@t>j%VVE`sI6!T?F$&lJoXxPI{AhoNSwgE??i;cE=02zU!2 zb2)TC(IP^D0j?McOrO-(1zO~DzbZWyE}BMJ}7 zBtiJXyh5f^ryK%>BMu&vg44q5HL|no11C(xYqL}dt=|i^iYEw2mYSL>tOOuDckSDE z4@u+{IMHX$v}e0wr+=6Eh9k%XdtPti#38ht65^-g;jxFvEvB*Jg!S;yWb^HW=0)u} zgj_NjXkqJs@!hWDx@%&hqhI^9o#;NEmRyifav^=Qgr80u%Ik(SBcvy=E|?+^XZ_5p zv10tyt5*k(8`lkEDMCy=Jw1h0s-wI6Zhn}g1Sds1Ks<5w!8qXe2P_5-`BSAbrLa4&k+F-Sk@mMJmErfYL<=CK0BtwuXB1o0UREH>c# z*6WZl`2@w~=d)~|6rlqI@2bAcUC-l1NuCClZcen|n#bd{hcSZQdgk}))F|vgpwMLs`Dz%&cBJO7# zRE+rb;(Z=HDl3uZ^RXv#$6XA}{`t9u-Fu7iA`a7`A*JD2m;nNHb%p{k9E|+8kfhl z^i3;n?u4*x=T#V~Dn+v`nPAP_%#|$?nEd?yxc^ogVSwYFKaSn_@)`!j$Gi4gG=F{% zu%jb$7fOhNiPKJ1NtXLu3g;@|v33&dBU~#AK6ujbqLDvZI0khP+FnB0>}vk}f_!r{ zBJIfucb(%3d<%)b1LmBXGb@aUY24JDv{RBHy=NCN?CZ(D=w?3NQrAOC_R1RcIVl4OP2K86Bg#Ue7O{* zJ`X*H+D<7{O448A*RNlrF;Kd4E%ftey2{tE~WAxVV(F zkLK)ZYHD; z=OC(E&LP?WqC!fXJ?70R>}#Wtvn(a+wP*2AhZoPE`{%qC5olu+8Hk#4Z*y-l$LHUa z?neq(^2H@3Q={je=G9N?AAWyxjrR=%o>OdXkD0bEU$sge*VmmUgMY)LGn+j-*kte> z%I>UMZ*uoS;Ajz_)JLG}>Nuh^@1?kCzfng|Sy_2>=nLi=)!sK|W~Z5{F0()V#PIsn ztLqlE#D48(f3}95qqzGn#Js?S1Pxx#CHstB*RJVs?omJny!wZC zkSQb|ucnW%cS{fB$wo+kH_{U65rUS+U8CcASf)89|V5CL* z^RJXvtGiCf2m>b+A-EM4yC9Qw<%*85h%KjVgB_ksq=ax+gs%#g;=)~klccnPL586R zHQDqz`m0y0=s+7IfDmwb7AF_jboM$$KaJJEp1afQk~v~UrC>8T>cC?j-u(Hop9~Eg zn%_~g=W$mN=n{aaSt5%hYgzK+1Rsv7cl_dYm0NF0AGJxGdy;F9(8 zx6N=!nRW10z~hxQI4R^KmQAKQM&B?t`r65EHa@>kb#8mLvHXP}U1R`lF%nOvv>mYL zq;~Hu%BekjOxCVN!Jp<2SBLl0p>zPh$u2Ku)9-M){D2c@b;DoTxZG|e)b_0 zHfM59>i@lUNzv$L;1?H3`s#RVIX-*&-eaX$Is~L<(?Eh*ZpY4#lusq}1tEtH^+a(m z&>NtiQP`zfgzHx>sy}si*%&|RQ?;SJ`}BD>r0d^cvKJsJGz!E4j8Nbo!NJm^g>LuH z%#f%7%{&~Q4Eylry@^0})t@%1B<<2V{P5|kSDiV?I3$OR94XJ-M>yq6E2soAeyhf} z)CdZv%%F`Z@v+dAhhfMv`ELHgbr-bO&o8z=P_!$2Vd+&mcR@v8$(`BY|6i~Tv z3ywzmX{M?qtRnTc{?@}c(F;V`WZE=GAd6~GJ((_D@)4|g(y7q_`tEp_eA|>qL~9~S z7O=Q#hqhn6xReeIz`m;kYfaGSqDVh%M@4*x>xM6*ec*E7e%UutUETL}{n9m_j1hz% z&(BOgISPN%)|?lRo6jM=ML`>|Kd z4;$Lq+OC(Kb^5mfcM?~JGP#{(d4$u?btx$+p*RX=$rC*r&K31;ael;lQ?N; zS%?d1zwyxbB@PZMwXxb z?nf`iI6!MthaPCRas6Khialt1T>LPj_A>f-*UIw+Z2Ob_GT{{ymH}d7{6u->mFL6| z>@ZU`f@H_R#!u{6ee^H3gjt1!ieX`4ex*8=+Q$4PpDiC)03XlNtnKYY%LDbA+xXLa zpR~2~pw|SH{bS44UO#E5vfh6gzxCo{JXE}@v)a9D<4hqD-o($(fTNNFBKPg0E6-PP z3XE}1`UVd6RG!)gGZYa@jq#x**CvH8J!Hu2IdvbMttc+;4xjaWeEh#R%~@`m`hFRvNaR4FcJUI& zhW!0Y5*%X^akTQeEM2oE94v&Dt37`FYfR~tCwQw;Fe_57Qe}W&oM9``afv*_Wo6@o z004pM0!1A6z#O_%{1Lk_haize#?{poS;!8P!NS^Hc$wf5;^OW;L{(Laei4*&sk#M1 zXC`3Txw%qwPNKmw8Qj&@&W^FM1TpRJX6inu3ZaYGf=7>#ILw~GegKQFy0K9qW#OKQ8=6k@3;9|BNpq@W zwEmQSdfGT4PAEqsaO~Kzf!*cy9XTRHQ7MWOD$sujK!H)qlPoMM+QB80Q&ly9<^Je- zI?F*6U@k6MjW4brUHg1|5XTCqp|Jj-Bplbrlu|tp<6wT#ogeDpFXGz1cW(+!8h(Ub zRa7W4=gpbh{9$|yz%Ua^fg{F`1;O5hi^o(p1_y{hoN=P?r|;&$UC%yF}$6Npz5gx!4fbgv#fYWn|lC{l2XeMyK|1`Pn%VhD$ zY1v?^DE8^o3G^G&lrAJTXVKP(N&`HGsElzXKX9K8WtBJ*QzO;@z*|?ofm$f+mG0q( zcZpgO0fiie=j69wt;mq49Ya1*<7(Jk6iLthCvTPS*s&uJXlJg1BZhwZ43z^RMDECW zT}=ZeII~nrjxH`cA=xYwC_mK^ukC9~%NXqbZ|3~cMF7*0O;di`Z{n4~{b0O9ES4xx zH);LxV<6f99_L1GskJ83#~eNzS!Xp#fB_2cEr$W#*yO~jj{e;;wQ=`*X z@pBh0+=(>Pk+fNX(cS)4uK_eRZo1C2>Cs;3~1Q(OJSjk4~stO^xWa(`tF5Eq=+3_^0jNCABpKrW0k_M7YP8AgP`&YZ+NS9 zd#+2GgJ@~E9Z5u;LX!Z~R~C$esn0+iot|8eri{)QDPwoCmp(D<)i|CH!4jU|o0hPK zlsH$;CnU5F$+^Q!i#uyH;bP((5uY5LHBbfjQV0We)@;HYu$ZM?ZSTK}B(nTzQITZ7 zekZlMjp81D@W7%nV{tVf zMEmIAqhW!kFJD$-bl}e^3-q(p-F?QF)?a$mnlpo(`-hj+5+ofS!SxQZ!XsaIzfn9f zMrVw7!69}aFaJS-}3=6$K_ni8HQwy0iWiIwW@5k(bgJYW(_uc7~crIyu#7 zrN17$PX;0m79{>$h64Plk2$%?ly@R7F!9@(nmYFn&;PK|*tk7j5*^55W$BBt-k`uuz@u|vR3 zC^{)qj(D9?%*mHnY;Atqq4qVxHXy9oE?9iOr zgRlS__i|W*)i})~_j@olA)@<8N@}xMcPl!&9mN-(UA=i}1diN$8{SsBfCvr$kdTg? z4y{)`tU8`dw&Gl+z{VP1E_RMlO~L>EV^Gns=x#O-qD*70KvxRaT8>yV(mH>{8vT>` z>k8Kn8L>uvUEjD{b?f?1`!c7{OW!f}w=30J3edfl+N}`>bwk<6!VvP&z5(4I91os5 zJQtAo6FR&yKaK9YOvTmCTxkcxz5o70+WJNnhFk6Fc5kkkR#PaeQ9I{ZW#yypG0}5h znSi3;UL8ik``=fKB?!3!c+37f#{Me)ztj)vo}_NZ1ONAsEj@TW;ywNM)-0j|Pvu+~ zBwz?mXCOA*#bvoYnJ)-noRY+gzJv~5)ceKfEg$XE+SY-+$$)nX9RffCYOU?yUX(DL zC*#+z-;c|exEv!baA7>CHT~G!SBts9!D`ym9|UdsPI_5I*;Oxk9{W>_ZjkdDMOEpc9aBf zj!C)V3usAyKbJYIY`3AK-!Gf@YB9&Ects#bV*L(O6bL0)xxmY)h^Yd^R%cQGGWOLX z9puVv&YXN?aQ^_D#N58!A99%(RAgjiCG6GD(kLZoYZedab1EypcoLdo7;uw=P*G~?BD;=7>8-Y zyT9Su6muKmU0c)#q~@_o6$Gxr9Dy`WSxNvHIKoa{%%kZTMaK+ev@<+BA0j8m`QO7Y ztkomaw*Juv0hSciZ(lt{_F#GXhEddxW48VMZQtw+9408dsIKHh{;IdmpxVc{vv-3+ zw1bugGi;P(+u5Jl{8GyI4^^3yg`*Dw`N3nxq%5ca9M|*vC5lDTd!A-)+a}&A7jAHH zFfj~uEn8Wg^nQ2=ZQ>fwPuTkCU>V@*bedOj?K7VTRe`P40X=pgB$Bm;z%_ACd5(BXWF)EmU1 z#hb8-@1FVlb0@lW{0$3vNJHZ)?%g|IR`Jurfn#nr7m{w#g6J!q6j-o3;4KW9G^sCJ z*T=`_{Kbp8l;OBtO~y%~H?2Q6-hzpaiGa$BwHCC#E>ZEA2SMsv}EB)Sc| z$lK=7_2^;HIix)gb2Sbdz9Z>bXgnwR{pybYr~jEVH6ORWJ-A>GojXOJ6zf#rE*v2u zV+TsfoqsWM&zAc66uH7;@uBmJii*PG;5aGW`d>(Gc1A0a~ffIhOqq7 z`s@lGsMAvg);(xYe81z*SLr}u5aTokFpf;J0X(BSg|`i)u&4M)Ax-f^eZ7eK0GJ&k z>OBnVjGzUbJ0~rKw)#G9*T>x8LeGD*Zd6eQM={1SiU31W#~%lCLBo;wUwxeoG}U<; z$8XKHlUhStDmg^8LlXDiMBPy97LDC4sc0ysLJ@9>UWl89)lN>P(u-{rm7=_il}vik zYOC=wBb!N!7^K49YS0TNcRxR8=IqRzvwMznoa6f6|NnpQ@BcjC=llKso@Zo+d)Brx zZ$CewEfr~ayYp|WdahUSSZ;Y-We8B6$^hGbORXu<2yHW-^{&itUoh>rEua?h9YUej z@a@p$o}w{3ZLyuLEd$`P--dxIu*qc|o_^Skw_S&hFG8cxwLbh%CvL&rT=qHsrN2Ll zoKVypfGVI=-VCdo_woyr7Cb6L0FYUJF8!9ZmO=W&Ql34WFQSx(^=3oYmPWpP6xxQ< zdtiOXqLTy%pzuO~<$0%adyp%764U|~LfisV_krS-D_1_a6lyfd$mmKr;))o1j=v`I za#Kf@u;_?V4BLyozWxF)D{Y)WB^jT>8c3zm-DK#(W6a*tWlhxkB)WRqcth=B2L1N9=+ZS8c?Q}5V6+j z#n7aNf1TOM$ZiG6|lg^`~rr=09NTl5)lNVug9(E+=h5 z%drVIzW&=PV)ubJL=ncJ5tqLXEQI6|WiLq+by&0ki1=H-o|1wMM>WO7J1?}64pON; zD4ZXn6fARgz3dfXnzdq~+S#Qg;#<`+0f)1_q>Pr+HE(z7;WE#B4#*HK4mL%1;eOz7dn8bXC-^k3- zaZPRr>K@IT4|yB1DoN*ue7~FGw+NA(E(QV!*fg0|mT>Da!`+kyjb4fpE6m z-Vg$TogT9AM_-mU;)peaBRp?6jXHZG8?C#>J5NQ{;4xr!4JOc5K~K{iJs~|kJr`X9AJT)LGDG5FApvm@ggJQWt+~s&1o~(8S;@Ce)&>xQ02xb;xm8<~2a0{A4 zH0CQ>``Ne(72O(5uq0-B1|lTWV?x?;Tb+R6(-2$}93XbSVnZxhqi*s-WGo|Z70~Ni zsIh|I(bYZsy_wjdjPh9|$3DqOj$WH$5#zD4q(hG$Pr9|PMGQ3qBV8#61`DAaQzK%c zm=G#j3_{j3AS|oy?b9d&Md8z%IlkAtH5Cqbqh}n&&kF8)E>k^9o;V5f3lB{jPq*Mr zqJOjXwEKxWe9$ZyN0%|&-z&Fy_)Z|i&UqVeC|ITbQkJ}T?+rl57C;Ph3sV$5Df8p; z06Q`Jt(3(lT1T7O+OEh=`YKRs7R_}fLc`Fb&FLndG%=c_Zs3K5dNiVp6r!_8Zg>3Q zrnWbUlVcyO-`_I%VLKizG3Q6XD7J*wjMJ-_+=?&pX@MkIrTHA5V?ET7W53cd;lB0= zS?@FEB{uKs(FMnMyYyL#PNr|#>SD&Krr;!kFA-LY_hU2~jR44?bBTjbOPETPn4GLf zjwp$7-7uI~qISGC4Y&={yrw&~zM%m$HexBQ4b;_vh0!5O`b20(o$<*26$=a!G5y@R zcag>jP2^7a;L-w}=9D~ai8qL3gx;#z6#YzSk{fh_6dr_Js{g>`_*bzR`!6(PKW{YX z?(O||__Kf~OAeG=^?EO{$Jy9x!h6j%HN(hlun9csLWBW7!zfB|H*Z)m_|DVX+2{~w z_Zq**nKMKqvXSUp?ep6MeSMJ)w{{smq(wC$4Qs8Sb+_73YXScK^Bj|p9PvyJjW|)6 zn`cb@1_>v`+Ymb=WbaQa+c_pF?bOxF4*6bbb5&_q4pMPcO>9KeayqFeK7qFG%BLqh zvg1K*@COrJS7&-aYOpMeme^s5C`dao-oLibl6RxHB*?@aI&`mVP{(4@+2#3sC1sC} zLy#kZtYZ&eRr3d#9{8%VUi$gDyjI+)dlGYa$??|C&U#Jbv#)!v&TrEjR=vxBM_KIZ zd))!e#A$g{uP^H`w)*G2`?Pl+s>_Fd<&TNdH~7gP`lm&dx&8O$6epcu*7D{#KbF7L z86AhHr+;?1R&V%p&(@>rz9|xk))jR;Vz4-ep%L@t%fW=dzzl+RuvvscWr zRmf(|kV+L&>90fj^nP|gsLDSmaOeL%V7u)1g&Z(NoME$npki^5N5EERRb-fdP}l}% z)eebl)?7O#{>eGP4C#(&w^jW2*dPDfGa!6xkSa{#zR+c+%-a6ioOv2vwh%Ax2;Hnu z4qffSUtY0XA)Bk1HH#bIiW}X|26K$v&yG=ruTZH%B(8s?WH*&r+gi)5Of8htO=X~; xncTe8RB3JM8W0(^HYhAWa`8aqPtW?Np@2Amjge8&#mCj!(#yfs1wi9w>1EGl z=Sjn1qp7PXrvsB}>KanI+H$ZvX(=5U1(?*9l9$thNhxnHS1ngBOD|guOFLUnaV|Ai z7h4(*PgyxVMHxA9H=7T>9u8hGjg5+>i=8*Dj*6|1t&=z}Oe5!PWosklCELHFY;H2WN)>+y9pT{UCisSseupbwxEDZE+qx5kXkRe;X9!6oCpl|F!V&^1&=> z>RM82KsgO5pq#d%(LcSGS2U25Rh5%fl!A3l-P_B}+e_ZT$yWS5&3iNf8g3d_D-fEP z7?+Bzi=CG}jUY^;#RaS2W$Qu1CGQ0Las0rbyMM3t{aDPg{uKOu<_dYN3L5zP@)G)Z{BYgvL1nIFEc5y`_x$1dcj(vcF+TQ0FL!%kJ@dO5OYRPMR?^M{r+wPS+2j&?>cvSmo@vu zUdIYvatf3RR;B&JU+RLBX2R`NiyF3qOxhSA6RL zy<)G?V&{Lav=@l4M*iLr_XRy>^!+_ne{~9}8q4_kcl%x7`)h)eM~`iZ^#F&ukjEFU z_b-og`=<~76Yry6Z@PJ_H^=5wU%Q%bDgHkG{`;#U^zOOfbum>)z}VlRY#}Eg`02y? zmDXI2?vqhT&-85Xx4i?lIta00WP`48pds;N9Nv9f-5{}XS;Nf{7x;ol$)y*<8l0^A zl-<&In;PstF{5u(#?`!TezWjA#bB;E*xRoAU{!No1Ac9G>8;)S#a7$i;4H`5J)045 zW6S7kZrK*>T=P3*s)EAbM7Ou%+GFB(=}(WKsy?=T{|ind#GMbx z_KBpf65I$PHKLun_Av38`Fm-;$MFLGb2GKFd>cyE!xgAayiVqCygz0BY*^->b-h0u zJ}S$)9<{>Ta0fxD=5ruGWAL6(*L`&a`}7jLy(inXowza5a_!5l#9DUmod%ROldeq3 zjnUQU#l@Kj?>{L%K81Z5)6*=Z#`^Kz2Y0%mqYC1V&H+mu190|i#s7}r( zXVm7j?muAV>bQp_Mb4=@&2BTA)>{{sM$RBz70Rj=<*PgFmapr)UlG{@Wv@>Y^ApiA zb>=H8RR=liLG}YIw4aMYr|Q((zG$D^&CuyD&J6wn<$DOAGmhC}TTP;3wI$D)6bv(7 zTr_K+7HDM@_~En}2J7!(_8s{fe`PbQFqxRYc^FC$i4B@Gg4|B)R>x;(JAb&tdqjr4 ztqiF9Xkd;d79D`u6Jb4rP(?Mwxy?zN%F{ik;`3sT|bPu(1f`!9l zFLJ<=LX;LOis^)km9n>4phUg zyMI=rYp`}Z*m=K9^c393mi@ufcOO?*Jy8v&!-M3DMhr!2|$xt1ltrYT3foMFZWAEr!907t9 z>iop7(wy&Gm4_s#R|?v=H_4P@AEpj)2m^xm-P7tMm%drzaTI%{oVQF7;ofa` zwZ(AM3mFfGbkH@+$#T4R<)AyUBBEO-aZPHG8%A0-j<5V4Zydjh-Qd@(dZuMumqVq= zHEWy`gMGoNkhZx#+`g`w?AF|&I(!b(uBwF=t}*$z3%VAc9d^ayF)j3`0-e?N{`9K3BXqKh!HQQo*CnTOP3+GhvWusbY}eFx*52DQ?Sa zC?op@F+_yyw zeQju5?`-gHp>EJ9NC~HAqA>WR%GH-jt1RJv%K@Q z9IXWaVdkDw?TC5Kj9ozpJQbNQp(8vaQ5D&HtYasYJF)pX0&*{stH<`(1^EO^6C&=f zYYXmXQ`UJ=F1wL%F%D7=-K?zssK(KxCU`(0@QLMj+p0Fvr;3|?Wwi?O~x zDS*I$!{rxL#1)`klxUi)V@K#rOk2fbtW^2#1 zi<4_aqqvz0}jIa8^sI_I8*7>RqLY{$?-7BnNhlc z3>4m_(hcr-FA>XlKI6l$1$f~E?yJ5w_kO%Sj8Av|JoMR4LPCk)czj2wzY~9CD_inpVyG;9XlX=UJ}h+HdJD&gZDw? zEVhi32PY)emXHVZPT1D)irKZfHX{(#P)}qP%fnP~m>C=yyP(qz&@VSB%Uc6?UShOe zP*0k~Sf5YV&qSV_(>c_0dQ?S}Rxz!=!*ZBx8ae@gDNkFy3}Du#n+gw2>&pZ;8N(MD|KBrvaVHUXW z47Qv>E{Rq>6sq zv7+b`SF@5isX!T*8YQZ^C?;c2GW4~T9)xE8bt#!Zw!El=t9`wf4z%9O?9C8tOQ!U+ zuE!vfl-&+{WU>Z5CS!0g-GJ=6>I~?KBTDb0O4R)4(x)VW{Znn8pN?~fS|O*0Lq*Up z)phoazSwoIW}w2yo1a0!<`x2py<(;plG;&Iz;9tqatXR1e62k=-z8=s7@)0O`XWOf zo4%k_edMhuYBZ1gt+mvV%hEF2{8?0sOV631?62L>1cJ8k5nimd^FXxMpVlflKciNT zlwPo43Z#4S$MKOfh@3^7Hiw1$Xi&kl=%q6J*N`wR`>M)msdCTvd4#kvGv^~u3o~Q3 zY@flrG>l`-qAKT%IoBlGV1{Y40?2bV&P(RlU2)3}p1wV{v&!+Z*Xs08gP)Q<{LP9? zmk?lFl=GO0<6P@u^}eHaoiN8Tcqc46-oeekT@aI-dq3XrOjG^TC&8OCM!EZrxk5G$ zfXn&SF)z`dh9vbJ?TW+D4gP}L$U}-r#~RHpvgfwnt{%;Ik?F{}b>|pgJqKfToiYh^ zzM5P(z9jA0N^PtbtB)O4DZNHQUhUO}06YTX(_r=EB1ewkqMQc(h$=yqSnDrlLz`=5 zRTA85Ty5WN8LuA>nr2TLk&NneqrP zm*smKl>Uv2>k=8Y1pd_89hzQt%XVs-PIC0#Fg-&S^WM9Qy#bwaQnC3P=i{&QKySmz z)Jne(#_21AD!XmAEWr^|;gMg;(lAahqu9Bop{HVcN7s>Ksww93t=GnOftaKyY^Wy9jmb~Y?Ss)u^Csq0JsHJyE);&3!wDlO6< zy8f%$=(<$b=>VvoxlfgJK08xE{7%WY1W#OC}D*B(j&H7ceM7x(c+5rJ4fLX_YjR@_3(Z zLj|H#6@T&jJJj=FwRXR?Y>_GZD)?O4=r49)y$x>l)UGhO!7!AT(oBp8-DV=J@ioE-+{#RNTsiX+J5| z+{)=~JEEv<$0Jl+{wL7H$b}FisU80-3@vN;zf0%pqVN@$BVmg~vIdd=g1p;$ug_L% za3mBKr7Cz(2|JsMy3rMNLGSObE>ENhyCz-vdjr(>AD_xugKAmL3#uQ@fk_K z_ojT5Sx3VFDmrFeLI3`F61~kyd}DZgvI?fXF-II$lPK2`LH@8Q@Y z7d~Y>Q|9%XeZW@WqF=E>e^DIWqpcMc!mdB8g69kRx`pRx6VzT~oLUW}Cc~x_yzq9w zqGg50ZJ=TZ=2dl1uHJ5@O1;fvKgMUFnNi|YbvHabMio}pH_`pXF_q$87P0Bf^yjja zwhGE|hyS}OI4BlX-~+h2#9;VC#KTNVh(N%XjRgwgV>LDP4v=6NO_Ov}w}G=m;#Bx@ z>w>w0=i=GiP(gfRsGLGr z&o@~zU|z4NSg5|Ii}%5KNBUtf)wS~pBiCH=(jSAFYj)^OJ+^u;+sHu)yRE9sXn4xtpsc}Z-yaiLsb3cJRh0})Xr)5~X$ z`P1vxi}{Mz4Z@nIw8C0r6N;xaYAkKdBYbh@({E7{ZL$?b(ghhzjd`W&B3(wsrbjnM z*UiqDsz=)Sy|%c~Vz?0;dhaAi&lySq*qw%|hicwbL*)+Ixeq(Yj`*gI<~_UXX>$o_ zu}(nBECmN8;yJ)>k?J1pM&u`9_Zc#-D8R^vbq4iHZD)QFPH3zyn;v7QSed#2dnJZ{ zJy`?`?Of!elj!6=OHzq-i1U=Taw|5$wTS?ew{*Uh=XwpSQq;tN+_lM3vq|Y}p{8SJe$#C@jC=K5 z@-i0Qpj8yMK`4hBtzwq`lKt`B4iej#A zX0V|CKoun3ONXJg={&fIRXvCwZFbox&@t!Wln*mFfKvM)~F<$DRjn>F2<%wbBZVG6+7B$uAaX2>zN=vK$RKpLr9_ zGD^Q8XH;P zBseVTOuhN1E#T4t=GT_!7gvtKdspVcymyPi{qR#J2zN_y0O32Kd zKj9rc$0>hK5xOgvO0Y@`X_pf)AezWdA*}x6(srQnsOE>oDJjhN*4xo$v)+Q{?V9z) z84*RG_mAEu`wBk^YSde8j`wLs{xstqgDR>0J%3=-7iC1yYKrQM^Zx~Af8J5%Eyb@} zX>q!WNH=-w$BB_+gl?`3Jx%_hnvO9i(b&dv$E-&S1eRNc@#Ic-%*2(7otK&kx|o25 zKK(61-#EX}=5tBh(MHt@M+{Jbe=z7Vi9yy|PrCCiGJS`%Cv()!e9kV{6Vo#EjV49> zoFq{FZkQLx%R9KI&1;X7e6Rgy^!_W2zhh=nm1d!jmosKS9@+fbQ7fguLU8(Thp?K~ zovtr_wBW^yOR@mQGU5vhU4N>RGVFNr#(I>oT;L}5axrLPzs7Mx^UNq}PG4cdVFg~B zOG=AihxPQe4RU70?l+&}uo4c2>S9PS+J&-mi{xp)uuT-)c@;_3r%g3VQM}jr**A6UmuTeL!c*Y3el5 zwe)fD>9$%qFR+gZRT~%!IM(VqU@Ud&S~5|nF;dAe*!3k?W?W8OqsKD!@&5p{Gxhm) z*X|u_Kj1p~-cT>y^ z(PSidvX1?g)?0k-S56t?^86|n%X}a|8LbL~%6i);Zgq5Q#mu9z4Na!$FuC#6$XJ_I z?!|Y!We%T%5g6EL}V7YrVxz&y#r9ch(a+3izb@&$>3%!TM6P z>lYVy#~Zw8xG)}Lyyar3VL8?qAFe(jTF_MhT)j+m2^ z{qz4wV6FsD!rn3F&c}Z_?ql=|VA<1Nx>)Pi}TqSVaz!6``S&sns+{Xr#h;^~VB#%f8i`$Cr_PG;zRc zjd+7xT2<%p@y)1Y!|07D0V4MoQXlR}Yr3+LgMJD`Zl097lEG29%*@Kngn=yYqzW<= zi*|!8Jc^HN$*+Sn6vr*H&)7KqXLg7QYVer~HP;=|cD4sBhoYH7JaZU<|o7CtEfI z{j$=Rjgfz&;4^nG{o5qrR2b;1t>3|^1Z5EI3ZqIY1!b}+yz%5O7(~bGr+|b?WFc~o z#VGsW_2)((S|X!b&@C(w-HD<~&I|RUDBzXYsZ(UpCbPw8B7W0L`LBz_(;K*+iA)0QKVnW z3TVl1ti=-?rtdOk~-+bB^w*^siu{q4$lIlVy6d5XWC9)HW0BBaz8iYno+(a~UR^ z)C%<8YGMs*qYStoxC{49$ccP`95Z7P@6-M~w-Am-#M4h8?POM_6`XBFl46)Z(8L)K zY2|X4CvfeFCmF_6RN4{|e~YQOo{P=WtTUIYDo3YaFQjM{=gUiKXXVw(n=8ZF@wla| z(7g1vO*%!vy>*p`!SGU*rwT;(gF~e{yDad)QVCDU64=R*Tq0zUTqgJDG>-kdHib;G zwW-=Ms#Ln40gHmdz#}%9P@h=sE7J5f*~YCe3Pj<)AfuAQ1DYM~7IN~=~Yb;bj(QvHyszDriC z)*Q=p6Q@U$Gu<*K`EJ&8)?TT3B5gbfcgp=~kuiA3GBGe!#2Q)XR%DO9c{|^UmnUTG zh_z}vzJPT@#b5SNwa|dcfb>8Z-0h2h^d-mOZutWz3ZiYQ;m(3hD$Cs$UE88L7V#`0 z)dSr+WzwflK=>v*gLG?*bx2S%fXr(tdZ4>nSW^vpK8||0|9Z5uc<8|+mY6RsyT1O; zzoj3*%p!`Bbhj#ciA_%R5aigFw4pbb_)IbQ&rE$LHOss)gl8xnksDS*XKpCn`z;|z zZUUQ{8#4j=8%5=3SkwAf!f`sPPXGHZ48vhqcr+%xNc=gnyrEUe+y>n1!OkDNU||36 zXzBlDq_$0rjN+m~pGgw=_ck2`x~e%pQTBg!=_Z@zZ{@6u{u@mFU&dgoVSaACJ}~a$ zZ=Ku*Y~8IE*pjL$>Ze}boWP%>Q|&i3;RF`v)AP)GhopK4Sx6N>qFX%Va^=3P|O2QG+`O>|#`G@%JMet)t&k<2LfYIpbSDp#!e1uP>f{9bfE=7kl!kl)+ zcD7&S#`YD)_tQeg_Eq{#_yMS9i4d&Ylz%-`8(=k zim*~}`P9kGn*#->R*IJ0;oR6Y41_LW9-vaxXB#9Q;K@hJqEH;q)|OD-S*5Yt=_Gtl z24T=LDu&hUaEtS3H6+WQ(u2P;5#N(Z%P1PIo*;`JPP#3I>x&3}%@7;CC&Q0iKN<2Y z8TGMb5j?&JH)R^nTZy|)6h0tO5wm7@nu3ziIjV#A!|X6JT3(h2eg~hkCVUS4+#vBJ zvLH}NEi^oFW?^+%B_W%SP`R*_rQ+bIurkp=L;$GDZ-fR)#PS!kGCD4-^a+W`<^$Sr zA_)N1>vp@OX0mz{KWKY0Ef?mdcRyN|L;FphJ0`r=*DrKf(r4NSSV!~-?d-r<>{)#Z zW!E_3cxYg(AnPt}VI9DSwyR;BDlxw3mo5dA^tYWL)l!6_VY${$ILThq{CFKI$6E>0 z(%m?QTAX7^Z~tb6F~YQKoa6ogq~{&eQiQ28wQ?g&hCym}NEjIXUA)e0$oG$)9@SG( zsxl?k-aw>J^4xM2!DY|(LnI3(OB!gBc}5`3l4bbqI} z4C)t(lIoh}lQ$crtU$N@!y1;K>E5Zzq%9rcQp7DCS>lp75a4duw8Ewgmq4?k1$ty< ztr7z!k7RkEC`4qe724|ZKc>9hhuIVB?5n? z6^mFZ9)(*(C%M$pUX4Is>cObpim!Ul9nN=W9dp}t_l7Y?g^J{Bs`|B*b39P!&pVlGOk%E7^_$SV-ujH60QK6 z(xA@xr~M#^a~o)0@XD--EoQbq zvthxOqL}_W|4$zp8o(}~T$CqfR&qsoJOB?+K>}t%;w(Lg`W-i5+{}YB!P<+-V00Y;BVpD_%abBuA#0|R zm%M~;U`BULZ6P@#s%;`{AQKl`(#7t`{4pNInzc{}(gkQdF^(W<7opq!o6A+c%p z1wmeHZ>^LkMG4>Y<-9OAKno9ho(FKPCC*644QScu4J9w3Upao+?>|yGcHXcfC!s`@ z4SMrl;OG9r{Z*5nqp}WFzlcFGVZ#0_*@(1F-n!yv_{~65_C+S}*udMau;}jwWqx0f z`#iv`#J){RfnSoe$`rJO3_Fy@vLNnYrN4#7{iLjv3N}L3)ipZS!yd^>DCH~|ZHKI2 zi*Ls7>1_aSKnA6dqBSG1PBkM8Z0>lJ*l5XYH6buoz1p&n-Vd}1nUHp>s--mRB1vR^vg!#U~~>vi8X zDcrqfh@U@{Jav_2Caz4wi0_pyM4;%Mgr_8FvYZAJc%dW96dt zn>ARI1pHsu_WxB1rdvhZg1R!U2dIxcK4~CH)G(PBfF7KokC>S6AD$3T(2U<$RD!;j z)P-WKL;Ef4ZP&rFR^Lw&J|1JJylx1#A}=m1cNl~%cp)!q8uu10ctr_;bISP|du}YU z6V+h5qx&SPcI#S`n2TAfx`u&naEnz{3$60w`F zODpH|)<-!!_39>5p;?oXU(_VuSYL4f;OF_hdqb*fXI0c<-n@cxghWkUisc#$-@jy7 z+-Asmp=bXaTY{jvCeX03gZ7*~Qqnu`gI$;v>Q|0$sA2hdV3OboB!F9V1YxzN>P6{% zn71lfS+sPtpwBxN9i^@YLbpgP72^Da67tM=o^N1f9l-gN?iL<+^@JuE)K}ajSw`-= z)~fneN4m~s@kjkU54}Es0q`b_*|-QVULeN;JrXM+e4VF$9!^8XatEx|Y508$jnV+! zRRFB)WuZ0qYn-wAloi0@b=3Pik^s~am`YF`VGy<95s76hom;9wxr`=WEsgS_Fo;@5 zpLvuZPh=krNxR`YL7e3d^HqXx={Kx&=4C$FtzGau}c`upr?d43N zR`DCyNRqt+xZ*)0Se12FGI=gRzYy2HH`T7K1)+Z{CNf*`yr7aZ*pzO{QKe|%$fvv! zIU~%YS7!FQ<~=!;MUA#hY_$3(%c?d3mV1?Co_l^&a}jNMA!)wgEOL896eZrv8cFpp z%B^7#AI0*2lv@Xkatr?tM&Nz5PZD2e=4Y8 zt&;Y43p0+t5k(;H75SH>Nk7nFQ^p0%ZRfee9>`Aa`8EO02ieEY5My-#_u0TG$^~aR zQ~(+FE**o5epFFwm4ff+gR}jR(k&xP2%$1Ch$R8nXVj^wcm`DzK^)L+f&hDrOg4EC zBd+_5?KZ4C`t=d+{j>c~HA5>^tQNqQ(P`8Nz_neWEM;Sfh*8LU+fnD{z{bS*M_^+IVJ%Dhc>l0pT#BW5cp=^R3`n|IT$$vtcmp=;?j;U^<#XEbyZ`l<#wm;kGYG>);8F#;#B#+yYGbc zj{vuRk@8we(BGLV;#OR$D8Z3bz+ARF;YswHdM~wOBX;0<<6RB~uDVXjZnFKSFV>=X zuC^Y-U9@AZofJIxKF(WE$&cRJyD>eATA!_#<`u_r@#Q1--Zloc2tDkuwj!x*u zc2n4W7P1Ri(jQ^~RH1O@B&&(fu6nTD#RxA;GcVG=DNcm*6rbJK)~^Dx8`kFC3AgA8 ziO+f?-c2E)!drs%*mH2xPcefVfrp#!2%onx6;R4J-X1 z*7b}@)gkf)DT;r8tq((wDTa&~-Sf$(yJ|{=(@wLC{VGdX7h)SL6~4QyQvG|hOE#i0 zQ%0!qYS}bHK|enSo5d_W6tPGd=Lsd^$fsE41W$&LXfcOVzxE@Bsg+8p`tm+&MNS8M z@|WF$5TWL><-IPNmvs3`48ara?aiN_tXb@apZU1x6lL2>*dhwQ5gWiUk?F*1nNI znp5@Cb~3Z3?kxSs%OWNJnx+p##1K0#j^@v|c6)JYpb2~w>12;!AcRh!=7B;n+{uTM zk5yH_^n=JV#6){f%xZDnp6--%;y3o&E@Z05wYN?LjO2{I@GJE{q105h_wE}fYwiUy z^XDyRDeY&v&WNVDhs3P0uh6Ke|E{A~IM;eZ*u(274Z1c%3Bz4}>TZqRKL-XswPOg4 z>K&uYF*1k?~&| zGBT1@NMFzGhCuMxexsDS^tN_35tfU}QU@V4bk0*0H+(2A{shY$y&Tb_mYXfI3(P++ zc=PjFu<&|jJK6QJ=XlF^tLLY-L$8ER=v?*qiXD7NrC9G(Z-M!ttR`` zA;>Z5Pv^~3#XtdhWi&a_dl9}1t`Lr(`nxW6Y!PlMJXBS33)Dm)O7*tIQ?8ek^4X;o zMG|K)@y3S{5h}{8mI~F@6B;GgiUwD-L?Do7z0>Z?e^(UKB##~zX~Kd)LiCzY-fPkI2x?!uLYTGZSfX=Qo8 zpO@IkBuP@D6!vMA@TiPab`rM8XbF9@tR zRSb(vmNb0G;YXS%X;^#@eCO{)*{#{9c74>vtvct-fXyLH>J7!M*woOcL=a0Q3J$@R zSn+~9esgFRRu$o|s^Q#VP!+_9R&lFW8M7>}wv zv*^{$Y0C4gY^JT2&;n!9ZeiBbGwzNSIA(U3W2`v5y?4Tc>aKCKYLM1j1M^BJqvV`~ zdco-(pR2UQ)jvp2>Tl-^M6!~%$JW;>aTt~=adc$X(KBKtqv{IAbn}I(DRb)C%3wG` zlP0v#oq|?vSs2?HAi7m*YItR4@sXmON&<$Zig(qX_q0kJ?yb48TFNHnOiJQ618!^_ z4iU0djKo8Atu|_SwlcH7Rnv?-IPnN@BsYEm2}~^jl4|bK2jzK>Xg+=Ze>2-oB1nhgBQ)>RN~X zxvum2?>If&bNs^vEmgnN+zc-EXzbT$IfN*ZSAuV-ZPD(ZjR-a=!YD>-;3_WgIl zROHR`R#Bg9tWK*arq!;+k`(@W_@7ASE$F`!gfwb0FCH2?-Q5vY{^?COsN^#yq*aP0+fS4u1l3CyTl=rwg%WW5-tz zqxi;1yO<*XhFlu|j)W1Mr(vCqdv{Dq&UNds{B*IYY^*m)iK*;_#lGRh$hj3?Ib2(O zH^aJIT4r^zG0J{u3pZZup!VSZ%*@kl-1<3y zpu}*d+-;*$5JOooXVSSOeREx&BWqKzn)+8Gq_fz0a|Pdd_a63L>)fu9W z6<$tHhN!CU0InYgp>rTh4H099yM#sQK0HlpI5ZxFq{@_CR0)_#NLG?O?m*o@uSW}fm)D-Mom3|)7`cP zn@AHEB5Eag-5`vo4)DPuN3%Y`0V-9zoG8m(2S{N`K-pKRNz?>`(%7txyfLDWBu!v? zNJ(oq5lkQ4huOcU!N#!w#MfZ+=XLG(q-^Ov7%S4F^t}DBtHIWEwMQOpbXODT%TcID znIlOAKX-gtoF%+eZ9QuOlz|69^%2wI+ET2ey+>dLZ`F^M^%x*%)720DPhhFKr?_(x znEL9^Idim;!@eJnpPEv-d9zF%ARVwu2v`CoVIVJ6)2DF5XKB{~BrO4j1O{wPcRH41 zM#nMMu!Ij-GIiW0%b1b(S#Et%fJL3x#|26K#I}=%D6CN1q{JQQ{x>&VPXLvpaYpv;OfW3BsH&{dqr1OXgmW$)S-k6}POJ2yKt81E&jw9%9A^*TB8@FEF zBo9{cSY%#a%frY&Rj~Z$`u3Goxz(F`Pc1h5505LJ^Wx2>O!3=9?+$*(^(0YpEe?Ts z)mW>PaZX`|b;qDzv0p#1W%Vl!8(=|qqIp@ zRCdL%c>Ia)#8jb`OT9)V_Wp$OF3p~rDOKZ$#nfM3#)th2+M*al9 z#VD!;JviS$eSEbuu`RrbYM;X=@j#w3w^0<^a;Rw0xF&eBJ-}!4u+j6Gz1t>QQM0{U z@E;kLRKhl!a?Uo{-e7uNB2(W#8~lDR0L3SfDw%%N1%F@NEQ5A*l{MIw8{8{=!4Py# zk$x$scRVFs?UlCnZhNFfe^BlZ)!2mvtI6iI+9vvbW5>T}MHLvWC<>z$qyI%K#<{VY zY91*1O=6_G{R1y#?c@2ZzQPQB!a~QYmJV#eQXL*>-8-tN=r^0sJkf-MCS+0vfIxguI)e0IX}%xJW}s7+lrTZoXcWoB#^Mm`N62IEf+uz!^0oQ5VQVhYs#w< z&3di_v`JrHXf_vyt1oKH5`aaeZX;w9?{ZxrLj7*@KkIDI?<2zq+y zy#d2B3G_T6KQ(3RhX}O#-8o04UXPrbqedOz!4>LMk035^afkw~ww4;^ozK;*3*d6C z)T2~pK!%O~_I%zLuytK&Djy~1z5&~Dk;$?uMlIOSMGzg|hV6KT|9wVkUFqCTPCDYc zlJo5-f-|Fdo+YMdVI82E9D{4$z9x}y;z=RGxy-U-b_;n!$;C>zd~!|iB>@%7&e(gt zD=v#3}D_sl~n$5OD(&+L=(b{##Ry%QCo;gHP9P}|@HTr3ocL5m~yrP_7D7<)$mXA*VN>3qjH zO}ht^#k?^!NY!vby9AxhdZWcm{haN3qur|2LoSugr9_59P9}s2+%6J?n^WBy79n9! z#^L*l1_qe`xU{Oun9m}JY*=*G>yHDiqG0w%)6Y7rf>YF%KJL#6Y6A%2iPyg< zQyQOfEQjEh{)admuYVASqO92-DEyBMOW9>h#51zovTYrtQ4aCiB7ECrPUt7z+u#4$ z{oUp9sn#Y5EKdQW3PK#I^Pin7F7#T6CZgW)NQgS|RUk#5Otsqpt*+mi-gZnKL_Z2f z(o0wW`-lKUA5r%hgra-Q;sjY}SA0*ZRYs3i1<^JtV{2ZQj{m|M7XiUIqFt_8qY=;liimMw{KEoCf^cb=~f5O zO2e8m;IG$hIJDGMajOeplOUruVPm<{T;4KLgpG->L?$)$1|JTL?tZXK@ipo}Ft7ZR zXpFsYSuv&dZP;Wv%mtT~B9a^&aR?k4-6KnI;2jwmdv`lL>i{umPz|q9m_t=}7#BV? zQv9?1Q#4u|qmr=F=$ ziU9ep(J^;VS%t_-H9hI3IQfcFQnruJF3hc4P@D0d1#sKh^0OltAG`J4G+{VF7oevY z?GVJZV-hwbV~5zf3GYT-6s|#$pdTt09QP=Pz3r0wqFD*~j&E?ygu)UJZX#GTHt%S{ z1etG0Y8MQ?HVec#H0Q{l+Fa+V64iA}=VhdzIXBtqvl3XA3@o-wN9bpuQBvJIMYdaM z4;vTxxt=mDWD2k2QC5V`(`rr^)ZWx(Uf<7&V3f z{oK(mxp;R05Kln3Z1Ir0q`v!E!F!WlOIMe`fIxsKo?wnHCYcGMJ-9u4*_oJL(ISug z@i~rTg}E3o2lV~?wEOOJv8xMug>*@Wq`Bk{84|BMZ%{o#*z1QUwoBkr*QxZc?>TSd zjWK*$Vy)XpTC63;@=H8zP)AhE{xiyo#(J}GReFgRljw2UB8{9DKY1q%VELo3F{iS)TWHbOYf%4 zxvysnEJv>x#`^jGm-VxiB}`;K;eGFb>oQLF?aZEa({0)|q+M0mDkn{ERO{CBz*6f0;aB1!czCtOE??kQ?RRxtz9oycmtHU}ogB}i4X7642|$C)a9v_*$&u?R9- z?!3f$MY~M36;n4S@wfpm024#*cN=a>|I(%%ElWU8+L5rZ3UKi(Pr{m;lI7~<9HD%1 zfZcaXZE+7CBUV%t=m4NhC7Vs=%}z{v;A{$8Jk~Xg|_z2OQ~*1-K0z zdlhIrZVVsal5KR8sRLgVeYeE&5<61{U{~>cm!(eF0he;veNPyLi=7yyDZ$Tgvi<-q zk0&Lg!(%UY^OL5KmuB_`a;*xX>5x@`q4mivRn0|2RSBtayP94WK(vA_6F`YC65r&x zvvMVZwv?Rt?Ga7VF~`cWLY=0b%+1x#*0R_$grA71Jdnh_>;}F!VZl4rY~ErR9I_8S z_PlnRM4y82B6FqKzAru9Ady&!AF(jb=oxjgd#n8}gpIyFz`4Di-HV0EZi%5>nUbfo z>5%`)2&41FewISbTXx+{xfFbihu_*h&gX@?<8Mb)O0mDO)5V$m?#Rp}V|8z;Ph*7< zG&Bht>-n#nmm*E}aJ3$$JIakTG+wa`!tW+JIdP?g2i3e|;i#e=Iyv1rR#Y=q=cf$Q z>y?ATu{O0#?@801=NY47o#1?quD?muJ1u9+#(sj1WDg6ca6vp(^A8D)d{yaw7&nP6 zYZ3`5(!J)SGNLN=>blY@ZtgQ1r(}|1!(}t5=SVRqf>wRB%VtYc*m3pDT&1flsx8QW zmvHP<$WDyv>Tc#NSB3gD{&zdevu!SgvC_TkcYhE2>^ z+vfRcAd~Iv4F}iJibegsDbkL8U*AfF>l6;qiLm#}gGpBU9?}_$^C0fpr-JZvEwyqJ zIn=;)z3sR?k~z9q)c;AHQ_5S?){gz!z-(#00o(e5 zF6RcBMEvjYH58bvPf|Vj_4cztO(^~urTgMvE{lubI@t}_(4Q?ZES$ zd>UHsH^11?`(`bZaE@!VwCiNfJi*u7^!?wPhQ8G0GlTu&!9LGhs4wQ<1IIDnHTA&p z#8r4x5_5DvbcoJR9Al3!u;r}cu%7oYQHb~OvVH7o%IQn$9}yhmL&6x2$A4r4&`Fm& zIoo>R^`b$}$rLG8|7kMM@Z)P1Rxe_6p~9joM<_RXY1Wlxz6B8hMjEeW4Th7w%%{5W z*2>B0aQK+|iyzk0@TLYAjL00!{U&@Zg0KK8K37NOraNUf6S^pWw!p53tYU-Rw@oKUStUHB%UB@Z4GI?~aGZUAFK| zR#@S4j75_)7t+y~s;s8uII0bL*hq~aG)Z{^iF6tT6AsSC;jj3D#(#W4YLhLtqCp2K zwl~qDmvT|%>A`B)#2GfsCL^Jd*khn4Q$71xW~#9Y@z{?(G2?`wIL5@u1}Kor1;{t&&Ou?PNX2d@8P|xr}z%HUZwc0 zY=j~iUL{VVnqZ$N$c;U+X2`5BCgrVrC7{rE=`A#S%d>u0?G4FPYbI+ zHAkt^o?_t9ZnYSvv2=RzaI7=;v>Xc)@a?Ru;YTPb`((J^;-CVtt%%ixW2KgQNF(da z9Gz};!R*_DVs0uXC&KaX|4%Dd9tic;{wq-px!tl$CdJriv1MXxld&&jM79iO7|RS} z2+@rgDT#d7kr}^PK1V zeCK?h^9i=QVRZ1|DeSDY)%imFdD+cX?c8Oro2>ICVr-ZZ=4u@g6x zsyN5Na4d+q7Uqy**LHd&>lSNI)Hq%|s#CO8oHi;XLFyh>6PASW<1bNGa-Jwx#zi|Z1CRW}YbrPUTz|ei^tYYbVp0I_DS$1)Uru7l4 zE4Zc@i?-P;VbP*Ut}(j9silofuDP0R+o@{(ccW^*TFUF(K?qEYRq7gP-6`JOP%ByS z`SmtWuU|N7QB#_eJ5)73EOvQ{-@Tb3HmBavg)vTkvDGW-BQXK&r`d9(9s-{r!h(et z8ZU@z+NI^){2bR+#c}5jO%7qKL7$xn5GKdmd~Cz}gefO-X1h6MSKL#n<%#ef-&&56 zA&PyJ9LN|_-;q`Ya7n(OGM>H2Q3AHnTz+h${iTu5h_@Cb=bbhU&tZdNDBKr14U+SA zR6Y?oJT|_o=1zG3Vhc}qRc;qZ2l|%LnQ+UTbGdvVRPi<4jqHODY*)HKt_M>D@<^#c4KH9S#jru)}#}?=2oZIY{=kkSF z%Hly0SQ>4G+Z*HKdDn}Rf2JUM-r7|ny)FGeyK<8F*kl6T*uPBK8(fZw-! z^>&Kb7;6VUjgJfekm;e#WIUZwCS1~GdQ!@_A4V`c9#h+d_2?NC4jGn^Y|m`5s8F+z z-p>0=s7_}dLw95xy)K>?^&wsx!K|%QvWYVTW>?lyiT1^!AU_c zA%WOyX9#sVg}(A9VDE3l>lCVBIe5eR#PkQBv2QVCacN?^?6kaU@|VOdNyzifI@O-z7dYec{h z+EZkwLXWTIRZ|1Pt|Tt6GUwcg5=lF9tUxrDI~!51%YAsz<2v4;%$alE1%EXu=(e2^ z-^$n03s~;1+P?j=M<>TZwNgA-E}t5*F)~v(rS-1UJBGfb8kc!MmL*~uEvpKo_uCa8 z2#wd}PRZ&U%h}@!89O^e2RVCF?3}$S)C=RIS5kIW1;&|!6g@s8 zt`LyIm$GZqD|O7Lvp35}B|tmhs80zQv@e=`joZsQq15{NxGdNd`0)7%KiCv_L@R80 zzZf?AWx&JNxplVP);IY#qY`I}#*F%it651pk7p2lms#fzC-A0psczJg?Kz)~T+V}5 zYF8?QyEd&70ku&a=grbo@NvO;{}ETgXz_f(+`YofTO01FwhB&WwJ|^I@0#4#LhOEq zuRafXf$mj-%v)|eZT!l?U7WI55MHzU#Quj4R||RO6O!iJ&YTqIm*3WDB=Z`_d9O8b z2*+886culHjY9`#8s~GIATZR(CD%E4MA-gxCtSdm zrMyhq3FI}CkcY|3e4|+?*Sj&=;um{nWS-(Ujs4E0jErP$l8#7!)qv|QQ7Vp4JkM~- z65(!miffaRGsF2$C)BOzd(o#Y@3|~00G3Les8~w_?yyTOYM*$+OHD4`>4csj{29Qf zt@G$PRQw{pmvuxap;$7*KQ1}5&hMQ?DCr-I31(qKRml(P9 zySew^z{A?v{Lf*gZLop3gf@qal-U+U90;F8(Tck3}kRjUr?r~o8#rbM_Psje4UB^9y z3Gmu&<-^;>ZR6g(%FIQd-&0X#0TU<`y(Kpi%<~@L6qR_teb6vdy?N0zqkdYsA_;aN z?w$Hb{`f)Kbgh(CRxe zPd}4f^Rt2L-g6be&jvQZ>ue?@A4t?Tc^onTACR;WC2o(Tc7%q8hi4C)jkR~@7|51G zjtY-N$w8(cTlGfnyS+{8U@bRp_i$9M4}TGR`}=fCm2|D~`az$6a~E}CuhabS4mfA> z;h{C}0!)0vJ|#@+#YHSs==AMDll}7FwGpmi&nLIkMzBmP>m$Ai^_AssJ8w6xo^0s3T65nroOHpgn%Ey$kK*6o zg}k^;&eBKralQ#>#fBPX^^5t=(=`^J<u4$8qp=#+ft2M>geTPjQrHRv^y+8Bx- zv$#_7M=`pdZyau*6mc?}0ZIK-lYwa+Nmgki`5CFrFp_Gk{02qw$VOs)QyV2Ut62^= zB77peIZSF$W4?F%>Uf$6MeR0|T*>(-(>?R=FiEoA)8|+mP1#|Sl)Stxl7%v)eVoJ|%|%fzlqjhC$yy!C9I2L!LRtUL zFO=~%&q7LpQssoZX2L)I;;hmozD7dAHQHIfD~*Jb%xa2}KB=Jo%v&`xQriM?&Xph~ zR}yQKg_4PV%FWl9O=;a|Q`!rW>A@}8Mqb*<*;Ih+wTSkXO>947mMee3<5paT*}1Ph zLUPKG+3tYE(Xo3g`K`Ymw7a2SG9QBMFE&}o``x~4~33r}qVw%sJFGriboeG*1Lf;8b z(??%CX!;;+jIf8ef!84yPp{&;kdJw*Vt8Yb39D%z(LZl{#Tm_?wr6YIXff~Lv5(s{ z&!P7=Zs|$SD6WV)qn37nY_DVoWgL$tYlp4{_o@!Xuut4Bfi%77^)x-diOlDk{knRI9pLn>T3XT( zX6R51hn}E&Z)C1{WRwnmsUX@=d-fS2ZPVm-w7fv~c;^N&w`zED-_^K-?aJRGhQTk zcBVl;$>Ta%ErC)1IS+ou<)|vRAV!i61_DnjEvg|1Q1c zY+LZdP-5ulPMNy{O$^8D98T2x-Z7TE#9`Oe^r)DP!tZb2L$pE_-&HoS^3j4)OoFNC zwdU|0Nefhoxbd7)#QW2nvwKajI5QUp@mY_8T9kUXf$(Jn{Ako@0d`N87mF0*+oKg5 zpA%Z{qbK;~j2TnWXtel21mp#hJ-cr2-X;P0=A^gNj8jRH_=RUJ4jsZP z4b6ufnLS4?_Uv48E=wb4-ybzez7rYMv@>N`+aN{qU4T3BNUv;!f-z&(@rI&tI!R34 zxI@;bME+KfRSzkqo^G9-Lw{dynv9Ew;FBep7aWqKaiaDqPwuvwIyz}S*6hydx#E`; zppy{2!m3L5k#gXD_0mLi__u{l(LpVc-2JFUf8k4^8GR4G0`fL>NBPRC`EQFWT1zum zzkTV(pIANWvfAx}30VI6DD2Cdz|z%Uisz?0w>!O2z7<;aAxtTle+#*OW%D-)?^XG} z)uoAj%k^Oo#LgxpN&eDze#K_$xD)>5@zqZQzxD+r&q>}lOSv0$uIfaq$aT5b)kYZG zjh9QWyLwIhKJW4B*jCAR&J=C)`r|tC9T@p)oXe&Y92NTKo}Y!3opDG!zglB2s_8M{ zogrSU8Y-oHXz*ZS@3L<967CnfU$%sOp01h;Y+U)+Gtd9%gY?Vs*9%i-6B5tWd^j00 z!Smt+BCRu~S+w1OrNgD^RhP%kUw!ubsLXGlE4~Ev&E1;OTABT}!K?Mx%Ot9s|96y2 zP+}uvvhgt4|4~Tr2U)LF{t25&2=*nx+3=rmQ<58ns7nii+9N>o8c-(^U~)P_Kr@|0 zWk8kK(3%i9p5#vppu3X%0qCX$o#tvqV%Wn0Bq0>L8{K?PmY@L5?c7JH`lfl;1o51$*Kwna4C>f4j@o&t}ZUB zWYz!HMsBWu=;0ec_g-6#Zmw{WH;LV)KQI##DB|XdCe!FXLfGyekxfo*x|f3)e3ZfJDrzxbDfw4THN<6(dWU~PK< zj_+2L!C=;7U=W=|hJa{`fUuYP_Y113qN1b%B}2d0p^$2-AfB-=DD_935=srku)o$J z)esb*TT#s}7zgf6y`LM1ams zUyJHi01F(94GPE8Xdr;I!z>hTNF~#tY^#EI2}3N@UIVMHqN|V4K_GOn>Z%A8l#;R@ rRt>9mNJmu_tD~;3kNM9Oj_h{>e+H4xSesCkk_rL=fYA?`>qGtpBL8fj diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic2jr6_nt12.png b/docs/graphs/large/l3_perf_a64fx_jc1ic2jr6_nt12.png deleted file mode 100644 index b55765a8f57f7fe8bcae1ac37647281a96d4002b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 254420 zcmZ_02{_kX`#nmfc`%c)D58=v5h_!XOd%mcAreA{NRcU1lBo=tNiroGkjx30DVZrl z2!%{%?f3ot&-FXkIj8q}ujhG$@8`4kz3+S7Ypr|xtE-;gv6W>j1qH>9GfMKB6cp=U zP*AK}*t`*6@i^~FgMVx=KXvXD1x0Siwq;`~{P!MHCCzgb6s|lJ6nF1aQ2fJ}?*62p za1^AV7`seCAsIzM!DJirRznJZvC-u0X?cn@^1o-rX<_)v7F(r@_V}mjfBs+f=|~Fx zWs}31a|)Y=H*91QW3>t}=fY!1xN7M-Xc{~1vAt7Cq<(PMgbFW*eW1<2ldy+6F`LUq;ik_erE z{+-^>JNLc3q;I%o<1>?3>b#}vVdh=+&s=BMIiGoXa@~g4lUxd!kF%m`w7nbOnsDb> z^BO(fH~aml)k_2J(@&cjPIu(m((N*xv=nsOl0TByama0onYHtX!)eXX)NL=%+=|t@ zx?avJ_k6m;xO0h8mFs!0T(7jdcSpCbQY}+PsqU-h@^3H{%zN3_CVi)wO}1G!_w1&h zAMd4Y7iJH%{pL&C-9~fp$%shI7K%elNf)kX%sLg(fBZ!s<|F#^_1PaAxyqAOLZn%= zKW(PzVR9ARd-pqw;4aE^`Y*oayA_r>OdrQTwK-foAo7|maL&(I#z3X>M&-feP&uub z>4#GiUkSf=?fQb+dhUFV9V+D?BR2+{INM6(OVejvO}n?p zI9Sg#{S@`OkAZ#_pY;Yx+dbV*wPy|lt<^X`Sj%R-JPm zMUCOKN2O^_(X=M*D=3S6s$8!f;(D;?J_Db<#>HcGnrH6GSR|(AwsJA>b5(UFT)w*) z)3W~Z)&!<&Cms#BUo)WNT{&j|aO2g}UhbM4DsgjR6Zbx@eEoT2Yi*X-#jCw0S>c{J zbepbQzM5zF{oJ3DS!IJRaksi93G1 zfa7R~y(O)0QQDPayWKL2pCs62oRyYE{(Lf%bQjwwR>Nk}_4V09L;Ir4qMHjlog!7I z#>SIHRa?JJ6i!jnTN^e}%Lq1`x86@|wEee_`=;wZO6D_6d}D*(4zH}g`R=0im|y!* zoGWTmW7S(YO!fFbHU69YVahFfj(^;xI3s^b%lYd>tdkbKIg@YvrCX6b$=uKGdYUk4 zOce9nO46V^*TfujskBq`MQ$aBsigUYf7IAH+&8rg2iF zQG)Gzbl0_|$%*2r;(4e2uEmBSe*gQA(iSzsvhWv5y+Oloi;KJ|0|Mhnl?c(qH`PF<2QBC8PmX2+0 z)uP`@6&xHK_4MibXC33UH8p$o@28@pqhn)d&wceuKB?(pWTbC&^lstk>GWm`(V>wM z6Jz6hI++TYZHXP@XOxsqCaCvKOavNc^YZa+NzwFw`jnNCk?}ydvV$bMURFd!g}lnS zb9T3GDQ31kdi?mlURKrj@9ZKX`2!`|TsAb4vBA|LoM90WJ{cK&cn%+-h_zVf+0&_Uz$c8eC&)~U+-F1t)1CMp6KdTZ!P|YvjJo! z-*@@#wA!+Vo7<};O)c!xJ|Z*&)(!g}%PzdqE})m8NF9e22L zlZB|ko15MNOh*R#`%igo+>!I@)rLJnLNxyJvU++fX=?GQ4x`PjnHL(T^-?sOUS2vr zF)^WpcjpNUef+qhE8m>&eCl?5%6>jR|1V$EU+EqG^{ue_)2HV}Hjy=~N){Fe?0+;2 ze=V@my?C+dt)o>{5UX!Nfmo!#C6o3S7Y*JzZ1I*oH9Jzj`@|h72?>cqx;b}C+?O;9 zEVU_K8Ww*_R0z<0WkKV+lYL;IG9WoMHP2)1&Wjf>es*MY?&6TTJU{vUikVq+>N!@k z#<=>|*58V5&kN+{=CVn<(jPy5TxMxP|MwvU&0 zb~w_A<<74E26B&=ac#lcMD(HOi4-UHrI68px8D1;X`VE?dSLX{kM#_XK~|LY}>Yttp7*vZPAjB z{B%4jmnXa5Y-c?h7|MM*&u&mLnEiOwW4`l0n^U%RWE+{+KHC2|TEw?Dj927Z^S+I{ zI6fEI4R+>T-L~}imnnYol}&HlTgP$6`Cqgy+iDsb%sO+79VWY@u?*BKET59kgjsYK z37va-^y<$q39T9DeO+cpkPph%R_9NwTfcs^HG>WzUDeymdE&$g#A@m4zk`T6KcpHbRU0E=zy0ENE-r=2l(Uh#g*Vtus{#|W zGBhjQCAqk=sj_tHe0H#6Pt5SS_}bc*3WEfbk?H<&c}2y%8{Z{g+w?XdAHA^~-1z(V z@9y_5VVoYzp?FUcIE+UOKeuIGsQvUwHoL=qpkj-xtZbC%HQIN#ewhq^3cqsYN{;Iv z%aekJiAhNpi){BEzcukpEna5gYeB$~k_AueZ?A={zlu)wmNsI4YHDlu^6>Z!*G1;L zFWu_7Jth5j@_U^h4R_~jYgRg5)uEC4rxg_y^z(=5T-N*h`8`t(=XZ8?{*tJ$E#ipL z4ZkOf^787rCJIv0(i!@N0w>l`b;dhR^Mp0MxG1udQ$|HqbteU$ED`5gX&)^G|Am(- z9WO5-&!yr)ko`&)2DZ7mxmmQo*exI+aOLV%O7$eg9g?>u{POcftiQfGm7*zhrH1-_ zaq+dLM5N=MM<-}zbiQ*sg43Pq zzVvr)tZf@A!=2Sda|+kFpZDas5?ud`yS* z`$&uwU!I*>j~5M9PI-J-pYiP3vtM6Z2jHhQQH8IIB`izd&zcPS_ogV}0lhU*pwLr$b$ zdUrHSuOQKGu!_Rr=a+k&cNXp~%}>!WG5MyXq!_c`&CImAcR=&b#=9y!BFXLRvD?qo z55&FAl1&g~UPbt>!zuOI#o2Iw3tg1B4F|S6?P9)}R<DI5WKJFJXb#}0_vL+4w>V6aU&S~mVM1)yXGRiS~MRoOT z-H}q;g=ym}!n>zRmISb>!)0EZu6N~mqYUOdjB1ix{jt1U;{bj4TgL!olD|{EjW{9x z!NEKz*x0cCpFfq>R#(p8n4w~z7&O0gvXNPtJC7`)Xkx-e3Id|0=*^qoUFPvLbadr7 zWTzArH^$%j$5(uNo-w|kE1Mo0jx)Tl_#^%ecI)b=#|cQT4&&`X>o@JR{X20{SXh`u znDp(x`ve4n2dhH{$H(hlT+C+{zrlt&{lehQqkQwmfr$x~9}TfCYb*1ib;H*?b0|?X z1PqEk^q2ec8I?$_-?+UG^#f;gTd`qdXRhhB_7~B|7X6rw58u*zeZBVCDKFjk&O+F$ zy}&~p;~m+jQ#5HN-8Gg#A@esq06k<&`k;c@U0(y zt}HIEILf?!{hBI5+De}$lu042Md;>`QhSzubv37lil!#5*QQ+~kw$k;Wp}h>8=W|M zz4OCe%Ju5};oCW7Vz8wYvitbom0MP_0H#78J>}mk;c6P3-o*$HN!Wn0(Y0& zl&(~5;&eOd;2?^%xzdt)?x%w!@?6!wKjVuFgCVFL#x)Q3zR=BGN6oX{-Oa7D__p}B zLYp*114;z*-z@b5TLV{6&t?#&&9APVANd?D=j!Te*SjvX@G)|8u2qKs*+5Chu?nnK z7g7`w!otGBWlKxtlP4)qdM7$_y?zf?+oMoF5xutVL&o_uW@%T^!W-XBTGKU#hKHL< zJv@*wfA$m~M^-TZH#chV?p6@$oCwx~mX?-$x4*yt#?!AaWE*O|&<(xTl1j6C_r>l) zg`vaT`}cb{C7lj?`m`=xBk6$lbNaQV-nAm9DV#};W5_6i9~e6}z$ z?Q`u+A1|-9^0{kDh54r_2>p9RM4D%Jmz+V6RL5jU5C?lm$52 zEoMoNKu9$zz2o+8H0^Wj@yPIl7iNAH_731MA&rwDr1ig#auJZI#AUOg(581}JiF8k zJGGtd*h2s(5hUx^H@+(qkaqMsvysP2LI3gJ_zii8N2Cqa(dLwEZO<7{4*-oxu}ivs zFHIvsmI~`sg`zjwmdR8XaoBWWx_@9^VJJKwo9;S4d9E=|nj}VnODs4#-QC^TfQ|Vb#&^Zma;&j9DsfW?gwZKPR03qy>4$qK=w6gTt4$wVQ zjg<^!72iqSJbK}!LG4g&n4PWd2RxJ7i50CFw`Dq1l0cjna`1jOB@y@ur0p=(!wzx+ zIqB4?Q`kfM1t(%8OiPd3xRF*^zk70PL=Rf7>QVx9bc3QuF+IEcAwr z8yV+^(I!yihyl7b2u@ze(o+CJ4`LBZ1eP%`X>?FD}VZQSa^6P%Ho5iW_kJTzy-f?Hu5an_)}G59~2i$qo|~MtgWJ!=b?fD zKh^?t=DvQNid~zV>g}Cg#U~7mjVY(8#s+SWL7G#4>u}9$FOW zg^1kC$CpuL<5iU2E!`h&vpgLAPQ1o?5!qKM=9g0WwV_`3^uJ#3nX_)?6Rpo=6OCkGqC~T-hdKlHB z`oq2T0ZT{y-z%WUdUDiS8P8e!`EvzU#>&d7yeRuZHiwvHEBW=_viI=ojbn8(hw+@9 z-@df~gD0)L2Kf7s`$XL4#@yHT_6VdQV0E0eGiT45Rz6^Ob-nA+ROwpW=g%s4mVci{ z@35=_l4R)I5bDPDZ?A*yt>4s~bb1$uv}?tWc#j68$oSD5nKK3kY?(S)17T`1*qoio z$;pCdpPBD04l{X`^Un-`IS@kfo1UF5Z*0u|@D@$1kV(a6MgapQgsz{zfB*90EP|xE z&me+d`$0qO@s>CCKh{k(s%6(!_w;bwzJ0sg3NHl8;jV`Dmd)2<7O!pDlgzg)?RZLl z=&GQ&xQCgM63CBre5Mq54il{M!s4Pvv8$t9#HFF2^5$mRvuEqlUfynZ8UTP9n(y_% z2TM}%=*!DLpo>jiq}Zrl*CEO=X_c0qoKGpx7cYg1Dw z&$%bmtgL5cW!E8Di(OCB;zy{vJYQKS=rm~n_=zTezmQNU8spqoqnsd=_A{d!)GM(3 zPldL#!>RCR5wm2zzf)em=N9r5scs~ObaZt7)9)#GzIh|Te*D%DP>q0K^Z_o-81(Q$ z?*AOnTb-WjDGsf@LP}O;0MmMPyrF@C0kfpT0iecIS?{e>lGM&FE(toBJ9bkc6^UMJ zt`OXN4R{A>m;jKTo*u*k>vaxZUfv|vi>Qt0T!VN9732-6NUJ?tFOK8>04?ujGc(07 zo^#B?SNsx;{(_%i7Bbm@GgpB{zzYCFZQX1=kA}SrLC{rXmt<`4Bl4H_OM|ZT?X@^0 z&z|imw*vO}^z@9A=wCFF1F!Wb+v*26iTmE(O@OFr&)cgiEA19%t|qmfN0WUFs2!^xh0gs$3W)RbGBWXlN+fC|YLbh4?(}=H})i`yau3 zm-tVd2hoFWKn_(GMXd4ja~eL$R!cDyhW*EmFi98GOKoO`F2v;u=79_yq@pQ;hxj(*he@ z-B;!nuyj*ie(UtlChn5unX{5klPWkwTVHfo7Wf>MM^g==gN}ir5?IwsC$sX?rw4d5 z5rCz21yNjFl|hEtKTx>?IZwo)m(;*9UH1*-!!+2d8N5C?j2pT#AVk0d1md8B$k2Le z5RDd*mX-DCK*=E*n)Sd9;I(x$v}la_=mp9 zNmVT^dO#}b0uK2HK^>21*}&j{`$OUAJFhM7D9H|ZI)EQSzselZ`^MfEy^4s%mpz~o zl#mI~k|d)|P0Z!iOeEX}-qZx$p_F=5>3x*oE#Y#Fp0KZ9|Y2CVYXj@JY zMiER2BUP5X^7h@kcTb>Q#c!i#Wc=GI^s-}I;`ZM|-(FkGrL|Cg%y$QL1*>sqp??#2 zN8OyulveF8Xj=ZtE7Q+U^%591@vSfuH4KQ=;)`YHIFE0%Zcd{lQn9D{t+k@fCjE8hjCMgC5M)>;pXl5Hq^J}HwH4sR{ z)&rA2`SRtJ*2#T+ErcPx;!(_&|sj2B=zb_}~x&Y&7oy`5BqK|j7OZE*6c&4Q6 zM;Egf)gGmTVSc~AAM=;-Kf3z*=d<+0tK!+d^=UV^Z@PkW1tg9mGgW6;93pVax>HG1+7mtZ9uITOC zo9I&zd7*0uviE<~E4*z=)Kcd{j4#9usUt zzd6TQ;v*xg(Pi~K|Gi^?*+V+9D^n<^wE#3pew`Gae?awQIKhGxoE$FWvpv>11%qkBr+za3!FetrGoW99wm?c7Fw z0~Qv6Z_>XomD*+rBySV@)=?waE=vbgl8fcr$+p*OA`xBN*at6Oyhs=tfE3Ht^xy62!#I`Tx7S-jg6K}0 zQHG4M3xL%9S2fDwAkG~4DiQp0E;`c|%Yxj@K7pttggz5OE)^P!7* z8=Rb+NZJONyYXr@zLOs@dPx6uAkM`3T$4(!cM%=uG=jYo6AySi&CAOpbR@d;Bh8FD z_{Psq#X}+v>(f$Gt6whG%#h~^+ug-4936;1=93~|B|wF=yt>v>$Zx%1%{Hj8_>G>4 zsTzHQ#kbeLo(|@49KRI^7${`kz$Q1vh4P6e6xG{%_Ysf*%_(PF-n-b7plwf|yJ}|k zC^YnUdz)AU-}#MBPV*fvAEma8rt4&};2a!m?kWik+#Yf0qAZ{YS~+eBiQUv@9e^z^ z5mgtBO}i^9?xRU-$~e!9`q}$LC@wr)K~b?v|4;cw*|kA)zOfQzF!AC1+9Z1`NG`Fm zv)3cWJ^@H0);DrX@83Th9mYrNpR68!8Df#Gtss$@nwoxRY0&=#U?u3`Q;ZG2f>_ll zzloS*c8N;aQ#GO7Y$xu-U}wW3Bj=T`GaNj4kmcxgC6EC<&-Eq~*|sStXsjMZyrHL# zW1)1vl&-~pXX%$X#1;fKx0o11xUzzwAqO8HU*n%7z&JH{L&XR zZJ@~Bi}^_1EhzY-Z$3n!#?bG|j;29w>sgJ|!Anq6ko9L4ewzZ0Slg;2W7YtE35tm+ zWN3joeyE%>G&J-uEX*q;L@i-S3=G?1=GY`gm>nkcYFflP*zkdCJLm)}M zU~389*U2_Kck9+s3NU18;Bk?nSryK=G&kdk8pWD_O-$I{x+Rb|eqd|Q>#6H93fJDh zdH0UVWu%p8XrDiecJkwJO5ItGLjNsvXW98hR@NC67193yt~jgA&aSRv?FN4OU%ChQ zAIub5B3&miOlEa&B` zSFb9qOd@LR;@|q!Y$KWn#J26(HW0A@*cK-88xLG~$p-Q!=7dKaGUED=`+{W?fQa*p z)Go;nc|(|t4?QqMa12lc1l5no%>c_Z05@2L0}x-|ym4^gmaK6NC zvG8Vu-q5dod-i-te>nyGh-a>^(b33k0|gbn>%{Uys7`3eto(HR{T^R|Ccdrp$BnPA z{?jo4cARvMvq9U6#*Uhvz58vkmt3ssB7m>QeGO?dMwlV0UN1Y2>;s-o#Eu z9kpgLGc&7nm*C=}%E`$gEFNHlj6EyVV!WN1I@<)`0y-eK`R`7k@0M3qzT6{Cf(MoBt8x5)IPXQf@q*Sh>56e`Fpa68FHnUl|(EILcvk3&~?AFt4t zUzJ9tE&uxUZ3SPGjN8J)xj6?E);Rrxvq%p_T0G6^uGO7(So5PgR=u|*7F$d|ro|`* z1OfYx4cjCr>^BGK_-Jvx`@zoUvv&eY$$xgJ}<5 z*a*-lH=)HEkg*P>1*5jz*w|Q6I_vCK=qzv~a1y%pRoCl9p^74&41ZqBc zE;y&_YUrk_-_1Uvl}^##uHJo^esIvZh3A;#)~^NU6V?674gSYNGGvX!9*$6BJ{(qi z1!6#x`UL=f!Zu;c!93@DkU{1qM;F0e-QLbft*n2e=Y1_Q0{9oO6puc9IGJmIMrg0z zj18ZqSaRC7hNNO?sqAy6iW{S%w>11{Q95()OxXPulty{Ib1HrR9Q}g4>mvKV?A@EU z*l=9^t?031cPAKva=sNBf0y+0r`)y6)}HRKQK7)l(8)Xg-Eq>FO}W{e%GMRRocGdt zesD5G?*ABIh8WPiDWFRJo}L!`xgWR^*c=R55;RWWQNlDsu#6ka0Tzq|ZIJBfi?vs3DLf>`!ry#WGr=` zGidS?s|(!3o$&Gg7O=68fWRSA5zhfiLNX-oO83zu@4Nm1{a>0G zQD`G~W#9X?)qL(83PwitH+c{FyWhWg_~@NSY2Sy}R=n>^?y&P~dwy0a%cu|eFy`<~ zYiq+?um1yDw)W8$Dqs5hn*}!0^e)S^BN4?1ANoI`t?33tK=^u8==u}ad}x`NPD5-5 zTX^z!TE7(_a<*ad<{di}_*tGR1pmy5piqsaWy>q`qDq^(>L22Jt>tp3VA+|K+s7q# z?W(!(sC}Gj)2&7h-M3n{%qRYB)_X-AQ~cJ^c;?C}b%3$GzvjCoets*oHUES9{h{vc zLS4Q06ESZOFE8U7YU2}2f47A8eJ$`d{a&(Tb_8m?XZ}+UPGJ$_S+j+)OwSIgJ(1fy z{?DUx*=$6eftzQPv$z99F1i(IdkBJ3FzzIxJE~bdWvz##CnzFfTINXwltaoc2z1c8 z5X~6pvzF9m^w@P$G|zyxgY-t6PqO{?=;RQrNWY;AEzz$(;hJ5_!PmQdF`Cao&!I#=P&AR z4xWt?QqOv#9`8C8Hy!7J0H?NFIX+dgt@w8Nmjv69?YerhYpXf-Y1A6wN*j2W# zWw!1vE-v_RK>$2RDTb1t#s&sHLWQnz>UFP%riWjd0F_r1+5#Z}#>WqSwO06ydToPb ziDHM7_q^YCkFf9~a1#h*U)8iEyfShApc$aFo$Qu~5V)j_qOx=6PB(N|uPocfIO`ui zc9505Uo-ZY&-T}ohmtT@94QQ874Hit>xKp+QeK5iI2IQGW4x?ssk6ztNk847=RFNjE*n+@6eijc1J#x zhk=0sW;Ef(E?@=Wt>^<5MC}?J7?6dVWwa}w*Rp`n6$CBANQI7>h+og1J?pp`%w;3P zAN~Nk7Yzr8DwKAB0n&jJuHxatld#wsJxzF_lX>yY&HZS@(ojZyhC!AQZyOpo^dqOC zj1esrUH{(W$DiVx^6h`9V#`4}269M0X4wE`ja@);LL1`} z5ZGC4s0LDm{1Om|6!<1$bRqtMq$F;*9)#dYaXV;i^ zdaT8wA%R$9(V4Rs#8*XG8D)W9Jse@W`DQx+&q0n%gFh?P^fmx9?)6}*6!g!36j%h$ zNn0Q2hLzAXyTEUq-6XJ&c1u}V*$bo6I5aFb(IFH2lJxDP`me7W;~BNm)%(N0Zwvh~ zs~7hI=a!gx2zd>t9n}RZ;bjnlaK}U-P!iiLqs6QkiA@YE1Ra-0CF&%YxQh^}!PZ`d zwojTSZQ*7+yBs@i=iBA!8lDuIh8%1s1mrInRsEH6*Hemg4=)YX%%FCZzGFar*)_s+ z$!4@!A-%xy)~Z#szrXs~Zv~mt;BHFWNB_8jC?GBna)u%HTUnijo&Z|!2bvvlo4ajn zY_KX-0Ltm-({?hh?Q8refoe>>dpB`}S+t}EHN7Qj7z{>)`iJ~Q2*_4?BJ2Cg18crnRyUQEz$Wu$4Jo4rZE8R(-YuJ_6Pj{iEVHz zJH{cd*Fc-*L)!xdLorK7#I$NVu^8Y0KhVn}756``Bly|CC=iyUe`LgLXzqa@JukZL z0pQ@}f8*I8q7vFHcZ`EpDT7}4vptKIRCTARo~!socv*?F;^W7UByxJ)W_LghG3o#4 zO~_~*tkU)u1s)I3fdk+i^%J06`1h|F&D&14V<(~68d`dQ`zH^?i>`h zWH>L#$E!R&npJdU=|rWKoi1#0)mT5Jvu8z=bBV(S>2tS;8EDLPAHcGHBAypyNXYtS z=r<4mfFq~CZgdvear3Ihxec=G6dO`+e(Ov#(!ju#SpQ@t#cce?rrYx$>z{h+O$AIG zIi!;A@vCk8PmKNAs>h#;?w6!SUX*E#C# z<_$6ZXrfRqAX??TeqGhk5s4K9uetdy+T5>Sp|yI#-ubujVXIDNTlTP?u!smTp#i@g z5UiS=nISd=1mDpsRU~0T!$s>?2ge0*ZGcCn>VsR>9K0LEb-HP;UE(yMTzlZN!XC5o zJct`YIYgugUH(83v$h=i5B^#7CP&aR-}m(;W*qD`Y1=^j6UQo6=fTcQCI5baJ%IOn zzPmUXqYec^q5`>WSnO&H(1Z?`IIM`(lY&<@<{9)PVlg2rQS`|oBkZZcS%!mh(WoI3mjiGtgAqSRGMQ9TdBZqmS}u`p_p_kvki+D(0T!nc8rF@l#8Ui zn3ba~-@}1sT@kgA*froa7SX!^#$0A`h^~BJ1|0b?PW5$Pg0meOr&kuN07MhPf#wU{ zX@wMjI+$&*q$CSTUpd?-GyoktdjJrJ!7MjmvoEN)#ne_ri5U>4T@|jp_-tg69Xy?K*Zqy1F24y zo_BaS1Hu*+o#-YiJmtt}SUM9(0Sozsg&rA_(kT5n1w?H{+CSVp-k>JkH2wz#D9@~( z1)SZrp7#>K@gMQ)Z4bkdr>m!j(v&1SzZ)bws?RkD7YRGx)nuMdYI>%b0d^bH3d|K8 zkY6MKd$Gs+QM>Z4)-}W502k>F#Si2AQ2l_e5MCmn!LkRxn`P9T8ajTwwE7K`YuEg}y{Ql%jooR=DJF2P0rE5F**sVWL`}L& zh@d7pS;*I*oCxSSCMgLq9gZ+WdG_7>-E3?scqB3_F>~UZDje@V680m4j$>LrJJ}y# z00gpEkYYJ3Yx3ViBMEE8+fZoY{ztAAo_Vq)SEup>`3rfeIY$=+vVvH9UcP(@G|2%!7NGc+ojV1d=`Fm) zOinT=0H0U=i9F)sk#Jt+-TI|>=+L2szu#7y@P(c(J(b#WN@K{DAePz<__fXV0Zy(i+C$gR%YOw zNA(PmEj)5P@F4wG7`OG48~9Ixktjm#O!x-xJ7T{YD-ieQoP|Yt(>U@)F8+#(o16Id z&?16g%9U@Qj<`aW>fKbySx8)^U)2(Jjssp^EOr$$a{En%F(0rqERZJ<&gHOlqYq4m zY;fezLPEzl%+F>sL$x@|1PX)I+|lsBK;SfFN?KGL`MaLFo-Y99a{ zo>0=-V`xz;Y|pcNF;D-1nc1_JEx)WsKGWzGS5$Ayy6Wb>_uyIGo)T$MOD(q!U{HA+ zF;IGac&Kg=M<60G7z7Ixv3USve8JQXoMgcU0?4XNICe=j{>UgApW433f`B1)6=gS7 zBS}#uR6+kO|=u*#gZnAfWDGP{pW&BxFruECTxv z&Bhb?VWw;o=VcrE|~Dt@juN z2{Pk=F#|{gzFQgine#TrnPplOe5JkgX-4v4-7u_?-uxbFj`Vytw-)@}$swf#HkNdV zu&!*lzbhM!AW(oWz%N{GPnmS?piB>8yNOK<^BIXBKFrV0{{TXP>52@DDW`A=3pb~a z6zqEx&6oaF=3=Dl=%X(;M+QTTUhbSL-$ivQ#VsM}!^wmZ$5l7oQbQ%)8{G@TdY$k2 zp_2ii88;;=ApKl6HI;)^khuRRCnrG$HpHJ$14u0Q*&*V$lQLV%Q}0pNTgQXM4~oV~ z>$z6#fq&W=n)?ZtpqT;pN+5J9*u>=V69G%F&JrfWC;Bpu9oGA2SN@NKtol%q2Oy;( zp{s+)hO0sIa%v}fT|zL=XC2TXqVDqadWdA7pNQ2dJ%pC4iQy4=@l2UpU}Y$)s2Iv{ zl;z^uABqN!SfoG>R70)V+R4wruhoaHiLA=d5DMo2{$oEV44T>>q-LU9h83teO^q{Y z^~H2#R-$GG#4qC90_vsBD1d_s^C64pJ6v2{n^9e5^B=7D~2I(oIMX-cu7QF zR6Cu%lK0AZRDky(ybXM}gRt+Tu?)KZ4Z0khBLf)K!Kh0A(2zX7oB=2Dp5(`9f1YAT zW%3;jfUAHV_aH1We1SaDt5muABXz@VS63OwSYO{wFj+d^c+wFD#Ao|^Kmo{5@yeCt zrng{=QSmOq*MSd71fNd=6SH_Fpf(0d)EbAzwD&SBOm#hJ%iQL1q^>vrLfvDU2JwpqG(pGp8JfsiU-@I8U91F}!G&jAStv<2bHElpY`t1dq@QdnYI2@lc7EiG@|8AHxlpe{liHdRXt(fywhn z>~MNXlLoOF$d+`%M6W043`d8QAP~@_O-a5Q$w0LXL5im8^BejTJb*^ab#FnO9ZQIK z_>lCc3#&_0FgFW7|4t*s1KJ%aTNt=r?s$gV^71kneA%>%BN>Gbge_js6CEjB69SXx znP^=0lC~TF1-ry;O8pc7nbs`*Na6tfH+PK~fgu4C%}d1WGoBegAb^Del|j+>>sS5a z?2jGeusJ}bBLo$=d;6uo)|kT~a0H<$3eZ=iR!RX92i)X;d`>K!Fcwhh?OKgZPd{+f zrbh+J-S(a1_|XiUFRW7~IxX;D3*mA3I+-c({K6qeP~*g}U+Ay~(NX`vfJ-tg`X~$= zw=;jhkCM(E4nKqU&Y{?OA=fzzxLfN{RJI>oY+-8$Thff#g_)Td)aWa#%ZtQK1>|Eq z|CV0gh|XK`1uO)_C~{KuonF=mST|UyOSO~@;hVxy1(){X!&|7hgBX@VvwJb`3R%#7 zh*3BonZI&B7IlzJkSS1&?dbSd*{bE&tQxbq+*d9`ld2G9h4_iM`1!Y@{sH6E)YWkx zJQ#p%@CDN450+jaTJ#u{9|O<9YDFx5=+R&~g_;a!wiKdnMa10hWIEk}mrt4=odgL6 za71R^z=R|@#=FgIA+-$~0Cdqo>_q4}zd&qImj30nvI~L~8R|nDk%-pxl~u=14-XIU znbuL8cT3#d2b|uIr!xU!nw$Lo6e2Zgevtf77UA0848RJLeFe%hIjE5RAX4Ss99EWd z_=_xy{<&e#E3TRA$a884dXN?ZIXA#MJB+pZsCm?b1ePBa|tKNzgek$NH#)aa~t4k!I9W~*xufL5dEgwcbPQ~3}i-wP6Mr6 zeZk1x8{=(uM#{O*+AN8W8?3p7K}uR${`D?lFX1#Vuk!Qplsr5<@U(lOt+z-1ASx^v zqB3aHq`d&DtZZuwkBVZ3-N{5fWg{~t3?h#hrGh{}R|n6jYSO@=?MD(|+r`3oFuTj( zfep}fWPCfDq6zx*Y{a2{pogL1VS?GkZ}40gDwTup5c}(^@#!+Ya{-cm+*|k6TZHpM zqU}SVLM`r}nlds0OS^WIP!!+aI*LeD*Voq*5)?jOw~^a0JwPbthq(!sr1S4?SsNJ2 z2c~R56JCRbBxyK7luTEXQDg+p4C)=3$a8W!Ef7h>LZk{1J~%;pBkKs_gc(p4LpoR+ zf@ARVU~adwOVY!e4(0wd%Ki2fN|3Y@D1G>Sj0-yg;W5Q{MS9?e?(P_v-I>>&fag(8 zAd>Z3G3=ydw(OPOuWeMm#EgYEP1nuk(#&{_-*z6ch#79ZZqB;1=hMr_keT5~(#2DN zx7&jTkkl^p=Zb;Mr?BZ;-A}H4f8#>qdzwFdV0=6vJKM&t_vw$DOGkBe-QI6ZU z1;Q4(wj7K;;7F?gBtgy+d65lf7=SPd(SiU7jf(Q??S0A}Amu1kT4Uec+B(Ns?*M95 zY|SAM(E|8$Vdz0RmkrNiXbx|ZFoMivNz$@UNNisz*0ZqReBi0McvyDVo zf(r-TzZU;6O4#M!1C`rZL^s2A0@s!&I71AF9Wp9mBUC>f3l0^MIBa6pl)hquf)8V2 z*f9KE2XiqgXXt9s+RpsxFj`M|R$wYnmF5@%^s$C>Vf!~Jw}qT9w}o$XAHzV`c2nFW z5DHt3_=t%6BiX~_pmqik1vn~|(Q+Gvl3$7X1*|tWFS0*Pt>AiTcMc2&AS#$8lF0P{ zXnPzcIzur;Cna9rP*!#ic;0q(*@gUK5iyVD-<$B!h!v6-n{=7@G{H9z(hhrJidJf) zuXa)sHArk&p8_q_jTk{(CM6{$82#+cxR}p_bGq)n=#3Q3#i|C5|6HE{LqT31Xv{u% zh5+Lp!Wn@N!0J_CGz4n{9EZlD(*3!Ol>5IqAFEzRCtt>; z;P?ed1O~VZz8Z>7H>yF(oZC#JHGno5A0-K~sOT67K?MYnGmifB{CqVmBScCtsoa`m zjlq#mxAAq^Q>lQj>tu7raoWKSY=pN5l~K=tv*kppy03PEI#JSOmZ!@-T5bMVb06Hs zqe7vtU*ve!?}Y$*uwVvg$S9+Wh=biy&VqZADQ!TLl2QPOU1~d}5UpaikT=|tYkJzU z<7JRMSK0UP#m8(iYdk2RZmOyVmkA*&9fX?s%KBRbUWCO=8qRFxl$nCUlZ4Jd`}#i@ zNN6H?tUZ2IK$QHNOwfJAyVej#)DV48wy?yoBZ=b+0iX+kPZKi#!EGWJyTzR3?-+x;2;Mtsav)`PbL~r>g;L&&^M$9Tg zV@Mr=Jv^-7;4pXa_z#pt%trK*0*6IY1QaEM#uL zABm))b@}voDK~CSCnJW)BXTSMrdO9fGVvcgtF+}#8(j2)kT=MM20$0i$Ww@`D*)0& z=E0ER`<(oLhmtj~B@CS5;>vzxbZ5~U6`zbImX@AiJ$jv-S;7#aMj;F-`+bd)n%>{K z_a71r7Eafqy7b zatj`R#;6%E*bQ!}qGGc(+sew`Ln;wtaC1x^5Y=tMs;z6GxmR#MVC&IZ#;n0D9$NQ$}72j~M!;&$b4J^-7g$EuWa z$|JNqFO5nK_v~7Q_ORM`>_rGi*^$x@6I~wl|K2}8s;Hr%LGIolZaL_W_pnuzokl2}yDH?xNgy2Q|b*TNo|HisEtgPT|-w$U&O^wn~>#yL93xf0lM{H)f zN(LLjSQ`xrpgZUplNc+Xn%hh()^BWIM@BW zy%j=_y#Rd+eU}lVGLe>z`-x>_=2k)z;O3$*;f95UX`w0;Z?29GBZk{AT(|&mb`LyJ znFbW7RM0ObbFc2h$qja{AG=L>DI$y5oR#@lLey#jG{!dgmtR8(iKaOG>Q=`{GI#3V&7 zKmh82HYf|8A1Q9#GaPuFXWR}{jV2Xj;A`VGC=<9NCj{98`p3nb%lDdoW1?RT!Tew; z$%SugK{Ez)akUQ7P{6MNnR$aWfQ|zuc;p1#Xp3e4S22s;)wZc6;y$mc%5(m0f^tOk zc!CO#r9v1a+Ybjf`K#Op)nxgw*w}h>=eT}=G+0=X^DhnFn5Skz8zqhly(~q6NCJIe zVGKxiet?D%owL&L+Hv?lb`+GC7^eH-JU%b17<6@fk; z4Zu}QK0^A(9$$QS>+m)vfgz8zrFi3~$Za{On0S~kmIbXZE`}k-Sqh%!L%GDDD)KeF zLT_yQJ)3^V2X>b5(U0Y%rlrj;O_k>3@)>ly_o2xGZX#&$H8n)|wjdSQK5p0A3-r{a~n4I%>0B z?`zsdbUqj{N_1PCQLN*{2t48l268Q2*nug8@>xD4S_BYCC34{n2t zgD8;;F>nuef=dtyX#&|eu-s{~%Iv-3Aj187cL6oW8-q^&e2LCH`L%fhK29GS3k{|v(Clw@kB6HMA;upu&g60w%oZ~Fp0L_LHh2W&OIH5sKYH{C zInE4Kn{X<`mW{q2hgBZ8dw@SCxD=IV_v2^1S~xQVNF#oc0XHN2!i`Y#z;>U1waZ-s^wSkI?=KnEwhE%AcbeSqSLQU=BYE-Y({Kbr(GuJZpoe0aaNAN1G5 z*w}Mmqj0(^L2x5g@7}-*d$Mr!_6)6m?HkwQRZvMu3yE{Z?fZxDOz0SpGatdbMMm$B z-#SFD-Z*?X2&yQ8#p~U><7hY2fFW=rMo!U@QO>=gN?TRlt|NvMglZBl)A_oJCiyX( z6Rd~xb8?)`Tnw?$JAQE+ZGfnL^5h5mLK?@Taf3N=;9Fp!izHHS$!9pxF;ne}1cQ{J zh^uRa)}H=T$Bhj5*Fkh#ikt@EPf>ITO}2fv@O^Z)(`jDe2g}+RJUv&xmjIuT7eFo`AfCvgm>5M+_F?vh+{8rgdO(MhkK2EU{zyioq3lBTAZ+D^4I2o2&ar6X!l3DL)$^m{ z4wB^CP?T}!%Buvb_5j6JATFxLgBUt zxSY0gNJSxLK<0e{3$YuM*Cq3papnvm!$ov0Xt$zV3wq7o5L+&Ma6CNTQ;T0r z^w#QeGRg`>KnB*;4_M%~51=VHamXAusHAj^rb6mPtv?U12#BYErI)LwGuc*XJJ!-U zS|Jo+{eEM17-*yCIcK?ev|DIl>6+hW>&f&uOu zMgv426Yhur-Kt|}^vKVT64yqe5W}NVj%M59JHmk6q$lkg#NW?q!xST$u&(}}SV(bXE3auZ`A|p^L#6mz9;^&$BsR6rRJd12LG;rXe{YXLR zbINCMS0&cq_sop&p;?T|^6+fLxqufClDH_d5;h(98EMhrrvjYPxHblc|Lp4@hQ&o! z#||!tpTe4{fd?#I`OAuGlhS{W4@QV5antegX>?EQsHnp%+e}Q`laR?m=e5$xV=gjF&E5%FW5y1279^q$$T(u6&-rbTC6Q z9Z?uICzDiWW;%%+&w-V2VNf_4)DwR$PIE!Pupg}20i(>m2HTPii;Xa!Sc}@P^5^8{ zCLm{%i|NvLXl7`EGYJ89V2%g}$;bt&$g(EJh2y9PDPJW21$o8oWOiihwaG~rX*2pxhv*9r7vKoKf zi5=m}-kKSQ`t}4!abl_ybAM>4V=y#_hRqmvGawbF2#~D04w} z0z>AIy_fX`8f+pJ6U!SNo$0GdxZwzSa%+J$ zgmSbR=>0TYj4JNxWx?DqZ}g4NI~YzcS0NB-|K_LEaMp7EFS_14p6k8;A6HSbDm!VA zJ)*RP${v+HA~TANtb{U3Nmep4%9b*cA|fk9w#XOv3Yx=mLCwU%|ZqxRTp!LTG|SMMUZl9*AI4?M$?FcTz8B z>JqUebYcFJNAPdKfrSai@4{U)SlvtEF^C$D&ySK}8{X>*A{!?NnuN||pqW=1L#}QWU zEafo2goF=t3!`ln99LH80a4*}mvsHTt9kWTz+wb07{XI3NA(4mf$5_?LYyh&{p?-! z!!AQEORn!dfIFZ}q=So7Y@ZocB!XOkoIo-scmIZv7RRtUxqrIyeA-8G?|_xMfNcZ@ zYi#u%BFl|>BVNFJ@neC%BDgQ43$+siCXx^ZuT+4UeG5hg49DHtdv^&EAmg4lsHYQ4 z^Z=_+1;GA;nIC^lK$D4o#(suHU0z-u8FNIS3;MQvb=eND<*CQi9~#MMoFX3(WSnD~ zu8k&uZb5@P-2C!ob@&uCKitD42HF*WB^GO73;=yI1<+)#o{ggE7M#_$@7!6PD2(?v zqCzn_kFN6{lq*C}26+!Yea;t!fq3y!L%n7mRq(oLRrkgX~c}vQ@ze{v+(UNQYjq~2vN7ack{R7+ws{nbiOENCc`+Lsb_WJeo@lvdU zvW|pwd=o4TA=BqR9rgXj?(XK6jy8n%;zd&ctAL5&_F8lM6@b1I?Iya+TRn+&v#=PX zp&`Kx6?6CQV_hNdAM7zIJgwd>21s7R{CDIV1&e0^NioEm#jxnEDI zd?2YqH|6L;*+R(P$1riRt^9i3`pxWWVgC)EKii};d%Py&`|TOFpEv$_E@gdF+=fkC zXc)59H^qqE%FXQ`YOt!3q}=G2pTG5zNr9TvSsfk!1yNJFZ{Pe}d&~OQn2_YaPs{#hmrI73VqF?~Vxv6;;roINtfQ3Bkd@aX3OZBCJimN84xl8agQ; zei}I}zUZbS8FhSJi3}aCMLg{oP-iMyIO|%!AoXJ%j%?^_9qE=N27ofD?9@jY12(Mn z?jWL>k)DZW65(2hjbb1OluPk=-XDSgwrpQi2x4$`o#jq?2SVoh^Bk~aHdfgq#rk|7(<4iX8#XXMW7jAk^{)%6oi0vdS& z(P7rP<)jHDo)7_MLNYG_*WxT@LJx+-Ul~1d-4_a4G^51C4E4r-9E<~}T#ZZg@4Br1VH(=ucWh_;QLe|6SAXvt zl6m@+_WKb1j{bQEuCEK1Yz}^SB}loc`qn0=UmUM&)_Ho0pTFJzj*p+V<)WP5i`E+y zm9kWog2-8|Tq^b6QRr}MzvXR*lODb~Mt+v@(HjrhQa6rwY~y~pZe7*d?B22H=&32L z({bF!$_tZn2ag;xQqj2F%NiuxltH}H&|PdfzDN;wS_y|f&O&4k5zGvy6FP-Tq~G8! zppYp;&Kdkq+Kd%RDtNDm0RaS$1!6;TAOGlp*b(;tPcGEDYR+a)>RgHb9A`bD?SNfV zR1L)=3=$(WhFoXxQ%LS2@2LTTOwR|!*6p|jYz)mT`82q-8~k$6g@i%!3FL?y`6>7e znvgD|Z6oI%T3A77a9+X>L9RTBRlqnzC?e>8h-rXurl@37AU`oE)01psjF3QSgeO1x z*b(#wun-A!2D=Qu1;M_!9P*0u(MJNKF;GF^=Zb`DT*P~TvwS;D zk(2tG=?Mw1q7RWHt$d#;jvlL8oOW7xK6Omw^j+I%Au6zGPP;YE+x?Bv$6^gg;oG@xhT zav^;sK04=duUK?JB&!%01({b|o{JISAL4=(=#3D4y^D=4#vmfq9-;3<&7Ny2TIkcr zEp`v2Fn-Pe-gQF75&Yl&(@9`K0tg$x%shx+3AGXdVDJDUoVHZp0RK#FKV(CgqYQzz zwzahp0Reo`ACUV2O)WJYox*?0v0FI^cM>jqwUI@1W-GdtPl0RM1W> zIEx@8SyaTauO&xt`r=?az96HW*|CQ)qs;~b2|7jsBzkxhX_+kNoO=!IIt^YI|4ALb z$nU>?t;j9&aEqy@Z0o|I@_*XhE%I-``0mBK%q1ZaNxmFTeCw`zKNfVFG`0{cDiG$= z(30RphdhF0St7zn7G>YkvKn@|#j&SJBxfGy8iGFWVTv!Ep$)rxd=u47`Qn(o9=VTFG9Y8HksJ_K$NmG)8m0_CMmy|Ytgfu2jIqgoaKNSJ@RYX zvCm=j!S6m8fbdQR3sv&c!7@Wgo+w^G6msw)t7vFgw?9^eWQZhd;O9duORQi37D1Cr z;W0+m6l7{mDsPn(hCG2NmAuWLbNeqv;YC|Mf1f@)!8Eut5GZ^^kQ=nVx1lcw8UzYy zhdm228EV6Jiybrcm4|2rDAd*8-BwGB00IO{-l{g!e#P;P^~=|<&|wv7 z8HVGj?DP363Xu?r=S0NG{}TAQpMj&e2q6$@yh+p!CQNAXI}Gfyd3dNR>9*P*-o3l> za;c9A^%IOY^O=#6_v>T&+}?HR3rJqCsx8HH3qa%|kXJ$~LP(ZpQN8hhfmYLx+ibwQ zikHF^MaF%Q%{VpbQ?!tdC

cA%BUvB5CPO|_Ly;`+6=q^L!)1U4MW z7zc8&vtzsU58+t3d|$%t2tFowo$!2ExQS~|eD*i20T8wX_YAL*MeGmQ7&kqm6A;=F zE#zNv*^eqsx)bA3;^P9`?%aO8DAWH8Y;eC~)ehq_{XltrnK4#*?$xj~ZXiIZLBO+;h6En9p=9ZqPuW!1Fnb|9uPe1NF##$U;WXca7est-WJ7jMx6em9vcX;A z#fujynsuQ#mQaM7=_~#=ia?HF|N>F zD6x5;@f=xY3TUe^r^2}7DJ0Rf&7YKLokU?ahEsiLOt+_c8?cedUV6_8%%(X1L*5B+ z1cRK>w<#3p2pYS(c7eU4o!!vh{zUn*FvbX|#|EtpBoKTY0zwVp8T~%qpmozk1gARZv4^SJ{Pp6qv(lM*W*obA2Y*$ykUPYkwMisa+0vYTlMl(}K)MPsH0kah zU3_$E!#Uu!Xs!T&dGY#PaP@>t7woz-fFO5Cx|ijBJ2&u03g5}vItcD@GzJ@esh;8H z$7`0cjY5scQ=p|tBrIEZ9bAJ)RE_v!7*X7UsRAspxjS1_kni&l&Az|rdfjAA7_i{! zkvR(aQYb$$t1LjP1y~56kI=vnO|7kVW16<+_TkLLIm z2KyM}TmV5D4mw(|4kbUiHRzHaP3uM z!iQ<+iKPTO7&sOgmI~k=29*cB771Gi&#^tX@;|k|mTRgm@GVl5!uyQ`?L9+|V|d{r z3$DNrd}%P=DlG7+AzX(3(+YXoxc87Oxi(qsr-E%%IQmn<{X_#j-Nc_y%q-Ey$pBR3 zquMC0xk4N(MBA}`20{ej?~`=|2mc5A@rXrzoK3-f96!$ zEu>JjJnZk_zJF9TKvG+?Kp1coxQSnJrK02~w+xQ$BT$EiC(zIwZxP$r%{W4Li6DxDn z-ei&q&`r@vjgS7>!a!Bu3s@pTy2o*kmR_pHbLBqV3&}-?F+Uj}0ggiju!hhjiNiFY zZT#>}&@6$zmT3JDCpq5XbfzRH=lWZ>9y!@$80c&rU_DNos^}WHM4aRB8Oaqxs&9m8 zU~yn@n1U_>ZFM0~Sa1YL+y;*21_G{IZ^lL>#S}a_pQ8pp-(AV6zL99bfh7Dp??HsY z13&A=2$QS^tmt=nd zAb=Ssbkr=+`ny|wmj`%4ON-Fshpp^)?xHkKQevEbfK&H^jZOBChf&U+#FrN;d#T1t zq>cxsgzrg&vM`zX7oud(QAu^>=t;!pKx`$A82LSd6t{DISL}D2gU` z1UbThR}D_Em@^Y`0dY#kV?}yUrF4qe|CQJG4^P3Sgb;Aakj%BDO!`j*oL`Y;88}~4 z@VH$Y7+hyJAC3(ia;T#U;L!(s36SE8&!3l)N`Rmeq!;j#xcw3c3;?>nLJjX$1;a8j z7BHCG{pZh~G6zfkVr!;@h;>lNt#}aek0XA*yj6Ww2plIoctVJRd#@~*5yX-OjXkjk zgApzkU?X^d7;o($=L1r~dPcUPwoFz+5V*#DHpo0kC00 z6*RlYO%Z-A;DD0TF7RYZmvJrXE4*MrO8bTO9EkXsnMW2x0{ujep3GiG@WDMN2wxCO z3gl|O@P^-0Q$XuS(ZiwJT9lT(UtV68u_bfFKfeCg9p!z*YK8FZ|8od&yk+vAizc(E z)lI3S+NP$l%uZP?J~H(n9+CY^rt??&Q|HFCx-C_c#Y|9CKo(8>=y1nEdOvn61^^n- z_JH65g+2&sc?CO`zz!i+eh|EKM%_^O=hK0D=xb7mB8rAscuk=$#7!j;CNXtvfgBXwfnOaKX!XPUUBYCKAfk*^>Sr5Zx)H$`y&Dn2k zqdM$=J-)B3=ItqtyXOmx9)ON9!Ml}sU~6!H=V}V2(5C_84;=OgXnP5y18b5sI0P|a z8p(W^2W6k4XdVQiLA-yXqm^R5E;=ogu8RO&5Y&kCt?3$4$-D?E+p(<-UpTrut^j|7 z8zLd&n_;%!8UL@7HKud6>YD`Er;oidFYp(0LCJ1l@VTkCfi^H8z@+z?OUE(lK>rzD z12`iUoo})WWm!9-uYCD;Xdusc(4<|=cQRZkV9U&X z_%M3Wwx(V|%0PQ|5s{g|rS+T~zAwZsGb7=oTaJTW{V%wcB$k`>N+kCQbcVgZ_bF)Z zKVthTUG|RAE~cI>&PLBRx3(UZk#s zf50t8bkx|E#3%x0eB4f2fqZhZw>MoFw6(Vv*snZwzKZYD5w3>oftK;h?(9!yd)?G} zUr3;?BUcm@WyI0@eK^OwRqMk`&%!nhMK%9UJI*Y;KqLmI<_E%GKUrd42M>-C)z4>qqNiRKHIgM>Sv_ zhYB60N4E%q^TFV%g4X@cn_2sN79w};K4Ti{Y;SKLxNV4*wlfw-5!yRNtpqmObthLg z)GV(HUx=4>N(TrAw^?GI}xAthvB)nliA1n%;>* zsoC6h7DM>GLOaeT-qy}d{hC}~*b;wY>Q~#;%DmbKS2GNMh_5UNj&{I5F-Zou=>p2) zaA&>4NdYgiKv@C`arp4>u&ZyHo889l0BL=s$rt?U`TvrED(#^>>A0CnF213$5#(D9 znqll$610m0?wfu2l_E&NWRlcCGsH}*uDyLKuuuZ;9c_!mT-Uv2_cD8#0D-83XH}#I zp5kE<7pRLD9~0H)oe3s&XOCz(S%L%-(ViwBaZX2tvkcglkd{&Xg4!9Zyzc_&t0?!x zqZXgl|59@4=G(z>xRbj4%T^m3yZpJ|^d9a#6$;z4rT?JS#6p*u2(!49Ex!H6r*iNhLv?EFe^%r^Ev(SCW*m6DR0=`Y?9aizu8CySmNg}fzH!pIdO*5oLa z&I0&N9Qg%Q2edvyGoCau-SB(TxZ6^8K#v6f5a1d6(*%tG;DV~?I|C`HTzS>8HHCHlIheb7zu5du6Fr-)2hh{ulg9%JSmbcs;~98@(}r)-}@izN0bv z-?GDeqw?`H*Gn>Z?+dSQa<#nyDq5e%{2o;P31=r$V2nQGqR8Fk%@N}VR{|2VgKjVM z_xJQL4`5hF$Y;k0vs5`rjR5KZ{V=OoZALV1&k^4aE_{w*Wi(I{ z({sHV;k0dnK$kE+OP0T`g%1u(f4Jm6MZnfXqX!&Yjh$13xXb@0>DrnWE26BA}(AgWh_xv0saoeye!If zH+gRYlpn0>_b$etF#YTPq$F?r5a+Z2B;RBP12V@4!FT^Lf^n=LT<(ihLygU^onr#i zDvKjK43gBIAZZ=ZqbfiX_?I;0#Z$n0E^z-dV>bU0I3;mM!Mu%(?Qql)4jE_&C!=0u zX=Dc6t!`=-qFD2FgjfIQ#IGpK*&`utKy+u5Sz!ir=)vIsf%fveu5KNaPB2>}FgL0e z-gh2q+5H~`)&FQkODO>v%Q32Y02Jq-%Z#~#Zsvn?-#Wl;B7cZI#o+Q5=nQx{*0Hu~ zDQ};^9uV-+u7f70!ko}m;vtW1Fo&lOn!!Q)B+cFn8a00hO+|!o97c=-wlkfkVs@Bb?ch0W$Uw}iMmn)mk1XSFM;?T zwUeia%BoO)3WqEOCTsY2nruIV00oda8b^^*%B3utOP7)@hCL^~c<+$B`O(OSTj^^` zsL3Dug@p@U&zLl`n!0t+r=d8*g-D5`hQ9#DyZ;nUOCJfIxQs)oynq zu;=h5E}~Tt^YEGBvxv?(7IZAfZykY!WQ^d0aU2*_v$+ps=lv*ViJkkaEj(tcN|>g> z`|O>U%{m}9uub$xO^Se&8#79xAmNdlb+ZyR+xrBCm?rHq3-KnzY?@FgvZzX&shYVk z+0YkQ+6f2=bw}&-B4`~^zh`7*(4IVmI*AC8!@|_fUGRHAQ{kp!U;bNpzi)U*D=GbS zavVIJEudn=3SVEWZ#_OQtNfsux;9LTd2nOXz$IdHT11NGSr_q z@xOU^V?J!YafSqckO3lkY_pGgP}65#27cR&Ck|uQx--TR(b3+;PE>IGSSp8{$12tC zNZSDb%>*V!-$B(khv6rN^aI#+2N-7W-nrAz)TCJY1x8?)Bhc0aqcSJF1~}7d{FIaS zwL0he>2o=r8ass}>pB$|-)7)#WaNRC*iSN~5TqhF z3Zl~{)QM2;j&wMZjcOi+86BD*iJ23H=-ct|M!u96iYgVaQ_sdMJxO!-&h<~9a_|>; z)$kg`A61Qf!a#?i!o4_w3EY}nl1hL7=r3&ZErWkPcn^@=E&K_*`Dta>u2b(Gn=OAR zx$JbzRrHZm=wU#pph1%=u77B#0YW*}A0GcBmw-2)3YFY0@tgcQSCX}FV5y4aB{E{{ zN9cJ^=aOcmvpekpe_WsV@)~59;91L)LE#gFKzWIop^l3h?Z%B9CQifK$3JoJV-&yi zlw@HN&MgoKbms&;wgyjNBI_k>s2sp2nZL-M)%oP|?i|A{oN`h< z6<;T|7uz7&m3q&%Y@IWSlGjFQZM)XIOAn46NmmV4cS!WumnAOBT;@#E>%O!4&!$f$ z9#fJdqk%a}mDiaE21NL07mmUV91{^9d;e>MxB<=>stlNcF{$M2y#qszyde{9Bp+4(_n}cy`uNNj{REFNz9t zYY-JT1W58yuYC>F=N(22P-hWrcCT#I=#L+;2VQvp`79(#SUubqGr3sV(rELH=(j@% zb4_Z;9@@6ITG`Tb4zpI9x6z-hS>{;eTUBa#z`m`3@800TW+70uCXM(g#&M$@{|X0~SY3tF}5X-3AJZm?mu( zYEYU)ZI7L&)eGm<0d7PLi{aY|c<@xw9wuhy0(gpGZ)1K;^qJtT0f~Sm2am9@`VC_* z{kv@bndkZy{O=I+RX~DCK)K-g%)RmJ@Xi19VAyOZwTUNF&kSlsya1Rk^UhpEZXeKU zHn$77?k~bLtD+A(tr>;1F4&BK9msqI_dnzS#D5+4APUr9NPiNaJEC!}|I2Z!ce0y1 zz6iY&WLNJc;Ty=-8DZ(f2PEzdC|&>`ZG+ls2!l|h$ZZ2q3!Wa#u#-^PrS)oqS_yfC zo7Bxwe{X_)A_HkWga)k5Uya?g#1GP+#t$S?W^!BipJc0ZEY!hTc4 zin{2EU_qAGUrKfLDOiwR*WPlem*s&d&GsKDd}6b|6ThhCZYE0`Q>&yU58Q*GvddX! z;~YUJ{D|gb$xDXnmL0^a`@j6%-I4TL5_% z`wQH0Z=w7yc~t-&RU&TPKTnV10@6;6Lv-r~SfTc8t-@_X>}iOJ0O3`FNrE!cGjND< zwV^{}wmL(T3WM^ddig9+^fou+1atqfZJXb+8B%u41Lz#$#p)ExUA08FGcn&PJ$nD) znJWi2Yzk^CcU)hK4|NzzkGS^ z=YtIP{kGTm#v{6lmELX>%KOJYxr>3Y&JHMHSt!QQ^8A;gLupU!eUK-18DE%{0IbdgIGKElgk=QbolUtTQ zYNN3|V3tjo<>;YB(2@~m56T>3pPDs(4u{#?+;*@`GP!h<&xgG>Z=hAfzQrHJuMqy$ zc9vtcgzyZ@btFrY{>VA&sfyq7uh4@>6QzydHU&nC7)Tr=$V;(ER1Bm_1wRwoE&NNb zo86wx>|Wy87P_p2sY=lC6y$ocgK$R9a$L&(7%b7N$GO8bAn15bbv5$4?~8lxNYUnx zV~tf7k3Y=Z@4lbTtlY;(Hzsz!xOb6gSe^AJP2b!dtaN7$&63*(Z(L`n5z^foCvR*l zth2#avEgls*$JT*Pef#0=kX`I0 zH~tiq?c|N(a37jDFy}IVBMWu^c}sTBjdWKac{FVI3zan3JeuM03_$_V#_D zd$IfHetY^CPaP6-q~30Bl+YFGyTO$H&r#j7SL$Ob;QtDxh37n;YO<$DJh^ zA)#npT;}n8DDZRA`2(I?w>4azYiG~UQZhfGlg|+PAP#vTO7$xj+Zal}Va2k{9J$OJ zp6Os$K40inQ)U65l@$JiTwxyQz1EOihXULGR~zGCmFW9#!2cyv@7HLj1~->tGrZV> zzvx`HwI$R5LbXEZ0nwgYi2Tw|q4_@X@Tz3@63Wcu2$eyfn=yWFpZaq;mocyr1`)Sn zy8DKsqX2^wA_z(Ljw;J>1x4DUOCce%10084T%@Y0>m$VXp3+?}7aEipNhe%hW}`E` zI@Ij9(Zrl9PNAHouFj-Sl1A~svU{wu?%GY8IBYd{mD}I&XF61WboQ@RbYbB=8Am%B zhRvJ(VWneQ{A!59e0yDzhZ@7uN8@0jAMK}?mt$T>1Pc|G`YS{P39%gp%SZJN^VIK; z4aPr)c|0c?U&I-wQchXE0q|?a1ktu>9lsScShj$MlR<*S$B$v&@i= zvWivKuGb|L1f+ZDutm1=as$igqR&eHB6sm|fP=t!-An$0(=1Rt8hvAqd2LSIX1cZx z6zm020sE~#R>M+k zogYrT_Vwn-j-T!Q{oPyK+{*F~$qfiYUd1O;O)0HsLH1#bm4^nb4%D-4!X6rQ%kfW( z&-A^yl$1O3$!EMgeJ{V#H~9q7+vN}wVZt-7t5{Fl#;{}CIsPa`GE#r~6zj_yQ+l&} z0HZI@S?V=2D=ERXCgTy)?%gRd2?q-7uUz36y#DL4b5c~)wUVvd{209QjRso$wiiXw zhcI!*sh6`D-f8uT6Fu{S!fWYGOGBHN%RLj^(D%+q-g8!lglVZZd|tv$+V2Mak~M-M z>0xLACn`x4fK9zoB1uoK$s|#qwXD2;@JgKXm-`Qc8?|I_^Xh{9;}%pU63e@<98_)f zxfA9A&b|sBKRSsF*u3D#v16KZ3G(uhZ0hO*)!M|j4GEGwmz8m?(LWpGbvv`4kRyv<4ovV(bq%m~v^`Xo39c8W=D(J8EeiFru4Qmym#4 zX@jVsen9J%V&>`66W0-xrwj{O+D)HP3Ia{*6mSfEkaZj!q6QyQmj?Cf26;LLUQur{oT6{tPK|^Ubf3 zN$MAJGb(D~=rP?le0M?_Eh-Zd%yOU#Zo8`%R{xF&ll33`TwiI9oJsn*aT>v`p~3Jx z2R1~YJL2ncpo8&SdD=g;ySwD_ZgkYd2Wg>4XVDs#Gw`$q({2hl!&D| z8qE%9h)6QBB5tz>D35^LA#<7$ZTFw`euBVaaUh!DYXN2Q0ef91GuHX(Q>mJfv$g^% zi*r$NT7j0d5a4QL3e5CC**9$l260$jC@V!HxX9)E;T+X9H=3YE}=d zOO~$}|@HmW3XhixvcxGp=^xTs4keNMLb_-fN6o_O_5 z$L3UWpT4j;J-fb5Hk9FA6edy+@%QLUkkRp?r)Liulb%DTk?H}DBODH7DYmvwWXirJ zLK;At0$UnAqBxoF+`qUlB5=AB|K-{d^Ubppf-SGyjd+W+(C2|H4H9ksD^1_@Vy%hQ zIE-K6p!V){bM|;jv~vQ|RQ95of@zWolIvl(j2c?zgFileY-}Kme!wc`EzY$+`Sd9_ z#rrnAWmMat7&egclXeHSRp1h@{f)z5|DbdtT9+6j*UVRb1_98VAYvT-2{HK6u+V`4 zC)92v02;$id=spt752SU&D6mTT`f%l;U-2MhR9&JAd--xUQu6? z8g$@1F$fIfqP&Soga{aj_^@cdNz&8JiK|x-C@sD?l5%5~3B-|?FH-h%bK46DI#o$( zX?-2G94TaEy&lemW2bHT&o97%a@n0QfFRQg_;4g=WCRd03xwPdgOK=k@c+}%1KwDP zMcGZ@`$d1r`G?xsyE`5xY(KgDsQJiA+xu6}j(vSzVNquDQJkLD6H_$+;DuIiIXny+ zkg9>sCl=2h#ukJS1F$Hiq5>4W=cwSY(w8rnYFX1WFoZvR7#tBnk(YN{(xJak3Reku zJWrf{Xv3B~oU4Cu@C6bUKqlFbPMxTy5ZkyDnOjgLs7n6f3KgGuW$0n#a-3`5*vzB9 z5nKu+ToiQ0>2DFLkOHjA9=1Pc#WE(&2jaE`fPIo)OTH=a6q{azqP;{;qN1K26COfE z$|h;}#A9V}fnowEI+pwg%+47IZ#B-JSH@q$w=5a%J%sQK;Y=Z>s94BXmkznR?=8(- zACousme%)nk=>vy+UWa-EVtLJ{ym|3az)e&L4%-s_oLJ&11a)ouw{rlCuU%Br%ylM z;~{Hp4aRqi^UuRQY7J-n9e+tl`{-v}#n&d2>osJ?LWo-6;S}Hz1G6D9D;R^4_8Z1_ zbm-k-Yoe^D*KO!^5nMLU-&3NpxwjpLzfPInKfW_O{E;f#!e2M%ap8sWLx!}6babXm ze_stp))$m8M3DlTEJ@#i>&g`Q!utAISF?%07!!Pw3_gQtHqu~YPyEe+vmPd2`}eCq z(qLMA1-RIN3-r zB1oWsCb88rGyhl>x44XQ_dM%-uenQ9UNp;ZF*oH|i;fgvwq+jS5S@nl)92LDk;af$qRjj)Fg zn=w#jf(0f58xPKYBh$w3lm2`UkI z{C=dw>7{0eeJX3Sc@I%kWT z-%?XW?I4HkH|rq5$IqXckpX*-I55`UK{>SYL>NO5M24X*Hbrz^=?asALiz02kbLF! z_E*}`w5;~HJV-f8f>}zVqN6z!Xi+DlBbERQyYmuLZx%`K096)Mt);f@}mi2#Fd-JHC^S}SUhQyHUTPQSk*+mo;QI--l zq0PRe1w~4VDEpEnLL`++WpANH*<%VVNF{5_kflVX`rRI8uIqQs@0{QFukUrP>vOL8 z4E27$UeD$6xNncBsH$d%Nv`JwCs*vUf&8uAVQXVE9f2$lXF>XwW(c!p+S)P_JzMpy zMc;P#PS9|AK6#BPrcLYC`t~msngsdpud0Y^QD5(A=pDaVaruZYG8KsW8|Ka_JL@CX zic`mw4^hmmEB3<3w}7ocwKuQ0RIzZ#25i3$ZBEq9f(77i5CanAlcnW=UAxXrG|f&u zViG2v*?-+AYJaaG&$@a|EHMGJLsGz!m#4!1yN{Smv(criR}sq0nmgCy$!qeE+O%)K zmFGpgMfS+ke+BDKMbKCo`QXQoqzTnehkp9S=bljdxQk?dn2u))-`!V*A)Dcn~%=i{w!f5wa$4{ty zQfl7B!q!&Qpl2f^muF`sd+eBGY}`yDm#He2av=D`+Bhy)aN0Qm9z=<~5`wDyOdBZm%+40L_W zR+nplcIQ!?-T3fnM^`_|ikxn}CgXrvf6Q1J_l$3NA6xnE*6d53^$WHrA7cP4i4L%3 zp7&>X4E}WvVyJ-H#o1_iA}pl7%+d{aO=Ao&STFeLeGd^VwDb12)T7o~GE*8b&Nl}; z!fH7DG_Eutt#@#@jLj}d`e_p2nyKS7fBvT3yQiO66B!#Tx`}D^OQ*gcHECk$yXkM= zl(y-w(&^N(P!Iq0s~R_2O;X#_<5w|h%HUlHP+*TV-76N<`Yf{ife#O!II%k{OgF>t z#`x#o(^Ew{SX5MhST&|1ljBDmqJZP?X|ht^PKi5~yDHIoPfW~*k>0%~O={IjVfwMW z!ts6jC^O!PD@kSbM}rI$$I0)CHfncqRcygxqwN%U46f-_fpe-DSZX86!o$O1b!8`g*JFLv0&Yz~fRgGjgl4pUTg|yKti6{+X}ybN1}nl4w2qV8@Xo^G&Dp8utSI6tl&WBhI+4!!?Z|C>2je zf4p*OV=J%w!hoLWo*NVtq~myq|2$^9v;FT%lzz!e#sv8)yQ#K_>*DTf8NK+{4wUSA z{~WGqQgUO7v$HAiTVE}$l!(_OmQB@zc_88|8^<9+&PFg!VX2A=gw+MTdbObLd{BA$ z3bd=s*IxTN^TXeLPmWUQ)qLl*xvJA!ee{>%V_0?ioZOI(J`okxW94>mMm)#Rq=1&g zXaRZb1#zz@%ncf}I`PDjb$vQbT)g;?;tgMaRq7~&>1PmMfyaWxKELV+4}N`P17?)d zW_`=g#d}=?i9x#cwD@+L#Q}{+jy%6^FmhT@*GpOD@6HAt-CmNK{$Z-A>3izVGu{pL z5;7!tEHmbXQCYNW-@eTA76v;6QK_L7qlMD4wJrKy`~1nZ_>io*-?h2C=fgmZtInAv z^PZ4P=d4e#E3l08`hMiZ35F=PoZj$kx<~UX_skl`ME^GujE8T3ex^&hZLNguM#W-r4!=jNofIophng?=N1 z0!`E>6z>(kZZZh0*DMX0_DTUD;dFQbON&z*9J|>CXbg@!9&4lG6`r2Tk=I?wAJ*#| zTu*Mcj00Yr=A$T%UDDGR08Ml0a%!X(7dPw~U%~rntfZ8Cw5_LZZf?>9x2*0AFIw_& z^Hs-5KK);u;A>Eo5~h21q@!3tB}GYezH2Ls9xmLgD3yjl2i9_?0`~3uqZihu_n(nH&16E+Ooo= zYDaIQ_3Ov58BhCL4B3{RZXlf9wrvT$uT>MF?EdcPQY>Pfs7~%+_v7TW^(YH-csH_e zn@QN)(wUGFZiz8Z3~5__%p{t*q;}dhX|gGr-ENpg)q%^n3TVQ}&8}UZ$WeU*1nV zQTh2bSI390eYz^z($K~A`r1TQ{j-C)`3@dFY|j0KEPmemk+8+`+?do&T+e(2#(*P{ye#z<+jZES_@lu&Ix0%wb6K-~_5HUfPw_3v<37I&CheS*#k;0hCnaBT z@ZdOycOwc-$abKbT~IRrh6a_^caqDTlCmVZzSrJy1{9^RUf1TI4yu?L`SQ)Zailcu zQtr?q6rB`WjY8LxP=>3S-pi6@cgNRFc(($97Czap3`w-9sSfI5V@{Bp{XgzDY5lv@ zrPmF?ZHFGo^@*3{X@k>H%cr5$-{Wu6p<_p+_cyk+?lv=8#p`x@>UO$8VFMulveH3!14N8#r)KVGFO{H@g^8Gp2c3Yv5=)O23o4{$t``y&LL8b6o zMVlf;vcZoeB#dS%Z2$0&&DNcn+2mXeQz~i>cm7kUM1QffopHR=&+`|{7nGiEYLE<@ zc6?*9NlXW%yv5$iSLOy*ulus7(7f4#;HXa;{rmQnz&hh$ue5Yx8;3j^(0}vmJ0WXV zteZKrHBn@Zh;wZQq_9(&*{)^207f+;ztu{YT%vce5w8 z?LI-ZP$P=w@c7O0f^@_Fg9oc37j1q1chzJH-V&(w>APb~p2Qq;Dp~L8+4|h?swva|b{|M`O!2CdyNM4v<3StSfmwMd&gd-j&(UY+kk0cek22`3|F-I%fDcH2kB-KcAiLQo2v; znK{iCkkb;{SgY+yZ?;~mGE+JSjs8!icSH20?-w-2O~Z3cz38Xqr;!tl|B3mo(^g+R zexQ%r&cAjhMx|#+B;=Vh3v}GR^whkW9oBlwPt%wt>(-r*j}w9JQjSkSj@ouyU81Es zd;bZFUOi)fji#GFy^LhTA(5{7Fcvx0%eR60ErMEi@68{uvT8ij%)X~zd~?;X4F4Y5 z9eeitc7SG!J*Z-%S)!OSE{dlYdu-vSlr4A_poVY6J8$r`sRMa z$&@dNpC{BG`jD~d#DvNL4R7x^MA-IoZsyWt$ZxfclzmPOct6uEY}&U8J(BM9H}>f_ zRGw*c8V28HH~SK&o9mDPL>AwsJeI= z)Bh>AxS~{|7@oUA=Fx$Nf#kv6PDEfscbXJa6A=~$43+45&-ZlC(w=P=G~aE;*-r}( zHkAwYy?K+Fkkn)d*`Rv|jhI2dW^MfdL)S|-ygVHv%CHt*1|ODo~h#E!?q6r3hFzOq+y z-L${8{f4({LlnDaEA70}pmaXri$^ParN-9JKKf``T`}r7I@kJ2@_?iX7r!2IygFf| z+1SZ%FGPQf(fv5V^~R;ohc4z=KeOBTHu-SrX3Mdzo9w$J+5gqSW}oE_t-A*s?fLU+ zqiH|xto!=?P(#b)TkFoItd4D2`kPHJSN-s37C(2T?k=A6LC>`4c8aO>kIJv{7b>?5 zXy~@S!sbcK@cE02+}}-W*P+8O6vF-c_ZOGrNw066!OdSoybpDK)VL%?DtAdfee|d` z-G;OiVhkaR&8eaeiXtI#ecNs-$)3I!E`(%FX%#zi-A8M%Hpb4pVmTwz@JG|lgx+8*Tn|&`b?7UTttPFJo*1rfAk|S7Nk`p2D3`l^A%{cvpKBN!W zdDpe4v83Mj$7e}&%WyF{a<;`^d1G~;_l!=RcTbAW9kgAe>F$SoF3Gk7YaE3ms|Pm* zzD*ZGOPP1JIp?8>(Kz|aYrDH(Og=6(j`4DdLHRTRdvirt1fQw+E}GTpW%%lw3OrA*AqSU%7z)73lcwp6>AK)o({eG@DpI zE@Y?o$NZlFMavaCp7eR>cu?~;o)sMbXZcl;Ke#{7;Pghf zFZDjE;dC5dw&Np4%}+xBGvBoG(=gh2zn(Um< z)T$Kp-4k}ci$R5U>d~_Dx;2!~$Y>?J;_x9c2b~^{SVVwE`He&PjPW7n+n$2E@j5#@%(Ll@2uX()U%9MaDN0~UK zelEChpl=kWYiXgo>^92ztUSRmca?DfjSn6=qmzr71X5h3zmpUIbX*rU7W;2K`?@Cf zTT+ci!kl@xGBSK=Msu3Esy9)Y)3J+*VSQnq?HyoTj~1wHT{3v0JM8 z$*|lNl|OnIh5l|u4g)?T!!Yyt5KeGn+S(oIsMS*5>n}Uk%wQ03E68JYvf1Lriw_<; zROnh~4{)({>(-E+T213`5fO0l;;_0bB2Q7}l|^`1?`3P6EIQS?dn?3B$Tstri$kUG zat!UNdQ`AQNs08)0#sZz;W|RfkW%8wp6GrgZJzt55m9$GARplG9XNix1?g8r_Z>iw zsFt*Bvx^;xG$#O->axiw7=dCPZS62QpM(3g`<(=!bimx|0=Nlq!uE5{Z}m6gWwRx+ zRXa1Dg4t$$nyUJ8wXI=_OkJy&P@ycLgU>RfY2HG4K7)XaAHK*7{`S3wcNEHHJm5)mYPO|Gs@` zFex|L2E@}Xg&E**)1AFWZ;SU})aVe=Rw*0C;N7D9lB5pqk>#bQ`F+4R!atF}aXKj} z=+r3<;I_?Rwjn!vH?4U{RNfERjsyU7X6uN>4$H&Nt2;iKvDjpG4Lehu!~l^;M=XCl zJz{V(1FsdXiUTI_#%|rZh469581;0+$n)p-p=yT~QIA^bu~{MM_6*!4b7MNNYKF!c zyZ6n#Y5|aOvnwU5=JWmr3m^4uWU1e(@&Ekq!ax7_|J`C_-1yvPO2%LRYY?jW+yDCq zLuQ)G6t}}6%|AO!#Mk`!J%}H3yZZjkoAX59az^b1h-SGVg;dZqeuYO!?pzFdbf)G@ zUtJ^nj}S+(r#*fQr)7dSVPzu2f)IlO9w>^YGGV20(V3MIj4!Yfy+c>46@ciqiSuBe#rsOz?Rb<`|VkVMqS<2!0upTfVT`XJdjL7M=Eq;T0Q_Zhc9 zxc#~Mn)Pno{-V~UFXq(ljFOwjP!-4gFP4B<@~K&wbEFbx{!HV&`J6fTz!LC+C~#Wk zUyjer%#`TSJA2jh;}@=|SiUetyY4_!E43+nuag_TG=nT*7d1-yox~(3(b)o_(SE;# zwhZAzU0#v4s`koPrYh*|2yoA9ak(OdK#Ja{RO2u7_<*RH!5N0fYG zDzKe66~%7H5~EHK7?OP|Y(5}{%n9HOH0M#V)}FFwiDJ8fM}{WR*7L(iRL@3yKytMR zM5)NH*|BTan?rxv_JGZY$hiPgf~eUCA4Z7x?sSzY0TXf3NZ2?i9&h`%B%{A*z>g58 zO)n*Q8OURH_;fWkfx>r6d4Z*Ic)NtC z$Rvqj!-iR1SIhLS2&x)ks=m_EQPHLs=?7y3$l9^vcP?|$ zCh&2PNWq`n2g=Q0XZ7pfUj}lE1PVzi)d&merGyq!Q=>4KCl2mS#TA+V6;xCXXUQVs z=8}bj5M>)g9G~hvV%_WYwG~7yt|I1N%*K=$P#crA0=4gZ9&o9|9sC_$lf8Eu9z%iB z<16&M*Wh&O!4At>wHa0QN(FAYe3%^M@eOtyfgRSu6iA`>Si2yi&;%oOewxOGHT>Q6F-D^LJE+Wh(T&mKjDm09KHUFrbK_!v{1PmrnW!CHB~zH6rm%xnHi-I z<|YIXjR!Pl;taES>U^>FRI_sISy))ev`%s+TXpFo>SSQxX~#zG#u+KrFV5)EXf_Fe z2Ei zSXAa~*Iuo<+17LlZO8$WsZ+y1SC*qtdc0Yw`59`uetLSg<- zm)x5#(xo=PpY`OqbAzf-DNL1bh({re-&bnKjL!YIDhJi^Z_F($4jnvsa&U5W-n_@G zk1&ANGR(xkel14#seGr$s9wT2ET>{qKcs0K0M@Hn{h{Db-o}Bb=;$!Enh6j3@Z-mi z%j>R-Xk0!B$H@NKHa3HpX65`CbUO^~Z5aqnAhS*Tg`GJwWyO`bum#6*;4Jzv!X)b4 zxr0UP-*+ox*#%O+pMkk3BmVv{xbGj42?N8a2u?XzaS!5%y^c%kS>%L01~UCq9$ALbwEDp#8&iXBv zUHwPRF<5V=3?@W&wM)PEX_{)6y^xY#+qc11B)O^N_J+tS|?N56-*#qt) zi~&*}x)Il}k6X@Od(57fCm4D8^6a?@E+JLyPZQSnV;ZzD65FOwu$Zr3pD4fd*Po}j zQTxCR<0-F`p`2$Tf;~)DFy*{{V+@~}E8K)_nQ(C5J`Fx|8C&5odE8D>Ry@_mX>)HW2k$L zQQZV^Wk)+lk3;?Zm_#uRM1NC5jdll!T{`mOMf3gx2OcD>V|rOwpltv>VV7z??^v9% zkKv~K>?Ys2{BhXMlzLugS!6_TfLvGdx>)X*04l{6^@*#lTjXSjw;=r5yd);woo+lUR{UCMZp0Y;(w zp=82t_$pU4Y)mEU9?AbqZs(vgmeFCv9ZV$_=VZHX-3pggPS@^vTWf>@R0gvm&&S8d zUDAarquCb~3H>LDM7z|q_pqz~8x6;vNAJ{xatcthlir5Wu-svg$|7N5unm`!1Qq1x z2QqesU0Ytg0Ysg(6^4C;l4qf_^IoLx6gZr6v;{uA) z=zKKus1>TSu9l< zWyJ;n#`uJ!1&R0`q>NA?JqV+`GS{)xF2bc7#z0cOacx>Hy+((}kR48Yl$0bFrZE!m zPIu2;93{Uc>2O2IP+%VRuWV@|ak#AnEvMK4V7deR#XI6GF~-QpQ{%8jiAk^wQ3-oU z4uA-UL=0e+aa;4r!r-pBsb%0EqwOevIAV-Eo!fkTSpB>|n@7UKko7UhdJPOdJ!(%h zXpG=hNa~)$qr)w;i=WEP9*W2b^32OqEX@2wNuk7{4fJ>l$%Sp!V=|XTdpB;Jb&qu@HB3ITZz#C7!je=g{xO@ z6E*>jC+~yD4Z0xlN*Hlj8BB)g#wz*)*C|I2d>kRE7s-|e-R zTq++F2P9s;e;|Ztqwin4}c(@-m zPTf_`GPZo*fde&ZIE6~yr=&1`{CG<7v$3(g!Fzc7bwdoOnW+KL-E2q>o<*_z9rGio z32S9FfVt>M7zhW`CrvYLk7SkLY~wolLswUjvkBTKA3b?8YkbS1Tep;uc(HL17>OYc z@=NFa@k56uahvFS2c}okU=E$1-k3|(^c(Wv-!njv^6aE=M zvOsCYC+S(&tjSV6t}*X?TN zqG5n;eUXoRJs)!UG8?4bgg^ahfvXMYfB5@vITqN;42$~L>y(!o5AMkK%K6zyM_E}} zMjxXn?%S`Q#C%BWNS*n*tjw6wKmx5G4YO4@IbgG7M0P~0sv((w8ce$|l8ij`N zZ7Hl|#3t?Lr_Y~zfU@JGL`jtiKQ8qcNC~a@sB$XPbu7c9fcPl~&KxGbi{4gnnX6Gz z{qJXH-uvsXa2^4ry~Fb`j!Br)00st6>m5D(eL;SHyngID8qK%dB*c-(2o@SA{to)+ zc0Tf6_>FW|)s74++mxE>9STf1CF4QU4zZb>2rNx#!L4-QI3OuYxD$35h5pNFS9P42 zt*ba_zyM}Cn%mgyw${owjXj+C79*D#ElAwkQJm|DNqDQ`dE?&n_4oJRSRaf|8vuSG zVJy;`(hE^L?dA+Q+vD{s4tD34dK{7(qFek~ipG>*u zD>I%7sBoX=_Mo6Hi-sLdX(`Ksb?RmnX3qIqJ5p_U~lo z3-}iAU6M#;K>!=`+TJI=4-Taxr1)pstSQtvXC4WJx$Mewoo1b-eHt`qQh=sm!~HE| ziRWPg>wFkz8B-z661Sj4_o0sQX-F-~?R^oomLO-IfmC%BIkFM?cdYMo`4`EQ!^+R5 zw5KMOriHi3$?eA#$~ChX%16eQa?SmhddzP8wGzE)Wo6d`R>`S|o@k6mRbPB{q5QS9 zD{fB_P3<-^M3vKJ_q!ywFs`nm$;2P2F|%wM0FHRZI7-R7B=XvhkI;GOg2o`{@2Pdz zA@-+>%*HmN{-ckQ8nE; zc<9hc1@(9`ZB6`rgM722_jo1lNN(4cM@C35D}!>V(3bM?rJv;tlFNmvRphzo#@|r= zT>p4s(XeNOvYzqRw_JOV*cz?EEefkUEOO!)GdPXF?vK^$Ia~Ov4J#<6`fG+Rm$yi| z&wdtrYw#Wff7RN^osA1}IKl-#ck0z@x21%BQI?|OQ=4;)mRT@DxEL+i*GYwBCxoQ4@ znujBDC<9Yv>@<|FJQY-55F%*%Rz1<#;C6nUuF87T_zM>#3h(97rNQ$*zq%y}Mc|Ud zxF>AAsx64Y^W~$jo&e{kwQU;OT@m?p1sJtF__tWufCM}Fw-{QQo}%Kn~F87k|O zQ5%+^7URtxVo=rb_5&=3(8ID8jexR9k4ZxuJ+%YjtO|LP6X~? zNtB_|+s3+uz?H;IjzE@swPAmUSuyY5z9lX4jOiU4grb!X>CZvmOECJyy`9033qUZU zBqaWpnRJ})uOxzpqmlBD(d`pC8uw+s9X)pJaJmm#_rbyI-*skDu!ANBXaWULAbcd$FuiO>UjyL3f#+&5sif`gP}ujfNU1`VoU6zz&&+p zf}KB>#j{aS2ERN6{RR(?h>9{fIX@wVEg>1X|E7pi%Y8z`^c4lA1Qt-O9>Hxu$jyY_ z2TDpy?^4!`29jb@3@3M9WCm>m|4(^5<~hOHO(%c-@`bypgX-e$6Ik!#Xdgs64IuuS zhzlsPNOi3_^ZfB|pd#iIqW4_JB}9qGO{e3=2UWm!?vd5$L}1_@t{fclcPJRS(GWn& zh*BIymZerm~<#n`LR~M-U^&L9S}x72^g)L?kPSOUFJ?IQAn(DpKrDmy^ZA9Qa7<1 zA}YtI;rfx<8CmTk3bI)zH}vO)BLc3Ssdy#-EvOSGJG=AA;A>~i zBdanTcu?ha9ubz&+xPC3Xe|*pjvGhGu@E_e*tGftyKm+i7=me}u5JncrQG6lqYIWV zf7Vx;ly~R5?1N!I8u&-=-t;Cq*$i4|@Q#G75^lu##~YghxKOVvhbIq;d#e?T4zQI5 z({r({MT-G6|EK$d7`lUiBKzkKIO%PMsXTfXoFhEHdw*u1eo$ zKqkp8$$duTB7(uB8L!Z-?0sEQGPAPZr>#E+BcPQ@JxH7pH1Q1OlY;;-jo6q^0^Idj z`=i5(71Y1k`6a~O1CbemzxwDsoBF{euoi&f77|JR_1R0)qjO36gP9&mhh5v_I{(T3 zl12gi233+3KysTR_bTt5yvKvIB_dI7SeVR#k~~T=(NNC` zsLz6%h|Uj{EK;Iv;99)li6|!-aG}8h;C{??NF+JI*wl0>SB;FGX2_2KbzDdYrbD5x zB_;?+SNa!r%n!h&vVwTYW#tSL-Aqe$y=mMPBr4v)oJ1*uui6mU&-qiU{vgw#Ofzu;cLA2`+5l%y*m_X6ho1MH(G77e!21;$d)gUW;wRtvM& z30Y|_Ju}|}yKM{J5Nlx*jS7|Ra?Bf&z)f3V4PVQxC^KLf5!2Uinw}4x)YF7jy?Crp z+lK+r2xq#@1UCf+(5O+_$zcH4Ao0hn2PhZ212zbXC6ewC?Y3pnbOAm(tIqK}(DwQfaASGWsKU2X-u5y7 zJ3mfc4q0FUnxecoyS*^BocCNVZM$}T06Zhc6*xjsb&>&><6ue#trQ)!D7Z-UIRh!C zWW9njwC>ai4WkmuUX&ukrD2xrNnXKxXXj#|7K1v<&9@9|e+^D0*g1D|Hp80gvR|_9 zWJU4gi13zBN_s0CK@;fk>ABvL!YBDJyvEk-<5cAwaWJx$?|{I4CtVlr<5!?z!PVU$ zehU{bo+P=7`LS02dZ7 zYr?Z5eeVjFynMza})?y(>hJFq6Nwf$44r34>?K z$Ur(`8oQ>bc2T%;-J&{Yc`v0JCN7)hox#)!X{R5%-qTI z3Bqnpi!U4sqzb3tYfM6@i8?VO2bn&T%qbt@?0&n!02xA!4I5^UCqhapFYsJCj83F2 z@M)s+d#p}R9t)%|;cnz@0vahM>7>BhAyJwD-6(gt7=VX!IY?Pr7*GrMlC2%Vw!%_c zIo6t!h^1M5=e>{TDl9?-u(gYEWKXAJV>Kv!qX^5pviDdri=6W4# zh(Zo**cgKlGn9`Hki?&J4zwjxk@n%&xaUEGXsv(St5>`1#Kh^<`ed4r$}Utfj0`t; z3$;EOjy=Xb+0G~&>w>YbQq^*LuKSj%K%N#+UkYR3tzk_KRA`%*~&`h)KER=pUB^T_^{F9#bJ5_3UVo; z@7=vSK`T&#F)F>Q4C?w18U&cE3`E2c92beH>K3)*!lg_75fP27%zE&EwzwT|U|q&> zKT!9hx zJXlSpF4A~0);OT5B2Z`aXzN)WC)DgaosiIppXK0Zkf)`1`q47Vu_wx2DgwYz!PQ;) zR-R=UIh!DE1PYw+iM{tuSa_YQAABGH-vGqO#^hY@lwiIhodq;}e&bMBCt{5xVs+=v zslK~=Cnm)#0M(0*b~xcN7XS)&VB^E@-nJ2H6)e35%rKn!wbj!a=5$<9P)<3Cp2eZu zg?Gi56Y%iOn;iQw{r(jXNiDQ+;hKlbZbldM=S$`9TDk08O}KX(#pwSLBW%v@yM1!) z|F1UT|MMq(EdE#Dua{<955}v*0r>85=DIsoR+6sF# z+KXs$-F~yA3zpFn0(B5R306V=mM;y(*V53vEUbqxFk4u)-t|BF;3(-ibZBc@P&D?L z@DArMUw#9V7=Gr=7RJZ}h5!vO1c^Y((3VDjZp=FXq4$uoZ#efAUsBZ((Od!ojlbYm zKS!P{PY0#rC$s^lX75xDW&pl>)dU(4K0mRm$qFReEJsA$OLAR=wGCIoK;JAUp zsIgdlV1q}ZhKKeT)5sAHV3^`9$750=iDQ#tO;N(BM~QXgo5PUBrFHpElW`THt%o| z)T4Y55*rj`^3vRRr-xj;Gl02Z7Gz9=B%HwWH1eUQXpKMvg(#3J9u{BTQdh(}XvjA6 z99LZFLiN}L7+-<(2?g@agdXB`JPDnwy78aSlj~A8yIcd@h2n_0^zim=Kk6*RPiy(I zsd(SHXsBm*($@(~q-o3V%>%V=s+2zWe- z`gX-xG{KNelyCHnBC1^&Vc8-+SCk`T2lMCmb44N~ zqg!<+AMAJtdXc|IvEjI2WDRgUGoXV*)DR|sKg(YS$IYOto{sff;FsC`ab)=s{tL(~ zv>uGR$ORBvEGL8n+=W9!JW5m_E*O`1k6ehCaBy*Wi18Zord_^cNlH}AorVpqo-)3V z+mU-uU?7$qMpdy2!H&w*>eX*epzC+R2U1#55DS<9=w-;Y%vlAS(~>oeDE0vjz;fs{ zZuP!Ur7{H62W~%=8ftseZgfFPEH~OsQZdwJq#VkMMlT^k(Ze+c>7Sq-pNj&S#Re4# z#w#{kR*|*0YJiNsW>>5ARYgzf7aOPC)hy9 zjSkn>8b-gn3WtqRN1R6sr)2nhZlKFW;eQyy54Ec$x6W@ETAx35NzQ zuqP0NKhe3?p>1~;#x%ac7gc*wwfnd!fS+^Xow_OTRj-psA)gJO0NXJQ-ILUWUGKK2&#^X)B1^D>n)x$S(IpC zLev!xV?<$yt8~dAerT*W!KR4J4n@isA`ivpM(i_tSbAxq?eAZbyYOFizbua zkV(#g^_^hq$W>{KL0`BTVb*C}$_lS(bMn&0@kX^2l9b;y?=Vo(GNrKrQI~i3+ z8DZ3uyH>vK01if3h?>TfIg%0#I3X?$*j4TAz8@fS36?BXs==QY6OBU6vXXo2y)r~RY{^G7DUd9z4F zBgAapX0OTb)lDSSnf}C*&v$y0@L4!vm{*k6uTaD0Z z&K^HvC3v4aEMzToN(wCB)99c%WaztH$Q{_*N;Ix)JUR4G4hWV>J_J}$u6im91yY2x z08x6-i%IAQXAB#M{UfD6;2zzgX{lnSrWQO6Y^41nl`kmV($b|7#qXGbq-yWbc5n4BA>Vem zNHE>nid8&x>1Qej&}Lw ze$HyH`o@5p+@^ObQyXT0*wSUUZ`7nkcBMM?2<0(96ORZ;ciGbmp(zZcyH~EcxK6vR z?ztxDZ04kF7{j@NZ{s*j(S(NQcy=)viE2V99`{f77|s(J?uLK_`dp?-?`S_jyeQ)8 zrDK)~NJ5hR^hWBRWm#+$BM~%Ju-CU|I&nvXf^6d_57*WXf-p@$3;DiLdgYDwO?44R=%dEx3U?PGaJWMrA+cr)<+wyGsc zzJL+}oN`#o+@~jG_Z<6Z_j8D{YxP=SK}iS#y=ZgDWGl;%rIV-^xM|se7kaG$_^0&_ zCA)~a*!Jt(4{d9ZmY(NiW9y5@S}ras6y6$z6gScUxd#Q7Vc~9QsH>@6tO8p}E>2Ey zU%{?0d(;mXyEFHjA2*2UxX%+$RR6?DHz&dxNJPY=+`(#ce@hC2z=+193kxd^}orpG_I#1i--#gv6oO^sj zb?%??`EXMdU(|vJuhRn@XxFaYCn}uNmeB&b{x=qbksCL&Bfh1zR*Y7}T0!}=CL`hi z>qjOb%X~5(r|7F$eoUm@yRpBV2_U~Arq{u&C3$J(}g z_rK_=GC|P#zjJN6+Y*R?e_nABH37Ze7of=rC6u5Z=O3DpwuInnLk}e4IB?yXwvdF8ZP$v$k%Z{(c zleVZ*A*nx(rDFedp2sI=WbT*)uLzLp5Ec_|H~Rleo_ z7K4U?KdlCo7nAw2WoPw1C3i-}(c3im0oI8!G83lzTI(ms=xc~41>*pBUU_u^?@?+W zI(Y8-FD#lb9C2z%37cK=adwIF7ab53OGUhD=V4oD0$(u(3aio$>f#4fQX3!s)Ya7$ zeuDDp3>PVy96FDO96%~wD^xX)YA*RYYAyy;{yWM5;TR~{D7lp}_Yb{k9GPpg{CppPP^Z?>LpJ2&{cP3q6O-hysS-^@W<6 z^$N_au0H?m`MhiJY2aE_vyQ~ZW?eb?HOtFk@==?(kImET%;x3S8*2rUC5P&0AqoVr zrYFQf95i|B7FWSl%*fZ@i4erUc+Re(?xf?s=c&=pi=IDE8WdZi!^q(9u&~^FAJ<`5K#!@68n z0-?MOlTfiT8o?U33W)_7%1prZg4I@>Z`DIq$Y&j z9!?=(*wLny)6^9!%163chna&*$@m~($6u`k{0vQ(Y!t>=u9|H8w(*U9GfwKsIF>m} z=YbYdL!vjOy(4d^1AB3VL*6bDy)oXVR6l%j(ieCST9#xsLUPzO)}RB6IP zadv%q?hq%UQ>1m0IY@yd&8=U3Y~zxv+W;CQmQ?c1lVs}=*ktOW})qmA^8F0%E|HSD)Jq2GaT|L2U z?bx-n<+MrKDpr3O%Tx?V&!OJ>34W;EpnFu3@^W)K2U0j7x({J06?&OZ^s-!V!uTTK z5Nj`EIq4OgJDrcpT>%_<%WQZ_5(pXKh00-fhL`6h?s@<2-4_@X8CVV3Abl<9PwmX1 z1Sd8HGt?Y9^s}F0v`JXc;lpROQ0ovIlDk3(>2hl?ik~}xpABn(!?^FH#DQXxWJanu zdSCs>*|I6fg5qT+&rh3cVKIsI<(7ZELw(_hF1t7O%kW#^;4p9Lh%}C9v@D{2pu7d- zg0Z>?y7QlbcIe7!O za%=d}I`#b!`KU?kd92#w_RvT3U_eVMe0}_>(iGucD7YVIPfCn~Cr25wJ~_)59TAdT zy6a9ztC^A1WK1!o!vPpCL`DjQhCx7y6}~$WtG+BPbuUcR6tj&LwI=itP1u`|#Lhri zbjMQC0LuL==Xh{Cyd`QdHy*bW$Ebwgs$4btLQ&340S!U}jRU5TU6PqOwIK-tR8J#$ ze^r&MfM>Lu-J{N|W<60w=(*0CiwoB`*2aw6W~s_&Kd_)%1jiz{$}8T>%n+OtcJ__7G6Ns}4r zI`d<+kgOE=VPT7tH|{YC?Lox?7S{Q1Ycw)4KnLyrW9^qHI?5MKo!z%x?aDRU1UX55 z7J_3U81jCu%Qs`OW9I4gD|RF?jwizFbWiq{HiAuoZS#e_qAlPkwlo=0#oUE0?5~^^ z)gv>VNV@|IaB}a;9;D4+so-pn{OhOsB4S|EDY)!a+m=toZ|ozHKH_*<6Dmz@}5 zW@c-<@yWvV?(XEJDImyiu~x69028^|mm+kiw^=bC6HhGI=X2%T_Qi;6g%x^RA74K6 zB4G;jW~$TA|DH5Ij8;NC^7+W6o0KX2*|!~Y(l+7S_N`7L0pR%M%a;pGMy-h+rZZ-Y zpya;CCwozCJ7xDUQwa-scb?Wp`W#kji(%bPuXPnRfqO{1Fn(=kpJU5&qYm9LGRO%; zk}>Y!EQ>CygO^};NBleV?s`iL3$fY3QM=6h8t0s6$;$`+(w%*y|A>|U`ok|0MNb0> zJAYr}J(SdKa6_lDM;||Y-k%l14*n7HyJbnqFNgS-PoD&JW_Nn(?9u;t1_1)+(SEjI z{*QEoYg8TprW5j~9%l6U(ityPw_0m?FZ0v3dwBP5X-Maf>jro?)M@h~&JK^yMxKh+ z%FA5VEJ*moZr<4j4F4!mWOFXO)lg@~-|h>pLZ>*sP=G?~9)rIa}|bVnu4$ z9_;}iyZw2h69))MqXE3?|_d0Aq&!1D?YRQ~6e_)Z*m^BqNOVK*-L4P1k8^Y3;$_qqD{n4KY%d9YpB_Zo6M; zp8Khss}?A;tJaTNv31LqeW;I+wF{sGc0Kp9#TI1~I@W@#j%<|Dpw^9%5mR@GdvHR0 z4xYj1gC;*|YTz|=N}5M`PCo)uT&J7P;}%EMvxGm3*!&aqs!RZvz=Jy7X6fAspF^F{ zJAcvDzY&zcRILP~1Z>HRwb{u(0h}n$U_KxiW#sVohFXD7D38)JGR9!KWmQN5_hpCt z8K9hvCNaj5g0kYO1v>9-1|eskSdvQYT^jD#R5hM^hw6cXG%0)!Eh}ThT#>}m-y)?N z=X%e7i?YA|?Qzolyzl}82RtmXrAoFdDZaPr+{o zZ1{9TSsHJF8nJ%r0U!FWzPtUC3$D>nBuPhtg39CXEgA-F{-vVrR0Rbe1>=eOv!`*c zp!gX?8H8McK9bNi#lq;WULbhMsVLwErXfwhIxns3AB%^E>9Dvqhfup0%(i464 z-OXO~b&I#wiU9MJR6W)>J@l=l4e5=_=%5onu4U}rk)aDq8NG}Rg(l(A-PYYlv*WZK z#X9+g{%@9*74n6Lx-od%E2yPgs?G^+QAD$wNvVwMxIW{f|5@ zx$(zudCCV(l-93LO3S|aChzCeHtjm3-`dIex;O92e@h~_NwIiSqa=;E>1w6L>h(vD z{CLoL$SA-2CZDRd?D#!;uH%}0V$HN&TIV-bTi6T%!au)kf9!^l zBR>_cGc!|o|Chdd!hkHj=wVfHQ~zVaw_MWz+CF=+&FN4#{U!b_U2>yZ9}F~fAAN34 zX!HBMr)4W0&%d-bM$2ktfn|Os2HzHHO$gHo?3i#Zqxs;{Z(eU&+qCj1YjeHEw$R5S zbkiSGPuY!snylvWZHw2>QgwEX#F$5p zRUR;4$nXK_cc*A>X#L;IRNmjzs^Tb7;7;?*PdEP2EPlt)5<;u010D z+_ajMLGxZdF?IXtFak^^Dfp4<_#7o?W!3b9{f4ZIja>(f^sxFxicL-Mg@taqN7k-Q zN_(7Vson6^8y~i{PMUw^%9vRuCa#Oia+~|Lc=WKN*WW{JCd8P%xbFXFh)>!C zo0rKB9j`a&UH=vo+tKxID;ww7X-#nC+3wxGXSp0sVp zW2Fs&T4f%sC-3>E!S?RPI@deaiRRk>j2dFr@!Sa=C3Uaa@$0^=y}KyMM_aGHp^@cQ zgO1#scR#N3($ zCW&sowykO5z9Ln<>^f_;bDLxR*S?%kn7Xs0+ZrZ$xGBCXym-m?;K7E5F~ev5>DR;X z;|EXk{%-FZ3fCpA`Mxx3<+8I{qivJtYTvm!wPRsc{Hu!SitS$I{W^`W|JZhN;b3G2 z+}4(Hwhtl2h&Nq~E1E(`5vmd-*9wn4RkR4L=^9CqU-;CZd3rZ;agZ`xf`LYaRz~Kz zscv;U8W741>YbZZYuLtw@Zb24I~QvUo0a+rjv^MmYC`e=ip>SIg!}`rJVLwosJQTy z9>l9nHgkbdp=IkW(e2x{lO9R@JJwoDX|@Elp&&+M!{A%#$Y~QWgLcw(>WV96r0a#P z-u-?gtH1hY9B`9&Av_@@{j2c}KU8HnE1V{i? z9`o$nAv`mnAT`w~`0cqlLp(=~9#iHqP1`w7r9x@SjJ$_xZZ6r?J9Ng_G(7!KQ1LEm zmFxS-hR96S({)l%Z}d*o2#qaRJsfbGp|XksnC0IW_#@9YLCA^xGAQF{n5Db zi~6C3g>J>z>uMr))kq2+H!fHyBF6q8g|$lWo;-OXh6ya9B1aTo3m^5D^KtjG&A1rugl9KD|AzAii(o9nC%wrFXHz9hJH(6 zeab&yrUEWUTMr_$9d~cJgN=PEzesT6Pu@WuH)cxk8Nt3tz)MK zRcVX`w1uTZvObeINql;7sr5RN{|FH(oY%(1t1r+ugX?k#8d0V?4nW3BFft%w?Hyn4mzPqVxBF6c zm!QN%rz00l!j7pjA!6(uWn;ALVx=88a9YEb>mlrWMZj26U3CGgB&vxcQ+zw%(T}OQ zcf~G>flJ6h1;FDhtUAImEoxYxx^pdbjsHCRXMm>UO0lx5zP^nOf3!>vJ`ob1J}tAd zHH}xC|6=jyp9ca03=ry|>*?{IzKvWlQ5xWRUgheVQ(1{al@^Y5t*z_CpQLT(WIw%f zTS?dSM)MIXsOSyqJAS-fv8AXgcnZ|%U*RFZw^3@}dGMgc&H>H-fP^3bc>!tw?Qu&! znAL*jWv6&~`fiLccr8L5*m1Rgg4Pj{fP7H4fr8-AwZ_W-51Tlbo zA%Ugv(-Bs9S6z>G(4T;WEo#Dk-)(|xL>kMQlMZ%8A)dbNAQ$& z7-%JehAPt)vEh(Z;KzfLtJ=t}!)OFfI@8i}ZAKOcT;lRE2(^XqS)v`(X}$Wvz>2&^ zCx%UmubLL~`i|Sw>=%ET6`Mh0r7 zK?jOFK#oa%vzg7vlB@7|;(mZ-UCjh$0g;RicZJsb69V%Hh+haSd;y}<;eWJWG+2H$ zKpus-`fR?AmUT4N^MT;bSOeN(yhr9Fe!QGnrX>iWU=IvIB}5#GWtHb&9+mH)IF~2>v+8JyEH1mF$D9lxfvNor}g1dRcvEnu!VEo0xvVIyjP>%voVi83At7 z$|t_aVxyJSQwfg4hM;3Ar@Q8dRD1t?`2@?Xa`jN5gwB+l7oh;zr*4BvZ!_hJ3 zWa#g}$lFjG!kG~D??OTuAse9SAw$7Y(=VL& zG1c4j_sa8E^A`U|KCtp?ugRlIBVVup18-RM_Dwe^oO}LCjTO(Ec$t)VNg3z1uJZ&z zW%Gv+p1|wj-~!9DII$B^e`g9bpV;>_7L7RS%^HgM_n=d-2m^$QvkvT4X_(T*Z}%HKPNjBt$3f928<|j{UQDKqXnKDNT--#?0=| zEpV>w!~zmUP`ThnfIaIoXwdYAxyGqc zkC}yYDc1gQKc)5!)1hcCrzCm|HFfQ{0d!_hcU=>JBSi_+}ZeK`58gXTLth>oNr{4e}FFzM&WoG7sy$TO3%nJ+&hnNujEY?+|qaxLc z^?uGcuBlX6;vONIVm5w(&fRSbJU3p7*b8{qi_-#K6idE5XR;wn<~`CYDHqv(VDggH zO4O6U?a*UxgoA>(nPB2KW*O)Os!ngO5r@?yQM35kT zv@v^`nY>lZK}3>MTcVnmHVX-)XjS-h^j>}7RlwJ|bjJ|+OSpL#$JjHiDoG87&yZVd zW~fm~Ma52oke3|hVN6HmyHwOz^gl%YzKGkAj-}%j+i9)Uhiu+E&V6pV>x!xn2P6`S z67R*UJt4kO#JDrl9BCM1j5>QR9?Br|Wh^;y?A*8yM4SHXS!W!@qFDtbRoJ#`S59%| ziiSMSG1=x^mK?Gb!xuI;Sopr8B2(ByxS*^oRTepYBWo|6IY!*CU%6{&hC-sZ8pDf%48+*0kf`ban62Jl&XX^Z&_ek@)&f#^bQ0#VfmZ{W#Fx@f_R; zEq|?j+}h;)Dp!|Pn}p&-%}Cu3cUtGD9?$uL3|X!ck+!ih*fvQe2bN*BzSd#S|5Sor0rc}0nRSM|PC@ah%AH0mcK#b2{0A4K zBt5{>BVupZQ{-Km-9x$`n&79zPn-zxgE5neoN-318}xJ;lBGwtxr^3+7~wwuM%>9s zNZCa7@dIrTl4nU6g?=n&mJdO}?R4CiaA^R2bdHIidoNN&+WQ%vJ>7ALT8 zT}uk8^;orJP%nZnY~Lgw+-CJt*q~k7BdeDGQ)Mg5H$!w7Q`w`#mfNu?DnyU^AXxVp zVun(+udE00XtQV#EIc60?r!K_z55(-%NcS9Y)y=+QpwdEz0#9I7dGLoiV20lA0z-O zX&m5Etcuo=eqj;i2JD_ngIx-`+pOR4eNm;|FK} z&tnurYOWw3h9y_wnJqa8aQ^z3At0j}n{f62o|0k) zzHZctXI?BsF7xMi`$CMg;yvPb8-~|f!IW&UsXyP2U{#0+5)``H*(yUbxhwyyMcUDD z%>_LZLm~Rf${OOlW2Ro2L}T`Vxu>*;BPSI7KUD-zcE!iB*8nL*>B)N$qHp|htSMiS z3;O60)b7uH>uiiGeCxk&Ft_A@fAj_7?INbwJQ^A9jR{u_N!Zf@W57g>~$uF}$q zJQOONcm4}{=I;J4uaWm8&Y|XJF&?~4V%E?hL&VyHU+l)0&}}UmAC;sy@Tpyf578i- zzgDckZtTe0Mg-Mi9p-4^`Jjot_@WodD=3Xx`n*}nh{;x(sY&0DPD+w}mQ>Q^jJo=y znzDOyCS`cFy}y&b{JfG8XSFC(`0;>Cms;nZNCka@-6`bzZ$?5zL-*f=Tw<`D@(;8v z$$YtQN#D-hmcxn)%TbHU&ST_O!>{c9<0EIfdg8stJ`!*s9HO78c?VxR8DLLvE?kf@ zN^;Ip$md(2IY|mrxn5R^x8yr6ooW%s0!W<*-~nF|YG_z3Aq!^P(}E3Q@Wew-WU^_V zb-Tb4_<6R?MdVfB@`g~ z8ot%84wbWRiEEbzVvz184V;xq%nW_pC9QAYz7k7z1r!+XJFrV)+V!>@p8xh96t(zC ztmftWN=8w5QaJiPqjV7xF}5!J;CN)PXTb)4#^oXbLtF^bCe)aMmm?hlb66*g6bQRu z5aGIa$g15rYkn_t)2Fd?L0#uME9su5fT1cE`NgPeH~>F={3sMcqU|IwQ#?J^mxq^E z;`r;&u+OktPIJa=Y_7a74G$IdcT@iNev~N_P6iQ*k(jBoHevww%HdasQqX9dt6>gO@y3*x(*GsBR%bgw0C;= zo60Zuh&A(59n`e;e%`DM^~A`AWG+0|_IY5e9JFYyUNtopamyW`au-+u1)ezRNmtMW zrTyPb9Wk+pJu0L_aJ|V3wU(mu6J17xUN?`1Lo&t1qrSInpS{0L(yyId^8aR?(w#uT zBSj%7WCvd!q0o2Qf4co^%Oql`lfeESKYrXeGKna~;Wsfe(cy{)Z^Lnq4dBXPnD4in znVLR*v>rj7ju@z!R)6eVSd?SD!{*^7G0Bxu?$%kKb&YMz&>$0P|sjXCg5ccDqGYMbV&v` zw_*V8_4mi4?Ly9No9#0}09j-mfDvgdFB<6U8QA#g>sLeO^S-y1)6`Ty8Mfl>_>i8t zzt19_M$}8!Jr9{bsW$C@ri-?7@1Q&3r%&I@PCM&~hK^xZ0=oWI?=@xpTj584`#inT zS5HAzRW)^Q^sw%^v!?%Vaxs1Y1n@1LC#hKUlY4I0?E=-wY0^|V*@X?~#EFs#?PA-O z^>H2?Htp}zq}1mpFa4%}|BM+At7Ptuj-AB-OYzj*MmRti;{Xwt62cZv)tm349;H)m zeQa%gRhQr0)5~siljgKlYlHoa`!zqNexlxb&0U+_oF09H(+y{3&$sgbkUv_T@p=uK z9s1I>*;?H?+xS696BLzT#{4bWg*x=$q z#-Va|=(ls#$SH2W-cJCoV>)#U7cQ(Y6tk z&Hp0GWDx<1Bz`J1WHMc+y`e6YWWFU2ib#!qM)}Rt>C6#La0>pO$iIxNu1)V5XDYEu zbqx(07evoQhe@Cr^VZ`peK1@tQWiiAZ%=sLk?>y8KZ!I?{6{oU1>8gQNk1EdQ3-JU zvP3>&_?6<=0QBV~=Ux|BM0WP?4ar_4E1cyBhNwjnJDcDI^~wC6Uwo> zckliuv%LbRM;Wv@E=db{sKCegf)xa{i!aDc&leN|I}IqvZi>aPZ{i*|AQ)%2zR_`t zaZxf+X?imLg=&rW1n@7CkoZ)nxbED&`zH9CDtfdtXKGdl#iag|kS>5T{MYXTMM$j} zMGQrWS@j;7Q7{WokZAJy*sZE{cz?0;GvqRK#w3hi&jrbOfBh@To1#DS`1o=LKd|6U zI@_BVszR&)UtX}z_yk}USmDlS5Na!CDU*&<1PsrmF%q%Rjfn5h(;r_^8|Z12M-fgq z^$}7>C{os}pQOX?g_-GXcc_~gs0xSH^-p&O|BSJQgb-UAyoSL&d+~bgz_#}p$DtP3 z4YuGf)4w@`ec;M;BVl;k3wma(nOB>Sj}MqZ&&uMPaa@U8(DiMduDJjWS-1e)Sn>hk zaPx`)A(mh(m5_)6=Quq%ZTnR4;QZCfzWw*yVOqQ_=O9ry=WA@H>>36J#YuRQGI2af z#C_#;?=fUj43dXlWmZO)^*xS~pZT4>Ta4UUvu4fDrRk(H@hhkaxmy2tIRtBO=?DC$ ziCm8W&bAlsyIX(`Ab+L^MK%TgcQm=**=OHrFy-H(T`;e!opvnsi}Lx$VHI zzitfueDTIjk>x{5DDP;nU5ZL&Hg;RkR~!mV%Z+;2|2YP5krXAw@t_4DuufEQ0)D0; z8hq_-nX-`-D4>Ad0|ReUP!b3fHnsD2p3V=d9;PJL)6Coke?eO$+@c)6z3qMtK7QXB zE86~H@B5Peq@}gq#IDakjSc~Z(aEojDU(J2f|+n*1wMY1VzK!kTulhXSu^k1r-Jj_7<1`Z^cifzabK!9AnA8(3{sSt8qtZ zPR>r&wFrHn1v!@toPVEKr!H5QnDGZ2ugB3=p@iRm4jV(w2{ zh4dP*wy5)qJkei?C@&~;I+fKakCOV>u59!Kq82e~K%yuVMM%C*O*{uGyBFbYnEW?R z6pa2*cV&j-A1&0l&Np-zzbNs>?@LKkoMNWLW-bary#a=f4k}>UF!qZu`@8rSuxL}& z6#tJ9?PHMW_iL-h_*Z|+r?h+=U2svNCadC$S$6=>9p+N*Gh4Dp+nqYQOYuyXtnNRJ zC8byGke(5!IPCn{(O3cuy7$xBBh_7L__=TP6DxFT6eJdQS5^v?KG}ZFbH?)JDdG@rx~Gy^0&#gph|#AubB;+9Vs%d{JX8ea_LWCzO=P53&j#MHow+FEzJ4 znK$k4FRR*?0@^-N7*l6*D2u5Wzp#>RCE5TL_QqU1J zqp|wo_a0QLwN#Lrv3>hrtx5KW5A8XpbiRAoTTm&)MN;XD0*qsFUA0BP0OqKGBS#+P zDco{8KzjiskGwiHH=u?+^O*b&VUzyLxIS2; zB2xc3oF z63kx};W0#EP<-BE1llJJxd2+vu-@S>h8sPxYw;i=9v;~;re#oQw=p zVrn83%3KgPzr*Zr;lUOog)vQN4D2l|siGj@6bSKA_a?8(oFhZ!%mz#Ov`e)|rs>~# z-X~3|!@1bp*tzRIpS!ob{fkv?X7w6{hK8kXJrZ8no}-u-GnPfn2QA2W2xwmYJ~*%4 z5>uBxK0AXFO&9}NEaG46Z$vM92t zDXf+Ylt%!GJSl=nfQC2>(XNp@T=ODWQ?7ScLQU?mzlFxiv@Vr+5tvJx>fWl|Eg}R# zDt0$#9{mDR5LCOhR}3n+W&~0gTl0Wx7jA@8Nr`Hl3ht0$bo|HyS8Xx}^vM%99_mjf z(0@uemnq@9It`rITRzY(kVORgIvG1GPz+ZwpaXjXB1R0w;2bJ`w;nFS0=$3m>CK_t zU|Vj%hpbrfrswv}dLAWVS$qL>HT`LO`1M_PcQiT<%>H)0&U&3HyXa%=9Pi)qy4?k0 z&K_Cw;`sZGPS-S5ny~x-fI?@w^ZY7 zS)=OuB|tHUhE93*C5x>BB-cP)cyrw1{Nm!&Mc=ATjU;on8SRRmo!{iPyK9Pj(7HPp zgOX01=-f+o$H87(|6E;Hba=>4c&z7kW5#G~PCHy=+PUSr)7jOF`|!Z99FYtAgZ4!{ zN=gV9WMs;_M4uMJ-5~%Hk#ZvRSG@;noXs7-{E%V!jiKaHY~4Cc!+i3~^pzx4*2%eQ ze|#`W!>OzEs5u|1R4>+?w`f>ZVW}|t?(3dIGw?|C*x&?0Bj8VfM6`}1C7g2Kc5t;! zw*9;ZmhSI(Vr*Cmy%hDoi0QOR_b0E`kt<2P9t>82s54On#EmHi_7k%;3gAJ#0 zt1Pa@RTquMBqQ*Ta6Ci#WG!D<70~W!yFq6tP494)F{~w%?L=dUFazpj0f-*_$ z7?*mz+=k%cQ9J-DcK6Z*KRi#my1$N{(#t-UJ9Dx}r)No>SFCtjF8Tb5jG24VBFC;1 zbQVpfEKJ$@Z*PUZ`KL~=z25Q6R>kMteKzk8J=wWi)WWPQ`<(9HxK(ngzryy)1AT{X zi%!u{NMCfnzVO2!ja&OUN2(_d#{Q?Z{;fG?1!HKUt3GWi^u`xIt~?RmCo;0`dFB0i zC9b!}uC#f!!5JU9!7fM=(0xsgEKE*oO$!NE&mmvX3YB(JCMur!HIcO z-L%$00gIlCPqZ8}gtGig5t6v*a(5Ukaw&~yD}}C$nWZEa(GlKbG)*}ZT18|k@)mxK zT-IIPHZmmrXh^p{gRB4EU@eGIwa_d@r~mD5Vp6E$Fll(}I_&Ms`TC;57I8t`y1=yX z;;$iv4dO~Z;foNFta6jq6mgT_DT{_F4$@f@U3?FQ|1JJ+-v=E8c|m{ogm@ogJ%vzz ze@+OFJVhEz?lb^6Y$OuY#R87tDrse(Twu}CG-UOsWsXEj1D3HF`qfvbiQ z6e2{Nj1!WgX_`gO4HVF_<;(Bj%n%bnd64HQ4Pu)DkoY0SZ0R|4J2vnhi(|MN(0Gzq zBBph52rftIfdKRqge!AV?@*R5yzFY>;_$QE0i~RGB`KRU4=G4EZ0}=Z;~4zqPUw@b zA8M<-^Djghj#w^paK+*olRck3`&i~6HOqfws`Gu#ZGWqd{B576Oft?Om7u|`Z^Ay^ z{M+T_%MLFxRF5jj_+)n)mptW#H76#UYd1lm0&LKe-aSRF&z04?$FI1(%Kg%E4G%@> z75cGpz4QW~$m!WG$~`zPwC9KWvNWZ@JUBs6xiH zdoXhg0<8G#jLwvxvzITAZA!73p)EIe|I1x$AttxL2@=;TMhS9`3O^~#23`6)+Dn>c z<`wkq)$5>pZC2$1g=g@pqKneP32k*sF!ioa&r1r-9MzqAZ3^f_AXk;>W2~$#V=O=M zu*r88st^83JtXZ?+5>XHTY19O-jU4#1AA+1T=h;OWD2hz=2S?uOB3?xPDzn;CINK* zq9um0bzFAWQCNsPd$IgDpg7ZDcZO+b^y5sXS%(^)Cp^LU-l%V^eHKASPqx?NcZrM#$P6CslZ$p!Fb)yM62(%BJ>>%CV5NFaKuR+9WtAk4iy%N^CXA>F zOx-@Pl5f0Q0_K-Tbe5RsNgXB>S=62dH-1S-XUlKhkB8RncvA zyPlKo=qVf3-5<4~DD*@j0j_xuFHTWiFfn%J0_CzrK_tsEg7(hY8H&E&xBuOpd_(He zaqoheLYyI|y^rm!we;4GE!XCuS@T=IJmbL3_|lYh!=EOOx^v=G&k*&T!$Wf`tVi#8 zRdM}Cl}S-$<(}~hQk~kr&L|B2rap7SO$2O!K&ThHGc+WY%O0!?tMUV|u?`kfsxd#F znCzOGWEpPtJW~LnU#8Q>Z zDdtb5&s#@5dui`m2$U{Pbh=qchMobT*@X}s2uMs_%1ED2^EI8Fm5E0IMTNHOQ~uo_*pRruFZ#z_ zg~#YPoi=((^=&Fmtot?l@E)`9Jyx~vdQ5QNH|R}-p5NzB+yCZtQwV(UZlKUy1HhJT(~)vrtqJ$>5L z+`J!K022BH(URjI>CU1|m4ALa)Ufl68Fg88b&8#=yL35vvitVm$~UH@uRJLCp-Zoe zaoLZ;x@~yB?@XTmP`66kEe@|T#|13~P!c#B2d5tx$e3=lju9YER2N=_Zva|o$YUvt zy(=?-EUPCK#8-9G8CrBkB4F^rPMf^FGY6;*%(j07yt^5Jw@49weWjOHjN`IpGcd%r zWd7Oc_fBT6$B8zCA}PcmlSrAarMkNC9)h;&RF}Z3S2xFhyJ=dQ=cgj~% zYwj=L5(A(S?jhh}Ft+KD=2XZ(kmCWlJ5U+Ee*Ib)m@x?1t`0AC)H8Dt}{EGo;I%c^7!}D=*wpX@fXgpoLI;SjSmDBsmL4Akx z`)lJ@o+A2SIqlIC%Aau9Eu5Yzec-^w8i(E8YNoC`c5<|P`0wBOKG7CMMcdnx&Ntrp zG$Enbb+n4g%;d7L+p`X?FjO{jWS(#L2uW9O!MY+drgi<$%p zW6io>-#x?#Q%v~)rUZiwru^aX+2F8yOFyNNJx7(6Sm~OUx)1MeZjXFtHj9y=B?p@> z6j*OjGIC_)ia;xIB}2#4s9LT@))BUFQP1dvjUS%0>nlti)?1T$@v7Iw z^>2qwJK@|MPRA((afHy(o5y;eCWV5=L(zP76Q$v4);1R|ynKD%N;KMZeauEP!Hi?F8;mgH2c`atYy3V?SEh5 z|3c>(BTM>3Oda~Ot$(M8{Jp=c)_F=@%7m0LU2G8QZ{l<9e%J z$BGqoHwH~I`^Ty7m7nn!@0$JJCU*~OB2Ixmyl0Y)3FR14WfK33l`473lf9dVKsG$- z9QN^Xqu&8<$Ee8?ew`kM_un|@((<~iM^CBTkce5fM!nOtlGyzn;N=w`e;GEazNKQB znprO~6NZqDH`ECl_p`jw@=#37-!I!``(HJAsc@bB(kB9?HDm1TiFOe>tTivT%rT6; z7FpIPs^GQPKN$v@kG^>EZCl;lC!e3)ODG<#*w%IT_nXnLk_?$o9A)VK#Z@yg(x%sl z#>c}gMm@Qqw9z2dnL_xRkT>Xr|A6|Ctu4DhXPp5+qI_r+)rijs-+@ zNsJlxifznrMe(qSNs2=9np_rb84nPfL*;t;2yitq3nC0fB=}LNzCx9DJiOF>pHj}B z-?{24>e=Db6x1DL4>#@XtZ0|2O~DN^L(xh1uZE#RsD*{n51cu`ZPd@O#tQV&nB z+LtpvP5s)Iz0KkE>wVtqs;<#buY3RJM>j@$`siI!+TU_4pwE{UiD`dEUzxVW=JM_Y zyTLZe!EY<9m{FZo&K-ybRb$K~Hh>@~1UE2!@`bLuBdt9&jie^t511jHZC@^X<%;s^ zqff0RHI5wdr`R=h`}yJuuB+T-5AN@@%X*ppt$AB^WmUduYm|ZFrVXphB@TK1djYmf zh%YHh;hIyfT#4QO_m4>0rHWs#GZcNt`~>@VkB3Y4uu{o?U|~G+^HA{NQS`K{R zW*au_M2v52V-tL#VGGLX{La<6bv7%;Pg$)0N^R`L*&_~K>fHMIwn=l#O#}V5kIO!3 z4V++n?u3ljm-Cx)&?EU)oS27xt*+cBd(*N%f3nuOuU>U~?5(%OiXRN)PPkU<4krA@ z#X%)hMcD9LuDAB3trtpZ1mb8z_7J4P`S7KwqV;0MEL8SF42#GX;a$F{`k+Ew&YUXJ zTdI8^B4Kw|JBQ_NW0vnCY5|-p$Dxn73f3 z9d-NA+`PSt9X3t*+AMQ8@2*0~+KmJEjJFKFzcj|OyG6htt3OWntQ9a(Ae-9;qW80iI+Jk7bVCDO zMHFY}WSMEcE~8bXB;M`oeoX4sn{JI|dHW<4>(1NE3Q?b_R#Le{`@q_#bIt{=t{m+? zQQc0ePX5Z3x_m-@qHNr5b+G8>`73ji$JJ{G+q1&J9$o>9JVd&-_8H{pdJV&mI4$|#9G2^AZrG&N`1F^sJy!6+VIo2Q%9Qi>*O}e zzvq`+Tm#_g+Br*|%0t=4Ils?ENZ@_mxt&6-=s2gvQe zpxNtSx8z-R>i#0YSb(Uo1B{FOZ>S(z8gNF`8~!_X>`*z@wRc_Zb)WR_BdQV)LpX>a zaU`RU!MxmyfK@wp`%z+GAU(wfO(#Yk!1X`q{Rn!ghWSgT6#Bsf7_^e zJ>PuI{>JwX4L$FqB{zJ;tn0dKF1n><40xyI<`z|-_vKw~ihI@1=C*mX(Rk(7?;c#ll>ba;5M%IwB|NDZ(A^7xH=9|^xA?Z9H&TQXpRby>4V&gL`l+G* zzAv}$&5_)y>a=+bAeAt$aBGXXD56I~PbPmYGuYpMo!LPX6>Yg=sh{sEI5xb$qpmu9 zxT2~-^L0fldu=(%+Y4{nZ=Keuv)@^B51V!lpsH*q|2Zc*EaP3-g0xDa1ZfpgueYrT3-W$hj==*Je?OWe*gmJ)p(M`(|EhAiCn=?FWQBX zDl25xRFa6(FD!K2dwI5mW<=PE_O)vaO3&`RXM3-(|HBP#U*0PZdG^+|_CknEOxpV2 z0qv2M6D}aVLv0M5JA<4#Vc+)m_rjPa;%nj5L{t(mULlDSf**>8^WpL86t zrikn%hD%_IV=pVU-XoC24Zg)H!P0w!^0;~W6(uDG>$SBkCwp)IH9WZO?Qr*r?@Rj^ z=Q}#7jhIz*SlxD*eCDLV9>(=;>udJ*@$>R&9Wf(*Y{l-TjfVyf@0}cPW$?Q7>YIX% z_JNDZjU1Ra$3CL#_ErMYlb3AqAg|7Y~_j_R{@cL*{b{l+X*aT{9K4GwCHZoMmUeCGA>r4ys^dQFCw{j$nxcmt8V^PkIJa`n40K*Hxd>bS$z)D!~hf@Mnx-tj|TSc z%H?5@?bDXv8+IaL4i*HfSW}TA2js&p{(+9fmAS1jI{>L7>)8ULO{Y@roVwRysyN72 zktO%EkBYud_+dC<50Fc&IDF+iy$ThXojK(5?|VAk!J*hedrEurBTdRcq>4bUB0-Ka z<)(4@te$=vPY)irJ!M8@zrV68d)y18SQM-Ta^aDZcN`as0bHOA1lTa&^zVoPUk)6W zoe*+RcK?@}C-2q6v?U8}R99zsOUd+%{AZQ69Aj$MZbqaqaQt%9_`3_@Z|h|_`KPSB zxI}VHyH`W^ZdXVP66#zo6~tZZJQMQoDjaMfWux@-S@V4hpQ$`1=8BRijn*_VQv%(wH&vd-8O^>5x0RG71ZZu$~*&5x}ed zVtSURfvHJNVODhceG&D$?x>fEbF(C1E=mC5lNSIibBCaLr~djl3vU7aOf7)|=Rke0 zFOcnV#S`D#LLvf%R}65L-+T(g2;#x}1qF3TR+E|vy%v1&k*xCa%93ApBtWL%NYbMH z{#kKxn{v*@_Kd!e8GG*gmhGcDzP_&e>CpMb3nt%c|F&R>gRX>x?bWp7)n*9QK3LI_KtmsM+1sPNxFtTLmuMC!@`Cz;W!itc4bxOAGCnRNE0&z*%yi^^nX zn4lW`&#C|mT zFYNa5X2Vx^>e})CD{6!v>o|V-zUq6Rwj)^6to4$VWlr0E7@QlG;bv8_AiZu(M&2S@ z+j}tfc_<}>j-O4c2-^U#?bTfQFZTqzpiuXDpU&4D*jr5WVt{qf9FyySPcXv@!XYDY zR&aId+E7GxpKte{7r64Ydt3Imzhu!VLswkM?~~)~+`JRxZYq)KQicB~AwtbFAwX_| z==8-GKo!z+?b?=Z#8mC;j;6`@Tu0(bkD+vd?##=JP8;`VsOG@idF@ZfM5evCk$tj< zdAz>%`X<}r#N-Q#9Wq7?qy5O<(#vC4dHL}NSuV|oVSU+U;P%0`BtzcBfc%i&8o$^w zM2mtA6d_r%@2<{PGMdT6WCSRHQ%DjJp!7#|ibS;e`@blHtFLOVu#8>8!6F8_Sq(DK z1UD3c;KGe0W3j zR+d7x5ShFOvQO81as4t!?E@`24%_!{oLhB`)3Dcz8Ie0SH0u9Y>hW{+=KqB;f@f1O z{DrPW?nFy0OoRlI&kemK33@aSUefV&oY~H}xNhn0rH=gm_3yNH%<|WN^z!a8&v_&N zQvD+V9Kalx&Tm%A+_?Gs>tGNHCkh?0tB55RQawkjRgIZ`h+S3M~hVh|rP zw-Dz0EG4%Zn#$a5k&<_`*NWkL-( zuoA#rp94Dw>+G^)AM&Itadq; zViFWy{r%JOmtK7&#~5;+C#g=eK~4SHQV7Z$fHWIW0g*5gR6ZMm8RBsno;9j@4_s6w z*N{w*M7q-WZ+c1!?>htqYKV?{bn!mJ=p0w=Q}53mme0=U`T!j_xaw%(hAj86k&c+8aLVT9jq68x~w#0>X50Pm(3Y443x*{z_bl-cz0FyJwth z-}cGx`~!*J(p`V-lg!Y$0LuXDxL@#fEZ_bxoQBc0@E1$2t&cBwctQ46%edvX$@Vg1 z9_63bSC3o`1+(i$&4JN1-1nZN=CjX8Q>19}Nsbe4D7=5aP&| z74A)0^>X^~ffbRn0JeSSm~E{8kf9LKvn5Y}X{GKhFU-c2^%-&&=l?GFqgJqE-%aecwLqAD&vAzlL6`yw-BqaGFvbN_9 z>FAPo#^+`(T(l^vtkFdG%-Y$EmVo0)lFmy?16D?K8-4Xz3GCV>#!*!JG^#Nu^gdBf zZsjS%#Rv*CZh57pLEuy$8O|^s@yOko_uM(zy!&j_b!Cx#XGz2_NUmPpxEs}pam4Ee=Tck%EF3W^?ffLIg&E|fbZ<_<2@d6W3K8jAS5fUQ{#Fap%%vr^$>l5 z8>F+ykrX}?2W(nwD3jG=29DaY)E~TZ}62MD;l1qT(mI4?rF zYp{)K)ESv&{!=FJd+TCx?EJx`HMDRHV(Uhyqq^Qu3j!ZO;zt6*qwwk6yVqgZm@)4T z?;39$^Z8;;9;9tCe`I!b4-Q2!TZOkqoebjKk@kq|4^@{4FBWrXAjx>4)1F;=(18`? z#CnnODdOK3Gv>hN#Gy`*(#XXzm+f0)^zO>^?81vhn)G((;R(tHx`hWGocXfeVvR-C zmA4Nvw_df~=5&8|v8#}$i@Fk1Z}*G?mZ!JevR$};>8s|dsHqt`HXc|ni-~Ooaa3tK z$)Uw*BYd`)H{l0~h-Rpxk(rtL{FP@fZvk{ajl#hC=Wpr$BRXl5V!kD=U%$FE{IcXG zh-mX6tD*X&zKlOn&Y>qeXHX!3P7)H zuv|YdO4^vUZe!&c#!W^Ym*qcuB+oGC4-Qv&s1O4Vn>tlK$>tzraM#^C(THb#J*Yi? ze4aR!Bdw}C96?@+#Ybei$GWVxRYVtt?Wb0S3fn?r$>3K9$LR;y zr}l$mI$ykC^wM$-i^NHu&iB{9q%;s?G{lG^yzMGqo==@4-Sb$dY_GU}Cld~*4rO;? zy;apti6AM2m>Oz?>!{2+a4khz#?P5I?+s%UMG_6NO{;i2*0y{4wWsZZd?oulK56Y! z_fPi7a*zMsQh3lT4!^YwQh5#U=1R*^rSc=Is}+-eyjZ*J$vNXexxQkM7tfD2{#*JP zmU{*rb5APV#ep`neCgYD{S`v8^4DKoQwKEUB8zyFDF>K80L6Cj&8@~be z|3z%3JLcc-7uU?(`MN!ML=y2abii%+d z28w->HmsDaAJ?Z>uT4K(Gz*`nEc5FLJ|bG#L@7nK9^^;`N@pb1$%AO=-ok_{l=xe5d;8 z##G<-jUwL;-6@X`-&zRuB9;h$LiG6lKvs{lcg5T(b^lbks>E<~T%z^5w~yO4+RMw& z9PrX`@#4f%EhGi#Ak35C#ZOHd@WJ@?pG23tykc&jkU6vLNyXw&78!W-E4w+1g$<48 zf^O}9q5|{!=Yy;E<#ZT6JZkp4K2LhO#AddvgXjgi%EhRS;WEv#bNuqirCWs zxNqR-kUU{o73MF%vryg5S|l@&gGP_D%QF5oR3#?Z9Y|yuzO#F={Vl%PIEw`=qJPB} z{@{cj8<~=**o8Nl9^>fAldkW3&6jt?K_s1ZlKY)URzyDj>*w=B+jAK@&UV@MZhYZ~KjDBV-?!8Ff* zzJBOGQ^C8?(Pp&1&d1IV1G7{?r{2C!-ui>#QmV&3&p&M-at!&LBG@=_L2?7%Mm#uR ziQ-I;S1MLkcEuF0SJu`%WOAoIT^GJ}o1nL)W4u;SCz&{B^Ujv8UcK7KEW^FD;A!qV zR;3u_O%U7s%)C<}qxLM^wc8-&akYJSHJKIb7cJ}&89F#d8tiVTpPxaH>&@0->pUa} z=>%zav{6f?1E{IFd0Pusm9IVnq_hKTPf))rptVH|J0Fa#ZM?5({)Dtk%kE2^oObh9 z{4JmC2*zh%2-w3yg6{3gzn1j0qR!F%nRKZ1kr|yV;{&G*%3`~U*Fc>#!bqyRx;oan zE7vYAO)*;U)0kNu=3QhU)Sv8-`nnsX0~ z9C5r7iPlAhx_eP~5Nf?Dp$B5rH&DA`o`TTOgU&F(Vpq4|PMiK`R`~e%;iz#E4Mmb2 zBO_-Sd|2UOv>Vl1 zot0ch-F^?qYN4|IFBv1U_c-;&@pAGIEODKLysaN&<;w{(WB;wINbE@aQ5N-I|B<65-V2@AGu&j={fVjRXauQ3Q8(QQ_TLOCC&Z>W2yo6+ zSyg8$?Y9mM`)ASG$jmjgiAQ^Q-gOK0-vQYQOB7p!=Zps6E)&cNp2vNuaI)6?SJhVA zct1BcbZxrdPOmdtYZjek)y>|=NdHUCwk)4>$(80l1?=F zW%p%a)eXDJiQOa)j13t~Rmo6nQIJRYypNYUg?+OF!@y_m7oVE7b`IcAqS9vbz}K&* z|Nc31M6_;?<#4X`AlqVsQVi+r->Rm~Z>Y6c*@}fG9^pPWl~^Bxoza_ z^7s?$^jiY6Up90No2Yk4b);1|`lC(NCJw3YX3TfM{3*f!w|=Ygj!9TuV7(o2qpDi= z;$;6mgY_#6&P>(Vkg#c@gq*c!7C9K=U=pLs#k4chHiP=TJ`pdk+k0{S3@7aaP3kM& zWUVZ#EbsO&biF8YP;8cP#oe_(mYrawj!L>XWlfb1=h-Iy92RpSxWRIB9Fd_-XyEhf zy2H;~%8}NvlyVL>{MfLRH!o^Ncb?>43gtJTft^_xtc)iXXQZbKk z$M9KuQ=ly=WD^eMxv%UsPuJs3n_=`Z5qpCUGd$-OVn9N9_>{bpnlam(!T+G7 z#6Ul+lBOrtXqEbVPPtIpQB}1#tkI0?{ng8t=3^9lXlY4^xfaivtw#d)R3`=qod`k?cm` z`{4482mt7MGIHms=-KJZul0xEqu>GOou!@@yxAGVX&J0Z{f)Q)!|pvK=f=+tklK*Q zR*SJ>uBz~p5Wu^}&0BFq>sg@3g9B>oLkxNi?l=Bo(&CzHA6V;y)?cztIaD;_R>h@$ z3gd2cbGiRhYhdpoI_vid+c}U#;l%9|;+FJHWN&dED~}(4$@B&PQv0kHVvAwd|NWg5D$r9Us54^ zw)g4U8P{!W_3}vpBaP$(;-18_Of#zQl103J9?7ps2fm7n-DN|n6*S9^E-t_8K z&eDGMgK~1BHaM8}u6(L<0$;B%+kmvIwtv2F-5D1U%@oUpLGRoFZ6=j@5_vXP7eqz9 z{8r<4#c<*C?u~6egfq_K4*?S?&W+6R86-{y5Xx5{lBgpHTBi! zgXd|@51}+=N9JNB;xt9j`>|;JBcV(s6iDQ7P>V1_0|f65GauiPV56rnj`WeigTp*a zog3RyJY&p5Pft_0kfhq4aNl|W6F32aMd~akN0ep!`T2>m#6rjm7Z9xgcWNsRmPqr_ zIJB1|6L^{!`=cJ_UT_4|TF_;K`LckIwfL*D z0G?aq{Sdk{lJpPSTPAMa?=|lF4sxh@|Dq1(fBOU&I;G7ge#A2JRiIL)XO|nlEcYEK zP2T?P5&e3KTggZf_`ugiyc)c!by}xNtMj*)YOLe(5Hr}Ae{~zRyf7qlyDg`<#d}=y z)e8ZMV0Q?zOK~Zb4OX`^W-_6`=>a@R@SghnixqmIJWj40(T_lC{mU?vt9KTduWJvD zVr_V#ups{0p`y&LL2@~$gYclyGKyeF*a%z7RxDc-?cyJz_mdG5eyOwh6hN47B90(r zoDM=&AoHX0royjZOPg#f76y$`;5CrlAx*r*Hs)%vf({bn%}86}(Gib?4LqLK3~<^X z7zES^6mr4W$Ax|@DS8tzkQE9c5M=u^$<@OCF`i(I3-#k3jxZr&kRDbT^#mRyDg)9q zo+UdRu}>BpF|e~pgckBymKnYq$u|dEHz;Vx%0?xfoY}Ge$XiFI5Lt+H^R zJW`A+JOy)f0l)~i*_=#vn>X)~*4piDDi;6>=2}9@ zh{@r=5*9dQcp!*0XETooi~{eY7!VBE%0~_&_{I}?g}+vWUDm~es$+r_N7r0Y z&=oUx&GEgtc!*(Qsg#^t!T$j-L_~&6*{SOoxOn|LQ74m11EKm6V)#{a^O^(p;ESn9 zpJ~JMnRh6@qRF{fF%5~g7OGeR+6lx|i*22?E}SG#qu*9;R;N-HIjTT}ft$DLgwIjP zEM8y_;tNGn3qXFitSlof`D)RKhnx$9=W-lFAaNDVIj+wYBS>=%jmKYx(wj$vxe5_H zKFZ3HPeim9mfjdP+4f6QYinYYs7bA=ArO^)mbCx)LH52r zP2mHR#z{5?LZL!GN{(tM_EKW$?J8y+k6#Kbox>1ftoTSY5c_lk{uJ0ZF2uRU*Fom^ zCYLKKmbwgA?DVs3cb)8u;9$k^cXD#XfJQgUWKM2-7YD6S_d7?2&)QVCf0XH%CJ{v& z7Z=uaRQN2?SFTXmLY(1Uq<{QpAJ~oIPGiUJnKw;l{Dt6P?`m~yAW)5mVf}^OnZs1E z+Zy>91rzd4fgU0tLX9B7aiGJ^tsMJ2>B5D96WpainWl=#D(dPNZ@)R{Czu@3j62;q zR9g_Fs*4+&mw!AY1QTKCyI#&jOj8aJsTFfrRZlOR`i9VjSr*6ojT$X)LqE;1;}zU@ zZ0CWICGP}LE)E8r@QX8#eBcGsXVF)Sq4Pqi&|6-2AYOV9O-7|_$Fw~E55)B<+oH>q zJ+^-kNXmgAIm)S%x^<{UC`Let7O=K(jm|_5a5#m^}FsYL`?3jz% zQ!f^(6*vRiNQ!H*qL`HlyGsF;kJTNj#ad;4kHz_9Ts?i0sbnOMVT`jOn};0#XNq^v zyzGH~o%Z(G-SD8Kd*8ldWOTygl_Of0qQ`OMgSp6#_ca&AOhbbqv(E%wO!wLeKMJyy zQKOh%l!y{0J$*y`w@m6qav3o6e%^VAlcsB*YrgC7)0L!$fti7dt*FiQLm6$g(I#=A zfC%_*5WMZr=UpB03{5IeRro=NmP7+?xst7qvO44P)iGjxGWQB})oA?H94qtUN6}e8 z-?3H)ah9M8N5L`pdbvGm#d&0Am!;i1a9>vo(xQYW+1PheX}pzxT!ha7j(tzg3Oq?Z zcUP}9J9BdoZxj5-*W`c+o>*?ir#BBu&Dc2cg2T8&;gPfIua(a3NVpltl;QO0w`N(5 zQtJcgEZnS6u;ilEvR+31?%O9t+2~rMUNwFdE1yL!OSR=h1|Y|>?zc)W<a}H_<1%)%lS0`#obOj<4FPg2dQvV$jIfHCNxHO0vEzOEe5BWMIBPHb}b<(+< z&hL+8EDxBo^QK+Zdr;ENuL35CsUMtD+AdoQto3NP#ccB6n7JoHc6u)C%D#Eq=JQ_q zO>AXHzAV7Pn$SK4_wUb$E;h%Cz&4#pB_UFDw*PHc5Yuuv_XY+SQk%h6@c%$07AVgS z4;dx><-mAZkn#Fc^Fw=rTAe`PBx-XxZ9(*dbhk3JHWikQhXPK5rf_ED^tclCVZr>} zJ|l?T!)I*SWvP_Z(&i{Z*mG@lGol7g9zo5PtN3K! zSW-Xx+bmD5J$^Zj$u-v=F4aic_+b&ch}rOGEiWXG$IAz<+*|tWnQV;Z<`{Lczeg4J=*~_1dtflPVDGxv$$7epgzt1;bzw}I5>Bq+KufL z7OHBV=$Fw<Y#Uxr`>NSeJ*2R(WB z@}A`IF4NWwIdAQm3<50@#<*S4C3M`krIL|7(p%l0oHN=Vm~!vl9QI~{bMV2Or7wYs zMZNutzKy)&#fP61roa&0`1xM7q-D|zl|O&}a3Gz$(V3zd70{M#hii%|E-8M=(Kk2Ja}q?43`62Q-uq=x>+WS}H$O(eY@I-B< z8$md9N0D_BH}8yi2ZGE0&p+OvcW_p)f9soN@18V<9-kD&ZHoJU5OaXZUkmab&OeE0Ko^AzkNY>Kz(bMT%8jpb(Mz=tyS0j)7Y0M?3#^=7wh z*|J4gs44Pqp#XsbZV-nYN2-gwlZB+S^O6NU$c#P5?HA-5?jqkGDWfPM1e}cQ^Ujke z$GN%544Tfk1ZV>yj9U2xVg@>SxB?(AF8&m8IL;;!G9vRs@0G6S`>DPyn`hQJc=I~I z=uhIrLRO6~po4{vCb@UOOu`=U|MiNS&UNJ&VhF=)UZR-$!4bV+;#Ozl$2cS5TJN({ zNkJrxQqf3Cd^_x9+%^BdX7Ki@SCn=hd1S9R;=!jcP4RuagXnl! zutLNJJb#wLm6dG8Y0YozJRQ6gIKCG>p8B#x-KHmpb7yw%vGecmmaiL4JZu&%bkA{- zS1U+bqe*@ox3KMvt^eYC>_x3a_>0``oauCE6eA*a8ogVa?1)MN*}<5@AXGe96f zv5wY^LAJLE2!95;l>ehg>*dY_$!M?Hs^>S7l3>7qkb)ZzwT?)_>In2eB-yeYVaQC= z=W<~4keC(uh;h+#$bV>7ov#J6&Xu?L{Yg(R!S+n(_1eZ+OG1ibRos4P1?R@Mkop$; zCYyeg3Ty%$oqlw&5KyR;HjSoP2ERdVVJ{Qn{|b!nlfq&c_K#k?!`U@|L-VvDZP6K9gtV#kVFtSg0GxpO0;0 zg^P)zqGSho6rS$KXF?XKjdf5@>m9MV;cLAK8!M8c-ZQ?&Y0lN5HsCTM;}XdE4x*w$ zz%gv>=w3v>B&5@T4x49x>d06RBo`(Rc7OZ&^~Rfxopi#@yQ?2AdR`}7Z)C)j_*NF0<-zT z%0nvrjW$g_J*8VO*}w}g)D4r%E_uB%L#E{5=KfhL=&s)H{G_BLAR$!FPNYVZK$A;R3&JM;DBwSyPp~c-pLq9fM@>EGVWU9%jI+lcKTtg<$ zCLE1V+vES@WlK$;QE_9*?c{O@j$hLvl!t5FR*xLfdcL1jC+53&8|(Dy+xN~=4O(UY z0{H|}Wpnl7m^2ZzM(+pQdGTX^%h9(VtXbX_xPS)ouWY*GpwQWVU%q@<-1}tZlJc7h z%};!d4I5Ht@TK^?pef_xtO~3b0wJQ?ExGD7MdUytHyGx=;MJ>FZr)zAZ1@d(&Yf!< z+GovdfI=jWwJ{;~$6p-Qi`A9(pMPBGsr!cbxtA4W-@Ti;-Y8z$J)gQQ;iKj~ZWPqP z8D9q7Sl`3&aNx=XMBs=SgccSSWF^VQe2%>S6$N!zl%m?~BN0(9CNijEUveIci;A#Q z*3ZaqOM*5lX%2k(-Q$?d#;hO8nAXvd&SrbLv36V}7Q`Ok^t!q-HMOi)Iyrf-Cw9>~ zGL|Tisueq+) z;DH=2FR}h25-hpC+3Uw2zVE-AE%)hHFUdL;7FI|(BywPg`~pC^mgcWwwLj2g{e3Dm zy1<;Jqc>!6Kp9VoytE=GXU5a7eDIgFp;E*KFT1Q14oQ6w(X1F34nZ{8X1uO*Q{(KNn~D$^Vb8GXdwa>)O8NS)=Ar z2u+ewNi#*Lh>Vp&C@M;%xim>-_$$qXLWB%OkwhVtBn?8OA~dHGeZSp(zu))1@9}&c z$8+3I1FrwR_Fj9fbDih8e*UZ|)A2v&{+T!<*fC&M3y8JkPWiJx{82XgrgSW`|DZwn za8hx`R@|t;D6Kmix$XW1>bTY5-QOxMF77qCS4gY3AOP5`oP@Q2%fO;HcSF+C-({WP zL-1>$Z$QMlf278Aiu^$7_`E{RLhk?H#W)XHA-vvP7G!U*vwJgIi3u@7-@DwMx`waC z1BJTdDD^Lh-#=cNLG^zK5Gz#r+G8R~?cljQ@jP&{j-N{E?$$|iI^lH!6mWB!2$?>n zcD{m9zG9Lwfe~C!ct~@A(n&htI*|43K)k^AR6G{#G9KNT+=#dIwEH~J91%zw_laBf zZo}n!vneCGVc&%M5uygZ7@hEGzz8YFq(jG!!mo!eKHcCyGg@u;Ywg)EYu(3hGg9Sr zP+fc>lB*(2>91x#G&$C{MQ(L0PWGZb)vKB^uXJ?x?)V@HD`f_|ad&C^{heMaa8e-? zqFKyzGW661w6KMnFSkV&rhXv%%7uL!J}_JAX35l6VTMW?msDj0QB`;r3;i}tb#CxM z6bho72<0QQ>vU*LGSrR&3>9`@m{cvPucRFS!+-TJtIPYYPd%FV)n>2)FPjMX37||6 z)!B!p$SISkmR;lP4zismfm#Dl)(wOFm&@lHgc$@E=;3b;qOJ1mo~>#?X>Nmz%ze z@b?rP_^xZ1Lz`I?rQ-VLn=SCy76vnm3MuUSK8eR~=rT2U!4PfhQ6a2f4{u_~_3zJD zi-*$jdy0CT@EE)-eH$s|N7nTdoYV326llbtbvh=d?qNzcC+KjniT8F_7Z>^#0o)QD zS8=jL0tN{3ijD)?0^wwhmpShG0zwjo&a^0QlZ^hviAX?2U?R5D_R8t-lLGc%|L_7Q zt66Q5rv-Z4!}wO9CuD!)AvJ;8uo>;>un49NrD9j|3o=w3sJn1f-?EV5yhUKpFKLHQ zqick>EUFW+-1I5JgDmKz&6`)AH#S`)o`?F5YpJfDtlCb)Hd0!9xXr5B3%i|~ymGc< zWV;8`m#x2Zgt0LcmDd5C#Xp5T%MuAIQeigC`~A-XlWh+w=AmE#s@vyQ_qzF+YS**S z{YB#PH(y}3Cp=70SS6k+oTJH37rB|}AzBv6)-Y*c%5d9zG31QhACva~}#KGNhJ-+^7WuJ!c=yw`KFB)dGTC_Koy=2Z8;8sA%>dw$roSv&T z&5%*h93(sZRmo(<7`aF9t?x+xs8ip6c<-D12|s??mK3kgoe%0j(ac*5*%R$|{P_NV zTBJO4L4?r=JlsUq3t@yMc2>YhZBOeV%8Odgc{5Iy8cx_lG0=y9#UlzcR9;lezm_C! zwex^3SY!Saq`WIV%;p(~hYTFGc;`N8FUBM@I!qW-_tV_024y7R#gUwnS&)`{_UtL3 zPh1END;b_Se*7+3uR9;~!--Y(Z{Myun6mp_^afZ*Fj}Z)%7I6*7Ca$aL8cAwC7#xq zVFO$y{AjYU30$>myror^Dh-0%v@})89yy(VnWr!7`g>~55xY$E-8T-UNd>HmYiky8 zpAN2|dXH-PNTN$XLTuO&QCN-X459>S;3Jxw{5XJ-mPh?tDJcz2R`lEgSDj1&DTYnt zi%abRn4DsoUrI??_R*O$bFt~wb-~p?9aZ`Y6azS)Gmi(lnQY5t$<9_Z9@%+e^?+MP zDLjd%_3vz!rfldyET#|>9Mj`~jl}X>I2cp#VRR*hEM$>&@vQJQy<~Q3w^|aj{|}*{ zd-qvGLq($xVkK7EOz6qnf?sVX^?u^e~ww=pZiyPiOOKz85NzYCr` z^X%;vH(vf0C_Ig!?u4HxB7uX~F>B78+k+aEbb4@6Vs38u;z3`Z3XN&b15wv9OL>T> z7=)rY}c*`C5TFGvmb}DMDQ5C?P zKpU9U(r7Sy)L5qHj`Mj%0|VYsSFUbE?xG`rC;Iq)uaza-J6e?N5siBI@iGEOAZ(QE z{7mih1z%Nja&~TO+Eae~Ugb`X4tXqGyd@Vi%}+J8lvTM(b54-?1=>mqMH5MW*D*j; z(8!LRI?aE%7C&IhY+?3wxNRPzyx+Y{RR?l}YQA4I{gA-2)5TQQI-9IGWv_zv_~5M1uFS% zc%Zk$Ol6wBQ%*~V89JP3W2h|IlxR#E1+yL?j0Vd3ykB%GA3M&~K78q+ktD)~DI6W7 zrDHxa%=XDQCg^=cFpb9G9+ke-r)}4snXpVFi%f8Jr2|WwZ{M+F#eF9ArVH zMRr77>*DMz>ljs7tv7aH(4D{X0~FD`5U-Kk zseh&>*tbYZT6bZt)!AteBPBmdd^Y=t);=L6;^UV`@4qQn;JWdz%kPJ00v@L$)oc6q zKNe4ze!0@wSzmjvhRmsMaz;5!9tA%cg0Y0ft2_4}JXl5fDpa0VQ&X9IGV4T%?ZNba zR086vSv1forIlyHt?>^a5Q_PVq$=UJ9$1A}fN6)b;upQO_SXA$OnePd1pm|JXhxX~ zC_04Yn`d8Gj)x{X0sZRMf64xOg3?p7&HMaGbQ6Uaw7Nu*@?}jNyb|Dvt3vX~a~!=P zF);>x9fwNLKJ?!cNEMZpktz#Gwys6mqPhk*?nR>->GmsgiP7cDb_+pXeu2QZEG zZf-HLu99n-ZQ69}A#11As+HH2KR0*EzHERQ^nwYEd2K7ku%eAr{CWM}CNL>7O` zs5PHmj14T?+AwH==C@wLYSlNQ5}SQT$H*KLcoVkqOz|voD$`-HGc$t61O?f}=Qgg? zj%zY6m~axl0-_!<_EL}xxR*>qwnRQP`NswgG?WA^?BKAMKMYcOdU?g#cG~jEtFHhp zMP2;R^;m@I!LqtddGRUlw<)_kM7$?hBVbPwm=Pr|M#)Yu0{=I2KJs4oukY+X*-GiQ ze?(m^2P1p?}h%n9cf#GSI%Xe{V?~7i#ry`o$sD~7?tU`FXk@=_$$_!Z5PIUjLRQ$ zH0IH~h4xJsx_gfvvml3jsCH)!Xh82Wj8~-cROFvl5ncIk$KVCL?D3Upf5Mp%HM0<2 zF<6sv1OM2Sl#*p8w4A)HufPB9lP4`~mq`aX&bZbXt1Qt)+oCT!j=Yi}OJH+8 zF!0xp9|6Z@u*^o)at7K>nP0_t%7FjGVxI@$D_3n=yLNr2S>B7`yG`yf*60;ho#5vO z*p&ov<+vE0RZQO#?NO5G|3+Zfy>~0n>*`fGwlPlMy7+t6UJ3L#C3G7QTR7C%S{tu; zNoVFiveQ=a19`hmg9Z%>!rJRUlGW|Ep3OaMKKjr= zr86VA^}|UL8cI@4-+T%w8GlJxa*#?h&@_CpvZ#L6K64`zls6m%dN#lAg5!Urq1)Eq zS$w|j)P{kUX#<%#r5_f$ds;VD)jdNT8q%2xgbgP>t8_%$-R@Suo7DGs z9Vn}-y}h!j>R8sA>MswxpB)*uQrR;>vX5I4B1?MZW)cxG=JYQCOYis&Di`K#=4giD z4rM;W(y~=e@ml6-NVdLFJ<@9Pjnc<~)dqqQB>ZI{g``+KTp(aOe9@Dn{T#G1JYP5IvjQV-l%t$XZ$aM~pcRY#5l`D=LcDBHf5 z^od!+2C7PQf2W_D@$S>x*X`$CG#C^YZ9aOST(|qa427r#JAE~EW3_DfmKJG^29uWP zpO{e?KI{Isr{m(B?82YAeYKCP+uKF9#K~EDZJJU-(}J^m$ENps->TKC)0=KjGr8RR z$=(d9wdtb=-v7Fxz}GBF=8omng{Sf|c2^uXFj!$Z!EuK3nBQOh-;G>uE1%sVB;xIZ z>E2)77cVd!&qGXZxoFX?XJ5bedONUp+X3naR~KE4d)j;XrxgWJQn=;JF`YKDQ{d>H z#WCBqS`T{m?0{=&n$?=d3h&~}S$^^rR}t@+#M|iy?@P1ZZCaK%vh(0khc#CnUgzw6 zaQSn!tKl~0zM^#CioE*RNet*=XcKnTT;63#Ss5?)AHIL{-lbM9QK~@)vl#*kxhxLv{4EGj`=7iIbqftFKr&y#K0~dM}bH-rCtcNZS|`6}4{E+BZJxw;$v)H6=Q)WlQn@ znkE!^QI0QqKh4FXvhtgnV`zYkyltc$GSQviGo(9o2qrsn8UHgC%evOb@6JQ5^QmZs z!Ra2KZqMAUyWqL6k_Mgd2Fd;7XS@0ZFi-DNf}GKWYs)OEv}euE8kk?6v95VbYm5K_ zcOO-?4KGp3@RRceLZMox{13)0NQ@^VLC#Llk28FVW0YNuj+rm*-oOp+uC89YVIo}r zd~n^=%8ETbJFZZ;|H5L@(LW8_R|Lw$Xn0j-XYA=9FK;?%kk?8FI|V~S?=BJ9K3Na) zH@sKa=6w6k+}{ChG*aJJr>Gv$^zyovu%=(v5>Kmh*vO&G6Kx^fVU3^7j-aI~`!j2K zw+`~pp2fOen0Ky2=Uy+|-S@kPo<4oCt=ZLua>>y9i-t{*S+gd7WR`)SZ^S%C2*k}k zo}n!1R=N6VthH^|`QiN$X-r_s65y1`HsBQ)0Ks2lcTxijb33Srz^|ELl7RB z$Ta%Il;5St>F(Wc>1=di0BWxe)3wiE>}{4dP+>vuqTZOWnJo#5 z%)I8>O-3^BPqWOjG+-3pUU{LN2P$rxjud}w+E2TGo*kDqymN?jt^!p0^V^l&|DZ`! zn{z}k;d_CwQ1FZBrObCy`K=VSYmXjt+IVLS^>b?3N-N|@fnxKGI6c#dJ*&>j z+^;#i{osir!>`h|$_I}3UAJM>IfK(C63t%yg7zWxs4%N4AF4aGS4XX&HpaF#=T1ql zrf&;Wb!dRPumr$!N>6&^{#vfYq@*I2p2o_g_2(C!l=ynw z-tTR+TunhL@q_0w+9LfhH3!WM@?5{99X~UA{l485}fyzF&jTNf8DX@ z{GPzF8wzbFI4M8cW4kJ`#`k2W#)bh8t2++9IDT{~Eex{aITL+N5nnAA=UPNQm6=#AaTw#1AWuP~m2v)r_Ez9E)`F0(x%OA%k zYJDqljaT&33xlHbld*E`J9ic>j2-QN3H>{Qfuo>Xw0VYWA5wJ9Jzvnipy2-1l-%l@ zsuDr*K}x)w{xo32-Go}9U4Nd?fz~+vFmTP=ng>2QB$`8gRo1`xVH;UeOPxMr4}Mp@ zu)zCQG3H+_1# zYVuNDy*;=azAG{7ZRr`J2%=M*45+O)QvrR z>SDZ~w)*nsenBr5#e}=jx)0-93osrI&@dgt1K${Zj0Y5qB#Jpav7;0Z!MDb`idgM} z58~I&Hylo*T!$Pf+S-5xP!|7n2)noPvAtU8$&=nO@8WOz>B&QQ1CbJ~W$-6|fVsjH zpj!yjJOA-RWsiNt2L{$&Ucy?PU8hcs>aTXSVOo}|Y6DCnVORn5g9J+m9EX;+fNM}L zl-u0)9O)|*kHUCMwX9N|zSj=shD?Zb|G3C+#^pY`$Ct$1Z2W+WRY=>SvSkTA+dgTb zW@Z~A$DC_|S>E+OyIRiEJ(%^v z;>pva5d*;(3Cy4_dkI~ErpLflNdX~z+JH-o3LIL|vBt?~0u_4Q8#HJbj2iHeMVHSE z4E0r-UUkA6eNDM!$G@OR?+$|51or|N5TjJg zLq4Vc)4vtQeVa->+DQQ(np|G057j5{cVpmp!MZKzXrOu&RF@uKnL^gyL{bXCtS zIC3VTgFP zHI?WAz|eXK-y}Rem>YomQS{@jZ(hG9dD{)Ju_WdlGevN5iM;hdKfLB#VKw47B8u^k zxEnEvTdtq>>4m70?t$KwV17DP#xchzn6dB$l}3$Hvn?GsX=qCiNFMCEoQB4g^RT9X zJ8%0$Hl=4nZBY=!Wt!9{jCb_-@G|W#;NGd}+t0KeJXp@m%-3UgoZJ_8I`(3sr>ty< zx#FQx(;*u*=ypxw@H96^L&o22YBnLVc39qG;glp8RY#95d?mfw1N==v@>2uXj`?u3 z1o`0Jy+bH>9iLqC@V>ZAE2NCg{O%PZQP^_`mAfPSZc})cDzkO+g1mu?`6m_TWfQMcWWHi>&1;-Fo+iJ}m`l zyqC-YyBmHrsEeTNa!I)2)5{e@;dj8&iO ziD_5^4E64LVpMel!zK-{Y#glM<<+exG*J)Du)3dbtxF~i8)}%Xsuk4kCQhC?;8e!X zG8KFN<76}1JVE7Kc-fV?bA9?7G-$pbYII{gQVaYUugce5k90Y2mys`c=FH7ATK4MH zz;}<@j2gXBT$roQc!bZI{))*?Kcr_XOi9_~{0)Xa3{0H~EXwH}RruCh=GVd1%WZ8UdypD|snUV8H+Mr% zFMwF1k@InTOeq=Nt6p+z)_!3nK2&{4LdbfTT?Oa8mhZozd3T$a-Z&BgW|Yy;g8l=K zD%UpZ)bz7C4};A`m)4K|1E*F@ZNpLrCIITr+|a~DdNy*sWHQieN5F0I(tKb$SU+;H z=`OG}oZ)aK(h;!)p6$i;di==^wIh?sVB4s#hdTRcoIodPC|Leh^Fue+SFc zr@ushdgk&Sr!|80Pp*OCe1~LpWMhC$37pS*Uv*uA>@|=0~PE}a7j@q zkxNP(&SGoFaHoAMUf;bN(1(^ZrR?F$moGb*p}7ds^qeSMAix+%@Sp;>ZpSQSe+z9POya{;OSMEJnAKm-WqZxkjyq?c<{O)Zva*&Rq zOyD#okGhOQAl%=p7q-%}d4%nh*~SIoy8qIq=Z_j0^Xa6jZLVfrp)hSeI#)Yf^_ta- zu3?5clnXm|?rgYT#yLP{bKv`=Dy_@=uB@_tF>zRPLEP}im%C=}?Xb^SJA4%7e{rv) z*SaIHTMApo^ZDC?^GE$c#=5F3)qZ1*ieX)zDs)eR1@>Vznsr9OOk*jpDO$;XFif(5 z1gI?Ou_av)P@&i@`L0482sLmwUebk#QCvDbn&)S))w3rps~utTOir>hd|%XayoIW& zM1S*?UNT|E-hr!0xMx`pv%I0~duq#}^_8zTEDvgY>+51_E529cbc$n<=vkgU;Jtfm zWBF&>SLc$PyPTe~^OD1Gqawvok1O7U`zN-y8Ep30@u9t6POfe!XF;MlFlesSPvbE- zVGK92eaPt8J$nMoLWfT@QEWOhsYRkA_v|fe$tok;NMr$ba&lD{i^m__KREbY!Kn*c z-gYigIzjDrk!!(|ySum_S$Ph3ywl6&aKEReof8+>=F>|N<7CIKGOlMF+eidHbd)Wo!XM8vG>B^P=+ zIfR5P|LqRxrnauiO$`G9Nc`xdM|$=J)%VFe=6%s@yf{26wmN#qldV7c(>$ z??;W{`nRm37qu#R>Gf!B=H6S2B)$>O$c-)BOl=1{wCy|(R?miO`Jwk(ozI;oB*jfLZ1vltZF@=i#5h%Y z)ctbcJ4dfB?lSbv^S4uDtS*}uXNO-)x}`GBgB2Uu7 zR)yQJ%ka!#KaV|JXF^S{)*ngBos5j&drgfb9XW)z76r4Z@Fs2P+k13N3yXp=GquCx z$y0<><;9p* z{z-J|Rs3Z4x~GqJ_HY08($j(~sYyOv*7bg+R-`@2y#Jn_{WoTsX70?sDZi)EuFX@~ zfk`LYk1+74b3d@-MWWNSq_Pw3=icu#X}i<4A>BGYZgnfBuK4?Nk5h9eZMe|=@N&&P zcL!YF(=`*;x=atoW7L~+0?Oxre zvb0gX$14?$rKw9hmfuSp=Q*@xixaE!ydPXvTG!^)>%Geww+xSI*J0{TqvGc5&9R3Q zv-{eQYS{WCmcmwg zYZ~nC9rlY{YkJw(-27TS^o6FX@d3P*P!wnZv`uh)yqtPgSe@%79n=ks35=)^{!mUY zvV}hs1unk?R!F86qjz>Xo|({sN4lOfWProshzoY4<6;OHE?BQ2;lm`}&({T@6@{a6 zGQKXf&BLDK&R>*LSZYyvx${}4Ep=tzH<^~j9lF!*Rqab3gQ}-%$1Nze_GlQ;T=w>P z`s}sO!+$gwY!y@M^|)~h3#$ zymQ^-3r@FVzN8UIGTXX;(O1;kuA?1(ODV~Cuk@f?6tzM^ikfpQm$n?0S^T~c!(Gb+ zG{VY$DwsP&xEeyr%Y}Lh=HnW)MnWes50t5EfU;ztJD#{)b$wCIP=E-Q24$p>NXLH& z?YF|sK)UAc;+djFOmRU{JCc9YdC0VZRABU6&7RHX;nzY0Z6`hO9N7UkWM}RhA(cQd zLcL-8e4pF_v!b!-dyXybkr)Z0c!2w(vbOe~7?TFFqPh7i+Pvf~Ne2(<$%H>Op=;-z zV*;(xtS<`4ntQ)dB8BLavRWgtBK} zNj8Xy0|P}O#s;C%zz^>YH?l-Cp(B)teEfp#v`Y z$pDhhAJRH4&};*S`@P@GAz&z|VYvYZPdf5NQ+geOElVq_PanHQI}BcBQMKrLpTR|& zj68(M5fNJWrM~1Z5QCuq{oY!C?bj#8(#oPwFBgXy?tXxI;XBZlq9X8!K89PAOJj@YZ@_pf8MH!DH7YLKE zzn)#N+;!3k3jnVxS6m~K4w7l3-v9!dtGg~j_Jh~K5ZN+)zU%Shrd2I}BG2RDdES7{ zM+W$416x90br0PmHzdpRI{z@MK`SV6fBg)Ji|KrgXBxqam9ZTt=M7=pF->)BOZ12o z#z?0x*9K2r2=)Y5uVVJ56v$E2rw>6A>Fqt=_Uv~nrMkf|^~CM>8!QJo=2XLmtFwaL zU)Qh4HMisB5sZX{waO-zA9y=*znsoIzYTYUn1)v(oJvSj*SqeXD?fi4)OA)yY?QE& z`t{wug&+==Y|XU|Nq9?DaR*r!uV-<@(3&Pg--cOQi+n#Qa1Y^N1)7=m=d`gD{E)s!Dm0~K8l07tm4*2|va;b%T}K+{S__*^do^lo038uP^BMZ}3z;1Gul^HkGJYSrml^iY6O`zrg?~MZu#Go)ab(`P+^H>L6Va@m zWn&ZSV8%ogK%FC&x{`a9-gH;-u(Yu|nC1?d7J&VkiLZx^_%56)V!0FrIkPV%PBKBd z=XEDv0(-(p<|LeT#u#4P@>5l${Wo{Sf7SB!jrav5$~@Zp>*yH4>Y4NplMdIQx6PZz z3^&T*DTI9r$Ua!5EISfHiI^nwr6%T_+;CSPXgPkqz8O%4Ap~K}y&n6Lzt1BWA3|UV ziI(BsBH(Fj?>$wZXymXsB49ZKeudXJGM77i>jU22=jqbM2M_;p_aP=RzF>=(w?YnddqoY|A-i^0Em8Rxi zuodA44h*L3If<*c7sI&mJj0A%B-(mDZ>Z)6U;!a0mPC3)0_SrFA3k)i73{%1L?CX) z5yv*(rM7rvEg?Psf4?FOCtRXD|8Kvt{!M*F`YV3vN#>B5ip>7MenraJQNNzCdbZo$ z^6vfnNL*GpIGC*fi4xj(G)1frv3_pwLMgz00vKdJa5H$;w}F1Xn(6pK>>W0GbQg{| zGc~KPboH;OEOoG?4X-u^2V8sr4y^A-Kru2xP@7hDs)btrE>J500sc;?i3~WGcyAB{ zAASPHN=sLGu&bd#KpDbscK&Z>+KSn#{8Z-Gk8X%%jsp@}RK7FR)GqJ$95`ot7JUUA zAJ$GzBEo5EzI?3y9KPD^s|n`O)t**cx|aR8F-B2wC^`}(XMj+PP{&&v&-IhzWH@`2 z@@#}_=AvJ?c(FY`MHcn1kk;au>?>Ct=xKVP(3jpm&%|+&ub1r82bW`Eczo^$lZw=+ z0(%C53?DJ#PC`s%`bnGmv-q8Em|}&!!g3V>)c|a6zR?``_B2n&$a8S07ioof!ss^~ z#P0HdXO*>!x`jNkhO-J;6jt`di60b#8Y|tz!5_|-01Nf%m^FI)gB2D{25x$vZiE)t zLq>><-IM~Um=nZTipBXZ7GU(~{Kx9QvOhQ*Z&QDHsJ6}GZb(|`d;)l6B7oD+!ya9j z8PN;U4qo^$RHtX@wnitWHtD^`%ljUluCdI-z|@NdM_Y0TDIt7j%LzpQOnu>D2X(6| zIK4BDjk)~Afy0M|Cmis}I8tjoI$PWXVfYEn6@7UY-r}5ORVLh%1F*$^)720iDva?3 zaRzyW9an=k=7h^5fuaa=d2(@9XC=N!KiPc?)tE zbXOvGQZP8RAJA9o?Sr$c5G)B3ohx?wXMpnn73eoH7C?*%6Iv`p40Hem4UYVzHu&rU zkTkw2S!a80FuXp=eS`rU+Yf!fCIQp16+V6YW(%wlzAXh7iqO}f{^VQ;?$PN@rQ;&1 z6zI<&3`9gtTVW-we=@M&fy)E+io`7NrYtptXl31 z^x>=EdI?86rX7mezCfgy(3bmr}LIlf2FQmmC;(_W$ zO!(sX2r0XW&qa=c+9KvlVyV?4vOZPkO*7RvpFw)rvSq@a)ciZIL^RZp`HI|4EC9-N z!ZgQRyvC=rA>p+m!cw6>#ax~iucXGBC+r7yv37FO$GZSW_m*7PU7NNDhbvG>rb=b7 zCtzkG-F#pO-o+R#Gscx18@o(9VzC$pA*8PUO~%G$`l=0c2{!mcE$8MCgu5NcIj{~I z;Z|S5n2k`n9HONYZXl0B5&z0V2-cQ6IOuTcxMRSg@8*C76H=*i*kqHYO!-{m`aqC+ zEOs??-Hpwu7g7`kDDH`h8Yx(->^9-4#0}yf4Xa;^-}l8Q0*oc{(TDICvN=;16F7Qx z?>@|_(}SMC6VSoxBt{KAb2UvYLsQe^Ur~PRpeP80@E>Xt6-Gj{?8Kz? zA-($dSLRk*&|hWwj;{=x(P_JTX72s_fzY%^cY7!vggIOy#@^$ z?Sr%Z2_$v&35E}v>@ugns?Gj{Ek9j!E*H;7(jpONI9sv7G<0!^PoSwC!CQLN+|($q z;};59R5J7At_hoImBX0R|wx zTu=(&^!f*Vd?ed`R`tPqYUv}Y7x7T6j4lV|802}mtgI{@VZk3NelkEn*QV=l9-JN9 zrLW42DO-Of^cpZ=Wd5niM*XL4xf&1@q~uhVW>r&qBQ*gh5Ju=N`UQ164A^fzX^r2~ z%g2~vpwy+G>YR?!(t#+BhWiZCFk$k^nb0D~FkWpnjn)7R)#OO2b+By<{MGPbtoWX| zYD${k23+fcqsb#ATcP|q>7}_{r&{ zxPJ zH7^jzs}qZtqS8De2QO5oW0x)xVRIw~;2AnQ$H3;D>VT2wBgjIb)JpT$-uM=%#$ZXM z;lt18rg6EPUSUy%0^3h0kTGM*NvOxkJ^&x3(N4hEu<`>1lQ9BDvUol@9lr!T<023> zJ!0{~#J6PNuHRSu)^sq}=COSLeuf8{4X1kX;YsY~{oQsnC)sw>AG@Y-F8=NuTy4ic zQ|7ez;SfK}7K%D$aOBtf8Xx8nhaLg2ii4^ndYD$x3vXJx1$O5=yl)DU>W@%1C zB($(P^!oLw)z{vh@#h0*Dk&@LK2O`4`WT%;b$Q0_u1%-7tCYTBCuDwOb7Fv>pKKFt zenZIkVKBpW{BCbbrnvvR61-doSPX`=0K%wdb&tIpRGW&hIw&?&rj~Ednqq3 zZ}_B1$CoVi@ZzNj7q-=xZ)CIo4?u>7#E;dO3IRd%DPaJO9}2?YH^z>^|LEwCUb0R!i%AoK{Fq zYTjqPiW}h(I<@^Jathb%~ADeVs4( zac;Z8&Nei1vJl7pq`huor5>b*BWZiymY4);9&INs_<@s-(yweOYEoD@{&aG zCqWZlY@yd~+g(5DBq1JjgS+ z$5Qau1=a&zicT(vzKD{dqPQOEb{<6$#vdFwa^wQrYwE9e=RqI#J%4T329H`-)8#}FMQlhg}q!oPRxWHJ(e zvTQga%tYR<=kD$vmBdoY9?ScLcsPc8VUd+pK1hI~5)e9_la z6N3~<3CTg9J|j9rCJHt0*+0K8ynbZ1Y}o=^TY03!l=*B{4A+K&aro!VA&=pUFgYb)YbhkR_#n#lyPuO2OzGzL-jtpb;trR=c6sS6#VIF6ZH=DVv?K?HIXa9O*p>cMof$}CFR+(XJ41TtF1jm!0f#Wz^Y3x`Exr;sqj6*S57ONmH|A{l49B&MVizi?sAS}*(=eE)u=Rj2mtXDIjw7ubsieX3iu zPv>Q6Tl(QIfW5vD>N3a4aRGaX(`7OQSISC3>j_B?I>g}RUEI9>mEIMPQC(M zkBeu6`OC+HW%m~4-nCK1@gS@GJvuIAVu&yOb_v_g2@@u$O`NzwHOPL=n%g-!I=}eP z;g>FT2FYRzB6m3GX@onwhX>%GQ_;9<^jGjLa&j6i%>A(|#wZD7`JiEh9MTZ%!yiF;U>AfPXg(+i5mdWK@(zPAu980yuHQaLJN=zyH*w27e*E|&pO9sJ z7_rg6ywrX{_SP+1*15U8VY3lNhx+G0NY;1l&_*wA=GO93UCj-MwL@#+P(_x?>uwIw`Tf@B$J^4KhyQhPpjtwX zUPaMNIC80bK4EXyA&1SQsU!Wk%Q$Dx^UtkP4XRhpY;e`vb$F(NzX0rnES5#A#+8gr zw}gMTYPzR-KlPLO4f=y^RQs_riCr$0*?Y>hBqjy5pfuDhy0 zf7jvJV~%|x_5o=QT)OS2EzULaOI`%FiIU%Duy?!jwAy#$2rAvU96@@>S#Lu*r1To8lF7Ytp{T=`1=V^fIuFNjZP4%65~J97-QqQJH&6GR_u190y6Myh78oZM zPA-1BP*ZnpsMy~0YokaF@MQ!Q83y)>H<4$`hsKDL+YXw>PMg7({Q182k2ZM&hn`DB zI2(T!=C$SJJ92DhjXAbbt$)~{Zhvw3PA=C5a%4L!@SbU{*8jy-TQ{}Q7t&b};pQRy zzhTLUULbDNqH865Fq(h=6unc{wYhS7^KQ|gBcFp#DQ7G1pM!Xx_=RDDZ;s8~s?R24 z=gdFh>^t*Y!#+X)$mj^B53qa>Nz~B{ilQA{+4uE#gMJhHcJ}L>eD}xccuuv&wJ)Zv zxJ9>a)dU(5M(Rh8-i)=*bgz;uxqI)P4d$TY`tP&U9Y65_3_rQWvi^ixh%?caRMir{ znxzW&A=(_c>K*f4Yv zxAG^EH`)N)>xRW|HrYTchvDUWrsQ{?+4@tLPan7^slu*G>bC$g&Mp51tT zeSM+>?UKIg&I3VN{y~TK5sE2Wg;_T|;gzaE`;Q!11y5&c^N&h#GhWJMS=x8#U=1sk zIjYTHFHHXg1EjK|LiSnMrZKO6sC{@MvIQHbiF;P^UyYJ9h1!`Adq{J3uhf%Pe}GKk}9}&sw?ie6Jzm z-lTa)edRw*n2h7M426r4oQsT&wTOo6JcpSrT)3|IWyWDKpW~8wqVBJBly{g#kY*x= zQyeo!K~{E_i+t>rE5oNwwLI8Uj3X!Q_KQy?5h4D=I<0^ujb#m)i{k`4to}`6I_xTI z$KR3q6AA82CMiHSGfR~ye=$JJ5FAl``VSj#c6zxR@u^W>Z>N~kth zNB<#aa}bTd6C5`B-<(245wcJbc91f~;dwq88U8~)ZF$n6GNsJXu4h3s;YwUAsxc8y zsm+AS8<+BhQz9-Cp?_XHecBB(D(V<6z@^n(l# zGoLW6MD_~<1-ZqcR3#fkoP0jpbZ%F|(gU!upO|Wgg`@+zXi{r{;&wyoOlpgv&zuW| zi_FY&C?5N&ESC~TGc^_v8Dr52Jtxcq?WIf41RND)aESBeG?ysO zX>Nq$n|j({E_LdQLnS*%E`<7Ft-4s|&CQ zc8Sh}m#x!>aFPE+;rKMc!9PF`Wvo9{h)B6+&1#vLm^hFjdg0*;nxHe2<4$2$355ucVr*3aqJd1T!_@?NMAB*1ZI&DSY~Lo`=U|K1@JOyhSJW2!)fd4j=&xls+UH zk7uWC#irA5jf9d?ye2*#$t+cq>bmlq|94mOl@JKB@S+PFb)LlnyY7s30WOF8o1j(h zfc1jqY#xoB(F{-(T-(_5D$GD&YFsNJetjDpqClS9_uSAicr2mlf1`_5-)nn;b^> zy2N6_K@~(0#CFoKNTW)V4I^wANI_?!2hlHV=MW5-I0j;BnR?k4f7rsWXhl&0VUi%Y zqUZ=1;rw^r9ikFEJp?v`5NZIvG^7+_xE{M!4tM}Y1>JvKokJ&CHw_Xq0x8l3M4B*M zlLiy}Zs&`BAzXqG28ZIvLgq1FUiG!hWPvvbrz?;s?uVmDO|kI5RB!%{!O8$p9#CGN z#@lwHtbqk3%qeIaWq{G#o;zeKuF`>vC?VFgRctfckCaDXkv z(*}pp8`2aJOAh%&G3cILvR2XPDTFzH)$KuUzBjh%1Kn=}}F5iv2@ z%r-5$mA71l(h?EsTs}uI%>Y(NCvJ^`e+M6JaqvZKoiIU$!am|gM8p9QaY}>vRH9@! zj4>^(iCf1S@#b9rVG}W`0RKY^)vHmFkwVJ=vQADwB;3U~(O#Wcx2LI_#?p8eX6HmA zCEK)TYr+9wMU!ad%GoS1Fpg~6Wq-<|QMT?T5sRZ^&LW-Q)u21XR>Lyt>0wrjz%mL5hz1@~o1a)bjZ0gkjku(bP?XZHYA=-g)$Jz+E>qp9~EMFI&XD_lu=ZdL~4FE7h6 z7tl-5e4%O)MG9}3dFn4Wtt!&rRkl~&B8xeH@1KmOF55virKHqh`}XY)gD0NM2TSq6 zJi9wKQDB@SEvzmVyW$>6k&VeG0Eo=oxpx|6#YYySw<$8a!~3QO)@ypRkp;qq!?dP| z3mS>e@q0{TJGO4QbH>na?fK6+BiY{%{@-tBJXGr5$t2@)wvDQ_O5ZtFi!)|>A6=i3 zX)r%XdXc)}$5`1ne}AWRV*0K_-fg1Fm-rJYFzPf9``EkvUwHFfpr?<~5vzckC$bv&}s^VhHc{^#56iI#PLsziRQuHMkqI%i`a~~y^W1=(f1&&b7wtr}6HqV>@O@t`jE%FX{?TLE@sQ|2 z+Td*^Ms%)NL4W%jNt=jjn3|D27+S5#$P|Nn$^AG(pxr*&=Q9xFvC|0_KX8-37#%H- zNjd!1^4G5)4{KU_&Bco~6OL?Yf0{iZGI)`ta#s#x^onEHjQ#@#Jfrmv40CA6^t9Qv zV`jb$2RI}d5!VE{^b9^BaYM}f#-Z%jzL10C&26!(cm|zX1 zqi4TiWCi^KhP(L_;3CT8=Ld?(2(T%lRZB_3#W_N4(tNXHW~chGq{?EKNRe)F z;QH_>L?=K{>IqvE)6-~L5f6l0yhdml`0nQi+nfEtRWTAVpmRiXq?~Hw`j`6}?wdEe z{V)~l161-1A5!{xTb3uvwQ1-BuFLJe2eg$&3NnB9Rmi1`Cqi&L*&-C#O>`k?njbMm}fvmK)L7kSC>cHS_=Qyvl%=_|J_ z$5><%qD96YRbBH`g&T48`i#vW@(8Z8KDfEKFormP#Yq(B*cX6y!ZDm-X4Vh7OY)Co z(D=`vT@GolJUKB)Z!f?BK#o&Po{cIs@DETRdR4919Kxl_eWpwzoIqwbPB>`MXE^3lNJNZmuuox54FD={sN5$CJN zd7qgTe6z6dO1|?d?}5csCU*J#olCBYQ9|4y+a4aVTz3|M+;Xr}$`tSgZH_ughZt7! z7xr#Lfe`|xjjOxKS*O3S{G*WCoGKT#%INxc#g8AWtSg`K=Kg8?Kw11C=vmanqsN#= zMMb^PPqALPk{b0$)-^LwE|tDd)3$1$23rBn0g=Yr=g#zQ=q{k$!qwgfONes#`lsvx z!ke8QUN`8b4h~9!{YTlpX8Z_#TEfXw_0I82EG_Se91s-jXkHbu()b?1_E@X~y=3&V$iY*RGTS?Qgg zS5zbegK=W9#cL!L)bdZLhI6qTZdmwSdDciXW_m3kH}TD@LOOfSBns%m?^Qi5ll1iM zsV!4rl?cK^oD*rS-pPsiD=grH0_nHGQ-LgJulD)4AIT&=f^9>)dZH6MKyaoIgIz58 z)_WymUK+NG(8UcioJLozX>$wGc|Xfxty;DeM?|#d^3J)g}AYL=1}I#xH# z554ypIAKW+{y8>WLG!#@QQLxSKG%Q0jUHk1_vCc5#gpKgi8<=~_wP3;W)?5Y$%hmZ z@ZcfYB>Img2$KrQc+zA#yw4L7PdO)! zvg5k2dO41CQ~7j2QvfkRlv?m}&8?2*RY7^8vb~MT7uY{Hso^M-Kg;=^-Fo)i4U+Nk zkLsCIr-boD@IvgWb7>xqx*sQ^2`sjRA1?{x*D(t2tv*YSN9YZj2O0v)ZZnq_U$haJK%Y^zJmWJz-s?9XG+8MG2o@^ zSl0Yqo#rgIusO3=N43v?e8A(CS0D7=u}|HSe}tT^e_EC=MJThz?6`NIUM|;8ojZ4K zVzItLPxj@&X>^{#Tj8d0&C+1`@^e`aa%|#FR7+iQiv|Pi z30r5riEdbr?c4wS{U%%&vm+L(BC2n0O#dV1Lzw-x*m+Kqw<3I9wVqueVV;zJtIq$A zM&_MYtv;I}5 zu4YZdacSZJ`O%7!ToctM*8%ddkKTyR-_s{&l7n(=L~jLB2Cl#c_=K^TumGxNPM?_Z zVSk673MwwI=XpDA>U1{NCMrSwhz2w=Zp`HOS`W|ES*r$Jh5U#u8;z2W*f)$r*814- z+wN^^1|2qybz1=wH&yGUhGU=WRbGw{tS;T~5@V==%th~$O)3Dvhh!Jj8iT*C?D^Z^ zXpRjxig$;J^$MQpy|Gn^j&@4*Zacr}WWF6!L?~ZxVqlF@^zV3AoU7eT4clg9KDa?= zi=w75#k>`l#Fv&IQ5gj?e^i_dZz8X8co z7Jzw(p>yb&hbbszhj@m~Sbo4$a%0(i&kx>u3J5T0jfhSbW;i@O<&c805liyTXb}PO4 z+gKK1Wn46UDG~deV#*w%XxXYjk6=9?kS}G9e=v2pV}g0VKhR4&g!y#e)rH4maNDJMKQ6lcBZ!s~I&Qz-TY+_HeH3zR%D+o*q8~$=3YL)g^KjsguJivq z_tc@uLFF4=5p?3j2{=*1P`c-rQ*_})-J<>Pk0$Qg&%is+Jj{BwbY z`s8FV?t|8M;u&BQ^h!t+p`OrYkd|AexX~w@Sogia@YoB(>GUoHSXo_tfJXIn!Q=c#a-6Je)=ka69ylY8Ku^>TO;n#V+f6ks~YZRtU zD-H3DHjH>)ci~2te)E$AeAsq(3&H{|kHKFT42@MC?IJmTTruk;r4+3V(A9wE7~&<@ zk@xraH|{RY`@Q)>^rH|XQi>Ok>zq40Xq@o9TS<~J|M&pJ&-(Fv{JneIVz)gc@}qRn zwHxOr{8te!>eazRL%K=+NBz1YL-}us{`U`e865cEb$jni|F?>tG4x%)|1N+3w7a)Y zvUsaZpp}%Al>7!Q(+xJ(8*Hbnbg|_>QmQJd<3_8fjaHesSatlA3FD`zs*F@onWCa1 zf2nl!|M&u@wKfiR9{>L@Sp3PZm@kkOKVhAX-IRF_hPJM=)^2pRad2KWYpuJ~xbc%G zx{n*DI({TS!rfEXlz-jxzy7s}t&6L}T4$++|7R|4x`uk@do8Tr0;M%T(iOP{R?cA8 zx}dn&G{n%<%Gkil*izTP6zJkb?`mHF^_ZdP$uA1Y&(C3S_c00xRxs2v(KFIf&~VmN zFa(Y$=mI@&px~^h;GSAqm6(~A%CKc(DYC7;U@zmhRZPI~B2W*5r>mdKI;Vst06%-N Axc~qF diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf index 3249a9acf8b3f2aa2fd0e3ce39b781a0bb00fdd0..b311e0f5dbb85f62bb2a1f6da550eac9cd2b04d7 100644 GIT binary patch delta 22136 zcmZs?WnA1n*DhKplmaa+UfkV^mqA)&aED=VcelY|iWip|T#CE9Q{3IPxH}Xr?V0<2 zp8f89&enD@VU{vV< zRHDR1cPUHY)o`M#)wepM#m@E1ez0VQ%bue9#QQAHDkkTe+?*rzxwI5)qKDbe&-wem z;TL~zI-yV3p}&Lv-VH$i#Af;p3JdSHpgsRwwefn~KL7mtX6ZKN`kMT~DoXFWx8LJ~ zI1$wDb7&&eB`y7t{O0`oneDiWxeHVt`uLOLAbe%LSZM8UjKSa2(A9%1&Ooq3O0$rI z?np#q*#7tJ&V3JZTQ#owxq~5{00tXwrOlsb=nIW*L`rvm1J7!q4|JXr3ajgjas!_hgXI3);|3zuyl%4kC$4+uK=xY5z>TYd*PJ{p|J# zzML2Nb2!lXbg}T+?|uMH$y%djRksJ`<#}&v?Zy6VyxVbGG*JsQSa!bL)i`6(OsUkf z_FZSWuSq;OMszH9JG`tX=>ACBG~YY3+~Jq4E~K9}|2mM-Ip{ApGR<5H9vBdZXau+CY1ltW$(z_JsK&Ct~B|z?A|;RXNB+e{pvdB zF6^h7w;FWtp?Gv({b(kpbH;u3PvQ-S+@D$U-!gx=M4*!x&?^_ngHDG^6nW~=!*$@u z()I7=HS05eCC?4;U)<+?cZ%VepY70+PM_w(=QHntBq{5TLrw~thxrK(q56Qr5Qf{A zBzt=@xz$LJJC^H{OpE z6W^_O|El#|$1~J_oS&Q-R{EP`k-up+ohHZ)(T4%T#u|N2_xUEd=%0mJvP)TyKHtu0 zr~L^EZSMD2ckOgrhG7*ZW;|ZA!T=SL{HSET^iGX3Y>?f3WPwxAcTBJH>F*?UscJ(9 zyKW3H{bn=Tn{4m8$Mn8e3~NeI?sX!cXOUuaP4HQ zuBD``8>P#X`t_5qzeZ`o&szLhC1)d+e{k07vQ-Rt8Yab*hX7`E*tRu@g|MmJw`FyC~G6b^x>?|(*BQNsKNw03&O3HqP3O97))f-bFH+#QqKesh@<0W;Uq@NB{*_dX;3q-vaCBR%g zBRKqSG-2^l|D>q5Lr^b(#c3LMQCQmPT}%8_44%y8w&xgSpYQ?sE_PlMTDusxY+Q7z z7aKETh`o*|Pns{mUMqGh(_>C2cB}vSR8ZPU$xFZnzC~^$N+=cXJ=Ja{8s^Q{!pp+w zWYQ)}#~7!WuLljraN=kQ@q83^1DAKLtu!gT_6e4&) zF#GPjCWbYlj(z&R_JgyiEFhQGgb50peWqZ27VvS7BwSE?m1i4UtN6oR-euh$r6hoH zD2kIae#}cz_}gWZ3fpwELIrDIlunamEcIKAmtA2IaZoIo2_ejnoYUpR69U6==RK@a z(L5Uje>yLr5j`b%t4?dHgsU~<_GL{0IF zhzwH?s@oYIU8>*TFT3s79vKxZdFMk!!qT4m%*B^uv7S_2J&jwwQ0+9x_`bitU>{&Y zBV6c5M8X|~xyuEhPiTpW=6z>%Iprkxom^51Fw+tlb8t6c=F_`E0hD2rQ-t&n4Eqif zQ5=_WQDbH zMV|!*QY11+lq-?_wSLK@!bt*`V2QK_ZD;}zq1RA=D2Kuw$Yh# zmXA5V-Tax^XA47l8%Citk0WfA<_-A!r~g6Ob+fsRp5hS`ed=jIQbk$p;Jq4W3^}d> z>ZLOZ%iV6V;vK$f#@R^uD!7WvfML$6!?W8hO}@rIvaEi`+Anl z_y(&NKK<(q$4l*s6|eEtGN4lL&CNP;<}S^O$G4C305Kf3fR1QTY+IUs- z5x=mByV=@WXi~ZB7iU8c&3%$p<0Pl$&wxuYr>Wo*;ARIU7J|-oM8cF`dDK7Hu8=Ab5r;*&!D>}F1y3@X|zP4o>!Tu__A>+&%vh}{yYhHP8F zu^m1)dsrkbmUVKEg(3#u5Yv<$@Pf`$yyJ)DbygSs4eO#)k9vy2&1#0RnO|6V|EKbI z$4F;rt?ygUdm_KD0$}UrlX^sXHB5ejZ!LIO}`&O^)wz`A@rp8 z(kJvZ_xfZd;kn28UEsUj)##0~_3VCufUt9s=yy-c-xkPgw*;wT{Ok6fXC+fnNdjnL zXR=B)b(B+4Ml}*)XUkj_o6>OGmK!_zsj-J6BHETXOX%L@Pudong*$cpmdoM#qDcjJ zg&aB%W7;U?q~0Nyz^$^seVYNz0gN@o6n=D0u=}?jNQ66R@>Py`H{!^n$ zW<`LxEcC`=i?$F*09Q{pr7q{ChFuX zgV^6NIE5SXF=({Z3rMAmJ##ulRK8Mg+Jbn>eG4xeD`4OJNsd1j@@s;ua!&id_u`+Q zI>hSIE(AO1J6fQ&fs%BJDZTk;XfeeXVo+e6vjMq4bET%NafhN-_ydN<-m!$@-m$7| zPKTIQ_+z-n*$5589lo5*=mL`*=f5X;XQ#uMDeM7Y#Sr^CyQa z)fegP3d{~7j;~oqwc9JRx2zqu^5aF9=Yy~IKSuniM zG=)cxEH)m_#c45r3~0Z!{9nR4F6|5<$@x5)!>OVqj%24Y`kAj&RIKDujrFejRV&E> z*DcEjuK5RKmIYMJ_ng*=)ro$X6C3?mL^vE);FN-QW!M+StL$DZ^m|avb!;garYrz!^z2^5rZ9#W@%HO?Y>#1EFHClmrUIMipu`6RqAbX2Bq%1ryH*qvsAE+x`_sc!Jcq?bS3PC=F z>Cc5hP<47)FVR}o`8396I_r31dh&rO+B8R}V1x}ER*@xRt#y?qpSaq+SvBNrr1&Ek zhhDD1ihD3#o2QM(OjQnsH42w$G{A0RO#U=VvO7(`rQ)1n6PE+6& z-q?&`)bHsO%H%P0xV_!Kf$``{0E-t&*iYHdfsH}2gUhyCIDby5;$|rmN9wKRRtd7~ zI@mUTKarq;Nvc2l4G;#F-ac#`{i$YSW!ra4SrGP-mloA6TTKd8IAaM?_{VKS|MlCp zTn3zoGBE4|L^yRL*b@e#{3mMtKWxEOfXaqog*}!Kh5zx|Ml+X??t<-qLsWd)!vAzB z)$0Kr>*Ts@H`#*F!l^kMz(i77#YIP{k7v=I&D{eL+s}JX7uR6_ificOa56*7^du6i{@h4C~bE+^uWG_=VsBPU|D)gLw$#Y+SCnf3un znzwM3KjdBNYFp2zDzzYy22Z|cy*H`0VVUG>=vkuP zxoZyID{b*(pab9uf%Lj&EJL=^saYJtOz4R(d8tyBNygZCc!nSK;y+k9sQLIAGi(VxxNjCOCtVl;PmT`cA`bENl7@P69XpYe3Mhdi5#?=mvqEDL|PoUWG{rdr%lv$$sr%Tl1p^`9NxppK4YQb!myuww_=UHM_MX_UE%h$QfXI}G| z%D-06n1-@VeHw5b7GORrP$*0zOmm-P>iY<^FFu6#XPV>Z<;+AGkIe=u@)M+$1Bd5X zrmblX;M9yPJBX=f__*vNaCuwYufy>5-aNV#=9wt7?v?;~{`oLDbgng!^>(zp=OybH zrJ6oEOc;+@P5-YP%^)27w5&jzco;0ZfYgx)ZRl3-W)&lKtFEFsIlsGgi0E>T6E^7T ztnVF^S@H35`QvL`6`;B?lAt}(2dg7f`g88kk?rLie`v#}{@Fv{_382l2i4*;u2{{M zwMnL;rCFx&3;N9HD9>=dBPJA$qeC3O>O#ZZOm@D#!ywQe(<_6m!^Px6ON6a5!WMc! z_aEawBHq&Zne6^a)<5iKXZreuJ-_T`iY?}6uJ#L~rsZ;C7OtwjZjnzi-_Beociv5> z`+1`mqHAzQXJG%>zM_A*TkXstyeqz@@6+bX&BZnMj2}1uef@*cR6W1Kp=;#cZyN&h zKRnsnsyLA-QFkuWk&57&BGfo2UQSi^${Vf-0?4JzD+cRLEuo}^clBB+&y?9i;wI$5 z=euIgn$g6`5`%3@4TO8u4_*Wcm8NLwiEnLeqKb9Uu6Kx;*zMqC)GXJ#9vDg500S(& z>@%yu!DJ=2Oq_v1x}LGJ+uGZWk?bYd2_bvf2sH5wx6felaXD-*sYG;&i!+9nwfBI@<*XJBj)YSbz9W|-`VIwd}u|j3MSex55rtL*=)_d z_D8i@8zRlt?ul470W)dn^Sb~H&judq>-vj#uqQ5(qhme{YBPO}y%JRl`d63_e%2pa z9LibOe>GJ5N+|}eU4075OtIC^{=E76#Cul3&0{B7!(2;k3?v4w;GAi%3d4NnJ1XAA zuSCaJS!`t0x87sZn&mo47J5*vl9PYFT8nKqNI3No`UpxJ?xd=MI-4na6?(Tb!YU*( z^I33r{dk#OjJThHDVW|ka_aQ+DUuPxy{vlXa|QCOD1#lWYJ2^%t`WnlE8Z9?^j1a( zxEf~mwr}DT`k58HD(%GMWntZUrm2Kh(}E?X!t`Sq_3NYXW&vY%(bOy12F5WlrS)Fs z>AWO0Z=?kJc4rxA;uR>>QwjZpUg~TAph&}MR%ydT#<76+?qpU3Kj+P-^X}%_ig@CT zO6X)^HiiRn^ww`})Xnw!{OXu-e>v6A&B3)A3H(aA9zew^DdpBXyTi6TBy|KT=CI_^ zgduQpE{{`WiB|m(=3nmc+J`-EY>S@ot>~@Zy}s3)Pb7TM7xj#a=~kq6#_0A9MGCrQ zu9*d6`ZXc(1!EyzgIQyIHA;MJsa&MPKS&Z}zkb&o@t0R9^LH8EDikbrS*`YRT_vGm z30BOgO(*c`SFng$Hs2S}aM4=y{HVSe-oATZ)380F6Hw_8QF?UqVxYtzxD?PiJSnmN z&@SYo9Y7xr)i^6vhJvVvhqV903$xtK#brg^)BO)^Ot`0beMM5gyY+97edUhR%*nK+ zYA}!E`=2nIP*qRh|6rL3A%=tdsQ<%CH*@555A5<=kMt$!G*=q_($ED-8FMBWL^oXd zdWc^=L3JrLc&>`Se?GeA)c)%edCTYn>l|HpS>jjx)2I0uiQ-p0M+}e8f{V!fMSCSU zu%s%5=v}15Q``GgKP2_Jy%kZA>!@&1iSOGYjHh>uc~W#(W8U|vP^ovvHd-JaWLEij ztoMedsXdMuGhbi4tolao|Kg?2N~q~J>g$j6Kzk?+GSaJ$g7JX_ls|S7X!J;pD1Vq% zz)ZyuFsI9MTo32*I9Ow@tbW9giym3tHgHlV{XLR-t@B4ZWO;c7*IPP?uS->Kghn95 zrB5jk8_B$*Ni98rpe4-|T; zl=R0k1#<_DSMb2MssQF5Uy9^U?)CCIElSn~W zz4wHmX%#IEphCh)p_Jx8auI5DNgk6`-1>^TKrC|^T~YjV*|+#@1J1j+{83w-$z8*2 z6wUE3kezQ;>70-Z;n8H?scS4Bd>J^{Cs(30Ku-wg?;P}4Ts zjUBEiY&69(?J;T3JR6ORL}*l6ir?D0)9Q|0IFc!^if%Q{ptW&^4m~x%o&b(k`N>|4 zC7!<8iTj-1;>rov^DhxTKmdtAR0G$l77sss1s_cT-XHmGIy>biHofLu>JpkI2@ld( zIt^AT5z8N8tkkyL+CEY4yK?bPM6}6mMH89oQ|m^`hZ)`+AjIIH3axZn9Br`b4prDG zP3m=XXJsKS!nKD^QY7acb3qioDg$e_FPLdy24dO2fPSB(n@Ry<1=bPNd}0A?Rk?re zDTjzwUWY~5T%FhjNt8?{@;j2}QRM!4#kUy!Rcn{qhD%>cTKbU7#f(3`yO9b9UrPWZ zcHH*ChJ?DBHI8P)8hQaq`XsS+!WGv*h0{q1OrDe0 z^f(vuiI)1QK>;46`I^r(`=im}rUhM$0v1~ZK*#ZuMO5-74W3xY_Reboi|Dd^Zprb@ zI(;l#mhtxb!G_eA?-JhOxQ_Q?IVj2#3rdaSEotD!Iyd8EOr-c)rcP`Kq-TvFFJgOO z6+-cv)wn&zdv7F#ScF0=H2JYfge(F~`#tFb_eVKvsNL~0xj7Hl;_P^ef4WL05t z_I*SP#7F?2;NRvfUo5uzc9f@I0C4AsebDf;#5;{Xm`J?_;Pw%30nq`t$C|&ViA628 zP=8Jv2mVkW{vmjv!9ya^ZZ~N;XI)04@*3(a5tc)lW)aIi)u4`2pp3Wb_}daA|L+mk zeX4{2OiL=3aZH)FxCDo+C?Q=?+SM>;B>!#6RF{LoTVk?LxjAJLxh96=8iCSipLeAj zi5D|hX#DLrI?l)>02k)PvB10?tp`u~Y5%eJOgIO$!i1L^ChTTRDY9rC4AfG^vI0;u zja*p)Gx}sE5YrezZEt`eAQSg?YM*rwc~{ zpzQT2P;w}052BFkIz;@tL~>{Z{tddsGkj-^8ih4_>rOivH-y6zWaV0xzgA-fU%SEQ z8U8F=SnUoktfk>|1yC z(OS9OW&0^~z_mUHM%BA}H@PU8lyHp}Bey6yg-Anu+U+oA3x5DW*bwApGnC_9;VQTc z$Z1em(dUDTU_UnDPvP~9d&jf&$(4q?9QmPY2r@LnnB%(GM#bR&zrg=(J_cfqf7U-1 z#%JknebhIuShz{!a2MXl)xX-!4s!4wAHeCj`^7#DN`<~NskrN^c*V#jTmz9zO^xJf z82{lF{3Q!s&iz7d6T)k9{3a{cL)*kQ!{0q}rlNm7J4G|WUwH}eGG9bZ8qhtTjs7KZ zr=}Y6YJ<+%+3fgDl~L1At_Cci`y)EB$#G;<5T8MG4a9u7CEV6F!)hhwZMisA_7>yY za_%_S-9hN+Bmnm*WY-z5oLj|t0y`laLeCJ0nnz3siTi}(EpaUWhf51l z&yu~9I+nL3+JX{mTOed>wp@@cP82-*5^?E?sT`NF0Lg9rP0`CXpsrz z+LB$dQnYf_jqfNVo@h=1U@2M1E7*18H^Mg|(D@V)*ArSfw~oqHaW=$!q$Y5A5P+K< z`bnc^y!UJgX9!RWS!${MxHc)#j7t ze*knp(e7(hVJ%RczMjuEl(@@lIY~R!_d`R9*b`Eat<%ldx_u>;!2V#K8G%QUK7#!f z${TBgs|pXpV{Kl_m6mHD)CGnMC`7#|^ZzPts7kN8&a{#vZ8MJwCS=0lr=vc6wa~gi zIqeVT-ltTJCJpW_VJ6e8A*f3%aecAO;$i?g{DBpYo#N`F9>q+yh{O6p7{v6KxND?X z7HwS{$qLa}PdQ^Uo279nJA^_@TP@Xf%&csc)!iK$TX8; zv1HAL7kc7ShFm{h@gKlB_K|-O7a8och90_ZhiL%|@lcl?Q4wwn0zBs$;VSR3$o)yAxp?k2o!`C~NLp5Zy8=zMv3c?nlUM4C6l&>f|Qd*ncJ5_@o5;z2IxMBe8qs5a}> zQg&cn`~;<-uG0XFC-B74bQ-mnCYG$i8V_n7rRru$oV(qo>PAmbB!K9g<;n@mE@gXm zu!D)rLJLK72Ek%W*@@-SAM;O?<+HERa!*Dx8{ULTE^PvvXlco_Phz9aKJX8ZpO}_X zsNoG`Eq#cUXA*4NV7QZG65P>pcPBBk2;iXM%|>2I@wHg%FO4Nj?L6c^eN!WT4Q-^= zOA~c#nm4%p!E73;ovIO!Xb8^}&&~`7$4@-$@OyjUWcsgRfrx(=N3Wb0{l#r?jPHHS z7q_AXa>;Y-L97+|dZWQ;TbB1E{wJgUKUp$SYsCGiUt|7-KaRV9t&%KJD7lW2Y(y~> z%~3;EgfZ4U=SziMEmXP3GJ;p2(TplSp)LA6;gAN;DaJF=cQ|Ao@7#6_qs%v3WZ>OZ z_H5Ks{;1@NqRA3cjY<+7ss+b+DX6<#>k^wvBU% z$rwf9_DZ~fi}$}Gx^t$gt2|rKqOf9_B#uj_sw3vH*`u&T!K{?JmBWhox7 zuu|_r=+<(mIF`S3<=Vq>;->NKy|;mRCl*WnSVaNREy%~OdUD(+85^a;>cio8$a##a zkB7HNYWRnmn&sr8d9yfBe8AToOVO;Cm2Jb?D9oR|UqxZry0h?)7)lzWn+Q2viMU|Y zHldghk_hf;3IXD7A>Lm=O34EC&k-GiOOAd0a=2T+n;8Nnmz*QRP7Xp%8yXLT5*^bVHm5!=KcJ$65)|u)T$|xEZ>RXf(A|tuRvqONz2(ay6@cdhA zWazKPPr9ZXOTvL}0O% z?2WNz0kIXL#dsCl8E%7j;dV9gt{!7S7W!OYne8kLS;}n>m6ylAB#H}T+J)}R1E(}2 zSBUPdo2HB62{05^BFmaG{Lho}q@mqgEd=>W==S^Ywb%BcdoSb&M&2}XH;;x4G7LY+v%I4q%;F-hQJihO340w& z;Wp{)Ty=fCHxr}l8>fRs4wtm2VfV^rdIyeYP$M^zpw)7BTgH(0m%3ngBPfzcSr*IQ z>8LIn24W5ha&pjMs5nA2j5n}26wY3+ zrqCTWKEG+==H^2hBA`V&yTB4$${X3@xY0clsh-<+6GRhG9IfDRt|N=77%ZOqAr-f@ z{zlET-y=Pj+TTMqPM%+O3W`qzHl>%zC1;;^om9^Q?&__}f3qH-v ziy1)5s5geK!2g3Q_M#8P)35FzZuKbiGce|*&ZOjy!CN8YLntWbW!6`m2e2(H?bzu} z*hCCD8g$nKMOam?;X9)jJXbe*mJn>ASVf9<@Dj(ih;&JE2t+OAY}_QCNaFyFv8H|L zovR-85;DuZRcT5C{S!x%Vg7;Y*O(<>J-%rlnA*CWl`}1o?bj|{o7Lp4`)b5UFXmmM z=fhX#=`uYRdUgeZfsQwc^-EuhrhC&WzX&Fni|;!Tv98ptMYE5!|61d>qW(CR)P-|c zOkNx}DmqCv%dF7m%);#Ap*WH!aO1sDA@AcVABqj4uOTaufNJa|kG!G*(NnjUa>bP; z6Pg(_zB2`|7p15qryqV$Np|5k$uIhd9X42wW5oU)l>9NJPoINkw`--C>k_HXdC>I) zDCGsBciUkWRZM8U(EV!EVq4E|lrbvNkNs+GDymGmu& zba3z#QQzl5RnJsX7vVq0rH4S00Ou0EyJ-we3MRGsh4H$upq9wz9ECC=20G@SIfu;5 zEM9Ma(AWrkYmVNGoy{5PL z^YlK^jAF{kh;3@5noA0C`R0A<`2IBgKgLF-gMA4+x+j`%kIlh!P4%sEg-Tl9X$d;v9xXa0^zBER}ghUEuEy?f%#*c;KiyDN(D z@ub~owZE?ZdhXkvDkl{>IMZ`RrE0q|t>OWh$Ui>j@RtkS_`0s~UL&#Q#_y?Kxh1@% zvv2oK_O__~ShXrnn3-?>!Z`uazPrXgo%*|+XgySFq;v_?vN(_g+0|5TJdL=C%6YNclX z=qKsv)cr9Cc@U=#t)9n34}Ce%%QQHeWI5N)_6gTDI!l;%UK=$c+cKvAJ4uvBMu^M2 zbSL#^1tj(GTT5@ZF#R{DqEc2=_1VJ2Mb`p3!bQNlVWO2lZ601T+nA7I28Jg1spwnn zR$JS5P=Mu#km?dSkKRnv+9LVeLsq0jP5y+FG+<@kG)UrWseVv%eb#L7VY440Jz#>s zDlVb(7eVk4TP>N9hHdvOGYYmjaF(`M4rIPdRgYq(;S*aTX*Pd&P7K#lxpr2e|CMr! zO`$^1yXajxT=pCNf*3AiUr_&zi^aGSKSdB9`r%WUIVc5xP#|nmJ!N*m)kHj;>)16s zlNly_H@I5tK_2G`v=86Q03}1bB6GX*0&npNw4R?9=|_hnA6~SVjHX9)?-;PRQDD>a z!(T`2e6OOu37^QIW~~W;4*1uh!=ST~lPrE3vduU~8bQI_T<#wG4iVjDjE@~Zs9CI` zm=YMw@_sym2Q|nyV4B--r8%2Px=uo2F82dQy>+mjhEG$B0rCu8+G|5qR~d(*UwkDa z;Y#*9WUex7A{J2Y#EKk1YNzxTG4PdMM?nFx-QdP85zn%6SuIUsg-TM6!OPk%*tMzP zn@q-Kc5Jx?tocbvr5#G^o#}eexL7+hGP8|vv00PH(9&QdmjN2s7CHqh#baHecR3ef z6Iz&Zk#m+nmCeY==L2Pt2{v*d?<&?x`MaraqaTcfjJKE1sQA0R&$lK82(CM$wuOrc zTB-;#*A*5!Wj?)8FN?NP#zKak-jB!S z4S}*ubABpw`ny$SJLgS9W*?%mCdNmbNpEE6=bE%~l$^^*Xyw2Abg^+&n+ifmKMB$o z@H!vfJDW;t_`CgS=zW8{i(>N|h=wa0Nm8aWpZGk}qzbSg^ zg4}r3Q}d-|R8N33;;J;+wLlsgY&t#u?J767f9!o&Zk@bx$Y40uQ22$%w_f^4HzMkX zEb>t4DgswBVCdj-0iD)^khJRACkYLTzpmfP#yVf*ikqG$rqoj5*rSY_i__jF%t_mq zGwLK<_J>I4eVsA;w&OMaNo;E;-meZD$2-g!rBUO=UOG;806{gjYWM)4jzCA2&#h!q z((8K-Td&di1vlVBBa^PVQL#<<&LfZ@6dZ9458I$1!iNMtu`4A_r=T)Dn;oB^UIq*H zPiC1hdR4Jkk3iHbvfY(k8rp_Gx>C}&xLxIQpFOt4X-h z2JP4}IDPFMp@M6WQ9g)5`@&cRR7j7)ny`=7e8{H`q!>u~)4OF}BrA;it!rm&N6+PM zucPG)S_rT)%vjkm2s>#j7R+oocO^BV7%UU`c#brVrxL@@Mzm(YUk=A7xL#PXZPKos**Us9I2W4s>EM;f}2rXzrGnl5k*zW z*f1jzgaG+=FgRr6!0A*wWd&4~Mfp&sS%o;T-DRa~QN|0McFiJg=c z(?v;kzmW3br(#BM;SZMan@hN?Klns~7|#PpFL%W9#skPVjLWluzv9}ZgMJY%s>ZE) z_~{ooSd`%hi{kO+N82APN9d)_H+UX%4b@?9J^K8p&%mX z_?8`~%37hZfrB4+T!pcc9}}U!@rrn_q{fM*wMsTVDgzzYAv1;z%6bt-zGp3gdzu(a z9&e7=Lnn}~METEob&MEY;XE`@%<7ApdI)pfRW6xmHh!Aey?=_>v34q$KUl+-9osyg zxy~4d&Y-Q3jo%kf^SaIpKM=jn83m^+?(Lk1Y_XbuK?y~WlRy2D`!Xt!z$g;pq6D!T?>EkISJ)GZs%ms`YY zefj3`Q$_6~k;h(@yZi644dE|XtfyW$`(KI15ie!E?OIF`yM)Zn<%(fG1N=~Rv-{jW zr;4e(uA}CyG15sCqXr^IOq#_;ly6#?;O|tU^Q7V0H)hvbBq9Y%pT+YqNAfUKKe?dQ zhQ-a4&k3Aq3Ioo3p%N`Fi8IbQv^Ij=Y5%k|Bmev2bX88nPBZNb>V^D$5hdcL#SOh^ z3KAheNUH}_&sY+(k}K85jsM9K#D5zm^P~#*oDw5&F@iR3E3ulfv(qsi(ZOqrauF|) zgc6+%s!{rUGMovn5igOFfNE#Vc-hGe9Oh6?se%4y57ymiWAt!Oz%J_^;^vgC#cI}J z%nCs{+GRaD)(d79%X(seS+U@n%Hh8DGnXAfV){9>jtxAv-~GZOCA*6Fj@h&>mAAcm z7E7qD2^K2Fl`O8QU_Ljp*n)Le9DPt&Qov{}xhJKP0>5gF4;IUwM?VQ)_#I@zPz5c; zz?+S%DhE_vdQ3XgVGO*@QM>H)P%DI>6u$xfj~RcT0!6eg9D5xCq0 z6wIlO(Q56{BWF6zD_#TNKV4V{W2{P%}CUa3gbx$5|q*r0Zoz3#yDpn6&%++kSPQ7!NYVI_;((XjCyvr z?VkymlfAorc(S+1G0Vn+!q+(RC9?URF7nawzr@t!D*IzAAC@Rzj-5+Km)(0fVl&M@(tgdW#0gAJ&;FMYRDWNX4&V5K?UtDmJpa3!0$W0S0*nKri?mU-FNcPSZ?vk#r4-8_ z^9D6O*P1?-^>j#BK9+fK?6{YTuv`f79@Q#}XA6Cu3f}S#rb}N6TGAHu ziPM8syP4x%!7BK5!RmkWZnSyMQUt2agb%H)Rh=uB49Q%CoXK1q1H+{Mu%2D;Ck#aS zFY6hsc6Jn`_P@LuIaIzm_q}{GL!&de>*)1Z0Mj*oU@?tyEtkPLdh;8RJH3A0%0pfR zo6;%eyE7t=^z%0DJN89JrF+}~;M_Z$bK(0-@qvu815Io&ev^8!iE2OoFA6pD*x{xv zBRk+5Jyg0dG(u5b-XHc&V7^|E_*|loo<261jqN!AMpZ#90<{;U*)E|{W6EPN({&=( zZgLoIvMw?j!WNgpmQAp#=Byx=o(iZ{>j%AL(iURb4&TCNOXuqcZ8K4vn4fCeM@E%!qi8<)lp^C0Hmjj;z!0qnmR)yg!JBIL1^8oxx8k^LOCQYmpQ3mup%%bCW0gTig2814x?`>(w*-`X=x`t^%eFWK?{4;6r2yYsWe57S zfY1d};T{&rvvjbsf0cwWO|y3=tEbG{sr$mL`m~F`uE`hZDzD_oBO%9W%O_YZ6aQ_M zztq`D>ta^JI~}@3g)#LXEa%9(sI9h)@PMDj8;9^(gHWq1xW-if;K0LU_%Ods!?nYy z0lLL~9daA8Kto#h0eE;{Ie}ALI#dwneFzi2oabqd`cCxYH(8**G$c!wtu<{6RGi?a z&>-uiAz9n5RWYlfCtRX_GL^)vsKy*l^b{=%WdHL1eSx~+Z!n4^pE`{-b{R9il16b- z$?Q5@>qr*ZXKbTV{Pl08U({e^7@nlp`jW3Xfw@!I06U-CXtjsy=#z)*CX{p?XN$oO z(}mmty*k->H;2O}tRs)@u_n=>Q$feW)u2}4sSBotvqXp23RRV-ZbfJpag9?7KYR-- zG%?A{iSnHOII`;TfnMaMa_`6A5~;-QvSA2)>M&v0~YAmr;*iurk%E6p%+M8gY)VA@A zzse;by_FSX)u;`t5TZkC0J0TZ$Cxo{Q|x=AqQvGwmMTF9-R?oxFRT!bRHF092xlwh ztvP72ZD)22y#$axFqO+)0NAvK5;h^k4ciC2nEIhP;^k)C9asvv>3`{cGD3)gaja3Q z#Q7Co;=eq(z>{gLdQsJLnQP05+o8D4u{Ox3ym_oVGW+oD3SwEHhQIT&O|fSyFn%ql za;7T3S39{9I#Z>UBhi7?m(>!gau0|q%xGzhdI0d`7I%Qt9{_KE{nF7%uAEuEAh>{7 zTC2L23!$CTe&{M^^?SsRS`KZ)2w$+$TvoP?*?Jezt3v0+-WKa^oDm^I_}Oij1fQ)ij+=iN?Cx6vqvDT;n^ER*v3H_fNvw*I zzv+w*i5lZEU|FeWm|xd?{!+Y;wQhozg_Hax+danOw7NTG ziRK8BW`h0C!RN#A|K;Z{<^&=dH|QJyXbXIT6BF6Mh9OQtNhFQU@NzonKMKX;I#Pq5 zAAEwX3B0}-yzD?I6tkrMqfkT^4%gVj{53&$1*@a-awc)XvmI61+e#U1M?7;s)J!Qw zTrRau?ximB*$DruSV?WR3Ql_X-yEI)7chIL%70(IvnZu<19RXM2$2Jcz!P^y*)QOc81}% z1oJSh6P(CpRX-xH(C5bj3OuSk&XB@evVlAqu@`&_sx98}G4#5_2iWjpL=8!Un?Bb~zb zHMHxDF|S-vYK$?t3R-YnnHwk_u5_q*X-xuPq&i=G7e7*08e&?iGDAU0vTTfD=2a{K zzLxQ+c1JX`0Vk$(xdeD8>4(@SYSTOzO{$m}SL--sjP+mptxV!s8ot_5*Jr&5qKZz? z{-U;+Q_GxBF!@HS>Qw+MW3s}CsbU-|>hCkM6x3kz2Vy1W+AKrDm<{Y8E#;3=+cs*Z zqFo;dvqg9=H8ANAug;(&35VhXj6<{EP=g=2^RXGLv*2Yv8#s+cH}%-!G`}pA<||9G zGLlk##QF076mp(HO>Ju*r$r$mHK0I%5PBq#3WO>x^b&fN8l(vVDpg_uBmoR1lz`Mw z6%JAa1O(|FMM1D5O|eoGL=bMyz2}|}=e{%Vp4of${y%H2J$v@756^%7o<#;|eaeee zY`nsj1n<4ie+;OTuhhjMD-4S7DDags5C+jH5Yu+Z@OMK&PLr-#H-; z$vzCfAFE`4L$TB`AoL2z#VtP4aejGQmh?nX6wVT7%$?X}hKxHw+jbo2v+OC=V?3D8|Li;GMz1IppKyEVmX>iDDQH=c z>$eMlEEw2l2B2?ig$zc5WIU}Cf8H$=7ozvFWeF2xJSWpYCFe8DP>kz$^fKf5 zoZJebjqQ~ejyqKBCqhgP0BERTL{oW`pv=7wW1U~ZN zlY`^sUR-fc&Ik1okYZ2Qn z6EH0S@w>&G44oXreOcmAwyJWSf>lvtWxzL4%IooRv_ zr1o-ZJ5o~W`$`Uqy;}^0tLArP{rq*|VHF>h<}(#&I5D{QDRrJu; zw=sne?Y2$}5Dcjd*-cHjek-um0+w!TY|gEgh1z0mb`I@7I^pf&A%+;o+->8{cV6d> zAGqzx- zcv|OaNmPvIN8(5whO}WpC`;;jb!I8UC3idZ?8vSD?*YDYn|H%U2Atj5k2HAs!E=-i^BMHU6CEDO6P0hLzDs3CZMafC;IBE0f^k87Mz1z68|5spoXQzKkqrr|vm-A_ ztFkwUBHh$)KfCf=G+NL}h5)Q19p&28+kD5B1L@|?biCET10o%a$nXmc81sJuG$)mK z#GQnMuM*#8cyX1VukbwC*q$4c2~llo5RgszzG2%-<~T}vByYx-*KEu2`glx-@wI(+ ze=u}-U|2{RDjeF>s`R_#N7Kt{0Ye}r-K2bAjwiWE(db${PCG+kqiED#nk=y6GZK+( z!Y4mdQY)}rC{0q=&w(5!i=VB(o=S&8g-3UBV4{Qnc`PJEi0TGYxY8^dAd?#PlHY>m!B3xMCcFHAgd^Mr__r9tU>U z$^LrMx-jn6eevcqq?_l&bT?b`#e`g`T%Ya_(@O#awk;^n;wxAuDb`?Sg$NiRwiWrL zVO62ZbMk2;%IQx`>euY$r0;kBoAy+DGHbzm{;mc8J4j;gjN-0PoPhO2{{F}o`6ujg z^MlfFCdc^eN6IX7(-h6DmgHBzLbF=_nFdh-@!vk(Gj**aMQv`Tn!3gZE-3F%hID_< z9qsmtHht2qq8S86h(6Q5^gS5oHL?0u2{XuTDzUO=lZ*;gdDNbc8o;vkk&fAp*{t5D zX7nbaXvmiffp-YWDA=UPTxCXC7bW@Nh{)LK!wbDU;k9fY1*q-gD4HIZKo6S;?4j~K zP7#>N$am(|>*Hq25X1l@pSAx7G8qWn1}l2Weu?nr-Vj}(@F5BoV7AXr8h~nrjD!?@ zQ??j=k8Ou+^eU~>)1*igOjCJ34LK10g>n*6up3-3deR^S&E{KEx=HW1yv{8|J1T_q z9^<3WoFb<9di0hk5^GB9LV$WE3I>2#GkvrzGZsMaAS44#G^A~l)k_q5?2rb4qsm%rHhQz!t(<3Tu6!0m8iYXG^pOVO&JDjO2!jhG*l(o=-pN1h<7qz0R(}5Z zW!Iqb34Ih1SoCQJ3l8)^cpV~X`OC`r2lz1?y_}sCniS8)x6buTz{(6V+toYzInrMR zEHTx>ez~TgU89?v9_1rNp9N$oua7hNYjSMUV=Tpx-8#pE@iTN42tfcpa_2#W+WY5K_>9lXz>krh@SLNKGH%-v3L0mv*DPv7L8S7`VIw+k7|ttobeuMO}&Pd_uU$MYBARbAPq&gX(0u#xa|Umtbl9$hLs{% zA9VRcRM3E{*Hz*h-sjiN(5g|Xo(_RSXJvNbh#Zo?{oTqDHb$tC8ifhI&zY^3K1s7h z;mjNgHdY43122nQujl)1Upa$nfKAQ|y_IfIVfV}`w;<*R=v94|&x1Toi^_Z}loKey zd`+|$`EIuss4PHWrI~ZX8vE}WPq{#stT`Q|(V5bdzxR_lgrB8fJh4ZAlW)oWHMD+; z^dERw`f5>gv^nQ;-4}~)t8uR7v=2gm%)4Gw3D#sjw($UqIJU8R?|!pmfHRdT3%yGM zKzrr{lvttCjJ{7DD*YwrjJM@-Dse)~_|cKiEo zAIqD&8b7rEL?2ancV0P>>PDI9WA`uH zknBnHwXyZJlct?whbei=F*~`W4XR&TMhoYm0^_5>f_tCtSWIvxWrJ$dQ}zPaY~PJ=N2P*dFj(iv=%BPS@O)1R)OyN1{`I!wyp9LfMX? zei4T3(;Cfyx?|^ap%rQVPMPtvqpBfBn}j0(lX(j>ow1QL^d-|?V>-#n_~!tNBoPwV zp4I=*SVSy0j)O+5b{E@ISh91_(nfC+ip_O1L)IB-yPc2kff8u);&2Y-B*~T0PbSt! z6KIWs9_#vMx?}UisiL-N>Zblx-aW!E!jaQgt7DRN$J(X)f^GK+Lo>D}c#3ZM2}9k- zKKs%-W*NjWW*?CWQXqi&m8B^aL$15VB5yr~hnf26fJpZcmWr;z1ny}TlbER2m=G43 zrnM|51dG6~23OF_8k`3uUBP`%&{Q`9 zu0I$xm_>THWnXiL_cBUuX7E_5CZ|wO#j!-|q>gpsFgS^VH6gcI)C31g3|^r8)?^qK zq53q9GazBD-aHj+7hdjETU0sMDsR<*f{UK`)|zY}Cncn=iFN8PpQy!WDuV{f%|I46 zrlE|Gt<^mt8OjZ;bUF8_<-%cD$%Cyn>KIlSLjqgP%a!fDf7CJ*;$so@H(Ol(`PQJD z{=WaYvrjI%TJ8JSG)51;5Kv1DaGU2_gum4gzF_$Lgm14CWQ@7wrR8udPbc0zt(uuE zjW20M3N31FEvv37%;%Q~uFx4LSLoHg#6QZJG{3^5%Sp`hFF>E6wL=mr*ZME{Pu+aw zl(PAksi?o9@F;#z;SpffarNJGk;a^hw5Wj~$FHBn8%y7fbr*WAV;?Zdg~Od1YJ7~s z!pnp$G+X!9j~8C&88Yc=HV>4+;J8;euKSWk*ToXA57~4kUnlV-i#pAKx(|V8UAsoV zuH7ZkS$gh6W>DW)`JA)w(m5g_Epl(vGl|eZFFDbd-!=P>_%FVAD!>{f64#UzSK(Ea z*1)#$YVM&$47Ud!fv_9;u16WpnV@Z4c6ukMNLTdT&Q0o<8RruT7I6qkoS@~+@`}+F zRpOJE*RIf;EE^2u%DslEX8$~n&>dj!dD=$nv0)-`(c*{1X59254oO_TCK|FO8> z*4Ga$x?*s~2s^!-mkkp{jo3|hLxKJ-V>H~W*j(F@dHLgsm!&$?obOx6p1mBFHf|4o@8d2P$X{4T%-0T_y&XXZl7x@7 zZa&E$AHarvG;X)(;=Z%To;YrpP$djLl}A>q`p|YiY1|&6_$_mMz|Dd7OwPTk&s=cK z9TuVOoRyoboW~x}iZKbx$)f>#*h`XO?aFp6b_$n}>ss?N6y68#LF<9vsY*tj%F{k# z+!p0rrAsxo2@X!6K#5SV_z(*9!ZBj3}G$CH(c!%v}8Ct&I&&ZIPy5&aL9XT zXCj6LmA=_g>|PQ?`($pHSWN^uZ#H!YVwSd6VgTv>2l`Pn0r(7Y*@6wys}+emhFc#u zxm@i6@P^v4843^DZlVRrtM`{R!>b;rO?mD-PtixKez}7;EE|yq#H<>H;hXAl83sc; zqu-D9Kp%Cr&d%M@!WlO+v~WLU+>hkQzmLV!jkH2EK+FRz3^tv_a0b9pY0iic9> zTPDK8KnlMp(B1K)^DPL{o|p1N`j`AcS$x<0JaTw(8u_iiDk;+0JZ-H7KGEMLJ8*-d zo7&y7vS|K-X%6mg7w!ucZVMIeOKRZ&0Umw*d-8)U^{MHx8dpm+D7{j9IN^_St~Ggl z-)jra14a_ArE7Hj=^1V^J;RffzdSCcUiQCDrr^m1vC1U83p|n4P4@gWBPys zl=7SDu^sd6V_Gw4Yb3<}Tt|9Ltg(6k=e=)o;Qff5pAmDE%X9N^fX4dyIP{M*i z9GyhaoFbKLU15N+&#PxV4fYoyd00Ls_#x7j2bXKo!p@Do@s~3uuICHY%PlMXa%m+w zj}k;G-R!zJaWLsm#XQg8zCReS%U4DTF_P-J84Q&Ft)^-xW}By`_gHb1P{--|EVG-7 zE3G0-t>pWyOzT#0x16gc!)KlUQBkUgFbX3M%b3 z44_T~5pIzVbaSAKI$SxLtypEt_@Ec{noz%>py5Ay52kio<`nzP8AW)KQxE%EazXo> zxEYy*hSL)kv^Y^meaz^^iZ9U7I_zWR(Zs!{Dp?nV+sz|BIBhYP7PsiEXcras@CsakDdn*^BYX8~OP<1z=GNbeLM93m@i1=$tdh2=3!?Q^;=H*J@iy*d4d34kE$q&_lv)*&KEkF3;| zCOZfY4;7KDI+NKlqB*;gKIiiJk0-o6L`SvM3@I3CmNLB1^Q&U%>)WSFYr1>U`U#pCoXp`ElSgrFapQa>3 za|cT57I;#JZNg%F59Oz1=)y%Vv0ska3xwZQBiVlelNye<8fcpOQ8hUBj|)Onzn;RJ zm%M*g>sPg#m`=c%5D7`pxkb8vlf>TES)am^{vQX6dV60>RXpD06N^YS(=D%idCVe9__0cZaN2 zE1(%-Xm*%`C>svdoa#VgKaPc?u=Yrl1`?-%L1&+EQT~Tw>oOj1`S7GPfz0a5Rdfr^PmE3{InYb`q@3@rIUMKpK=b5n7({Ebor(ergIoH=T56(#zg~9J0_w>l! zU&VmHAn&{nk2E)Tv;HCnY|Zdj6|bJowSdF!=lTU_e|9|oZ0mUd=d$~&v!=`oMClXQOh8cU$Gp?MJJE-#3+7 zpsV)lsK(VwI`|D=v>I0X zdH=ES_tj$I)6J15O;?r4k&iE~=keaRys9JkO#1O5EGn#`NB6ryJeAsXcg#+=85ZhDjMKde(1Wf z1E1!o|IXVpw^7*IIt%(;UUty3c>l}D$|vZ|{dD%vGU(}q_oh1$uyH^NgMPZ65Cr`u z{PW=dXE5dAiS*|KEtD(${mfzpXNXp-$${_Ly}IK&O|`m*FMrau7GBN0>IH>VLH<-m z-!Xf-6xSBFLkfSZ%&JaLmht*Kz`7icXOn0jKFzupMV8GZ4R9nJr)X%!c|`P3aggr> zE?$3%qoFe5&ciFT*Byp{_098!KC?iCf^bj_Ml5gPMJup`Tx^c9Dm$|Co~!E*<^lxA z+A!70YcN=2wYl&|iI%l21SGzA8Rcf^e9XEdnfH5Wcn^GJ+VM{12HH{9?#M$&n}B_| zlg1Hii4IFr=brQ3c7d6Wm#LBOX64fLfHOsU#00GU*_?sQPLzVeMIp@YGOyTj^pcOu zcebFBi4Rce!;1-Ki@E@&&JR)G{QGmmi=v~4RH}#Y(UOEPv~CP`P|gi)cX8P<+_AEH zoy~bc_W*w6vgUBH+Cf}sPSS03Gr#t{w zYQ+>g@Mhd6p$LgFC^u+KKKB}k+G(w5cVUWxz5 z=OY61#Pou>Gnu5|IY{EWCFK%Y4sU}Unw!Kc442Ia>C*L53JFa0I)%Lsbg3wj2&(al z1x9S2&2;JQg>Wj-0?#ip$-xWgrlnMO#@NSV;k&9(G^sNc0TZ1RN)1B}nG_(5Nx(>4 zcX+v+<)sv8VR`;`rASG-Y_@b%>Iiq1>P8t|Bh(wF1Yn!g$fc=jXMu@`pQ&g?7sAKA zd4qx&p=s(X*JLtDc^|d7p=oD9a^6Zwc>pO6Dg2#+C4nWXA!T&tKt2}5CX?AJbbcC9 zm;&G~0MBMKAkr0;{|qwHd}uJF9HWgb6i#)dlLR_252tz%pYqvNcK7)9C+QFOLWbcT z$8?-OO}Wa8^j8&h(_wjd3vNqQ84hzT>qg(Ordr`XV5ET|@k<)s?V%G>oxHr@ok~cV zmb1JCII;|PY7;|8gSj`4OO#@#XhMUx+ah5Vu!kH zy)K!h%pFyqEwMtp#_X$IXCGaUt?u)iML18}I_pu&dD5J8xYINPku|TCc~8W#baS^| zuZ=TBink;eO!nA3X$1Y;OBk%vd6I0mfE0HIgDMQ;KAJCJ#h2pMte91Z9rtM-_2q7lRVXvsgG&jH4KoCPs`aFDN9vVR;@Vv7zrNuS{KP4rhn|6zUkZpsAD-sTMk1dVB5DHAy_q+a z5AhNC`zOXZ^-g;|Aq|V6Q3%C9A&-~#6li6Qxb#oNTWsssye3AMNTHjW2vMlsnS3sh zQZKj$3Jl3vzVxdJ-vu^Y-vc7@WuPv8A>AP57G1fl(cdyL@?UfVdp7l>?hN)O<%F%O zXGcwYZm-X*M-*${%>>*wf3O|_{dsjx{iY^~qR-UsJ^!`iCNyZL;;oCeWfkX0(r(Fn zukG|&Dc@f$gu?p}vm{M!I-6Nt@C~g?CCN(}5F?MluL~&s?E$5*E8GV&3MF*%V(86m z`lr{Zg@;34{uK+O@0tGA0~mX!VULc!lqq~v+% z&KaE4IlRV8`XUvyxJA0!BE6s0ae)(YM1m|1rFAeZ`u7QjJ_-!5UFuB6eicJqU#O~> z0U=x9DsdZh{z`h}nU(yK7D!h~?6Q>wJg7E3O)OwSPGx$>39yh18k=oeZk(SCQ^<04 z#w2ByS=~iW6nu-jd@qmTv+kN@X>~$Y3Yi}qo-MxZ=dO;FK6NNdEclqncSVhRwC6>| z{*G@f9WJ2m#$OdvW>8#!DJ5HQ$z?Wj4f<4uz9e>0>_PNp5r&)3EDwIfTiJcbt9W{U zEb_cJK;KLGuHC(hhg^L3+D+ciO=dOa8rbQP=+`gXlU2ZHI>;jMxFV>yx*mDv=8LH~ zccWVNe!G?1HYSPIftTpLI=O}UFVRr*B<@c6#o{Yk0fwjKZ;6cRk1iAO1DkOX2_U*D z4S{Wb5keLS5Rn*WSG^@461C=$H=%OK=Bs)&S6+HLpnl_bshou^m`Ocso z&qLQS#d0Ks_Q)v{1~!h@0a zZnKvMBMDYgvFUDhFV|bf8Jk1GDnK(9u9-dERlch{j4>>lJWe_V9~aQM7)S%bDrsS& z=?4`b)4dG#RE$<{mpHTpoC(5nB&+O{4cxAF322@R1?mo0<0p=%SKAc0@P(aa9<86! z&3$%RIH*M4BWHN`f<;?8p3{qv4YG8QAb-Tn(#;KFgXnYOr!5HkKLFLsfS@P^27O|7 zi<$R0$;$qNKiQPZcH31@Y#GYI%vk(s6Y$_vc8HrZwCj9LJAK79Lzs^hVv8dniWbfJ zmS}2UUcLRFta(3FXXQPWQ5O#{W@8$}1fAvA_s035lkXCi)euxILx+NaFJHtbWT*1H zXulBHf9`Dy%j~ft)S>}(R}3^0&rgH~{@U%aQH^^59I@|nBK(=~RZhK{YTP5%5&JQx zDV#T@aMMVgo!^R8YB&Uq^KS1l{bxKo^ZrR$Wp@1=KWX=b!l_l}@*%Wuf9t8ONM2D} z(U%W}1_s6d1Q^QrP$0&v@33TBZ;0-0!-fd4cEmQ0lyE>zTe|)|n@f>*!U+H7w{o2w z)&0xA>0H%;@6IA>zUbi!z4~*V)gEOW;V9OzZ*s`|^@=~P-42I{_Fyb~P%#qHO!CGO zBUH#`5Z*3~RpsYPt8p!@SZs-7YPu_--)JHU&VhJa}kc0`WG~pCKKNJJ?K+QDF}Td zduN{~W5MR(ENn^<(WxR?r2&!hAA7!!86fT9Np44z4!FV<@yTa1iC~>QY*evwp}>0vj$Ox?(K?q$oOPV7N5+;fxa{E{J4oG*k{aBH<-!@d>1^MVdqg; zvOC`ViR^GsdT*DCGEvYv#Ard8^R7g6WMvPk{(_)+Cp&pR=iKCQL1MLxx|=5hNP|>P z@Lf{N&h^dPkUi$+2O=!4f)z`Z;InkVvAJ43z1#@UB7$|4)FJAD4(J+S8m#lHxFGCt)_`#^Z-3sFlP!O`5~#ebJpY6E~y7|Gq~w#eG($+i@!4oZ<3hC=#2c zg`Mc-PZb@EisAN|p(!|=v-1H(sed5jv!sqB8X>tXzv7 zF<4L=3H}K6QL&o0k9(jmsP?j`8(5@}0cG8Poxtm~qfFLN&(dcohEMo#KQ8EIFhkUj z81uMDQ%rX0CZkmB&W`eUR%Pv^!+DONN%dEU;QACs{KdV2$qc-jW1nxKzAcUeOnK2M z2N>yYZ6OCN>Kx8NJjDCqOZSs})>k?@S>}!7dr%%y%X*kLP=#5|MQ+UJvVAE6#8u^( zQwHNV@xhD{WKrYD5ook+Dg$!~Cye{(OtD#2!F;S&x)Y03c}y6g;7zWh%`4?CAd_-NME-_L?0C8qhcXb5<+xPo2^JHKZ!eYM7o%7lqa)t7|%hKlP$S^ zjkz3xErr@Fp0-dsEk5CPmJ(BVN@HqpI;o0O4%N>Y#o8_N?jc|F< zoO&`u`)3Dx)iE9#g=l|=@4vPGmF8=0Y_nVw?^G~iGz(bGAKy219Nh!F7cc+Wyn+}1 z9_qb>nNnMwr__os{4kRc1c8EY!xX z@Y?*X3;1r`9-kDXnzCiZ3tZvz*x~k&R*m@AQq}aK z*_@9zdPfsN*Ud+$pB1G=9N%rTwN+UNyyBise`LwMhcjt^u2tP;Me{S|6Tc+^15llvMgA6<88eBaBV>V@F~#jGu2U3Drc#;pbp&24zegST||= zQuqzvhzA;gQeeM63@O%C;&szZZ%kIvJB5G+m3z!xgQU!oovRzN!10gY73!X2Q}4zH ze>pCPSyiL+UH_b+o4Spob4yB)PPq3EWZ=lU<>8oB=_bOHS1%x%b&Btp@GTDe8nAJr zk_LYJL8H)(gFPCig{ASspl~Uxj$k}bInJCG!OO38XEb|#?#2_42`lI zc_7~uNM7}Fd9GByj8ZkVOo^C%nEC;}M5h}c)YSEcko|6R5{F@reSkoylYhtD%|zcu zR9THirc_d^Jwkrn?vzfaCuS*Je*N-TrbNF<-@Qbe#^06ODMK|=-lIaNz*CQM{&0k0 zheTMF*Pxj))ry_z5_l(LqZfa$h!B-MlbK_^l8Jbhl(*RvwEefg!K;<5kd1Y{J#)4%#eIlwl&JR+H2G$A=W zrzS~Fc}?WLq=!+pw8(s6qt&k{W#KlphzXt2!}&P0*vPLnkI;G{v_=1FLBE23yit|1 zkmXr^zX7dGND98vo{)-quOVCC0VM5UGZtBg+t?AWX z#n#7KVk;WXJ6%2LrSHR|KUW#Em=avA5*mL$JNjmpBb*E^d*Qr6v zZGtJ&q!EqY7TE07%K`FI8>Y%VWL$jp+Ya_U*-6B82C4w7DOr$Ga>-aKXCLg^L;M^@ zT%`%q8UH%D-hKF3JM{2s>HDStO%mxKInN=>1ON;sF{C+*T<0^v043GYY-*oZIFElV zuuJdJsj&2S;1yI8%~WlmKnYbdYcoz|RnL!>4@Eyp%7`C-*C1yHQLM74iz>+B36}A| zyCX>FY2vE&>BI#nUm0SN?CIgSP4Z!Huf-+rW@d(|ioLE#F0%qzg>WnTWiT_^zv#lO_X52pIY}Yi>H8re4)Hc;x6V9wonegq#^kVDd zl&?c;zxi|USZ8hM=MOlFpKIx7?PvQf8^+8H*L7}2EE{H2R~|2Zak{zXt1BpXrde}i z`?=PIa1*CnDX6kosg*mj&(>uQt%kP@GbFgd?PURtEX>{FH6R~nO*_oh1R>Y7C=;bz z+e1u?++ed&t#bI`wI#PIVS1b)!$M^Aqe5QxXzHr{b|l5_porgDe*JNobWxp`zPHEm z*9oGkxbz;|dV3_L!)nZOZud()WH$mJz#D5Q>D=Ri@PGi7usZaI`u$&UnJ z+{${qiz>HF45}uq>V0&t2H;b+Q!mHem{$3y=ig)5+c!Z8EwAH`hnDonb@H90eUgrc zjM{(bI5O8iwpIhG>wfrpx528Cc{GP0$C1NvA)Vc&Vrk6AhD44h(`A+0ZWtf-@mcG% zIiC?bQvQCa{{c5?Ja*d+N;ZMdN`Rt&#W9l!gl!D3gZP@IO2l*oMiAoKEVh~G_J$HM zi6$x~BGwsh16XXc4BAf$EpkHpuj#7HFSToSQ!7}aDFTSD4kW65HyeY1n-aF1%v$-) z9|&MCa(_^^PR;n%uvEmRH$`Mf5!bD$CG&Dh^mH-ZKlV()G!__u;%FuGw5I`Q&bSrlZEwvpErp z#`%XOP-h6Er-tbfV7UKRYMuz(`(LP;*AA(b;=g2iucrao!Ti5BSKkR#14dZ&9&Q6T zzo3gT^@8f7Y!qw@A8$*0Xv%Nvm7G(M#UAW?@^x=rZ}%rHaPOg+A%SJzKIUFGMxTdv z=NLvA67jmde--xihS<~~@4V7rp8u`|_Jg~HqeiBvh(=tDsOeJpk5qS!Cz>HH*Rf?9 zCLi>KElzy|m5l6Qmt9N&S({hql0F(S)Zb6e2hl%!?wEOT*iQG1B|7kLpL6z_YO=%>{D8_o_AiP<5o4U-Jak$WvR9c-G2D94}>KE|3nj@>6tZ`gTRM1H=bV zvO#31H26je1|JmQ6DZlnv8;BKXvpgC!*+FvF!+>21X7Wn9TwHj>V_fNl{gn;p=OVX zrw7P{ArYqI`Gl&XWJWk)9%i^H^KF-`O4AnRSO?gNx?3?QtEbVffOZN84OoIto{N@qGGV zl+EXH#0C(Q7gjD95awVW{6!h7oasMMF7$=SK%xyhaWcimaISG*!w1AAI(z{J`X;Ke z7i%&Q6v8K2e0+ms=bZHlRroZBMqixoWdTEFM6;wnCXDKRZfp|FhYwPYYLF}rw8*LQ zq3a<%ONW7zCj{O(!0cX0fdM1($u7IQIEl}a$HvLC1?!lh;3uzY>!#h0*)o|6l}TkE z`ejX&GEv0S80;Z?NXSeR;4Y`bhiAa$zE3;e2GISkPFWrHX|+;#9v8?07-r+}@k}jG zsm(@(vGyBb{LH~ZCpQPc3pHqyhlr3bMrZSoiNQ( zc(UO3mdh#WgVMWG&XT=Do+iWRTOKxZ>a;oIEVlj(V!vYo^}`Pebvw#f6keDtOvXx69JECI)hN(*3Khc$y;O$v zv=?A}Hdn?*8;$7k+N8*gYR28?iWk025_r4X!ycT4@KFy=f7~+eHbnxD-Mn^1f*WLS zwm~H#&P$&0#)qK2?(@6s7r<>|T>{d#R7I1zv z!Ec?Il^v>eXR@492#?rET2tKC_^w7dF4rI^0{-x$OfIQivYIeyW8jqVPo4(VpRQKG zD0Q{cXlSB~-;9syjBzC-e;$(;(>7lbiuKoT|3VYusITW|M^##_C9P;E%KGtUrcUBE z(oE?ph?%JLFLbKB6dFaZ^IvpI;Adrxxb*MdD@Oja5gFpr-_Ze-A$=z-sMpX;hc^fAa+|60Wz zV=PzsfjSrA0o}x*IPe9eg+xT7a<5Gt}7LNJ;u1Dy6SRD`g z8f-pHxZQ_k8Nn6l=J<&~u8&k0w`e`($a`U`_>ovZoZX^KI&B({Pic~(d^lX&dSQ59 zkx^w4_RHz~h}Y;yCnb!;he*3U@}SP;>?Lw8d{1Gq;NIt9cwCs71o(%2B{*;@Vzw+V zsVZb@q^~n>dy(+hM{LwkeuUEGj1vRMrN%1|c6H^;ybvdQWG+DEhD;q0 zAvxxVO#StbuZ1U8dAk3eXO--OKDR3Hje#n+ObH4`^h&t=(X->1wW>fq&;TF8a0EFt zWdfwm4*3Ou|cju8K!N9qaGDwDa79xI)FR^~aHTwgyhVN9~OF z=<&Qat(-X``2rKNn5-YI2B=`9;~4!<+O_*J#ylGSSdn`28;xS&P{8r`S3k@JHblj1>5cUa1b0?xNx6-FdhKi^+=<3 zs9swS-SUqtBjws-l&CoY7}tUC=Yfh_iVeECGf1X-9fGQhPnUa-&8eMDTOes3JL|1@U~v?peHTpm^o0Il>U0n1ESai6Ddq+qHdQl*t3Q{ zPt#S=&4(&i$#W7DvRgHTVGBZWeyoan1-LGL(_Zq_P0m}4>QcA<(o2ahgAJZ#7gzy}p`Og(yy+sJU^f~2$- zoO*=BW6Z$_0kLm*MS2QQ5%Lg^ zV3hw`{C{bCBtQ6A(IXlnE=s=yl0O|Q#dU7!diQJ^gr&g7vtlTEY5~0^-z2PPw=J11 z@muS4R%ww#hw)944y)7lmwqwZpMY@6u)KFF+^EkiEY<+#;zY%^{AP}Ckkkn9A+>U$ z#brp`_9ux76ZuWUg?xS^A~oTSek`2@Qcd*^xfcB56Dv8!R9qpH_9yCYgKLaG<0GGaG-k1?)B=`BTIPjrF0gn%DGzuaYR4U)!61k$sZmA*b1rC%116e4iYgiQ z>_Q6kb<^}2d5FxukVW($#9MBC^!m$wM-l&SlL7m+qpwXm?zG7Gj)$R1QrGu&l znn22=s#TAm&Z;DkDSvW~=cufjxblW=B0Di2K~j z9!>Wb<%?^W(n}otFXg*OP(C*Ge~2K$Scfq7|7+|SCN!(6R6}L4x&SrmXpvH00~t84 ziFSA|fo@ZJE&N)nbpl%aj|5tyBa#XTV+qGKk?r5cMRW#UIqN63wAB;g-ARj3e=Eiw z$(0ig?g#C9(Rb3Q{Xu3Ow>nrPyoBtJm;5e{@8zlk*kryZwz!R#mb_J**^B=~{w9S@ zNJ(1)rH`zC1q0;lnBjsQ3uzH_S&$TG{EkD%WA9x8e*1i%p{cIiZegacz+Ep^D*O}q z^vX2(UlU5UMCwSCvVLy+tSwRMuOTC_G}C8KPM)A6f${+23~xxKvo|M|`F`<94owBq z#qo`jS9w2%i(`>E_;x(;6ZxV5E^h0fimF*Z05PHiQB@(tUj8+-OL)AW7Ng3;KNW*r zEKyqrO++X1@t0ocM`GSxY zVZ(z&G)A@z^0xpFGFs#;YFxT#Qr|+q*nSma{yvq_lIK*GA2;*PfRg3A_MZSXbQ5I> zU;)SmDnS1|RYCAQB=s%J%3C-4b3SLst4A6WHPnTf-By~WTrGqy@tYCiE9zg)ywFmS z5M)il)_=kfu<^QWU7nFinL3CMViLE20=wSY%ik>b?N)R7_(-({mM`caRpn$Ur79Q= zchL2dZKSD*@lDVy8ut8(L1Weoyr4sbW~qbE%Gw-7#w>bV#;oFNo`9JP(^1OEZuQG4vpU-b?$p@x2ao(Hq~E! z(pW-yn`#y|RBNm{gdq32kL6VvL^Espfbsc^xL;nxiNcUK9ZDS3T81gEYR+oST55rB zFeAQvZ2hsN>oB|?EVo{B{3E(a)YZ}?d(+D@GA`rL$C>SbW+b>40?)8|2O&e68?} zL0xkoZfGnlY2aoGrq1VLqZX%r@=k|^<&!^?;+6bIuKOPq0vgq(#yQw_(+L8BMpeWR zI5&I*5~nEMmZ3a5qm$DyOcFJoXEDX5&V?%7G4_lP>S7WF&#INbecLpf_m;p)d2a9Q z;lZJ2eN1Voc8{x1PP_>`2!fnAjf{1<1A1^^GZD$6{+BpXehO7=#A%c$n%aRSaiJXjZ?O=n&ypDqE6~e^h>D@aAV(deAz+ddV)9_ z+;NXjc`p&lF_k3!d{DG>sl1%(jnm;-t@4c%nb1bs;k@BuT_v&<#jF)(uI76h+(GsQ zE8QX`Gn#nzxl-#0UAPKbkPuyYIcwGHl;|vaZwRu~C)1y*-7afoY6ZziQqUyBu0~`j zyYb990NLUEg`z!?To{e}wSONo@=X!gVORWFkkHIC5I{1e4TKx!E}0U|6Phwt9W6{O zQF2J3X~Gb%`Afb+bEBEkK{J2`xCDOO2%xdkBW@^{d~wxWW+PCfH26BrycqGhE6XDj z*G`*9cDxhUU%?cqfF->=tbLsVCAqVYnNGNn)Ex4rG%MSAgQ3uBOCdOM`DyMA{kv7B z<7tx1RAMt1(4>euOob;$CY?CF%NA9~Fp*q!x1ywGGgnd4Zuo~D>5_p$3(y!y{7abe zs+ohcgf+pqXd*s;Md<#e!9rkXazhmdr$f448njBaEAT97fLe+5&0%Z0&UU_VTum6i={ z5^&~0H@V7JGVzg#i&gGHgR?`J3(dR|I3}@<^BcCcR3%BKw zvBBxgDSaGmn#B)2f-~4e<*&SrKZ?5(EKq80SxS^NbU3?+6o-ur^zgZgiHF)mM*BX) zc9L^jI4KYn1JpSy5UzOHNWShjVwf+Gd^HB5nBg`GWct(yGaH$!+1nfoOd(n=aARdk zcS1+z8qREHwImH;`m{JVlc3VVdXk+jFVUyLVVmYOiWG>A%SyY%m|RzB;3uZR^eJ6Z za@kQ!kzF|SEzS?@TGiY2FvV(ol6C$yQ3kH2h{R=vf!0dqoEw%|yu<~C0%}HUPADOW zwDQ2v@5f+;Zr~3)saNT&QkQkyCD3L@2_HpWyE}C*1IN2>s(Oy2qU&y3M5`m!+1 zPUh;)uH_d0DQt$PMb*xh9X?CO_GYOx$WFVzS?s$ z8Fsa(VUME^zvF}?a=wP!oDk%r?_Sq{JbW!a$r^h8Zi$iyi7P!AaBZvLuTFRXiPX); zOx@2VTyU8|8X;HTFE(}-G5UO7JxTqYU?Og?2iJ3}J6;Ixk|&%vJ7!_NsQ-(!SMgDA#S zttO2d!)jb9dJQO*fB$lDqq~%U@D4;7-~6D4GX9RC7Lu&1Sk#_6_H4Z|Y$&|i6~)9o ze2h|AGQm_1WqeY@=4Wh*Eh^eX!f|8h5Y`~MIB-g&iTl(;9LV}oG;#p*X__TDc*`_@ zCwaMaM|z}K!cm;rF>!GMK@ix*LAD=8(y{*oDb;%J0;;?ZEopf}QDV+EGx-=A4W<3(oac*?$~nI-`~1-aPdIeLzP2#~9PmveQOXSW?KsoY%K$&-8X? zo>EaNJgbDQ&W?MP{B|eho_$%#OF;`cKaqRc`QZ}nln>+Ki?kSqb1LWAvoU{YYIVH# zD;f-eF@wE)EO?Ign1L-@Y6iwGj9C3lz)6+f2zT;+I7POOB1qkoA+>rWVVg0gjvgO` zItDTrloYyi#qn8gAdh(oL-ko!OeYo@>2SfxK1k4+GC(Ni>KU$|ah_{hgSKH)6%2(b3jaxsa?mU zyz9%k2cuW`aNf^mgqy&APm|#@EEm5i0TP&iB+^NX@2ZZxfWJm8w5tz$7ZxVii1vm# zm9uFc4=Pg!)!cYKKrlueQ+jGlZsfjU1GF{wR#;=$g}HZnCU4^U40}|(sYmr0W&~|Y z35*d3?~|?28R6^NrfXt#a)|J6Wb^wm``PnNu1jSg=@!!++Y&*wd2xVm6Iz(BHbZCJ2zbmgz4CM`_>*(OiwoiROn>1D95|4 zi+fh+ye9EMou7Yl(M?Ps+Q`q0jo)=-ST2goSo8@Za<0(QJtjx&|6)r9iiFZ^T*s+Z zfVdw>tH?FVl>H9E^hw#~r`Mn~K8EQH5Fi{=dzUpg`DffJM| z!-pwgF-YTmaT{#rF1>QzUH?M+kZ<6=HTA@n4nXs{)`xMR z12#?JZ)K(xlvEL~00Y+cPku9@r zgl`0i@q%0EjTk8On(*mObhxOz6HlzpMAZfP(HSjVS`$GK&iUNWHPT#%RZ$QH;f`Lt z?6P$o5d+UM5e}fFU6)!NEn?wifCYD1yZ)e6o>*my&2?K-5s~+D!$)9zET?AAx5M?9Mzz-ZATw1| z@8#5=)7^%0=B*y-KG7X6#=_bRv31t>my`1VuW({ z-l`HY9H2Y{7wiv{(1##_6h^OA9~kr`ch*bQ8#TX|EQkP~Pa4E4nU^Rxezi|w(=F;J z!ei%f*KoWvFQ0B>@qx4z3t=0vOE=>im^RjDHC*eLL+-A*WXg#X_OGxwY7=j;8EUEh zHzJgeiprN`<6CTU203KP%)t!YP1_mUN(!d@%&*>Rv$OUkk?Wg+8gV2POqmqg0O>;l zwQp={@a;_TU+xjunFdO^G0hvOcaNdh36z(1qIR+@Vu3@cWh#iLLM}@MYJVM_(=Eme za|lCQwgq=%m-JQ9{RU^^R5>#nr(zRo`aV};lrU0SU^{Wte%11tvgfEZwohl|1FHp) zsTi2@ORO>QHQ7vq67?})x|$kbM;J%#U=x*kx!(e2M=L*MK}SwGkVBV-{cFmSylLZO z*eiF>M#mM?@6Y0RcDFcLz5(f`SQNF|4G%hOZx1_C(hg@#G{@sIYEFTkfAxuMr1fj6 z0~{#a*wr{)yTM8piY38P$3ZA@h`6RRYV+da+lkGwF$fW>Xtdz=*4M9+g&5jr3S1d&Tn*E8Fgp19&FIa7dB99qG>utTycQ7_b9YN57TL)>LhG@jDR+ zDK*n6v|+!|xHQCVAGdfnXx1vy3i{&|z=_9V=cqkw)>;6ZhkdK1cCNnA(f+s`d zxg4EzNU;U8=Jgkce66Ku^T`rq--2DP{s0~tF6FF#k5BYizBng9Oi9Om_C&S?qbD^8 zFpqC{S)-Cr>LOl0lrKWe7&9HhGi>Hdt?!@kIAN zS?>__#)V*Svz=TRtH&FZL@|8)nV3I~$W=WDNjph5MMO=p#5qG#NZ3_J*fh}WJGqdl z<9t+I#eC;`>^Rg;fV*1YBHKYi7F6d)SGfGiK=dbkLX$K_b3`uw1BoZ?wUPhI168_H z`JcJ;JbC_R2J9E=C2IzvHVl-WDU`(Va8jK(wiX80juoi3*BTlq#*qL`Q1O^0osED` zsZ8D{w=W(GtwlBs2-8+S9%B@V+K#JXo3m-&!dEao`i20U z@de*wGn6d>110V)FB{B6B760X_*RiUmpa-fGzbi-`LRhD}FZr zt6Z@PQLfnaACMWSY@aN<#{6z`U=tPl+Gl{>moS3o($@`$bL1ZYd2IV1mUA zlBYg?D;%MZsZ%MUPB|8@NPnz7XzczB^3pFrNMG4Tb?Mt?In9x8*M$1InCwzj;`=n@ z=g0b76Od(Gtfi{zSZa<$$WlH8wy<* zdX>b~52R&yv(fBdG*SV$&;}9{MD{3Wcbm!JM?qi{Bm1_({^jAep5n34X<b_}m?g z`b+gVfw$Sv#ZQ?kGW5q6A-S>vkMou(-m=J$@lUfGL!UzZl#JnKyvL3%6FHj5+amGuq(a0sc&@1|&oL<>M%;vZt z-|2mW|aH>wgNoR|CGn#3*a-T^1#p7MHB^Y7KMp4bJyK{6q||C2$cpoSQ^OOsr5 z*RQ@eVjYoHa+E5zxsE>|E!L~OdQcfNat-~RV)ah&5R`w`CHt+H1NFpU5cGTD!lI4m zg^9IU5$`wG_1FkScwgHgU6lLYX+v#E1NSdjL#+;Vo5Jc(*5A$jZI1$LBu{P)wZFeN z)XGxNm=nMJ0qP}gi&FWx%-HrOC6?nnUPL%a$+Mp8AVv`DFyRb zxMWsR3l^x64&{;|;R0funpNIT-biPwk@-ADmTOExe2&%jNj1JBJh49@#~R;UMy4R$ zrVY(VJM}9Uf5|qMYSZ^%TqA8hT(9D&j20roik9&3lXZL&-KN)AzlXiiqCDh+UE(>} zHrEsjv}C{#R48I_X!;%FrZ`XV2Yed~I=OiZ2eH)`?b=JJ>=6^y7~*TkUtnQfDGbX8 zqMbURT0WyLRZ~qg>buisn2VXo@YCqI!otb~w_tqY=yE5m-!-z7>3?$(--Ud(@85`z z7<|g%0wRASp6eC%UJXEN2m~}By8yvca^yAdUl1QBFJNJ` zvfK8!0*NMohAxQoPz6#5MS2JXAqasW-5`V}2uP7`q!W7Yy@w(oT@eN8MJcaRq**{f zECf)%cH?``Id|qgAMP`=XFs#{+H0OKYi8~L{;g*lE-{z3&E~7dH%JvgmNN6G(q8%irflKWzLjEoi|O*eC&r_0IMK;hTJxnSF=>EQH|(9d4Ue28M)Dt( zu0-&|yQ}P3vnrO*CZel-iTkMyQsvkr-WLu@^QysQ`~!t)CaD@e?NrL{vG#30;GQDtX1V{39%39N!vQf` z^{s(l z;5tjYYb0gVg;%r%*n0bDqv6p&-0QB6oL2(m==PjKGUP}j8uU%QBd73Lv1173{3_dB z?6#ZUkLj=J62qdaHTFMmO0_&<3fC7>eMzEil=SK3G~rXWu}jDdnAuWv_BC zu!3{J9ST6hs;+JJ$g6z=lNAkCDR_-l>^hxn0JN4S!r?+P{mZT zC9vlP#^ZIZHKT*OJ#O+o#WIpZu-Na25E+{KsZfG%AO><-f{2M^TwqkROepI9LUJhK z>a+`S6~49MRG`-kOjDRyFR);JW9>|dQuP3}80@q<><5lg36YHMr{#*S+CEFi$-G$G z?3jx%NcFNGQuJACjQ1l%WVGsv@Ntvfz8@bFA|mJHbx*J7CJ;B$goszpLJZ#R*6;~u z_H17R-EPn>H1RS+jhw~mg_}o)o~12Qfv4WXlS=!-U%daZoC~~#q8-#RoeEy|WdgmS z7}Qh216MD!>yBLLN#uJq($}4U5AP{V_L}94`zcx-copE_8nJ%b%hQ)x~hGLi` zoOHN}PVcZ_io>=Bt@Ja+of2+(RuOXXjy_S_qiAZCJGP_`*h&#?{Lg7exZ{}>y_37n zdVv>ASMvxjOsx9o*D6G7?$DsoZvtih125}E+q|?qie4dlO!;Ml_a87daW+omoPC^M z5BqVWcxi|qaDLAhNG49=`9o30!(+C8l!VBpPckgM5B_x&I$Ep$4yK#N8<#Aa>6+dl zUDKnR{?_ztB{dG|7~)zz^F-r>McbwF);lZ6u6_^x0FkYJtQvImLfD$TBk!$5i|;2NsGqt9?_rs~E-d|YmCS9k&uCu9v)+$~$Gvo}nV&zzR=NRuCUB-UptO^iEMeIPb> zh~^#rpiYZM03(%8Vvo>4udW0P@r;d(fI`dqGaZE^K*U<1DVL^ro_<)@lpd@6Kx7da zt!Ua~a&j+rS0SE9`todaq8tVQ)>C`{@%L&aK=gQ|M~K|#tC+Jq@X^!JfyGFWm;NRi zKxN#~dyNKs|9*?v!Mn^f{Zq+JZ{c^Gl5Ds&G%1~=Ke2`eJj?*yiCaMPcZ0gI3uwZ9 zPW?)cvIu_XK6fdbTKm&${oLir_T-!>Z%5|csq5UHsm;-WT`kxX=LDhrqE5_QmC~aN zS+WEeyL>TyYAza#^AP@&>!}?9vWde&yU)6M2>1F9`AAvEy#ffRn6yD+Kg-Y_Zyv|Y z#4^beQjyQ*g`mu7^ipGT3hkF;rN_uBJFqK79CxeC-Z9!e`GZH-sOM#uh+X1h;?LJo%oJVz}vmDe1{m&is}{YE$XdQsS%! zcyjrx2`bt^ZWau7I$uV%r=op_#r?F)2&Fi8Gf6kgQwF#)-h`E5HrJQ|rm8B2$*|kJ zo`h)Ul>nzx(asI)&$LVPq`+)bIcfId?a^Qb$MXh&D2morZ6iBA-1*BVz%+f2<)drX z5Z57xgOOk55yFnL)z1^xZK!}S=Ydq&?F^e(&IMwgQVqX$DVaB?_&uBJ(3=B7q}FJQ zxGz8AvGmb&!Pw~IGt%vJQ3zhEl@(EXW$u?@v^ ze~t7OQL$!0alTN)sv{qZpXzlYM6$s4w~ocohSd!*|Nj(9a3sn$+eGEp9wNQv)XUk64p5))z@|8t>xm(mEppLDoFSRoqU>k>_zG6fZpNsW-_zq3&ooUz-0t5< zuEeK!+<9%1g1Gd~#??ut?0zG72EUCj)>W!j+{SO%l@*`ak7H(fQ!!PcR&zy-Z(p+F z>~j~zP5e-Pp1=Z$w?K=$+lts^)7>>cj?z4$2$zp4*R0p7DNKiltwQCW=jo7Gt*9jIvM8ZpgK1ryxd@)bQ?8 z6+LP1*``TB2^>%Xrzq+ENYi(0Dhtoq46{3B{gfMvZZT`zg}x;}`Bko#d(yVOy% zz2a23!*|)_AGnfvh-Y#fwBq?r_m}3oTO8w`YrIeBmbk9CTWOaZ0^O622euDox9Dis5FOK}_Rpw!ZQD=&LYahzqTfhw7^)ZoH!)VOB0~9J*ZbuJk#j z;T~TL>BQ6Vr}!YUgX8F;sH*q%avY}dc%byfB0a+3u`a((Nzd&Pi2|dNCrl$Q^)y>9 z!z)*(-lJU-hq-Yj^+sX%YU7n&n`WCj<0oAC`j~@mn60VLl!PZnpk#|f|2N7PB58m@ z);&iRK_zLnY5|rJYAuF&YeR%0OM9~_Zc~3xn2IxKn~Wg_R-QPs)IG0Gz0?xmOvM+a z))(8nps?rR75yXL3R#iFO;SnKYYT+ZKsz(#m^zb*BqGy;mbw};;4=4B?B>|fY(2c- z+|2U7Z@6N_R9$w{A-!}U*taO1qnFqE;ND?w8?#quM8mibwN4Kv$P2d7MiX7A3*<45(d$eFbLb={D)Hn^A+#JQUI%_ z?!8Md$mEpmk(^kD-bSUzB5SJj90Y{Vx%GH%-<)2Xf2+aA{~h)8GlZ27`H_$HmvF9I zoGORc;~QGye!uFUptwdd&8R%C*3SmBEhJN=kRcLUvp(%@_Kb+#bD>u|hwhTEC9@T^QfFZ5eDR}XhLbREvM^C4d}l`0fw`$;Pt*R$BIE5xA-wu_J; ziz4up0d*irTA#)}l*y0>zPpw-Olan!G7LuzW7HQ({Lt6%ESA5#-@$jswgsqe0AIrW zWZfVfR1lZ5kx-Ww^57$w}J<>ZcFptF*mXp(eS z96JRED5yB+NgX+ua0?LD7tfm^klXSJmJ2VZ$tMb8IF-d9z$|W@AI4)*fZH(FU^h;F zzAX{NvGCK@Jb$W6K)dR+x6|RUIavI1@_gIQ_rWfMY2xfUf87j&v^_-{V66qYR&83! z+KpV3zGG9+36b9Sh1@cRr0nd*YxG9mK{m{$cYjKonDnn=vY=2dF0 zZPq4kz@95LaZzkdsezjMR=_N zl0nm{o?<|K>CabKoB7Ew;G*&)B7}4??K`lLJ6IS8P<|xP+BFc*b6$1BQti@)<64{d z85v4TQFipN+?*(7{7Kip|9yAaiq0{vl9L`s%=TVS`W>lMbDG>}m>c-J`dpbm9YmBk zLBPWGNt(mxWeoEHdxWOHFNkfyxEYWAgD9dk_c$3b|3MVLAdTT_oc~4?`!)7BBX$0@ zc(bt7nBLN?ga7Mqy3rypHgrP@7*l7|2{#rz&CFW<^=VXelSPVrKo)s9%>R&P*`Q?p zPN!w%Flf*EPGzo8=SGSKKjHqX57TSdQP0}9P?^au_;Xu1Y)((HCz&oXK8i^}KWyMs ziouRvGH8JW5`b5)F5nna#w;5f?IoSDPSp*LB-Ks?rsPQUNS} z1icf}qL;w)j32Smm*PdQhcQGXk6H3w1C#;$2rKoQ(IA!h)o{5-+&-K|oukSTX%(A| zBvIFJ{;Zak+K9C%`n+YH^iEPwTCzWB4r{7#v(z1k<(ilVshITP3r%`Bp{N7X?AJ9+ z{N~M)mb>Izw(`(~vl=J-b*crgW&_L}c$WCeJpZb{Kj>XImbh%}-n)tLvMAd`T+hf2 zo_4a(EBWA~T8a0kfTSosMmh7I*1^lw;|9yO8WjmSmSFKP|5>Qw3W6(M&IYcZzjdF7 zk=iYnv@2eY(V7#PPV$^w+ZH!i`6i9Ui|CuZj3LLf?-%2fCDRh6Qq?%1@8S0(4Wdfx z09??aN)q*6tQ;dQu&Gy$S2?5L{`4B!9oqSGm`S)Sdlw7n7YAzPC4$&Vj>bLaY6B z$z9ThGzb5VZG@xdn-ex-O{2Rv8^(7Yu3WI2F>KJz4qCWpyZC(|`0I}w0k0JWq|mV^ zxD9MxYyK>k{Y=DbZfS-j{Gde-r^b zfADKt`}?QTIl0g6of%DD*r9uG>O0a~ff^bsqak6j1;r1JI*IXuzxEpOfxUYWw4lK4 zmUtK6__3zqD-Hc-cRqH7k(WNq9VwOk5|nV_1ZCu!^D`)@w04-28Cex%VJHF|bqNW( z1XIdDTgm;y5@t0gu7FT-Qcyx5WF4Ipugk*W&Wf@y`XlRrguzLU3UEaP3MC<*@!3ke zwbYuKK}G?Ff}@Zym=Zm<1Pmr21(rARB4MukIZ0wJAz%nN>^~kEFKu%*U@)OS@nAxp zLg2q&5J*Kh>Yp|QOacB+8(f9{(G7M`O=V@pf36|mD#-t~DJuMz4S`b1fLv3z!=Mx} zx=K127#xYzLF%X=kt&M1NVJZQjv@+;)Yedg{eK>IU=0nhuiteazd#=+XD|Y(j8pWgTS-ilSKB6)P%yz1vz{Ux$JsfR}>e!W9aNzxdXLuM`yC z2Pi1MTT)QSB~ehYx}{W}P{0p1p4B?0O0h=%?_N!IJifEpP2I#3f9wADzos_PEd0kN zFAW{FO=DE-^xUk50yPHs7)k*K#$NhXzPsH#TSXZz{wf7l)H*<3#U#)&1w*L-Kc=$2#pai>{7#7m|c*secv z`RU0duSvf;^`@048|^pgty|Z_yh6LU{y?gFj0Qt$&8N1?OGbYdTYnhy1hV;jqd4t1 zvj4L7ro9S-2QFpHd8qXT9X+t_f@A3QsQuon+Y-;thP1VAWA}1?Q*p7yO+eyotZst1 z{*{?9dTn*f@?A;Ilu1n8$DSvvt#EqYO0RMa3~01JC(HIN?nTg*q{5F;MtR}k4l*M` ztTvA+)@w8I^EI@vNSLzQ*_Qkj+SJqkGksflG3PGr7cUaG#hmq{mVL(=-uve3ZuO)G zy-7~nUsB%ZR!L${c=Pj&^Tji&eaAZN5mAOEF*TbVy1#vbZ>zI!l4bKXj=&D?+`rr3bt&HHV49yw}H?+J^#F6PV6 zW~n&wiz4alYuW+26V;;8xo&S_6V|ie3g5V#slcE1i}Ck!%PQ&0yUHGumWBi6#?D#0 zr(dqSyJI?->P4h`cho`a!1||0|5(l6mD%ARP3!t}gvq2k|8=Rr`qUAH<(|8D0#50F zJ8e^_p6=Ct{rB1THD(^WoVMzo^YLpNR66TfvUXsjz4b}!!RDNibAmgQ9M!EflxGKn zGkBT}bw3YfaQqO_RcP+qE9v9BIxDLjz^QCsepLJUS(U}2I0_m6`))tG2anvWYOHU$ zJF(3y|7zE7VV%$0|M2wgOr(!5Na>-!ylEso$4I*8Tdu?-Wv2*R=gTSq9V6#Iw~feu z2=)#zvfz8*o-uT^?jP6T*W=7vN7ftty`FsF7rSn{jjFER8b$b5l?=tzs;tnFJ0{^Y zoKt2vUE8g#bbWB7I`F^k_?rA-EqeACe_f!^P*pbY8=Ok@HD>DTyD)sZKkJ}?=BX3P z+lBO}qILw%>}yp0qG#RLuQMgJz{XhBG!!^>`Zd1~{R!pAmo9l3-~PC?+8-OebGuFk zrQFQ?VoPUgYK7;Y&UV>9l99WqDY^fTKWbYXt$Yuw{_lTylvKk`{qNt%ROc4V`=5W+ zcT@Q8u2+JCi?ZEaPNL~Cp7%W-k^6qk12xOr1WMTIgTAfUg$-}2l!?W0FG{{8!xjg8GAHvabQ zaBbcr78dDwdBQg0?4oC?cS}ex%FD~Qx3~K(OtCvUIvSgpoXAk!%)_vmCXBMwG+Hx@ zTTzj7{rdG&Q&Y!t3=~3FIkNO4FJE3C9v&Wc$bg%R-l_CXqNv5qTeq%UytqzUT6$3S zcA#E$&=Nzk_QM)J4-b!)wzh_r7V1NX4pH3F+=rK~FptsCdHMYL1^ccvD<`KrGqMum z;x}4tm^y?>$=R zJ!yy~Tl{=Yw4`}yb$RC7n=IPS&Q4*A+LU73SH<{ono_84&qE!@?tAO%R_175zkcoK z=XW4*;hf;9$1jI!0{vGO#p2@PQe-_hYH4ZJ{#!6pIdVkSXNrxTo!x7yKThD}!`<<% zG=a}l6)E33NoDJ$+r7?EQ}FvqfnPT*JJ;G@WGyBsIXXA?O4zvYg1qk!%d=-yp4xS= z$@>Un?=A6>wy+(b@b&ejSY4UdlXe+=*=G^ju`vBU$Fx*XD^bL@<;ET?h-sj%^7dzb zv)pFo=f|pMYG3s9JT1Jc{9lBQ*6od862!+;d3Ae#n4I@_YAg)nhmRk_X1_GkV%ZzQ zXj!k^k#eS4w{D%vm!{~j+xyLEq?~)1M9x(6kB*K~AB*9jS2%PitSNdYEq>?H)vH$v z1OJ>?&M}~Aj@iZN^!CvnJnadGo`=z&rl|+gu|(H!m+H>=c#q z_A9l)%QX3>r3`pC-V%q`DhLCXfImO>lmB*joe}nFWaPh>@>`m-wW3Q_7t6#>H+-tU z9QNoD|48@!BPwHU31N6~+Dp5sb?!umpu9NWr9_1l+{4eW{w!#z_;}ellaXtpwc9xrgZMe*eY6Y9%2=HGp6oZT3pq`_ zZ5zvDgBC^7Vr-ZB! zj4ImN*WE@MsOkp>OmN&AzV&2>eSeorn`>0SQvCEOZ6LL(q8I}M!xaP{HLHj@|J0L+ zh-xF8@+(-d&DU5(MW&tviARX#g|+9L%%EJEcF3j=pTX#=5ShrDZoa_oY+0hDEN!ij`j98shdI zV^a*03HFS8HqQhe~hS3rDVQ8~OH4qcMtk-2pMNk=%mXjG(1CNtZ#UBS(%5 zmb>b`dwj~>)APZZnt;rQ51p2e*93SEmO5WeP2Gk3z$Y(%xBl{`lL(SKl8%?Mvi5!Z z_U*qkx>;6MRv~Ed2Ey#~=&0&lX&2*CC*~(lo=|+K3*{0O6-`M^-AqkwiI+FXHQX8; z9K1vNTnnBJyTqTM8M90N=+$kzZm9Dh)BpV4XVLTE7##-(N67^3mMu1;EpflTzP^l1 z7nMfI&6V=&mlqCs^MhltG*v>}+%wFjrX5*YiAa<-R=#8L%3hmAY_@FO+S=JUj<;C+ z`}fU5ot*&-QyQkGJCKfTaW0LGjZyfJYd_=ZC=^0gl6D?EUx@VB@Z!Zfhc}sMLWhHX=gK4c7;alB3JSfSEBNDnZH5gHQPCYhI zwy@wTH9g&bgjatHyM*1A`H3$3-{0T8eSGQ~Hp}57^Req}^;oU`r}lKUwY5PD1NM>* z-4wL6v{Td5vhJTZEG#Un%rzIjSY%8gfJ za?56>rj)xAUzN^Y!dg(`m&3R2Y`U_U(fFC$9`EmO_w3$%0o7;Sfq-8QETU)H|Nfqw z{qrYVBc2!enxebN`k0*0lzLE*!o`ah``^F+GCBF`#Z|hZl9HFNUeQLXW*jo|@sY-N z`1b7!Lz3pERS%|pSOcd#usaw-McTPA}-uEGn%F4=o*Ol;a{anNSM`Ji5A3WI0 zrE@Z;ZXZT2pgqIAnQ0Jw5%@z|*U@ZZXv^jqeQj{e7R}3$-Bu)bP6pU5gR! z=Ra1ep)P!To5P4#9$%dOVqW19hTk{LH&x3u%tGB=G3?eM;n;i7utJQ6yXHOV3f5Nm4d!GtK+x_`G7ga3AC>v7>J zb9CC)?bxGmuvaS4C@iJU+R7@<-b{J7BLsS^Wb=Uvcv`|Dtbo7BZzF8We-)=A(~|ErX(@z z{%7ucYXbl9Dk!9^TfdQHRg_#L1KY{o{9oUCu3ovaVIv)9J96c^j!XA*bA@mk&kolH zN}=|u>u1<)VbSqon4~8*p^mVSvf3tiJMD|mQ{qTOwfGC!4lWuG%8va zw@=U42^!`_6cvftzPjBwG{kuUODGYt;-{OgsDm>T853i-y0WzLce0?wxi1ER%^ zN}NJfb90W6kPv<*nx>dtTE@m~_+T$SeoW;GS-gmNq9n_7^g7!`JjLZrbmQY4cgZ6n zOy88dTIA*DFO6K~iin93nYz!<5W&)`qPn;?^=Oy8kImahC$3z%;`aIFW+eJ%tS{2< z>!J&G_V&*EQ*7IA)z{Zw6f%0^5XKs&a^Jh4y|g$gf=OC0U6CLb?PT$=)ql%9cn@CP zlomwc*TGUDJdq9R_OI#piLSLv;aKBByN(?pE3?Ot=o>TCt}QN2q&vPZ+dn=&9u_Ri z&F$KVGn*{w$ly8NZuR`ax?huhvB=#|QJQufJl_tufFIsmNLxgb8L05&BhUW&bzGO? z@^!rUJ^dVP@il3Bi;%D1zP-*fkr0=Vko6d)?#b4(N3a8DGcO#72)e?d;CBPX;x+Ph zroKQ-ObpwO9iP8_i$raBmuu9=yJdZJ;0iF)0cq)RAUWUzF|V)3??}7QJja)gA1kH) z{r!C;-;j}wttFmMuVnbo&!3i`Uq&K9<1>8i&ENUdzN=|yC=H?MIXx)#`SWK$27N2K z=$2zIUnPpl`u^ZR;#yo>?0$4&@5;Y_C-|9=*9eaJ_4{{@S@{#goIL$38l``~7|}>% zW@fsLw$Qt@oS7f*$kES=is8F}@`y-VFB$#_>FI#?cLQv#fLWRFaM1i#Y+TqOCN;{} z4W<#m9yoYSk4`*5mA;3v=`}sboS+#N8MzryX96plnUxiH>()!44h;>BbhSx8RhDs_7IMu6SU((+!u!;Ndm#QHf*k*YVdcU-)7?OKe{MOE(Crf$*K zuhW>f)6*}N8=IPv)`k=zBp&K5TX;}_XqW}77vxa@dpeI*n3KLCSQOPy!Kh7I?Q^HX$a@$B2Dw#+SHQLA9xbp3Tm--iz=qP2^c#BJM{ z7Th9(c=`tii+!d|@hn|P3qXt`?_KM%T!(90#yooS&9YsdB1xg49`vW%v8N6x3^8`< z?%~$P#xTHNeZHUi%xJ2FbFVPkrkv9k2yS4P_25A& zYWEGrBkrm<-W1_ltkN_*oQ{g(@$2ge21Z85f#Q9D zu`hdjS;AE65yqs^)78~QpDXS}EvABh%8yy=o|8 zTL+)7HKMCkemN+)iv;=J6`0jDSg6BZ7ElE|#GGT{oxEpVvUpxNb z=vE*xu>%L5BeEx1PU35{V;Z`;M}fcqr(7--pe{T)Q=^IUU-fEYIr=>bnvQQD(PR4ohtE1VL?AAG zh7eJAk&oh2Qm&heG&eM;ynm>lgZ4?>(9lrQ@r`Prwvmw$PSGRJv^QB=lftzWsEf6Q zq86ij_X!9@A~E)qIGzmo`;AZk8Ic=+AB zW@%+eHmI6gNvSh5WI;sMH#Hs2z0x-30YIpRUUM#}uf2T>Vtv{^#~>;(F|l*P=(NXb zn#;SvLDOb^XG_Z)&ECqYqv@75Hn;dT8;gK8krT0|K@0_$AAe!b7Io3vkeHLve)BhgTC@7+7Q zuply}C3HK2#+iN`Z)~$LHGo`737%&+b$g2J$)L>NzecZe zg;*l@fJfl~#3W4+kW=aJVkF{MKSy4$OF1zO4Gj(I`%`3TCY;5ikfJgDzTDc<@-Wh8 z@&PMK3IJF+D|HbO5mY?~rJXx>e);lcevqn5Z)@aFPTI14e0+>dOsaU7`OJ|`(G@F* z7`Z3ez@N*1W*9j*u0MS$fl9ZM8C?A!Bg1d;=N|Fo>qyY3FMOtVqPFX$DeO^HJhSuI ziZ3?mDjnBDpeh*|8H#%bf}4@f(H80H>476@{OiuGb{mZomhSJMDkF+sih}=5zXybqW`t`D-V>+J3q&WY^xkd%>tI0&k*U zMowkG(gBBfofT8V;kvYY`;HxL=yli+p69^Mwm;R=)TBXsJPsrR@HJ5;kKetQ1=a17_6wn#?4xsJ*l!(3UvpS$sD-I1w5zrNBF z{POAfMFQ||+}Mtsdie0+3;a_NOag=h0o{Qj(eKQdawRN`P!))Rib)k#(KDl{>cXZa z)QB=1fEP%i-@5N#PU_aoYypMTaB=-6nUED39m7T)41s@iK8y-am_EZO}dS7icz@niG&>PiE_8sMd#d3|HR&i9@V#X&%@+W~@u zG!6gs$+C9&hdi)y9^)`Cw|>%xn!rRPXHSq-+B!NG^36sCXF9NqUTyj2&)hB|Sr^%~ zG9WjAUYB12Kyd7PlGjXpaWVTylF@n3%2rP(|;M zQsIps86Q82?nf3Bs#R=!M8r1WEMMP0eTvVRX{Zp>4@3U_m3t2^JLhzjG#DsV-uTyG z@;G+w!l@LV(qf>0t;`#LB1@AwH7UgixpWlE(A&D@v3?FvSfHo8%eWK-Lpjw6UXOYqIvy2S=m%Pd#P=!!9!QieX9B zUzDZyLwhMQ1tAo(_k-mwb?$pp=3@4yr^kDcODW`fWMrh_;hU{CKye<(tbJ)(=ngnI zpadK(dU7x^KK^9hvM4pBQ)y*O<3SsHdz`@vUrwzayoQVL_hE3}Q^R#!%GM08YuK+;mvgJr5!@k0zB7x&cU2ScjK`S%{gdk=%DD!*Ff3beH5fvjbu}@y7-4dxC zH9j&T;uGqF{~#E~ff7eXLbi2v#bW6x?o2#KZt%`Kfo@Lu{_&sqwZ1e_FeIR;zZhKggxv_Z%H0s9FZZU7sWE)~)+??>;P)J8BriuWslM z)g+cnNu;-HHe+$iN;Z>PFt@z?PCmYD1!j%a&0N2tm}sc#R{sj^)4H7rQWA{qp)M+o zEd6O(3&8~7A$1RrIp^3M{TvzehO@J?lHj|I?F_WFw7viYg6tlvdSk{x#2H{+w8ak! zt0*1D8$nW6gsv%~&i=jE%Jsyet`iJE$+vC`wfo1vRtwvTvx_I^J(o?peY*)QS7C8+ z29^wYkEQe8>L%2D|J4=$BHfAA#aU}{ad8mE9#+0o-NH8JRX)*4No>d;f9l0IW6_a6 zj-asz;v7X5lbaqG7?5kVC*)vj>z=)Pzmzs#kBueGz_W^qk?HBv!U?q~2(zQDjA$ik ztAy6>fI9yC>C^Gd21Yb;5&Ahn6Ex1spbTz{TErkD0y^4|25NfksG)|Qo;{dVa#-l- zVjI}o*IH04j_1B>EL*#$Q8+d>riKUvXL(`#UkJKTL4!v>m9>nsva_GMji{g}j>^uq zPxA>=xd^-&lb2CYfIQyj9Lvta(uCCn5Is@kwX^d#g}1l&yW6qC=kYO6@2=FacmGBPvqRIBqaAZ0jycqRrM(I~EE0eTPr zB_k`#tfjnFlJ4WJTerC9Z|LW~BP9UccS!eLX+$;xwx`hQi17aX{=gzxl9qTo;wDt-J zd{e5qL3)Gh6B{d3R`dan zM}6sr{hl(J6A!i7K?Ox!KTv=(Q&!g5NBOt5rL)rx3;jxXlaKG--HDDw?WiH5K9H|?a?>eeoGIOa z067pE8$T`0Pw?>Y9B}(|7%bDZ8nNb-?hZ7?uB_Oc5!hH4Uo{w+d6t@U(T-g~lDDiXnxB&~t+E6y#w5&IH z1vD?E#hQ2(QHxp|6#DC2N}NbrpTB(Zwn+ffaxf-6E6cX%l%eudzvCYSoQiL2^S%I; z18JAerxP)vn<}RUq=ImulHB(fd<|jBv6pD6E@1<9ne3C0U?e4w(+Jtd+1X4u;Z(P9 z9?JyPF^b^ee?jpdS$H2leAquQ!2I>%oMGR>9{GQyGk{f_y4MMDLJtBC=4% zg9q;3-pDa-II?G_kMhqs0)*8=qHJkxT_>4)dN}ycF4V~^R9f^=p34g^2;&1%Qv0N( zZ=>}JK(pI_nw8#P2!Z0}?*2Mt+t#hN2*IG`>GE5m7MxsMT%^-PFs084-TgZ`y{=E(}T^8WVNU;3y+|BqZViB1R@AjQ)F%yt7) z@m-o5gHSt*M*~TG5qy>J%n(@`{DB}95gk2-N+LS-<62Ng(DAT$FJ2tsTNr_21%dZ6 zVU8d?LJ2>Mk{0y)osoI9UtB~4b?w^faswwHeO!44cm~43EXyMIUm$QD5)sZUP!*x~ zk;RLg`$XP4c^Ab5GA@-Y27bO`K52yH4Kjt0JHIFU&~RJAZ;H_9(T=(|kRatw$tMs&N*bR|6-o6KI*ecbxD5W+3-t~O9*ZtSX)~+)p(loyb90s5y%ljK!<6?J$Um>;P`g@_% zl_fEGaqjbPr#Lf`jtN0ivp@(#56!r_xAR>w*R;jAO8YR_Xt%u?s^@)fO%)yA!HMixYjc% zuW)9tVNe3UMZ3F_(+KLGY<1X2mpJZl{T%W&cDVd-*4&W?`Z*C%QQUldo9yi;4%#Wj z0w{vaKtr8_P}P5l1UUNq;`$))5@8<`6h_+w(L1?ntKI_i27wIX-QLRAEHeW5Lizad z8@f7^87hci*d5Sd)rm>}Mixe4dM$N^hwk9{54&Y$O(wJET=`AI0#><%gmTZOCLM+y z0a6p|B#$OHH`V>@Y@<;IB?z(3swQfq*3I?x%90x+nKnmROjF=p{@0&4y-VDVR?PqF zYkGQt2zVqQDda!Z0oy}IN9ST*v74IYN^BM$Dsh4hMTJN}po$!{e&!emqWOCcAg8XW z+3@n^23J?tA`gu{d*Ehiyp+PBuDj9GbGv8xe?AWK|InJeF0fF7)K;>@0Q(0zW#p)C zus0i&Ey2K`DL7~m_8Vty3zuRF_%Nc-ndaLerwW@tyK!@$7Sfg$QY5h>pnm_F8aS!V zvv`r7*)Wc-bKvRmS_K)o+K;Q3C=x~LMe7bXMs;V?$;d=R<^JCvp(XpeW>!Ogf6{R1 zS}Kkv<#txqX7H4+fH;ZuLOgj(n2PezqfO4Sad+-SndWblj1&u98e@f=^&BD~XwBV1 zLf5eOL|_3rQ^w(;U4Ek^087v4<+#+XcctrJKL`~_WoW%yml(hP_3Hy(kH*=q(j5tq z+rm!iQDMiWe4+a|^_I6&^PkqeeN4s99_|+qb${^xF%6|t5^#u&2_HaO=*cbnw2aF;qiC3m@dW>4F@nEgH67XP z{805$L9H_u8!eQ@drn5i7XW+&1N{BFX8uk9y)C#a;WO22IR={8oucAm6_Lbwu&>Mu z-N84jjlxa=#!n4MV5_@(de%!`>_U@5=vBj9cQ?20jWX+j4*&`L0Dh2gZV8^!Ts|Gw zz$e9g^X8}3;OLtUwp5XkcS5&qbDO@y+v)X|pZ^49rhc@F|AO$;FrCs0#qHbOS+pN2 zskZKPTJYbcq49Hc^Ol_piBj)AaF~{Kr|jKF#l2U`uwTmgdH-U;m(7$sd`d5#-}@w& zy%vUOMt?M)8Qj&`$;-uBGVI))) z`#>NOXiF%2V0f;ATf4dDn+(juD=0_dm~v7@p0)|kp2?sw zeEpi<)6>(X$~1o*-86tqL~)dQ+|ew3LYh2y@SxW;sp`<4Js06I0itpD@oDPrKAwFD zpAj)|&cy}m9FSccrE-6_nAogE@SmB|n)E|~M{#sY`w|lp!YV3cktC1l>CJGNnP}Y> zZUxQV03DNEDMSHBmx%Z1aS8fD-BhiZUO#_^xfei)s9z8D(w_v=6Hd{-lLNK=1RyKW zB~r6Kw1odmK_Sc22N9@nvEY{#iORm_6s9`*kzpt8$%ltOR_@5^{=hLcUH^XPmaX+I z>hpi*Ckj3GEiNh71Qhz@A1&gLH+=B2)^GNbX{k1wOn1cGSbgKz?}=&;HO9KXjjWaL zU0kfXcPELhuU*>c^tOfPz)wnG_d%-dksE&fUaJs70ivb^pHB`26kUh%?99xDfjabYuv@`&eN(NephCQjOvu2Cd#a6Vzq3?A2wi z(3N_W1+@0{-Q73P%ZXV&r$7Xs5KJHfBFJ74JC`6kuPqO+4LOFJA3S)_g=Kh+)KEq~ z-Iwp)F?aL7fL3jd9LjMhfJs8)X{eV1U_o7-5*}>~g$Y9M74+m1kQ=}PH3C~bGA{Zo zzmqoZ;QX1iJb1idMM*~&W9{Vh`1AnjNrBHF;u9j=HYI2_qHGX)weH`-POz4n8F{<0 z+>kDtk|pekPKV86>QS%TSZew}otNjr)n*IB7T@5QBUg9J7&8WbpIQ4G>~cI8+|nHU@$}3uVh7&o~ZeAC}0)xmTjYOGz6xZDM3#Xu#$;4pmp^z6Z5C zQ0mMI1)i9}i1h&Mgr&3dwXiTMBK$U7*^HJv0xu~B@g3`u^)x_Kf8{to6Nil31vIag zc6Lpm@S(+Q#L2(F8-HKr{&N7x=0S(2U|Z!O-a70zF~1rp?WLp-FxmK|b}dDsi%}peNTz)a5-uPO^f(ICju} zsM;T{A-k@$j+c>)OVj1U#|0DMl(`6)hIOiQn;X?2P_3q>CgzP0xI2JtcVJM`ZQi_j z6CCPb8Mtr>-yi03zCYSYIYkm8>?xWbVkx+HZ#N#d8Cm-d{w=9XyzgM8URmC+We< zT!%a1OH>W}?aH69&wJ`Sa8*;iX-CdTnCjm1+iWt%HJ<_U@@qqN2UEVZ&&5 zuF;O<A0XVC0$5kgX z&)_g3X`}{y)JAdE0Fwy%XCssR7dOIUSgu`_V8H`(4cnfj~=o|>1eiKj7eCH6K(Wg(J z6g(9GpHOfPICSsMx7S`<{YS_(Vr-T7je)Xu9B)cNtWQWE(AWD+!_`!2?xI=r`SC%+ z!NDP|45lgAi&gP2QLxub7Plo@glJ$r`wA?Y-@LgA@7q}zoev+Tz(*T}-!!8^P~Ya6 z?+ho3jgk@OKcW#-`U&SyQp6)EtNw1WMgN?vM?6xQOz1wirw=1@gbS` zcmDnLp7sHnA~vhG+xv4rK0MA3cOJMStP*&W-pUwuHa0waLg=nr|B7zjet5#L{M<#8 zr}s_?`7Lh$F%+ysGx;j<)A+*^Dr=i_ayF;P%u4LpvxmeA@+j<%8=gnd?`4BZt*mUF zbg)3Avr{x8M!K>4=bp4QUoo+Y&IBR$6#4AuZ{9GV7y$hcRTPXkwlq7Nk7$`aJtuc+ z!GL-MEKg}L?2P`RG?R!iR1!e#$$=4Qb&&d|5%DrXd1+QqE!mkc&v|Mmk39anrmdWI1U2owy?=1z$j#p ziR|={46ko*ye4~fCbt1}9zmsqltuhy$BrE%ekLrD*LWe~WkEqRYp%w;9RKp#(G!!Np6+9Yo*RsDq-q2BQS@2aWgR^|@i=d33jV}zuNDp+eYQtW$sG>h zhq|ecdu|Km-V!)@v}*d4$rzFah!Dq*LR+_NL5*gxWsc)kLsY?(Onj=y^Z_H@-ah;Sk#+A@RChxxD~c-be;z!*dhX~4mcTo@1eSZK8#dKUWH zV^tQ&W<+mUdXk^-lJ@1@lhZq)m>-Sh?C>29z6xy-{_d1RfwysP6}q;>{ps4d&p%6F z1A+sf2+s5I9Q}Qe$iyThmYbt4fLb@a0V0f;N0Hf`t!wo4^}GA}`ZnrJ!uJ6$KWv|d zpWH2~{p#_A-`?f+$w#vbo%5Lqe|&0W?T8&k^zuxslG9D*{pQpxEO(4`%*|?Dr_gOfb=nE#02*P;+V!KB7w`#014W$wq;!f#vXo!>1^LwI3fpjSSCoLvc|MKM( z@DP--?DX{Y*bc|RlAPW~gVBG4?gLb8#mYd8IIO1^`{QF(&og&x@SEUCiIojt67b8U z(u8FyH8r}nHss>% z9w5POT#D^z5r@pSIEN%9C3RQ%x*nbvj);u30-x>Y@9%heR!3XAWH}K8O-c#}wunaI z2W$+WZtnTdkDukZ7jYZ^zIJwL@d;_QKxS-|ye3A+r8r8E4B!X&tGJYuyuG>L)#^?d z(*LJ};T$AWKD`?iWC|YiCSKs|Kk0ppLwgDW_S@OolEDO@nIRp}R_jQ+j>bZ$JUJ1rQSzatHE$v+%KVk_v0{xG1l~4XcYI1!15p=o)3WL6&zH|ZUj`$32 z=|F(3U76E@2Q@#K4|((sHeTX?%!ijS0)iP7FfBO1jn#YCE=w0}XtOsJghMSW!$ zx!?@6gMSH@#4RX9#1Dxs|L5=DSzUh8@k|5$ojMjy65){_gNuX zq1-46+DgfrJ-89WRghcADVM3C|F%@9ntM{em}d*;mgd+a*}7ErL;vLU(w)ph%u2eA zT~3?d)45wW==$-}P|e!oOXbQ`d~X((6A`eF2yscL@Ue$e4Wm{Ev=z za^VP7dQUzB;yzvJbsy1B^~0i@Bk|Q3l#$Pu1#>A-0LIX2QT^B(r_Kh@Q4@3OVI{u7mfS|L8kxWn=Ra z@{E|2R2I%co@pr!EE%9N|McdhQYr6-mZAxA%J|yffFl)%}(a%xrwkK_$@Yq~{~;r8skQ%*BEk zEycADH?D8l%d6L$k{cYzoG3bRyex0slP5dYd4mOk;r|+9(CH(C5JYJJivg%1KsK;e z9ytF&CiT-Fy~ww34G1VEL_82-vSM&$ub|+6$^<^3D)%0YEr1Z|qxs9dM(njfuSbyn zf!?yvOF(?2-?b}0^$%(srDWFwxSo;N2?v1TC@k2GJ9pZUZAzjZp<+weweOacyoKKZ zWerx!60enRlOe3UJpu|G^5m7PR|(DsRS(jQN;3-sSVwhrV|>=kef?(liRzng-*6E= z3JI4%Nr*$}3Nz71$`yn1I+|~|H$fBKKL*3N^%d|R#OnZ(@ALQXZD^HpjEgdQop&pl zsAcH~yJ@*u?Tv4RW+;x|Ambi$*v>SVFY!dg#(Mt4vvm$3)mRc@Kp~Af#DZbDd22j4 zbVT%`2{7P^biKsb%mgAHoCtAdLPZ0l&pQiEzW<9=yKMZ#a z=^s=XD0cfTSmJWG{1BY&FlWtzZ$i2!PB@qhnP7Pzd;P%s(YrfI-Sm4$weQ|O;*plog&%~}cqQwCgQ=~LY8xuAPPR+sF-jwQ+aKG9Wy=pA+S z>Lz5MH=W{7vY8hgvgG3iTJa?z!Gerei@-cnm#Jwo?xdHF$znR#bw?x3T712Z+JQiwL7D8fm2$~{1o z`qD$MS2-tl*m^Z=C@c$qYarvhjb;Cvw~`-AG)@lglVF5q^1C^{^;6Z1q~QP+t>m() z2{huPNsw@6@Hy%xrq`-%BGsQgW*|TkVr{$v=`L>Dq{j?-*kE2cgh6U%t?wV2E zjPH9>^~aHL;{^xk5V+YL4;KaCAsqmdjm)?YN-tO)=l5{EIe=eAFmdD9!Z_sq?ae;e zlQ0~kVdYz+4qmi$VAUhx0q`k;QVaMF!ow(v9hq@KQ$sQyy7S~ifzq&UY}p|hfpBsE zA5Z)-lxBZ8gg^$ogdhmpjmPX~HJ}0R&MyaVBqfo_ATr^Au}D=lHOKcKK74TKFWiIf z>>jWLY#uICK!f4ud5E1d0H4 zwWl4b`cIHM3f~j5$)kP(ESd>u|B@m!ChK-w-g?#vBwJOx6;m6;*X2b-^S{ zpGXLoE?s(bGJ6|H7K~^V!5j(1tb-GRuIks!a5_5J<9La@ygX=E8_mkjSrN71`@3tP zVR`k^;Mw!6rJ01C;46kB0A!${7COAeUT;U3<=bcov7Yv^(F7m6G!DiQ-JO-qAcF zPvpzW%Fx!bEc|44Btxahv&f`VgQZ&8(~|{t<78yBx%n#c

;{!0lhr8ucDB;|WE= zYlh#_0b>vF(({6bMsJD9!Z1Jxq%5zQp;X`*N9)^|^&xTC)YLRM^%kypxZqR3D-qa- z1)?bN?o}Pq2KfyU3%f6>_nzb(cwOKz+p^>4YU5w*t*FtUoOL4-^(>3 z?nHtBF`=|@WL0%7nXa44Mp}7Zy~EVN=u6?ga^kVv5s&5jdjdq4CbpM|_;6A~`}!F{)R3P>z#{g9Zv@`=K!L>T#ZByDDO?)m`2u{gS-wmF`p@96ys6wu<4(=-x5eFt0fPeYR3S%QC?ba zk2gFzav|n$t7x(67@kSiK56FN%8(@aO;k-f5k61kS8ZfsH#awkRP>nWfh2~Ugq69M zZU4MW$=N)%%6CoZy{q4|raQ|{bAt_SGNv4@u6aBRtvhSBfy9JmTh4JzN$reI;X>sE zF;`+!?jzA+)whUeocGDL2G-d?V|xFDEud}(3yVEXX5?0HMep9O}MYO0EoYd$er zS^8~>qHFaM8Bx`t9*iOIufg*^^gT6h`MhbhpV&UlTc5D%4=}_=0ecN`*aNI8qlbmQ zHIC_jz?vNxd0?Um)7?7b+c7*b_rAryLh33895v7@Dm{O+<_klMx{l7fij5%hk!?E> zg6IS<7+P)Jw#^Pqn%CFvi|BgEv<=*oXllmD;3G;I;nZFg4+j< zi$J6z=&{}1*ikfvVR!G|HQXGHAwTd3=)1UM8@^9Z>)0>Q)O}hxZudMB12c+Apko7oZkS5pkaf`#^6tU z=&dfka#v8q|4l62k&Hx-4rHQ%t{#1F{l|~H@M>hp2T;#@fef+W+pk3;m{j35rs%lT zhVx16R%9%-cIjbZ$iq;bi!{0BBL5A#LYi1J`faGT$}L2D)>?|Cvu|Ylq5G1Q*t(M^ zJC11U>Cu5KtX(?lD>VhTv-Vx-XsmBt!oC-8aWTS`H(65dMego9oPYQZGcUA61U)7W zY1F9KKw}C6Qm7Vf@KM0mdA7gMYDVfL6ntD9l1;AbfYJ$+uZJ5lN72r(%C_qKDdGpK;lA=L{D3$G>|>~7X|66E1^ zT9p=ieWjJ1UD>B>-1X3*j)@YMBO zps3mR@lX9VR=0n0Oa}2L7F+>;dSAu^$GoWt^aV|cs1qT@fHwwWxfS?*oSLf_E?g)t z4w?L>{O_;ZYV}d8;5k+6BIZ|3kxUzb962zDjqb(`1#nlG6c5i8(vQI(O>T?e{mVdt z+u!x&%a?^N@1-F;7M7OU!aQo)2a>jJZS{{5x*a$RBwdG6(AkNBNI=tu6FUy522NT| zt~d#_Vl6YX>gwmAL;ZCY^S-4{+gUbpa{ioRS5m45 zhBq9gO8tT-A`LUvza9JYPL}?3hH*q8k8l zl8ieyn36yxrSVPo4u@gDhV-(;j|?l%T;BHIm<}>99=;htt>g0cAp7I}`=vq_(X z;oAZWI2jfWfNUjRjIuIbj3ok@L7;n4?(^=px+G1m>DF@B7k_^r@O^SrgN1(S28PE$ zF~F;R54C{^Na&L&3Ev3D%HiE(X21`>+)T>lTnziZHEA&H}!ofxW^pe|enD~cpX6s4K z<|-<>4jqK}fFA58W(xJ11DtGZRDm#aOiRCqe%C5Le`o6Dep9A}M@ZNos{I+^Y{Y=a z6^vuzr9jG^fNv3FNKj`)%f8M3x7;*zFit?7FeyL33qYStoIJ$CK!_3pPX`UY6_*zf zQwyvgXhIH8t_tZZyzc3FfSJ#N{psT%=txc>HXUdX*D(u&!uSQwttCTmHxG{nTtNXL zij2vDwIKpMF*SgJ*s@dlI!gKOICVfDJCq0FRY&hmnjoU<0>R;Ij-tU?3YV&=tOQnI zA(vkec`(NSF^nYp0;gTTq?iXgf*L}GcVLMH6O0;VhfDz20jng@tjSD?Tde9C*2jg2$`-PfjXYUBtxQeUi8VE|C>pu+l6NV7| zMU#_-3@(^Fr3qA3DY@VHL)#DErfG7KJK?)G0i z|GrGj_2VERtBftr)b;UtfHotBGJr=<@H^-<_ci6S>;bI2h!7{@*uW~WoS&(VFc8eb zj{%Au(oHbTj_kD!*Hs~WpiDEw$3!rh;M80N@QZn;jYrJLfRNvS=~8&2cNK`^SMhl0 z>t*ESzhKZ}$~NIM>GX*ph9Ztme)8pH>G_X$Ag?$b>Lt_}nb$2Y7Mwa?=L_eheP>Gj z@GuvmVcG{V2aG@@as(zUV&9(~5lYbfKSuxZ)!PICmVk)=mQF*uBpeI)AB^m3V~~@! z2BN4iPwm->fXJ(=!ve*Do+|9y2MZEQQGN5ZeZa$>_Ec5JKsM!z{f?HqI&? zhs<-_e$t6-v^4d!>%hT-d_V+vpkmD4aG; zLD46!{lOAPOh{1@5(tG)7x4ifc;kx~m&L4`n9TlIhf((C3(3g4iI7PUL^`=)<@2=y zHt2pG`0_O*))U4sm@Pr##5j#C1~l@Fi!K4U%;F{qtW!Ivdum)5fV(Pya=VLdwV;R_ z^e4`r!DyKc2zzL`#CVtbMiNB&I2M=GVi24#n*M6|@uX!9baONGPt@MaF%3I7INDKW z$l#?%MGElG0VD`?&FDP8U?}!*mOk!MYeV1?PlwJONr9NmH`%Y&G2mb$Sy0OXpdg)RZX_GJki0=ERf1}hi;2Cl@oFG{V z0$)CV?uU~DYCXU?W`FWeD)O?ztqN8geij!J06HAoeZ%uu^Kq_(q4-ziy=vWIX^w5+Y*5JtHUaSh9rq44 z$kFv2K46bUZGbY6XvgTgL=Y5#c5+Diocp zA(u?RX|VL8YG#g_0rQ1~kH<97uV25SvU1y+{~xXiZve9NTe~~%6oAtWxQX0Gfb312 zUpSZKY99ECRN&l2!vbGl8+f|`cok5J)2~P!$@Bvm_`gkk9s^hP9uaDcf`saChej0e zL)@liJ9Orb8+(ors^b591~fkN*l2n@kQ%BB0GKc_@4%G^Z53j85-#fi!AwRh$u%p$ zU{Jf*z|Dei~Zr9)1$nQD7-bG47&Zc9v|4;Ff#{BhXC>(^4l92 z_@+5(|NqBi1G2YpDkNbXFksl!@mD&cv;oug`n>`tl(Y;pl)q12}w5qmC!U#T1bDj(<`^i;OFMpp}Ll zO7njfz`U}u+c$|YM=rd|+lko@YR<}|pbpxd%G+@TCby8#WDQ3hBAg6{ED7ep<%m-5 z%oT!mm&_;N_+sUY%FFwP4uYt9-qbW6-B68&e(&A~>YcMjaD2cK?&9Q&S9kkF@qe-P z-f=zm?H{*2lC#h-vJ#~T$xay+MH(tnLW#<0B&%(OkdP#VG)O8UA{A+9kc5abQXz_x zG}QfkpV#m4xbHvi^N-(kUap_|em|f0IF8pkxKe<=wcpiJDz4hF;R%FKT~gVBd*|L( zoimGmZZ*wE{?wT>MTzT{fV-T?Rz1zN#o5Y(zPZiXS3-72Q{70eWX>OxOR-$Ev~B0K zy_UV7Mj@1wpYK^!tY~)6t--BMore*IG=qj!D%#I(qy&o?#z&w8q1br`e ziKS(Z?4Nf^F_1jrCMpsh^A9N?1;KzSu4>Lt0R-dV5p@JBZ#;o6eBA-so>Te$cAR#? zGfuN##2dDYb-euIYvb3wg6a{z5o#=9CJ|sUm$|@DJY`=NA4A~+KfcH_ ziUOU2?6SEkow(SJ7dF7Za?c_p?>b}(fg{x5=tzH-b#FU=G_Z5GZUci?KFwHmm{VLx zt{TOCj-ut_jPw}@`&|GAsQ-RG7Xwo1iOKe5gr1PTQ03f!0>2w!jvaJ z`8+Npk|NLRwRYCKJAv^ z?iO`jX3vT?|EeY6Y>+Zg^JAf*u|;c8^DJ8QWYGBnSXB{i8sAXTr%97knoon|XOvCq znxpER*t{Y~b@wm3s?!aX0g3I4(@N=IL!4p6tz= z5_=@&%dXWpBfr*lCM^KP zfGB~4(3um2z$oOe8Hvy7bTO4MYGB1>PKPw8ml08hw5eob3jD+dFQIF*8ZV8v5>Cdt#Y+?I2hP zP*aiEy286@B$z_6+Z2k3lcLOQI)_SE{4w382#CRy!qTLV-ZJF1K&ojzM*_a?r#Tc( zDTw5Hg&Q=6LIMY5$-;iX&pw6DpH|veW3sQ`p+mi>=-Ud01avj5V|OMcCr@0J-ep$# zX$&oFp{6A-XA!!J~jcw^(c1sW48l$0-RX2lACgvW)k*`5j+Q)VCH^$?S{JZSlQU97$O z?{g6uhHdqMA{NPFfq{GIKjq})IJ97|XtDc`UI96!>M_@;v&^ z=<*UDh@{CI%krV&qmgK_+1Jwdss!cU&fRGPIJU;G;bdE%J8B;_LcZ-$y-M4L(}m&DpbS*Msg-zPLX{%oIy8QSh-*nP zwRBrP>Y4E_^v+1YZ1`jl!hNWgtF2$?yeKMaxtvk}>_Qnd)W&<#q)9K_dr`{qO~3|9 zru_>%ZfSY)$W_cIe#yx#>jFlz^)xldXpWeTa8jValvspCmlMJ@qO&%Uo!}n=fBY`g z8;j0PeSrKy;8Wr3jD} zxF-lROXRnK>Q(N`xpN=3EBs^n{;HvY!77@3A+vBQiV}4#%TbUQU6O|;l`cV|AzB9|36np&mGeL8`#5mjg z=Vp$!N!9v?;$BP3liZA%ox6WsoUmGm=IKTqho+*&u%i(|niu)%bs{+}rb|?VNPZ~{-~ZCx z1bjzS)@Ukc1I9@tS1Z;^gKwds&Zi@awoREX!qgs?lnkW55sasMgDPi8xA%1ZyvhkJ zHy-JH1Y8v5JT)8T@?6+XomY24&z@DGNCQ#707Se~-_m9K_H1N46x09z2Iz;D=!DG$ zZH_mWWCTNYicZLJ4ZRS=K7t$o``8TvOesV&?39-XVlEsWlr#bd1JNKj6xB?9dQI~> zfwfR8h|(O#)C4prTDI2z^Agy$SjAcj*%=^!2-YJL2-0{lNBHmBckx~UyG9LogFdu= ztaVYEk58-yEx=sfsaP@aIleQ;4DH=j{fmIqSq6p3@4sIWMl}=)%2*ISknPm&SW!f- zBGAcoFeu3BJS_*dOE^z@Qn1tb+=7p zj6&eBCr>UNZISD%YN3F>R|}H#g-ayN|M>L#28T)D-VSG!E><1WW4}f&#l6MFx6UDmw)T32k+!(cS(e9yCV{Jf zHLR!_qbI!Tih((=$b{ql3P}Y0kkf+sFJqPj@NX{7znmze1Xhje^-`s$sXZnPAy1-- zCfSV_l{+Yn>m$*ilk4WqMA~oyi)HQo_wFJacg!LjC8(cecPyj~!UoY|*<_$wcCORd ztGwJWO0)lT|8aN_9?Gxq6vq@#Jc>0I?>~6#lDM;#O6|2nkR?9?MLhMtyh9P%gQPe0~wjK%d_5p?lEmzw8FH$4;tSEj8y8PZnM)g zd{%x!+t2^`+D}i;vsFi0CK?7V#jcnmFwqH{6)}jy3(3uU9kH;WWVz?)N>!f49c9x{ zey=2GlEOe_TvK3Tr{abKjCuI_byN7ZH_he~CQl~q#yESdA+mFx`SJfjLOdm8&xFMu zUN|b8c0hO$IKKkqexjBs~5t%~dTp zH#@HDvzKx<%P0A*tSQ}dVVJ+gW$P(M)|PTMX>qTM(>tkJwyHTv{y+#S(8UY0_uwg1 z1RS=f?WCkwmOXYV3Kc5K?`x@Ac6II!NGo#8&zN69PRjYU`S&-g^{>hh6+1Eq;Yp(_ zvG~^~uevZNOhj&>7cc+%)sWo0;cd-BrBFRMCk~u3BUwvo0Dh#%$jFyzf47pKrCu;u zJPV^U`kdvwT2R=}ONS6m^8~_z3Wg)OIo3`=qVw#XFJi*HJjQFvyt?&WT2?KqzUsUs zo@1}(S;I3!4|xs~AsbpdHl6!UHxAEZa$}7V7q!T;NL2xTK@QoK)i46J2O&U;D`u$k z?nHPO@0%$D)J!~o)PhV3Ko_pGLVm!pj{&iEoAb% zw!9mqd3lGRVK@Qzv-$u%OD40o(M=iP(4)O3IetLDeu3NHT$dl$W;Um^G;HAje+Vjx zc>!~D_hzZCUf1jOYoG7cpSsAZI?7rSKZab7jzC}^C@KU~0pv-jUf2;mOh`N4Y;{iB zt>sA_{QOK}C^}mQ+_aNpxP-xGdt@Xq7buWo4;PHNN~^%xQ`Kwdi*- z1um}xqEGN2bkC7IGv899V^ZvTkM;BSz=9+l7+$9jT)v2E5o&u4_DvzM5E5-l62;-4 zWv5tG%oP|gz?!(B5K4hX0x3(umxA-ikz(Qi82@p%MV~BUW@go@bnm`@%BX%V#XH=b z+Wv@v>PWAx)YJ9h3c-`H3bJM^{Us{X!P#0|3}i@cYRF(Rm* zF-wmsYsSE)i@;>sa#y@0`Hvqb%6(3|0b(Ay-Il}u;LR!*IDCmDvz#vh}+sag_-32d_xny@EaP+&XEVuI{J6xtQmN3=#heIULW{ylD9GL!=2W z`(Q-io%M|^{7RAf4%0z>mdO$jx>1Wv*>{Qogt!QFl zGv;f#!Q@lX2WowBLsMC774wU)s&{$@+3KkVCE!mXyl`vfI-Z3HxuHt| z95uVNC{%0s$~&MGsrMyw7288Q$-V^SuIH8#QZEs)&M5?W<@qyH{-*PbuYm`wk~epa zTi)5&_`s2uI~0|aAC{&3K%xLOH8gLUt`GpjEb$Zp7J5GYULe&9zs z5E=X!T0o-3Ot*}&FON~LUj{X6NsB41EN=OX z6NV1QqPsT@H;I1#ezxN?UHes~iM{sy^CGFVrFnNy(EYHvmqgg@sqafZg)3=l%7}h# zt6k~ax9m4Oz-{*#O7bZC!$DF|{)0i`wz#H|izc{MErL>HDQ9M8RwVvBNH#nVjAtOg z!0Y=C$P5M0yAP!k0#c^@z3*7FKZ5n3i-Gm1+qAuYZElp2XPS=QT=oC_$+hzGQS1SOQ0R#psj~$91Ml#L%6Wl$_>6T*n=V0z}v+ z5Wv8Z`fPUTVSIkKpP&9J|0DZ;mQTt#adO=LRIlM`YQDmb|L?17QntVDJo@?Py8c7? z8%o}qh#>g&KYssyGBFLk4WOC;TCouw>^VtSSGyzkk;r#MM)eRQdUa(r3ph63N}7h3 z5esl~e(9+)$9xQ6$q&hmy+~1du~)?Wu)xqzBmO&;=$dnJBJczd=&=&)zvRw$h8<8IvgNP-{)02(S=)LChfGKqiG_D7Yigc+etRl?2@Lbg zQED=76WwH9g5eO*G>>16VDKYlHqc{NoSs)KV};`ZI&!Gj*w|02PfhBzPqJg@&PH_! z@u|Cw&I}uE5FY%uscI$!>?xZr!?cHNtJxbeaZ=||X5VriA8Z6$u6$#Q;#^49Ii>%% z8QPDNY{JW@vYxFPMB6wOM<)D46!j}qBNF^O^pUhib1Bh;Oif-sI6!;9?*VnWIR zY?uEv)7Smj9Fp1g!8%mESz~9H4ox5T7qP-tj;M3BsquGjRE^ru$*A-vZ{h2=H)xB# zzhfIbnv(Y?#{9C$2mh7>_d-1FmsT*;z$iH_O$W_UC7>uFj-k~11j7ot40?9SlH!26 zu%N)s4PTu!+IElf*p}%0G)-q=$iLRV{q4~9r*rGy*;{5bzlW8n`+>lV8&CKdp4GC z=lD6uUgz$ReRL7mf>}2iB4uiyrz2&5}@82wwM z-JDW#}WRi>%-wg}~W@?O> zyZC+6`s2ovHb|~nb8T%wb)(;reXdn2pI^3HyF7(03F9KH{ip=QQ%7HXHRV!Pz|aCj zNlwDY!Yzo4O7)xiI=BjvLWbsJpXhBtFk2Jj+WGz@#mj9A*3%x(H6YkQ2+s1Fnne_` zLQiPLbRG6MVhIr~FSuuJE1|QQvF)46w+%jZ&^tVR;mWz-(p4KAnzavMDP)csxZ1@> zT0X0&sOTXfSk$}5hE^6TGGe+HzYBydfNl{7K_&WwW&ibN6gKA&LBg@*!gOp|9Tm`% zdzJ$*+A?+^U!djk!w1^3@BkvB1CU6B&p>Yr2R-YjvykudP{a%wu(i#|`6$s$UX?zP zXRYz&ZCi|^%`aNUh-?MQ&9UEV4z@0+Te|c{Lc&teZ&V@|s9g>ZnsNk&i-SFb`i3+G za&edmG8E%bUCC+PCSlm)y*}hVoeyB%!N@L{5d%WHAc~e9JEn>$5Cl$$q65W=D;b=r z&0k|0C-?C4lkO8XLAq1Kr9E>x#w+zI`n?sXK)e-JxBOC*);-5Vd+W4I&UC+j#xm`> z^Q+Y54UwI+wrE#2WO~ry8Z8qhKt&PFU^t+waiZ)$(X<=a(iL@gc4lBPe zd;D(raFf(~SyD5CpPTRaV^Y5R?d3i)Vfim07$X3OASU|Kh=APvaAWF4Hgt{e$;KSU5JgM8C5-5`{os&zlsc@j2RWL2WT7na+@SdA1h&EAT^S0VmYRhewpK=oM*{kJ!+~$e3 z0}WkSaNZc~j7ey{sjnF>c9=s(%m<<;6)F_y{re>A!DR!D`4?yHl9MB@RrD0R@K4o! zq6oR1uSDxdP;d0u$dP9Eo|-Q0I{Q87wvb-&iVKBeR2$x`G3yQ9wMBcU<-I6T?$f{ruO@$;Qc)4u;oA9Rw#|!l{`a9X=mc3 zF5UM81Y{ZgvEW7qU2o5oTUT1Y8EFtS1s~4Yi>WJ z&|&XB=eUuRE$in_ax{6Bwp)&g8Sd_PZ%LXH5CJU0IuV^2<2fdG%O{|VT8BNhSV`t3 zwn%!b%>I*(6$22#p+gnKCK@m=YV0P;<>?0FxeyJx0Wj7vEP(u;`ydktha6z+Q8Np- z3_?gSUG;A@vIYPGcr#$+D%4_NTBSjKW3So)XLAa@OTKE>pFtvuH@B3+?R*HgH1lYcJgUOrJJQ*tb|}ht8&-&f}#y{TOIvBHe{l)R?Vb3Q~Sk zd~<>DdP+Q6Uu+|cnWB?>J*FC=5kf!EN

!M+&q$KJ0iuXE|5MahSDYf z<;UWT6Zr*v4UWVSXv-*2h+Lujmd}U%U`@6nKFw=0UW>+EF1xFOa7l|2Xm+G)MB*^O#5nNbn7M@vN1ZdCw%9XnG z{RHyFg=#i8_v_GtpFZhSio|@CC|9&Cu|egH1LEkpb4P~t82arVD7jFSbK0UGVt}Q# zK()Dw2sRDj7W;> zMn*;>CrwIxmEqcJ_kF1FBvPlS?^FKR?vMh+6b3?Q2uO$X^bih2Rr#wy~t~^5x6LYr4vfUdGNAj%ie* z+LzN+dvpRUmfAnpre9} zf*}pRXZ}>7ASdSoWs$F3-rzxlM6B4>wwYed<{s~c5V*$M=jsE@_X%KX6uV-sdho=} zpXQ@|mYjt?mftJJIl^t5phbnCe8SG_IF+diylT;bQ@#;BmUqvs9^8KPiE8z)VyE$A z|5oXRXsJI?4Kke9>mST~)2A<+=lA!vkl}F)ibCQ4g19|?e_Mt80a_#Fw}<}51I9N= zSEurCzj81CZ_B{#D;F#P9aNvlFaXH=C=89!cHM<-3{cC*iC>8Vp^&90HZlN)Ul zXZks6k?*R9nl-6DaGhy(Hm>~!j8tA{HhfTjN13!BwMlYj>(=>Cs;}Lz5Sn>8B@0{?04hqr-(P_&Pvu- zKFIJJ81t{8#Ips#<3lyNo{5N%y8AH*4Ia&lNFfC65(99x+|p?vdf9mS6e(=qv`GZg z)F#%Vgx2zEmO(%%HQ83NeF)$zVBH*d_vY?f_1p>ZQ}Tn7^+1HINM?RZs%W-3z#in}^aPD(*e0RG<1FKqA`cxrXxIAZCqsU0 zFIld1i2_el_~T)j(U{M(7n1|{9hm^n+;Ff0nIKg_xz+2^mOOQ}Q9LFjpjRpzH>h{% zGW$IRAI|4qP{6<-41SBbHIv8$4n%RMO}N!GlYrUTV7`c}n^#ul8{L-!N<J#efP&%lBYjFbsc)2mE~p9Cx8TxbxQYPrGN4<~<@6go z_%Yhg#HQ^?I>k&eF}da=8Voc6!;i}&I3qnlck^twb!D9f8>Kn_D=@0Kz;6cic{Ld3K?soI>8Oa(cB^bNrt43ghV2mNSs;e~5t z-j`E!PHlL6eBt$f6gteyzUSuzetHKKjke*j$5Huln%%G6^t>H9aY}K&&?pWFUUDn8~>l%47H z))g;@s^xQ)l#L5KwZ6T%G{L?&ru$gy%F=m?g(qU8ZI|1JZgtITUm6lJaPgm~VW(Z1 zTBcfAJ=mD$)a>LCdA)An04JLZI>W>S5;*Gr(U26iAWq%T9&!U>dcXo_I!OK)ml2gm zQ2oR&J}VJU5v#PNuzr~IXfq;jj|K%r8reW$itG-g8iXW#`1Wm~jm_C7Z7g29(T8s< zjUDUH;UPj8Xe9j5v%y zco89Y^2t#J6&^*WMV*O46T!+uW~-pN0Ioe4Bs+ru67EM0?^aQJVrLsgV@hQ&LPSv| z8gdVdtjhjp8jpn`d?1d4<1K@-_35X>7_t8WvqK}e2RaC%X%-bOUw{&X=&QW#XV#T# zZ~D>R9w)K4bB&|S3!R-^hV?tp z0%W{V&hjl3l$aPq0y3W$oOCC=8xok3a*AF2Ll+x1S2hlq6D1vdjF5*> z^om>`@(4-XmP5lUuA}sHLBUWv3+SnR{;I7whUsW&Nn?L(W;!se7hON=`WloSPlrPa z-Q*z@3WYEvjgvbl8#`}Nm^ssb(GK!KoY}__s0fMt)2ZczPqt8Aw{Lz;%sO7 z>;0;8@ECHF#8b+ShK6R%0ZdV=8%LGPU*rgxs6?AUqs-Jd7lHWe%3{A%pQ55QF-_nMe7CRB(M5V}Rc86Y~msh{=rYA4!6p4#n$ zww|}<9}7owB`Pm%uPe(Rk%^@Yd5F|X5a~O9UbJ-LsFQ=1(^E5D-lvjkQpmID=zdV9 zYA|gPv~wQ*bf>2FL*BFskvuIC*9l<*nm~^)pf^3cpD{^jU}rI@d5D#s;<1&auOSoQ zsiM8>3vYt(qo;&~d;AN@vXQ7c^G{DY#1F>bum_tNx1eZK$=DED97HsD?Sjzh?nSgh z@2W?Wf=my+Qy7RU3zU^FnAxhTY#3csUyPPJOR2KGa(#4WZF_sgJmp!v@7;aUWc7f& zyBi*{!N%%nMMFbFm50pOevWMdtS08VtJYsIIQr$|$BVsts;Iks9@=mEvfXIrs83j4 z*8%qV)QID{iEAGTEi^K8w;==ws6}gbblCs-wuMgK@zVAVc^x|J>|mff+nf`fvtD%2 zl+MDAhV2nXE24B9NS!f#m>Fbw@u`cx+9~oOc&U|Uv&)WKt z!EjUsA{j!2vFBsvj8jT^TDnPABC~pC4_#f`F!bL- z(#;lp#t1n0UojS$lO~E+hN4~r{|F$`EF#~;5M?|Ng(r>sxXt8E3OEuZ+YD}yq@8?J zI6_c>0pZaNA6k4Tx}sIvTf!d}7YB1uRvfcniSC0@0>={Odhr~&r(JM5aH1XZul|O@ zg+};SO{uloWx7T&;+FBzgc{r&=#>GQP#8A1yGTF;E)&C58A#ca0&GPZ~`}I|)73SOtMtWFST9+s7$zOb1=eopb zK~aV|E9L4bujYGap2RLkHn}e0Pi3m+;lrAd%iI3U4Yb-`x7z>3&#mfb<>bbDCh7aB zKkBO-06~C7M9A41S4Z+7AlmoDx8vTXm6ecgbOXfMKdU8v7iu@j$RU{0Qkzbql{)}H!rux< zg^b@cTLm!yj;Du)4je(ug+i(VjyAXDh>Tl1K&uA+9b*0-fp*jUw+{8F8ob2fZ`RK# zHtK_{6z59IU*0)&Q*DTK(es3a=%6(rP7X~I=Q&%&>%CKUbZk89@ucysM^?S(`Kl%@_O%$Jr0;S0hr&CcP9fbHG#T4L zW5SSTF9ggL(Flx?e4xz>uW=Gw&2@0N8xbnPD+Xu)hkqIrN;P1~w+Os|uo}GG( z240Z4=R9ieZ1x$t0<@Z^nq1wmWQEs@h>?pqU9GOI>`2;F1e;7`D$|;W-~Ef6MWAGi zKo*8+){^jSdLs}>(9USr+-~*9J>?t}-VF$K8Z!~{$3YbZK}dbJi*+{h3^wEf-6|%`XpA$iAuBfAYcu-DYkW zyKdVp$HvV!F1$J5>wDnS-<~4{)*rxR|(p#uyqJxf) z#!jzwFwuhl+h5x=>H4p7r>WWX_5Y=u?*Q_wc=U3&PzC8=~k9v zU;}daNR-jU<__GPll?T`ynZbNgI1S5^mGk2+ypwk!mB|Rb{Cvd%=1H7atin2+)Ewg zGr#ZiaH&3#<$Ek}M8eA2+iEGc#&hPRm*%cLu4J}bR^r2#Zru-E`<|lk^2es7*XKl3 zmYEOitlN31T8wqPljcPWBhx2)LacHkigUG0YqtLreA_A)Vv&=`VZTM?uF*9L;|j8j z+#)Yr*vUE@_#)FG_hjAbm=iL%@0G@l3qbm$Wl?_8O+)Ju5BBXLHX5LOgI942p2Cj8 zz(+MHxC8nd0Bj*9b=iFFX~zu&8gLrJTb*8Q?AiR~(x^9Ew{Ks|#&`HTL971MS-aUNRe7C4xbXOt?B=8PE!A}WH*DN^{7kRBlYTefrVm*pt9DRs?B=+83rAmvZKP~Ie{g@`;rtu>P)@)M zaHZ(UT{`Zj2yiLvz?|0=eg+}!nyej6ntLcUTo^6J-0cI)VpA7L2|SuKg-ePr+S7I# z?6rt57MfOc!u~6cl5`7bN~3-EYWA{^Husj^y#MSwog`Xg_h9qM!`yf4>z;bKsIuzN z$??AXd^W@#xcV}FoO}4oxUtJJWj24<{rOV)r!EuM9>fh1QJ?!y%)-&Q)%1`y7FH!4 zKD^LRgzrG}fISM?3{x2R#D|%~@K8!;O!bExt#AQQvHtA+FCNbKk>%e6k}3Rpz(NRY zMLMAB%-N$u=3Sg0bb7|xcZe(s-lG2JL<9cC&F2w8iMjUwt__L7iycJ10#oGw$!c0APWGEYA$LKv$^9;f$NWx6P% zr^D)83BDVFQ8tPWM>Ha{eVU&ndTjr>UXMQ&Rf7mEEBw2r<<}i+uiIPKotoS`>u{eV zpQDWCXnomsNsg%VO)k6Mj63E1SNHk0cJD$%ef`Rw0!$M4k6QdavzK#^9dlbFjE@-e z=0%!?nX9^0$KEEre58)fd6bYp$~mzZwj~;0y z%6z$&m7R@*U%R}WtPKFGGxxd?5!lc&`ODaNN6m{yU)Fo=GJ5)DiSDaGNle2b~O z>A{r^Umqx+pS3xm$GguQ~Nwhzje%dNW*j)j?siK(sz4K*i^-RTY>%fJ$X zr?V4=93(j?e}qf$LubUPREfAYF8`bqYk9~vel0@*MpgfVgPwly`qh9B$L25CQ&Z|| zxXDxq7D3KL_)YR+$IY&Mpz9rjXWSihCd6;c`}A8AnRuSEM{ew}=>uyPK9rQ)d)+>- zx%^m@l<-<$;UX*Y$XiKk{iwYnHv$@=l9^Q^Y>is{1~C?-*7_+Kh@o_;WvdN)$6Dq= z!$Id?JNwU=5hLoK+ih+nJNe$PXA9u1KI|&9LB2Py#0kOzQdyB!3p#kvnTFlc-gj|_ zplpJ-W$t^p@^RIz%{PCHKBUMSr8pB{N#L4?`faU?RhrAb&E6p|^~^o%LC1YkX)RmT z)RJy^9m!vuL!ToiqI0cKd6&Cgec>!7+++D5ERx~ku^JOr29UB#3F4Bxq<+oT5wHH+P!4gVUw@Wh;5-I=Kr(sLQ=DGNc$K*m%_Bo zUDy>!1b5PGxP(~7RB(b}Sn4Y+y)!K>P3!fj#cNJYy~=3@Mm;Df0aY$*^;2bKr#YTE z43N3LAr~irIEsbs4yg+QqyF%X^ym+HOBCW_=ZYyoKo3Mrh>$O}LUXyqWj7bBFRH9@ zSQdK{l>s|M)^^8-uK5k_uacL1l9EzTG50D6-4madnyhZ2-)+^^akN_$3m-8s2@HT6 zn42Yv3r1kG4PPxpZsDze2SCQGg+_~(lo+2oTzK8h_X zsufDRaz;Gp#url~7XXCzC&UP{SG9VTjtlSvnAP0{`J@G3;z-G#KGv9dXs;4S|Fc63=JprMWn-{q7ktND%D|w3acB$Kn?gro|MJQhsQ>* z6+$Ie&0^Tpy83!6zJZulN1;HHI+w!#KscHpm1M5KO`*`5ukjF#*JO^WDKVEILuk>JoVZ$x{>@BXU1(=rCq<%Lr6Z2ijFPkAmJ z;4r$FMU41H!uBnM)lmMB%Pf}1q!l36twLs3MgBJq9*d&d6LDO$0@|p7fglBkj~`Q8=sc)b9cFrtAkdg; z-@92nCmT#w=MJDvV36?%;Qb$2gEk&Lb0$yuyk|OvAzE-v3Yt%RwpPT6g2)kPKQ%`o zGrg_q=R_?`s*b)=TrI|ZQ1*&Y2C&Hb9Od&OvGlN(LxDlH* zLyiZ2c(c*BdECy>($cLl8XC8SV4pD-0sKYGUZkh(MmXFTcu+)s3xyjUCJHBdQZv>e ze`t2a9;jQBUs<7VX9ef=nvBMQ47J@N=%uB|$%kEOwJ5efw?66Vw1rk|a#=~fb3 zL3nb7zd$%x5a%Id7`kIE2M97qaH0nw*0@5WSe;OzlAOkz65vV?f$^m$C8B;@%P{)P z4NLwj?)KQmr)~63u~hSXT9#6)KGFZ+K>>gBm3BQCrWx z+iY`}84HeiJR^!6F_vtdCANRjOv4xCP4%VhxFjV0+r<@We|l1Q{Aug z9ncFZP8{EdN;k}&-I1$^bA6_4;ru-h?jsNrQ^t{rWRo$9WR!v|*WIx|22q7$Vu@Ml zTH-3$J=m=4LGzCuKR$;$@9-@{bW~_E^{q33{P+dDvIlmztxZg3d3$T$GM+kj!Eq&# zip}^cTJ68a&jWN~dD9e6L#KE6mBbfj3y4;hZ33hwdK~*$S2U;KFXKy8sS|naa_eH| zAJ^TmL4BZq)Mv}18WlrQY+F@=8iPOH>F3`KEMaPl^CZc^5~-;xH^hyaGNtkQcbUNC zQ%u(5rwZL0dxuccHJigHk_Z4Glr``rE$(;48 zHG4RcHK4EHnCFv1_Rov8bVWYSu+W1KR;ncaH9WqT?*{Jttt{ot+qX8Ue}7xA$v-ul z%4eW|mTuL3==;z`^JvwtzhOy!g!G8?PSIB7HXOmv<&OIKdhPiMTo`C2cUFhy)@lS~ zA{n~FSrT2dm=zZsoGm;`n3~ub0uvIUG{@G<8lW*f@E)+;*t3;(43Hs&TPW4$#q->^ zM!X=7Oiky^6LPSo;?_YxOS(;xXOjjP#H|SnX2n_}lV4>Q1FQ%F>Q%7*>z3&x5HJ5) z9FrgJQWx>`=-;ZL!yn&pn@A?(jWx5^R6g9ev1rt*8}rOf7M<==U+EX6jFPX4Zxk@| zr(@q$V^vg!F>VzyB1FZv)DK<@N89+x`r|f~&cIAmj#M@Z!jq+So$^#nI)HPpZ93PY zw+r32&Zm)ezYFy`{BixEy5KBggs|=$JV-Do^4I`UP&4b`|0P|qZ7-vEEq&o zDzo8^Ab8`D6P|c{2BoflTSCjZ$}xDw>Sk@DVj;43!OXfbixY&tgfNQDuJD6qWkfWT zp)O`MiTEXSig&A&4upif_6pa61&l0BDcKp+xvs7bVma%PFunJXlw@gq1Y71JI%F9X z>^4ROMR7qHdZkggEwOd;6{s$Axv1f+?r!-q;*Ba`!*eb~^XKlyXQzs>BWKUG$%rHk zPA+a6^XS?)#PkGcHy_}jGBq)|Pmv%-&>#9PWA;{z2N(4YbAm=U*dM#&yQq6p)23f4 z=CP`V!!|0#dij*-pJC)jaj3vtIC%hx=Ex?l7I?w5Hn% zudZo-yUo~Ue#xc$$f+}zbY8lIES2_+PQ1O~_Pbt$g=r3aJlA&z^b1Ejl9YbnH$VlB zW%s7Jxw#?f{rLXgcH+y$6-0EWA{749!VEK)|1qYL@p} ztWO?u>%i17rfP*x2rWZdOuZt`?c2B6I)g^oH8nMTs;UaU@~4oy1R80PmCr&AswFPWjmRHMoq zxG(VLe4b0ow6j1G#CFO7(|FCl>Sae=`3A@Boqs z64VK3o3dL*M!cc}hLIZAEhHo)hCI?aI(ml2L|^8&gdf<}R_l`!v6fC^Bb&nv5C~zb9x?Ld|b(9H;O! zCverDO?{dU2gs+*JJQ9&gOO@ePv{jUP5Aakxpk!UqMG0p2PG{+7wN9v{=M&!pBa81 z6OFZZUA!A`I%M{ZH9wv$_@W{cu%yQRRoh&UP`V8RafkaEcM@NnZu-$~ca2OzEU*EWf`N0y3v!6xiBMIFN>nu9;t!ej zMu|jL!8rE0m~mhw^;c)VQqOmhD$k10H68Ot#-{1isbHq^S<}eAjHA7$_2QS*Ur%Ql-EmUFSi1`W(w2YP+k8^-DMoF^PoY)wM_2R#hMRDu^sd7wsHP}k~eeP+pcX@ z+fpMLtnT;SB3^xnc}U2d{=>Cx|BW~3H`TS9zW+1(H79r8aeHCba#on8`N|x4G=@X` z_0i}`iKhI`n=|y_BK3?#GQPj0XiPw;PWPA)w1g$ay>1T}K&$xt)JG4H(7W5d=k^+R zPauZrsK%V$-1^>ciAT`rK@#hOfG|oNV?>Y|?Xo;44>uD!_k==AvJXEV8kc%M?Qq@! zmYXhgKAnh@mCs%cJrk2*HE+7@F)<$dJb1>o{C%yh%4<8z=+Q8Xx?ISQXtuHtb%`q< zYffiu^BH0l2z@{2A9@$uMHPM}sv6tFiymCDxVB@$!ZV{33=H}(>^m;?=wQo`d1!K;;l4%@t4H29Ok39P?j7jK z!Xavavq;+en~@nn`GgDW{)-n4I;&PLcvd>4Fuho!T|Fazp_cUI^?kPvm^_(D*x$;= zu8|orLeEG)T>pLf2!*UZBlgbyTBB~Z@@uiYtId+Q0||{efgiSoz3QjDGFRr9cKMgl zYi4Z!CM~s#iJUZUdl_4XVgyLx-!52ghSDV2doNim-VgfDK$mw zW@~FXKFEj6Bi!-9R~~#CMs`5KSSp$_&w3;1n7xZGF@d6c%*AKRat9n&a-nh;iKS?5 zMMw=OfPQ$Pm^rH5_Q#$Q!;QU%>ag!nuV00W3%%!679uNKgX{!!)_%>JZNahP_T?2U zq60$=69MP!Fx?g`9V+$#{?>EN?myT1r&kq7=^Pk%|H$AEbq=cnU&c#iAF=8yZPMJ& z9es-@LE)Try%F^WCiF&+MinXEq8N~qom~~y#nQo$Niq{cwAY`0^ww~Z-0$>XU&e15 zmVeIccA&emgzqUCAA*ZBw*@HlQ8zWcEi>`P{*7MsQ)R4n4rCqTq6IQm5y}5eYZ_uP zWK%DP-|@?C{2}vu4D-34JTd;X&$`fR-q<&~Pm|`Z+WUoeCbeBVF?q}E32P1~oq04` zN@JwBwQHgWe*N;r%6ZNL3M9HEk*f~ zcenJ}I`q6(x3o41-@(Id-3?l6N?vSvlbSlmW@WYIsx30w?+z7RbAG$DwJc@Olk@Th z3ijFQ7xlL5=#CkiX$)gjcFzBC?3Qz88;qTHGx;c4C8yy`-nm;h>uH(Avge#c&mxAo z3SNTtc(IAeP7V!9M}Pz|ME=gZuRa|pr!dEP7gQMMTDr-Iy^U;`j0FcPsfv(c6HFJ% zRem}!R9Sg~6^Q^{y35Daj;{>GJC0C?mN`iz-g!xRktlCE#Qa^~Sn7 zRe3wRc0G-Wdd=Ve{8Wr#NX>@}H6VSG|bJ1e&hxrfa$xKFa z;Pv@0GzUm3F0-22&vE>H-(KlYlQ!$UeV)9vUvSgitxu!;H;t)&(a%q2*u{w9!{;>r zogHm%@b=NreOdmQf6j(}C=bZ=UZCc$U)lDlqGAH`NwojG?@Vi~LXSB0p4Gq6|Y`BWG`hpPReL2+s)H!X2r{Qo9(hyS~ckMhgS)Dx~k(=$jzA_ zyUfo#Btpq_#MjR%W^QLD%f4CnbXG*c&ZD-$e|v{qgy{PA_D&$U>*;%C_-?$J;fFDX zOW4~=YO#LQd?f=r_86nE^0qDPGynJw{co>^C#>((Ic)pJFy)o&$DUJ@ll$0o^ql(H zCsI0lJrb`q{E!Zelvkfs{W9I{MYUe#VcT)PB(GTX>$AkzxKDIYW`@I%E{V$`-q*Yx zoscwr>)nd0C%1Ex)1y&N-#|@;NI?Wm2#E__gTUECL;o&q`BO0Us*}@&tqu3LZjzet z`1j)}*ES8`KWA>wxFu5A7YFvpd8Adr7?Y1RHEkKbp+kJHj#+w3vRm)F+n?8e)C;@$ z=4yz^>#bX5vogZ{ftKi@}|!}JARm-`s;Y~h?3Ydo21EZv9ZZ; zt?(Y1V`-^4az~crkKg;&zuDWN?EBb|UJF`=F0Rq5)X`lNb?U&oTSjMBRA^?coU?j$ z#Dv`g2MttWtgtZ8ZhTNF6q489k zy-n&EMbmTVE`C&-xqKm`&_?b3rQL2bHA2r}LGe|l3mtHf`dT|<`kWxu@jI@a+TwN4 zdSTv);nPDeZ!AmDdo_JrnrHRdu~LAXP+7RE+0j7L#9JwotpRk~Z@>Tpj#iPn!l>M0 zTaVu3#*L#NKYw6%s^3HB@%^#ak5H~^b9N|6IC06dOG03;%jX72Xvz*hU2Hp|;-g+1 zQpo>W#Sb};PE3V`Lm0t(Tk>-5@Cl~RrAz-^$MDsQH4rVPF)lwZN=NkQ)z4>&lj3Q! zw4YfIqbKA((vqb0)TNF`PLq6+rd7RFjxBWFx0h)E7iC}X>^SG&TJK0#DQk7R4eDnPvGbeb-4QjCYEIoa zMnz@E)xm=#yD?y#cm@xV%y>9H8%i695W~WLwCo~ZpPdshR!Mq5+{lmrw#CX{jWswL zaVcuvh5RA|go#j5AK~Ol@t2111S#g>e`INkVF)n#GyA;$vwL?Z`iO(;hg%H%RQ2%E z`Gc=(RV^-fY(J%GaC6(wAJRA8y}9Mre32u!lPqDcUyeEq+09iY^vMvAi0$XZyx2@J zg-8!D{t{;_3%m1v4_V)!j%e>HxphV%s%53yy98MEp&`kZJ9yOXE}d*E{gOz{hh!YE z0xW4QV6n)-A|~otnooP*>lWe!c?>RUm9%L%O|l3ErOORcC4PUh)KIytJ|pHyC~Hhy zJf`-|lBw~=X1DJ@_+XiraA~l={zJ$VuoH%Ub&>8@hFF$Kp9kp~@dwQ(+GIiO;CDB4 z!~UUXoEL@*FQX-@-BkFHmP5g#7xdBb8&Z{4tzTTX!uhDH|}v_ zQAF<|7t|keM0p9DVVNWqVj-NAeQR(I}I`je~LVQV#rndl`Yla+d2xl^_Hi$zgLn21PWnwr`9kw=?0UPH!*a=@;x1W%hkW0eS2$^ZtU3HkA$ z`%qzHWxXxE*kWvGI48dNj7)0qWaUK%UO!B?{cUUeQGfk9kC>sM{=M_hj&{G2oAZyc z)Ik4zB%TOOaQQlpw-r8rg9O+_ZejrM=Qq|hWB(=_euGFm2bLpHXWWFSjh#tL5NtU8%lz zimQEHr#l@tIZDm_)4jdT!Bvv)LYXIYJ$wZbTqbf5OG>a{-A5}SFj>sSMz1A*bR6Gv zjy#)g<%3S!zP(zN@C*iZj=8zH-#3%vN+Pp}JQ=L^_i_?|$B?Wcn3d>2)|BSIua2^k zwO(qYZ17G{bUJxk;UxzxZ#Zv#vQ)^WGdQMESh*GJ=3yM_@y|b#^=Be0KO7KHV1K?` zPe_KTxfiRB_Rq<=X4m!V3aQ%EUgJ`Gyt&=WpnXPB)Y-EKH=NtBR=GGv+X%5u0;htl z&)==(>%cF#c}~9@6=20I9?gNh0ahGlu7(A1Sh)D)vC;p)=)w3BaTY;r(x^Nk#ILK< zC{ab>OYZ)Azqi@7x{+qW)|{N|VdbM6xo?s3SMy6o$rjTUXT$^$9rTeYasw7ETu~JzR=$e5!5n@y zeM6QKQ6PATK2_@Ear0Zn;j@X1v_J=%L#UPhBjzx<5pnJw$a$IM$4CmPZADP3|B9VG zyLE0*E4`ncH(2^h8s~@@xnMr#TmP~33&V=5@6d}wMQN&6%wc?if)FPl>_-}+VYQ99 zoG>Z7>ndeJN#GK^tiHZn@?)+|pf|gZ-^v1bi*FmG-TOLfUfX#7@=yth-KOCg)ipKg zRzcReS_3FQgaipWH~i*>(n@jmR`a zwJ`IThCl1Z`)|V%x~DxFkEb7Xn<@?#?#bxdDI7{V?X7`;;bJ;!paa{CMLlq022g^? z+(cA*uj)70HD*fINb8~F(|u+^vH4%RROOSt>cy8~ik&8J+_$$&n8*8?wtxK;0Q@&2 z6w&CTQJU&CgGfR#A6A4ljmkssH;0*Gh5yang^JDW7QbcsV3aAD#l?-+^Zw*^e*M~R z^Ct+1ciU5j2p%>fK4r~7CnFy+_UVBX3Gd z5J6OUoGaJ!F9(h^R@yTwR&HXzFz@p-l)cNm1=&;V0Z@4 zzc(6|aQ2|o)>N78LxnCmGqWqfJM)GFD;X$Y@}F# zveQ$$lP~sbOZN|+OhRw7O- zJBd-y;)38X@Sk#$+Jk!rEtOtst#?&YZ0z^Bj~78^;061k_Mn?fx6XU__Zso>@f6=p z$Cdn^2fH^IZX9K*|9=J_F4)Mg*Ulk5LGts1X~)d}8KAx*RZ^tk*SH;7wsh&+;$f~X zE^dm>uZP9$M<+qJ%VD#Wp^vn*IcPpmS47?|AcUAq>i*p%``;%d7#eCI{2}t$Q;2&# z_*{)H1Q%uJel;Wl94f$*yEd}fP0Q)IbJFInmT@x2uH2vX;q1P>2TIb`HXTpAc{2hy z9Bsjg5^WUZg05j|+)^tm7l343OO%O1lXN#zWvJZ$-YEE2oXm%z`wmO?8M$?w|FbbB73pscFSbJcpAR9u=K%hDQ_#K*i zQg*loa>ErnE zS;;I7bXP)sh_s~}w$=kNQyZrAnSb>42@@%fz8`~7-7$8kK4 z$DlJs>(y0Pw=s54sARmLNOpR!c_w&*Q+1wxlQJMqXr`0i&9XMmA54SciLBY$W^CfU z%AzItMr7VeKroo1WTY^YCt(u%oP2iw%n3V}I1gi4m@W?v({2*8<)mTDv8&6vzI{0A z#`1uG%m-8aQ&M*Nefiq!)V4{7!l}Eq1RgHrh>-;;a-2nsW;uV6bLHwDAuwwIVI6r6 zFFt_>ktrbZ#~H@K&WBz}VuIvNR@BPnPYos!LLP^1P|(S?BxzfY9?qSdchf&>3fY$L z99f;?k%4>l%n-w(dUf|tACbCm{Lbi}-OOr#xA{@ssw{kL;*RQ&Un`g?OU2P`URNYk z+hC6DTUFUa0CwmPb041$qz^&SXbxP$FdRp&*rNfp4`01%T>UJ=E(8$D;F7Z0%LU6d zd#d&BU0`!&)WOj5>UOx2GWDj~1Q}In+g6;h#D)#GrtXYUZ4$=p@bQb)xiYz>3w#GL z8s{?iSf!6{0=9?vT7)g5A)D;sd##a}i=qxJsfA&B8HVc4hR{3B7>UXgEPIk-fVW#V z{<{^I)cyUkOw6L9Qf?i=LO_D-9<2<~Tg0|>s6RuoQ)8}flR<+d6h(d`Y!3r0!8bZw`NCvO_d+rlkJHJ$u;_$8Qn>1D1wA-h+{<9|wnqJNR{nN^KY*cM?U=@7) z+SSr9H^PQcXd6GimFFWKx{5?tB94Jli6Va=829s;9U}{Yb|EdJ`2=yzVs&>){WH;1enXhpk9CVtfqPPFM>85L!?=)l4;!;@BxZW!aca7Ol)GiUk|%4?kzi;b|j zTFd-|{yzAaQ{I#H*D?>dH0>OBWbB5Ii(f0=*xC8O&lP=o>8bTT6}rp?wwE|~Q9fj0 zqy(TKL3U!BPQ&U|`}-TD65rB;hp)Obs7DM&Et^~(_h?n4ZM$9X(E^64YLHjKSUiuV zCwC=UuUP<@#dC}06s z7|fxZMry~soT$3s-fe5|esST@KAygx2Gms8j2gaFXUC3VNuxq7C!Dr7Ak+M z&RDXFKZDt~aFTV76z#?<5xrqxgtuw0S+fl1s0DS@pY2#@)oQDLx4wS7eK26>iyJpL zU`Kb+=jy9BH=?vYI&WmF%1A}Nd_T%!FT}wvz=c+0XeHJ$8p zp9@uEL%sS{eEXK3*g6=21sq_T&{&-{TLI04W`^UVgI0Rnh{eg9_8*EH>blqP$ASm@ zk2uvix4d;&)bZmP<3aU~rcVnzToq=JQ+aLA#RU0)_?&VkW!LWe>Qi%i6v5D4{v77Xf=|+r5m7bbXojW^eB`+3d1>%Q^c)U+q0GyrF-c zi>})s79K3v`RZJosh*zImW4;{zhu~s)G^Cf^wlv6!&X*Cfct(l9a&mdYIn<-M=pQv z_Nm3!t{qS-yZ9|DC~Z|>u4#8q&muenVhUzLr&#%}OC96a`wH>lCis*A$M{RDyrL%k zqY*sQpk79kW6oBVhu%F%|2be(n(>$N7Cl@VkJ`-uMx&AE-BA{$MqX;44mPaUen_v! z`9baeIla9w%p7uTWu<7|;Gi7D7zh4BFhGWcbjeXBWNr>UhXYTHtwz7nhvUBiHV&yT zH*Gqt&f*q9t>|}cAgnDR!nYWkIiXwqhRz4~ZS}6+kA#5nirJFT&ma^_=qy<68>%hV zn^#p?<+)tnG17I{Wb0BpH$(M@u+k9)hrA!_xmOP-#gSl__G3Gk=rq#OTJOIry62d& z&%JDPYt#aUbzV;0bR!Kf9AZpHVgNxHvJw;Ih60!)hT+oLr$d941d5qsydnxvb_o?41 zN=yC1!%emAGU#lM26XLGm4$}RN7u^bqHE(B+Fh^Ybc(Eg>p87sFeMIPx-APY&s!Hw z&_?Tn?yzcBAt{cVT6E+d4^&HQ*G^s6IgeGdgF@ardhYF2SZF!?(m@;dti}ZfXJSLN z1OK9Ut)b+hDEKeH>>+L8f(sQ5IgqmGG+|vs1rtj2C%dm#ysV8DjicLwbMzXfo46wa zJZMBddh$f>8_qx*$a13IknvvZz_dckPHO_^cQ{h_KB6d(S+l-79a}kV+TLy;jkM$0 zJG5G4Tw48d#*d|Gy~dW7b;U@+E)@UR#p7F8Ss%(0T^6N4uL{?vR->F0CZ25^Umfv} zy}>eP>qFXhj%uUseVAMD(e1-S9i79fc%48MqvnG=42&w%ch1A=F?!a`jh zQJsU5=;g`M*RL1R!(b`tK$p7Z#^ir=RhNEz90B|pfCQ5l7+^J^mt*R{IhiNjhdGtmnIwW=!90?A9i50d zI~A}{#Lq;9@4-GZtxw!~qqKUI|8hCWaWj7_r@~dK>6L-%f2rgx1dO~ zY?4;#Q_HXu09H;kdIM%mDvjzocyP!5{Zks9>o&G~$dBrxC5!JYkBVXmE+aWAMmdWmZ^Bi37%s!*8 zv@zgWjJ6%iBW%~M&DwUf)g=ojJ`23!rU?+vD3U^+scPZD#PF^c)koCbU4z!Jy+!`mGeTUr*s-7PhGcUpiky`+){Z^xtBeX+$egF3S<$#I;8!e4$rA--m zrkWexECd~}k5gd;6}IH2n;`~Yus<_^2ax0~j@OuLXg>F-a`1ob`c0$fenF z;HwUJNp^M8yMq0OL|3wFH9J`q{i$vp9&ilg??^4?i-- zZ@`ilj*G5Ew||)4siD&L4Sl9{+>0@ajCD{VK*_(HlM~QlaLrlXI@L{ziE)KIKN>_T2K$QCAg2i zedd`70?hOE#d-tSaT*LoFxCA>1LkG?`K^`w{i-+3U+K%t^$5$~d{ye4}ggm_fPxwW7FO%WwjkMO1utP(EX7-82 zQ(I1CaZKf|6K^9FELe~T$&WW9Lm~v|qp@lJ^Q&s41gr5HM$DY~wo4~B>y0~&E|4Sx z#u4Jv;hO0pMmCwuxoVg(i|m$2%D55iCGL=w6W|stXF&&kPrk)&(d3nK^B5O9*5#oC19leH`y8bl<*weioUE zo%yD~pU^;aC}*$9C=1daNdX20izZ>p%+O2a$zGQp&N?Hbm4Ag{ zh2-%{^DCGSDx(TEuSj{lzkaaehjr?Hvj>ws_JRlnkP(PuZuy0(=C}#6CEmf4d^i4MIZ)pD3Lat9kPnVWCKSa5Sx`aL#fqPOBW~Yz!nR>0h{(dFOJnJAMchU$rf}ff0(0}V z1ukAZK8_Lw$cK&wY;|pMJu=g;p5Z)=yT^XkoDI^pPznv=s^ zjhHyYMK3K`J{7(yW{an7etmpm1g<3VR=IjmiwiAUW_NICo4FhvCs-%^#W_!U>f+$rNV))yS}H5MsTx&%`4WrUHVgP`($}cH&3pW_aa2N~Lg~#@2Zq)w z2;INhCw1Y_8%NqK{?vZVVTF{k!!TG3nVMYqoY9n+%c7cHSKQ1`JgBD?oWv^5_Qc!? zzXOZ~?3cE0+qP|c5`sHhYz(iq(W(j(b?qD=e&=CJ&mSY_#GWoI=1_(-VvQ zc?_KL9a>$vZsYlFenCKCwzYbVf5V|oYOWZMG=i&rp(49ow|OpoG=d9;a>lmj*F7u> z-}1GxGD9TC^CEJ(_HT3PkH)S(E%iQb^Y)_Nmqm{TKtVDogsuYt^4FI;BN4j)KCTjm zCj1WUY{cSvGv@55W2H6`aX=v!J2Gm=35Egj+Jz>QCr@U>tcl(8{l||=_HMlcQ>H<` zl}G?SuUM4)Yl{=4BQ!vEG;lycN-D>XR^YiE4 z5(<w}c+h>J0p^(G zx(l?`ZC4Yk&UPwPWpQ}_Eq;D)Ml?zRN*bplKfxW27<7;4>fH8G|Ty&J6oWr_4jm5Bhmk zStd*g*7&UZ`ej4^#%{kksQ=XgG6(`?4u-)KjC8lC>bo3okfDdy>|XhmYyLG|6nyrd zNoPlYc<}MntMzyG%+~K}4iN$m^y$=TR1rqVbX(PXwC^IVESy zmxYEFj4WLe8LR45c!cQf15_@7rgh>(`Rd^3I#M3XC4kqw2XvFG{nn&EAMgo*730HC zyYBT?x9nhZ=g@;uiM~^B>>RryP`5#&%ny&;W;Cj(7}hL#>Q&?Kw$tpDF$?CGQie^4 zip^A0Rs9V5St-&(CtZ>P3(fY}*oA)b+3Mrv#l8$~G;jx)cE7m~FLQTX|M26F!9nLH zGv0pHSm|GfE^c!|;+$(Adtir)m4oqsTcG)p<;We+*jYF+*_%d>Qf0gl6$PI4x{>iK zyiHNky`m%W&04hOO-91#m#($1BNtnb@VW}k9AP(hym`}Vc?MFEh~@#iKM^gfo?o-* zO)9Vo#o5TL1t#9Aj(h9(=&{pe)U@qpf+eCI5f9Og%{$nq%zug{+6z82w@Z%p3U9c=P7l!~A#7^Y(YQQ{fjl>@{tRl!rG6%-RIJQ`tZ1q@hMX5DDAR^)885s|Sx9 z0>_qP-QnK%=xCF@FDySkKCo#ZTF<~qmd3AJf!a~#+fuO&f4VatO*62T=pA{BX9>1}65Jdls zr%!+TWiO`^wD`({O3rRPE|J z`L5yePd$~=o3_!^w(Pm!p1sMwgoOQf0zdx#X|VKz?%CI$dfmT?Gjux;tFk*<=L+Im z{HkivXHK^@@2hcnd-XbSm$84H<5L>>`gY#D_`7la27yTl_cDr?t!VRPb8zdBR&gpY zB;UP`=#hD7MBm(-7oCsnJH<3QJ6kyXbl?ob#BVP9Y`<-@6M>9}IBlHZKIe+ffbz8rSUFZgXyf;aTIpS5TfPke0TrYoM_ebJ1zxf zOHsd1&7Z;xf;ur=JNV-E8xAHbolFw*%^C%3T)COG(dPgpJzHDc5DRM@z#zm$id})P z$e@FdQ*h1!I?%tD|0jJTL_s71-)<(K>lEkRZa5*E@Z=dZRls%;fp4-5)!}!NFDoK8 zjvOkawj+-QysY#z{a~6fF{LitfP*ZGZf18u9aI6qr6d>F0={(E_&8biB_#x13;qEm zh}yg3fO+5Fj#|_TG6JghcdBKn4rgzHb=_3kNHV&I1QN zUg>B9Xy&B*>D==n&ACpbQOLJ!*KUvFrWe{>o7fl(nPRESb8Y3^xwBSWZ{5bdCaY|X zOYCv?s9yV=<{jLg4lJJe^=)VG*?0gc4cg;qvUb+216$4djF~o`#gNw!UT?|D-{)3N ziL=iro?B@7>*?tPb^nVO7k2yj(duK#=i(P1dOmw{{OK{Dpo1mHO3p2^h^F z^p?9{S{HmfqX^me`tQ(b_6-nGP_9eJ#=t$dEJx*I;6%|UfgeJBKtCq=FyytJ_Fdm_ z>Y_y_7k9l8-Q?%j$%m$-uSs@3a-)?Fy$Gwb0#6{)#;hky*tLkkkB~-Bl5K_cmFkKchMJF`FS?gyysk98;*c;DROy1+@ z&^#+A^VjzSuez9|jv3M`DqaaBgn?GQP2oSG5VVUNKVgDO`}Ph%J3@(LdiQOg%|6e5Z`!u4|NTSoyEtm@CoL=l5D(5%NpAA| zns(i^BO`um*xK;T@PR0wa29}!K1oa>D8f8hZbTJDpkj2!ay{Tlh_%JnxcWWkoa2y@ z0+N=|YgWf2_^i-f3vz}@i0|*}$ZN^eQCqkS_woCx(rbTjqgxgtoGVS|=JuC6Ulav+=qZ#vG#fK%A7mx`mB#cTp zTRtCF&p9(e=#YcTFD!Z9Bsa@)T&l8G7pr#I-1x6mNShqxPyu6FJ}n%Ka_8|hb2I1< z=?$WNYkx;dt6fr(iONmdrt-XgD^)D5H=lc-F#4)l<XHjl*u;m%M zJ%cjEAH&V@r&NYFx&$JE9-wQ-$@;p=EejI= z+29v@Pf_RgEyvSA_Vy~PYiW34S5KJhqJ|pboe>ca32DE7Zpoby(41Y^{c5fE#ZAff zg~uLD%QFy!9NysKkn2bQ(OGVzZ)l~amQA~4?c{Vj^GSCri5`Nb#c@+?_P}n$hAF^=7us+qP5=a^=Xd0RhVqRRe#T1&vA4Ba1}%_*c&Jg7jozOoi5(+ za5~aFF)r@)WUtB<&2lr=aqBT+>A2d))pgp%!74vbO-JV3!*d)ruF>7$KotFBehu#s#82Thm$r2nWX;9On81Sh zU;p$+=N)TMd~XeK^KoGcyjgUXyGw8Q~L3 zHY@i8`fCFr-nP1_3}S<))fLdHAGUsVqAIWUPVBJ4?Ap5SV_)2k@ka)ha5SJxm(f^U zW{lI*O+^Ra|CPo(tn45R2sBvx=i87m-G$-u=l8Jax8L8ycrUwCy{%Ily$gx|SR9+6 z<;*zge$(b^Ess3AXf?2Wd*_$l-Ica4d#iaosa^l|VciEs8^;X=90p5SRZ|>b)vLC+ zQONSaiw-!sVsxP9l<5A(ag_fMkBtc{pM2yh?TyU_CObPZ)27Az(!FPv!N!*p6!T2Y zOnsUsTKk_~*!iE{t_Qy*Ha|L{wN8AUT+<~;CFj3ONU%#yOKbh4os)IxESpem_;~BU zj_)P@Jhd>!>BR3Ja~>kG%^kV7=lV6LClqY;eD6}c$uU~D(5GBUamA;e&VD{$l-Jp= z=%zbvVjBY!?JlE3;iv&r%q&VxO1hBgru?RASKq018)OYAe?Rg5!H5vARoZsZ&x?z- z=D1b9I^TPo`RN4A@u@o7%CwnigGZ&`TU34+aCH25?e-V5lN@^QGHbJ094r(yt8OfM z)fkf(NpO)G_1P>LzTki<-0OojBXMuDvs}N>^5ci~g~ul}+ubj4!kt}xuTEnI^lfiK zqvGv(KiV0;7Mhd1BmnTBS-}ARo-#2IweXAmVyw4oR=!Z5;F3Trw(dtswCuEQQ z55tBH-;GQCY)aSG{e5TPmQeuZcz4&$+4kV-&1RQ=*iWjbnH7wEgX0{>t;+k-Mg|@W za!B*w&}Dx%Sg>G$Qp1Pm+o&0+wF>+A_?lwAaeniL^(Lh%V1Ke>l9{*a$D^dw3vrIF zK5F2yBR%*2==3J)e1q8^_FS7&w6*!3v0Mq|1sjH(HgBkD_V`J;UBA!!^6d>{yl~GwMC7__3vQvMS0(Y(GJsM4^P<~n5ML; z+gBd5@%6|PSvm2Yyu4bMJ^i`k)PzmWdeLJ>j>IO+YoM?tR{!`iG~PeGR_$XL8Cg4C zp2kIUt*b#^-}^`XDoU&#S;t&1M1b>%@g10wf?4C}ep`*d$okwSFC;|jaiCS1Olx3+3+ z^<`_##_Dxai#(pTcN?(qYw4T5t#!M1%~zPzX!^5Xzcp@m(6z6LRd~3js^+X#`)@N+ z4NhwZ?%mq2UqeYqgQoiWi+a?IT{tWzd!1_g@o}~LUESWVK8-NW*MR?)Lj`$mS-WK4 zA9pzJxx%2D_F0EF5|2wn$Bg{?KmiS$4oKwgH`qQ*1 z)ys59J^ecIJ|BF0-c~xiapA)E3MkjeznkWx_jfX^H?HXg=d6CUsox(hZ}_Ku^xC)f=aliCwg%mpEC@Fp zGvz*-Wil&=I6vtMF0jwn^-6Z>h`~Q{?U%cF<~0LipsynvMO@vf>9-##{RZtg8Lt*KUVwt2bX-LN5~T40!H4?ctSQg*#3-B%oN+0rpFeq^T3zO|8& zE6u3mCHRZdjy7a8W$?A!T+Qy?#bb;`Zsh!_{eAI)fk^}sO-B$T1RIUMn-yD_+)G_D zT{o?IwaKzY{TV5wozZMn(?~sVQ}ptNn+p?0b_QZw@Tr1Hu88)#+xcyhC%xrNE?vdczpnKzfEr=`|J&2HD{D*F99wXFkb2F#(r>FY!kgP4t~HN$Ou)A3CZKpK&KnTp zdm=Q3!d-Q1P`T;tpZ8`j${cAsZri?no$i%?c-^_nXhLza47r||nw&g!%H!DuFENu6 z&hwYETRRLdzN^=JN3Ufu6V4!VyEYI-H>ySFRMtoLvGm_{F>F z*AGp!b!dGpu=8o=f%Zu>E{t?u%*um8yn_D#R5%GlyEUs z;-R47lQ|GF9DWALe;Q)}_&U%1A}lb5{v~@>JQJ%px*}meiiaP_ZKs*-W)zyHU7FKz zfv@Ry_neivVI?p z;@bnBo_#ic{hcSRTRuJAa6*>@F#&$_8|PFiWCYF)Sm@I9y2|-yMfnN4Jsxea&1`02 z;%lbdynX+a>h#^UI&~VgyfrCe_s`?rPf`bVDGW{2jJC?z?VDcju9ZoEX7u%qm)l(4 zyL;cnvFkl*N^W`>WL1`~)X{9ez(-TL*z|{+_t~)uuGW^egEZew`1!o(+r05bJL^QW zyjAJEU*&eA;?FMGeMUS~3fb{XF{92mg^SL$=5I#K5=yer@b3X+3 zl`B^kefyo3rrcqV?c5K|)q2)^3Yq@J6AXnRGTjSOQou<#baT_Bx0p)l%YduOZ*Ige{AYhGEzp{qQQy|#Lf+SpIfR8b z<)mOJ+DctrH?rTgqR7R|HsbIlBX@}obxw!{!jpJMF@xKlxhryKaOW=j8jdJ>`|Q}y zV-5akdFy`r_5P!q?%c9z;o~Jen)Peh@=W`i$Ijg|cWbNlE^^DJai8iR>2e@$@`85e zX6rLe>bE?0Ve7)$I99xQ!a z*TV05A8xJeXQ@^9q9J%MH?%Hj77f6 zyNZ6LbyD14e$2PoPo);A{)J){Cm$Y#6*h?Y<%tFX{^TaG$>K}pDQE7DipyKItKY|< z6GcHI4@J8hs!KK$Py1@sbiYD|e#oqpI7R!+s?=;_v_2rwt1NELm-j37@_-YLwis@m zy`FQk1jRD-4Q`E;-i0{l!=I_T+*zlZ7>ag9L+{ev7ndS6Eq1Y{Trn24< z$Hs3?54>IHMw_TEt!-@;?Mx2T*`)gDU6FHAb=}#T2SWZdev{NcBk)w_n2%=%8Y=ecPUB0Hb;xZ(U$khAX&mSjCKKi~jU@|?*59t( z1e+R+&_eAe-1=3(1AvMbvyeqe)HA$H@%S!%^!Kw zYYj|#kTK{7v#_BE(Tnc$wR=y)9*tD@mb?xGorB#U)V7}BxX?kFkE;H>l(FR3t7~<$ zgRft`d5pxdv{A=))!B1=)rjM2R7Q)-wHZHCuQ*-0R4Y>G zlm!dsHZ21a#~%-N#?_8n(4N32e=w!t4*l!S7dpS_X%FMscXfvi8|E-!#E$r% z=TX9tPQrwEbb`~l8a)f_4)K*HfNR}|0d^1Mou7G}b9QuIMCK8fL?cp&)RrN9@!+5E z)Jtucqq$+ojR{fYx|F6Sdsjp&p6_va{_Q7kuQkiHDmmP8b4vdthm8^}g0=uKS}jf3?$wgj7%gC|x5z&K*5{dN-<7@rXJa zPBhfVg`~5T%M#tRb?bD56uckkC~csgjJh9C=|rS5 zteMI4Z*8;btf$8-wOIP4-lV&6YaQb;h2Mon0t*~$gN=BpBb@Ken>!a_fv`ZT%r=BP zVvZK$v=aTJM)f^$;5(ui$G-%)h+;6s5o zWN-uB;`o#l=Cb-72Ax~@$25A#l>0Vc+>l#o+NmJn?e#v!)R<|IcCTK|2c54RH61Te zXzNSUcGuTEJy!8*C9nktz}3gy|0ShyXk3Z02%I+kQ_TL$VTd$=z%4LWqnGaS-lE42 zlm#sIy%Umjj=2nHRLS$~)~cST9Rs-H;$ktVN;+FLW~I&5gwXNh#)%gy;w`6Slvr~|CpcWj?EdAdO}cf@-zzJcTAF?*`lq160i!!{^hpyq&!gL?n^y&1pu zhv!t4%o@_rE>x)Z7@ZSQU(;^YI(#2n|4j4p_BN)uL?M=U^XTVH=V!cy7)GJ|TeHvu zRbdBk#I!crQR%qo>}JLW0F+lzFcqnXwixcdKGt+krxwG#2IatVgreR3pD@d-ql$j( zGiqxwxJChgr^hJ|Jf?rm8@=LqWJ*=ZDB^;~rg=A?b!Gy26kE=p|B2dW(tx87=P^E| z7>4tO8Xj#+Qu6>*!E^yF#@6rEWwZ-`cbzj67^X;q5FiB{i^DEK^^dU1CY|VBb@#=K z7v@8U`c>4*s9IltOY2EY#m{vO)#)<&7OZrVlpbZdh)wBzm!z|24>1byG+Wv>s?h=6 ziB@sW#pUI``QPB~Uw@N&qt(G*sV~f7X39v5Wxdw+JdTu;(h}N+g?LTEUjGJoANKZ* zG~pU!=0|G9b?=$R#@j??ER@Ds(cpFr+VS$8d_OZRM6cz4gxBOE68?CAivCYD+G z;&i*H8FiT#NbiO;se6PXIH}M)s0dy7>pTJ8mk?cn3o!fzlExxS6~+T*bZw(lB)LvV zTmSMG1xgiJj69=OEMu9~f=mZO07P_v8lcK=Vi+D1eO}*kZxEv!0pW1>?k65?%IIfo z*lYxWXFYh(rct8+P9jNvc<>$^c4+*aVuV_EHpoA7mxbqAi;?FkQh$&9)+071K zF_Q)!MH%(~_~-SJjblZ;^gsU`Qhwwf!SVe+|L8^LHcw=({6GIVq-7mA+UWC9cm%v} z7+?JUCXE}XCPt9~HxMs6^eXExBWyKPg-3?)C->GZu_|X^*mlD(_epzcB50>L%CST5 z#MQ{wwEv>-YdNi5VCE;ovy@#28d~ zq|Zfq`~CUpy~{ zG>p<2?&{i+SE$Ko1Nwk74SQJmNtpB;>X-n@OW;qZ-f%O7;cByV>FI)oslPj(xKjoL zPe)(B8IuUs{)i7dLp4cTViQC4spty?kEuj3nDgp+M)+34y_hq6O>0k3MxXKc9^ z?n~1rWI;9;2Cjrp&rUYwlH>e9P|ewEoYWU+KysY8LZOjS8P2c#oj9gD6C;1ec$w?c zri7>J?+p!=oZ{5Uvvcp=yJxcF=cB1`bf81SVojKEMtG1$12(>#n^d{!*LuC7{7?xx z@F`ordaV1R#$i5%2#sBvGum+rGY) z6WlHUOaq&BiBAV2Q{#T&<1zf|`?WBb(t?3EszFyXWufYo-do956y9 z988psv(HdkO9+UhJ-8%#awh(QVadY8id;ljwLhQN;;rz4(_aD#A-ZB_%Z_{leTmH7 z#%m=+1YV{>#(oLPLL6ro*kj=u`J>s{yVS?yu3-e8@>2djq&8{^H&0J1REnr-|3)a> zzrTaEDr^8=E5FG+VT>_Qz+a{@ql53o0iB32Lx$5*T|F0J4TNU>_a{b{*2&Gwn@Y`! zG`R`OMm{dTzg0Jf%a0yKf%0$yXEV8#73pA8i92Z%~RE5plq29preJ>r52dJVeQzXNY*PU?!>L5MDL5UBS|v zw_`*h#Q7wyknAO#WZIp`6k`m2B!^1ygky)8I|{8WS>wxSd?Qrl?Nc|Rc*D04F-I36 zi9)FXdM2?ikBhL5d(BpG5KIuJ&YypaFI0n1HKxz>Hrz(HV9O6DV@+Js5R$#5Q)A-F z2P1TCh{2_rseu}q+`f=z9!)!N7)guJXJ*WJ^YigkF>8_kq2=ahQGZ;AP$cH@?7FB* z;c3)Vl!zH1>wxc+o1c#}|1+br*GOWVLDdS~9nf1$d!-1<&u@W~4cG3&i#{n8?P1#W zXT)WUTQaOXBl5#3^58AjMb$d^0QFQ?@Ag8QrYY}sb|sf~+X0KU?c)t-R;aN-x zoY$L9^uFdsS8mFgr3HlIf0h55UI8{ZqmQmH_nhe;<);WvL6m27vk#my0u=f zUXJ#TjyvF~b`Eaa+dA+1^&SN!bV@B^5og*H^}Vx#lvQhEb)`a9XB>NbIHZm6(-V%V zYG`z?E`L6SD2fyOM#*He6Ri-FF74N^U)#ZXjFi}C`S9pu3$c;ue39~$jPiBiJ9p|{ ztiC_75bkm;PI$?GeoX7QmY8vS8usl7w|gaw=`NqmyCk@OEOWz=q1txD@r1)P(Vq7x z*~&TbULoCYETzzJ+VC(wPoE{F4rLqKxNfJciDgmEg5FgKJKo^+t5>!h9fiDeeWp^5 z_QQvVVH)4Py6le41XNP7_ht|OVgmHhlbOklSK>Dx!WZJJ>MZ+IG+wb;vxubw_ngL6 zCV?eg$nDsbCpu!U>39uNcT_udGU%$KGaPzOq=2M)1_mST`3bl=Xpe|8w#vJ4BODfb z9CJv=9}t7P`d@LnKV{#IdO~kbV7JKk&_3qUql20En0Jp(#h=D-<(iwJE$%`gjDNdL zhN#+IS9d#>DSAwSKa#4{tCO7^VwrXBJEu8pC6;vRL1e^Kx?N zyC*3pu9^L!3<)Kd?1a8rn>ISjK0F)+>pCPB(2K2Yr$6EJc9xWmopZ(6oetmQKX>d$ zz+)Mglk@WBXHT!Jn3}+y121j8Jgs1LqhT}!(eg0jVqUGt+%2PkpAGp`~%BBE<>ry;K1+!t}ElU2;NG6AyXCP}(B zpkv#%dSy$`afkuBhEV}^{rEh6iacK2Ma8?tjvMvcnX7p{7B`#IBos8Cy9U5lWYP$8 z#I1qSiJs*Rv=BNi{)@GgP88tcZk*k?j9o2W>oAq^USQg9z%?lb7HnDJ;)9yiVf2Oz z`pVZ4|Ddru-(&T+Md({3$B$Z8{KGlOAt(kKbufVfNVQRrFdAp`p2#Bi1FLa^bX-fn zz*JfXfex9jall@57-;=$aJR)cg*zR(?{;}nA#eici0Zjjp5?Ci?Nb56(;g?iN%Cow<_ zFgs39Sln(n@xgQ)jZVn>%iAjy7PO8;XRW75yv?VROoF0RBUTMo;OR%x2Szp0#)AhJ zW-)-E4KI6iN(bW+P&ylrI_+$X{hrr-;!((t%-a!UlFW(3nwLGiSr!Sn)Uk?X^;&@NU*Fj1P(c`aWZ3{Po%FU6oM7-GeL6m0+mfJdm4^RQND=!(bt4|DDLRWP& zTNukWUel0ohI-q;xDUNllaYvQlP}uw3c&~8s*N$AdJyuKFjW2H<9^Nxc? zLfsf#HXr`Sipc?#K-lhy6v0;9~K_UTobZC%Ag3c8Fn*&F8Q_DR9p-5Kk zx&}=_h%(WVqDv7i9os>ou_R>W_&!+tfK`Y%5#(@D2kCvg=$PkBKhmU*7R3wU}1{`y0KD5a9}A^{Dyen_M--jDijGp z-%=4*RB}bDr%dKC@K^pb1+m=PnTH52K%te#K`EJ#@bzV*>qg|U6=+LhSwY#x&A>TI zmOW%ePo0-Uy|bhGpK;pBnG@E*c;IqxXRba#6eXCjTp(EhW)7Bm7JOlh!CS(5Mu(A1 z!34dchd^y?agh7$fh{-&8UIlBanX$F9$_@4qoKy_l%{h`du?cWp9W065P1$1sX8Dj zF$7(oUp^0h1S0P5?@y#l8a7vUU(M+oc(*}0Zgb{r0hL5G!yS5eHk`V!g9p2?oV#ia z4f9&esNiPJ3V?BRo^G&!#SLNY;o2u?#D#QAKmVW2I8shbdK&N#8v@=oL>+oJ&P%HQ zZr3|>=)izox8Ut7fdz40Mb!m9BxF=^_{-k**S{#JMSBRP{D~;oqg>NU>V^u(TznoZ zdyCeBP$hmi>*$zwcc$)#R|gtY7CO>bjOh<1&?0==@dUVx?Wq|1ZD}gMK@OXIP}G_qCd%moENE7u07->g}z1NA=GnCud}5 zAF|AEVHX)ET0E0~!gK5N7o9l)_f zvDO(kBo=5f=PjY3(qHmqge}dqp}4GUR`h9??h)Kki$6Z@Yi<&j^1RbW;DqO14I5vW^%`Yng_CQg3v}`$lhVkQ4YMq zm#`S8Puyd0xX1^78Atj7CyP{huc!Yq9@!Ci30a+Zm~qBPzixF#fy1xndF>xL_c&jI zIt|%NDoqu2RGT!j-n~ybk3G~BoeY%=bR5J)Si_r%DZd02QZBDunR{&fsg1Omqhf|N zV0CVbkB_hL>ss|e>#6EYU=`mQB(K}G1NnNPoN5{y!nvGx61_q3B_mmyW8`58$sNyy zh+d5X5txp8KyYq2yE|FqRp23YGb;Y@gMg|ViBkc1L-yQ#HZi7&!}E4xTn@s6nlPB= z?L}GHHU88h;L2wu?;f1UXBM24oIDjmqm159GdwvO(7f@WcktF0F570Lps-fK&T{A& z(W=tcfpKBb?*Xr$$~Qy3f&MMEFt4+eWN@M-t3Z7Q$0bc|0H;mlCEeboP{)woy^G=qWvuWb zcz7l3f5}GVpRgq+Iy#N`f3S;^$f|@f4Do_{`!ak3WKOW0I9>JZc-<`UFbaiaS}R;1 zF-W_y)5xFE3N|?oD650st1as=oy3b5FV>Ps#jgaS5tXE2US6KJR_Hh5kz|6b5hAk$%Ek(0S+Jy=o6})RuFIy)n~l#dqS+GCT=fr#H z`H;V|p^NVQ_^>FgcieS~7#aco0D2vv>>vk1W(*VgK+^K2SxfRVG7jXvcZzXRj|fA$ zO6$Po;9)tewCpc@i0=^Rg4lYJG3qwykdv+ei!9`Rj&5KR{#@h$>Z&DPlUFwPKWU|y zUOQ}1wws}Vn4o9-_bYVjv`?iS?4O(*TG2u6FG7s4 zADb2UpBz=!pm7I1^3(jZ2#X;}V<4-EbBEXdiaZ@ciriU144_XuU|@ug9pF~G{1!lp zt=eh!@85<$ed^Sbzz+WWbFF>|za7lPn(uh(7Da>Nw&Jhvo#LEFA*g5lZI;F}`K@yk zto8Ewy%L7W!EiCL8%f1Scp3*YyVcK>3>!Jvo#tf4nW;jA;?&BvwFFz{ zh!vJ7b&0^=iTZa*W+WmM`vj5)f@vsyDPL1z}+lMmc z*=Y1}4#12o5KM1qm)tAY@Hi>PM{F%-5|_PeFg+0qN0#4!jU0q|&EJpC0B4XsTL5Pe z%yhfQs+-na7&7gff=> z%Sqts62bE!0AOvRaNtrkpP}aachLMTf6Xh zRAB*eE$pE1iR#(TWT7ToZ5r5-IBs))?WG}>^k5zidsom5hiH=!s)A8ei~Z&oZE{JB z5*0X`w9cs8`8Dnr~=W&Cz+7mXeLsjo}Y3t`M((*4!iNL#!bG zWo^2zI>Gdc<28mIbOW)E2=>j)WU=AdM*pr(GbA`TUxcZmdHkIUK4BrlD3y!1J@+cH zl0jM2n|)%Ig~n*&L~(7vUT{XY5xXq8SSbHwh6|&`5S*VKg=e)H_3C~wNPzTH42!vo zJ^HL;=UxRuz)VJk`uPI^6}116Qq|<{zR@dc98(MXvpLi|R9g2Bzx~G;BZx5ECI1>qB5O?}fNWP@=I4jJR;jqb-k*Y~3Hqiv^UxHImbNZLXEKMU>%E>KZVl9=%Ve#X?C{Vs{JyPy#@u-vtT zD?w=w7N5)k9CcEYeV^amACgLGl0qo`uWC`{O*zgiQ6faMt`b*Sh`S{ny*B@DjQ5pq&;O5P9HJqwCZ%gAh*ydPB-k@hfJ#%`I&IXHgn# z0uspmjq1k1N|CnaLZzZ4Pq6RwD_5g!iQzDXfM7JMP`hN=xRm*m#nH}V^IP`rcLo6* zqLdFTn5#VX*FP@DujMGl{N=Y+#Dgh5ANxur%?gx}Ld9Sj-&~%=;jxKod=%*{*qC*- z20p@T4U@;d9^F5}qmcuG8jXkDd**FT?&(oe$9E%%jM|$E)PdKbsZ@=r$8GXx|6r3$ zOz$}d$8FvE-a0(HgG{aA&xWGPWkpd!iqiz%El@;* zAzE;FO$EQp4lKne-e6O( z_Xzy(I&s#It{sF^P0HZeAMDPJv$Iq`+E5q|{7qt5Yi{2Q;%F4f02jb!-842CO)mS8 zGjhX`QI~ouhTa3H59T1_#9vd|J_Fo6G$O)^!+XoChcHT7wr;&)Qm=Rb5vyUJ>c{Ql zC|>mDRtHQ9#%1{&cTNx%ibin5Aw$x&Hhc3%U@=N;GnH`UuJj3Otq)3{&R-gZ@^;2n zXC}1|Y}Db;q~(?y0+zst;aDWerr@7Wzx|e4N-qEYq`!Y4-1h(d(+$Tp_}?Grq}8%v;@{u|xri-@E(zrCg0wb;^GS+xf~@a@2{y%XL9ZQeW;C)ACX`u_cHu5B~i^!?-> zV>`F~`!}&svthIsb5*(V<_O3u%$;1{4Y)VEFZb&I{@wUEr+u7b*Egq5EC%;b3^eg5D~_VeXBP*Cm`@_M3WU`(Aag*8Z>{L z@8Xr1OS_nOPp#ldgA)y7qT24{YgG{HUciwq*AvCEQ{ufJznLz0lH4PgBluD!SF6OF z&^y#IS^TPTr&XWpPz{Ldsuc2kGzAH@m72qPTSpl_;z@`0{N^gG!$q6H+(MjPpkuLm zUr}TUh9nRuldNR1Stiy4_>Q+QPgs&f4&Aosrt2vw1I#*;Jg=)cI5>#cE9Kwq@_kt0 zQ#tz4RA+n$1M;+&VAo4bi0SwS70JhMPc`Uxs-`uu5lCCA3c z%ixX5)B;i&w@V}P40T#x~@$~vvrZ#z$axzX}wV_Sau6D zg8T~jckuZT3Q;f?p}C^3rwR#xHAYe$*On3EeFg2z6+AG>wuOC8!*r99j9^NGR9pyR z=1iY9%@Cro&^m$|3{7f@8#uLJ9g`)cN`m4u%GszkTytfa6yuC3*eP_W!n@h1Ru(*T z<+x_H%&*{A=$M)9VPFVsAIyv&X zpi&g`FL6O^N@W9Lcb)b^Fk+Gr+<8?FI}QD)i-dZ%qu1m-YUc0!R2Lo0>9&fWKTtY9v11OdUL+6vJd1mA+;*ORxBM zzR!9wq>`iX7Md3$hQ4168@rBXM^KZ}^t&%uaN@b#@BBSM+xg z+^0>mq}pdm0w3b*HjMwky{!;#Fr{PKq@A-hm0bK6zrU|fbj(fqaPhEIFpdgpG31xB zFhLofiv!l8rAtBQw<8mJUlk#t(9kCY_U1T5zA3GjXfj!cP0_O`%f{sHSo+~%h_s^Y z6b5yMMn=->{+N`(M1xwj4v~(>0=dozrGXULIqaI4{sIPV?AULpo)&Xiem$muwv3_` z;A2WI{14l8pjZ-+<}qW2jr2D#cb;j^BgjiOF0dT0nFJECSNQVg8&y+}QtloF(J8Ty z>D?qrF8)p(jHv?TagU2~u$Czx_2a$th4t69dCl#?h+j17d_@H?vhIL9KRf`gvhaT0 z=+NyTYs+C*AG-zH&vH;=OmLuJCfr@QjkvJhrB7t$&#Al-gp#EIuMR@Ew1U9kLa4oM zp}8{nJC9W4)NmoLGIIs~QBX!|ffAUTYdQot8nGz0gB#9)Ty_k8#ES|})FRwnvCMo( z?iQD}_-QcX2UaKSLybwe2LxH6!^6%~ZrAQQB|9(f72$ucDAUl2%cVpj@GW^f>7D#u zcy1>Nn_e=LZw;V+kimzy#bSC_p!6XE^}@bl3X;#*;!)}%$*6k?XVHUvC3B{Xv5U{Z2BDy7Bi~;+ur*-ks>XRl#Az2$)Dgf2tVZ z41)n9X0u}W3hwgZ@tLLy447G}@_1>?`g5zmTc1)pnaNT7tpUHwt9mJ4=ngUim0`o; z-9U#-nhSmTfcmifUT}`SVA%h_=A42grL!?{@XKXkG8S&8==Epvq)Cc;hMWeVXOjCN z#y3#4-QRXrzHn&n({m9_k1Fm@T#&S)+*JS=_Mn2fEOJmh2G?} zei86O2d)S4XaRk}xWP?+GYmD~t^98B%9LR*y811Yk6bMWU@8zn>A&KcR{)PEgz2?K z3#&l$nF|_bN!VAHgTpqyNdKqtz1hoJHaP#{;X38Ed~rayqqY5Eh(fMCGUU&A)ury+ z6ZT#R+JuUF z)b1+GGNw8zDk@)|P%C~(_lFU}g(YLLd@3Co5&>G5u=MN28lGf4B!``#JbZViao(@0 zDWW;)rg8J4RuEraixz$_`jY>7l6Q&aiu$1CJ!N=7f+5_QI)!zo4o&KcA;9Wb2Z|LO za2Lnu`T;aj(#a(O>lAU5A>^P77cQ*CRZGsNf9@V19p4-XE{owF1=>NE#OfTi3#}}_ zM85FkP{~DGE&c*;kkE-c{xIT|7R|)AK6CIVp*Fa^fe$D|NoK z%8M0{Lp5-8NTbUW7hJTW$lwG2J0;CuTc_UV&!1&NdidVEVOHIrPg{RwwmBpvF5NqC z9wLF_A!BBt%HL$to`jr{Wt#Z7z%!O!(;7EpB0l6vNALU){+19WfK&GI`L{H>Y`uvz892Var z0;PE&g$i0IWC?_ddr4~H)yv@CAY@XNaPU9_|2+Sc<7~i@?&p|l2>h`C?t_(GV>Mgr zwM|x%dXnM^@*|%zy$&7rn~a!f^}yfgeR+DG`}sI z9!M7ySH`{fLx{JL+K*Tv;vptI=8+w0&Uf~{k;j9a&yDY!Qdo=`7knMKA{)^(#JxHF&D)r|EY!1)?GAcQ6Mwq?)j zF9q()ai+5IH*A+~!RPoeW3Zo+x)0SEr4N`Pt0&-~bl55@3zm2b4rxBAM8et;Yy;IL zK(}nwO1HgL{VFrDDE{g>#K7PLn4JVNh>7W}#!0E{(Kfj+V@%{3m02y^tbWKmOoXeV zxk^*T?)96WZW#HdgIdFqwO1Vi0`HWF2mmskdFEO2g^74lQu7@{BUU9m?tXDlV<(0; z_Vuz5?=l|o4e)kyR2VE8$!QwTeiqYS1hNt)Cd86P?9%v;+<8ho>8FH*rrH*ZW!U0x zu@&$lI3kRzKhgK}Zu>0+J@|T|)*&|@DabF5*V1qxRAvqezt*i=fK{!Z#sK4fSx`BMwvE=Nu^%(S zc34wEBd{RrfC(uNppWf`rcnx%+T@-;w`ygCGp@{|>irZEPsq|J=w2ny&W##&^VRu_ z7zcMC*-_>(Xgd{?8V?#cP-J7Ec%D1r$6CHTk5-uNM|Y176Cog?fee1jw>PcER%V=v zilUD+<+$xSmI<0c!j=7AEU4rSV=1JGpSO`;YKqm)3a<1ib^9H|x_0Q$fulhR(vA!) zeg9r;FV+Go&~M6ch}ga&gFCtDS7l8Rv@^@;U6)U9?HxYYV8zS9vV;VPSwZ(wize}d zz^~tehYJ}DgTg#InTDwwVyQ$WtM+XP&-D~DZpv2d?^=sjwqE?;5%tyI6PR;bqUNET(Z_J zD7c+>p&9=;n*&Cw6NrRj!3>#>uy>?2x|lV8ey+hdKor^zA@OssUovV%hPS6dJkQo@ zilR&`@G@!GgYJ}YE~OY}c(7aT0J!6CD^5@xfn{nX71alQ@K4ZY(;NFHiXwi4vsPbi zhnPbame7tNJL&w&pKbwtST-5ueu~GBzB5iK5sZ52>suG{w1NVpH zMIY#YHNSzZqBwl`SDq*N8FWqcp$&y5t|}f2)~hh+VrzS_uIu-&jV2RLsZ(fBi5*V) z10wj?V=g8p^04^>>hqINTqaGK!k}bo!Zm+z=Fy|O@=H@vJR?eHoqPbM>X54-f7K8u>or_rHD(&(2L@wp$Vc;B`1QbLjK7AR`(0p{02% z!8_=P%`OdNOD==bTc7}tN@z0+27*w8E~&$OFCQzOfWp77EvXnevp2y*9FAc|Pzeet zL`XF`(5|V^zPsQUVzR1vZ55ag|EWFOT?k!Fy{1D;K{&MS(xv};72}wBK=5ce<8J*0 zYDUSrlf*@J;KUyzF#E9KfY%RWwHE&Som#cDjeT;fWGyXRWc^~=VL)?<$DmOxjzDdi zj+}F#fx$cWHiQE&@ESU~pKan3t%Z05wQ%hqhjVjAP!DdSxJ8Re^S6_}$6)z$ee}h* zfDfs-OoI~ZDJ5lkkaT&khzN?kjL>R8`5Alx84~6|%Q|ef+?e$@R4{J^FCeu;?(m5l zdeN5coikuDRr$}KKLvT6@A)E3}NlB^u^_`{QsId_VNo?TDP+et z3>DbY{A8IM7xy1d2$Luc$FfM8)lCh7MKf}9bMxLQHkrm~#D%;;0%-8)KOnIoB7l{( zB_gDpdk7-)*Vp`e6fiMJR3fs}T7*tyKP($OLhIIXj>vK)M7o;@tr(Jd$Er)nVBvWI zK*g&G+<*-0yriP(s&36@e|hSXB${7TsWeqv=L`_TEo8=)Kx|`g|B8iTqnZpb48WnG zM~}t_%)i|%4rmdofn;>go+&}iO0vw*TMGg&2qngc2o8(BevQ`)Ecx<9bn?u@dPSz& z2llBrRkF85XzMuJWqET%oh;oSGcBMoMbvB5z*ExFK(SHIXLmXax64RZ(5oJOeWpg6 z_TK*%C_C`evca8L-REx}b%LkPndQ`tJ8ZD9I;)Lt|Nab@zF)9G1;Z5{0!3!Ct8I*P z0Ea(Tb#CQt5>beF;uHl#jY}PR^*-IYjSUN%!^tJ;X)cq!M_Bjt(^}fZ42cv`6vFn` z>r(yaGZ+C+r=HOJD$3hQu~CA)ZT92@aW{mzDm~r1PoD(~7e>{OdiDIdh#^2{HQho{ zQ}b6PBhbM|RjTJQgrlbyKW?;Ut~t+K`@orTzJG9rLBcF!DIjM2SU6%j7I8h5vg3&h z|534@1NY5+YxQ*CQyAz@6xrr+iCD*)Ql0Q2{Zvb?(yykbUNN84y-%O*oCQFZ1%aPH z9xiZS=$ZGMT=OT6)vfbBx0c8=A+!&7jA=9GkHf5fhfX7sMn6(tUQxJE7XE-8rK={? zTX;VznLj#Y*DeGvYZw_o50CBJtO*l2LT8JUiaCbL*+v{_?X$> zqBDF9?$)c(d;VPN-wzykWVF*4Sc=YIOCk1DAMJM853@NDb`%kiY>n_8`QjQV6 zHl6i>ZPlyCBdH2rhgza2fip{;UEsNY4u##e8^rZ%#xmSJkhsD{(e%1LgTuMT|Vym^9dRr-o zre9h<%XC+TiL0w?NBeCi^5e*Sh&_!Bhk{jtIPC)ZQ`Yvk%hO3{fF;PzojD!UR)A3p zvEpBAJw+6^2sL^NMf z4`=F1{_C}}^(Iy6!&}r}NH?p_4&ms3-rx>B7Zc6{XoNg1*m00U zjc@)fvwm<#b@ll!F811{qZRds4jnLkOj}csiWkVX+BHq4mg9vmScE;Z8w)VW0N>+J+W+$Rr=M-s@ap=0?2H+YtX2djPo~HeHP^Femv3E|%}Aw5 zJC`;ZhD>iY_f;))9Oi5=D<-V*LjL#VJ{o2T_1s5YxI#wpG%<28`7JX*1YAab;4rzw z(W?%0Ya85~-2XE#FNT2L4zWKD;DCa2uB`UvXfgWcv?Qph&*FCwOZ4ZLEWeyhNRZhw zQF0LVfDi?$Ku;4ja<^sqdsueytR8|}(ur8~Q({Z;-yj)Jt3$hX-KF5s!da1~pk0?P zb9zP(W9=2qqQn#;KU4=C!b22Nrn_`qP5XV1GN4if3LwtvCFU|&hjOf(rcf4dut1H# z(i&LXPos}_;bA_|-!EQVo!I>tz`SfW0{0_ocL#9>YYh2lqU$qkKkOKnN1@+H+HP4) zM;knlP6rL>6~GZNJ>2`!^R&qAdnh#x3=%Cpjy6IaxAe&w6%uACPg|xS*wh$(CZ~Jb zU}XTSE1tiH0*>@g)a%)E8=rI#dU9MB!68q|1cp)41WWgRe{HoH5ORUbG~s4?8+C(i zbZcut_h{x2Xs?Vgw4 z>$mdqLa@9dG8?rv%TM)DCAQK%H@18-0oKUYPEsMK|2 z(sacV3SG5#)0_Hn#>rbEcJou6-pzGo=yUE8G|(m z4tB9mXooTpQSR8eGlF=OT>JeMejNl6;dNQ;GXCTPOJE>@i)y>*wGD2D-o?R=qUZ!P zoqWRi^BYdQcAL8)dRX$E05+VUmb^q4F^UAF0pF7hCKWVj9}{>ud#oiFB5j28(gIQ7 z%{O8#^Zzc?;UNqqlE(v4SMV`@zCXI^IiwC1{1!%D6R07CA!3^K@X!^V;McfX#{C%F zJ&$;gR%f2RI-&ysn20PViGp%MX?6A=Ju0yUUab$Q533U-1G&)c%gW|=2sRL?pU*1x z3Y_&PxnWsZtf)N1Z(u5zZaa}u2Z+z#DyEpD3g_F99)TGwI`c*NfcR+`aJ`vuUVsdi z$1Bq5Q5y#!(n0RrsGQ*%A?oFG6Kf6|d5)z1;cawtb)o*qCZ>qAl`d)8nA%{p>kW`z z{0*vlbxlz-8gYTq#Zbggbyj(^8D}+Q&AbrA0IGLMnt?ga91XET&)@ueg)~8MmttKF z$0RdR)&IN-;m$u|;KVfGK=?K`O46-n{YTxuM>E1 z0g@p)+AMGr+1JjbExj8)C^sR0}F z`?hV&*zBZf<~%P%87J!ORX`@v=H7VjDkpF_KyKJXtbP z&TFLC5CJFRjX^744;LLZJgEOS4fdFtQ>jD8KGYd0)FCvjbZM0K@0g64+*@W*Y1D+t znScIZd!Xw~&fz%N<+{2$`q3jifws!Z)@4`VG=Y_Gg1!ADZhwo|Hcgz3Wa<_D9H(s{ zX$8qw5_FRI0BTL3Wm|NNcLQNsJfR2)!(ZCWl zr!+PE4pd})RRkr3cxB`=?wO4YUNXi2ltZ~KB_C-oD90I2dwQ&`E+nY0{1fVRnKzp^ zubTkZM~;Y%5cP9|E5tO!a~b*h8n7N9r?e|`l3k~xb=zA0P)TkT=Mawp&*FhpE#2V7 zk>p4&uPjV&%;Oplq5#l^hRD4ecA@_ z!4V8VDi#h3E8-E$&H>>M@%Qb(16^{rh&?xGCs9{4pqP7ruZCdEXlWT{puDbPnv`O5 zI0cNvEruzGTOua{%A5cRjloqcgb4gEQZB}f z>P7+y3dqaQF3{8@KpJ=@8GhwrS&_Ry?L<4wBd^iVdMExy#5I}JMPfwS>)x%~7Fr^3 z3E4V;O^;8VD_kB@H(s{KXSbqv@8sMdQqB4}P&HWmc_vPp)bHuC-b^8(Y$n(pLUZ*$ zZvdIOBW|IZ;`(wd8;Y!ef`tV3YC*2Y@9!%_R6{Eq;WwjPK}4NAcOogLq$+O#I24bV z-mooEmfEg+_lW0RTW~o{W2_Tb6&Ue2_YgiMdXav&6t@Kp+gev!`ky-}f6t?M1nNWZ z3@lhC6kk)Qrc@;xt1xp{Wm4+aeX9*Vpg_8ReR-Y3Gyon!Mx3G|KEp+qCknE?ONW~L z2px*paKO3<7DNAldhAQkGbDNe2QqKnY61Q4^vxi69G@5GMZ8p5XVJas=wZV`fZ}Cc zDkm%hxTym8Z20h>G1>p{^kt)o@ak9R+^4m?OL5teBMPkp(g7x}L~by6%Uk=(O(-d$(5|m{REs=uzO3kKAEG(tW?7=4h#EYLgzr)D6K z8h`%s<&&Y~6>_xq=ZHuaggMn%DH)zEkFP0&`TzD8uurtEM}VGwxu!2o1UF{Y=RxP5 zxX^CUkI0)toD92oW>$dB zE!0!JQT9O8R3JEDjNl&NAmJwK`SU|$Y^HH^D51~RpCdH!*<@F)2n0D{xcA>h;BQR- zj^X3H*H_k{w`_q9K-QiMbH=nI*?~i$D^hGFb%Q2oI;d69Np!>RjcSTY&6vwACKb7l5 zFc5z(0v6CO-`D8-sDMneZyLKhUJ!u>ce>Z1-eKhNsr9X2qLZVKNWSNfx%BPf zV`R<@3l>=vLgiwKO)IeNcKf)^@BaMMrlM#jigR$5-Y9(#yc`5Hr`4#s_0@2yT|d`8 z>hFGN_q|Fh8&bK_>r1pbwVzwnA`x9s>*6)?I@ME{TOhILSJ(9cOv`-mAR+GuhlOC{ zWArBhtYmkW5&KYmlm|U?-}mfsMurghTen>~*n~sPWfn$_bJiR}0ph9baZCq7J@t=Z zJdE4i2Eg8dpb;)dnjqi?ECBxLzz>pnb=*4k}9k??!4YGN8e55kD!UI2Oy9PeBl z0%fuV@J(b&j$a}8IV>u_J#Cp8;(1O-^#L|Qg-sqD{dAt^u~7VocZY<87@1pjYrs-n zSuW3ORU4VRuJc*4fUM!hAS5r%%S#wg#DQ=;3pi^6yYss8crCRXg|5QrhuX4c1qsx2 zKnN3F;-qQ=OGy2=i{`?7N#Ccy{5umhF*mWGX~37t3qySt831!bmS zA{(3}iqz!X#2tz25r`sGJ^~nk=Et3n(NFg3B@5Sx0{@2Pu$?=yBUOs;=`~dl#f1rd= z9H5!ZgKi&gX*yBEbk|%hQqS^&)Z>Uq2T{yX93Wk#D3IN_7wCKVgAa^N^?1~dQ<_a> zaFf3&GCCTFchql?@k0@$i=6HML{sIf z9Mp(`;QJFaJ-3ZN8G$Up*Tf<2>{(gyEt{`IJ5T5)w2vN|k`c%uMATtdc6h^@HRao3^3xw`RdXr7JvaB3#lYk=(x zIAEqU`>vgg{|&~_i0VGPy5AKuAs{f3WeC5-d&94B9;zNdN=;%1I40(gi%Cfg(I4dm ze%hc3Unjaw$U5=hMr86pZFcqd??Y!=-%aBYB3(VqnQin?D-GI~*nmmn$49dj)*ql2 zMs*Jjaw@`1EjhyQa{3Xr!_h^uo@!b5tlMbEV6=Gdbm->Yw_<*CKj=y79Z){vi{ zT3_eEX0jZbTm`q`+ySu+;enu7HeeB=md;`-eB`yEgaISl`)aMWCSJO9^X1Ezd6~7= zO`0?*+qDX22hd$QD#}&NenO~w_W$d2Fq&&28q`fSAENzKum3HlKx%9HmAm+H*_@si zK=mPdOaABkjN^Cc&pBRIcvGySklS4Q%}ylZ95LY0#pL9GIfGa8>a(&+B5vRJ3h?Mc zq2^wkiinL+&z$*LBejc>sE?;|Q$gMyd4$nZo5H@(he+ra$#0K=bTPbFSLdlAV=W&F z364K6GU-sVS);ZpDow@806aqWnofBeF0*)wZce0T7 z;L8pO&{)Vak79@qM!CQyO&L0;@MDpE5##Jfb4wf4s0LahJ7k3l> z>;c1xNKb&O#M}Vha(Dno$-#@;EB_?I=do zr3{}lcW(X0feRKOXdyFkx#DPIbMroC?phzcIQV)w(MrIx; z4SzmA7T|N{EYPN(#(g_;T&%~`R(Fv4pGgelYa&+g^au}`WIS^jV@$+Gop!#Poef?L z+I~&#t{17aKT;t;3QP#Od&fA~96NSaH5Ppzup<%VBr-jq)h%?h_{&YA1JF2H5CXx> z)fu-WIlKj}GMdwrnWbr7pS1BUd&gXrs6&wgH%AdgHQZ83shG2gLRS;qg1d`&=UI=^ z_Aq+L(9M1)suj{29%H?WqhrB@`v=4h7%ZNqQkEN&8j(>%BYBe7jzkSKDwB2qiLC+o z*7L-JrI1t3=?=a3!*4s@zAg!y_bCI6PE0q>nM%-+7MN^Dd&RFGPm@;Nb&TX(00MS4 zHlbQtSYDVO@WckmFh_z|pZffZspUv)2FFHNCR_ThKRgv?EmsW>6)+poIRgJ+E{Q>N z7TR{MmJyD$CZq4xO)<`z7PvdAeyXAN0rP^te@^PK6RMr6aPacy4B~|s@Q4;|5PiX6h_=_Xfr=1WDhK9ob3X> zb@4`q?D)LMCP%Ls*bOuSYRb^A^NELB&_F@=Fgh!tTh1F5fyMgJwUdmgBMCDAKvbJL zPnW4#_j=}%1b(IJzL#&O;A;|gxh^uTgfJ(jBt)x0w}va24`_P8rnyo_%g@g?+s_5m zE#BE~A{`$2VQzJ>S0joF&fP~v<)*kWEn0LF#a$)>OtH%%XVZ{7XyzZqJ03h8;t(c& zAl>6OVwsA4&qppv__Fv(2$SH(C5-kr(P^#VKVSo|?s@8< z^&>BkJ_HKjP9PLI0m4fIzoz1y86>g~9hV~>pF#R>L()(7)=j@qbw#z=Di^b5C}L3h zGf_!=)1VMJ&!`i=wdq&{)1LRQ=$_E1C0x<1(ktO9-eWC~w7X_ru`etvmxMDj#eE2h zi($sT&?Vmq*n&>^) z(8#~Tf(1n;$2-^EIeI3xg}FvJ;=vr2T0t%u#H=2b=y=Zh;SuXjrTSMD` z^GLRLF{}v+BAyNe7oD|tA8qqVnab%GV@I6zsSH4W;olH{4YJoY?AK~AMv>AUz(!AS zKkaLxr#~3y876uz;IXi>AoU(%Qw{qwY2Er;$3EFdV4|qA;3*3-iH=F24FWlfaC_K7 z8V*xJ1vI(0N7*P{P#m5%y(B9cPOO@Zf+ zuKZBMP~!_;$g&UVoj^`Zl9U)Z6CTIl@Y%;tpD_fSdy@`}Pl|G|gNlj>nh>LJ3R27E zTA$S2$paY)6kHN z>#S+KGjg8sdzeSv$6Gftgb0}?dBSrb-%fO@v)Fyf$E1q_;!+gXg4$oCF|(~}ybD|P z7&FGv@m3VE6O8y|-~({1pA?$}qxrPgHC=3tKlI|6lO4qJOH4(1S#&YdDuP{pR2+5x z(7=;B8oDfB{wL4ph}MR2-CQrYesN4D)lo6}*8Ei$Vi&Yvmev0pA!K{{#Z?6wTfo~B z7TpWm-7!HCm$t{}j!$NOkVjAil}u#j0}99n4(QA}kz+DS;83KRtXQ6}Oyx@IX-yQ1 z<~`K>!^>L#<&Oqx!nR#K_w4Kt^6sOH83|019N48GI{aNWuV+x?Y|EZx$r{G>^0{#n z8H9$HN~SuZ4p0Fqj0#>@2SMB6_C&xST``e$r~Y4zuoTm>{ZE(U)9P@@X1~-{gvAWEh(rF|a+#E-PX>uKX7>k;IyDO3Lg}Dr zx0=|@dtTR>`hhZw9vQ1Bs3#OhH@ONP9>3d%HTU@Ze4wx%fLcNNDhF;K+Fbkj>gG2- z2X)gTY;~(l_PZ0HVy2&LMu8G|n+Myqq&eYz5bY z_`;85$KnWO@d1T<=4ZR-E|#u`mz{tGjtsYu_;G|eQI`UFswN~cmr(|T6iRy5u^y(m z{4UBV8TOW_Ly?Cy%JV$aa?80{85t&H9P4M>kt*`W{9e^wxC#0iGVnu?X56Ra0wMt@ z|KJdU-2M?3vK7Deyt!6ouZF6i);sAq=b=jy3VZzIEPT&txu&?S6wqGmdBkUglUMdK zNr$OX?$~*-NY9S9yfS_0WbeW3&06w`c_J}OM$sVq|750z#&j#?nrwukv@ly)~-$5d^9O(2;E5r z3V(cE68;a(vCe~g#U0wWC--`sIrW|vg;hF0YA}CsfKxAgZBJAkT z;Z(6=-@|A-!F~Y6(!m`+-qO_nqN`9`p$Kn{%7ddIGoPfj|3lk6=fs8sA#^Uw5Kk=v zd6gv&V&8*}3*h_|m?S|qX%|lTPHtI3S>H`d>qOoq8sta#A7c!Q62ijb(I;(aQpo&r zzslExmLV}`Jl$+3VYHY%NMC!MJQ=T^U6B{W+@3eD~1@B>)cy+Rgf)ETF)S5vsfu|W@{~LaA+>o;NTd*(U zSpl3YF;>DQjfgi7L&m^cb>qRx(hekJa0gsIL)muFKh_)D(<{U>r8&5Z{ES@#tI-EDK=C>MNXeEo`CV{)#oG3$SvDJCm%49 zNh#ep64pLu=@Fv8o?(#12xXiMWsCJLT?&6D1TAz*H)mqp$l%Y^_z;-`7KI@i8F(yMlXO@k**D>&+^n{ z^;)f98sr~lo`<|eICK5_C=qF=K5^F9gx&k;O_SyWg@YEpg!>a`;f2%+vQOGxD1ucq2y2BZGAi3%BB-0sAvl#GeR2J=$4?&R-kund9=1tq#kvg#lH*UfcvR3-;4R~0Y#2hFEi<~5*CLzed@Ol- zR$WlM5hK*$Q9LJJrCy?90B)~ODBZ*j1y+|mhaAQOGwSM$ z>-LRgfd+DEczxyAMilijw+jtJG!3`bzKVdHmw$@vMcP-@PUJZHm`5R~`6rpu{#4cf3sooUrYaCLRS_B3)o5EbhmPm(y}uX+ z4jn!wbvZbgC@R43+~=Cb*^2dv*eargpjdda{fb(dHu$P*@d&U6!sA=|(x|73v5!8W z9ld1Kq-M(PXP!3B3m2~8cRKwT$6_D$wB3a}meoeo64Wz;n56;t(>U|5DF6HA)Bmo# z9Ly7q1gvV-)y_Caax;KZqGj2ys`A$t#O;emJdcnhuq)cD24da;1WBLYi+3=fS|;y~ z3|(PP`7ZaCVh*Tc(5n383AUO|6>MgEcH~r`l1h1K(n_?(#1cdT?E#xW=h<>Q;k2Vy zv#rgKRu*XO=sL>0zPhpQ_94r{|5}mbs*-GvE|k+6jLzq|V-rSWlr)Ww5s93bT|(0< zAB`6o@aO*i&^659a{6~|>zFoK42clEiCqb-xyL>qufSN=WLilQY$n z;q*WtiV1bmKp`|uqKUx=qH5x_P2SDFu0i}E4m1hFD_L&cipd=@oEE?jv~Cc+mSk<> zH^`V1wS#!WqYL^}8qT6&F@@%cD|ssBZUZQ!w#jmzLippp+D^0+IGY?X;*Oj!{nf`0#K{A*hG z;_XgL`?<|su)yL}`k4n4HQqhz_2zFdr}!?)mm3+bF1%q9#!3j8qQqsW=C2koPynd^ zF3UAa2oH*iS{UK?Nv|`bG9>5f?#4}3u4QIqj2rWJdw)<&DkaJewNHaqGPDoE@GWc) z(h|sRdB;Qs;+cxURKzu?8K&*tjMvAe1twF&{$_FtL9J*^@Zy7BMQlNIP6XtZu}L&5 zs5$>8_SOF&w3h-5ER-VU*zk4~#9{|diGw_BUd$ldh6s0Wp$^^k?Fgr*%@Z5pqPVo* zY9^%{)?K{j4`B}$?AP&TDXLt=vD=9fgb`vmBG-jW5*#|yHA^u}AkA`rLtiOWVeMk( zYYCP&KsOj`fuvM>V6%SY8>X9>FM^=_ZGQ4blPK|UMgTyBaE+)+4oqh&u!zf0UOpb? zn-5g)7 zbDbz@sx3llT-__Jo5fD<{o^VNl z{f>)?d*Zweyq-um#_zW+OVM#P~mPghR!(yCiLDf)HeiK>@yph{u`tS z^Kk*r=|^+c{eDEC5F!J7X-?{IU`f<@3`u#*tB)^XvMYw)Q)Cx*8NLgDOXS$>8gOCRa20&%p=kR3$JX7FiT5vA`wb8Li;)zHs8sR_t1OoG;1x02lo)s} zab4+D;ceh2A!9I~yN?JGPdKDTOnGemq@C*zhDr#O=2=XSsVM@(!rDx{>B49(oxBhQ zGQ16oIA&;1d&`l09igfi?lM6hcKL9HjQm$7$P<+ zx;pwSS5+V%{+rszVK3+t5tw9wl8gTxoahrTOmV_!vh)|-?MP-NuF7^Gg3bbr03m{5 zC*KlOgoF_G3;pHR?E_cjCm-ha&j*fbdJ3EBtl?13UofqvD#y!-Zu=SvMBjI^bFX zO2n9m|0c2=h(v}LwEX+ER1)ZzE*6M_m}?W+RxD-UYzu+zd@sFv_3GcM3N}x);WPr` zNoCMXfnDmO zNd}}Mf_aiHo7hI&W?1b`| zrb?F~`#6Ww2f_9JCmS$D+=6Ds345ZNbDD z%wbxZ_dVUL{(R~gpkFY7p!z?%k-{^cGM2gU>VK@7_=#MC8LbK9T@N2UD%37%YTlCR zVTXx^>@CirMieLJZYh_+rpuVg1;p(ILJpDB$=*6$=Wl!=K+7M{>uLQ)u|u-zWIs85 zne?s>!(WzOZDq?ZrZvh1(3C+hWD%gBLcB3~)1utodf%wce};2J$3+=*AEstv;_!LX zrAw>7ohe=zXWg>yQR}b150>u+I?L&CQ+2p|ONtlsQzg$3F zz)ReY9)RpHcuC$Cf`szYsSYn`f>7Ow6XUyg&R4EZ%D<%+;pG)$RqofSt!gvRnq))0 zQ&ao+_SduT>E@$j<5RGCzhOXqT-3=&=|MdLL;ARR)OT&E9fIOh-$mKggG~p1ZCXA3 z5a~5y#Q7CRoquYMX}$RV*xb;+29a;I13tLzC1cHe(&?`|6Dp!7LwAL0@5N-h_CHCFh)3ro^XnE%#nx7OG;vbw| zCBhWw7IHD}FkKKCn2dG>>6u1YC_|9KjmmnpeF#7WvgPB@0ixd_)S$7VfMl1_U@ znMUAIMVd+nDSj0FHtw>+mm!-x-`?C4vhLiUO*uzjo}2qtzi9lPp3}Y*c$ubcp0{#< zZ;$%w{m!4=+(y?NJ9jQGp~t2l$AkNP{G{XCC2V+iH;SMT!}IYkyN6p%`k~sf#qp8p zMeY|ubo!_#{{CUw(!7Ii5P*;bBQVEZ%2PYEXri~BcO_1;EAx{lkr-L<2E^T2C{75l z!s@RF#(5X2NZH7|s#eWu1?&%HQ5-|>JJD~5f*Z7%u}&y6CCYNj{;L??!nwFDeYUCh z)+ABR;)Q`rlW;R|ZZuFQDCk56&SQJFNNe(>NwUa-BBa>?bAyN1*1KJOw&d|d1C6AY z(O*ky^e@`y^i8_tm!GoBee&-g+m}6?_j>s1zYZGi+tUfB+<-Ch2?@cu*XDT_zumnf z(8#OYJGg*tgvD;22S)tr)M|OYQrv6{gE2WR=P&Qy^#^CGvghM?J=66rZo{czq+bO! z?`W^=U6|B!#E8Rmr?_g}W86wH&PcIIRt`a}B`P_<+Zyw1e4opEV($so3u;jG_O18i z-V4aq4Avy#T|^the46-9oI)#uzh8(b5Qvo_FS{2WUSL8?P0Na##XpC72B z0JI3(a8J8&|Ngnh?klN<5x8bhfE?^eF0 zPx<+|qeh*!v8X(B^Y4kx{<&a1>e7Y%?icH>cRUdtUHng%F%Ak3JZ(PcZ~SWRb_Qwu ziOR#)j&afY&cEFv=7bq1#*3$6d~&=-Pz4JA3m{#2@KYJ~vZ zoyYMopcPZJF+#{0*z)m9k{IgsI{sn+OE|69&h4bP-{_UiFGyYxl&gqUEZFZJuqv@d!+ z(2BWt-_&2-NY%q5=EQ~Z`*$ZLy`6Iw!i|E(iV_ONmv?=fZ4~WNQPI}u=*0|=G?!33V^O=1&ES(fjxXEhblAERN2>~ z71?X~%Zob+RWq0}<1G=uWvygQ-)<23)J1}sis%h;LbUS0JL@nHBOjxpo)=Mt8hi&8 zu>pRJU`CGa*Y2##J;$sJh(u0(cG!D%f?I1N@g+Vso2E1ZLVh7vgA!iFy`@atAzzj% zmkNoXAX8&m8s-}?mSpt$)|>0AzxPk}Kj^deYOqHq)!rqcM!LKi{SRf+`&B1RR?kcd zt@wt4Tlk_WIf}JCt+jI|j&^I0gPu}BeoFdng}N1^`W-mG{IK)sgHOMQH+8s6ZFs>s zr#-KNXCfOsxX!IXyRhR!RXvLDF~If(XPFf$V8k!-GJE#T;us4)A!HS>57uuyg+Vwz z(|K+ON=N99671li{Fhnj{CJSBRzhYWW${y;P~M16QD7z7M}-gCX>~O^41&O;(ZjxH zGNymfE(Y=2(X76$OU(ScNqkdL>arc;?`O~KR-I>~m)LpoI;*dlLo*1M$w+zO2wob1 zunc91<``k7^!1sUvJs5^o_A1ibcAk%p_K9#3s2$ac{?IAqCXIxHaIwV`~6;ITbT}F z=}HCGeJRyhdlnC!3G_qxu@IAFiR$zmC#_;+HkA$pixEt2v+2QN?2|tB$Q4Dp+&Eva z1NkZIN?X&bI0+y4@yRamDlsAiwewFJHWwO%OHbbqA(R)ZHgjh3pHCgtCpkQbwF}Fi zckiCBrdM#r`o;J5*+1;qe$=R(c}AON8W@K4{9RdM_360#w|VczyV5! z%=m*dM)Q>PCoKm&m4Ao2N|-pFo%}_VOIb+D@u=2MPEgnv za%Gic&ZncH14@bG9|XZeobY43cW>CIFTQYR{;l z*>mQ(ay+uBB^2%z5s>*uqAC{qfaAxzVp5NcpgDxdn>?+w$_f#g0c@pm*>G? zP>4-AP;dR#5H}8*ke=PTL3PeRE4&q;lYe7?CSW@QByujXikwFWEl>$`(>j#dqJ!mQ zGAkt90W&dYCjOod*aGg6#lN`XR@IyCebdVAcqZ5CwCw8Dd+QMD5>lHC&ZPeSGD^Bb zav~PG;e>6u@c%>Hfu}XjgoD(JXFv6txE3h>bDfWYex&BgVm>8`oWIEpjx~G6?4^F= znPnUqV&(hBZq~x5&+H6t*R43DN(*r}R9Ahgo16ZsQ0vGKahH1*or^5|M_2vXr>B$5 z4jt_8owX|}H#;EH&e7OhVN*@aW3TUN<4;fStz!N0@uZJEjvm!fPp_+sUl~zovkpN& z<&1xDbt;sQj39ole{+)#0IlO{f4fCjMmi8(hGKEtsslzYVJ^5;Z-vYMb zo9a}uzp_?_lfpOkQ~h;-jcGV%O3!`)5*$i633M=~vPZntqi$9D$ z1aOICn)3B?m3_&<**Hy;Fy0fGBhgFFO!n6wXdELWd$Lgzf4cCAdf&k*r$e6Co?vn7 zAT-#dICcBw>0YcPkO>B$c(8s`LdFPg5;3_HXD!JA&VX!8>@X)*cWdK*;3?6b15foz zwD~Z|w@Cp4Kq>(gRj1g1_kyMQU5*U(O!DGfPMqp@zMDQX=vE9XhqHXegzjI2QEJp99EOw{NdsxdxZ{rcLr514|`e`Z#S*x z#|h4nV1C2`0MV?Gs{H;bWUn$3`r&N^r{3x)xCQMAmr#?0NH!8Mur#`sWmIA;BFBm$>*L&V5Bm zN{QicWzp?H>Sr0VUo!ZJiQ7aHCq!%*C%~xUx5zbx9YbdV?!``0igEmA#{4xlEV2Qh zM5fF#SxsMOAJlG%>eZ!AtF zD3m+CnlK=wa(XLYoFpjW+5QzV)8l?gNr|wm9ax8b`SP$|^_pMfthN80jQ+6(9k?a< zf5-_}26j>>{tO7OM)?kN@s`>(%cbWVKCs*k^rD(*HfR=GPHa|vGISc!GffMpQDHCF3YnE&Dy!`gX7 zB^F%U5NkQWX;S}2y?zSWfH4fdns&>dp~Gd2`Y@!)e~Z~DT%f-Z7I`f{lUEeFhRzeu zQHS65P5s8PP$n!a0BQngQz%0M>jS80t)efM?JJuF+2;%8T7Kg0pu-FBR^kwgd>&lA z!Enq9$i{lbMtDJDMxplM>lGSb>PO)CrjZA*j2|a+DeFJ6?7U=_o{0t{Sm$p7~D za2GfqZn2|(dMA|wce8eVOsNQQ=d>1AJS6!IiBd6%d5rs*Utc#KH4 z_e!X^`26O$au-V3E7z~fAm7>CpEOr!Fho!-mS&*74HyuR2A{t2q2A*y#m4$r2RWrV zHgCS2Ntv|1)V-vg)wBX2z~lILj4lP0_f9rJ^kMmV{4;tQE-{7Jh%J$^sDK5l;stMo z2B(eAJsUfGa|?3T*s8^O;LkA6 z@9Kh7zkGRZ$2nO8#S*C=xy$}_NjW;Rr<<9iG9Yo(x(qy^()B? zvW?d<)XQ}~@6I$fq7nJ)=sSVj-xK6f7|@6^0*UqDQRWyxZKjf56g|_3?!JZz0Lp!k z&{8mVMi@j9BxW+SwS47Zx3c1!WxzRqg&$Y=R4```Ru*M zwKciV!ffWvq!?F6e6M{#A+Yx$SOMWL2_0s%LF6E&An|k+FGN&5CxtZ7DuAYdLxK_S z?0&jQR4_%GT4iMZwGF#CfFja^GZFTkb5Qg8DWQ;#!|IHVRVn=k6XP81?GzCVc#AL? zCjQ{_{G&dl(R9teFZHQ`f;lZBGCy|wcns1TkwjAz$vzKM-yCv_amBa!2>4rsummc2 z9>(Us3b(dV@I4<*0OP}A?M^V3Y8xrxel8S#Fh9N)^nt*HgYRVj=b0+*Vd#^X?x=V8 zwXxwz2Ij=-q4%`V$tZbQ&8)ZPZJm9b^}6hgcO|d%bE{7!VZVt!Mkpu_85PoLHfP9q z9C2FuW(wa>^bGpWw*s}y*HC<8&F5XWj#`I@D1+01hEZYzK=P25R!%?fv4U9_wL9Iy zMx1>2GHJ%9I=2;*r`$=dT&7z1xxa6%hGE~5%`0DM**sKvurP|gdl-z!iTbnF+G4_l zcs7cHgU~Dcl4K{I&{#ZO`dZzCm!`0xOn^VeK6a>Brmvd;X<~A+;I394{aVKOQOq1skp4U!}6N4XSMcvLqNj{ECNNoakdWSq`A?71)V&0NN?%tT1Rq zAbXcz`hmeZe)d}qgUBVC9hm(e(V5V#pQ|9LX4;OLEXaWEt$pW4k@FKHkl`CXDpMOe zqpb$Y1cqI{q{7icys-J;nLw0;1@r;?j~$!L?MFA?_&*(rNE#Sda&Q`b1~AFV_i@Qd zo7j+L7mcbOuEuRaHagMJ&@AH_aOSIK%%czDY=$J2dC#vq52mp#IikD~ABDX>_n2Uj z;E7rTEKPaWiTwwnz!y5QlG8NGJS)woxTx`q7y4eQ!EI{78r}A>^2fS|of(dOb07%` zRf%le9P7SDT!ZxH^#8o{3cV+<9J%KGyi6_EJpzXD_rs!JQs+Z0-=RR1*a?v!&H9xC zg?%C{`wn-j?8)tkhJIX69ooM-If5dy@YAQ*`Z?jT;=chG{c(c|od4F_>PQfIhd7!K ztqyEyu8uaOe{f^!iYc;silEwL;?haXwd$$PEt!J9a{{fg1Q*d+Q|XOGwigD;_c*d1qFjTpM~H@h#~$}pq&E;PA?dD z1@nN%9)9iPLH@McE3l?4hL?wyvF=K8jUgElpkUiEH# zRfRDGLc+EO&zytT77u^`ARLJAJw^uI?Z>9P1$~RMELIW+2q6fmmuv2~@Ow{hh+6#0 z-`W>__T6C$K+$t3mXH`WFkg$v&Hn7Q+ef#VN4>+igFs>=<5Rxhuj1Q-d0%SHM;&qZ zB6Kt8?X~pT{aafuERI^)SI=eLY3uSw(=3zD6wiIKq%g#>aF?D#{>;%A(%D$jR7!kq zKgyLF?FHb{9!sP4&z(E>o5L%+DGs65ap6_E2!tMcH7HF?H=t_13MMUpz6VnZdB_ z;oV%1bUS^l%kz}Z8iox(4t1k>9mA@AS1BkSlMLX~CU<_%q$>C)62McDm1ely2tMbL= zc7q068cZ5zY;7AC;{MN$7BTt{pE+h$maP7yTK%!UC#FtdGmO?{q8c16QIiEdjdf2@7CzB@sZX@@gW<;lns9JH*Sj$Eer%jV40wS&G^XJ`j2FPMG#v`zaAX5o41~8&(@v$iFfy>*V0zH z>s-SNhjD=XxMC?b>|%so`Mj6P@u_PU;*-q zA``Q!X5IR(p??uGH#7lRG&mL2)yiKR;me7N0~BZ=-Co6Kg(g|M?c+d^Na{YUK#YA- zN#iK|CjIzCA`;EjOx_RxuFRt|yhjZScz5o3$-VmrGd32uO{kf3N@?DFzZogLD% znvUO7QxKk=l_kK5>8?I)liMSwlpO&Zd{;H`v14|^0EA*J3JP}YO`mIH`4PZOwxKYC z8~uFrkX{XwI(>Q5c;m*l_ZJ$kk==B(;e>2)eP`7glv)Nzus8{d@=G-VFDUQhbLVGP z735oQ+&J_4cdur#_PTC!jt+MFUhZ&s^2mbUUkgw^7Wo!$ejj!jn{5QyP*V+PYH__M zwbfo~grZQ%cFm8siKdx;R!wy}cdo8P%$XY0A*JwMA=!#fa>ph^??OlZIule|u<@qh zd3HIJVZjcRUPTO+;4twj?}T#akzM;L?~CZvuKN?M!$T~rc8B>T8(#73{Q7ld#=thC z>Yv1}e%IvU3LTY7b)SCuId`EDW_d$4bKPHZ? zO{-hq?9%}qtDwUkKW^MMcS)&mG%%P~x2Z+Tptax6#zieFTfgt^%WL1NI-fU>vJMayPT&j#`S})IMD|zTd6<*R1%^ z!C}K(dX}_0XWvUb?XSPe^F8W!RS#08ROPeN&H*wm@|Zi&U+kB{!W$unwk;*_I(?pPKsOu8K6p7ht3}m)`^?&V&2n;b$*BsHPe(Lnwxw(A|4gc&>h-#PYPEZp3 zT&N1>`96KJ0B^(nwjnj_|259#u~6j_a|zpk>sHo#_E0qMX&zNPV)e%DMcGlm&Ukv{ z=;*KOZ5^Z4wL_QXGe;o9&Y^6@euUYiU|>;wEPf^Yg8%$~4|n81oVWn=3bUN0irEB<@7 zHKByFx}liib|)l6Df9E^diApt_D=L&`0{c{a)jNvi3co8bL+!2o~Nc<9FF#=UNKtS{~F(<|W6o>qfbXuF;H{?GpRm5OSSSvPx6 z=^6j5q9`x<>im>A8t?hywu4qrhB>jb<^N+f6i^91Xb_Ak=iYt&x@x$0s~gmUzc3f2 zdyA&DljV%uBawno-@<8Zj#0E}qcL%p-h&4ZeRc-~6fa-)RB5kfdO^lD} z3z`5Y%wk=EJ(%feQ|o|Y3dwOMqkEcoqac-8%_bRUpSJ$HM_8uL7?aFK9V|1BGIfT0 zTCZf$0;fnHi>6{Ngxg>HlM`Dyw5elbsmuq7zmh^t>1g>`}c#vwT)B;4YNBkd-g@wi_w?dH+*>W|FCr3fmrr!TS-aTt0=NX zsE`PiP{_N71in2#!Q;4kZxW4Co|9sxK-PiRS=XspR zm>>9i&gEo|F{M$(h5q)h8V%iGPaUSToz$MFF>ic(^SRQ_#dt}raf)*esMv|phmgJD zHYOrFh;AL)43g9z%y{6*xGV?=83#ESzr}j#*^%;qVhP7%bXqqdEC)L1CsMug5HE=V ztvYNmSGCS9T)LF@*E82}>yJOv$zpHsv>CEHL0Xr1yqCvR4|o0OH*Vu$kq~w>2Fmt$ z!{9g~f-f=gpYi5CqGA`0dT`od&`Tt4i|tw1q5YrE`6*{_f!C7#FE6i+V!bs* zYx9Cb!Kl(c2`b&Z$jzC)J%+ioQbr5DmPlzJ(w>N^(|+tk0KCc}%`I$L;U}B}&^1~PRBy2E|H11bMf?XCczSf zidg21u)N`4zdm7I2B|8-ithULQPwAIsjU|hTfBEAoe!&5c(vO=*%Rgw-hC$BU~RCy5hty?tx{<;#{y3j=cz8T>0#B2SN= zrO@1zI3COW+}C$;>v-&_&OvGYN%wNwXQuT8vP{g4h*UO6GFl;W1S+@Wwt|=ZKkXf1 zzJ*g6C1p@Yma7r5IkDyd^TRHmz^r1AUE2x#pMW8tuOt@~%#K-R)8rqVH=U5Ar1)nN zp1OT{I@NdnoL{QK{OQdw>GY%|EwM8iVa44WMa}!UxGWbyZ9mU^Q@}3K)nFEX5J7#y z#RtB-H$;$txzMU%=M%JZz-s3+wT~l1_Q+;bEa4_ z;V!DTz2#aZ(h?GfuaB8NakyJL%3)Mx*ckqAoV`Iq`j%W4i|cApYIFQ)k;6i(s}ZKF zoqAh)W~6SD*N4XFo;!6rjDq?4O%=fnOnx`kR6eG?R2g_CYecclb4qC9z*flih2?17 zq2S=HmuFmEm0X3+hbkYBjWtNpWgcK9Xw?|;Prj$o5g-6&uvHPa^#uzQCQ;8eSD-+F zPtEH)Q%tautm7T`B1#&}5kOVg6XQi)bY`BNurZ3Y-u5sm2lG-(WJ@#$MPgjK<9+aEx9;R_OT6{K0}HIoy^o|n7v)-sj5u%3QbmF z)-S$5@$=4YA9Y#R%7=D0c#XI?(bpDcNclQOhqDwCaYsvOw|AoMQ z!qyen)1s8yS=qbqdR_h}EnI7YZ73v-DDknCGF*IxZh)&p5K1trg_P^czzmV#3FxBo z_Or7o4^D6S1Tm-0FP_kOw6ds#y#}u5NCr?7`RpcsP~J zyH5CRHUsK_B)5etd*jCSb^ANn_rVTDcsIZ950&k>@w4Q-cZP13Oy1UN#{C_kJCNPQ zHE;w4kEr{y!^GQdlfag8%D_`$JY(2PWVI8599(pSY7=xCxtT-&_AsLw!YnUTW}Kj2 zpz9~#)OYWOGDrC>B+b`~jS_=mcTsb5&5l3cb6!@Ol7j4r0jGGGF8|f~y_V){rl-w= z6^~DkHHSy~*w;kmK?c~``RTo1n$FxdQtoT08xBWKe-nBtrbKb7-6~Dv;I3U0_keO*CZ#zH zp6+;uj~@q$+#r1KiG`>w00D#qXaa*kSTK*L!^1=6x<|$dN@(B#U}#~U3HJ;$cq)S- zYV78pf*=~4511CVf^>guJcg4XB|$Hvk<(QSM>;Hjv^a!9Xb8Cq>gqRP@AmZBvnNoK z;yqRC=s(EQqETBt3gsvm2es|(f71BZ$rLt}C)YQN(YI~mk754SN>MscdH8dm_fDny zw3LzeEhDqS3LmvI4hh|twWNJT!pGO(u2WMzn&#Hl!4;$Ov9FxVLrpDu45ZV<+@HV@ zszl@CJ1)jQRh;X0t&^GsFzLH>4FlG!BWt@hC8vc{&g@{2zjZu*P{3JI;Cx7kR{g8>f2=biXtgnZ}j&4$C91Wa#Q#mE_dw#;qnzZ5jr$nawNAF^PX3I}Nmy zKOevMms!*48sxTwLe$>A1*MPIEn~eV0W@$Wjf6`ONI#^gULi8JpSs5qfnJ9|%b?-O zv-<3s(r6WRcKEk=GJowEHa4oB(tvxDKdeYFZhNn4!M|gNm}h5y1(6tm5rb!Y>!GEJ z-x%`{hV!PiJ}g2VI9Z7T0nJh8%Dir-HjbNiycf8=z{~4{_&NV<{%MCk1QzwYL7C;p&KVRWD;AP@B{0OT65N_RD zQ#$?{O2CsRU%#uo1b78G!V&@FYr)A~K{v-n8){Wzqj)^|hDQ|02F|lH>bO34SJNiR zQ!>a+zIL92vpLb|FXu5{Ue1?`iSBP3v+O=TX5pzfHWtn*3pwEGp-{&AZ&@6E+)NBi zCqf>+)XH=~W_Vv;0uDS3F)vMRmgL^Tz;?CZk+#}%GSa^~PW1e~3ccKl3d5zS?(YF7L~{+_br-6pTs~uDF;Fc7w;!>6D|V4? zeP%%KEJ}V%E$fOyxCeU(f$R*ReFz5Ik&Cji2;)21&6R7Yo_ulopDoDzSEKgw{ib0; z!j#gz%CAmYgm@mKX5D}sxtH|m&wtF`|LsT%9Z2Ns%fc~@TwHbesuw<`y{uN`Dkce5 z{Lrdx@bL87#n<)P0t-#ULma~nMX%}(?&a)v99N?io&3cgu?-+_`Eip0m2I1+w~i$4 z2njiA7k~8CjT;nBoz1ZfQ@4A+G%mPRB?bmk091ju9w<%s*Jj^@qC)(GjmTO%&Zq#B zx1_GA)~ThkRB8uh<-m76J>5QjL6d*8Ks)rw*hKdRw5x7*X?r`Hetj;w#2=~}Zq;~Rlqu9o!c1;-UtEFw74fFwaU#LSkD zv+bm2YZh&hM)<4;K%1=ec%(D|058nS7IWD{6!APfJgD}>mw!I#-F8Jd`!SnL9Kn^u zh4}~LzlFD?)cZ!CWR1I=YZSP1HoprwQ}wA$?7Gf}%FAoL{|yHX^Le%NF06Y3HDZ*w z8A9`Q%IX@+NsUz7E9!-|AmaL*ynfBespS?I*Zg7*QCHNfYpF9RBo!4As&3>uO~NU}*Vh+5ZOPld@I`h~%hJ@xe30jeIdJmO zx5Y4yU~)XtowJ5s2Lda1q(XUFKOw68;D9VeNoE%^P-zfR8*9nL6I|(TM5F< zYloy22YPzYCeWr(8Ro)P`!wK1luNMCCH#vnUTjvqtheMwTYbw_^V5G^x{r!~s>P8I zOM7Eavg;+)odK27Lls&KwAWnAYq6ld6q*OEELrv$e$cY(J~UgDbWR~anx@U;x$N*t z7KJtTr0k8+3kwym^`9nG+<9|dKwP}4;>vfQ-NSQ^y?wqtls?f;1e;(Sr~EiP(g*__ zT$5mM5ZXqJ>OqSJtR9G&Km5@C^Hm_d4{M&dSz0|fY^b_@IfA&zd1Ru`zW>G!nQyF_gWq2Lcb8@Wgr)_PH*oX{x@Qr=d z4&2}22mTOYAOq9(P(V!>>;ZQS`2E{aSw#iik_?Yoj+d1bj5M$dno>e# z-9GxME89^MA>(}NiK7VCA{YOJG@s#HLsk+^--lAP4qoCk&FrQZ{8evLI2&_da=jBTNB5`sB+Xt!%$Slfm$YB`o zMfADA>IsS!7{r9s8>QV7B*wt?2IBsAs_Mn(Lnk@jp{^$|TUy~&%H;OlpL;sAr6-!2 z6iD6}i$qq5-jolh6!2~D))%vBZf0;+HBSp`WK{TYo|Sb^`yu}bH)i28F~YUCop^Mj zEDTE-=sXna((d02R=lUy6kY4V#3Eqq?OSjfzo>}X3XMLh z2S^0@K>LK)7t+@tw8!Ac0(k(JngM(uVzRz}|9(WE)|~yQsjV#!WDvX<>uD7^r2q|C zzrD0gg*t-%{xlRhX=(E(H|M?eg>T)e-EzVKuRbACm2eA!e@sa+n=7x$fLWDqe!l&aqpz8nC{j{F)n~RDMnw(B zCPs26ZcR)iTQnyae(3)o?|<0YOg9#v3(_q{4(%8!gL;2hNXQ)IY@+kJ5AVYHI%X3h zHyF(g);YJJ;8dP#2y&|l3v;l?G0KNev^N?#Cm=!XC3(BQcD*Rp&T*<@`pbAGKCrdHjz`4bNsHV;alip8B`ky<0unkmGpP{6=6z21{_* zTU*#xFhzWH2iT<-fFq)zz%fDCD#o9@^$>+8xwG@% zod=2JG$=UUOvFEUG`KcbN4*xt!8!0sm362y$5Einf@vYj^L~PPwJ* zW9_mF*3R^K_=AS*TJYn^o2!*i+W8X_5?rSXKk8TClf*}z(PY3llDMGIQR4`jhdtGC zgewpxKqynuFlXs)-Z@8@WW$)w1Vk1gpf;5WGC=5<`=H@&gYi@f1J1Er4-g?Ha zWKiIvJpF?8PFWW>dz)~ogfW9E$S<(s+DVvVz@!2m0~=W$e}4NsotiXUliWig?_i?% zh~|A&rM$8xeSu&fW5=5AGcij#d>z7lih!vJa}|t)(YRrZ2umndIO<^td^_qsfqLP; z5thyX7V+#5KTSkf?Tso+^!#TDO*V0s5MlzX!bePpBw+bKE}2PpF(@z)G`zYcx9t3+ zuNFRzZfUFA(9z#^peYQ+f%e<8 zqM~+dldn{dhbl{3E>lc`SvaZ7gjq5vX3NS#EIIIVbFAD@&g5&4pWR!l=hrDb78Eub zNBVven}WEs2qn{%E6s5DvFyq_2!ath%s=(4_Cz2XoE8Z86QPBN=#5Soz99`(WlF*f}06P&n_lCiUCCg66Z#}VpoKV25I5l@UdCZti?bJ}(<@Nf z(8Ck4D@jq`6TaZofMwnY;e&$Xfw24mWOwXUWA>f>EWsIMh8hk^yQ`QiEkS0f-6iH# zP169Uk?GieqmIj_wyZheMdKIwv2xK zttCr(p=*FYGXEe~27m1u_SU&o0D4Y|8s$hyFJpexwiG=T#nuu+S;@Tl3Q;-7@2$+&v)%C4GB9X><02NX zGzKNeJA8oRW8~K_m(TviGVVSPJOmm@zoO*R%4sjQXc6Y5^6F|%we;8ek#9)D-7D^u@B1p+az)Cyb2WsWdF@&3 z?n|guID40Md$);4wTT|5+mMb=s7o+-vH<#2MOR2n?+Fz2xYi{ODtpNk8-~Z+E?4E| zJMW@l;y$A_p41dXS!}&aB_@!6xyhC%I{Hp_yYDN6=IC$X5Z=IkZ-wPe3xf&WT)fgKm@{Ep}`4nVBOgz1~G@~gH z-Yb9?L1-ls-Di(g>Ro#3%ijNyaN$Hem{fI}@ByotGOZ z&kqoKPCGkK)ie?CYl)2u1x%^Z;x&Q8v>|V^cHKZyG~BC}{uZRCk~37Nj>1mXT%|SF$(d zxTp|u3UTV7qK*muEl8B)a*Go-5suDC_W!kT)?41VrN=&K*T0`7U8oUtKM5P9l5mxm zKy3&5DNC;3OFW$ei23yL+UjA8&gADj@cX%!JkMP&u5gaX1j?yzM!(Yy4Fk7}o)4v< z9sjVKRE>Wjj+S-j^De<1nH9S0gtjGZ`HTY4_%3>jC+^Y%R;b(nu~ zwBP1CXU79!quJ)Ht*%P)K+YY-O}@ohfAf0t^CX;gvW8V1RN8O}$N)GfIkB%g0mY%3 zfDQyFz+U{bs_C2@i^6Kpx|}JqrDt>&p^nwa%yK`Nzo>?$hoL_%&g1W-&a3m6I5SD% z9HpRXXBqyi_ll|eM6UtEOE$3!-A>1jXujT48?J=)f6_STXQu zdm~Ujg3f-rxBLnA6^o;Vz4-q9a`BQb8a|tt?#ds$b}h+&Y+R4J4Lw`Jvo5>7^EmJJ*+DGP3_nzv5&v)Rd>VCjTLg_YV(Gtznw(o=|zWig~RL zt#@rw97E}T&bz)zWS<@yN?yIq2-vI`xAH7sMi!@2+Gt-Eqx9B#r)zS_tupNcVPb^x z6*IQ(j_VBNN*z|3H=Dkw75lnQ9v+em4y0pR$x6I!+tavsLuup|rehe1JNE3j7cimOYN-6=$KOjx2299bNZt) zr>BkX6I7;@BI|C zF!UAU?cG-=+}~*MMEQ9+WMtS3c&^CPIaPoix%MPwN9;s zpJ#DjrVAKEfdJR))Xi`)Iwg8oQ&B>_+@ocYnMK+k$e1j9O{Y*fU{uW%gU(+|ZiKPw zF7|sEx$Yw3szJz=(9Ssx(%qWs4*61>!?*MFjA_#@4^mAexDESQ1QXZPKf#T8Zwb=yzXZ@rz{jSGjej|%hirJDk#+DZpKYs^sN3=hgP~yyhiJKyi^Ips`)4?95WTJ?DCMca%7scQM zhvmH5+a{T94&hjYR~#5|&Z?|C8)}|GNr#5fu6#iupIMFRwbNRP@@C(4b8`Y&5(WPi z%7T@et+ms#Yty2b1zvN|W@I3LzxLv+oB>V^$L6{aS;|l~zR7m&-KvKKbeS0ZC!gr* z)^f$h>|#CP|8z7fE2>K_n~@?bH`gKKpftan6AIPO2UJw_Un$Bzh>j$`*^{aiBVc5x zPj3Hfsi2TnGc^XmcG?VyRnVSE)+6R9u-7D8q=%cLWqcU4-BI)F_e!1T4s*A%7s^oe_cB_rmoqntn>d>FHicO+A}Q z?S-5;sgi$pt*uR;3Ld6yIonX<4b~MpM0AyDcokqNv+S5GDmo5G=#g>$$N!}BJ(Cl? z=!qeZ%kkoI2^i%JOb!FV9WAh16&sFnt4?`%K@y?|=qFG-5@Bo4B6O^{q@?1oF8hDG zXB26PNv*Va2}{~=JBy&7MPK6g}4Et8Nu z^^2Y9#-XYB`T$)puCAJ`1~XRAHy>kU!jF)^k)iOw(T#eNuwg-GMp%$T3_w^46Q*fe zT5|C1!8hG>dcW#YTAI9TpFjHx6udw(^gLer?os$Ow=h!csHxA-BMgHDeHZ0Q7(u_|11;3q9t!?t(eE%Uso8IF2CjhmdZ*&;8du~ zEHQkeJa8kq)#Ad%i7?* zwM$w)ch^Us#Q1Qls3bkTGOPNk=%Jnoqi;x7Evx62Q7W79-fM~}w2AEi1PJdz)fF?Z4V2%~RtLNZzK`?_mqI9(;qk9jb&n7gGyYRW+@X>^uEJrAxBxcdh%(pJLKX1RR?O*e8Zb%W&JFkZj8$KZR5`p&2gR-)&I zE|ZT~7Py!Z)#8n~uOg&V5!p3V?6S>ZCw<1FfX?bVHa!vZO4w?Tm1uk0UL?*&^rhC* z*@l`N2O@xgoLR|NWZ_)O$E|9!;v-aD8faVS z)uqp(ns9J%xC%4_{38wg8#ohTicGZZP$?mpTOG5uZU8Ej>Rg49^L$nJ z!QTaP9Qba_u3gj@xo)o1KuL$7D@Zl)e2BO|Xc~qlCX|q>fVV?{_d;3aOFU}6xkc+J#V10bujR<4e)7fR3>_25cB~@f}wuQuJ+GqmjepApU`B;fX8){Cesc4FN3W=?W|05c`E_fFIrsPPf~v~XHdEh{<8-5xG#_nKv5P>PF*r12 z>SDpf%zQ~pi!cztxrBi!tK$^D4s0+b!8(;jNh-1N9d&P)jlYo|Ca3Sp?~`a=Rm!dQ z^(E(a<_5Y*U@IXp;$>eN8Zgl#(oC@)T8wXOTV7Z=o2&aIG0{_{K82rxAU0zVD!tTk z2`Q`B%*-772Dk!Jc>xNz&Xjaz=@`~r(G3U=nhQ1J90_7`>Mx2M->EP->?<&k!gk^WQGpFCY|? z#q3uwM6$kn;LwCv(rUXeHkg*d6EnS_U_Q&%)s^}E{?G$S^CO{}>?hB>GVv@gKKJT| zo?c}{?*c@s1h*6lm{x6`lmT8MECtE%xD~M^7ur-rwjG6e2&e>Dm*V`d2#3RG&rJTZ z!RlueE5Fk2-79!`R@xN^T)q--QLe*C_pP!#;{~5(Tx4`KF)hXb<*SjM&zm~@9FF;i zuxG@e0`q~RxbY!KRz`3Qs;|z#g9eu`r?gt|3k&BhRbrsNlOt6Y#{YyC8?CAz3IlB0 zlc!EK-96#Fn=l+lX;-!+nh+nqSF#JgVI6*e#AtGT{Pfnl{{BP^wcuep#gok+Sc}~R zzp|?d9|>`FR8%MBOP3Dc5Ayti^t9p;J?bcw@C$QZ9q7{_Cj3`#V~}hPFop0U8c%em z8jLFn1h*@91WIlqoD74!<>-*Aj0}Is@{4f85gCP0GN1w<13mW5*ruTn=I^7itipz@ zECLjTUlIhB_hxI?zU$0v&$T_t&cLn3bnaZz1I`6I2`qhx;O9^`^mn-0xgR{KpozGp zn{dA%e62tnK5E&PYIts6KA#J|JszQ>YKxY^{%%97q49wWdnsyjj818{w$|q8YjK)= zovSLFjVj=F-ht83BROLU7B=@3{=kPwT!R$@@wdNz^XAVF>-^5RzqkHJ+t3rY%=5*_ zUdkwzRrBSs_v~K&RBAdC`y$eV@v?yK(FjCMmg?DQGx&nlLsZog%}E`P*U8&Y*ghP+{#5dno6loIA+0$xvk!>7J*uUa;FkIJ<>>M- zf@1LtBt6$xY`1J+3k8<~eF)J&yRQ7*DA;LTdByw3x7Tn(!tmeA`|jb_bPAd{nRg+J zi737h0XL69`+UbGb)#2Q{%U+$@yA7nCSK;dJ0^)eeHQdArHHP2Rq;b3^+5s7z&WLT0zzb7)(T=N*XA7s21VHj-3wO-5*>nZ5YoH z!RCg>Om=3rwyN+Q@@^F9;*^m&h=_VO-?Hb@hbdH<7|6d@G)_c@b4-@$U*6OD(%qSy zipkOuzY3QjT!4w;8e#Foc5nWjRm(5y1_8|UgZq0bUc9(Zd^E^&L(|jEpm*aHo~&|P zst|B4$sqUqr*UeQ6?;Kq5I4ZG+S9)a{`kN zt8;#F5pjV~lqf++VDIwK)D*0DH+kU!5j-6;^LYgYGKddBBtv@jI}Dgu?({;1051eH+Zi=Y z+U_5Qt@$RWNfzD0cj9naZ1!05c{i5eJQ=nc6Lu)DA0AI>30_-9`U}?n2P$k&o_wn! zvN2^n-JxQNftMU+`ADuHn1N@=Kh@RMA!KMpV5%N6+N`LUkbnRVs0x8X!|O)Jb^vlC zB9@78=_3A6vp848z0K z-=25r8dT{H9#jPw2?P|c3ZtVm6brU-~dOo zo<(qSLmzMvdUqmM9nKgp4xIS^?4omzq~F#DM_)I8H(VFhkwZN78$+J z#M*;WZ`;y32i$)$CLK9(LT9yT%W*QT;K%px5dGY@(s?*DGgFQvv#HznTwNtm;}Pa- zdbaK>Lw<0gGsOTO=>`(heQ%hd5<^?-@&2Kj8RxW#=VMDzuGfP z83`now7=}KdD75Czq8JB!)Hd3vz$#qL7~{TC;Hbf3kbopZKu%QQ%ld2prc2XPV+yW zlITsf61j_%b?DwFqv-j=JA83)f_n)U_yL_BTuT1%x&ov120B2@LVFHnuNZx|U*P>Z zwFKO&H81P)J8{$ojGpangPUtZ0er&dCu5(Izw_9<$MJP*U`xq6RsMx;HdZGJjm<1w zkdxF9?i_`dB>iG+ExJgO@FEGX&2>dv=DRp_1O%73d8s~h@7qcJh~4wPA3KxT#&fU0 z%9%W)DOw?2ogFiVHC(D^4WjmC#MJ%(mC_g;Z*dP~|!kkV(>H{Qu}kh@or6FU29aDWdON)6(c_DzHzUuHFh*!s16Cr zYHS!B7@G0MSk@vg%pM0%_#@1|tom+Z9phYm5*dkr8+UKNN~pkbG8#|0I?pncXW3WY z!dm+AZjZ?Q;9#)}%EG2_IL8b%R>u8q@ERJzV~DdQMrimE+%yt$IFC-R&y(8z*n6#o zZBO-z+zu5m`Te^$CiqO_%X4}R?=UF%aEV#pJ=|E7v$uL>JIl%Aa*o!oXLKG;Jq!wJ z63_PNWWIcP-=Yumj(-u=&U0zCC+h2uI&y!gSH>y955p1(m+Xf)KD@(9uK5)ljeAPo zG!j)K8pzp(zyotDpVT3|)TH-9!fN4RVIcOG=TP1A(=rwwt+!@8*l8hOa=FS_Iy+8) zk~n(-oB|d!y0&ShwA-eSLS07ZkL2TEUw^Js(n@v=?Cj629?kcto_*j?_hl8N z(Zl{34jP;l{@KGVpJd|y|<+Q`t-rKWBDE>W*V;PyY8R(8#FGX8sY%Qg2o;Dw6oNtkRq?M!m ziKGLGVM$w9d?lNDOJR49ME2Ot$<}j=kL}yX0!g1dF>U3XUt*MiJJp5ZkYL}9HQzeE zllt;k8Y!+j?^x2+2lqg~)Mh`BHA27&o7czxr8PZ3?}Bw385Okvq4PjQeA>Skqs)VS zwcB&Q`2+ZzOhS@KSM(@P#HlbqwRot6k}!of`lDT+e8*bZKy9sW=U)*)??2626i2v2(qT@rk_*dIs1)~^d4+(5 zmDLA+el+Cy)2*@L9FBaKF1>5N#GfQ&=>I3jkSD86EpCU`2G8}{4E|$^US3I8ohP{y zWqf^;e?3b{(a5v4HT`;Mi00Ov+~YYD-;2_U3u{Vy5LEON!w*oyC*ykVNUf|pwMtyR z>lD$OJ34wsH(IVG@f5#+^3}RPJH44}HuqwS40+a{KK1u&dwPtseC;=l_uU!cax-(v zXQu+)ENuXw;{N&r;O+PGt#v=bVxkmNSq~zW92~@1wz07Z`{;kKS9Ih<75qIz zX4{e*2dRfTbr-7+gx16h(F}+ zUAYI=-?jMjm6YVF<0LQfJ2Np2|7mQ@5){&TaL%ek!=k9jr#7Q~_s_eH3>JNcy=$ST zg|&7T8z${LB^6l8cgWalt4gE5)<)j+#Lb&0?_Shqln`fBP&X{T4?O(GgMfffvFaCS zC|sMO5SA)bvCBuZEnyqzwx>wqm6j(^M>SZvM4WZ8QEqd%91*Oc?ehk5Ar`Q+O`RX_6BdMl+<@IU$^OH}$1S)-=AUB_{I6 zpj*A1T#)`UsJB)0rcctxR$8o*zfPZMR;ERh9xu-dEnmv6lXVsqhLnny>}7|dSBRSxpDG3zxFGg(tKNL`le+#KQB+?6BTYtSI*7w(J>~voqCGneIvJD9S+)0rg z9IRnT&Hkv%*!#{D(IEf9cnxteAV|SyE&3g}V}v{$pGlY+;t{<92c+L~IUH{Zd!Oh&}`*zY@k4Ap&(@5FX z)S*3EY*kXrZ*b*EVC4mF31tA91OfiYk-blva(V#T0>!4nh@}7CALuSFTC^sUo!(j( z1+Mu1{ri!0Sv~lS;j^Jh@+1Lcr_=IdX#VAGh0PaNhIS39>M+pyUb7QUZpzB)Rpurk zGHvmb#1=^2m30#4V>pQk(`X{=3or*UGX!T9_hqlD$1Ko+!jh6SJf8TI929~Zb6&Mq zIm5-c2|_H`zlSgx-xebwK}OhrqgTcX-0}+wf@{EL<-f^nd`SBO)9Y2#K+~eWza5d6 z;v23hiEb8n;HBvo{K@Mp(L$VhIw!)mx-{pWkK==X10!8PFGS3xQRD|>W`WfN=UPm) zzF z`rmi!uPZ6&w9(*$jsuq`ej141-npqIH}u1Z69^nyWj}DB8PF&4u7*_wbFiVHnV_Sg zIb)Uen!41MUNP2sVmIn8NwTDzJYylZd;eAgV zpz%LS(M=4_&W0c!5`q!HLv+AXfV$c#o9Zz5!jqul?u@c#IPK{T?kJsxT%a9nM2`+A z^h${kV&QR7fn@BDuLfoX@MkB;CU6b0o!a^AL_%HrAJP+c(+>=Di7Zb1zd;9wM}~gJ zD7cea8m#MS*$aJv#R(v;B7jM70j`vfoU($cd|3Ea** zfrVm@0*DYmK-tnOTN@h$A*JIr>bc4wA2h3EgplFkoWl`i*kWL4sEouhs}dF(vfn4{ zcBrdULK265c;K6)G7CGd5K!HLO7jOgUOwG}pMt*ND~>X&x@iUl*eh1Ynf(B>1YCJY zyh{>Afj@P0bQsQcVLA_arDo<4gVKw|?yLW-P48{hY|)d2Wocv&rG)xK{F^B&aO8oz zf}5V<RY2AbK764p0+@R{X!sa+<^RAzC<%@7USfo?AJ=bHUJm#e zm#mL9W;qP9$vi|Swtc(&$%@h0G{|?rt$1(0HuC}2hG3qWTrv&~3VQ$f^XEg^V)p$9 z(BDITWi9<2j8bv&lK#KecTdR6TP!TBK2mzJb2|+i+ZWWXB9hO9uiR%g@j1^PwXrgL zlw!{w@Mg;KIv+K|hM0mN&Mr-zukqYqvC-0z)6)?@8l~)uF@k{esIq{dppu#zu@;8M zib{BjK1CnX>Hb>zaekUf(%_1O%%_@gjmJ$j5*!SwOt--iVS zQy}#XmJx5Tg`!71uGURPIRBRgoK)3bzPYZE^8hy>=D_gm&A#;%{5sTYi0UTa)@?q%F4;e0$et>9aIu=Gbp1;UIKPQLNhU*(#bnn3ia z>weASwMCgs-4Ff&QT+ZgNLOu1*LijC zzyW!2GxhzuR`cIuC=S30?5tV;yNX?xM=%BluVUcq*r8LbrFOm#AIf2veYbCf$C({t zJ6MOY0+Qf;{u|U2n}UumA8coWgS6yHSU)~^@a~97UhT*6h@*4A$kk=w8Z4(w0i{`} z)pJ=bgR1s&)Kk{tW@0>6?o`>@0tgN`1Nhv!_a+Jm7q@?}%zQ~V{slZViFm_$`SNAx zD*B}aU4m6@t$VS3aiD%%^#p$MtYGYO=gCv{*PEa2%KFRprP-%!AdgLot9%5gd@5km z&Pk(Qa#iI<9L|`0T+-Fu=^s{xP(e4r_=Orp(`*rWO);Fp9s3fGg!kIvG(C%F24Nsg z5*>=jrRA`(@gUFdzP=MUU$v*akA#acVa4g`by;<4wZjhU*s=17yW|ZG4H!>GB=&E+ zd9!tS&@n###ZC2CxL=P@M3}@r1y8VCY_QIru`hyoB^UH0ilg3nUUeJ zB&Dtt@TumM(-;G>gDsn)pP9Mc{z5(JhZej!(9kD|7iP7?AgP>*uyN)0R zsSs|+Zmm1vx>ZH$@Z_Du5Ar}zhlaX;2mKcb_ z1WWhelYJIqWfxJC6{0I;Dzk^hhjo5cJ^MN62Vuo0_s``A&$}iM^ngSI{a-b1azT^0;+ zv60TMBq&V?0t``_4u9AULX+pYqQm?z(MA9clX z%yg~h&{rU%?F=q$fN!Xwjm!3Zj*5)bHW<8yA~1?$E<*8rQ`1z|%^Yw7@Qu?EQ-fXt zGC{kFd4r1=IWSu4b&TSNrd zW1=c?MPBiVp8PJ`?FwXWFuj@crzpfI6cY`-C5v1^B_$e0GGpUg0EJGJJFd_v=}f(P zH2m_p);*AW}a3?Bh4)D#e*};W{^RKr&_fs7ycUMyP%ddX;CE;vZ8!;DTV>z~c z>bd8`0WW6NlC(eynD^ zb0D8BBj=;*+*LWbL#MaMq0r3JHNVVR{_LyOP`i4ki`y6ketQ|8^d=mTOa|Yvp3PU@ z(OQGyRqZ!rls)ox`=K{fT~k!)$=ge9TAc>p zY63e*%vNqy;dVwn2G?W7wdI?Z-`lDR{kX!#PO?xQkgW{WOvzz!=cj8BN~*jfzb^Y{ zFJemP+q+EujP;hEygSJqccbUK_%aQTGr6kT4^;oSr;3V-_>UZs&(F_3rQt9+$6f5Z z|Mr{chU1J+gEiu;&h42djoxO^^}US5g=*KM;MZX{A>{ zPZK!v{H>#v?!KkB)Nl39?3;HN8b3m+OLE${l4{MWTis8i!(?ZmaN?VE>&x5ZIT+V~OLqu^&F%4!? zHt6gQ2@Es?mFGiU-4MVSSU6$slIgVd@_nI{)Lm<}SAwimF`SVFcI3VbeLr??ySMB1 z85gcCz4x9xZ+Qo6-k!aBu$Ie3D16t#h2R;F`VWbd=}*8wT&yk(Eh+Y z**y2xok@?`x945l^=Z4ANQf!N)ew#CEu^wBGcT{K=Xjr?zDiN+{$|@R>MO=?kM8d# z@7_jD>P?$CQpG%w?dY$4h@pRgR{786Goo)vS4i}?>vJ-6W(0NJpg%f4ymb8hJ{C_h zHnNMfiVqLo7b;*&>Y%DPxyki*Yn#)D5;5Tu^`9yxy3N6J1~HR}3Le4ugyJ zi4pRSjIakX0}%_1i+1E?0TfV1W1Vn$;^)tIA~o`ZQ|^(=Gt6Skx49#%U~-L7c?)XZ zPfblJ;LH*eS?DS|Ru5Q}T+mFAHF`hB_e?NojZ z7zw*!E0uzBi=dGaU7^}XW?m+Z7eZ;*T68EMezR`hmU+Rvrla?N0sPQIOFOc|3H&5Dso4M$!6HZRh@ zB=2bhZ!-JHRV)s>NRYgu2`v0rXM2t|J@kIi)+Rem$WfYmjrW5K4NwbRZEEJCD-BqQ3_9^X-Y z$!rfgCFt`6kqHO_`+bRi7jK|tQU~qr^V4s(c=x*7*VHEl;>BPbgnvPpH4sP*3YLq& zOCckWoRl)`gm2v#tFMob$;zD9c@x`vkr!BEBO>USgs=F+s*8BYsBQ?+EJ6_S)IMf+ z&W}Ay^LexhJ+MUe*|jUEzX_1nzG{gH7|{p-tn=_@7<*=~rFpCBz_GLDJe6Crd-uH< z`4BL0p{ar6`i7hKwU2TKgQe&ieu^EYo`T*LzRN@wJIIa5&VC0b0O3Obd#;+U2K$o=7Xf{=#{PlCHc{%>aLU>ninDe8VlGf9>BHOn( zq(hm=Lr2+sw4UCd-!5gc?z~M-b4OCKj&J^Q)r#xDA?eW2ZI{ZgD?6O$Hw$`jm-+8! z5=E1BX}_lWIp2AWysDGc$6ru$USDqfA-#Ejs;;F(f&aPs_T0Sg7V@Ot&4(KI@m6nq ztsBX^euOh#^1x8qothO%y*S0k57fv-@&)H7>sF2Si27`w+3SM+Stvd|MN)Zd;grPA zJE<(emPQfL_SC4Epm-!=(+G1>;D29G^b#RR7!X&_E=U>SEZYw+N3msXD5-#X6NoTU z&_&HYkdq+$q8XIVi;xYVw0<)y>F9X1J>AjmukOB+4ri{ZD{+0{k4SBV zArUUcBGg|}xeI-7hW-wO{uzB@cl(!v$9Ir5`ha)S!z&7=vmgLas!K2I070A+)jvYr zrr5mxiN%MuAMdau9a!HW&G(w$mVPH}geM~I|7d&jupa-l{WtSegpeW%Ng0yLn4w7` zlq5-mM9L89OA@})sLY8FrKFNFBtvB=A#*ekGDPNCneEqQt!F>KWB;-DUwa+TaXf30 zKA-!(-}g0~=Xsr%mtFH$z?8IrXK&qVtH72Dm|I#Ab7IMq4o_aq2>qkGGNIakRQ2%r zW;zk^9hP)j(=;#IR3~#*xbpN1Yu{Zr)k*J~8y$0|CVchq_@=EM_&yw!7Ix{Fna-*^ z9jb3ivX39%#(fxius%~o%@DB8 z2)dV`(!PCSsrSN7?Ua1XvG8fv?z~t5yz^(LD4#P#@4&{57CVR5we=|zjf`Pp>O_h(JsR&lSU%jAu1m*;I#1SEG~sYPxJ2{wsP&%21<5 z&6_u$sM3fPIy**8?Y0rq3tU+F;ejg;pOP&wsPz}T|D7}UP8;@nq2hg+l=apnFuuuz z+_1@ar@90_i5_C|LtWj%mZbw?F|C(AbU>za$}KQ3Sol|+zsyZ-VdE`s@HnQb%~xh7 z8*ZV3WzEqSbRtcWInpA_Hqn%Cl zxAb~#sG`_wX-3-w(9ndqxwFd_A;T{#F)sK#nJU_OqYJM?5c9Px%a`Y zuNB6%x4!sqa1&Lx;d`Dv*7~=@ADhf4=X0-ygn|2vv9?aT75w_2iP?QYOkXfaNg|u9=KiO8EgxLJtv1k zuat+ED6q@ScAz#`{do^#PoOWWWEc+rp!0Ker11-t5kou7?hrd9rPfG0GBFE?pY=Yz zOHL2ayEO&b9XkaZE2`d@x#caBUYwKp-{XvXg=A|lJ{cF7OxOPC@nevL`noaJjw(C< z{c*73(9W0b=9>I_*(adFZ(gILFLl4{(?n@Zap&dwD+=FP{7Qk#p>SWWL-AxJ}Bd(A3m99g=icv zN@qS}RvJL&LRs0dOEqVDMcVnan+yVpFOXtul&^0M}htu z@Xo()ZbilIV?`*R5T$8qYd7ApFwxE{fBfj*osYUWJ3H$@5g$Dok+WleJquf&ayx#^ z#&Wm(F@kb%ksU)OY_@a6a4s@&k51&HKg;s+%M#TYzlc(sT<&Nak+cvsaN4mVgvXJI zZd1ngGvLvoJ1eO$i?w=bUV7wI^IhY^xEC2^E^t-=EGLRIPMX{GoNw-czp5vKnP`*6 z5lQ3kg&!z6HgDRj>+$#br}vT#Q+3Z0`S&Iz6)J1(bx21gaQ4E58tv<#Xi5T;IyuE9 z%<$%&phaI4J@##Lyb(yLHRnRce(L~i0xW*}R7SLUiT#I9pAbD)JWNhMaG<-fvBU5S zJ(ehmiIb60yJ$<549+p;IxP@W7HPy)yIOO>r~F@uPdDL%9-N*+$T={_1s<(zwB{%a z7=^1H9}L4n3v4c^Dw5Vr1e(d;3COZb0?~8B>8?xn%FR*=l}! z3tLY9Y(`1BD>smL-#s=jaCG|?o2|SP4;+X>6ikp1_Y^cGn+``-9~@)o4YZ=_uAoy|A`0gUrpT1$Wmec&L%^hiVg*m5slfT0ci;0_-9_0a>GQQyG8 z`@L2xya9HNpYq0pP*~uoVlgVcH1XC8{>~ukiId;)GS&2Ftg|~f#G&Lf@q^RvkZ!?z zkhk};(j}Z1sBg?i7z47aFv3$oqD9))?Pb(fs*Tl_L~iZ z(($i{Yz z&eXnlW+~l?v+2j?v6hhYG&yhqyRl@AkGx7|W=y5(ft17e?8NhN5mr|o?(YAxln~-V zR1ySwaKLNwo9`U%(N$M(K7%q$J*iWw!G&MvFWCFy^*VFB-Dp1P68=`>V3X0B|98Q& zyN}P>L>*=X-&5QB0V|?vSH+<%;rd))SaF#XW?6wH+{4&!Y=}+2^1PKozQY3|0rI%L z-xkfEB2xdxy9%&ly2z@qw7h{q-xtHFnDHwhh220>llJW^IXHc&#?puN%+{Z0B51{r zHv`2x4djrLc-F`2O8|`ZA8WW`+KZfM(4ay3Oa+P^3V z3`{X>MpRMz_P#8MXVm@k>pPoAYXlVft~6AErh39)jag>7`U5({CX5mfqR!1Oo`I$T zrzRPeMK#CXr~dcCe1oPQ=rto3af3y)N%*Od$B$=Vk6}W_xeAzpLA4w0E7-tRat9)a zudsT98B`DiLtKJ{V<<0=GJ!FCa=^>f6os!$y0Q9gU(k9W@$4xGTK%}9KNW^ zW$_1a899VyiPss;VK!^rc7qTqPO$Uyyrfb9iKW|->%*V0Z$-vgg?g9Tmr;Bki}e1? zLCxfhQZ65!e}4$?G#rS3@WJ(rTs?gFa8+1Xn0xe(A3vUegz-_b+nm7)cg&%I$38Gh zAh17T8wo!?We40DeG%O|`!njrSX*!hSue{_QIr2D*$lzy9g!h%kLYMG8m(xeOuLjD z@acI;a*JF0Z1ys5B>N0sy;>U^+X3CQteueAEzobWKC=K_7<`ibDKH!#cnxH}dq!2=CU)kL1a z9w6KbjfaIbQBsp-3sRa0@lsqId42egDBj!w{x9uczj-687bzmKXlRawfZuf!%C=t8 zCzF1Ee!evUE3ytG`l8?8yZb*ZcU}3E!TK587O{J#MLWa>cUoiL#f!T$!OSJ>#92V! z_u)uRkR=5YpE)|S{0KA;nQ|NEc$DvC6YK}+HZ)0i$eM3Z0n(tK0W!J6kR0Fbba8?I z0P*YtpbLRvE*NkR`4Io*wzq%AVbwD@9pHQRBsC>Ikz35J3Dvg+cV`|7HS9fRz7n&~ z%h^C6W<0!qYmhuh!vKNuob`~fBH}^8!3;*~hp$CX*{w&BBt!Q!ig@F-pxM+2`fLl& zSCJnyR0eNwQAOOHg%3PCkH(`Ks5tD|fa#+d+*}txD9bj67IKp zl>5c_^~LZC=|v{$s6|_fP$y=>@Nbw-TFAc9EyQp=6uxpArs2|(fZ99_Muv{r|^TT#vipI*H3!kE+&VN?R`P9X&!JfXcorSGRwo?X&6cvBK z;Y6KCq31l=Qp*70Bozx=c-}pddx`-QwqPskd2UU!V&?pO23295AM-cb4Igvs8L-?G zjb+a@duVD>O<7Q46%VXhz87-hSoBu11KQ5T<>0Vy z)tie607^BLxdBH$VxirOowyTObn^FKk(y)Of9PfACSq9h$(pJ<_-X||81K#)LT^RZA42pNHdOcGkpI1y*4-?1h$lvKcMta4 zXMkGHBE8hOw!CL*1MC<7{m7tZFMdH}V)*RGyxvab5&288}%sC)}< z&w8XLYg5zJ2Hv-IJjwo{Z&)g08^7#jujKvvt$5*gW3%Sl!@!`#_btV=E*pHO248zb zwNf13y4w~GP5Lp`8SAxThq_Gl76fu8t#N7C!ihDb8?t2$aVk-7MCcnlMK*mr*P@>e z`Kg#kg62QE9DvRKc!?u^k65N@}C7Ug_q?K%zY(^ON2T&)xaz(hTB8vH@^ikAD3oaO4;} zasVONBdoy5!;X`ZBhqdz(MH@Z4F<+M1%WuL|DE8}q$j7iWs1*Y*|H%BW@UJeO-Tx} z*_l8Gd)ws9<;yMTYKadVyN9eD;eT(xa;4i3HN7FXo z86Er2SflwFGl>f2=FUc8SRi&BHiqBBF!K(tjBl#L}B}VHvf*nlx%;>+C$v+PXPi zB^b|*jA;W4YFVX%cRQ}1p21hKH0g}0n`SElR}>mgfIqd5znqTEy~L_gCCywEJE8iA zs8|7=T<5i<5|%MdoR9fLvJ$hs_LVDW^t|0oHtky*9Nd7wJ#c9Y8ylPc7)4YYb{V#G z14F)(CQcm5zW{iY+32GSs^UrM!O3}%@@3*p#)3SO3P{*$rt)uJP_6J1DGdq3hZdQT zY@#Nj0n@Le<{w%8&Hr`9P33*99%~#do^s{lMWip`sQVmt$gW=67nGKOn-Ql@oqANk zS2XE^aav#|3S?!Udqc=QA)9(wfZ)eO%Wx~FW_0B$CS2Gx;Q)?lV-q+EmX!u3IW;J^-` zjl70th~HT^n|^F1lnm4LVo5OI%2_l|d$LX*KkmjF4{8GeV{k&d+ll=K7cX5(KvZ?; z^zbpARq-R{xEtqQV?!{-*gdL}RTo=u-H%Bf&p4{M5QzKnr;FXzPjqy&LK2$&t_m&O zdpF+WeMgU~FiBhR4^Fn|mlRi9nZYz$qoNioT}H~8m&O?IB7vMcHJj$9w6+Sof#^D{ zw!(ujKxdnIvQ>-b%^{VU7;Tm9L>pn2J(zCMd9uRj62kxjSbE-Y;2}CT;#&jd!TU+< z9+Y8R#KWgg&kOZfnImWrua(P}sIgeap)uldW2moWx`yr#NeVT}kx?~by@gIwq&Ji& zG>nqQ5lXL;S(2($x(c*evU3eptV}EuWxO?uSt3P^02DdK#^yXw>cpW^rl8X|mLtFJ z@_@7A`^2ViU=}JTEShBDGTy7Nt4{AGd2_A{G5ZhZ_mYVB+oyRys^Uf9aI`+N=b=N# zpFGr1wp0f<|Aaf(rcE=N-iK_bBaC1EaSYwtqgTZ7}l+-YgmX=>^n3JnShQfl0SHwt`G(Pt3S z&btF1+`r!$1e{|>ia&Z(3`3~eBOFicF?}h z!|zWe-U=p7W4Q`1x9h_`MY&hfb}fCjeYWgslO{&=RcOLWhZ)E6NpuxP&l;B=`)7W` zUE?p9)ZHuKrm*@JfO4(a{)ncYQmkMFxc_W=Fv*_+xQI_M%0#v6Q!CZxKOby0--HY+ z!-7I?6WndW$~rcKP)f*@mqaS5HK5uj-u00^(H0U1o}Jd(=*Q=aoExO zpKcyou74Q1mP3!5wk&9;%YVZ44^&Y=1Kb0LAX+)$%cm{KvjD7RB4B=WP5Qhb`%C?m zLR>OaP>HesX(!n+!6E${h2>PHG^I5}_^#j`dtNQDrBbxLqxv>JLd4uM00O8KKv%~d zH=Mfww*=JYg=}}=#InxrGLPDJ@3}O)RPT)Xb7##e+tbc|C@&AYv6M33ynekD6J*(Z zMi~y0v*_8Cj$*q-v=YDU28|WnsVsh*2*JHLxJR z_YX(?e)s2hDAk`o^C^RL}%*ioi=8$8*9! zld??L2|zP)n?d3;{iDiHwNhW!a?`Z1@hKieMKOsM2!X~S78G&v0JUovkV{wh2bgdy z0Xe!R_d%WO)2nlvf@dtHI8DzR$ewlSg3)?QMc|!%(qZa>e81!Iy|w2!fzIcDd!}x@ z;K3CZ1AdchA5= z;dJ?h`!}7B)3qDxHIjg`v}V^52~WTP=Pz7HiBn*B$&xY*8~jxht3~LGHxu!MZNS1+ zI;5Xxe-&zd9`$i(0@?XV+GR;Z$3>%wbc!(rYW3kJsZnTBk%6c_$T@bb=-fH}5K-+6 zz(koE{)1bP+s*(?qS~>vm5tmY4~~hEC4A`c1>&SG6#4rq%)3Db=3iWbw+)qp zyphz82VC;h=+eD?auJ?0nV}Y?Gx^bKG8&8^*_Dse($`m3&S*vrs^nhH6_KY@KNT#&iM(DCKNX=1nh5iBbrMp*05(%B)+r z4l_i`IeEv0FQAIi#&o0fZ9j_h|1hXVOS{PxGV;WeqM%xc9GDB4gP%A}6BydDQ8isp zWXqvoN$7jLrqDRrU41@}r@ViUO9(|RZwMLuG2@;tnN$@Ql*r5-vRL=> zrI98RraX9om5hI*F(q_)RUXS2r7&>70M=z-Dy*ZcJA)p~sCWi>yRfJT#%?{oojMxE z%G+=M7)hGNmoNXdQKV@gUY;seg$Z1fmzVeGTT|2FQ&d6_uoS5E)zd2g#UT;X&%dv$ zYqDqhseek(6BG-di*c+e)Y@-Jff0&{NhEx^mO+i`C(>CyAB;%d1J#@ zf-ZF&K1Js#T73Qdo*cTMArl3}Wu$P55fAuPvaTSsx|Vo6(bm>o$D|5$8+Jg+Pv|Q$ zkik&}yE+_L-sR*6~_~wDHkei!Zwo#|{ z?L#r);CXO4f-$e_GejDzosaUx48wFyJMJ0JI3%>qlC3wr0 zE!nDah*{Z!0dLXn3fcbhr8S|0)7E{+kdt=@9E37jwV|F6rU)+ht@L$VzS`m-uvV6jg9&|FXXj0(g%ohQJqu=`d8Yq4KY+ zRLA9x0~mDbrnRG}%7#K*JI@0TbDF;ObWi z0uv0%+oD5X;$0Ms_l5U+PcQv21E`b2`*Cut%Q@aY?&k+0Y6%(?mt`+{jMUkTKwqnC zR|T{w9Bj7x)B|apn>CmzqmS2yzT2>6i#fi(7b$&c_`sbo1yP3AxC&EpaL7S;4Z*-^ z1U?bGZgPkjUAQW~oKn)%-kDu*W7ToK1gim9^(FpIK)CmBsN0k*<^feh^HitoUUAZU z2eF8(u03J|0N67I4A-&Ehc;XWVeudZI>+Bgg=;g^!42>2Q=yY#z!Zee1CCrnpD)M& zvuF!)KQx9Y^3DoCF*$&;lPvLft<$w#-54)h@*toI(^Ax-ORg4IN~;N>ttMuP|Jfw# zK67Rvk8tVJm??^7brg5&zPu_QKtDvVLntD|)PeTvy3dKD#@46aB)n(dH3mF&IN&%6 zJ5V-6>(Z&CJ6c$Mu{D<+fJ-dMyO?WB11tumL{w&u+?N=9tu6nB`s|tIYAo z%6YI1bSJj)4i529dYERsW5`z&JTm+B29Ukj!wlYz$cNU>J7ER#;9ztT4piEted8W&9IY`YYq0c4%fBrW8yE&0Y3?X@vF|+#3@UY)cg%Uz2-`-i z98O_EERb1hCUTbI`&93s-dZCIsOeR-%A0oXTz+*PX`eNA6DLj-4gp;9JgZ+ZTzo>- zrqQN-{m%qDN`w^G(<@-+ZYghe@w`L>xKiOc&c|dLulGN%Qy8M(UC?x}hM|`-h>3!* zY<$D?b(|pCK7weBhbvoVVL+L5+s#y&+9EUZ3qms4YeH}+Vd@d!M>lW65(U0H?+lA} z{7Q|%+N3#Zs#gUS1F{bh_AfpR<;ohwujYD#s0xKt*WwHo(LNizwiHleoX#GyeD+yR2f@W3M~5rSy}_hb$s zk=%8@vhvgm<0o5HIW{D8J13`QrN&fpRGx|&u?Q(Q#l%<;Zn!7A6TH)?-wBX_rcJ5Z zz)wd)94l6>#7YaniHIn{CG=-rRHih(xkvK}#SHP5GRp$21udJ*`+)~>Qi&;7jMjfb zuSr)0&4k{QZI?XwmhwLGFFHQk!2VrcxbVoxM7}8avNBa%$J!N5#KV_O*hXs$1C}i- zp&$+(h*>T~g$>{B#m&88nx`|8lGacJ$#4*i-*KV_zz*+nQ>yPPSA=wCvaAGrmN`a2 za^M5O(L@%&7omg|6c~jKfY__vYmpzVWhRrkW&y*9B|sxU9h>&qa8)Fbyl2M<7~;wd z=psG^sPlkt2Y~DIZ-k(>RWWnF$3YbGlPMrbcDEp78%H;6kZ?3K zG_+0Y*8d_%h6e0Of$rh)WXRJz}^T%;a_$!(DtCktF*V>Pn}KD7_D{=|F`Idf%h^0 zA2OT&e?L5`16}<7Lrp#`kZMenzp%W0osW-%YeDZZGP9h??p}mcGcq^Nj{aIhPfw2q z1f{WbC1exXOe9SeuLLa%<&z-qbaq0A)5l6piZePKmiXq@dymH+MJWfH4pDJ;?50if z_5recB2EDGC}DmY&=tSY=%7p8m(&CL7b9$b3p;ZvDhikfQYVp$H7wY-26~Yl!>gzj z3YcD(;RKHfHXr-Z^HIIl(>K`kTlvy`bSZSW^ga~m(Iz`rHqwPjDbgrBy`7>TUp7yo z>{t%}V6>b>7_l7Z>k->6=;$f1V8J%=HbKmdC63&|vWp;;L?)n)p~eW{>j{8)2jLD} zaBl*gwCDT^fc^;_zJ=h&yixFRVL%@nL9RYx?hel~Rz#uPHhxB&mkGI;Dvv_j4Wung zYe|!awVs>rHKMr`=xA%->BXz+HBbv2)aKH>CnzgsvLqhZsui8=&wbZ(Fv1|Mi zzdXnRux_MTakcMgvdmDj5cda|?E~_G1}L4+;8rfBg=jVuL~#KB))CsG$(oQ)lsPjg z8R0mdpw>qaaSec&IgTgft4A=0p>+{*cPpfESA!P9NRC*$c@1&%LlP2Br-rZ;8JvKK;z0NT@1jtZUR-5() z_6qSTQ4NDJL-Q=2{>M+BC<80*-jzjcm?5A%v#?cs_1md=CKj=5S*4df5#J*R6mm3< z#7N!{BlGg;^zL0vCnh#UH@zSqEO3OiOXZbPY1a-X)hHH?^34(xwNM%(`zoN3;Hc8Y z!3r3g`%TWMOAEYo`?dd!ze? z=04q!=?z-`RbSYK?%f-%;-&udG}e{eC(Q5CUth|(JqXGysJZq98q6`?-pLJ2+Oib>e^}*Uis5d}a zF|nZ2HHBCS;8jr7cUz_MvB$t@tlH!ah0B#!;N83HxTFf$*LBSvQKvAT#FMc6^ZS?B zm(#BAWrj-R8C2YnfMB#a0r-u|E=fF#q?e*V1C--PqADKr^X};1Y^sqPgMeOl9ty+S=lY? zvGZ>QW9o^fC18KNyaHBBM8yX9+spJ!4s5V{7heBmQ#&WSbvx1Sb!E>sf7MOWfi=qXwK-} zzCks7sH+nJF`J8`*AM?5j+s8~Qe!U2n&MyP;JHdzuwdE!%64DbW{C*0fJb;JC#Qz{ zlSF1X)NgxrUpE*sp<3ymwu&N*H%atgl!*1hqoUY%u#PGGk7B)qm98iyN2)G;u|cR~ z>SrRkOw z&6I>PtRc%5St+ueE`67DjX(uqXwhzo3Ti^UP`%V1HOyPf;?iLsFLsmNG=a5qfqkz@ zRYK7)HQT!#?RsN3>J=GcLs7Jv$_8M6+h9L!;0y7`RaDhq1$gBLyit^|VpIQx`#f`y zWpXu*!<4{3Sb#xEwt?Jq$}rA#rRN`?(qPSnTf8Tnk&X?w5LG^BKg z6=W8x>mLk|h?~`TEQkw!sMRXE40gGxDrf{$x0YIjqXF{^CmhS^YKf+T!EMmv$Dr#^ zWqGp%LFOad(IFJA?{b}1>>4Uo!D=DDqtyr6O`Pbadj0sqCyJWy0Th^65kZG&DA<)u z;OW6U_Mv_|s#^-L3h4ufq9eK{U>%}dO8~#nFVA1TEG9X#MR5d=Nmh1XY@;BXQN%#j zW4%czNl?lJN^)cK3ikaRTKeI!chPuQIEn~CmJuUOP;4H|nmh1#sUtq^#(NU^68Y*1 zDx`Bl1102hC&uP1nPkUrNT@mS{C0|GD7mN6` z-~;f2(9V2zf9#a#s0ulRY_dLwa+k1mN7=O3Mne@8Q1Y+^ZvmpKw*lb-)H>OxqOuZk zU=+v`LmUdYQLL9`GKi=tS3xwyo0O#yja*$_LDnG>k{=Ba(v|zFAUr$&XxNX}NBHyv zSRsNu@~{Q3$KL##Ipq?upZ>~K-$4n7lCm2w5D-onFHvheo~X!zGc)^S?LuMW%(5K(7O7tO+FNiX0 zOVg)#u#$tEiKg-RsZ$NkEuEU}fbNu;tH)5BqAd&cL#|Gv8v@wE%`g3UrrvK@NSG$d zv8^%pdgNf9rAQ?s<5>FVSLEVp;Ag3~Q z|9<_spFV%i)pa`Xaxg|5eG7W@$XP?>ExuK;v9XWxxOVh8KF)pwo&Ix=kR6cLNx=(y z6DQ70r`fz!Wi8SQ)kUeVcKGP}P3n;ooc1N)+g)RZer7qRwz-xGdF(!8#%kUx(Jbdy z;n6Pm6CpY>Ir-^ipF6j1ZANLgO(H1KTzu8HKhvD!%iKh=-Q;O2DT+c9-O3+3tzHt; zaAB{bW7QhSmXYOlF=;6`y$%Lkd3xO$^h=^H(H7L(lTJpC6u;1b=soh_o9|tdb1aMQ z-i?ELTMzO0i6DFgn%eAaeb5Tw23z#YJ%cp!DuNL-ER2eFW_APp+$Df=;o&77bJJ)w zRCr&gD~f<~6cd_oJdh@s{ZQthkxee}J}Nk(&Ii;Bb$6!LAI6pqG_?_rW%YTDFw2ajq^f3-BVyVc9Jl*{5Eh-6H7MUI)Q_Y+|h zx>#?Mf0+MR4q|;}$H4VmFpKc6N_gersRuf>8Iu9eNG@Ej&*my>+81RR>h%1wi=L3_ zoD*I|ixxmkp=eA13z9BVKul`p#Ih%(J<5L(hA*djU;^Z0nm$e*vrs$87PbSj@XkH{i-u zxnbX8VIHvEaFu~gmb5{&4W{|nYZ>kRI`#fW)qY@QwjQla7tMnas>lr(Aiye>fC-^o zGPzQTmy6bk>B9H`=F5&cQM@p#CdjAEN%6W0emQZE+25@k@F1<2K9tl3hIC}buVo0! zMU5ci00;mVaOX@SMDkpjK3nFl}4_eO@}#sT1IY1PpT>;V^NX0E;Rmlo8_)XIShHxmy~ znDE71Cm(rO6>fbZB--f6;lmZ6E>s)H021#Svu7tV@-giK)pNo76}&^tFLXCBXu||b zVvE4>r)JP!BSYuDbo2KI?li%eAh_UHAkC5|ey%flA@^CGW#iT#1_ z`nrx;g%1+7YvZ;ReooAf7<0Q&)h0??Tg~To;lT(sAH&E#wV};*bS*|Ud{N?l$Fs8T z!t%M@yBg0NRg>Shuji~KHQU{5xBJEWU3-})KorAmk#!aRIIRZSN}GLMVFZXA(2a>B|$&4bityCunZIl$DdSvie;B{mh~hotPP) zy<**s4q(naDrHqwD;~?`H~n_i^mEB|)IQY>J_6R*_t$s-m*ttFPr)H8aQ?HN;Km4h z8P-An_I&QJ?c*%@pH$3`O{SSqqdX(L&t%&H1yM|8rbGH*C~GbPn7Zl64!v5nY^mL? zTLbuXdYdqK@B9fVcQ;;s3RyP;cRvij`c5OKZsfisBxq8#)b%L`Lh3ejsLZ8-cofb( zf1x_HG%L+s7u5SaFaC7*g^O6eA|U{wXYV6HNZRCmNwyPkTKX3nlCnDg5_J;A$#zJ6 zcGNs`cU0eS`R}?Zv9CDyivkS9KMOpKcvH{4o{VWhbSj6Kp2b(|D?G=SAl(U_Bg`Sh z8t@0KoMtUrc+i>GO+J40(j|8;hd6xdO0D#m zB0jR$6|n~VRLsH2<)2pJ%+_oF1N9BOcPWez$zuT1;f$;S-L*r4@W=;_Z~M?C^S6Kh z&RZbcEueO6byUO>3~cYkkHw#5nv->-@J9nP0ooDpd6hfI!b38O^RoN+G_*T59ep6KjY;sFb$M-(k5|lSe{H zHUqzYXgsodGTY?-H8s=HDveC&qeut9yY&5fO3G4GYY` zgG;F-Y?>jlSv;o)@rkZKj9>agBOUq`RcdT{gG?L#t-bE#=u;y!LhKH0i2J;R5*|2I zwh!OWo!Sn7{syCQ^Q-c&0*4j9xz3a)FS!&8d_Zi+3CK%$ZBEWZx~rqoe7eu7$m4-^ zeR&jVrRUlZBRNzCc@946T&t>@+SW0Fw+31#vwd`L+dHYc97wZFm%OaQhqVR`I)3jr zt+iCk!0}W2Ugp%cC&KJ%evl^@t!9*VbaFWo6KbLNK^<0kH%QL7WF2zOyBk_ zpJ}dJ%2HX5lW#quocRjU3}E#M>iHAD5WR}%NKlq5yN2dm$(OZi z-}5gpyx*R(H+z{z?4_B6$@06mr)M~<9jBe1H6>^G#!EBtxL$)^a(L}*`&X_YQrmC` zrmPtY?Ra_c0k!CTNOCb%nq z&ef6wQr;yftp2bR{mnt=6RY#`f?|dqh*|jIu?6s7m-hW!oX77Onc?7p@_*Vkoq1jC zot>9gUDqSc2p~!C0`4;<)x&N&$TS?a!_!vPFHy~?k(HoYKAWhFJl+qo(Hzp5s;TrJ zS1LH*-+y_VacwIsEZlMjSzPfj9VI0tfc=vzlG?Fe@H&a(Lv1Z?N%2O-+m<_mfgTU~ zVN9oXvu>is4zW^ZLLxmqJwGhbgXBHoyn9BJ-L&((iG0`=KgY52VoCyjy3cI~-otDK zqN0HyLZ#QwFE>5tdU~DZ-d-J|({0v-h24VLC#%U9otRz)%#4_-^!xJO^KOcSoj1Ms z&B{*GdNAfk+W06672t%Ir?*W$1N-F#Opx)WC}E-w0d+5KyGGh0eLp?3AxCvI^qTl) zp<^B7HN%hJ09pk(;%2eV7Z;aFk2>tzQmmEIxN+m#nXbDT-hzj+)$6texjpai$l+ zE+pIa0?u)xG2qab6$j>`C5GrI)iL{O3pX}8)&Aw($Io|^1TH6vm#VdfF2<$EMy6E( z4gg#lG__6nQ95Q!_f6I{#%5+_f=+MHG}~aEoc&P)zj|_Vd%%fTfjaALQq>mPL~anH zFzWXyuV||7U82dw(LVl~16gju%yd~(1gLAdUSqn??oY%vuk5={K39S_G=Ka0wX;W5 z0LOgC4s{NH=|AI6IA3FS4(NFgJq#DOnUhOoqjEf1Yue8C&XJ-kSEA5m@AtHdB3bJ1~Liefk1a6D?P8N2r`&_)Uh))88em4IicgiIl`|Jz2%xfNeO% z3k5(OK5!l1soUmD-u^ZlE`DNuIiTayhb1RX zNQkr!(tB}^sI>x~BzPy{rhduNTq zEJ_evn*nWGr;fQ-d9LxA*O7tq+Hq&$qNalE2sc8dM-If7@xb33sBC6duqe3&P%h$* zSBZX9qq6%z09Yy>8SP;G3yJ)@M*c;=;z|CF4eHn5KP~5N(5oT8rya+aRkjQYO#mBT zgp~rC>iaqVEFd3_bc04Jr^J3rggFDWZ5OA`ApbJhxLFKtzrJ_pwk)Oj_3=NMdGW|v z3LQp}Qw=@uQ*80#J}}XBu$I>>6DF^d9^POPOW=fQ!{1O zI^zKYo{*An-q9t~BkxTwO5PC}t{{ChB*cl5!x>aqTWtp|xV^u%h{m|XPhhiz)rNZu zKNcDOK>x`l^pZ^+-qBx-AkRamJ=egFRl}DW>>Lz3hos#0u2VNt6W2|&`yK9v2*?hvLz27T8!W~F47YA;3?bY=Fb zXQ$ss0KtAlT0$P}@EcRn@mCYpfa>h4%sQmHL;D?JRDtWxe3!7rOnbBJc%r4}amC~t z6P)PB`@=>AwVIX`y6{KsyuPmaR|vtMO|~o=7MamTLt~w3!v6mEwamR+Qb*%ca+5*m z3%{e(PE^mEHOs+6<(|3jv$xYPUTqZ7(AoL7zp9_#qh~gFL3g?wdHf%x`Bja3_I&@j zVMzVC^AZwAJ}pi6ecS2c%ya!;mA2h^`}5+9*Fsk*9ADhba((-CvD=$jTArJr^vU+J z8=Fw~ZW&}@o`wR)?umJtxp|tJpWi2^`%9*nSC8UuTiGO74mqGzx^`USocaHTG`je7 z^`x&=U+QN&`L>y`bJazwvt#lWn)xaQE$?dV;A_`PL*>D%hYJ026|Tnpd+OA~3SE^f z|8(D-N!_vJ?FD_=WZ=h70;e-70E) zH@j`8hi@uFo*bLgEw}F8;0@LpC{t>`ELI`H{?C6pm(|3NjdlCtt$F^~w1I>EQCwCF zrlZrRvxAP?E$#5NZEO8s)UW$z#6+F0+Lqy)@_!zF=s0WEx!=E(Ti4sS&-Cb|zF|iV zRR^|wd-bRK2p8Y*%;~|C(;JR9?$%&gU8u#{hE3{RhWyj}$Ej!~!}-0VGPbqhL~|JGHv(TEXWU-Z;0@t)sz(8-buitDVCO=sq~o>q*C z@*cm^@qg}3>((2?MlW{s>FCtQ%W(BAosCUTr`u=E+p(d|&}B!Jd-c(6v$L%BO^Db2 zLx&z39`n*nx)mBS%0(mhd)mV_TkT#BTBp%x(0UCG)8aP=rnW2J<`8PGH7GtW(5&X# zxp%Gh-??zx{%fkP_nbf8|NDWh&BL-C670?{INj_^T;s;3%@0^M&~rFbv%AHQPMy}2 z4;kFdKcVj9n;g?>L(Ng)Nh>~BoWHQq+P&K9?4A4BExyyd$$#cRS?~wjoU~rn%9+ty{Aer*4RQ^{{KW;+J=So?pnA zHsxzT84_&qHbdluNN@Muh2SKYu3k+BQuI7_`RO7S^p0Y9fe?wD{Y8EQ!W;4nB2em- zObv8oR$X!&r&Mp3y@r^RiWCZjKi_`vVr=$_wiNL=h|$R)EiI#H z1c*{SADvO+osG)14U5Gr_a=}-Xah&*xZ;mTOe43(&zWYCRErkp__8lD#_Z{23ADst z9H$!sc@=HS*s+%le&7Ky3PlgtPeEzvY8x6rEABUv%jaR+0RuEUgti~^nic;&Naski2cF2-|}xA;~q8|yo6-n>Tt zCWSW*Ul)2dIAqJcuS-W=s{80U%KTE9L)Gj(>wWh1AUXp#D=H}oTZJeHRLVM83vB^H ztRook^RWQep~U>ZhC-544iQ<~OogBEIm z2f_gg)WQdlX`?hu;!rk>sfwLiJ%XeHaEx``vi;viXE=0=y$tFMAyv-}E{pE~&MG6- zP=g4LDDRk2gKEHthN0Mvlpo5<%4cY##Ul;AL}XO>?SpB_q$Up>27D6BIDs;4bz_jg z4a@eS5WSA5!j#Ivzh$~ZdJj~(V&`IS-*A;(YBgX23vcTInF@9%y%pRI7cW|p3I30) z%&QmD(PPT+ckvdNL**rlMZI=Fs2BNc>n5$#y6_vEI2xU)ijwp9QFGNA?yC z@JJ6Irp}xNC-V|h4&UJ7{OeL!e#jVrSh(?&;YKh%mg3JcS`CZoLP5Fs1a6b5)<%6e zX;j|(-%a|9hqQdfJ18EKXncAd(L$ajaum_`1Ml_a<_qD6;3Es~3hITaNH3s1X5bqk zF@Sn^Cp!a_u{U|=Gr7l_HD|tmd4Jc(v{gx9JHN_f?N+=^o~$?OP1)tKUOgvOcF19{ zPDJ^@0Zd{}zipwtd1G9h)3a*eOFk4-F3IQHoV`mi-ZC@T6pv%*Lh%V|u)B`)&6MaAs$tJHudm z6qV#6%5f2IaoQLEkQ<>k1JK0jJ#d&YtgzSp-FM4;`IwnbP2#QE|yjh9{J0{yFHMQ1gpB849`U zG=egh$V*vXml_eS;QOkqk9br87y#Mxs@<1G<%hucg)pOewE@Kt-3HP+4>T#vZD+ym z5dKHw762UfjBZ9-N5@Kn)453t9XD^0V&Mwf6ZQ;K+WkSK7Bc@$S95U7z8?*}E-1hVHgm*`Dm^n2T z2UTDkphx#PReL;*94- znrkJ1cZtx5T(iZx2b_oSVKZi!qF)mAp#Wb9Y;M=={(J(|fvWWy=qMeZtf`S9XSRii zfszcuK==y@*{fFsUEe~6kni2UFE4@!3VFX70(s$ZfOy{~q9N&sBYrvQJgEXS5=0LX z;jRJRZf>if_-KM9A3t>>0yD+RrEL2-`tG!Zi5s?UyT~Itxc20_jsrqL7Z?<~0^jzc z>huUKm>Rm#drKEXj&(8__aQgj$_P>aXguM-X`aS+Mv=p>6enh+nhLl}&Lk>ce7Xqq zn4MPpR{r_Tv=K~B`0m|ZdQON}>Flskjh!Py#MN{A_7{iUtpUEZ!4hY%^BI6|(&-gJ zgeDOkG;g~0BM6mjXXAJ1GY%v#31cg;61O;=!@`EOYjGL{G>0~{yV>3Hcm3G2$8 z9ZV1uFxaIfqu6MfL{Fz-vg~>Ld*X$TAWN2LQ=dEqjR(5Ax!<;%D3{fYygxf$Ffugk z{b^U~7-`~x))rApz)M_y&AIS+To}Q4^NV9opFOL8Zf9rg%$T!Jt}H*k&_ktpBXnSi z1-f<~UclWl2_<#}Oqh7qo?zY1isuDs@O!36q4o?h5rk>W z?c+k>nVN-zE%Ba!anDcQ%^2HssGzGL9eU#`Z}l9wfCRaENM>~OjdlBuO~gEB)+}wL zAYg!P)YOWJq_E)~<*H$0%1f()(0?+fUyr-SogrTAqc{|AHHh2i-s8t2eVk3+Q%IN-Me)QMqwqTs}!T3J{6RdYyxtTaR^4mgaaAk0dQtex0k^+ zQJ%oXNm7{N63_?}7kugUVPBO%=wrVr zKbSOaLp|Vr(^PxJc@tmMxa2Nv&BF%X)Qc<+e=Y)KSi;g}O*@?^BIq4JvFzkSkq zt?ieskKM9GHaY=0YXoXeslL+Q*@TN!GoE`9 z(c}b#quVJ1JW9JK@7s5H)>SoTjV!5>0?>b7PTqBYtwVao=TAgy#+}Wg{TyPf69#(U z+Gm>KJ#Uf26Sj>~^KIY0owP<#!lJO5$ZyeMOqxAWOPQFP4=c3;p^em!rj8UIR@jk?pv1>6{>p{nY5P{Y}LCL^7%VI=xp7(Zr= zJ5o=XpXP83U@4OI?x`{NpAgA;bzK%aSQLxBKlRD+<=+mwtm^_#OtlcPcCE}Fm-Mk5 z-V%uyYQzchqcSWd<6|x^584KePj2~&n?K0+p#tJK=9OkahkG43ek$VsC>-kFr*dxSiSW_I~Z| zP1EuhK(YEdjYutL5+92TEM&DaD@i)q5hgo#_eX@uKHIrp6{gPv?yLH9xfCCj%+ zoakA@c-<+2ZM(Vot9usa z-qOkt!al-6-$GW#OjB@5O3M7x1R>loc4pYs1w8NA^W(TZxSWVU6RC};oT*xSeY$tx zqJs!(&Y!mcE(;i+dfPqE_fdVWK8e@m}g@l+tCzgJQ9&V zh*j6q#0Ahf&iTu?!)mamo?mz~`9tZ-9dq7atokB0)ygh}mdXIC*P`)i}2B!yg7W_Zj^66Jrx4x+!GvvUc z6F>9Q9|LrP&kobS`pSjY?ixq%1veZCQLiKK7zPpZ)ZVW7;w%Xurw@U6jx)Yp-+gs{ z=A;lW!ws7ojNRIF-fU#>RmTR*yK7%F$k6%piq6ogP+Jpc1;XAEni&FGgM`^~O3n?E zG%Ch@ll0rAs*#g^)V)jLSq#PfWby6c;N!rnmpE z_oYbOr+O~RENabJj-8#9Y4<1bHb?Ab!R{HtXZPC3IF^-L6x<&Gr zPKfIU4ITPQOLjZMQV?UGEy`p8druBYF8WcK;)jaYjJ-_P+=+_+d;a*R6H_04^kgIB z*Xp)jx2;B)*x`qdqcYpejPPp_>peWnA&*A)jRi6|&qxFYqq7HFl-L;!+IWEuZHQ-y z7ww$*K5+JExC}Bo4_`hAh(T7q$=p75ZGye-c9?0uZ#m%+5s%>1MrvJoUsM#>-km&m;p@~O0_ys9=L3oY>bf3 zAH+o8(9m2r<|BuFO4_8ZT8aJ8twC4J-~M9&u5GBPn$L?tf=rEBh%)^GCFNk0A)J4) z_JuWn`D3xUw>~pRK;_7r#(R6G9Sj`FV9Iub$h4x8k~Rn^rmJLsUiu#`cuYVBJfc{q z61mxEp2Z-1%>#$lsU20l^qOsCfo*@b;qDqahT%a@Wj9_Z@BV;lBP!=L)k{44042#iYV_O2D5Y3I|Sl>J8zm}mBT&%i05B&NxoCz8Dav8Mz zx3e503KN*K@l^%wQ{C5uglvqBP0?$&tdnkZRvh#)4}*^B91?t5v1h!>(kOvan)t3> z;S8rVt=oBQ-7gleE=^Au)c{OFAm_#zG^WZ+IhVOxvJVd!4|B5q1UhlcE|SX*N?-yq)o#XT7<f7UNtlK zHUzm6U>L2c0orq(nTGP?)!c)8a?@YVJVs|4*XnABrnhhqyU z;=v@XD{6oP=~Pis$|JURSU%XI4=pC?_t^l!OK1G9pjzc2o< z!?K%>QicH0O(vd+vWlKfsFURdXb3-2A#Qzf;ecJLG>-O!7ICpq`&tgiExZarTfcl|OCpFwkDcP>WTyBVBM+5V^ zQ%)I{S{t9|->6gDe_f5uih~qdw|;+f^n~kEOT7%KOWyNxZbD>FfbVf^rE=;AI8{I3 zMR8qYdSaZVW$1iQwTHZ%(ggsGMqEpiM{gA~Dejs&P#76XjI^kjsj;P#=53R<@%!JHwC#yRL7U9Kngv+c;*$q&~fSBZdJIw(VI33}XqRhfn*L0dE{FETFW` zMz(YFocv`s`r8xBzoD|;Nq%dIB334&e0^`!rLGOP2W*>(gPyo>Na6zcqj0|j>AsZt z)csTMVz>L-+#U~PZcqwxOCv&E^8^!ZGx_Y3D9pnzniVcB89@2f0T{`BZT(} z;j8wil;hCLxFlC%clY4r^wJq#URGoWG02{8u%DqtMdl!jdDp?>8#VM8$SfgtC}J{& zZ3jqfNoq)Y6&0NPrUEO%HT8-rx9tj=d!rA!L+Dlo_EU z$x2I#vLaDdSuG+BBO%EyC6#1FAsJE1iiTM-ODGwoq~HBI=X3l0zQ5b;`}ybGKF(3^ z_iH?_>w2s!xcpPx>(77I9i2gT33@Uq`5;D=m{%k5{K#qo9v11-C7|h|5P#+EAF(z&F$5!GrCnkTSHg?aEaPFi<4<_>G5ZSM@aR!n zQehGQ&=w(cQzSbd*;<*88g&<&Vu<%76?&?7O@f$xQe4d=!P07lEK-**r6N*fWspgY`b7K2n=HX|^X z;O)t+T$0lXIQ%z;lFlPm%njS8%kd>B1L{+VG`WC;jdbeNN#cJqGZ8HA=guLtaKutU z#9Q^4Zs=1?4tDv=xP>tH0N3LzIT2`C9?)(6CfqR^0t?xRdWw+)*FaECN zag=Oh-Y+?n#Z$|sDlPyI@>U)?ZH8l9Vw;h=xa@8lqsMS0wpa?u%th|N)yQ6D`3D%E z*m1xfTQ!ZjASpXMk&GscGoms&Rc5}vw=9_eSkCIy(Ct%B5jMUerZQc9%{gb6J~Kt=^+o0#)CN97uKrOgl89{y69jI_)<+seD%*U}xM;$dnZn`myh( z59|s+@UF11S&M*HYW5TN4`xEsUh!=4QH9Q(V_wJ_;j_g4aCS;8Frz&<&BVfJi&FHj zQf@fr1{1#yrdJQ}UmQF53?%=FBmEXo9TS(nYOrABn;W+xiz2!BQ~_+t2cAnT)q4P} zwQE@Nm_ZvTJ4d(*vVxI&U5_8RU)!NMfT>Zq=^4^gxi*nyPZZYgj2Uus12C^d`=e)d zeXth}m#*b4)C334R^)|y;~ezBcENZ@&;M7g-tXP2^?Gp4xK-sSWN5odZv=of%zsvHgH-Hb&e^PGa^H!m;mm0_v2rD^^u-GH6$KeVz?eXdbBFOU&5R#caSFjn{vtvmW-&G@$NlaxL9bu zm02Pf*C^A153)%~ZsHKcLLISvj+Uy~91G>K7BLn|x|U2>aA)nCPw-W+f`A0qH6%DN z>I)*5wJ7HtMRh7b~2+YJ2%Fxq2#)9*x1<<{cM*Fdvs;UhgUZ*RK&2vf= zE}4;8^lR`SJZCifo#lRu^PN{N@PU(N`imC_F*$?=-_??rino2i^5y95MX8QZlB36n z!3n-dhLaEzxhr(ayl9_l_Nw?J4MXQHU+D3j;>zad19GOpNn_$)jA;1%^A30a(>pMj zOP9m;5$dO8P;-!@AqCY`1Y+QCu$izw$LRZr3#zKsX<#nc61WH!rycC!#lUh1c(Pp_ z$6wgPG0WFYUi0O^>B&A88c}$Zp|ULE;wuJ_!kY~CROs=&6>REza?K>LQW$QDz>wMl zK6?~PX5fQQy_i2GGKt&?_he7j?7*?pKwuKN%@fD3f2G$|MP(Wbe3*B6DmlPufw@G& zEL0pai@Eh2s8LqXV2-Ab-Z=LZ4zKV00s<}Ui{2HzeLJ2XpV|341}6F)l3v1Ja>aU6t~1mnqm!qSS=E{ns+|DPW+{%E8N?H>|C>^|pBrd!Teg0g@VO znN)cHe*DW@Ga^6ri|=(mGxKq-{mecXlO%OVDR3b{vABRwuYS%bwT;sSlne3p>o;$F z;QQfWaJ1`Ma4jsrB z?eFVA_j%KLGEY%twQF!P(hoBSS!}pRXg0!|ueOU+D?D}|I1q>+zT#oLd30;giwH=w z%fFLkmj{Ro+U9)crR4kY@dOlIumF}JC+Y|Ycj>t_4Ovso?CEL$SlJf=IyA08`!LsM zrz{pMmaaHmv<^(_KlE~2s22+qoPyRHgpF z4$UrE{Neq3(I8L^XFf=Z9VoscX|Z;k4z%cQ615vH5tY%=%!nxt7DIyihnVa&aqz7X zrak++7uGjsqW%vKt2)q~o?N4rpl!>}kx$G1r@W81$^zkJ!t!!iIzVgAWswqw__8|8#l2sJ zBN!fZ&3Ht{M~YW#{QL?VC>a%!qr}j`C!%Y;&lyRE%b9`gsw0_%tPU7d^>CYnwt&qU z<%agVaxC`iDt)-WIsN_!5qEIDVSBu2a(Y|F+g`!OvP%;GsIK3Pqg;B6>Tw$$>4`av zw3HwFoZO*G#mhBf<^0aQrWtzkEQoUnKxAoJA?Q>=<(0m{#wtnsi}5P+T;F!B_;|>y zo4?G;oO&F#PTK>k_dL~rg@B^QY!qMi2QPz`xDmrEF z6~DEw)(jjyZO}9dJpwTvKZ%`mT#;Mv$d4oI+A%64sU62uF?a4bxGQE-h7bH;#;jX& zN>q-qFd;LQE83>deA8~zCh>pZj-KSYbmo}`<|aLdfbo-qw;peLdD-4i*TlL2*6E-k zn^yh9oWnhaUsx9>pjb7$c4Srx7hq%#MmFt$ANKaY4o0>!KBN4mQU1Ff) zvraft;gLz9?zPJfjTd;vCjLZHlB8OK?2unT@v3j(7zZn5wcqYk*=mKDXBN47GfM8Q zhni(Dx%Y{%7=E>$197e7~0QR@_nVu8+ZKP}ONpchG&@4~_{4KOFz%{1?ZsE=EK zV?^X06f|yq{UiV+j)I|mHN1J#rLiP-CK+uV1Fo?6)F9a_Q&tp^QxDijwj3fe%5$bJ zbfwE?dGsE!)I94*dMJZD-Ej`!_zz=#5$L%C50MD3V6??D$d4A)2Ll_k{bm6P2!NGc zT7(*s1kf*fMfC-N*x{_?BVYQ07!YC>5!y7~7S=%Y89Z2_?PJ}zTUIeKaLNpv;%5gt zN625#K;t!o!bm=tkOs^fe0DL?7o^mzE<5#U4}<<5Ktc0OGh-~6NTw~`Cf9+rAaep& zkn0N6OGPhsFIG+T2T7p?xkIDowx;Z@*^(`8%(^@+<)jc4Z*K*BBZ4EZzrPm~+j$ql z=J%t1q39l>K&QysZbo}SZIiA*x;kWvNDk&fCvuVs{>~LNV%$SY7QI1#yusX@lj+`d zsMMRbY#9mWI^FG%YT{-pwrJ%gxr1O6G=i<@y`uRin7X*e*_dd;Y5bHebf_vt)8PKJ zo=k)s**xl4ZoNmg|4ZSl!XMAk~XBK!$>fyM3aYHd4K&( zjSwuMkB~cw=+Yz#=eBr1WGdk{-G)&Yg_c=Nd)g}8{K;rlc|i)P8fC$r2U`PLqwbv7SoS?{2)j=Gg^~G}_7>=41&MIIK7r zRSdEjX?g{8qUMQp0e;#K-GfXyOo3fTtk@_-J4P0hvcqN$E`YDm8{(Nmxq$O|-@wfp z@bTy=s+dq>Pub0hbn17wjyL3JrjNlA8+ZC0w=d3MZ6bc50!ilzfGAcROdOE>Cpe%W zhS)VZO%mkr{JG4i@DjcR8$DdWPfrkV^wpCY+b8bqDskHKJOb@P5@D8A;t#_)#8`qF zj4VyE|6ipzu_}lEket&dAVAUlYjKWlKW~FRWy+L$<_GIAoT31f2mC{_neBV9dx@VA z5rX8RRhHS`v1$f)$s0;J>pSg`|D{Wnp3X|X2Kp$o;6-yPkDkheae2>aE2;+h^>o_I zb+QW?=wp{n)lK~pJ&!NXhe|dbrHcnX zjkW4XAOSq&dI6C10CQn_Ab>p0#5sgSLU#Fec5?=p2-?ot#o>o(~tvxyI-GH*Je9LCMV}zHVROnmM$&TPA^9QxlQXD zYZvNauFx2I(?a9MNH;fMv+gxVy|1S=RJo&q;7?L__@Yuh{)4)BcSe89++_R(r!pG7 z@>MVTv}*>+E_OWtQy!>jMU7p?z41j0o$#NPWvJ1zH-c>u{&HsCcod)D5)t;PUTxaX zYrB38p#!qKMC;NkeXq#?uaP+MTPV+;c)`0e;Z zwY0M;>gW2J>8#t8GUv%MrQlDTlEg;c#wf%EW{o%l`{^E`St1m`yBUos4b*ZqkOdb= zwrE0~csYIP*x${XC7g!TQ1Ms6PeDiRCf1!Vh^ZGD=T)otLva=cLxv7)ZDR8Jmvgw6 zTbjS0d!3x{oSVqugfoX2fwZhJZJvNZAdHC(Ti$O{Zhq>CO-GMh1tvgxhx#B7D6gnR zqOPE;C#7X7N60LA28^C>e7>x{16?U;+#G3&Srb%(b1Vi8_f9%BMj<%*F5k|0(xjg1 z>P@NA9FNp}k|cJ4XfY&781DW1t?1y8agO5wB;Rx3+Zr0wY|`uIXcb0Jh$@f}14*DE zvg|dGk&M}_Oh7Hc;T2D|k&%&4(r%MJBVvF5p1ZU~o8w=I4H z1O(R8dGw{heK@B&E+Z9=>l{(|k|_n3_CxQGjjuudgTpq>ZjGpK-Q|9&s+RiP6M7T| zv2!G*En|PYlpC2K&~Qg<`kBoEhr@kC}JJsBet;335tJy z`M`$K4kJd5e|7oH%3kZ-vC5YZiWp~@w07^0iLLm z9XAj4UJ-i2y({M;{y<)B3_m4&QIu~b; zFYMKBy$aJ0C9@v-c?l1w|IB|I5;MI+?rvorHv1dy7l#y!B+dyyj;)Q zhDRJs8f>e4z6vIjT*QB8eco`%D(4NlMG_YMOv&@kxeGH#NF4%*n4NFa1ArIR;df4t z1&)sX_;?5>jo@C47%?J8_w6vPHp;_0I5p`zwVhRx%CPe#u&h^qJf3{HFp53QcZ}YJ z$(9p(CX@V&5M=M38DQ^0z>Tx`A(aKf(%0oEQgddWYLy7%R$|W-77_8c64y?~RT?+) zz`Rd&@R`2N=btkd+G`ibKAsnn<$La|d-rVt0SpdpJG-r1U-i%|X70E7j!)Zqcx-W= zbNPB+yuqEL{>Ba`bA3`Dhu^9_-<0aa+8~h#G33%lg50!We4h_mczH^g#D8*%&t1B7 zGX2f9>0g3R4k>xcksy6C`!ZOjNYbo~ft-ff6iLQeLt5$X{U#(`NKP^czkdCCJu*wR z%krHPsi^?MgNbWNON-3x`Zg)aduE36P#-_%CKhb<1C9iLj(Z-bRdFxkUrjv?46oZD zwx_;Iw~4p9^ziA^q#xYHOo>zoQ}{P>Vhiu>FWY0Dqiaz>hT3v zrbm9_!Dir84b7rYy}Z=!z#DIPR>1drT*;B^Fz{rx z&M3Uqf8w$Mh7+3A4~TvDawI0mg7p&}eNUP}SYmU`%VSu(<+;-aXOm3+)t^#d*j{LR z=u3Am^(!g5{ZjmB{op;BP(rrd%WZG_etqA8t@9H6HrCr^G8FH}tJ#scPRT?y&~>RH z8RFGN*LjyGnss4!u3z5>EGjN?$(Z?DnOe7!IoS)wH9F#53xg&d#@|ZMGCaB6FLrk# z_J1ZNp(5%xXiQjS7Q+;ZY1Z&n26xRP<5)Bo)Gw}Ogrvx!bKzmFhc@m$6G62Ms zO}LuHq*tF0k1~CJ-d01&OJyr^wbvg%`mq@>*UmF|aXV0NplT7optO(_RAg6jzTqok za9Bu7LOPOIA%MBZLmvIm&Vxr`pUQYx-yfdO44o7JV@2(YB!(&`GELs<+bEyNGN7~z zy?_=HGnUwW55AocyN?!BSXZ#eO@^x3Kh1{>8gz{Qkh~Mg(c{dR)%To+*+^=auo^VX zq%#@9?W{xS=lX?^=m?n$!IvpGBA}jVcRisQm_CDHOif+*wI4rRWfujL>m>fx5522? zoyJ~pwoQz^qP*jEy^bAktt~|F>9gR_p#gtXI;wSgUzb=p^nmx_#lwfI57?4g*Wu2u z>e3Zu?bUa{6QAnN`4TRg9=j~F#V#rn!aPn3_7HDoKqh3 zAeQd$CmfkYr+XV9?;Z&S;+>RKdf8;o^aTUx#}CZDvUa&Rg@(Dj%z$>QqHX^9m_W^v zC-mk1dO8zSlxg+&QnKt`wrfeDxQaG zk~B6%DPT@n2xByW{Hey%uk1S491KsQJVn|HM<)+I2g(#&O^j^$mE|h_R{ckK6%?#^ zA86X$Mrp?3&&!frC#iRX$L)E}X<5tFADh<=v>!O)V5RcA#SfZh&kUPlJpJN@Enip>Yw_L;;CY=1XX0gO_TDuv|9 zp4gq`D{$-kF!{l*kl9ph*{LM1`M67iGw^qS$xJNu_B4CVdt@NL<>ab7zBUXUkIo z4vP$220ne!;>+R=n(ZQ@U0i&|YXmnM1^yt}(Q@2^@As1FdaZRhq zLf5GV|LhNI)@|xQ`<>88Tg1S@Jb?Fsu=M!dUt4R3Jz7*UX^Q?1qpE>de|GrucNFq~ z6!)3068q^4!8*tYB(Y)Q_mg+E9)}yfO*t?1c~-Fkq#EO{0r1d2zP?H;3`&fc2rvu4 zG%-q25J=tV6=K3Fzz9wbCZ>OUoK|vJ5EbP9M$uBEk;BD_*^?eS4&h?@QRDoB>j(QG zEA{2*2BW7*B=^bTQN=v;6c&G%9FjQCgeDFTh;&j{R(6Ez|5aD{>9O9)F~i z)C^`#tigAYhu*u_l4Mv4#EeDXZnMIcJU#r5KlOgtOb6v)*y40-Mvu(*v3BUAqgp3X zWl*LMtT+;S;6ULvqduq5T zYJWVjd9rxsafO!40u25AGZ$KZPmX%OV9_F<@grA8m}WJJ8yjBnU2pX)oH8dS4q0k z)r7}=F7hqmtkbrOHx2llO?BzOiH8LYRC={mj-xAy1 z;b)IAV~#Kq0KchsIFPRmqd@z?J^xfu8Mlpwgit>T{#}W849N!Q@ndLS6jKaqQ7-j~ z%xR1vi4nSg?G!o~z)|p8uQvw2IkL~=592;{V!6EI=R7k2HmncecqNga;~Mbc3Ov3~ zmCXj6BRk>b!De`c_wxSlCc>-l?Uz`5(W>!-`q#e1w*X;s(mZ-+r*Cy_sHc44a{w7U zqrW+67zfu6JdIem9mNRa6i;Wty0@*+a_BocszdoQzw0c`8z2^g$&=$ASrO)K7IpOC z;loqcthrb)c@TXA0ZI}-sylcR8Mc>m`UbV=$;w!b3x0R+voSLE_sh@AZu}lFwC3y3 zo-JE8-0X3yarnK@IsJO-J4}o4_(7n*I1$KTh@&Ad@eQKezA^`d)gIJKz=mf!f5qk6K-t z*DY|HbKHfxZv`p4jtsnDr97r<>+T)2+i&>gaW$tp%}`(O#SMMqMk+&1iuFcsaIFt} zwSUal(06tH-k#Z~tVA$CzH@)%Jcv9b@o&oBY*Y(CPB}rZcf#sIbRfe>W9b){oBXZu zuCA_8^V@2K#6EtgiD!zn0u-3S{Qc9X`c7Q`A!P5~GhVeD03#zS{kCNsmmS(HNX1(R zz-C!`b~%_eUq%{HsCmh1?%sX6T_5eOZ7!#SM<*c#hh&@n?#xy{KS_@j!vua_n?5nU zRjc*!>2~ZGz3uhPjxRoTJ8G~1!V^{PGmc(RGs36zNHPn?3(v61V%-c z2J_iB102Bua0WdAPlS7z3jPr5sW9y*N(SN&?Bml1{1+JkC5rkDu0jMx;4>a3HM=dw zIebq-i8XX+;!B^MV@A*_$?!(%cjAWZ<4K)v-aNM4KHOo7<u(MkH0b1kWe+1%^^KSPa4hm_ ztna_3W{qRPY-4Y4i-?HVuh#nNF5h^4T++%0SoYq3ePwugjpwx3RkFTPwK`XcC|g^t#d?d{9lBYgr}Lb_|yHzii?SfQOJBQ=ROGBWl}UlyZaN8 zMTpuZn!w}D_YN^$0JS{qCxD1;x;KQZXT=GGQ%9Mqug)8+?Xe~gUnQ_@PjCZLvVZ)r2>A$$=JUNof&Z9Kd zw@iX3UVnB31QO2fZ@euLzGzg8BvOl3H7^7QnHWzLX@!YwPX(de1^(xo%z>^%^>bEi-6pN(#X zesA(LZ_KK#yZ5yDU>!v-uRk+Ru4aMZx1h!UEpbgkk(W4U_zokWkShS7Vojm9_v5K# z4ZSQ)YTCT{%2zm~=iUK+XW|Bb9IuM#;|_1!jVK8i9|QriMHmUHH2r$<-(wVK zOVy-Es}^yM+lbv9(IvCr;D@zNsTs%7i7|TbJe`8jp}sJfu!sM1S`Zir&RTDayQZdQ z{JqN3B`XO)0HG}|OijI}wZ~W!v+ktD9-EIDJ;i;E!4ZO1@@D9AitX%`^-h}SDpx-2 z0g)~M9~GoMV2035E!vt3?t z$THE!czM3#>#CK-*|+?U=v~34s8jst#@40znciN;{;wS$g!M9TsVn)!FgMc<@U7Tm&`&v}RU-=Fk63MYP+vrC@zg_9(c%%k_<2wZST z+ozJIj{t!=muYP?_Cv%PLQOJ!Q~1Zzb5mgXyOxvo>gd%Ls{YPoYF)@9!@vngP-k;^ zU@x;;lu0BQ-ey#S9Fvd{7|4SJHeg`$iHhpO+uMsqk9&tRjhl4Zb;9{r;a%}gMr_=l zp;*NF2jV^m^~NFv=qp$)fH^I#@H?m_I3HX#)b(wY^Pd4$rjkz+g50J9bFy0quzb@en|Zbf-0#Gh|Nd%*Fwr$j6l{Vz(%& zjSJs+tVhqyr<1Dgj!8IU>VJlouG3H4dDc7HnF-&6w8-aEuC}#YmspY+s}9i19J?pTqpxng@bsLR{eTfkh0=i z#G|Z^uM2h_@T_V6CHQZxN4QH(zWuyn)1ErlU(T+Y_Vn3}a}EY0-}x&znCCrteEn{- zua|xeayiuFh=rM|A<>+kcDiDjcoJzcgQBjJidWsd%SqS8BdYSOSlt^ z&LAj9-c}mN2+nmGU0qA_c2I5(3|V4;`sV28)B4tfA%!BX;%1Q@$ZI6Q{qsGKX7s|1 znuaL&A!*J~Ub4CYKpK&cTZ3Ye$wmv z_nrf%t+4CfIMK#`%&UV}xp(D%n?Ikq@7C=JgFY|q#j`5?b-Q*qaSBdkCZmz*MT>>L zz4=B?-_25KO0M0#TmEEvdG1UHbN8uJj!kS+q_eZDLfYO9KS@*D-R@~hmqqv5 zwm;MV<_p^u)-zAHzq5O{>Q%3I#_N7s_H?T%DBBU$$0A+V$TxhSTnmn1k#ez*55~H! zRX4;7K_n|e#0Gf`Li7bOZVU<_6sO!!?9adwQ)W*7#^5i^i@0;4l*RgS)cV-C;|lK% zTya%s{=GkYi6kC^#x9H~cI!a{E1t@}9fm?dyl5lWv&|OeN*c@@qCS`jxGu?YKy?qK zQJ$0DURgO%=+HNdR8`xErcBlYAnCx0E!q+!DwB}7!Dk4hO8{~x?pe5J>QslNC(88W ztwI$B?4@WkTMMQ`g$hVa?!abpHx)=B;IgJNDhPusvxY2ZG3Wu z@B>lCF9MLPRxt@#^=8Z5ahux?4&3Qg?rx!>-}OUTs;_@`^ifpzZ54V(j(l*`-b$%z z7VBKYfEHGPYT|I7###8A*1^pO-^m);s0}T;2gwz~ricBGe}d03WUheQ15CV4S0vdf3tT< zZDVW8%eQ<(;Y6SNNSc8TivM1>++@^CeZHktW8Ue1jr7;5=I`(<_Rnd}97<`tbH!&? zI`}(mM!8do`9)>1bU@N$4cV5sLe$0A+*&o7&K|Lqq!WKC&b!wUm6d-j_&D6?+tnD4V+fB3NV;K75ZL++>S zeXhY`?E}~RX2tP(E&%(4-|)j;Ty-t;w^Hz-mMC=-GQWBc9@Msjt+sYUwTcV{>m*a- ztd#Ck58iAu_WIs_KDTroceT~@+*NR3&z@aYS6#Z@JGj zyI7FGzvtwW)fp02;(j&AcN1wwFTIuiX*k#izExw=*bbzxf>N(QB;XXYM`^u>$;_!` zGXsw08t8A@RlUej{oc}tdsbT7f?-IHC6RP!40z+D;btCTwE7REIc{w`up);zkA__I zThE`{6poM44h(GQ_V}@J@SZ)#%$`S2pS3wP(y(p7vrE%YuKZcyvpiMFa8K`cXU?@t zbE^KS*l@9-p-Lm*9LNfxlPP$#hN!A<;*(J6rv1>>-jU+B_+DFva-s3kvjcdwmfGN;J~B# zzB-H)U}r@60>d$WxS#mJlb5;1YnSmB-OCXX=USJ>m>Vp=yS6~V%)731{5tnrv6c?E zmvLRT;;WLi-yjeWF;N^7XpYLx;@E9}(`Qm2%@xQDoh%<7^_zD~#cR_9)%91e#eVoO zwA6jZhr5K@U`t{8vkL97bjgxQNcB-CRI8LK&Re{;?w_}ihsayiKYZLadXLbAAO(R2 z1BdRPb7ji}G)9UW4%Sv|GhO_qq*+*)X>{u!Wm=JyojNsbJS%{1loDr$cM6602fkUsoe{g*FQLPIflR}FfZv<$D z_$IBMXuRamNb{3*{lB#NZSS;u#z3?4$XKu+DIsIuicx#D5%O}`d^mX6un%}srDz~# zdbesW2`0vjs4EN`wbgQf1qst*;q3|sb^iSs{@;vo<3 zk&!hgPM-A0$;h!#KUi;`i{45D|5Jgwi&TTM=-y=^4>~+REM%+K!7>GeT7qg@gqZs( zPrAn2^m%#ksJGqOaWfW1o_YPJ@aOSeZObdYS2)L9y0q)T-_gScuOIklbyT|tM|>|8 z#IEyRwd$ODu4cs)AOWA_$B#e#Y}*Dhd4$1?VAJstxy|eL=^hIRIJ04BT6&8emr8Sf z5C5k}Nb~02s)<3!$)ru73e);3`93g=6+1;+`U-}6^mImXxnryX=TD71X}n*{L7P=b zRSClumt5Xu(kMLK;M2Qf`qe6l#?B(5q^S{TS@L0I1tjO_$n_Py$){}2#ab0MZk+3d zN3BnIx{vVcil^nu>qC1RF3oRVSyKITkVn$4H@0S~RsGuR>G%F!WLaFqSPfTez6lY4 z2ftQg!@xL}xvT-G?Qy2J>M?#zB21gMa_%2!!Tg93VEa;sZp+DQ>B*TW25|c1!Iy1z zykouj$sIe+*c>>lsXogtrFn9)PkqbT14c(G4{4jDQRZef!AXnpl~9Lz;C2E$%v-eR zL|M;((TfF56YW)o`}E^6S2efVXb$*nx%bn_+~z+|rtfT)Vm4|`dz%d}w*>_~(pgnz z7yh9xOUrkHfkN;^7W01R7^bn1Fcmr(S+gfrLg~C{-}j!4yI`aLW9jR0o%ECoys+_Z zxi~p${-mXOO;wGYpEs@irj^>Xvt>z%Pl~B|+72g;_a_>GVN*zjEzX2MeH`WB zxzv}4y<1(DH=VJqZWZpi=NFx~C0nQUsMwB`bsR?ri>-XG`Bl6msdvZKFP{kk-?F#B zeTI6&?8ppb{XsF)x_Jg!N5)EZf;&TlzMs!HrSO)*`e{NYQ@^iW^PADLN$aLdu4r8~ zJ!^h>f#2K&w=WtwWrdSX0v`?k8JF3>M;v~pZgmfmt^VL z$*X)JnM!|*#k>9xAi2Ls(3HL-q8LCbe^nN?82jR)e&|4hn#fZFieJvM&LCR7XyUt~ zemV}QxV{t1HrjRdlJxY9S^Yz*BuH=Oc!T~dg4Oc|ea4QU#)7$q9 z+@WOfGu|fT)#-m1;Yixs^cvl5qbD+jN+-Z!OsN#14(jS=O`0@8Zvoo@zxDquQ*zg6 zehot&SyNrp9Ii|>h*MG-GjHA*_e;l~FI~{GW7o*l>0fP$^cV2X zntj;98W=?Tm>*(wuT}}kPhvV@WKV)i6@~YJp#WlSaJyyxkgMEFYY(_4alJ(HX2@kx)Rl{!4G#-ey*p@ch*XPxz%H z3BnR7HTo?HIKTufF~V2_AxqVm6FRa^_u5gl&@4RLEX%12zH^q4T>1$z@h2x9t~oz$ zUe8n3^~pp2;_&k27Uim9M}5;91$M@z6k+R8){NVW0k^E;ET5!VYD{UP{csv3J{`iserrxk3 zMe(yTWoI)G);ym8XA6>Gb8*>ki#vqn&bqiv0G+h_{7W_oxwm%h>2)tY-lo|$PZaX6 zysOTXm(~4k;^l=6*Pt%v!+qa-A@<%%E&pb|>3!nF&8xFtRGWW9d<{%@>C)Onm9_A~ zvO)$tCzx;`3ZCp(f%WN&cbwhJg%RMx2KT$$Z>8uCk#(>}D&aaO2Z&5$*4&gC=;spPqE z+5eyvg?bX*DHtbWzFSyR03w83VkIVWkg2iDS|R}XcW^rLQ$5(P2T*ZUeLIgCC-wUs zR<=p>xw6)`y^31&IXCN8U2Z-KH}KrIss8BF7LS0>5sRP&bz)GRc z&;&}-Cbt7esi*|y0;{W^wCT9>NM8<7PCh0H6~H`W)Wm1vovqGgc%lsWh`=F=!$mSg z&~cpu-y^+4Mh$uPc=Z1-uHi_vkfU%`&_Pv5$bWJS^&ZwCB4Tc}Yq(b#5cvKk^TM@s zZPbqs8qz-EctT2J^0hY9n|iSMgV>YZtlhA*%c=;W_uz)M&~fpe%NjKlH@J8VVXRaC zRJ7o+l{uT91BJc+VuPQvk1Ke6ojMgon>hJ_ZCObDuzkla5vG5-jMUt*v|oFJw7Qog z=hb(pS`nU$&`ha&>H0Mmal2QM8WJ_1pes>7ardK9TwhFryi{2I%3FneWUzGutQ;oLn+J`0k znP2X!D`yr59V^d|F@NT{b*q8$i`sQLN0ugrJ$9|URIs7N!McaupFm7$!jYEY*m{5f z!cC!&?BXXLDmX=g8XXj$jmqEUmwn+7Ya?%(@6HM(JV zYqz`sBi6V3Rn}4UTttkO8}Af*e5XOKK~bZY_dz{EX=^hM#V=q#eilrl5^@2n2==o0 z=INxBx0d9V6*xI`%jGBiXKtAiS9VP?qN+5%g~s*oA6r~Z z%eQYa!*kRq&!f5Lr*>X$j;s@~k0&Wj?Wo61>=xUZR!;xx>l7VE?P9yNN)9pU!$U~zMpGB}C5gB!eg|wd^C@fITXPpB@|4BN0Xue}^&4Ngff~$I z2h|?p<-)6m3Y8WhKVxJvodNKxS9OV+#QmazyS;B{?l{2xMp5^N#syHwX>`QFLNB03 zt3t)x1mQ25KO*vQ4_-cb8T22d=?$PNKb=+r5C4dnXO(whWF`|saMht!3UamvoSTNu8xOIFK*DVQEiu38}fi|xF*L<)nk7q5phd-BaBP7=+PmF zq5VRwcgcIcDx%Hk^OgV~v>X2}C_5Ts6$j0H^TmrRcNPz{OIFR9Qd!b#KqPVv*(1v0 z85S11I#jy!ba(pUsGmD_SO?_!9ZqZ6Z2`PE^z^cu@8_v!+r9hCUtk|XRob>C4kwxb zVUVqWj9^~IuAad3p<#dE7?9mGCV*S+M`H{bMVT6fV!^W;gd9|~71Or8~gN->Ty2%FGq_N~}hw_KEw?+K^?Y1QgoWQ%wkMrr!6D9a6#DI=I3X#HnVjYZEtSI-iuRG@cvY_C|#*G`B zsLbrU=PYV3RGmLR+89rXQ!?{Tuv6OGFSY8Q149&41~zVdGqd>WdV}s4?oGL|Y3Pts zb5cy%^g~(u2yDF}z!;J5ftjQg6IY537eP*MTs#ToiTK?&Jq+L#%Qrr|pA+Sb%PU=C zeyWI5U9sWFs7s}r!2EX~Ib!!k!mi)J+2`NCB6pQZ6CcN~Yh^M2QRi7%r2*y{sKxo7 zyjLQ;Mf)j1#sF10S?9kNu57kCW=dlf6^9p1V=a8T{VKE>)8$~rbJd4k@a5nHBckNJ z)7Ui(PTZsA|7j7FxhO<8nJ%%pGNX_7B{)N&eRPLjwLi$}&zfJ}84t(axajP>B>!s< zxK-59x((tel=xAJalrY++|Vp}_D*P1TvsUKW*LTg$&*@llMMI4U6zL4~W4 zRm`T(Frg4p4AL|N#h;FiGv3U}ee?9b_Eik4b0*!OMV56MBE{tQC@F0`swjGYHp5cg z9hNDzZ9fMaF8TZY7sLT1j}DHnID<4-l5cnv1jb@QL}^zUkTX2)A76u$lXh%V%+$br z;rh>%DO+??nHalIP>S)QN9EF@N@>iIN55^1JyYf|?XZ?oLNA&?9geB{GSMOnUkQH5 zfS4E`%eaD~&$C(_I_%u6<+4+gzqM(m@cc<_zEZnxmbGS!W)(0+Li!A3`|y%a;r~T1 zP`_qgeqLcOqh)MNlz`ftCj`v9_K%@BZft9G?SD4)&x)hhn>Ed^3l*%84r!uts(;2U zboRiQkzYscJ$P_3^-j`CaR(uvD}8)ZbWR2aKBryYO@eklK?fhS_ou0Cl3NRwv1|!u z3y{~MOUrMSJn`9>zPLQ3&6DRb%M?b{6(|C~W;foKx2*H68Kj$sE% zZ`rm++a@;lFR&H2a&j)@`71_6M}G&rmW(QVcj6_3DsFc;_hVBUP8d%5(j4c>#>V5o z=oc&)|9|+=&pkPkW3}EI%P5upJyK&7iJ{C3dtkHlOkccSqNp;^KezVps2^YY|2T7P z-HfMee=WOxGgP(uI2|D_tC9^M_iBvOfKg^}ulCM-3au@V0PRyKof(1Okzqg^Z zj{>epa?e)@JJn*~K%XAR%=%rh-%$H=@$gnD7tQX(l4KCgbADm5$<>Dmgu8qFJ1#eQKxRXkvZatDIbqUA2jvmjb=e4L9*z z`LXq>+09yXXgZ_cdKPx?b_iWF5Tv z^WhIUQO_Z%@q6h)6g$Vv&1|1uH+K5dv*R77E?(IC zW!A?B&9=Q#9lA}sb)2E9sJuTQ7L?+~9C?`T=J4~GKt>|a=BSor0|2ejW}8wOEVz(^ zL2eiFD>x1w!45JJ$HavlCMJS=I1jx5Z?a6vSC1U~y)QI$Zc@Qy#%lVwkiOUQ(39c}F+0j{s#%r~CWe#60M zPn))wUg)j$<4c@l=3wo1c8eE$U%q^7aP(#i4F}S>8Gu(6KkA9Vg?Q!HR5<}tXmKI; z8LFDl=zxoT4Yoa2(!keX#+_M8>%Dlh-|v||W*#`qW#2H&F_JNVIKDz68kQ-cGo_OM z%z3g3?Wx3o@cnb5G)7tMHL$vWXiTfO>7mZ0MPCQz)Z9PN$ocs=%{KS+R%34e#_yI0 zdCB7DC?{wxoDku9VlrFHhUQ50?0s44?bekqd@Gwb1`V}25$au8(MwVb>2!({}{0CG#iKe&ruqpqZ4QPt#ZbsNeYim z{~q*HQC8N-O9&k8#$Z==<@J1L=UR*Ey=*bCSRa%cJ;CnCqmDzz*qN3mo>z=Ge(~bX zviZXn-6yB?<=T~O0+_pec|uvwF+)=8{*J=TOuUq}$3Xw$b-UupII)X&`iy_!9Z#Uw zr}Vq`m2A9z7SA|q`Z#&DL6F6$Q8%7?mqZ0FwU|1!=t|j^bFF8bIkAWl&>|+b?8!OH zBYmm;CjgI$irT_>cI{8aWW=N*{oys${QbLbQ3H6!B{!~JGuW7wt#Dv?er0EWi!{&B z6uv2&R8p2-s|L3$d3v^2%H&b{`GbZiSACxzUQ(9AWtYhKIY+MfXPlQP5KRAZ-xRDu zKH1mrV{RVa#{RHwVnY0O<1Z8c(~3a;Q65edp9;*ehU}GOX2?ykU?Bv(}Tg_T%}7Cn}yVRTOi_s5QnJ zS^bYzz!ChgY1Oj*lTAxCf(}Q!eS33TSu=nz!jT#9dW8DtyDd})^Yk6~PgZD)XZ0;}_UcC-Sik80?Hr4q zv8%N}C<*Wmj8Im6*p-jQh|36Zyez!1nmG^R#TcvSsV7NAzwVxt^qS`=dLhrwnm&5&TLx$;D(ZKg68X#Q z&OBmpMF2SGKm#+`3OwCo1t*|ExO%T93Czf>)ca4U5fE^<*Ut@_efk=ic1fv>Cptp9 zhg|KFZT=fqj77IkbMG3j$Jb_gG*CMj4bxDNTTtO-RuwX%Y#22?#KgF=_G9;=I=YC* zc7Cm27lmJ%7hutVM+=n#eSGE+M=ehyC_AjzRcV#$EKONbE#adhRz~oKjUF=niPic} zbe{~{C-;@@veAs3qTIRJQ12beH~#!)SEO}eP zyg(A`6uVEfZ?or*2Q2t56W@@kO%3|H`1Yt+CJC8>x`TpjVcetI&~Bb}eld)e89XCaJq!8euZKHjLDzJ32q)nJXN^_obP+;JE+EEshy(ucXf%jLUn9V0?r4qeN zgPE(HYM?u;#GBz@x96~?-!hcDSDH=#wm5#rN`MfJ|jp?G?ck?nH*3DcoiNm!+2P06@BaZh} zRfBwh_IMjvS>5b+T}KNw0&(cz1gn4#lq^$9ZEN_v{;=dho~JnDfZpxDw2m`tR_M`S z(V{^P&2!GJaaNs92w!XIVc62iqIIHTB$EMKAn_c)?l*A-q6YH-uD(2=fu~?>-0n^i zLTns{vmU1r5{uNm$%~QL@u4FDzU3=O8mxQ;oN!E1u^uhqdFUmgZ5|Bo)!^#YzhCUV zk3QI{`mn7W3G`SjaMGrWf^~d?mV~p-I@L>09JF+99;&uc8Yo!ACIDLpTR)K|cS-h} zx~HScst!D;+myPx?0$i{s-l9=eue(O)F*TdjJ2BV?QFT$ZRuo7%S7EDL=T!xusyDs zI9wrZntyMrUj?NPzSq3Xc;8$hGC5#OS$==5uVD{YKUn?Xx5tr(P7_zGICFQ+vuEdu zOnR@qs&n!~yF>JK6OHRsYR?)Do6^DvwgyKSOz2JREX)Z4UjUE7sFkoW7LHo4G`8)47-n+LF5Dfbz zrez=w3-d^&=EVhbe*JAX#A}L?Bie77++aEScJXA51L~-VY114BbrL3x(&L^GxN#>I z{;aI5{Uh!$m(j+q#P@q}%)_EvtCn=x^{N|!!KR8~_6>igF=R13w|Y4G`}VsPXjs6g zx1aw>1Q2_@$yfUJLf0PrE@fStz58w^q-MGdOZ}tzaPG9j8hdl=RAJrvAl8MJc`4U$ z1rd9D?uLTM9Vg0l-M5Hm&tspI$fv@`9XD)-M%&pfQtTdrRN@YkX{B!|;^ZC3e;JzV z(aJxD-L>6q>5|4jo3?62lXhY2KM(Ge^qrknAO4p-bs;r#uL>!AfCoDU$Mcs_ zX%}2Pwd%ZM;;NQz7JL8LMnCyREgOu-5D=m>4rBqE<&65ZBXGBxz4F{88*XUMUDYM7 zU0SiL?w%d3RYnuF=W_T^ISiVRY@j6bJS`h~TUm90mwGxU)^i??&4h%qR*eqx6yQB4 z{3c-KtZf}$zwc8w7NUa0LxA4GCvn-WjpFDYj zU|jpbQ-t{(3Xn8s_-EahM|uE}T-G)DGKiPgwdB z&&hh8d}Tw+9$#K|-g45yn<=D%%gZWt>#~n_NZS#ifRt>OSd^)v5NvWduZyv(CQRKtQ>4WieVEm=IciWBfs+;p*&gwoJQf9{-+w?A= z`XLN-8Yclb{Ux_U8xF7KdKbxy0jWv2!HK+@SH>GT3d4!e>N}XGJ!`Yl!r}Lb&f5Q~ z%+Aysmn&N_xHywjY%DlCIQr3aa^d^(J`&EvtPe9)rwZEN9Wa2Wpt15C0!=A5dySkMB!tclA zyuYb)a&oBo8MVHBtQ2Vuo2lmD7WkVhlMrEh+Me{w0@l{l_g@^dZ&7vt83z zirJ7mkt$=ox=z|%?`iUCHOc_68*C@V9)mmyP!nKENm#h#@c^5?cORXo-gnQ3PTn`p zu%Smq!0dz5%L~yly(ult;`pvVt6#(Q1{D<2aO{q=lUCd)KIcYk22mJxmV=dlvi(~& zr_eJVVawUWm!VT1`O7UqpQav%ID)O!vK-n7C4I&EsvtN@H7mqGALQzAm{4h|=r5 zJJXByPr^LOlbVit5PlYF6%djn=-#77y{NKF(AwEGSnu+wc;5BZh7;AoO-BdXAS}9d z{$XZ7$T+LnM?KnU^nV|)f5!29o>7h50s?M2v7;A@7ROjuUDu2xXp}=-C8w|cR4TTJV@K?&$_o|^sC?C5WlmgfE4PDAt6eXV$opymuoJouxMSb#7v8d&hjejk=_J( z!2TND7Jqx8sj1+2?W|;p3Jd|ux({KCi-CG^Y`j4@jI^g7>w4r&1TaL*boKq}4jnt@ zum2Nk*Jk9%^4lHH_0VzLWop$(8?VIhNwKbCo?(S64kFf`zM&(O4b4F1;il?hDay|N zc5!K|!T~78gz5%A3ai=OOu2KA-mgwr>IeO`Bk>j2@sLanBuK?bj&)afqdL z^3I&AKn}2@;LSE7ZMj}3k+PP$T|MjBa-pTu7kvR z&abZ$mpxxNd0J}%$EHWx0~?P1wLdxG+_BR!i{`nxx-R4)`1~^Y$->_lO|3633%h%c z>|1T3ByEn~gP}}2O#EAiD6=K5|5~zHb3zu&NZI=WVlC?opaxxH%}x4TaPD81nv`U1 zJ+skIqmYjJN|yQ`cp$McKOymtO+Pm!&__aHd6n2rmaN%g30a)@>=K@MEqmIdg>LF! zr+r!Kj@RW_Osce+y6|#R5?NPS3`Ofm`xUTSQe4Tx8cniHU7&BGM)>o@hSf}0+T`@k zSZ%rD$!yIy`Rf@PV8T(L7asI-BJThtXYH^}be`JbY3XqG$shk8-rhT|=l<{i4q4eF z8b(ObP*D=T-hdb=g#Uc=Z=;HJfB(>*H=NS54+PlpvdKV z&cdx7yUhrU{@Y$^`^o5bMT}U(rgZ9;02s-(En-rFA;lz9ay!%U{kFT7abcG0iKA0a zd6>>NEq14d5~7Cm45F#9**G2eY`%J@+sTU)CroHY+iEe-VEQq>?3m{)_hAGbW2J`-642DlI^1$&$?a7_X_@$OLeq&cz7+uXhiZr z>Bzq}B7wl}f_CPfjP!I(=7@X9lo>BWU`0{U^R_o70beD9 zx6!a(^sYB?^K!xJICOLCFc$6Ul$0>a)_cU6ZKz`Qik13SQNik`}Bg@NxE-;pVrXL({sLO#}~J|PFOqp!fxlU4#|lT;icW> zBo80>H-S{4PADn+HgON}r*AaVq{=1W= zo?gn&Ohscca6(L}89H>RwTg^^)? zu~!#$^r{`xIYGZoaqU}$;r&Dxm(_oNe_Aur_*w13(6W1J z$(tId?|gn&R?l~#{h@OzoBQF(?&n%V_$O+bA^xupOw`-#@2`ijpfb^Lo$65oct*aI|4qqRIT=ghT1uS>!sAMNVWHm-+v*OVa!3JMaF zfAwBmcJQ*ql$IG^z1^kmcyvpAo}K@&-w2zOB1{W+AD#KN$wB`HwpmoKZ+NrWClzD$ zk`Lw#a{4~!oNdBr4UK$+SlkDmi(9_7w6s-vPI)GE=Z;1F2Xs(|9B03e`sH=cc)4z8 z&+@FaM8$)Vh4+uf#ia~?_inVgY|;Xa&K)=S=Z-5YnEPFRe8H1--YNP453cETAJV;R z*OaP+^T(IY-G8)i(&-tcq0LV`KV>v!nwF0Da9xuT8f`Si_)xI&jb=rOq;2e#GlP-1uvP+I*#<$qS}rXe`%hd@}Om{VVZ_3-?Zosf;^$ zvi+nbb_g?CtoZLz~H1MHyF%H>)R>K7bda&(x&na$NAGKvmo?QWkd zy4YNXTls-WtV&PHRoLP(X>I8BMN-}BJ;EfPUd_F-?0MaY`3`o8p?9PXbhR&+X>Q)S z@#8}_oE$%WUUDZL!&`OHmMhuD+=F)(YKmB^%Y+1g0=uUKb8qvT9fR)Pc>6}VVVgo_ z_qFTSXSW4KL=>1fx(xaC?MPzp9XpEd#GW6e5w+K<=U^4{@6`cm`XjC9t;y)pSbEHR z^M{ke^_JegZ)JOvp^2O4cJ5sDzAB}@zGrJ|!4|s#*H%60KR`W9J+-MRNwUND)+TPgaK6sPq2JsSp;i~1=MdSr*TG?au~g<wy- zyLR5$=g;H?2D&$_jGXrDeB`EUtg-mo>+@H}SdKoM;CyM?GmYWx^FpGMKc{%4+^o4Y z!usJi_0b2EyKLI@ex-$ug^H>w*nFCER>i{5ervQnm3q0oSUh5B0fh-2Oum)5!S?i5vjk1`iDFn?I9bn%S#xxX}_+_POSdzE=i_?Ax`m$?UrbFBkrQ}4E2~TT96G#Q%9HYrleB$YT%6gB;m%G>Ohu|7 zx=;}bO93l}YaYt_U8$&Kv_D^hsNgIo`8j99)i*tR_UXuPN15)prkslFE|#{J20WHp60+`<=Gs z)26-3R-G4bCS9j;^jN*;{ZsiKo)@lPKU!)L_UfmNhhF!t+qB%woX@GN*KfQWFlp15 zK6hg@%)1<&*zxbrq30f^x3-Zc@q`}D0Q)3EYFXpH&03(nPrKkPW8!6VW} z9#vK@t{QfBPCqR(+DytI#yMrYPS1YWSE2pqdGm&C*}tST;?0)tAHt<;$rvT|>*{E5@^f@43WSl(h~ajcR!i< z?ft`I#cuKBkHNYK{N>?t`G}j2^M>uzbhIy6=|;2mN$r}gs(B?D{T}h?7TP|V$i?2V zS4bbi-X>4*+1htKjRV1lm+Tv)J$qC_V4fkX3qL6+z}$$$_$7+Qe^f_>W?$~u(L3?9 zi^i5ke}7qxN%-_8#{2fpprFRv50Z}jx{=}6XH!7^_mYdQ-VtW*rw8?U`5{0r`&s0& zA)h|~o%B)3I83-*A^*?W=w)BgZp(D`4`|=s-Sqt)>4(R{_?|?-)_YZ*T>ReJ&TB>P z!U^VC>u&T&uGpEQc4$;0!Mx;Ccqi;RD$Ug#gpR*Dbvg z`2On#>T$$lH_OUGK*3L@-%*nh`}=-sU%z49w#k)zHK_RBcwl>3*2nSRANJKY0_>+) z1Xc|s-V@nCaP2>E@*tbra~CR*LCpx@iSx$v=+ptVGC_6MY`ZR#Og($e>FV`s80>3TTp$aDu@6m;ey zO&Lv@?w}=~8!wmj9WrD;a?PTYo=vW3}OhrC)x1etp9+FmH#V@qdHbH;D{DneNn?*rafQOc%k5BAA7dIe>Gb zJFJhiTWS?~=jV8O_kS}NY4OK@_gLD0c37jY`!fHw!LgOahdnM|KCxoH|6!vMZ+hP? ziIhHmMO)AO_};yP7JdG5Kjr5859;$jGSp&j+iDp0;Oky{C_`zy#7uW12Uy4Lh*%GD zY{goPBI7jMo_Yx4j^H&uh%Au(iaSptBO}+qetS+v-mglm~7DX=!QU782u11Prud1;zBN^o9#reMgxdGqrTwHx(#~9}gme$s01LwJ!ukIQikJ>`M36T%Xe%t$mWL)2gSA~LH^f+AO zWJ`WIv*-;Nj{U`j?eJkNDhwMb5`d_!g$bM;f$9JG<+91AfI!79PoKCS)zE&r-A6wL z4H%F@ba6)@;@A}pkGvhC<4j8ek~;qF^9O2>_@P4v8^_~odU+{-hyb4#!(j;ijr`wF zETm67slKwZg}@61$tHh%!-3LlRhRbiK3H(qEaS?mA2u%*czAfXZ@l0vf5CTp`Re#{ z=bmrjDwhyRj$mVM^fuxZ2;&wYl2H4IoND-;omp9v1RT$IObckD`Mr0+dC~!`Pjc2a zl38J`etNDy>=xv{xV8Gz>0dv~rquWESsYaydH0NYlc7WL)2E? zKO1uG*MV8D?AO~>PRi0>2l`B`l`OLq1!nh#2-#a4FukVy3PnzNDp9oFVm={8hBgXS z8p7Kvj(rDf#NSfq6CCDs>Cz`jw})=-ng!pVJMNBXY#cNHYR5oN;O}C9WLjWF0n9Nk zRsky*TwUGFUPJT?unvpJj{^KwCiKQ)1}JjaR=X#7GI}~*vC!P|$4{!;U*^G^Rad`c z#`yLOjmX$-7A0F4$Smw6g7*W9ZYP4Vd35Nwj}nqdqXgp(uXSd{!!KUyg{7qem{mUR zchg?}>xrF1USa_|4N`^uhYZD1gYVTrKqi#=BISr=!VOLd?ss*{m%4AecJky=c-6qU zF*}mxzXSyi$>Gqw%ev?waOzhXi4A2qKa2q6L-Il_L zZv`VlE|Sm=3nu)!a6zYl(Ig-Y4szsp<2ztv*y`dpCFl+TDv-z3O^NCHcwwlgmu+OM z$=rjcHbeD>M+G*w97xz$Ur(#U#8&_H`8_i#p0u{4+)2G+y>4`Bnr`s+?!BV^{z370+Suru%a<)HJQXt8 zgql_41Tn<+U#AmteA+1ChvFA9ezs;#zNb(bM7eAzJRQ3coDN=+lN5|Ah%7Lz@yx5; z!wH4G&Z@yDBC>GkJCqObUzb)s>HlnFa?8(AUFCXkpNPRxdG@gRQbonZXplxsoEQy$ zdFtsQcv*p?<>fuQax?nC0RVX(MX6G;dCWoQHmd)Q6>pA*k6Sg~Q_1|T*Hfjeu6xHu z2F^#oc904*`u5zYhzRaVz-Pdf&ujzv%I*LB6N{b2*f_&uyG?SX$aF>Z_9Z5Ija@>0 zc7^5AiW!?XuW#IXa(Kowzc;1cFQM1HkefLV|Q1w%G%Ex`z9sDbK zh8jZ5Y;m!&rR5T)qxbKZ-^~_5v>af1n}4;c$Bbj*FZ$#SHqr;O1J8|W`x7%+eZKXG z5%rg1Z5KQpNlDg+)&T*jTzzNbFk$TdR-tB(Y}#PMmz*CJ5}w$28PmYYb-q;k5w{~Z z8LV-ezx;%Y8odnqmvrdWz5nj7l13Lsr#Lz~qAOd1som7WBTg%q{4Hs+g7VsJx-1yE z3FV9$6k;@vX*c9?)Pa_^wmU52wM&mP=nudY`C5ESI@AQ&_L}gODyl2)HJli$yjN-e z{_<1#$3A2!rYmiLKw45N_|=;Y{!*DLt~;oDID&+F2=5GOskuNg`=<3OSlLR|@v!eP z=Q&7;#4sH-weDG_zFSalzLNQ%I!b-}_HM3UmR2oav`Qmt&#vIex|qIwb4uonksr3S znja;IbNIZFHh@PGMo!V{>QmW`py=6x6_s zNASDqZ9X=NS{olL+1&42q$-Ej*MFMfx%o!oE{ZU>@=I>sDJKU{dL{XN%kPoq`UYP@ z8!vcz5Y&+3?HHL={?KO3K%cW*s=m`@V=uIP`!-%B0ckAsX*1BH79c#&3?7aVdWQ>C zAjovC!i{l-ieS!GIgRH+7k}PHHGE0uSB=FP8J`S7_8+VZNf@iTaf8d%lx}j$Kv#{m z*+Y|T=O@2e0WsS0ml^?BIhW&HymNtH%J@H^)F=?tsc=H?8ax=>T>Ghvm^34j z*IDesUZWIu;=*eqx9TTPo=nI~z(qa2@Z#X1y=H|JeI94G_|=!M+dFo1*G;iouw5>C z*wPskU;oX$)+v_JP*oK&7a=Nu{UegrrNrR6lyTU~33+%W7!-eF@q#f<>06=W@rfjT z-~32NS~}(F=CxP6zM2_ti`g6&Sqik}JMmcKqe<%$fBl;L(uHP_n zJbl3B$kJkp5`a-rPe5L_UUioi-{)U9efsMk91QoNxt16A8Wb9CyzafC^1btWtRB64 z)xA0DwMNI_{t$&njy&z5ob*?n76Ea{FccgDMA!ZD<@~aJs(d3SrusT)aYV!#vIH-CR&*Co*(o(9eGOSiE9dy0oh!$FScU%UGk7wl!E zz~sNbeE4wU_CyvMb1&c-yu-1%OYSC~zqb^9Se|`O1F`qB3v8-%Dk`*3pBeuqt=8S} zbCV1V}Ym5l9+PMh8k#&3+`gnYiF%I`-Njm6c;DJ@|f=GTyvCg#nX z#Pvl`#N{p*oR=Q(9|TZYveEKvtV(uvTv*-g9-Z21&p$nHU39$9-#e=>pq)Mt8fuaJMq8%4uq$&X zS7iEniky*qBq8^6n3-HzdnJ+3LZ6xyBHyFN?xBpS*+0%%3t!D{OG`*NI>ozOp-R2F z==~3mV#(kRl(h)dY?O`Pt?UGhBD^}Hc+&UN6hi?xSd|9YB-wfnPr#exspOwHjUe3q z{jJr_-ffs2CYw5|CdG~$eOQuj;UeyZ@k&!FZcA`sfNoN+qaW|f>1=W0+)CQ z+4JF1cex69;_Z+Xvf*JcQ9p?3+eaqubkSJTvS`BL!z$iBu8%dVF6TUacx9J6M%O*_Zdq>}wrlU+1m6`VJ_^Pu z9$#*E)mwb!V(*?q{;A8a?>nNUrTKYK%KTM(2Y3gbiXLwB-hcm`)zaudMJxg_#=8)s z*o50~#}p$FDl93v^YkgNxr8^h(!@5SZ&2=3N5|+E*?B^~;976u*Qk|oyLUlG*XZ$) z%GzP!^^)@!&2?F=`acRY&1Dsem>X{_Z%VUC*@ON(FK-9eC7#Gt6Dy78K-8v?8_|^g zx9iw(>gn&bDCo}FZ6^WmAo?KDu(%B_QW5_t(r0yRH{BvKIfKDL!nMNzD2o2n$@$+t zd{X=J$Mu$sY=1?y#J~+Aa0bnYcvT?II0<&Gah^T9i%9ECN_+N_zbMKv-Wt)xWw8kf z*|l4LZR#;;-LH~fQ>ydh+!NayUKjI8c)or#Sj?i0D$In{`FG?yAS+4>H0OdNR>1Ck zQ6>JT((aIuvq)+VeoyxNvO(|jx~VygUEe=AW$!BQ6sM zs2C+R$5xj4!Q5!C=zs*tFKRpyKqWE|Jc=UbPppM?>LMkDKlkY&`-bvidP8#~JE_~* z>1FwNB@X(tCMj%u``lovXps|jZ0=5%kJ^el7>n(*P#dLV}%)M?ZinxzVsPp z8W#^D$&TB4>8l!P;JyNbVT1e08 z*MJ~zE@U-f0NxAC>8v|%&KyN$yYd?WN~NXKCJuPO%;mKrMo4F+P@0NZ8bVyv)Edty zyPszfuykHV$4&mTLPgbMrr7yot}QXIN=bF?u<6&OqZz&*o)jOnKH&ee)b-`<-DcMe z&(~wJEhmvPEG!JSiQ=eHZx5EnF>)92Ef$M&4?p>Be09K6H9x|=fCRft~0XGYShBNoAnZ|Of9!(a8K># zpnoKLbnbcd_PtBGH*Odl((To$XaA-Tf$~22qZ6e}Cp?dR^Jng{yEXl{JzsF*Vs9_6 zlS`l4wgw@Z!l`TmYDDF?^=R|b+R64;y8QUIJ!Ep2>G-zRf=8wDC+-&>tSBStn zS$g2DH!-)qywhCV{iy!vB|ic<+1IU+ei=TzRd@F6*|Ea+IN+vFSoA#WRZj@bf;8lgk}C&T1a|X1|ra?#y@1UN3LE4c9sKyywV? zj>UKOLf_0NKl;2>Nl6KqDpSUQ$uO_}(bl*wd2CW~Q+iEiq12mS(0UJ53yZ^(V&~5N zbYo}F+-KLzlVgs~o2+BAf6njb=??F<7yp%#{A!51GT_G-*TTN7*?$viCXXEXwdx4R1pm9c^voUc8-`p;l0S z!dw5mQHS3@Z|#%y9$z#$;FUdtDbl}5-DNf?;=OLw0pCY-AEH)yAiIxLN3$b~n+|r9 zOr4$_)zEc`M;wSy85dalXeWmRHp$5K5jU4F>Fj2(uFpcxy*6_U71|q)n0ad3ho`<* zJO?gpI@3k+uL_|e%9@&?N^_!Rmb~rUYsdb<8thKhBk*_w9vzZ7q-j|CIxAq(wU!-6 zjeXx*v{kKHlaW|{-umRez?^F$HD<|@C3BB9&x#AHdEgmzXkj1i0aD#wJU47^ejJ$K zsd^$&YUSZcDsFp&I_xX|6F6({0Q)_`Ik$XV*22QJyVlAa~=BoNZEA?eza5iF<{YX($rS3%|;LmYdm>H^^9TzAPI3 zwPBux#B!aY*Col6W7na6tB?9s^>z_k)h(>Xea-2Z(=&bE(TE|lCG^{c z$XyNJD)n5uyzlDFT&?>Wsl(czEA({i7BM`e_fNSC+a%v^?GW`u&hnOyrcnrZQ^gqPu%zQ)GKragWeriNPVP+=X_3%_V5$%SyU)6rJ?&I;&e(n7RI4sK?GI3!K%^t0T z9As|{d6yHF^Tc$Ew2qg2kGtttYV_S_es)~CwI;_jCoSiX>59w&yK|13KE2VYF|%}C zea^b)0~dC;*qa~MH)d=Xu2U6?O+|FL=+%z`*m;*3=w3*0#0Bbef3ali z0_j@ib{PeGEEH01whnpj9USyFR9a!MrkuURrmbEc*ZOu?-P7Ulkv(3izq%X>_KHlJ z7Lq8D++oiF=}Ybl$D~v&-g3v^bnEAw%Bi02Gd~P4L1+`A2du#VbJuP5wkife~bbHa>Vd}j}L<(Vs zvyxs$kt8AZVDhz;UFSyW%?p!h3fzJ7MVUqLI#~Bv?3TZOL`y4@)1;;EP^_&=-vW5~ zG;P-BmwPW8n^{}zwacDrkt`rn{H9_^3c*yOFc#E12Ky8K>Wl4PzEoWIK}BIqlRVQ_ zx1U`-tar6ikGZa28AqV6KM2%-;h)f6V40b=QV9tOszd!YE6lIop^)`1FwyCH=kAg& zMz(4LJ7~1G+ZdlD`Dt6)0q31Q2h?|dcbe#;mon0~TS}z2eoFE3q3#~5dZqL|BsFN7 z?H~OEkq>9J({T4_mbutx_4IWC>N9O6=Z{cb_adsPRP(v?!qF)w*LSa)+4?GOMW=R6 zQBNj!t9sJb68E`r=DME&y<4Zm)=2+8y2DF4K-!^e56NE#YW57h_A;}zea`}^I`4%O zQsTb3x~qCAM1A;@q><`7GPL7S2_*@)(8yv{XWxzO<{XH0*l8N@w&Op}6W>IBnR<0> zzDs83IU3F%FM3vIADH^aG~n`%l!G4jDspr6KhswUv;guPP)T@;c_gGEr&`(r@c1`n zydgAgUC(W?L$v7hnc5?KNt@!+pE2xIc!W5v5+ZMIcx^V^!(%Y<+)8(&ZL;phw>J#% zCV+5L03!+Gx&cKH6_vT}g9)Q^lKMX)Vvdl55JKY&^_wNGc-rj^rviXkoAs6iHmn(Z z6rhPfP$go+FGSwVTo%5gx;mq~%=0o}?YlLD5s3v^+EYO9y?*`ry$$a4EC~HDGns$r zHMUEultY{RuEfM4Q*EE`aDMW7p8Iy6NzXfb&Ae)#IZQREE@M!wUjHhc*52NmUc0_} zTztG>PyEm*>8k?|B<;uwjl5+YJ1HgQ_11WAovu|n-?jT^4ZLig8uG{dTmG5&;CIfK zX4xM9K1?+tY0TI9KJI%PA|DS^^;!}tU6$1@-!wn{i9u<m1E zdi$yk`umpeS%l_EaBJ(`S7ihrjM~4zD&_T5_d8YZD{8+wwW@9XQ(*gC{@rmC1qnaP zKMq$Dy3dLn(E~JGl92vYcNcv?Zo(Wqu&vZ5wRi5s&2zL14+{fBM1Mc|^~xgkegg;2 zTDEL7OM2XTD>=0=#RCx$|72%puc78EfnYg(wzAZ2L7ddBds-1~pK~kIY_qhx;Oh}W zcgp;+>#bHEX6`Lvo-_ChcWWX#&9SuXi^?_6Se7t2-?X&F?6E)Lv|97z?QnI|H2O542mU2#?d$m*Mf*(Fp zuby);w({TTwRL=f$-Y86BW6G?yfkjqpYOo-*4c! zueBMoDo>H22!~Dl%Dk(fqXD!s<1$GR;lP9FLov9f`= z(qhmR*!}n-wT;|Rp8oq9busdA` z{WBxE5<^sSR*2ERd9wtpxvuzN5;J)~;}e@BmvshlwTcmM~ipDRYH2+;S7w-pqbZ~JgMSepbz_p}0ZK=YvGkY&EIwbq=&+O&B28L|A8Jn}vHT_0? zQ^)0{@8^fi4zq4(4D8VdH-en?(C(c&gemGseXojEQUT5rs3wKj8MkYF?)A=}<@Pa1 z(O9N?C)^~u-d50|Hy)*~(+VDsS^Ycs3IHJ@VQW4w#5$b+&6w1J60g2q1{Dlcf_x%j@_PLQ? z-ap(6Y(uuY02y0>K;Q??a5lVN89jCNQxI`6Od0{Wn1R3*^NZ-*wQtL}K`i*_n)3x< zbhna`N($!QdA1n6Phj>Hf7e18_b_n^w|Kw%Quc67GvP{e{e8*5^VCkO?EKJa8i!Qv z=AY_>!LTyNu6fCHuMba_x=of0R2A12qGR+C+o-TPLa=TR7s4F=e&9sI|2%(}Sj<>B z$&dl88_k}L&RW*q@EFr<3&iwN#x-*(nGw|l<2fv$OH&tCC-HnD_EhA0-3Zvogb#bS z*08C~t16!Nx80hO!o+|*g!A>*8X7n$42ijliVB0;vOY{*yDjzvrzO4Wt%aJ*R8V>u zExBG1P!(tQuG|6D2f8@SEVo!1DwQx*p?4R__S^pbkDKdf{=fdIN8b|{rT*`K)qY#7 z;qd;p|NXgt|G4&S>(?uDj-jd%k)c=vVk;X&wkZ+sQ;XYGC zpItu!Ng+BAAtq3X%==0JjhH3Ma42FdF{F*geCGRPCyozJt~m!gcIqS`e?k>;%R`_E zq7e_w+lL8J#@=d7`vP>udUzK90K-o1o)!a~Q6C`>m`>3^>hbQ3RY#sH5rG9h=>xDy z7V^8=>@BI?FLJ>NQUTExZag;K_zS8kDvMA_aNzS<2umNg-_43O7tJ!&R2h#YsoQ)1SgS>L(nYc8$-=z_E0*hJr(`T6;eI|t?F z;As_u*zt2BMXz}!J5YNQuWZ7mR!!JS5pd0~2nq^XbKNER4V4Pgtls#+ksi!>wuZGT zT5egK|2xQQK+kv7M!W&vmg}m-*c2jnW3*0}yVeLd>kqX&NAjc=sN)NweJK_}HAo>ut!*pCg;B`TT6Vy~y z>-bCvT?Gb+dwX943zU4>7nIaSvd@bGh#O$Nl4^!Y!~_GBicQU)BfQ{2lh$=lbzI?O z`}JDY6C zDyBGZO&rC4-vO+8>VRn2jagWM?gspnOpWEeV=4*ZJu^#T761bGzhD?S)r8TI6Q-Cy z4|vA&lZ6lEJBSob@%}^@Q&Eu=zF%q&CMRE?DL5jbts(0|jGW_^VAbTZQ4{9;s2+og zONn4j;S)|l^0Au?I(WJqq(bB)_kf}uIeGFB7NwYD0Ld%dE#ns7&A|hYI-0j5ek%Vf zk?5GZN9Y8|PsW!Av1R~HeY>oztUAryq8qG_=nTY~K+htGFobagC~YHwibb{Q%e`5O zN=kD1`4?Sp`H2?d%9VeW=R7v(vlF%nh?PjY$+yp3PKm@!pD9-uoQBC$2xY{`rn4Sf z5dH{F9Dh|?stSrs%zxUlWeex}9XNFn9+sQC4Q>e}s+3j9Ikp6m5IjW;&(i8&xx_0d za9tI37RYeU0JIKaM~|AJMSzOP+-<3`G+jv7MCKD7I!X)?SwZpp>e%{cxlA4+t5A$! zwT!<_8W!K4cqWapNOZYO(c`|x(03L^5{J$GE6c2yM7E8}9BSek!|a%_Foe$v7%!u} zv^U6ub6OwbQ)?&a<7N!^%&m^_s+}pw&L((IPD(~qIX5|;} zHGCD%M?6w|ox-F=oZMlqFjCe=p)?TwJ!-6%A3g-5?qFDjh;X2q!CfHIhs1amtoY&@ zAY;kJzh0VZLXR12Vj@SUpmyBwr7=^=_zh*8oDEloifh#Lpqkd<=Ej;VT}Gsm3&w1e)Iy-P;9W$XTIDq2dYK>Gzk3PT|cUt`aa?BcHg^rXS zFu(*CRmd0Ki0SfF$D+G;BY-{?14yR|Q#7uC?j6%(&;*M;At6!Pa=}+%s+Q$!ZD$EX zX^k%ZI^mZU#Ep%vfBo77xU2$u+AxF9cgI z;n9SgnqrLz{8C}6Va%GgG&J1y1y;)#EkcxuDXK_zHp#n=a{7K3zrax%6VzW+QW8m? ze#d37GU97Ny2YW4oz15VvrUNPa2Xyfqc!bGhR@m4_m-ZfDKqBcjNs$OahxoJsi{DK z7Du%(T2<9_L+gv3aqKT+Ru%f6gI_>fOkyzJ@^{}S_S$gWMf`4dj!L+ei<6U@ahPHN zHF`zB_ZnsA^wowH4PPrrd5^>0KAX!U65UoDAJlmR1aTa9J>tvR8$P5~o<{0w%Uu_S zPx>@G(rjLG1eDonaFu0bkX67VDqeS4)$~;PTt`^Egte8<>y4=+BiBwmM%LZvcW4SD z&}nMOcJK6#(<~fm(Xf%q%HmBwNndknEdxDF>C!A$uDtLCftM+-`v$j%BFSB|S;yzJ zH-7VqPf4*fF*A#xAaf!K&#sIt(c)U+@wadrc0(UY+gc*9pF=;IU_VE;#)GrN+!A2wQ z+`c`VWS1}ks8=-2S3WmV47bvT_ZvNF()^0$sTG9boo3O`CTcc{AzpEaLoEnPh>kdR zOg%Cx%9LG`q)pj5T3LD4#4UfWt!3+(PWk;M+}gomblmCFvvK-Y0Dg@$4m%B7ZAuno zIBF-gW!iGfpm{9;GU9mZ4;+5>=oBR0=rV>ZDb@X?%VgnPKcRLoG*m%skRO~N;u9a6 z9N@TT-@XWPU=@rlOHvNSz6d{ZMAhI+`Sc1pAPuqM0N*RPaN>A^#9+)7`%{c#0q;<=EA3}@S8?OJ^k@6V({OLcxf0~MQBDORYc04hT zmL?`eJv=V$*t5$uX*x6WD9_CfX||(ug6|_SFq?oFQkDg4E-mGV%HbIBn<(#`~!!fbirZ(1s9;Ovr$;m zrL}zdB9hrjS3iWqpnBK>o%Z(oo12VfcC>;1_~)62|wl zPng%h6Y55TBQkp)R*&o0BmDf~w=7RLQa{iuQ`o%WVwMmRQA`J-4x;EJN8=60V18}w zP^t`ma5}|-_$=IKII+d_2oChw#>T@&j0j@U)?pb(Vku0hoFhD&M-D^TsaWSvQk#em z3|d2A&ZYeVUFrp(Ca}EzQlb>aIUt6A(3m1b_)Hl@zp*1cyq{q9arpr%h-RO?o{kJ$ zr0)?ZN`odwly~nGl(8x%c7Q-(S^C7rPU7Z6G$1$=wxXi(B37>8`3ZfIzco?Zl{+9>wU>GJfot|JH37!0^rimYe_?z@(q(jG0 z{PZ3)XdjX)b~x?ikPUB#aXF)%UCc-d@CCw8$V^hAaOP>r#@qrm6eG{Y^x|Ga#F@(U z;(h?p5W-!1^j5)sAIdoqF@xaJ z?C{u*NV~X%FY;dyeU2=z?#-KgW+hjpdJa@5))Ip09XnVO%hoFI-?QiZ?<;eiBlfM8i_2VstFYILGK^o# z&M`7HY{v;E7BmO-sYG;P1=MhN8Bg1-;CnVL6^(N{eir=^r=t$YdOQZR`K z5^2_GE@`bQ9fopW6%us7I2E3*Ony2&pVao3J5ub~Oq{J^j1ZftBEe3BI(^rkJ((xe z9$EC!O7takTQg?!k9aMYIp9JZ^=vlwBhzPxP>JAJ5mCLoOf-9k2+@}Sh7zkec=BuV zil}eG83=;BndA#41%({H8#D*(#KPj@gv*o3g8_NGNwp^EA%q&7Q>5PoQJjmlzE#U5 zo|>L+Qy6ywI2v}f_+f+LRroLviVup5dfdC|C^Ge;;~%Z#Vjcp= zyj$ZNZedIoB8Z=QPrMqqp8w{gfV~s4-v+MZC@+zlo*0=C7K`Uc^Nc4A0CR(c7S+ z;G-)wWdHrM@x;aPeL)L$k?GlDaFkx<&fbC4{&;n$?EJCw3pqEn{|SyBk(&1Yy$MDr z07NCXYrH5$Wo0Xx5$-fPDXmPucxx4wS{u5f_g8wcpaB*!lNku~!PnP=j)eKLJ?IC- z0Qfq5^4+P-8Q{yma5 zZYypn!nYzKe#vCu-HAPkb(<5G=Gc(-oKM8Gcv(VO&YnHn;zvn4R%Ih+FhxS)p){0m zkraIo;x#pl`?7PyJR+pCEKUYtxP5w6nAf!++WiuY9)lT|ti6?X<;rO~9EJqld%Uxu za8cln5Y9pVweF2uuuC-aW9Pb%DL}ZNNPer`^tlHg545LT#>Gy$X`b}hSI8+z)`0Kp zt`HJ}+FY(|pp48e09S@;m;NBYXD%B}%+BSMpn|wBfSfte|0EA85C#(5X2Pfg z;yI-*F_@`O@z!#6`sZJUP#|NQLeLoYm=uD8{JHSc9nwtk_Mt0AkE*%h%Th6pf-*gi z@(cGE_PFPDb#A{3xPD{kdk{oLthhJB5Zn6xR_#r}im#zM_fLzEP0sm}Fd@%}eLR1<9nB-*ElLlh#o@ zBr6XMdJ6u_O`A3e;7nXzOu2Wb)so-@?@lH;NCUNm4j|^tK*`= z(E|3Amu@M%Qbgu#x;chciIQ22wczgMNP0>B><_9~85VUy| z$A@%056-*D5FQDouDhsAg1={*bUGK^Y!z?K(fR1zq>Co5>WHPeEJQXTr86y|p^;Hu z4-c(iBdaJ;Mduze( zL5~#o>RA7?J#{~SNdTc3ln>+ZIKKJEGA^3M{TKd^r;f1Fv-}cD4`D2NAoJGAYVkzk zp3fdG+ukXG9gdw))FfZUy@&cjN-4AIfL^;K6hJ{3fu?Gy(0v*q>Prow2Sk4`P5~ z<1*C- zg`neFad*@d5fTlWO}FWKH#QSDlIWEwW)5q*P7AI|T&2CEA+uNDmR`nT1*N60u_+E` z-;?~^%o{zTsJY?!w$4KCjkSfTwOV2p);^gvHEz$|{;8~=WD##IvVB~aEZGTi5T+=~ zLk^$Y)Q=f_Qwn39qD8Npf}Yp1F$8i2;$UuM)JbSJso%yqDy~?u;xOVx&_Gy;7Mj}2 zjQoY@_N{SywDxcEQHl6&5U_15G1P21_wK>02vxNCl;7GUY>4VnH{#C9tG#@a%pd5y z>}Iu-ttR#i{a_y7QkELuxNra!iCb3*&U7wjj897t#8ZU6Y-}{+R^!mwK24lHV8DF% zSZfu6P&E(dqBqvbobW)J_;@8*2WhE}(0h3l+ngjk{X=M2xE6lZ{{20aVqU9Hh45!+ zXv_)&=u=~esbO-ZyqgDq((FelY|G`#myNI58s3_D_WmW_g7}PXAqII+&}K!>1m`oG zex+GR)5nkI7)l`CrY2X`fwp=7{rveecM@ohEP+r!YP3XoEeL}5Mn%~H$w2HV$zEh{ zVU!w7ah=zCgZlA=wTeF`QvqbJnU6-``t^X+H|0D&niM_~?~VVFi4j}}x8j}CW+P63 zj?k-yp)>}HddpMN+566UeT~OH|CLX#s#E&qd1Y`Wu`gvKv-x`*&D8HV#e|=@EBrrb zjbdU}y`1JU&^lwI?F+7a&`^xhd|_cD6lw2ODI@7r&hXNdev|f>`Pv7{2sst3&#kx1 zV(-5jg%$MkWX}Eb`kndtB1H-AeeW9QNFC9|=dHJ=5@S_CSjgpb@qZyyQ1QEcS^F3~-`C@kN`v=9pL3NBbB^DNcGohz zmCxd2JUrkiCy3ZnW4qRgXU+gP%KKB6@jNdo@^GQh`Si(~7Y+Ew;Y=s=U&=0hw3O39 zp9DU?_73X@(j*lqrKIDT$B&*QD9z|mz#A3Wy@$HfcBSw&RzYRW$0cgtXYgUGqdqL>(0oxIfW0!MF zl#ismW$jZ1Kp9mC1JPvW8!{esVc6CA<*S8p5rUu&K=H+mpH1WZ4FlD#(ILy^o7>zy z3yp=i8`EUft3N{xHuNXjlif()ZmVFDO?VSbD-|Tlz58c@KQf)G~J?AU)OlZcqlgt zclJpxeTdbt0|(j_T<~P*VB&q&uk(%8m)%&vSdFh<8e}Jpf`~OJmp`gt4LM@rx+Z)Zp{$@a@_qCN1ns#0Ygr2F;9`1o;qkb2D+plw;i$d7 z*i_;2AyC>?-T4e^sT7cYT-?nhtv6$h|F^W~$Pc4=1{u#cO+jkW3;} z(8$4nq-DC`I+>8xQ0-g986R>{E^@_@iy1M9L0{IFSyJI^t_OvWptMIG)Xk^vrbr) z1J#Iomz3qVe)5b=+$DiIIwTy;E zXunul?-(mX+vEz#so!$DA9qyW!GnWgE=0#rl?KkxU4HyNnoLm^urA>g5Kxw1-?(1H zo&rGdviZFt+DdRO05hVq`{(A9E)eaQ|6*u*Zv6S-8NS|$p3@(2R-jXDT)Se=vR8CG zA1&mYM2&H7ZfQAz)=&`KEL}??!-4hvs0PIRJMbc8BAfsIUXQKsCdQ5l!0|xM(*Uw@ zgc*m~Qw|eniM<>v_yD{Cxnu#&B(t2$?H0`fbwI6qtq}Zk;;@rM4^DSUJ)HurNnaqS z9;oa%cqcxSCiX{;ZnDpfZe6HV)&xq<SE|J+p?%gytg=pcaXsLYw+c1LQI~TdTuMRIc zt~J?;H!5bLAOn8H{%d^Phviu3BZIJyHSUi32QxNv@?ZgjI9m=fJ90IEbPMZ|ueSKhRY(T~(F_6y-R2e&FFDa+ zTtD;3ry|z@qCk{NVn__tvweztSFjvEKfh-FKA&!;kx>ezBPE6j7>(F;M3=BJxkGGD zfr?xTJ#Xe3_;fKvWo5~?R*LSI@7@_R2v86wfK7KefPq7!SQ@>koaj@z$ro`EQ=s-8 zH0W-`Xse}^1Pa}(xM#;MuH#9#p%3H6?oXjJ$TM4J zM-EB>&;Q=LwZGUaEU)IlaXlm?wn`Y!Ftkpl^T$GWhsENtw)1(CstLY@4q*hJA&0V< zUXm>TbL7$q{5f{>nvw^95O$%-9f;qKc==YIiiiuqH%9Wuewag^KDLzCC|}+Ax-@jc zOlGQa-TlhNfddAN1+<6Z+E+jRe9F~F(i^TnlZiPL6B7a?$_kxAz<|IViLZ4Ux6O?0 zE#__s7175nf8AIy5&}oW>@V+JPc~{BuU8~|aw7^zao-SwCx^D0m9GZ0eF?O2V{ttB z@|wpteKJO-Q5>ZqxIS7SuG*Y&W%_PwQ&k8#hCjhR+zD6=6sVXdy)PU|`msd__`xW| zcwHW+4B6?&GOrJmmTrOdlD};Fl*C^aDxOgv}yo(_$y7vQ*Ew0N>H)1QMfPMpSq}gNti$Ji;B0$9frF$ih^YhQ< zDR#jZvU&3jt#u#Y4~(6Ab_97s*mDyrH;$x2fUaMN3EfQ2s**K}4PTh`tE$@0xP`<0 z+}nPx=>6)EHaZXfo4vjJ;)Q3f{5cwK_-sm-tS6^kXU^;h4?%rJBj`LwR$WC!>`B0m z@J|Z9yTOlu<|$^QwU>F{zhum9PE)QGax)V9oQ$)kd`jqubR8~6e#YCcKRj=oC24`2 zg`0(i@1j4J(cKHZ2dCVzx#t(`Haex%cCfnf_WFlDZ$amG?wpnSmHIX?&zx$C2AKln zC4;;^Oge_-!_Y8Sr#8lBSb$>^+LGFdpWA0eqdDWlb4-x)@a0T>Jyt2VUmqTpy$E){ zv}1>beaHWi5b+=73VrVX`CtF2ur##GcANhlUC}=`RsNSIDX`P-|Nl?_|A?046sNgL z|33<*_J%>Mga3oST*XKCdG>zCBmz+S8;9*HhQ8ohrEB8p3+WJz2J8t63_3gmt~`nV z6q@_~%Z37il;Esk^$A!DmV`Y3FqBV6LLE8J(lQVT8I)V##r%V1{Ath=7C2=V;)JtA ztgzJ-jm6A!){%(OZR!oeL-B~1=sEwc>2(*$uSQ5|0G*Lp$k*1VO((^drU*#pbm=|uXqU9x-X3iH#zV=B96-g{{wCG`{TC74k5JfF1rhz?gYTEoXfBub4 zdMsrE@N%l1`~BKUuP=xh6b+VU$GbEw1w*8`ImLZ3IpD{kEWN*7|1rII_+4f4P?7l~ z$`W8FVL3)uC-5rLa`gg!KpQu@!e79Z27#;}p+KW^@h6v(zF0uILcYdrAX)*A0Tg#` z6o+WYNbL|~_(UWR22wO17s(6<&=zy*+hM7%r*v^gsLXA$gKNiSbDIaW^ljKbQLKo; z@yNTze;?Vedf0y_4EFukEB9bW;BE!Gq|5g8zQ(>}rQNMj&)sqq7DpgY6kKlq`M(I~ z05>}>M5)zt+R7BnJQn8_7Vf4CL#rns8)K3oC`s~Amy-Yci5Dt#CWv;yY{itSP73Ih zDddHX|CD^Qh!R3=gXpP!EUG7N@SZ@SAwvu#4r#i!2Otv!59K-s?j>~^-!oh;5~3*M zM`!^y18j)iLDWqXkCj$OnUxoax|EwoshFNUcx4VJtMH3~%vDfM38v=q<#F6+zy>0Q zh^})x$J{+rW;BK!JZOYjgMyIN|Kj!QJSsLGk-(qp;OsHve2DCKS;6tH`CNT<<<~^3jtghr_j-L0$;M3OMx3>+0vMkfP8f z+ou|TEiV@iDg?SrLXM&NyH6Gc8=dyzg`iX^ValiXn&;?PL_anG4t!3u_6dwLV)!_k zFp4X*ldsVbhzX8#AY;8AYYGIDZh(O%_hz+x`qbA=9zLoIWhzbFYbveRA{vb5LF{}m zGaMKaykkFqf7$IJoNgR~EF7Jl?n$;p*No(JlwjH_tny;-F~DPWEf@A5`}F<0Zd;PI z3L!xqa@Yj=nt}5du}$-$G;&Y`h(3*$tn579eddY10$fM#6x=3p?_4ry5OCb1F zs6s)aY=Ye&VJ!ms<$t&yCYH(a`NoomMK;F`Hy|1sTY0K_3LUe=D`ARjezmSS9`MaU zu)3UaB>}RM>lMVnA03^Cts7~wMK`6ZtILa?z>HXgBHRznxG2RYf>z{w2EJ(_4olGD z+&L5#jIKMxeq-$5Zp-*rqR*nxT#RyA{0o*|Icx$1sgPi|@sq@iaJ;6it$u>3qW!g> zv!hR!l{-N}CjV)5ZFc#%37?ig!T584p%O$BAm-6ZY{Hnni*@vzF1 z=Ieg@SX&m@v0E#$3jkeLPKCVPgZVaM)RGt+#g>BH8-YQXa#cK3fStPmdUy9**vm`c zfn97qYOI06}Q1(075^fe-!oi8LS%sH5{zB`Jh5!0;i-8!@P;H z47|+#6w7zGD8#Y^nh@er0!nk=7I8<0^dppm1E8JIeez500DUlVYo3Kg4$3(3G9brC zgAa1uJB>;VeENF(Ko`npj&Pi4;cKd&K7K4Aq@x@O>4aUt9lL{F2-QCD!Y4e4w7a)Z zOzovL0HCO+U3R6W-yDf=T6u?+PE*b&0Nkv5ElyPtL%0kqLb+kMCctVbNu8-FddFXu2R?SkHBc>xqRJVK#_w`o6`c7Fzc(| zR^o2N1PA#L=aFwE@&<08f1MX^Edl^3yfX z;UPkuE^rjT5ltcEi21~F`599!Dy@T)*zFfFWQsvwT=wU;Z??V%JP4p`G3wQo(awLeeimFfCdkNQ8C{15#2`r9ZATJjT2_3e1y~up$_e zb_JRBKHLc{&H*}tg9DhCh)qoFj*gfEc|TA+k63IcrQN0|#}Pxc%eix>BhcKbC1$`J zqIK7D`W@9rDnV9Y7aZA~zvWzJ&-~vEuFGUgwQweX7UNYN9rw&VTgUE_bhoQPE&`o( z2Ti2xl+F5KiY**pD7N3nZWy$8hG{zE0XUmMHe!H+zk|BvQeI? zj^U3zai2z*cqBa?_K%-o09QT>rJmXpG=G>riaDhQn6%_&?Ml#*(4|st(H^@wU8rop zrO=h5k?YT<+dEMY2}3aNu1?&PLA@sTmaXadSBgU(uOt1g_@u(Z5d2hk#ghk$r)QtA zsE!!8m*#L@m2{W){jeD@$4X?Y1LqH$Fd;qfLxPg+)FFh5tGh9ZPDJ-mK?>52)v_<6 z`#)xu8*f+M-A9vAPLg|FV^Jh@m#o_%0mo9O3yh4rB#%YJUH5GB&lOM^5H<3Eo8UMt zP?@fto~9fD{@i52`L>Lb<97%pu>?muGL)rPObmD-Mydk~#+|gg3d6G3&c(kYuhwV5 zc*7B|P#iIvJrM2ho9MdV%ZbRxi!fYY|0p}OI2{D!7-zzs8Dj%Gg)rZR-7D}qlKo{r zcdqWTYSk)%k-$XB>MV~~sZ_GbiXtgVlx!J^%p%$S?#Fq3 z{`mg;yKcAh_PIXSNxk2%*K-`lV;u{dhGV?vMv(fyt~Ri4nW?AU>r90~juPj<0Vo3% zOc*iJ`bKqRp$Ua1FpcQxAx8u)LQ6SrTcm%cMfVKr+`-$QAHFTq{!y+)>Tk~Q!dGbby3j3jY*Gp z)sGp}KujT&6ms;6I;ya+Y5x^^Gq%YU!73#*W+h8o$RubU<1kGnPFc~V(e_B^120v@ zb9At}*h}F_8xXdO)p>nj8YQ;|T!4-D6)`sv5`^pp!U6jU}TO zxZLWOL7ArRPb+=TM=D+NxFA3O75sELorSou(M@*YVz94@#6U*1C}jr95fZ9P88TQ z20yO5kVQY?FJN=qAfrT>uz_=bKPQyK7wC9R?A{-OpXqDq!+wK5&>nK;#?KmE@*QV1wIb~=GR+DI z5=vQ-mD%e=F50Uxjz9Hk0SC6r48PPPH4b&@!zVi&`E{h?rfYv!s-z0#X7eL=nf+3l`KMWCE)(!V#K|fvuLWp(YRkuM~7$9Nd1`Jta9hBoHIyJXpV%1 z!BSX6w35%On#yH{x%-AXe14O%KRBf}-M?`1^@THj?(4YdXwdmXKB?2fOycP7Q03=w zUD0D7WEJHMSe{_$&iAGtmmZ$xGsk0vgMM?DfCZCBPIlPWywugy$VI<-o1OzIR{x})0@vr4PL3q3s@f#A|uAALeMc0QCq}j zunjP|<1It&xY0^oN@;4TjZCE>J2MwQUlHyHIN`#ZupXs5gneeF7WNh7s4soZBD*O1cfXUyvv{Eui8Y#RNLXp!Gbb2Yo2-nuB z&qRA$+*$yp@PlN*$u49Wvw17{B(7+l$g4QH{tFDGRCWL8yY_%l^5ozW*g4druRenX$UTEBXe}f@ zbgKlcfsY|}5a zb(jl4Vqk#`;wRrib#fi#WGK*>Uk68&k>Di@J1g7A>)igEH=uR_(#jC_S=BP)mi|w+ zHQAUfcqSyN~Sk?fBzQS0z$2xKhf2W_YGf{G<9oT?_W5QgvF z%Bt%G+(gkzvp}wq0{@ShStBlcdY!lDTg(CCKn+{sW7e{f|D|s7va4fZ?1uuIy&Ys3m`eJ}E)2A)EJ=Y0($}dU76yod=EI6sP1W;veb^H2 z%S7JeoB1uePxKJIEkbW%txZt@J3m^)z%!AEKnB%Y5iW>=ML88Uj<3QwIpOvUqs)a- zTtf{4Fc4Pljy7n{5?!BEU?r}D=B-){S~jWP&xYJ$=1Z3j7U*MY*lW^h+&Lnc9}Khr zcTcOm<^67o1^cn-GKft;G{|Jsj)U`V#)3*>@CkcVCvE!5J-}v?AL^cbFlQ{kAzY*6{cGcClR7MR zd3-*%ZZJ(bpF@ay!xbl-A4zbLq-Rnz^R?FWO`&tVro8}`6F7E)iAk29bhd>&RltqY zH`h;ee@P9(`S?6)rso8JVNOGJ4UI$3w2m(jfeS5gPksGxYyeNH7u`DC6&_W@;6fzD zd)g=7qx^UKGw*-n@VL@TM^13~TZ9~}Kb^G2&+S5ewr_7rf1)~X^hzXb$Z}dAw!ul; zi<_4fvpcP|AS|FOhIlnC>Q7H8+2v8oC73gJaj7ZezTMqTxm(_WW3T&p=UFS{C&-{f z=#CZ-DUV`(O0U9$+I^w-n!aPzI6~O*Cwngk+7t7f^?swSJWbGipiPZ|X?P2OUtxdW zcN>#n_kw|{D5Icgt>SOZ-MwRnXsh2nJeeq27APj`!Lu|;*iI>M*h2m1^EcG^f(vuc zJ&j5gbdrHZNbs}<4Jw}dH=`>ZJ@-2ee~#obaI(yX`&<{hllWmz(C3_6xo^qZrxdTgOsuNe z4&Bq6GLMdPJbvuh0f6B!N9V>{l;7l7L$s?FuaBXNM+_(qZuSI{PY}tYe)(1E zS3uwGX&&&KS_9TD>*2s|ENXqUMX*WHfo|J3T;akN3O97>22OKI^qfN|8$DUH&#K&A zlI;ios#?_RTU@-vCodfuq$lET& z-&h{m_gT!18+#8-blK*g*@x>FaTlDC8K<80eNNWxd-oO~ov6QJ=|C;5!?qXNwfMK& z!_#vS{eX{S5p|8Aj`5@SQ*n{YP1fXqWf6^oc1z#~C*(bf@K^eP*uc{d@qR6$?7e}Q zU)lLsu<@S&wbijI7bZ94V+LUKNzQUv){d zrzR=4ypI7zCRY$Egp{icuj2Ll_Z`92EXHuN0I?2eejVWl3jgge7-A6<8k*i}cInZ* zFp*&()`gUJq=qg*WWr2G(lyG<4Jwbdz8biV^asH2iM+y;uqsM(bcI_^YCPCyLXSanM&6-8?6UhC&v>IoeaUJsFJDyE#mII6Sf0rY)SmG=nj8s zo3&{pms3g9h@nIKFZ<*YpL$nZ#Z-zoyE1n7RVY@aPvBN=MylqSemHrMQ+xr;I|=y# znR0`pF*vnH|IRi2j5$u=nhM?@Z3`j`z6!M`e4|}2%kJKNUG)6Un#6VIG>2T^2qc1d zZvWUPje>SFEm|$}jW1{@vAS>v7)-lij8=c$OtmZPl?)jC954BmiA)`xHLJp6Y7^5h z&ozM-{{si!E42t75bsCbyxpOZSEBKR6(fcZ=Xmt~ z(!43CJU_GyxI+Rv(B}vlCaA>XAyd028E}Eg9v}ljsv-7S%%Itd)LXzuxL6A|b?8H} zCY?Q78w-)oCodWXi}<2Wo{?+>s^L7|6{<~}J<{xc?dalY==%94%NjzMMhIF%x0OA(*Zr1;ZE?K{x`RrKrTAg5zUc zIEeb#@FkOe&ptf`t;oYiT~J>`8+`fx-3x;fb^d=KCB7k`EUehg0Vv94b{G06d53Qc z{vK@K%m>UthV_AUV~o0bZqX~JlU}W1s4tKJ;M>!`TVlax9JjZ!34%oCT66U%(I=|X6e%Cz{EpWTI!8< zikUlYiEmJx>D8RW{~j;>(*4i(&JP}&PT7?@_VzB%^94tz9rJAY?PE(vM+OVu$t1I( z@w~@D37UPUHrYEO`}NH)fptx16`JHnl@uTzpdYa^rT&ttl+%~-5%(Bb@&wOZr5 z_g7FF5;GxVVg2xkIQKC>e@%P1|6mU^P`@&x>Ve%H1l$HWCz?h^MhLe;LkIiC)MP&3 z#Whqlv3TZcX(j*B#iD0E7HUdfV%K)hldU^z=jc+jYj34sICS)UiBqY1$Dzi31(L4qdD= zl5BJe>+r?VaMVMoat{=O?K%I``z2H7K5S6;r=i-ah0e1_E^5%Ebx={f`iQAZd=WFXWZrXIk z?w2zkoNPMXf1IOMhZY%6D+;p@k6&1+w8yad&z}oFUn;BA+23c#+4EV9qfh-dc((5f z=dW*0G+6o1u!~XO{TGtPo_X7(Nml!bYj!^_|2S}GR!}o_W%(UYfFF+THLy zsc(*6=yJ<9v-acTF+3GCw-`iG1{VPK21zOu8dTbh2h&uWk?Vu!crlne+_eHP0K^|* ztTM%3moD=Sw<3}NJ?3nQJ!egJLOXP(F$3Je5DXrUNiY2hiEu2iE`WAG%wEXLb6!#J z2~vRwLB@-TLzyccDLEda0_xG9T#Zy_l8W=}=){uOFC~bD{ui*7Ca;=h0%TOkow)+V z@D@^Uo_C}3_Vqm>MM)5gi>gLSOj}106^e%X*|UAnv5?aM22JH4;D8_O?`;0%xzxYh zlM0BaHu zibEXLTX%4J)OZA4jssHV?6#<1boXvJ{XIX-{#6Dgh4`6)hEgLJI`2;^Ip(%5HOa#K zrsK^`EgoE$yflR1-^jDa`aA!edcy3?lqOB`EE&CZ!lY9t?WsLu*w6dqm;#RMG=-cu{{sQY(tvWkC^{*_~tn+)n^yeNG72gYm`-eC<#mAQ$ zUHs|QoY;t{Q>S+9j5=RdR`2Ie#aXvMX!Yq6S@*Xvwz^tFVs_JO4KV>?FcU6Bq!KkX zbr(+?+B=3bfE9}OtTem#mN5myNX3P!$kA;C@3y^p6HBohtG&mDf z46Vk+ocI6gKI~n#T*8j{ve8ig)}}|$#x>emi9Lc2XeWKsv5e5! zq}14UxXv{H&wB@Tuis$Y*w6ER*WUQ*<<@)c=oV`;())DuSi1C0?9H8v_btBt`N?^| z&EoTC92eF>B;p#dSJV~s%-O6Jm686ZY*$Yuw228!GACN}3x6O_x2TQUhhkbt5spH> zsW1Y!b{VvUyg?-O1A18`J=D$o#kHuMc6vYE1Dipkyq>x8R2B@Avx?MlkdT^^l*zr& z3iD*Le}Si&r8#mn+LSFdH8owdY*{acEML#JyVI}g35fB9YxzOIpRLd2(ZGn>4DY2o zk14m@H+y|lP2KeQUS3{sX2)S%Z_FP!D)JvSdNJ*sePQyo&87eVpi{l8W=$tyH#Rm_ zz7sG}vsh~jthjV_ZM!$HM)t5_5r?Wc-@zxCPa5(mN#E z#^)@aSCQev4D6GlLzjYF{z2X)F^$5mfYEjG^7%9dPdJT{R6lUw=T>~xf)(d!zd!za zgDkC&&-{S0Z|?$szS?HGXSnA5hvyCUefXde6r@^JI-`QX zAN%Cu+aJO8fdM6}*XMq!4CE6;(7iO!ZrVl!{UGaluMPg4I8fy61n|j97id)+PQI_l ziu(Zs^*RV8BDBD_E2({HHt*q-Kri#~m|MeoPz()SKRwC7@vh=yXl&Lyy05bd$N{eM zas)&@k;j6+RzWZ=d~-|1h{1zk!xLOxy91Qa)C~K>p0q#e5(~xe=H;~^GjkLw{z2MR zQ5t;8ICO-vjDc+U3KmcXJxfd8Etz<+E$T$GfL_c?OFDHoVi|xt$2o zd5vOT?6YXu8f1PQ>Y=Z~;tp^0U1N)cWgv*yFxG3FVuudq=>JefH{^i#q%e{X(>Nfp z`&43J*3(Nx=l#{GddX;CGCTbCt+aa+o7M%*7(d(@c@dY7efsphY&?EVv(~M}V!vU~ zy5A5jTT~4;fqLCJ=4J|}ki!R~e_r+Ba;n<558aNB=XmLO_s+w!6A$G(rZ(HZs=@QX zww(tP!Ps_;il_dIq>PNvQ;%A7xK`JB$cE$Y=D$9LYtDR{q-{Id|L^&vZMqw01hm^x za&(&dV8^~AmRb*uqdr>rGAg^*j; zFMeogj6Ui04v|EC%wzDFmY4j8J09HvYPb98g#|RS9gu82x>);c@}T)$oOJX*NI8r) zQJiC-Sd)&r3CLzIzXa5sKTTiMEib$@RFrAdt~!bacnJZWQy z#T0xJ22fI4pfvv@y+NKWwsn&`U08TTt8{G&*!-AGjDty%0hu)?Q3)OS6SA!mlLTn?pyWEL`+1IQgB?0SFq&MOr7_&ZxRS z5s`f+iztw90(|1d(M@X#O5RYp@5WihNmi}8B)Q(znEJX}ZN+=tKp~l_lT=9URtvni zO)X%dx?W+})~<6D)!Ih)dw4I-)Ra=d@|(Z*jLU}HfPf>XdB149{aLt@>pi%zOpku1$A}S~Nmvt1i>6~~-32#r zUV-{y2ntO7q8OYpX#SHJPL|G7oF$fl7bkLA(L&NPqhh;4p$I_%-mpQ0FFvJ5TQ|EJ zMNhjB4+(2qQmRs_mI zPDY~E2d`3AQJMW?>v}Wi?#~@>rrS1N6sV)rf)6YjQwoJ9Q#Av7d@w)jprKK{$j=XV zQ~kuclBqY#eRb@U)>tO=p!xJEInqu|4f#h-aq&(XFUl@Udp%>5mjU}8x)pTXi6Bb9 z_|@96E03Q&n+O*DBKY!kr>lo8g<11V+12`erZ&bL5-}IQ;+{&aY%)=|D zTz|5vS#?{jI>bBxjMjp5c>b_*^n>$t;8 zO-+61p?jgf?!LQs-T&j`=fAf)_9`emHf+>CtR$um~_~BoM*|A%9xlK*ExN*X0Exi`nc`c`O`Q~jS-&> zFfb_RC%If$8pi)|pFjULhbI{XIV@qD=baQ3Y^odT#>JE)F|6aZ#51pt=kNPv`sBbII_f_Y1l4uPb*cW^v4n`kr}8^PgWX zKm&7baYY*mN;CEzrxj5|X-&A087=P?FD2c+eVZYb1{*{)V>;?aA`2((&hy@Fu=gjy z+z~GvNBLF{dSBvk(DBZ`b6~BpcZ)K(1C;7^A8o^5C@Si82R4MBlvdAFS@M9Pxv0c2wQ5UE7Vd>Y%N}@Am!lWEk05KHr>yw{SgKG|`mth6r9gP}s^8NC-)& z4>`|~$w~aPP+dfsOb`6)C&R^~qoZF0R~?Ki$;zzD`t-E_vZG^m=VeX{o+~G3mmEnrzPO?}!r9)FZ-+Xc%(Fdx zq~XHC#XtmJz;U7-O-kzY=xkv0kL9!G&NZ{F+O8hA;oqk2t-f-4*!IgD(pn;*#Ik|& zOohsb^0^n;HM@tD4qW|9kNv;+i>rcMVtjj8ttJ^R6G-uqOi6s+6(CRnNlwonKMs14 zy?cAq{q+aBzc@B3_sZH13;ohY2SjgMRq^l9>F|l;1rDTh6cJr|6f%x4CS&Q6HLI38Mnq zYmcIY$O>6%6qFp-L_*|(#dShvi4RCVq7ZN+`)FuP_c-F1W+{N1;R<~o*N{$Mxi{Ia z-GCHS4ScSF=RiIV#G-!4F2Pc>-KI|7A+|-9L~*>!p#AJ#`}TbnrH<$y;{EsQz>A$J4|eI- zNpwjl43)Sr_8KXm~lKXKGk_ z`uOAl&(-FB{QdK%2eq8)jVR+Y6m)^w- z>wdTOxv!Lbu5FVWuYH&Q^{bkM<9$s;#J=8>nAH4w#{4HX$L39KU3vd04yC$Uo0N@+ z9T5?2&o2j~vrV?sH2o$1kfBRI4C>hLRQ7W*=MW=5aMep_AHU~a5+@!#_oI*&=&Zx*TJ>fX~dLFb|B{Z$6JV-iAcdg$vo zriwYra*4HG3sgU-x}a^xxw;;(EpC5p^C-YI-jZl|K{e2I(|%C&-9q@cAgSeE-kcnk zB3t69YdbHc`;p1%6Sy1DkE3vc$~)A|Rh=(0$otNf?RJ05X8BbX$K8&%LqU<(?C{~k z;I!Y5KUAbE`1ZbZTfpC6k>71JH#LK3qkt7u(#+J@Y&<~APktj4`}8Zv@2b0mdW&b2 zMI&xb&SFc=2CBfA+*y`?mUqQFj2mUe-*!1Zm($23{-UTBz5ZJKa<32r61e%aVKR>G zZuRJ)C-|*;30ExamHvEfduvx?wbdG3N)O)tkbm>0aeNU?!5o|XTY5p6qr!;AIyUd| zU0VsCdV7rzF`Yhrf7aK!y19>C>kb{5RyRXf=x&tY~LF}N3pNO zy2FaTe(Tm23WNCjVHC*#uvV7;6cw*u8~EHA-GJQseIZ|i%?fnxEPUt#u=FFgEz?is_rD~~*bXzxQpqMbYfnGCUAy3wV=w$4Z zY}QiV2vtk~KYj~(L=LBHv{cph2K#(7y1>xR>L%uu)+cXRRsS}pe;GUW;vmRju;C2= z>Zn1k0tQ0^!x3jwO0|le4yYo@BPih9e0(g?m?074IAR> zgB%@3V+-LXJ>R$O!{>RaYrbjomxV=_wPsYE9V%jZsCE&3NOlH5p-83x_8C($>t?u2 z@KmM?CC5Z`RBx#z#Hwd*Zksm6ps(&;l|f?<6&lSMVai=&IC5T+jr!V{P803yOy1uA z7*_EqPNT|)*ciybF@pF}dqB-K&2YKhRBe@*?u!1UzevZIKsBIwKZN=5n~2~ynY=dEg6gcX;Ip5b(SGOI` znYjy1Gh0{iL&}6ekGdgF*Vfc;=#4{6DlmP0MeZ6d z9~t=#M)nF^lH+etO{D|a*w)zCho(DjRXFq*JaAwfd55^XLI5|{@Z9ouLkMlIbNt?9F4R||D@hNfRX$;f~L#1To8)4PF3 z=$plSTJ{}RmKdnSagdjH7MJ@2p0IIkvPo7*n6*70dwO0Is=a)5=iz2Z~ZePBXiz;K)Pko>jB3K(D z5XQPJI&CUAH%^=;i*nJJ0`i-JQnglX+Cj&@|4s*%wc$wUy%#)OFukt zd;9obs$6gsElRq$?@3p$Nl5kl`(CeVf$f|VENg+jxI;ij`_eIdb#M9IQ(uK^K8LIz z;3N7{htwPNK%S=_G2CJRFn8rRi%aXt46i@>P9~=Kw zu(E5PRHq2MnCs1MF8*2VN!p7^n5qHutD3MLDtP^pM*}k69+HVGdp6&Cc3TINmkA~b zb`qdF<;>i1n3!8SbXyL6_~!Or0Gy{iFSJnFP!T@RYW#RlRJ0;*dvb9_dx{)hxcpr4 zZ&IPAHYNr|a#(Dv3IHa+TuMoL&be=kvoFtis-`|re($adFvnWcR;i9a%vjF8wqNbC9-$I7}fMD zASV`Q;*Pu#EhNuHMMcFTpI9LTT8*^7p4oKmYdqs9xU5)gM3U~nq>pvA%KkY;RiGR~ z*a?iaB1SGr>ku_-sT1>VVU@(lB$32a{tcivDBR>5a$I>J;Io*gT~FE}WSC+7e!Ep94JopVM|uyO z$JQ6;+IBPC=+dtudPYF+;^WR!rYH~BsO!r?cEmn83*A=2lx-$LEYTZ=c~s$h5^j@I z-i;d+wdPucrhMt2$jS4~Oy@J!ffwX-$=`tX_Ym?|!6Rt6NL6`ToZOx&6y>oEC1}Tf zjhzm@l>`@vX@gxmt6-yeMf_gW201VZJ8pWw2o0b@)|q_NhZBVNU5pH~!#}CO2#eVp$ol)Ir*pj3iTE((fg&pg>-?)e zUWo^_oMG{9e_$kz8_--;ZfpZl?hi$qXCw z!KIZ$7}Fp^cs@B*1Exi^HOCR6SKd#{M`p1R?G(Oytdj$P<5UOF%>|U$x3VvnB4FML z3Mc@`#X>&D7m}5h%LO3DY@pg9{( zWx;~acH_RDTio>MjF4xSUd-+l$!8I}IZH|+szBqB#HtLK*$AYn2i)8mfkP1=UA&`r zmWK3IW#uoM)d$V40_{kCAYZLj??g&b>N;Oa6}g(g{Jj}pExB@7QgO!o3^po6ZXqfL zKpQ+Ve;y2)0W1rZXu0y7;&p2}TFN_g6vfAGfdhg=OE@Oh^B=N=2%n(<5kn>G-L2;0 zriF@%nr58(^zsR`xf?uxsx$&O6&opkN1`tmy}hF@cpBQ)*$}j|+g`dQvs+N`iHVfu zPty)W-w8;^_L1pIP1u$MYIlp%!3K4uU;fcoir-mUeI=!9FnzI<>g*Q?dH zm=;#hP`tqnJKxxk<+Ve{CUVEK6FcqA1m3X83|P%AwgE#A#I+Weqm&lRyf}I)@1N(vdS|&8qG(!xERENdb$*1M#%$#jqh6TX{O5#3sDN$kziDnda8yJYy${&is z?AwDj+O0k_R-%Bb4iSWgP)QU(l0LkO)}JbXQ(DFWfDMRqjqVZHXDnr~XsNn4SRx+* zD2CER02h%B!37~d?cc&$xiPyK^c$8SW63*3Fkg%Pp0j2x{`<>MBFLjiZrr|OhcR~p z?LPB&+k!%ZO`%A|nDl~EBzKz>Z*ZC}SP#$DMYV0G*r0iHo6CRhr3F^p?kZ(AQY?g< zw@mfYPzs$yg(6#s6%0V~r{#YAp9fBARxM$938c$QWePwX#j8+P6pz@ss>oJ2a|tJSW4WJ3pOYVv|07B*(Kj;Du8aiT}WX9NF!&oA>{Z8LU!&v99Sj z>M#WU7i;cMHP^CW*t4;Yr)L+afk`K#L ztr_D+=KL65n_#Q5buO=p1$EQMyg)3YC9)+RoW+?<2??HSOV6QRp>9NTW%+U4rcj*e zFja7d0lDhM@0PF%QJg5VSjoboY!3z{xI-ub{hnmt;3<;GIgV&mzojHNHxLPcXgFCC zTwA~6)*M1WVtn$gZS7G4p&%uX7M%cv@nyzZ#ciF{bW6lPz^OJuWlmKi%JA-HYXu&L z-sv{n1{~7pM_EJ3%HSM_aTZ|#*L7CXgxAJPhEH4T&h;9H#Mk^yK`)IpmZ&GQ?ipH@ zrcISOSws*|TibR9X-fzsdiQ=9S&xFkKE*MN8c=aa0QMpa9;*aV=*Mos`NG9(J>>bAm5<9@X2~`!p*RbkQZy$OEGV{7~;NJKzt0142MKqLL z7OOtgwm9KX-C;o=2PV_su(JbQ5W6!kuwK!?({DDBJr4 zQF-{>DV9Ri-#0WH>=H2V6763-mM7!2j}uOdKNxL76NQ9fD@R*TKXAm zL^kac?!6;XYT%w9H^535O8MS>WsGvv#%3>)jK&}P&aFZTNefwHZ}DoK?X1w3GzW;8 zHpa)hhZdF@#hjf1-v)JhoiEhiz`#7EgamrdX@sH0J~i!U^@Ky)w_Ht0q~&55ibG03 z$?{+9Dwcc2Wji(ZZ3zu+PCLb$+WFRi!MJW3hkHfm!?$K}1MuEwqOBn;ch$b==nw#g z_K%nH?OuHN;8NYo?w}c__$QZF@1`9@)&n7Pt)QU5X1+**sE61s*zqHoCkLHtchH@q z`T*VfhUeq0ttG$`w?kmRsn?%9`xjbk>`ZL;7UU?Q0Mz;<$_k8{QZi`ngD=5)t95#Z zN4$72;KAvZPoBEn>eSz$!}k=P|FTa{dKXm{r#j#3eyq{UjAxfuuVm2BHpZWEesB+rf z{rQ-cU4ZeJNSZSC22Bnj1NrSwrtZ78c*SyRde-t{PJf;#jDti;3R6naV{^&hFTMsO zmU$uZ9NpBccXMSMr8-BT&)Ci|WioL20chEx-flT$*F0L!w0`~PzwbL#(*@omW-7Uf zX#<6}JJ}p}l5Bfs`}K*=2T`UjnFM<)tO;L`V+5KUxC3I$VRqZSToeOQ!OQ1oF?50} z86E4L(bLvf5GrZ}Mq$s4iP@Q7&XG-VLgA*_qeoW%VI7%NwD73r^m!ryK~Q7G^5t2v z9JcS<$KW~-V!x5yZKHFj;=0hEb-N2)pb2+0;^aA@TL(gl2q=1g>H-zr!m|#nM$ugZ z#6fn`$Y-!&tU8Ze-Gx(Jf|%IFRmjl4(6CU49STounw!5axaHOChF%3h`vVm7E%WCV zdk^A|c8Rt2TArEjp`YDo3(^`o9*Z+F#f17_FZF&-%4jYLN)66VLI=izV+h;9V-PkL;4fXGJ;lu^~kVLR0TWGyvoT$cPZ1(Ics0hK5*<`xHe&_NrOEjTee+Nxk3Z% zNm8K6uZVo`Lu2|}o;I-@EeR^BKs78cjub8g-p2n^C<{13Mu||)d}?Mv9-^R582*V} zBk8Du_|R{({(BnnQ$kYo`gR=Y9BVr%%n;*mot{36FG5fD^8I@$a^+6vvT2ChMsOWY zMPo%5|6q81Elthq3bq$vtSROx%h%~${t?9Ml9Ok8Ix{vGL0U6`X#%@~?66K$X9kCk zM>dhm{){C$Vw5&l7&yfh1wrLRp@-7)AkT_BbJ4#)D*%_KK1#f7CZWI-6`+d$7axU$ zB+DxD_WtbzQz>-%vFfpHa@p_CaqeTh@0y(&`|aEM({o-h(5>PD5HW_~nUvSm{)Bg$ z;&o+hB~Z<8_;7>Tq8-;b%Y7IbS5%DQ*;B63%`0~6##6XK&w(Cl^pf2Yp1|@P_`p?k zfY9|D<|^SFqxu_zK^lct0`zOb(zb8Pjh$j*wpV>y|HP$};g5GygN#}imW22rjS%)1 zIJ2{F#A8wckYcC|U!u(m9OCQ5=P96wETlbZeKH({H@KuK6bQ%PJIspgaG6qx-=Ij; zxAW?J-)xELu-v}#%zuHO4P&i004c+fC(^@c>IC(yn%OhrW+#3;jG3fxadPbee3Z%c z^f3@kR0*P&0C)}!3-bh3NO)9g&8P-G#6Ykft|fQv!-Q%?-_6kImPNJeMrc#3rI)5p zr0x+B9~?Kr82#8={g zBuZB#&%`58M1IYkH4XVCmxbUWN7Jc`eR%j$z&HhM-XTbUsCjr0{ zyv0=>X6WY4uOOilsa+BVzX|U_`@pq&j~|Vy>S=WK<@@j53cC&UYmt==FLLFY*`b>0 zms-?RUj6GF>2q4U=gMO`RfHk3ZPdH4^+4((OEA#X{-ddhC+KUzeGsYEpcLE0it?N7GR5SQ- z+B0s;VLW|yp?K{^ew)i|^}Z#|jnb6wr?#uT`ub*%yz|Qkcn?~*f8&?MUvITZv4|H( z;(-GPLVU(db8!(39*~$tKEXW_+rph9S;(BLWfc{(Jd|obKi|)zU`0#+!=mmJH>u6+ z*)yZc;GY`=wj=8N_Utx_r`+7!GJGT9O1hd`yvY;7|M&u&p4E@le^$~w&IWg& zGAUT$MO6oR+f*ipgL&f+J~H~nz~-T=XvHw|9+0gR{rxscqAGF8tIbXkucb&9x^h%h-txt@poPOn)pfae(Up9ykJoN&?w*@uo3l&X$S6zw-`9yu zg734}eF<5ew8o&=_wJOG1=RSke6_4&gj30YN8ro#Jr>~b@*NDvf0=HSebVj&9;=aL zDrWVxuKb&HW4ZN~q=J1TCZB5HkbGeA zEHLCxeIf$p@3Ybrw}@ogp(&+-Xr*@y=OX>~?Bb*%)8`R4J!oC_RW}|qw5Gb6X@n`> zh2PuX@pE%_*p00pStpX#=*G+$5EAu#s|I3Jqid3YGn;ER_-;I z!yAOJl0#^pHl-%cd$&tJy_32wNVA|qwC@yA@Z>yxeBo5cl(`bWPbp{c&*7%5LDm7$E3T33f z;qa6}`&0Y28`jmxW$2;Wrpr%HDQa?R-qhUEJp%_KUvtXrre&>tzRGB_+I{AAY?k7a z1B*OxnIJF3Bg_dm>s?d~Jw|3+zgZN&J+`;k-EI;~K~wjJyc29Uhl}H9Dk_>A6RAM@ zf>?JGMk+OL9@=p|I5_6wag;;seDH~;)atDgv5@__Cjk}dq``fSLkLo2T2K-ZTnReap_w|n3K5y~;a$s!j z({t^14K|qIp1I1|`q8SY;HqU)PAlGCI5%~n@8%#VjIZ~yU_S{mY|qU&DEGn;CN{Fh zGjDA?y=PcMJeS|zeaZB%2$AqdabU-FpJMXT6ES3ak|eC!_g7R*@5GVBjYtR1G6DaW zq4TVUha{BI)=Lgn-S5mgLbo5^nR?tZQpcL$Lagrnmwzd{@x3xadpF$>#p88GpuHv0 zMjAX=0fhl4jvdqfsID4Ubmenq_U;8OUj3dOYc%bf$;Z;d2XVzd!E2u%E{N|szE7VW z|1S8YjIDT@G0DvB?EK*DkT^%(0f!5Ao2OO2YEb0m)#LP!(iy=+!^|FR^6CvsAj`GzT||pFU^>u+XaZ%{LR(BX@^fwY1TUUM&kg#;Xf+W zI_V$szP&82%YBeUHjXLx3V4A~^a3%%QPD7P@Pb@MC;MwgwwSO-p{CikKG7(`{fEHd6_$_=&7q$nJ|Uq2IaB1NF}~{>eQ1fzIqXl zEW|w3XL0(By5r9maPIZA3ed^yEyLy&xDU}_r*Bpaq@5@#SN>D^?oVIvV#&%hDmm|eY3#(hPa}%< zO-fLFWLR4`aQ2`c(@IMxdoa$wYKD)+3W`d8a4U1?%cW2 z6Si*b^6`OsR7twto;SCw+dVxpX-ucu#&I?qwcn>Y-(T|fOZBNn{vBv4#E2ufCC3Tl zu?hkA3#d~psU2i;_~vQbYaSWc_SrRKN1UVSl0moj^gddkHU64kz>of?76hh$AEY<2 zSr4=a0Ye71w{;3O`SasfR-9>%rwqN>(7L_fgM!_RX3cVQNOrxc__p|6mF1O^BUdNS z2>tzW(Yu`BIGZ*puyyDsWVR9{o5-(3P(q1K2f)C#^jnAOuZvMxxK#Oe<*U1cVJzfI zfw<9g*ft<^G7_4J_s%YuK@`eO4tWMN`j0K2YGU%Tyyx)YyI>=}($-dC)_qyEMv;jq zVfNM4n>ed)0M;d8B0~?4Fe_@!nl<_nn#=xOT&Ul$!FtQt%g^<{J8j%SUUx;=5128R!1he(KO;#?OEWyxGfd=H%pgG-|z@`|nA3rBcvsxmDHK9NKFXXcTYq}Oia zS=oJz=&f$6#1VrM{VEmvvE#>Iz<}$OOb>xWHx~pA{D2+`>ueZc2?B9d)JPD?lD4GK(ho^)u$Gc~jKPp1~wJ-zh%hvLbm$4>_@ zdv^16R&d9AjscP2u*}Wa0y%1PdDRL55P;ViNbAKa1CXzdySbN(r^C(53!W(@zw`&z4JT=U2I& zuZs&uqzEx6Y4{SA$sZQC59E1NMo7lB&1=b8;N*u4sYi|n4;-x;e}Lqt1u;*0hbq^n z$ASGbwd(zDSCn=O^6ychG==BD$Du?Iicb^j$TX$o?g8V5u=ahYEim80=Yt**$s58Y zq(0kkde|SaZw?tOFgqL@`%2I>2p}#X?O`dwX6dDiAz5%0NxU4k`Dop5A_Yb+PZeyPAj=v$>rgV&6f-Ct2zri2p@fJ9EzX zqoYc^K z%@o~#ov@1Qk^T|~0xv1vsMw);;Jx)N8=|x#1QdMogDYMtp(ub)!=Yp7s}NINVFI}H z9>B4Tl}VYYHj%ptQ1SZo$?8>=)b&4c>k2zXnRX4o4>ge#MD$Dy)pJMKw+a*5k`;Sw z)|kb?rInFHA&XFy>Id+&*(cjw_uk8Im#I|pv`G(q_w+(-%fv5Jlu^bBRFAb!by%uI z>PVg>7$gvub8gPvcS}fyZZ&FDreaC|++2k-rC+t4O}nIXH;|-}XZr2i$NB!cH}T+m z)5|yCzV8#zBI|jk_twH<`)14OF6Sm5b2_*5qpAdl0xLBMF(6HWVq<$uj5rTuQ~)gd z%xkawM_o2U8YW>es2ao%K*P=a8T9O<3G?F12a@7E<@krGY0B?DSZZ$R(mSerS+i-U z#%k9gEKObW=Vk{|QmuXstt?rm7O3QxylBx&)zzwhPyM-kr1oA?yH(#~fAqGFy>NEf z;CJsH7W&O^@$nc<(+O%%R4C&yAi`>h))GPinRb6 zK7Gf>$uCQUr{hsS^?t~GkP+~AIO^&H^mZ(pFl=UbK)Ppsp0|u!!o=!`)mXzem|x;d zLy+J{7o7mDo{4N8cQNjonFKUS;~`;|cA2+lv4Ta}1_p-dq6wcsTq4X4u|fSl#-fHA zPZASk)CDaseyPasZaat3&dab993u9qPDRHI;T7nozX0P%yrx`Rw7^ojqGZ(j=r&3b zV=iC{snSRhs`(-+kJ?2w@*kP%+;dOKs^f@BqMB^(xOt=J<1;-v45(A2*E#ev;P2^$ zHlGtso7&EryuON1Q=4X{m$x)}SijxuI=%ql-_{+~wR}swlpf?;((z&vJJUb#-zut& zpLc6tgoTE>A*laGkSTVxdH{%0GQ|?X!~RT^KJoI5OWlIj1-lQMb3c23;IHH_Gy3*z zu`DdQPg8_hb|oqH?b;R2x}IY(H{JPUk5PYm*?H&taWxXf+?Es&m0#)e(3l~(k~D9u zT2^2QlJG-E(026qzNXSgl&TcWfK#DpqsHe6zc2Z;h8@YwA+0N$vcVc5rhJE^1 ztiWU9Yb8xR+Ydza72t9rU_OOkhQ_}(ih3$g#=vJA5oD?Aj^gPOCc;i_%ufR}q)T8) zop^9Vsav;yPOltI4jx0Odwj_|yJwdt-Xp8empSXwBGD~EBOL-}B~cD}B%+;u5w9~v zz!?f_1~!bj=%0iha>iC1uRV$BfD7pcIxqfrr^@IlW4Wae&y*84FJaJcPjV5yBJ!7- z6r&pREP)RqdYpCG6izH&d+;~xxFj6n2{~-mYkRTJjyFj~ooanoDO{Rfe8btfxxu{! zm)m95{XLuXj(C?&gL2Px7;mJdWnyr8P=!Z_O^Fkm_M10IH7st-+jZ~K1Uhq^Urlmf)v@#k};;xACmVKW8&6%S~0_=eYZFX#SI8t(O*1H6&nfh;R zkA{SX##~$x)jZa>+-!K&SFJTW_1gBCyyqm{fL- zL!C6?y%TEfi{cwLy{7j0pZzMPbRgoMU$=Q})5eV@mU~?FKTqy<%$oiB>s~sCrViMp zJ|b%=quF(>_k&)9i6rM}A8rk_U!0m#kbEw9M5x1e>vnc%Z&>4-JsoWhE}^NXm_#+o zyl}JHy}%V96zEuw+otFIv5x#bs|;dzJ)}Jta^v}v(j4RMb@J`Ix-LQrY7WE=1q79O zV)i|)|HT8i#(fIJnn*dn!O!-alC~yoDd(N$<>$r4_5n;xM|`wtw51StN>B2l&yc8ZX8f925IdsZy}-h*?51&~oU z(i{?%J0|_&3wuZE;XR`Q_dzMZ^ja=?4g=-R_CnV8EiP#To-SI_h11GgMtgdBDW5B# zj%(A+aO~3WCrj$#=OE`suSPK-6a}tWjJR_dFsmU0NMGf7)pno?a7{Dj9Gd z{ec1#pi4;mZyCJVe$k!(b@R>m6M6>9tCxnx`a5^-gpXX??5~iitM=R9TA?;%zS74| zGiRD@T2P+XW799~m#^G%TE#7`EbD$SvgZBf`zoI^_3kP>eUdk!(bw5zGk5?p5StkJ zHh_uBqUS(mAMo1jd;HuE?b}B*nAhQd@e-DQ0(4uDQ5M5gREqFbjQ%uNO`USmVu{AX zW5z=hGcrsX&90pvW7PHX?+H)P{ba7Xt#gGN3M%fw;>Rz8{Wq7td2_JD=lJ>bDMQHa z;{)$9G+(!u8fqbB9EwB6PBwwV_@6Rwkxuu^e-1^bfn*BW-|Kw$nKRe>z&PaOZ|ge3 z0-@+WpU#SU9AsJouadf`xc8k1!ckcfJlcfqoV{)YP9Xlxjp51@&*x10Z!CrW`Pvq8 z4Q%P*rs&x*&9V3(ymUJSh0&IlmZ1|;r)380mFQ(is5rZ11h`GibV^<|0;>Jl{&P0H zAl|*~$wzXgq?eM(-O9EVO&#SMSJDfl5jo6VTDYi*BC>UsJ^|=7iZ>2$4yil zy29?%wv36FuWj##J>Zmg{q#$YFB6lBX+7i6Y(Zg0J}`37!Ly%=+)qTFY$;X{OfF-2 zC%Yb5Zd`hfs|N9X7Ql{K;fs^v=?4e~F~>!B9P${}04AgDzF`ebQi8|tzIM&}venL= z8h6V=TD1!rxAOZ&WrM3%`>#6ZRQSXt>G$Q-Z9h?>-=VMx_j4jv0YA3p@Zpc{%z4YT zEB%n<=&%)b5fyFfKhFrZRaje;ISpS1k(lES&?Y0g2%|@MCjx=k*tilO*u4RYUj9nF zSlA51$hh|1`gy(UcCp*n_eV9Rx1oW=Da9v_Od6oZur|sslCNHISvG0vcQvRnPXe&o zZS10fU`*t`@i;@2jMOO8zs2s(Ks0<3;SBR(U1nAoA6$6tM)t##Cf0GzCxbfQ=8F8W zXwi;=^Y@GoofX9Jr%j+)FiGg*IFOsJ!B-THk-amNPrUUS20>YmAHP588=8}6!=@!GwMnrfhN|D5JFx>_ z)Re~gICL8E{h3EW-QO9r_pc4l=G=qHroi4mXGE~C)9O{L#O{u^ULtXLq>}jt#|#K) zj7=CM7D?XP35oHzXf2-CVZC30-LzHaXHqWcQ3rsyih`3noE0R&O;UMV8nRGSkk5ikrwf8VzS0{D2^_7_iPGo&>6-IN6~7c zM)_;)*KQ)F&mfVDqKr2j{Bg{|w>R_K&+BMtcr8EKxS`v(kviFLD;?Th*PhW$$04Ze zi1jZRC6YBeY|F*O7H>cFHB)H-ZsuhdzCEM%TVLl_2Pj%Cl7k5vu--j5j9yC;?I@i{ zm^`E&S&i+LL#Ly&X-)K6d?liZg@1ng@MJHfyt7yvWU$beZRs)Zm6q*t7KkN}t99lA z+7UdO@<7XeoYamu%!z+5y)V5!&?+sU z`uvauo8Ed?+}_r4#PCH8pgIgz0Z5*iH#1|pUaLQSCYw$1FmCUsXdc4K5XRZkbRVmL z2|6eWIaCZ)9Ep6?M5@}BQ-MOZl!X;`Lb1hf& z7XAN^t}l(pdVAk(qC%1(k<21ON{T3%GL@N(AtIHrB9e%N$ULM7nTbph5+yQjNit*% zB|``)!@1Vp=l|lI&pG?mZ|`66Jm0mxYu)#CUjrv^;Dh@#Z6T0e!b^vXGdz@&t+g#B z*dx*O)&55vt;ZQ!d8w(}qPY&zzgp2(&)FAc8C@brF_7i~D1$gwnu9>Qj~#5)5#gs9qy2L}$6 zGWU7Gb%!A!ndEvkOK=2%A0xPaP(rgNv_n>QpvaLG#(kS<*#=NXDVXp>3!RG?H0PA z885!0@-Ywh>o2Ciq`~gpR_@}0&XGUyUgFIQEsTM;7kjo{zh0xYGWp^Z!W3FIj*k9Q z{vrez_1XOFQV0PEwsT<1RYFw3)D%+%MA}v(D+P5#RDk&NA#NF&$7rXX!?^*bF+`pT z-dhw`6nK?SHGPiK^Qqyf1n+GDPmfSfPptf6AZjCYU&xh6)g9O2sKvG?02fx7Pyhrp zdA@?_t?;AjNIzp+`)2x)`~TR!ZOylE{C(ip#L`cx=g)@{WUVWu> z1_tRvHZ?VViw)a0Z{K+WNO#g6ePAK~TZaH*8{yi5Wky^9fGNrsN>H?6=lUn7gK%#J zega4V@U+m(*gP3gdQGiqYNS5`1p>J5B)<*XIxV&_gCV9=JVa_5W*iV0ETdDPr%qUCaJ_^$Gzp1@n1V!N#~ zDNX80%P}Y3j1QmRpJ0A*D~H$qN1aXTVn4dGnC`PJIo^;nd^i8x zDf0KP6Y60HP00zUEm=({)Io=Tiry=)=$^LWQ>-Pix3`T$^Nk@4KiUjpZI(8A79-RP z&8(zJ0~wrZ(mX@ga>FxA#Zu4mfolv5ej>`6bj!XZZJN(56>&V`tV9FjIQvxv zErk4;kbRis7Mu&+2m}4pKUDS1KX66G9MBErJQaDSa?1OX z?t2*ME{KLpaeD*9!+j5RJ>B}~xT#~Law@&2_K!|x%H@zfi*F+j9t1D1))zJWyEEQ? zeggimejFG}Q~0jai|2>@idw800SLh!!$LPWYGC+55ANB!5Imu|vE4D_()A&GxL-;+ zB*fOYG%;Sw!i&cJEF@$|%FD+-O<{+ofWKa*?ktA=I)xWbE8XAajf)b`ZwQ0K5nSt{ z6}D}Cm2yg~1K`)8t0JS!4SW@RybZ)I>uy3#-a+@u*bFH*&E?g8FU?9Rg;POArmn7G zA6)*H#LhdkD^fBNCQnDDPQ%F@DkKTu5&x*R+(n$eJ_s&5XUAI6wM{L-h1 zxI)&;m!DiR5VMGRENpa5T7--9=x;NPj)#XqGakH+LQy#~l@Gael{qChb8;1fC#dZY zszz#Ia^($$BzhL&=L@kdiT0cGf++@?QK!Ep8DhD5YF4LYTAQ@#S;zII;b>BlscBo1KaFAjg!-8OcwOlQINZw z&YF&J+ZPZLa?6_EQddxm=h}Rh+m7bPLaQrZ8N}iRV{p(T8?NT{OAmwPweo2Noj(Wv zkvlU)!H2^W0u7?tLtxrZxqzR_70VdEp((G2scpgyRBiX@?=4Iw-9CI|X5G5m>r@m@ z>Z&o{3u2Hv_y9nv#}(V&FV&p}Hy4K#XIAfW z#1sN|M2Qcj^%QlCpy14`%kQ5SfSndiRNI$?%f33#T$XVeodK!OMU?5oM_mOS>w`1P zp4xHOx4uY;a-$&lIS6rpZO2zcJ1ZdHCjO(8iUT?OK`Nr$MxSi?caeHz0gBT`u7n`s6m-Qb@3186Y&7#JC$oz=cEm zRE}ozpSd^ME68+nrC9mz z7vLfhm}(WQ7I^@LCV|tDH}SR1E>%WG(I_7VXV)Xb*(=TLvc7dZoX&5sqUhY9s?2x< z%SZ$cDDQV~?fH_ZlGzcsmBrO{rtosyz0#NE)aST(6^<5VW^xOujlWwM(zNy0%j-m% zay?OMX!Yjk3Sv5TWPfA&&81mUU#X{O_-VS9?^a?8f%57V-T8=60Xx8aXqc>}YEn9= zGJoIO5@vb9hF{yI#DQ6ut|gKA(63Ys0|u2q)xu&H7IA>rM;!@i-y%xNA1*r z;{nWmxMabg7_M7bhKrG)yZIBhwMHwsYAHPBoVGUZELJ+sU;8g#R{eXAcPjlK+Djlj z6OV!+FVh4JKQQLRb$Y~B&x?f zE9x+gTKay2b6Y7IdS28{c&498zW!7ram+znG$J4)$5+7uO*`2*u-IRTGLVmU6Z?$= zh3I&xYUlV`y1KsnIP`sV@4>SN5BafN z>FXbUE?gS}`^ySd?F!8@{ew5QylX*AE6 z|H8EOK_~%>P6^F$;=^MM00z41eZf-|Deggo!Z2lhY( zF|A$3wJ>i(_LBDZyzKVxXQ8GCUW%-anX;)vQL06M>X~C>8mn71zHPJ>Pp+)%DopOR zTybyfmQS9*TXruvm=EdO08P*X-hVbYRHB#Q&!-k08Xf%(m@&Z83qUhbhrM{s+0`8X z&$Gw*M`kbf=zq+Z>fb(H;WFJF<2K{A*Z(jF${hV{#q@SGP|))2bG)vBat9&iZ~QLU z+n+&swd?rJqhM9a?u~GQ2$gul(=#v-`4(oFD0C;w9yJb_XXsBRd?R9~4DHkgmsSN; zW$8drcp+nH#gPqg9Fv?Gh$V12GQ(XH1MUl8G=ozHZ`l@z$(7OL1YKUkXpFQop%gRR zNu5FOQ-lnB-O=yXaD92H*JL+3O11v!yI_tpCnMa%@H2@(j<6l!GEQ<@5ki@r_lg*C z-M>$Rist{)+K;If#hk=Y1zkUx%7aXhf2mUh+Oee?4JsAs-Qeee5kh7;cXGAMuU~<0 ztrC_KIwT|?-6p62*l2gId_;Fsy=v{p`kKFcK$k$7`& zM8KHy<^4G(TUS?;Zw`IjH3lRL|8dmcJS0LtVLgyydH-Iu8JkD>k^Ar82tZ>FzN9S{ep_OBE-1%P7 zvqQB*aq0k=8}Sv1l9@~hr1LR-P{0qtIPC4{j`#gQbcp2y*f(uZ!h$nTq&E06R+vme zWjO~Ipf1Y-8U0B)JmTTrc)s^WoLD;5Kc6MP1E}c6z zU9oOjHXxlBupO`r_*c{eaK)Y31UEcDwnR<>9`@V+r`n!HF{6PBD~S`1G>n?*XZ8@? zIf$5$3yV%BsW38NPH2fq=GE^X{D|KR0BHCV)uBJx1;<^y$^U;<{_~L2aE5}07Uw@$ z#4v<=h^A5to)2Sv3&FePu71Du(JoeetNSmyE7DEPwX?TY0_P9#1%jf`l8?y_;ggff z2n&P61yLkv5aIYE84uzqeF*J65u{YOi4hw$yd9~}9y~|D6+|eo(!5oV(vJj4D`235 zw(Z7*YmrguwI2!1VNl|Kdp|We<5*=Lz#Jr3w=g$5`x|pjFvA@PCISS86f`i5!{dyF zw;@&l4lqofw@!Uu1NY-Lj|r-VBn8y8D18uH9}dV&3xzJ^$g?J! zS=euByr<>F>C?J7u>nLU1q?Re>OhOH^$xNR+~ar~F)#iKTnZG*L;|>7y!al3mH>oF zpcLL$umU9Oub&fa$`x6=Lb>uU#n7pax1ZW|EYCLn^Fb)zz5>?6@Q8z$E+9InE!mGt zW)|Er5@z(Dr#mr62aL&e@L)lazZs?l+i;(p$LB_fd;ml?WM98G^n4FvLKw%*A(#bO z*YcPMVR^UPWa9YT=q3+`u*(=toY(ocyPN~n@}NVXG}^e@wUz!g z8NGL(-v%U4{~C655(|-iZw37q_pG@2YZ)88v=JKo?l_!{r6u7I2n7) zcP$Q&@w4d%FvrWCB2iLJgeU@(5D~cRG4Z`icN1HJn5lp|Xbq(7KD$h}zy~9U>>%^9 z3tet)Y9fyl+!lx+m9SXw&%As65Vi%e;6Qb7d~SV~u4duhuj$1$*4A*z2#Sd@xR~e> z5EF?9Y2o1T0Om0NQI~Lj0)T*p;4Ucs-^dV~OPrC!s|Xx2k15Y=T|uDxiP_uFX+3F^ zRlk2PS>h6Zd&Qigk!ZKM)W~u=69W*gK=@(4Of>64zzGw+c(|CW90MYL2&S|iCZir?h&k) zgHOZw`S`*jBdzk~A$hogcy*doMLcAf9}{vav0u2-iz*7(=9^09UAqW2NcBiTU1i6^ zU5zm@-T8chx8)>rV7``PR2+zbof!I9e}yP~EaC|O83r-rCg+aPqRjVMKWeD9iO0)l zP?Mp;0zHTw{#3Xa2;sL^7Sj(f_d%l794N&hO-$3ib>{WUXHtFto@^#^;v)wZtFUm6 zha!pBj2+A|@CyjV^6IU8+oag{5BfN8N8!D6o*O;wzBm;Oi!S<=N*xAmd^CzC)S`%#nEj&i$X# zd(bk%f#9^EwYOpK!JUPjdpfC{?UD4pkDZ-){yT*gN*Y&eivfjveH1N@b3Vtk95E?l z^j{oa?SW}?D<&bbFb2TWk~m5h5aPXEU4jsRQJ4*bCx?0H5ahzV`^F!Tm^YMGoj*gc zNH;9ssVVqwF;WTNm?{r1Q*03u0^#v&l2MQki}w)29nk-NpY?Brcp9`Lyfdqs@$old zZ$xC@Al}KLl>_G~Qg!6~B5FY@wiry5gMQywJ#m2-^^3fY3ly#m67<;lQ>4$w?F*G+i*A!A5vsVvyH)VusZLJO(bATyf&? zfc^okEdl1<8wwM#(cI@(fwjRn(J)jT!d6zw>m=VsSFq!Jr)a%C>L(z?(lLb(;eLiF zp9GcPqo0JiF4)-9KFE?)(eOX98u}+I8Z_PztnFrg{&elGD%=y88IJ)?9l)`yp?*p9 znhT;&Vq7_FC@GpxM;#Zu| z5PhV~H#arqmsiESrdCCFhReGK$(Y1DQzc9wU~KG`aVkb;z-tgj`yM;<21GDHwfZ+$ zAtwMNE%8+TxbV{W#Oe6Jt;=dLz75kWTAyS*0C5M(RZ^kN1KAa#d&t)l@G^r40mq0O zv&x4kuPp8DLzGg)3iR-WMN$QhJAn>DmX#;VrnSbQs(PRrL1U^BwCujw27b1&3SDWM zJMu*E1 zx4N=;&Uvo#^(jZJ1)A5Bm=oQP_Hc7^{Pj(Xu;8F9hkFXSe1M>UjE91sSj1q80N_&* z9#p71QgOt#-n{+k)1tNyYDJ6?58RVdQD#K6k_hN(1@7~97{wz?))J&=fb8vfRZvbe z^GLRKMY|vC8{c>Al`eBm^ zYdlP6<^U~1aB~Wb1lTX_Mn6GVkSKTvR}mC>ghU-ARSbGfwG3_`c)$VtlLmkQ-W%{VM*m5Y2jEH)li@Fv*ofDF*vjCgp<1b| zke-=nX-#Gm+psXj6mm(yS986{IWwyUz=(KnVd(l6hYgsXG-wg=jb`Ck_YcHI>qrHT zp9Uuc+&dBEQNjm;cPO+~4%2AN?A&wA-i$O3^*dDgbro7n8V%-O|ITgHsT}D^HLqV% zTs(qv%pA$+2ut}ADiNA@H>>@7sq+XG&TKO6)-9`wp^@V&1>*;E;%y+Ph{F=0 zU%|P#!stt_QMpd^ym&y!> z8HW_Q912uwc!5ErBBt$tIbg}%2%<#L)2UzNBEvxX^M?<=+{4TqY&Ls)4=s*1Cq=3{ z@TzXbrA=J>(X$g9DbBaA&@2bwyacUb7&Rzy@+S>Q?+#;<17D2}Tq7NEsn)Uu+8u4qt}VTWR40CGc<>?Pf_y zZs&*R6eFPDbzb1uzV|jN>7(j!rgi4zbV9_tXfqDWSI1p}=zF-}cvczi; zt-$+w{e8I~Lco8(v9oPN$F8{4?6kVC`RAG81CCFAe`z`Ml;X#H@)&}gP9~*?L2FO) z!B7+vhj}!xbohpth2ah+xzMDe#T;M8%D3n9=bym5$OTF`h)|~?B{+lb1$gpBNYcyC zKLB7R;h5(Uu0b=kcmNXM@3E`uMTZKg!p!O&z=cvcBmq^#?3(3~GURil3&kl0WG59D zv|_j*#IZzDg~CT9a@R?VE!&vpxpdO|;-7=`{Ps-tTWTLLdB6!tg4FTuM_1e(Z|3^` zZ#DGIB)u*GHApNh=e5zRxsBb#qjY~D7fR8b+=F0pV{?EhEa^0@r?3y ze;Bkrk|eslZ9}@lbwMhzPdGCSG@cgs=QS_*7(7vy-spFoO#N9frbahAShcoF?6JU8 zf?1jzelnuI#8K(ox*Q3MU>IYkZkQXWtf&ag(a6%i(OOCiMn8mV;ac2U6R2aDGbrIl z`Gxy*n7XnuTGQvB#|@%X+Z9&&fepG0V(JF4<^3Vg6N0M045ErCc&F+BNn_nGKSP;CHZ{+w5^=cu z6Fm)xHQcZ^5lp&%B}4!8z2mFPg|W&pgyIG`4|jsK;R5ah33kv-%OuMXJDb)u_NC)$kAx5RdkXkD%9;3Ag2FKBth$} z&A-M60i(|j=w87!T7q;cv8<{jH&=Nhc32QPBL*fW>)1`uwQ6f~a>^_`IbM^0W|%`;6|F8L^_t7gN0?^pUhix zr!R7(`u=PDDT!_twe@*mP!%)pc7NbadrJ+*2Q8+MPtk#j+kN~NmpEM)qSD46P!M32 zH-@7E+`R2DuP7)S{D;f7NB^e1HJ|ulUb9<)id0%a-x2#`&M+CC?0zk*ksVytpdhv9 ziz-0cw;}rE8@MfDk~vyh$x|19nhVqxFl`Os9>cM<48A-jcLnwQH+Xpi!@(~DQbsK0 z4hb2IjRy98);^u6V5pG$sn9~6cKVgj#{d_X>VA&E=PW*NsT{nB}lYihJTh60rGs>Br5q%@NDKk^`nMtPX)vKoA5f)=-l9vDa1h%L#@`x6{$gsNqEPA-^46v^71A? z`M(9q9ZX1yH2aN9)U9C3%l&rt>G4Q<>!_-=SBjdUEUdvS!*%l4B|hc~6`Q*=?b8hj zfcqdird)mF0e8$tQaw#>$68dl&zo%$tn!NRecSjJuUg=K4^C+|OH1=1h0vELR4VpE z`#}t5&<5e664G`$xHVu1Q?y+ziC&3jl~x*GrlpRvhldYLS0R@Gs@>n?*&mM6Zw;u4 zzz+MAD@94011y9}KZmYTFR`#ERe>c{llBLGQ|t+^$k{>B z#m>UZ9>{fA4J&~f2VFSCI@}{Fn$e*L z+u&NDU5gzg4ygfN88P8mX;Ywzyp|9?=5YR8#Zpyv-Q?vc_co>fgL?r@w#QG+oI* zZt$a1bh2>Y3^!a^A?qB)>w*D136X(_0NoR%@d%0<%q@ruAM_OSCMBvCWo|P&k2o}Z z@9o+_P5t{DPkm!Va(12|s8mc;wK|C@B&0KX((52&a>$ZApG;zRKhb!$pfw;S0}avvqmkjM7o8c z_$L!OxXOiK-QivP2CxU=?0}jekDA2-Vl3jCX=76t&ZqVpXB)gE)H3fMLGgv7xg8dx z55+HPcc*J4(?AgR8(INF zZmAtS-jN#+NY4k{8S^*}iY@U?wq(yb4lZL&7Wz?;hd{i^>VBOX78L zbSkT>zlAo85QG7s!~OOi{qEsE^W{(;MkOtl$Hn~)b342*^36Ete*X@pZ2(jb;!9pY zY66V7z;|lJWkSNT_efpk1UnfLjyY0IU^)T{BGHBt4?qn3{rb+CJaRm{_U8xNw%No1 z*zh+&oqPB8?awVDj$jh&<@zha{!u`vt=Khln8h8KKK;yOX9LzLrq?2Z$k#lTBfCZss@ZLs`;Ya=(WwDJrouQdJm*Pa0^5 zd|uc2%v1>dAK)x7Ll~}L2i=D9mn=WP4d5V}(5OIC1L{2BSZC;_Y`P8poT>ga{c2mu~6bP%Sg0-FfF9EFh_P6T3X-rgR8IU&5ukZzWQXq?u|+0ysW ziR1K%jnl(Lm+m2Qu3}5P#Frl$kU7(#*&=8$78id9rf!1sF=wB3cG+sIR3b&6l?Hk zKtBXvX%?UM$Re8*3RG!nX(H)BS^zKw`9FEkO`%F1Bv^sHK}s7I)`FKZRoh!1G2AKu z8{jCQ_SZx$k`t6D^a2EC_%YA-avF!zZujKzd3TW66v2Rohzn_9&5o5DAvYf$8nV2q zAwvbbBzN~e@;)};8VE23+Dx^jw^w%;Cwm4nnhG1^rvs>k$DvT$rCuP`pely}En5c> zC^iNaKtk}NAz`)<-*^roE_lXRIAOroqWcQ_Hba$+qAb zh-xyY93^qJaJir~R?OAkeVGIj{?}0ecp*K6Z8oan(1?h+w<~b2bRG-2`}wmogjBe( zulApU??)Jqp;38uEkM9W*7U|RJ9mR4Y2^r0)%rH zL&&I%HxCn6hzlf`;5cxQI(P(x1z`T;@c9gWb_%dNx@0^l3i~5qfuow0?kNANLb&@F zB>sd)H{>CjSw0%{t##N@&<6{&dG%k~a6l^EZmGO&-EmY30Th0`*1O8IxSnra}%SE&b5k+WKUfGoQN= zZ=g5m2AKLR>UuZ(2+W)<2f&5F0DvJXP&EM%r~s&@7B5Y-x(~}4V#@L6vcMLe-UccJ zQc{1pYMeX)7PK4$HYB+9g3Xj6UM3VF`EE0okmIT2dO)YG;ay{)!I}a-F6q^jlSJ2k zv#0jHxB-Cw&5yhjx61!61hBYgj%H3+;0Qw*N%s6f>ksK59PEH>1B*($5P^XYJqGbN z$LFbuh?nJxEocXUZ_A{?F-JLXj;b4%Z`ALyFaaY>?&|Qh)NWR>2EQ}UMvV$kLIot@ z#4Z9glQRm#G#RJk{d%4m(!HJGZk(wZX;d*Z1Trr$-w2ZUyoNg2!e|ei(SK zsyF*lQBHIh2cb@)210pjdFB+N#i^Ii{3n+I)*R_UDLMy+3Xs4?PNV3DRA?vy_E*xt z6oSkg(i=qf-J>u5T5KdtxIJLM6Jss}$h2oHKr0sl3oRT9cnRP#0Kfz$=Qq83niIcN z^dEShf17p>Cu~nuH+xXvE5ts07@e`n<2V+U7~lhhfoa`&-4l~Yfx|90%H4@wkHhVK zOG_Fi9uO0X#GEK_{fdDBQYF2<@rB1zd#&!H+4Fd+t*yOCye)k?-r z5n!kqt`Kdu0hzFB%hN!o$)te9cw$maK!t6r}5nrdd3+vL-hF*o- zXWUoKB*dD>%&_tM_kj^UMg2y>C5R1eyz%3R9Y4;=-w*>3OMZ#P!{^|}iQX4Q=KrWu z3P;bsxUqID^8ij|Yz6#ZYDTzVh5n_dR6s+*`lFDN`=F(Ee6Ex#Xw6QHJ4eBG!-M?oUpsL-dR5 zU7|@GLji(1V_K4M@f=R~+Q!BoNin9{;mUI|Zkp1|913~<|7dR6f_P!ozxP!l`EX&x zqB=$$Hqzrzl{)L0vSJi=}JqHAk$It4mcm5w z02uxVjDF+)F&DHDbs%IYB-Z=pf&uAgQ=rFAfL+Xf4p}OA-nr9VQT8qSkNK$ARdMml z0;i9vGA7{1;c(oFd!7tVf*d2^}cO%xyFjn?NOV(yMMjkrgl_TJl(qbkje&JA;B*qZyd z5xos@!^fVIur2LkqC0X`SnY{w}=W{yyjgWM5xQyNzo+CnJP zN4aZGjE(>!3AI;h1ic&E!rk@@T7H&PDI+7Jou4&2AXy@@ztH>r z>%<@}9V>mq*?ZrX2gA!ZO0RFj@!}L|HOH%Z4_s0F`h~OW{jd~b7B!m)DGG2A5k_DI zZdtvYZYP67|GihmSqGm=IM9Ob>G$4yFgxUp)9SRqOPS>s~|JxVf6qwEU!*q>_BUIK`?1WOP2glQD7-kE2^K+(%mXA)23 zAOamQO-DS%>-X=!$I>>63_mTcU*g!ZQ-G6C4{$Xd&T3}pF z#0x`1O3GVMgVEAD4paL2%df3!77ON_6Qy~)9Jln-k*mJQ(c)sFa6o}F9t(Qp>)t

h~@o`nme*i4S9C z;cPHG_(?dd{rd<-T3YgE6_ZE5aZ4E$`}!KWzx=5bsmjIY<|Qm=|Lk{fm*M5%DbrF< zW5&&&dxFxg9E{uYX5{NV>-Fnv=H53mIX8-kyeng27TL1Y;qjOIF7RTNT4hzAfPC&aUwY+UZbc=-$*T~TA%<*x7;_vCde2Xbu9`MvYpM8#YUyh6g4$( zi!hI_5&Q7nS0^UoPLhR@g6~f%`#wg8n6AQF_s!d>M7d8ej!x`Uf7hvG*fGyYfsK%961)=LO+Z_Ne{L<0m#6PL|u*=;~s_*0-mf zOjV_0qTo^5a;i{^=l87I^UAKm!CC9)XR2=1+-az%8{-B;x+`MuR)5#7gGX8SoV?Pb|7f$0&QT{h#uFJGLu@p=GzB}&R0w}Z{dA+t ziq!IQ;ctdp3vowN6K@HAdR{>9HQE?cTje@+`@webEW1-;0BjRoDu6bezu=4`!t4kj zid6eAJ9RT>9eIV0n^h#t?fLZ8D!-WN1U*1bP(`Gr8(d~AC8e(WZYYT4GjjqRJ?-gJ zVxt3}N-_e^)Xk)aQ8#!jBvO;K8<^=J-#{fZZRoD=ZU4Gl{u1gN`)}NET*z?~s#r>M z3>Pt_ui`k$$(A)cV)6I-=K{r}*EUkh4!^58IniTqRC{S3qtxSj(!lK;Gbm{xeT|R* z{OL~Hz2ly5TUXp}Y~B3xr29PmiS4r)k6KHv)VAJ{&N56-v-C^D|pT{-d-R2Sa|fzEEGO-)FB}U zoS*)jPc5(yC^&=hKhcnJc>dZE{lHEejr?7p3S$G+Ji5734^DmWF0$hb&z{xFd~&|d zBTsQ_K*3$0u^4ofmK`4GJf~QZwoBED&x{@?2a;w8kb_YX#cKgsMWu?b%(TGOBxnoxMezr{IWgigR(c?u?dZT@hgTs-#G-a-m zFjc59TlvE+?6znhc5vDEI@Erm(xC4X_h;P9$0GtOeBTBs;fuAjFe$6=U##D%c<9g! zn?r|c>^Af-;ud%;%9r;GMA?O~pML$R@=@lZST}-uu5(@lvtzWnA&$+(cfgXVO0KNF&&d$OxkIN8&5pD<*B0?gwxEI#7`IbAn*?Z}$V3SGplSz=V zZyC;4TDwlLkAHm6D_tRR=z?z2MV zm(lrpOZ~J(^@h(yD(Ytwq~;wvLVXnsIS*>9zxv!Ync}s^3v=`c_h+ioVK}BYDrLTc zA;9+e&oC%P(sbQI@_u_k1Ij4DC|}>rzSMS|&`-a9Gu(Gjt?^XV9^_n`&(40dvPrDz z?cF1A*3N7%gIp_)mpBZH!7i*p4}y3{+>w-#d)<^4nS`ojQV-aB4BWZ$fkw z2w~0-8ZIqy08S@VK-}~xBN$y0-{%VO0*TcXMhACNmc?zwfA%=Q_|{^VsG;EyYR2Hl zdRqf56~I9TqZ-@p!r8R{@F$|_2ZU?`6bWrj`9D1^&3d8X+S_z=_SNuexdXz4|9%bl zD)*!Jz~mDK{c|8rAPbc0UwQrs5E}pEUhxzER{n)KR6&B6ZxOQazpN>TzKZua6>dQL zPl)13L;800rA=RJrcS`k*?`T*k5lAz=Ef`2UusV)YQ6uqu?E})OnF|l9Rv@ZC{_>K zF_I4m5(H)nV`Ym`VO+i(ptQ(m>S|ZrTKRDNX23x?7`~#R)dIk}y871vDuZojE*%G$ z2b~6<&K=OXFniyAMIP8+T-lC)`Ql04*+U^5ywW~g!TVy2idcr!5k1+eS(Eyv~stZTR%*!!}cXI&lho;n`VdV!Jc=#y>M7W8GJW*Ko@q&wk66EhLaH2cc8w00yId zmjM6MgX==vCyR@t__--uw}tb1l^p01&m?aEfGdLZk+Vw@rv}KjTXLnLoR$y{6 zH@N{YF$gXKq2WSn0%84am-$6Ua#;Y9RirwxS)GWx(*2q+g{Y|L&zC)E>kU=O2Xedi zr?blKduoguf-DJH00=mMZ5@4mCFEZa00BIjAwOobC#KhSMO^wH`Z_|tGNa7)=s3X6 zg?Pt8YQgTcjF^TwjRg1cOr6JlNjZ3Pxj{#vOy4oCy6kd z+$fT~05gzk;54M>$LJ=;%sl{mq(ib=2Z@i-M~`io+_knh!Dzn$%=tCPntO{RHC$kLZiLPYmN0HV1c&{4S~Sok-z+m+2jv@=8YY)^0urZPrWzp_U(1Y|9syq9)nQ^X2k3H8Wl4z0Wg=n0ZzDJn!tJy{R#<~&&#$@LC=9{11HF(Z6nzkq4L); zY8aBbRk2)KE+LRA>lhDBJCkU|&kgW_!j6l@z6uMh|mtQf#XCI8=T;isx&^LCq14jwf@cHm=)kY9XM8$|KK#G4Y;!hiX`E|s0&_>sYm+8(hZpsK>) z;`M@E20dm11givG+HD?}m6e4_P%XGPS%OUL>~zdAfiCsDNjaei|7!e*WCo6kbAS?g2Sw=_*%5^_wV0l-D?`o8x|d{1h_T04%2P2EXR(OS4%^RhvCSZ zuC6kr1WfjI_4L}bl;?y4mv1VohBN6>Q%g#jm~$$eYD{YS@@3?Jk$Yv%ef{0KOMe@w zE}k#DDcNZ2yz2S^=@G-zc?jx+T;$?%38A4w>x^MOYg1uqX)Go+6M0*M`KOtR-gX?W zX!)J(Q@$IN0Tj7LxeE?eFF-JpeSAbCMm1ufr*p05pFi!z3WE@I;S3F1z`n9#{nV{j$}T`GU3Huo(1c* z2)K>CefJJXMO2lkMWq%E<(q4d$5P=A$ibn3>D;B8pK;?|de^?QSO1s$jp+>St*NQb zn?UNGKRB7o zP%)erAP^=ixr2jJC#ph|lj%pBc6cnU2I%VQI)bu&?43~-{un3^_}$pU2z|^5+pFUE zb&K1LJ$#1>1Mz;3J_{Uh;09I)#vw=;4*L@0_0O)I$vRCxPC|r`NojuN%IT}(8Aqm* zB8r)I($Y4YUOjmTc3DLE$;8BTxaL5y?z7H!NU3L$c00gH9U*19!^~_U_`Q*1j3tia z+v_*NG)}Rr=FaC#-K-x}R$N?Mu#qf(Cw8a$HP~6W?QEw6d-WK1)@16KUlGpp-O2Pm zFvDznxohM1#Xg~Fv29zu8)HUXq^|kYE8n{pYM{OrB3uizAQA`Nyi(7qC_fBC2TpPE zBK@OjA3q)(8lJ9wBQKD%rU2B9)OW&#bJ2m;8+T_wzYgzJS7B==D}TG5l9QJwMAaYv z(F|IO0>R^YY(0^Md)r2_}#XFHVhbosu<*tYKwm zZ@~qrj<<{mX6{mgrK_DX2kS((Yl4(lnQJPZj1cMIt)&rW@L2KHaa%<^e5ks(+cu9` zR6n`xY)9F46HZ&`9mw#SkXwlV*KLeQNTS!=T>0QK7?*l_ddazKep!Ns>puRlMqg+o zQT?Ux)OE|RQ?##M%{JI-(oOV>as=J?6+8dL)5DO9sh17%QOqgGlY{IkBPZu8*elX= zO^1#=>Z}cVPs1T%cK3<(4i=X8I8676ih^#CfO;P_a6n+-%gkj7dvVZO^!0Pyi>mdE zyq_E=Uze3t6|EtxV9jwc0@>aLl@WGBPsR@+LC#~wr{b>deP!`L-1XWfA0tr5LkaH! z?x;vUg7X?Nk+bJ?yz(?*Zpk{kDD4^aVVCWU=@27br-}~#KaacyNs@`i@RGy z|CRNYQ|?*%l4~noQ1`bz-P*;Jf#F8f-13bsw#C=$ZF_dQFGeqhU7w`dF3p*DTaa;f zDV#U2gqA~OO-TP#{9TYw;Sq4Q)R7CWVPly3fj8azv7t@l#P=I3^T*DwB@ZONfs0oo zMzzH0a_jc(B>9anQ@p&^cDH0d-E@7rB?VGTIr(E>e$H=LFP>7<4Luz!s_onwp^YQP zwy+KeBN1RAVBFim<9O|g35`VV&Ub!~Mw*`UK6>=%VYc0d^_Mx>9^~#4$!H$09D2ec@qvF8jj=3cbogRQ64xX^saqw z`oSR~I^8)qk#({rA`8i`UJx3g1L$L_2Axd8Il0zTZhs|bm$eW19nGeu)q7M6Ro3_Q z^+ABsDbp5=4E;u5iz;JY~Nb3g+H`b6v@n=>v3g(?|Y;is>j)IZaj7}M7r)CQr5C53UA%P1yJEO5^Veb?)0#GC z>26H{WE)=p@H}Np7jJ1f(fqLnr?mt$N8dLST@w_<$<*^gUNx{H=GW`>U12-w(<-iC zFSs#XAIABpd#&&vJBv;ciq+MiB_bTq#VlRZQgc}M|!$$K1^+KuP zyvM{En*DxVJzHhAT5h9p^cEZ2o{@8o`4#;i-)xj}jh2}UbiCv9vcBb0vi>x$^zvwl z?AA@fkFx}iURST_nBN6b6r*$5Ky>L3qgG7f>;)9=$vX0DX(iP)3T-@gOiV}=4;H*| zG*y~+#vUEx7xD`UIqiDx&PiI9h?p@mdJN*KpFgRFh97*W{Oi(e%2vIn#WbBbY$ILt$Ee6%Sx4ji z5|O%AhwUXLcOI4tY^c7%1Cl~uAXRrsu8depM}yBl4N-8Cy3ZjR-!1XUxwL@Z+1aYK z;?pO(-PC8!le49d1@t>qS6Jd+5{(jZdW@f&34@&muc7mmci;Rw56O4XlI_Jm0 zT5TFoy{@Oko0nkt2&CE+4q)#CHa#v8f2wA?xeEyBAID3Ntc`8HoLqUQd`3^N`(UqAXS_IG!t=d=D> zACjUPqk3q|#oc8)ZN;IZW%a+?D?GlQ&#_W-J>?Uc{4)1j@>Uy-UZ?C>8s#~0&EuJd z^uX57W@8nCoGiARO+;7S#6s6!UH5EExr|<5U2mR_sEp&Fon)S?JyR&HTU+TTSIVF3 zpB?$c#94Lcn7NZFh^z2UG0J@h^(s7uBZEx}piYziJ5d-LA0#{H6HTfIWCWssg|q>- z@|UhIAIFJ+L7tu8EkQw_)A3fRGlTxt+sirEyq281GQ-NtD|{Zt3LKL&dc`(7XWd$1 zZT&>QF|Bf8!Q4(Tex8<1z&~Wi;Q38Qe5%bkk++BY9?4rsQg~ zK%O&$8O;`0aU&xmNA_nweTsk~=H#}^7uu4SxSjn6qAxm8{QyK^2;(|<>s6yDy2aYHvaIY8aO>} zFj4f{eqf+uIqis}J=agZf|}o~begSYlmqm1)DvY7ZA=?dug>?H8Z>%raNBL@^z28k z-~jW$F)QntLsop}W@6@e2e^MOb^LMNnDC_J!+tKI*So{`H?*wZ@p_Zp88+RR!ms0! z+3SyR((HOa&(HEOAS+9dOnE^i+b1lHNq&$G?uWL&&tMMgbdBlcD1~rLLBT;1`G@&% z81Gg{XrN#*O26S?3U-c;nb{fAPMviUYkEvwQgXm0?Q(;0aCUY@x0L5g>Fg7Bj1%2_ zd9klnUtXTNWWgnr9vvNgWY5kmtV1xjA_Pb)u$Lh>g5hT=c#N!k*`+V{4iCF{lfCD9 zc$1G5Ix9Q_{gdeq*Ab`rF20Rr28|_i<1UGt zo^Als-2T|lsg8qe8}y!0MLip7YJ|FB4s@)UiNe+bi#L=YV-glJ3i4)Caqod>G{NwY zgmbOL3`GC=b{&E=lpBdw9z98L!!Rmf6z0j=nFn@RMSNir`pWdN%;l0`cE_)A3Toz> zWW}HLJU{0<|Ea3F864lU)^yXaK+u*d%Nh*JE#%pPvp^JxHT8zbK#=G^FNbf9W^oGoRuYE(&>~S`_Ea_liBlM|8`qFRoWrC=C#Pfzx(YhxC>?vZ@Zx$t&#oK0q% zXnFDkudhqkG!$yE_nNHatm@EKD?YnD2R47(NJII_q|)q8g{u3`vs)_e=iGkvASa{J zFK2(k2Io+w+O_gO(?$bMR<+GcvFjfx=zsgP-fM%#x#k}BvpzY#JsVrOg+lAR7ekpk zQtL-+P8u|hZ|~h);dr>J`1R)c!0#d1Bk|O~Hs^#H9O>Z{8g4bMoZ9|XZ|GHh;8$g< z_Cfx24g)s^&eb=#&=knYERL6X*GsIE35y!aH0W>c%f9Be-tgqDuj4oDU;AXYPG{VF zR-hXoXD3;53wfgau!M$P@_G105S3VdKFLt~Z(Y%A69JsZbY% za09T1IP~<%`tl}h6@Nu&cCFJmv2B}S&EMa*)6*UA@s_TD=}QCS-zfl6=rc%4460Hb zdP5Kg!K3eCIqQa#mAZ|BAfp{@5Crd;IQ?sNZl;{#;e1V`{D)ojHo0yUw)wvGD2QnQ z7nBaMfHjC3^`pZ|B_HBWt#U?Z)9eyLa3|*Efe6$R0D5$vcrByB0@nj7`s{x#i&%)(^|= zww+$yekaL#k*wYa(KRYqfiR(>9w zvvwor#7>$qNyzIzO9_oys?szuQPDHlh0u8|TAy^4^YC);$yBf3v7B zSy>m09o>+3Ze?v)v-ky7Ze~dsWYGX10ev2lig0pqNx=(6z)@1NyUy6}URqj;jE?SW z&wKByLQj*dUC&HY^5KKfbAZS{tsmT5`O z#W$aNMFN9@-ZwUqD5hr9$oRbw>25|+=(O+VzWacu!L#te)X8zUpaEWN56h{JC_+CQ z&ikb;tqA}PDAbmwF`0K?Dv4}$lJ_Ec z0|1v}tl?AXd7iDgNE%%xv#8nI&d$YWbFzKOjGsV#gyF;tSA1+Nj%NiJ-d1)hhbvfH z^TCjOoj@<+0T%Uzc=JEm`yQ^_ru~vv$U#qb@0zja3cF!hPT$|pdkpu9m%Fpy-gtoK z#=Pjzm)C6ygp|QX{zqr(5Tjf;}Vyq)-^p%n@i1cXE*zMX*G9Q&2+V)xTkges3`dF7^KrxyY&#E+8t(58Ib0)ehUS!@>bz zxB?}=aq3tQ7C}B&|AoMzS1A4PFJilBUL+VYXaPi$S9ZF3RUy4y3-8Pi~`*cr0LjL1MI}Sq+r}*2FdP_JRSdo$o(*pb_-&Y(sN%YEI6?r$csf ztAg1WM1j__JCYFk{QUd?47|t?`>I6pdMWP=hAdO+6(Xt=nlA*}UYtr^H~TpTDf3Dx zW%ErBL#rADAqIwH*x}FbfDVIgOy->+pFt>cYV`2|w{s-=0AmOAO(|tT&N|q71@O-S zm}lJdBIBiLZ`N?`&eVJf-L5|{4+U8XbUb-GyQ~>`^Nv2MQ?929CJWm9zI}~)Kip|@ zcCwXc`QdpAT$`8HvKE}$sWi{6bmX0@7xGCp@Cud<=bd;oVShQ#o;u|Z54`Q4kBU%n zo@mEqjqirDq1p7&j}Mx=p;;j>4!EV%07{Q(f5x)~_+1pc;B*MwRbCZ4v1Jgz>^r9vOh+YxUM~(U{iNL zz{|cp491m7PA2K?VA{4ovt;y>}ED*2=`y=_dsR;rMFMJUFQ>6$$ut zAQtOR^=;%;!zuy)1EV=m)aK1V8Fn^^*QS~hfxE?h4o@U}ok>TW&>MI>87N}_$$sOkubKXP{@Q-W8Z0cKbN`h=XvhuINtZ2 z$9*L7a!d0x|ig5eUwWR>$G!HS{2{e}#=>edLh1xEbS)?xYJ_o~Tu zR@gN#r%b1eMI#Px**KXYclh^+)V znqM1^^ma4i6u*b`cgvP7B9b9Ph8is%tbo;}6&9V_?AZXX9#Q*MIDs(G#43#6hAMGe ze(KYmt7vL$5b`W6z%Ht)GWY}iLJf#OV7IB7Zk>44W~)|>!NNHqH}^OA9w?3aF6x>; zAFf%$$e`LTG-Tk*GV*BZs#Rw)+`1U_7;;(r4)R|b6t6FITOdAFz<#j9&UReda#?%P zi7-wLc{>&2HF~lki03!Xo{~R*t=*wT^mu+7;lFOSV@N9m&!WAjRPA}%cdF*6LFgp} ztC6mlNS;_8avuY9?=;Vk8;c7VkrysrTMSeqqoOwQbY^w28#n4;>@RrZV}>ud`HS7* z&zk3lV-UU}5&=H|K)2YSMj=#kn|KwKfjs_#)-1ZwfRE4~VG%EJ= zh~{QdfnYvxYDt2j-e|eaa!5L-)3Hs<=4;1l%Kw>dy;txbe>OCC?)pD}Jbu8DfB##4 z#qHPZ0lN9iT109?ZXBqiV;(7(l}E@$Y;m+rO*NA3P@%TfDQR+CPo2b+Z%PL0B;Xr_ zE^4<@>A_sNG7i(5%E~nxH(G?hOG!9#B#ic{x_ko2zr{XVta_>4_9Z2?BuzkTp)fQ| zo3@#CK!Zi*EzF%;HozsQ0T+s}ppF$IDe}-%3>-StsIbgHUq97}&H!bP+h#P-Co!8f|KyJadqzhaYycx?j+RJnyaV$Bib#cat zN1m>0Jkt#cja|Cf&YANIx^?s&HV?*-YH(TSob=QpW_}#rxpP$A<%N;j-Jk3}_2tp~ zMGCVk-08UyD#jy5+zx0waMNwTfC13mt7>ZLok`-sBlpQ#tFf5~hvp^DHBsxevHX>!A6}N(NgtJa~ z0D;#>PvF~{4H!7^(N^6vfSoOn6LiAh3BVPvL5~-+`@3nAXIE1mR+2a)_|s#aAcDW2 z=?#9jel6V|O!j+Ri#Ls6{nJNr!PHQRH6fY^%F~u((E~FAV;E9N^Ys#R`%p0k?!`Ctl#}xNskp+Q`Gj?<{eLA-M;))#qH5DD$>DS<40DFud zKmJv1YC1`0i=ve)uty~Wlt~Q~0`Q01fm>Z|sBIZOHnrppg!)WI^%4%q6r532hH04g zind1aOYS*Z9*9)fG!J<>Q!#*1VjGE#2z@ej;u{|FQpbfyX|bQh@E1-m4&{SH_Xrzh z&r-X@jEwDs?x>g;caxMEI-k!reKoy@`4>8OA=_|#k})yKyFi{e)T?F;k$s9HSPMUF z)M`Th0^|4kdozrwCCHgj1Y<(G4Ev|^qd!(mcd?srBY>#0kP&34Y@=vxN{!DIo`eI` z>|_VI1^EHgm#PuZYihn9%h4vI5k$Mkc`h0)24j)ih?*ATRE}RB?F3?N@S98hYhc+# zDHqU}V)pfJ(%LG8uCA^yHZThda?Qk$@Rf*Mu~H_n6PgbhxJ9zv{Mo9Vvsds~KT%wZ zTr?}I6EWB^w5BiEzY=joZU)708|dy}8N%eixQWOmv-ep4bhP(#(ufBi$_o_DB%@A{ zl4ZCn`-N71$fJ$j5H`6L8_Q*L4wdcRA%e-K#*b%l1@W>~z4ebiSneo3J|aFoK2zh| zd)LgJKTEHodiXM}7pioqV(1H?*Q8b84;@2{62(P?w}*?VGd!amyLSEV_sAT5*LhUL zGChh^fka1vpP#&jwz|20K(i2oM7kvG!8dqbW_p(!iGd2YCIexCwf3PMv(XuVu&on0 zH4A6r*s)45`C+3}J$jVS1;sE5iI=7e$Ke7lBoOe3P0xyx+p-a>_{=Am2rmBX{4bIN z@MyZ5ZUDZkVmJ;PYdy8@Th{1wJ3I5>;GEt2&bGkBW-%?=mfwB%hN-k}*8|OB=1Y&tNiYDo|$^U3N71EEnlN+LsgBJ>&56_H_U-_XsdW;kz*^tl=cA9|P;Hn-4c5|q|a;`Y-vT>QjZiN_tm+!5+P;KHl4E` z7i+k~p1ivUE8g0G?~2VnOG{=kV`d@i5NZ4MLFu{XxE-;`dt&zFQEYV@ph;$A_FD1E z%;l>J1rHM@D6+qE&E!=(uKMgMu>^f154f7s1|jW_!-t>X$K}qi`?xvCGvhdt*-Kn) z#p@L%Jr*ZU=s@!FiH?_;Kf$@fmv#+Xbo#VuW+Zx00U1NF5&EDEFC$8bg8g4c zs1!MwNbf;OFSqDm@&^r4oO!bS&m|rbYvWd}TD9w8bY#=u7j(}(JUrHC*)zM2+34SO zFnOgU$BB@A2!?{WQ=ldcE?wBaS6{Oz3{LvIe_lS@cy(j$>pS3SNgT`r4Gm|quXh;0 z!x`AGpJk-B6%$d5Y0hb)W1q|)LkXYBV7jw(bai)WTSbzF4rcG~>Y$FIF(D^Hwi@_NIL_;}wV1$1b&M}N5E zBHsoVGMR4Vq|iJ^{%E|aJ_ik^UJa+6-A_LqfA>d=4iio5U#(Z1Wq#}=nmOBd#KhdK zNS}J64caE%@SdY)oeFreg$rAxryAO|5t&Ycf{?u2{p{{xuty#Y6k| zn}-tClhOQie+$hk)*$qqTMb%myA>;P4IsI>Grq0x9g$*gDDhSMn8W==K?4R4-ht)v z7%3`(Ql`cXv)D7-cQ4$WzE+XiGiS`0NOcwV?!oexR%Cs}E)k>qtt2oMzAa0e+y6eR z!j9UwOV6G|+G~uNXB_bOXT?fHam83b=+rDd^>o$Bb(XIHSSPX?capcthlX4BLGLmt zlr7gc&r$d4ZzHFNVxbvUUH$m6I6Oyjw$t_+YQ;m3!dt{=auY(oqyu|Ht#x#CBEi`D zhW|LS_)|E&`yCij=&qzjv2oZ;wqT9W&EP|UCkgeh3|!DkLn>aFiCW(Z!TU}Ui2ib! zOj33lF_?KW&LGyHAd%Xa8fx}!?iC4*@j*rUlp7O;Oz4hSv+yiK z3=I%H*U%B(iECXl;b;>1?68!J7cWl2u_qG6Wio5s8uui<%L?B5)G?ovvYqmqbKmap z@`^i8HPPy-s%p)Ts4lnToYnjJgS^w1F8lJ1)6aad4y7!sn|yZf;NxswcieI9%+1&F z(q}k24*j+up^NKS*%f5%PIy9FXlVOhCOMX|SFT=-rkkH6&j5U-7$7W(7IcqWC%o?V zris5?*5k5=EBlW|DJZLtPh{=%<3ZZp%kfW?RRtnF3BL%1BO{I+zkM^OepJc6tSq}w zUb$zHdRf5eIbIwl#iT{aWa8WuPufb|Bse3Rc_`%AL%L=rm29}fARZFiGG$F4JvFE+ z0;*E@5p5Ytf+f*fq|wqbZy{5m?Cp2^t7eh(VLBFoy*=+3>x?8$)WQ`E0fT*Z)cHeD z>Wr7={6svVk;h5oqYhD`=rjQi;BZh+2l*{5xd}gnV`=d`t6HoP;d*0CHyvD)d>pP~ zp(nRC1x^aE>N8)4**knFE{#_VOy+s}5hp57g#&FpSvl4k z&HlJ|Zx8gue(CEgKgh&gQdyt+)2HBADg%DV`{PK)N<+$VYTA&xv=ijuTCi3~tP!y} z<&FVnAkk`;b;z4KhpJ5pyG@vU_miHFi`0f~BL_dpk>rFjIg^^J@=KQeym^ADqo-~r z)27IVGQ-+mTDWd!#Kw)SSo}sl{%iJln*a(}MZyfz95Y4?^|3d(z?M}h^1g%r+JI`~ zQKN2g!|Kng9J<3KX@l0CfV1a_3Ka;im6G${b~yM@D=z<47Ioq9SLgS$l0CmtJ!bCQ za;MB3=Yqa%D=|~waLv@ccGU7q{m1)yj%lt5x)@E0#9n9v@LnbqV4X0o&4xD;5$Poecv$J%o8RW+PGbqcVB0?{nlZrYqV?ms%2`Dk!s$9}t5 zp^OY(YC1BP+}zx-<=?{%6!<2XFx>29xaTnkqUnUtP-ERQpL$cWlXN(E=VWK!es*MX zYDuZzi$^bBOj*1*Wtx_1wwGnNdaHWhuyUM@DVM77Kva7CArQ4EK%WKO22oT}o}1N)K5S{aG$_b$XQtMi$v55z%%LcJ zQe7=GTnO)OEC&?Xumpu3>_@N!?WBDo4gjRx)H^k>EN<4Hz8CVC(4z#ZaZpisg6T!i z+>RW|`RP#+E)D}?&*CdjqV+dl3U~+$ws4jG^AS4w`fvH0NfrZ+oLI{p0$1llLKoi= zEJF)70V7|4B^L7a`3Yh5h`AdF8o7M>gL-v`7$9pKoi=kp3dpzcB3RF`EApfcV?M1R z<=Rf2r{3x{?TEKjycrxYBGaX2ue&-uGU>ZIO{>U`PhaW*DN?Gnm+=Oa>%wp-%>KtH zQoJbdvPgW-HW<;7JFwvFs-UN?dTK|@3uDFkg`!+4h7G_G)FHWVE)QD4)WsDn$-=kH zFUn`-k68X;FiB*Vzwe}#u3iV*Z!IYq6~U*k`cl&$KljTN=9l=EaCgdm5!uiH&SE+j zLZ9Mp|k=LOrz+!LKwyNBaQEAw@|mv<`umj7^M!_$SvZ3RMu)+z0hXKQ_i@t;9f_ zV)!J6+TAfr0(hYSe=P+cd_ytf-sGAYJ1|l0;>*F1C;@DiP>2B5yrMd(eERgeB9*ae zWa>nsfrADyBeHw$*B7S;&pm#GH|C*H78U5*1#d!LZMe(5hjA&O#ptZxXo z+%a{wLitR|BbTJqp}(i5pFOT{Kjr{(7<+rY2hsiW_0^3o_2n>1f9PZR3V~dYaVCG? zKP@vHHwo0wsHM#dfa_gF2yy5U;>JFH#9RdD|VsXE+M)#Ao5dimmZ^suS18xEP^ z7dA6y?C?K6pV=zZLt^odtA+@RC_m*Wq3==ZQD7XW0^rJ7NVlZ$!?e+ZfUi7HMxC7r z-+v13e&MUj@7`4`D(W~Z-O&bu4No7PgVf=j!O*0g{6LXfXTQLdW+3m$W&Yth`}><) z4bk8<;g#kiP+)e$Q$}i2d1=r^stPu8FV^O;ZYDixF(ktW;5_{O5ijPKZ}Rr?!iUE4 z_rGdCOv~G{{JHAZl*;tZKV!0p$t;6UH#77Uj2|;A8L%RQV5#53Qe3T*i|-VsmcKgI zkYT*2=vWZia~sFj`Og#Yb^b!10cvS~+)?!Q1amawDJ4}DxmK%60*IKf7y7Ljo2EGW zrJ-r&HnuS55#aBhBlV~dh$?qKov=HUgsTBavft-7uAD%0ksR}~?B{#u>@%*P2@vF! zl-@Twn0gY=j~_~y;6 zjf{*a#E)-^TaCCCW90VNLX{m^(Dj8?Sd;Cl6~p+J7s#KF>S6!8Wx75rywymvK@$ zCC*9}AW5Jf&Ipq)&k`+NyZ!;eu1Akp^|{M%3xUvu_>?ap0V6bwbv!f`J=V>_^qy6z zPu6TOL(yevo32(ag$Qw1o#G>HqFzJ6z>kj&7T36L?#Vxp~05kTNb z#9%;)2Y`+_d#cir%)uF=C3NS6uLk)i&R4t?8NdNyxB}pqj@!8rcjD>0p&5bS)r%fU z4Fv`0T0&CN2AU@F?@69wmv+qKV3re7?7L|+u~Ce??yNz)1cL~}-DXVLmh;5#(Ynl4 zCDtAgT1Mbs!V@Ct^TPrYS_Uhf4v`>=VDawle#&Ke_v=ujnKOD~q)gMnN>_^XLSxl#-NdI-oWuE4 zF7`k|!Wsl7qE8ji4xTJ4oGE}9?&t7oIZMIX5>76NPxyhFK-|Qn=CDpa{JIf~cB!y{ zuVvU3ONF}PZnnE9xllB20@&nR8T>&xA4$J}LelA-?@b0d5VrqeX>X!Z`llDXI;unp zS#YFGpZmsDBXWTv=(=H3s;Q;5&uq>SsP^=!0%>JaD{+akM(=d!5qY8Ika&tANgc^3 zP*PO_#ut7}pOsp}h(rkMAa6V}YeF$Oo;H51Y;IZ8h?)Qs&9U@rtPf$rz@i_kq}t&X zhE1{NFy5M%x3c|`ttYHWSW#J2etKO6d_f+4o;(uXQNe`^Pe@)_&~!kx`u4r>Z177O z4Kf@kFz_v)=FTnVonl+ZhLIM(Av?MqbDYjM<%^skm?rifF=97QMTr1Afvpst|A5>; zEM-w*LmqhrHqs&>;|0m7bbxTO%<+5F9Bzy@wC_u`$oIa@u&zaAvTyc%lRu;+A>RmnhE`GFBCH#FUtDo zs63zn1in&0#RS}+F{N4^{Fw@PB(A%A z`6xJWTiBwMExF*|vMblHpkLP!FuhQ2>&cYrZ#&B{3x&uiEMOz_2FMS8gCG&W&2BCx5*#q&D>W|MJ(mBxT7zWRV zcTJf41e-2=89Gu%ldQ;_pb~z}P?3HL#jl%tg&w?CH>Up8I7-V42*qST{IlDKIuqsS zKs^Yp8-ib=QZUDq;5}47a(7f}A}ndTr@k3RNGHegYA2R6jWq{26vO<@|~O{ zHY_;vBWDBRCQ!i2wXJ(OQ9g*BQ(5!{aTZ_!g*9YwzSGF*%&=gfwRJnM5+s7Z!GmAP zi))AuV%xD`<@>JOe-ggCI+u-O0!1jq+H&2-9N#~$LSQ5|gm2T;HCZCDu2%+e?v zvKn~Ap|`oSw>hDXI3JWUa?~g!YGL}tpaL?u4!vIavzfCawFx43IpzQmXgMJ()JPoZ zaO}is5YygPodD=qhen?-+k}LSBrU?`L51v0z1gF!Tem`WJ{%N|l|4&mV&n#gw8bRz zh25=S`6OB$(6&Mk>d(*z5(PfS@XwnqhwSo**k6+3SVnu1XP^j;3nMas2O(~j;$~1% z$=BSEFCRwCN|n=!Z&ts0lk<;u7}2ab{T9*v;yF&gpK2JiAt(5CZ%!AP;Yi0m-zkAf zBU3K61Z1@QeILg*6J~z~H`CEk?$9AZ{p(5g+XyH)WQBz#hBpi6jwyL?=a_?cngOAO zb-P^~qwIwgp4POBF>RN0Q0;)OV{Q=#3MHr(AB6j6)mKu4zX0H%Rer~kk$I=WLO9-G zeg0UYAt7ChY7A_dh7yt0r~gb71zDB~OW7Q6H~vv+Vmx4UF*cA(2k z_9Q0uA!=}#fq@Dr$Amsan?y!H`hl1SwY-``@zxh2kdjAl6 z_`kS8Ysd0u<fe81I-cY1`@k9V?U<{@AVJ`Han}pZjO~ z@5@m3*$og)!8!kci~8D1A16*K$zDLbo(|oDaE9i}BESFt=52@C>`a&^$XrsS+@ z+Uy5Wo%~hE9V`c)oE+$Y@G`yvCwhU3IYK-3>h(bT<6vD~Nu}^O-o9hUd3uDfouqq< zCi2zsV|i>evGwCE(|UaP@bm6gS2sY7Vg1pG#@)6691vEZD^2m7oSawG;vz}K_qB7s z$$voTw0bd@lmOj}sEg$?K?KP|^%tIRgVeJ4vGbot(_R8j08lSNkH}}#$3@J&(T@Bc zxZa#_B~P9W!%y^Ro)OSU4usvrqht2in(%O&sEXluar5Lb@IOcPaS1^Xsus8wl{GaX z<>&g$gi#7ZKxfb(92@6F70q9_(}skHhDKj)?G2om&|^1{fYKu(omz|S^!^f$8s@%n zSQzsBXCfouJc3+v{ZYYY=9neQ15(qgJbp=s9-NF#gJR=HrqCD{@#T1*E~Vw;$s8~s z&&RX^`(kJODCn--Wh@BEGWWfa(MEcx`H-qOx6@fYh$I*q#8qlw|9}8VeF#KS!PC6< znRLSw*2c$&tFKc?$RM>?6P4(MC%8~s$y8S~u@mr*rV(2$t@QHp9;;WcCZ)fV{;959K-EgZ%vJHL#oNti z(n=--O%;<+*iFIZ{f4(1vK3zHs>aVtl;{X=h=}MQvxT@+Np(b{uBRq^1&G6we;bi$ z^5@TeYDXNLDhc@6Ul`k2Rc=Kq6b_3}&cdO9-T+!D zH~OmU7DHDd6CWUM4Fl-M$VG{=aTs{xShR+f%qUVVNckEX@#E^!F4FP%e*?oh2L;dXv$G`4~ZzdMybWHFU@jSGtNq)TZQkB+9>td9Q}MxVm0I zsO=XJu#hGkG>lE+s=)wM5FQ{0p6q(cD+Ygh&YKJMfdAsWryiOi^Y^O5R#{nn4_NA2 zd;1XFVUe0MZD}0m)^r@r8j{p4mD$_P7+mLn! z^BQFm(WOhjP`?WlkFtBKV|stAdgx%irjLJHJqns|ZP1i5+wv<#MU_C2!)Zm2KQu#3 z8-)TwL#J+rJtse zxpSL#?UHCcU_R>4JsEesgd~p;Q}<{2xpV7)N=sr=%Idhx$5_u?yl`P76Aa|BbF8L) zWxvy{h64s<4v8@>%{#5C@*C*Zgn|)d5uhgYI=sCz}hJQRq@j020TB6=+ znb;uX+wR)}Lz87>y8&^Go9$42cS(Gjej2Nb)!Pg^;4Lf$qG!GiOY9%;WRq!!AF|Ex&`Xq~%1LH#26VvU2L)mDJHZjQ{@zK=$V?+$64GG57H@nk{Mmu}dd9*$129e@Fao=$6)Jgvi$y3z)sRsF}aO=)k})A=l7?cgjw zF1(7ZNPw5Ln(~f{r_g)u-A>tFM6IxMe(%rAsii%9>L(?oHqEWZo6QKXO4zll8@-ro zzw8nU9Q93U%{v)=2f+|Jl|@UH+~G{I4g|KI{OzqFUTJogyvhtk^)#K zAuOVTGY6+RV~8cW{kGyq2KS_Lcme%*t-p(|8BRwN`3< z0g!7Sp5&dqd3kvR-oErXw_lvAtD7F``Wh5Ck`t^fMirX_o>T2)cd{0C8IoCx>36_W%)>j?(Yj@%Co1Z9ntHBksX-${{-X zL5>G(2Y6(^uAO=G3r}ky^dmk6hz%SgrPQCamkvIElurShT}u{G7IJi~tp-Vr%o++h zK04ZZX7wlKHYc=0S>+m9)GTBX0ReIK*6V{AebD^{k=jsFg={A}IcAPloql;ne5!1o z?lANF7^v_4axuW{`{P&YMtm#)Z<`?*Ck{B6x>H>Ew z_g(R%WpPJ(cPUl<6NCmPL#ne>pe%<*JYwAB}Vb&UafBeLWYI?MX zT+O9%gjZ2PK?{njy&+G#kNH&DLJFgpqP8vHGt~F|@P{j=A4m>k7AVX{yS&ut!_Pq$ zfC9QQ&YdBJC}>aYd_}zqft{=n@FJx4&;e+$PoQPVsMDZ5{>m*^KLNd~CtV0iIp%YJ z52vZx)o|a9(1N4IH0;~PgIsD`_vzR-4Ga9J#XM8?;wI*7e1FTyED?e?u2MIHQi5Jo z#E{#3-uaR=)pEBQY5qnBf9Qsqr&W<+*2#J=l?17WK(-jqPji}>JTr;2+ zJNR-VtrFr*HB2Nt85@D+&W330t=uL?8-gYWr3{R|+^@&Q_~kE7hJI8t`YAdUqU^i6 zvIuTRjI=T@jJq*lFP+-Zo(V=J%Bl3!3C%}l_d0w%yD?oDrVREGvNA>HuG7}D6fvf& z8|$AAJ%9h_Z}XjQ9`l@?SPaphwnTeOJl2>Lq}_DG84xmV+N@qBUrlSjl$2SKf|Bf| zWyV`2-js20PEJl~Z=GbRar1|=0>_VKvE)iVyr5_JrE%CN4Ba(Hz^%1cObhBoAUSApSBTfIsgc+ZAvS| z?fqn_&eN;eqmfvFO7P$S@Vj5xf2)S%)u`8LRifd`(TA%7%7W-kPWWbFdX4iPNi=R# zU}9Cvto=R)c%MG};O(qDox5PdRcc3Wx)G21>*-~b=B&{7p303gn?cI`bvgM?2;0OR zMtR0*RqpSGP^!_d*8yXrQhtrsOa`fJV zecNm2I&}VTxz5^z?mw(O_`eJIf0RDo1V0Yi{~vizn~gKS>4N@|2MvjAYQ2Bc^UK$c zZJ4N}q||q*h5gb+&P!d5T>@PBKc!)M!xTgG42KLeoS_(QtZ!hfFc_q#XRN1JaOL9K z|M&%O-$m|=L;ml-khOF5Xnvu$e1qSj#l{oe?OX%ReS>@!x%(_J_YGE3=#MlCRw#z) z57HZ?7rflso`2o>-~W1+Ye1m8uaA=Lc*`M*p~Le=jC#(G2gr|uz5R^GeY@K6|NDxE z87oE_8|ZTbTybecdLX}J^dG6#4 v=D4An6I_Ero!xz0m9l>g`k&uw^X-=Z-`~2>KgyclQL?g_GXAK!)4Klyx671^ literal 264502 zcmY(r2RxT;*gme3q$wn$6jC8sl@XOqc0$PBBYRU)vZ5q~$jD4cR%MkH6)B=rBqK9h zw*PTG@9*>Zzpv-%Q`Yx;-`91W=XspRah!Lkx~lv(8fF>_3W{xt3No4$6dRsWP^_P) zrovaQp7)`}UzAp7RnAgS6x`mjY_bub?>AG>RH2~o;i8}jxIsa&h%W^UQc$?_Q&0>W zQ&5P-Qcy5DCA`u&g>O)qo|BiMSR?->S7b)xE1R7ZF1q4x)&Ks~zdM$We{6D7RFU1( zPf5jioW(BOiXE3B?xUsarfK52-^s<%(#GCmzniy{#eOdbCkhIM&|K|vdEfcB{urT` z2vk-|^Jjhg++*Xb(MfS>(a`CsH#`%jB9XH_TN@9=%>4V8G_xg1(Ngi1&A)GjDj6OH z!?MpqFSzR4cW_nyd*A=LPM}h^N5MTe-G;BpRpV)*gQCUUmx6*P`4;(g6F1IYu;0~@ z=WDo`D%mt~=V!l?#zJAC*sgL4!@E3pD6ZwIq*=#2XP|o-KXFmVp0eclRI#F9w?pC2 z7}q}Q^RIpnig8_jUlUixbn8-e?#rV1pz*yORC8NuqO{CAHvZ#OIdtLM>y}@m^jts1 zt{8EeUD3H-cs(;9V2EaQjS&wZP9o6KT9PldWA|@Z_PBANJ=Sxy)=>a8SRx3QmB+a!JQ_V$dP9{o-70;g^y*zS#01U^mWE` zb6E1egW2MyFSFYWX0u8I^>w2J8DoMT^TynmDY?#N%Y5EfA%)%S!(JMy*nCk5iVWQu zxjAY2q;tL82{R7;XQ5UGJgO2 z^X`bc-GozN&H46K+J}!FM)-9#V3#@8|q`^j**2 z-?grzr=-kE2iWrI(*Dr8dDzub@^al^W7!-K%}BFpN3I0tOMTW`<=%gveesz$`1qM~ zyR({Ug)@`D!aLEu0$w{;<9sZH5-Y{7GpF6uEje*G?H}u3t~}O9J%4jrGZ)2}BIssM zEY4h|kbS5le1q-9=Z~?QrBBxk9b$@VeSbROz^gl1x7RKYu}x-Q)R?%K6o$y!X=bA8 zfpBWVpPl%ed@vTgc!IwIC=_MRYI${zCVFboTQLSFZS)x0s}WDby|>_+ZUIZgL1wzg zTwGlDzuJvfJgC(*RlddSJAQO7@9VR08k*FAz|Y_L-v9NxYX9})wvhO&fM(b2@}5>6 z1Bu_h#rbyoj*lx*p53{V_J4mIu^eQWrI7hQA2d>Ky0ZWKD`5vyYq;6Tho;m^&+&a6 za<>i-n}q!LQGQ$|_`-iboOWXu8{5al*^zLcJ^%eMyW_U&srv##LK>EC>*aDCKOU2o z#%V6dOG^EKH1z15#2_Jj9upm=w}Aw{G2f z=Y2o^@kqi~&coxx+}vF3!-vy-jdX&7g3n4y>g(&NLS+J_QqJURKi14vP*8Z?(z4Un z*H??1K`lvAHe2iW;9y8pl!itUcho=XVm%cX7r`S(j_hG(HnpXh&LE> zWIuV5@ufjzL4m;H;v&W27$tg|je>Do(=|g=QVys+I`!tmho`UoEiEh##3;F65&xfi zJjn3LC$% z=B3;FBz}*6E6md3#+QS`!?#ex#l>+RJZN(Ha&r`iJZ@O{@((N#OC;NAReO8h<96+} zt*vwt5)ycWz-!mm;m-P(7pCv==tSxj+fL35otKi5QqR^Zy8QFinGK3EeHJ%v+~D{8 zYjC-%JTxwjsp;WKtF|l+teorY@CD0{kD9*Z=tMP|<85LONZ)*BQbRR5Ha65*oZ}&W zdU;|)lK&hNA0HpDVfh{%9i0>QUthOos@ZfF9eMNS&GO<%er;{-um1kJ4{<`1lanTI zBlrHOx|$TcgUw}m-uda%r-R=L5B2o)u%7a|^YkfiXQ|`IG{xvUO6juxuVlJ5wAmiV zZ46z5ET6VZgq8)*WksSbsMPE^9)4CBsZS@h3m5a`s`Yz z|I+WVu}f{4Xa75&eN`MG^hb8ENz&sDkF;fY>)mAm-z? z)Dg_bcQK#5%v)stO*2d5@lB_J4@?J6p49nTb+zi(*w|4lo33?RRutCYp#Ap}6?N{I z>NnRn1=)UivU%)#DWhKL<)d2phL^G~7G_M;F&a^``OieD=Nn4AX>3%a-n^L>Vfe~( zoX>B0@~sqZGHCZvJ!-6X^#h|TM~bXJNm+kN4I(dIoE zCM73#b3b*UQ&08AH#IfV)Z4Z(G$)E5dhzNN&80FA1{~!Z*Ej8;DJdzD8tW_$djH_q z784T_ss48{K{y}Oc#Gcp2)fbH(Hk*5IyBgHsU%5%N;5Mvlc~N&DRc9*XI_)0DVpg@ z)!z!O!WXqd%MT$WHcLuMR(~xp3zEORZ}YdcvdiIMy&8rxz$@@c+d#f|*RMI{lRnXF&Gf3uTitHuck`ubJ3 zXJ)81Fg!dwSu^{F+1tp?7jm>2?jP0X>g(&HpFOg3;kAOYGCh|{JZFNW|EsKEcJ?Xl z#-=8Q_Pk3xACgX2Wd%>Y+28(bw*aoNA1m_IsOnLoxTm1cjP>R2N)^PO>!0Sj*4Ai; z?j;U_93C6I13M6yb#GyO?>w!>1uO{!}qVR+<6>2i%gmmL{It7-TLvvVArl) z*6mN3#63oho8pfroW8mbNvXvCTlJF*IZhrP1304;NO!b*_cnb_KlkmW>md?!OaI2L zCVssc{aSE6lEMZF}kfyEALVn)oYxR$+@|DS^ks=-ZIs5z}G0(remNX&du7W~B zWOe@i{mC<&5)x8=0r|t~z6inXupc@Uyt4FU+ zv_3vQh>Z0uuiWjFRa9sx*WuP&TwROJY;YUR&CL`qUc4|iGn0;H# z)n)orj7?3elh0n?BV@Vt^xCqJ`*2%|#$!1||HgoxnqZ-;tA5&f28!zHJ5}O^-@Jdn zC17ssduye?FHR@&5E8?+*RRhY`ky_2Uft2b7%G$cQ%6;GJ8}Wn(W4P>Z?kbwOKGrG zHux$d6t1o=973Gby?z~_ovSyA#5RHBzl)LaEh6BsP9EzvW?^Hb_6Bpo>YAEwrH*XK zBUXQ!6G@#yywfo<$_H=Xn{}z|MAh=t7F39~Cl|u8-@p6%YEk!4y!Z3*MI!!w|N3-%ovuRaDdN?bX2$tK<+;1pOChOg^Q``ptgP@tgZ* zMM_OWg97$MI~iwse{^FJql259Fn3HCK+ES`{gbcUN9tQrq>EEolLe}$Z7)rGbu}y; zx_|%vLKly{KW&uG*<5WQ^M;-6`9^A`mw$RK&Ky};obCO1dQE00B??7zeFV$UmmEf# zpun@&sRXdAoVzwhu$&y2pX|MH^X4uVmU9(8vtf%)GBSpnv$fvg1nL%AZZo>Fa8N@- z19kHug5a@s4wHz}ewGvV$@-<20lPk7!d!(u# z-UD0_6&1CWS(qA_qW(79M;y>~U(3CPq;GD0Q(Jpub!CZQrOKth;yBDhT@~Uej!P6;Zw+nQsMLYU-nV1{kRd+h=o}*(jbix?r6lR}1onN8u=P$=RaIfT_FH)ERLyMNsw)-RA^H1fW@l}> z%0<6@`!?ELm6Uxx^ZAt(pQpLG;ful{<-x$M!F-oqwID^$%*;5sxKuYZP&+z00`_I; z6+hAD-zq#bInt5;7Hf#wX<~2x;q|o*By1?c8INV4!psbQ+5jA`pg_H2->H@>UGLwA z7nr>@MUGZVPsM5ZkRVF8Y15`B+PUvgKtgWce*5-qW|mq+gRcy*`ysVQoTw530Tj$9 z>=lsNXoA)sJ9Z3*4gsDF{HvGw+@=5BGSJVSJ$vrlzh94?EI#N92u2I+(%wi3RL1pK zE@Hj#wXFX44}`p@%)GxDw#D%XGC}*V{5U~bx&`cH zTexFrP)boo8&F&u8yfZ>JQ#$Fj=F!JD&X;Bj)8#zvO>3RZAE{gTjp^TYfg9Ua+GOM zZthW?JOdM?pwXXY4~p$NT;?ZChCA|a$HXvrj(1rA43TdjFbW*LpI{cYrU< z@xuAKWm9~pjR5;WDrA4)-{gwFI!|@aVhLwQI^NdT-$Wj?$zMVtcAofkHt_oO;v`3` zID%hSSJ!%^!ssT_+jXjKzZ;_uJ=HH&wW>ak;_<26+Z90Q9H89%^vBcL1}~q_J-W0L zMeDN-lclwF)|mC3;Oo~npd2c2-u3%Q{qFwJoahVtqZEp04b04Xr4G5u$gCdK*4A#z z(YZ}BlW8qA8WB=LW4JH)^jHB{WLE665M(0Z4y3qsuh>1*7R}PjmlBt}>%th^B^|A; zL$EXySUA)By{;iuQ|vOxI%4j_&s_SI#5_hra0S4kl$$o4&(S_A7r|V&3>l)sR_l#?Vc`4)tS&~LhR>y~bX&v72@r^d)Py88O9 zC?h^IgV)g0RjsXxlM;n*pfzeOanQ#%)&)dFY`b>t8u~F>dit~qZ*eiPDP*l-nLFrh zGjn&`Q{V)+p$?K-XT6UdrKWR}BJPAq!1TskA!5;OtTSe+|2^aE&@OMa!xx^=ld{R9 zlV^^1`TFhKr$Q?=M0%t$ml=Mm(*mpKHr%G@`}2BzLuo-l4>qEM=5WkW{rmYU#Nq)GuFxem7-!4bSJm16wm$pt$N1SC00?F8#Tc@ zgzVaR@7}$u`@-n}37LSyk^h)O)Sk0*9X=egSKPzt*Xwoow)MB?2T|>lfiw=&(*R>M zqW=LFtpD)gmZ1qEd%l z-?C+kgPt-zpO~8ixxfGn#(VT=@p_}s&`^VNuPBuNJ;2;!a|Q+mgB|%v&60AFtl2A7 zGVCd2**d(lt-SlbNc5XnSjdidRUo~Emdg(g4SjlMBE5dY21PZsw@7WSuC8P5Mr9rj z$VW=(;6Vrgwv7qKDAgftf`P~&0MFso%a@-r)z}hOfOS-@rZ)Zpx;`>o!|1K1sHkZ2 z?YV`pL+3rVtIJU{Gj3N_W_8f<%}9YbN#k@`nse|N{dyLcF!Z&61HF~s+RDGDm&)dX zwvSAs(|%@JM^m{nr{7s*BW-HR4)Wy}>TjFaQ}+DGL9{MZR8%j@%6k9)edPG!F~ zb}loLjx^r`A%r7Zwj$LmWv<6SY(QH)-BwLaO#&xRc2u^vw$^lX==_|@#A%E zZAGswjxDMXQUHl162;Z_yGSVF4xpdJ9ZI!H!<_?JUu=<@(-S}Jro1*fYTxJLUFrKO zL_v|AhbJ5zqnVYJJkpcb`~=}yQczAi-9f%wym;~0XtDr!47}R>Oskrrg2FH0TJ+Wx zSvzuN*u|dK#Uf1N5g!s27;ZVry<7ldoI14s8Low6rwf+s(|(OjVk) zprDP%ZQEMnvQQNxOSl+>53+b4ZA9%UC@E=I)xFPsK`GO)y`doli7$M;8M`AZDkA|G zVGKOF~NX{TMI zE7og5z9FY5GBT2|V94l%R{Q>)urCEy6BG(S3-H9o#Z{=_@wU07J?)xGOTNx~`bUFb32-8*vr*A(8|xwn2>qTq%YC0SkFJvbh3>gs~9 zTvydgl$4Zc(B`#T8ug1NgQL3<9NarRtVDPZpICNl`^et6x9O0QqrLx~I_cQO2|^KN zV$km|DlWS2tyt?HKYyN9(C-IBa_Y+R9&vGT#C}oLG;r8oL^z7z<;$0kojQp_)`N3| zYH%K%;QEyVQ4gluS+x<<15FP};}X96=(~R$NC8&0A+(?2sH?quNB zLZh-Bumd?w!}zv3z#J#~Pe5g3W8>F1Hq%WV)yet7D&`h~4F>*m2@DKG*0T7VrYIvT z+cP+*pru8BRKMhd&g~=YnykP$R`U}*MX2JhUn#^Wr8`KwElgP;1VeA#GGCsb1XK9o z!w0XKL0Vwq*6ccZ+9TVk1J8JQiGWS91ha?#Il9GAjJC2#CZKIAbp4|`wp3{Kaey>5 zS?bq7mH7GlGfDcr`08^mO*ch5N7R2&7~B(K);12``BCP1n{;C1;~)DPqt`Xn*3^d5 zq5rD?^eGC>4RQ~MtwabtljFgIB7k2THf?vp6|cW=@{8?p-?>pj!-y3}WB1IE$M8uo z0?h@G*yrD06@V=nGx;=KpuFuMNNTVqMu(ktF)=kFAzo8TXLmFl1RY}V(j~}ydZ1@) ztOl0=kX4GGW_rW20Ra4*`j3nVI!|9p|H7|L0?p6?)Ci#l1MN?Ya6Ihv}E6 z-XFuUB>fSx-NB5pPbA7qosX=^S+1Q zm?Brdl#7@57C!s@!r3fw@dUQD4+mnjvp5{nG3G71 zd0&Ge!gfx4cdYtELZKu!q@k=Yx8E9dyyP{#;Mtubo_|@KyN5-MzIRW? z%galH?QTfO?z21jPo8`W21r`EX8nUC3E%tvOLL!4BN8pp30DJm{Q6~#m&V0VZrm7J z5~fffwUZKr#l+;~5vP|*>89`RA5F{17!Gk+pMc*+n*9CuZzK{6ZVb_W6Y!s5)+E#~ zxjcs2?vaZF!u!E7qxOsUzbM(u-27MMnK=s zo)P4&B6<`}H41N7o<|Pfkxi2K?r|_?)uk$0l$kZmAmR8{|{eC(wQ~{yhew1blV5 zy6h`Xb%XJ1`PY@wrPS2&b!W@9vb6x5*SET_Z2{uIHt;2gdq!?yOrOR-_eE~!592nx1Z|3CiWU3`y=~+c)e~APV%IEm*ecIv1*+F!cgCir1Id%rw zT4KWdwEH8r^G(**wk(d9Ye+9H*$>pE#00vNW3Ywo!OgMqoPAk4JB_>aAnRCi} z>ITTT2ZKvq&gGjb`RYC;H8eK9`SwlJNd9g|PvX8Psgz%!jTcV9D`K3hv~@c5&ss5f9dP*XF& zs3W2oniN-dkSvTPxo<7{4;LRD6BCq_WTdfUV~eu6!7H~=Bsla2v(5J?FK{z_FLgX5 zD0mkEIt?-)v0qeFRP479s&C)Gz}=Xb*=h$aPELZWNbrH{33Xe9vQmvINKHe-J)E*^ z$7fg`^7ye?0k_)MLaQB0=~EHnPlHm_50fg6x(H?9F1oED-nEk@N|u&f=kD_!0HJs6 zSbnN#+*%YOJykpB-01vSj%WQNBjFJdD&$H}b^pV9K?CZt*^=|&gYuQt1uLB3u+mb` z%6{~*K(KrD3B6WSRK$2$2zP+6{tSX+SKNt&2kV9h1~MGjE`C-&Oi$u1IXT%>$ayCn zT@Bz56a(2tXI-@fkvCw{2-8wj5e^LuEEWPypl{ILd(Dk9CC6O?q4GzT z*(%6?94eXCQ{5;v$puPr@v>ds5t$hoSl#s2SIKETfBwi{zRZWlLso+ROs~k|{Jay0 z8Bei?%nS^5sP{uZ%TD4j>2XL12uub=AK4X}){=EgnTrE`2GThYp$|<>Acw`ud=`XI1}skA4+UeRQf35*yST$69RxA)&|EO*hczwe2oTi;L{k zQk9`!@5(Gp^@9#NcA~yqGrP9Ez2r6Z;CDh8Ls(1J&apwi3X~4XVE=>nipt7$eDST*Z3XO<3c<9h3gfK&kRL|EOxd+$pi z4^?R{Db_+^)LnT!J2O*sgm+x97`w#+k?cWVpLz2mi5z!zMqb_5P?!XSg=JFIap|90 zTR8xsp|T-E()`RTC~yw-m_r+M&W)Q@-QKcqx}~Ed6Yl~ZpuE*?8Au82388I4k|^O~ z*AW^BulN>{#i984yLSVXxqP#uE?Zb!u;<#ax~*<(-@M~tKDx*?*4DD-1Tj$eNCbNN z2SLo?sz6%Q0AZk^ICS8^*|*wQM(?48Lab6jGq*L@4kfX=ruMr^=^`Q`I~f`OHP6Mx z$G-zQQ#^Ohtgj&|AVW3@awb}u|C)0>J@yCh-R)!0A)GpO%J9V{u-{5x64(+3Fu?}DTX#d1s28Siv6-v`%MAqKkp=?Dv#b!1SkhA z!HbtK5xg&aW^F@NIp`OjO-xOhf+j;cSRel_k5qi@G$CVfH68gzhj@6l;2nPd{tdM! z<$R{9tb`kA+Ej4s_m1joFNvKxm5_90F%%rACOQUS@)ur{>?i|N!qFn=5E&UCr0#na z(!len!q;=JMgxIMp#QL1(tr=y8jq!y&vVxCIU7J$fs|`Gw*cpuE=-U5| zfe=V%PyjWuwd{YsigB!5xQUh73AqXMiF(@~h{Q3^3JOxtGIVwz7f{SEE%lC#sUT>f zS^dJM5+MnYh$J$%pAQ>?>rMyJO{E&)HEYn}AlVk+2m#63xzyh|)CKR$Yn zetiR+i0;N1;8cNg7m{MQ&pbQZVaG2fx#}r_mWPgI<#X$oJZ-*}hIR}R%x_YveE)CGOw7$SwXmqe`Z;@g9viNqrELsGHwi*1sC{g4VPU8(4~a<-bPdj5&E13N zh;V`v#^AsZ#V*ZWuCOtKT2WP%OHy*5zW(`%E^BNh$p`Vj zj`fu0>@?BqCx6RAJ88vXRS@7fawPO;nddZgD8^a!sVJ))&{Y*$A%{@~(Efmb|C73} z1V0CW)Y77!_^7=u2rB63&!3%~oxN=wkkoa6!cA?<6(rGBN!i*yIdQJPTfLllb~&QB z*gbSN7l;KWNL>f@WE5r6_`wi`Cr(@sX?Q4CbU~)elXdUjI^#PE1<8H0+y9SCWnur} z`?T8s%eE3#t)Qgz8)UiIIMbdzzxw;rb9opkuK|YLiH}!;7Bc0)f$Tf|=Ob2jufrBe zbauEkQD{RzTe0!+niW3o1+I#;2f(8I{~re<(drOx5WTDRv9sA)z}j)p8vxp6cTy68 z0y@eTH{J2>s*fNWzZIBmf`|idgAicgWR!>fg6 z>lG6Ui-LoL0}AU?hhDC!%d{N-dlANO4Gvo88CERjOX@$-I%3fAj~wXuJ}VAfS+V}b!lAIe^^qX>q*TM^}_@JLM{bd-4I{> z2iF1F5eOb)i0zhybpYXD>PX{`j0bw)@ygQ>1!pkuG3a&uo{u9b)z;VFjf%Pkei%Ta z^I--`Cz2kJ6KE|YFIuY+nXsd1=Ff6Po0yqBp1uMocAM?AT#CA}qa*PY?FyxAF<67& zMk-g^3||Tg3i61^^A_(XXE^`rajgj920NkMI96 z)Sa8&Aev)=tx?Z#=HR8+15Jm4N1O6jb>sJI{N9iM<$|(s22~M%(lIY_KWa^9V^!%j&1`cZE@a;z z=uEcVw<97fa8eA)JhovOK*^yTl4@pdX-QCs$9PwSUT*#O?++0$J$-#M9%|(w4ORQa z#UGLyODY%lg(ueCJGN{wgCGk=6wa~A%F3;*B6rXQQaJtnA$p(pVl~L?g2KYb5I4cK zYLU&`i;Xk`+pQcgteCmBWOQ|<4ozxEL_{457|!yiGEYZnKN`4IXf}|hpyJ$o_>iUh z>e>p+g$8#|&yFYat2)5Gd&E6>K^+CuGZ_|`ZbY@~g{XyQt*U(vQta8YX9>Ly)IWuG z1AX>eEIF}+VC&wtwoXSRZUMohov~@3q+i3e4V(6yzH$ne*4k`OS}|a4%~x(VP-5Pr z{s6C8eTWqR>b--r7As(eZrcPu>^9OK0kYBwoJgvq8oOfvdZ8)8H$1mE2Zr&sREihJ zcL|VFKVP|QP;#K6p>89^DgHl_8~lP#@Ae$DnJ5~BJ0{MbKc(@BiA`{2c!TEx z0Y@5o@LJcQlG5$oT~C-fBwQ#3rhp*eC2@k%UHN2W!s2mq;oW%)li3Tu1wrT|Wuy2d z$pJB<8i84%2W@XE6Oy6o(FNCcuTebj)X z`wtvg2OjQ{2ijHGEyTG@mF(LQ zSt`ij1q;IJ;>a;gO-(mVd(xp|HDg6xxsW}menGD61=Yg|{13g=%*5mkLN^R$h1f9Q z@qyz>Z`5~YN1LtJ@}AMrUuM| zYeL(y{2w9#pqNHk=%vprH;z_1+={3M(+hu+TxSL~;8D6M!~UkD;{li^tcEqN0OaK% zVc~lKYP)AU4o=0Bjvagv_Kr|!P>hp7D-o8Hh$+BulYlF($UwV~8Q#*&PP1vvD6(n2 ziCP1;tPXMoGCR?0wX)6tKxISvMKLf%JRuuJ;9VF0jjN%t9Q>TV1$F>{Mv?*n_PXbf z6N(10ym`x(_nr~&`(3~wpR2mEQWPhERA`FUj4%?>htxpo(!qSyW89E!+x+GHqbF&ya>FuGVBL8la+9CUGTqq?4 z9ID^s#;vTnZI6${A}GPa*5TXFN=vPfI_Br+DSizNHU8_q`Vo~3d^qV~Al63Txf76{ zeh_p*CyNvyl(U-~C$K2;F?8RvIIbXWApZ&hGo$la=j7_znsI{y-mc%dB2STvrL(oB zR{nMW(Qn62EH5t?%_>8s50bhDHbGdJ{_54MV>OSJ;{-)SMV-ewkHfEfvrqBY!`N8* z7k+SJkunEU-e;)cILME4QwZckrOrS;gb5d>DlHU=1;pqcF}K5Hi(r{RV}BZ|7MJExcSP;;t+3iqd74ip}#u6WQcwX z7aJ;54QL6yw9q52wDhFQ%Y;ok_JwTEQ7Cxx_H8gZ>NxcR=q?g1yi1gj-iya29CLoW zawkX0!6zO2fQp!!%dYv9sPB+^$N`0$(a`E5e)_)_vaoQ2@Ih7BI`n*pkZUL@DXkqC zV6*_E`vSCRftxO1uc*i99@qu~2L8RW{lX2|EelGQM+K4sG|H6eZ_teTzZ)WVfC({$ z0x&c79mj|f$P(K@2|1_H^W#qMrSi7BI@~ctLWegAw+JIB=oPdrtDTVnuxqy8K`H@G zCALw$BI|3|esKH{;HIqg#wz!z+QYyRO~^KTPI>WLyu0`5$Qgn=02r|C0^ndkkhtYBLe7PK zvKHzDu_WTM(6!x-jaBgTI|Wz_cLmT&rB7}N}h7>`wHZ(Mx;fn63yYKY~AejIW_)OqNZx z(ElKS>pD9i>%nvPp(%!EW*)-jclNFaoP(+JU%JgW<1xe9mX`B2%Y~B+3(rvP2rKZl zv$L{wMKY=QFvxcl0YU_TWZwka%trJextFYLZAs0-$FU!lupVArnPr62s&n+A=QZec zU2{l<27;Rbvk|JYM)+(>EwggjA*70E9*Ekh_)P85rQ zBb-RU`>9bv1YHM|H9Y%{zi|Dt9dgwHe*THm4w>pX`#CtSgU2oK7}G;}4Seu`@qC7| z$(z89lvGq$)(4Do`)GQB6-Ix&j7ET-FSflvwPT;bpogJ$k_ANTT)pBh9xH}qH8z!* zqAM$FRS)86>^h_>eAG+%3|>*{mr6b08^3IT4&uLh`{Ko~;o*10uqkLx4b&J=e{nCY zo2X1?uLIT{`=LsVB+%%ZeevGy8R38A12t{1&j*hg^gH725Z6*hMn=9Hd^%XgJ#L{_ zh`Mv)>Wb7$E#Mer-oBiBYY4W!D3zW}=ghVKu+Vn3AUr-l{r_{C2E zCNNgKxVk1;<}qdpT@D>hxsyKD9{ElAkA%Pf%6coeX&7AdId_`a+P*uKT~&%ILN5+v(2}q)4ZnP5Pb$>hW(pom-##pXqI7x z7-W8Kt+xDUcBj;8W!6$hcS*|WFMMt)Upk51t29m1ENdarzvcbers z95cav<)VQBE8!CP`PC0K@x!13yZqip*B~jBw42)R4EP=t0tB9L?D_yMfpCH-fT)qg z`TiV4C7@);ty_Id<5igeHDXJDd4) zebh0%!=Pr#_M&)cq9=zx{YgaFJ99zk0D=2q+iGZPI*1?-x>?yU5|xnfVWOu7Jfa1T zp0Lel$~a{?D0J9SVqsZ&2EYs#n+tCBNeF}+E^h8HtgI=*4v-DTVZaSnbYAEu2rxtZ z5^loE>JPxHXXJir(9HafU8ho0Qwg?$KvWAiGqg4>Zd`&+*%d>v^7%WAM~+YjzDB1I zCul(<7#>JpZxV(z)7KpFw(Tl*!d5ka2ascLy>#5jZAhX4*OfHh$yg zju}pr4-99{)(T5XV!LwX3h{+MlG@vjE{2%8U=pYW454Ux@DIP{0?-=CDq1ZMd(%4y z8DQO5Nq;dkcPM2iyMxAidftHbR8Ur~?dp0&i~~PjK0ZIaD0%ncmbkbHzW}c$=81^_ zv$y+2nV#kz5TI2}4D_Df?--dZwQ+vzWW)yDe8Zk4U_jH~wV{xAVGnIafD+3;!2eUd z;xGVwfr;_JuW$nG2Az)#jX4xiL}*9*V;o^gS(T9q2$1aR+6N*UC3-4dp{sJ6 z>mR9wsd%$}r&qtQY8Nwm`~_usoc7NPkv%82KDltcYL-8Y0V$S<@Mt18fbmQmd~o~r z4#I$d6frliMZ*(%`!<51@>J9VwD#|y^AlK%o{BK?=sVTH`k*6}$6@&U_irct9H0n4 z^5@;?Xdq=8!d&3sz+oqm)MEq4gN|%U_S|V15FfwI^q0`tU<=Bx~ZGao}DcXZSkigp{8KLg0s*-&x{t z4$Yu@Aj8B}eWId(0?XyMbZ+YeO97`hJV=;0z~D@33vU`IygZ{SRa_OnfItsvs_N=K z!IkUxr(Xrc8W9BG8{iNUq62gS_DJFQaUV)AG&IbY9ERX@kNZLxJV5F=h-ZWpt31aI zApy--w>x7bCKau&M$yIU^^G38L?Y3pGx_j0!G);y=MVVLDI^Ft}^Q_?!q>H8gs#Biu0H zvx32a* zwuC8MfhQ4DM5oc25t93q&x}IZho3(a(f+`vVuB8w%7J$KKx$jTUo5G#vmZlRveU1?YzQIF$1si|cjgQQ zk|BN|7(WXm7;)7g0?@@}Wd$eQM*g7M&Z>Y{p0e?SZ7mV7Dy!wAyqp|q3y0ftbKG@5 zCY@HpOb9kh8pc=XF%_Q#ai*x+!2#+5qLvw*>nJb&^etUDd8o; zi;Itwt`5t%mF2`On6r0c4ijMw?fpm#RTU^i37?rWd=H?hzJ+gmVs_S|@$Ny)^DxZn z{L`Z!es-Vl(tnnR_cj3S#IGeU|F&b-Ta+J84i0ISI^RdAC{T)(ffR@UjtCfleF|2# zGvD|1`2T9&8^JOBXVclU&+Oysd;BKO&`FGa)#z<$&)$%JPWpQZzwZezpX!C_iEma5 zkMDMMF+!?jrETUu{3)~492tgkn~W^_4kU|#N4i%2|$U|jv#q!Zf= zJvQ$ZqilL8ZP@L(3a7I|>x@k%mj&!l^_6Tu^Xxi|;o z?`PO*q44cFd6^Y{d1918mL5s73hyHx12Tnz&wHVFV>}hijybXmY;{=d;IcAtH0Gaa z7`(l_CP39B=jZ49k3V#*e1oG2H;4e{Dja0e(CC`3^zlp$e&T?b5-Vh-T4@34;twpO z2hhTUFvReL6mAw=<|i9xE_TfX_`*1k9?d;lnC-A`A@x@efFopfbM(saGr2%D2d4f2<{NXPf6A7_R)vCI(RBMwZ^Qkj z0MwxqiiVFLHG(jX=vQzvnJrEWz|lJ?&+7Mf!q8a>o@{+b)0Pfc=@Y@bxlRwLfHl66 znQ2wo_W7>Tx~;{(v_Ise*C%zEu#!a;q&C= zDNOT}*?)Bk&e18P-rimVKL(-?6izlWCYcVwse~|{*j8~h;}{bQ%QR+Ai2WLJR`JO2 z$cP;S-P!ZPw z3n3qrBg{^jRIkBl5xssdQ?|3Ae(CDk(jm3#*35~qQm4t^Iq9rx`ovTF+&r||7aAHW z(UaG+2X}3Ti9-3&sW>>yi4_Ez7+i=SzO(fCo`G2yrvsc~Fx*ygNJXg7U0q$^YR~A~ z#sB+vYK`7CS}C1qL@1M0C;!EiDxyeztMo66R6UE#1|fmlX{Frv!|%FqQGh(q60cvs z-gPpXXR1fN&{72yTY9wMSJWQ(lfSR;_^6+c$vj}m8wj2rc)@~$gT2&fkKBWi3p_Ki z+oBfMfVF~Ra{0##0O5N5@8ecU{&KKw75#GtLUy3~9L_H!WQL}fc!^-^fiSX#wwAXG zi1HbFX^aG99aVYhJ=dyvBhyM!6^?aq3F=7VhTQQFpm<2~kk-}70jFR`n zuXUQ)T8Bp7!$*w6h;eg6MfDkU*_hF}G_s*@V-D^~qs{2O-TADVVKjPRB6VN79K^k` z3kgM|&!pYC^EJReEM+`1R4Zx%ToMu$4`_q|+ci?8HgcCMz6h8hBd*_QW{ZOfKJk4}k7`}l|vOTJ@w zCCKK17zUZ?t*eaw@=6C2MAm&TmHyMryh+Q54Fl+Dyh{sjR+c8FKgKh+mNyRcnP**M zd30<`TkG|+*M%d?f`hq=FRH1zE_?eNU0h^NNynVVi0|}Z%ev-AUlMf8%v6fpYpC{| z(v3cG%4xr_W8T~E@jH1H5-+evl?#uYVUJRWy1MtGcT^zeV~oMT5EnCwIa16^6r;fX zU>@|Xzdv9+z#NmWdbVRa=f~S^+qSKJQ7(mTa-iF=7iSvIqRB*_)Ua}vDVbu!zp=As1}|<4x#i7!812$tW!R&bf5V2uJAq#F-@-TnyBYabH42 zA=cy}(ttp#Zf=f*9eN#snutxpIbK~`?Q{p-SUds)^NSW&I2sNE{TFqdfz1xIHIG2| z&@}zxIaF!Eyu7>*n3xcr>Y^Cwu>0(3M1OCeumVT&%xGNwfe(*OWg`YPBbZ;&)j z6V34AMns>N*v=YoZ9{jn?BNGB<5hO4_Y1A;Y#r9n2oB`i>8ObPEwW*jcU!xn()suF z?J`qYmA%7BS8YGX<>np`XQ8KQs$$@N9mWvf;BfXH_r`yJ-|RSjrqGi4?d(YY@kk-- z&pA4|F0VUIPUj}jMXiCjr2>1GibS!|(iW2m8q`8(F@i zZd|PN69se10euvbX$y{qgDd8O~$ImjkB z4rJZ}t#-Hnze^haDr@mld$r?~>p}L-59j2sUZ}bdwHAFpLs2JDpto)$Hr6_!__@XL z>^w|-zzOqvz_P9_`_*hV@lYfA&5U;^L7&)nx?P54({?{^mBEp__bAV2SSJkTT{<3V zVbYe130me9PCGl5dqw**TRWNMBHKcg6U8%~B{yt1ZmK9M%B;bCvUK^=;Gd5j0qdAG zp1i$%JiF~W)ymZcdA2t-!9%~z&j}m^riP8l1!3yv;18t<6=pLoX&IxG|9NAa{(BTi zml+Z#DZd!5#@GlWZVc|%IH=J`lmrt&soq6TFM|yQgFwvn_CH=kp$rfQ6alG-7(8bK zTTD1PxU3-Pi#5ZW^QlB}QC(53dHcA4Y#HZ2vIf@h&(*;>7?HwVgn;sJJXV=MpqXf0r{W5z{CA*Z6k-=-SV(yhqo%Gv|_4@**NqlwyQuW@=Y&*AXHaYBpGqK9pya8o4rF5WC=t zOn~#Dg~}9=K@ZxfQaKRts6nQ0T3&O8B4GozS?+se3Y)%N?bmOM&yJppd zmZu{F8Jq1<63Qe_o-Eb*`v9Ew+Lzd>#bKUz4~~7ru^_@LnOuWtypd)PLkT%JXv>NE z1-1cDj}GJqu9l4Zh(gZB!(xy(CxI?Cvo#L_(#fFDA%jvxHbvw@MJa5Vtgo%*_nI)q z%q3Ckp)~r78x4pjs|U|wZIE|}*BI^MRR0^#k#-d~U8AT51(lfzi)-w0%F#3npxlUp z1n!p!IY1uj8b}Guu@HwFMmOe13qQK01}BP3D}ShR)z!%j451$EHtN+n#~JVQ`LX=* zllRJJx^&Vl&WZTxMo;lj{gHhnzmsx1t4IL2YFte}UR5vnIG&n-DSbK!Qm;Tbv-<{x zlEg<(R97G*a1OP20Ei-5hN)aSOx45H0hpWy1WZoq(hu^`29UUQKYp+x3BuqPfYDL> z^lwxY%*v3-Jute47kOm*)+jd)g*jJ-sGa_@yri~i*1d5>e?Picz~ZKkRX@WB7XUpX z&qH;=qhiQxH9+|TN4B8NEI(hm2IHD(p*KvsR&XCZdK7YoELSv{a0XkcY;2sY8vd!= z-B8xe2244K=0YqJ>y{_|JZI5NdSIII(A1r8;d)WZKBTr`!x zlkyrAnhDfh^3_^0iPH?f)qyPp0=jiSkFBhUd5yJ#6hg=J$LE&RnR8lg9a`OMYRB(5ZewV0g!(YOQKzi?9EwBj% zk*u&`4gDlrOUuaI@K{3jg>&*DqKQO3o(beWX9UZeZ+|_IBF0?En*lA3xn-A z4)_?e@nrf0T26l7&v0wK!UpsBt|!Ncj~JckCv>~$^B{#I3kX`gqo+K(|3BZdcCr{e zi1G0OqxLFdqd8ME*>V?4O{7&MtmSUX9X=wm=aqYFn(Z=+m`7U{OZeWSGE=Y~u}b*R zLw<)>31r-xckliq&-(ji;T8bK+j!#jZjz$l3IwztCm+>v3e2iu~;tJ!9|MBZckXc^j@ES1FnE8R#{QCL}$%%=HaJwU5 z5;>{ zvao`U2^`B-*!N)G##}$(z%6XhHI6^{!o1}@I|KUK^li2tNexZ#HTeS z4rd%1Q9>H!US*)X216gXo*OS;ijtYcmkod(rHK5I>PqkjIm zzdyqPjM2(}R{Yl_el@6gQa#)F`w11J*5?*R4))v(%ag$w?QO3qro?i~LnG-|SNUx| zzn6_y5`^UfZkPi}oc@R^GJOv_D7q?4<~>ni(&Uao`oJJk090!X3bDzEMZsqZa}?6@ zO-#nx1pNRjMh3|+=8wJ9zznyBdG;P>Sd?M&%hJle+q?u|00w~yO*MI<3?2m2RftSZ z0u9nQvL#NeQ74y-Oj;KC8p?#26ldUm>o5A4W!+3o-O|W?K3fY@jwC}NLLoExgqB|D z+JtB$y_HwN+9Dcped;)fFneAheBiJ$(*YUNm*sO${0CwX`kl0E-bzDugfe=A-w!e_qRjBR{@-PfhVnj za)vc7-TX{PUDafV1x!%9#34F>w^M%7vzCDE(_hrRgA5pAiim8 z+7^Ita0X37Tr?213=Iu$?GZ?=&0x#Q{2qmj_%E<91F2E->nKYxq&Adn}&Nj2poN&hfB=l~NmDWJp%!-ttN z0-gZz2=w69>pf;zu7Eb3XzGwRjt%#OEvukm zK&^X>R4caK4vakPGi00_xHWL&mRQJ5K!-ANa)MWuy%0Fa+2k2O(7HbXIi*UnVyX^_ z4>tdW=|3cC>zOCp~!s_N_grU@fE_#1dIbOmPT9f zh2L;PW3p?5m6cWEI0+jJOptk04`|-pu<65Zil-B71-VCDWMJOJnqy!RMU+@f{;nfa z;4oV)#P~kFZw{|4VvszK4Y>o?0?WG@DJx*s5t2kp!E8HPxNR^P!S@yg3oqQ%7h!XU ze?7%>bME9_1cc(vZ#V~Oeiovy|BSw{{Fxhss-?`CT z5Qp68Rb)qar-k1c`b9eltS!XuRBTaaoGbWOAQ&Z3k{DY|2C=bV+k|055lmTv8*X3t zP0}0broTQr0D4g#20-hU{bVK$nZX-V-sqFlaWsn@zTXGZZh;RI-IkVVL}CJKMD^cu zH_DMV$vio%Nab$Y1F{X~Eg2D9dv$jj<{E0}wzuVlt;_AeLhU7r3dXhuayF2`T$B>b ztFgiHL692}u^=VjVQ1*(c|JTmxdV$w^f&-3n~uDnzKK+2P+*Z1YSG0{GpwRyyW*JR z;6biE8kNmG&hmdIPoKC(U?!6=S;(b$Wht&`0SIH-coYYQx`^QuPv}9wZv<_C$<2>& z6%on1ecbg`kU}UCCdd;$;C`I|%}jZ=g%4H)CJ=XND0wp@9k-#YQ*GVz8s-!-j*7%< z43(NZAqIB>3#1%Av-$BtfK1yBOY?j}{1W#j2rTNYTg`FO(62#@Spbq1g&7BMCk>4^ z&;pX5G>#lO3XnetIqm3}l%JRgA@|Fd7-R*;*#nhAcvfH|9C<_qLpX?ac$u3Mpp%M> zL9}?7i(5goVh-sX+5{q_;Q*Q;;(-&0m~mxk>?DLb{ojxa{$QJ-eFfv0AQk-Sx@lhO z){)vC9wn-roI7CeA$bIB6?e>&*B@aEIMHp72dnXZ`t*s|qA;lh^R^n^8BeG>0NxBW z4|S9rB@76HS%D+HDfHCQkzERI!lU0ahnQyBwYXzST3!JJtsQZ1?qt5t`>7$Q>Q~OE zo|Xdl+A(1c?Ao9An7c*9?cB9X^Vilag-_rG@!S-b%3v>T?cJDe&A>%qYy!r$ zH^dGIfZ+w&5V1>Ki9iZwK&OBy0DSDHPL*fvP?V7g-%kzt3R8DU`EBHhkr1y!BKL~h z+Pwjaqr-(^7T`C26$}P?qg$Aft*N<=!INDMIrN(#7?6e?mt+o2CoMD66%Xj3z#&(~ z_zm!HKSsTX$pIGVP^5j!53$6eMrK+u9bh^>TLpjQrcIjw(58-J$(ivu5(r=P|1*IK zIA(ZOpx~V_It`(j6HL$HC4P*@875 z@8=z@i_F=7<}%;^$J2R$^}PQ7-yRvEfvi+gkx0qNC>3p*G9sErqDWC@QiK+gA}LZ4 zNuoqF6dB20(Xc~EH2#nKe1F&fT-UkIG5U=6`+nc|>-Ai(+JH{2e5n^{qF5OZ$~Ex8(PS?53#%L6tpgv4 ziaz>b{*+BUwp|1I71!@=kH6|qpF9zn4OHWa6+ai8@cQv}BkE(f+GcaKfY-V7s`-ym2H`Z}IU7Y&L|;An*r0wZ6Qx=Tku z^6)l2kSolc7Oah}1#xTKO2}MEHY&WJpf93ZNl%Y(`<8#(j$%qLVGiMk>g(|=0>H5f zDjyvKd5dRPJ`aK}VybxT#2y1@%f7geQE-bl;&soo``7Mv)zkrzu%{gUl-Gbl9 z+Gz^1vX1=E^Er28ueJ)ib6Hu1m?`Hh7;*Qj|Db&P@QB@3DVr4U{ziI1TP6kH%Uu#l zGMpM>AttkwgyJ2|YTAt8Uxq^48ylPYE-L2Gq5b$guy@dEPzJ7gxfQ$!cy9i?MnhJL z7wS9E-&cP_G&Zz-JHDvxd$97$39H{KBi|CT^R+?x*Xezl2z|=R8rRl&=FD#m+a2@d z<=eU5eWbO>!uVGH*Th-h-fT4*{2^c_#U0fr#O{ys)$wRrDc-(}K5M2tlQs@f2DOin zqPqNNNB;X@MKhsEMA#gmTs=N1K|^dwLO(^Hgtb5rE`IFdqJLpOWJT5mMT-1b>hJBG z_ZW_?bh z5Qz~&YA%wU$Y{eHPB0xbVF4R8B3Ad1N}XGZP*@xh`{UzVEgEAZBe#RaIo{xB!4|Aq zC&;#W6tfjVIn6dT-B>jt+GGT<$i_Nu^V>TJYbaNn?q1xGpq++@Qe?Iwfc1hSq&^!v zVL~#CoDHTXEU7+kbk8H@@`_1BWYV7| zWAsTiU>PV^wv(VVp5l<_^9gYr@ZB7N`S*P~rPx%goHTKs)T%}2JJsI4f8kL0Y_0Ih znZegywZ>@XBI{PVmx&g)6sRh|%c=`(!I$Ky;)(1VlM6S|@mOJz^!hM~h zuWIE3-DMCfJO5#G{{_Nh{q9|Z)z;|7fqH$oZpe?L(Dw*|DGDc^RUyUTo7DB#M%c$* z>I2z%@xP~`i?u)*amla26`S~m9CfQFYg24YFdabj8EURivbx1zo#nQY^uuupY4Yen!ro(uGc1jM5zq>xitEvO>6e1Q>B*Nh{+u3`_;nlI! zntT!o_B1M-5k|;@fJivRqy`O2{i1!E9A*oPK##4yuRgoZtv(vO!`)`Ry1HJO8ffes5~j9ej&g8H$J9MT3#^>HxADc8_QDMUQ}s3-dXd**i6V0 z0!PgI>6CW%Y@6MgaGgSK%zKa{`hYFTW2y<&6Rieo?_JVSUm9*jhRg(fUKo9jF-uzhJ{ z+47xyRz^ofMIC&mreUSf(nlKhF-gkKTY%nIZ^YVIBR6>8np#rx;BkQyB z8cl{sg2Ahs6rjP`B5XZypjcw|p}S$v68Ja;^QVbve%qE%=!w8bXwXO!7})p+>Tbw* zG|-#?D=v%J0=Z;_O_ouefS%mz`XPhUj;ExxAt3i&Do9}%t$=70@z8Dq5D-loWvNio z)aSQ3&Qkd2LTOI?O$lvA3XfJh{>Dip<$P7^^!eHrN2hUZd!ogo7QljN@+-IYSmB<} zD<=lX?d*B~;py@isp31a6|l65wy9nAM^*1hT&4o7-Ej%MU=$}DkFW6HUTyum0(;|; zvyU3h6VFOcn|4^bs=L%Nr{+Kz!HfP6tKs%tGTu0Ek8<`uC1+I6Fe+Xq_H(ART-L0 zz$Ea@&Hyy6vk0R@15F6|Y<3y6)uPsKC%VEIGv(9McjwANvcE+c$aC>AF4cGPlquGj z!GwPnoU2_}ItnNeRR_c)4^xBRmU_Vwr3r^kVr|*8_M5iQ2H@xw>JPLR3DCcAT9ldd zAI|)D?0aKl9WSoC;afW@XBu7DxCa+k2)scgC{39%2y|}gvSlH=cfU(rI=JSN&xd~O z?6@_y1D9*A|2z-uW!)o!6_)XP&mMzR(=G{q7pi+Jpcvs}XjX-r>5F$yE&uTdLew%= znG?`;p?nj(1MdB$EFL;-*N!wT#7cjRyKQ8w^zGKjdnr=R z&ewMJJsh82C?EMPkA&Vsz;?okGjw|3Z9W1q2PeTmAEqxj2#zY+eozKDE`W7%scatJ z2vp_b2{i`KkTBtY`G;|q#DEEl!%ib|XRB#*m~D1q1F4v1K#hXmKq1WcpCZ^YTHrYZ z0`Q-k@J{4;jX+<-$u$S!V{ZHT)zLz%iy~TN9I+*T>C|{dr1Y>>g^qz@@DkfSJ#ivS zjJ8JHrT{LaMs*DcwegS$hXp z@%f+0MsT^`@qa>U&d~w(55;}|!=%M8o?ue|OB84TjTJhyrh|b-1cQv~$00yJ16nSH z)Pbjb-Ny$vZrlhjs}Z8R%jT3?z=4a6j~Y~0sy?)S#%&j|*8~ebOdciD5wlj<15^nY ziXd^EzPJbF*SIwd*8HF1s|ai1KukCYJ?O|*EXbi*1gBIIy{8COWW$GIE2yZ*JayN4 znx>|_n#8tkS}EDe$`Yg1SN9w6M+QLZeNF$x*LGVqL2L*VV9JoptJ5|$pG$5$Jx}@f zYk+UOQ6mZo*D0sB2(f5T`z!+9Q?4jJA*X zB&tgRfstbe>;tHZ(RI#|OMO)L0~Y;X3OMQo&IRF2q?r`m0k{vF%DeP0uMrC#J0>5e z;u)VV;QO6D2X5mv*_t-}xHXcy<>cXUi_4gGF%+%>SY6ieU8^1&il`G<%WT#*D+gH_ zVA95Ma?0AynV|$Enu+}d8R0dsbgGOMifO`uOPnWVUxxFoek{Ub=#tlI>3F`Y4dqRv zJLu?yt?~bKz(`$7y%(gw=*Y6FA1-$;A2{&lL&~y}S8&Fn89%#tI%2;R`?EuN)`S#p zvd48IjzDug7LSS+=3sL+PH@l1p(d3u9b_3<9GR#G+Su+1vEW$c9yVJj81hQrZ^BCwU6EguZGvYT+zAx7{`Yv+S{j}9{1idZn9yJ#N!QrFGw}iZjF?J zerZ-X)Gy?H+ErDmWu&T`uQn!GslrdP7Ueib<^rgG*^Y7JsE`#WnaDt;la)EIFxyRke4x`xy3%%QT71=1Q zAs)QmyfO$2WeS_yI9Nn{ut;70$*-t*)Al8+)gc(uIN(45L9={<-Q~;sTEU_576$=#%Lu3q0Tl)jtUJW?^B?omoXkr~b>A`dHPzrffyehu^#=dbK;An6PG9 z*p0)H4Q=b<;>4&9W z1ZZLcAkIbYSW}BVK6AbZGMgUafYVs)imtr#<)*oaUSHlR6_oSR*Cv2o^Rj8t?V6Lqk=PwuEFlS5PNEB@9b?$fHT zsV!)2xpyI@cAaB-JsL-c+<8as(vR7WQc-?;LBY_#Nm~f>43keZUPkADeOrLi6obWJ zM(m$JHNs_uvV@~g`qx;)SK|;W1%*KR2 z<c4675AhM7b;`Yuvl=Y6#BjcXYv9?=?ihbh!pkP%V#dwbh*Iz_d>U@&= zN`>)Z%^CqR0!VW}We{UStzRE-*xvbs@_hdvCzA71A|s7f8Y|p4>$cTsxXiKTw)RwL znl3YPDULLRJ{Wv+xtj#L_4gk?#DEZCzod4}yRa%ujFtfy5?S@^3kd&2Xlr?SZ?KV7 zKj$ihyGO6BSvzUPqqGEmfHF*T(;qL~&gPhqNB~kJoyD)SfIayx0yTw>r!nj(4Dd8T z^f!jomeZe3cKE2E;`wR`ssc2%I)AC%(x-ZfW! z&DUQ{mN9#L#;a}Gkx3noZtiV=RzGX`{{AzjfUtuO6cc}YF3PexKmksWbv|~@TZ2_5 z2WfuMrU+P^X2^WjN8fPC!$1$?5w8G7s4loTnn(_TG4gY)(OpC@T)Wl|=$r2# z0$7i3`q3TWubr0mI)d)WuxB5AvQh>wF!#g=I*>sh12!zU3n)4ZuMVq@b#4m`bGJ&Le|Xo2s{x1oA5HPq`q0o0 z#*|#maq#2Eho0G0G6*vwyzSn+ynw6I&QUsxgjZL<#1Z;c^9&5C?~q?t$~B>|7dk)= z9^pgfu{81W%CN{I3*7+Fuv-UAQv`wm)sk)XJlo-fbAnZh&=2tmA6QJR9v*oMzkuCb z$+^`XhqDC5CITz4$z>jId5u_8mb%O-orqvg6No$j^iLOJqD+!RKg4;AK(_U0)K@SX z@tpX4+)@^$IunEWkE~aHRR;_SwpEM9ZB+*PBq&4lQh%D75Rh)!zWqkm5l6xgbou`M zaFpN7ndZG#*hD4w)>SpGD3jK@U&+4QzsH(W3N@_(b5gWg0t{aD=%n|C{fd)n%F!i8 zj^i6yb)k{k3JSxz`N_#blPPj~(9R+}K#Iz^gY(zoKOl8I=~MFHK?EJlZjbyC=+etG zD0%`hkxD9W8E&Nu(wJblTbBA;aq*o~Od)~(tdLm1vmeb082)+!$_5dJM$N+kD^B`V z{;P31fI8i*thBw-LM!3kwtCg7J5)dt<^hdFOdxt_dX-HUai~ma?vYKsSFnYiqFaH7 z5h^oom8kcSxqNGEEJlb8gn_uBA*E}+<-#k;Yvwl?RP5Pv%*SJZPVd7yk8=-3L==Fm zt~=akbB2-ezO*enL1<9(wm7Gj(z}Q{LkX1_s?u! zNJtgmA7ZvQ%a(EsocahfsUWQg8N>EZ=l}@}Jrora`#)MAk~`IJg|b})ba9i9Kjw10RexhBh&SsyF? z(?F)yhon!ta*uRaG0%nh63J7MxUmO&K~4)v6Kw(yh7gxf@^K=*GR^sM+sysp?Cj2F zyn0RNbsyawXBiqMFq1{1e2em|zfZ@TFHX;o9h<-3^q`hzjm84WH)r#;?s!gLGNS0X zJOU+(PwWEU^|Rjbk|I&jv9;CEU|J_?maLSuf)J+uMXjW48NY}APY43&A|Z8vN1>sF zU-7ZW=HZi1592+;Z@4qG@F9jJ7b-;eMJc~ zcJs7q!*6X++7#f;wHvLYmDYMgE4H@Etvjv%_s8;(u%Oq0+YNU+4i39>C#W*D>hHR+ z8}qk;%L%DDuP-KPfl^xVzH7X_1yGDxP;hLhdrskfqbCmBc>a4^1iX%r3sBGPxZdOM z_?%0Z-v1mg)#QZ%`wfUNvX?0I_LNgRALd|zZf+u-=%slVRQonf zr#gYxJ;X~fTIvopyioZFS{0G3*2*XQVQtd;gqSJgLwJ){p5`CVQFe}g8v_|JM4Nuz zpT(IyE7oxUa$|rQUZ4wo*GUem`WJS72L(I~>?7KeH4$&84kx*TuEv`SNquXG&<;D0 zNK+?|nM{3BrR4GF@xet9?6^de6D1p};bgXOe@|r**Zs%t>^9i+`ga>5b=HD z)WoVK%+O+=B9UKchp~i+juEgBnB|1khpr82)+*BCcC*-+E?^+KNAg@~;ws|v(Ow9m zL-hYt=m5r{d++?5CW3Z26=~e5Y`$*K>HW&>|3W5(vYK-QC{`0u3xG^}_rGUhTn=Bj zpoKc{8Z4YB+vZAMnZW^$=yF39B1(+3LMo4eZr;NiGEE1od&I#&zy}q91B+=c!d1<8 zR79o}Fh&?B1Sm~!C^Q?w z*3BR0XYBoI?$vy$XylK^$GH~9TOXybtJtcy>=em!mZa1i=8c6xpXimAi}{HMBO^QJ zH+P7e?BN{NV`lnJ^H{A9msacF(opJjB(81o=h!1XEsrj@+R@fDr>u0F;w-J+pH3L> zKm7OYmjfD-n_E+w+Tx~w5$g{7U}_n>tDCvR+8a9yPk;ZySLD!eBOHaCi8mMvnIM%z z3nB9HV19^UBDgOIvzh4n;UAD~ky~rQmuIcJqA271ymBg7(W-&HK|L<4(DzR4X(hRUN4SsuxqAC>(mJ0qV~4V%82*-rPuTZpL66JyWFo2? zM7rVBhf5h%*GZ)Fu#TSnJ7$vf;F%g7O)a;Q7N$Rb-f4f|D+)t1r-je*+E@LBTaER{ z%W5zlD|a|hzoEbXWRdR#_I4aVmb$|ms0>Beb)FEkH%oYNp(^4MV4=E%pryyB8M{My zhK=?OHHV1sDU}i8J%9_C+r98$mA6b)kkQ1ey%#Pm#J6mOUEY8Yi{xFDIU)g$=JgPB zfG9rS0lWp&Je|OyzyolryxizNH#bx7VqLMyOyxeCv+&!8_45Ekx-fp)>fPhru9ZqO zH*4uW^y zs!5vLrQBP2*z}!ArrNz$9<|=tNq&sQ`0~HMu0QBEdbCzb{ulBbmudHQcU)e1 zKwa*n*4YgMl2aCM$(tp$`s zxx?OSbk1wuySHxz#RVr$wr%**Py+?M|1$e7+*i(;pB{D**VCt6OUToVg!X0a7zAhS zLWCkrmjIu#Nmj=AI$_QJ=R*w@!8hPC!w;7AxL`{dA)=>;&(xrzvCaHSf39Y@RlXY; zHIkN=clD#^7@9CzvJ1IP`Ux}obmF;Y7hBN52jv^uxp7U;jeMBGLBol)Rd!0S_CBmD z{hB*0z&<0UUzmTmJLNYg3f2qW_1Wt`4P_8}++pFUJ{t*m$CrU`|Bnv5UQ+TtPra+k znD-OY=n_A_%8R8F6j5GZ{!!Ey;a*|pU_Z`j&SIgmf&GM5`WU@p-l9c|n%?f#5F?71 zuQZb839RP_M!6+G>SR3d?Vf2FfO7DYPuM#Wo>T4O0Ee|{EAYQkP*RSuQ7b6|jk)2Q z4$2-I=tTGkP5xH0lLihPyG~8|!GM8VM%in1y}WYZ!rX3kFn^wIR-m5rE0E zqMjoDZdC(1Qd6ifB;6`Lu^b*e$4ghOYB_u6yZs0?HHncUPi9}14A&taTmcpz{H~MU z2Dg74t|GHeXuC+qW7R?0)RC21Km$RO`TpUgoXeM&XGuy3!WPj4NsEP1H`+=#DKY5Y zq}#;^fDDRlJL>n-=*Q<1l!UyfdTrB`mu_6%Kg|tyMVLN|c{r&`bfsbt3m6&tBI0KZ z793QMyagUC=<6U)9o~dSu%GSSAw0dOxHWWor_*IN+p`ObAbufyI;`bU zKsSoyF^N{E9Uwo*QDT;@NN3E-ssTFIc#%wduBzHSXJPC7XA|ln7d>=;}Bz6xG z@kj5DsK*?{?AGnuCx~5w(h$MUkVfU~G~d2`dxLxS1bdl~VWX=h9_I$?St0p?WfM&% zJ6mXecn7313aY9frce1f`DppSDpFjQzeXF1LMKRoqK?Z+!GT4~d$e!#>SRFW?YOo% z&`xS!kQjd{%ETq%`5h`aO3b#83CT7E=?m_i+*dTxWcIzEi`7s0)FT|DsZDy)(uet) zk*;VLJY=EHHxJRG+`Z+MT614q&@z91!r*FAgM z@=w|*44$^^+17`et}()bdt%h>#h5A5=J8yB_)&U&>GA7;n1@}`weL2d-*jR z6yi%Z4caU*&COZ!scHR`phfQhhC#U#OP1%b2NPX$wfr?7EGz#xWvD1=p@RgwfqKt{ zM*F}(|cxUjUggLQJEgB9Qq2nkvcj-1{+E2t_{=uEU0rXta!FK@}Zp1V-Vncv~yqnpiIu1fHX0% zYSpQ>b-Q-miwd~7oLtb16$gv#)l{s1^MmyhY}eL1dim@qc%bp)h;@wFyL;yGJ+Wuy z%QxcdZ`mBSB7Xne<)@WTtaLxMrn&XcOsU~C#s=ftjBTQn<>u$N?k=`DbHuYuE^B^f z`Zm{J>goxzB*WyqVl?laiBoQA^fNO%wtDfz4RfqB|K5GIIwgg~tUW_gLrRmA5`2G; zc8yeaIvDZ8!YvmhFh*zfP^vc3tDznks=MJDv|*39fXz>}zrxpwu#`u|!-#H&CvPq< zKUk7q403{aVn1pLi;)9_;8P^v!YMc5mZ5s!Mr|j8DL12Y#MZ6XO$VJIqo671!@^;5 zG#S_K-~XOv*aF8Z=EA%O!(P=IQ)Q*^0iKClG4SNST#_8Q^e$BeL7bDwgv!uOh-5Z? zdP7y6#Fc%fO8p=Tw{6zDTdP1W&$%VnI|z+4FdUr84m_TL!<8?IZcH>)Q6CJX@j`1{N3N z(b%q+AJl;KM}x`)zAec1Sv6w36H3+e*1hP=X;(TNL!c?TM>JjrKi<{W8u8E>tD9xl z%G^6w&Dv8zqf09$Mr^>?;V_y(pTKYCuFr!V62T{cQ;y4b=-y>x#gTs$k&yJJVO zu(`!VQ}9Ivg}tg2S(@iu>RC_a8hu9lU*8>;5g5vL&U`VdD=48+3>sRoB-1 z_t)Y5Yuj4oMqaenxU;6Z<^8*N+8=VJZrpbAWYLEJR1r(+EOzaActSFJWz6OwAqJ7V z7iFDRp8V;nHURWpM{CP^>*3~sy6g%_*9z-!er^+RzMq!v15DpDbx>pdY5N^St z!-qxk0_7cJcpf{vChsY2^F~8nM{qGHVZp=aK*>{T!Hhc5Jx-iB@eKoxIL6^6M67aV zCUl1)zhJ%3$kkuWHxqY!F>x;4m0bB)k=z7G}7t^lCg#PfkU8MkFK(scZyiLH#+t$}$ZHAU2}UI{-n$F3 zb@tgx*LG7iOqzyg{ZljX8EnV+Pa<`$)A957p>ezcNvM&;h++yNNOJ+EP!hj?o?X9U zQW}#q{x4@w+fj@TmhVO@KNlqkQgIW?M0O-@l@MrSjB9#iaqVirn=k3zXLGC{soZwc z8>t$0lX=tXJJp_Bo8+pfK0o*Ab!HBR?24aqGUSBp&wJgvZ`HNcpE~u5Ghu`Je|;}E z;JsSA{yWZ%+cVJm+oZ8$x2(RpRc4fy$xT;R!(_RY%a(0NoB_GcpjM)QLw|=*3^_tm{q{fCM8qblm80nj;j;xPgL#EP=3!LzBHD-r z39`RBEB1P9HQ*BAgXh^?Nyk35IDZ<2@G>&3sR&VIiX`Lc9*3)CRljrEin5GkPe3qe z;=z=9NY&6xh*6**{^x;3`4KT@2YhDr=;v0Zck4)k0=jdsefiAC9HUON)HH%m8XIHjUVYfRJR0PU@tT{(lYVm3UTBrO z`(IWLeP5OE{Y*pPSVzaZ0f}AO&RPfj?ituTH#1#MTVnkw_0daib~A5y+CRx=^WT)B z;l4hb@|Jn6eK0ZS@86~cFPaBQ#MY?Wnov624A3JHLCE_4S{I;N z;eY|%DTLDHpr~Nd_29*gt>4>=L@=NQM4~Ea;YCSCx>O%DU#x>Qo9A{3Eg3X|WMh1| z)tRLBgpVXZI5e6l9JxfF{o_O$TaU z(!{bs2;~fb7pqfLx~(n$*q-S!ccMQ-HcC#bFr1@y!+Q11Vx)rM%&Qp(=&}T0hbxb4 zYNmsygdInCB_{E)gtsRqzFYCw{OQxDVxFl;6=fN-%?Kj~&j}lDw`;H|f;$zTPM|E{ zkWo!;59@n{A3Bxd(6T@}eGZMh#NIySdM#I5woH5OZ5`WB-|LRfxpJ{+d%5@TJz?6f zel6bY5w&h4+s}|5R=YZwn^5Y?jl7&O`;yu3w>As!P2Ad++_Kf+$B&W^v2PcB{WRi& z=gY&0<-~~B^8aN<0oE5jeTvrf5Ht&gkzg-H7O6;?Dlyzk1p<~Y#>$s`COH3j`Ve|m+wB@u3x_?zvHu14NVVTypUwN zz~ea^$wK0`V?2!SgfdVwvV3D^`?dPV_uCvT6d)G zi1+Ked8{(`h~UtoKN!_;bGlGi?lV|=oPsKX2~aE7o@Z@{WK)@!x-n~y=8;bd>BsWPjAG{ilRnjUDGRW_?U81t!3jTRw=s1+$TT!J8rn1Hf;r1pa}X9H9hpG zb6aZ*o8RF*R!~;Xq~lQv2*)mE*EKrsKXgH(F7LL~DpcCE{rWXFHL)g$krswe-ZdP= zQ3(IovxdFjfo&03u*lgio9(}$WxcPj&_*Wzl5D6dxh-9FIB8t#BHGlz?A_OWe56B{ z&6nFVXk>HkXwXKRIdiT7bCyo2&sDslRVqrL{QS06 zHLEZeGFz|-FxNbP)TaUVDK^*xx6!|rzewIzNFM>*E`oZfWaE$>v#@Gb1~Fd6*Izlk^mviMx8sPj0AY;rUL%K|$ZS@QL@` zg(Zl8#CJvY+!nuR%j)7g+cp|Mk1F1tU^g?>uGZGhGgwJfg0k5^O(MLy+o=VaE{{~+ zAFiO4>xhk69?NSs>E|io+Nr|?C z!_-&(EP5d8-{_|_3GhTq({N9pKY6kk!RJ+7(pO8a z7p&NX=jAV7wuE~%Y_=L3J9*rIo)nGun5|S{zIM%;1T=l1(e^$I$_B?-L=95!ZX6+) zA*%4?i3_AcLKHlI)jqk^ylJlQgU!QD}D){FkxcdWcqqYH1{>i*@9i`7u6 zGmhGp85^g-go-AUN5TJlbn)KqBYxIu%1>`mD!nFM`1k9x&i{UgoG?6apx{@wUHYKs zIxD+Pn^ti8wAJa4LMZ`%8bB2@N zhUvW8zv5)6t_@~DBTo3+5qC>UbVFE&>~sdCgCipY@yTe$7umfUW8P zn{-9<`d_oUN{)#S$hIDNXna>?3+d4J2h4cHVavaTx;4LB=NJk?8x_WCV(^Htn=o z4&JF9`U%Xky>dJjxsamwVrlDnq?-sqD}LLf9x)atZ!8%*mSDcjRm?6Z&b&Kq!To?s zj9@T`Pn=RSRHU2}fg=Vl-2C!r#^VGN$-`fS^g!*Y@G3s_fTq{Ed^AqTBh-+kK7c2to@6w zw=deKarH&1L~d^Bg?vq!8zlb%q1}M`2l6>obk5@BJ7{jAKd>$CBXqa)O48{u%YOP# z<@rJ$oqb#*^0s2AQDPM(g8*NV^1UM&&qVmW%BU&W7=+D`eCAolb@d&@KsrsTugJ&S z=veA=8+yY+6_k~kTKG3~3y+R21e60oOyUr6wOh0O`V0)+LQBt655#L%njsNqQOw*! zs@hQ9ug;+&+!*2mVjsTgrA?gzA#q=~1sjNL6~(up=|#z9wn%pT z_@LpL)+gV763`a1*#T4iwJ4?v_0I;k{2Q(ro!wG5{zZ+d>6ZS7TjHnqfBj09mv zl_f}An1fOZGZ2y*)-G!qGwod@EJBFG6tA@RuG4*}O<`r+um!u8tzxtRYoNXJ*809e z9KiSJhNuRqi^$<7{wJ9>L40FQX8?U6)TP=@0;%BIT%d!|_JOu%91PDBOBDWLecuzU4T}ah!eKK9*=Xegfak}Ee zM8@*6+bMuf=jQ6RLzRj=p(d!m<1t>ZQFoc+ym=*Ut&P*^1B$^Ucc5VSQdydMY96(%!1-Jw*`%|e`A-oAhl9VNV}IfOOiY^byr2n!COX{zPTY; zXteGVQcFe`de6c=^MpiOlDvjcObI*=qAP+pE?{es6oWmVg(PTnf)5uUu@=L%OVh(} zKPWIG=|OjU&FEp2grxnG{In4z193l^#r5>V>!xh$bk$!^q*3im6K{&!lgXBQ0c*SY9|2ZG5G zKM$GyfbbOf1mP^B(JS!Um!+@4m*P)0knw=L<_2{g59ts1DYCvra4sDumeXDsw$LlF zjx-K@{-AfyeaYRSRd+U~HZyX^gtZUipFelrlt}}x{&~A|(xdV-8(y5=bK4wcMhui} z6;lmJn1hcJrgcoq-axT*!j_<`pwgHUgAiDFVWU{P>K&b~*?|rNV>Tk6K1gvQ8a{aP ztv~{l|J5wxLPEe1R3a^u2eLsGy^Jav7ZBK}<9%}t@7C6TtHr-U0}ZkVVNsoOGXC^? z*IT;sbOJ)%VDISHnTHcnYyT=pZ@{k%UDP%seo{V|KzkI|L3I81p9trI11IViBC`MGVtpiyd{Ob(rf=v9F zRf^Ct@C&h0Sx$@@Hl`^AB>_C{sF2JXTDC|X$0?pksu&bMkt z&2VxWPQxplo!FdNeC`iU5c=x_JAf8PzL@oJGb52H9aIG@ZuFBstM2N^5K`i z2>QY-`-lBhY?fWNSH)iq(exe3o5xw$pQ$sWswBl9tFe%-a?GOl)Vig{T?TPDh_g#66(4+6ZLQt$GzH^+Je}@Y&juL{i!s@KT|TNbHq5t6 z$M-%8apie|J9~9`k|Q-{Q9qZN!9BOO?|AKj)l5&%uN})&U2A2+jBfDi%yQ{1-1i{L z+L_nb)PZJQLiJt}8#HyQ^W?a_+qQL>hFA(;ib#Lw3>LFBXP&G`s9X-}th{w08S<-E znbXmP6yGT?uca`gY>B!QXE!Q$V_e+s1@^!Er$T!tLl7^$o-ngd|Hc{p?UUc~b&pKn z98XTr+O|F{gG}m8rjqTpOGO|MSVi%OL||Z;(RQm8t?t#!vtYCW9*>Jhyvd<%_O(f} zKo|xc^DaBA?HmKgWpsmW1mTK=QR2B5<)e)Iae8;|A)7%~J14kM9*F2D3QzZ}`QzI^ z<*~_gwB$((DTYE3kY-J5?kfQUG8_Ef1rKW>sv+=%+U(&>d!u+Q9P(!1+d1*pCo94a z9u&?NPTb)zlDzK^D6kjV;&yP4ps+tATzt7?>^Gt&$0NhWhBRhWYV;8}OkQPWsf@KF zrM}1u7svYi`Es;*Z?hgdj`!=%N+h@hIgGU_1~{+Qu|z&FJRdLQ~spt zGH`l?H;lSV3~7+)(WBn@TUJKK5~$APL!oq4=NKwZD3(ugz*pC~qBo;T@y$9hFzjA( zuNR&jbf-^0N~sevv0z(=KTU3b{=DmY`I4qgFk36TRq{bkKZeEL!E?W6-8unzqg}%U zcxH;{=In7a>+Amh{<1)MRgYf1qBA>A?;Dd_f&#GC zrr`lYG1!G9o|FF$*n$JPKf91}Yum33X7>&sGp1Z?f{U6C1-3|7fjc_*FK5W$!6GG} zFOqcZ*cKFg4OCtS=Bb`IG470As>lmQm!#HswdBw;Es4hqG1^2;F;cu)ynA->mUize zO(#a4S*Ut?uHm^zV4lAKWCTjGne>i4W8CBJY2v187srV4Via$DC9aD zFJkmv$o#n083CG+@}Ub?m|EX^0VJPpx6xABk<*SU29wS;mMJJbZPqMDN0Iy_fXuN2 z?k&$s{tbN<<*>zfzTD?mi(Xuo5R}0GQmBI4TBYn?JkB;NAJyROga8HopV`-+90_1m zePm>Uh*pvdD{}5zcK7Z`28Axn{NSk= z&+0_lB2gS=s40vG6;!l-qf4W9@hxF1IP6eBh48W1mffw*wwv}vzC8!{x6+3_Nv694~0rF)DmAd!Eu=bF_p zbo5i_etpyYU&wnzjZLSW?D9X)x|sJyO-%IJ z36Ga7*8iIYUGuE`yci87;-0uuksI?D%6|NQ0(d|qr~LY8TX9~&LE!sz1rh(t8{OFQVJ{M<^H<-0^*?d1&I?_fs@-<#kQ+|8zK?X&4k$e|0;FqyF&=)+H#Mwv&;qP`K-v z=Qz)Tn|S*El=^O?D7e0T&iwYH?ArrVuPHY!2X%gUs=eYY#~X!7)%UWmI3;I1!~Thf z3s;P(jm^U$zHD0W*kD+duPR2z1Lp(RyVCI=nY!_S@FEc2AP9)e%<(Xm1A2CWTAabD zMF|Grmu%3uW8-iQ%NMx|)_n%56UD&ui|O;`q{@Wtk_UQHB`&ynb=j+r(pYc1c?J_}u#E zp{3C3T-39(o}P<+;$M?o3t`By6VzDn95Uz7U%MCB|2p zr`U+5A2_~AG7V8GD34JoaI709#`6Nox10Tbu!9bfqD^`38ex^)QM>DOeqVlG# zwZ5&|U~tm0j5#V%gLJ4+-c$`=s{gk!cxNxeQ&%iTN}ETU@A!Bf$InKIyBeQdvEO84v^RHdfB#0$gqt6I_V1U!_~ri0+{4Q&w+)$cb5a+{ zovPM*2J8M=vSxT=s7aDA(hmqT5?AHIe06-I&LC&t2BJBESjC{z+bFRw!%<#stk0X} zdV7SjvKX$w@S3tC45GlpWZMcd5t`>hX>T-&vWQc-;3uBIy9fVCd_j}PG){h9q zBu~ctd?*hX-AZBhwi~+$j>u&&tig2$wmNV`4N?AznnX_-9( z+8;Qep!%SASdU4CkNa=h^d{PV$L%$l^K0LC8@5TlI{flF!v{|ve0#e)BQS6OtiUG2 zOKp=U+I(K8@_PsI86wYOJVS#7WsXDgw)7)J{14I`#=l~!@4rxEZuL@lcxC%RIIhR* zC1bP{5lB4g7-&j=7RJVOFVr_v3Te20UCZNMyI20E27N8#)JCOgHexjtBnlHq!hzoD z2hD3F+fnJ5!cd|-1hD#)aPTGEfKd07WdcW(asK@2_ohGh{~Ohsx9*W;glF0nzd?D9 zxy_{aY%{o@R`vUf!o?eR%y0TVJgYJKXjRo)TPF`ySB+GgzOM!i9Jq8*%ipE>^Rup% znQl^1HtREne(`NCh2NfmOC7d%sm+Yb^Vh2@R~QFwHgVr|`K8~|(-Zu>zx%IUFKd!+ z~Zi+=Ewi7aXOJzYXBvRTgD{l zq7GFgq<|*LbF+?*ewcVO?y73zw)-lhZ3j*07CYNn>Bw}AtNkOK-6Ar}BB3&fe_IUM z%uLdBhbKF!H5cax3cDv6PY}kc_OdE&O5J~dk$T-0A{F%DQ221?-7WvBUiupDdH-@P zb2PoqdYIa%4ZkvYVmt3T37x9Y+KbkcS#nEpZZxyxgmsHhrgq#qkLMTFIiWWmiw=J~SiW&`J-O*#E?_?^Sd z70D^gcX4*kAjw(G4;raZ>)93@G_A!Z?TSo~9tnqD zee7lRH1`q`Z8A4~7dDav`Z%DfatsSrdJaifm`M9`%RJ zV{e|59j$V8wV9bRlaXy^PX`?THtMi(UN7$ZS?OCh;|lc@06D48nFr~K!Oqzml^6LnT0D2w^5N^TJ+3a zy1{&U+vo#NpD^-pX;WM4$Q?WC;-wsAvsUbGS+%Ngy@iGE(WFg%=4bWGJpSQ>{&&Je zAN0`GloU`LK_ZlQP5^=+9Alw{0NI6c0-Ryr9@EU*k+)zY7=auR6m`kp=H4g6+@tf= zl{>F?8>c#mRDu2Gsbbn-a!cDS!-|>-#lQ$Y(%D!}^EWAr}G-@jmD3I+rM82OX zcW)o?uMqzpJ$f`Pqts?TD2Rw4M#7W(x%Z~7J-hsAunbqNyg%1@VrM6Hb%o*WTmPDx zE$J=SY3jzqqYkyU7Fdr|ncbYPa(Lv#0T);1^tI|I)e+hC5BbQEuO|&k-E;fPldFrG z?;IMgzt^*NZgWo=X=yJCp02-*{Fc`GuUz>ism^Ckdfa|5seZ={dzAlvuea<psY&+iQdRABGRCuy-Mwuc`rh@cy&blpJikr$OwO|6g38p4<P`*$2H9BdYo?4&hu)D-lG7HgdfVajB~9R?f70LUL!J%Ty>3WGq$CaJ+wjO%Z{6%N(-kAw~A`k!mt)uJxRjJ?KKi7tB z-qmSB+~knY>2BpS9-q;_zs_6gmWSG8y|CMfvmE0yPW$_Z?oTM~qvJn)c7#D-pwRh= zjfQHsW4c`#K>V=xm!eI#1?)O9wa3WwgYLQMcMQMi{YrP<#XT9R_QO7`?Llp(NCQpN zYlEoT^RHelc=_A4hwJt( zOMmA&PR%iiHoGNsf^aCWR&prxDk&>dq_<%|we^v5arQLD!X!LJ)Xp+5qfCO7u5WI; z{-L5k&GyO!-Ct2TL8ndoS8dJo&+7Jcz+~yK$qmn+>{9Y)Zg>CJ+A9o7W(D__JP>7E zdn@@^yV~1*m$~0QH0aU4Sz(B7D8+8TX~E`Dyo!Std~cRT`SwDO>Ek3)cV`-0vGbpy ze&R8+HqSWRc>2wY#60W!cOt|6R6u6LWR*d5PxsBoC1%Wd*emmb^qI?x1N{_x|N7}M zywjh*p&Jt2j>@c&9+7vg^5fgD7u95X4z#*IH9F-pw~vg4|TksD!?4@tN!tW0e||_4BiXQxo&Uy7cTbbV9bZ%JU0-6BN~K zKQs*3;4iV_$^PMQpUVtT8I`KpXc2gIf6!WQ!3O(lJ5Hq2((At%7rVkQ{NTt=Jw}fFzVY?6s$}hDUG@yq z7@gAc4^&ZMZsDl2y9R!rRaL0o;GRiV9H{nUGgVs^X(6}ZQ4jSzzQ=5w&*UL=ZcDMS zA|i5p=pDlX`>0Y~@}c~H%o}-Ndfd#Rp?06^!UVIz zgi47|ZFi^lm{ferc*o>`Z+eE=X+FSI!rO^lZZ>EPRXF;BU@E|I6j}724RiF}DY9^n zVP-3UsQY?(nV#PDj&V=iwFGx5CdINeJ?9T)Djls;_ujpWn5;qXLlA#n*N?f5lEdqI z>MlC8VOc_!wy{KK=KJh32%8ZzE0P3EQP{6InqB?Y^)2PP%KC)<=7GVXD1c2 zr1)@7|9vfvwu#I$z*g`2B*RH^bZXQAVtu~>ZXUNwMR)y5YVMiOu1dyH&o6oDH3v4i zjJp*RsrV@8Qn&iJ;HF+{eEWyQ4DCu$&lziUvLZ&eH{>mlwFptCFcp&8E$utTUpi-J zH=yV7oA;XTl<4=^!9r@!)_zGFYM9JtB01xeAF=^iHV6v%x2amfL4j)1`f#O&#!_wz!RZO z73?#yPD1(1aYK&K{h1y!p%cK^PT%|F_4CvQhr4m3mfsHw-o0Vi;>G8#S`3+M*3#@3 zJ~(At>?m(Z&=^5%LOza@3u`C3e=+cqmX)kFVK6JXd$$gJJ^g)chqrl~5y!zAqF*Sc zGsIIK>UqFygwR$q|LahZZ(lgLNnmRtx(HF%H+tR7CD*qF$NxE{a?6TUb~g8i1S?P` zAfz;IX7G~lOA}659P(P$B(*`cbO#Ke2>b@~T{`~&95c9Z3aP=A`z|aXW^gp{dF*_= zN9~asc~j|Dna1nqfpvFx`=1^%Dr;oIbe2d2l;`2S7XN z&7R#2$On$4;%dM~NB<8ac)YB?`prfALyt=m+xOqUlYt5(znp*x7`^b)e)ix@heptL zk!Z>^*%?F*y?=am*xjj(r0V?BKmv2MzF}b4oWe4>?A9;04-^AG;Yx9EiuP0CkP<0x zht>bVL4)rz(sc+5dZwM0pz#tDQN`@4nE2@$8vF1`G01YB)8i$d?B?$Cysr@6>rY-| z=<38dyK2J2zqRvTqFJ-YF4YBEcVX^^`xkD5hzmNKDbu16Ye#SO=XG1_EChC^ysvzo zU$g7_5|PRQ0z7~SA+m7pwt@1C(azvTA6FWEYp1!XU$vjzoZ)7-o%vxaR^ zKgmiAUF6{qer|lQojP~EpQ|ZTx2fd)FNJ|t7JE;b?T=JEE&bxo8GD<%jkgD4pSiR@ zd%5kTkKcf(&l7-w5K3T~ROCE3hs3o9s!+B#GH>*XK$mNuA5iuPl%)bkdZvNF^tR^l z5JjUS=UEy@{Gl{}C9NSv264*N?c~QUo~%a ziGHx3@Ek|SS5nc=DW|mVG``;b_hd!#tu=`*dT?~&ngB;>-%>J^4w8~F4+X@CNn2$Z zTA*gEkZ-i~_yF(9(;>?{Ei2mPyeNBe;KyJ3dQ;aLI7jxD40g=(REkcU-Lp$a?mK&< zIGQ;6h4mQvSS)EWQlr7Qg_veeR>Y%e$vgs^5d$pZ#2PBMU;Qi#%@I7{aub{HP#FwxQ(N1 zsL5g_W0CXDmL$S(OP%ER^caU;F1SQbFKfa@!pkr;lfSgizze?Jf8$r zKW5Bc$`(yZnT>}Zj`7&TfCJEx-3}{;$%Kw>3JOw)VrG)zmsbno2EGbhV0PR+Yw6KH z8~%J8b?(b?L!HyZqeikLzz&G$55#sHg~FSLU_}gt5)K4JP{>>vdvkSR{#L{;Ky=48 zUAt{nb2`KGNygSwr&I`Ib3NY^aN3Kt#Obr4xaF&|*8&HJXqDT=olnkvp4g|yzd@O; zJh&{QsTV${Y08-1-tsL1-u%7ys@U*c7SQ?T_r`0SOAqil;Kdf6sYA&!T`Uqpj0Dnz zkpTsi5T3OfYFvHpmECxW&($&Khcbi^PB2iz(qI0%E9vE@V3t0v89Ea~ZqKEPv31Et#38yt94Js>Gj}vupCw>my!=4-q>&v>NTpv{a3i z=KTGl^ZtdyhOa#b$VYYRf3P(CSnZkD>j8nW@A1@+gqE=@~H4xLvqZka(+hyTaid-(O-zy1Fa$~q&JT_`ls zAbUh8MU<4$QrRneL{3s7X)0MokwRn>iBMUU#3>CVL}W!qzx#2XpWpZU`xAbj+jU*H z>-?P5`~7-7$8kK?VPIgaxrljUM`ri&YPRfcBap)Thpb_SVGp0vF-ouq9zf`zEPj=4rJnmL`O&A)vb4L6>3ZJ0mQq>6q|21QfB$=U zP4ei2w->fm?_TU#_hsYp8CmaY|2dPoIPOT&+&Ow-t#1#OfWp`n?-zPbSAm%32$ne^ z!g3;QkV(|!jVzcdnOW{uJGfkOxxs0=_RXIWsi%%wtTJ+ni}7y&mcCevz0vOVm=Hr( zw~hTtW`=M%9LOpTcMq7!xjL!)W#e|61Mc1o_UN|swVk`~#c66mE^uF4CuuEs8dUdG zy>n%)4y5cII;y#jj-MoE7^wx)`H5tE(urQDU7;ztm6sPvUNc`Aq;|QbdHu$%L4V&L z)^-VloPyq^&$<^lX>Ee-pV#FLTsIi06$UIgAEC3A;}8c%v*9!b%6wOZS`QhSf%@>2 zlUG{Bwc>!|P;(2{teBrb2$_GcxJ?3>UfA4$!;#4(;Nt|lcKIT+L zPEL+txlNvujDX^fB0vDP^J(<34MUL!%JgoEz00F^eP5f!&B^JiRF`__kn)H|qY_JB z-OZZ6GPLr|&aRou4YU=HPw5@H+5+gnJa=$sEiDy4+^sB58CQ>Kd=DX6<%~;4gVw(P zVz&$auB5-#z!J6MXrizB!T8iwRP?l3D=b|H#*EoK%}ZTLNy*eX0hWm%azFV_Fq%u@ zxM(z9KTIkZxnK(8YY418@;Z0s^Wp)4JGRu8w)M(ByQpn=tbL+d>9_pu?pw(b?fR}` zi;s_{J-GOFuaUEf3T8v+J1?@3V&LBo4=X^mB6|WJTI2xY_o5WyS00Futsf4b*x}0W zx@)QK@%?@!pRl;Q`(swygkd`@x*Dc=*)q}w1ybVT{q#2_DRdQzORd@Y_M;O4A-vlZ6JKO@6LHxNcqE{@ z>o`Li+?!;=Q$aaiy0q7(0LxLE$MzvjqMPBNB{Snvs!LS@R7U>%W^v?xj%qIm*yi_# zsm#39Y=rBI6-TB^zvuMqWzqXLZ?f63&TMG-`!e*F`bD+huXi(^xV>=1rp@y{92k*l zaA4t#a%0DdufMJl2Oo4k*_>T5lk#b4HpN6K*|?NG{bp>Sxla4}RmUwc^=RQ-k$Nob zSzA+ctujj2i_~b==QoB7*t&`oZ7T$=ChSA$CfvouF8y#`}=x+ektcgn(=W5eHyOd z?9BQ;H^1+tD|ki2E6qK}n6}lV3;SGO)!bm@E^Ts}7e@E(TEnhPzZvavVitiyj0rBi z^J!YiY_s@vPlnYi8@$Q%&rCR_Y;IoeGQd5vrPtT|?u!@u^_{obqRX(L$ndk~x9)Z= zd5S)09eSz6(vZ5fW`-@i?`=*P7kAt0_RU0(rHc_8wd&OA4mGwv3VsCT)JEIrpi#u3 z#_ns|WZ0Wx<3rD8n|btUhon|CcCNMEC=#lZ?u?$Jm#d^bsHowXa~=Qm02HO%yKVWx zOFtrDfU~Gu;hxxLkW zQTf?s^Qdd4ot@ux(4bf{+&-eOsG*mSUGSo;ECa#$vI)j1>Kh~X?sn0>vHj5a!!tQ| zS&EfgYkownKK0y9m6{k{8 z1h9Y@k$moci`i@R{CBhF{v9hWE`N|d$k55|(|YTK+q?fcdn`3Gr{bO`6NGxY-|G~Y z61CFu2xAHah3duBJ9I&_3ldvm`%Q7Zx=-f&VAQ^F*;`L9G4&NKDMP9$P495*%$i!w z;lZc7j}(RSQHe9tBca)!t8LZnWa>Fc$%vs~vJo}RDwseSRbf_p|* zGC?pP#~$=C3vT z*gFM}))r4Ip*!d=Vo3preuP6@j$|1=#~~-?f*BW=8Al#WN_qTJ=SAtAmUg7}`ky(o zJYiAV_B+2-sj_LAtly7tGqV^Qc*N@4o5MbH8pMp*#hDBpYLHTryxqomrdbEEStC`> z8)9ow(z|iv)YWOXy*~c>9RKL-_B#hheCW_e zBo%U-?%VKZHEy|0h{p-*tE3G+`a1XXZI2brKY#kvpDutX(|D8==f2LEq94hs31IQ1 zEqL0G;ahf-pTE8LbJrHT!_I4mXR|0ia{jzpXqkzzX(FjM+qx_1K_QgC?K>xI^`{o? zW5zV4Cxy$xkE_K^s{X7~`xuk1l+OGDu}2Pj__e3waxW|&Tn zp^;M;Rg3X^bZ^#N4^nveW*KN*C>Zx{78SjP%$Q_$8RIUtV)>Z_GJdPuU*}ES66ee{ z>%h}|Q+oAkjOst2dTFYC(CH$fAcrzW?B3UKm;>NMqu2Qjv( zV!079#^ER9=YSHb*RHVWxZfuB?A+oZk=!F7IIFN38@%(#9XtR~n#Yk%l}G6(Ku_^9 z{yy<4w6SH+pZ}efwlwB$hA8Le7Bw%nwA>nfP=9jJEFZIWWgF@hU0p-pPh5W@r`Q0I zICmlPJ26BQ78ZWl*z0IxLn~DR?3vRy>~zRE9VIV?Z^!+zJHVaud-` z@uI=z1pql*W9eFZd-uS{J(CI)yer=3UM!s(1?I_XB)t&#zb!dAIqA9Bt;O*|{=h80os`@0u?t4q zw6NL#*NjV<50kvAI)CIf2@^--m&I#(%`6B&Bl713ewRoq=fk>V?v(ruHp@2LFEM=Z zJBZq+%nv{p$@~)o+1Dbm(@VkffqA|SwKYj1_b8g!*O(-N<8D}^i?a|xa zkcb?U&(F*Wsk4b4;?c_a#l^nP-(iy3E&=oV2 z5d1M#@?9Y_#lA#;PRk|&Kn%;wqPr$z7MS57UQdKm?%1$tN+O**Leg>D63Y_t#Q5{) z^0E=A2OpoGsGH(rRo63|7Tg%qv;^b92};LL3oQPu24&p+xYYL(G_aBF_?)|s$2qmf zDQjo9KFYao@5FrkQh-{)>u`z&1OkP}N#GtHP!=4JOlo&sSJN2Q=YJXod)M`=a?}wu zXJ-D5miGY!n~aiT3FZK&^(Wa(MnVDsiyMp1&)6+P0UB}1goQ3ZKN$Uk&y}ocP8O9| zR#++H;ht8nUd`&2BXghcH)m-MQOI%-%6OvMlC*d8@71^7o@rqbCZl<6vR-1p?jhW1 zhF=3RCjMHxcCE{*^2Wg9-0dwe1>d-TKZKG59w8I|wdB6=4pi2aNZ(&sIcDAE#w|Mr zsrEIRnsIzaxAr4i8ylytJs0?S=P~$qYY0u~iOat}SOweIkq_wpc!xivb6ej1E`t^(?K@qri z)tiJ}NKIu@xX|ykU=0?-Jpw4$p2|U{x($K3uI`ce*d01eRW`mUC{MfZR+e$fDn-Xi zm1}0*=ey>Tt|DU|xJ@Zfr8>ZkKxEh^PMS}oDdZX-t*|j3)`red-Yfe?&(QEC71Il> zzcP<0W$Kw`5El60nwR+2K3l8NikBvL6k3{nb7EXhj0w}BcZ2X22);*FJ_FcUmNE}^ z?FVP;X4e8vB~7*6^Ma*`8~1eliF0?_3`gJD&crh?JJ=;mhs6$Z1nVhZ%j%8{YMFSj z2|)qA2bOLL==dc4X`ppsWRkt*^rOiJW()KiMZ_-mPwj(sE&fEJ4aCDbD^BsKB&}Y()y%;VPjaW>7b;`pdV43n8PbA@ zNLyS^^aPW-@nVa=fBBm?66j;y{ZPl?(c*{%15EO%x@7Z1CeXpq<$T>E-FDMMmZx%L z78j8tGF%-lUJwQ&qXu2?l-N2Vp2a0L>B0|CM7#>^fm#&5zIZ#i|GIYh`J+X^e&tNjHqK~NGYuGr7GE5X-iq*z5=gyIuy#vYLAWghnODNMh zGJe?kXtWE?ocV4;{iX{`%hxX-jeQ)8k!%#sNTBQ3}~>q|Lv}X_|^aZ zu)(MJ?GSEE@%hbpaNwF^=fgMNDKJt4+J4vS7X<|cRNouqUJ~6Yk(b^L?;R^P>Ufe& zBphho8l??qO}IF8QNSiePj3zMc=Pn%8hrkK&`$T`w{Hkzer}v~mJDMxc8CSo2ISTk?+v>iv%~F^4{PIETZq18nuM1|m85c#j)-FpvG3p6%*k$Rp z)%BLW?N`&9%fyI|9jA3YZbg|R&1dQQpDjV738JZPx1y_6{)}6zD4jkesbydh-5L^7 zxk+WHZta!a;Vv#eoZhfS33NEJb7*2hLJdHjjDL`;4Yj_LN9WqGo02wzXHGY=%YM8D z(52wpwNDm?Hu(O1$?W__H-ic9B5Q0Nbo78-KpHfMXZW6M0Z)bQ; z!;?GGtrNmtIDSyXVoTYn>7>!SsNvBtoz`vJ-lCnP>0bA}JOFoi2TbaD?ILq@9%Fpe zv9dT^@N9A(M8gf{lNk``R0>6+g)H+Z-+H)fXTMd@`N@ldnsiD@Y1cWJ-aL$EN7=ao#? z>A7>pHBA-ZIWQdNVMJB6+oiJ=_CYMb~oOoCnR_o-}jrw zyr`Sco~1i1kgIN6?<;O>@Cgx>+sF4*t$g{a*@vc^8qA-+`BU^>qq9%WO?ap^c!NuX zVT0&CBb=TNT&`N7m6>_KeWY=DMP<(&vC+DJ<`!vuh-lKXta!kZeZ6LM>GfBqPF=cN z>W*A9WLZ^{)~#E$Q;s#?{hh~$-vPOIU|is zP1{(7*__HtU;M+waJp{h2z2dnT_aOfx87MU1oXs%H)o`$u*X zfe7~5oqxNu`J`oL$9AuEcaM9{>QXW4dwKhvWApzxy;TSWNkGHxmgd|YK@Ye;)iYCU zWD+v*CEgEmnpK<3(bu-8v2ID@)fL62z}QqTv1;D_79-f| zTZ4j3g5RK2Aes1N?e9(hDr(T+OV&c2&)`05hTOQ>%yMMH#jeqN2WYF|*{s=PK36B= z>Wxux3PZm}IF%e$ng67lTG3yfH>)}w>+t1u1Iw*3XQswwG*N19V0mf9^Hr9Wfx+tU z7cW)ZupxB7*}0Qv7mU6Uw0f0t#iPQ>x}m@8HeR~)@t?61+J1=cx8TMfd>LzJTpGCti!_xg_?@pRk$cTCcY_+T*Q#t(eiXV2F8r1zAL0`m+wfrM9h zgm+`KP#5$E%vqN3R<<^bcWw9N;vERoQ436tuYB_U{5t3JHfQJcJe3GRx0UZN%BHL9 ze@k${whI{=&c0`+8c{EBkCA|Efkry_Sa40v#|`x>KXjjm8bKNm77gX5Qmb$~5!Ep` z3@j{Wda4)~+m?>m%v-;nnJF`M(cJW8!P31HTx5w!7M{Rs;(@@oX$7w-;#q2@+V#ip z*FU!|t<9@ij7QU>3muI6ZVgBpnRzTB~eMcg>62D4_x*=eXLTiP6MlM|DcmTI_fP|uXuWXG*G z&+30^N1Q#oz;b(>zkh19w`X9fi;G3;W6t)<3pF$>ex#jJ%`Tk3Aa40Q_qfw%)U7|f z|M3#S0oPL8ozTG!)e$@-4gfkWxSkdTrZIsAO_{y{76oxVriB~Ajifb^}ebyfW&}P$(w)(fe@DA3DHB=uDEckd2U2Y zgb1T8EhPto54i2vQ|c+MG0}D*85E%}zsG_$Znoo>NCTs(M&-LEKRm7oG|e&nksi$joP!rF?GRR-MT^I`5K5cPPk zn<|^$&2K$!vXibFsJMUfd#w%1-Hr6Eb!T3!@1=HOOH2QRQwN4SwVAM2_4w||M>FP~ z*xG`ii=YLI)NfBcbFrp=kpE=$84Y_L&OhV2!bHC_T|R~CGe<|#n-t_Jbn3rTf~6!* zkEM*G!=Xo{GDH9$*fjUFZ_nd1HhJaVi0;(a+Ed}%r?7E_wEc^sR)85K4pAJt~e}s=MFJi=3$Dnm3sH*6_XDByqT8@WEhU|KuOiJk1 zSN;CC<+J~BxD%1{=CD^zbFe<>px|Xo%L}U1ydv4fJs7o9#!Y>>wq&h$Qo*=wF1?@U zHB_{GRs@<-QnbX=vs%<-#U~dMY2TOYghe4a#4=VnaNNxn?*3V|rW9F3v|CAVbRhf_I5YR*?g`W-)L zv13%~&^KSZCFNr%nlXud1-+KENq2Vl-G;)b?X;Ck9m|AYZPRl+MRf*B9D?`P9C@E# zW%)SsYJFz3apt8L<5$b*SmCR0-BO~PlxRAkDmj-KDLx8!=%tm5XbL3BNU~~xgM4O} zY1<$NL9ThuR(-*vE|JYPY^YaK9jkNyZKo5@uPLZB?>ArhAm?Gl6Wf@c?ffm46rI;5 zNS!nmTJfPDw=GD2DH9RWY`%iK7P+hq254j2;76r>c40|Qnz@(Sc5c~HQLTS}(LmE*k0sh=bfQi6 zJ&o#&CL?pK4XA6OF=VHTg>|?f08EnfX&kM4t5A*M|mzRMfZ*6hZT!watsN!1mDN zO!HVU9Hyc(A%6@of=*6Yv%L48Khq*+#SCiuS0}#-LpR2lF7izK{d3X8{j2V@3-y|m zaIy6V=TR7K^xhYayWv;z_P~KjTjyUgF{#+7V=-r+UNhYT|umO%qFEzdSARL|aLssFT@@BbOCVKne2=yJvP$uYNfQ{u!Ib-dt{ zrIfDu>VCl6!mNS*4RN_>*dKFrnp82sI%Z5TIW&+%oq>q@bnp+zsQU`pAMbs9kHP3y zOS*NlIA4Euj^fCG5*O#b?OMpN_eEEJY$3bIAL6aJ?>EO+H5DiAotmqyFspuU?dUxi zw2!d+g)j|sOxtra#xswuijaMx$}D2rK>%4 zQ9L^7V(UoX^!k-g9h>h_`sH_Na_U&6UJ4r%F1v*E+0|>SN!z01S>ch9lY1nnd3x%b zZuoT)3DUk-U*@#E8GI=&MI+8}TmQ){sz?ByKJJ-~zN`st(BHUqqOE~utP!-*=h6+C z+TO+ejoaIgLTQX+pT1l%`qZC$(Z-u@`Yb8$xsPXSYn1xp=**G5ZoRvfo10twDWdti z#upo|G-!HW>3f^+4Ry1B4sUL#w8t+rVx6V^Ms>gLA4fEO`etLBj9D3;Wl5@M?`U_o$3`!bW%#?dVwUc7#Gb;f8n(+A3{I-U=II_a;E z%@=)ty%TF5Ts~KI6UWel>-9mk*`=Ha{=fdrQMgHD>&R`J+~pA+*Uc z-C+{acf^QUPcGP779^&}^lLeIh`~imQWtai*ODE&zU~>catGNtGAINQhM;0diq;WCO3s<0a_sbkOn@q)vMGFJvkcr@ zz3*E{^X&MwZ50_EX5{krg+i)w|5dv--OyGu@YC)-e^w(8zp-UnUCwV_V7f6zd7f9_ zE*Cd1Xw#}$+2IW1E+;oHsGaUQOyT?cy4BUo{^$)~usKt6b85HJpg(IiIChPwdz4YT z@^L>UrCU#9ZJLS06i$^mx8^3DU_k^ob)5nVgX7(DRo0IMrsa&zT*68V+jk;yO ztE%2=N7O7RI(Om*B^NU095|1=oTF&4Yg13?YPgQKmQ!X=#!fXe8X_6BOY<<(FqUC(v5A5P87O9lqgL z|C}OmGIW%UqobF8hyT(Ag%u*L>1t2s3dd&6M!9TUUNuT%g{y1%D;J`1p^fN~IuJbO z#c-=HZwt5mFYpdPLBG!Awu7qvUjMJ`wSazN^yZw*rxE>&om+3275+1wQbwQFxc{?Le~tHj9bc69=C8a_}DGeq`8H2a|_*dD>4F<+deI> z^6j)LebKYKW=pO0%DRQlM~V#%U_#r8X{ zlpG({DroM+r|bNFDCf?5t^;=N-Fuhv>-@4es$iKF zaOj>G_AdXat4jFCH5m~3!J9vlZSCq>8dM#&eS3R=fB2ekcz2T2L<>z-BVnJl>rx&( z=rn%+S+}ep{qNr$mfiRCTqq;SS5y~^%7f}Wj=K6X@^mrT6)b@@4@g_u_uIRsT}&o< zUWsb6G26im-;Ib57>N^}BG`NBl&0wq9YUM-AK)C}?X#>=@aU0?7xw`^GXlhX#ix~^ z6@VmSJjQcR+TGsx)Wm#ZlG-D>o6iD666Ys*;;b+sZ|LI^Y5rC|7{$>tg+X% z0Q2eHTPk-yGbr=S!@$6m^H;=f+B4Fr?Nv+N=#3eFs(wD}l)I+o_YUq)Qkt#mVz^!} z+8||CYwJZ?2aGIEzHRhN#eMm^ZFYt#T_bKESN1xi`{nHYub=l^DT$h)Y+h`)`BTyU zjx~FIH`ZVJGpK3*?RKx`&KS1xee6Gl&C8>vH;*$MIHSJ4YWU2}KFSss<~Oiu{B6e7 zg=bm^m7gp6>9V=v>$mYEbxXbtSFl)}vh~O8rHw{6aCN!b`$Rppfd8h0BW*=QHF z!NTdVlD1k&hv-X9r*$a~+doFPOw-M-w0?Wp^?%1;;u@nF3gU423@G{u=+2a^3fPnC ziyNkPn0_H_xX&l=E5Cda_jGnjoJ1A<11btU-4h)?FDn(xk8DUPaql2o{g#M#Ew?oT zP;cL|<3C+1)x(nvr{lAL^)p%jYZWn!nGidxfN)-e)$_=EeO&_iPfT}bPp|as+o2kl z>yGN4Bz-rp6OoQAh!9aeAUF|?T#&j!yFPBilRP84`7F)Sv2i<~Yn+((^5ciAV+3M{ zH52{WIorfT5!+mIi*XV;$tZZ>(>Dy~80lGAQ^ z(SN42%6a6m`R^A>h9jK%zi-_uc~@)e*G)EOozh*{SD`#=s+RjTLxmL$m$!bDKh(VM z0X+-%PMcC4T`YB{^mcV!`PnniVROS3U+OMQIDDLGnk=XY&+ zR8(KRMYWyJi^Da3CnI|5YN4W)7FStWned+i+_e1`mE55-7f`bL;?;lvP1jN_1_>xG zubAJ-B(lYS*j#%PPvw0@zgLdg-tHP)8fW9-#`7jH64c!&e1{DWMLAQa?Ej?WncBLog=>tHrFV0)!)8;w6fxh#s?Qq zTs~U&@bZn-2b_)%*WbA_BFt{o@7BFeEx*t@z`660fYi}--6C#{Q0{*3ug|OQUJQ)R z@!XuSIOD6YVQa@{vC9wt6R2Z5`gU^It(ZX5(@9U8j7#sHH9IIUefh;dv#tftzk2TZ z(%&ZnpDwW9T>o|0!h$Q)PU`(w1t$%2{{ZJ$T>mMl?)G7TsTV(-j}Nd!N+CgpzC!{o zL63nEV>aA%et+KVQ3vyOLIhdoSoRpdu5r4H>u!6;W|aNNY1Ci5XwBfcAQHA{9lw+* z87P8#Cl?~ys~$A^bPC|LjBI=Ve!p4VM(?zAtO<{b=*UasN$ zqBOXkKk4=%zATB3OlzL#;ks<#v#-U9s-?cWMx=TFj2ZBG(NTvb?qsrvKIYDXD<


`2lQ2jCemdP{K%09Eq3eD6W>JP6v zsBVyXy#CpC#gA{l)>pOO8s%Y~7k8v;!^*fmr+11lY0sYR){~PKS|u^wP6v+x^m5R` zW?eLQQnm|~glos(;(#7N3fKY?AGe?j=Na;cFTWTHlfl$2G;remq%{_adi*7y3F02u zyLZN%o40=NB-Gk5xx4;RHI?N94ir@Z8+Stq2G)fY)du@^!R8izq_lB-5{NUX+TJq; zax5z|v$IM_^0L=uxf>M+8C)Ec(8IuB8|f%~BCTI;>Z7pe5{{8*^&%$&G{N9v)2#Da z&WKsJQR48c?-nM1Gn?l<;_&!yWE;%7@3<_)T6D#9$DH0l3v+yz5_Hc>Oik9w+D#`7 z4sVkX+ekNTa?qy?CULQ?$3;wrVuo1h3M8zkWRf@gY-p6Df0U7;*lMwpNXlsZCC6^K zKv?iEz6932s#-P>G=kvcezyG_%y4O=%g6&iR2q0sJFau&%iQCWlG+VjM=tmPo^!Es z{*_BB6H+4_RW-jVeD1c*+w1K}8iUaX^IqalpWVpSY~?(hx{^+@a|FX@{FN zHAiYkgy|5wE#tKr!+A~Rdn=;ybZaiIUb9A`a1hDKaGisrCokLfVQ)^qq16G9Q*Qj% z`(|AIdrtBB8g2WF$vVynEJA$vjiE`cwa1niSZajjm<_Ft%;|R%y%?ZribMFQ?i2HW z8g=R7hX_dt^~J^Irf%-WUqU-pMJ`xZtb$B5#i6t-s%;7ls{RcJUO4Fc7nYQ0!aI#W zJd(7wKJBq}IZPT{nq~C>q z<-biVrTXE8L`5&oMj=pVY59ZRYXD#m0DEk967@u69^_fHI3^DcN&nqyMSOQ8n$M{rLbpGYBv5&i(sUGN_#xvelD__C3V7h&l<{SfTu`Bv?8-hOVAJo9S>o;l%7)Y@9IS zvU2rJ)JJm>D=I4HRecp$6d5A{a=ElTTp6=0&20f$P|}-nIEmGjpGk$;8^9hlJ$dTc zw+HtD(*tdi5xFuI1-M_7Nd!&$+;xk&wFLuUT;A-*r{}bRJ(3Zqz}h$=2(1VY)oU|w zWwi}Q6<4Q8wmw)k`LaAA&wX}=hjPMPJ`Dt1bfX)4`N>GkoshTED~t@&K~uAuJ|T%e{O?z~^Bq-!)0K4tP?GQl zJ~MGIACyiWlA(x@fj&~Eo!fy6Nt7xTkyET^?6w)5n3A-tnL*(8Nz^j3s5fp}#xl{C z>1f{Gat%a1p8*z-@vn3t*yY46*yvEJ-K6CAmeepYuBY&wr94j_6-(5YjPBCv@k|;h2e=Y-#|U z=#hnOvS0wQ3P?~G%9gfShXV{P3JwXVVoiAG;b;yYA?TL{f`SH7JEkBO$izh7^XPsc zoV{~-079%|vS$Nvq|rve0m^W0+6bGc$PB>Qkih>r;$z%%_A`YV?j>kH{YxT+96N7$`b=Ls-ZX7#5(HJQw zmpa^lM-G)c=0 z2?=?!)alD#q_*QKY7M){b){c-HoI6{&@?TOjZ44-uxEJJKH&YeZY^d^f!xp}dVe_2 zu@2xamjIuW;fr~v?}0pV^+>KR1SekFBMx3a^&WBYW8Zi9&^`lU8C*+r9q(xbar;Qm zL;*qdw7U+;BT82Q{-*M!`puvHZHIZg7E+4C?#l!J`qk+3(A$Hje?s`o3q<1~Q)6+Z z#Xh@EHS*v2orsYYcR=(bf`7Rt0wFuI70j=`AO)9q$cn=p7JTm9xtLD&8puFNE!^su z+}9`#<1K(I=T*sM4Mat#NS9*GV`<8)#JhDH%$;o#KLL1TV&y|Hb%S(kAytpvXEv;% zVGBAr^hb9+TM`PzpTG+jkIIgEq3GS8wi+71ahqLENF+6%E#iWqgb?$PE(|C{s}Y-I zv@$%4(*wMgJ#Ejt9Nx0mqku5alR1IE{PjlSOJ~c@L>kJz{I|jkMQFDT4DEn}{s4y2 z^SeP>5&Vjl6;X{bVk9Jol4dExws}M3Fw5{DzGDMiqSr64{>){^DTjQBc*7=$L3CXQZh}Lz;q)0qwGc zv(^8+T0b&jk{tV2etjCpt3-Li$&O~t8q=o&CRz`CM1HMwg$X@m6`nI#u@WSiD4p#w zJHK92(vlQ9nU_FV3G+bkorzgdLQf>(AKeVeCc^)7csuB;Q^Mo!o`jN5fq>;ZSSDnlR&%)OQ9E4$ zBt$C7S3u>dMOQH?Nyps0=%G+5jOCE1WD1Ck*=CbYxC|Nk>*L6RrQb!^47v$!H*`&6pu(h&V1)Nf@inDE*YD%Wex>XS`>w<) zpvgbxI1X{tsu#iLOcFnYq)^y2YHW98I3#osP&4@FyEHHT76F?#_YuxwYdrfy*bAWH zYW;2KHFj`vxRb!Nn-<(uV541XQQOL&lv`6 zlT9yv$Fqi`tA?R$N86!rd84|t-^Gypz*?CHg)z6 zUOh?Q&@e&-IU+`yGatz?aek(I@fg@ag4Mj#jAe$w1i2+1;b_u=UgP-z>|gp-8|X zJvwN|jwl2NhX_x#TZ7~M5IIhILx#*)^ZrcHA!JUYtjCQb?fOEumYRc|>UaU&$-+)hFBj#&rZzU?Hm zAPIaHtATsR*L{1sD1ifG9CMXJsfiK@-yA0^Kwc*$?0-*(tXP)m1b#s{o4n_OC#Nh~ zegoHiPknfD<`~I4oP?I?3*(H&(q}|a66Bs%LB|;en&$ECuVEYhTxJV!D#-f6PP>!d zxR<(yhBc~-U9ix-JNI8aJnT@ImvKzaguFrTc!csp5^&eN0VR!)IDG$rfN}CzQPCRt zBKQAyi}D)&eLS=F`}Sb{@b9evICheCtS>))!=E3As#}I>lj9INYEn0*^?X{ zqrQ-!V=b=e2)Lre--x%OzHpI52iIWivN};zI$2{7^QYuHzCn8+-%*xp5;E;cK0cpj zc%94%>U4eTmP3h&GLcAPgO@D1R#+H`;!LWE|K&CiImUm%l-ZGJIeuVSvDL_DXBY6n z&_VwF-^AMKkaGX3@-l&y|erc?on``J|M0Ps#AD;#di=DGQypCG;{3hXi{kMGxgd*m9Caiob|%k8ZV7Az5*-b*jA`B z@I}iE9=uQrnKM=a$pFSFkcfx1hcQX}vVA|%eTw6o&m_?*;uT;+-{L0Q8W`9GT%oo0 z=rS~}IKEf2{U4J?jmj^6(S94f^NY(HqWn0-qmo}p`Y>RQjBgLqImr+n%63aWBmKL# za0qp8lc^5r4>P#TR}mE;Bfy3F0y&f*Gu9?G|EBQpHq=fxxKlY>@=<-EdT>qjMcKj( zzF_up9ypx*F6ttiNAc*xM1&#ZVR$2~f-%X3{Ep!?jPz5auBhUXr@|RYfZPE1ZVGcA ztAuUw@%&jTM0vy$5&-3#8oy|-jUI4mCD#F6R!3e4N-&}=Oj+NAS%}@qWnGQs+@j;b z^ynfe;;D>Ga`2(kVKfs1d(0?mh=AD1BTOS@PeBk|LIb2l=02sEXihi;T-j9b_q~0R zPvE-ZUYD?Ac}Xl9!bMh3aejwH4sgwc*9u_D-~>HBrOA+h?%5tDk^3)S9?USqf5-KZ z;BxKSMk;njYuVr~Lpz~ObPR5m27GN9vq@n`26>}05MCq}T(3&}L;>Q3FahCGkp7#p zT09s`B5n^vLk&f@ank1JVtPvf`PYOXXx~kraL_^jgZ{S81AK1iydW$jkLH$CngIyl z{@uFoRHttd*OusTb-w>?=RrOnNDUHj7SLwQ*c3}tPXCLC@6srNaG2;~wP(+si3=R8 zNn8+Yb5e9|Z+#=TpL{rw19X)+hg9fRFeFLE9RBw+d^H?lJBT8td<2iqei;wK#(-th ztl2a0Zve9G-Z6z)R}^s<7)U}jPr`rsix)GeYPD)L96XF4cWT;4?cw-U@xC2*-ZRR_ z=T!DyW8}>6B5g<_fwT};0>@v*qeoG!PWq!eH&fwjsZ&`E4LyFbN$#H>3PgkzIyy^l zh%TB(MK=nUUtG!5W-@(M0-G)3%2-n7+ot3&?t$|2Wwl1<&NJQ(utB26cO<6=Lv+hCXpkbDb@3fl`<#-? z7v}tgb44tQfq|dNq&F|0CB5HXml0p~^HXTJ^lE8ZgXdU`d6nlzcCWXIsLVSAH(@9n z^AhPo<5*Ii7;XIh`zoL$`M?U4cbpPws~)h-$t2yNV9^9(qffRna3)Qi(A`MSa}NE^ z|N8wQ7>V>v$b~r^TWAgKLqttimly-d$agM6+D(pOw6Bl&~A#IwgZ zT2y*6a4Vg`gFAu6v3;MOTXdL;o60pW)qUjf;(F?e*q0qJnPEGav*wm88Sd=Z9W!_i z^on3F?8l2Xb--g2INQ0)-BFs-trVQ{8n>*}EPWM(? z+u1cDdZb#q#kJMI*Z}#U*IO9?O%wvK5?sTB*=F8LSW@HJRZ&TeZC{m7VZ+vFR=b6rO*$6ErC;zR}x8fitjXDbh0P4|7gd`ga=Co zQ~As?dck4K2q)|yB1Gn43p>vbv<(MGk^mLP$ux+wDfzV?UEeLYK0Cw1?Y(^^o)~%p zqP&3N7_uTM=Fnbv66~`wGK-GD=44WreIw`1oA<9lJ86`2%BgM!DPG%ktjPWS1h^?P z-6-~RDO=~Oy781uBbUCJ z^vS(1f0QXVXh1++#OcbrFDJs}xY_$%tH0CaLfrqr3&sqga%x($?u8BQk{o`O*8kSx zXHnIC;zNI;E5x-qq4e7-R(r+nnBy!OLdU0IRBHq@#(_ytzo`sUqjXldF=ipR_%GzW z*gxd&V*;I1=axPA%Gjbdrv+OUdk!g)z_~|UuU^p$4NE@Te^!Qqf}eubs1Xjy28ar8 zQivfg>%;6=V1*fZw&r+KN9%B!CVFdZ@&6x zu}b8CONX`(2-Arxc=KKUH7kId3!=gw9Bp?A`tH8sr{?Od5itXgynr1Lv~XaTmq$Qc z+AxJqz)T%0alGP#g(UA!YaZt(?g zAW@il5^Z9l0gn|7B6~8dn@Z+gM~_x%m9${9K`wnKt#R&$=*JvEIf zHShs8zPhLWu#43JXCoscWBT@G?}J23rbOMh?1+Ydi#$nr^MFj==nK4`EMoH!P1J_< z13K2he4!2ZcT>PIYF3!4=-_gCZ$LUwn;UL@4mc(Q3MIx|C9{N|!B?!Cev-;^j6cnvkp8-$(P)b5A2hsR>BhlT@y0DI zH1sakTl#E=&5g5TlBTY%?g3z2uSe@4yLdo0SB(q}ZM^2;^k*pWOFqHMh^*>wouYKM zWjn0LqC{vTn5RULAXBg@+1RCI2|IxPq{aUk$W)EXS|uoAp%h7k@MR%$ks z7k$^$515i_oTM$UM_{B%X-&KF+qux}^dozmNCOm(A6opY6YEakN}RaX5n1{MKKThI zGJ6Vnv@xI+DA^>#9%(Q~97f@pJ^r20i2xZ;Z+6M$0nC_7T$N7YMq4lAW~r2*;bIyd zZGb(}wrS&>hRACP7vY96U3%@#-MjSn+~$Ilfdb0SDuIwrdRutBZ8~(AdukNc)1Ab$ zQ|OC-nd-ya7l<|!ciE(2Iws~4ia__>lj7n9bLtEnj|v&a-M#x}2sakf5dWj6Q3!`V z`K{8jrOcS3+nvbNHx|ep)wTp!GU951X>FR)2%uydG)ByO1nEWUY^B-@e_ABf3o(h@ zHxgeoe*-rKmH6E{@%7rRxUr!wmTQQ_44e0phxo8nIhEJw@*+l@zx z@XyYvJ61TCyOI%#_RQv>JDd2f^A!9AY$O@TMGdSr=2%e`Jkn)n!@NzCQ0j^Mluq6D zgF90Jv6NFs;M)<@j~;^&+$)SfMAM`a9*ZI3s##50G z9eTy)me^^N%$!?uCbIwOjIgZFrMRP^Wu}Zt#druk8k^{UoWcxy*#F4J^+SXm{L>y_ zaE;s=xkU7xr;K9jaGsMTMN${l6Gubrj+INRgLv<;=`XaqbrYpJ8x*zzKY{pXOcj}l zICSbUjcL=KQu508`%f)0{Kh$S*6LbXS`X(g=c9O8o=uJFiiJgoT?3UNKG3w~AZQ1G zm2v3zmj506`%3|plFUP4ur7(3;&uYV+AA~QIGRC1w~s%3ke-;5Q-;H0{nP?G*h;$w z_jY>zll(ioevi;qJu>CkR7*z4NGnTJnBlau+xeYtbLM1Ut^c!)!Pf*7A_j>4#tPQw zbWLQst%Q__FP~xtJtwU(taCH=B3nQrf1$T{!?3Mmz$J6PjM%lhKbs98!&CMA2K){M z11UBHkl^`babwfSO8vK|{}M6wOJX!ziuc=*ys-nQv~l&ZVW$f9r3wK4%jwbzBg7_B z)*w4GA+60MEu(E=r(aYQscC7~F%v=?3zb0Q!TaQDRh%&4zQ@+Z80+WoJqhsjUQH%l58@PMR5LF?tW`YL)|#64#7UfYZbG6O7c((-e+ewbg1!6b%={;WbZUW;-Q{)5 z7-U#rEKmOQbbQ8^^8L8nKq{XTx!!{gg`h`9Nf{2`9tT4u1tQ;CR?N83qqoGU*N&hJ zc)>_y`UKHt;zN{CD3MD5z$rW_PZj6+Ix zF5P5vEFB2VM0QJgqYy+5=jvzc5CO^nyj z7ZdF4>`wm1fi0~GeWmnAoDMN2r&u04xZN6fM9IYQL=zc5>qXE?u%W~^Ubq8g@R{suUq^pGz#TVF(y;Dpv8CUVX556+UHK7R=tkv<4(}^VY(dHk_ghH zc?8w#qS7l;h>3xY%~rbND(=yIS|woxjc0CoGk|wsjNYE3sX!YyfFg+3tn9SP+O~~RcQsB~Qaegv`XfR|oXS9F` z1nke=JzEnfhtnPBZWv-SNo&HZs~_16ffPgKHknLzT0XGz+Hdd5H`M>sLgZu>Cb+?e z^jb~fVbR$eOw=ftdq$oPyXNkfPCFRU{<7~1bLSKjx=D4$ZT15g0sSml(NFt7OERbN zwx$wTw=1~z~mo>fG zQ_~mOQXna}^aoifl~Cq*$H75pZVY-Hk(fP{3=r!a#Rdi63+rbQ=yLSy2H~>jFhWM9#1IKk8Hq3RGds$W0 zwC5lC&?#fA=3qu~@@VqgHZCps+N94u>t|En*$y4pbI{4?-Mi^6?7JllF*crfuCOTg zpL^pHCcXLA7_{)wf`re9g8uwPntIKrPx>`0zaP1}B<`01M;CvS%Kh1ux3%L4Ro2&U zMl6Yv%dms-@yJJsvaIztnpSrr-pPWZjgNL%Q_aT4Mu$Jww(TyZ<0pL`40}XuK)(lS zqu!=X8)<)@udM^=23j5Ru`bCmS)f)>AyiE2OP7A$m4B0sOeO1Y5qE&3n+csi6Xk-@ zck}-Ju3JAO-OBRt3Vxm1?C&nh|NW0wSi5g+8;xl7-~Z(3NN3gV^#l zAKA*fko)HX{?_KccQMh!#m9XJ=^@{%H(xPo1Q5Y10Q95O*1S15OEYb zRa{e-9~50??ipuuVeObUeC^G$e2NZ}##JFESRFqobJ!W0*b<{J9oCDDG;p} zzq~0_8y%uUic@Pp=0-q?Nqfb?i^$QQt^hC(hi?Mxnz)B>okLTL?40ZC6$_eQ49a~? zLQ$FROM}W7fnd(PSXvsOsByfa(A=#4`#$~xVHf8l9g7TX;1HlFN5Rsd3x9zVdNFMb z-J=wEoB-O$fzfX3O7XSA5qZA>pj;{(MDwMXS)5 zeb27=#)6oI;Udc?O%rrt6H&d-Ho%C$P)n$&z==Nqh9vA6nU8|_B|?VQ1yXlBpk9Go zmk+0~;a824plMW11BBipI$tUjzFQv5D{bB_?q)R25?Ut94>DEa6$H9tH^-QGHG}jx z0B{HO-{j2HV;@HLh%n{h#ZyAgFTK*4r^9c&0cE>)y8FuSn&4X2(CL_whd?)n1|aq-fnGUG;G9^DM2)OFeq1h|z%mM5@=c*J{IIwll8 z9Kd3nkpVSNAd1Cm55+m_*Y_Hkltf;5swl!1JiSTtJeHzErW&xtWNfU&pK%@;<8~4Y z4mI!-xFzmLap8kiy!rW{X~ky7*HUHhk1`c@&)&UMj|167MMJ!i$^5@T;Cvw|CF#kz zU?hDJ()R(CkBzPElgq13`Saj*3epOmo>Opy_KzKOkh@yGH_A{t>pMum4jcH)1fF3{ zq2ZDcx(13(WMVp-jD)8O;M|ui6)bpfex^39m`;&KRT)pm%cLL$U|Ws`7_t4;#*+BM z7ZmFlZNtgM5Ww}q2bHpin!8U(}~cfEQKiIOZs1wp!m@g{wMKujbtxVN-Vo;j0s z`*t8YmDi6{nA2 z{elTiR-u=wOem&{aa*v!7U=@bXyNj)^!B$&hr}=6m2{o8?u_&yTx&i+QbA$21k@nYp8TsQoknfMHBdZ6(qBz?d5 z^HJCZ1_T6LQQ7#h*)-ZmNLISvCOkjT2<72i4#HUBK*$ACJLXZeL2BgxC=7{MTvk-2`!xv2=ytjv&n{bNq zV}bmZy?(HfGsAuE+#KqC*5oT1RqP$*FJIn3Q1I`Wv!hc3H+me|lko*XW1Buors8@2 z_HEGN!-uQS;uiBjwRh>NM^+GwbOX29vjc$>y96)h<3_SWP;XvW(pr~+C};pcC(4bB z4`&DzljPo{Nt27qC-N|}9#yHQuV~HXLwUSBBr@BxZm6fl0QqJM7ap%3+|W)}&Ecqn zRV}ng{T(`uTz;+48UOjS_We1eZC@poZoj!Bsi=Hk^>qc zQOCXuI<~;1Z{Gw9|7I8WvtR@(kZV^=!OLTMDL{Q>!3bdo~&Uys7NZfk%Va`_--R&Q8*U2c^WezA4=6PwUHKCi62 z#YV5Gt(C+W`OPe%*Dfzm4v7F5xyNRXT!gH5X`DqfAp;o@&PkEWLVfeoE~06-H3i!E zL0w}#pJ^%{66bDK93FaKuh zLC4OJ2k-)*P(c*unwsM&8=f%11eIlTfSO8Pw&J0T%V@J@A_LVNQ;j=5vpL!&VlqJR z4f2W5Av{~acDQ(T6E`t1&<$MU_o+a#c~+az9uNEVQL#(HMs90K_@n9^&35TyZoc95 z&-NWUkk8l%hI%}g5)e~SZP!-+4{L88)${)De`lU$$}D3^hEk>sAsb1iGzmpQrZyRh zWXN3XOqD{3LX!%i5{X2iM1vtoqJah^N_9W3z0X`_`=Y7^X=lk9JP@m8H{Ti<8 zIbD{M`J-Rn+!VQmJT#Jx>HUWdwYaVj(BQ#&>zSaPF^1hx;V>A!#CkzP_$87r8*$pB z&l7r^%6^c?V6%$=RnS6(>Z`LjTli$}-@l*4?}pcmN$Tu8@ZcAXyr~8=X5_KeB$Zc` zVg$qrUk<3Jz^pIYK_(WnW-=0A^N%t?fOwiVbmW}fvpGl+SO^?IX;Ea^=p4UKU|4iW zP!$x~OR-%7KFW!1tZow7&m&y>IaLhs$cC!Y#W~%_0sKI=ZR+xle?@?@KG#+sliDK( z4=MyK2tIeNj{-F3V=nmVut7j%1WTKCdfP5}FLL2@=0X0OT!ZE9w@nGUqXD__)oN16S~PFXgsI??Kb;3zj3eIe zuiO1^uV;QJQgSagP0sIvPoGTtb*#7xPap`MmUnAfBi))W+ITho0=1>e9?9LEsP&|7 zq4xbpkD6n=n`ASp&P(4lvp)h5XvZh;wKD-fC@`R)mabUQbkcuZkfc?Lq5vSB5=hD` zid~dbQu+y+g`O7!itK?GfjP}x?mV{9;u}1msQ=9t21wQmGM4wisY!#@kl_6fxV~kE z4Mmbf*loN8bPIMaBA~_F#_R7+uI;jky_(~rd-Q56ioD??Mm)&QzLmYKK6c+pL)H82 zpwN!(Et`8AUR-!y6pDa;(4LZubn5hJnaKd6yC7mo171Pqk#id65tFLAb7V+r5vW0j zL6ud_s^wgNWfr~Y88OU{Df}qk+t5R2HJRvAe*2@n&clZfRhsH4-*no=8f|`ipM9Wh zpO1AL7l&l%ZfTbWSJGvAS2pIBFXqAypo-F=z2rz<4!732`w0Eq-jQ$HKlfJl_VKwY zk*I3dS5 zcqQ_L<9LEe6m~s3T~>sQh^I8;o9^d3D7Ct3qOQcTal0~eVdAuO?A!CcI3s9f3Z-Z1 zO2kS$rDpn5ACzXH;#*vyb#&V7deA`U-s*Wu=vflvG?NJBr%%O+lc*m2jNXi80A#KQ z;*585OXPJc*iuj~KuFWbthk+zo9S;>L`Z(V<`r!@6HVQTizcMo%Ihqa} z9g1M0?)^htT9(5%cX0}*ui*IFJds#m?Hzv{&)Hb$(I7)kd6ce1Swlqd<+A%KC+AZ* z(6JQyS5f75c~g8*pBj)OJc)11Z*-ft&41Bwu6lZ7#kgEdn!A;p|IUd?q(pN)eYyw0 z%*(2zc8w<)m{Fe%ltEQstFq8Z|BcDsvILJEsxX??pqUj)d%rzI+)EQn1Zq!;e%Y(E z%o$(p6w-&RwXo>UvVl7Ca%=;&Dp$QjuJd$w1xa|ts1NIkoj-h$)+srpoJW7yywb*I zFEkQ|dSb(GlqTG}54n%glgL~V@XLQE|8g=pojG)<1xv?$NOIri1W4!d1~CGqB3aYs)G-yV*SRqbUWTL#5kn$v{%Yszg`S$TSOhl$k_g&CY?{PQQWEl2h2EpgPlJx6xfg*NILNAs^TL!8LKpEEop0{*JGq&!lEev{srwe?F(P)oJso#0GeH1lF{Ty{m8n z)DDb|{fdnCmYvr;&MZZ`Q?C^1xZ>j#Xgz7k4FbPK9Wj9j2Seez@H1V{rqXxfND}<- z>eX$9xjShpDSi@zZClL=Hy#(tJkX`k{ED~B7$bBZc!O$Q)I=Mnx)vH$@dedX%>IWz+A8-oFSxv43u)%O-?8kl%JlTVPgh2eYyQ-22D&S&~ zIH+(9xK6v!7d0J7MW>L%YR#9w0(nWnJE!cJOYR*UO+Qq8oReZBr0t}18!%>!`2%&m zXkoMQb}aa?g~xbtj^AtwK|v73mJZz6CFJd0#h5-PFB;8iVH#x2RbT`YnzPeB`iv|d ztZTgVu8X-|g@8^ro4{QinEGEhU7w#fdtCOGBJ19|*G%0(l8aQAfo zw`^06Po;55lY<982cF|~$c+rLzk(~Bcg09K1KMe~jAQNgUZs*!pk&MW{WU*dMyZ4E zNi3%{yzE-I`=`|2pry-b)lfXoc%bfLd6M{uyu5j#TBaxlXxT6Btgkb^vPfdi6f_Z! zL%f3ufSpc|ZH{=Nu*SSaLdWQx;d4pkhxP)F_{2mUYrq~U0ZAAE&vk@!CaS+NZc6JT z7v~P+PVW-?4|vE`DlqxV5;l$8ScV*fsx%cr&Qc~EzrEw!1M52a4SDf#s-VEHjV* z7d`*9l+W#whmiuZF!!<|KNcp|mAaYd51tPj?%&wkw}iL9SC9n}CutZmqFgWDM@_JMD8@}qg zMcBKhik5r1>#hI(JN)M4FZJp^o3^I;rUVvnNI+fhr`9RHa#zvS1nrn|w-8})X-bT! z{O!l>X46jK5cUmO7C7fWs*iV*^28yClTbxvANNc;=ika5z2cmGL|VY=LS~{|V;*Hu zn^wajp#p+1n|W=>-$IYHXyH#W7o!pXO%>D(>zjx7(5^fW+8kT<~Ql51A`a+OuYQdPHEs`$?$n)jPVK+Co zJEzZIW8O0#x$xmStZ zPo~BR>lEkpq2#0-A@aSw6Qs zG?kY})#%>qx`e3!;b>H*R-8W>wdVabu0!>W+WjfHwk;X?BE211`4}?Mk1mP)ep5|% zP|QCQXN1DO0mE8Ge+_NOqAiJSks7x2{{H2wCq1E!-~iIqI{RmN3^2X~wla|^(QfD5 z(DNCqdv}wSFG&`|#)V8Rrj+`0W%nGQT!soRx>((jTJZYcxpn?;d~}11>ccp&&|1%! zGe;wGMJixD{+ArjJ_vI_Ha$yD)X(5T?bLE=v|;5C(89SF>n8n}6=(!OSK6V03@>1w zY{t`S$}IFk^~rwu;R(N&+S;05Y}m`0)soM7j7Hhi;k2RZ1w=`L64}LW$lpI~`8SiF z>SywKLItB1S58GLfXYs4qJ>#z0sKkK3&qwRymwR$Ka;JJtoE%rr{}Mh)$4c{47D<9 z0zzf}hqqxMi;=DQ%Z;J^UE9RpU2UVaAo_08WSgt@xqgnPjOrPYID$aUp2RkA3%KQ& zAqHh`2(7s|y4b}zrY&6N`f{#U><*d_->5od$YBHr;Dhwr1KEx6mPN6TUCNCiqK+V< zkBquTSBpq#BLh1iJn6&=z|BBBJxWIC+JY5hNfaj`w)q3fJu=22R!MZ{H(e5 zA z(+gNdj`<|CA-7X6YLnyQSdI?r(S8d+=bd;6)_~ja|wk z8ZH6(;np0xsqkExYRiO;Tg`X(^)}p3!_7IH>CmxBp7(?qq3IyT_KTG)9ODLcay{_u zROdu>Qe7|JIX$TPF6GSsLlBX00tV-Q^$?By4Lkh*@=xlO&gL1bN5VPu)Un^ofcY!$iQR5s+@$WR!gpQTJMTaQi6-?djtE00ZH))3BZn`^^avGfZ663(G zh>OU=Y6I7NoV9Zef)XXf=Fg_9Fx)MJnU)w|NF*tL0f%KW0^eyVU^^Fi8?aqT`J+ZK zh;jxBLviqwex4kbgruYc`0LMNt)Tgg=0niZ<7-m@O%OhwBfT=mX0oF?VE;;RbeVbh zAJ`&OacRbPfpSEmPZVFM1Zs@ktWH@~ZFi&3_()2T;SQyXxa=TIKM(85loY?wbd1gq_?mUkbV(Jhk}6GUPf6gcsA~1-TC_-bYczFC= z!o6fPIgCkI*)o+s(@jk~BL@`5#^E%z5gH4cTLoYf@q7c3lKMeGvViIuSn47l9h_T> zqZ3@{83%;h(-H4(ZP>%8n3%XztcH^b47s_f5T)+TX zQZ$9ck_~6t#b@3hOiWvjM)|FahRLLIt{tg-P{7Zubm-GORTyfp?qdw^69+lvB2z8o z)TJWZh~7=0IIuSKujp$TOpJ@g=O)SH7VRD zjjXDsX1k*+{&GgQN^kAw{IeV9Qvxbqo+Z$pKA%FgX7srMka3V-2O=UM|J1LqTVqIL z*ll>fphB$8! z{ao&$7K}$+FpsfgEjF3&v*$m3I8xKEEFQNwRQ@kGt>(y)1(ei?WvPAk0MNgF?IVa; z=gz8QL-nHDb?D#%ipW{_Z_U<(OOc~XM**CIaG{|zE3$Q+obov=_>xl(n54ON$y_=M zcQ7au5qSr6Rr~hc4TT2Qaf$#OUZ~K8R2GzU`lNTQSg`^$O#x9EQcU=zq3+wua7{pF z?FDbA2&~qoDn7&PKKX zzW$?VEoD40f0f|?hr;jPaf$xDUbiN+-@rM^>m3e!eM2ai8dx{)yj4(;b3{a8>&s~Q znhL{1D4y}oW?hCYihuFq1Y(d5+>F#aY+x*LduTXb$eTFFsXd2Zehd40e=BVyW#}7F z&~93D*>XBXGv`sKmitQGI`=^W01pQcJeeXHC{@yKQhzApFwO&QLOarM$(l88_&KpX z#oRs1jVmKL$Rk395-9X}b~Y1r^?@;1EazDK64ArKiFN)VRbL@E$$w5ZG0sreN}t?d z48A-*>Pjj!&f7zr>cB43ghYjqqkR{K2rtNvKNYfAi9`{GKX9z3)J+_|{OSULLTMZU z9GGdi7+5oU>vvTaeoRu$uV0AF<0){Ie{&4KQ~A%9PP1?D^?UQ>3(-CR4hB~HQdwb} zx961N4xB<^2qhFi7X=2v?8QU8*3#G~@aiCt5p0U-cS@;0ZXsXcQB;pTqt^|GpZ=UV z1DVLe`3-|OH@Ivk`-CGHQt>mi&NOa%3=gQ-8^v>?7rl+XX7~O%4rOoyPR8oAEaMs!SiqIsGW%OUKX01yJFnUN?as@`tGSJ00d4)>+5Y{mX1Uw7{YI@*hk(&X z+Q*_!LTiI2UM{_yvIPEtF622IBPUg{pI3^M&AHpZx0zV`3OJw8qTo!=qLh#+gsnQebaQ->12c*5BaPbpA)&%B_Tpo3=q*DIGUxl zS9IP!?{nc&-Ye&g^Saax)O^5>&|M!m=y0NGIahDOFa``KQR_62G%5xd!;wz%&gPFE zgTG}aMHj|A#UbAUGge1a*z+t^w6r>K=?MdXL0#l9jBT3aKleTI5inu!INHPKcX#dt zDUh_5(WAQn@B?97{#S4M^vM&svsnTwxfU=uG*sZHP5Y;3P>7JB0kEon;9j)VSj6p{)`X2GY+9 zR(E%(s5XbVkP8e%lrzhZ-MaYrRv|qb#1nrJO=?$hYKj;+3HvwWgPm6^+Dy$;y>xU$)Yr zVh^$Osw;#tq&t=ToZi)z?x#ocKZ!|6p976g4^;$05s;sUdWpRzKr}Z4v5PY;EV|Q) zZbu0tKr{PfJ=de0HR{HfcR@mZPVM3U+*mIp?%v5k_kR7}y!7?;4@p`ScQa0WI)1d> zuDkULrlATucdobBIuDFNxwLHcYJ<6R(;Cf59@r+B2GH{SvZ}JX5j!V7In|(QCvL*s zo4P-i3FzRs={YwYBhMV_m;(UwVy4A<>b0)4r+lZEz{ju*l%Kl~=+AMuEUO zogZPgvwk3ly-zjCu+b$T)sjrXS$>rubLyt)Wa$88J%qa1#W5;c4~oab@ME~TL^_E2 zw1OC;lc(;v;fJSw6c;l}-K*I{esV({vIzShXke1h$Uhibd>wDR0aBuzO6N4n5r?g!HG>(?)tNEUY{A97yQx(f1b`0KhO=Euj!zmLpP zfYQuKZ#Y@ah=?A$x3}gMm#wk2eFTNFsmPuB^RV)S>S$}mh!3QIpILjY`5c^=!sgIh z@SPoP9!_VgBBfgbtj47&Vh?`)LmE+vyGz_blF`jI(pke&l2a)YSJKMFd=oltkyHcP?&v4g%6A-x^#LzuN_lTZqZwco|cPIQdtle5X0GnI-I^Kp3_REGjdn* zumjDz-=_Ou3+LlIlUal~?_0xUOM?KYKz)-@yMt&3bC`2r!w=N^&Px^w7faGN=+D;D z)aBOJ%oqv3qgbY`t*IIR`gKgU&Kf-EHlhsn5JRveL zLT!}3`Sm!<6)P@Y$@V^IlB^5c$lk{~qs2sxkBG>o4IsvhShK2Q@KYa0ZQUta( zQ7!O_G_|y>$=oChRD0_;8=4pd_x`LK2~~t67m81yUc^u`m6iT1^=$Xw{X}no!j+y$ zVNgmK6%}`jaCHE!w0AM)5+?z<*P+oRm$??cTg`O`4m?0#AQko6oM29P>IK-Fjzp7x z(V<;WAPJ1xKW806N7A7n52kQj6KVOLx?h1Vc|Gw>0JN9O@{K=HfyuaC{EYi3^Jurl z2{?V2SN{`_Bce7FlrZmC^^SvqftwF~HoH7`Hy=OmL%Q$eJw3}Sh9?oGGvmax;p>-l zY}$7G6FMiX!XIkNKCdsc)V#lc^NL69nq0hyP}yw07-CJ z1*&QA{fit^f$yGO4ULF6D)whz-)AC(q*n>Dx_f8Xd(Gj)mzKZ32m7o|EK$y+S?y_d zIE?6*@=5Fn;H%KTmuHw+iK=k}&m@O~ICl(zAJyiYV5Y#P7P?_O;;jks-FBe!jCYvnw$$H0Nh zh79==adq42{cYN)CI2{@!l20JEnOa;$Sa)ZST-RjfAzYdO6#|o$80lfy}hc}kJ2_9 zKgE+)IPlu47ay)&K6I#E_m(rgbZa+kw09Y{c**By2EVF5c@&q0LKtI2JVpU9k5df> zb}!H@Tb<2w75GSKYaUNF(tnd+}P-T5jJf`UtQl%!Vq@GCw=Zn*SPA= z`C?H~#FjAQ)`R*sc>C7XVXucl_O8W27TeaM6W6|*uZVO|7 zzUVzMdQtJ^>3=<#AL!Z{e(ciDV=Hawy{Qcam1X53Rq_)3GerRHuV|KecA_Z)A^DE$ zbfW#!>GMo7&oRQTrnSaG`V$!+#8*us!iSDjt}nLwqi4@#o{(fXyBnSb?1wYBWN~-U ziN`#v3!G^HNRseJi)e1w=;`?qzu4HCDdT=>SUf+zhk8Kn6p`O@Cw3d}sL2urm?j4A z0aa);)C|))@=#uE`xviBsRIwvF{nbX1&HCIyF22H!*^CCsgoW+uk;T`outT9@oQ>o zN;DN56nNAwkQFp4f!vRZM@%54ZtbXz$-eVNg@6_@-q7Y~ZvVCZ-Hw`d((J>uO9~n= zD!q;-iaEOGj;e+d6(I-KiWR0GY90XhP(p&^ZSFNcdeJ`!!P#LN0q3{!5m`}~-osx0 zbAk=Xl2PT>ML34Jt=rg5YDgzveEkJK)JEo`LahhU$N)6u^;dT|i$9)^eseQP`%4Lh+SL%8U-1zXRhLXaMtmc1f#;or}1ZCFM%l(srb z7C-F1EQyL1+{sEWLtFeEr?NTCe&f>oX{$0{s?{XXcud7~lc$*XW^N4br@1O4NEZ#D~W6FVlaJS6(y0WNJ_tN>H`=L+| z-Q5EOfPf2OhI?%Sx(OLYCWB7{>$)iya4celB&uW2rRsE3-Yp;*sE`yWya-S;!Bj|S zsjI8`n*#FOoAkovh;gZ7gqO9uIZvE#{6NC50{-&9=rt=OgLeiL!Gg92M^_cuvF>lW_ljH$tcXzFjfR z`||5W#+!mhRK2)UpRLrXQ{k33zrrsaGT}%DOI16eo$Rt>AJSPMGcFh3-y?l>RK`#@ z!0UaNlydAwa3pQ?S8YQExZ~qCmFPuvkBWWf*jH$jxc=<4j(SUCSGlfFb{gYk8oah9 zcXmJhF8e7?_?h?kK%|cuMDD7si&C8KuD9v>kFgaM@4pNQeC<0?ae`A___da4H--jW z)N{Nwq6dsqtiu%ht0&d}SZR5mG;%6Eb2x2oqkjDmK`Ga6b-VI=s`K}60Wn*?Cc{nm-LfvFjvT{%MFEsqd-k-CPjO2Zu)`2z-RaL;hn26j(Rwu{HUIqA zHjVwqJN2^mxJrd#&m>eq^YE#Kro5HzfCN)%*nuY^h^E3fBpCc%Y>*U5l8P(JDlW&9 zg^OOjeyz3QNmtZz=R?+Skwd3i zt|TajIwk_Li{2)c^R(ND!e&pN==v{DQGbxsgPJm8+vbYdD)D0bgyRPP&*^`6z_qLI zpLARGv*KZ!U!Qt(9o=?Et_c59crCH$ zj0vI27X7c7P*UH75p6@1Pw23##dv$6zhhS;xn&^iZF1 z_AN_5LPO}CgG)j?Z~j?&_sZ(1_1^+UG+GdO{^YO_qZz$2J%FAP=tLt3D>e?dZ*+3E>b`T#a?rl71b`C9K} z1yDtW_mSI{&T4M7{c`8g3mhB3>5@J8w7zV#Xqq{=EW%99w%+9S!8t2o{L&p@lwg^X zQ&qI0hQEL5%~`6p@P2=#=>63PK#2xVMzT(Y_AFZc{J$n!Mg^YMFaC4z$ zncvO;?`Jh`Yo<+|vFa{_)7@^bFU|_w^xAW1@0>fXN`WWeKY5Zgu`_?@pI&}(_p67j zet9%A24pr+r`7W5&pS8ZE*v)M#A@cr(5gSilQIb&WOL1Radk^24buWAlL$mdSljkO z%S$eOZYztB4PrwgvCw;DTil0H6iOHv)f*Ima^TC`==n$veCKlo^8;UeoxoJfUG&0s zIn^RgU{%`w_Iv-}`SZvJ)!U;N&0fDg;?Bh5eVPXrdXEd`NWvcB3^W>U!$i*euUI&AtNGd*Lv@oZZAc zj4YqP1!*f1t*!bTlyZb=3Jhsm?2BL*$A2YpP#Pawx==8n3QV=4vPP`Ghbuq{A zsI}RXz{8e3G`yevYv;8&En3XI)HeHd_X?Ol%vgTXq%wrVu(Ejf5c4JRIk85&{c9G- zsw&O1xU1~i@(RStoq+s@aWQMg71$5IX3?2J2iGu|2^$WoHS?{8SIXv^_P(%%Fh*!r zWuOoYe80hiTX`5%TC}llkr1Meuale!31ApK`ZLE2#{KR@16hPUgy{o%>xdqfVn~vt z2q6=z3N<3P?gP-t7Wx?vX+oc}p@2Nn)*Tz*+AXfTyaSj#$_GQ=@zL21yU^YfhS^6& zCGcg)nN}`y?~Kp-vw+rL+~hR2#J*8Ah<^wXk3Sr!h??{}4lB-nm-^sAV>U4C8dm@i zA`(E_a9vxL<)(3G*}L)08a50VdEQ><%(!N~#_HFbs&#sDdbA?Q(!ToK zPMbgKGZes8g)I8|;19~<)dCZJ$ErNN-n7~MLmkuCM8}rYbZq4=lR+LmVjNUM#2F8J!QQH__ zo;!K;o?v5jQg=Ume0*X%n;2NQ5ey_}7~ePk-aY^9J4uL=epOd9Q4MiG!jmVVhMP=# zcM3trP<#bEMV(&}v$1}&a63D8)HMpP09B=AI>PPB5f2(d$#v*iurE|a;f;0b)~i+p zrMRTqjzGQ3%Sf3T+VA#jFxFtBAepMY;JR&l8rQ6^-lve}(D{l24Crw1@Czcgz6LyG zeUkvMnU7LEqP3YJ++(0EpHOSEH5H|Ys~nG|Xbpe97_N7j|H<4%Akiq{8F zf5?YH5^0?T?GuG5k#@(WFRnc|A^B)#R2G!WgLTf;KepPQrd5a-G-b`?E$kNrp{_ZZ z!6t*ni^MrbM!|B*E*Z^OSonz2j6maIFCvrOE}Djj7ZnbtfVXf7)m%&seGJ{!z#&7v z_@6bId{>+YwzjsBZ)Cy@8*w^TreUV2fnFraj3*R09SARJy*%f{4T`^Hh7K|*in{7@|3Kg? zlV;6smDNmAyFl76p&KIdf8Ioc3x&paBdjZyjAvr{1R7aDo+`426czU37ryw?$)ClNK=%`5ie5inAyOJhawFu~(c4#UxYJ)Eue+OfdxQ`YV@k)u^ zcV=F74nZ({040{lGUa2tG-*XBmCv2imaAo~4fI+Oxp64E{E@za33cKc<ONo_NcDe=87kH?q!?ugs@9MolpDc+Wj9$iUiJ zr;(f?(c#jv$7EsMD--Y^MXD`1Gf3M8SNH%~G(o=EfIv9(<}?6KAlbIwUm`eO#3m4C zarV``Yd03!>~B?T{Ma1>0ev!d%2;uV*08r zmBTwO((6X@B4f=aV5u20y#H3tL^zQ`V(C~svbqH4!bYhzuTpd3L_e+uD` z>m8*uS~Y7t8vqRvJL`7pws-KTgZ-Na7El{O8X|}W>RBH<@w3vP!Xt+c4SEi; z_Jm}k69B=&3}Pv^s}=YRO$53{%c^ZsQIQdWpr@o}OamiL)05{V%Zz~mbCA3}eAqkv zwoa!u{VXgJx>&`3d^|U*_(a@|3;jo|QT;m6=cvNpaVDG2_)k}CuIP2cyM0Hq<+;Fl z|Kp}4nUU!3zks5_D?j{?1*Pd$7(Lj|{sRUC7;Z9WzTIPSuH)5{NJAQG(Qv}ex-FRB z;Amzj1TNVW_W+qlB#mcahS-yR(M+^h^Y)kE5N^UYa4QIH06I(RFnz;m~`&g_&}o+R+S}!*)eL>xSxVdlB1yH zpHU7NK&GQ8VXTQOat6F5cy`;X*85F?H$4caM)chdn}zU{(pqzJYzqo%N%O%~E8m(@ zi%>=L1zV4uUc=a~^ux9oeNT-wDZZ`S!n^hBs6Aibgnp|B_^TGsi*I{7$)Qg zWvm9*sMOlntw)a(xDiQEmpKIVkhInUy7BmilYRSCk6>)B^BId4y+tmBxQcsmKArf$ zcLQn{2nvIAsvOm<5teL(E3RH%W|7Vv_FW|_&W{6-sJaEg4H);*7yAmf;tsuf(MJLV z?dNV6xzh=v=|f)#Fv_1{k+^Tqp$!mJ0E~X{iXuPwioQsFxnwuM5+mE04cbU`sgb#W z-~q{P4JT-t3We}77aMS2F-_;>^a87lQ#SMeLR<`#zu>ru>VOshacrMa<65!c)79|6J&T9zC-#SJKvCz%TLSU_ z&5;0i@^gyYFJw14Pg3r~@$(2(S1|LGB3|ZKP{fOWfX(|2I!4A35YLs0GcRPYIV|-8 z@sL6d;9~bT4m;L&I<`+_~QS_oKt=78M zg_(x*j@|MsJG(1kgMQKcqI0jWoy?y0prV#t?HK*ZkFHRjgfB0!Gi&t9H0EP zx4f}jIb+nmD*PgRWbn_83~HokM-e>hW-k}qo~kDhopBT>hTT~30K%@y>$cx>w)W$c z6iLV+>Cl5DDV#cHS5`K%TnEw5Cjqx3tgLsL{~}!_MRM1aM6X z7E*-sxg{8ESRa|?bCn%&mICTzvD3C~+jKQeg|A`$@XN%3Q?n`t9x*|nGxti!B{KcR z$0?EgXV0F!bNai%T(wTP(ob-P2A7dtCRFw>exB?q_- zQJiRuY-Omn6SpW=25o$PwB_gMv7sCV-0d4mGKX^G0u?PoktN0vnITws{T)yURk;M& zy^{8 zx+EW>X%O3upP%0qt@C^)RuT8gTw<94RZ!!RlXZgPV*B<#{i{X~9V+zzk1%HBVBXh9 zzFEPJX_FiyiEn45tpC#c`(vWAk>XGhpw|e(xX)z}WBdn7uFyzz-@xxES46alv1QY$ zOPr`eGSH?F3zCdZM0w098&*8WjgX&m>d@MVUY)+8!<*n1id%Rj*vx10|Jrmjr+0`f z9HJ7at)-_HZ%^!&Fhj1*7n0zRntBO(zuL~VyeGT>wMWymu*sfWoIQGwOu=)NwMThX zXf3mY#NhPRe@OGdBJYhV7(4jPHoRV*`+}Rhw}Zz?PXU#P8?K8JM{H|u5V0}>W5VM< z(Q~XVsIy!zD(>lb3|KC7dwIT%lRF)kGW3#or{BD`YN-VLnOieQzm0@}!jcyl3@(c8cGFFp1a`h%%-=FG&EFm$TgEs?dVP7F<>*^%5TZcvfO#(YN1 zBKeHd+?84Gy^(Hn(|@CNU&bs@5HF#;ap6cI2rw#BeE1%hvi58uIFeTqWE$3VV(oyZ zdz&^jaJuLk+BI(nk5%w~mYN7}>CjN!$v6S##e&&|lmoy^8?> zOJLbn+}oRUH7crrG^`asO%U0=VUQ6mNj4VFvHL)MbO1siK($4~F@Q^TxaKrx(uL;{ zJ8HMVZOeSoM09pCg#!k=0E}vDakudphdCre%G^y#Wl$N+INx^`O&8cnKZ*kSH%VOs znfbrGW)pBMTqwIZ(F3U%YS^jhl*H<5>L7D>z{@4>iJtFAT`!f}V^G|pXpp#1d7h{= zxa<&AN@Ame7~AE`lSs*ZEFnbHQf}4;F7L)~JP`G7kVI>7P@Zpx<+T+RD%JC)?5~7g zh_OV5t{@1YVVTUA#>f0JEJL_o(Ux!*p|4@cT8?$iZO>6WGoFC>)*e3m)V1rROS8*; z%xb$R`@=Oz+!xVz5@St&eE=yqwFo<3#BLw})7ZeADud>oG3-qq5ZZ%`l3bs;x}02# zC;Od`yEgqu_rSs1KT~l~42rXxISea75NI$lITh%kz?J+5G}sC<8%XBHfWJnD84tRO z^Iaee_&pAfZuCtu4h32Jrn?6?A=|SJ=`>d5#Ee8*Hpnb%4j}?mee6pK3l+%-PZm4r zVelT@Ws*6H7U7_JMR*@9>LhxnP`Td%Z=y9&CFa3QlXvSXiFo)?WV=xw`NbxO^cnBI zvNWQ5w{A%5StL|RKQ=~g0Bke!)-T3wWmpPNmnFkWesIW6L zz51vdY%<+9eX#oY@^i&8|A!}UMX~6+F!eG6sE>PX3FePbW)QMKwyx(m zhG-@|eLBmlsE=1lipju26Vgg02u0Z^4>g@0l>Vyh2>&~$s||6uOISR2ULXhr)lWH< z*hbh`@pPe@XKlAXn_+eeT{FpG=P0*TN(Z}|B!4h9ODHL#3B_+rn1i5;k|(h^(i7O| z8pC0Cne|gD;#M3pcC19Rl;*6ric_2HsKO?1JgF?G%B6g4N^-IcCqRZ`dh@}wc9#2r z4JG9WJ-YwGR$s|%gN&k>$A@gA<$XoLkC^2<6S7*b;VENeS%U71aElhad%7Uo?1;X+ zgiMneLsKb)r+`(=3u^s1^+3JG_Cd|V_rg&BEX|ooPoC(z&1G(~>x~VuY;Kl&BGb7b zs$Ao2_jv)neuFz|AUFJC(<3-h-~d{CHg}n;Iw8Yj5PdcJH9_OhiZN49qHE=U@#9fr zK+ZWXcNNx129B^jX!5ldE%raNZt1}0?o9dRXg2cBHl5ziav*IF0X#mE6-oaIsH@M8 zdk}xD9CWn1=v65E5E}hcc&4SxT)XiZ8R|5K95ynG42`WVjw^m$JjgXQsPj(GF2hEQ z=!Aj{h-8doJIiA`ZuL<}*RK&U1QdDM=-XWy=e=S8>Q#;~0c8LJiM3=4X;V3Aqe%mf z8g`>)d&?NIx~a(@P%aD&j^meK8EVo-U5T)$RNqt@T4vfJ!Hx0uDHG8z6B+ndq{?K+8FcY-c!C^>EkmX@bJ;9J z&A=xxm_>giVRjVj4~Xf>ecHY;mcTh@#T~#=cP}Mn=1uq2z+0$+#q9||!11{UF9H>S z%8lEDGz9Pi*S_Gc+iyaze|qhytF6pYpX@lCcvDgb&au$Xw+!fWdjQ2Aq!-mk0+;K9 zpSmvZ(9_{@X5EH#J=dwR|J$vfnzZJt!VZY;f>DNGM^aaCP0B1yE_ni*GWyT7PQs{< zgN$`_F?Lp%X-6fcsdRs$-Q+81lnmh*KETP3Hl(Lc!_CE$_O%Rr*V_LK?q@OBL0=c$ z2xh5cy<{kxD_jN=(7-!(vjB)m3uKp6Iv?C?&m=fEkR`9t?;NK%ggwKZ&H2rO0lMO} z-u|^^3n)-oJsdB;h?$zYGODMOc_??}V@@poNC4k``IcFip-9D5$AUWlExddZ5s?hw zIZeS+WH+Brfmp%Qb3bpE+SU3<9MHw~%_iy|+%nYs3RHK~sV}uIG*=YXD3CZLPm!(|@G`4PqLO%3Ck;A&$j>`5$Ds1-KC@l_Wg%?GzlSQM zQ?q8x;9ewmjb0Bq*KxF=)QO4g$i>0y_+XUS9|{Tr%kB?2xr|~nx@tpZR#(|tn!MA=bO#1UA<~9ACEA~M0GAPf6NfLwgJD3Z@2R> zNJNj20C=wzjo)&6Lcf*n6aR^d|_9HdlVIg5&V#r}xNiGU*zDUka zZbgIIc-(F6x^~sK2{^p*^Q4#e+xePuanc_RU{unPX^r&X4EUN8>xB&9Dg!|n`uzld z9)6%988=i^Re_fRPgg4KT`AL0fF8xh(r5FMoA_mDOvMAo$tTlPWS$1+uJh=dFG<0d zkuzm^fF`y;1ko{r>0-mR++F%K2XAF+coDHjDT$1>D}ULM@B|3Y3An5>5x&b)K?dh}E4lhzy-ao2eZjA%;RdwX*x1oTat2 zj547P>@~efo+$lAvuOABRD(uMrQ8M(LD{EKu^q`3j(>WD09uU1(s3Z$Oxz>-K^cwe zq)7@KZTT?cf!mhk=jTft$m$G**}i)&RMkaTqOx5f4 z2BEz9^%pn+t>c#H;1H4r!NZYMY(6RlHl3&Z9zH6aI{R;ErFp?paWe3sHZ__deidLw zqQ47cSLtr;h?Z0a27RUjSd}4%hbCBQHA&&8%(rb_|6E zlL{VgjC(iV`2<3@@7(GVtsukKMC6Jl6<_%Pl2%c1%9LK=?72miyPo5=-N8&3PBPAl zk*+xoz-w&gO&4}CvID$yJy5vOgh!;Qi=8{fY}PHu_Ful{ja;}xW&9;6g8-82>y3+J zV8BtMQuj5Q>#x@YW;mMZA3-T!HvY=q4ZVZ$=2nCZQYjs+;_;sdWq>}R_Dm`Rx+MjUrsdA)!8 z!0d{Zn&EatRSINj76tWb46@a{+r=kFaUbK0n z068XfF3nH}01Wow3y88m(o;2_xN2bRN9 zJbRFed+~|#4Sp>OuQBeGNkhe~vB)@6hJ-MI`vTm6yK^0VWA?R`0O-H^g1P`6QW>Bm zaBpp-JX+lry)mosio(n9@KBWHgEHGYn%?|7x3khrs|FxzAOXw%CMi&Rpu`ii03{U_scqr2 z5^iwoHRnp{!j<2!z(n`9$Zo}MJ{KH-glKb+O8LcoSd&gq{95_3{no5MOxsjIE| zO@IM7n8pCpV~=gbNL^U?%jk_VBNP8~{|mtfG(tsg7MwK4(Vp)OC>?jS?B_rtn6lx2 zY0HIdBCgLOY+9Hp*4g?;4blKQX|_RW2wePKpA|58LI&^5&uCt3(#yadb zy1l=-#!-B$C>CDs`Axl!-*^&=8;X+`i+^<)Vjj=x1AZoh3FQoIv&0wj<+9rb035*i zXapP~=|Lqkvt@xhBo5mc1CLj+&JNlv#?*-Kkt5`J*vxJ^_5-**@!8!r-_55QBI4Jw zJ@w!T&PDhBxZ!IS$VA-6;B&xRX_&U@$!hiP?AiMPzo&t7SS{!Z{p^2<&*mb`!1SjH zVTzcB%W7M(IZ^$yy{7%Sy6Cp!yeek`Zl3Zp4Jxx+%wb4@5JQv)L2)a=2!x?$S9860 zKWelKOOaT|Wf}#!Z7Tm}3}H$AmX_s_Qa~H#`nqfQIRG)}DI9g!@^*n6T}AH5*};1B zr@ep6JQ&caqh49AiEdtb9|xF(L@}82@#CWCtO#BD0372$=M}hZ7?v;tZvpU*+-+DT z!1<6J$(i!w$0iP1<1fF_&ukGZo)GD#tOZ9FHhpf~h^GA0kTytUIGOPtgF`(H?E`hEFg_e*I^&roU+)R`U zsJ;?!S9mwg&-4>kq|!mME{aIg6+PsQc5Jq;eDtvAlhX zmHiFHx%j4p7r|L+S*we6Vz~l)WEzCAx{O)@%)f+g zo{RoFO2BV)fJR6Kk%>;_lniRpm2(CA^Px(^1Z~6&xEsu>zH?@XDw9jK8MmZlByrU~ zC-a;(OZFR>$*_+1{#;AOij57JQdpC!2K%(%xVRZ=v93+~yFxY#Yfpv*1)Q8itOQ|e zfQIr=n4u2P%bg`5C}g%3#dg!{wqo^ai}T+qDR>y2_@a&PO%fnvcDK-x%&r5Ol3}kv z>;#R>b%i`n689M_jU+!pRV?Fw4=YcwxVbVM&uRW_*2p&8oVlMR+qV9)v}m>+c>5=Raj08K|&AfW|)(0b*+4 z5r-0F28xW7sl|bFb-Bj#%gVwRUV4pDmKNsJh*ceEWI+gksOJZ-xo#OiSsICwnq!HU zW_Nb%uaVHL?)*>b=kw;G6rz;S?-If-Q51C+=OMuWjWHlO_tx@1v%$eZ zbW!=c;#2^+l`foePXs)|>%wEx!=v06`!bNL7TpFyQ3ypzAK{cM z4xXN?F{<IMrs0|oB8$dxxeA)CHLpUcqQN01X8%whR#*%s<}h~(M{ixBcqH_ zqZdfr3M7+&-}2DX&kP?KM#fOH=cMLQzvw!>D70t-o0=LQ6|$fQ-gSzT7Y+~uw9Khx`Tea|_SZJ19Z$BMXinNK9gHHX^ORWDR& zj}3A7RMt9eweB)(l40vmE5)a$_k@@A`uTOfXLg@O?JSi5O$9i%-eS_uH=aY64eQT8 zpTDMlax3>ESKK_mbFJMD?mE&VaPy8f-3jQ3a2|IZ#NV?;fj7|`G+iQc69kP2_eO_qGrzqRw7k?M(O^!s@aTJ$d+y!3~mZ}CE^JZ407l_@l&m4&}C`$GLq z)e!l_!F*)*Th0q(^yz*ME4R5@*HYc%xW?T@GR0`1k@8-%bZ*BMii(1&7p(A`IXSHv zUPC}5ELdYPGpXHN^Y^~26sUICz?iN_w@<#Zc+afU@y}+zo~#$LZ+{!mqxUHqd8<+; z@vgnK%U`X0U#5}sskr5o11^CDZebVIZ|h?4$QOuw91+7F+I|+M8@)5!q=dxt zFQ<$y_wiF4=a3M;J3aI4iGlZBl{6i$G+g+`z3A2VbukV%hg4V|XgkCC(yMEM8#|Mc zeRS+42G6wK_&MQi^o@YPL4GCIbSonwj~$yA88`GPQ-mTT2bn6zh9*BEE$p0=1GASx zLj(3~uPkl@xWtSR@8+NOnf=2M52m0mzS%H>J^J~S1!NjK9L~ixv@7oof+(r{`&U(OYa}FD10@4Td3*a zubTeGdQ1A;ik+!FduOf6<}G2N=T^)L+H=n6UhUK^Lx+&(i&p)4u*bm(`vzY4mz$@e zzGm~d18AbKdu~b#TeWTb2?#?{8pe`*)8mnWfeUuS1bu~3qtg2rS}G41n4+^G0smt8 z&$?Wvceh*Iw7X|!HRJQ9V@aF6#huv9DtEk%*4K^BrJtW|R1a>wK>6cG{Vta}53L(L zRw4b7&2!KGF6F@pI0>>W==pb$g2FC^8GlVS{{F;wXs}hQggr84mtl2sjf1Ix-f9D8 z0y8wuld&s%XnQ(z>J(6R@nnw_&)p|p)Q|P88n?YG!XhG5w%AUrGq+V;U%&yiQpx5mZp(>k- zItV_7W*l4{i-ZhlpZT(#{rdIiq_ec_Ke6|UCr2}9*Vlhi8hi2L`%kKEKCOM#Z?pB} zm|UVQT1t-gWPo!70O7p}Z_zd`D?DChOD!Dgw`0+TbU@;W8A z3p4exNqOeEqB1|F z?JJ|?o{pztHZM(8#!tPZ&)K`Crs>~*=uG_mvR5V&1cX}2U_Ev93br6`{RrnGJHguh z?C@pNyww%fe;e>Nw%OM=UG7d`sQ44(>W+K64EiZy7oJ-{(;Oq8-R9KL4DzXz_P_ef=zX zCQ{iFOf@NL_2e(F@(os8JZZG!%c=d1nt4LoD*!Jz+m3hm7I57(=JJJAOB-}lMNN9$ z`Lo7d{m*|cJhX|63w&8Q*eu}QnagUc!lDix+Ts3m{$}%8hK9DUuMgt%CwMlyZkb~D zK%x*qCg3v<>G{XYq?g?qF%v;y@zIk#W|{4Sp2*;zc$^Uod}KsKLMsy>6u5mN3xzru zGGtTGckpWB23b!AZ>zMJ71)tGaQJEik|63BZKm zm5L2gAJ_5b&yoKt?O*ue&ecgF8N_%!gBnF@)*0+-d9brf-Go`845@OBirO3$-kWS# zxNSdz?3bP|v%PAp8*c3OSJ-|W&o^v~jZu63Ec03s@Z@RlKW`uDuWIw|M^XLhdF#?9 z8N8aXaoTH_W%nAc-?D%DzCHHQozmubH}v&2iVSGhY5q_S!I9qH-m*u?v!Wq{BKDKk@oG6G$Owd8#X^C-=$&*MpEg#)zxU;qi^3^{# zD3GXnHFsv4%x>pv1XO_Ppa2a9b{WL?j?8z49_>pN`+tf3HKG!>0NJCk`rvl4=fNZU z4s{GvD10|z)q~uJ(_+d>!UMZ8gVwdnl(Yks*A*nYwFkaqdaM`}dyVY`Sx4B(|E2y< ze(9422v2?d5b50Bq-(>H+gq-pJ^hU4j>J&OodpZaNK_}>Q+f-OHz>{tR_;R-ZeUE4 zbm~v>L4dPotl8i^(aE&s`9=)`jNd6vY^jDRU#{^h)T*#HVtGX*{PHnvXi~aNLI?MkdZArE95=S`+5I)KhNiW zyYKk@uIoI%-(!yc@l;K)Xbrxj`8`|luLQ5(=|3Zp-Wx?&x5MO1ejZ@tM^So7vOTeQ zlSLo+-A-2{fSz`P^&eo=0LD*X6|8k1@i-ExJ=k0PYoz51a}uOGoUnBCV9;w^mC@oc zS7mcqh9F!5nh*ptr1-g40wUSwqQNo2CM`yHV>4IiGV!-V{Q@_7AF2F z%ZWPuGTUDtw;T8|won9yMI0WFNOb*3^_wT`-Qy8u_XLND_d8E2xy0^~<_nlQZgI#> z=H2xJTK)dM_g=Z*FLg_JKRT-Q<(m3Dgxh4B#qv&u0SM1nffN%j1PS8CjZC_q@T`*m z2R;iTbOp7&D)LtvKpB?OU&F&nzp4jDDKio<8-m~^_#Z}6V(l2hOhL5XDD1%ENm4KX zy*-t76UTPLl_QFXFxc`>SzAw89PN1-$<3fL+M)C8#517*5A&kI!KQFw3}Pp{^AC(u zn%~;qk*nNsXB!K<{q?oFs?BiO?3IcyS5_1M`K2iDc0gIi5sLzHMv-1mJT4>!lhljB z06GZ25s7Xj;i35Ss{sOmd;vZ=knoWw1Ju@Ob07o5UBJ(L@UT06HeGWNtnoOF8uU17;tW zhi2DQ^UQNJzCJj-lz#49c$C?$vN?ueN&eo>! z?;a-@p9VD}dh<*7G!pGd<~;gMnBiszjDiqqYN!DbUHTTV6A3#)I6jG*fJuhLI3j{p z-y@)uWG9L_PVL7FRI1y~FZl&)1ERplFb^!d8QBCxUjsG{FCXL#aG)9SX=i0;|03y& zZ6DzjAyN$Rcyn&~D)O=$il@C>Mve)GXxR843NsVB-l07z$;hbUlC*<)f0{^?u;OT@ zcFKGYyNYdr=9$-eIf=Yn&6x*M4VI>4Rm{4e)Fq)*_+g|WN=Ae-@!=cpc84>B)R}_p zfWwB^{V`8RBxxO#EW*c37)^Zq@)-6D+&Z3arjEDEOS0pJDqL8&;=>2DJg_`JuKJ*_ zY=wQhBQClJTUA}+{{EGAEqD^s+Z@$xdjh$GAaDC0hzWLFIQ|RTK%x_Z9*$(|L2f}@ z94Jr*sf;m-M9P3)#M(gWB3&%8?-8Mc_F9=Qi6KH*G0qWVC_Vs5qu8WOB}B0Q zvsZYcc#r94IQS1VnB6^l@zapwhv1L||B>fb;e7G)(beaocoKr=CRT_4gx8iy7Mge5L_kq zTGal|4E%{ygMzoZ|9{~?5+?^?m-Xrztu+)tU~n%frOw0y>Vk|Sqv3KtIc&qFQw5$q ze;zvT(z~Ivgm=5xeR@Zo)WyDzgApfPOKv(%f!J?)Yo==W$mrePj(=;-G=ks`PMx|t z;r_o$68xCEBDot!aL15hMnoB6J5YqTLK=JI0ghk}sgNuqOU-{MoubZ5V<;%0&<>izGs zV>;( zr=~XTXIgv2&hvbX(N>+*`7?$!_mW3gTNo>gvV0I9f6Nud@jkGf30>B#V(mw|Lkr7NPpp>@2^XY&{=e(P z*}3$|-8I~M#LNEeS*}iy_Hvo5P>#h(*KnmnA?ZaI5>?tKXKZz$=EkgX^0q~ss(1_{%%@-OmJX3^8KwbpC zonJW0iKr9SkINSNK`kw74Nv|qO0Mj7uzu2(_)?NPVe!ZJk>y>|k@&fYpb5(PHG_;< zP4y4tr1$r~f{T(ABahdnvx7fLR71m_}G2;${C!P%h4V$)oT!mGe6y40#x|eVK-N_v`>hJSJRMX{2v81{Br7Ay-iTHc>LfL|z za-EIo_&(yypm6Oo%}Sl%Q~acb?hC3Qo;}fI2ObrVaUGPy(tol7C90()C0{ycTn4#} zsb@|)W$s9Cuf_+zuW<=;@@gt^L06fv;MuF#*fd=)ymzl9QdlT>f_GGyljD8anu@vc z>auFW&xhOYnK#F?G%jCS7(8ok;Tb!Wd-LMVDnEaMAmiQlr?_ujt)UxvC*^ovrJ4JU zV9V=(W2}0&3vM=VabGjki=|gJd}k-t?-$L$^>)c7>#m3cPNw%Ee$Whe+>kQ2=29i3=AAuJNv~`PN7QovRZ`2*CvfH!)NupRqSz z(9poi8`!0f?im^hi6XkVSQG)lxg8X5Yql7-^2rB$1L9!vVo0W=^*APm{rx7nBg*PQi|w5&evDHM$7O|&2;HaL z<=OZl-=^F^OWJ&>-zU~wU0s&eZPh-;w|FsD_dbtWZ`qx+Pj7mLDIRR4_7dSg8}I49 zYEN}r{N}!%D=lMlq2b{sXc@$Miu7_a3ZmTuIFJWh=MaNDDG4Ys3N*?n8!)Ns($Lmc zaCMc$Na=c4t>>eh0+mevjf)MHa`|~TyLdWpM@3cLa5j3$%ql9%uzmPO4l|>#ugR?| zonl!u0|T6)jg8W~YEz=3&U0KURG0hDF8B9D31UvMPqXy$u7XTW|0fq zgx*e6>~0=_r409z-uO#C%vHaj&T(}rTTSBFDWy}#L-ugU;!;7(!ms8HZ30_=$c$6* z8`Cg}0dfW=Ul@u}d#y8ZEG(Ek*d*PMx6)^#t7=j{_AZ9S?^mhBvh}0^wUCx_0=+h18WN<;!&5k5 z<6QC*P%PDS$~qA2WM@L$j$vnKr6(l4!2p4@Q&1oI3fmR#Y@0iGpq1DK`{HEl)h9)# zQWy%qO5LxOu+E`zHMuAK`Z0D^r}7V?k}l;IpnQls?8LRPD*@*X@bCY~q?Lh5-+^ zx~^m`uDz7n`A2=QzaQxPtLb{-Mp8Hledg56e=k4sQOH~mHwbcT;7tG>2SGRIv19LQ zJD@|pe=e~iYs+H5%0G#l9f#hEg{x5mc4_0id9#wIFeD^XymYP9=jix1^;G{PP4-@O zCj;}#`#~gyS+KECa@x=T=IuBab>Du>Q$&Ql?{8S$)do4)gqjeqBQ0?cb=hpJ^Jm{p z4m^$gATjCF*U1x>+4pl>m`eV72{9jc%?HMRUSjDQ0JvcSTaTE8M4U!41qDTPe7q;V zJP>bk1_mz<9FhZXGh*&1 z{^N@mS#TFG@TGK8ibp-h1lb+&0y5t;ZN8a3;v8)ND7+&sG5QDOWyh;bprWM#(z|VK zhBPOxU^BwRw=o(p0qz-A(=Z@*xRMdndBy&CnF684r4fKY!X62P0GJS?9AP#}-5wsX z2*XruM3P>9g)bIqr@rdEuAYBO-`KpwS|-&BPbFVrr{g7--Mf2|6FuL$NEp{u%2h^7 zr((s*La-rxTck{29dSp&Ym09Iqg^9TGLq$tz4}$BE59wFXZE4l=^_w zL2#rIRAX8&s?Xr9K8oZo7~xLiY-1B$X%KFdyq966>b~5-eEfLtehS%-PsS1))bNO5 zTLGW?8S@dA_(V`{48d2p{AA%z$G1c}qEI%ZDRvV71Aj9Sh>~P`b8~LOPCh$E2{Z`$ zOyb91Tku$`#w1J}Z!mkjD^#CORf*R2pP_AM&GPf}d*hH}n88H1RA48&bhGR4$La8z z8uo11D%j1;Rj6G5CO;Y7v&YMGazfzUedQ3(6UmA@)l+S1=wdtv`zffCVtvY0To(L( zux)4cvBhBux!7fa@pPh&;L*;K60aP9v{h?|&(I95cCS%Zwbdx|iE16%saDWB zS1@GhhPTaRI5D0=@KJPh(O8_X?{K@vdPI7>beq3OH06+qsMC9ar!sQv-HhV{=Rq%w zxL4r$CLP4^t0R8@QnIPB7PJPGP+whxz+-3ShC@z^5Nq~kRqrKqdORKm_(R}et%MUH zGb`%@uCcvR&QtKnWn(Ny`;s|)1{;&bK&2@_`Z*EC{l;;i+zWoa29 zy(yP9wczb^-{y~;Qq;UL{af-?z{iok6qc*ybB zk@CuaA$xgL!#U3DCiNvm+_+*XASnbxCh4!inM4k$b<8YIAyKQr2X2mxBzje!5=l^Z z>QB1O^AThO`T23C*TNr9JXWxU4Br-q!-%vuVdnzTGb_Hn3R!d97haQsTdc$QykfA- zB0s#NCw#eTTXjq>eyUOC%9|ZD?4I}dM_U!mD3EG!FH!LYDq?79 zXov}o8#)af^+?gB0d@hk9#LE_VwA&aXZq>Mfn$7)Jr}BgYkmPTwH$6UcQrK3mr^f* zZt~~fIrd_)Bj-I&DauoF=DMy{G-+)6)tp#GBT)Tui(YQ=+PH`iT}GFiuCOBq^s_@` z)WNrd^ONc>)IGgpWB$p>+)(6PL6AM^LO{~(zlE~3-tlq6TjO28oFJkimGRylN)r?a z6(HUt7>G_FB@EU+*g3S7nihXpWPF7L96B%p;bZs6zjkrp+&9R}F*l{mdnwWQ4PSdj zxs{Kax;Mu*t?BP%ne2<3?r$(^W$|F)THF0K(TTx1)hYn zv&?)2^4QIQB#!2DYHnr{@HzD_#c-9>_B?oSy7i;h>iqK~t+&UWIe7}-X>!@i%gZ_y z7phB3%LZ@FTGR7l^ql!#m_y;pyLOaM?Sb>Suy(Si@91b_`@g%aEWVr#rEcvHnqM-y8S*aqt_K4M#>n#7fewi1o{0Z*htq$OO5_-FsI^!X_{aEdJKuQWdFyg(&}^mbv(z>yq&BZ zwiNvXT7QoD`tP!NlTpf(nk*7s6P&`e@T{12;H}E!n%~%$?I5~)vm57y>_?TJiRSdp zwc9z4=j`L+TJ(o;pIE`@t59fN)z{!j8T%*LfT`qUXV^#r)S;5Af0`xdS}q`=;WXpX zFcq)JNEQrO#6tnm?iih*)>d_O_K=}gqGfvBerT#-k01NWr5o*~?6(pVsU!5Qtc*Vf z`0rv!KYW%hye?3h?jBo27atcl?ed=8JZ&>vWk0{h^)1f)NwgoOxKz75y`__nB~WGa z#w|VxUj`ezy88o?w8lPuxEFlmFfcN#CNOlaoSJ0Yb#U9-P@})d#>uiyO!3EwuaQc>F-69oqt~UKf0(12621a>Z&TAFy6Cx`xB?;_NWYf ze%wElB4#V6N$EHcwq~%YbK_9D+h*_#Q2Kk9f7c1Y_i2$)@)38lg`Cu(oWClDax_O_ zax*4Mep@D$GxXf2mj6Io;HCeY=@Ny90zM|~nK=eVbvwJ5p80X!dc^_C$8D*!A9i=U zIs9@7P5jI&z^LKYRoE)rtgq`68x;I77&j19@(-)JOlb;F_x+OZ8;;(8RE|MDG|9aE zT>i|t=h{b49HTH!jF&6kp>a2;Qw_Vz3CMHtB8R8H4_3U7{s9s^w&qa5DLeS=#D&zNx&ot_Bhk$t{20E(dcEu_@)+nmap|R7 z+ryb*h2zJ0uhwVwwQ%=Ry$a=so%-pPSEz5~#pSmsEc9lvFWjnVK3^irzxFi4j6RPqJg-+R@@?CQ3JcHEbQ-0;N)xfTv zYLx!oNcFks@x1FQ91eGlGDwF1!i&`3+Mz{d{X_m?Rf#X@9{XyS{Zbo+G$c-GI;Jyq%p)M&M9(W3kix_Z^RRUp(3y_xyS0fTy>gYjU9G zSxmf(V}e0CLmq3pe5L5+J&(K;TR(8%s+X5+woXcK!*O}8c;6a+>N@>P#g2^)E}7qT z->eVVGC!H2`$iZlKcpUZIu#&Oa8nPo`Zg{l&XB&&*~I4c-MEk;XS)x^4d@S>2c)&m zw~ram-RM;Ucb=5?>^ZN{x`ii-!PRNb!YbPQsHRFxlHZ^ z=vr6Y%5*H#Gxu-a{K0#>tfkxXtstka&*x078ymvxiu!xyGniuCoJ`_F4%ek{GvbKf za7b-0TI+H<`5Ms3tkkQK7OKymk?T5oJNC-ppiXW?IF*cm%DG%9z8J>y|)GjciGKZ!K8`b**AeIf;{++UR5_+xe#GO6@o4SJ!#UZ6vg zVnafj-5T=^H8+j5$>dqGXS)=faTaN73)4@GP`bf-_}qK7lfFvQR#uCv4o@X?u<>*w zVW@YmecyfO?CQsqdAWyQ6`O4NH>VTx=H& zD>b7;d-|`Jm-XNynkiNhx{lxD#xthsY4Iz0ISy9SYXx>*9`90kv4+#-oqM+FuF1_u z6A<@mj^314=i=o(T@WB>3pMcSzh@Iq<2h(`bS84x7#O@_xg5ehe=T(PF4_bI8tN%< z?xe&gRe@Efnx~1;-%pP6lAYbrYw`+y`I@}EM-}HrIeB((-lTa(gFi@<38XSG@P9+K zN0j!S#ePCuh#qVI7P_wN=i4?vzRPd4fjxpF_+jL`V-(aEgfE{uz{$DeTL&8~J5=)W zSu2=K5hNUU!z!uScjegCt2&{@Z-shaU$)@&9Ql`F%E}VatQM8v=`8QG5#jrQ~85zr2oh5GG^kf=o9X}$%P`ajb#e`jU zkA4mvrCIh{Pa!FX^y?QG)VaOT`Qi)mEF>hvuI?AAhAj)#)ag-I16oocqrDmCORDhb z8Rv8;HnD4U+3l$*F6QkHI+u6_NEQ4oAt?`IMYemsg@Mcw)9?xg8q>3UE=xM8H0me&w(!i{UFlnF0RWtfNj^n=wBkcBOcRCDTe)BlszGx65Q( z?R5sHeX4sm{Bf32e7xHj95#}0z$~J6jha0Ado9G=w4!0E3)*N-Id=EENlj zZYtPQ5pW5Q{+kI!PA4TK+28#B)(ak;ANx~)2G0N=X@(}B1bQRu648M|pLq62-04q< zFe?8my;$r0@Zm0IX2(m}r4eDitUrEzq|~`w6NjkWK}67AhGGC;t~&5L%)Q7)J-C1W zUFbg$KKve2_4c>5$P#6Szr+6c>PsJ=yY^$TS$W-`W*f^Yuf1|pK%f<%tgWC0NjmAm zod<7)XoW#C@+AUF7Fnxe%HS}~c}HL6pOHR(&5Uys|IR?(dpS^6$nLB&Y(K_oeBxHu z-&&{FuNiiggH>fEf3y#25j(!DJ}|f23eVmOiWjxDVuG55TANxQA}LSXPXX=k$aqH z<%5ez<1CDmq(u|*hzd|EByV$VWp*bXl$E*mOQ1y}punDm&Ltam7a}hoYlvOul>w%X z@7uvGt_HMVYGG+Ag>!<&b!#a}@A zT<2PKNMIu3*8cM&A}ncZaMp2r|4?iZD@|9R%FGi409N98sNAI{`2jycAkG`$IwWcb zAYL$hf_SPOd%c*LQ8hvGO@R+Ud;ARWfsT_4wx>E!JQFDqvX<$ew7%p`MzLve>=Xp( zX966m_)n86wmTn879{)=ab{#QBDxHcGWq`i4@od0zqb9DcOC!R4{fHlc8Ef$6Z3Fq z`BlA;2tc0nzaXnR7!fC5GA@-#2(l!6^@*MR@IiL{<{a9jN zJmXVSAAwGOiFFR9gZD@;COkhUCnw?+kT<1*V+n-JLAXb-Rb=6LDVm5*PNpfAX&K0V z_bxSm1M43C(Ihz5?yGdW9@G{z?UJrBM7&BJ9&S=q0~-|J#a^&vkc41@BT6A8-$8Y+ z0L2zMNl7WPmhB{*A{dGtcHUxo$$R*mPuTqMFt5#DI?7XkKmf;J{@#K^7VbhAqmIbS z%j5Z?K@c_0=;@i6So~}xT^$fGPFR;q9nf@MPbwN68*^zlIzdO$19AQiy>xj@e$CwJsO^bFa`z&S_D?OyO$d4uuiR=w1`q7(cc|({k1&9rNJE{N$wlBji@fQj#R^OeW?>L?JYc8342BN>g-K=Y0v2M{hbemD)_R4EhXHI}Ojjzr zHCC1~zQCOJ6OywB>lM}M0r+k=Is*BfqmvWMpgQnCWSjy^O`dq&&G7v@GIUPRGpts! zSrsd!)lC#&y+^}zm$YbJ^r>RLfyHHiz4+{_hFN|hmV72O7OiidJ(0a|L4B&!fI?W5 z@ng_pGNkcE^PGGCudbh*5MfA=svMR35}7T-gtW*pXuI0NBr zKd+%-qWffhOiy@(ULgY%ab&D?c(>>z;ulM5jL^Z|Ku%R0jsSF@4BXsH3yb7ggpO`3 zNyG?vV6YG;4=$i7s2XQseuceS4I>@En6O|idih?6ZJ>f)d)yHOjRsU%1R9>u$6V|c z(fiIk(Da*r0N}ER@RxUI?Kac`1H;C+hlG5i8)DL)ODqX%{fg^ewFur%x+MRmrbs6Ljh>C{ev$ciaiDdnPQnfPoD~sysMGYK6 z+>ERGAFpWHwqx}k-r_&EeNObhXoM-vt+`FZ-C@(eYZzhU;8UV3* z1qFTG*w~rGMTN**64wE)l@8Hac%1Luaw@-T0e@BDII|UQZ;0&EZtFu54wB_6D(ZIr zVNu$<4HNJc93necN}Wr7?oYv7JcarVVJN7I!nMFqUPtRXFvp@_qMnjiH~rFd;; zlPmenWJJTjSF4^<_3!shb)&4*GyUAmQhJ9zrwBm(hN*@bPmNp80_6B4GzSw*5L9;v z09A!B#?t9?+Q)n#QaENdp*MABv7ioO8tPz}M{59(+o7z?;X13bJ&B4tx8s@m)1Zd6 z^HW8qLFf%`-Us_U=r6js{Gr(!xx8?4ksX$2Od3<8DXo7ECc}w281fS~RtDaRjas^M z&QheVnZygBaSuNRQqc@6+@aLR!vM)Y2p{|pa6t^UKQ+BqTzh{G4_=k)-KA#zdszMF zXcwD?hTNf^MaOS%H;X&CKM9nX@d<&Kg!DT=r-?Z*7Qd&r8<1PrMlpjzhz`J4MO8H= zmDv@3c}oY{?k7P9ifQ*YEuEiP2mS%q$`tyh@T}BBKW@skF&gZBZf^Q{|Dx#Vb;82g zR1;xZ%Ao0RIXP(S>$4YZ#DA9*731(Gnc>9$E3Q-U98Sm(IH}Cu);1t{lXP<2uPz@K zbJFmql4ZmB|mTvz~7Rkl8%KC4lIhALA0L(U%fQ)c=@)U=<* zS#hL*z?Y)WNyDI~hKts5uC)>Bf43JBd69z+YXivAE?z3q)Y$O+ctvg4RGYr6pqTZ# zb5V3QZ$5r3AIK9AcGlISQngf4nv?!^6w&Z1JG2(OI~K(0xM+Q)??!8R zYqjFySlORo-8-FmI}Hrivj6o2FINDt12AO*`*AyWAXbROcteAO>2L`V9vvdQ!^rk| zISH>M(D@O=)|Y2gcde;J9H|-S;^FBdg_Jl!iOY>aXbowpu$!L1-eOpId%au}^Mwn| zJWPW!`TsBrJwrrC4Towl50Pt;aZ_@(UKsI)Q{op%STs?Xf%U=s%gxLC3rnB_KPCy> zTo9j5B$cMbzW<&*RlP4(TtRLVgd+6XX^gp6OflAKC;8=~x8C9iZsCI;=leVJT~;M? z;UCz>Z#+A0^%;{BH$Q(BSZ2gH5CT)o`m+7;Jv3Xm4;~D%&g&dbY37zwxSDxGiz-d8 zr2ORCqR_R+`>F#oa7YbVmQ2}Av+tJX3w8*7NcnH+@HQ6Pgcc?H-kx*aaKF zU@%L&ieddafIro4qwu4|vZjHK&k)w-G9Xy3ALUvj3(n@4y-NCN{b z`?`1;z1(1nkevT^g{)r4k4J_|ISHGvv6(iExK&90)+QvpRqRwVPo^=c6;AJdR4J!h zy8F<%P^y&vd9bf86S}+S49!`N*97M*CDbh!Z!(Qn2@cAA4r0em)vmCcH2+RG2qY2`?`Ebjp zYyWr+Tw8AhZM@ni*ZXse&21s)M&am#pL+U zNpZ9V)CI#HaY?sje^E=ADYZN2CyHb*rdrniGF$tld1Dm#a2(z!hPRtKIyz>t*K~ee z!URuRma%m#6uRLIJ%$Jb5;?HXVX0*74KW%n_!Wnu!1cZxC#lT$zccdL1$VA|V>E=UsF-jk{xW z9%@))jHGaPKhLP?A1wd<{mbhBR|XCT1!Op&fOtQI`bkMiRLgj-&tZ5cgdA9N#Jb?x zfCR`Sp_6qo6g&15V34{;?}K>~-o2wog){apm#$L0ePpOcs0~>K;XK4b{;$1FzgQ8k z1?I(qk$=UKF+Oc?Hcw2HQ`nu*p&uM%X9AhYPCr?()9lR-UzE9!#kHhl)jr|rix)4L zVdjP0i$u5Mc397DY6YN0i#?^2T92Pk{}5yUD@R#J`affb-;}|QVe=z*^&#ahqhDLN zZ&{vkqI<_y72dwdOv@x1#~mJM(*HT}_bMLa;xq{v(hu-Idm1>?BH2zQGi9Z_j}wj# zKe(?XX6uQ6=C|k09ohSVJ~TNxoWVTh4GrUk3`-UOtFM`Nmr4PM!FSl>T~p5Ee=hML zUao5|owm^UCB*_jVApvcP|TU5m-Tlq&Hh@qnlOi&C}fa*!ZKs!RF4=o(o={V?&Qg` zAG*22LcNfUSro`OH>o*@bY02s(EmPC4-*2K(9B;DV*al=b;pH5tRMMjxl zW$=S7U5l&@&)|QMjSM^RGuW1V<;X2TpJV>W(W_Ull8+91>J#|Oa2J)(u~B=hZAT)Z zr2Q)oBTbK`-iPcO7b1{Hl+yCXe&anw_4L$X(u- zCS`a!ahF-}O2?MyTR)z5j(6Ay96!LqcDlQiX4gRmZRH#|JnZViW^1ACB5^dZ({WKj zr&{YhFi7?AJ{vb09N34x!KHXHOp_l>>#@?t8g2k`O3KPLQmuQ!l_tF2JkhdLfH4fC zdFhCWnx(++pWFAjTF+WMed@FAebZZ>0)1>J({poaxywTXANp%#UwZUW`aEWQxrv)w z@q|aIPx6J1>582^l2RVgR}?4x$|P!PGTo&f?cp`L4}Z|a-8g@4<}D| z<~((V4F&RGsC=)0m_5#S$o1B?u?O+dr}8Ou1KjSveQQ!%Tg!83fMUB>IDbmQEjneb zkf^8t#J=)`QKd1?Pk4dSIaBRL-7|7=+Ofn))1FmWfhQ&SVF>JTSUFR= z!+_U7VtkZJhmH2vFR$}p8g9tG5;s%O`{5zz&MP7^7w#u}F;dCYS>o`q5q?3ZrBnc+AYs8>Ggj{w@n+R!?lU!7DJWKQyXMo^FNVbZ}H`Zi0f>_0xEyKAYUQX(Nk4SR?}pn zyI7R#V=r{{UJ@6x*!ibs8RzucGGi7uM&|!I5U0WVJeG^dEU_0(WBF-D&~Zg6FoL5v zN!bq`TmnjjBHuD|+NG^B=e)g}G7p{-B-}WFX)bfTwD$?jj3pl<`Qp$%iXB^AT#kNU zo1wlvrfu_C+D5j2SH6F68l7?uDUQ-kNiZ(xnZ~%;8xyUe<(dd;D>l@seWdbEfC7cl z^mLzFXSSlFq@29WcYw^!8XCxbrwvfhW#U7W)rVS@mhXH5-uYrlmoAA> zhaJ|ctEGz+-bGhg`gg9X;NwS~$6rU}mPJ^n3py9^fsO5zq)1fr7d>{9LPmzY9|Eu4 zDcb(?|Na$SxBB(54|GV@DMl9VP&Fo|qh&1_f`6C#Lu7|<-!7i1|2to@I&1kQI`r&w zmA#7#22n&{gBWZq`WYRGTf==nOJ0+cP(g>B=b3(&aROo(uyd1fwftPxR zg)t!9SClAP@4ecmz^mh@oMd-x`1|ZfrZBO(&o(ZgTlL77;~8;pObjd%a?oX%^Wy_6}=FPKyEJQLfWCm z3Lje8{6Dk1F5cfo7cJSMmbj~YVEAiO&8Zn{+uricxHj$TZ1*i}l&6h3gmeL0huh;k z^qKJ-G5xEvflAi9Xx4!Umyh|Rq&}2yZ!x{_cdTceKeQxG-^=T$guZFvZqYNvua=hj zx@`x9_|CMvo9w(;T&E^^K=Q|(-!yX~0gpU|lDNvnM@G0qzsUQ0w(Fb3Ubi~`L^oAP zc-MH-4KKRgtUPZ=9maWnbVc9ud3v$Th~g19clpfs;)2Z3@KB1$!chUW#Uvsy&B~hI zZb7Z6P`*4_@!U$q;ZjZ9SyPVU)jL(UP@TC4H=7WLDZqh5n>d9`t5-_3HU%3V^nZ=h z=(_>E8%P3V4;VXtk5Ep~Furz8=fLAnb#>H$S}@F}oHgd>5w6Piph$W|Qr++Zv^MSV z?RURmux;y(rgJ>edf>PWj!`>u~Zf@bw2g)@veV@%#M(6ibrN?86ny?XRWB^dBEU`C z)@f9d?r>r6@pHOHI5lD}4Sn{VWfKXmZCZ^-t_&L62DBb1WH9-&&kj}aW}I4{Z|zd? z?f+*p1?aqV>i%LX7Zq~$P;ci2r9N9+Fx^q7@Qi1BP6`-pyXZF?QdOqip()$)^ z@JM6*PC37|5eONHM-~=NNdrs!rMPGpYrr!o@yT(47y^#qWiO+hCs*U2JRfVwC@T-J zeUWp0{aBlCyyMlY>>v4#y(`og2~B4 zdd);#a;@#%R&1sPxc$yAP8_mQG#mZU&8I!B!1BH>D~G|u(z3kJ<)9^JH!|4V{{8Dy zo4M>05Rd?O`j*po&mB&M?2}sfs?XmabTx4*7bCg8|Cy=N8O#GvNb(7u9=&3nuB94Z z{Wy>*#p$48Ih&QHLpJ9^B-jUlktms?Md&23NB|QE6E=)!D}hoR(5s*F4(1%cWLVwiBxBn_c$B1fsh`f62 z65d3=db2Yh78*@tqdNZ1sx7f_JaSip{8Dcr3s?tOxk-&_2v{Nb7q!MYQ#c;hTK`ie=2$zY(d=)`Oo0*Ge(;t zPWF!g+Ead)lJyd8Pb#(Fhi}WO!+*NsQA20^Vf2TVF1?hLSNr|Eyz}&?3T~m=#IN0#-6?v2KkEm}3WkH4|u4kEid+aZCCFisBNp&&E4k`;>3o&1bj+yab|6=+zwexX3gZn{stE#p!nSU)aCX)PZ0 zyvBB&)Q@#5$JQu$a`>QYt+8vAs)k#k<&0NxCjv4&U)!oaSY*&J)}wzM}>I@Bnq4>Rv8j zC&YbDWQXtGy`x$N1WFP(i6j)i{idy3h2gLz&l`M`2@~1Z0+qZG2DQ!I{dk^eR-L4`U@ zoyF8p_q)vhHip*VuH)9!w8`T5JLbusYK+Y|Q{R}Gv48#p!8qUVPqwbXw#YUAfW%US zjtM#L3?*H8WC+5~4(%)h8c)bV4mW_X<5Z7Jp%tceWD3S&fsvD^%&5MYEf=}pH=bt+ z3qCPo;>f#Q@NxVG*m4XGvmOveqs)~K%Mj@d#GX&I{DR{bOY}@nz7n6S?Tf|;iFj7h z2E6a?W@@?A8zSKip_)6xa^v7keg>iFD^e5yaAOZfF1)*30s@B+Uk*r##9m^6$jTSp zxT#!40@GWCs;vm<1t=Iu0X$^9B;go&by)1R^o-Bu%vZ%cektto*&vZwm16<<>THDX z#90MBS9+!(QtzHTp#diWyE&o#zzQNHy9USQK#hixW6Yy{?mqn^i7wqUj>6fz${O(% z_xg>SVr3|vspTaDHB4GPw?hWr11$dvF*2LprC)u0nYp>5mG7QC zd)Aa`d>TKxU7cuHcsM@jyMwmI1*x9w&Uac(#w_L0S0gPg-MadMb0;%CL(uDZg@)(; zZgsZ8_k{KhRip`}`*)DwY9u6j^K~F`lssE|_PEtqFfuUM;5KNvF*nf2{-7w?TpH_e ze$D+iYbT)6!6$?mh-j|bjU{=Ap~cyn2=ll+!p`Ggf{WT@0&5vxi`6mu4L#Ro?(@oeRN*0tIMk{ zLsQ#A6hew(bBd+8uDMj z?N6e|AX0{+XhYj62SIbJg=8Kpc=i<2*brHHe=A5}!wZZVS7*RY_bnr`))OBCqCGwYIbiPGUM>j*ef$*t!_v)mX`nA$v~YRHbsKDfB(MZT@BQnykUBV@EFtQSeiNh z{7(S_e0XQ^>p$c9&yS|Jk8P()X-I2|&=FH?d3sKT&ORx_eH(4!xuB?^4_5-Eyn}A9 zcfXFP*~4(#1F&%^{CxT-!?W(I(ddNr$A!QWY@Ku=JNa4^aV8Lb4*gw@z(b(UK&8{t z)8B43l@syp2(ZD$3pCD@i#sfmQa?`?k2m=}ZjL9{iyK}To5kT z-7Bm_0@+wt$bkn3*zM*kc%D3S?DvEV*uOTEV9EP?={%@rG zJ-z(qtB>iEJ9lDv8gfq3vXw4B*2&HedNW!<+HuJ8b*(}rAlMdHE%;%a<{Qfg_)nZD zXgZoSoj9v*_Sz1c6{yC5v{b46$p?U|Jx4GW-nRV)oCvN>zEH!bl$^{yFK}XMH(uDC zUK#>Uf6%}OXe~`Pw*~k)k_;wJ=)1sf136KDKgGQ=i*~=-q=W>DTY5!|=U&_$Y>@Y5 z^^JdnGgeslvn?0w&s%XI4pb92V4cj+o`0xitK5CMXeSYE*V+XA+@ zFKxzNVsxafjb_)b3ZSl--!@eT?*8D`d`K_HN^lpS@?F-~nc>$uib(6gN_#~`g_Yjk zp{sK71;)Lm4M0|>BU(RmiufSygjmVc)oIs(t40bhI+?~)3=^YIRSrg-^a#iK+%D$8 zUgNO~5zP=LA-MmQaR+<^S?}Hz4;3b3YRE6h`Be|u^4XN9Pd9qL{aa35&hI#`r+(YW zxmI;zE7-iVaWc?dtaPOcvaH8(z)lER9v;Cm2@m>IzocMog^u^=85SV>MzU zLAxl~z?qQJVlQBbcFW7vk3(HP7xM&ZGJtO*C47`8@K^|BSYKi& z=;*e5&EYk;U{-lI?qKme>&j9h{UYZAQik^v${NueEU3RC{2E@Th0x|-pJ*ARrFWcp zx_@j{<;^QR!;dgc0`EKdg>>?!>3L;jSS8d=cKPIs1zoxPd3R1a`L(gAeIt`T6rFkm+mbW7L24MIQz4GfZKH)Cvgje8ZOQ)W}?sk3a~RUh$Kb6Z@zVqiM3u|`Dm zc{wXUX9B8ploS_}s8Vp9P+e$=N1kz)ej$D3%=KkHnA~ivt&{%uNr;Q%M}I=qf$0ih zJ@NGfC_G3=7`PSk)j-ua#`TXvsVYR@%#fOy+ieE>0??n7r%7T z)vMl-k>9)Qlf+(y{}rZZdJ6y4!7IxgA_%dm8@uuDCqnD2{~l{eVG4ZsP|@B&p1t7l zOw%>3svkK_%wk_?j~|aTIHO^g56AS4&r!S|yQDCZnSI`_+|Xxa8fH;L_BbS0r{DAs z47>)qvEF4)tNxEO5Zuv;xAEQ@alrywn^_88S02)RLgPV+djZtj-V;sDAKG;kaVjNT zxx^X#kW&6$ntqO);zQx<=hH?lAIsTZLa>sFUT!L2$KAhz6+<9Lx^DjQ5%2d7Sj+s& zmb?!<*wWivBJt^kaIpT=C+<|QDurQJR_jB9XTml;ROJw%lwZ%p1hJqG)RwHD3ikvn zVz4^E&i>%g+4rqlj0vHV&h!l2l~v=vd=KBK*?gzwpsMT!C<9y4cx`E*`n*@-lBJjJ zU8I&8p42kS&kT&pntm6ozcb0+~+MxI;YBQ@^TXPDZa|CrB zpZ|R?qx8q8o(~J!bc*cB$uGoWdWY48_qwE-*&oLzEOO+Sg}) zbxFHrq650{IhPZKV?Q9mCcVn)S07tj=U&!wwNTu?t5Maixz3Ff99?hL50!UX$(30N z4&RKV&TIRQNmqJ$%i2s+f;!YYP?sw=!=W;7QFdU*$p{nl!ub)p+D=AAgVb@w`L!-?*+ zFGdsXLY_^IyiIqj-q%Pt8fCqosT_pG(JAy6ZP+*#(u;0 z;#>B^yi3lM?a}+6)jwT-=T2*vT+M0OfRH;3@-azCK8Te^cHWn<$(G$*Vg=(H61cCK zcjS)GNrc>`M0KS0^?N1nCHJq2Ru68-zWe>u=UeFo8y7dkWSU#q++>;BXuUKup-n4}OOt}pliMFl zQS)sw+BAAFbb8~dhZc`jx0(p#8H-v7ocuz??3MX5eS`RVorub_ z>$ErCdEb1>!E?Z|ap2ZKtF{U))#NLRgr(EbjOUpacQ0&t zxPnVt9y%oKCE$mePxwPz7-V30Nw1i^kIb!uw zlE@vPnXZSXY+`t&Shb)(*`MGx|iR9#ii4LJ71Ty8kU6)PH%XIPY} zdEYvsWYeaNS7=mvy(}a)dmad_I!;SvXW_}NH-QNC{MiW^uFLYL_IbW-+vhXnHz&GP zY%AX>u2)h5-QDXQ*KFB@1oM_CGHGKL#}#$&yWJJPdvc0Jc3Sd*HMje|MYiE>e-yZ0 zHqcOwbk>awA7m}8CtG2Q<$~xJyPiU`P60Za`Bbj zvP;^A?(?hbc;7QK-&tBx+j!_e=r~v9{qp;DjkPl~(^dxV3bS{IK8V#YUomWmWX2vy zDupSxZy(5Rl->wCtuzuK`X7JJ%hPF2?{eSka)xR;_}Rl)t`&@Jq%sB)_u0=9lW1tJ zBf0!~X{B)%!?%%w9{<`B0H3<!9c+jZFcpUD6}o1=-s(9ET~> zx(iM2^k{gxj5vA@G*K+>jbS#t70*-b`eMhFEZ3tP+S(s4>eN*I&;Qe)sU7K{sEgZN z&R)P4LvJLfyFN;AO=5ct*ZuQ9YP)nK3`)BLDe8V@zrUW&$LYAAKI{9bin{988#W4b zH}<}ncR9cpS|^y7!F)Z?Iax6a%yJ(LY71lWA(l#XUE?czpiYX@OS` z{10d*>Z@tB^#bb_3htO!GEjqM|~Ts$fX_!Iw~edNU>Jx|CG zCFs~)=rD=-!xjK1=1|8DG$)t8(OP9F6bxD96SH}PG_TF(l|L@y>c2X)tvJ+B9A$z7sb}A&J>>`O& zNLIrrtL#;T1~MX)R4U4f25AsUs8k5O@w=bi@9X+rx8Fb4f0x_sbNjq$yq@FnI3MSE zoX2sTixgYUPECKA+WAottN#LCpFxwXIz$8|&$% zD0Ei0P8>0+I?!ilW$Ty*F?P#59;WTvuAe<-^2zRrJ+uFDd3iCVzWY~~6OOZguRrx> z0$Z?6C6ytA=xiJOJTy|SuR zG0m6VxYTEMJrTuFR3zu-YAFDQ4IMVDVuoE$UEPj6g5dGSOw)m{uuZdrYIp1fk?+JXM*IP{Un*Z*>N>!_!%!V<9xdjwypd9K)GfR*$08ypz{{EqmmC;^+nUiv z(qj2#zsrCgjJAQn6N5%MVbdB3e0Ox<%CSlxk88AYA6(V9k9|_>{tK1bb=dqAGB6fv z0q4(`jsExohm?QY|7F34XO(Z&$~wXv9(WKcy|Jul3^MU4|5nYJ63j2KNnm@XVAW&O z++k&>f9kG@e_?ke&8OMNW=iLqbYB-4x%t)wkMKcF)7w3HzVx2w&{lC_?L(urTg1NI zUNhxTSjQ$`T)#Z2udAB8J$}>{pU#o%J=L}^>^|Ue$4?47TbSRw{j2u9eL;g&tE$^{ zXtVxDVc?MU2UQ1F!f9>>YsjtNo?(h7pcbaA#?m9Ym)k?6b@jZ5|9k9!Q zVPh}mK)4)*Vu&g8$kY7c!G9<>52dTw8D>R5VAbI?x(hH0Ll&kVp4xjV-w#)mDYlLe z)-~FF=+MlB>Q0@jHpAYe&XRedbYEQEN&=WJR7ALi zWgr}f9G=qw@J-Q(t&5+bA+DzIog!|v)({j5=9H&oRm7+n^Ko|@73!<`5DWO z!-gHB+hDodYT5GT<0ehIT{;L7ItY>d(xu~$<$0eTWO{1t(z+^j>)l|~-m!w4xl>vo zUvQHe$=5eEZf=wod{0q>$}Obm<|ij7vt){npIGW|F=K_#B)4jt?r>YEOCaYkI%?01 zsm=}8&R#XIu}0tLYwPy5{rci~8~u~Tlf%9i|0!AdW@q&DY4<&#^RrhlowqQZ{?1qN z*d%#3mo8l&pYYE-E&v-Pkf4A7qxc1P{v{X#0pWT^-|P33MH8}0S3iIGGUs6B%Ih;` z&dfSEoIH=_HwK;OEBBk3PIWqDn#GR9F=9(`T6%=FzsDoD?Uarb1Hq0pS^sLeEKdDY zYt^de$&B+rU?0D(g(!;`dms0unWszEcMF`gto^k$a|O!+r33!Wt4$79p!%j`)~?%} zNGB(wl(Mq2?mc?wP-(xpogcQmlkLV1XDW-3+Q5)AH8qOOyE~D4;dL`;b;kTJGJ76` z2{ie}*^imJ-dXgCp^Udk{_GVu$18nnePD|(AKlL04rw;-Yluad-p-Zyl!fR%&kt#N z2vP%0f$;J(<8|Cje&o##z)31I^UMB-IOu?}l__nq@>g$r8xYfGrjT^-C$MsR)@ARI zBBupGAjSrEiCaH7-F&MaI%Z*GCv3Js%o?_8&FydsO(&;?{iF!BfO8sEP4f7Vx8 z{$@6Z*aCQ|3nri6@4i={R%p~UQUbyT%_!8=GDrvuRB**hADj-GSaiWJLq>|S8-NQ5 za)&89XF6JN0eH0nUD1yUM$blZkHt{b_3NpHh4D9!blYRu7$aA0Rypq6w5{kkp)9mb z)7v}N2k4EltdneP4v(_NZBhnA{=JwMor@g`&uiU@^{9SM)&N%72{I$_LN%TRBQj@} z>g;u)=sAc-vk>=a<)GbrhJRkEURRF=1c^78%_J2RwC5O>xvx<4N6nr2bYzvHO!aGq zn28?XAi91iii~!y+xFJ38|`oq7Q_>yMZB~6y&9GNzFzTr9Zscmk7P@Uc>_3o22+*j zRqTL)=ij(J9JdChQyGJ`82+ISQysw2kjObq=s#fF+c_}M0YPcMje>$8Jqd`P8Q5uIy=Q12qBDG88S;RdR}3w9-Q6S1!u_?*Mw#n0+h(G@$4vgO zu2ByktN;F6{%^0#=|sJMzx4O7TH7pVborlex9DsbR^D)^Ltrbl9EWJ#?VFKOv63|c zDA0Uzt?7qo?Cr-tk2y7JY6}FIhW-0ri2Czk)x-%C=#*O>e@nYQ_*tpmGDgk%T;m(9 zBTV@hi!kM~n5i9{GE=ICYqoAZ+wH>Xcc*k=s5FJP0jUhqPmZTA7LWY((j`L}f^I{7 zE^hEDFZr(9;l%W5zi$U!AAKqVk#EHfRld28div;;=v8^qg|D+}`Rofv&R*Ow!l~bg z{T1;U&%@Kc%^i|*v3%?g&0Z(YUMx>4D@baT(dhoK(0^*f@-=TC>(MprN1#`H@v5WS#+e3{d?>7REI#wA+OHV>Z~Wab^cT@c#)d%K9?4DwYpI<4VebCxQ|&_|kqyxL77t$^}N zKu%3etc&d?X`IMpUc7igyW3#da|3|FI{3BN0kG2ezdxnI8AXWI@#y7Hu9g+V-bkc% zeAAF`Jn+qyA%V&tg|XZ)P^_n2u+-T|3l!Sg6PO0!04hct2` zXC~p~Du$AD`t11vUYV9P)4SamEp~A18tDCZV6=nYKE+a#V}_LpvsX~6zQ{wi5I+pI+X#v(b7P! z6fp~eL*!TcXS#G~#_=NR8J-3ww@68Fx*0DL$^QMvz#b$LfmT1HLc}3&9#2$|N|DeB zGlo0GEh?C+{8CX^XyC)c6u)UwD_Bxncte<7xa23ki`$J-h9L8oRKe`<39|oZ?l@>X+07J3A^%>6Fxt#aQ|FZ{ z?YJ$;8nDx$R*-p8uoW;xd&c-X^ujc@=LRVyiQPRq6=HqTHL!k8lKMPKC;c5XFA6@l zrxkQS?znB+olwt~FEkwYup7=Tbd6WCqVO(5i23eD$?+t35*s-Hg|>%`7-G-su(QWQ zCZ8CNW0xevetz=tGNlmZpr3kYxgR8a^yJX0wimP>E+he!5p#*3H&|*g9Jm;QKlTG{ zuJOFKqX}hJ>yBAz_Rt%UI$T-*!-c~{Mi0Fnw)JJYy^;DGa=J&6l3&u_tMY|At!n)EsNB{D{PhK&t4 z%p0w^%bnFRUNc;n)&d1y(DXS8`T57tF0`i^4xmqK+K%o&fz%Lu3ToO+Sob2@!Y)kw z$RJGmKo#06=7&ra=e;c{h#iC{r?4k*GZR!xdNOn^cn8TZU*0lLP2~sB*VOxa%2m=E z!foLNreo+l-ofWsjBoZ)nn?tYW3*@}qn@LN;abGdOy+*0P|+Y!1#{hw{{n<3nHz}r z|Gb3@leuNAJY{gK%gzz?$^4kXwt%}&ft7|{OC|?EnRs1XY>E@IO#9}k+4JBz`$ahS zG;CTo$`iRs5vz-RfT+z`FDc2%4KQ9KQ4w%zYs=ZVb^Cqy{<}?N&+-dz9|N+CUWM0wYI6I(Qgg zyyBeSgiL-u$p`J`*6-jcyFsj@wRAl&S)uEsqh^Hv-f>@=-MDds$>?dH+ZUL(j7anD z0uKakpR5Jo8sV}1XX?l_v$MQ@7U5-@cB0cnLdl%dSLg%96qevdXwn20K$Rc@E8e86 zW})WMqY<(gh_$G$g?j{#fA8VL<`gp#mMsej=P=05V8!NR2(jIt`HRvo4@@Sy%3wvl zdl3$BZ3jAUJNr8lDs#hBLh{{XiauD2{E}c4!f~aI6 z13;M`VL7#hDRU9iXz&Li%Qok)KfpeM*!g-0)3=A{%1;L&t@^ z=4!bz%9-gK;Ms*Dot6uwixkMYq=$HvznV{)6d*JG7V<33<=Rj!>#;BU+pgezO+#)P z3JZPIAYkyP*H_Ff#werDC$E}(w;&#Ock)rIY+r^WD^C&B}&;l{gDJtL8s6R-Lr=OW{C}c@= z?uXw$b=>Z-4=)HJZ0_WAAw<`H($S$Fmt_hnOxiijR@z?$hh8gS@`MSNT_+qoNx5l! z!@V*@vWtZr*gn75@Y|}Mj_=(ap+Z_g+`1tCrw_@}$59nOz8}mN2*0YzLZ2qrAiA*Y z7f+rX>A^b);?=ad20=zL=IJchaE@1f3Ii*dr~pIFgl$2f=KAR6NB5C{#x`gu#n z!(oIlgJOR3(|E5xN~`kNwP(ajd1{MJvs5(=Mgjs@vnr|9&!Le>>Ucy*#%XwYsg(TY zO_`Gx1_6|_sAtYR)MsVU-Ty9Q7Bdd}kVkeIGGrDzMfTCn)ukcqzgO2T&zKnM6P}^k zjYM0l@8PX#U3>BTIf_`4L7DuNpC5(`JI6ka13?VQc^cedCr}M1`M5W01f7q zl5K%6aayy-IF3hTwWKyMh3kL}Y5c_Fl0j2L$(*x(eH~Y>fQbS0y-CDEm6rmpU203L zmH8%&Bfx1El2I%xiMXUrb`ICXyJCcn&Q^{J5+jV{;be2O&xDio* zNWC@k*6J9wYSgF^t7dsrO9*ht;TQQWEr|CTK{X%+B|pJ+d$4Plk}F|B;1l%Ch3YY# ztd)kxTQ;At0BoB~oH$WV3^KfPPlr=SlEjRLn?+1R`d=;79PLLJfeacB@KmdC_K3w@ zSdX36Z$&|@aR;Fb+MH)BkJ9AHlVN`-m-I)GnO58JhS&G+-^EZgsiexeA? z+P2rwXfSf#ypEs%I8w;Pu>n>64D;v{Y{}WNZ(nl?;z)zx7mmMyYC{Yyfi*&`V__k^ zG&C~mGI;ReQ)74SJ$iK6?#@o#c9@PC^OSTLd|Lr8gp)7~h*edx-c3`YtUE!XT#uO0 z=P{2f5=}7&F!hhjQsf_GfED}M4tFI^_(wE3nkaS>m1x5Q%|T@O#%;THjb)76@#Aw6 z#ulWe`m$^&4k98|agvh3Q&dH~Vf4_Dp8QGLMQn<>f_~c$^kPT{-e*M|2v+6!HuTQQ z%F=+9F3ABilQ<|fe!lwnaXe&1pq{@Aek13Y&?u5hIA!2CPsW<7j(RI-eaXnz@3K}Q zoWI-F?fAUAe(FN@JAByYb!|;mRi6nDsOK@5K(M*sc?G}K-acfMT9^{ML`ctsN%8RE zi)4d`hI;hjtmo7SPOJoLDB>Lst^ITbQ9V?<3+q(bb}|)`L4Hgrz%SH$IPi7l&H4lP z9u{i#^w-t!y%kke7n59X^cyt{?jvoCPKJi+gb1RBDO?y%<&{Ul5>ICQF>?0*ARCg(uf42CN((Nx8W<6Dur1R$0dh_~OFAf*Cs#Rhxc{ zZtc&XLM~;j2LwLM1xA) z$M+|Ti~YxDCw|q_e!Y`%w!&$0cAPoEqM8xVCHp21H(bvwrbP_mlO&jNAatNf+5W-h z3dT4Ob|MF`=wUmiM4dQs!Y&~(C8Z$R~F zbt~{lRlBzyMjy}QJCj41V)6#72IGjVK2CWtN9I)I_OQz-k1$X7$9arUBOX5_huAAn zlQo4`^@i5&#oaR&ft(2!m}HF-bi*@8y#sv@<$pN-^ye4nJs3;CPm)p_STXkJhX@}} z)H$z^dM{+~u#_<%^wU^$N-d8>fIhunee+04!+eKNc`DR|RGs341?(&%kqJ0Lu;*aR zccwIHOZ~Ti&lb1U)$gBdV)q@HQ*AvzCY(U`ho~e~w%Fl=#L0PMS4hiIMkS+!o<}OU z9zCQ+%kCFIuJT4}l0sRdAY^AG?N;bV zwGJ#GBiX;z4GK$%MOZwXX7>b!L|6gugn-~RP&7HxZ# zJe!sg<-;wFEbPvU{PjDDlVtW}RmJzaow)n#*|R8mISnPxVCsN)u9G-FXT?p_uz2Es{On#@}2LpE!nr5*7i8FQU8 zb>l*zOLo=2xn|*0UXv8TD1A<5ECC5x1i181;h$d}zJ2?){!7MKF%jH9Ka(HZKXYTm z;Cahy+1ZnBGqKi26|vpdw?7urnAafvWoMh_q~ zs~7u7|6Y*-`Tqiv7Lqa9crVlJgUUwV=wUr=kBza$-V-Mt!?Tz`Pmo(OqR*gFUxAVn zD>-WFMq9NRL%f^?gC;{7h8!TEasas`%wkU=bZSsXc30=&!wb}gh|BZ6w$>Wp*aS%8 zT|cM1=TZ6pHhNl^iZu}Bn)rW-cOmT@Jw-zfXxOeNb(MgU_z$8ME=#V63eZ9i7O<(P zsP0H`a>A*Pd#iB|Wz;>vkM?&e2bYjN4N^X4hHW`=qNL}TmOB?|IejWZt%2ez4n{^zuKy72%TkwgPoYW#HWE9WHzhr|u(Cy_PIzOKorlC5!ZadsQ(cRHcJ-Pq19*iM-_ z4)gJ>WSLSZ0NvGvR+6Mt{u;BrMlq$Ei}RiSD{>nw9WVnwSZt4 zghMgTq>3K6=94Dy1IvtgC6mdjfpilA*eR+6EEsM08bIz%X{pCctE4nR*!cp>+b6hV zC3uhXnl&kWGan%$L6T1Cj*xrvT(#=P&5CwNrn9V^P)0-r)5+YuR;)NkQ(ONO7e0bV z_!rXyc^z7q+JP8>zE)Caa|+pMz9}9DP_`L%w=U@^R|mQ}h)V@|!v&4h_m$H8fkTJ3 zWhwGP!e(7(PxYeOy*K5vCp9I6prZXPf2Li=rHmO4lj)I?IFLmN?oXw_`ol)n2VhLI zeq@y!AO$scI_&2pkYCun|-T4tYW$0ya+mpig^5S+5k|pe+IghAALRG z6*$dRsu=UyOY;)yiuD<}BG*c{jc%WlVl`$r?O6J!z~Kq{Nt!vOtKq>Ey?whne;reX z$HTQ)k`dB?wp2RKy~fywUG)%KCKMSa*#}fB=KUf zKWm@rORW3Zi*Dm*_F(!53R_mvitj4CyK}+*^q_Q%PEZ6~TlNADRqROa1sF`@wikhd z1iDgqaD`%n!9*o2*O@$bF6}1- zf(8IKBaMxXy=$IO)idaY6M^MREZWST=a=(PY;UgKCMt6WhqatPXY5bko6ylJ1uFd^ zXqhTCzo;jOFNU5!R|DIX=H0mRdaM`NHN@?G_{s4gnbyz^2UdC?8Ns6)5y_Pl7rHQX zSKKIxO3*q;^H1>xsqA>`GM0)XXDP3Qx(K|d8mLK~i5Hm9Jf{oeT4Dqm4L(fbe}`W$ z0x7gtq+drwkr4mqm#SK&Kj_&RK9M>mpzWYU82rl789GJ>$fyueCX@Vc$RAGlky9M8 z4zoT830nch!2veGkzq!($O#3>=Ek{YEvjK1h0eDN>?)>!v>IpG2tODj2{|nhV=;}a0`b94AA}C8hqjBN#&-ay7=tA>@ zrC^o43(&2alu)fji>;g}R4z1ssz_-?c1wR`byBsAwEtOC@tEf#X{taS9LbWPet%UD zenH(kKs2Ia=k>N7>b{%BNh177uln1byb)=ALke`Oe*zA9VCbbXf1r*|eU$htBhgYw zcZ}2H?`&zPjktI2!#-6?5T-tWVt47?dl&d0hGi^LJF4S;OP_9)$1~G&EWWyoY)K`1 zUxJ+vkCdSs888BX7aq&IpEdInNwE%2@^afee7XeX_yO^8BlwFX76g(rTl%9@TL3e>U@9r6tKjzZ0Nbp8nm*k{ zFjNJ|y=I(Z;X{A-WqBc|$$90H$09+WR)+xFmTV#C7#Q1VHYv$2uU4jfvV`7ehBXJo z?0?sX!$r(M04#uHLdkl7qHJwWw69B_k~*|w-@es1Jq2o9l9Bkz`ZilBBqU@>UJPS- zz8y>$c5P_rKkyr1nfMsaDc^f0W65(KBM@w)!ME_6yT4-CN+(<#zeV{%hDyuj0~!Bc z`j4{hL=TUKyl<-8=SY${S*0M9drWbLX+gkvN0S4_m>l?=C=O1BPTT5GbJL09*v=_i zMUGQgSU7K`SOnnsE>u|RR;|ltKCB)ruEQL_(my6mt6#sq=IuX3Mg~#dCwUE<8$TQk z`9Olv1&aeK;g|7CITdt}u2T|=0*0ZSJVAu8?|e!~fq)Wr+MHhmNt2^ikpM&wQSgA?$`Ymxp&DnNbFg{pSdz$YlaDXpLZCT{-j?GH zLRQ@TnakKpVEA?M{*t~Akpvmv&GQr;FG&Afu=;CL?#RoNt~ZoRE<&Bl!Cal}DTTHx zAZ`m&L!Fia1wkce#bv17wEW#BYU#7CfHtMFY;2;#EhZbNnL85vsmI8AHvr4q|+$&SP$#wdz2#n7vBX^i-V5K{ki`K<;AEOBAfrA#GD?ZtzrGY z`7zpiGR^)kjF|ubZ?~`u`b(|(`w`>(P4puFhd-kow#Ur-|K!{JKmKsd$1J8rt^2HHlJzcbTO-O2eX6d5F|TMr$2F>G!Hh6bWyXG2-K<>>seYhfBp z#i0;no;zA+SFi0MY;Obve5j?{s^85uyKfJ%bnE;(o!?yrw>^pwGI;g-kpMx2M^Gfz zToT6!YJ{F-e)K2>BjW%up~w{rif3mhpXF2~P;tTH73W@lqlC|I^ryD)4`tX{zz6OK zg5g+FYJrX+YjF7bVqxenGsc)4iL!Jwh_)~>0KjP1DiCvpEkhu{NAw+J`y^3BbvW`? zy(69lb?P-Va4RPxD7DCcxW}x$X8f;jZ8)Wia5tc3SA6{9fK~4%km>xOy%c`Saqk}k zDictiEoYCf-Oh({>v+%jFx-Rw6cu}40MHLkkG$&M=rP3~#TGkWSo9-)fA1_g8y3CX z#Y&pmLV5wTlEZ?>L-{D&9tCF(ZSaXydcEloHWa>-cS!H;JwQ=M>w$CESDH3d)*Okg z5O2H~4#<3Q0|CLIe++eK>32I4xe`jvJC7bU9WY?P?LWWQ{l>8r`%o}5uY;V2lz?m< zOD)n}+Bj57vhr9?&(G>AfontKpd}w5^_(kwIMlD;v3H(7*JQ$UlEWE@WEXSbi~$w! z;-^Gd_XH&vF`@&EFYMtT=JWPSiowUAo=dQ(7r#KeyjSpfqE zIBc9Ml&lR;lKLQQpYP~6o->6f>+=n(ek~3(1?WZ`B%##r3Ur*HgYN8iZ+y8Ai;5|N zBUXx_VSv+vSu5haA?qs8_I(44u1zY&ss5SG{e+1=WTAvFz{L|ZBkAS{gNBa!P*5Ae zjUfw=fINr5!vl@|UbF)U>hj7rV|Y#PnOjL>E7?3g{>^BDaa$93qQcW5bzPhlZY4)A zPocoG2BXhoO}E0aHiJEyaX4a(%T@qn0edlvX^cTaI!R$f|f`zC7^?G&Z}3S zUQu`(z)W_HM2P=rBO)?@VDsl!g%`EFAfgHdYd`j+Y0xXk8u-Nl-UZ14{gR7O!*q4; zMQva8wLZ|Lz1rzFZ{D!J)|0Ht^a`4#Rrzj(Vt|ljNKP4K;~Kcx|Luk-=D9P!(*zhy zUZl{`B(}418@?joE`-GN4~-6C+3+hwhKQ{bUcW83hkT}j?aTaCcEtm}Rb?crg~m>I z7MuJCi=ir|&5GSek3L{BikS41E*Ftg1_lO3L5`8TObKY0&X0=oCv=3a)J^gVA-z}g zd>CdhC4bk1DnrOG+vjKq53&xOAslOzbK9 zF)#LdRaBJ7z7e^EY#S<=w^S$M$C%g_&@FBHTu(7DA~D}*cWIh-n7KH`$^cHb^UNDA zw8*+oI6m|cs1sNd2)CycM+&3{v{jA)bqS*g{7QfNg$3k*^0cWYcTq1LCw(>G3CZu1 zMmbQS!ljeq0wl;Q-#|j(`n~YaZ%Hc)*H^2+vRlly5u(V=4m;keccxaOjkNOJgVyv$ z?hdJlO(FxhPd699pfB`1v4P&%c2F}CdHDhohax2>uU}<$6FrNX|i<*A5B{IvaC6EPTj1Dqmey!C#0#E7HM4j!;wgU zh3!Qs_bKD30vZ}QW9YGC$6myGTp2Y__BSvX5e)BzDf#*P%!sriW~7f>MUz0RrQ}^O zgE^u3Jg98g0;p|N?>xqUz`+qE<2(ri|Il2Cxpk{~ixwu383b0Zh^uL@AY>@m9Moxs z@JXaG0}vyS63t<0Ok-{_dL8VXWzL&a6%|L)mV?OwaE$?VN4UC-9=EQUzi;SUZs8I( z094C=#}AWRZlxO-`7eU5gZJ1%%9=N940o~{Jr zfz*K`iT7}h&nI|WXouaJBNr|rBXSfv(*8ny%IQZ-;w--HbeSP|)M5H`XV&3x zI?8WLN>%}6NHCychL)H_KbNybvmBeEZah@wk%VddkKcOzsqG*q^6J5~x_a}{!fU5y zDxOZX@z=z{fErlR8&Ef6vNo6^ub~BqLz*yVFilgZK*B|cq|7k#(E-(;w2`;JPV%i6 z#bg{H4E!)WT-_#W*EtDmkwhO_B?d|)HB7vb=^jL^x!9(W zBFXTW($a0dzKuGz?46$Kz$p(%F31bbs|@Fdl!F{Zl6A~X*8bcOC7d;!UIDNJR274OIL z^;*nFE;?`U*x7$$P*Bj(^oz!Nky&Vz)cK*a6t&Vr{!YXH=+Oye;H4R-8XZ4=oUx@Kh#yi~G#aPG4r|PZ2bo~Iia&Y2n`AdqX zw`c$zhBQXJk-R+#ItF(wruWabOgUrTzJBqW>JOLBBk$>PA+SkF4q?W?&=5ns-Qb9= zuepJzu+k(t&s)0GBY(!XFJIoWPZm%nug6WAq{zm{!z{e5=~R3d!wr5@S*eUmaJuVX zzamYs$YRL>%SLPCeh>a}dZ^{(QyF=mk4BW(lD7hnCV^-$UTHohdJmhcew}e9YMeQP==a74|mEZIA0? z9V0EWemG<=;=STKMZSUkL@|iv-G=m4WuHFD1f#61AjS}g6N{M4OO4vS`;9OQytW0f zqQlvcdP7({Pvd4=6cbAkp#Y=BbXxib@)ic*Vald^hguNA#X55IZ50le0fp=FwDzpH z*Wed;k_fSA1@yt3ufJ2C8DE$g?O^5VRu*hx?rEeznvH(vg8t9jG!}&SEv|N)a02RL z_&z1&4w5i<8l(Ry%sOWouC&>ou@K(%CvnRf)&Uo0o8HX$VKHvEWlOf`n1HtTdE_#W z3;OE_$n=a&9DGhPRnj zDCYnyj#QeA1W^d0G9UOn#vKp1T8KAVY`Dqg?7wh~qYbP?Jw0((TNWvYzhDg1 z3dK-Q!uJ|iH@m+jF*UUjsku;-LB<$>HJ{d`NV@1e!S`Nn_Jkvwbnxnm!uHglxY8v7 z{HN26kcoZ(s4^NP5lSCQV9Fpx=!$pm)~7rhb1E2UmAaC6i`G(%d>}W7;1`NJj#KqI z!s+20Z4=v&7B?S!t5<=zA7e_(ThyIyIGkFY-IgefqKSVYQbf|L4Q!WOD z2cc>9fP!@Hg9;(@2n{cCRm<<~_Lzm&<%p&W2OZR&T7m>^FMFjL7{H#&ocw{7cM0jD z@F;O}vukTJb7n^5%)VqYDKmyqPvlS z)Y1ByKU@i4Heh%bOds8pxgoe0qd=cG33MB?oDe9n|9cYgn>6=70( z`VK?{ObYvc;%hY(XV+oF+7Y6N6Qn4$WDAFV_*oMZvhfM9Xr0_A5X1!m0hvk6$!UfD z4jdIEo_4P~kN#g&6Gw_X!3SuBLWrh&Hzn-;br2ppZPX_e9S7e0se=Zao2MaPeCD+R zEKjL$Vo3~;HsSCp^Kp3LM z=dTpo`{!TtK`@<2H{*TI#SXNka>{t6L`+}c3c4?<^P&P*yu9X5QbEUe$+I&m;Bi^` z#~Mun04JP?M43F|vo+(ec`h{FHt*PxJ~7FNdYQ*X|5=H&tMt74fxPRQ)-Vn@GZUx< zS7;tR%3h~Dm(8fR>;&u;X^f#rMx-bmKUkO%wS%nMaQ{9Uf?oVuyg|7`LjVn?4|D)B zxua6^arIzP5m7tg9$4>g-MgGBqAL@`kzW6L=Fw#32h|2(QAu({WoxxI^a&gZxx`%g(QjsnnJ^ZyUVoow8J0h?u#nN)DYXILROkj~0+(<;q~_ zpKe`ZaZ!9zR@Sq@pW3aMWQ(yqN}R`Tlsw@Tw`o{X=%slJ(U6SLmiJ9gAHof>5~ z%4B4N`lGrho^8=xPfzjOM$1uayIob=JF?@Z-D8Xgm%7&0q#g_kQnR_&)u5vK?WRwk z9_1a%J2t4~_^J)v=$e9Zo66wP>b`M zKa|yJ%-Mi0PWRCZo6Y8-MNprX@g~XD%?kz8zVM1? z8BHjF>Ak-G@F9Gf&Eh3X{4jyIcA?e@ZZ({U(GcZK7JpTD?=js8rjmR*Xv*?)wQCmi zX&wLjr#=PLvvDi@cka~b*DsJqmjUuZ0_Z^*C9_>QAhK(so2h4132uYLjsm}Y@POWA zgwVJK$si^nHZ$U0FD_=?5pZ}>gt}W%JT+os4W{n3SM!z*KEpSO5&@_-{}c}2b&zB zj1sCoX;>j3L&NDFx+c!=-kW!qOlAxlHcTokY9_a3&zAI)=ol4cK)HVUN*S+^eOZLg z-cnCe6Ob0-=vU8UCMK&tS614gXLQr($Nn!S7mPg>GaPjklu0PqyUjys)5wQ%OS_>~|uWsabDh@*L=-Z1GXXhc?4R8+K1NoAW~rVaoS zPiY%XDeK%VGf{$4>&T0V*){j}`qs35m(LF6fPrNea$ega!r8?|w27p8r=FH6d2c3G z3cvPPOPh|*^|(>)Nju1#S-i|d7F^Bq#QuBJ(&lfAveqp~xqCOy&TsLeMS>s8Lxvj~ ze5L2`;q6(pWO53@g;VJY0z%i!-W5vng}VlowClAo=I5=aajKm?QeQvFyoHi!sOkn! zsswW)C_S?c(dE~{=yh8tEmzHZ*}Zq~5>a~b`G5ez;V8RCiAPo%Hc_8)o+orDV*15` zuQP;tL&GCr@aLObje|#qS!Un-QQ6Vb&pY!>{1y&)r0tZ74i!hTHUYp=r;xF>f&BX; zpp1i2_@j849|3@Pzq4DfUQNk*${c|Ojl1q!I;);#3e8MdTMLf0+&@Jx=w7kQu{ja0 z{tyOeL%BXJ_~Oq;p0{>1T@x-H5_tapOg`Y)y)}Q`+O>mOz{L#0i|L%+kzdUdKCg6v zbq(kmcG)UNel?EDvs-QaP*>ypeZ8%TA7s=gf!R9u&#bwYnPF}Kg*G)Wk+A)j6dYg( z#r#?t%7a+9f~lag$a&GSO`GxT<>;EaL~M}O7?>HecO5u*&>2068yPX1jQr zn~|0ZyvCq!AF{jg|6;aEL?7{GxWkv;XoFHn;3o4nfM2THHI1-2pS{#%+9A#zYt4*3~lx-oeD!&=e1BH>X8HivQ^cL&X(`U@g&EdP9%I$E^ z=stN8{uQAdFF)HWb$qY`k=^dYhquSad-*?X5@eoCUstvm8wb9Y_2OGxxW^PNEiL>P zuKqI9KgFTWf%^K*o6%$k4nc+<@f;0^q*O$$QxQD0ADFk8+Jnh_T&HV8a#=m2ZFcy% zC@cEi)PHZPP=X0|+fNUCVR0_0=VUtvmJ#b+9v7J-Wid_JvD-stb6_}wPK zQi8RwAAY|eE6J~bI$b%Uz<%fXXunSP%OdG_o| z_JTAhw|j}f`Bq;(ZV1b`wFuTJs5xWARMfT(qv{b3my}(wdnk3%($fJ|vdqU6rr$F% zFgSO`XQ-m;jkLF4J~kr`M(f?2ZP?8r*8uKl2}@W;F^gT7pkV)Ue zB1~!*7$N1|?dQ##H^tFuxGK)u6OZjwE|k{UZQqllliOZgnj{mw$PxGVo*O_MUz7Zc zQ+gpHUiwN?X3RL}>BZ}y5)HaGEv8+VrOjd$54Ft5kKuc!jaoj_Ikp`;PV2-Q)*czL zwL=6G(;88bFHa9vn$<;L|Kss%=Ce<{nQVOGS?(go(xeSN8P`XVPn7vg0ia;);};y< ziF_JpLJ{8BkYzzqp=+%; z05q63W82jNk_x*DtcDcZ#M#+^4?V@?G-@DNH&lfG_dASir~v$tP? zzo8bm?b}P+WiA~~hvY>H%TIiQX9iEt9B5x)8=Bv;Jf(*%gtTW5OiUb_U%wtzr_-&X zHM6jU=>X0n)L$5slJNC8w39>U-v@a2$&=3kPa9AzB58`wxWN~q2!^QjOb~2wa=C?NNcQqiM0qywg3$xQHMI|}oy)|F1p&n;T~Qj@wOhAU5C!4Ya$4=k>;tDuH2O$*Gis)k zpM;XkHiPCtXg^mKM^+;vCqVT+$gc(pbrMo1qY^?Yg7Xr#JlCT(B8=KR>uG%Yn;LmD zZF%ydb?-eh)4Hr+=Bdr8S6#KM2EwqvU}b((TRRKdOqi>-75{~Wh{v(=Tl(5Ay-jAB zp1w9kzn%4-!(&66{f?Vnn{>3T*(9Iw2ks=*n=mmU6(W4X9fd&idyk;Dp9@eP8Ld@S z{rcTR#U@+MO!WSO3Z>OrYvsMo| z)nKtfpJrP9$erI~E?kqGoDl2rBwF9{-2M)O>ZTuH_|ue&?R9kSBy4#S)hV{Jp=I`w zA=-5#BVSc5d|mVIVM1Qpw(VZW&$M#PW?0{ov%8}HYQ0~ek8jYhl2e4anQ9>b6BTJ`#qGt;xo=zYPjLbGMpofw&CVyoH?78zC) zcE#p1u1#?6cf_cmwv7A zFfn~+cclNHG23guZW*=?pRxf3UUSb>j6h%Tzy91C-EnZ_WCaJK)1e8Q>N%HbefiXV z;Hd+>N0&Tu{x$gMh~B54-2QW0?csy;EibNkuP(T;$~$vTy?RSB&79r4P4;QfWb3(i z-^w;m`Adk}R)()?tE1Vb zJ6@Gt_x>|>jMDsA-}3!UH3F`l>a5(luBmC4jprOFfB*Yq(Mr8t_TVu*!N4^i>W>*? z{AKm8wVj#`a_hyYzSxVCPk4=Z79X$XakNhZRM=)&`LV5SuBshP`jdV5=B*}=KJ4#l zeACI}#^*zSu9k&28$H^2P0ORts$8B&H<{x;N;5sbpo3b+fe)`NN=WaWl-gm$!7D%K z_UM_Ab6suz{0QGdV5I;3r$DV;w#`P{UjOI^DUHxl2@F&WIBu!5W=-O^JX^eKMooF2 z-4m+Y;MHmtOA0+(+1i$GP~KL5_)kTuq<4>74bKT3Uz}*xu;J!sHnz%zo{dm|&wKR2 z%h1T%E~&gC?)UG+?`_-ZU-Z&Y_c;8oZT>1!MA+MpNe2o3>Ms@lyOyDshFbeZ)666( z5&6*aD|XNY>9*3IVfQoJEFgXP@%>15@rLKb0lZw2d$qLrOXuPGO`P&J#{X_+vc0O> z`{#t_%@ZzNz3#iCm0fPX;HF>hsdsW2yzJns2W!_Fr|&n8D~&TSQ2Q{maNXpAbFMqN zDYeZH*rUGwTk94YS&?sagOsM`k0r|wM5>ME*MzP}`!)*>mPhLc0NZ z1zaJ6YNX7KI*Sh5XaoSXXHt`r>XpX5J!E=(2{DlKkW)7UV=9}gOM;*E8Xaco@}p{@ z7!8rQaxM#L5{xo{n;wz9sGN$P{0km&aP?Zet@s9xJed^oA?M<*gb{j+5k8=>1L75l zG`-N=<=WcO{?L9YqgUG)F72m9q6+#(AqJ-7f91-RHIxAFDD!{;hE`tNZk9iWYXI(t z>ABiwj9!pA)&OP`CftSIPTtvL#dybQdO?uN1^ES~1^a6yLB8^NImxV8ka5D@4KdvT za>|q}T?*H6Qt19HP)$jR z?wl5uu>_^8Nv{L*;mqJ9mGb9a1IlMWz=6jM^H z3xKc5;Hv28E}c6Wiz>Kcc`+;Ng;{R5?SL-lFk|!+{w&racF`JkR_=>K;MddiOhYat822@1I^v= z!<)kVDWrao5V`=I|09Et5c&^L1>1Wmx11wq;$qYnx zr=5PaX#~g_y*rpOmW~k=)fmurpmc4d)2uOKertY3h=;mqo3O@TbXPp!^mU=mFvg{dao5t{WGDy++kWb6w^VgF6@?yobW*+7q zJ(@bQW02A>B$*kkzZa~#2Od^1#f8npvzN&TbYDbwC;fT|J%aN_-w>jq#Cya9kYq5i<-a3&%_9oUmz%#*Qqbd30AOW663rA~f_ zhV9aiZlBiw1Nq2)BuxhLgam-=)&}!7KDq{SAyqFq@fd-6kdN_JLg}6$cfNwiR{BO{ zU^6iZgP712Cc1BK)ZmVrS$xzdGI7hEpO5e2i}7B$n|zZix*~T z4!+QATYUgB0BO{PBHW-rKV=<(wzUXeC)6v(M;n7dNLowF9-1BBN;M_M3Y1GgbOJg$ z^w`W8wKUlILWGA?r9E)qj9X~gc7kZDvw8(>q)o-j6H6jwFn+MrWt5PpZy^~9K?DD% z5*nysU$nOSz2AZZBI4R4`lH*JIcC+#P!&Mo5@uC&tncC(Hp57Z$YTBra5)3ahd z2nUkW%c7t~CABEa;eBnUiHQQi2()i2@Co|-`FJ#W#2g9)IP zGh*Ehk>kmHZqQ^kHMKje8aH-_wQ+0-nf}nia8?K5M#E z(^?aCHY-l(*;o@Xp!3e%yCUNT>Y!WZe#~`7mc#R#8k3G~o_cDlb9DX{w{AZ+IRFm@0>(&&Py@tdj_Vx8&f^}9Q5F&VZEv`EGsXl+LUi@Ix_-#Ags;J} zMzb0m6QfzLUOfN~XbbLZ*NQ$?zzQN;pF_$XrWq9@SVQx%RXSBVgQEa)x%Juteh^Kx zU{#&>m`S^Xo=mr{T^o8EkD{3js!8AJGpZSxJ^l!9Z1Etd_A*oNX&$A&n1?TaGlDtJ=V549Kv>k9~|2wMhL*fWkEI=lqa#^9`jmri39vLo0n(M|ek zWIShOQJm5S&HdpxZ{9|#?@4RzCwAU8X1WEsjRu%9~`;4s}$obFPlT;^q+)%Un^|twvT+8=%HaWs9lolE$ z4fw85G2TUYResKMAPAudnYjQl5~IF)#$b8;$z*%O+F1nh56i9(8g(EBUqVOUzMQqq zk9kHI4-(0n2GNh9G|=1dsGHW3Cv=Q!#B}YSvwjDg_`>9+zK0K+AU4!$=}- z@Rq!o4?f#CaZpMm z3F4OqJnByYERIUirX!=Q^wK7PQu{!AqHDHs%7&-3S4o`zHcV`U)y~=#x)Wm=eINCg zzVZO4oxr6XdxX4u`*vRCWF2ump@ej7)dRZPN*5RHu3g8ho?-eWaag--KqNSxk|I%N z|Eu+a&hGxoa=d`nU*6YIjQAqFU*44rauXVXjm>tfxRm^8Nm0YbxPG(_s*Q8~*iy6w zJhoiRC%K8qJB)Wd_1diBx_x`U*`Nj}XN8->ivxTRi5!cMn7>%O6(ScVm8 z+UtPOD*-?K*csVWD|qxp=gc8ngpzmKe`xDAZ5%)TY~O&&m?IY?!oMmm+49b8|NJomJV&=H_gqoO9KMQ(RvO_>%U z6zRFp4p_b7fCABi5Zp1yoB<@=WMLAZ2Cu@BB?wUGFhIo8RxO2X5gi>p$JAQLeCUyY zxTW4VT4T$H0}M7$jNZV%y@gIM+C_*yhk5p8o0WyT)c>%phQ`d6B$&;U9ZjwZixwPH zYuFj&`PRx7UH9S2o(Nj(j_wV%Eh4hFo0e<%j}a#wbo`@^J=AO-wt*uK7Vmuj-g8Jw zEo=X!l7SJ)goo(FOAb7j*4!@Yy zhCS62S6Xr@0Uq&+5qz9vL+)3lE_{3vb}(0-UXtPLSkMzk=34Q5sT zm>rEI>O=m_p!@R*hX-e0O&q17+G<1m!{6td+umB^U%@yrraU0#1|ya6N$^#8MxjQ! zutGf0TX4N2B#GK>rbA0Wj_Y0RhlL;QfUZV=w`D#l9#(Ct?M{W9c>%o)?n*bfNPw+n zO@p!;1p`4tSGMh4GWDe}*@Ynk>3i^wMJ_jvBJ`|+35#F^d)nN6P>C?Zpa~dXGUkjdGLk`Qm|fkYKpuQMg}IC;}3k>EDmdogB132WaiZ zEw|o6_SD{E;Prbw_|PG!Thf2Wgo(g9rkmDO4SI#7mV#Un%OuICy>@%eEzVf#l~9Xq zfV6xI3p3AgR-MIN0t4?v&OB++xXi)AqJEdT8y6qXsCmD&Q7LT~xig+VC>rYj5-9j5IbsENL@GG>R{nzr z%0cF?zUOghKl3ljixyN29R(BRkTgV~(=LB&+A@H|0Q7d?sa1g%p-F3LBF|Y{+J$g@ z@;n2CbOd&VC?l82zlRRF8&p2V!9kDLBZe_0RDU-*|I8-Ck#q^*omWH|g?USKQM>4% z$R48_Jb%%;U~Q8Nwwtaj1{e#z7=BwCpKu{0zKQ>dnD`JCm>{~16Oe2QsuiR+W6Ccs znuBQW*%24+{i>$D3?=yv71QdSZ57A}$O3eG_fE~rJ4&)Dl7O6x0K>B*x?0Id zjXku6^d2-`iZyA~kZ*RFl{v>m5Jfg`Q&?ag{VT?Sw-!#5#7(^+HGdX zI-%=jD6Ii)EKD;!ePfP8L?Qs{U=2mlN5#l-8?HICS=e9KNJ;~a>~>Ug&ST#pneD*3 z=Y(9obmF2#XAU;#95Nt$^$glI6*u*q=Fi_^+be$Y;hZwdtKG!625dF+#W(O-DdNNd zmq+gbPXt>*ZHfSU3S80;E8EWBf3J(W?z?M5!>|^E-C&H&xNSS7es(Jj`#w}D$#?E- z#6Qrnvxm5if(ny2Tv}mkq`mLH1auZkF(*N8uB`$>dc5TxAytk6pF>P#4;2tujX08d zs>Wrze9Z27`^URBG#f-Qb+yR?wtFx=3XaeG=_f4re8A1Zmz^VVkD~YNm7Y}Iv>7k0 zDw!mmV%pI%+=Dml19>qv!lEo{1RsJ1pY&a`V>_2!Mp)Rr8cI@IRAI? zGIB6kHe`CSP>A3Ql=Y`k0NQIzFqN$XvLbCOS|j3|QqyhJAj{K}SkAfsGPovkSM0%! zuj((vvYG@MQgOu2MV#Q!3qmHgeuS6_3J!atETHDwhpZ;73QefS0H&ekRpWrPpxX9b^nfXpE$Lklv^b7CBZq~lBpV97 zjmub*mqig3pkSOJxT68X8hEPh)k<=}1Bk9{+NXMkeLF)#450Pc%;M zB7i7&_kuA-qqQS3uLK%;(vv4@VU}s|)U|u*`DL1u;7DP8=mhAzD773Sq-R8iDG!8} zrWBZbq)?JY?Ob@av&9acEl`ZboH-Y&*AfGiyo)Zi%X#{=8IN&!3#(oSC>%JWmOYOe zRTB3vOR_%c9q%uZ%)2)8rJignefx>Q9y3=+S8DGe(8GL>WI*u_*uBZHpQ`^Sr=?? zkSCr6K&a{GLWfMa?;Mv|HBU?(S_3;GK-~dX`29LvHz68ujMKMq3-tQu04qpXh}2kL zhb`Nay3;&V8_O1;QX&-Oz*A_EIY@BWCGAX1PBwj+Go)895$u1?7~7$A-Fr9ZvRos` zg?x0+@)yA!`}AoBxK*1wrAar72vAyh`XguQeKD|XZD;f^?-ZSwAGB4`p1`<#|MGfo zs^HiXm7jLBnIfW#8uV%1c1YKOkx4$IaxP?5M21`HiC#S1DuVXRD-bzx{~4@z52goR zLwy!MYAeVQjZZmJ9kVAC9x!`5d`WiXi)l62E%bsg0$B-qDh)a4ag*57S0io{`bqoX zgBh@h@lFEWVN*H!_qllQm)(*jGm4Fm_}$yL@z$+%$JD%AgtdocaStSpo;F!J_!k+C z=+ioMn9OS8t>(pgG^dsZHH1e2h}43U90jLG^}_+tJ?;4c&fJ)ofz%LRJ=X>?mv)um zs*$Iyr-W+6u(jZ(+=Vmzn3eK%n04GV>xS!Uj2Gx2VjAl6s*eIlhL|=lf?2AmsTl>h zh676gZZWJ5X11LGY{7#k1p{oX5OX~Bjdp;rz)AbhQ2;9NLSVk{wY47~%%XgVi~BXK z^P_*83Pz@;DffUsCLnxhwrwj3G1`Iz^cE~IVOLSzP|0gz$(TARFW zZ}~%IDI+w*k_?M4G;1H$MXv2)r19=C-lJP{GgCvS1)>(lr&o-hua-U$5D0ArQi^>C z4+_$Y4o)1Dcz}Z=w4?WvO`)1dkuwRLN0cYf@a%VJN)u_1%_zVunHjBPF!WP<(E$qTp_;^$}fm2eHS;~sQ>8Cg%l6E}Kdf3gAT zwbr(6Kg85D%4N6>gJWV7%I4YMtdOHv;(Ss9I zt!~clx=&o_|!x^Ks|$=vbC#u(_P#p~;wjF;BcI7vg32M1D>N_8!i zrgP`WXi05yy{&n?b3BSQgsFFLnrYC59F8;cBnmaK;ELyb)9<@OA{#e`Enu*!>(ynA zcj+IT$-9@$0plA$Mx((Z6Nh0WW5akgod|Mp&p%|&AV$dGS-4dsd_m9K{Ou5)PQe`l zuX`{)EwpCdJ9ac7;C01S284*KfzgxU0iEsT>%g<=+r~IJGN>FDD4WNBPpPEoSGLStb@sL?9^< zh1JdEdX#eQ;IzPL5-JDT8yp6c)z*KXVt6}dqfY3G`&oMc6rLW>1LlnLzrzXS@Rd0N z1XHQP)`N$_y^|Rsa)7fRzj|u1)N4isHH{!$fCfUScPxH6Z0JzIr&tJ{A?+gGFZuqi z4NALLEB$WlKX70bYHvt|j2AcgE9Z(ld$WPRS^g^)ix`X}xf5$45tIBUFpwoqU#UCk zk#I$#t;=S~0yymiNr>4?B}CUxFR!ceK=79!QgVq{^|S_N26??cHj)N*uC z(It^;AQ_n2E;=CiuPKN&UR`8bOXp6H$P!yt)DXY|OF_T{K0I;+0+HCS@IHVXpW=;# zJNn$>nse7=(F-4B`78wE_sO##fU*GKG{wq{UWfo!J4&rw+ft{CbC-)tVqxLj+M66i zC~rB$Sa;%2g$D*pgpvf-76YVOF({fnj~AnQ=Q8bReEc5rdcdqi^O&2XzbuF|X5Rey zqk-87RKeNSOeuN`2VdY-DrJ#oGe+v}x;aJ&TntbVs0Y!lKwvBrc{ma$g2Kp@1IZOg zL-}p=Sssze7=xGHUH51N7;q(Fxe6`f$1fH=d-Zw;cO1f05muzQhKb4BojVHTZp2nn z&(8Y#O(-jPD!X6~-kv=6ZTHYLog5g$J1{j7d8(YlBpZ@Ni}LFQe6 z082pi#=sPtTSyIi!t+~e+7T=>(U3|_^s-ft8Fw(~@ih+q>s!aR&CAQviqkdcpj2x5 zG3iVlU&f^{96?lEh(U!g2iYj_zyT3ei}4e5^8b&n|BmbV-~T^wD{l33*{yGQs zdOe?yalhZzT>@xu!#vLKW%RiXwOC!{Gjsm##1B=t-0)LMUbWab@hOT8vjJd$j3pom z>%i1LjzrbT-Yp1STR^2lC)!n}J7{$D;GmOhn%WcKj8VV(L`E1E+)>?R1eM5?-fkfH zS)16Fr`TR9B>QsV{NPuM0i^+blN*1>KH2RA_^ke)x^)ONL&|2Ts9bBz14cv!CQq8K z-IABobn@0@G>ngr0Aw_K@h|^>)Z(~pLd#}ER6u#q=!wZT9lb|aB)qXJb8C!B6|7?eA1f7?!O?S zns!j$`Zlsa=nfyG?r4VgN#=-P&-+}v-ssGNh`q6A#<=e1kdX)&7)pd z;6%<6*Q`sW1u187H?hU1J$rf5{{pIfs)Y*Laaw{7uu`C+GWkZ5ngHwD66!_I^$~Rw zIG^wzpaP!!^4#aflP7QgvSQ<`Ewxnm^7e==X^Xp!gW@5ZayFSBU_TVILxDclghaHd z_&*x;;{l}p2(TcXEf>|w*AKQeY}~wc>)tdXs{L&&I1_=h(%1!?Fx@Y$9<}ySqzcUy ztsBc?Ph#=Yt>3?Wn}^HgSJ#M)o?>|VUk|i<&z@&5Q1pqda69e|)(s@HnDixy6IN4f z_tf?%7#rYQ5jl~-lkhdiH3wNPmtj=Y`jW%KS}s{J1;UmiOlTkYV*xqsy zuw{6%k6p=VvYaG=gdWi~{=yBnpa1e8ERgen*oaH_;I&)B@Ch-hfwPK@hB7YC<1$KZ zxhix~$(xu>UsOj6^Ep^)>9}5gG9P7XpU4rnKQknqvX(QAjlnZ!9vpnap8wi3(Oh%V z12e;n&0Gz6jPQ)=4yzQpMStz~;|66pU0vO9XB%%HmCral#CSt`9L>iL@@vFMOm{R8 zq1t>xxX1&wUT;$}3Y18AB6jNQxPcfiU>oG)p!I|%0lXSCH(632IPr->0_qhT4l+@; zT1oAZQu5~YsXE?}a?)sj{F5PZb!BL4 z$E>jE@?gJ1hi*epG5(=DxxARc9Jcj6UY{1f|c_ z+O^ve9W9p`h&?Qdtga1hB&5AVhp;(a#PR%pwEU+2HlZw33FYq5qv19CHqF!S0r&l2 zW1@1)&AE_2jpE}|LCPPEG&f9t^kBO69qi(l8Z20F-saOo3apo1?_^}`0uWux6{Br# zIOy2n6DM|ChPxHDw6tv0tj(=EN839OT6#`Pa?w+w`6`tEl#KF^0Z|6 zN!B4o7wzg9(Y=9?!S=Z(qpC+ca}ZIwa)^{@>{*V45xVB%gB|R;!^e(Iq76rMqWe<3 zd6Z@Y2S(p>Wat9w0hioi?fgOy1A0T9N4lqAHj|Y~7dR#_qqwLD%<4dBXlQi;t8FR<+wqRB#~u}U?qS=IWr)yc4CPBm=*VWk7DmDFN21Ibmy5w zBbD!W33PhAwtxP*a*P7JO>4xb)#KlV*WFgsiBdXz?I_uHqN)I~hospQyk^(fPBerB z&C!86y-JNx-V?TZR)3ovg4{g^E0hw~-{1P$Bbrop%(8;yn+}^RZL*FW=%iyB)jFa# z4FPP|#Knu>JI+25?t8EFouYRmZ^P-I{_-~snNxYAGbca2lJ)iGy~-=ws;XMQo91Ah zXlgv>`pdF+HI9*7d(QbcCObKOZue>9YbDT{W{)o%q4cegiiT}66eb(5$R?koKH=fZ zLbm&9dHe2iHyfAR|CCSCi^aK*RH#b46z&%(-WckX+qPcO|A70mg=O>LBgAVv({1leZ;OaI?J9K)po~T zBx6D;6vg^pp>7^*DV$oHHpGQqu?b!}CFE(6M{cjS|LCc=b9ZwCNZ8R=UrUI0;EUUo zYs;6HY#Fp6tNnomk@VV+fu4~(Rl0ZDEU^^?RS1Gl*~n=ixDEu^ac(})uSzSA8Wa&U z;OfFl)L3lW@>0|K#ZVRrrqAhF+R9JFI42To3ZF@NkM!>Nv6Tg(KIX8w1uc$g2)S<^ zrcAltEcqJcv?G}SDg}= z{P;SugI=eu3um&pKybv2MgM*FUV!t;chf)o9aQ-7qY2Rkv}SbA>@up6QACmR{~~Z; zd?}g{{+KqpruPDicGHB_m-GiNO*waIv?=(1a-1TVE_|7@L|h9ukj$+t8Mr(w z@!Vj$yC5|*2#Cx7Gv9N2Pj@!?M7MMIx9O4UCpayc{8n{{(L_blD4=uOxKZT|!|= zRS}!dji?~{J@gzx&wO1@u3<1fOqS*|=mP;t2?6qo&pa7x?KGS71YDO=$kWy48 z0SvIedXT&|xyBLj!-jSrHR?zB2AitMoEa!J9!J~)l>lZ5N-$<-2BS`LFEr?MaS@Zz)eyAv zP7+$tC(?7VVP{=l+?tUleb5LI$pe)7Yt~>x+fI8sG)ztXHYB7ig~rnhA9n9rUzUFw zhh{m)AP?sbJfopR*O4`fOac3NEOCo6_6syfI@L#y%nR;JjJA zP=o=KVwBW34FhfuV!W4l-62Idkwl3AiG*``hrau-ScFeJd~c*uQcF^T2+ziQ3!hKF z%dR>#ZtEFPW~wn_yY*yGN;DnqF=Vtz0VO$W@L*xL=p;T>dAuWth;J01Xo@ch3U3KC z74??bERj4&&BxS3URWcbS|ET*_kWh30)O+UubRpGkTQsRMWSJ4{s2#vVb78(1gq_x z?4!N5GY^8>#hah>4FSGn@G2@QN?30LqQBE3kEY%W>Yg7d zf9emADrA-S*2pMDgq`i$9Zt8I5opmD7$!kat~h0zzRXX9QKJmRmvl=mOE7aTJ_@cjv^pa10JLKd{F~*`w5gMN@3M*b z4L~5zPq*_CC5Ft1piZJ3ZljZ59uwf4d{LPvuEx2Q&c=3i8P+DvsECAk*S?$ zc|RqiB9RAk@aiYlo(19ulVrz451Qm1y%vL?MEFpo2m%ZrAGJT4`2+vZ?$WL>-KJr; zZfBG7v^mzPupo#+%EnnIOeZ!K+zO@ZJO+~N?WE&NPcEQ}$Devv8S#O4g+5UEnNg&K z_*7O_rrQ)mS%w2(q8BpnKhJuo593N7@Zkh!;Ae=X2cZ}!DH!S^uob|x1H7JL{Gs+r zf9?m6oBh`lGoBGh(Ss}_;+p0;IsG!xbBnaUJp#jBe7<3{O~J7yu8aJpQ_@L3H@GvG zGg`TlECmL0{kgzQsJOD z>o_jpIN}f(0R=$oxIsD~49v+JZIR;PPNVOdUcxOX0E%mZHAvh8-VV1-8gd?FdG2RT z_wnIga3b-^L5rQvI-<=Z|I8af=bOU9?h*HL8xh!{{qSuB)62k}W`|TK8fo_E+qe0u zRb{q|B6nrArbw1?MVz}RCuteYK{2{5J8P|PAg3kQ+A=0-(wItAh?VweEX^VzBMfnC zSPPUR_Ql%!PMmn5vF}D?<7-_I<~MD9E2DXP#phR)wBypXMV8A^d`Qkpt}WDqlG;c3 zoO%40>lt^ZTbDGVaI-CLuBNIg{J;hCvoY>w+Wt8aLw&Ob@f7(&{?3O@dbz&tV_5^C zC~0|s-f&zUM=U*lz`wSaa_{$~i?WW*6;D8(7c&noTE#$b*xiB)N zD#z}ZzK)9DBq^mi6fv+>>9rSb#Dun+BtQu&M7zuxAfuPy7gBn*gS9fbacpcV5&VKn z4Pz`;C-uK0O0e7%FUihQwKNJ&V=Lo7Jb3(g25dA)j8J}bXSm76QSSjND=8=_AU%_$ zv2j+DP9#kax$^1#`!ue|O!ySRQxLIpudzm&D=KdO^67f^KFb!@u6?Vo`IhphMVrW^ zB;}PiYnJ%SwuK02An%Z*Nh&hso-H(8&QK6B3GG|f#FmgIePGS6Qh;%_r39Vb%pQ_a zcW}cJ5XQ#hk40xh@hpv{r0#wC)bB!beQ!3Pgt~K|5k0HEd`7nhCtB_As%rY=!qiQp z{LEGSP8P8lutvy47Qj9}_yr0a^72JrK-ouO)ark;lfjUQH49B&Fy-^8D2PU}Bp{vS zh%*NLjg5~llCjO;FPyX6bOYWxE@(8t$8cI3BjeO)JJ07&>*I~5n3zcT4flGWmA1Tf z{&q3&fcl~r#-B$3tV-CEh$CI}8W|g%`HS#wiI(MRl^7$D)-`L^jOGLWK#K%-#2_T~ zAa-9a`eiOU@^RaB=`tXvm%cF{LT*kNe=;j&vIv8RnKXBope~ywD-GP&6gDuLXgII? ziWi#eLzM>y?nyiM#qIG&RkK(8&Nh zvmF1HSU^I%kOj+}dhnB?wWA4b5ps&x>bP*>-;AS-*L(i@_3J8kMNOSuTm_&?JmI%r zzC4h~7^n(*vw2{V2@{!$zI{&8xh3WsPhF>7dTg!Pr>UF!8-oB#tG(-<4_TVqM*olf zuGltRqY6s2%Gbl=h(;dGbFgT zLy^$k8d>maZqt(b^Y3DD)IU9M+%me6Ga@JXD1#UBV#~v0NSRp>+LY5c$KB6ce|6gQn@RmnzD}>TNH;gfDN|UfR~DA z;R98Xu(wp3kDE&%ukX)0Q`D|&EuSaeh&WdZ)XX=kD&e^F3tKCxS)$HwiUA>-D z_bUAOowe7x0Q189WOj)$OT71N>e^ug?D{q*1RD)G(B7 zJIQH6h$SXj!?f``nj-x%Y#NzRRo|>>*_ub z=_F+!Hf^_Pf)GL#iMW&OWaX0wR)NIPo6sP4<|tfMcFkmR)-UPBUH(o^Dj+ z@ejlza4In+_3tXgr_cHty|s7cUZk3H+s01PQv@UV-Q$N7sK=#u-tE8 zoE$C6bmLOzGd*dH?+L2F3-?T?G-p z#6}%w2nh6zpaNJpB)^?-)|XD<7oROHEtq`M{Q*P$Bo{e*-Cg4|8^!4OKe$xJSzsuW zabXfkh$>2?7W`h(@?=sJoEYOe2+k{K&+&gydIG(te>P~j*657fLDEYz#;2IVCk><> z{^tZ>-4K)rh#e%-w*&J~I40#M1E}hwm(k5`8l5N(DJI2p*Gk_lIrSLda0klB2uN`5 zO0d&^$j=2=%&9q8(n>W^a5Kq>g@RljDXkk^I_4Da(04Fs#V8si_ZN0hyfN-df0Bv8 zdW+dDl#)@*4<#k{%Zu-ml+qJ0Kd0tf>7E;74gL-b4*pd=dsg4z;RwTy2iz%3x%2T! z;n#r>T&jPM(HrYkbL=P{f>@XS>%a6}vFfE-T|LlDY=Y|fY5viRMvb|1`O(pKD$}ik za=+3{32{Z0mW)GJ@H37s&ci^20dNFEAUS!OlK>5V0Ix`%qKswkd&6eO5&zWdE`{&= zJgB+{zh4aM*@g55&;lH8wD6YDEB(djdGu2KvbvTUNSa@ z2n;w}akY$ITV#?jD$3c}&?$X~vWCV!u7Rr&z19{;af*veG6Kw0Yhj*&DBplAB5jg< z1ez4ezJ+w7K#d?^KAki|r}lZr=a5zH;$UGG{2uGTd!!yxBK#YLaq77!_*pM9XU4)p znPaVqmvsTv*-$b^PmcZAdR9@4B(F=#4d=+wfp*m<{fjiO3v0veLN++chbA)i0QtcX z#@rZAn83{@;TJM@iyLkoG7_u>woAI$pr4>Z=j*&7F-e|2!eD?Qu|j?}3Z50xwpGdW zpzdSrcI~uDKY84;#my}(8@3rfyhrP+xH?kz*M%-?N^X1$l@|>gd*$UctNJitz>(u8 z{#jm`cUo_Y+BSg>;RW9?`IdP~(0sCVz7(W%1G1w4^8uwGDVMNeKd#v5`a$JCCY!c> z?|9MS_W1`?gUIL35nc~oqXL}@n1nuJ9>@#IUD_ujC#Q%FJ%;o8xYTuBOQ#a~4TT~2 z`jlhyeyyLMT>83yon4PX)v3)qdu|?SrTzS-T}GGYif!Y!|K4MzcrjVA>}E4xRJOrx7u%qMZK0PyR_-%*iV8Ho$Ol)dY+8JILB_{Lq zdT{!pe?SvM2E==|wutHWY|ZL1cI?=1<>haI<#j&CI*sUr%*c4Us$a7-gtFweK}XBH z1OOql|9D&Q<@C|jJt?Z?Y=0x2UtzzWWE5@Fjt5EAgm78F3l>jKix1}q5uv@C z?*f_*Y|`WKKN%Qzu$h5VFhvKT)OJl<#EJB35f5#`TS(wP?o%M!AaqGG2?*d@TEDLq z;fX;O&HWxuJKf}tLr6qdpQ@+510#(Vb<&zRk%)-8D)k$+G0J8)*DCz38Tb05<;ryG z0d8kXTZBCVSCgp$BGN#bITk?Lm$McDB&|Xh+uztYef_NE2kovjv0k=n)u?q7Gs9;T zsIVESE+BriplV57_XjsB>FYXksdD`>FgFRzJank=$C|)FHk#Z2TG=H1YSdKa#x0e6 zpl|hKeD`|p>l~TWXxQ)_gOj)I>pbwEf2Qj{O}HAQR04y@K5e#R)w8@dM3<77PpIyK zrAtkyB%pvAQ|XBjW=vq2H@7{?0Y`QZ+kE-my>eP|10w}PNp19Ubq?9I$j}f>%qjN3 zr3eg%gvR*IhX;@zO5D)&5!8Rs2{Po0w@qiiw4|;9=KzXM^azpn>fbZf+Rtx0jC_ah7n>lC z;M5?UeYi;Gk@dNx#cgaa>o7di?NO~nhk`|v6RXq?{Ru0ZZAsuuXVws;igtoEg<`}j zNv9TyyFkeuc;_c|(I==io2gF`wu$c8#a_wRwD?MsHAg|NftCKYPyUeEOoJ z_w=E}#eZE+l?6UKbamnQ(|v=`j>CNrQ(UV*i(-is{Jb;Hh!J^phpEM^!}UY0Y`*#* z!!L|jn3rAZ^kB)N7I#y+xvk#nD6$NUE|3lyTGQ9v$V6!e6(^pMU)k2*-Or+L7Lx?c zs}ELV&J>O<#vZKBee`Pies!=emIVu@+l%@2ydZgr!VN$yL_S&G)6$)y1`;DdrS2r5 z3$ndzqthsY!4hEsF?vUchXx7!Nylc-u|YvbU(rr7KeD`IE(VD)>*$!Cq;}OK9FRaN= zy0YNgYec@d;iA)09`Rs7su!S0LBhfY;c?*7wqIm-IHL3&nr`SgAS+{^s)nGXU|z$i z^(X{sSQZxEN?o}4`n>t`w*>_qs;gV!uyK9|_J@$XlnOq&M@@SN$GU#D`dQt_Y~I68 zfxolTCoR=bRaIEF%+P7AM{@V>)pZp$t~UT4dL8N;72R!MfO-0SLdhWhKfhWJn-}jp zK^a}bLF6N0sb|*xh_V01*Po%QjI6NoqfYcF9-TmmK zVWjQ<^uNh$;1BxxYl#dfGKV{OAe|_d|z!j`H2V-%vMb zYW2t3sV2&r3D%3=P44yha%lH7D~C24HYk(zCp+T?u2V3t0^^jtUt%h_sl^H?T z>@$&m-KU!WxPGL0(}?v(`)K4id}f*dQ3e;6JYDQkNh$LdRQ`o%=&Hv7YCe?(0ouof6oOkZ-#v|NF-ei&~Hv;r-gg6JV*t` z_491mszQdMa)~f&A($c%#VQUMN_lC{ph6mTjlcSZ~O!o=7oeU@-*a)0)+1Y|(h^ht1C^4Ue^kUtkAH z=p{|N(@KBzT$mSa2PWQoM@`ww4k)1q0MsNRE0pN{*SvPwmL$;3tP?KWS3?ELBZ(7>+Quk1g1+rAiU zqwQ@o#oEPe+2aOkgVoMQea&m}DMd}qFe29UZqJr|^;+rPF9+rjf4^97L02oVtX`(N z6U8pa4Y!N*uAC?+n?7N*XdGQly!~+w1P)nlSMDwm(F170HXy*<$%aJ8x(8tg`I!`r zPJZtOYhQPb_#ZOKSLlLK2`nO7H>FWO%gYoJg4)t0;xWXYCHD9^8>J}}L*U@SO5FMk zlH!cDj#+A>PV!`WPL7Sd-WEu{AVrYL@*#Ljm@3+F_6D<4CvCekZgzy zYD?d2LsQ>*vpRQDoaX3OSHIOJy#25;=ShRqtDhv--yFvXoD~HQ?s|Qew*|E0-pHV? z1Lh;Hh9HT1JYE!6sX!5gTX6fJ_nO2N#TB`E^X7f7!#7&;KmD`A$`~n4OSJ*>P?SJpBu>lSuQmyE*FlihEQ8ZdRWZsgiEty+0)&eqoM?sUrZP7ocSsHN#pFqFjBhjujQeb0_x zssgR_-0D^Dt_Z=F3rkGbE*=Ri<)S18Mp)UaVuzA`A&3rLjNgYdTlRC`TJITeuwg^C ziVZFVl4fDSBK#_+`e_}52b2NiKvUOtV#p1;WPqpgP$8&;x?$tZ{aN98&@$jJL#wLtML$0FH29R2tDd;BU6ZBPH{853@8Mx14D5$!TLuzi3~~J71MsL8?l20^;wg{Zmg;awkWUh|#q^YiVK8WP@w2wHsEu zs&{#wWWCt57m<$#2hpq;su@n*8_UE<_qnH=P5Ll`3SWX?Ak`%K2kKh_Y7jJ#b2*yQ zG^b``H^qGu!&WWLt1K=1-|*~^W!9_Os)X;b3XUB8>SAz!N%VwhwWqIreVa6nI{$a- zJ=21o?XGnLOlAq_i-8XB526&xOK)PP*rMip+x4&C_$G%_Eb>+$i^ry*fkv8n6O%x%e4m zBprD#99HZnx!Fph?nKzM92t_-Z{-J!QYgka;=g4jv{K%5Nv<_owHp}T&u^PG@Wa#I zZw#^`RO46nAA9QFJ+Bd0?)WHN+pMLPy3A#biqlMo&65j))UK)HHc3mG$k6Qb^&r*)rd=IwCTMMI-(D(4B& zPh@o`s@;KGhJuHxuGGu%nHa5>Ra9!zeJk1?=pPbRwxbTyw2T&I8Cp`HvP!-7e%e=8 z8?2t%y4Ac7)2?ouzN<<6s=*oA%@kUs9SDY5+-mO(B9Sgs#F!O1%z$+$E+-CSW z5b%o+`UbKGLO(2f@-satA*5&}#d+zAOT7V2b8BKFT8CK9dR4pXL~d1y=Tb}WXmyQ& zmHX_I?XO&ukEofG{VPne1mwWkom85JGR~ZK6!s{%XNrOq=|0Yrb`Bx zqvqnWU0KcXm<#xe5dtmB?*02El-GP}-p_)qEhEAl9ELcbDouRpy)etkptVbpNqu~w z@+MtIJu2Ddk?Q>qU$H{%2^v zJpHmS6{pPZHfBTf=?n{TdG>s&S($m=1kHwH#ypuHZPwS{l%(uTM7$@^E_ir&yj>7x zvQJOc*N^jtsWQ=aNyD?N?(RJvbC2rymM5)q&zD&-Z3;ug!qrq;I}J-Rq8 z_(WK?=l5B&4}JoS2KX%7-L|K)N<;C)@jVg}vu__ZHQ4^Rf3$~t;Nl4D{;8vHu5M^J zL7}HsuRZKT|{4|bjrvH1^YKN9B6?!mP!$A3y zN_hpT|#DK0Q0J)s36p9!FG*it1iY zDR&;>JfLr*W5-&aAMtjkMbUK9Rz48QQH;Z% z8D31qpx>1kMu_7ZH*XR|iSWawtUJ{sw=7UX^XAPZq^HBo?B+@P4}_p-6u8*P?G)x<8}%ac5G+4>zY= zkc&;jx9_}-m$q#A#&ye<4*FfDo!vP&rG1y&(d)b(H1zYCpjF90dBnmYfB|3=t>}g6 z(BVHMzMg& zNqb73MeO?SC5fE+b>Aj%ebWFgWacKEr}^^82LU4wAO<>b(U;(hp7=f9dKZ25C@sI>+UcS6tlV zwo{`#uZ@#QyJEpZnFx(4)eg361?5J%yy^F-zM&=w0DeHbQKeVZNqQgx<10uMng6fb zuvlESF#PXj4#4Z1`dZVPubUCx{+|29eNX3DEPMWKu62Q5PwxWbu4hbhBY=EGT1%ln zhF4%4G&9_asR6?486vVRUFx=FFeQW7H$leV!JU#ybC8KWJ9tyiuz>^Lv}oO0B1IV( z;tyRbmSq{pTR3UgvCz<&F%ha{ymN&ZD_HwEVYl$&ki~;a>Rd~-8sc++%z>XE1#cm` z0%6f4wFsqxq*^)IsUEvu41`jSx#Cl;W!7lLmT&e9KDTx2{E+1HHuuseg-!TcFn36V z_NAzsH@tkZ);-!zw)UUNBaiZyCA1Atkzb;W4DZaQ_Z?-LG1yr;xD9hP7IL@XvlQ8# zjN}I9T%BY${EO~RoOKNN65p_f#zS7LjA3a!uAQ-y7M!?*@*-@Isk+o1FYsHjJO#M0 zPw_a-w-bvs5-c{+dPzt3 z`|ZKM$UQWmOGC+#RUOos>F5ah@l8lnFLGPdC6gHA^7*qVJp-KKfuh*7zne#P|I-_t zApv98runwpmE|BpI7Z>F*|33}wn9uKV1Y91kU9j1DvG!5R@xzaUmUc+a1JZmEa%do z>Kf7hSjGCQ8_Y}!60WE}b@^lGG1sce*5Ccc+Fy6gozpMjO3D_y0UNJxuC)$MN?}Jm zuj{$;Rh;7_CP~1h6TEW>8T+ys=?s-i1J43BantOQ1CLCFHVr-0X?d?&TerUMz$9=|SJ0l>$ zV*Wgsv4nf>q-p>LZ1eF+*DHA^qFesv@|Cu;%ur+njBKf-cFlg*!P30m@z*xLYn{4j z-~OEtq`Q{=`E8v$Ia`erflW#N#bt$*!9WVn^k0_9a0`EfL68A(2dW@OyAYq zAoYred&YzImTrw3HL)%{)n$I*x0JZX6DGXTkG8V8Id9gn@QfS?KAp|4sFz8XMRH%r zSYvp(Z?D!pQmHimla99pv(Uu^YcST;t5+97hq}HFB_E`BZv_NRQDuFK?WZ(XQvb3o zcW$or2eTNcn@L|MMWcZ1;DYkzal^JYmF(fkEqc4_m5nAmhKZrb5iDaMIr*x zG&RSG)NXjQZI^}x{-v#Hs9gD@a>mA=i{>vH(SOR2lWGH$ZR}oN@riRRvtJn<%?G7D zBdIF-`{)?U@L4zv`LHrkK)^C234Q?qdoVT+J&7k}dbm|aak+NQYyzrjjP{+qU(d`r z36aAUw-{p?&mq_LL*ZmiQc}oPC61MUs_m7RmnS(_GVmDx1nFeCdD^Nq&uinX|I0H) zGB~q#BXS?qjaU}ra`G|Io|~cd=C|$wuXE>gH(iR|HEQ`$=Wd^0#>J_qshrJPu>}re z+O*8ZFP>&^Cj^&uAgm)ZTyd$~T5{I?!kqP+I@sZ!fg`tgSCH$I`m7!4YOyNL-Nba& zLh-ZL?#ca|wi<#FulwNGn0W@+#wBa2_ew41~u?uq6K;y~bssdXgLUr7SqxVz1JMbX%HR6O9p{&8ZcV}R{ft0vx4N6V z1xTPD!@bVG&U$AQq_Pl5RsH-0$F#c0{AQjOPrtml?HhjOO7}5?{j#DR!l)3)_gaTx z>*lCwK*ohcXBrSyOsI(D14hqF-%2`KjoUQI9L-U!-|D17sl>nrsT&x0_XC>O`1OuX zy|55Ur0%#Fj_tE<<-4nsU-A;A8~}ZD$$ezy(oLdRxoOa5{qRc*d9w65Nhd8@v_L`A z1;d&Yr$khqwjJ30XH9|Uk5j4fz`%Pg%1??CqK{dMgcrU2UEr4AY<%FhTjNvvMxUe8qb9`qQDvfTZWKIBCO54d z%TDJkIye8$PSULyI5p}2%R-%fH1QP4@z4s(ziQV8X&!5-*~K*E;L~?s2J|&;+N`CL zde5GzpN`!3ZZ_TG;N!8$%tH)xUFCju`fHJw?~77Qe~4=jK@cft{G!J>2HOudT?P<6 zN5eAcDIPGS&qQEJ846(sy%O|qJnpadB3=K_7@QqvJ>G&il`C33drnBVAr^ddFS}w7 zAV>iYQE*ZvOafjdZ6G24%=%zdw1T-=<{nF6}-LE{!Z_F(BEYzg0FIfJ8jjM9JpL@TGO;?QL#q4vs^6SR`44<1BK>u90r(hO`^Qho{oVaF)uDKCn#=ivUZ3{cy^7(Qyfh=F-? zr+@g>>(KGO@kjghQ+c|M@CfDatJk(Y^?XwWY4%=uhn@t@zt?`gf_3obp*lJth!x2j znq6xP2Te$lq>52&$w1VXgmejqP?TY5hGSIpb2OC_x#Y&^Z4eMoI3TAgRXp!hjnwGK zcqZ1`Qi5Sx;EhTeIh{Eg<822|SBE9utG5gskpkB*A((W&9Y#8zvgs0$xL{eog-N@z z+5j*BTt0DX$+->RF4}*az5T|HwGViI23*tB^q%{o&Dk5ypC55bFX@c>+%86I%ZD8u z^jc;b`~>}=|7N7yYv%?8VV+KC#O=l>xP^a`p^`T;6*nuOoT21}ZkCvvu3;VB5W%Bu zmJVz&Mqz%faE>&fglc2aRD#KoI7Kvd?E63z{m@+cl`SrWd{VP&OfDihU;PrhD{mSg zY`R$M3AU7*nwahJ^vOtg8%dEMCSi`f%gj0CW+zf1@~9)(AkRXksJqYgbea&T?2))) z`|Bj{lZQR(y_)^=HS%m~s%OuDvx7|@Ds^+%)W5&qr1aajJsUP|3h%p>v3_C^m5_f( zClXuRq78vRlu2JO^3sA(e@jXc;j^Y(LLlf8BSL&4oyjT~-PO64?I7)VGEc*2Xps_= zL8d46II)0AakXfXcS_1QiUR1@ojAzWf^aPy_QElxhALFjyKn;eV4lW>b&gVE^XZ^r zC(gyjprU+sSj)S%;@Y}zId+f3{2pf#(x#O%&SlPd+@4W(SN3z~invU4S9G<2Mb;Q~c(izSylZl1d})3VXJ_syf1Y9|$&gm}KX zk)V^cHYF^xr@yH<7zpevVkKg2ka?Pz!n{$&v7)$~#R7{n2g$w|w|eL8~ff;E!VU9`AZ<8h)J7Z(@{_5=@d?RrX9- zZxTYyyt*OeqmhK%W;{>f_~>ssk14%*@O- z;ZnylD#3FcG^hJI<5oPb@2-nRk#=m{rB-WAGE!XHb&s-`h!B!hpH-Qlw#k$mK%$|_ z%FNlheEvMu)#!!ka^WVJn9&85<(9>u7P#WHHtvq7W}L;|Nx+jDDC%F(cL@798*t3Z#Vm*=E?j1mR&A zC1(7w@chs?;72efduWVg;>a=(6@L}g}bUEW*9XGeDU$bC%Bmw4b+!zbH;G*^k z&m*gp?J`cyu6PYIt9Dt2XU)DfQ`?{OpiQ@KTXffdYj!JhT1LjN@+k%l253!Ep4m3s z-O9;WbdcmNkjcX()q`mj4LA!CxBbj8&ZT4-Rd?m&&%Y?3A_hX(O}(;pCZRZ79_Z?< zDni<;Jh}VyX?e`fiwOxX2{rvl`w2pRaIef}<@p=-7T@(ex^()heWx`wC0ZE|Qhw|H zsLegCHr%{t+ucPm(@l08S%zxXi?EKPND#GOU42akzI90SWuoA|oc&!f&v6ott$tTj zbgTHja<{>KfBimoyn{iNYg((T0S@N^0?zduvg*vi1*2EY+Bc_{a0Exwn*-y#j4L|zHCEr$ zy!85)mB^^^{FR-&sLz8D>Hb;8)8)1%YcCzVuy)IVQnK8+l4nE~7Z@9L-HA~QeV3E_ z%0%0NMl)PXy~6qihLF#VW5V#F-gGbP;0rWTk~A+9WkILhl_nV##gH*5VFYf=$2*K1 zW@$0^(fvrb+B15FpNtRpu`Wx0XtwKmrw+FoXE17eY+LkEsC=a*kWmm750@2v2#s6+ z$FD_JT>d9UMg7=gfd@TyBh?iQV#CXIYhxVEyuH>4bgx&ST+D}U+ z-~1c-s`M^q;bO)9)!M32vwOw|bR>gGRpm)Hx`KBAYh3t-QyZ&=XWL(09XN5beT0?g zj?qr-yNqg}x8<|*vHLxG7+5SwnR#MoV2`P%`o?!}S2EMs$(PJW*3Caybki8Y1tUB4 z(4qaqw^Wzd-}va0Q1v=th|!XS>T;t3Lq688AvQ7t35;}EZp`UqpFWzFUqupYqy3|} zIQ4V(5Rb86p7$85oLpyVqSNlQwTgRp5(q){z%dpR)acS@pF`0n$z|6$-{5RO%i5X= z#;);Oub*sSxZ%Ovt*Mb0e<#&DTur*O?$M_=p1a5T0|ZFK1sSTI1j)c> z4xOf#{^D}mGutx9n)f!3x}$x##n%#fuEjCPaQpPY9In$dCb zR$*p(o2`bp<$Ks!6oB)$mXH9*doB;#lKcFt@fL9#)~}E9t4)db!SsgMONpFKHeDe>FIhZ6ZIq7d7wD@x|LO19pw}&6=~miwzP!H@C6$ zUNv>|X3u`nvzvXm70LPQn;fJdL8a$yxSh_6yI3jV*GlI$6MTMmzKefV3U;=A_Aj-T zn&xeIatI~C>*6b>(d;@n>?gS6`ST8txRdsOLs2YP))f;!%YKz#u}ogN{3cJwAUJXy*E%}Xyd~iF5>%C->D!MGe^lgjkjkg6{c7K@$8*SD?HPx_QKjt0%k(ptainzBHlRY%Rm>0@^T=w~@4O!L_ z)VvaW%>>-&j1>~ci3ncDvSr`$h1eONC%ca!4rw9Ne)4jm817LqLdvJ&l|U2nj1IL~ zqIIbc2oCDqeZft~8nlTZwfyj?n~&!2-X|ICY{$Np8#fSH_?9b=yP|Oe&@( z^rdzrRh?<7KJS9UL+#-snM!@F5CyL1Ke6W{l8!*DP=A#I%(9nMinm zYb9i**IygjhxdRz`-ut|uV@f=|29PRsG-cH8f19Q>=X|@3d%)f7IqltS-RED?S_Nq zL`8)MCwkV6S+4dfWMF7UJ5vXTt`zt^wYAay$W?-CQ2HVW-NMhG?-2`!L8B0{Th8(g zQ|@YhqU|Wn8%L~o3b8_QPTAV0KRWR)WDqz867zBON5lh&QF6L=bcuo#ATZB*+Z(JZ^?e(9tc5{OH)6jjV@Edz5+Hwxd zRq$>|Su~f2&zzY?8ysKuPo|pMyuWQWnKyL}ZbCjn;g>IUv+PoEoiduPe9WnIxDU)F z8I=EEUl%CaPa*Hd)i*ZwPTqxwDQTC95g}VtxPZkm5xbhl5`9c$W>iHU_%xN%LJV)z zgM?K5Yadlo3Z-dRa*skuMq2#x5w|S6M*+QkLs$9~OzU*_>vHhrDE*XQ5@1X)r0+B1yvG3+mGcGn+BDGbLARc^8j9Hi1|!0xeg z_?DNLnN8)acRH7#ps5rt@sU;d%BarPkbN{s*J0wsG{>?>%?y?3#OJ*j_Warmlg1u3 zbDqq0??lV`hOSV~=h~CF#&vaVmwXRl$FU}yUtJBn5@!><$K`X@awVz}5RFmdL6BVm1H(x*3#=hmH z8e^wzFdxaarDH%kTqjMrftKMJUcRsOgs8PsQaUgx(YkB<1Y47@^;(l#E8kOh#oz!7 zC3zv`#ZB6_b(Tb#(~r82Ycp-yg2lHEduE=EGwjYp*3UQkaaf=cvA*AMFqU3g(v4pe za%F(ei=$+TV#vXg*+4R~-n?03K0(GhQj#-pJJ|i<-Qt3#wjHXW$-Q#HN= zB2&ASZ0gtEF#0nhm2*#z^c%AJ+;Qii!5k+t;1V&Yfpwhj%n48`aUS)Xe?Q9Klx}VP zTdD5h&0{x1yU#FIR&Ay^YW0B;I>Xl;S$6bS^n;y8SN4CnbMcGVvx`pnU(1%d0^4xK==6Xfl?@?=2r`z{E_o%k&ZAi_752N|7oBpFEdf)$Uht?wFs{Nr}D2*eU z=hDzljLdeZ>AV)H8ppHE*V-AKB@Yj30Xf#jnHLWiR{sRy{K7-ww` zTk{i~ZEeEW+wJxBy&k^d6tVv>e6Li>PCS@;e0Vdp!AI|etzBDmGTiW1$`+>Eh#&xk zKms#?}sa5TIp# zym1|oD=@CH^HLi@lP-K3+@aOZ@D8m?t*iZeUEP;qY%tId9~Ix?u~q&pyP}xnqYV5I zpN&O)Z4IUSb{?mZJl+ z$LyLN+qP{>`urAz>$U`lc8^&t5Y&cw^)Od09P`QO3SwdUox zbA>Ta(H@QY<($+Bq)$Q!DdZq2_jJy`c%kMh3z+hd@uxp_9+)|OIz6%v?#qi+`lg5M z%Yz&z{d7-Wa=_lPYM9d2)HzJ&5Ow~1SM}FNdQFxAr-Iv%t{Lw8#ZAv?^ML>ohryPD zJ{YV#cy{oXwX<#Sg_J*iHf0u98i)Ab{QEO{OP&Tza@<_qYDx8q7fI!}6*3-r$B($$ zxJR*Pz~98-VB>gs^ln+ik{aR!CR3^M7Fk_u*)&98J;&X@Pa*E&%q>#`E%2I6H(ol$ z%zv(oD#37*`|`nJ&oNYsvMaL}rig8Q^Z*m_l_ohj3H3|KDN!`QFyyE=nFmgg`(VRL zCnww9i65QlrRKN>1TdP$AEh2f&-IKbD$CPpnlDlmHfUT;kDns=!J#gdD zKTO#7eM|0^0|6&ayv`cg4@dOrX|B)OG5uogz+by|Ennq0BqJwssAEyN&RGxb+}Wo` zSCra2Zl7AbrOtVpE<|SqS3E<)CIWqeV8WD27X!$@eCjvr!)up=RC{|HN9$i)l9}PM zb?(N7`hU9E#7u1#pY}Ml=cEB0hH9Srw)({sgrl=k95yHn8mOAzsA~JYvaO5GN7)s} zcqjnbi;bLGWZS8}7tjq6Y2BLAQNq_K0A$FJuC64nu^!E|>>G{R>Tt%N(x72Cz_#Nq7O{gX5(F{LlE?3)jQr$B*swmrz#a=jSKnt55d) z+vvr4k^<}ElqV-D*hWj{GMbk*8EY)Mo15rYKAr$Sw3b{%8sb(NA@`1$7SGuomwbxg zWooI1Xz;e!wRqU1rW@+aj;(p4$VzJ34RgD4Xgey2tLzP^fdf%pN_~n*EB5Tc_1!+@StZ4c?`tCY9a87k6`?!$p}n#Q{(b zT-2bv=nO{Ht@TyvlA{R6!*8SC;Fk+~?eD+H?P9^e2M_1hZJd4P>|d?By2W~((JQ!p z@7~Gm$R2&0YilUKdObg@? z5slQTsNwW1f9`mo8d6D$r;iknolQIsnv3oNay{-$KctbEVR<=(t)Z|)Rd z`YSF@!FoYd6VEY|sdXsghH6K!%P@miaL5TdjL_u)?y0G$i)c(>*p5Qb$PBFeA0A|8 zre$X**woG+k`Xii@{;>H4<_z!e=lPE1?-5{?QgNq1&tF?6j-q+*NGDp>I&yN=>XtT zSLQ7Lu~-7?kYr@P%~0?x`JOeyeCP$s!;R?#@3F)7Uh<`S1%m61z=l3v^hX=M9FN<% zXyqD^O~7@Fpu47#{&$9J51hPgS?uE0@hke-h2|wjCz~$|?~?NC?Vd}T4GQ1)NpF$w zE=jVBOZPnD$_-E-*78IN48KZz@MB)cyf zysf<4aK*|L&HSkXJU%B*H5lemeI$!5_D zdL;~WS*_1LU@VRA7@2Fp43}9&S3s-@U*AWn_AUUkHI*Ny(*ZeO|2@n0V8&?{ZY6h zA@9HhpTa8zg(Vl?XwSdwS=*sy%RSm%`}?;$!B<61FpN@ErrL>E6fRQcUpyLH@jxRk zw)EAtc4YEJrYSfB)KlK%Ec-gs;7*b0?7O#bpZAS=iY}Kd*Lz}VK_)9aIKWPK3Vf*X;GEo@uVP%P+0Co>s8lr+98S*aGii> z2L_A2T-$6A(XTo>2{~^oS~cCZ&tJYyo?J4wE7-p+KLm z=aGs|^iKRs=1sEBa$DCtzcG7I|D43=x)&G2wx#Y^WU%$V+E3TwxR7&VFlI(|%wWJ5O9ddlc+HaQ+YDCyAt{d5K*hRdz? zJADq(_v`RO(`z)dc0&j{J2Fo}P6j!4qV=Y%OssN*WuMx-(;|+q z8LOd3kcfN?`q9BJq?Yo#@+7MAkC?cZ49RI~YU(|5$@C1Rh5*Tu$Hm9ROD^1mWZe_=fU&6 z`-bec+0(cl5xL^|HKCh7d>c?2)SXEd?qT$*DsY%Vr`^iosXZ|dj!m2WG}+jqo%b0z zFB^E-KQta^YWg$|(by3e*W6>7>5io7+1UkZVR{~WHnpjEQTe03_Uwu9KDBv6RqsAu zQ+)bS>VVsK3hn(}#y@`NQMT$@mSP*Js+Q8))#E>I?7Pluv3G;()8z|4IIS&zn(VTr z_=UsKko$LoYii6VYDB7>X)nFfXRm_Ip8Fn|ncm}+bQ3xZ*)zCu-rSni>wcX}@*iNJ z^04RLeFf{y9JBL(FSuxtVsUiekYyiMc}I>i!X0fu>atmvM!01h?s(!v8zsGc*RG#f zU!aQ!nRw4D)-U?&D%ob3t?4c=m(X~$y|k`S{fZ$iGa7sZgu@`GJaB=~2Fx1YV(F+i zM>$KC5{dfl()-)%><|8G4`tMDPDPI6nqx^9;(va>{yd|(-q67Q&*aYigG}3ZY4>(i z&z{ZxTKFR24IpxMZFeRu{HAFW?Tz@>jtfzCQ!sJd+i<8#0~}K?M-40{cAv2qF7+Ld zP*thiXe2>JW0UN-)^TU3i>jucO{co^=cD4j@7ZHszBl^r#6eQB8?TNx{H|P6d$+*B zyurID`O+T!n@|MUTs{njn=F6PPSKdZ{S zpPbtKs&=fw&G{>=CfL-j8nOHL++j5@s%(l*Z29G8n?-Jh9i6M9uzM*>i}>L@Df6+ifvfRgtwKuA6~x z(go=^)lHknn8!FBOqb5Gzq!R|T}`cAZi`f#%f3;6y`-t~G^}if^+CHV{jyH{kch*@{I?tauzX^SHv ze_JQaPF(e|+2rr{YG1!2@3mp|PLK2{mveV{@%GX2()FXILw=epT}u0>w%OsZvE^Ay zaglK@_P%C5XidhVz7W{ zs;_$IV0BMhFGk_Hq|}mWL#mQgmQUXs9lU#%{@-sK+}F0cHEfjmreJl-;i}U^;Ym!n zBC0kyDP?(SBng5s!0=y8fsvgic+6XnpTB3p3#1sZ)8A=m2GC9%KX+-hi}`|pY>)d= zAHVKPjxQ_wSQ~@ag=(S8woc6pR3lY3wEy`nMryiSfz*mctL{#iw$d{t?698BSp$W4 zZ-yQEt2HeyWQ=FcgygS9U#qIT3vORAG+Xn&#;K<-#B7{qvoyCLg#vsXbGvl$gD%09 zyCMj}q&H77c)#U@{ok|kdmJsVUUk`~7(P|&eEQ0PGY(3d@5?X0u%h#jA*XBVUM@KB z2bI*?*E@!$jN5lUU2BTmTL1j$u3FQM@2U5Z3E4V*s_a<(IAOZM0&C3H4>34hP?`ho z2IBIGO)Naa5_5)g^5bl~3|^4X0Scz=`GBR7tJcVl7;$OgI{xtpg^E9a0?S^VOJ7z1wNv)%VL=-=^*?9r?HY3WC(KWKzDYo;&P%u8iaOuyZLh!ILof zy{jqTpr-cmZKAfzPLH%+={d0jp@7c>kKN*Jzxz>RZGNx7z_Q3QkMq^q?;8EA ziA>^1{i{wRrtM_FHD6}v%*cyY>a`D3RqmG8)yJG)ziy2qgxFQk9RPFM(}8EEH2<`o zJ6poNh`Cn?cdKz7?xz8ybostVKddP4Mk7#+%d@Mr4cZ~ z)OqBV(PFl!q_H+=Djm$$N^}~9>mIwgXs&tl$b9aP*okb4CzC?Y8L!j?Noki=B-1K; zw=il#yIDqRfhXk47Cf3Qge+KNcadv!oO&keZ}{$Qzk=)}HPs_y)S}Y@G7Mj}mak}T zJ!LbZO!DlyK>dD`osJqVc##l&#o_GUVnv6$wOdzjdHrXtUhfXFP*A1;)g*>{CR?xR z6!>@D+V$3FqQ);ynK5Eum(iBpx>X+Aprt={-@cynEPeJ|e06j|>CEm=r1*}I)+j)* z&`R)l0yoIO+Bz<0JBB){izb3RjT&_c(zNU*NlYFbG&GOqk9da*2YE*@!%s7=(5roK zYp*`^n$_8w8gP22X1RIuEmUjOyZj?U?_wJeP zgyyAZgDg^B%C2ow@20Rq#ni&=$N{yXfj!%K?E`Q`?IZH5@7!sNYQ0+T_=yuCpnHq7 z6-AO!S=pthdkbkUg>Uel=`N0mOO_RSWgk77KB(rcx<=eki~MWfnxdSCjPu*q{o%KF zYt@{J&_(>Y-M#sP#gQ6^Zby%Lf0=d9?Yh`xdOn#OFFs{;?zUVmWmfb>5$A|y=^|<%RMgE08U0jE2LW(m zflbpLH1BCiLD?bbp~sRM6IGd)M@~kI%8+R=hzg$$$~2j1 z|K-%T5iVc5`s#=N79bF55l!43q!P?T!23jeGQLP&g}yo(8uDZiHXU)AGtlN<LXsF(^;IEpgJ{nMD>Od#V^SZcHN2>?W~a!0DO z;mSMDFKqZLfxJGU0;vO@P-+fAAsNp)gRf58rdJRqby#HHS!H*0(@9QgZy ztz${17KpsCz((;cViy@>G^1`WnfrJ3ZSQ!wU`OQJIXxcr6CS2uF(IJX_&iS#kEOD~?Hm7Dvrvdvu*u`kYyhslXt^NM0pjETW ziw;5v<(Rz}Zx}|b*K{`g2@>v6d}B!=vmv)bgVV=_Cnv^nVxq^@Fs|q*CBW!|=i$W< z)NC5sssHFl@sHIIV$Ca!F`GRrzV4cUK;T&XphFdMe-U-Y91m;`lUP##8?G6V_yGz3 zkO4usJwaMwvE6->C)6OL@cN{g5w8yX#1K@@*4DNQ>9ZI$F@84Un!(2hHkp&W;2yB4uLHA8gsra#PM)AceRW$V1CIhaD z29*6uh|s$CP%`7pL;^DDy;`Asn+}UY>|&L1HN>P8uf5u=~40 zHV{V&IA9@<(!HWU-L#u~Y2M;x^|#L)%eiGfV&<8ocYa@fdQ|&OR*pZNb`Yi>2qKr* zO-7JqR|EAX{QlA%UIYryFb;7M`OGw7g@MyaK@-C#=Ubt}YJLmJQIpPE64|&7*Q&>h zp>t1#Gqs)BP1WiiJqrfBALA)k^X|HqmiNAp#CBiX+&HojmHXX?)0{|o7&+zU1zZPWtZO)PY)p!LyLQI^j0l3t*aZ%`hM8S`!0fvBY6c4afn zB9B$2kF>pLvF6iwKMU){YnmdHz2=>EQ1`p)9`kPE<1-Ic-Q#!f_E5drcaCq=x5^zVVoSt_2SsoMvy?c(naoU-*7q^vyS;9_FV6Q}<+8#kJY+_i})#wBN=8X-%e4 zjNLA;K~pd9d@tk1LS?9NGV(ni;ZK7wlBP;WZoI^rHESql00P_%y%-Ka`x#7DOWa&L z&)@L4r<<9*hBcd0pDtBjU%2X&si}9<#A(ykE>=^Wva3(Uxk*}may$+={A@kGt;VpU zdbC6M{pZHZVnF_o9d9!1lo2(5j#bbHe+M%%czyzSTllOKcAe)87YsC#srezgN7NnVI+U<)4Igpg!Ve zrekx`?Y(f)pmFmp4zI4+{>!7T_U^e;H*R=G4(5macvsr_=FOVyL$|ix-S?C(zy5o( z{xjM2`YF(jRv(FeU=F_(aA6TZ@|VWa51C68#254MNNMPY)obCT&f_gTf0Uh1_jNeF zCu*^(N`4``qUl|rBq(Q8RI%^SD+K4Jf% ziznv{UfTxS*T>9K4KDQ$c@8W7^}$zP4^@5jPJ5*SvLs9$AWU@a(uFvRph*yG!c4L# zpy^JISZsqmworREJh)uVdN%-hkr=_42FPvHfSy+__S=0X@74YH?ca7~2tKi9UFny#t3e?MB& z*}@_%=+a?n&u7~DfvFZl?+nWMF+^InKxap3cJ#NzRxilTJZis|!#pia$OI)(IEMfI z{p#;y1ib3$JDpOmx!Q?Q?7?y&5^89$bFuTe;9=U&!17O1-H^+{jXvIs$43mzsuGeh z)CkEcahH342^i0%H;&RV4la@y>||ZKWOXjyE#o;q(>gJ^uEba0EUw#x3D?|AZ@ImO zkk)r>mq{O9&RgYkB{9Z$WehoUCukQjEO_~J)%%3+2ZAD>Mfrop(f0kdTZ9Kt7_wX{Baaou^5>k%6pd#Kyy_d@{PYgc|vGE}p) zZP!>@*xtMU+R4_o({^pXGayExIN*858&hbJ1)ZUx{Nf1KZR(IjQHcnPh^QzLUCcS3 zm6dfU(n{+L@nH}O)eDTL$oy)Szu+0@dg5kLw}Yc2qS-H-H;%}1fvGKO$KDgx8?umA zX6zggoxXGG$s5QkU>P3~4=lIU#N8v}xTD&r^Xf zPnu+Rm;auQd(!v!mp=$Sxv;e3NwQd+blqQOeEimS!GiBT8_b=$@7O2p*PHU4TOKR8 z*YI-4`njE@#Z;FBtB(i+R&z=Ub{{9A@Bk&fSI_ro(cIaIh5HSxMij`)zgLGg{)=M| zd%;~)z1DW6NK%CCpZ^dmJb~RRniZJkT^tT{eKUkUObq)dhDt)`j!{tXKJ7W-n)%N9 ztsT?1O=^6aUOpsWE2sFhW5NdYPhe zOsSw&`kU8>lHNLY?6WsoNu{^`{Pr$87ctnSFi_RP!h7qRkTwzKb62{*X$hc3+G=rT z5@Zq-h1CYEB?dDbuIq%^6ys4v26G-SuZZ;1L;I}Uge4|-y=;g5x#yg%?p!insp@C7 zh(zNXEU7K>y{zZ=>e@AU+qSksS%tZnaLLD|1w@#k@S~2$K!?-PhQ-ZDz1AeLAh>cV z29kH4JlP9!;#QW-R=5w{@&2BHllGHjz&H-MNHa_7mCw`Su&~8_`#wioHTSpNdo9M- z3|iCS#kn;SdgTA+__nd@evVGP`c;YPIG6iN|LTs;OWz%;g+>Xx=ZR^XPRZ8R{nZbp zloY&g8RvI1b|Q)%hqkLr)ctD}mX})8e81dFsMGTD>Jrk;h?b_3awax_@YpbPNx~K{|xq`CXr$CYb~CM##zv6rbM#mC2U!P2fC z_9PWLfxzR@Xf0+wI@QjFMlECCB;!aG>_|aTI{*1|^uqFiCi|pySKDm6vtP$q&{)IE znQ|Ydc!E5%arf?ry3PkvrmPv`fAsgZr&?Majr6z0R$=1&fOglhdd!HcU%GWR{3tUxbL zh>JNsBD*$R&EG3de6@wZ`toX^pf)MrIn#gKkY(F)?6YD+|<6D+d zBc*57*wg;k#owV7l^f?jU!a`T2V2n-X7)c0)jrb+{Q0G8Z$0Uj?ohX6$pX{ec z_AMOHuVYjBt>cpdKYf#A4`+kiZO?U`-S_b$UJj?2u?RO2j>uK+qO@pUZo|t2; z9RIDR?&gHQ`2%~pm)3RfGvDNM-R_UYv%KpX%MX9+`l9WN)pCiOoMomp6~0;9=cz_; ztkLSP9c&CXE@35>wa39?To<5lO9_N;jy z)YPOPS7xsoxwgGhH!#ej8b`)`ATV;p_blAU4>(czaw)(Zs8O-)-;om*WNyy+`dn2;S4A3WMask=mjk;$*Vjgl z0{*Nh&Y!N|N!QTf!Q0E;afk2wy?vZBRO>Avj zmdt1~aa~-j#MgJhDUQ2|u|4*w$ULf{+{S;RoY$^E5b@>8@0awvnq7CiTSK1oH)UC= znp&4GJ@)}PcWw867{n#$(PAhlmHz1SgK^rfkx5Iei{e82e!jSRsNwfm!}+PDEER?g z>4CMuQt6=|_6+=3WS;)$)ur=>)v2@FRV|pU$5|~bVi}_iQw#p$qiC2_w{FG^Ce7Ji zoWjm;e7u_tq7yd405~inKVUv(d+m#r^q%GFYiAd$KJ0YNw3C;Q&Y8({Mm`)YN#k6GbX0H3Yt}Gjs z|GS@!l`GZWTb}Kq6CaXYiw!Kr=j*1X`wu4+dcD<)+-?;%f8wO$vBtapet7n2Sp9={E?V~owj))&2=++Qr(BaN< zI;zhSjEj;E3Ea`KPvzrjp=If{ZE23Y3tO^eLIRd8($vtX<}AjNOJqWm0TO)VNJYo{ z`TL{ms!d0hl_qRkDxoeR59uK>c;xjzvYDE)dpD}io;RxH`r7MqSx$%d_TS#tK{7@u zGisxE<>*I!(=I>IDb~sy*dhLIyJQ)&J=3=Pb}Th+<8wVY)U#uJm6D>CQ{w3B9hTeQ z$_!8*@T{Ycq|z$SC&q7c8%wHRuZuqet_&fOW`jA^rZJw+d7f>V;Cn6qdt%UdC#MEz zluvzaq(c%HYb#Ra37G?nK@7UN_I0kdzgCWq2LqSoFAX2lY(voV!AWtBMQ*S7(NKjRH=4C?Zv zt%|9*=%KPjqJ_r~lIWgGi|nREDi3XIag^Ko+2&)%MgSWmG0GT#}B=psb|h-)XFbdaL`F((nrb4j+HlkJ6XyFb+{sYnwn;DY zxt}NLzESOZOn$+~jOkK2vQM3UX{0VHpT00Jx2bNB#zC7&Qt#TFmV71EIwG^d?XaX# zc2-m4^#|@AHnJXV*UOA)Gk;@zqQ_9jcg2e;)t0D!@vZ!Bm1!l_{(76rwv`JibKMg<>}VJz{1hiMn*^HioZ(NyzW-3 z`)Kbm-xl410Kd8;v&HV0T%<2fxvDf?s8?`e6SlB~!EQt1adiDpSIp3E(UkgB{6?Di z$z35N^s~Yc)Asojtv3fnDCB*U+?LpP^0p+2hK|Z=F}DBe5(Z`qJHe>q7nO z9!nBER0EdC_>Z5xIcdA9~8zg_h*!szziPlbMl;o|l+1UjH*JID#o)o^{`5*O>;9&s}z7jIm8{dQ=nW>34` z9$P$mhhKiSOJ?J^kBT3^=Vm^Xo`^TWEby8-Izi>?{4hw^3Anwc=uw*PA>C^HJAYMUBaX zPWW*n)D=l=h@l69g`L1zuC(GB@D%kr=@T2ImUVCUUJE=*@J#~)0|lLsZW^uQQcU+j z(aLO(o*u#+Xe^jDwUh#aJLre*$K<4a?}H3&4-2cIo;w96QM9=Hlk^p@j3~!>^q=8A zWj1N=9{;s}{X-2!of9d6DqSpVsmoMx4&uP3rbpz z2UIF7$@T84t&mk@?y0b+?UhaM3=VJd&<}d^`cuIk&9n0(lK1W&csXg!H0!Tx25SVX zT_3ol_Mw8E-MVfoCt1txmOLaebV+Gxwmw)e){?qnCPSUY?dy-G24%9fFvr#X7{-^xHl+d9IHzjGO zNXQPH?{>^M^X58v;_YLF`=_&g!l<+#=HCKn*h$?zk+I_2v$kfU*a zDb2m(H#6n<0$v>E3a7;UEZPkD_#3>tiPF$Jm6qbPE_n0iWzRD2@5YK)%VltvCqo(Q znfUvUWq!+_=Gespsx7WVIRk4~x31|J@Y_aOHv1Fq%pXphm~cwZN$v5J^5#E#FNy2e zW-v%bgL16Tgy{<^`z)zV?VO$-AYm8W$A8tKp%DrXnl|=I4+)gCTkA8?+C-yzTwCSK zvI9(ZJv0pK8(1b|r_=H}^5z(YGPkVmcKS0U4F+nskFm5$$s2sR?&#J*^@F}9d@X7` z;AA^GGd{p3DI&W5YQCIHWo}N`q|7X*T?*+pdu~heH){9H$uKW^k#0)UC+^1{eK;lGie>9@nC7MzbOhoo!fjd$!xRW z!;v%6K^vFV9E%%ge$J|7z_&6+uEJreEZ2-Ej0c!)bnDY+e(B81Cq=}|zI{uR^qzHr zZ6$0sA#-GVnO&U{sls#s9o85vcdXDs1uk6ZgNgsykaHOw7r!*rAChr=w%yh<(XmXmDjx7lhfBKrgwE-Y#!J)RgS(3e#_`jc#5Mo0a}Kd`iKjN_}Z^sHb(d-iOd zKWQ~ud(@y+b#>OIGtZXd7WVnNRA)tnnK+ivHR4x%`S|bq_cz436v_>m8IW^hSYo8g zInVMjRPrC)VkH?Y)OwVqZvoO>?741*YJHN8Tp|>0l*aq3N*RvQTe!M5=zfb(5xh<) z5|F?n*40chJ~u1)_n5$)GeaL=YKZNZe%V8{mvPGxImdX5Oj&~kzs^t0jCZndUK==k z@{nzbWokBda9+$*=?f9K7aGm&52>529?eQEC`jq|B&oD<2af%SRbbSQ z$sfB1TQGRfreqtWWtwuv%eUR+4fI-hE&+NYMDyfVCBDpc`+tNkJyX=7 zL++!)p6CKM3aYtKfEZY8+7slS{RZO{w(EwH{#r-dY?&B*q(TDw3`!il;nYjJ$0mzN^95`S%#hT}Sc8)csVN(kejAy@E1$apfx-$vSFg(N`>3-AE z)%Dcgc`>$XPU`6&H*eiyR#XF1T!NV{6Y0rJXdU(VC<>tZXcXfg`wqvRi7|gMUcT90 z2aQ*ji_?>63W5Z}3dug)xn)akM^%OH?UdR8lnv3;ysy}1o|BV>rRCl|R?^=(pgj`s z+nY9r$%({HjlrI8#poOB%&yM0rI-rM$j}voePM{=a4C`jmh6%EDau)gxk|bypDM^JNU4Ejl68Vim9wHKBoty%^g}7) z%$$60Tjf!a2E{UAA$yJM;;0-CBSn^iT}QD4q}3f+LvD;WjZ0b#8D(*r%r_Mi>#k3F zy^Xu6-ld&%n_y$*fB*H-vFrc-zlm2T4@>&L|COPnkxkm!n`w^ZVe$yh2P=G3UwiW#2Nm#>CE?mSg$LE8y!zSw@^2 zFz?X53G@uj7NnF@)~$P}$C1M(4FBBH5{IG}A`DVE!E$H%2~3#V!=V|Lo))y#s!f3EMa>xxB3!C(TUXANwaN zGE(eTVR*RW)ls2|u{*{}VdBpvEn-_qzNt1iVsVG*pBOFz+E-Rqsyk;Pa6+$nvbbS+ zX-Yp5S~89fG~{T)(nM>PZq57mULJ)lOLUU0g|qElhA86O2pDmITV(Fr!Gjv+YMv-@ z6O%^b%SbA?ifp2}3Xp=#(FbB!ZwG#TypHc?#mp`q%@RsTI^;ewu>s< zNRh_FXdYSW47hY`3~?b+ng~lS@u9C?9na-VH)r&traO4Dgs^{xuKaf!>OQE_m{yKr z#7E8k_UG63UK>F^oPs`0f!^wh{VVjGX=g2)NrbAZt(}6I0et2vPR+beBKA@IL4-nr zBR*>By;u~h++7?R#HmD_^C8jkGe!Ca5PlVyR_Mi=`Ie<_Wykn5FyOY~PctXqdRPzP zA&Ljlc`~OMKWw>p5jw~iUg)UhkCefgLb#Xu=y@$6`kGOmNJgWP8xZs{c#f;cTHxe4 z2@?u2HJI4FJ}a)wnR2$*lLegy7~MOxB4-C;bPEd$Ay#I-@QH~PZ7X)CBiY4#GtOpn z?li&3dK5d6`xFY&ZQhy%kPtS3Czv7;93{nA4{n3Va~A1CFJHc-7=Of(!*{Yyy!mm% z-`{t}eYi5u#iaP*!`-Y?%)2)@+7U5mBmx&uKw`WHG<;uZ!|*W7N*O*R{uPiDP0RpP zOq2Lmlp#YIq6wdVR^8%7i*A7Si^$@YetvfrYE$~>F?b=T$ShLD^rw&^FzcHR5F9!6WCZL7^_KS)^UY??jK+N}?&8Y;^MJYo z%T66W-1lr++Ux+-fSU#qpZBPkMq-aKoRE}RW~QbQ>gwvJ2%NH3GW1W|8xb*(=VuCGHJ!yY z8cO1Hsye#CS(`CL+^M>A4# zMM5^$)%EP&qer!HcbjgHQPZi?#>Rg;ALki!MqL|y7|Duf%4HeJm{AS%c-Bb84n;h} zAszc0+9L0c#Nq}nzz|ntA=vMP?Tv~WJpT8W;-a<#f|SKCXPQ`qSHIf*qvPizLEJ+_ zG`OXy+{XTrcbs^f(3jK(4KlGOK{ENPI8f#?#E*j+W!<~lEyG@2LnDfr>oAbgnFX*G z#4F?+ola229$4x_U6w4Fjz~WO&hfIKP8>ILTxrN6trMfMC)VV2p2<(XDt-}F#91oU zB5K&TKcBYu+5;Tus5EM682mO>*9?1CMt5^z0Er{rl;8XlwqzW%uIb{@Ja6pROGD@R!p^g08AlH6>JtRIc2a`FKdJ!k2s%e5<&+bn) zaO|IgmFn;l5dRb#?rc)h411&m;&*dJOlMyfG3NCQx+B&$hgq~R?q`|0@>J*Y4dZ}r zrYnU{{r3qeB-HTXI*PHWwi>pm>YAEZMI9O%PKJ5`t;@f?*TA=r;TD4yz3uZ|N!83x##et4kVqxRq)to={I+5y!B zBrio`FS+|Eic0MVV6-gYc4`dvGp=|8Xw~pN6*niC5xvu!7nepsD*-E{&I{AJO?G2_ zaNGW|9PJdKBDRPtOdmlT%_sp)3NL4>UXao3|#2!5iI~$ zB0_FMcCsv$=PW3+m}9abwz4`TAO=w+(|Rsugh~H@Y>+r zh9OZ=-67$H^&fvwbm|8Wn$hc1+Wq7D{!1l(5Ib>`(5=dDI-auPSfwnUr-tM{oMGW} z?@>L9ZbxucB2`$pf|Iow$6TExXO2Z^CVL1%0X~7`kw-);a1XeBo_IUz>aXD?^CyJI zGS3J0QF{^woKccwyll%KWR>mWPDkc>>(#3Pm{4A@*M!qyh?|TE`t*X9cG{gVkKfFx#JK@bX8I+5Si%zb~M$5U;^SWOS26feIXvi zu!spH7se*M=Bl@EcQO~}qc25Xz+??J@h1KO-yYdxUm|J+odgeHNxNP28-s@pYe!V) z$#I`Rsoau(N2;vCzW}2MVh`>V7Rm!Y!t3LOvw1}`Ls^LJ=1tZt4cNmbI8<<-W<1@-0stnW?ia{>o(|y7Ec#hv>h8`mq$rntdJloD($I>+td;kjgy;)-CM;;;j#3cOD+ zIXBW2L1uRQqT|QjLP7#?Xt3tpbgEh!fV|H#($afkVy3a2lUt5MM&e!*l`Ksk3_>9h zU{^t8Tnnp2BJ7hH2No4nWmA*qJ8?3E-}(IU<8^*7cbf<(aC38uHWGoR&?Ujv{=1&~ zj}L&czBH`<$@0;-O+8m^T?@m4POS{vZJN0}Ovgx|6m=>O7{8uec)k8ts3_-P*v_hg zGYxgjdesq6n8B1rlTJQo8a1r4UZDvB!Ul6n{H=i2|!&*E|^1iL4744DedlsLf-N# zD+>JJs?G{QC=G{-ZkK6d{ zf@{>d&ZUvL(k%lZm;bxUw>7i`z|3(ud_^dXL9$s#!Y_&kgBWAC&XtsrFHVkJk`FbC zTL>Le+x%*0>chOxKZuboZRCti&M;!%MUd>k;q(l;yg({AnK9fVQr6$mZa7U7nuY4s zZ)$2bvv=7@h99p>=~D12ELxO21#E{1DT$gJnL$6i1VfymL1N(l#uh&`S3KCUvpNi(uq_#UgDOhl9_hl!YLp$v!dYgI5q^Wq0sWslnPY| zcYPnazR72bfe;{fZTmD>59JFSJ@`E#=Q^uL7d+cmpn_R^92KySfg@ikCZ;sOWKh;> zPFlsGLxrYMXg^?38zH*q0tCYnUzemFPUr{RbSbTUsgF9yPKq|d z-5Z0%77QQz_&vT z^pn!Y$8H$_y^7Ul7}BSurNy_jzsy-qqlGh>NZSEh`DfKUU;wcZzz-5EL+k*6D#e`y zk$5+7Q}t^9TE�PI&jx&X{>50l4q?@h}$Z5wdDgZYS~ z-3*S8?|q-DgtmV>fh~`tHc&7;Ru>^_$5Gf&m~ZhRGVEt^e9*iI9|2%}I@-mD$0l9* z8?$>6mtpmH=)qPmck?4?2-zTR$(?!acvnh&d^D-^CrZ9_%=3(Xf0;*5PJ!j9zcn?e zlRHp~#?71dh5RMl#IMuPn810U0q6@VI$BN}d|RZm3Tqi?9Y6{4@{RYF zgd_-t2v$w>b8Z)&0tP?GddUQlTd1m`!IUq|e0GTY($~m zbiCNdRG~By-?8~oHOGLVvI?4xF+a2*Oy4I>TBrV1xKjF{+|OrWIcZnYe2%|(SWyV2w1{< z*CIrD0G53hbinmqZ1(cUgSh&I;rl;!U|CYC-d6X^okPZmgycCv zMLVD-Vw$2+&{$vFtX}_MEZWuG3A%1HxLnD-%*zQm3qX`JLPk1zQl^ri`_|6^|fr3;-*XnfK zv;W4-+W16PpgastI0*#ah13vOLLo2BBaVhaSbuVt0l1!xkH5Z9n}0*uJtf(?IDXDx;1&QGtEKR- zy~DIRWo&^-7af)a?hjf*%7O_S39^$&Hx>gy2DiH@&TP6C32_Di?vW~3Mu4hKO)c%? z?81{t_%aZJc3@zudYQdU3tn1xFgg=713V>qfNLeseUIhRPDb|50X~$|Ji}>B!5Etf z3jMbUB*1eN@Y5L`QNz!(wqDw|GZ!={bxDHuJ9zM_QkgUt&~2825$Ao+VO<`#R9}l5 zdBgLJ#8var6$tC;`bSTn_QZ;b&%}FUN%uJxN~g~{$vrhgO62cx&WqUx&ctsZM(c() zAlqCRw$=*@9mP(b$0g2JNU2KE*Z6x(Kn0`++`T+*H2(wGCML!vcgvMA@jl;@#Jw$0 z#>mJJi!*{SjaXze*B)NS9++MxetCHB-n)c{#}T|@zy_jw@=UPBnUnCvpSCJzt+ov4 z#0wV(vH*;$v(4O#sOZB!qybC|9@E{sI=kF6>^*(-?(sig{x#F*c?0qTap2&&nL4P% zzLk?UA#2n@3|bl*c5v@JF0SGmwGNcAZ2lCDP^+)UoArm|kW2pIu!p58RHtuK)hi^(?5e>K{k0I8a=({2 zc}Yr2iuZH>7jqwO34R;`x05X0sjmXA@BejzHij9KfB?}xh2I&+e&Yp|Y>t}GCdBQc z09LnS?)|aj$M-`6L*XR+b&)c$GK5;ht>+dBkFE*|E=hf&iKvF!#c!mV>ZYh@8`l|t zS6G*umTSged-w9{ArQY90h-~rlvN<#ZAsPm@#8DTu2*+9>esIyVj-&AQw6^#1bi8W zQVfY2#nPt4ME7XhESiq)J$hKi?P_d_1qEksQ|6nTDVA)WTnu$A?fm&$_wI$kw~;iB z{0@N;c*i{Yx2J~(o>IQiMk3oDWdo5%VFUJx%kwAZxO+ozm~@P zRAGsF^6?-C3LdU@+z_GsegxC)PX7hq>FjePi5=gT-hoAE9oHFZJ7;^>{{1azz&O=5 zzRO<2>$7j#RQcc^i2VBXO?u$)+AMYAX6_Uh7muAfaaXmLhQ_FzzwoA`{U7jks(*iY za<)70)5w6647FC$eLc_gobTTB@ZrOHemgkn^DF^dGJ0prjb;CYr`vM1pbZ5EmP`+= zXEc`ZrON)_n1cvVI-LiQ01p_g%YN*e*$tF{MwNN%Xf5-ZGj+4K`XR*?@S;l#yBMs> zx$i9sGpi*VdBR+l(e}9)3rXY?u?q=V$C>&vH$Hd%{NC4jWAYX^5C;x=LZuOMr@y4&o_I z``n{fuk$_)Me02VGeI3P>*x&Jwti4lh_Dt?teEt6Y9F za_Z3P2(T0=y{(iL5Gn#DJ?CC8A002z@O0zx-k;ZuFdQJ^d+^0}aI zl$9MoP=Si|4le5ZN4s{T4Ch=TN#Q=6z=U_@Jb#Q!r_)f0=R+{U7LyzNmIhXk7;Wa} zio6xk%u?#pkX5n_u3)QgvdB{m+C}UfE-kRvJQ?<}6aBg4h2q|hbB)fHy^^-{z7nU6 zK;HSJNO&UvN~*wOP6!<)aEu5e6OV?eE+l(Ac{W0Bvy4~cdUu_rE?d@qexJrIE)ld4WJqT<1>~o zkJkyi$sNPj4o5(>k3S~NZh8Ejje%`MxFp{!j{hM3Qv%ed_;F9}%uyx^B2+}ef3VLn zHB9e-zaHL=8tT3`mrHsZ#H3I9FIZXPNt3)K89zthm-N_FW+KL-q??PtiOJY-UbgHG zrVhEF#HRG?B*ZQP<6!747!3%_AJX8htqgAa9o7K5OE+MC3;fIvIjy^dJ{$E1FBz!G zRAE+hc`gPKe1}h*ej}L~Q74OYmEKicm;k;d!$PM_o}5PQ$Xc2zz9BO(?h#J>MN5}j zpB!$zKqo?d=Aw}DrI^E%-uwC;i6C8#8TA>9cRXK9gqMKPKnz;MOqRMsNO(wyz*8rL zK^p7=7{AsVhtJS-$a=Ite8kwX`)Tw_6U@_qL8(E6H3X}CF=!VXkS3x7<)fK990CSu2ECTYM$0x5E5_q_G(-*%=!Ca%^= zH{%*mqTt*@#1xDm7HAZj4;p&K@d2YLs6a!dkzp}IxJ?DZOk4=bIFEX3z613XF*e{i z&X>Xql9DsFIHl_44XvxK6~PhoeTt}}&=O)v`!}Ke64>20$O1xMR4J?+gs@!#%eqB0 zWK(S}f!8q3y2V|=9azIy=rx)c-66`|pDX~!5e;jqz4~yT&9b9wWy7c55`m<|nNvV; zyM7dl$}yQ1Xg|>~%(#KU@SyF?AYl1nA7Ml(=EcAh2xXbmPC`=UR?>8p0XwNmx%(Z9 z=s%D&Q{Z-n4=3`}QnO!)&m5BsCpd|5FVyxn9nh1*FJ{IuDWfuH*FZfZMk)k9fcz+z4LA zsuFL2KQAJO8QA;3J$q(33m|#R6lm!{Dy;M>8f5xb8Kw|@*4JV;7UikB%?t+o*KkrkpMLhm?XvEI{x!vuFq4$nivS7c^+umjYBtQ zu*pi7q6lthAP?a<OZEZg2 z{HKEFDCC%R@+?Z@vrgh0wb>dEUoIL3q-K?Ck0|N^>dVUj}6(4?T^uyz{ z-a5N)*7BSBIBGYZ-!nK%7r2^I2J9p;=M2L*vW={NJ3kV_0n;M^}INS}noTwut84j^m+A|Pm znnLv}fnplqON`rfbRc+6!=&t7dirajYV6eux4tbf1_V^xtCwh=G&NN@aX^7Z_8e)L z{3z2%$Sg8UX?)k+mHyU_LzqoS0_G;5Tt~fW%31EvR8SvapM33;nA*Ju+IXvzrv|O;=CXzq(hhGj_IxMt-{>r3W-1ho)c-@96i+8 zTfy4!U7>rFW)#6c$irfUsjokn71*Rky;5I!-IzH+WlqtE8Lsza@ zvy{U8-Sdo48WLx1#r+g5;tq=Ic5CX&R=z+pA5Cy=aFw+$y^rcj31-zIx)BlOHFn&% zS6@E~j2NT`&_1;>rTu$nPHq9XGB55GH8dnp3E+b)a=FRMW4M1RHTT~L+9eK>FSY-6fUjBquX8yVet)Y7tMVTZR{J$V|Zh=8sTDwT!5_*-NUz-X+ zP1e>%+t^}A1T>V=Xg(xs8doIG0O}wt+BP>F0Yai;Vz%8_7$037y>~Cs7q+Dp-Cop- z=R`w(j_;T4wbOXzGztUpv;_wTmTjx-BK1UPCYF#;Rk@p)pzi~B#jLg$ltBRS?%Z6e zoR|z_MYOQ+{*~bF5>p>4NbPLNStvNg)aby0_QuFodUKsqH_#zSX~Rq6hY-IgBD5)G zo##{Zqug$i_mQN%{nP12pOF&&c*liWGB$_4O|mj6|<-vd>C8yWSZ0 zEaH+Bj#Ug67Oq&an_d836wT&Fn53_HSYNuqfo3JoB_j-slj4%uoeum1MxFRt4I%P& z(XFkINps*p<1ywzpk?4jxh^n2*4Q%UJnt$l$8gUIHzGDIH|W{M!Xe8mAxm%l^5u(q zA+WP>8Ip)pq2IniXw@)v^%Klm#R!7hmBSs|PeE1|WFL~~!5^5c)zeM!{rMo^jX0?; zNcHYy8adtEoNtK@Y&jV~PNgv>D|ZWVE~>7C;j78TWWNJ0cEL&RzZn)ELY7or7L>r3 znO}$j740fBz3H=N32l;a5vK4Ke}&#ua4fk_1XGdYo#Cx`oz&|cltyodAu6b_OalOo zV<9KKzN#5-1&IMFy={hROKhw%fvcu`LWp`O`-`!e5ETB8A4Pk@N!H97nukL?Fq*#4 zn=VXO#A6T;^Ww!fsVh0X#J3jnO#vW5#S#!E_6qXXDR^jg4k?zm+mhh~ljH~6L=;{k z|5Dge^U+OGORKA@BscPsfk>uS@tk^t!QqD;E@hWkG8dveC65G{AMe;oKPEgp{FA>; zp2gb0K${=Qmg*drZ5S(B8fu5kU(8C#iy3hpD8aQnQxhVS)Ix|_91zt!Ji13=dL?>6 z&PyTP5?+tu1Q0$PysGn@NwOUjuTuyLHvw_5{@0X}*txIr%^RWOW|Zgh>2)px0V!~! zqk-@BIUN~Xz$6zQag6v`nxfL@o8s!63s;EhGwI*b5{rdZeEoYMtKR3s%OhBzNi?6T zmLGA&m~M~Dp$1&2Rva66HVV!ec0PxX91%n8)GN($kWOmAcovexz*+gvj>W?x!PZsv zU#x;Sn>+L!yN5%yq{XGni%rAHbfA)|{y>b9Hlb~7Q1k`$#56Jk<%+n)FPg!T2|mW$ zq$|c+s82us8!LxWrwIQm27V$H9@3rYvoK5=33x3q1NvW4DbkdRRJh@G>Cym~Or?dQ z4qcoeIUejVxXX)9w3I>EVbHKkKw80ariM20G+iS#zJT00Q&~UD=$!;^%zG3~4ZBuI z{W*1%4Gj;oUq?Y&`SYiBh3F@BoVTu8ENt`zg^U-o(93IAOiV9^j2;!)(L7Q%%G}bJ zlMlMexsR^|?H0z`qOoNC_K7w%jZC0y1t^D__pBt9R}Z6BCwWuqoVdg3FP&Q`px|Nyev^gy z10o`ete%Bjd8z(rHRpst2E^%3>mId#|F9m`6O#on!LHk*vV(&7w8fE~&@O=-1*;pj zZ`9$#_b5Zcx{u`^V?Xx>grrLqc6NyU>bPoD=}3f`3Ox}CvJ&}IPYDo{H41-7oaMF` zz4{42o_b4o!h>tH+oEth?WsQ8y?Wm6GQ8mpTb|6Dnz)SV4u=oin-J2GIK%$aJ4s8x z)SyXWvITx1CAjXqz@IUs zXDc(%*Hj)1g{( zbu-fdPsh+nH)VHtcceFNjQ0m!pD-fduPaSEC z;&^Ayp{+Z1m@3>V`5iXfwy_I02g_zV^mm|iam45Oa`D%Rg$9Cu;?yZ={sfysCO;(` zNL`q3U{omJE4oFoe!z2js+vf*?1BM7G^MTpfDkk!B;kyolzDaP$$`YR&}*SBb$3W% zxPIGdzO5}(jTKq_Ot$xWp{Zgzh>&`iq@kPxpSe11>{htCuCvG5?343~i=(jc zR9xM=-GQ3Ct_#i^Fj^;ABv5IYjn*B?X2n|aao5I($G(zF{8(hCDJWu)b2xvXChI} z_SCEEoYkL31qO&Hd3A!mudHqctcU($3suMJbNm0}2TQX=QF4bu+l-CY_;X~t_Y)y0 z?I(Q~8e+|czpbx)qfI0Jnh-^SDp-`jg1rn}SK6;IW`aC?RWS<&f}>;=u?mE@2%AuG z3Hw%L$Ziv>j#7=h!4&;*qOK7rk;o=(Nu-akHu0AE<1Er3z`{zpeP}OzaS{>(@pu3xco|#S2T(sy z?lEU!|G->0Vlj01uMpXp@cl%O#3Me%;eL30U{4A756-_mt~%|uPQ02>POYa!)2Jdc z5+SS!D=nc?n!EnucK<_9oZ++_OEj@OdzYUOQ^%SHtEpOc<6B98EUvcG=EL=rO>V?{h;(~vhJ-^MwaB!6KE`>b-fcHz4*jB4<}EpmP^*56+~Zx1j&FbzIQt#(0`KSwdAP z`@}52+MAb5l^wz;k4g?ln4|PAPkdL)gv=+(?jSZ#{R~=C?j|l@btUA;C|KjSs<9hDfxPO~7G9r7GQVJ!rWMo8{rJihkC-s3o4>#!SJebUZ*4ZrJ4dh3T; zgSRcFLxNiti6<9>IN_dprGM?#Dpg}@*~K5Zu0^?jv|m+)4|pDNWqvxf$PQ@rlIYh)#&z?O;pKq$rxdo62GhqM5SCF@{ zoe(tmAbCL$b=1N_?qhXSB>W@x=ck1si5BR;qXN4MqlP&q#EQ+y*y z$fG7p==)&)Aeu_K9$yy~WkB7?q(15%SH9XJCV^qNsEm&sN2!qm(Q%XeBegHrlBkp` zDsG9kDKGCB7oap8(%*1+q@UPF9&l^G?Shvj^`x35^>ZABJ>3{EW(dw4@lujNtcuh% zuHtC(MneIJ-Omg-VU=Jqb?UsEq3YBlRs2K`rYVV#gnEZI^MRs9c-v<0C0x9t_B0Em zSQ`-(Kn-$W<9{(?qava%0}eVkEqbJ=fIMHU$neXZZ7DX-TTG_gdPT8GLte^o!2)!3 zckuL9etLdWJNeQW+H`TPLug1Y2HouO6UpJDmyM;EzPP!I2@+sX_}7qLD%j4@;0PTg z{VRuBmIN^{dPq@yZkuNBJ2!2*Nvm=><}OYY|KDGlO4>g#G=`%ptPY|9rwS1QkmV}M z4bLh@Oo>q6Emg=Jo2RcGlc_V1^|JhLoD~=B>28d(1p8{!%t` zd7nUqh=7YQcNHzI&2Sl=(w(|{y{=El#bhU^;=#&R0G~cS7nXHtZxN$+_s`b>tB%o( zqm{}1K77cK-MmH-LLrQ?p+5jt=4-^5$9=A@R`1()D?AV%B8fL?J>HQa&z_8 zr^pRjkYGNH4h3}zM!MSu`u^P5QrA+@+%R~<-+@l)9H4`!meTnKXL+(WO>kF< z44oTyIB+6(K&aY_f}0zj9@TsE-MB*a)dIS4{vDXDp5wKR1`Eb=^NBMXoj@jp_70bD z3f(SciiqpMDu^Sv@@6?M1A%%-JsuzaPjuioTy0cq=VE6TsMAd%JEoEU^i-{;8ZuSD zF93;qSN#9Z8aegAf(bxz;*g`x$erJbWh$~vf5te;C}(qh zRHvEB=Pqy3_vy!*L_Qe;!^1I!@*?8GlH7hWSdu!s zB7BTQ9d-^V9e#%%IC#qzf&9*?CkR82QJ8KJQGNWWy5}v?2Xh%B+P0C;hW?4VX-axz z7(ok;cp7fp&hg9`_4oG%^k&>N1NvViSRu{=NieAPLc`rO@500bYj{427A=xU4)_U( zX_716JXeA~pt!il$MY$|EJ~8!9ORM@Z0ck1^IDCButIV4_ zcPzhMpI+g>{ws*kcurM;yZRKUCBW@sad_G@yucJ6!OY|v*>{{sD3{L~K2kLNoHiT~ z=REft{Lni9K#ZBkr{;rAS;>5^PVzOAtQw|<_hrhr>Ezex6U#m<8oOdc!Z_5a2~CfT zOkqN}gbZBomx<^O;&HC|QTkh!>8Bq9PFQGl`ThNTho95;NqLHc@MmqML)it|TZ_nm zygPTWADzy)VDwV6_S*32vE~KC1J3kkg_Ll&P&UTodO}t}v~U&F^Zxo~%??Yg5$ika zp%-@A>TrXuj?Ox12s7ssENs_IGBstxr60@oitYPyM|O7l!D)6|F0QerKqUGDE6w(+ zSFdVSU`^ssk<8n*h8H_2D;FGm(X4T;2FDcYki@-~lD0s2>~qa~X!&;FlEGf|{=&@$ z8B>-MUy28ctu&zj46Se(DgX`KIf9#~1$$&?<5~9(9uKnwT2HvHj^Yn}wh@STJ%RW2 z-N)ryP!{L;14u#6@33wPe}iVC&X&)XqH!r~EY+jk}_72lVu#0Qb@gfL+)$~xet4U9zJ zOgUrq&vjG{61@csAQ~XX8QcU72C6;-v8brPbP>c{C$tiH1TAqo60C9O&fSrwtlY5 zPk^CEvuL+?Ho?Vss9y^b&CU`xIlY>^9)pq-lUjoufp1jot!M}w3oZc<0$XSq*H*pt zpmrz1FU&eLy)}dgKZHJ!`~1fo7X8Y?=CSU4d-VNS4s9lho+V+TIc zOb{51v0^)-qp3&zHuT$|a^9b6s0hL+{k=iJ7{}Z5XM=(|D;ph{p>yog;{U?PE9Nz8 zv`XA~Tt5{TIyK1XW^`!!pvPoKW^7Y+R?bMD>N4AE-N6$J&c~(vKcaf{vT42ke_ZnW z*hwh|*BrLmV6XFc;bO(u*g1~%BQ^~#Ez78VmHA-WhEoIWlJ;AgD|H>-yR!DdwASsL z{Hj?O*JooO(Vk!b`uUdnlYIE~{?dSe9nr03oxQrf8v{=6#qGK@H6g$<;q|q~UR8H4 zTz2}u?NZ z@hEYtD||*$rTCzFa2Xl(?cmQH5gI% z=-w^U-VK)3^c^?hmO@2P8wQ7@R(n*DAEQznfL^?q}7{mZq#)>KS13f1^eQb$zcEXc?-; zF-3W25_%%-?i^iqgGG5<3 z@AT%*2FFb2g|6yugIjd$shvP0H$26(@sJ^@u?4xNBkQakZa$hmbm)(cb#m}^)?*LtP`=-P_Uk9EWoMITH{UnJX!P2|$@})?`1}3JHU04W$Jfg% zSL!+kE$Y|5|A~vMS0*j*d*l3T`-4Z9_cc-SNq_l$dxyDp_gc3}-Td1AZSJb-@QSRC z9UgZ0bb$p?@D*e~1%91dPjGiJ8a|rE{o7aD`a~(9ADJ~0_<-ws+t;r>Z~SU>rn8cg zb1gEbL(At5Th(&!;9cQqDY{S2ob7$R@>=630kzjVxCB&I7Z$}v^c>JeW>p-zv(MqF zecZ<0Q|4S(urX`&NvFJkVWWspG@iCtX{KiXemsIdiJgxS)(tm zOg+3hhJj}5nU4Emx8druzTNs~-)%`^@RGNWPe$xMpx?0bw$!IJtAtjk^7>BSrB8+k z$CY>o;5l&Ah^kO}N|YlUm)kiwVR?34d^U02>mjfg_#$>lTeN4-MD7ov3@GTKPDKdJ`L{XzRr$>jb%IFAfYFH5%?BMr zBO)Y|`w2RJtO5L#`~+*}V)T_?Ozt^2UjXQp2nMO)zf@HvRLpDsX~w7c0M^;c505j5 zB_GYllF&Kf3Q&h4i^fbPYE?A8eL;-G6(Am?y?bp73(H&DHGi_%##Sra^Zkm5#G|2& zn3Y$Rm%3%)m4+)(Z1%EHc4ul6Ib>)aeAxm%$&EDv*-F`-kyB+_{#SdgY`ql)ZftL=EpyE ztAAW-(@q&HWL*Gv&5~RX&&!kad*k954SyOANZfR5%cGN zdVl+znu+j|&~exjB>tI#)VI388Z^I6r%pE|0G|s`Y>P5~2i#$Na^b6wx3pV7S0Z{1 zSVttER8wMqrC$9F{!3IgSGG{sfTFZ18T13((#iiq5F0b*yoOs2`F;4FRm;f+m z*Pl3b>KpEb>kxq;_{)BNTL2`lEmy;G>wbw?{vjnvulL`d-F5|-QS3%}75^iee9VFGf7%vkb?QVOcIuh@S3N>^m8p&2*5#JX3jZJDtv>ah zd_2C96*rXohc{^+DbD}M0BPf2-*y&84{us|SzCMWw$3+Aqt6UB{GIaKy6vY=tvhIF z=zjegnsfxQ>7twI?OV3gU3M@rrDgZN*@s_hS@&Llq5I{_%ZfKPPxlFb`O{?SI-qkT zCRZuFpoZ|e?h*12)B#mDsX#@rAdJH3zG~Gfdom)#HkNH#b^7&X>CmyV4{*HJD5mU< z53ftCfn@45SPLQv)^TOwVylD#2|HiAwjOwvy{x(DY-x0=vf575b$h5;4`@z=l2Wu$gHiX$5J_N9)HA{hVzb+Z zjHU2j2t~Fi&so4+Q9*NlgA1Wvr&-pZ%)3g>Db7~lQ(OdIKguE`qUql(Fy9Qn96%z1 z3;B$PL{0woZA^h1t>_`>NPrtKwN2=#M|K!khVSOVWxv(f&+XZYXE_^Nb`Mh?@8R*p z{YRmL`DYQgFRmw=H!Md7oH!c zKk^I{AtQFj{WEZ&e%-)cW6xU5bGCb-l<=S&Yc9XG(95uS^C#)w=P5H*!(8A+fZH#M zKdVhcvoJNwgk&(2LvtszO+-s|%M;-w7+hPx5;WR~&;XS44>?v3*7b?{s>*2xT)#JjcCVHRec7kY&&{1)qVKF-HR5F@Ub}iC~(uOy_OGePhMFv zW#W!gwKbIvBR)Nxqbog{fR03lOdsFzidB~chb|*HCiu!0vm!Vs(f30|W#NH9Ei6;- zt=qg~e|?8j17%d3+Tl(z&ybel0mC*#2FDI4&+qm6?(^ph5$S}0_DgE=|9}*3FYyT= zz(v!M^{u!uadVRL2f6gf>o&q=k%oJD4MXI<>ddIxE+U`;zDIA}T1m-$Y~d_GYZiw< zoj$dn6mt|l{7+5I(=3Bv8-Hl}bR-BYpW7@+Boj@zTP2DZK35mPQ@akXdl3By zM%#~f3V&z#iEohA2L*QVnbNp=6U&z~+Nbc9H){TaWd4H5pQKx+*O%If`=p6;!x_Lh zTR(zYU!&0mpy1~KHvu(b0}Tdocrn|;D7_d-4p+=3THR*`cNdRPp-GXkLa6+t9R)b; zIAl>r6p%VxI+?h9G<_`(KPVTA3eVG!<_A;CUW%Tt9I*~nhPFv(Z>&U-e~%u<ier z(UKGUcf5;@DE-=ZzySB#@6RtU2p<-1dUnrC5+dEr)zj_Iy1m?-YJ0O`tE)e&-K}1p zU7T>@f!i6`0UXdRkbl=jr&n!HF0cFhy=wc19JPOfv)ZrkJtjEosIi$_*xA$ZgX5F( z-9B1cuC`tn@$-yryj$T*ryJc>a$c@}_2XAb)t$1=Ds6(7MTJc0Ub^3-vXg%8^IG5Z zLIz2qMN#;Dbp$Fdbdtx=g3;?XJ-SCGwhlqm z8K^{b%0|@1FDtpBSogC+2lc^%dk5?D+MEz`S|r;aAf%y$r34+p3%-8sT7xGIdr%{v zJxR6^*5c7{g_IV3f!ysbuG~g0+ZT=tUW7lfQVj*;i?@ioh6DiEV|?#j;+MstU@>;^ zI=lW7lS;tM4)K<=p$6cU<@A;z-(;adl=7=5kWQI@vRw!MvkNB-M1UOA1IT8&%!0^X zvhUcS$R2E^tZ=!*L_X zK7%u!U+_$LT-%K-jmeW+&|de1z7+#Ej!rZ)QQ!CeUZjj05R8tm-mk`Ero!y8bSA@_#D6jqD-I9jIw^q2rBJ?~U-?loBprdg0DwDHPLFdYEXLwjw9j49AMv&( z=DF78Jm?By5uBd>XVQXq`-eAm$!+H9p4H6k^0QQRL)U{vCv%(+m!8*5Non0O;oP~L zx{)~&B*s$DWsRJ~TqUzHd7N16-kz+C&7c4Hf~Pj;gRCHUKbU<{Yk-@)K~yP1BplYu z|NQjSvPu*glnJ;aL^0M(DIQNaT)B(A5BjLB46lpSYfOPA4hNq0Hh46YUk4#lpet1) zZgKPUX*2gSJTib+LmX9{glQ|vl}JnXEw0(!s#T635h)LR6FR7U^1z-a(LK-8ss>(x zdEdtwbRWB(@z#!xIq_smF8lZt{lw)vylk=Doz@e6OhH+TOvKyt3z3H2dW@mffQAwW zCZT+1+zJSjcBau59c`k+m8f+Mn~KXj=Jf_r7a>zz(85w8?Fv5v7378%sk58L3jbA zC}w%SZbLveu}H$l?J@9-FKVt|S%il47Rr*FJbGQU@`DD2$rAylmYHZWDgmXj#M#MK zAaMY5GjhhZy?Q;T&H=*4o7@m8$UW_ON-G@PB8P>(6FQzjL@#?q{wY2(CP2X=ac2Iq z7O0d^%0WU-x(Lb1d4d8 zvPdla57r-!2#B|EXu76kVfP(H%SgtG%)e#2k+)pi4<15gvi2OI;soDfgleS;Q^Vd@aU4VVc; z477C*pyTCsI~O8LNpmmz>^0ix4$gA)7a;M+nd)HFT4Ce)2`?9_shKwPK5J?pqP3R5 zCKfGnaE?!gqU^~KV6zMaUMD?CRb}S{eh$)`fv0EOEOY-a+lUo!mhR&T(?jM66pbj6 zSxO`=%roh6A#0)Op>B^1jXD85jvduKD76E>-tFOWBYt-NLKr7a>l0n&g6FWTNAFi3 zy#P!hrSuzgaJ=A$Js&Rr^jx2$i3g#Vr~YuSyY#Nd;^#AufBp1S-PAwaG$&W-+63=* zD~{aSq6VFoW>=RLWaFl#ruIDVX?L`QRKge?#xed z94leQtZvTYs|X8GS?ST<$enOyO}dJOgMJy^KQ*in@0xsu=!cP2CTHHu1BAkKRZ6rP0K|S+5g_H{F`! z;>ANCE^fCedwPWL{1Lm8j%G@GJ~w^o?(r~%+Vgz{dxASt*8+AHE&t05C1r{ z%0#D!ru>-G_012H9ipMy?~xi;ufP4JASe&B*Q3W2yWBv=2Nf;DebKnqhm67u++y|W z)yQRX$oVmuHU-s*Q*H?7-Qx9CjiG;bd^Pu@xX=SDm zgT(0%EzAf66ZA%=KkupA&rtZ7q7^Q)riYO}#ADa9jX-}7Qsh-~=uNhkWjsuwzesX8 z4DS6K7qywgpaB;M%WD3^<86Ruis8NANK){{i(M}3+8aB~9Y3#nQ;$!z(M;g>F-gC4 zH81%{NNdap*EvLX+;rwI_KLXm1fg$XvV(@U_7?D;9sFmNh+9JalO}^J>t6cBj^pRL z#n1g)w;|E?Om2?N9h}m-u)mp=9EH*hBc?}QB;(DfDeB3HNRHtFguSx3x0L>7Ri6fdCJsxubFyLjGdc050*FW%!N0^K-5Ajq^o?XVQ8kG-2Ho}a7~?MrW5ccJ z{B!Yd7+>G497%C6Cj;EZT<|rCFXdplpYpMTgRc>;#I5ssJ>pPhf8n>j4G8y+&KoMn z4#jG)PpKT))c+8f1ljfqr>WuPFu!PhB|$_^|RW^@XiJ& zlW^_KLWj(F%MgY3%1yniPE$>bu3b!`lzpY8rLCzWprrUe5=(+~OpK8{`~cV(5xMhi zDf!H}bHhxr28n_m^?7+4*XrfcSkP-^*R-Y!*5lQvj(SUi=`wD}b7aK^6M$ z4Na*OGoUR_uCAL1TESBlnDYj$nR8CP%O_x4(0~BI3>D6~Ni0n#o~aoYt>xPL?w2|9?>b2(11fk_Y3LmWqMI6VDan%t*`*#{HN5+KI+HB~XA za>x_OU{fkS@j`!uAdK6KDmoS9mG3Iak7iooT-_&El$#?_+oqS0HwHb2q#IJ<^6y{V zyJGuPGU)hnwD;{IGiNJ73oa+N4UL=q{Y|Q0LaB7K04Pk$-c}j$oE}q~@R;~?o5!gT zohafZ`f;L-LntdiMJTA-cIh&S9U$e)#fvjyO_e#TB}@3li$8IPH-q=>m#pAde&>!% zt4ce+tf&E6IrAqeS-nVmqEvO8-?U$gfV(ODdz2ELCF~DT1b|GVasInvVoJ0%QVtbv z`QTRl{$Bd9OE)WB^Jm9 z9s;slR9yUZ^5%_D=f2>Ofv;dUr9r_FaEx9ConQ2Txi`WLTrG>sFfDo1ews}V!f1|Q zKpmsx!WnE|9?@MwdPGMiXL3Ehx%h1W_qsH2LUw+pXrM^VI9+EPpGy&c6d{f9y4><~ z+m`nuCJ>4}=5Qle3Uo!1l}8p20uCxtB<2)afqjo#CB(uZgLt-uW1TmzH;^%pu^aav zZ@AYJ6E5GNF~buPM6o670TuJ%U5*^B)l%xwYIP0fra7)n=VmiQ=I@j}no5W?I}Tma zmF@uQg(weWm=1Q2?jq)E+>*Qyo&e2un6~wHfL`jExY-wn;HvQb^hE<(gg8aT@W>Va zzA!n2!HqZVyn?NpAZrFOL(SiiBb#hP5~BkT9O&kr)|CJZ@u!eFK?N_rP@~J=sH?G*+9`e^6)!b+u)E*85;slDa=`7fBJ6x z)MF)_)5m!9Vg$hRMA}Lw$mWkLe}9dlN*PUvd4!I`4bUXX*P+sv$b=!`vI zCcm8YpqCC^po-ZF?ulh6D|hQSZ0GQN`SK;7+5O!;mC?;wh3hzU$A&+$?)zN!&5=%r zdB=SMYL*~mKkxfP57lrP?gpx>@|}xez^Z3XI6ElSJyx^v8Z-8f?A>{JQ5feGd1L%k z>0TwNmC8@R9g$y~$C?^%FW=H}+!DO3Bw>tqc0O|CQy();t_=S_)gADhgZC`|@zsTl zwY|xm4KPvJN{Nf5-S_Zx~Q_+7=lm7&dvOg&Iq$;)fX&6|r;%&HwE5aui1)#r(h(@9s^| zGi9BM8iCc*;g>hJ`0P;K2rt!)j0}OMkO1Jgj-#4}AOQUlx{fpiNpB=eMyyo--fioW zn%>;!VrzG4^ITVam1_3m`H~P%yZ|PWZef{Xp+Fj|M1BlJOqyp!dzw2V&?Hi=bwQ0b z9L-G#9ldjhSGPW=Gl>T^rAlpa#nst)$Nv4lbE{L|lr~!z`{vI0PRq=M2BnPUQ5^-d z;T(wgR&F}c5Kqo6lA$ ztxNw22c*l?`!Dy4YmDFE{`PJ+1f0rR77$5OeJ-1XX>p**pdi~CwOK^NcIWJmbBbxk zfeLWtK}Sg^g7*NKycnN3@NR$vDaKx6+X65xFU@HK?6IAE>2Le{)PpA_N1{_dkCiKh zb~|YCOiM{8io6MdbqEgoU@rl`2M{ZFuWr&PQ22VQ44M#+mTo3GQT7lScS^GkF z72QzmIIxMrVGCSGVj>r-C8BJ=ntO0HH2z%mAOK?9gka@r(=MoLtKMA(azK!>G#AaZ zIyxCS#J&k=dc@2?{j5gx9oq)i>vXLq?yeLDRHoRWB{qoOEYK73oVwDZ>dh1o{Q&OF z=jWH-k9ek?OHsOKl>cS3=|>^#xAqv-J#z95;o`w^^dehu?eO=<1EEa%nWC;16-fbI zMM=?RM5b`=)GD0*LTPY$h?)+?t3;8qk$hY>_TJJKn+USHSaWenOaDu~Cb6gJ3po8B zK;=t~Nf%QjACU7qpPv&tGQj*M=>uuYiq`Mk^_MsB0CpE(QtmLxV}gH}lVCXP%$VY8 zqa-UvhJa#IJaWa_z-v%(fcFx==1(xl)}PmyPA8UUK?EU!sF_9-4ruq(aU1its4GD@ zKeS0squuwV5|-AF3ZhdINOosL+|a2nQ*PtE!~6zvM6cJ76UY(YHVV3}G*jsMji48{ zREIQ7@zBUAYdoXjBSwXjyv@cxN$3tx4arGEq}yWeK1{F<;htxY-U7Xze zCZ^Cg>)m!9IAA~qrW4vvAqUZpEO?aIi3(HlXbA^;l2hOJi7;&}7#1x<6&_}{9CPmU zX+er5BY=rG%`Y07EDpBbjwz5+6bk>iV82b30>HJn^;N5x^gj^$lAkV7H4<&Z9>1EI+0en^Rp`iX=EU0el^|5M9=D3) zym>eHR+{Cm?Hba>#&A#qeif$lUAnOdeT4w@GmcKqc=alp!%q)1AMfmUbGG7m0h~Bm zY0vAk;IJYw_{@oK^f1w8mslbhbU?I8M?L3eoUoKH5+zP?3?|$Xb!whp*Sv7YJGXBi zK@S^}c-UqGCjM>UB>gmeApTmT>;&yW^!*zZq%;D`emLCw~~um3AM zXwcjWPfWp8LBkZd+Ab!48NZcsW&&(&eTfKw^3_mN%gWIW-Qi`!FdRwAg&Y2L@@y`< zPext=!=h}zM}%Lzb@E*fbhS;@og~dMtT`QLTi9GtBXRk2y+dM$e^a^ql}Z(UY(5JP z3azQ`g>0s)%TOg)a0#xaGFnaVm99v2725bfwn^{Gct*=GC((An_J7TidQN#T`aN)}{1`obNH790ZUn%_%Ko$^c?G zv&l}p&S^wX~epZ&oVOVM|nC;W;sEm z6oE7!VmA}xlaq26kq?py^y*|djE%D}jSRHj!z&=ghix;8`|B{s83w2Q?q#{lDNBLG zh!3te?#jdf1?BUc-jga)Ze(Yl;-u%Cs(!e+UgYF?!-fW1iz0x1Ap46mkk(nMZ(bh9 zxL`9Nd7aMIe)~p5|0bZ2n~vd{%6ze%2;9LBwXG6+4jmc+di1|cRM|M#24kc68{Q!YE%rMU-4UK>iR>8IvR$^2^QdIkiYD%#ND*K4}!hu+;1N|X6PP$3~1~qi4c&Jn$uEnD~vyKg8<@AbTG2D?TpS>qM%qD z6fC!Bi?8xZg7Mytki1OCVMerX5LvyhHEktUa9}66oW7x3LJuewo-lfL_H*C7d)%&{ zarP`n^(r3T9V%m6#2##gYrmI%!u7RS>maU3Zf6d~jKp8tgMxaJVne%zLq+snU>*|K z!CT=)x7axu)RG%E+9w1Uw2Jnd*I=LcR%l`F7V01!-~RFAGT10Zw0qAldAB_<)q2sd zJ03H2>@Q8^I;KSwJrZaV=_w|-X7UFD^R?V&Eb;P}D_2ukgXy`cm8yODNl46CGG<^^ ze2>^xjITPFBqAD`3^{qZvj;uaVscuJ6`7B>YV}VPR-{2c@Q$_eD{Sqk0Qx&zzKw-Z za0lxXc13^yoLgUjYmobP#f?4*hp7ERSNMx?uCEcQYrY-EF`@!t@n4#}$lJJ8RCRUl zEwiKGHa^QNzwwuA&!gN0GzPxf{k*Z?b>ht#`BWEqj!9*tc&9+dlkF)1Ce%`iiG3+y zesJRxb7{aDCyXLzSr2ki=ugH|e?-4ed?RiqP~q>4PDg<^P;HpJZL~r3xp|(SX!rLr z5(7XIszVqm?yj}dH!p%Orc(1$t8yc>4{&Q7$79^gld3TNblBlQ+g+-I4R1f#I)EMw z(8ki)C8+R3#kVs3HT>f7=V_sjb(7C2qj=KD7rV#P8)#3iReR5vyh>tMSdZ(_79RS#F4mlwAcT`8TfNFaGC~4A)6)R45_60sz@$qRm zaIr`t=rFl(M45=u!LVkqPkLQk?Rm~mI-T7cFQ`$d3r2<`1`%irk3n0SqlCf%@UJPu z!((hhK`KFk0Cgr8eT|!xx+df}>ro4hi(Mb&6sC&jgwOF>-Y~qJ(m%jBhagl!*MI#x zJ??eJkzM~Ip3_&Kn;DXc4GnfC5PXmfozb8%0Dg|Z7t?&VkF7hq**xwD_g(0aTg^o! z2oZbI|L+>AiS-Ey6}_JE*4-I;2WEtV{ixA2CP+YDjc1+W*`hcCD98Ne1i&pUJ}>bU zjt7~ONp+RxQ>{f=<2Uc(1Dxh;XCkcIa*r(j-hUNZja#=W(xU6C>F5%jN32zV#fzh8*^NX$P^!o1ZJ%@Fi8XcH3(W^x9-BZg%sTt86;XH5}adgwK z=6rGhm41LbYD0&1z{bQpot;S-OIi$E)?<@h|LpvHGq>hGMih|JNO92#@{F~4h(b$& z6lwJd=aw1t+S<)wFNA=g#znTv@gVdVur^gcDNwo7YyNOH51^J4xds{$jyBuN{*~FZ z;YL^1{MkCPie(knZ&pp5rPZ{OG($sFgLE!59W6PU#`KRkg8Ur?vfAj{=J+k9oZ>u{G{(@> zX-aFo=6oQ71FKh1(R$G!!Z2kfK75DX0KtOmMB!7?FZOuwB+0QQ)9ADq;}K|;48 z7HG!sPcR;JKO1#>Rr+UXh`5UeHR@gFuHSiE6KXrEv^(@eiv3j1Eqyg)SfNs~4ogJD znbhx~xe~o3EGS|Qf=&1|da+YRP~R{(mUxWXVvC{+d%uALrQZJbx2v~-& zq~z_}Y|J|1@a5yG@>J*z#4gjp{cFhF2fe?v_TE4704K6k@=#Ibd*qm!%#!Zy+Dxb0{d5)vX)Wz|>S=Gr0TvxHy_?#4Ka zIQCyB0kdH#N=TKwA7NRn&g9?}D0>`1c*3>Gtl{2eD&I8fa-pWs#`aj6Ud=CWPYKr; z0hr0M^1&)Z*wBli{3YuC+V$%)(vlpOwO6i;qWU9WqwBvQw)ocphGeDw=xPWPlt{On%Z>2_GVxB5g9lC8PvI> zLA!XXfKBcO*8|5K>WMr9C0TO{QBf06REwXBLp$(&Hg-i31qtpVi-h5AT_1HAP0c_* z7SgO80gBu(ik31V_Apll;uc*DdEo3<;k`v@$*~~qD3830(tO(3KKGXAl({qWU@|xT z0>Zvb9ClCeI-#FPk1lhHxQh!uAEXRWrlg$%APqZlEyuC?>(|4u??gl0Y8{RhRCsdw z88ZM9!ea#ke!|A)1OaxeucMRfl-6;x^RZ?_KitfVegVa|VM=9HjAjq^$NYqyK05_r$@pUr1~6lcPd(Q;jEBv6ErA4ug%P}b1dIzi z4_P$xu*(q^Obl-Q+xzv~0tb$@`9+tw1!{_?FTpC*SZuBCEKM1i?Xczb8T;h7wD7Nq zhZmVX#ZS99rFXa1+PAW?xpv6zbhF&u$}X3@E5b~3CaJ3Tz7{pYDSljn)#o{b7mZmz zf55K~1HVTf9Hh|L;&{En3Ksa8kfncwum!dlo?a)bAEM)mwteYtA7unUOxg9Ap%@ zkcb%Z3;OsNaC`;6>`hZD#0ckPFpGosPu*UtQ^$@$%stDn3Ez>^ies4OZGJ&~xMS%a z{#4~1hw(KaQYqL$udoZ5zp;6{!Q@gH$-3;j?*cEA(njlG+me1`mAGOUV^dL`FHx=Ms+7VW9q^*ckF1^C$4Hlj~>1D@sL3l&XYcVR2kC#tNWln zeRNk3+1jywD}@iY$`~H8titMMR1CUMR+G@zB0Rh7>kTaAI&-Ou%XRJ{5&FSJ^WEB? zk2WhVr_bL)o5Bq{SM$Z4Ui83@nSmz>IHKYLC>#8~)yn_E+9lRl^8F_7(LI#4VS;t? zwNZajt`-lR5oQ=+n!=4lrE-M60w%4C+v{Gcs-l49$#|7|Qgw4^m z+WPf9`?fGxaX#5D>A^bNerx=7J9Ox#NLoec>*dD_mc40VcS-ZpuWrK+I$Ze`-fX9@m)i3>OXRrBLu0hbt6uNIjpHkOww#tZHNXgBgCOYUYU;euIYpK#U)(dNY3`c%_ zvxN{0b4m(vYf6)HU?SsDFMWv?K-X+Tn7)efoJr@BQlmRT+x7)6L8FkWB z092F?Ayi-Py6>P4ky02R{W*dzVo*8A^v}7AD?-pHfm33Dy>`Hby%U%UC$kF z|M{y@LE}~(!rzo$y&nrpK6$TN&4@LI-##ghw>fV;f79UPzPBMOS)Ti7FgusNfWvkL!r&LxyQL3e4_7DxVBDdIrC3`n!EevE?Q3mkWfySlgpjnfo zZ?pqrr#(cVgl=)mAy3G*b;y>N{i<2aT^(z+-g@h$U&Z|AH}qYSwMY#{;P7}>H!FK} zTvoWA-khg{IuaK6?QF`C*nc+sHrYGy(WAp%z3y~AdFOS;VU2?4cEgekx;?+FSYG~P z+CHyei&Y+d>wJ2|mEB?zBvroijVuaM88Z)pLY>uABK^6f<=UR+r^6jZ>2H1Tk3(96 zT^eI69*rrja9qD({0|}ym(^Wrc6QwO;P>l|w~f=io;_jF>hjR3r-2r@srA({bI03_0Aj>Ur8PzhtG@R&;2Tza|qW%2mK2C_SCz{E>GE-0sL$;1A^x5;%f> z=N+2r=EILZJEx*MJ;;xlao^r8clf3qvcGH5WUEBYPp=%-v{Anl?W;O+#@{;au2xA_ zVVMjej#qH^sPrFl+O}w@w|9oM!wdhVZQFJn(qiRDrLxNLF81wngfAn=B?#7a`WZJ% zN|1@Mrr|6=tuqexZfAm8etW0VeON0pFK1p{*^kRR-^vDlF zTTOqa2&Jp*?RJ;CYc8EIdM=qWRl;02v}A0`z=Kt{!i;u#okZ~fNF|aY?g|Lxjlh}r zIL{(R?ud-)-!=Qwi|ZRV?ce`vs?*Fp%jUm$QCXWGS8&*3Rh{pLjQQyDS5s$Bsy|cmZSSupc zzeCyzE(*LM<~uq45rlpv?@7QYERwfrcF2qoKB7@+oTqaAlP6Vgm4E2lppU_U)*%Om znGK6t{4}Vq+w>iK{_dNw^6a}zBJ{K@)-Er}avj~fZSth4?dyY-HrjAwTif9D7Z=Cx zwv1KkRIu_RgJvg$BymIy+qALWY-OW$U9A8dF8_S5O))1jW<>SQaEH8lV6`qNk0(v( zq#N|GHqGOjW4A%?zC2@wn>#B7vA$|t{YhamA@Mrw&O};Nk>*KB?y1~{)2{J^$bFIfJU)(-(TDYTipw*zLm3c2slD0=2h?y{ccg`z^uwhXnv~yRq zZR_}9=1%VeBfGg3w!iznhksCEb%n;(4?|zIYx$<;n><0*SD8LW3n?hF)g4CorN>()K(cxY5U zhbZ+Tkt2p{SxVxC#52;FNc8&T-X-K07lDM8f#FfTecc(S6`-zXep;`)fm-UmDf>3( zUFBnqd_V6_*2)9Jw|HN3tNycEZ=`*YW?9_#T<06WzvJk1;Y$ucY< zOvgbkF!PHm7jrOtzkgTr#aYEi$sGp29N}ZA@A3ywn!xJ{r31VAsF<~5X^Tb_#q)9W zI!s>Ov0hh6DY+SllhFE3PC-FIgmXrsM0zl1&L19xv|6;cH}%6&&M~+x2NT`PA0={?x3(X6vUU+2UsEqU24@-b^{K}lAyQzONo)aP}7 z9B(R3G*z-W6z$)>)3IZYZa?2WZW++v!b;_`t9hoGfy=%gX`Gf8TXNybRH{kgxzLk| zuLn`6?)L^XgVKgW{&fk<>N41EV{&qGexe%^C&>A!@DS^=`gt1cEV`Z3I``7weOsFa zeR%WqjMM)6r8mcXuXt^=Q<-i`rRTod0sS59HEUOX^8LGp2ttFEmvrL44ZGmkHa)ph zr&_-wrs3m8-*{p(EW__t=e%xf_T+{aJ?x|Mxb*e(z?zRUqAM!R=36`T-qk5^%`Xy{ zN=nAksDcJywUomOxzRN4@My1IvL`T}NC}ha4pK!8PmiZSwdtz^7I#o;P(@RsM zHa2cPu8HThjlb@?bw7Ld@bi70y1y5B)ENxIq*Ze-$`su(Yec=_Oj8}=jmZ+Oo% z)dw58za7|GxiQRg*qQ_@Wt(YBm#Q2-y((GLv+}8B-{>Jz7FE9ce!Q*Q++L<$A>GurB@S5HV{TIUfpFVL5ob3!Qzu~wjhnaE_5SLNB>K-o?=4Cp z8n8#)Vo@xeUYn}%-<5Jk5+mN}3E`lf7ozKvmqIqH^0SdT_U&x)3_PPQe>}5L_0+dz zc9zFB+}St2NqJS?w!(sV^&1O5JX>|eqA)Lcez>W_c+!PCLHyE+=?+VT!QI+z_-=j; z8ef?xC-Vddvezun>^Ef1ua86dW61b}kgyVR&3;=BO#wJEzh+QK(vFRR3Mb*nKe>+Q zb=G>$IruI;YG`F$j+#f+Z{^hVOLN{^6utZTBe-zH0#(!Q3IUpt{xi!j*bFpFVgOqH z_C)XY3LjGDxw@Vr3<$x)Sg>gc@RNq3%h2thF)F)mij{>J(}K2^o@Z?PfiS=C9G6egp>3yQ()YTQp)a{1PQ1EREGx z|CYhl!5|2q!Sy@$8}42jq@rscW2|^~>8rc^4m|X4o44BfJbiTpFUn*+iU@`|mH-to znBv*|CuqFHLYA? zL>kgf0}K_xe~GC{WCtvi-QO038L+0%JPs9AtdqS~tMgzXBIV!{9>r)YMHzD#q={pN zqy6cD-s~-n0l2wH*r?efhc2~0d1i6>ij+l>%e2xjMeTo)UopPb`~CV(xb=!M)T|~3 zY5w{1;naj#X}0GsM{f)D8bW&}`hBYLCf;#_Uh@rs3ZzCBAQ4@K&G}_2q#XE{?UF=r zfJ32u=iS{O%sB^GMXK>;Fx?dIs(GlqcXddu3~QGk@&02!y)f_Sw$rBN5A^$<(lOX# z$K$z?dSm8KPH5aJ#PzSQm(R!xiEfr~i+|mVJB(kq@U#q}LzlJ=$fD+NLXwcbz-)&( zN@Y24Zx3XTKqhwgPUmX_U&xRNV#7NoU8Gy%T0qLs1007_9LBSRvU9x8{Z*9;KfW~m zJD*REEB66jHHOsR<;Z0hB<4qiZOl3)Q$HyQC5I~iD=Ll$ToLi~#7yU8dL0@FST=|6 z12_s5d-RAlpOM=8i@R9qX~lSbn;avp63XDaB`Oy@Py+Ax*pi^Z8LIA=6RSisfSpFB z9>8HU=dKe!Qv`UViT7A;RmEV7WBz~Z!1g+G2lSe2QOpP;AF;VpE0OE4EB^u?9T40Y z^9`m%fUp4cU&?7o=3%lt`CF$5y5ao0jniZ_Uw_Nuzvbnsv*M=**&LfTCP8`3#9qeD zTps+bYikntdd^PIy?TvD)_wotchn^}=)Oar-oF;XY>Q=>r2tYg*ml*=ZwnCj95;_u zK-?oSav(>nyV>_Q{kcYlfg|TAk#Kp(J>yZ<$k`z39b5>iua^2=(yTmluEU6QC^xvbxLg8q>C|M*&E6 zwHdc3e<)K;L`^|2!kcD@_rCnqzz-7ij~sj|9)IjIgA<$o^G}8_Uv&!+i~?LdxwvZS z}_3qhjXKeQ?_t`=dlC0j&+Ff@TzIf5| z^KF3EMQTpHs#w8sx$Q<{fq6kI`9D>D)Ba;aVWI=xH)a+VN5v+rgzbM-@9-MHd!w*~ zlzUgTH*<6Ff2ycSa5rE44xSTTJAHjjGhaF|66g`zz5NvhZ^n&dUeNUWVg8KpLuO9j zrNfmF5G)%<0Dc<({5=$FwRluHb=pT7xGX2QV}n&fznVXYONE%ckF5+o3?A#S<7_TV z5LDx{oLiQ|u-aU%jI&CB_rLzK>2VNU>hgDm$vrki4N@vn(HmG5-OCoqXd9FDpO+O~ z8rgqrOyL%@ZlQzzAupIT*Yx}SE~vX8?;_-9Ppt9uP^yfH~M zCz0BuB7kW{nYdX@t262P3d@j4Z{}|9(1sJ3(^rGMk8MZ2uD77#$fo$e!&X(aZ01eP z*iIoJqi0^*{*$InL$om0$!Qpde>74721l7u&VRKS@L6Fe`hI>(8tBeRgGBE!PKaG4 zLUE_g66W@ncIEFM%AX(g)AuqbJi1GDr}OEPX)8ou$eDs8MFEU>4vKm1T;*XiT?kC% zcHBI8m(@dbjFoX2+Yiei9+W$goc}!CeG?uf6u6bTrxjI#c}2S=uoPj_azUy#*EUey zn9ugfqTBj2`B`>$^TGEHt7%Vi4us~r+3C@{`5i~t9uF}X)-3ysx5u8r&0nr=71*Lp zx=(WG04cLr-b5T}KKe6{36)r85s-a;{IVwdiIzfus>3WsCV3#lbNYL!Jr1ZSilSZ2 za)UMcowI5mB1ck$b5?zLkT=(N8!=as+C zr(Y1QKPMi!Z;S*4Wt#tzS(iqu6tanUDZ5*>h$64BGaN)-Soc@NXP|tXN7W#*E&LDY zmq2Yvr`70DIzu&+0!$c${qHmm6^@;}b>f*(zjxF4AfHHo*sS-&(h!smsL;@Vby#>r z2Qd<}PDGjlE2-G(6Pj5rh3I!P4L^3*p#VW`U>u#k9$%O~?2ColDcD`JgaAiZM@LiO zRG2ocM_WtrVo41p;|Ot;Alz#(=3#|N-ZYd~5Az(+1-0l^-`<+@LDUBvqN2(I9|Y>2 z;dV-Syb;6}<$}Y>PaecRh|dQS7kiE=f+ifpOCR~J$+Ko1TRSte{&dcEe)IEVKM z-;b~I^o*riyIzrk2%Ro@4F5K~F5<8up+uZidhcU6qY#3O_`1xH)~^g{jC#=fZR zV0i}n>Q5LqjtRL8!orM(sNY%b_P1dSyx{vx8@wVlbz|v|we>?z4EK1mHUG9j@cnI* zduO7gAsxVJ@SEh>7jIDIPlRAV8Qr9*Cy5SrSJp`6&)DF75UcnXk)s`J={9u0fJ2j2 z`ZdL70~Z$S{@UEsB5e7UNAZ?pybk$z>c8@H-3FC-@?{D4yecYB?1k((xb6mXASO{i zHN=k{W4abM1_>%r@!!zI*WJJSag7qRwU?h$TnNj?s2g-9&6wLL1&I68;(i`_}r{DY5 z?}+NZ@miCX?P`CnYpD9&y=Z;2hr@<%dB4uTu1#%iievK1o^OnJvLto(uGWA1v|Wy4 z5_JWYqbtR;Y+rtW&&8wvk$$V2~D7z#!Gz2+5?JK6J)vXa6;uPq%P zM+NsipLP?4X;EdRgIb3vu}VX8GW{e=M?5juQch2->GkEO`LU_pIAERLrQM%DNQC9S zpEH`uq^-q^@1W2XtpZ*^v{)iYk<<3~OvG`V+%k8FkL5OE*ov9O0f+K~9yM&<-0br1 z=x*-CjTXIXS^YjEY1SCG%Tv0jEL`fU(PX`b`acVPn1*NlU09$LJ5EK#>2><1314bf z^QNMndY8A^*trHwq9a5uD2Bu_L%NOaQwCy;iAg)G&F6S#{oa**hLHf!&+%{FA2x;= zRI|ujCu*-7_eGBxwXo@Pzf&cB{QnFhc>nMWN(vjUaf>9|aIRmbfWlLHqgU-E%2Fl( zG=ve$Ce9!lHGq+p=!`J+H=8W>;iA6=K8xr!H_45Bbrzpp?T`Oqv$x((+j<@h8 z+y$u^v8i()%>Y*qZNwIkX3<*EDzxx86czD! z=76E^9A}AgLeC|4Vw=UmZ*MTh`Dd$%=gWde7C_L_=-j726PN~9xC~Gtngv}+$B_-k z-=t#~h?Vo(342EJ?3(x^!&Wp~zkZ`fd3m2V$1M|jx43d?+;QWEU50LUKi_}$MqiBF zYHC65mv(6!nmxSZVAEz>j~5(%3kN=8j=TQMS(*7UdQNliQ6{v|r^bMBvZ);MXd>5pTQT_<+zWUf^8s~m^`oyT@hEtZYS?lio+xffb z?L0Zc!`QG@QsTHPi&Xp?1&?uZbNe~5-ZQ*8V1t0w^??Fj153<9EFh>lp9bILwsAG4 z+7z)MTJ8i7i3y+Pl&q2b83}8@&Mh}D+WN-t_Rrc5esKd`JG+`YTn~L!eE6#3Vh#X# z%*6;SsZ~Up9@~Q6B?jXeuJ(4VT7?|DY|Y+gPGD?NEpJYj8I|l`nOgvC<;cDHV|UMX z8|P+kH2c1+`nk~P_U#j}fKpWJIJiY#QE9Qo^`J=tMCP?Xg`+jo@`_F%3 z?mWj*>yTIE+i1h%tId+5zO13gm{)A{;lrHo%Q}^0KRc8kX16pI4xH;|NV65J;Q(qF zHg14%j0onLBa>>eJ6Kt2071m*t!E_J4wkoyTd+m!bN4hmrQfTAvH9;cdwq#B>Gpp8 zT{34FE%prFKQb3v)jx3|41(aWp4S@594HiykY59@4AHl7-r`f!&gk5+Hf>Z)eJ*;g zzSO|Y>G1TQemx!UH0=@e_SN+*#h&8}G_=jU*f;ZAPi<@l*z?;99j>*ARWf>KxWeFjtwS4&md(yzd7d}TV&aIo&(3EJoIXdJ zl!4`#cWIf4cZtECARz661tz2lC|Im~jG$sd>M=2|Ge#Z>I#Fnw`#w!9+@k5Qsb#DF z53r=>g3cUqpA+0+y?LB7Z!7Q>+CDbnF+L#orwpK*uM&H#4Tm($wVG>nr+XFS=M5Zc zt@$l-Pf+OKB&&{=85zaRZ|1i-yF^)i(V;_5em)nb|4c9&8>BsX?}}IdEWS7T_gb&Y zd3VOGJZ;p_toT>g`m^iZzP+MRqj7WHY^#5~e5Wr-F8L$K4;SMU2tOgB{rsBJ!Er+x`ML$QM}c6RmZUlB*Q&N}wsBpwAe zM<&lRYS^(_lhP3lckQ}fmALr-(DmK%T(|$f4GNVcA}PtHP$=1ZRz^k$S&@bvl0=l5 zQHVr{>{Y0Y>_nx=D6&FUWkw0-dG$TNbI$K^&iUgv?)#4Tyx-S#y{_l%{#)8tX)6B6 z%uGRTY#{&sd+lwHyd0m$?c8s|JyLXS_PDzFLF(%e0CkPJKRJSH;}Y&lLWu4?1^Cg--3wN_q2EdFtrCbpW1p+{|Aa9WVyqmWas` zf`$$FUbtuc3%o+BH!k$bJ3Ju_sBUOTSJM&|)1SK|E$#>eFaZXhX^aT6tqub0)B9k% zxpKC?p(X-xZg?D#`aI?K);85Q`YHwn3T#3g!AK>mZ~x%4FLsmSeo2!zlovqoM=@Tz zwqV{@{#^S9LdEZn%g~C5H15*+^WJ0JkQQ1{2r$&yRDvMie@#rGP!#}iCCWqHsZqvS zh&$4dq7BC=KprD-G|B4_WL^_FH^<ANxNr9f3o^`K6B1QYITAlth zGRnJo`ADzF%&hL=;uA3zDJ9ez0K8Plh&8NJtMcO7f zL&oE$G<%AuM?pqWlt&@P6;Q~-kTs!L>kcVRg)$xxz7aN3CLyySq;~BeoAul`Ft8A& zL-$pcT#I378Ewyw75$+&7(Iw9BuNg7UM#;oIuF7sB_%mo!L-!LYFc^zi+hT)Ts{*Q z+G{M0Je~=E#y+|8y=|fD_wQ`q!(u(WsgYL%aVFGrB$_p|$1tT!91|lxnSea|z+gY9 zV_U)}KhMAJ?tRX&l6z8hZsKA$H^u#4Pt`8tiO!I!-dzsNFW-G;>Ahw$SFE8{P6b;t z$d8+2G|<@r(jtnrkM_wW=1IwgJ(>6A?)z4W) zBX75syQ*jW5HuG|c&8<^O(p8a4dJ|w-0kLvU%#$VU!FTzm}glZC9^)LH5iNtuE0cg zgN=ZcHkNl(l+o9>;5(C=Et=&&@3lW@ndc?Y`JT+&(|4a{@aTP}_u1O=Pfx#nu<6@7 z`D}v}Yv-!#s+wFl%N(^829-$<9(GDtSFJEILHQhm`2=OLC%|9_g=}X$*_j{GKVKWa z&6AUh`d}L0$xzMD7cbWL=mia?RVBvjW{UFrVwF9_R*ZQydhWT@wp@cbqw@KVWgEmR zKtHANE~`!eu^>F+;?MMGe+q7ni;IZiW^Gx&u`c7Dd+#II6r>-&@x*qZUq0DY(3^rY_&ykK2=$0dilo7SZNTv( zMg$#=!#nny3tm8EjfJFCkV&Z$2IF&xi~wMurIJF{9}rD zB6yuaa}l}b)q+R;j6)4u2JYSr2wKr$V;gY2wpu|a+LSL;|7AD#F}fS&ZLMcM?2E88 zq^GpOIZXo3h&vhZ1%$IiNb8yU^8cM?7I5|0|wQBwYxD;Vj2TIB*R;pXJ9m_(M+bz zYzvB{^Q1iw$sjaXlm;#My|?^F#E^78{gis@hrok+rI&&Z9;BtUG#g7+)Xv^UXJ- zghY`0s|yLmB@UH*AFs&=zNxE|x>k0NdEmXwx$>f+91Vsea7K``AJ`3Xg26XKo+R|6 zu<5frpf$+fv2ekxyxd5o7>rS;G=)6rhQ z?cV)I4?Cs1?3HmCZL0GB!H^3-S(#-d>ELP7O`A* zj`b^lG$}U9OPybse4Zb=viHcPz@1rXSNA`z6R5+Ki@dN$h-SOFvlE{zNuvRV7PXzZVgr+urvXxk)3c%?vkN8^RVu(mlD(X$6mH zGRT5O14m5&4JbSDVIX3^Y00!(o4Lvdv~6x1#Arp`+nHZVP0v*!95btQSdk_@G0fzWJS<)@~P z9&dUz_rug;>c|CNS9z3O1SI9vZBoK;f+B9%qJe;cD5xLZbL~S()2t(GE+oD|Cv^j^ z@d!wO-(w6w;`Xasbqd>y(&!kPQ#Bax9aN+^|GC&}<LK(C$j%nP^$SSF|K+eBey$PHlE}!nB85a!G#*DE z%AptXhoP3VbKpe(Ffk8(#1u@;Z6NXx7Xg;JK6)1$v(M?2U0#vSpy$Fy{0$TjaXUaDh$>hNErWza zgX9Mrcp-FGh&#Owc<9TQ2LRX+4mAwx^f!rbz?4Za3@8H9u19EJO*V!oURJzVXi9zp zpTUjX!QtV061s_}4bdUt{c%S_rlDOyn1tK-Tuo9LYzxr#xs0`Pk$4XL0^}lxH{Cn0 zSD&tBWFzCrVdvz)_;~(=PZeIu;p6uX*1V5x;?fkt7h+u$ku0FLHi$zW0a_wR1(2U8 zAa8(q2`h%TYl8`-+7BOX-F+SJF{=H@wGQf6a`lFP7CM5zUHus+Hrl@H$=}mh`&#{t zv6dE7wF`236#_KyM2W-?`UFeluoBHVZp?&${`KRFX2bUM7#Ek%0*uqK9-aio zd4`A&&6xcbcx6cKJno6~xUitVg#1f&$PA~zDYYxHH?M}A)ggn&M*m^mBa!9Q-N z$_(C&-Ip@DJ0@hdU!$qKB4APH)osf__vV;|rMFyWGBBo-9}sy8#JYedL8{zgnt_AT z2Dmuk2qASL90~5TO-Lj{`wmVuAqG5B*sBg95D?BtO*~A3gW<9{ta;~R1g|ksX<)b} z9Wdm51X_qJy(gExNWd3}>nGv7;9(BL#Y6-n6hhjGxZHfPFjJ2oo-?C@Gi$Li+5)vB z^$gHgNY58<+ooVQqOPPYAI7ra6Wbgxc*P*bG9 zFb3lSc#?CeW6YIcS1enNcXO#-E@@0Z5w~yTcY7wsAC)rKNTs710$3AxK5Zxm#Qjkd z>ls?_xMZmr2)xJx05yIR1QqhUQSm2^NFXlnQA!ouRRMvb+MpS?GV=Jx&~AxANdQxi zhwp!+sJ};3!vS5U!z_i521&NZ@F6_bN+P*M2yiBJFJ7}uvkD1(b}%vqfH{UA zl%yQuZAYi`!s88EDXA89RN~&odM~6x&4d^_*>ci9PAFIT{5Z{iA(p4X`s{|`=3Oo6 z2zP;}_jgIoX2-Vabr)YR5utgl6e3ZCBA{{HI-6xunC29c9vTQV z(asW&HWm)>+1y9@6?FnlfI0sY77fcFNtnV22JycmPD3~xHz%!yYYK{~Dr8R}h~cfv zPf4V;A}Wa9{gj);whx#+?%sQwxIFZMiau=<&Tr&5oxrc>yR?jViWZQh%@gZXJl*iYaCyUModRH*Pyv zYCPDUj1h~3l4#c?07^)8)^!qs9*(GBSA-FdAu2K5MrZBjyMk?_j7P zoNdCUjd8^*#gvU{qojLHmOj_zE_!7--$5(b4{}UONsF*B&|XN79Ofv&C7b|1L|kN~ zxD+_AN;SnVdQ_F5m~!wLB2W|v#*8!~j3|)NJJY2b;7_rQ z{SCYs>);T8byxGxXU%dsNC1fK85{`$94)_;V%jF@yZGn1pM6O}QpjR` zL$?J%Cum8Z$z1S&9f)iqn1_B}VnRGoaj{$Bul#Y>efF}~%15^ybyXA3n&^O8Jmh-h zP|gnNxVSh{J`DQfVY_|E#g`b5A%Fnp4oh1~ z=g5(T=e>{RXwT#I#oK|c?MEU1>Aib;08iI^`t&>U;1SPz{~8@M@sdO~4^% z;@^7=@!}JUB#z4PqkE?2D^{0)P&&G}c;$s~PtSnLR=~Qo`&7imz4h99CJrnQV&N%A zG8gvpyBLjRoJ^L#+CSY>Ra*&fx(YM>xGfOfIXVg-c20c4oW>tL9@PEqpMM{t&oS(e zG~QBRB_LL3@VKjcN$|pF&L)qt%K^~>VV%2=?n~%Wzq7~N%rJ*n=^UroW`t#8-R!!x zaj7GC8_pb3ut=O}koBEx{ygh-`+j^p>6m0Pu4ZOo35Gt&VD-E>=w3lk+f`@=D(ueI zvwS!HRYgRdVJnV4bSsTLSGgp=R?}~`!A9?@#tW3row2f-RXfZJW)D|SMEww99nSV= zWo1OjP2Bu|&hVfxa)Z;vbb|d1e~iZ44|q{OS2%te9Z2%e&A&RTfM1#CT$=m`JwNT&K_H zA+}psM{f49mFVM{D?`aUAOcEW6A`76(yx;0$R!D+hOhZ!Z4_+P7>S9jXJccQVZF9u zq*o17!O`se?ZEvY3Yh`av@3>>fd;P_j){#Pbp`LKs0Cgp$*T{4o1JJp zld()}B>SI}kF}VVv)Ueqgl37}o|%UNFOKLwp{Ih!P7$aGs^Y7$rIDcvHeEbjSx7e# zfJ!p95$Ra@(N7a(awy8zU`Cz7GK=*vDpGol^OOv&Y&M zK>T8;Cb=;<&Pk>|!q3*gp2~>t^9@Ua>Bq7iMN& zddUcW=>`tzC+eD- zA{i6vY7AmP1h8)&^8VvWI9*Kp_?%oPyYQUYaE-zuvjrtW;!lAFzzMb5wKC!7JlS;+ z0Q%;vYs$xa18>}A&{?@3$$sTNn4b9>Tr!9Z4o8zCK3X}dK5s;FV+(Cd(_{kC%?y(a zY+(f8haH)NJw28hh5(Xp1Q`oLu#$nQY3G|^UmHd3qz~Dr_@hbjDEZwA3aX$s1RePV z>O!JN#+w1b)aB%a*S17(5)q-DLBn$Xyj`D}wJ};6w##Bk;y1*o=mfna{KX`a7wToe z{U0DeTmuImH5Jtv+@UaUsuy2|Bp6BgAi1{yH?S*y_#v<`nkB42Rc2W))cDX5ri#zV z8lzk}i~Fb~4)^`0FYd~svJ#RxNUpJ^_msJZm=={DdkzWkfx!pQV?Kh}sr6Kj+`xZe@T!VGH640K2AvTN+lu5bn5yRr~(^ zV&gc@xkF$A$^3zkQ7k#3C|z}I!GPX(qV0W?WPS+KK)7uG($BP+!p@!8smX;3W9^o- zpP(deeyhNe$KO1NgB$S}n8cQpA}eEkAzvj26R-px*h#Nby$feP=m)XaLQshYP-1e& zhTM$=2?*)gA!}9`>`m+E%>b|t6(JcS(OjjQDYGncs)^Cb1_AuT$|?a_X|kHgRGwT z1&CdYYV&3is{)5E{uVH*n{gSsI;g16eMcu#H|^>~|CcXc*wlYB#mvnu9r}*ZAQjKN zC0CD&eOfdpCuc_g2FCPDkuhorUW$XX)QOB-d+=b&RtaDu3T z#Os!A_8|@ne~mCcYjtU}nn2q^eN5EB%F)Y!BTu8O4^RTrV)3U>bh{4(y;GAS-a3pG zKt*Jn?&BBRd9;WNA+nqpm@&Gyw)PLHqi^y=0-bl3;;rBd*>%~P$u+OB5J05g#Tu5m z?2Su0c94Aljp{fin1mE!1qWBLS*K^~@$8+Zf#L=0+$8dB;M^7V3PjxLF!G0Zu@`#(eVGT#nLZ#x71{1#*;dT$yeLK z?bVyfPpZ*eoK+AyImpHL^XPnj?s+ogmMZV`^jo8sdunPPY9%dk#B-Xl(F_%gyGme^ zLQvxGOR-T=x~<fg#vTEx!JM4nGx5u;tf=cbnPrs;UP*nYsMj(`ZcyM*AtB)mX)1`}AXp zlXY`rDfV*X=HAkL5w%x<7a6rhYxNg@{{3a2r4- zfJM64v5-C#rrtZ5rhw_(ob$~>vD(?G>erwo^ZnZPVJ`%dyFgV9)dUH0+BTw)9b4~M~m;QEaQB)kEm4}y$g zMBT;o*}#((=*s;giUm(5|=SWrP}t1^XGY$4Rywp^<_9A%LbKbf+T16|sy6#Eb7T1Is4U z6(3Of0w0BcZ+zY4S06Gk+agcj8q-Lu*SK6l*29KFas%+szlgwt94HQx2p-;R9QCL< zdWBB{qvb&)5s_BPzlcfUK2GkMNRIc}hU(_> zL9q{@2C7{8AxVx|5+D*?kAZ5ttSJHJ#w%4@w2B0UK%FLz#F8^vvKps$hIW z;4=p{nzIPYC4DzgcCY)cna|{>`SyM-5{oj?gYf#aMNsE$oPb2=0Bj0Rd<3Elasc3i zv4U62>gUILe1HnSCR&iXxc_KQiVK={nNu6ZyLM z*&+KOo+Kpb^5Um~3z}qq;+0m$=@u9kMpBPS@(QV3?e9MW5&9b_c;L9Z7|B|T73JhQ zSKXwc?kN29{KDdQTx;GOW54S@uuk;czPG2NJtT0p02}~k>NK6Py|IJ5ia1la6_H*|1m6W zD$WOffZY#Pml2oUv&H9yE&UhYsHqJh6oJT#(aW$Eh$_GWa;re4FTfH~^xJ!F=_1S& zMDBof3-F&5eOStNV89_sFlhS_EcREE5&_Bw&m)X{?i?mukltfV#0!K)n+i~aH(G;e zoB^QH*knruL11oqZVVMsS{>Wfjt3m+^Z$@KFr+@sx;mN~Re6Oz%@?E~fZD~xcP)5-wP5ydRA z7%o+U*G2*&$H9y_*KKac)YSI!@TiWPTzL!W3?>WcNgz-a>hKQ7dqT3*{>TxC0|sW` zdvNLP`r*C?8yT30^K)@mSh1M{K~be>L54~JaeVJj=r297aiUZ4`)6dt7n*SgeN#d^ z4+^5Xzw{r9_9K5^Vyop`b`>*1Hco2@cwj$D=6~3y|GsNwph5 zxVuk4U?Ov=1xly!_G7JW+5Mwb-6irDCR4lX8-l03D=SkRirvohjh_09i%9tLC7%ya zznBm?E;xv_gTK81>GLmZec(l|42IZUR}3(bg+K%e1(%76!c9?@C+H>ZBL3 zt_z8{O5AslM*sG0z&&P*DK!p0O`wp$xScry?yz(_HJfEl;5<7~61VrWa_Kgn5D}om zG4^5E?(A7&Sb=S+Z=nKE5XsA)Go2A7h8;Y`QItr*s~?fLnsyJYF;XiHbC#~YHvm*} z2G5d6qX3*(oj)G}dlQ1m{Og9Cf1kX_hYnUUlEl5R)usF?u+eIqTqM}HV_5?5I{i`nD@%~*tcwghcCGr)2;%n^Dw@y5^f*%4}6$$ej zOXnJ%p$7_@VNoST`MCmvyv;#D)4F}F=L2EI!7c|tgXwdez<$gyNJ4~Xtcj392yDW% z|ITo3;@m4Faw*`YV$&IIZfZJoHB#zLO#srcAaNdAkam=Nb2|Epp(YyvhjHt{g0oEG zLwgAkG7)<##vGhdt=Niq_|=+*XC6z0K@OaPt71zoH!A}}9XZWG>j%Aj=@XQ-T~kW- zB4m*OuInq!(T-H04_*SjU0ZN4?te-ONT-l;hciO!i~E%;8_u0u80|K^nkR|G`g`#Y z_ZAfK|I*~p@dvMX6Z8$_7xk8X?kEESicU)Su#E@6J^{hP_p9c2`lK%be8<7VXI^>L zH=GLv&#;;!_7<^ljJU$rH*3b1{^9ROP#d8F{Zr&D8i zlP1_%67UE8AP{ZMxOT%eXwFHFB= z76%?f<`A@x@9OyWi62NX4p>0K7+O4J)bm)r{(R&|5-1Q>=Zk>a$VlWKtZM$!xa&Mq z*_~uyGcOuEza3GaZ{RekL*_@VEow>|CmTUP(F2f#LuD_84X)6Jyw#5ash+t~*Gd^u z2Mv-`rv@SN)g^;tjCpW{`^43-p1f1t05~o*GcR`sW;F z!h`ysTUeyx)!XeJYA|LHf;UF#oHgK^1f+?RUjo6G9JY-9xg=8eMlL~Mvq=VTMAk;mJ&&qfPC?j>Q+PwaY6Gf zN75$|S`T+6gvQ{GKwb_KCUR`YNjO9eqAE9vuK>(#i?% zvh~wEEHxN?rXb=iM6}}H<>lYdgOFwbqb@%>G~L5at8?@C;~TQ83^tXkUMsSG8kC@7 zZIjfcwdg51;PE0PG4d;czFR%b1F!M_HfmT~m{Z`iC$>yTK1iKG=H^p_&l{iU3Ci;F-y`=`nnhUoBu^HtDyenZ6-cA0LMt4 zVGasl#Nnma6yrA7u*K1hah-;|d<8y6G&AWF81U@?9z8d!+nrlTU`Zzu#xUBHj#{N8v4sI-DXyEiY@yS3>wE?{q zt!)Q_0)=S{G}VTPm8+>~JdWW(Ww|;1zb*neZT$261=15O%M@?4d;PEUbnr%Y#uSUA9HTu(e^;A@*ac6BS6ekuZJhlq)>UrA z`t80$jpWZjtElGsqZ>9Vet9vtc&XRl=!w6}=gJE0X&wIh!Y`Fu&YjzL+%PjL!;s~m zf_VHEma4|c!R3)R+r}z2qWf=b^{>$~j(@hpFrS9UYsPwDD(^I({quHvSbfVY*bmVQ zo~Bz8JGd?Jo(AR1W$sEGYdbot>nKhnGW6avK9DDWZT7U;TBx9cLPX@#_?wGt2DKL7 z9@+QfLrz>*L!DBcq@Wpt^p}MnzBkqno&3(tu4DfU`7gM=+KjcPaivfRvg_PsmAOhG zAh4B1?271_{I;O{R;gQJ(x&&=QLq+=0gaw5>AJ2_ET92S?)f|osoU!$b1$pmyJ$8RYOxKd4M%H_y6&3=5mA#o-e)eJbK=Bc#h!O6b3YDT z+f(<&RQ&qZwkMAlJ<-(0X4>(h82=wGD6Vm(M{-OBp3ctySxWAScEy@{32@ zX{5LZ|~3r;trdN>bKu<6nh82i|;< zTtpz?*kS)8$-k~y30iX~ifig^SdoYbD)BSJ;Y$ooX95)J?oPA|jHSjzc*vl{0wmei zY{UD9Jc}Ba)YS0d9?gO573+$yHrM`6tKv(8HPY5S#8|SKRXhUqjZjy@ntrTu{Y`UD zJlXsA(Zv+9`HJzutC#W?*ABSG$5G$7!X>hWwsUhcw{tTxq|x&mFvxm7E32(tK6@yR zm**NY4-cF`Uu~S9J6@ArJ^k^8OqXG@cKKlq1s*L4oyWJgvt4z3nV*T!ztY(`2Jf7M z2ZgVCR14&SwLXlq>o-73*m+YIYJH`tgLE6=PmE|j@cM18y)f6Fy52n}9?(1WHiRi# zw0})pz?#*fLbLI&4OGc zMj*#%9eSj6V!UBXU`X>`AVOf%`^=F!t8lkDCg=*w_4OP3U$I#JYbKsndZq8DQg%SU7GGBUg|1HX7>WPjD+_nos%Vy(4)VF zBp|%lYReMhTc=b|q|pGt>C^+~uPPrTKfPS_u8-SsK*@zIa6f-wJVj=Q${-i_5hhD3 zhqk;gZSB;PlS*C%diESTp8*ez;AXe?Ka3k7O~ck(oa{QB2xo&^YkHU==Qu--21lK5Nwe0fnOhdZ#uk!2Xj?(8dW5?bQs^NX$zyY?nLY_OX z!Q*NrDF04zy1c<<);iDJaG#g(gP~zchYCtT!G&0(Q$6l_T$~!zsEm5{?2ca6$HC0* zChMs2M5Qs+G+kmsgJO{cB)$vRR8&{Ht4qIKMBF`Z-S7G{cQrl?9=sU;Uj8na9sgS) z{mlS^p`Lj4<`{i@gogwrKXplIsVeuKZLF-JD?ZzEFib;@a!Puy^Bq|8a0#qbV8(QP z4E!k0`P5;cO~K*3q-!05`#hId*1|EvTg z>;qkvmrkVv9zho-_`d@{9<5*phpwJ+u|njKwUUzfnR9%UCY&0yfB?}pj9A6P#+Ok_ zMZi~Fg-8b=#Ms_>%aGnre{t_ihFl%}i5w!8SSbwf6%l9|sXwX1hBh%aebOh4WEnFrTt2cQ5nb`X9 zgB|oR{{q(`8iwSkRsw(#bgZbGq6ZeClHWNCW@`KH-OF!Y@!kmv;tmS35I-tjHT`vJ zzSHtTsK3&|hyQ|yhOsJ0)9<5i^O_u?Nx)mslQq_!%>=jylj@VIo41d}gv1&`vG8YI z#R-KJUO;QOSr71(M9}4Am@{`!HDT z@b7uJxzn|mG!R%dX|$7?`j=KabHr}efU}V$HzYXN^6%m_ zV80Jgag*3!u>DCBM`_ByLUT8*MV3yyhybtw+KqleK$3z0vG+b31$0D8bQ9l*t% zG^D}5j9ro3t&w=DkXC+>dusviX9Ui-vm51@2xpT$0h$|m&ZJ)y7o=IV*N{I!g2rqm z8lH(Kr&?QWS8ppbj4*vck>t(Ykf^4{upVv&(hd)(N+!zaGwus;))Vs^LaOj)R0HLC z4Xm1NSHCCyW3Bt^iVF*by+=Cnm5gw7T4PRt3XU>lqcK1DHlM%K8y(80H-N|5Nc16i zcUtpIB}p88#_|JnT*15W`kAM4VDJlBLs6MMqAcw6K64q&eMK&h9nb7K?Cf;o@?!Rf2!0sG3Tt48SLZ&cQucO&7I1R=9-3r6Ejfb5 z0s?q?{hDM)p8=+CmRwxncg4iS{QIT4y0jtv0Wg08VkppX0lnY8lv(Nyf$#EwI!5yk zz1rPYT3eS7}vogM@TLMaEh_Ga3KNBr;oU~8wMA9y%hA+4>@xNM;6&MAF^(w!_s*L z>1J352);0)SkM?XH8n*ZE%D_5o=3)Bx}|bx0wsWu6#REmYygO;09awIGZ;L)7K?+v zh5FEIqo@+3j{)+x0WJU1ZR^zxJHd19kVwFj%+8R=Zrc_bvF`gv0$x1N7MSckK(VN( z7{Gh_ZM@}-5&l>Ak-l^L<{UsL#PLIrZ+Vwm;2;!q??dv9pDxEE?kukUdMB9^Y<02!rHl9+t>0c=zlWmo+Dc)jR+B8p zh3vHG#p2b|F@aqnoFgS%6ZKw{d051aKb^=2TfSj1P0ky%5feOT){&}3GxO;uo z)q3+57fVQq?g~?a?0^m{66jCbheP`M`jN;JgM%wQon_s6i-``_&;I^YHuDa_oazh= z4BFb-*TcfznqT=DKXqcmjpTQapcXv@BD%>5;ou1m9^~&_mz9#@2e~5t0hssQXs)j# z2HM@5O8OlFs?UHtxdoI;qFdyW7|;r0WJClG%7qe?S?QnRvx!NaJ`?iUo~ zMnjUV?Kfl39ZXEbM1>yYl~xHFmdB4M@=iw@s^#VYF)$jFa#Aoj_V#VE$&ufyw6xBn z7uGm99F@{Mnfu~S)FH+@voE7CgYvGf90M7LwbE1yz4~TcRy*>|`|J{xm6aJ78Br8t zKH+gJ6y~Wfj;?>3v-#{01-={j(WvW-mw$T`+jpf;+=&D$i;_8=gd7QckfWnr89A1s zEU(|bO#^Aw`+9EB5~$v7RwH>VKuD@rnkV)zPJp|C2FlX;j#J1b@x2+Cd_YEl9O|2C zGRx*TUu=x6GX$*~e0(f#1fo1lO#DQHk}L=c45Y_WGh@d@y`898n`bX}C@`UE@JSZST)^=PF4XMa z`qcpD%|&sS-aamU%0NRi)ZR9E;gf4;ClfvWP*>3U4UHpZm6eX2rp|#EPM$Q}4S=sMA{1rZ(+W0{%477Fy|JN~}Iw(HbJ6Waf4ucYLCw1@y-yu_ch%iz?ihW#;m z7G&iX8b_3J%d$0X{a$^>CLoiFceNC9-}hino0^-?2UytWp3V*{D=QP|ozu{)wtLgi z*7kzWXwwgEywuE69>QE)Tp8`TUO(@vq&`oV+X1f*=7Dk6a3oQL6hvPzXlll|gLmeQ zwer&jp@E+-n}hAu0xz_7C*R1qm!9UE#5Om!UQ32A|6cqGM3epGpEEE5z_3`|T`wzjJJ`i$6sYr4D3UVB>G z$h$9dIZL|PkBw7NxPefI{Q6c_7C-pf#^z>{cZ73<`&(o@jfAP-0|~JTb1DiSN>EU^ zkVRdKHvEN|Pn6|&MS656Sz555PUG%f;@W;`lZJ-1O4s zC;_g>F0x%p+NRAkRcB?lh>C&k`9WWB;0I;f?F$Mbd{%#uO#=}C$_AM^IYZ$2I=Qkx zM#P;93Q#_qI&Dbl_c9*JII@|XcudSfP`YDn=QVdWv=4n#^zfiXv9Il$d*Q|}uK0CF z7^D^43<#J;?jxzs1l1D#elODT&pGb_yeL6`!=d0LHAQI@Xqy=nI779A>l6pLUZ^mj zrO4<0<}@7a+jH7G)#)~+Ukoaw+EP`w!Y9`V1IIt+7^@yaV^jBQ*x|4idn1ax9^XmW zFYst&q`*CYGS6c6tf#){{$plqk89rX`8sHQ%Rg1+^))K*t8$a~o;zVSC)Hd3vjr?T zmVz8d{$GcV%%s1jEfw zT4*zl_yx*29x#rsLHwiwC;>pYwx9+P%})hy3M!)CKbj~u<`vB2tQQP>_a*7vzvoWc zbqREV{3c@9!l~E_@_XWt=bN{=HE;HtOpETPd%6Rbovkv9dxbLaCb z8nDR3;nhMABIbZUFZ`opH$1SMhpR6Ya$nS$!^D=2em(LU**y7)V^x+{?eLc^TVs!VexDqQ5PBfle75YzkEyw~ z8mWc+GtmEF7w>fblV*_Z38@Kz+KJW!(+LJA(^9VMAE-7c3SR7{@!91U^ELBvVwQ+Z z*|TRKqxAP&osx%X)>ZMV#|c{3pNSgIMOn9RUGROF`Sf1RI8yOpqJQueyyHL+PvWy= zsodOT!v>bY_VFuLRWuYnTp;&)+&WE*Z?{iQqr~g;mf_5{>Xqf&PcL>?re*Wcecbg+ zG1ma010DM4K9nbC1JOBZ#j|_ zC-*)W_juC2hK&3e!6vul&SnAI;bCXqdl$QhmebA5=(ewqj4ZP4y|z3Q-FADoMVGLD z+k=-oRX8R;+dMLrG)*m>p!DnBnY+W!j}v!OWsk*1gP$MF-d}nCgrOstM)1#XoykFG z6aPcosPbZiX}XH)V{Wnix;U0XMNxVe%rY9%ecncB<@&bje#(UXJR+7yAWhP}DLA8g z%|g$5U*D-v$K1Uc+(KL5C8!U-<}vlB6MAO#sd&?g#5X88k3+C%jNtA5!z1rn$8&Pj zZim)&_ME2R+|d)Mr!RjllSM!vGc9BvU5_p{3(q#+hWj^WoQ=jr#iDD|ZEV0_HuidY z&HE|ds%qE&^Y}H@ZXpjR5eNJxkXseCT;lnqK z|NX_Oj~@Ad{P^+8?%6TdB8A^ctBp?<7W%fHOc#~5*3^$K*D%&$mmHV$GFUNvvccKK zW@`oB%GAh6VY`T&1c{@s1+p$*-X|5>Kk|O8A3-6TnYBX$0s^b~pUp;ao+_n1c8rEs zNT{mQ@PZY653Bp#8e;QwP^Cl#pQWrGJt<~x`exc47LFG!3CuU?q z@o0TzRq?)6y*0nDFNuxYtPwPboi+Yuur^a}V|>S-cWZj9ijS?#ME1#da}Ry0Vcxs} zXAA~R5Jvr&@vR^4s;{r_!S$g=to)(=Nqv1~4CpGUq`(`remv{TmNUWyI201EjAt6Z z+O)~yt4cwU-$u5-Njs;O7ivrU6y(-CJYa+$zbu#&RK3e@wlb=4liyQnCo8}#y z>ZEHCP4r>EzTe*=DGq54468^aTr6Q)@ogzaKhrYh5{qe=I z&s*4AqN}?(IP>}QoQz<~1M)78{Z(r#SN@!blnRL!mT|1v_I#HU9STUjI2j`gT_=G}&0{)lId?q0h< z1Fnv5Xfs7{2rRtBe2YVC5VftO*W!4KQfJjP{~4$GpR4Toi5<7rE>fNf$>F`IXm&0q z{HnE$8^0sdl%)2js^=!YV{#4;C5sdc<{!I+HfD!UWUWevlierabIW(2O0nvG`*0RJrBQOPo~gEWI4Q-xa2#e8&cM1j)v)L2ng%T~xRoBPehjo@G>3tK% zi@s-E`*HHRK-J^+&Wr1NvjuDA-w)ZnA2n$yCxX!5plm%y;%v!>1nL?*?|qoLVg%#m z9NW~Q>R>T?1DfaeRP9!y=gwSM)5Ob z%B$bA79fcG*COZt(?Q)>!Tu3$uc_j~ZB1DZJu@QpF+P!D0k9OQJjEvGa=d(H>&-0_ z{BL|Ib7R1qPa*DV3eS{;#gSrl(5vzLvssVo-qt9NLtcl*Qy2Ls@3R;>NPZRMx&GAK zAt-$0v$}#g7nRa0>9#Wm9Vv`{XUlK>OB1SSApF3$?XQ;6jn&*84IBdIZIOoagOSl0 z@h)2$HTt>w>1MK0gr+Ev+jJ;%V^f47eK?1J&UxRXaz1C63tk}BYDmA;rD|P2mo??w zdTZtH7p-lkIU~YycvsBlf3o!JSl7R0ytg(%&S11-ozEIA{cjhy?w2>H`R;2Q!Psb3 zwbU3qu50ZwP&Hn3dBcyjiTUpwwoI`XS-z-vaaE^kZxxHD+mqczhgL4kyS?Pdr{5}P z!1-$LeOg&skF_mpSLF;6QzIG2Gsa!kM_gR0J=gXuWEtM@2farK1eCUMkeUO~0drEG7Q#)}Bf8;+FHQw#NQbrqGKW)X ztMnWlrV%JCpR0QU;k}8fCFa~6mX?U4vY0yK`O@CXz^1pie~hoDk?N8(lJ4uVPhpp3 zf=L*)SZcT-++r{|)w!P$$~nS3208Cf^& z-C|TNt@q&7QezWCRYhadeI~kFIaj}{nr&k}`M_|?_6WwZYi=1}{<-A!9N=eyAvuV@xl}1g2(YDBikMUwtiN~&Q zlVdI_x@fRmIbI8e_eGl9RTJ&2J`7pE^pZu|#ij>MS^!QU!44t*G;44a5l}Er#1=Po zM@^=mF_E#cs$~C!fPE~bM;H=hY^(!gW1m`;&lNL=so)cPWfi_GRAP#SW@sQp7;#7vA z;rgjJB3k@K@4jE>R0r}&&BPSj-JyOLZ6E+M0T?3Q?DFi+l6|<{8J|9l(0ZLcMR06E z%L_DL%p;6bT~ALBvHrBzn~qbSthP>~PJek%cg#HKF8}K$6UK<@RReP$IRnA-zhr(X z7|^`UYFY2NMrB|Aij7{e|qokcT78p~(jR#qLemo#s>&_CezV!U)v%k8&_p@VGp zw{jqX@Q$u@%3p{)?w22+zP+VvEfkK3QOO;z)jM@6Fd!gc@EHJa8eqdT@3WhHi-mkC{Ck$>F9f_wG^fi9LLziw03lLL-T(4%zGmZ8DcAZdBui z%uT1o`Gf_EEzjibtxR@Kb%m%$8`|Iv1zchU(gAT4BIly6mIs-vGoi$nLn8g}%fT7} zE#CD&ZYi6f1Q)xztCl6h!oOar*aNi>gk;@W+&cNUIu38>#SNMk+hiQpQ;Z=rg~Bxb>KyXv@dH(u^<@M_ zV=lzO=P;JCiQ+gy2^i1=^VihK2*zTg)PV+kbuGd+dRvv5sN(Cw8u|0`@}!pM+76hy z`aV?uYh-|M1?J5+RhZTQwN={1)N{x<+KUFB8x0MO#^yU0mE0tDgD{0JtJL)ah2L3r zG?4+#ss9U*`xdZi0~{jT zjbsjazn^FyK(o8RcL?VZ;rS&cO*}#wdbS~zkVJ1mYKB^dhYufK2QCEr%?TiACNDhJ zVM2iW7}H7@TDEO$Y@l9qMpG_(Hq{Si@^uKc4+;;5->k@?PX>R?Upe(*y`4UHHC}uI zs0dXq`XRxO`14`q@_)$j%nE^=8zB(KIJgmL6*dkJuv7ttC#rY*8t0ycjufYC0x+4K zI2f@f+{3!LSmvO$cTohkRq{QFX&b!d1ALw^Onl}P>RMO*T@E28(0IZnwh5;L3O)jI z_Sxy^=~_u3ppmV2igKN=++I2=3i5yK z^u7MuKWE7E=KP=kIhra!|G)k$z9cuwV3=$1gj^ISo(eWeEaW?PPVWAB4=2>n@Nlw0 zHr^rm)ZcyQu_xh*q^qwVfS3+IH*jrmRRD_xC<;myD+)ORTfM}dPg?AT#>?pzQI7>I?_h)1*rNEP)c1Vw=O(>=!)#i@h)v%LIT%qc(& z+wD!>D$C3J(r+r=fTSSoC2L`I!SADEW_BC8ef+e=>SktU70XlO2dluP@bEk~ z_CEdo&q3zRV2NO%Kw@3aEG?IM3c%v1D0*C#5p0w#*3kacB248*YN}Q43cGq#tB?JR zLH2$3AJ5YNUa6?;m?kvMe8#z2D6jE%7L@tf)z7~FjF#O1)z^z<#}2iBkV&i!D=DJz=<$QDd9%49G8`lW+Q zv4sUY4(Fo=ZGBJ^SLws~vF~o0r?V8Mn8*8Dx zHvanbgWV%6vzZy^3l}aR;UK$B7I7&NoYeSnu`$F~c;AhUJ&8kp;vU`z-57j7{5@k| z@vR%I3p)ZMQsS|7Dgg_wgIZshc{A9NFu(*Ss7n&C%8rWHl2*p)v{xXCq_yF{{Rir65XdiYo!ldi#or7R+J5&gWFP=Vmrwlw zzuyMRA>xLhv`|`5qiP+WARLFAWdBaB$6Jf+h-n1tlDx4sc;|jYQ9yPMycp<0 zIOI02hhNe7Pdjxy*{K$R^AaftCTqgyV=wuNIzW8f@GKxKAqHViB!CatEOBeEfwF7$ z?>AXet8^U^u4Y%pxIhv9kG{?XEa$v^``LFX!`OwQBHAO_w~A14Q^=B{qHIl)-B^>7 zYAh3(N}4up3`(-pOoWMQC|R;q))ur(y`RfGbNt`ue;n`o9LIA!Pr2{z-~Ic2zt{J? z&g;C+s~J9FOKIlk{B&dzIqkFFZ%*$$zQsmnf(}5AP>_g90r;iH9vk;}byqlW4c0FX zEu>}HLhoPfKb!%3V8<{;Ogm-e*ZK1n-^|Kt9Z>%jRsX*I`?0Z>A==VnPrKm)x%}+_ zHmFN+1*F+T)XX&WdB07}MxPF&Lf>;PY2J831&iG5OYwRV($gUAAe03qbKpKOFZZf*=mSkn%_Y$Fs5&*XK?zM?wlz(ogHCiv{~7p(!4K%h z`}4yUWrS5FYK_Aj(vH&b!BX!D(GHg%mZ$8`{4WYDSRq||{5X`hREl$6`Ntoqq&R?~ zM5K%nO4!-6ZOQK8`j@`HQtv+geG7S};*dhWTF)r@-I@WaN*Uky0#PYGTDmTI>IC2e z6sc&mp~FsQ5Hpg7cIKCG&f+PTtKE_2*d&G}vKWFLp*O@ThYIpy+wP23=_n!~WD?Yy zcgwfN{E%@ZMoxMYDu#?16MA=S{Xw9!jkWa-?C3a322v+hB9J6*u3aYpi*p{*%@OWm zBvW7vdx?7ra%*W>g0YrH6wEm|RnDZP{S0IOk;;#wZV%L!&+uKzzJ5L2uHu}}7k5+; z3O5@ugKelf)r0|AlC|9oxev^ zgE2G1WRJ>3Rs0KfG2bQ;4FlC~nz&XZw{Oo^6F0*FllCyt)E`eJ#6NWQVgl0&V}>l` zP*yZ{()xy~t`9H>y!MdxW3f3HkzTC|SK!>#_&0q%wzXZh*5uEV#s{n;B*&G3?mVJ) z9XfP1sJ_sQLZVo8NuLAWa5A6{4s9AICa_E$e;iOUHV;|7J<{aN##Fpx4_zfPrLB#% zE9Af2%R8L&I6`%0QH#Y33)?&KHH+d3!?T!p4rd_WZ=1xSdsT31Vl*1DHgY4PGo^NlAfpko|JMY~O(c zzmUgmW^Yi_=)QRI6U7)Gt$U)zK zO8YmF6$DFD3w0_-3_r!?W7)ESEXl#BjcCN{Oos)PXH6ZQmy8Lcf_#Jy=?(-%qVr@r zP}X>Pd7L+u`I`K*zVq!KjJXdo?x(7XOSiay_8I*HaxCYMzYp?XQ(NgI4*_0>y8J7i zWhAErZ7JB-2(igY5l%ld<=rZlnhH+@K8L9`n=wztaP0Ej4x@KANmH#11_fJrmq$n8 zPPz>H37Vg1Sj?j8BK`Fnd4ohqIy98{y0%$m#Ie3SyOK>6I>p7s(&@@Jlcr}X91Qj9 z^gjKIgTvNZCxp`CR@IXnJ9*}#P$ZRv%rz|7q3=fH2L{5jUAVCL>@daLxi_beNdJ+7 zv`2bPchO7Ll|>=)d-m#8!P#Ly3>~($WiBR2TCy-vgTYK4fe85tYE%4vMNCF(6ulQ( ztQ~3lrz)4sH=}M3G3u@hU%$(G0{ zLl9f!jE#ohBLLLk7-u+8LEz2wv^0`W%kZT680+D!I2aujC+H~rlfkfWOY5lj%spE_cO2fl@1noQyNax(o zn-bPtm*{PKh}NN)-;qQxW%$wQ%mcxe54EKjt!`$`bO7T<{Zp(DvK_8#xS~NWEuV% zHcURo!B*1+zW~d+(M%E^)6`HEsEAp8-V``z&H?DZ0q>$s_ZNG=?B+t}6HqpoZ$Oei zd<=P29@@bX-FT(HS>?WJ)Xqa+Gh|C~iO%v!O7!MbszcCZYV;kudA`Ld7_Msege8bC za`CCKb|Ui3LpBl##^bg}Cnec%=R&IUrjM{_y=%hpQR~lCKXOaZdW}Wsbe35#a+#rv zu57mJHELZ1!NFK#3#z9)0E1Sy?%gdJgb+llQuyXU+S=L?+5LQG)X@ulJhd}^r-$Vq zs;|dr4IduDY-61`hxAJYiGUePil~98Lr2ghWclc_S3+PTp>w(*mPo*!3*Dx9n4w01 z-7ErMG*(}2p={3Y+{)$+Sve5*)Uzp^k}o~t6hp0!>Q1QRfRCW*zAQ1j-x0!IbOh(9 zCEbjH@?eR==GccQUi1KU4mr7T;&EECxtl4BjxFPX|J1&Hdwz_dWzP~h$xpQCZ z!b4`S>!g3Y)}iLi;_2Os5*_y%5|#KaGkNMDIh+iut;>)l1gl4{wVQFT4zX=86?H{S z?#mZ1%$ZM+#{!EWFt{K%5X*<4YMi&G<9j5ZUyYb`x z-8`Ej;?%mdN=6o}>N7FiP++w}B zmiuVo1jpXldjG@#9i2{`IbveL0vBU9cqjAx0nt8~kU`+P<_6)H15_m5c-AdtH(Jb^ zH3T?B2233X>Lq^eUWkCgO*3~EaGWgK$Vm|NxBEBaGJiV21J#*JewS2TX4|o&U{Upa zKCj}CBV`;sW=wiP58PNSa8Q9tgA91ixk;Q?`*5)Xz!@-|YrmJg9F#9v$bXck)>-qh zNgztTd6JQ_?uun3x%O$BXr$3mBzJhF*?W%kdfz8FIGDxJ8ek5&+eVDtbUM$z?3aDxhMI*c=Op()wQpZ5ss*CoXm+tveQmAQx_H_SP|v15%Ffpx zPof1rrf=M2L0@u^+S=MV3WbQMWfb}v8h)m9&j=Z&xUcMXUVG7FF}u-7E~|u9CAmkCBd~v|>}iGi9S~NBYx3hS~UzxUvt;B_{-}$3t^Yp(8`EB(2rD zJv6noZN@fNGm&H+dg^#wXA(=I^1qHSL=^J#w7T+ts}I^W_bY0oWdejHbCSm!`(Al8 zY6o7W!M?r~DbEP~H|>tijEZ?#EKdAfz?1>qQ{1x-_3hB$aj*#M z2Wl=av4Uy&;`4fHpMvbk;rU3j&wn|JNedZ1q;z?TKfJ(Ai-Ta`r0*YKaKRceu80`) zd;JAyB}tYb!UZ%Ir-=D zF%Q))_(MjI4`~KC{4RMSq_YqmseZlvrc%z8y@AAo4hjeYF-hW6-WoY2Ub{9C6<^%e zH|6EO1|{lAc*pl8-YAO?OyuMsOu8_$9l^V#SQzo66GeMB2y92jO3$u zx@kXj;?ne1e7_{-*vl)qzP2TW3;dL_8U%s1Q#`|c3J`6jjyY~6y!84BDTXHf`-N?R; zMc|LR+fIm61q~!pH|11(O`+X758tx=G$D|S<8JJWQT$#2BN^P)O;v!R!d0cYRCSD` z%u2#=l{}R+L>4l))PT-`t}oNL7%_R23KOU|;_Ygevj_cBmnL4?RIk#z_pNPB^@}l7 zzt(k@S!Te1NkJ)m+e%<`X=&+oF2*itxkY$qllYoOH|N|F1W{$tlyQ#w@7~=?8l68s za+7umvPGq&y|;19uL79Cw`DExNZ(B|#Cf;6n`f6|d$yLCZI5fJRR&bklAB&&>8((7 zM}pN3rL^QhGJKL~mzLSU=bcF20*gpS@*5zlCwCRN9CJyGb8h?IQ8k2AZ0kzmIehJ3*K=TI9x1XkeDUsK>>Skm;%1(VzTyYcO zCMZFl$CHru)%4WA~+2;AflilJoF6otRFokhEk-y88 zit5+GjK;`@)#PquqqRiPpf1X^K0LzHs02^Px0+9-b z&FoDxE?XW(YWJTj=RBI08yiMwDxKNrEKNI%=Cf&}HO=ws$7d4dNRD}Wcwd%Mmmf%} zfQ_qNqUTor*X(f^-SL4diuv=?Yf7b41`X=8g>p#A3cY{SMPPo`Ka6(~< zbEU3)sJMG^R`+1!!ZQ0aikM(yhx>aH-tRRUFeyHMSLTv$^l4wCllMqjpiH&^YWFOA z9%mDc!&w@To6ND_c0lP)7#GcFU$>h&F3hcOO77!m*b4v!Z)CWjmR5tKrk=l6UVxjS z(#|i@p-o0$P4RX0?CvoMolF#a$K8TS^_$c8hqnOpXT;NDx=0x9BLTVjIv?zzNYTHx zA%Y{B`ow4jeYb*PPH){~I~7J05=+rlq{iB~cW0%7>CQAjj4u7QQeYUQ$GoEaP?)mNU>(`V0i zV3WZ4n_7<5t4`XICxeAaVpmr|WPE&o`2z+f3*9EhF-%8gapRMuzeZO8EN@Zc}~G5mzRh0<06@z7&Z68nxjE3nOW_Uy1e_g5O` zWWdeC43exz_6K%zlmzP=5B6MlGD)y#v;=NnQXx=1aCvm+;izyhI!HIsCMah=rD{=zhi zJdS#ikiast#Od5P9Q0bXY9)GL;G-ztnJzqY?OHvGKf*?mZx~jn9Y-okYJVLP1IA0p z6kL5J`mi(X5I<2OC74RTN#s@|OIINAL#Tfs0R;^y!xQ3I=@f{Bz2clc-EwpZ7TL*r zH*_iPrEakdx)N{J_m?V$QxpvPf>DR^crIK12t*K9k53uS!EHID#OsENr*tC0q}hcIX_Vi1120sEEPQEEq!oCaQQ* zCCm8xb8p}k%Nw)Jga-uruB4ckygtNNU#EN~rrE%F_-hueGL;$#ek(h>o_x8tCFv(% zWZUkpF}{j0@`rc!Od}C%M6NlfeinXxkv8JsToAu#Is#Abr~nO61gi2+A3ugcM=-VM z7}%Vo{V)0pIUKH`{ih`p=qFNkaI*M1kZ{|-$};ht8om^iy&3)%Vo@fOY^h684Vg>{ zc5UfYdLMVvBt=3W?TB~+f-V9^nspS)3J0#qgaetn!y%42^^ub&XKvVF$OjNq;J9*e zJX}y$LwSQ83Fms67|9B=|1ge%3I7qZ8>pe_EHA3lQiy?2VJl8EH*ZUw5m04oZ9M^< z1r_Fb4GXH@75xO-VY>Qcn0jIurF`9$_rjcGb|GnDxh{v|%^8{InB5uR4QR$#E1sAP zP$#2a`}{^bCdi z6_hxFfbnW>ZYNYaZaK?{XBzk!kW1gRbASw&-h<-eG_E-Hm%tmZGMyc9*hqtQt2sl> z+mGDk9Gs4^d$Bnu1qqFk(EjWt&CC*}B4Tykf<(ZX{#PjwZSNE71C31qsI_GH4r#yB zD}k_m<$2Q=uU{tv$Nw42bE;DB=-4f$Fz5{xi<^E;Zr*`8LbtIpvQ#KS=#4^dMDwH6 zW^NwNrEYL+)cV1!+wlp7G7p#VH4A!O9&-GIDN|@Juj$U(Ul3}D$DqtWXVXhh1UFRL zb3j+<8^j@t!sknViMkOZmfq}mbTN($_{ljL4%|M+fdsxoNW zO(af|1#>XX(o5)?;=o!K(WPtL)k2n(8dGUIkex|wfkG=H3&?6I8qOM9t4<;;k^nVs}&aDASo~9sR&*wP?2R+ z1#S1h=P@bA4IHRtH&F%{&KA=cF+w@~&)c%h~H*H_Fxsj8tp>Km4cntFempG zlygX7Ca0e)6e?8X{Tupt=>tIf@gob%f!|5>jYLs;YvGayfhpLX&!Db=K#aVLL=Q~I z@p=jd&>Z^@9df28TGoQ=d5h6pf+Q8^SM}s@K-(efh+1v7%%r-zt#Q4T-72az>|E=z zy#$BlJV8ol4z_G9cVhk>Qn%S2kEGZ`}An#?1u(HxHt_Vnei)Es_%=r)-?c>2t<|I90iOo9;vRfbDV)R#O~ zY}3ju11A%anQWo~H^N9Buu0s}@cfPF^`0sPs;eO764aAW3^^%@86TQ>;X*>UE>{o- zK#rLf%CH*>(CQYPH9x#Bj=pWaiF|f&XrQ|2SD=XaRIq=6{xlY#tJ#5qqnj4d_nvk0 z=9EvSJz;?;CO`?%>ZCn1!l*v&%ex|c3&U|D;Aql*g&X~01}4#Ha)@;$Vg8Rlq8}Db zBg_yR7hc-1pNP?g%Ov6CrEqSW`gUcvXgN>fUiHD+maais*g=M=pVuSOUyBlF;lhPM z`9^U)J14X$e0Mr$P&V|_+r7xEp)(w>-GR{Nrr)xnjgRj z($OczGjecIMJ8xMk#SlR4P`Hoj8QwUDEwe zxDudyx9lBX+nQFkebSi+5{cYqxS}3@N`_PbBG45ABG6S)kbAV;j$-C|D*vSw71Mza z!N!|)pfxTHI^ms7VlZyf8iy2=q^!0fs%9b6?_j`xg50hGO7*fws17> z!q1(XofuL6`0|A$r(Z_Vny3y{Om&BE3lV#R`M9a}kNIGZN{+KGDk{{bPbKaCV4pPK zZSwz8`|Put`@hvcYaXLUt=^QrFI4+CN0rqJ*F@QN%?UTke%!UKduzQz9o}zny_Vn5 z_t9^&=eH`ff@kFbjW5*m8^31AIqK13Lk`XPPd^VFo-X7pu;`z5odJw8!K*YNH1u)a z6qUB_?MOkEQ_c(u3QE~v?LKPa(ryb)px@*HK>CFYKQMmHbYx#oDbUuK_UPL7eM|O% zKn|pvI*Y#B@iatjdAH2anE+w7W9@?9wfPhOD4j&`G$AI!dy8%d=}{DrM#K@Y2$BIO zFyld8s;N-Wm@?`z8IhSB2|sSR*pnxS_k&`ghsUFoc1+Qs>T$s^hH4<#fa4+_OpgrE~_1N=!j-2J}dUOLz@sI`{P`Gvr@d*hImhpysJVw~K z!nR;X=Vff$1}a*2>yZN+jU1b
-N*9n_%@9Yi}P1US3QXKg0S7h(JL&jFHt-f;b zpe>OJFPH@b4*wkX1^#I{$0MHIN8~a&9z|y{W|JZ62anIX&hrYtu!A@!WDFC3qcJC2 zXJs?8Igm+KVc}22LPmecz&m{PNGAlcf%0g{U5bc-?7;BW7EXT|HV5}DHrLdE*ji%i z0;s((%*=+!iLfG)8%3wTv2U@)%Ey8a=nmlkeu%3W5133!mkS`xpBLw}bbp*ta^qFDh`+i3_NAh|e9E!QkezD4MUa1+(&0BptT%6lGOk8>!`QUFivjg>l-e{%i{nE|2D{} z+v!Ec6G@>qKHonCExj&P9TAp-6vZdz^JmdxN~N7Wys29O;d4`qzjpO`T9lc2x+DcF z0o=6w@g0CY!akY=xMIva%%@MB@XcIu_Q)BVXwe8hdUWZIG`xFzk3zxn6hPqwNZ`)Y zhvYVPupdIO5y1+@1VG|OX7q`*he$IrGnPzqIe=~K>= zp?jpq?6|_1b8LTZ)9i%r$FPS02^F`0C`c)#Fc31#ZX$RcBo;wyl+NG3x7w!1!x6D4 zP*J>v2qDKXBt|bXCDH~n+iO${DYGzaFSQ^Q7MrzESoE0`11hEF&Kq(e5i3C}?O_KI zjnwH0=D>OobB>vvMsGGST<|GAe;@veRm6yaK7_9bvtR=P@$T{6pPivX!h|lUlJ6n( z2s9^w#Sy<4k<#qS91o|2KlBUC{bLLU^LZw3evVH({}XZt7~WY_p?85i2q~h56{ygy zfKWlcRFUo<8R?W>xKJlX%(WP;B)wRaCIjX+P6gOd5Vh|-!fY->Qj)VQR|rlaGB=pR^KUYxx|H-H{}hlB)^ z5r6_Tk`;XtT(Zdz#p)BfYiz^E>!WQHDvKi*?4vBkn4F1m3L1{~Qmi!%ovBicd#?*Q z$*xY-(8o+|i!Mo_`S}yFi3cZMQDa5>CF4N}Y5|N*`fw2lk@olJB8n9G76v9JuhDX? zAlRoKDr{>Y6-TCut{`AwU_0sd{KihyG}4!~FN3uL^#IRumRk}=h+SvBo@?M90i97$ zQ7uIEHZ(sS`z7hTpz)`Lc=Yp^KbfPLXf#qU-udSfJ==UGW=Fn#+Ug6SW#0>>REU zvlUG;#10mLF#ydQ3$x(0DOt5cqAQtON=nM6?7zq#cFCx!k~9in*<>wqKHLaLVjQ=X z+U}4!+H-f6e#JWY4&Scas^Wf>GPU7^3+_syla$+dW1kB7Q3t!19($-La`P;GK37&= zD<}v-wm!a}$LI}j?u89&MBT)st;9|Dnb&aC$W4*%0^0(C;p8*LG0BqJ084ui-pg`h zWeo7a{^PF9JkGs?4QZ%<(&UT~g3Pl1Q1ILvKSg&Fg%es~2M>G4df!Gf$0tXX$PIHS zEZ8SZL=?>v-4a>~L9Ay5BVcp4tR4_S@z)H z{5GO*;h8T_ckx?&>g+6q%gR0%5Q#itySGF+ zwd8VI$a;T~I3irapLjkQDc9z5?}imZev|v3> zpTo4B+6<=b=nMZ3-dD{_z z>bTPZ12gmT0_Bj9A|d#*5`E(Xw=AtL4aXoaKVme^YB|4p0cm^q(cVuwW;9fGz|$$l zL|cw<@QATQS%C8ZW_ z7LzB;`N_SmAE(@A;0DKkp@nVqoNvXJ{$tZ-qW4E1-;c_qG{vACzX?2(IyJ!2qikAX zY}?#lx*1Re-n8P?>00OEkBf_uUb?P%Vs1Ou5T;qA-Q1KNlb>sRXk7i<qkmna(E|&t zpg^Yysi%;qsq%<8bIX5DP{j03L-aEx-5>zsisc69e2AI^0hC}xx`TYu>+K$o39CuX z)_fb&vTzjGg`w&J&rjc3qKjs|Vs7OFSI_X(X;*^lj?X>4efxI%wIkdV2g-}SxGr0k zx3YJ_-9A@-{k46YHf^{nR4vyLy^35DVKwM?pLMNC*qJ!bw|`l$UDhUP{Xv*vjH%i- zcovf)&e|=b`NY+N6z1XQ^d~7UvbST$pT3QLB<2KT|igql;r~?#L~mCB@hG zR^Qb0Ng75eDTDvh$MY?_?J%&pb-QnEJ>PyCHh1863VMA0eMyStkJq&S{h@8kor?XW z?EjPQ-_urUAM-!2Zfmd6q&(n(vTJkWuYpQRN*Wuc%-^tl=>{iL$8}EpkCMKwzMhV* zp^mQc9DM^*BLh=?-J!a=rnE!dZPoOK8xGeWrlvVbOHB`AT!$~Ot2Sx4+B7H6O-o(eos?1zd;Z^_YW3|a|NlSr U_pmV@yhq7m%FM~}6PN7#KclD|@Bjb+ diff --git a/docs/graphs/large/l3_perf_a64fx_nt1.pdf b/docs/graphs/large/l3_perf_a64fx_nt1.pdf index bce34bdb2ec7e315c09e5a1aa9dcc2eedbaba0ee..6f0b8c74fc3932e9e2d93ce44f1a8daddb3c6bf7 100644 GIT binary patch delta 27808 zcma&NWmH>T*ESlYKq(GITHM`>6QDqgwP=ewK?;T94G=bzqQND&6sI^8heC^n;1nnn zcPsAt<-VWid&fBM`Ekw|nVEa8HRoE{`El*J=QUTu7Dg5Nql^K>@T8Q`|#7r&f1xTEr)RXc6yR2ODrdmUGsQFu(BVi z>rJC^3d42fLdE{tAGZ6I`^){i8{JFx`{RYX70s!JcCPc&@^7+3_k7+h-)|f455^a6 zb76=hSj^G;sJZXJ!rGr~;A`IgzRjiXMdHr<%FV4egWnP7g`dp&-5GH2QPH>$n{Z*c zN3~zt$tAVy?nmW?KoD0jR9$QEAoG3deb&mgR`$&HWo1GoF=YhyQ|8eHQq>BVzf#{} zHv${Z_fy#s;4qgvhWowYg}Xxr;OgbQt$*oMgWTPm<{98OR5x)8L+#sd=osIg^~&9! zGyt4(F>)q{?SSsvmhY8Up;v+nE9l7V`xDK(i}7~!vkQo>pc`DAQy`Icx!L_2Xyt*rV#-JXUl1_hy&kKaav5n~}m>nI|)DecL4w z#`M)c0UoP3{qIBi`wJZ1pMtFmsI02>d)^fQ3o!<^(4J5FJ!%4w;SpVR$ zE|{`2U%#e>mix>X9lRsH)v$L;rbMOS~bfp;B#zT!79TKWaE zFPC$%mrqdpD>uPKZ`VqYhsvfhGVxb2MKezF*#O zgSB7(1}>5hEAO{Aj=x8WT{F!4SJ+|y^@!PRZMPLo)IKokeUI4RkULeoyOYcEy=MT1 z?(Z%)CQt4~eek{RlePMwIBS1^qgwjOlnq$M8iDGL*+TM9hrj!ajLtVPS0F`?dFg@i4{f3*(=aLPa5=h3Q(j$4t(;b>ylbEXvyBwh1t;`+& zRI#NTFqihbi>s+sLfgJP#F|MPh2xW~VJ)$E()l)k#0 zzV5T5zu=vW=dkti_OFvZ{4asZ4H?(k-*70edY0n*sir5V667%zh@iMX{_7s_S+X|r z&iV8iRMV3t2!Nsy%9kRp{~v+PyBBc`F<$v zj0tVHw4d_aw_Db>_O3wj8fag9q}|9q8~m7v`<>yV41u%3BZ&%Q^S3#`Ve847PlxO9 z`^u0`?EoEyj}%p}d=D`Cu%ts7KwK{gF;`E{WkH5F^lXg7v8LGpCSK#zu|zYVRp`d< zz{5Tt8Yj%yDVW?MXO$VT#4PId73Cu3_}o?0fSX*2nyQ*nfQTFwyfAZ4K`m5cC_tB8j!K^!rRO z6FsGaa5;m6uCNk+C}8wU-H1QnHTLdTMeyEzyQ$>u>1dBHk~NeFUCfvLgxR@p6Ft>5{|OW<6Ww zx`9_9J@pdo4uX}7@pNy2pA4CQ4vWBb|&ZhQ27!5EvvTPHOap3->%!)V&h_i`oYwLNs(%kUAAzlqpowAQMb{@HAb z)uVWbFTdWQwCV^9SBT9!9a2UGQcwO++k$nAF4kE#w~hzmokX4d8`l3aut0zZ`7e>r z+y3~Bjt#=<++~~9p7>XInjMsb=ibW0vI|nbU%mICRju|!vCi~YxEQ+qzWSr%aDF)$ z|23gj^R%gwi3{}5T6N%hQM>p#?;9$hp{(;n+{xH+F5Z5=(^OPV)-leZNh!v7*eJg= zW_5jnj91WU?k*_OI>#*wuZ0BQp31mGNOKN5RipeQ6`HV_k`zkRYtjW%bu}!6Ia)hx z_o$`g;I0YjL2=2G-fcazEWt7ThJkoDYQkuNqzqy+CP2J1ObSSi!_cVP48XV8 ze!PQ?+cBVyBZyq?&{Kz4tPf*~ZtW7is;3s+N+V432qYS`KCCNN)FlL@j4;!jgzU#A zBt(z>G15Gq5xr`D63TAuT*(%OaTeQVc9f4dK%gmV?Lb>|bhwhTr2S|&QS4h`ZZRWj(a zt;XFJJ%&5~Bv>@F3FQAGN};D3Y-ZKaT;$pc`MibDym>+`IOjHZ)q*iDOGJATjaj%+ z%w1{!(_?tsUA#jN#@%^xL17rip>ZxV+&SbdC>!$y<}#3D3=}%Iuxp$f3_258_?R3> z(-rpG^}S)!I4$lc@F`S(^*UBi*41eYGm*X%(VM|Jp_}xaQx>r#*RSpKy6#3?LhW2a zz=|c79&0Z<=;Y?$IC~)5x6ixiHKVM0&o#!FHRn+uSw6==Rpl?DG3(!FyPz?5=p6<6 z``M90mb{N2z?!GRC`-8lQBz`n!>@WMqRW5XAFC0rp)L#=(GX5@{G)1;1I8OKR3O^@UM)^<`mY)Jo8_GR{wnM@Vn}ju@t7U9QqmwjsW#+#h$KHw zE_F;`<42;{R}S5o&~{wy8NqC&a6yekO7`$ETyu6nZ6$?O?eoK2r9{?#epM8%_9v+X zPFIc;PVr$&uGeI2%dFa7DDFGuT%`HxhE*5T!L+E z?JMI$Mm2ZU+>_EvxYCv@Clz$hc{DqtK<>J#cX>_JPq^}8o+IYdaIdw$DgRNVu)6NL zYe*HKg+nUFd%U&Z$2jDQ2b1+@;-z>frj*+bteC9EKFhj^ZfB>uJgkl?HTCN}dD-?+ zLx)b3!HkrrCXy7+!$ZnbDvHkN(MTK_c3NC^VxU*B^-sdDj1aPNg|JyclGxGteOCmp zb=PvgT30{!t1jkBzq&YxQrSxg1jPIm5S69$h49+Yh)}+H`=YFPEYe_0Irf;>x~HF) z1>^CXG}4_%r^%tsb(`nsDG@gVzc0_6S#KnCzPDi?yPx^)3-8Ap~BRPV^(C5&2q05(Rb7#>3`Gc$tx4$i zQ9;M1-J$jIug?A_g_@?Sv>BS0U!lAWAoXPZKN5bkN0;qt+Ur%*n>78UQAEOWZx~W_ zP=4O0@3tnGMC?mYCf)4?(a^~ZK*y}Ux#)FhS!wfysHS-&^ey9?gpwYx-JKu*%cJs! z@;^gl$}i_cCaOfIe+w^B#Xug@+F=-d^YRl{x6SPCg`9qIJ3N*$kzC6zf?yU+7O4K5 zgiEe_PO`tzEl*~KOG~M~GPW>3+BfwMQ z=p5HwbtPKLnzmA=RPjfo=W09@p_381p8Ac;poxLM=Hl0HZpsJE+68?sai zEsW}Q3Jd&;WGrsGw1%VltMaMk<#!NQaMd8Y9({2gGVf8~bjQO9Vty@@7$6x0GrnjxVGa;Kc+n)>dJL4N&kg zrgk%T*dS6kIcrBSiwB+%cL*Q1c0k^?fRa~NI`aF725ugL3XP1IENr(P1f)_Cpm7~r z5RK~^QY}~sezYn7crC1T3<%@n+7>oqXh`)1rmDKgiyY$xuwP1u>Qg*M1AJmoLNq2H z95Q+yILyifkQXC-taJE>ojA0_RdD4^H|i!HxPm;B98~*2UvG+UL0@^LQzWP2pF!+$ zvhFu%33Dorg}1~$gSNq?M`$R&{wy+yqg6n^`|y|S?zqvShjBR3lE~XVfv{ zYc5&0G>_#((TX`blA?pSD3Ha_&dt#e`MH{sFLMJxrV-np6q#T{r^jWxTx%6l)_7OE z1*&497rI|3JSMSn_T0=78-$pTw?|C`1AP3_$^}8c_z0>}6ub?S%fEqsnXU%CWXaj{ zk-V08J?!ovCXZ2M{QVW;xr8jjc+YA1gtx0K|Eb_akQ3-biu0vWNw`|aN!e%8-&Xog3_Fu!oNAU$hycp9Jn;C)U}+d*j^0T zs1rbuhzn+fQ~16;60wC(IG(=V{Buls>!Z;BWz=VO37J@ ztj#B%sW!)gNKAc5a@Did$xV&=6MmCf#`E3EAQ38Xc4OvhgcA)Ii8y#zuM{8mdn_TS z3$aA*w3VD;6aGk}Mk$3=Jwv8%jC@cDftO&jg;Zcxm-BXG8%i|U;#W-yi`Uz7Fa~nf zG(YP;aYLTNk?IzPll}RU$zun|#AV-X8GJB9%%!J8-`wy`;{naj=SaohRb55xEKvKT zr{^n1b3dQ<&rv?@Ki*%EAcXcDA8TE6)Cb55eBPCE@DUUu-b(Y4P8Z zEbujqo+LZG57M|8R`q~X7!sx8CyRvE@!6NEFsv;B4m)?-i&ShBw?^Zs#+4_kYmylGZZmCk&RAE{ZZfLvb9aqe%j3-5Jy}0-(-v5J zyh-+6mZPz^dIohp)S$MaK6S+F{KO$Wi9d7sZ%fyU=eMTkrm&o*li{dWpTD~}$KZ!Y zKvK)yg;4nN>UCqC?&pP%11J|JJ)Kwo0tM-?=HZ9-4{}b=<{E2)yTpdBnkI|iHbeX7 zqGxJ?mBzzS!*!70V(@oO_noU>5u~u7^gSIuy{zJE>vEskEBYySx!oR%yyW+OQ%8gA zPcL6nw@BA5avs)mkE|J!IxZM(aB7a9kxW4>5;9xY^HB7%{o*Lo_ZO!c3wPf1fP8}l zIk@fm)M@T?4QWsWbD9pK3^S3~*?-e`Q+WYj+T|si`>X>28 z7h@Bjv9xQWYe`5vpRJF~eh)_4ErYCV=`gxXsxiWwt#%S}QVd^OX!A99fG9=s@Fc!M z1nDPoMO;9_ls*AA&ed3~B6PSKfNeBs$^a|9e5&M&P{{fQNW)FY-gcXCaX^{914)Tx zSICYrq%n+ZWcuAWawbZ2iuB zKt&rH{_RgzJXC{U*=6jB7&v?t{?H7m+_%lxds`MM6u{lmC%xjafYQ+l9GA_A1L;nW z7LuO@l-P*8w-HLoVz2{N__TZ^Utrjx*VMRay6ougd50JME*z;Kr zb|;Y^POG}6w7}hjzKJL(0gdoIY)69TPP5{*)C1l{pV%JF))3lUq@A#U-n1|ONSJc( z`Klx-z2b7&p>}~KfTNAWhohZlKzx`FoUvpPs2}9R6j{~$DhNVGwEV5l#5s~|KOs%$=Es9yPI(9-;w31SgiPzR#BVsbNVxN?tA40h^gsLH=C zDbOH8aicrhfs=dB={}J$%96XBi>ePqqwlN7hTBD+8aDLre_(BpVNT^^>0W5B3J{e;!AZD)CY#oU+J5h;Y%@1PqlNsyeU{-b$ zV%EkVV+qz&V$Q~<(SehOlf{DA{g}rpHRZCn;$R;yO$!+S84Wb}Ud$82T`J?QskzOU z&q!CM)a4>55}a#pwdW7_0m%156TkEQr_IE`3t*_`FX>eMyFeYEIzo`$Q2i^ruf%^C zw11$KwS9T*sHxZ8&%e4h4=+SR9!Ie#m3HaBm`LZEFzZ=Yju`UP-KImlpAv!byGrCv zpO!sAT+UXyzSvrpCR;A{+&SgtLb{_ScMQAbZjx&1#6BP+d)9wJh}-BVXoN?bQS~uE z+4^y|wRUDDTT-H!cqOxa(Q?|Wbv5p5-BTZ%v_GZ{nznMNNy<02SIz9LPE8)Xw__xB zMuvRFoI>_A1BBV^b&WOeW+@BiyqNwZ`}WqHKCjoyOeqj=BblZh+Z|JyIvKGDcCsB+ zj~jNTKl^^Fb0&Et7M$z5RiFZHO^^Hy#1ePc9Ppm)guH)33d?%eyIBGIl3`WbWv$IN zV7--vSFD>{S>sb9BJiq$f7LU_@^^vC+9%Try`+8xVPr@^Dub};neLc-jbC^8pEs*I z^$JR}^%JvA~Ip!)m{?nN!N@Ey#))*$EIXSdSXB^D_uwj|D8pzmFQ90u0zz#c}>9CuQr~=nnUfV z8Je4Ke~lF-+;IWlviD#Uer}B)0?<6!IO-3R&$CIKu@H@bh)_dbtyQ{be_!WI7^N;< zl+QCbvl5Mmq*zT6?n*xCw&L;0(dI?)fSzuAcPMj&lFj5<3@M_H8rY<1 zqrT<4i*8$u#?wXjghGzp$n5ph?L=({`4%s#PLzRr^xNR`S}2CibN9_{Ai28fkT&2| zNhdB@N1b zk>y}@lQM?K`=9HSLblnY)MDO}-^N^i7P9LaaeQ!o<5rG6WV+g*kGVlPYJeLT=#KKz z0s^r!T71I6RZWi5A72rs19lID75YL3`JTSmz;n`hj;e0j@hEq+a4(HiW%=-Kf`M0O zoQs=X>_keU!9)IcW=GST>qb(E;e-~DE5?R1t`vQDN0aj3PM{{x9saypak!(Y<->2H z58G4=LyNPf*z?ZMH}>qdmj~=OWf0+c*qJR2%iNz9`~BxLd_?rm+=1Juk{Zpa+h>J| zQiA252?}?Qkd8vSXU8Ek8CZHy^Fa=R?@xFq%!Jeg+0UHmgsFY+*!aItS^JDEZ-2a0 zfXAgiG)lBzQGlxiu<^ZcX19;yeGQLG#5W?i?aat+p0OO#T8aM=QNsT4ZDf#q#9=18 ziqIO%kBGxpoCa}z8IOULry5mOR}_9-M{!TfQs^zZFY3w5QoubCCBVbC1Z({k-7>l( z%-@(ZDOa4?`Ix(@zNNSZQQq#@jVu$TVe?ri6FEv*w*wA#3xM@urSBoI9aoJ2zGWUc(h)9^IBmgDOZ_l~&n8 z{q-Q9`p=2G<~{Mt+L0|*hQ&XqUP$I^bQR)_VYs^;wE>!D{D)=iYF^W2^L7vXFG;D) zJFO?o0JM+t#TYC-uI}S8>ql`iS^p1biPX>j*-nah`EMTlxcbEe60h40^`Cljgh4zz zbGL&!8kUJYv{uo%;_yhlbwG>?eqE zxQU)$e;LfZG77qf_YO}(&+m*T`|6dR-~8hp8mhU|+3lUVb++L$Mj5>EL10{=>7o$hMM=z@s}RCi`jg zvU~Ky$FAd#_2KlZHOsnVI1^WHDPC?jt{CsG9Gk*hmAAmnEIzf#+nd=F=bb`oKbx4LyU(^Fq7QkenBcUw+T)bO_kM^KOm?lU4JWGzaxAMs1)uop7!{x`A^{ z^7lTdB8x!}49AFzmzJOVMa0W6z8qooCp=?G#rquXj-zgg@kMcuM!Y0#rK|S(;pb?2 zaP){aLh}vn7vZgjEesOwqHmS(=n>b2tOq#H_rncwNVu=1rNzIdaTr4di49(y!N1jnY6Tg!%{Dcm5``jD?Xe+xo0Mk?H@31 zetjU9BhtR)DdwWOsP0}&#>D{M8O}y>59J@^vC#To?uUTS1W(o`uwaUh-5LS%Xd@@D zjm6VY zl0#w2X!%dq$JpLUL#Ml42iISYpM}WvTs;@h@Mp)lPXCq$U1e&YC$a1Ee-j+E6HXlGnKkQUt)}d^ zG5NGMSBB`*vhEenJixmp_eb(;bxKvhV5g{8L?99d*ciW-cf>evcQSS@P=?Fr#JX1S{XpH@|AkxyZ1q zDacl5;sn=`25g1y;GSD=YEg?T99kQCxtnO2kSB{jiqPB&uyDKJ=d3_jVcwx*A$Ajk z@prl&k)jI^6b`w%NyWYAQ*@q}^5~&=;QCw2%jwUXrTH0sgLHA@yzHD6P}LpmMf6QT z4o-bc04@4`j+OM6)?)HWcdA3!OJb!m9XD>M5>%bPl{%<&d=_z2Kc zD0>K-AuZxv>Mw^NLLBuvgG#!fwUNR<1nQmubn|9Bl@_sUU8TX$A*@McvW~>isF4ty zw(1z*!Tv|tz%dedi8ShF%T6>mge_`*1gKgPq^vG?uu|c&8`7qD2+*eNT6g5{xLee8 z2cT;P{|r)02dYxyvKtemI0;$U+JHo0v?(?pEXEXYG{mTdk;{EHaHa&lp;5r{^0|5j zQ;fHpT=bWpLvTy(we=)dse%_T!@U_^wrp@97P|#mw46v#3dG;^$Ez<%&4(!UTFWiSNof+m{36AXw|~ z2D0OsYs{pFvg@Y#h^`VDr-H!8N;$(d0w?XRS7n_I#MrQiZr}G3uzvr|o?D4%h|IMs z?IUNZP2K!+sXUL#ZDO8HT^s&#JK{}U7r9zH(&~;MyCpMY3BPs4n!te3vc`0(%A7ZobtGAh3b!rxe+%F_=G;(obFH9IhqpobVguw6>%hp;sY`$IoKGI$z= zV=(o5*vR#Nax{^-#sfTUSuu_13!YnjZ2iI*cStj)2{1A#EWxB{1dF3_Y~|mWu}<^~>L1ul;(3-5OlZJA zk{gvWbvg(>>@_*CHJ_+W@yfZQZ_eobD-n0LwJMjPVw^HsAveHHigpR`s zR`7iMYURdkEIyk?d;uI8Q27GckWAt)=k`71px3~BJ5qNC^=)-3ZG|8%;Cvp;&JVtV za^b__#Hl8PZ>bo@nw3kZa`?dzf%4bEU--e5q=eEMrhfg{8-=#Cy5BLIw0i;Wn?d|EnQ4H6d#>tZuErt&pdfBGc__HuMc`~7t;-Y#>iLkCa)gSRo9KS@W1eZ zH#}wQx}OD07nAl$Fft7Ir0kn5%vcz*^6D;FOEGnw0 zg^rnJAD*&W1(J@l^kXEwq@o%>eYtqgv(ZGHs@AGY2@9x#JYE&8{z~=-h9@naKa06H3`9e@CWtfOaNm zr|`5@Uw$5KD#dghje|W%gfJR4fQ^;BYE{Kb75YyjLMj-C7ujop5PXF zpy{hd8~9Q@8b#{^vf{>Xp92}%lO0N93fzGC&aN!B%Ad98Ll|RTdkaC*#QlX)D;PRl zpBklu^t}H%V>-w=x;WNi#rB8%Nf+Wazuki~s!!?}M#0S5UZ741>WDBZT4%KR4{_q&F*&|? zjTB;QUqFl75briMP$y2DH`9_kwGde@*@I}SolUcjP_tXYkyPx`@u;D^fgx#KCgWvC`Okkr7b|!@)ND~@bDwC!kkw21zs<1-Vj@}PobQaeUP`^;1v7oLC$57Nu zc|ef&lh=a!nHxsuH&95NAt7}{w=F2(i$UaTj7}MRLG)uz>j<8tt->GuyqX$V!wj}@ zrFhXSqLl)cXS&40Um2FSTou9!UR~jg^s~>2<&FN?_v-=vI4)UlEmm+`=p%|k zVy%=7Oyl@&?G#t}ZiNP_$7pJo)e|e*Dyi2YCW9v?dn?_}>Wd6mIjXoBG9A_`^B%ne zmwEsDhHCW~cYrp5e#)=pua(%EYq<-o@4mMCTkmkma{Mb_6Fz?=m{sL_;^$-kUs4;q z-&-YEbD&MR{*1b0@DSmbw!El-0S#A4;aVBOz5&lCu_crXB>tA22`&+s!V^~ggwX>g za~P>j%~-EZiebXFIOC8Cc2vD2+}*f1!f>2qNu8UQI@+VYbHYMcq#6cWd#g5}pcaj$>;wCamY z;txPW9-2Hd0h>@HH!(lppYHvD1c#{OJ2m<>a`Gd)|Aj=a>0$Wg^QjD`?${}dFiIo? zAkb6%&4m6}-O2zyOpTs8iEe>7QmiD$AyV6t$M_N>uV!r$50g5{%X3lzle(?I@6(_4 z$m?0N+gK{cSYDCv+yKbe_zd!Ml;>;z_eSORCJnK5oHH?P?>kn#kF4RPl zN|{Q+%yKo6QPS%ZQYpP#$J<$k1Xqb4W*b&AX2}zOWR1Opfq(1r_rfw{D~XbrRu1@#l4(&Dvkyf6HqIE|kwRm|8w~8jHIY5Em*TUR$an&TC<> z*99@UE;|GYFafs#!+SN2f^8j)cAW&^8cPZ@mNuZPC0<8qcaX-@#o_rFz z1(RDB4*Q!8!`Y?2QM|PrF!AYJiL3FuSWG$j=s7TDv^63L{Cv*!qCr~+MEP3Z$Cx(y z!J}T`r@d|{mrA#-zaIXv6*nf*bXSJ+Dr<3-Hy6d<;`%#r;|c% z1}m`+y)nu^QD8k&$RAK(CbQ$_)45_KyIzD05ahq~S>^is_%%b!$pLy+4n52DpIJs7 zTLw}f+&z(!3q}%-=84>6!n$bG{^5yEf^S%7wrNHNJQ4 zv`Cmc=Ck<5fD=+~-U(zfwdL}=bk{jyS>YMBXk8k52gaBd`=RrLCFWIbJ+Mu7>_fgL z^msU-$F1O(BavlZj}f#UwMh<8Pc8+tF73xkdYIF-npE7IaS)G^h^L2jnfOYk#vz1| zxK|~9eIX>ts^SH1raw&QR}|THzb)sxp?9@HO;+?2^be;wODMMJUB-^}u9;%#ef3j$ zGNOE}@l{428q3X)y9&V#*SU-(&hHto&;zq>O*R6(u~ITFFD%s76yqUWdH~plA&$ol zy*(n^iISV)c%Y>b7RryWzMV)sy|p{|Nu$WGn6crUKr8b|X!wJ>?=WbBOAgD^vw#n+ z9lOFEpJg5~nNGHu$8IHNzsc2ak}viPTOCiL!wm6AJq;mX*8_j<#PuGhsX;z8#nvnK zQ%N@5Ci&nz_UWkkVNwh*2Mfmga#S0!TdxbB0UnXh0iX&LK}2JtpweSVxj}t@oq*%Z|evEbn?L>iJFJ_nJLLTV$!w6UaML zdi!0+mjIEXb@F5qe3Wy)uFmSAsG4Tn?(Z)m)GptbJ|v2_yRU#stt~~>Di(a-bJ7-M zO(Qk}@(l3vT=gO`qhIj((S*OCdBE@d{+W_v?m}db@yDgW7hFpP6&Q+c3z^rRHqVT6 zwDAL{Y()FT>eDQK0dGMA?B$CnxlcHY#&XU)5_=5_ zpTV+ZCu0K^1>HIE+016k6-#$pal;g}V`@kzInw8P5dt1#qbEJ? z(nC>C+J1siZ^x{v5~tTZMR?QwARjw)yh?opxI+s*qRL(A4zDQRIi=w1j;avyaY#oAK<}&K`BO(g_DTB%it~M1=0FlDiZoZ-vtQNJ;C^X zlL|7Qm&ZaE_%421FYM<^T*Quh?#2aK3((Q}*ce>XSpC}Yj1-S+1R>y}Q7IeMi^p z4V~k;=Qog0^I;0 zegRZOB9q*TLc;@)OG`Uu7xs0t){sc4cvJuxG@H&8su9tOvU+-ZsTa|z z9=|t=m!r<`AswTIqUr0%@6%=qXtHRgLqa+k)ErCrxy zpLt(L%nHS%%Pfau62%ke?VyR8b8qgecFl|aUgRwrOaxuf@wR_2sS#?zHzrm1ntxVFM@X*CQ={;JG zR;e>E%w?&082$I91oMP&)3cZ3re1K85`-CE+`H-pNSIzw9iOMt*RE8yd&0Prr?me_ zyia!;J{J2#O%pWQXq{bS+oh^UA{u#<2`Vt6PI%WyodD%QpD7^tWHbL1kQX+kjo;2S zWU&x-$+i}Dp%hfbrC~=drsM|+N{59W1kX#}wz3X~xSLWkMEM$fBDtqu%lx{=PzDao*$cS9VR;6U=6AfS#sUo+z<0i3b?gEZzEX_oRk#)~R`P`qODHzEA zP%w^J|A6W`pC!sf(MZ@OoL%C-6buhc z{jP0SE?WNc3XXEiAjLydFjn{UOb7cC;M#P&AJeatR0XjAJux*1ppR1x{+FPkJW40! zK29fn2qxu(F~7DEXCZ-zv*>(O<~&l%VQL9C$e7az`1z>%dpwqMZN>tG=tLxM{ZHuO z{si^q8Cl1E@8A>&O=YWRx_|v96K^%!Cx5qZJ|O?xr1Z1Bm)%HI^& zTZi*6U|w6Bz-z1Y8ED)yqD`Utzu7A+h+mVrr|4vgvvC(WI9Y zX2~Q|p&tIu>i!C5nh=`y;qyTwJF$4lz4vsIt+vU>o3i8)x8C4hPuZ`ckCqOtUFn4J zVi?xFZw2vUyys*JT}?j9)cB3i=?AGMr0-q`5x9^n)~qyWoO7U}H<(FkR_vlM8ItLy z9&-ZldxoI$Xhx}uofa#~+kuKlz3{qwPa$!Y8*|e#zG!&#LQSWlz7`3cK7(w}v)MXH@PJD*mHotV?}*9_#>s+Dkw6d^kWKJDBu3ycV!M61

Bgt zCgs?-nxh8?I|ju2(Iz?xX~lOdlptQE=qWCQt3H#FGOAVanA<_*4PENo{(Eoj%=Y97tB&LuO_$6V>wS?g8EegD1KCsknogER~I&mCyBFJu%6z zqt>e|h7DgC&h+-%1(&HZ!{OIx$Ho;+xGRNxjKd5nFADy|+%pIm#sgKakqP=2EqAHeX#*e!+b~6FmL~%VXO>8gfNwp*m6H6u|Hp2aP?cK zBwf}v2e!0}-;QHdqj7WAs%!ZQeMZUXpEF8f!Mj$UbMe)cBGg`Ct7V6gH+AE6ohF4P zqWMi_vxlOql%#NX_`{Lni4z0!uRp)lOSGe~SM6}n8D!_e|0(P%qoQixe#rp@DM-%eF@UArO z_kzL2oZXvqqcnfU1ue4U+8pCfgWbP}5WLxk-btpK73pbqFf~ZOklivSy%l@2@8-k7 z)-$r(8}R0_HEPKr?eDqlZB1Z!UZiJ)qzu~s`~I4JlrZW~jC0&6<}X2qxIE+yyx)6_ z=1)Eu(=GAqeN$dJtEMA_5)YQ!#b9}3iuNiH^gZ5X;} zU{c9&eOupumi^T?Lz&?I3{Fq;?pLn)gm7M{KrpX4E@9}t^}g8+DGdjLFwV#CV;XMH zNaWK$#miW0y{pYJ;p9_D&-ACM@(_Uy_Vnx|E)Pc%n;svqS~+biIgJB*+5Mar*^9*~ z4$BUl^I+tLd86;x`~=`|t;faCsnX~W7CpYSZ?|BT5$Wy0D3nqx2i6|{!CEAq;t6|O z;yZjU@(C+8Mwz9_3)!EB6dSv_rKM;{$_W`fxHAOD5cMX*cG+ZV2hrzjirz{!nRrm{ zDeC>~j>l&Hon`$2MwjA__n)F>@w1%9REQn?>u4GZ73hErcnpXS-O=pw(AOnWVWVI9 z*u_NLKWK%W>>{YTO4?f>IjQEWuW@Sy}nq z2?@&sMKqK5B|>5RM^tyvsr(kAbs;bvD+VBri!EXks|RI_%^+zVn{b!)2~s!P z0WTD$^nBa`8=cy+wjqz4{?+mm4B9UHp(D@+!F*g>b+=NO;>bvuXc-|I^*iBZqFRE+ zdCc~hQ~)j(3kxm2zEbOfjf$pt3cb8YfBg5>VOFWIykzqt)&fPzjVyG5x5y9FUb8nrdkP2++>oGm@n(vXWXQ({`zgl9q^#p}Uk$9!0E6*VnX_YGS*4hX?iHHEF!lxn7y9w+GYJ*q zT1LS&1PWEu;uSO3`@Z0)ggKHFCw-VSl@S%uL z`!5UU6^I~%?O!Y$TZtb02s&%UN+5ys%!e*-Rw-h6mpQ+X8VihiKhR^h2)C;1LHCTe&mMP_>*osvlaWrn_oZq&aAz zyV`;t1e6-P3%i<4wN_dmn8mihW$?K6JNFkKVStB(ueJP^YP%8;Jg?+6P#WnA@Dno) z)RKMOXRfCXuTPsPFFzCWd1^6cnYTH=&J$1H4o~&X*y_3VXzJ^oZ8TFuk<3!N-@h$p zT`c-udwk`==E)c1P{_KdG(T+9silfS9A(!6-vpUF%eH(uFPuXoLpll~B|$*hKjH6n97p|Fw@F zC2u_8YiIQENqtkI?+bc$KPc;>tuHr}uiQi}9yi|soGQr-Dm7}UUKxt&--3@Y0Ep86 zd1&!?)$I9Do$MIA>oR%EB0E+~<;t)L5!x7kR_U9<^dsh!9{s+YrM#IX!>npMhG8Hi zIAl9!r|#TV_A9*WTNstZ6Uy=F<9pwdFtwI8lty>QnjoX86!|{W+73<-Pg0n-*+$>s zE?pv0gMd+Jt5weKiE-$lCLSgd3EX1BOepNIkT#w+b~=oSv-#y@qmy6j&@(m&OCjgo z5Unb4(Q3`nO5{sl4EyKbB*eFA@a$7(F|*-u*40X7Oka?A62i0H?~u^=L8{}|V9;Fe zV`WeQnKwus2);!zA&AR-|D)MJ1N~-YqD!9vPwYrGBTC=R|6)TO&ejOfaGU5aV5J2n zzCTzg_y|HhY@lR;K_=kQYT$dOCl3@5_4x0rf{X}JA)`=w1U;bqHh(~xHorx^QpEd1 z-z(V5SkCNj{GZYqKe~t3QzbdXQg*nj(0nCLQxSr4&AY9WN9MX)UYI}L4eN}C5l>1S zIo^92|Gi$C)kmGVQnmr`E~lP9{}GkKma;sIxiH80l*`e0vidW^cYcoKfLA_^apY&- zT(UIVijQpH1kZyMsqcG^N**6B$o^VskbgQV6_=H64;T$rH;Lsdg>+QIs9GtQJA%5~roL@BnTXY*a(ZW?wh3M%YgI~z;Dr6%Q*3?lOpdNyWXq&Q*m1hH}9 zVJu@1E(uuto%pyD8Tn*S{X`o&O5WFxSo*>77n^y6MZY+=A?$%jC_Dd$n?g0ZG-;$t zD2D<)%ok=d7Q!h6ivgKGdadpeohqT+hyHSD2Ht_;T)-itfbBz^;;uFI;=ut-J^y8D zR}Y8dlGXIaxu{+irSrWG!R`H@nH9Ai@4X`;bz|KMq~#1bA8r3~5vSs%8*rfm35lkk z#=kFLF+MtP6cso~pIhq5x)Im{ZqAgawCl`#Q zmpC=d(?60P(yHOvocR`3th0=rZ2BM*uJ+H)PAqFK#UHk|Kz5o157-Qsy3dVCay$WM zvh!~cajg?WYWKIbE(Hc^}MQ zGPUPE`%g1O+I4o!44jJk4|9O~S05;}Jn|kioqu)JtgGDZcgRFhXzVG4fagARXx!|{ zQdf6#jeo128A_u~{`UK1cR%;dsC|6E4$1$U_bBUrSpm`Lgf|Q%WTpawR zH{?ES^)uew0n&Y9z9p-{fQ>N3!=(tgb0S4y>omupor>DH7pntI(P&Qw>G2*`u!L*b zVs`hrK`08mSl-bPjD%RZ=o4}(Zz#(Nq90<#>f9&Oj{6xA!fZkmt3yVrZ8PAIs(P36 z6@Ys;8fkz_JUfogFXi^gO!H~CM7U12uXMsB?h(@2u~Uf$o|QIG)yJWcMi`adZ~ZU? zg{m=_8AOuyUy=aW%mFl{LVkp-2RfIPm-MQvBr443;^bWA%z_im7%U9d4q!}Vxtx4r zGSD`9Tim#d&!i_%#n6`{c3)CLU>ZvY19`wzAOx$m7}Bxt+nr<2eBg29^}<2Ux=*@* z)wog|HbQDR8sD4f2X!iV6VG%=mVvvOW@xQYDO|e@@+w5-tkUSPiL8VQ zy4vaWP;mX-SKjfdRn;9clftDwRi5!EB}F@S74`3ZcuMMTr<#4mm0}U6mfo?G<|%4P z3gil6dWC>XG{(<3=4cnAa?K=WjkXk~UO!Yqoqo_!)czo?F0afTNhM;gPyLHxL2-Dp zv5fNzNCsOwyEW=FojS93ujk#;@lULCm^V}eIh#vvNR@~kpVXzW^qxGA;=fqT`Lvz5 zJ{C?A&fXsF9JU)q!cQ%MyIWH*Y4H?T(mR zD9fXXVeE?%c`H!7n*pqdg zv;Ug^1-nYoo!hbe8RuuEzvLGMzAIzx1F3f3^Jqmtej`xP(^qeRaU-7^#rVC)x;NbSA7TgT9 zH$2t@S7(h<%m~+NxOB^!CCg7&|!-Ofv z3~PG$%k9l=qti2bkZ=83icb?mz%Jc@D<;P;&HVM-A|Z)GZoaWkN$rCC!cY;B!^48a zJ-EoB5VEv&*;@oSG~U_R!8Q<*P2UcJn+~DS@oXlojahzabyfFgx(7!De;6%WZ9b!F ztK)_XdLKLT+LEjMHsp2_gkxO3T-Z;W$o_} z=!ZtBJ*wt=L!ScYi&v`>XFulftEcb$TeaMXZIvJfOP!qGtOlOpfxJtN_;6jmNO>ch zwF28r`m~>GkMM}n;HjNeeP_1Ya^XdbqiA}WBf!h0!|6*Q&nwe$zL%z(l-LLGiaj7c+q$%t*UpzOZ~ged(oSp3Qut)*DD@Vu|(f`RunsNbH%%263cPw z6(FlxtsbemfTKfyCUv^>35#ifBt)3UI1ks6YP<-}t6yYhL=LUjD<^}6Qh3y;mO{22CV zdk^I;!DY?mL0>HSEM`jg`e8Lab9?JOMnKDj49{^2*BsXwUD~ZI^u>;n$(3D;$=W?* zqm)bRt4V;k^K-5Zv%#zbiqgEhhI)38DLKMYqcKx1qkC;khI)&MUzaCMu70%ZZD^D* z-7Pih6<}AN;$1A_GORx9IkgySl5ZW(;{MQA_{p)Ur;uyOd`D_qqy6a|e|gZoD|Voj z8!KpT+EYM2R@ z@v?O^H$U#&1t@y}c9awb zW49-UE*;+AuBsbD)syHSFFib3jKkj;q3f&ongfU%sm3qShIYl|%bl+$PtE;I{aYk| zJ74=8@LIkvYlb?n46@2iI4lZohZD%Z068v*#0f1DSXaCRrLHna>OL# zH-@QT`$xzaLjLJl*XpDb2Jc_b4sy6^nKG|>8T;SO*2EAc{?;{*gTd4BcCu0QKOdy&t7PC~~(FjKLiI>z{8^!|qG?Hok@fc9L9r!2$M&UC7- zGi?g7JKoxGPtC}F(Q-lE)U_pcJ_RN+Ikgi>Kcj6Tdr17}Ghywq@i5U>{RK1EYr|Ns z+{U>Jt&~%-f@&!L`fL~;sx>KREcVl6PgVzi1Z;^qBO$&R-^i6(> zPz^>FE$A309s<@U{~FH47CoQ=2Y{eMRrfnJs()@`15P^L^;1Q%o8>8_n9^n0fCh9r z#9AkU4$0l`L8rT7!>EpyYqF8qcQmYSOS1+Pm$i zbxjG+*?dGyoE9H^an$~PoLbv9IZVOzgXdS_JTq;F`7u)|03C07q!o;(V3Y*?{Ygb1 zzbTd8GFPW>9lxndI}tuB{09x^`ZjtfpFv}j3|-?KwY`*aS7wUDG<%y}n+KoKXtz#oXb`Hn57slimUf<^0x$%OT+7f&<3)lK05%?-%S!VadNcQVeovo!z*q9+D0P zO^e3-EH+&d(!xem7q21JD2VSLs_U~Mx;`TM&z zcFzkC)3F^ai23~&(-GW_G)$sU;w)D`RyA7q`8Q34vBkf@&aMQI?T9nx8qdNR26`)4BC|^&9QzY4=Gx0nam-Eu!IV zfA`8rBcfi>!}Cn&X=XZp6nKwNbxOZ*@j#=h`H2Eky2V!Gokml-R@KYM?p(big;yKH zT=tD0q}4s+_oVDTSw#0Z}2TPuq%iQTxN53apd}(|vK1D<^uP zwBV4Hz}304bue`I{$~n;EMgQ-r-9$kDdCQ^l@gbS8|7S0iB&SHP$EFOMMFAbB?0$? zm!&VCq}Mkey{L++=G)C1qFPll_EW8-A?x&(NZ3`S%b%=e^d6EWS{N0);yx+4ff_TX zAaeEPWBWZ-(O0t~1XL2c`j)j+G5lQJ;}GD55?83Wy1h$IdqEm4OT}NI9?#vOn)lPO zFxf9RTeJ^nYCNjF2$ho2F&fHs?{}jcR&mXT&k3zutu>maXwBgH2#rQrTk5a2>rR*h zpTd}OEs#7o%ev3WrVE_a%XvaHD&x>^^nP&mve9oePz=n?*8uW85mp6n)?n5B?L~&g zA|E{d#zSNYiyWByY1Pp=Y@+EvyG?na45xo0Cp=XFbb12=sLN(IM)v2|4K&%{IJvtL z6#Ia2I2-u_Lv+^apV-J33x?vqp%AR3rtn4mEu?zCBz8$oumFHiL4X$2ArY`s9q) z_3~@4JrdvMP3{_C6)L^DYgjLd=a^LCHx<__Z@jLN+hPe`nn&=RIgx511%+P-(lVjg zsu~$Giu3orsKPw>upGC2&S+E-a9>lRaptG~_moG_S6(Nt;vQPHqOgG!%cBMgcgb~{ zN1kR-EE>LpR$p0Irvq8!Cq#A$#h8w-u()KprIC?E&HUXhuQ)14!u}(pb)~DK8|6L{ zG6>=*92(a_V;auAgZDY^kC%Mfz8SOurt7ne?)wTcNJgaC8q1|84H8yq680tYK1ola zAgWSy5^R9MUUz^_o`6AecVm^6I>u)r>`d4boO~i|UtKu4YaGT?Wzqt*tAO#s+kh&Y zrVzGHa$c_0-ph_;{Cljlc%iKQSjlk$^2saU46KJbVyYG6veJ5nv37E(!g%GAnT@M5 zSy?o}Dk%jkk}ArNe5^F* zW1WW-|9*#u5yr~p;3^d!J?8u?I>ZGtM6msnAzCR>fuRm;k-)!%+D4->N?`jZx5VN| z!d2oO4uiCUT%Q>p!DwEGu6$kxD{ym%uu->0@?wEiLsrP4HHhZM{BDvvKju~N%oQ#M z5OX|h^C{I6PC0XC+63R-7c`SRNuUG5;h#=l_}gjRGF|ZhG9e7mYct0Qb@9Z+ykyq; z29$k>Aef=n5&h4%Y~e`ra4*1IM81kRUPx%7w6yAS3(FO%2z&Np6@^neCLj z2=!DilGh4EP;&_GCG(s@-Np>%?tpI52Z}HJTMq72>)?G(tvwSmn5msy*-hJ9`-sa1 zc-bu~2y~qLY%Y$CWdse|%-INy4>Jr4sl0jx(~<47kau6ApV7M9HT2drz0g7)PPW-0V)h`8ZLLs}x`Bo2Pa7mcZ#* zktH$YGmrE~uWLbHM_B5P_Y2T3-~Bnjf{p5<(V z;J_;dNg{^y*8$jwl|4%xIvL(rg*qDI^WCVw9YcEj4gmsKP2`K}jNS+a9znWl0^iT- zCMkc5l=sU#uiEGNq-%VpBUJ;7b&iKTi`N+)9Q|}~k~~?6F7XdvC{*Xyz57}ELrn+w zC$Dq6#6t9Sx!%f3!wR33Tw1^dUdy`rY%r)wxx0|Nj26*Mg*22_`85g{JRjR^rOD~# z_(*0m+ra}s#!JO;)f2j9bsNvUI`b}C>gQi zNAFd>H|6^5TTQv8p`#hWjM%r;R1M=_)4_cas|Y-|&v|>VsQavHer$R4T(>4b%LXU# zQ;}y1fgXIEV@)6M-Dmq9K!@J)zeLfj`-VD)(eVfBVM{3Qn zVX(KYKImsTz%?;aYnq?8G34`5Vj>|yJ^PPzptP=?!Xr|9O_3-5zBDI$oT&uI0ttbi zdYBkU{t?`SgMu54{|av8$zFRG{a0`EE~X1W>i)}^4MW9F?$J2QQ>R4rzfv87Or@K9 zN#Ytmmx&k@FCNEJ{K^OxlNe`6NN{IRZcbQR`R@Fs^#RL^uw%2Fr#$b_!4c-Scztj3 zIVKZQ3SRsoV53(fsRcAvuGxg5e!?!fQT#i_w(#w*m(ikx{wkS!!DNpZSc0e*0a=$d ztp+Ak#rg9L9if$L=J6McdOZ=R2+gh0Y~{#P1ZN*V)gu4GiP{xc#OX+X2_m*}FL*ou z)aAorm9knRBAxUPQJ zI%?Qic#4=5oW1tmzGluknTY?%E&F|qCS_5!o>yAa|8{zh7&{2(jd*O?SNn*tTi)84+GufX5FuG&#Ffu=0-R}lh3S*eXj!LtsrF5{P5 zf4++_nfc#JKXHFe1Rlk_vmLbXEE9cc>%NpiuyM^x9(8ad(d9$?*zXp_HjA$S`B*X| z+W+)bRC3Xh5sZ}4_Sm{R9K)tO7^APW7l*Mr9TGB#rp17XJ;X{xs^BtK=ir76PI6<= z+OoH^&=SQPaRQj)GQ8i*A?Jqe`# z%GsB}Yp6O;Y9;Rd*)N`K2@qG8(a(8m0>cl=73NN=TUe%O*9>}9-%-spBIF;8Qt z0QVPX1a?u5+BHd*^}U!;wZh_)Gaj-P@=40T{m~hLJYafR04$+mw~p>RYRtm9BE*d} z!k!R$;t7)6Or(1u&MRukTqpkbeZzUYdBM|?|jOIB^gt+*{newprwqq*hz>Mzrmp}Un6OTXnMo&Z(HOD6pi z*OBC*6E9e;!f$4*jyRSk@J_8JT&jX{gf3Fvte#}jMKpzWJtB%}9AL1Bd$bMdnYY9e zc6v45d7tj(hi0M56`k%?zRN$m_0q3%cYT(|?0~YFBK-n3IoRPuR6q9u8-pvE$b+DJ zoO)`E{qAy(Q~ecwqtuV`dPHrN?3&))2Qp!XBsu;nJpQISChT%CfXH}Tub zZt+CEo%8x?Mw6aPXU~U&!XQVEeoYeue;(J-P*>T?-o6sG83;U$YmZ(ph)^XM8sQL7 zJW$7JFLI8n5WU?WY5jUd4Sr?m3aF?&2)sNm^;~c3_d3*^!?*j@3>nVTZPW?wvdx_jo*edUmAnl}q}`Id+T=KUv3%%GdIBIK z_u9~FzYL9@?h4IcR^H;=KFo-C%W%7a{M9!bHZsz@tKE<#ig#5_Onx0e0NRn{_!|mv zu^|&I?9RKKYyVS*Ci-8#Hz|F7Aa z7m`wt@$ZEKjBbo~{u#g|U=pG!N#6G;VgGYSOhjBfrO}&RMpW#7ZHbEs|L-+nk^i+O zAq0}j;L_%T@=Bs&3i1NNVlZ(D1rY%yMG0kP0VN5Tynv#BsJN(%^#AvS*&P{~JKk_h YFSxIlt=%1%govQ{9X2+lr^*=oDM{M3ltW`t=Y8*y9X+%yfb^6?pxlbWS5Nyd> z1l3v@dBVf-7YQmlP5tUJ#W;P>SL6Ow(Tx*#_j46qpU@HpgS`O{Mq6p4>akzhT+a`g zhc0)XBfw)ma50oWN_ICP{hXO!%?Ie59I877<^1LIYwvp81fKU#LuNthcWugd^@DDf zr{}gMBfyB}x)6Lh+?Z24T1KTdJeS#!Vr)crWy{Zz4R54@!o z=5Cm&F&*DF{l$GwcXvZ=8FfEnIMJ;Mk~U^1bd6T%W=hRjTl9eh{N>wq`+Jig3yS7@ zPU^nf%ml8Y@}GygKUyAKZQ}tqX;S%KZh#N4;z@Iy<6LR^{AVct`F{5tc6%55_es&@ z_`dC6;>x;f2)OU-e%QDCyI%J9c0=cf;dq8e2*+BRQ+Y&OHI-~x!n&-xI;okFT5(|R34#v5hCzoz!pjwZZ%>3p|KR@UJ7 z+}b^`Yzc74SKmwL{so?nwXR^@586?8#~L>`T|tl6m_OXx#^#MfsCKm^8#%Q!@_tzk z7gf&EE>+LZgFZ<$0S~Z%`-ZFMi|)JaUp2uA`G3zdJ%c+Reo6m*xIF}Jw>$WXemq|^ zTORDozFJ{=b&_-$IfZ3$*z)82>QQfEEB_hRdh_@0@ieLXTPg5&evjkVlcgNPE=4-u z-`S)vp8F!`QX{IbL+$WlKD49sndun&i0Qm z>x`Pr4zd!X91MRL|F;LODm@?=yu0-o7)hr)8csE@FLWaWIKyc(*RKS}D=!5pe=9KM^hn$4v zN{KSXyoVf(dD_!@s*c4cgFaX3TZws3m$F~W_LvC?c4q0vrb6tKKmgF~7P%h-j^Bhvk1(!B|X&iCk z0Qn}l05n&XUC(cEE}Z7Y^LM_wHRfb)Heode=`5L@7miM5uJ$a##OYcskC)>Mp$|Dp z_c_!2uoKJe`uhN-z|k9qCExbAym6_LMuoGM&E=z#8ygWXRo;&)=t{s-iRGPTpJPDW z$OXHblS&r#uGWJCc+_^Dz{KjjwuP33q4lH&h#$V#AoKH3IoaRyP{|{^{u1aFHZr{< z8Tx=YiY(QD!d^0~q798$yhhl@u;rtbF&TF;%rB61e6<=nbrKSO-cjwlg<{`%vw~w2Pz;YrNe%_X!+`&sV!pb}mv*dx4y32X2iOM|D{s&=8{96!0vZ5J9@@^vp z_f!v+P2kz#C>gQpXy*(QU{lu=uHHt*hg5h_uzhBER}~pgv=r zAG?%#8;5_XmF=M?eT9!JyWULejYvSLR_W0d&P&rqadZLu#&&t|{!Lcac?aUdNP2V{ zcz=_~W^{UGhD~o3cOSuV^2|NFVQ=`A388IHze8sgj1gKj-t>JZTg;^ByMy_)1B<%F zc-2ViaUd+Ua$V_@|oI#VYhaGhvBg28$XeXIZ@t$o}Tz%Es2e8Y!xTd`eg{e{-q zwRf#<&yo;uZWUQSeJ@@!!~CdIX)_cSv8k2RVY;DZUM+hIOGAF1i9-K!pqouJD=3Pn z%~L{%AQZo;<-Xea0`PVn%D=zVyoK@ON@H&RDf-XB_#;8F8+$N-ZGTTMb7XS{@=eC_ z9wC?lZI6jgIX@5V@*Z?8aH&JBoPS^S>8G*>bujVJ8Qg_!ZN4ZLOO(0RLD?ZfM;aNx zSJ9ykJE$jP;O~RL#`g9#<19SErKf=wcdwl`aOp;0cb$zG{s~w^n>P62KdOJF9RFTb?LG|6 zEr#t?CqZD9L*7a*=f25C{jm^-{zAB6)L}%QA`!^u0Hvg?U|F1CuuJdeyIJj|j zFp>srS0=#2JU~K-h8^%yw&WxQu?6Vlx>yft>`k%RPW?~c_D)R zzHy*mDjX?tCHs__3OV(n_QFA>TAYUuPa*HK&EGFEZ|OZTK-g}HD!EB3?}w=^ky6AL zQ5n4kkrlq!r@!A4&>!lu)#d4AWuGk&5((Ai`G|(XzF<>e7uTc96K9>Bf~ekjBwITp zl$UFjU50I(I$&Kv+8GvVZ$WCMs_bW6KVSu@?z+|%6L}lc#%U?gK%HC2p{@ePQww6; z7a@pHZxLK|392X-Q#VUB{Ibp3Vm)8usXA(?Hwaff_Ncb_!o#W*FLZrv14!AHpJ37Sa;QeN5#&L2CbdPHb8V&k!Q;=jjy2p5&lNWF# zkE^wqUi=UtvU|pp=HzvKGIP)&o>ZwL677}ewIC!LSgJnlaYR4s9-Sr|2#L*4i15~G zHAdV2e8rfVPcH>}<{_k)O35@wmI;I$Eiv)8ZTFk9#OJB1H7xM)2U6Z;(UPyE&KQGp zO-XUN2lw()S-UWb%4SoEaNWK#@D1c7qaU+pJ z(nN1dOP^(c^@m_Sj`_Ry=dW>pXuoE3~H2dTko6h~dWb~B86|>sI z*3EC#!|gOSg2;fWUS_SpH2@~*M4ToW!{3t@ohrG&Vtw~pmEZB4`2^*n_{{X2d4pMt zQS;ocIesI5OSNA$uU|tm?+A*?oY&u}l6NWApJvtaTUDB7W5i1HPjNR1Uf#>hEmi4H z(j#fKQRm6KC#0QQX15JmeBJ(>IQe57U045N@y^hBvTg48lbbj2zE=AhUj%%%?`Ma% zE0)%K)dI)mfA#>16m?T^^t%*;@fj3KNSt$Tl&Z90R^v(8WFe4(Q6*C9hOK)M<_<*Muf(u z1Qzh!DJt?9ViFj}e`?k=7It!J2j{(H1CeSlHQ%@ zyMhd^emB#rJG^KPSK+?d-zyIK1$urJ9ME`TeOurWFXWLzaLQ>-@v(Nn>ki^PoFi`t z1qK$!1s>A9W)1HR{G$Cb^uHvI45og-Om#U1`ATfQJNCIbmt6Z*-w!q$)FsY0QJ>7% zY+tlV6jE$G$~=ilMmbEE#)I)|n5VEVEH2G8|2Q18soXJ{AP1i;rIUDPvU)c^=lmc` zHc+3@Fi8w@sqtHg%{8$n$>5=HxZDib1b9uqdgjJW*0%kboT_S?+Z&TAyD9IYcDja1 zW7{|#Wb>>wjc&bO3WkmgvmCp`P4Dm=cFZ{XJp~7H25-idxlgDEjp&PI7XFyD+h1=r zX)c*9{kFyA!WQIj%^DN5Tq? zlBouh-`V2o07a>Zoc!WK`cIZRClER`Hi4Af3Py<@8(_g4F|$Dg?;PNU3Z`|)*jIZY za*hmzsix3m1z;eQzJASMwNr?NAf!e9eGG=ezrTkImS{sxgHUH}ZT>7i^n9~}gc;?q z(~PL$y!CS|UbGr!ns~O5{m5jRkk))isr6X4;87>-w*U+|k?GPu zu{a9fKV@Q{VG)(le{Hq`G);H|SHQXTi?7j%dvfd5zVZhgL>UK}%H)-N7_e$9ai7ma z<6lt}@!<~N=dH*2OaPuRUz^KEE;a4Sq-YcaH}_YL<(sbJqF47^8oL2~G^LKDhFf5w z(#6p>oT&Zk-B=fvEVmKKF(vFTho_u9-Y`l-8h6A}#}k3AbHY+U;&J*>*s9GmS0oh~ zoHXH07qx%Zn$UtU&6EG?2l}*x3C%0l4TIIbKXZ9G*8UdSLH5XzFwz+o^E}a}v)=~d zW0a`oXjClb;l|;-3CA%!vUNSsJ@GW2$!!+X4KL$*7EmFJX(`?>K}a}{U*k}|YYZ>n zJ|SIVD(@r)t2raEkl{`7HJmpEo4M^0n{l$FtX(FP_Wj;e1ipD7nVPqgIeQ&TFSDtL z47O`b@tb;yTHoIj!7cRzY!}0? z3eBTruD`TpGuluY$v8>Xk6~BM$ot0f$(DOzJ#0!C=^8p;4|Lpq}s{kDkH9hI7e+crCO@ATW{zs4(6#F-h82=AJ{%;KUZ;*hiK8_#`ZvQdl zPYLii4_|>Z1it=%2=bB5;3(YwCCG9)j$}fH4rCZ*{}N>M*5beE?n>P-<}Ky>ad-E? z-W#dy=FB%C*LmIH+oB#JVi>J>>ieFx@);=+kJ2G`aqYrE{M2pO|G@3q<6@U6XUt0{TK z+5}0MsW~&%)e{U|>5_1V(E{I0n+p$0sVHum9jH1R!;T`myt!^~ooUr?)LgS|_H|4r z*H2cv6g--jnhD&N>V2xsMAckdQI-~`YvXDkeCI)L%Tkw>eOlFz7{A4@ceyWBcnhnC z9|zfzt=Bg)1?lhUgH5AbqmCFS?OxY)`tDMf@*ri<)C6Lc9x>K!e*!WehG}g~(VR_3 zuTvqp$L!s~e80<^APu+U2RzdzPd|-zJYfuq=4hOLCKHY@Ucw)gylbCIm-Me7%%axN zho-Afc~-MIMXR(625(;Bsk=0VPKzcoin9k5Iv;6vKxo(*VTC#?r}I#E#%h?hQQPU8 zZo9Oi6e)fbU7k2L{47=9k>&21zyIBKzreke$ztNObiMbm zRfJytfV@9~vf1f6&-UG){!O2%mCmkZ6V6O+d5m)Id!#rChUuI(TLGe^?>4NoVFNke z5aT4A4%5D8#V8lR;dcmuQmEpE*cbbka>cK`|KO+$n&BmnJ^29N?HslcM!`Zz8QDX~ z+(JM0YR8CatWP_U_6&Zbk)9#_$ zz11QGg65&4QHna*M8*YzLcucIeW~s39T}em*;-}nHS@#Nn#einpdPl_V+JoWsTbbe z)w1)pW~>XAUC01CeU5xp&8?%EbfcQAnuq*9-uH97R3j{J-kw4(>uY|2uNf?Xo1^wT zRe+sSeNbcaTmN>#!sEBM&j7``e!tdB0U9wWNBdOVFg@|D@_0RyG=psMdFV+qzvg!3 ztuywFP+bGXI|PF}Um@*g=15wZjIX#6cV0AxzO`c7c)N>$Jc{;>kDDD!f!idPKZ1eY z$GLei?Pk0BmjsvvZVN`0aTtg*Je7&wvUH(sAH0wAB($5ix~o5hwr!_}Q|z`7wB8<$Q(>xzMP6Y#{9sdj!dQ8O#^96bgNjK~ z(LQYM@GJ=htYcW{uZ&qC*cH3M#&eEH7437;t}3%fdtctq*k(h$5OteRylURhn4eMw zr!x4!66bU>;$iXOm$Z zt3|PXO(z4{k1STNrJUYI{*ZFb+=Oz5YQCX26TzDcP|IfacmW@I(GzU4sbhG5eAt^Z zkLDFE;%~NQGe^TiHy;#Z*cW_3>>cyXW6FN{djX8R1jU@OF~_?VLYx7K5l}s6fkgAw z9lp9Hq;pK;6T-bERHshm^Ee*K+}eb+78s$nfysWdSi~2@E)mU@$ZJcwhg6|;ty05; z$lc6s;C>`+JZY}cDe9K2CYkmOG!066_?n?0hdI3MB18~NcP#x2eScpe|K zTWLE9V`|+d7;&ZG2>Cm=k$!HVo=)`a?V#JG^%UrQK7h%=WKvU7qL1DkI{aV zAVo~TDP;;h<<}6CPVXd>)Kn^E3`BCO4?P7CT!~i6N&lv@AE9y-p_1*Yz^tkIy7*F- z`8tM7LN|eSKPA~4|9z5j;H^Tj%1b3?HRZtKKQSs>Y4q|R1sHV%3BSMBQn)|Ux8oQ9 z^*-|IrL3K@c6aXIuKLEUSuov98g%s^GYuxG%gn+27DM>Z@3f?|f+X42o=l@oUi^ds zCQUjU=c>9K^&L7vZk9nl+rnHzgg^zewE$YVEnka61*9K4+ zxbJkZLG12YPc#SOzs0}Z6?$r4UypX!WfI~To>;i6s;!z5VEXD?Rk9SGc|_iPx4S=b z1)1B`Z;i8_DivsM`C)yC70;F5R$KaWrLabJ2AL)}%X~_`0e-p}@T2_UdOj(3W@I zgyX;4{FZKEm(dm5aJQt4QP6P(ms{0Q{M(C`1@kP4B&mkpw@CJ$OSvtPrjfa@04g&> zr;711-gy&kfoTx}gTKr$Kd1})$fZH}(R!f#aoe_jQx&H0S*h=_1BO2l8@FIZYD^8^ zt#~)Ql41-o46+J&i(?aH@l4wfFvEfu(LIw8l@{Du@~vEN=GjPtZiYyV)7x9MKO)C9Vdd0`@h56>SN}mIWw6dLu9Cs76=M>t6$>8muo5t=MImJ>qckQ?N6w8@ zJkrDCR$HAd-TH3BjflfN3iiStv>B74iBGM(G8*k+_tH7!L{OS%V&X3aT(iPFcr957 ztcYc6MaUCu7O<7F^~|Mu*t=nN{Da(LKiQ#AGE63s`@SnOp+!2}e)cb82Ne23mFjNa zX2W(J=j~n&cA#2~RgisGAw?~EY&xN~8snh($)4L>f=QEr2iw_a{#{NQM2kCQ24IPS zxPvE8=~P5nU%nl}l=o@@M4V*2HU$tq)?<0WOHys0C-;WaGZOtA?N0jc=6?(?zYR^H z$$n6>#I!s4@I05&TKALYyX49=QX?h5us|~=;ste(X)59OnFInQ3EVNBUZ)}=e&IAh zP6jWSQPm)K1s(sBqKKWFPC{eZ(eQFT*UTEN><)B2$%i*_%)BHP=wY>^byK5L+{s3O zhHlR#sU=tilvmy=zT$2!Kh2PBk%Vq<5)N#WN~Qj2nXwOkVl_MQRq5vEO|lOV(e`Ln ztms=J=WUL@Zyk6SAk)w-$n8gD#iB}U@fG27p=TWS?>8v06}(r?O0|+us;AiRo_;Bl zRVc9KW+da25DVOCKoUjOcf<|KJks?jPp?uB^Q-M)UYMZqMbu#%F@=U-Gmn^h<3b)E zX?>b~N#on@NPQ+hp)>HV&3}4NE}l)_Pt6~WpK`Az7heU&XEe#dXQCl_PktXI8?F6n ztj;M%@Rom;xhHD!g*SP=b2zRZh!8@a{~;W2Way3`)Q3U`lxp?(ai2EyMC~Jj;wN6g zy)7i`KJ#~BKlwiF)aSy7*G>hF>KB^fz zb1I3=T08Dvf`-QIbMB^pcR+aWs;+luyGLMhU(+C=a8Z)C(x=ygKwTZ@-CJ_w|A~zK zZT2Q2^Il=w|Iu4T?(CY0A>p=vngF~YJfI#x!D7d4EM^$UKpLdTeYdK2OKBbOKW2P* zX(br`+1Ty>wAH5&H$4*mPgDB!-yNv%iqX?F$Iv-w$nVdM%fj%%e)sj^*AOY4fWLu- z=LWdb&w|nxXFFOxIcuvvVKr?d?lt_6)rXLT^&6lLWOk?2qyR&vYKAa$b(`LmP5@n_ zqKAj6Nmcb{8}Un*;OA?}aOvjTA8()yswQU)*>9Cfux z^6M=J=hU(>Z;4B0ch)Wq3?QQ~qPX6I?cot`?H|4ZROv!MkgA17#4@ z9BwX-<$IGfDE#J(4%Ql8CH75;A#-QI2P%X z7~?}#(v;}#h>`A{v^3Ru0p93t&R@%RL}|51IIrvUxKbs1sk%yCQt;)1)h zKZ0&OeUKo%a0EX^vMzzM6aqz!2131&^e%#3_mc$*y*=s(;D)WT|IAnYO@PH+??v(o z)&L4^Nk!=#;>fBwSe%|?V!WC#IX%E@-!g>#nBq_R?o2D%c!1DDgW&AUYZko0jyR)m zEPviLWor1}Zd8{%zpSogIaF&w?ad4tWYyw~6XOES9Wj4niZGwC)o3(=U-S+3`+(KC zFp4&U(yh0HKpMZOpe624>(;~q812@JHgEKI_8jhYttpJCBBvX<8rt^A80ZF+BBAP& z1)CrBqbS<|2qL#VrBwogy191$OK~-FYG@x;7h*;c1Gjbn8Q&SQ_Gnh`sz#_MHNO~` zCcV#)0iQkU#Y;Z460o=%Z*UQ49y#5!cNjAS|QVIRPlspUg@_!0Oth6Vd4G zn{J31^2}zKedm(S#@JAs7P{K>^>iO)acio%i%xbxsQT-O3T2t|fG}-2ZCtP#!oiHy zk|(wAI?_zVI;$ad;jvL|vYGY}sl2%e($P#kxvvMaB2DAFne8lZkGKcEIDPE~zI8LJe$f-rCaj(cO_h4-cFv^P5XOAuq@Xku50|I%d=h#fYrc z?joFZWrt%ebcvmmyF1{7$lzVqr-w%slAI21eaUs|vC1KP60KkGVz8l*jSk#C81*yO zM3b2Oh*pH#e~?S=n49uk`=SpPA4z~qO2b{|&VFD~H!Z)%m+7Dxwm_a+to z`k}jHbF$knpWs4VpL_fGJb4J+A<>olJq7s@pZbgULP`02VMQAf8fU)+H{|_X%E8Tt z(OM>JPWDq~%81U1ZHk*vgYlpEPL?;~&1XTx&g8GEW6nKJT_gM;sU91I})>@`7qAPqmi{+DXQXfnQ0 z{wJSD;ZLmB0LB>y)`dp#8gqXcZ^~~?I2jz5x8+u&^Bn?^bZ_`t; zB3Sx`nGO%6=)Qyt@tOuGm#0g>0<8)>%jJl|JcnKG%EOC@8bw}YI9tE{`#G&tvw%4+ zLYF5^W&{nniTmKl9?f1te1YQiHLx_mS;(dcwve#eig0Qe{?C?CgL@ znSY$CfL-c9N(J=O?9;w+Bv^!8kRzTU&5SyGYh#!>jw$Wr15+A2-#a7`+(a(8VDoP$ z<14|Djh}XLvS~bMKjN|9lqSiiVdH8z+x@hQkWJ&p)o3Drxv+sv1J%;+p-9EGIF`-x zYih1;rqaOTt4UgqeN;#x*LAx?OCoF4J2A;Zhj&L`rP zqL~((61`T1^Ao$FuuErWBImGPqX1=tW6G!IY?F|&;3Is?v+eTco_3AdYYq2 zmpn5ttmn%^sSzQ`HAjXXFq&4`C45YS)LBN4q7=!GZ_K;)4j;_1P@+sheU)w2n*x!^ zK&A2w=bT3RGG?fmv9+Pk0lhC)`-W(+vk=PJKv13v$>HoH)Tc0_)u$N3ga;LjirE*c z-7*_6YU-BotucL3$SGFqjx89y^!NhKECxpJ$oo^(by>_(Id{GA)j_Hm>qPqE`4PtF zs$a~==g?*(wM)N-V81%AK5dN3iT5G*$ z&OrSdYb!NFyg3q&jchH@zWp+A_QY%Z9h_2*C@{K;U*KV}5b7*Oj*9`@6j)91B+xZU zj+j(2z)<)630dJTZK$`FED1K6Ukfjydc%dR5@>V=Oypl7tC)(3l&=vkWHrHsERs6L z74ka&3KpB~dJQ=A`ExL>L>niSK>f457D|U~;8?gn7(JlQL%@~!7EU!4t5iglxGkJx zj9Xoa>V~;&0L3-|8y&d)lYc)J4(5uv2pUlhDPDWLu11HrmG*T^^(eAurq5x8cL?=WOY1E-OvlP=c3+B-9QhlmOqKz z(7V;XP^xARDP;%cL>1q9M)s~NS4zA z%*aTW%|e(Ay=1f?xF!M4uh)Gq${?h|JZ|$u`R6o{x~jj_qDbv|x021nCC%nqkH(T$ zm{K0I*B0D|05|OFhq)F+?d_!U6^_Q>YjZp@;(Xnq8U0~%ZAh$l*kM#451h5<1V?S(DX!om7$KbiL0@l-w9E(=1BcQ1OT#N-Xl zmJ`APG-e7B`T357aIGwUVj}Smk!)L0XKjZ6p-&FrEMmwZ`}ji&5S z8XgAr6tGH-Lx#iN*ccsh)|TyeJ2GU}Rxy6KYBd5v1D|_X)enTKlA^Pk%KTJ02r1+q zPyT3ZaWyH+=_Ja9hn%bioZ(l#Lr!k;`Wvr>IOS`R)=FL+UO#v#9_Vu{7pf1Xql!Q* zzS*9FAnKY7xY8r2*sl;x!|OR3JEHWQmCFang^W~=%1BBje*qis=)pOw_{89%ml+ZI zTg5b~xKj*@0<00JkGG@wz&Yt{|7PaxXda+GGRn?u*-z~pK zaU8I#;}dE4xie2qamfdDVWHMlY{=NQPB!21vBr=-)P3jHCRg7=igZH6Woo7r<1W7@ z6C+mNI}M-n*^bS@`Qqu~}D0k^6C$LEYw+~bYY;kHil51uJ@!Ql*U{}MreHm0>`;~e*RKk*Lyxc|>g z;QurAUcr!1(KCe4X(Z|L)7t7)XieaVJDDe{kQ3-Z*${3!FL7Ct?Wa@0~MdJ zw#Ht+T}J?z2g}%{HM|58&N%GpH42s#2g}C(f|cRk2iC#?w2$6WK>R;zmB(w{*||i1 z_!`+oK*YE?w6P@0AXxp7Q$_jxT0`U4ElPsN3ZK)gG}zW`I)jznjd1g#t7E)OJwBgx z$;G9mzvaZR*}R2U0OY=KRzkK)#^0S@x6=C7t2a%D@yh(E-AMfBS<%wzFj2WhkIY^7Mk z+zOIN`!)9T7W0gcj6C1*9qj{!0lY5=ggTfs@xG&#BffGQK2MyV;$QJbWL__4bhDjh+!pFObLjGU1{DAy%?vy1sJ?Ci!{({|Vq57C8=4K9V(PkG!#k%yvN;n7=Dm6@ij!*?UJzeUku^b}-aWYY@|Pve;6tRVav&Z(2kQ$ffpo6$m|n(6VY$lOf$?#0LtF^xmKXpg8cuazka2H_Z(Y>diViQa&&xR8}a7cF?ocQ$x zt+9jzCCrPiY^is2#ay&Hi`Sp-F>#Dckd96Iv{k8qs+Y(%Wl|y0c3m!KpP28G6*hkd ziaT0}$lqA9j3}oJShDiErEbZSYLP;j=>&hCbG~9#4RMfLQJ&{j1v$@{c&MH;%u<;@ z1Ix@KQN)J6DW22JbjNKTB&Dj=T!u{#4xT`aTim5a6zjCDJD|;<<39zT) znY6U(raX>W!Hh_m=~8_mGca+zMU2ma@b{+5eB&aP^?aIhPZ~oViGr8E6j4W#VB204 zWXyE+%u_y#iph{+T;FaG5wmoIP5!tXycuLy6ONU-#E6upLL__sIm1^2m|a7G{{W4m z`?B!7*BJ@$xn{t14|bhl<)eIuCDQu*jw(vPI69?Hjo?pXXITQ=0v%}b7&@inYwQFA zhd*WlyT3V8cHX2*ywZB9=9yM$ghPPk7guyv@Iq0PIzgALcdHyJ!KIMz-OJdfO&k8f zsakYe4JJ(g^svUv;&*`Lb)&hl@E`VBj#l2GeK%2(qeeIZXAj6p zZ6Vu;Lm3%jD2avFzi0EP6K&RespX-9UI7TEe*_8RH@hYU#K;vvCpwv%-pI%qjR=Dm zjJg4#$QT`2VS`Vk3H3<+8cIcwcZ@F&z_mh@wg#pJ9!*!F8?AuiD=s6PukQT=o{{A; z{<1hS1S4hdH(syCP!O{OY2P5n?!KE3l&SOQPovMudzV<81W?qL!IlTOfB(n zHYS4=2bVAA{PD1TIwH(h#+bJDa@!CE*3=^&S`uwzSiR&CI2Ac6iL0LS`qEaMdwabafB zFiJj;oXNN>Km?u2l^}r6qfupMa3VQRsqfrEjC%C(!>d6`SjC8StD*c4nxPVl13N3{ z?DJP)94v9P%&l(^xz=?-^wOig@ltJT@A|eaNEpKs)kgrCxgRadRCh{4+a1;4%9}K| z=xDO^ngRuR@t$=$@P95-#b|sebs+6*5PU-m`{NNRug8;cXguIcF2B=J$e+L@WF(7} z-B9o>nJivX*h*G8&ET1maHHbhA-}3`uFTqBUQXDj$329%H9*!PV-an?_2=a|w}514 zxG_yYb}k}tE(oA(4WKv6(3!;j%xIHZSmxT0>A2$mr8Q6c)n zSI0~;=V6#UasbD~i>+;Yq<{cqoPe$MpSDTC-7y*VhW z4EjgD`;`(m&Cpu4>m)`~8qvVePJS?D-@F?jMi&Vn@9~?T~U_id^!Omwu-m|KpDm|M16eZ!JmcHE|O!l6}wgA2Qw7NpLFTgZ>s=N={+#MiOL=O)&k!6a~f2Ke_D zSpVyycLlgcS1w!VGs@ zB`p~%)*vKXmOme!IUMNem!ziXr;;!;=H zxWjXstsQ0)YmcoYo*&grEA8HMCEYN#clh)`;Sa1A_N0%e$Gjq@?=Sr`r|YjGs-;%p zmkU!j06Kw8uj4b`hY`~G;Sr^u@GDKpKh!kZul+Nn97Rq@fwMDS-DQzKg7BE$81~Ox z8=L0NI6?asM0BTz)Pv4Xt+)P`Y%Sa0!9ivjj@;-wi<`kKUB0Mrhcm`t9@GJ26XyGF zy|`)q1!w8A8NBUQKiKv+IFmiv79LY%lcb*H$FUjQa0z7MIFcC+@3%9WZcJ^O`{HWL zCh}GSXs#@g@NW=o+xnR!>4t`b)QI-<)#!7}x&IpD&(Tr%6i0aJLl4&^G5EAb5t)4g z>mDTAbq5HguU75lKaT4cyb9s&GNJ0-Y#wWK;~ny_Og?`9=k{xy?#p_tB)T~Fv&cWU zJXA|n8CXecf*v#A3a%u&Ob0ybQFwwd6$kLw#jL&DdUW&VgU0^kso9n7>x?kf1?+_u zXM;~o@0bJ@SnaSvM?^Zkap)Jm_4~4@h*adhCQ8vDe1Ws}7d1s=N<*Z^PoW7T0f}C> z2waW!lR2dkDdphz)f=2tR~?mu6saPsWPSDRKm>2Qq2fIA-FgEJLrY`b9)SuEw0UIU zr(x5dSpij}{j3?W>#%--3Q|5h`z%*o{w3aC*$U`Lfoof%8v*;QO=l#$sOf7SsU~df zp{rcN`N_m)ORH9QV`=x!Us|%^@$F??i0w=X@_XUgO9gDpJ*^g7Qr|3?jmkV6D zubfVdd#j5p9uh?xcElBDFhu5vGpQDwsZ{YMLSM`L47+~?nVGI07Po|-})}lTaXu+dp6yFMEGp}W!6+Dw?G5JfwAcN zT*u(l;P3Y8O@pR8(YJu{j(0SbL=L<4)uJz4-&W{j=-BmXqVU_vbL?9UM@lTqrr@Af zL@-i~+D9-^L7mW))rxwEInZE>VT?>_+IEo~V%S}+6pE<>?7G-A98NG91pr&yj+TNr z+gc`t0-acuv`nS$mVTB4zLQR&fOmVyq}X}Hg0a{InIafRHi)xH%Z9O$W5IEHc1?(i zI}{O&eQfOi1mnnw%cgjg7Yw_i!o$TN%3pT2>_5w4ve{cm@m40oWp6Ie9D+Ew{d8UzAObe{9MN#& z=wKucVELK6slYw%!78)US=qANwV)Mst1(XaDwGB=2n2Z$7zoE#cGyo8||i2kcb! zn=2sC(gXiYVb2UKZ|@3k*Q+(KPshY@N%&`p)%k57LvtVa-dOad{?G$wR}|%!O>?NN z_w=3U?3obKMPr0E&FQ&^x7U6S??W%xdevaa=g;_+`1sZo?2VjvhkH8vhrQeEV$Dwi zvfhtly4szCibFoDg?_|9A$%_{XoXKRAEGcZC_L0|XjgDfy*Rkhw$n1M<7u`lun1tLZVn03e(*|MZ!#iI z-MsNZ`{(ZVyGj4J>Zd0_}1*5*@Wg2!VO`D&z~+M;j$6ms$K)Hk2A_m zNMebjtl_F{#Q5c|^;`3thhm(jd={e!<@&G<8$ViaP%|@`pSY5}MB)J~voCa@(S9SX@ z5;5!1RU57mv|eh%1{~LljUltooA3ALj?v7=UHl0^=%qUqWXKIJ6dkJHCD9i{Y^k$? z@6Ww|*T-FZP@s%ekHX9f@6n_nplilmdVJ>r7x+%-J? z(7V@&p8_)iR}gkvOFz_$p^fvxFnD?GOZ;Z`&dqakN)Im%Or3Mw%RRcb2c8NBungA1 z;d?L5pM|HNG+uXTPw~7F*g)a2u&8Hqnp9_H;jx&Q{PJ2P$?)pe2j$8GTgAOpiWRWr24Xyea_Z=+C zNkm(FMj6ZGXuC?mXK55HYtO}lF8&q6K!H-#DA=w4`?;bI^=p)^FB#$qaM6a~|B5!V z9{-3o?-^XhN3_&gv30iLqRqdWO$KyL;x76CuGyTQ!8IF}^r6^)G#k#(|7bSfZ2C7) z;hIgH@IRUjl!c5*o4Y}Q*s?SR|Mu0Q<7)y2S}blUOrAO5QsxLQ;(UdRIAL%R$L1dq zC;CIb&HF+YXf(09Fadioqh>J)6Fy(9La|v@jP_QBjQneYIa{9h^I zfOvD8q6r7YlT!mSI3R|s`F^T7djQR`scllmZvzPy$!oEAGsjpibkS@Ih0!Z&I5L!T_cM$2#5PApcN)Lh} z5Q;!TSBmsb00lvc7`mVo0a1F9qVy`rcl^EYeea!n|M)Vq&pvzYefFHmWY(;;pXXUw zOTQ2oo%ag#e?*EpwSb=vJjI3Q)oDF>@xb<>_WjvgmY`kkjUNijWuo&g0`XqSdDt_L zV=e2Belzx=Wm`0l7*+&T0PAh}WNY0qJlZxiwCr{^;S+`DvM-M777@8Dpg;Q9^c=K% z@Oh>RueCYE7da(juV&MMVt>s7x9*^}HTcbk!HqqkH4oOXhTm%Lio=FvQDdfpKHRld z6nJI?+AVIhvbKglLx_sb2jgCN+P62f8lnv}zXjpiy-8ox>@Jj#7??en3&#E4k+qAi z3CX%uP!_w3U5=dZ8-v7Af*TOUrlPTkF zm2hd+ViiE&Y2PSbwvD^>upVyK0kN*GD83XIjuM^ne9~GLVo63vxstnV%V5h6Tq9a& z&+LOCB-a3JbN841Qm*;i)mL|nzze50Fi@eqJfq49nd0?5CG={EAhm6%shl1&wtQS!%nEk}kx zly}UUGQ{@v2FA0k4Q4pHfj6A*DJVLD-!KBEu+O=Rr;mk+3Z<;1N3dmiOWLoa8M;Tn zjM0^us@(>$D<+h=NPO`r$1{I;SSE8JvVHQMa)OxiQ1WwAG29r_MXp%57+R zPVPFw&#_!{iE%PL`F_8V%s$X?H;N$dpz+r_*4&3IP`amAQuDBm+SS0;M;@6i>2z!_Q5Ny6$x+VUfbn4It@)Nz1&wxA?zekG=+sAE zQf`U&!u`rBgsH^iC-#rUVTafnI{&g?h<7;x4@WI`!*}%t=q2 z?Hiol$nUQZvJxxDc&Br`vAj(YMX1iF9FqzR90WFLgV0Cu*kILERirj>(bi3M#~l5T zuPM>?QyX{sKo&+}a}#L#VH%B`W!eJ0%DSnt5u{;3TQ|dw6uCY-S;es5!9Oq578ohi zR=!iL{XsWO^d9pc@;fKHhp&iSqnafKQ%QRnHbKP$`5U8!GZ7vtscQon;Lqmw5ST#$my%sUPM=sbvv*96+X z2soLg%sezENwl~>>|A?~n#WeT)%*fzjmK|3{)c}P03bF%`xkZR>5=1;p(96)n0thJ z1@ZKQI8qH@9|i<%Vh5d@jyN|+fQi0fPpxLGgwyz9{ZUTrrA|4(#yLH$0?sYzx_#R} z+TriDTNPBr9zHlD*lnN&*VF)IWx#aHkuO!h*>hcszty7V zx@N!cYpdEZOg|~&hriFxYRSbpzvkbLBmB@l`D6i3RF3*G2k>yj)mz@*5$CM)P@1b- zCs>XQ-sum<-louMh1xJ<13Z&k{yTopFL;e0cBQ7L&6t934S-WKFf#QxWdt5Y z;=Jytpv~Xn@7%YgfS2(VCUIX>l2D>4?M8+z0LmT{Tp!xS6@dD2jPp) zy8M1Y#uQs>(;@qGDV+r@K8y1k6erhz2t=i*A}r4FKC&>K6)eo5l&E(iX@*TLI#_C& z<{1@8T$Hx6)FNf{P$dTRN>r`!Ft37#-GDSAkcxja!Nh`dbe@fNFu%NO?rJb!{AtdJ z^@8L^Xld;eeTfb5CoB9Mdqe9NAyUDKQB{mZc)?UcY437wYd>lmSv9nVOmch^66!u* z9i6-Qf?ORXat!)~#-+n6^pvY+F|i7zSXs;G+v!O{@W@wVbRZ+Q{m4Vz7g&l)#nz_T zN3T~$=^sc2kFpZ&s=R0l_mr<}YDcIUZS4G65~~PI)beoHLXQQfk_YrdBbp!;rXS=x z&=}K=d?r9u|7aAf$7=Y}QIgE@d3i|{d0;sy(g#lciy0~BZ^rRb43%EmV zi0*1tFY0=ObYB9t41Y+Bz$^1TUK>7RgyA1uv^&Nir?wl-MFlp3R|6XlYx8A9@(SFIwb;YXTa$OCBxL&_KJD57vRWy7sB#?Tv`dhmc zbB|pq9_^v6vAkNMsmGz=$f0pC$@5;rbKLRFj+XP!R;|$54}A6n-DFiIB>VajAiMDc zWHp<1U6}&M>?rbN^U>_`HtIoTE(No zb_4CGE+lZ4_$GXTBPyKoi(KTYe#2kTW?63Kj|%%i6UNu3Js6_riQ0%K{kmNHhDgkJ z0_2O0`7{o#LyFD+auib5LCIN*w^~hJvw0M~&(%fA2^BgL@xyI8D9`ssfiDL31RH{~ zRWbtkHi6LLQZD=aUPp1Zu|lY7G-Bz!++97k8b=lOld9ktKpUeSRBwfmELX;NjA*i6Yy;u31+cvDH3o!*S@&1k@wi!c#0! zvG}|u_9m%tFo9!tf@J*mCZN31n~^Wbl13#7^+k>0;S-OTY)jysIgD)b@iYo7Gr*6`mD^Dgcnm?Xo-aYQ{p-$cx_Y&x^Pf1Af zQs_ZwGJyeWpP0x4cSL6z!S~a|g?#YTEzBpjRNt0Af%wjgv>8RwPWL`X^}kBGcPDAZ zcsvVnb7-F##R>o_!HK1Rf@ud{AqTYhDx-ia~6#xRp~p3 z$8)YRB2~fS@W9ms1kb&}$p@AA1HTx9*;3LFl(6(tjOL(ux8(E!eu|i^@r@Mf;ngRi zi`it>-TO;T%kXbdVF7pie$ef8{C zb`q|p;eh&TdT9HP(PY(u9b{s!*1*nDHf7Gc1?xcdv{}BvCBAZsoOh^-9e(GBmHbLg zMAs^U#L;3RY$VvS_Q3UrAg*EBe0qv_@^qyr2bGB6Y%v*4y*Z)(&^~T6u)`h~;&Ut9rp-@EY^WCkFQg6c zOaSrSpKd!r*S$YNzE3nnIfjlG`vrt1nc^G^G*nR1Ux>vVMbyWak267)jP8w`&z&Ko!?!jZcZ&}l z{}4mcUHW(eq+^&S`a|{;Ul8mjE)A5IRQm{H@$^A0m2z6c$kS1gxzg$ChXo>W8QNs1 zNzYzfD`o#Vk9A_TuUihQf|HdX$Qvvo2Hutvj|(K$+g2=8u4V>+81kZj12J3-Cu*Wd z+T;qjDUOm&D1s(Ff~I^x`q#P*tiiA3NKFucUTmwj{6GJ zzW=+nAAw+bY6c`7d=r#{LB!(_WN)h`91_wQ;@zDJC>CbUVaV zE!P|$ULOtH<$qpl8RlM5wtAfZyFKOlCljV>_sw@_$cKsCcVe1fo=&=Py_^`hxsQ%f zQ`~@EEB)De8lr_#Q@k33eBfEr*FTgxNB;SyhEy!I+0QhShNSO_!cA44rM~9I@qI)& zGyjaO*Uz<_bsG1XeWjaQJsI3m-yrOPc&Pbc3YdRzj`d9}^FRchg1mC4y=G!Gb#kX~ zqS(yGJM@foAzeP{>n)@=&!1@XeoVb36AT~k09n)9dq6X!m8TPkV#PfWd|Yy^TiSP! zwpsN(e!7Rk^zc(j-NUfAW#uL3jjhdmydS3=t8c@{FRcvh=n~gAbVyec*Cz)UKi>0= zjlr*TN4Wfsy;}zV++jg`iRC7!0t|A7PMU{b#apwB+gR8W*WK)s(T_!P3mlG)^w|DB zkfz!KR3`^2TgAHRKb;(M@6ecamtb)rn_VWj(I3Otk|2+wXS-1-R^iN!Z>5IzMLWNw~<_|-z^!golS zn*b7P%6!Hrp(|2OG_H>t*EkI-Eqw0KlQzMXh)Fv;J`0itzT-2nrZmlg;1*q)@*(#b z=m_i*zq~2^+?{n3nX|X5Z}!W-6P?i*A#-K|<@sXy7_#kGu&7U4*bXiL3kaH`BZ7=Q zfx}d7I9|z+qh#H+sy2Z~A0MH|kX3Oo|HpuHM6&(IVGJTh`tT@58bU#kFSAWt6f0BB zVCwZDcY`sglh}(`Iz|S!all96yD#^;WCM((=K$e%RsKef<#3gITZScBmVf!`dX|>j zbB{tUt9wXR#qKr!1vV}Ao=HgQ;{B38k-=JauO;>xpFK`hgPpCCw=_7)xDG{&;(CQz*!@1QQOV^pSpLR&U;R=J&*$B#ViNB zE^nftjb6a?D8K=Z^7&kw~w`=`7IK6;><8?<45!yZ6sb})tN2pLCX zgiH@Gd7Q8#lDL2Q09((oDbw|C`xeu7u-=_FwVxSU($5KlN zrDdPs_1NM#zUgX%%Va6rfGdjI5gSsx(_^ImiXv%jDBak$<+oJo28!Pqr@iE3Q*?1u zDZ?%|k$6R|b%%N`j|A(crB+{);WJEE3%%NXs}-RchJ0up zOt*{CaF{3$=MbjmwSuYe7!0l7wRI!bA?FTv(iuG*wWw!uem*Jl_4IBwgML#CJVp04 zGe0ii{%4*3600xSWO}m9f`g8Q4581)`5&jl+5TcSB~G^mCkTvO*`lEou zC^Je@c-9d(Yb;6^P5Fz^3_2n!oMVg9#S--tl(qTVV8Vld2eU}|n~bO9jDdrIQmZ)3 zoq)ds-C8z`v%i^%h&JDS3XBK7uvE&842qPyuLWg=5}h(cQ-GpJGtL~nt~N@>LKOq9 zSgrHS*S3QGaWLXg;PZJ3yeN!^3@97gtTnjql$_$Evz6}PR8U$Q0DW~Dnjt>Hm;W3w@2ke_5z ze%z&o_&KOjWsJ~fbxn|Byu%jHBh4fsL!v69CzM|6*zgSbpz{sQWSA~oC)J$>h>d&L z*Tiy|RWUz^(}nZaZE}RiID(?n%o3>hE1@=zg-chb!;D;>;F~A1O=Sr8@>M{^3WwYo z&A{LNjreT%%!|!UO5NNtgnIFclVnxV9{9$^#(TVdjDs0|WVuO#g}Nbx`pXZ9^|dnW zb-yh*54IwI*z2+yMyB$;(CtZgd(yJuoGCh(H=6GDY5qYSh&Oq6)M;9XCjgK^@kvox zVHZm&GtoYCKoWb|Y z>}kZ%0Ul%dC%w-yUjB!E9-|tHZZL|WN4_@DE8(Nh7rayl=;yc?F{jeCH-?#7J%!2w z))#;9i>*eSroN&2U2u*wHygQW6OB43rg+u@ET}b<;xjyq2*X6xj&+&~r_}#q+S2YH zmNhHlF*j}gCDl;{q&jMvHSkYB{0VqZN|weDT4Bev7z+U9!P6^IrQJq0k}~2A6V(DG z=;(rUVzor|V)?C{R7D&kT`Q{?sV}~->M4(VM1@aZz1!mMdiP;>?!lo<)I$G-dtDIQ z>n=N2J-72Lw2v4)^M9O)b2}9me-Qc3EzmIIFiR(?*4L?>hHHxb@ zVS#Ua1NvesnKnPYcX8XHs^Ct9n^wg(S0M#Nzw}*ItVpkAC^#s9#N{lI2(iLGwAaEg zO%w~g^!V0M158ejN2D>9ERET-Dgp0~y7^QM{D(P$iwwO~t>ej9H&%$cOPnRGkm)13 z`E?I#bB5F>?=9eZBQVKDVR!TOq#E-dtE$Qc;32|&9NZ{QSqTwwV`vY6=qZ8MGJpNm zU-y-h^@*Rd)ws?U;lc|M*#Vj_>2Y{v9qH37ga+V%FmGiFl2j6c&FnUffy3d2jwB1{k_*O*=Be-`uWdc zvJ(dh2U;dOU;9U?CRlcF*$dP8TIh{Aye+9au^}{Vi+quxq) zmXuM9IdAw>TWgwgStjPojYtHMd4@`XWnfxf;h2~7q>cXtx@FiRmFP%lxK_bZP8fTB z`q*?<_yb}%PP(M{)_Dt#=+ScpGVoC z*%uDp;u4sjO((PT5@P9Ot*s4uY1GLU$NdgAU)x>}9$7o5_N>2k(tOZ~x$@5jonN>e zKT#?8spb64gH}GXPeQCXzF3{1yZLB~2h!c_-qWusaYqXIc9?A5!R8{_f@& z#2TS1ZkUbqz;^%=aZ)KUP6I5=LCaHuR4Oqv2EtRo9FYUsU4ogIft*CGU zortE;zHx?BEVKFYt}f0=zgnpGMO!}o zDx3c<^t`$GHDx%Lg|j+sBDN@=L5cLQ=WH_~T!ssBmm-Qy>I=`tJ}FmH-!Y^cR=jWBi-pC&;g=0a_q6`f4DAeh!mOTYRK-`p~KZon~2)1xt_S&M@8GNKhk$HF%oKvA)`w7&WuaLVx=$ zYKG~YA9)FEp&4wZ&!Y0-&j{h2wN365^Ukw8g~zCw>QB)>pUcvQGqTuI{uW-*nd4e# zIjJ%!wYGJivCQGlPVDfSsn*;m{nSv;jdHfRs&lYI4Nv7oN8XXTRqwh&aFX`*Nz|3R zyqyxo)rGK>qe$D*#j)os;)~|-woGsvycTi1jZDSq5vX@6HRV08pN6j=B^q=l_-fl{ z)LS+ZPWwolhuNA$w7G!q53nF0EJX*$=IHopp8~1#3#aZ;)2#aX7H#K=dbu2kIFIq# z)sisX+x?)%h<9(#G-*OH)S|bE#9iJ?31>YTj1ch4X;bTWF}#Q#&hL^3^f|=vK3>8G zZ}xo<`y}b zbI>8yN0kVxUvD;?R1Ag% zJjDkeujbHOCz3yVD6E=!8XUS05*X&HEY?%yLRds*;Z5R59UaF#{z-KHCHtLRt0C2nEg4`;k_kj6;i6O9Zm9 z#XH6lE@#YY=6m)&F%>$B(7VJf#P_HeXbA7=3Hty^G-E^rT6mIj;{pCX#3F33#Qv3E zZI+ov(0n-pqyPGzmjkO%jI-qhK9qW}UO7H?y6A#H#P4w_H6dkA_LJH}X11=Xz@{n1YY^B=tWpkXCww}r_7bUN&f zBYlnb=O3qxjQuz7ZRjgo_$P=Dd+jRHmF;q1zRZW7g_IaP4dXP-^2|Tllh|1@7Np#A zU>`I|R##&G@fYeL)Z{h9{1?u)P@@9*-rAYJi?OTXB9b`7Mw7z!32mGa^0tT*|+QI&VIrEM~bw&;44 z(S0=3V$22@rFouap?0e$qc)3U-HBh!Z+a5f1^63Ag#RRC`U*Lnc{v_fbj3}?(NXer zYo}oI*H0Xity`B)(f1*w+HcLiClfAaF0YnmisXHPA#tm?nG4V#nB}41Y~;+M&2}Xk zceZy5@|G|kxSH+Om&uu2epQG~n0vftnCzCoTou1Eoc`YRQ}N+G`Rh8YT}gb$8#>Gt zH{a?zf~3kFCLx;L;~)bUJAqw<^^@*3VnS=715H$zoNBO|z4PEa0&Ie~rSo6Aug1{ibosBQrf;-z~Lrb8Q zRwDKbBQC+5-4~$*MMm?a_o1ZB>u3_#h1+F@&=DYq6a`X9Lww|<1go; z0VhVok=eGO;qjEN8IPM)cJu>foQCWVg#um9B)ijNZ?Ci7f@k7^IZtk1LPu~A-2GJ_ z$_GaipZek)`tQcZBvZ8CVPXC{zG&~%d!ip-X?wMtL0jN$b|efjjHs1C8w4(+a7fjNV$vTpLCfuPgiq!7i5jC%wJHSz ze{__`L_ZC^*LhA|w`)e&I_)^0Cq}*o+rO*JW$3CT#26L0)`HcD50pBr4J_)bC{#Ls zzj^#SA9;~!#soKr947}ADa}6e4(3*k)F(fuDC_hI~m52tec6r)TdyYzUVZcIkR|M}~soj4*jIC_6n658}dwUZ*?+>Coqo;n5;kAza8)-p2EB_(LFHt3~du-=|WBa?|9SO?@Ssr(cZryhcZ}#ux zq2A~);EXvM&`wB2rtb46(sSNVBAms(?0P{w3!Q(7qIck@PA02NHgyP#zB%jYJdk;H z`7>xB^L!orm}XELoHm}#aqs?!r{jSMN8biA`+AG1!1B);WM$);Z#{1zN=W@DbWjv4 zacwlo@#mmi`H~uW1^m@eDZ1c#Y~_z#>hDQ&h3nstd2zRzYhHG{DF0`^un{>W#Bp;h zh5R&^p>p>%MM0O^%(YBe*EQGN{rl$zetp;Ozw7r;VSHw%XV2VHc%I!txP znL@@(uUq(XO&wmy8Dj*>tAVdzi>+7uH5ac4L zp#YVDN=p1^MaH*)oI5bizZ*$#KHwz(=LRY-B_Z{{ZE|u@`Tt{+k(bH92J$P(N&lZI z7)r$0jc=lK~1+M96DOXsE+9q@gfL4S5MERSl?&w498(mX^GnmV}y?gp%U_ icZDsvk`j4fkV8OFNPvqgIaFQ-Dnri4r+H6{{Qm*yM-jgO diff --git a/docs/graphs/large/l3_perf_a64fx_nt1.png b/docs/graphs/large/l3_perf_a64fx_nt1.png index 6d13b1c900af36227af568bb291e21dd49f251a5..f2cb381786080fd9448c7d13439bac6947e8ce07 100644 GIT binary patch literal 255532 zcmY)W2RxQ*=r-GWslz#|NB+@nI{eZvB%-cHJLrX zchOO?(x`Kkso^ptT-7ukR1BS&ZS1UX-L^Dkc5tyVWxi)+LqZ}Kkgcv0`jeMpy^F(j zXH2Brnoy|e#^F)3$9;64uf+Yjm$ys6cfjvda~Q+dD_&k(S1M*=O`fL4IwgwzG_erO z)H@NYdH#KklhvSG#jm?J8K!AYyQ|*3bNl(#IbG_)e6Eb9a$y1Wg@TC!+BY_@0?+`5I~)+QfS2LOd~!X7*bg%TP|>AB;TsA z#4d!vO5T~_X5GhXQE!(IRV6x)&PS`~&lC@Ruk`lLZ<+1x?(spN_j85#j+F-Fo?>mdv`|oya~vYqt-Be1#0b^REoWRTRG8tvQ{Q#3HPC@AIxG&PtL) zyFLf-zOD&Wf2kFd?RnUM{^%xOrE=$;`?sa<(e<1AMn|en4w!R=Z1a9}!8Igp_a$1= z!&D!eMk9UKw8`gLKloaxYX{%xF=NoV=+o}7xauJMVWQHs_OE8K@3k$zwO$Ux_M?aR zFNtZZI1k&YW)%Nzv)+}Sd)$SIQas+_e0`|3Vk#r^sl2}FZrRmaMz_8P zS+(50A~H{3Z|x-gxmUu@K6uNfaIm`iT8QZ}eab(|-IWSbWLL=om&OipT`07p`g3G* zhQfBxd1BqSnWtaaYo?;NM7Rzqe*l@I?*YBwN zskF7H^_@%2#pMUbH-ckr=K110pHN*W+t@*yOdhbhJpFWYTgy=`za2rcKXAGZ7#k`& z;YfYKuLJlq@kNtY^cuh2C%GbhN$uW`iFju^pA{$cHWgg5{dnBcn{(PtZP0R1(tSZ#os8XXR`;HwFxSM-o_cq4_d(pjo z`Fzt&&CNY)y>4C_$z9_M{{PQcByVbs7zx$?{S}_hBAmtge}A1k*k*Y6zaMc4X*1_n zh!YCed4KotlCt!FUu2B`*I$o0SpNDDwYW=(VRJ{=vM>eY2eivoK5*Tm#x+nB@f$Ovn6~#v(<_ zYe4e!>C^T3cBP`?;yl|S*>mU4jegHN{_XV*(OoRVh5<-NwViqoSfhE8=v9hFh1Jho@yG*2d1xsLFd6zvcHZ^=$2JVF?M%TboM~ zvY`wCDjCCF`d&3%UC}i`G<>(eo=7pSt*hfRuBFg^fA^Z-KE}Ycga6y%>+I5g1v|qc zBC0??9|1~q|) z)v~ptZk9Nm^<2LvDk`d+`As2FJia^Mto`+k)1eF(KBiuYIE4?Xo&D5pY3koE=e~jA zVQH-I?c2BUfgU{Ev)^v=$N5`dp8NRu?Qtzj7#%hL9LXV-p^{er<;%&FCw=xFVmm-j zA6Qc2W~(m{-W80Sb&!s(;raPHyl2lE&X2ZbY36yy@R_|esv&ER7pZ$|+K7#AIy>BO z=H`dUV*5X=;c}W~uIH_O7S{Ci#Hpq$Qqj|Yesw)XPEM{L%aewyxBOWca;NLfzh1ZL z>+kR0^jsd;V`pza_AQg5^Dytd-`%$jF5*QVPGt#q*9GV=0d z9;@?)KMSoWm_&>IJ3b$=aa@*XSYidNB(v4|OzIDKu8y&@DJ3)w)&v!MtO?+jf9&Xp z%rmZ=-B{`)ZipJ^;X9pKN!L?kpT)#b9X$B)cVl#3VIiA=DqN!$uh3m&dmI~Cfb3iV zwYU-GoxT0oXj$3?fw$XV=H2PyF5g(xQBY7Qa$Vr$j0$v_8@XQL=}~Ci*X&DqG_1h{ zn>VS=+&>_|uro_*eyrV(QN$_N>ZicpuAG*hV*7^ApYib`o;{Q6dULb8;0{fn#}Z>* zC}Rqa2vOK>WU3GJ_g^$OKZO@xUcTqLH1*}p&5~y#&fH$>|MobJcba0=+_yHCu+407 z+6K!_SaVng1_p4Ha10+-R7j8?In!UgUjk>{Zf((ufsqj#gDi^khSBQ$7_MlKs;Vkh zp?+g+Y4861dkk=-I^Nuj#U9vhU}koCy0`4#{QPjdbMLl(EOcE<%VW+Pua!%jtZ+(6 zaB`BJr+bS&eE6CqwTGXcQp{~J71xfFs9fgiSY61<>Qj}i^PbqhSWI?i=KE<^qrYaT zvR4ahXlUS-q;VzB1g)9GT<0%xN4F=-kiXHdP;_wMXYN^|JSy-Bm5hX(oP75_24aag zZhqir79xANoATqwkKQOXhGu3O|GR(T;Z=*1KZ~8G^=$XcNDoIHJbJY8DVI)tb2Il; zQt3Z`s?LM=71*k=e{nmF?%WBAj;2{#oRngV_}5>xqd8Hc1&hIP<27|>XQ%so+vVLu z8>%{*}JNkFOZ{B&R9S>xIR)YQ`P!j9%WMQ6Em z3Z>a1+De?QckgFXFfd?7ts+XXfYpx|sJ^W)wvCL8Fi3bZT)lc#F;1uo#h#asZ%1WM zPfytPM5Xd1?r=HY^XJc>d0$v4K)dACr_LEgZ*`hgh1UIF4UIyqK*)<1%obg5uFZ{n zx!svX$Gpxa8$vH*U|{f|Pdexb7ivzNi1WS>+3JPyF0Imgc}Y)yHZ<(j*Vj*R9k(5- zrRzO3Xs-=CvC)z*H^(&Cd1rZr3rV#bFOy*G3JTc*aLg9p6>0`^Y+eE%mx zNt{JoJOKp?&FwSRf3&Yc0!;;16poMHnWO(WGxPM0-4q6BDvusLdZC_qySMZL_IzDi zTR3`>-PqTYCl2nF*jnd1a^%STQ>TH{-}R= z_|r!p@|6L((t^m08Le_v0spg`fBI=;E9ie^1oT zmQ7NAp^`yD+$h?m7166|wLLwh_uIJ1TK(5G(5tV-3Yg*s>6W=hwWllbxh-1mV-)`M zIqLLYGBTr~+K`@7mxu^*vwpgEK`~& z6G&F3*HSWFeia|M*nQ<(Qc{w_bN&GI^L$i!dcnI*FR#TGIZe{1D<$5-ePR%H@IQQ7 z-N4jTCjZu#?!~OmJB4cbJwnc(b4*N z5$9)TjdvGVd`mWKNzg5}^Fd)9nx0ld#cBBR#q`I!+xOhuh6VCaoVW@-?9{2AVCb92*urfN@($8G6e-Xq+)^$ij1bfWio$HvCcMfMy$dK5d= z=ly%ZmoHx~udEo}zI_E9=d^m30V+ySNr@^xrh1Ov7u@cZ*+w2wkBErco*Mt7&!5{F(X0MRP-`!BGqm#A zu$9b!Gv1lEdH+S5Uot)5vSaVwy;U_eK0Foc2T`=QZ{J>IJG9%bIPQbvxRI6BHQ?3W z-d^1zn{Dxmb&vKQB5EIr^Hfh&ZLKfWaoN@vmv&&Cn$Z|t(9*`*)AwJva3N0EG4T6$ zO)QR$jSa8K=({^zp(}HL+EFcC7yc4Bm0dB8SwNt$h$pE2Xnlg1+h})wdeS3oB1(W@ zR30Fe-8j|2cL(>Ome@2~t*(4dk;!1DV=AF{*dedS_5-&?bv zJxiseq=fSP{k`=uG``CjYIe&rhO@J?_ba|S;U6ni`%<=zFRU*QJ&qHy^~MdOS;`3T zCy^|>DJ#31oQ7LEz1{Hc-Dd2`3|y;#fWWJ77Eifv21+JXVzB_YsAy@;<)&w6W-9CI z{J;m!8h(5M*KtQ>@_rG?f4aHe-~4-D!(iS!a%LX+P4-sYC1h5-X^h zqbK&zSUb;12G_A`u9Dy*?#;1qP-Hx-)>fc5Sa$B58y8M{won`kE zY{Pc+d+>%Uii+I19^G$E$sK6xd9J!Xarbe`P_y7atMyuqBQ7p3;X}3p+adO|GJ14? z0k`m}iQ+Bn__r0Pmd5EIXh}FrNg8%smR{NO;9v@Re)EfH?*I&0EI%8&ySrQeo=IHa zI899^<@@_*=xFX!5>E9jIg}KlL+m9bWk%fxmEi?8Z)$2H&MN8*>zOkYA+l1~8>q(M zRVX`$(ar0?9#A5A0j4rFa?V-zinhs7ah$FHZAT!>WSJn%JfmF)10GdYUcA+k5R#qE zWn3Fvg-ud$rz`oJ#lNK`GpuDKhwAS8=S)8-(%o%n8I%#?I26u(9`7%PWct@PCM%zMCgvu9 zgJs9 z`o;0dNh=4QMw4@^OVf`5ZAic}d^@nmj|EL{50^{_Hb<$mA8lnJdItam1s$If&`{~c z+fURuH|jXXRWhh!6x4gtxn_@?vma3)O3ie|7J*}*@!vj3oc8)KmX`IV2yf@(vcWGD zsDiU#5u@uxqk9+1G-C!b@mb`l^h@h(X%gV$m)@R941BuRsv)YF9?e6XlUcPhp zZhEr9(`v8o%cdCOlL-k6gQJdiN`8NTKgs=+l;fz{ zDX5(HnI-cb##F$g!sXNW&08xC*4Hh481MXI!}HLf$P3U(wk}50oO$9KRAMaA47C-1>EO+%Pah{1&h|V7#6VrfgL@>65@!XLJ{1NwUF9n53Z7`k5 zty^y$$JOydb=Y%3Yx8=i35v=bC2#BGGy+O+CRZ^-?VRhp`Hw=YMzGd~rX~|KxqJ8S zLChe%9}`1|&Xx>9wM$$d$2mo9N3m{bMMcGcb&SF-^Znat%ghz7T?;8Z50#;*Kghq7 z!mfBdhDzbkp+nVT8z%^40sc-v=se@g*ZHpu7f2;3Q_Gn({n4rj2n#zTgOKu4C5kWzk0_~gqbLV1$(YEadKkb#eHjWBwYimoDIJO0KiEH2# zH5oQ!4URm4;Lv4o=yJ{5SkLC%9L}!n=%BW?wk}_vZKUZ8@%Jy~corM0iV}jQG>g0e z2-<^x@4Nq4Gdc^&tb<5_ws<_ck#Y^6Aq6~F_V&kX z&-u?P^vb2FfcZQ^d4YNZ&7?ZagE?FdB1-_v&#P<^1AqSbrKg+O>eGh{QlJUnYL06F z+#Ku9XJ=z$Q+uWS0EKxQ!LJ}st^Cc*A?To!lM{dy-Dlo2XU;qmcH~rjsW8y4Bq4*B zo0@7?lGu2p%Uy@oZ1L?ioyt&lRu=Jg19}>#sBr;OXKrjivFcvjt#J`0)6mcm$EiP~ z`3Bm)#v9!MkiWBM+ARutJ~#%&3R({i3|ulWc+n;`Wkn!d$PL&9l;?8>ClvnS)=R0V z97MhJ{OgJ)-&^Jetnjj`TP5=wgs3Nxk&hA+k74&4S%u04{03?Tai#mFEPloF@h1Qh zdc6-?T3Q6v$yUF5@gm98)Ra*O@snuAQ2A&rMz)%P+KT~PCw~<1{dZL;E;BPWIP|Y> zWOnM9We)%TO+`l+pq(x7d3C#-q9Pk7XCOo^18Zyj9x0E^>zUse#9Sl$`wgUit@Raq zZU_S?f^GGU+GEGEUARCeAt8Y>)vK*@l8Y;(zFrO=FGD%W`#RetoM8YnT!^S@gBTxQ z1pdMPcEIBK_jh!@xBMX=$>VnE@%O$3-a2hc$D)S!m@NY?IiSKSaZ$_X(``)vHr% zigEssLO*{F1W(~Zm(qS`e!VCCII9s*0Xr+}M`fiWw6vNAvFO7DKa`Z*Ua_$lYq)}T zLArNu5{fRmH;I>*7ykCf#Yt7^R3V_{I8m1;GC@cFO}_7=PkTFZ2-4EbKs8#k0pQ{N z{udr|Uj)J71|e9gsZkU7qQCM%jfXs|HAl{glS2IbkvsP6n*q!eEf~3;-kxXH!lalW z8cE?9V zFHUGVP4xsQrtf%UwpM8M69oqf7&Xq|5gxrJfLQFXr>DNNsTdV`ZHW^Xwr3v$Aqa_D zP3hsv-IneuCW>37F4+8OW?Y>5)u^HGQ3Ddz2)IR13vkYr3JZx7th1cNzAJH>WIb_W z=jqd#X1zlYv{=ubdqSMgr-@k6(;9CGF%yO2Q)jl$3P=F7sT9rU$5qahzGdK2Qff(m z88)`y+8lSGdg@onSZmV5whvG`fp^V+yyLJdWlC*UE<6GiY}<_ zJn(T>&#zy^yWrxWEkLW5(UD1&PU;#*X->jLDBZ-q?f&3+8YfbAdn~g*Mj{mt3L=;%81mcH+6M=z$dmA3s! z$IFd5(}FY2@gee)co zSJ}2x5G5n!PvUZ0MD@`B0e0MOZM4Kz#(F;fbGe(mL|qFUa#etaEg>Ocexf@9TB;5B z`}T2g9YTj8jtSNTc#p1AMphQI%PRK zEV8~jp0i7Fv|*$bxD%KDwY@#DSP7!b=EfT6c`~50rluzKZ;r!z(SoYl+Wv)Hqq}zP z8a~EpepJWS#f)zj#32I7B?JVS*SSvr24%p*2yjdG?)G)gC|Mp+02W zU%eg~9tq>_;fjJe8~}7T^M21?|(qo3=i(Ns`JZqC5HbzFQfNf|hG)u~Qd(m{UY`!oN4!`C)LPJxR zp_*>~vv6{JD3?jZDH!OV&uh~iw-8VXC+A5-MB-_+8POpb*6{w35n0GqAV*%uZ>)m& zKn}fUT*Cme3wY#vU2UWyxqXb3l(ZjEOGfx-G?ZFwBHY8hg@uJD*x7e{oN&E$i>>9W zkXx@9cSOT0<)rNpMn4kwytMA-&6{YxACJb$gvG>Mk(HHAQWU;jp#54c<0EcS5dUi` zM#eATc635E%qkfj2C3<(siOAhaABfLyIv`uBHg!d1fEF^@oI5JHja)SFTc*l-bYVR z$#~bQGb1D1N{7K(A)z#GrPiP6-HYDqJr#U|wzw#U1 z;z6d*r!S?qUk7!DCrEalH(5Q?*2dG8Hu$fxi>#b#lgWrsJdW*yFn9}MDGDMNJ3itW&!8VV?ZGU5_sNy?}6%u|2<_%S#kG#LE&G0$83D@sc2Cr&&9dI9^{LCUoXV}@4DmB0PvwQ7harKO_BbxwjE z=CrEbw>4%bfse{37NoPgQ(KP zqDOM+JjE%un;pIaAqbRD7Q)WTtSRdFjmT12{1_G!zgNG(>Q)siYwP_2YO^-}LnKfoaHqBQ1$aTbmo$_1iC9x)dkj zm4M2%B_687Y71>(?eG!bAkk+6n5xobNMnr6E5E}V@CQ(96fl_gJX-1}3C9uOV zTetY_^~u^_CCAB=CUxv715g|1`%#b0-`){6k;GbT{KWtkS)ro7{0>$2Kg!=dwVgAx;Ze!@(m* zeBvA2n{QEhz#*%ms!CX?5QKQLg0L~y z7k^rJWa})2sGU5tV$C4pL~-ZN`1bsOnri_3Kwi*^DstIa&!3O#^IQu7gQHo>m64~# zvJ-#C{f&I3?Tk*128vgMS|D1`@;ibs+vonYwBsfYGJTNSI|NKf0Bj}kmDD0XmHX~G zL<#o!oJRof?tDKue$@ZHE_To(A0N7fcibGG>gpoG!yowh)sf#(1r!SgHc%asA+rIH z$Et0q3Wgo*<^|H53OUSnfNGnKt*uLnOr&)0U9d3J>>4o&*tUV@nf|3?aNGi`n8OcN z#de8Y{9kh9fB!!m@3DDPt7N59*ySyFNX48CZN=b&^~hXB>;OV zUH2k*QmosI4RX8t?Me9bAO-d?HPVH%fK^Sw@<3+}!L0yDMV~fBGZzvOQT^a>8%1#$ z_lyqN0obGme0B!W{{JH||BJ~?wptVbE#-P{xI%L$xre)rn`4H3z<;-g78pn+1AX>S z%vya@6NKe$-n=7FjV|L{(n)xV!WIE#&b!^t21`g5I~&l?!ZvYL$49f8bM^-uHF*mQ zP9-wk@dZ~j^*}fC|1Ygm;}3ZTA`vSPXY_5~gETak(K^msbX-hQCNa2m3q?na!m39) z@W|PZoGQskm|RLy?*IEY5KMQxGkfA!2~C-v5)S0;pM_i?0qdYSxGghnmyglW>qAir z9bIt6{sp8s;MN;JtZHdE0itnkqgO3KND}Xii$PHaKxv1Hub{A5mZ@53sRb!mv-I9s zm5fNhJ($FPl1W6=qQ+yaS6(lp`n4J-oV`!Kelf-<>?W<%gc1nF9H|VPF3MfaB*O8b zk&)CuIfNk$fA6fj`tSm)pkOqVX4+L=3x0a;n;$-*gAnN;gc~3jOOag`2VWp)n_=~C z{8Nw$U;Yq?v$G3b`p3iNzL&c2=@eRCLgps*%AsDbEsyf8O*b&zD`l<=tz!w6M-6pjmpZ(gmjj6BPN8q`l+dD!e`^M zv5^t!{{3}TRoh^6NVPQJKpIUJExrXT&G$DT-a1p% zefdC{-s9NVqmXOQWp#}?V6#O$dBXaGll3G$3I$5EjlDgeH2GXKz}XBUL*OGY#bqN> zv0_+Kq-LaWFY$_})pTSs)MlafM6XWt^?6~(fYDXic)GZ_L~`B;gc;2e(XiGXCY7}1m< z7F$##ghK}T+z^sB1hsAVp$p`}7e&Pfkn4wVMpzO&=cQOx_z?pm+&m!E>C(AZrcw%1QhHk{Dzr^~10_`uT*jgn{T-Zp2@vv>jZX^xVdh}D| zSZtw%{ufw*pauF23jGpYR`#bkKm9%?(R#2UR8+$6c=wJUkdvrWC{V}Y^`qhDd#pK$ ziir`vIV=)nh*aL_UeGSIyn_9MV#p>Wv>X)|q@6vIp6HnjaTPvYbM87Y`Wd~_XE+vN z(b0a`BT~K;Nsy?piSwN%wXw1x$jBj8f<8*?ImF6(G60r20f7K8GW5zm7_OjB5}q>3 z1B$`=#s=Lv6aN85hmj^aP^AW9b2&H+6SoY#kLY`Y(EJD~9-X}V)z%ss`7&_Wy_x=P z1kM1UhlE7%R_$zVCkx>t9voHrVq6FdpxJ2_UF%|2jT{R zEO?=q!92NNP_R((BT<@_F#RFT+agtCWMl-(oCmvz$m0-dof-&A>l>f?AeWO$olpa9 z$2+;6^5};~MMYH^$|)!e$6A+HH8utz=aur!g7|#sL1rjXfLBCL2`{5nXh{dd%@iL6 zjUedU*P2TFT#*AuTxWJ4f>_(k%v#6z3FsaoyMvUk-P?Xt=I)!*v4UO?f`VoYMq$e0 z@(7S``8{_V8jI(;EhQCIXe1XoB2IY8eCSIr731M4GDF8j2<5*W3t5pC;TVkv{~Aor zRabKyV`pavoVQqB0~X4WZkPlbedOmy@If>f<9BxfuZi>*u_K^K!(b*d3q(=}2!=Rq z00Qku86u2R3p@CGp;c_#M44OzD+|j5kF`ajra%eGd;gvtveGfH_3(}kHK?Fcu!iyY*L;!iX7LftsW>WBdMCCv+2nVl684jWFXSa2BP6vP8^;8g05-)5!)pG&- z#7BYc^!#WVf_#wC0Cs}$IhAq9sy|?#9XX?U(cN7Xz%T`gL?YG|aESe_&445kgTt>F zB$H774IyL_p9D9{`0ic39{n_bQf8AnLJub#2Ivm1ixamsqr<`~1^e8kU<4DAJOC2} znSZGMMC78w;ymtfI;s;Tus)h|2*x_LlEU*viK1rg-BHb~Mc`C3(BCi5}wXNWe2D}F%Dq69*dKx(b_i+aG zGNwYShcB2<=`7a#92vf^xW}$|=DqH|ty{^0U_vnpz5wljLiT??F8|Fu4lxwXQ_JM7 ztNZG0Rb|0KDhkHDn76lg zt2r~LeDjl&yhb~Pgvn}`h=hyV5;p`BqIQD1z*AaN>AIHw@U%u{6UF9{4o)>fr3GY7 zpf4*!N(RL_6(6zQ#Y2LpCO@-JiAqu3hzZ|QcyF` z*gXF8`!`XA0L@M6gXxf`g>W~5HUJ+b4g$Dxw)S}v@#X#b zu<-6Lc|fiQj*uJyX3rwb0N#3314b@Q9dr_aFQCtHj$_?`5!!rx*M29jwO< zdqrnvx9{ISKwnCR;M)VVRgg@c+&3^wi5%ZGGBn>-h2G4^fhpkywH}+lXcesOjtfN}AOWs+Vs^+Im9);67dQ687o~+*&2E zP(-Dmb?)|CK%f%nC`tNq2O}dRobYqtYDj3*o5xEg^*`J%PJ{q_O_6HPRu7V99YFoU zHm=TXMmvYtf~F|#cO&(aGMrQNj5;*0XXi`<+djDK{BCGUV7H1wa0yvDllo9(;%fT( zwzA*dke25L14%*QwCpX70iL7Jo4K>%Fg?%g+4iyx4-g*38Trv|Z08XB56 zdS%+IjGZXx$V*ES;n8)yu52AYXfCjc1udf?&Z3%!i0c4jZ(l?9g9*|j0B|6YR6=eO z<}%T101tO1(6QzuuyPebTjHfXhb|-HJe3R81X^wo^8H}1@aJd)Uz(nygL`{|jZNIS zqM@OIu%Zw>qTeeNCEa1cLbfbg7MXQM^Bt*n;Uk)BgC?d`4H3NVLd&b(lG`3U@SZDl z;t~?64r7sgcLT=LtjWGUuf+QLa20ot#se_Wy7XH<{d7B+m$w+q;NsuYBS$Z|8J1@} zY2&@ZZ9H|zrD8Bj=?a00k{J7lQup@Z-3om^5$11BA-50<{7t^i3nZT zJAO>!2!k09k7g0Xo?UXN9WAMdS|V1I$Pp3x}`hT!oP$zPz^ zCKMFHF2|)oKcfA_j+ztv^)uI*Gn+TQyn<*;$o=>XByic&mH#>cr4j}cEP^By1{o(V ze61kVMv^o_K+i1Q;)AH&M9$dN^+VE2hRWVvCP?8 zWe?G)y#CI5Vv}}LW8)ASuVr6(JUpNM_tDUawfQKCg#k^+48?qt2Pi0xm*2yO4_Ve; zo?N05cMwLI!!m%SBQ9T zPWkHRFppPl=FiUGdHPCZmIo{lrid7l+-nk(>6%&IIA)$}lY)ho-ICzzv;vm2NW``D zi0iW~Y6%GoBhmQF$XR`EcITc+7pgWco&FAvSJ!uFzatS5SzgPnuJ(!S)h^h1Ioq_+ zY*f1}!S>>Of;1C)rF3vUCs}RE~RB2IQvx#GM=#JsG zw5yHFQzG_9xaTEVT{bu7UJGFugMcJp_`$Si2aDU%XnqZ0@y?gZNhh>`Klld)9p~je z;3Ihn776^cRQQSnKtq|%%F1dr=O+NDkL14&?Y57EEcw59LXck0AAqjie$y2OP=L^( z!{koXBr84#LW7Cbool!QON`L%Sj<9^YuTpH(mmuKL8N$Q=MHdm6OmQuVL#lHNyb6FZyzGkM0ytyR|NW5!sReYFaQ`%MB0!(L;j@YLp_t?>*&PM~JKjKyB;EMCT z-%Ck(ezgRuFQE$p*hhqg-A6P9ZG=1geCf#;VCc4T4>v-8YxjgE{B3+e`toHyguV!Q z9r_U=x;E7|{>j+kLv%jCsuB( zNeWLsKO`+@78oi&1-g_t#xyB^$nFikHrX?o?HMrLQkL`NP|OMYO5u zk^hv$?*`}6f`a}Ty;4$2deTPt__Ekg3ECyk<0-do^ZJ(hJhoaxe=pUrb%jrp!T|m1 zN>v2s(BfiivdlR$nKp6VbFfwcV~}Lyx9IREq=e*TGM|tAMUayb`KJ#ViSaG3&qtsW zLYkqGk+&|lpotX@CM5(vu=0yA_rI693F5w_BTZhqHdQk5^ZjF3rr^n@C;n}{#z8+J zF3wb-9Rgl?@??#2^_DQk=LlD&Em;QZ-JWlz^jyFy5H09G9r5k~(;XCJGlN(4h8-`V zgDg$md`L?S<6Oh>4z@fTujr?kF2xxI>rIbbM`rk2SZwT7B$An>>GmU4j@TIx69YP7 zZF@UW%_03FiY=M>ZFX(CA|gKi8p1coVY-eja`OZ!rURNkI_9~&4Y4SKn?vZKnEsUB z&8eMTcyV`6EKJsP>|Y?0f+D}!PSwGAPKv~#&DSE?{t z5G5~j?_SaMFL$n(%-wpWr7GukBg5beeO2+QTk67$VUg{Q4^D^w3NE;s|4@7Nr6+HP zfYo+u>#A+`&U?~SG`CI&#~+A?FzvarOMHWxfpI}vx@Q0#9>SA7ycrC+5X=o$&Y7Ff z8wYCyU6y{iy3ahIASXYi(@9n!?Ej8Gsg_^j^Xc%??^+uNQ)@wua2IFXro!-I|(=^^5fh;zdt54=8~iF46Y;dKlVTLk5xa}Ewv`$1jE zE%Mv=VT-H-eAx$RhFB8GoUS=oJ-n9Rc@b}|gtbb54ZNB-GEBHf@aS(2hyMk{hFGTr z_yR>1nbe12jpP0*875H1Qd3iT&6*jGUm}A!2MatPIGDeuJv=4Wz6G>KBAeSaBipX@lrDCwtYYINU>-#6BZ40tpmw8L`Uy7>+>d z|08Y|L?}p`k02Ne34Q0aP5)-$0?&~UwYa%MktSzlTmt1Kv{jtQVJKYOye_fUnGng) zO0iv8IXE(;Ev%4M_Qa?_`kT3#nHMNOEng%90|N(F${AXb~+&ElH> zj}tUAH%BDxa?*^v!iBRyj)t+QNOXoEAeQ2Y4bF-V3Myjzw}S%%d|z-N2__A@u*M@k zp&{k6XMwVeSDNPR_czmmb?$!PA(a#%z1dH-)y+tqf;;9gRAqlPji3 zZo3>x+HAAwW;3bHCFb^ed4i8md{xd{F}gbN@Hi2f0zig|IoEBLh!`a$WKT$EhSg`x ze|=y{NRSNSAf@^&<=5XtH~O>VA`Ab>Q*U5J2Z_&J)V1*RV1^KfauJ2jOQZ4SW!l#ZByg+>B0 z2q`rQMg!2R4b~aHaV3Jxuhg^YaY^W7s+e0#Rk@ufu-r`JAp~a+-v*$kxni<( zJfyC7)i>j-{?`5$83 z1#gE*#(n=M27?2I0873R-P|1JRxu2S%y;E>`eWE4oUN`i(=cxG(35do4Qw<4q^{zm zcH`=hDMgm@opN2)H;XaEl|f<=9Xy6WF$U5IWr_%(qfoM7^`N}kAW??bxDQJc@(VG< z0|ep?o-m6~RlYKE6sg68|B2L|8>@(V_#xknfvk^xeTitmxYLY!$}qY_Wkrhd{()l9HYuJI~1-yOu20AADU2E`Jl|aEqND&+7Xa@_qECQEed>* zjpVZYjEn<6&a)J}cU|xvYs{f{b7Qe^-6{_w+c=Y)GSGA3-X$%WfB}TyFzZN#L@qqu z;LZh%I->4B!Qvw{bRW7eiWSJ=4mY>8K_=ILTaerkboRllAm$^n3!h(qTATm(qkP~! zC4fB2zgb#X4RK3A>aihxriGI-0i7HjFJ?k z00|$0;hs9g$!RXp7Q07=JC+LK;P4yCVzBfX+>c4iykx- z1Y0NB>M$gD6yn=)IV7t*u$@1`T7L1c{e$D3zn$4ZKn5KQG=w`03;J=ll=G+IL-l@%NjL0C%>wm_Fn36{Hq`eJGzkErT{h z<-U^k=u2$Nz*ItYk*$6nyhl`CAK(*AE39{&w>@$s*Wf|_t0NjS4Rsj0Ntn(}kl6p` zO$}c;4ImPfnA0Ioi;Ieg@J;$%9vo`c*_?-6wuKjXm5lEh=^E4 z;SX{U7$P=Ty0y9H`!y13tvw`e@%0%g+!&zvns46@7wam5!(hSCfVp9y>vEf+auEj) zJ{*>O1sz)sF&ScBv3!*b0V3>uIlxnjv~3XUhCmwr5UG@zeN8-6u5W`eVj71!M$5W@ zuoy5&%Az)4(*Nco?@<5qIzsSqq*2t~=+Xh2KoH1K%Qy%__w-cGZ~ZxW#?FfLfsKd` zQT!!5TaGPJo)L-UF7kp@4i3pal|vEHLA$uj+`*5t@Tn&vrt;x(KUQS6V zsl0dZOmKsdW;k%{SPKpwku92Hso;nDi{bFU6`O?@s z1cw>fa_VB%*Yy(Y`thP7BC&7*eQ?13!OOM>&cFxQP1>vrv%LYjK;?rS5V3iY`998` zgeHd&U>tbB=SZp_+r&NBpWUxWe-ov=>B+No$aG~rp&Cxj&XIJor!au+;Pdhe@4-Gn zcH$GxEyUQED#J2poy6cfBonYV!c@a{@xunfI}X){A9vqe`2&WJ=Ul)$hWq^1xNaY@ zs5JWSmjGoo%iRe-55nLq;a3g~8QVw>4GwA|OG41b{@l*jH-h5g7^LRQFFo81C3q0S zo0#1r87Nofj0!#~7z&-`Xd!GWP!b_22k6 zG(AeY>OTK9y)w?!O=tkAxt%#Y^MYcIWOsi-r+KXl(_=4}MswV@5ZUD> z2^Jie)YKEOs^k?zBWpt*^BPp1_#74MTT#(^2Sc-bsh`0n$e?q=~`;7vU`u8z{M^@2;U#5VM<62}iir zj;?U6!Q3YF92|S7#Nulc`Gc#^^`5`~YAEI?Z=OB-ug;sV*h7Th@{~}n7rUT_&E1C& zef)H{aR5~>s#fCzDHjv@-Z#($q7}nr^cNSD6 z4%Q-;{#ZML(=a@5_myv4EjVd`I6S0mK$&p3hd70ruT z33Cuw?=6Z84HHI)iE*zdC$AF97>EzSVscD5e(3zkt*yDFKmqmNdq#O}G1n$Z`Sc3E z-}$rGmB%K(U9}Ov)A^`g?wQ1~sb7b*+h%?_A2@vS6xYVXxLJ0B>tS-J3L~X>k^>=o zquXKl^8t$^uh=gJ`#JeI;av#rEFC@rZDnV=lH_t|)BX&Re=8?YaLlRrvb=gi;P3K8)uPFk>td zDn5}>;MOa}_+!SU;H%ux#4J405=a>n1r4!OLhb+{PS8^2oOKG*=X69P6m}TB2&5hm z7)U(Xf>=B7A~>R#@7@J~DKJ9R!$(^L0>)!tY^<xc$8dSy?>nC(6F`WEQ1J}<@h8e6I1Jlf`6L`32fO`s1;~$s2ESdUWX`4@E<~- zg1G^z8J$i>D>h9_Ooo8cf=Kt{861xv?YnAn|Soy=|As zzZ2c6x^P9}ox9635shbzZ#&H{O7Em#T7MiUnXjT!ugaNc?X!3v)MWhCyhde>*596Xx7DjM`F%m{N+OWuwx90N3{vJW9W(FoPfJqjU$K%j}JlLDMBZ}al1zJ5IjT^Usi(XS*x zds%v>m7nW}+R^}7!_bM)t0|MCa!y-ERX>m6n3vip5!1~}4*R?2{I%U@@wC_CpYzP& zqi2)>hHdwht?p%qwWmATmuQ@k@rqus7RhGp6hKGgqJP03=k+}UhBI^Iu4L3j zot_y>uSp+Wlk{A^bZW%o`txh47)0+Kfa6AtDML47PQ4|*5*sad0u`Os zV(&2tPXz!S+r+nWR2aJfs5SjJSc7=pS6IHEpK>+gHpF1M5!{#!Vm68)^5oJR?0At@TQ^1B$@&5DU2TTu=4Oc2C7L_;Q)`r z00^kOZ37874b{a80Wsr=e?5on z5oW6(L{K56Mlff>Sp;e(9*BX=2QihyoSK2i79vn}Apb@7e?kqnAo3BjSHx!kq5{$n z3Dt)NBm8@5$o<;AOoq;2iXzW4gU&Y!Y#nLh-M6mf&?IMCEc7wFV!zH=_34ox8{4q- z71iX&p0}ZvOs~$hTV=F`TLc(*PeGT}nWKQbp-p?a~f^7F?WpjP19UM`A zd^Or&xZW&SD*e!GTw^5F>bFwT5ZM`O^2W6?CrWPJ1ETAQhnCx~^gnnZyC8k`O+Y}{ z;90BZVQiKc)4ZQ@8TOX`xvb2`H&A!XhumTC5wGbIk7~ZmrU+t_M1QjXqxf1Qrx`Sq zc#sDkBPBbg>P2&LW@+gUuo${~|JWD@nU=qQlPVef&vDU#Afdk$`EB5^N7w&vSFd3?<3~(4+cc7%U5c7^N^&l@Hanv66SBy^Yh~c=B9H3h!a9QY2Z_>M4*z0fN9|w0B8@e zmVKbfP#ron*YRc_=1`Z{){L#Jt|690To(YI>Ec8;@kkA5>E~?vFQQx_azG3qf`0Ov zw=xq#IS3IUXCe^+DeN2Xg{7slR(O~M@h~x@$-F^vo0h_1(we zD<1-pMl1_c7;L<}t)p!J&kV2OD>n{`sXiLA_gkLRk=&Gye`(DMUZz(kN%7ulQgxy} zv;*R%nAVR%)5X^}i089dp|ngET6tZ_qNDR}|2>LIeVH`K{{@1|&(_x3negz7vfP{U zfvxWx2c?Hzrr3|%SHJ4Rz;bOdMlwT~@r*uqYO{d(%z98*^q!*vq%|cz-&DtEOV=Z)aty34>-Ix%gbIWS7v7>-%FCNw~QM+ zdCKN*SjLKwfgHxgh|1cCp*rF)Bk=$);yTia4};x3$De{a094Ol^fBqv`gkQmD#VB( z9%lFHPmA{q=5w?3Z8k71aam4|6aj}EtHk$$F_0k)O-*G0*F&DBzkF#2B8OT5!U^C_ zpS%y#Isea)a$4-E3Yr$;UePV;<9v#h(%#2?GOec3JT5>X$K#z1C~(;+Z<$K z3Wd4=sq!G>Kd%PN&JZpfHWJ<^IXM|^@{IPo`WMtdaF&q0;0Q zYj{mF^BeIjFbowSDM(HGL2HWeC@j7kZq5FwO8Sw$kULb7+ZtWYuvCCVyMBxGmrkdcx|_7*~AWo4b` z`@Zkr`Qtp!Irm@P*LB7B`}w@b>-AhOtZ3eB6VLz`Q0(FUh71&XddUL2p}=T1(5~^= z3BH;h5GxiJ7jNr*f|Ls&F>b?r9b&25Yb@%x>@^^Q#y>JZJxmPmJbxfBBv%5edL%cJ z93Qw^rsn1-bU~bvR3jWYyAK{z1n|ow>YCjcFkqx~^yTH}ULHXO1$}3(7i?JcxS%OT zEBNJRjaD!1QQlJuoq5#n-p9+8$6fe&K-_q|Dgcc{V~Lv+Jwn1RYH1mktyG77mRCsE z{`fNcBrT_k?mf-=wrg1@W@dI9HXabAef#nJ{nr#*sK2y4JtmK+aID1k0__>} z@1)&CBm*;H{|+164n;<$-9(QlPHL#bi2+it(wFJ95U-uudbHjoH3Ri;_G-E`C#z2# zboke@O?RP?6Z)rP-BbJ#u@dmZAoqcsYNUe)2M%}&!o^(>4FRPfp(h{i??T2$Y`)@! zA50)DP9S~+^pr_r`391Gh9yq|AAoOU!rAi=z#L*^wm&jmwbo8j+1oRNKC$AhFL1C6m^UAl3)Oh_ zVBpA&1U5Kduu;*;myvi6CY1p+_b4J{CbnaTWy=2$e+fS$N)wbo;3Jr>cM`u4s(Gc- z+b$sN2?a-TV^A7F`9V;AA}e=v9D>FDHOl{O`%h)$D~`rPF0h&7`g)itPPzX(N#!#D zeIJT~&6*cAH8igLX1dFSn<4H%DK9n_oK*AR!=Q8N3eRYBIlRLHRW0#<1&Obo_%{H% z0jV-jVi`eMj^)bd@be0Qc!Hb})&KvihdxV!?-ty}w~~^=!~3#_mzdQnhZfYD^>STL ze~zL|P>ky=cp_+Kmha;BO@G0LU{2PRek82`nO^TrV%%jeCMdir|6ie`zPtvA+EjE~y+soI&q_Fvd_`@xVXLyG-frQuM5X2R}O zt|yc|b79B5vb8>ic~vsetQ+}4xx+%`sm*mxRodL|=9?q)4~R}%bLw$tW}a}%o$P%~ zb110u<7m36ZH{?G&f`zzy)jD=u*xaR6LT1$W1S~;?~2^PyRn6FpJZ2JWs^pn<)Nw5G>fZ;O&oeNHy!i6<;@KD+xHd3r68Yo;@wmo92ltm__4gTwe>`2 zZ=w?0-j&E*Gwiq|0L1F(M3*y`Vp&8VF5c3KLUs{ik<&JP>%d}ztKQcFmmSsY71gbCp`5TlYk*78B9O0I!?1g9#Z;O~6 zu(tw@^p@)?fYt%7Ytg~{&A6z4V3v{WlxjPMvzLn zdV1x6NHq#w1(zN%6QMR7DF-@595b{_qwFgE)^h9_82WXgk4c3O+dJ|1WWQIX>vI0% zIPT!BJ9>G`wtk9{k=n$5ggV-}%%effr9{W{+*03N8u#Vl=Obs5S&lC9XmOm1Y#5lk zS>FD+=KCHyJKaH=*8h*xh|&;SdI~&yH0Z$b^cT7jua*I@8Fq#SV4)F0H}`@W?Y|ViEE}&( z46pD)1ZzfLoB}kBpb5>p;m$xs5KZtJZ;xow;0z$DWfT+Ot`GhHgif&BKlG6KOc*RA zr0xdbDS#$2Ld;zR3QeE_sK($mddnJ6uUIE;9*5o<-xT+e4b&X}&*Tc~dNr4B*dL?B+@Il^VG3J8>pi!jha4e-ayJ4AzMR79tD-FE3Va5o^_Dc_Fj$b8>n*FA83iO<*qJ0FEy|7xHNc z@ETs-QA~!^Jb@nj$tq*#O>U?b6$kAh^CxzMq|)B_hSP&d$MU`S>^DX01D#n!NvZJri8hj1E4O@+nAO zJ>}@deMgQ^$I7{uWNI}1?(~}B(B7)2MIGREtXSj)OW@NR!sEY>b6-vNsZEJ9C}(Xt zQWXb{EpzL)8D71kB|0`2JM$Rk!WDI*ZLI2_Q98!k74$`^C+SD6F1@yIiu3EcZDK_E zrAVIRNj+O4RhHlo!$ha`6!8>{`;`Te%42#!-or+^-#DRv>6J|DIGUgVtN73h3) zntc8(a4D{xMV^`S>vaePAxk638u zKYVCrVE%%RKJ>n3j__Tmc!*iW&hB0LBw)K!2X9ZV$LB%{IkHUA4Z4p0Q}&U#E6)KO zpdsTI;IK^;6!dkc@nMT?yq1lyiC;qzN5 zDx^4K^ASkzh*w^enT>qp?@y9xz(lY;`G`dU1m`h{3Pqwd`f(6EM3H!VWwzD2xqtz- z=10I)K)t@2DEhA?EEF9V#EBN!JRBSp(1}PMPmt$`lklN}K$e;2R|E_fOwrWnoq-IT zZB+wZjRFR$fM`x*_UT*5Rsf2y(r`=e?itSi8tV{Ja`;}O?d+H;xedw1127P!ksj1U zAn=xxp9wsaxhE6H_bev|{!A_|YLuElo)L<>|6G!aKX4~hybayfuzeyslB6X>mSl#6 zc=vT@v95A(c&&v!PaPcQRSy<*sH_;^JtD95YQqvdLXgkkGk%2>kNADEcSg$3HAki- zG-bxoT>Jg4CVx+$pI@eSrjO58znW&cKV7?6k=;i1C@DUEa&3|K&M(VDZ%;dC^i7I+ zaBpQ5wBnYsD7v+$kblH}VAHRj<6BuCGIL3tIVeWA|Em2=S!~QAgUv6EL$0TF@6ddN z$MR1%SDRdQ)n*6~oUf&7R;0YoiaKs#p||EqpZ+9@yLoXqVXOFdYHDX+ZYK|!_dugT z5VngiDNs>Dq=x^M|NNpD`3V-_Bo(CXZ5u~;Ap+Z=ONE0EXIMN6d+9&T;EMxrsEC2I zBAz0wBqZIuM%U^yd^Q{wOb$h*R(cnq|4O!yBTm9f0)9c^3iaa6zxx693d_?Zsc^uJ8dzf%F~b+8URc6>)d<$R=A`MN4W^rF14oLHE+1=^fM zg9Zi$n*6z%ss=iT33teiOM5hf)hmH;q4;+!ez|ggoH@%ECsiES7l+CHQ*iGl(l)C%Sb?#z&>FIfhDstj$-c`-i|72WLD(5ikvV{BGJpxK~Cw}pG~W_OqO)9%a9y)~PoMSU)s zH$<DD0eI0Q+L@%y1H}cceGeE0 zs>t)$|5TF=?>jo4VM*A)-a2rAYEob8hs*eo-D%`*^L{yTTN(@{jusMvvj7(q2`OzX zn!+hZ(7nxKBj9BrrI7&UL7FS9S{cqzezcHmhlKv)Bu1-C0^Y&(6G2{O4EN&F6YQmL zkWqse&vzK5#gZ|A?e|n^EPqZ&AW#m}n*wzeE=+;ZrPx_;n7~>4;NioE{Q+X0t7ifK z_%^S{(?x#{LJ>hRD}(nUpFPtDpn39JbX1f)M54i;hM+av$C}C_^%0)=8ybBt#ETY- z!ym{QMY&jbs|BzN`7Su$cVm&ZHdiQ-sTQxp{vm-+0|HI+zZR2kK9o4Lm7zXjePg5N zO#^+_A7@QWs69J7D__gfFm6>(r|&W;u&Y>=3#X6LNuq;#ghQUj+;r_xo^as4d3BX< zpDxi@adPtu=uPPKuUU8hv(Vg+UKRyt_x~@F#mn#qxuTWC>iDJP9Zx9tCysh5;U_xd@Tx1 z)(P0lM{=!?auesj(pR8?pX=+XZw|aVF+Zyai6BmsU&e_e&PY5!B zBs;k4OcvAuw~-<}U8m%JU?4kyPqd&QbbFa*1vmFQl|mqZKrG$Fq58TyeUxUt(Fp+e zfMD`NHV0eT`mO&&0pqcs=@s()1mfLi+Df=EIa~AQ-KVkM-kU&XksRi=nU~#NT}VzM zR=|@BSu1s*c9T%HKnXE z{l|!yOF@9%SBx0V#Fjm!aALBQcE%2FBkTye`Vh1_K2rf zNy!zre`j@6$NYVKXv$MHRTHndUylEU%DfV{8wsz*0E{`@WJoJi= zj(<6~uAq4!k*oM@L@Y{Uxd&wz)=!$5)_p&cP0S=>%Bki#;SFNdT77 zP@?vjg5HKO>PRhs>cZ!S}dq|nNq-^H5LypVNxPo(_)fu6$66D20vl1qCV!6V_0AkUmk zWWkMuWArNI+T!P1dO@hpg{*tXKnExVNWu}>D)jU+_@VzxIMQF~xHSFx-yZ_Pkcbx$=jgcgftP?v zWzLZ`1Oe#OEY_*DexndV*wL11vi#4aKgX?qI*Gy6Sm9)YI-tl04UAkKmc!qxThFA z$is0#VincZ6f~?p;3S@UC_j;W{ehN-<_?H8++Yy=+QRpT{(Spx8Z;y9=n}ve5XClp zW?ne^$4=j9YQtC)g6v}2H(aV8|Fvej={916q;>az>z4IBM?S0(&nDXGMmwH7BgIkzjBf{6?zt4SN zdBU0(;%0)E9o;UEu^@=+Cy`OG-=gHj5g`Dy5?=*Dsw|?SFOVVtI2!Dh5iU0U=Apw2 z0Q0UDIvm5MBhnHS-o(Cz6r=kpobfP8!V_53<{5{74f{&(#O*6VN`8Fb$|~ zNVqmuAyM?goJf>n5L>~th&!YTnHP{g*r0~|@9Z6x>7Dq;@gVRb#2YEei-qzk6EYQ? zvVOSKk=MvCrLlt84~z*B$9xVb)L(-}!cR`MA+eKX9jswI$2V~s@G+Vf8Z zcy~@0J!)%P{@yXEh!uJo01KKAGJt|afZ((y)fk&NR6sT}!1OUAMl;Vo=_s+^%j zN=O4GJgydm4FmcRKtc_WP`GvsffA524uX+yfL2Go#NL87;XOYHFG7ys&H@8KD8}yz zxRvhV;Ge>7A}NDLxeCd9Rzvx(me*f=_1!-yiRQYf$NuV|Ef2h>LoeGNpAR=kZXGWG zJVz#50EtbX4V3sQ1& zbrz~wKj9GrVMNWHmz8CJuK?{2#;N!Q1OznZq+#R)!V-q@d4LU;-qI%xARL-N>dCMd zbQP%G;yx09fxH);bYa>?Oh+J5c~$OItZR>4|0_6;^arVw;o7GoW?arS)YZ4RlDJaSohVD zaHn9N9BEq+C_km*=ZHP*J0FD@1EfaimT!=x@T&DpI2UcY`k4$`K|uS-zmGTR$-KeB z0F6ZB>A4o}p#-|m96x_&RB*&a&B3`paq(*>HM`;LLA7$zZkjxMs8k{(J>ED?|6xQq<93gO!K7Zb6F_{uNGkerAOgFC6VQ*k|06h`O{xahievJ6U|ECh>WF_ z{{aZiKyfh&7;l3(9YK<$^M$mBL*}lOjvLg30eBlIDNasbg@^HSna2tm?jHLpHuUP6 zsr2=|nh$WiBF2}CkaoBwuc5lMjTzIrd^cAJ14|B}=qI^EVA86f$v5px!{~y z@2z$cqErv_;Hhy5L-wb~Uu-X1rVw?#lX@I7YW(b3t=z9j^dUc*7zOYpAYnFy7%%-{ z-K+bxp5(~_#v#RtZSHlc*N-7Pi9&2STI#QeO$Gd+h;|=ZVv-sU0Utlmu>-KEBHu@6 z=~*vIJ5u%FjY<3EL=abs21*tj5^U)AiLwS6{6x(8(U8D@`F=-Pm5)MSiV%)Vm|C`e zi7=`dYslz*+@d+L97-hWUzJefLFPpPK`VNBrjw?R50&KM%0^JGFMb`+X&#OOV7i_3+~ zBGE2(4#v(T$;DvLeI;JKL?ueX{C@q??y6jAfCSP8LJJh8Mk-r4l(TU)BJxTGC=*GO zf}()n?As2C6-ynJ3%F0m0i7Z|1AR&3=#-%12=AZ30gItD4->4;{G>deaJ!#_=HH{8 zO#B~AJ9mmLyq{ddms3<_@(h7@q|{~~zHpGl{q(Gk-tWm?xynY*H#zfPKliHdXQTl3 zb>+)C4X{R%itkz%qok!)kAa&-qO>m>#RmfJtgLHyVZxPs;vBF15tW@OqT)~s=wr$u zJlHrTF7=kW6X6Hk%}8HFJ+ZGb4goDucmTHV@*E??l6X>5V5}jccU9NFVnPED z-1;CvhA7z{)L9P)JZ4K%;d~f&_e9pDb=4lC;72=wo|Oy>1S2VVee*d?sUZ9@Dq|w_ z0Ni@D^!8%7LW8KCl<8GAr&~VnJZ0@ipkn#e#)k@A??;qBuU7DyKO(`vzBy0%)x=5T5D&>7m)If~e3ha-d zHLpaCjY5Crh*o?CNG23`y8%fcY1T_7?iI`fRqm&ywjMqW zDM+A#!1w=v!i`M&BAFGozZax>4n+SOMn}mDk1qf63yhmnQwvk7-A`&PXxWAi9+XGf zf~+$%d6%Hdf!ALTq&OsF-=LmCL@RFCBS=cY+K1g%?ONvRqG|11`LUqn}EAkhJl$GSM_Nw1FXpCpLl_f|q> ziMsd|GKZ=}|9aNrS|qtpP=}ML3aDqH(jf2`>TDL6;$d;c@qg1smuQ`V{$uo9MzH-q z4k@J=KNN)ks)&xWO7!U(;%p6}86w$XI9j3Ei4?S;$Lx#O09SyzlLi2c6iFDvxequz z6nZ<%dR0j=D03VS>yNYoHf%g77_9F&E*vE9&Af!J8x#{fIUIY+<_LS(N(_y^~yn|5$z=q0&==?(#naO~Nnz5M%>u zfMrWoHFn3Atpm5oWK}43jJpdRRWNWDFKVMiOC25?5fl$iyXlrubRlAoI7$2Z2zds} z)dq4QAr}j3>0n&RCu|3#B)=`*c9dmc2+x~#GD7SNRG6!oD_25@x13lWps$0fdnXNt z`Si_D;4ILCRiYS8#LQKY#QpGA{s1i?BWPcB9+DFrnVf$v9aXL%%|K*iB=P@}#8`Ao zWPT}d1TxDBM!x^fgkcA8PC9+EWRYYQ>G-nB2T8i{EkZsA`IrpMPl$HGrHsBaw({RI z<@b8{t;cXtZ}#TBmZ}D^(0bgdN9p{-A6=|7vs1vt%V#srO@umU-r1CU|Dqlym7*?z$AqxN zrY8ZhV3Q$-C;{Pa*`{ zz|8C;YJ142&?fRDY6>+*l@q1JCKqJyS$j)l$aCOQ3JDLNSJ4_m8%~;S%RONJ%$pOZ zlyl|uAb7YR6vU8aETx0PlM;^w?5R#V#(5ybciXi?NZWy2gmF_dL3Qoz2eG~mf6=QS zL00G+pdyObF#!bCU*Ms5@wG0&y$@eS9aQFu5 z*gyVfoOjwERUD4%Ct+d6&gb^*LE1F(oIXO2Mh(jg;zKTAxsxOKe<4Y}wWas&ZNy%| z(Q>AtI{`N;)*gVQmGQm#$d*xlBTtA^)M7LMp0mxr-+j36lg%InxAsx?gZGPd;7-Ji zPF78ND?__2rcx~+Iu$kiX(%DE2H-{-&M~_m7f~9MA}Y{fAO2f}i*Ze7hN_ zz#ftv9)*|1GGGmXWq8ZP9cvnFfwcpALOzQYh_Ow{3}o1@g`X};OG_&O3B=?SL^$K& zVY4-KB>-2%fduF?G@srDXB((bWn`4X(uhKXfuOM{jEGwVPXpZ|X@qc}p-zs%s<#=e z-cCGSphKL2)Ih09f=tQ09O#w+>Ux7rNB1{K5}^zPsaC1sV!~Hk5I^%5={a{3HP@}X zvrbm(yh>{8_;%W#U~gD~D?*?Jr1w3Z(E`D45EzEaVyRU|c~#yBl|8D5LxlDU8u@)G zRSjbz(AVHh`uH>c)*!{vP|wxiXH>Ie0L;k*OrYPf`aSy|!TAow9#C;fy2M13NH};1 zJ5CrxKH&?-cw=ukYL` zN2-t~hj@4L=y*Gh_bt1;THVJY%uu?dpb`Cy%C+Px4=3NQKfPN*-(KWRuZ=j zTu%Z2NwLB%M1msnWsylo!inWM+=i_$qgMIK5&&!ghu%+zLe`Q;>tKvGfXRVede~~j zKP|SlY#$dAjLWNR*KB0^K49CJj8bqh2t#S$Yc(`7!UQt26HT+OE0;P6P=))Do+h39 zm&G2i;0o1cGUUvIX%XzF|KEJDqKIIEJOPUj@=BbfI;Om$69-yNPa=dqwbnCz5l|R0 zJ;r&$%I4eq8;^|u`G926gV16uG#`NnlY%crqp}6XUJWuGBVz=Fg+2dx{QmU|8MA3h z?Dg$T%hQfwtG^mc~?!=N1SPKbgK^NQNrRTmdcYFk}2GuH8F-|5pmE}}?Ft1y}2T|qk(w~uDB zwT+gRg>}Q&_l=oPmwOTXs{n)^hZRqg%4-;JFlqQA>StoS#*Re^H}zfphUkntV#=l~ za%Iu*xfzSJCJ!Smmdxx!tAhuFlK2%$4$lv8 z5}uz4gn$jpT{F>8QFx9uIMdUs8P+t9IuQ5) zQht&@g;$4@6urd@)F;H>FE*3)-$*z#|EnW%{Y>Xdtt~eHSWkavtYIyDWzmU4m-_Y4 z&J8Cv|9a>oj1b2S!QI51in@i-=rqYWJN5BL4wqBvhIhae5hF zd{NwOg!zDUj?iRpuBGb)JNsiA2}&B*3F+6AN2}6eO6I-PH+Sx_Ol&J@N$!)TSug#V zhgpK<+}_a=n43$*72dsT(h~As6jh977swJA>BiV2_XZbC)hw-jXbMCi9X5){CGHfW&q3w!d{k7V5dSN*n~!({{-Jt*xtjSNNInp$u@xFK%j`x_4BQb*Bwq zw#TX`o0*!v|M)Sf{&jb_nfueQu#lLThRed;$j*XX7Wlsc>Nd>BvrD`|Y5VoQV&A+3w-Iptta2w~d`wtIkFzC!}_?q~7cG zxTh57twj0t>$2()u`aLZ9 z%NcImsoKz8_(1a`OJK*e-PGwPl*J{lN_7~k*Xii=js43z#Qkj=7bXIeQXX*O7{o1y zDL*764#om_--K$CzmKWyBhm2fC2qn_-Hr?_J(t$o>CUTOMq7-^&Ee*=S*sRRGRT0F zlM|At!*C!L7LpMPe2m-hv-1|HimUYd!Pq6FrG0rT?ud2Yb0bEc>jJFnj~}0NaL5+y z!89{|!&(}ECi0S!8xg1UG(6mZKN6P%{&4l$H5W7co9yc?p0EC)G~Dpwu0G2ay$`oG z4BXXwf8%n^t#uN2ZF3XcAW{Hx`m$c=!i5XX<~fc-ZKfy5EDFE^DgFBa?_f$cMzH#o z>msOgGdeg_3?toWH9Wq#6G2|DF<@^pq^JaaY>d$b89PV8d2#UfV4!93k3yt20sQ zYf3`5>v(VHB{}a-8=H(17q?SfRG{~T@>}0X_>2}~W4)|uy>Y+C{$1<$30GvBRhZtg z^ZIFOD|X?6(p$N7zuJqo9_@E-o<`w)V(&vbdV09ec5HR(85=W4;x?Js3js;CF{dc9 z7!b?=8EHq+Xu=PW73=IG?gxfq%;wwVWW^>jf)0sHd>NKAb^Fpjfub%0LZfVrl? z%@MUq57XJgzkf~W=D|6Az?W+WI)!QnDBW_gw=$<*)9W{$EonYSqSuj?hrw)Fg>OFA zY99s0)M-`p5Mww#KTDlLs8NNbkp1tRR;XPU5$6Dh=_^6U+fX|gMeY$tm=&G^j0(!C zhXb3AUA64CEQ@gS? z`|VP``!@&+`R{3a&A!V7cZ|)*i;8utHA^)Vvb4zs31{A9Dy3|2bGe(4V55|)@iOht zL<3joz>NCRZFNW&kmA_)P`HO+{phIP*$&D;k1h|Z2vG##${Qln)c_!n*c%Y-+E-#r zKV8ONCWEUXH4fe5&cub9D*^2Ikx@=ic-6pe5jRTawqgD*u?1iWp}z40ft+KCKX-iu zN{pj%3;K_Y`Bn~lA7sN3Ie)|1nQh?CPl3eb5EqXDi8~|pr1UK0Rz$6bQkld;5q=oZ z4Kg!3ixwGtP-BCJ7$HL`-n^lJrVQ^F>R)&VF&0`5ix8dS*~#}C(Tkwg#D{*E)AS(F z{ZvxQLkJwP5=>TfvGPDwj_f%z1b`x|L5$?A!uW=PkCU}*K_u7FBIJ%qw2LpQ@eIHl z(13j?nePEq-RDA?<0kCv%o3(K>zXWd{mPYd*oDOoy!O`+l}80N7SZsd+$RYp1mgmb z7{bnn*$O102dyPK$!)FL_Vq>&&I6?q1Yjr`vj18!qlBbh#issh3q=L(?Y~=#b~W@XLGdtkcVOb-r(K-T84R{pU(2tEeH zq914ER-yaovTqPY<(vU15|0$s%E&W|G&0fx?{*6Rk^q4SPA!;i(c;xQSf+t+w#1~Q z0`3Dl_|=iz>T!0y%brw?5SpPFb^hCUfl0(6(XpbaASfL&0tk8kz_v#3co?IYP}9yA zi+(jUJZD(pF%+s!L^s5v-P#%sw)xr-F`PxDO9G&7gjOf;pm^!`M}zek-ue{JHadF+ z_k3QTluD`m{#`qXItS-;Bm6nx} zC7X!)ikq1ic6z7xote-Gm#V9)`x4P5D>JN&A=|b0cd?~JuRk(3WgPyGLC3lOwR?C7 zP8-D5C|>#M+~c9eXu5F=l~YHHx90I}m~|p;Ii8$eF&Gr%lJi&UtZ0ArKE0Y9!KSHi z<)UysttwCn+OQzveB~eI{NU`k&w3!YujBD&vu_{d(quL+F2-IiaiwHilk(dpODpc< zGmy~U)|0nW#DONQ?$M(PjUAL`oXL)N_lxsv8~W>@_fkfNs&Zkzw<<@-a@~ashoJ)6 z#@z94zW&=c!f=r!@AtU}lT|n=wlV5DG8bvPFH4~08SY(M6*kUd!a<3{;ka9OZIl%D z0+VAXgyjdE`_^0hyl`oL3T2zQg~ib4@B9q%*odR(tdGs!{BI01V)kNZ*IG_Kin$NG z610Mfs_He#vkGLeIT^nFU)~a6o&h3oiZL89MW^H$zy|?ri)4ncM0yLm%mPd~M9)N` z2U7ET^l^r*L#oebku&|4 z?hAHbl%Lfb6CG69n$+AxYg3u{Hl`+5fvLG)#{$#&45YX%{dSD3Uh|P)4BN8PKce&Q z#{Sei@2N9JRCIJ(*&2$DJ=eYR*hVldjo#Np+UEqfVzzomp7@d@ye{G!Demv-ieSuq z_xw3^WF+gb5ZlH=CkC$SSFcQT+H*C$r;05j?~ng(mhIwoayk^F#`UE&p89jtF^^u4 zGjwzpExR-f!k2En5$n+m-6tDDVmEQr_Pu#-1zvwJ%Kk*H_H}Km>qGhw96TLg_eq;> zq^U-0#@}23_ZD#Q(Ucm|wdI*RTf^~i2FP4x)FgNAtOHkJc=>g+<+>nG=%BrcIO8`>flREKFyjp1vh!l;X(>YSn#yUdlgSR1~wipG)1}TX{Ff{oEXj`MmusYJQ5+ritEp^N|({b<3A=4?saY5@`~B zdg97%rpT5LB&88I)lKuwpK5EPm_cWgxM$!au*>VB{zvK*T3B6s?Q`euMDdmGv6cIt zar)9Eu!VN;DA1>3EklM=Fjc)TQt0Nh%X29$-XGc31WvlGZSd*Q49t1`dJ}-d1W~V2 zLqpBSz@s;oh&`K})a8-PiMsSCOJM&Qim9m#n|_}cmF4I2|K=j1@&iD1R<`M4m)I8~ zFKpP-l6QZMHn*tCRfgr~G<4)Ci}Fld)dVmL)#;dq*bBZY-k+R=yQj}HlrN+-X4Yj{}tO+5ibv+P+Y zDf>S^Gn#0LI-{gi;W#I6m8H(9qhnLU%v#;jp!ne>+;`fZCiVY{3?F?Sl1^{U0waIG zbJ^BSSXySd_{8&i(t3iTX~cjXG~4HYi5t*>B{1wb!C;v&<{%PS-NJCjDD%EDlS3Bs z=Kz0y%O=fFwb{~gmoD)|is1IwmemQ`v7;iwB6(rq{jc_l*}T*H__l7LY2Tc01oH

Z3A#}LrOczd-V(P9_VgO7%>#l`!}a*Ms$ zzKx+GV!@Okh)swpz(QPT??W;G5*xbk)^9m%2BjAk{f$|V!-hY+zsu+C+o9ze>Dko6 zK?4sa+;^-{6d-L42^J|syT-=G$f;zU6^XnY)cg9a(_3?La=>JL*Id((8P-lrOB+U$ z@ia7)51;n==O+B85MrRQKxj}r9#!~p18b-j(ATEEQ+fJ1^s=lhB~UmFY~_cAU0Y#m zyZ}BQZg7|g$mdg0X+Hc7!iz5qxD7(s$HvD&)taolY_}|ZN&O}vVdk?y16(Oya9$!9 z;77qov;kP5|MLBxV?0-7fyZy&ycyt*57>SBpkWa@lXD?`COABdJbm^(pV{xREvlZZ!*db*fDzh^sjAgM81TuM?`Rwlumz(^pY^Y1Fk+k8JDZm)*<&2?AXV6crc>uIT`4OF@H~h^WYg&Ug9nBZnv!d{+;CX-A{;i z^#_aD5uXar(=fRRA+B?{S+|#$&0dIvBB7=YK(z~<776VgiSdjE+y*R;+@WolTS#=d zBO`>xC5L$zuOy2HKmg%xhbNX9b=u0L`bM~cJZYv(P5X#h9C#`44kcC9DySeK%*S}R z2=cRGsj)-5-dWVnuYOF3!ouBM6ksBzOHHL5{p>uRnn%(D;dw>lkp*8pmOvdoUjWs?$%i_cY5YTd9b zknb$F6e2Qv)NaJVNhml-#E83$WIdwjf7v>I?VTYdibN95N(Ox4XCzeZ?lm$pnnKo^ zUFCLRqLg#4OH<)9%H z&A_$BLUQ8^3!fqD9K`>xm7Ln_LnxjW8$4Ix&v}x+GS``uoIHpV8U(4}Sp@W=j=cv0 ze+EASKGu4oJ0=;x5DZ`>cCOQmQJRG1HZ^ctWo6><=b+8OCj8k|FynHa8%`2bp;P#) z8s9R(oOUBBlMdp3NG;%wz^9?Y|G{@aBW4WW%ZT`ywLe8I+&tKXg@uKm_-Br>%_x#A z76fgk{1L!^HXI+W!-~i*Qa}Tk25Srn713q`4S44^ZeY2Mf(+I41xw5l9LhM;^U)3?}I+iq6u33IjayvZ|vU+h?D3ak)W| zXf{J(>1ZXG(0ow!USq5Xvp889Cslwnx>V6g;{S5T4)f zyRR+VY&SSW&@&+*kO!fW2eSZpIg6tK6RWGyGXcF4i2i1J5-zcO0Rf-P3@)mrryV(& zz2DQbeRjqCy?*8uEiKG9xe)T2z9WrwX7*xQWK0H*+_bcOotCx({P?ATe-Y8qJHE!sy!#xF1f@4sT(ooy3|H0E5NPoO+w5g_ z_EUs6Xgb1LbrLGxltO>>>QJ@o;?v-_kmKw;F2knW9{@SO*@o(OcGd?+u*#3fo8kijS$c7{)Cs*p8sHLn( zQ)Z{!yZ5~=4J8NSbl&l^_1+$|Z;YqE`A!#T+YC-%GS9hle>Qj?oUuHbG;xg=@nJ|g z3i1W_<1`MV8L(*vC-(yB#1)V%0g)1M$rcv=3mi#JO47r1>6yN*wfQ}zRXvV;@GEg@ zori0l1~Q{=ATT5I1x$X%@%a8`FUw!^a9Mh);UA{Fnox%<{?4~0OwGVoxA=@=76FhzfIxh!4px(Fz ziMRtiKSoe^lWT+|ICzog33)KUk)fzgB{elqmKciR@=>scg*vr^OxTGOV3_*QHa{AlA0;uzx1c0dG&+Ktr7nU#(MQ6zg}WSe2$;I*kZcf)ha39-!2Q=)(f_R03^D>HJt7|8tMS35MKGM(94kr#Te#&^Sb%v=n*jm{_cZ- z0E;P$qJn~{HiIBOoFjYp?IUUNMD7xo`cYS5Sl(S5%`NON1>T1HsOd2w>;R#LQ; z-H6asQp#`>hWd5BdrMExfXSH3m81<@sr<#pD?YcMlD&5-)Z&_f!QkP@Suk=~IQ%Zt zhRF@fFfJsfrzgC9ONF!gv#|ihz4bH{4n{^pi(8!4)wKjRZ8S7CuA4f@f7}=HU7*>> zlf~E;SzH(ZSU@gA<*okLwZu_61?wMx`|~CR1~hn2!&W8otVS5rdD!er(HM=N`ZG3$ zlkyFzd{3Xo^$1z{^Llf0Rnw~oeLEI%B<>pYUTiXfkGr{ycuqq*2^ki7UI@s-wN37Ez|MGhNH{&0@U8tP zC+K8!pHV{@?BwqLM5X0YRNKLNQzIIYb!b9x#>gltybilt(yx`mDyY21{W~sWD_qXP6 zQph)XExo-3BtPQ&oPkK6#?THZ*?+)qrVnuwkoP8th_SRl;gNd+RO90pFWhD3y{}^U zuSPfpM#vxq)o!mxrLOkn=V)s4Scg#rM8AR`&{3@V<)^`e!@!bL#hlxk~rVKxJm z^Z+f+^|@RmNR#kvP`|CC0_-jyib*ORp9|mj z8tEx?QB=$}HcoY2GKK1Cy6F<{6au)(jkoS=k&`uK9%4~1Uf{Ul*2uVnZ0x5+e+sFg zJdeg9h=Pq*<9$<;@-^FFGbKo4E~9c6U!L5Eu8cqf;Hsb=yT>kjcDDKIGG!Z1F^{b7&5d9n+&%RIe*+dRr%GA))&^5t*uxmr!}F){ZlfpgOk7p zmfKKuoy;+k4fy%Fa8P1m;w8HkppH;7)u6@twW3;^tpw?e3z!!&MAUiU!4Wb{nv|Pi zt>a+!F>Q~1ZDqof1cTys7|dzaDLQ&x`i_f#?L8clo+Q%We1zMar&Y)8x6AdfAVlz| z`Z05on-o_|BCr5hD{E^9qI&VEiiRvHkZI$gLpUQ|eK-I8@9-z>>!^F+87fDKo0O8m z&cgCub9J3AtcHJK6~$42qL!!dmx3Hse*V+`VGa|?P%|bx6X&0={31@>!4 z-%|Ggv(~g1#TA_2^b=Lyek_7AXf}|U6t13yWebkeRK&GqhUH!Z zt3?}qdfpd5-mGy{x8Xor-p)Xo{<_X&25nE#?C+c!!1o*>yn4hOmyz)p%IGYIQDtHi zevXoNdtQ(0xJyljj>m&$eb@Czj~!e8SY5-YgL`Pm=5OL8J9>;SrBKR zaU3)1pr8-9Qa3h6_tW8~wbhCPzG2%3Z-A!S-ObJS^S{02%%sn=jyZ9~QAA1MjXLjE z%3YDSJNb{4Y?kNPjoc1B1YD4?U3{3U+S>9iE~gRT%8n7@a0ijmXoV-4d{h0aWq|$G zUL@nLp)T0Jh4q_pQkw6(cNFM=5V&7b@|=zBTdaz@dKOqZXb!lGD)V;n3rG*kUAVBY z;CpslU`@h=UEKb}7HryVj%<3Rrm z^c;g8F@>N%+U-x~vEl=ajJ047WKn;RbK^AXuJMzbW@h@oDfZ8{J_rg5e_~UcZ6?og z;`#0ekN&WPPdzReLozCQ<33ETep0ZkqE<2Y`*G-DwXGbNX^r+YHTCtKZ+o{H_ zcRG2FjaqqX9YyWOWpH2r2Oh_J3}mitV0iH4(f+$HS(MC;|7fEe&K)0fO9OL-I%XN= zFb);F&YO8J3?IV?1YmxlY;8##pwgk$g~6RxMZblf&Ly(AiF`J;ajMxyl!a)b8+(q2 z1Q%G+(R&44qQ!aj&gm+7uS7$SE6q(7Ow3~bO&~k3OMz^M0Q zXeWvatNeZnz(-%YkA|_s8jv09RaYnHGJL%{jl6kSFIFekRxvfQP*3Q&?4?TvxYg}gC^=LIQfS8g^qGyQqLNOlamsP>-NI_>!XjTp)RP6W%HOk6O&18EWsgq}lp!U9=* zq2Ymb06zdtlet3wo%Nmv2*aN0uj@qx-&8H>fz_z}KYMR5zAZ0*seFFq3z8{o^^>k_ z74B>CMKJ)S7XfTGQ`2kYVVrj+=8zFz34-Gh{jst*%9-%qkRNb8Tlg;AzaU%z%U7A6 zv}?WYv|c|-6p3Ic7~)VLVA7ia%JgdqrJEw;S$P~6?f7_}$yao@)^D#4Vu*BC<+RA$ z9m%2Kua~324N2tln=onVp@tIbgbWCp^9*T;iG9#~eb7D8X6cD&uVcKtCeQ*B zJXc-E1x2tcGNRtwpRqM$2bN-V?V7WT%SYSG{I@DCg=TOQ2SKP+Kc;g-O|2Reu1Hua zz|s_rOfSS%AT_d^x=`lUEm1Ksk#-;XY94f&B(x7%36|~*#NLR}w+ zeqnxo+qRcjF1S4gCnkb4-B;U}fNVPhRYgyn7wNbL(KvwKANVjtHKgtKJBNf=f#$h~ z#kaEmu=2fRF2p3FSIOaVGKx(0N0mE6)}`grZ0il=w!txQ5P%Wf02O#&p8@zl12OON zqAZ_L$STF~D@#lZlOoTzHvlS#)E?$6t-Q#32C>gm4WH+-W-e|I+p#Njr9waE{p z61tC6_-}gQ3Ze&|%>#0F5Rw2r=u1g`u4Pp)FE z*qoFUt2Uf_)5GDnd5n}6P*Z|goIH85&GH5+KGcd3@iC#e5J3PxkUbd~<)M9@TjfmO z%1ys3fKwLRG4|~}8McZ8R)wQum!3IfwrSYI%}la`aIf1Vm-VypjO@V^jN8bEfqs+t zq2|uc9K94mSiJKHt=Fr?K3(3)`hddFDLR>!pvpSZztiDIoU(uo#U)fZ) zN+rI$EARU1in{by*f0f*6Tf^)`I4L|=h|5qCWVkQg z0?JqgSp@tcT`omkZ(tZg@qtBI1F+C?&IlrZkbgx*7xgAE7F$Z{gRG$Ln(WP+$4Rym zoG_=Xdv}zVLp+Eb$6K`NMtnrQF+_S~;wl<0i$;sqmX>M6Iyd45tA506k;MzJ8&*<{ z?eb`e5$}5n(s-tZYdAX~&b=9Uean`gU8jA}84lXyBm@;vffgdudSJf-|D^S|0}{g9 z(;tsu-_~%vkqu#oQq(fn@#oj|_sY*HDK#=RM8X11_GNq4h^*eRdKERbfw3{p^mEts z17#M*c^7`N<^L9H%Q*i*O7q!V`g{)`r7r7wDUt~idFx$1|4%?apXZ?ELo+_Txgg=tZ-GqEoz1u6eajUGXe&F&L7oVQ6 zd*MPs$FZYF`|z+BuDedXR%(PqC|t+&!T)g22Y}enN#Co|Ki_+@8~a-u_l$i{)}4CW zWMUr%_IQLTC1KEV=R3f|kX{ayn_F&ja+0H^RajDfY_I`|8wWx*b>?E9^MMwR;mVZnJQuILi;fQ1?1g~2so`sjaPp-U z3_r8iRDGwx9U=cl;K1O}5T+9KF&W`O;#Z%aJDM-WxQ#WtFM!wEYqsA{A~Tv_x_Mpm zsP2&5)I77({qNs)pEc0(MlJ<1zqI~7gWmTxlucyKV>QJXW<);{csK|vC*XAG9TeT< zJv>Ua6_=>Wb!oWFwr{W4+|#Pgm@Mk%|3m-Jp9)>aTfSOWyUG*=El;01MT0@O5KaZ; zTn}M~JAwqY=i{bH36L78i2Rq2nY}0?-`+2`niEM%p?ma#YIDjpHjd@h73-~(!B02@ zF;7em-^T1nY(wZ(g+U*0?@5rFgqMQuA5ssb46uHXKDQlv#>M%jCZqD3KOXC+iJl98=KA&QW!%E%tsJ7k0?Dk3YA6cUPvN`v3y z>i(Vk`#tCP$Im(UIrr!D;XPij>vdhv=i~Vp?f^7zMMqm1y#hc0mA^t(;{K%iNMvii z5t5;%x^O{BTbm;-t;bC3Kzu*ISn#FS>gv8^mNO+WbVQ_i;3pUU4zm&p7*CJ%h`c#h z`KusiH^ex09XAu=;)1~z`K<4Jpy;vSG22uf*cCyzM7*Za-w#i=qxJ{>y zP1fBC{ruJqW*VU`bA`D9$IDx4FMKwQn*ZgCShDLBsG*=v`g_pC)uc2yc4(`(dMX4PB(D=**v?uEo^2 z+qa*Eo<*$=#0DiR3S}Ei3Dl$S@2mzb&!HypBMS^t)6&yH(^=Tnz@?*V!O5{lFT$)a z)^6O!EEWwSprGRq9N7SQzsu&OAZ|c_h~m(;n~vL_|f8*Ig);d+q~bv!|yg z&?umxpiVak`Q@UZ`0&9B>S*Wl^Tps~p1#E%(Y)>epGx)9fq}4bbton#&yK5~zW=~M zFg#AQ2T2h21dFwg7t@P z5AWT-e?J*}Iz7{D`1v6WUu6MvurV-V5esdK^a1<;yN-c`0CyEzMwR!s6G?u*TQW~t zCjWfUvqMN zDo6U!UwPEPCc!)FwPuh|Kg5@0f0-K*C8HA)ttjdLU0;km@DLK41vI+J-TW(ttPx#Z zCD$$M$B}4fgZPEY_LER~(I`PX0`%V;T}|Du>HNCiv$c3yzX7e3f)c&_;qT-KYzNq0 zClVg#G+71Q*pTKjH^gtwTwniwww7Dy)11|!jNN3)!yA}IL=?S3*9dHsKwW6z&Vc2D z4~E{QWMhM^y}dASuJ6qC9ad5ynR}Qwa_r&SWEiK(n(gY!^Zjd4ZR(jL^92%aoqaUV9*#>~OAWfX4TMuR{7Ql!V;~O$H=7;{HFE($3zfjB3pqLZKzNGeM@|n&E)cux`w8mkeLhJ@?R%vxA%3WMoz|fs7HtC7LgA z`r&c@fpB49`-h>A8+a9#dOKGI!7sulYRNROZ^y%Xn9ryE`PHVYsv7ZmBKmXZ-ztWL z(9>GZHk{a&b?~TZ)(ZtG$TSl}88GquGKP^3uZYMEpu#Es{{9;q)HF43bXe!SXwB%M z{m~(QLd766cALIt_F2rE``|LDZU{T%%KqmxH$ z0PFAds>nGjBjc~tUPvx16*nEyGE-u#IX@V$m zR;k8mMXxl!KPiAXlDrGNPox#;6l|$qZTNX**1YW*gT~44Ul=uFKXwMQoLmbUqT~a9 z7BwG6CMHjVKH}z1xGj7d5BdB^&;+R}N76034Gq`-7UFoNp(c;$c7W*?Xd7!A%iLTy&EFxYr%(WQy*+M|P@ zpR5ZFbsMCLH|}VCVW7znduPUepwoI6uLy6CS#|HT2tEz)CMqvz+xT#6H2-~Jwbf>q zrSDei&4G3)OyI=ki?T1@#`3toP%%B7eS9sajLABj7g%XBzYk5TnJaH&qHR5q3kpul1Mb5r z>6rmky2f>tW+4%k-uX5Xx54~DZ-KP$QtxAC;?z``S)rkTuY!;In%=!z5E{NM|GtUU z6U+G9IcdthI)7*PRmaTAu2E#Q~ zWy;UPv;hG;VROSZ&2PB&u>ScoKE7LK@@5lPw3FW->G16|n8aGtK8xWn+2FJw9~T?j zR{6quNKB0v{OZx4t{Psnw`)WIxhf+PX5^Z0<@7X{w0^a8(FvMeb{lr9DQLXvQw6gO zW!eWICcxP~-yGZ&7k4W`#A?Ia;Zsky=u1}X?u^PcQv2vDEWsSJLr15?F<$6|tXIKT zy{H=z``FaGR__n(1>)i@|^ytxN1~GRdOfhr;?7wMQt8FXbIK(G& z_v1w9@s;_pG>g4HRa#GWz`qs)>hp!FS1MBn!csmvMlD_X;XLo`?#3{(uDiQg{oVnK zo5HaQSHzg#*DOq?Sn;p?biKQahA?n%J~zKup>Vabp&=P~-Jp>{wzgI2dX}cLvbwM$ z2Vq|KEa3i)cp{2FiN!I8nTr4V#LIesW5)%hoAJ@ajb_L~wVcVwsKl@KZpGB#F>6~+ znYGt8Gg?xA8EWTOv6e{IP8=X~^f{y@*bAh78aNqayakgsen%Z?UF zDVh(sEL1i1r_Pk4%mXLJ*y5F1v`RBy*t)>LAZ`(j=6Y$-qo2bfS>$$N+O;)bWP5&L z;`73MS?K9biBtA5OUI`cXtCxw^oR5nH6PlV^ENB2El&8(mQuP7ziLo5LED1PxwDms+t2u|x?!K8_UcIRawfuA<+qYH!})O0Ep(`5{>fpk!!$^ywH z0nP!qawvknsaoW;Ztlm$AG!S732J7r?x0`>_)B-(;sPBcaKt?d)i$O?!(i^=SfO50 zP*a`pP&5>Y>)UfH%HAFx)tUSvo?JSBng;KHLsop^tJXEp>|gwKO9>;ifit5Q>voQ%5a z^*W!m*w+Ya7bVXZ@$dE;y4OuCx2k4xOSe9jvJ- zImcvZ5;mFcU{abeJ!L<~{EUL=en<-pZ+aAUgn)*3AUzG*1p>(vgbvC%ct(!n;s`v5 z%jTC959UAGVm)f%jp>@3OxAN%fL8FI9t8s(z#gGifo}nH2Za?QCnxztuw|*Psp&2~ za|EOr@N#PJ&pi0_MNC-uJtm{v;NyYS#DS)n5Qe}RZ8@^jF9@1Vd?jlDwxl3~IDY}g zGKn}LF}w5B832MtVNx-3_eI=4P9e;nh&+PCh@o^C!|VgZVmfp(7!S5F>h2@vTwo*W z5~$G>&JIPBw>DJdX;xRaJhf}jO0@nQsFQOBr78CGGUO`}BLfVjF9NG1!&`I` zeWak2)Gc7)_i&LDA& zAz8o3fawb{b%7N`smuFA$(MiGU!&XE2~_$5N@?_0g9=k3;;LsdzFHkGQ5s(ZB&cR~ zHVaz;C<_=KNnx>7vL`JsFJp?j>r^8JK-+34s$e{VOtn-)5kSD0XM%a&*=|IE|0Jh{O@TUyFv{~}h{#p;axaIiVoFBvDEqY|7J z!72vl=v`huNMx^YeZRdjQmU>!030uLFJqP8OH{6VpEMyo$F?n)&|qpQ9pctqzq z7_+IV{y3$e3)$y03CTi?FY3Rj%guq)yFPEQ9`U5AlA-8)dINvV(^m4 zQ)?Mwi;V{oq=K_ymbD5y01M-j5c6t~hKZ6E(h)WwPx;a_kQC(S&m8}KL0^9>49{a|UJ2O){dklYSi4S2e`miC7`dTsv&O?vwv|A2r{cK7_F;Bo58B8d2--hOPGiKct zpX44teoRY8N3+szg!Ue;T{|P82*A0EK5r&7&8+Z1ei4rb1McJm`VwP)DM`t>q;D)0 zveA6Q;zy4L2k0H6ni&YwNO-0xeY&SO#r)=!6;RXnIA9#-nerJK{n$29V+VQ$pT3&e zhcgtYtXj|}o@)eS9M{TCZ9pfZyA9-gObhGBW5SukSR;=e!(>X;cuFHi#ULwx{~EpG z0Wc7VY#e9|o@n4c_fawcro{G!0C>Xg#vKKuOLTy!*qj&wjg5?=;^Q>{SYZ=}V)RFJ zg(UtO7Z$X5KmEzlFe4Lv{U%7VNZUxX!nrxgW%@2Fp{-U5W6P)!riN>X@C{pkqGOzl zi6CMQFl8n*{0BoT*gogUz!T?qGUfogY!&vO*!crvXTYS;6ytFLqp&1QOQw(p#TW1a zeCN{V7Y0C6hl1fJ67A5fgK#xkF`>Yxq3E!aZp#({gixv7#_;J?@ySQb;>T>|!}>Bj zXSl?)Sntyv5E8sm$^v*G12!2F0}`0bLX04G;sNa3FMY*HYi5$9(+WHo29p>`8LN(9 z#uOdhjMD2f&zmV5dbOtRj9V#^Wv{$HQN##BDiX?M6o2$O&4@tyKE&ORpZY0(|2J2tzTCUn0zN-GA(lOpt`|n~;{@!or(RJXP5#j(=9KdhN z(7z5V`z40yRYMZ9Lpf^BmYCIrQJ{Ugecwh$3K`}8#-8eh=g@oD;oGj4F zSH?Vml{KWc)+g+{t41_Nw3C>r@;FjY`v8iuASA02GIEt;3OH9 z_a4*jhV;?F_WQ;@BY<9T>|4|FjJ->w&PI&+{(VGJasf)M{=MAvc@T7p<|PxIyB-$_ zKMD8BZmH~HFZ>W7PH2MWNDX5B)s-KQkn~h~Y0t_r6Z@MN9cLfq9LhC>laot(vWZ4u z-M$NdJ-c_NT%`8(9eVJaNDE3!w`xSgrx*n*mdPRLdQkD8a)w7+ASy^`8(``{zCF%T zlpQda%1h7yCxWghe1G6iOsQ zH zjkB`wH~AA3*n<$1fS)LiJrMK*%vG<9T(iTUV%&E}?t$pRlh(iAHdqKvm)i#HdnfD; zHZY;T!)*$4I37{a=MRfC_HRg)YUz zl}A}~XJ^y5A=ifId0thOl$lLr;DZRTWMF^b?^1*8-jZ2&JOkuq@dba3_(Z;Vp@j|k zHAJ-_yG<74n{S$!C^^5gz?56_s^_$aP4QPqxzSJM78k>Q=9s0W4zy!f7FhKVZ($vz zf`5S>Gy(M(1}2(W3}EqN@1Ia9t0-wofY&KT%yck{(D+RvJy0YNxEIPaA_B#mgH7Pp z$?v4}VkPD7l9pOFKQ@-E`}aejNxUt2(z$#NtA?n#GGT-9XXfN;I*L{FY`fq`?QZEO zli2ZT>DR6&zR%mtb$~si5B>n}n0@x|unWJ$&W(gxF5-NF<_@spSJ*4utC~EjqjNJN z>}zoPK$zI0!NIPo;x;k+(#F}hUkB}))&1sGqv{T;QMXJV^YmO2_Kbts{8nb>Cs3CC zM2r;)T)`|CYWQ%hPSlW~RL?IiI!0OV_4MRnUV3x>^k>sjH>T|y_4Rplerw88TX*in zg93jVT()7)0So!x;7JgwGTHJy)*hKBGBj(0|}HN37a%59rx}MKUXs1_FTMwQ{HOk z;N)L)!M12KNIL;4NiggIza<1fG8ldSVfb6%HK(vR)T2FKHGC|HF0NbHKvXd>2|-62 z+ttqwKTmpKl3Nfgpm)oqhqt~9v;MLA9-CEv4o-#g%E^gTh2h;CC@ZNLtqyn zti9vbU|YoNjD1LTw~I2Inz*BCz<+VU?4yeFqrYCxb@NY-%{ufO;nN);EOON6q(6fz z*l+A7u=IpHVI(1V$>5CahR2JODev31Mh*UIW#N|6W4&J%1@s&)5D<7i0c@>1FFN=kMf#| z@dSh$Z=~DDHO!&RT5wJ-==$RzTCTlgZAmxW`;6d+?A0XQk0Nq$W?(M*!HH;~y*nT&%pjYMC}dvDU?`46py!s%deJI12^YV$WIjt*EfnSTad)d^WZ}jL z^(DlSI?(iDU&E<~!x(DCU8ukIV<=9h4GWI#CVQPk3P#}VfZN_X(s2vyMQA|4xf*9O zj+fW|RdEny3jx+iUvz`tHa&)rD(05WX)gcgs8Vjxw-r(mP3)C?DDfyV|t8 z4I`JHj~KvBfE|2Eh-%i|5!{uinj$HAy(5wPz>aHlEE;jZJgaL`(##9O!`}*vGnM95 zR=T|0pWc7|DCVdDXz-o!lo2CQY~K($Z^VPEr+3SU z1sfP<^ltsX-mJDbGCO}cdS?T#oX~pbYGuAH4kBAcAqo$TWMT6^62UM+As)b|fzAeo zm&yhP4B!?&h5+&8$;ehKaxP*IS|CbQR@Sp~#p)lowNdB8gCGbUn$Dz*r!!@^f6#?z zFPAi`C@wW_J=;mcMR_gzUB{{@g?rEH7VQB8Ij0y3n;z+or|cc+#(`?WMQLla1Ud50 z{c=)#Uzd>Wm8g45wYx(5;-T7Y+D>}%oH@Jw+j%;5ef`58j`fyeEP2~fyZsL>|L}H{ zm2WSgu5AA9Q||xj{-4@HCyMex#CG}I-F-gUGqH)wIObJP>$}xuI&<%P>IEKS5j@J^ zoQR4&J*~E=FaF{3y)Cq79e=$E*mba&;RBo8Zq7GJ*8Rg3oGssu_EaYR$W0td8=GtJ z{P1mHm?bggI^*xj(5E80ubs~+8uwIuNqKqrn{)zWXa0w|mArlLIfk;o?~WAI9yrFj zO-S2G^spY(FkfEs-Z#u)6x9c^drp4~$NI4+p_}p+H1l4>1vYU#3*qMjuHKdLG;9AO zJ`L7Q)DRV-=0dFk4r(*_k0^g|w?2Rkb4S90XiTx+Ha0a;LOKJKdYn(aFzJC^$+W*% zDys|SP0F_cydf;9jHA~sMER`g$T~$s8K=b3IJRxFgL(_)(}2?*wiW6DIR67 zUZ>cba9}sPI+Y@&hLUp6cYE!pY7|!}D;p_-7@R2c{1*k-p6ve1IpQI~Ir^3_EO)B> zNaBMf=Z9)Sx88UZTeV6)C-^b?fPU@K#B^XTO`;Vq>S0+OfpQ-JIKBE#AvtJ~7oT&$*q>!!f#N z{ZEQ4?Wb(?&!VnYA6#+xTdGxMp4?z7sim|fU2>1C-=nmMl41?_P${Kgs^0k2hs=DE z-I3XARF(9Y|2#C}cq7hmZFBGq>es#;H+a%suXDank+J)N@#Bw*Lf~Xzy+g&JeKqM)cWBWrWpg2| zwU-MhY3$ZZ1b$T#(pRQC<<#?oJ%;v(Vxtl-hxB2;nErzW4AFI|x=ep|Grste>O))o zBgiYnZfSk0{NWtm`wn#aMqf=fvj%dvZ7Q8wL!CE#PM_vRnMov6*Qobg zpkG)R{qc^wjpgH}>-?+KlDz3B1*v{hc(<^Qgs{$)CFs1SmF~Pa#rFRivrRTHN-yKaj>$ZalV zs(TyWpQVqblD_29uXL3mHcM|Pmeoa|WDToTAk*el_v{H)*GeLESwJiIsF9hbV%~&jXx%E#T_v5YX74)I3 zx;>OX_Jz<5y+#;3lilw@vrYp4WWFz%CcDbO+9)qPGc$o~k|&mnE5 z3+$&$iqDoCQqgEC=Ur8&Ra|5^(|B{Akm$Ne>Q6h@-MoCNDBxgufJlhP)l`Ym`@uhX zg!0#HIBhC;>kwDhrh_}zum^4FX^6aFGJZ$-(u3Crq$qmz>8<>YuD6Pac48V~xyb?;V2tpa(4_4K$qrOQ~xX_RuzOj9=5a#r`fe^s=zbF8&f&k1cG-aT6NHnOSoYW&{I5>b{>ha!8w{_wfNdWa@cwT^Du|GN^!p{F<2e3$2Z z(p@JufRq(29l8#&sfT zZ&c569pN20I|qcUz2vK_TJYF z=b?|C@5&q0MtxK!EqW@%>APKjQ10g5mZ1}(^Mn6TcY)GAEzX3_$v~+Gzhv+6vYr;& zEhyo<<+$tlUvC&c-Hil?G3J(n95Ziw)r@*PXA&AVMrQlWyy@NI-Xnj0#qxBIoL?s= z`{C^;Dx`kf7%^gIC#`| zcW2{?drhAyrnA|-Z~6~r+4psnYaBMBU4E!2S7=*f)t!*5XMD<7PV(r+IO{E_s8|1*?(&T=P) z?F!oMe|?uqu=17Ex*fxjHzWmk=!ZE~uY`%M@smEwnR=LSIB#t(r4YTJL<`NDd#Rmk ze10gK=(>1av)pPGnA^RU?P+Vxn*H27DvQt8aH{NJN?w2OaAVU}kDIrmwy4n#bEF;f z>0Z_B&~q^qbMW3A7%-Clj+*6a!?q+@`nQ(hn=5tbFPSgCSE5_*EI9HwR8G&^aP7_` zK}}b64lT=UqnH?-3mthEEOIQazIK8?3yr`^z0YAC;ZKHobDM1-wdYa5SpU* z=WkC8{I2M{|Nhe+jnLX7OgB>X=2zZWY*1YqHQJb=7|2au$Nv7MnDmN}MbjITy6wB3 zbL-QHF#cStoOS)e9c!Db4svHLntG0e>&Nx!-4GjIL0R^^adxGHc5D$LM(-jvYNg zyMgoWy;Muy2eqaxe9_|qPkxKFT%0!2o1tXi-Q^JMx+9g+iRLud-n=vw-iF;KyT;Ep zF|6s^aYfR(_tK#ZmOc56$HYC@c6)bu1#Z}-zNOjJ_1M+2UA+m)XM#iuS#umXo6>e| zOK81&g(E~SU>h}mQ)*d=kCmF1fN%!az#6I-LSb=2NpD7{lW8*#oox&FoWh;HZMJZ2 zan9Dc*^e@B*jxP0_WQWgKlH9y_hNIGCFPDzQx3lA?HfwAF~!`DZF#?+-_ZJa+rR`v zoTcij#NW{AclX#B1>akKe0r)Zf65s)E(cH<@opPyedvddjqIu*9xj zYV;w^*C$^qLqG?hGWkrZe^)?d+G0gm(~@Si;sNS;jQ#p zsKl@_Vd9oPC1tYIDS4GkhexDeSMLyGs9yAsifK`3_{;vvp{aYjt@fIuK3#E-N3>Ia zTno|r)X&B)cWWn2?R4WfwMBa4C&BRc$E#E^bk&z0kB`7)!IS&5(7G(Pvou8qM(wZN z*mQc&_Dx-KtT&8C4XI8Tw7wo`qWyK~*V^1PL}EzEbCsON~V+8=>v9{3DU)UG0ny$JkzX=?h$T4o*;2${28 zsT(B%xO%)uS{>XGNI1@jcBvICo*!K~2;2;wsYt-P_~dRcaD~f%mrMx6asRx3Z8Eht z?US1f!n!Y`lP$Bq;4hW|r?O$Zm4BlakeVIl8qisgNF#OiHnz8gV1m5HYm$8U)gh|! zN2W@qZ~2<%3LZNI(Fi$=4oIe@sjyy&JSY*~-F#@zWpmbjO-;>IjaFM5nj<38f956W z|Gq(5Y{ZGVUq&VDbbC>9zVq ztE=5-tzp}O7PUL;C=1aK-p|K`aXsu*KyVFu`eZ=w&bpHmk1Gcbk1u9Z)i3XLRQ9E8 zj>yj2V0EU|)Y5c)@uQoDW}4B(l2_K!xbKsU3|aJ`z8y=ygW*sgx~t-ymuF|!*8MK} z8ujBF<2=uf9gxfZboJQg%G=$|(_%Ykx%JG4*)3P8L;o@{rJvdz@&!z1Zw(uMS=o>; zD=X>7X71rP+WAKIceM8;L@v!oEtL^fxh|BZz@F75ebjYK;p+mn*KMmn1lE1Zb{AUd-g0$E1v}D5Y7bV zYKW{SQQH@$b8|YERVQXff~jmSQL7vb$?=h7^-VF=pSt+SbkHz=wMb~^w#BNlu5bg| zhKBDe?AAkjh;kPbMj{dd6CKO(;-C4au6wV4KJq2sO3Ea!?P5z@x*q>KvEUU-_H6^p z3!^huBdy)~o8~qIi#Drjs%1|Qr#DyuaLLK_<>i~hWtCu$lZzMK+5cElCiZ?^tfGIt zMM#u+YM7BR4G^8bQdV=BmpD^(Y#J^>X`y_UsAq4&V!|_|Rv-1$vYEh6GDS`UfHI;U0Apt!cp?cu>VnyUyGZ5^ zR)dZR@7dNf&n)N}dF4g6xp}e08y^N1`{>UC;J2?7aKjRW@DR+CyXe&-Cr5?%QkxtUP|6x z67MCK*40p}{>53QX|(CCq8HyTxj0_<0mj~TliHhzVb)f2Gfl#ueO=1ye*R%7MAxG( z53HJRWLpQzI3?in>_GA*X5z3lz7DMc=Ap!8Mf}7AU^`lg#2u6TGiV_oQnG>xg0E_e ze~H0C&qxPWs}Keyu*W+E?sGqT4D`SsFqq|m{t5GtlqC)w&a19wUxVsZqoNeV3D}Em zLHf*-)b6vN`U|%cOd2!5N$ZjwIt*#Qq3di9+YM=a0Or0#S_(B3($G!=g(e=vxIw$X zEx;wS=#*%1`9m#fXj>Un)|f_-Mmg3DNJY?1T1-UbsgArhG8S`W7A(FuLfRQoT~7 z9vuY8CE`ig!9ji^?t&ed>6v@b24RH5Eho1Wo>Fcq1{={>O`F8eW5_!?h0mN*SbH(+ zn5n5cVcP#%s{I)w*PYb-&QZR{%wQ|9QFw@~-?*{d8Ci&jG3_ulz3}8ET$Ugk-VmIw z{e2!|95Nt>uR$5YbVBf%F^0XfSl8iKCl_`5_BBx3`_2I1fn|$g2?is(E*I5A{`iug zdF4u}l(o#g$LoqIKnlU|qUJtpWj_Y0w%9r$HGA()YsZ%G)HN*2(`XkFI%3q+-=_^_ z9lntQly&w8dkWm4_Zp(>E(@M@)Yay{(zFJ^DG|;>tg+p>IsnikR0R%y-(3De_4y4e5toi4UFaBfvUhkrM)e1yEA$Xv_i9lOW z(?Kz;^P3G;SGu~OqK$Cx+s9F8>~LOM0*MWrFkVa3W*6aNu!}=bfu5eryr4s~wM-Ny zMaX$LtjiR0Mi3rH+6#o|0WcPCoAX!e+cQUqKVUrJEsQjM!Y&xpA^~CP-sjO#f3*ySj3NNSrjYUAs3gg}ckT%8r_eHud+oW*1mPNki@uz;Eyt zhceT_O7oYmou;@_e9dy&C#Po$V|J4uHy43H96OK(_~3Jc_Vb|qzzAZyb{&{BQM{88 zc{Exj<$xc85PmK%Tl}Gw)(m5fKoeek#gW z{Y~q~0FEuZBy3CG2nhC5V(RuP`}a>6Qc2WEKwA6hO;vGUc}b;vwrd?UPy1kI3*#ta zsr9-x_fFFDto#Q8;NM+tpMd!%aUX=jAWTs=U&hn|sTGMGU>?BW0(>pp`d#cC&^|%s zZfx{QS?!|-QL7Bhl$wgck_R+XkWex~D9QO74pxwBX>62z2Vq*i{^=d?75*!*^Xt6p zqLJSR(T$>B!Ve`U9S=Bn?Ninhl>xdyH1j(3GO1%YJMjKPPd(R zY8)%_>;qwlib-ty%a=4Q3ybjNoLYHV(Thof5T1HoOdYOeKeX&9jR4c3+`GrxS^^^< z4O~~p;XNQP-hj*l3f=F{$>A^f(;%ILM$POkJDfeR6O|Rg9E0RH;12hJ#S*k*k$(+`mp*P1%{U`c4VUDM?NO}q6HBk_;SxK*p2cNJ72)dAOWNo6okVfBpO^mlwhCV%7S^= zjI$FZ-UJd6%BDr|d@5h19;~&;czQ!_W#%16D{M2qumy$w8fL7`NEW^eow_yy9b3F8 zQ%rjc2~`0j6=N8f5`i#LMf@Cnc5ADBj3&OS#N!FZ`nUK(LEs@?zK0GU{_^jQp8&5X zZs9P~X@P4hVvNZKdE6w*o)L?pzsTY_s1*mX=OJ=t7`o+2!vst~+`6VW8Pl_6;nHCP ztNp}f0aIwQDdQxEX5Zpf_*I-alo;fI0*wJZ)O?}>_BX>{EMxAe12hyw0C?5Bsa1ea zDF=Bs@e0kWMr!~2C)=pCTqNuYTi%mSPK8DY4k2^%LK*dF z6?{SBC=5E=)ReHDwPP!I*ZOdl%*n~=_>Rhw$ReGcXNBkRThbshr-FNr$rPTQf)u&L z4otfLak|;riKooeLGb{~;z@9w!MHqz!BI|*;JL~nwFXR)Z^gv8zjA*qb`l1q*v#O$ zmBe?_)>Z&#s8@{c$6*bGHQ+;ng;`z?ehHa7gWW>R&~_t}qqAsY2fAHy&srWEBA@*azc7zlh8I4@+l53#I7!#wkHMwoMnKtQs1+**R)$Yjmg}E>z3>i2nz!e}oS*R-Vec{ksR(W|0&nii}1Lq4(A;?0+w~Mf1 ze~8B|(~!@K9Rd^QNIc{--?6j7ut*#i6^^smyJ0181hidBLO(%;y|%^u6{G;kU}if| zcM|^8AMnp{V8LySgl~8cyI${5@>PI-b&|9pa4*CqfHCG&Z!Y7v$(K(%IwHUf{Pz}a zZUqbbVJ-_=6O}*l-UYAObOJ@#7%byUbtpmPW0p;c8_2+Y6(8}uW zBcW4}pRyqNdZr;Wlh5+ol*c)txM?9r!f zw~0mp{%Hnyyx^D!5iR1lTv4`wU!V@E3;Cpu@>KMQ^dp8WNW4A;4i*A3wqea+KSq-_ zhMt2sMnmpPeCSZ&+r*tme(%46yvp$7PCU`5fr+&Tk*4AldpHMf5pjl4&gh1_C`>l# z!Nrlp;+B<{Uw}M)0n_>S2s&&!HHwRB2lF08kCxwzy0aac1E^^sn~X9tc{SxR4Dka} zJ9bo>b$~SR6PC#Z3!CAI2R0HO2|Db%#P$X5B2~|JpLw#0#L^k$Ix;=H`=W+;0pWmx z!gU;B6O}rem|v27KH?@*{}hXK0(XH>1@WNa`oau1uaDmEbLmr7sZ$h`0>Zm@n}`StM-iu{DfBPMrvQ>9Xfn~FsKcJ`scC?NK|tLc&fa zs92hc_jGy^MLZyy5JX(};#?r4TGD4Z(Q6LzW&^j~0^SV-3NjoqHqVz6Q14E4;aQ z$6e{eh#-x|GP8ecVrnX+oxJb(CAha|;p`TcmeyyNJ$tJH`CDXBfza^^?h#q&@72ko zf^Sn6$^%U(-G&1^C%zrf!lYe)QKgE@Jn*yLmhys#hE!mE~0#;nCf zkKBh3%`QQglLf(yc3x4@ff?381{}5;NMMRsgawK^2%-l1`caD@&>aOL5sk}?G z8VW#q_@GqdA``D0=rukg2NHo?NFuz6QVFLj$;_&#s9=uyg4jl2y>Li2M7x7-lo(IJ zmAXab9a^bRgXMzQ0dam1odh;PNJ|JZ3LpBbh{c6Mi}*unz_Si6%Gg`5#N+q-$fFd2 zM2}FP5Gs7`-b7UCK@^;L;{0Qb*KMYzBJOSQPe)n&{>zsR1TMjG1R1(9csg)4lb4+T z{#^imTi9m5VflgEPw1S$ch5k_jc-gmYJpn~Hu(93wS%0DgV0*XW)+4c%pgMQ~bTqdA9szXg zM9KWpJk9a3m2NC|OaPn@xm;hMoo+QGQ`F zh_m=;sKnuT*aG1ew(X(c51u6{p*@HBP)_eLd~YOf6Kr(P-n~6QuSsoyK8NCUjk_Rz zJsa65xPT;I4DJWO{}jo>#-Kg-7)7MsIK++=N`Q0_Gh^?US%A?~Ah&RsnC4Jq`pJ2X+TNVrQ@;+9BcjrSSMk>Qr z?mwDgJeJOH-$7gj8ak|3#RW26!~>%m15Qljs{&F|UA8Ih)*tYw%Kz8{Lj;s%H*Va3 z_B$;v??y<-`k576U%*W_|C4G7y%k_I3e<+6>qpr%CeJB z)V;K2ujjN1ta}m812h18nXs13b5rA5Kt(=6UjjJIm+smGfT|^s-2f!EPd|87(PQJ2o5emTHh&KzlPG2FOcX<=sskuIdKI)|B9&BA^bg`e4s9 zJ1JIt^v8XDOR)=hqAJ;!ttGZ(bcb}Gtx{)^>I1rBAnkO)D;Gnu;S)H!0y#1muO#XnfQW$&)_m`yqpjta!=y`Fx9t`J->i| zQOJ+1L-UYR|J1CI8C@31%2Pxyj(^vGz!~})%q_`83B?p7UMEZpUK$-h`;zagM7AG^ zo4CZVdnk}DM4+~vogJ>ff6Sb#d(v5O_W>4%qp~6%S};xGS_pLKc@L<#qx zfy3Hbr?vHa&+X4+&?uFf{xhV3mJIFi^0&zzoS;3O$(9-m?QH=2VWmfuRY2^p#ujk+ zo0+WGSWC!>7v`hC0@_X7-H74>;Nk=}gL=pIFQ3!G+RZN&W%>ZD0H{6zRiYqNNhDBV zufsw^zeI{-$paqD=|KHekGAeGhsz=~Nq{`ec3qc8$ih`r3Rqh>9o~G-FDz8e?@NAl zaMkU!O#Dxj$fmGHCtD|`wH2-!XK%l>DuuEb_XC|2EI4&VQ}C2~UNa~(1^`;P>mWM6 z{yDYmCHRm7S5VANQB+Vssc;hn`;6`+kI27&|3YVpH#=dH2UOr+D2(Fvfr{taxDUiz z9h)uqv2WOm{REZ)gBIG*>F}rE$EX2SV(l|QcURe3CgexVZ1TW5gCSRc94iJX-Dg!Y zx=rm06X*Wn8$3W)uB`rkhXs{<;R7vv-)wnP0D0o6nzsC|yUeFi_DoRYl|sAVk9pDS zA1}Z4ja077*Y%EEy0jlRb1bor4#y%hk%H7AG|8;wJm|yJwbt5_3a$pZ&Q*n0-b?Q~K1q2Z+raxeq9q+QTDL6V0^ zwh3|fgOAgj>WJLF>6qll(7@m>lXF-~iXQtkHd@DVGMWN9&d7KsB4DjPjkvUQsp>>b z%^tiFaq8CI34EsN$)YDb_;K<%>`TO12rGepAKz^0vaFxzSt*I{fRoj6N*o#hv=uZO zKHhC0$Lz_I=RL)D#cSc)noi8Xm;eGXV;hJMLjWQrRy)V{Ya@yRzO1MUXC43&P4r0u zD;=4&T+?zNCnmPSi>F_@j9*ApbPVmt88|#c!@rtomk%_d|3ZCF`yF&do~I#DBS9ia zDm4@_`p*soUP2TwH|=P{l!n>Sa6`Ia>e-C&hFw+#mo4}dxx9Efs=09OGj zZWM}`^tu3@K6>erA`WE|c#7=?GlzC`izFo#lN=;>{j=mn%>bvW6PaJ|K%$Q%j%)4h z#QF6j6#3)$pEW=b<7E_+#B~8@576}|X4kvmpNPrBH|$z57;ZsFi@7SVy!>s<2I;qM zRfZG_4h=cDx`6%55VnBoZiqRtGP-6DmM8h>uwt6TYD7W3(4%{huq%MsLB^MZR)++8 zZDVG3gJ?|N@pWSmEhB9EZDEg!>hv2x7}RSdm4SqcLLFou^?my$YGRLp=jk;{@RNQ^ zag~whp{9;dBay*_Fykg)d1B(9C>OT|ZiYmy1*wY-@p<01?bhA9Z7|+w!Y4w%cp7g7 z+_4OP;F$TmMoTOwTKXbR847a@go*1oUMmW~DY|6>2ck9rvPcJ|GBSW6VJ3|c2W+Py z;bA5LL5u$hiVVr;I`d@=EnDTYD#t+?xb1;_aT*T_v}Wv*l9H&f5t?AU zOrUUI-?|82{3-%1|Mu^r!Rd|>Rr5%d&PNVnuYsNn(Kh-h+0kweqBVl=^qNz!=|HcH zA+0ljjAdXi~J^EXbt>W}R4Zg*P^Z#6KfmGT6PLX~wJ%$rLXYpWlm$YQiJv z=xH*A`Wm5q_dQcj)_REp=h!b2w*jNA@%9+B+23uvRVljz0{1SNKmSVBBa!y@kpyoxiwU_m=UHdMj=V?~Is%;Zz_ zO8@edfZl>MROT!R2NBF}59joto(TyF8U0u2ydW`AF!VtbS_{0#F;;Z}Vu-1`N5Axf zTMYEM1ZjKbNAlGm2qeBdBufy7C=OW=FUAQB{;bO8IWw{Q&i92c_9Q;bgNK30=lcO) z<`I#1;#I>iBWL{Vv&%-O9gB;2#-RFL100~-`5pRM;({48^BMP-2<-}I%W$%hI6la- zaY3=A*pS$i|G>OA#o$RwLezQdbdxs;^#e$WfhWpZI6nqq z)`Laj>gI-JnGEGF^=QGF@d*q^3Bjfe$K$ctRiIF~ALLcw#Ar5A3l%i6Lnv7A12c+a zj|&CjUeO~Eu_x~K+910ZXiyNh^wg>pE#tqWM$D3I;-#UFK zU7UddrkmI?-&jw};0P2Hgkb{a%=iNk>(WGLaR@S~K=x&2y-*c21xFS3FMXpmpK>|B3I1AV^P?_?^(rVbOLzR8ZrIHsgqQ35Ti(p zoN$bg$>5;(1L-n3d=RJ~ITbzj2y<%#Kd$s`Ke#zcR29JBu8ztaUVQkhwghxQ0Q4X= z550NwKyJ(6;FGp~B~UU*bm!;S^L%&yd=#5kyf+)ZAa{qwkyiJ}|1()_lm7n>Ux|3_ zfBk6XtH_($V4C^gA2a^4CK2BM_0uB~wb>ZBEBT3EwVSJZoN@o*oa z18!|TJu=xI^ZE1ai+k;F2i<2CCcRM;@$sGGi+(JEjqtxe`n*vb#;BqG`xD9@I$olQ zMA!fR^s4h+H$tk+e}B@|coYgyEd0-p97Fr*I5bE9{kb2dz9l~>D5!mp_bdm1t?V7w zf>bKqel6NdSk{?l{U#18z>KpG$o+fcP>~~UrVc^}|1L5Ts6F>OH@EA_WkCMm#$}Oz z(k~>FJ?s+=nm^b;rh}AO%kSKOK7i59@sfVMl|SEf;Zt;W`n0Ir*Cr$u! z!WLX(4x|MW>TvuT1u7d1(J5dIh5zVQ111cIwG7+$@89pn$0NM`I?^6TMs}da#S0=+ zAzn)j6`orShmVgBeh;Dp zNQq6*fQVAI=-eqdh{s%J`^n;GBia|1rn(}yt?L53`F0$ z1&{Q#nD}I%KtqE!IzHZlPG1X+Cb2_7jyuvk+?E$?NHQwow{8F!%9XsMBCd>_y6xMa z1Fi@ABnPT>z$p|Ukm2=#Z^oh4Ll1#5=KiyDmf%F+L6+YgukLBZGH0O8oWi^=0B)nl-7o6Y8X&gd?yNQ zQpDevaAyF~lV6AbIIwe4iUNVcuxazz;c98rgz)i7?K_83_B{^l4A@P|o*zlt$RYs^ zH$6%+JXs_*3}cp3;v>5>V>kVrU;ii>CSYR70lPC|tcDLkmI|O6^s;rx6@+`kHJl=# zgSP`3=)iH*Ml41E9|I(&9EA%bF-*f>#{h^&X_9-QN`jf08D(2lvou_`NERa=2YllA zUkvK|)`A2=)}su*6BZGMI*R0W;Iz)(k$`8g9G8>=B{uO}ro4;MX6|*P{3woo_D1}R zW#C`&2mBO)5|VQh$0C_&sAasz?V3bWOl<1#0NTDx)<$=Nh4yr0BpHSk5nwftLIchQ z7eNIdn||=0W3UG>Fm_oxJD=3qa)>jo z#-={`Em#cOOb+PUjUEJc*8pvk1%ALJP6S<$bc;GfuDZdC@g~DtSXewsaj_!)?VYa_ z#px5f*4Th-oa(xq252VO6fc^eF2(k5UXmP~I)E`N9sB{7AyC0ix-Yr3teR)O;bW!T z`t|FDAuSuyjKI!Gv@6&FCQNkbwhnPtDDOx5n?)L+YZQ+O&iyOsG6>>rA*IB1kTng_ zGMHuo6Y_aVGdz~f_V zG&eUle{gE5ELYdWrb~dO0f`l$_vzE83ZZ!&Lgry2|B&m#CIpl_1H%mTk^QV%wMtBd zhlcKgtf6K_p@FUu_#y4a4f{1~=Ak&?`vWG`qG;riB_&DsiHHN3zj*N?UYQ!x%Nm+N zMds~$C)MHye1t0MNkcQw>t5PTLLS#uF}y%5$*!zC$r~LvZX8`NOUr(*7jZf|dL)+1 z7>_~fLjw`AcmNaB)uUMP$3IL|R~J4W{Dbkg{&{v9G%(t~X=lXQ=YP3{y0yWED$0PrH;bJSon2{v_Y-OPSz*{e=sf`Y@MAA zSFavTrXuf=sc2MZ#hIG7UhN<-B&7U26k=k>phOkT?fEuihq)&%OIs(*S49_s?yKYK ziX`EB-GQ|X*n?mcKK?%WLw{d?WF-AiC-Oeo-t!8(3kyrjt4Muj?;jj+^X9Y;SjYnk_hhYuZoe5oUze*tEzhGsxs zA=YrS_*E?6Jvr5LzW{jMZvE)Oikxk~o8}M+AN|qpX^NjPV=| z3o2UNYl2{t=uSqlLpig(ZP0(nXTMIS(BX;61?*E&YVlol3V$#yRY_VLp-s-A_Ca>b;*uHP!Y)Edk+ z-NQdoZU-$?v|Z-#x3KLz3cE{w!RlntzgdtO6K)cEG5q2j5e`8(NevzP7zthQFEw@b z8?{r$_~{NSHJxWVh_d?ahU*gXwUZ}Lo;`|v!-s?+ISj6_Hx-&I+R@p7alHJb)rC80 zq9Hu+#ZAb3LtRl zO)rfOWSop|MTY4Aghh@pKJP%DzfXLlPgq5dM+N>F49dOj_ZBhV)PJ1CZQ6JX{wzJ+ zNYQLNJ~}DjnGW#dB2rEyi{juEOl!fojH0!*wP0kB!X7~fwV&v6{O6asB@>fBoNMZ5bipJ?TQqa0^C)1vuT)cDdO;SpP$0>{F3#}e7gSmxGDfW$#UWx zsb9z%?otQ;07l;m6&D#u`cx!Ohx-AHBH zOk2+0{L?zeC>CM&?(*r`!zroIUqNNuI261SViG%vi%Vn6ny2|FpsBG?D@gf-BC%rs;^&Cz5erMIw&P21%(a-ThXq{1h4^H#bg7-Aw2X)vd-uy3^SnheiWm*@RYExd=3qo{A)w(8ItB`0g} zp~PMQ(V_`i1g4$u4Tl-+!$!1QuYr?d52kM%Rbzm1E8*ZJX9tIkr*<|c_xRAhV8^$X zmPFff4zT@_&S3^4RZ-NJEBY- zJ#HL|ZJ7>BmXu38bKKr^Eqd|jbJ8uo2QTdmxRRRc2*3+A^Ed1bqN@kI92P2f(yLfP z4b#eXC14Gw#{v;99I(ZOFau=t2LI#Ny)Sb^1Wv&M;Kv2WhISqbol0h8SDID)e{7c4Ynm0a*I4R!T9`S}-io`vW!sqW%t zXSX9g{l@|utJ4qp$(*dsZyWSRT%cRb6<*BjcezLKrl!k&@IaV!t#EbS&p{;O+9Qzg zW-T9s-1c^MS-?9T?FMMj3v!S+^(!kAbrSgRmNBBHGB@p37% zJ<-v18}})A8vh!f+#_`JlL^#pDF?>gvz<`6sqg2CbXNz59JCh$Ca)bpbGP>8ty8FW zcOWo5{_~ss=n?+OT_$Y%y}1M1;Jeb|3XO?a(v8;FR2h_VGF?y&ia#-BNYT&)VJWtD z5zW~4q)i+88h2gXri#!%2Au$Dkdt*uX{F;tJ8bgu`qpv_yJs&SsUq7RuTz0;s z?WIduSs9ILVafUNMm^S4UDpr~f=gMFXfu?%M513XZCv*N<@0oQd_GVvJ<2k1IHycG zPWkq-X*vxmjzaQ+T7-eu0c7O`k^Lq=!QR^>qNdX>5ReYE&SFc8n6qT(pQwWe3^9u@ z_2z#QOQvb!`XdAo98y9;LgBk_pFh7sWI0S-y?AuMa(O!^Vgu<e8s$X zYEqJeiPmh>ZjO1(gwOKSKAW)@3LHQ|XR>cNo*Q0Bo+HOOVh=xv( z9ncaYo%1G7h|~LkHcGp=WY@vg)c$uKJ-W2h4z2eOnn{5*UA}sC1@Ye-F(rcJ(`4`3 z2I?s)i>J??%}b7#N)8k|(Ux9YRmkQ#01%ajZUO!~d7Z}y6>5wYiqvBwIY&1e7Pc20 zF2DNPe>=#JQPUD$VrZsVpL2+9<#e8(;YGEy22e&)=m&pjA!rkvxu@AjJI3JCRuUI_ z#q+Xt;(783z)xhUQ{0k7c@E<&rBI5#b@WiG+XwdS!r%tl%Z!E7taNLvN4xo?!a|9O zR5lQ8M(vEWC5D7X|9KfNZz$Ne=+}=NIf9@v*CO2Vh1VzQH&NR#qalzFVRFsnB6tiL zj|b3YS4SE)lu%bLU%qWSK_R?=eb`&yK0ln-votn$vAOxGiw}Q~7YNvi6ZR7Zj2=Cj zBK1k^_2vnKVjW(`5X?sh24Ik^42I6x4Y@R&((f zoTOicso(b2MHKn;vJ;O%g`by**hoYNX;2P(KPBtFGkEx<7vnFLK0cVHexcNE8u^Y3 zn_0b{#zRTqD_D2}c!*m}ojwGYKz%*E-G{r3Zo2)Mevx0|Q}vzNN}Jq{x-^BYFH<^8YJM^}|WcgOgiNOr@*3x4qNS3kQNt){Q18yQ1o z8N|4X`~I!Cn16Xa57;=!$sN?H2Fb?IQ43GLqWcxP#6uUZ-R-pFU!(FRX$NjxzYcjl zk7K6o^K+8G4JonTkQeMBolj2pv9#QLc(X0zU8BSDCDsoXZ6rg)e`Ab}juY4UmAOH3 zz!8+r1|+;_uLS{JchUy|*9Lu^iYNAea#gXkmzqad8(>-)? zLfI}9{On;;o6dwx9jUAuB1l{y!rGwiU`MPuIRNB}W}7B!8Xf!7l5zuH6B+YpliRMW z+w^DX>0YB6UP<2NIjJ#<*4Wr6Ry|M{D-zdXR8M%lm6A%$si?S~?XxVZ@G~Gn5t#+5 zMU=%vkJl>YGYe5!77usWd#&+I3@WeU37P+Crph1I#g(b)m0rM

tUAQ#;~1?83RXk_l*-A5m3`+YGVOwI5*VB?SxYHHaHMiV*gxx~epa)g&eu&Z@dO;-G0`YCO^#sD<01)mI%;!nQTZMj;?x^4{ z6d7aOV7Zrk-$I?T`qeG&nZb%*>lmxj~KoA%&;-s z!EY%ChUn^^zTSETcxK*-=ZdW#U7rm*-#B%U{-D7}cguAcIw);M4>?2qZi%)fz6VX` z$=$NcRJ#~fUN6~muKvLere8|sX0%lvJX!gC^k6Ew<7 zrmd3WKuz87;g~^mvL4Vm1`nYdp5`}e`MSS$yPdm*g!Bl#r}yq*8SyoZyzxoT%*>4QSQ{`7Ta(-VsIEQxoCQZtEAttU1}w`u5BF76bqYo=3p2TpPxd0&}N^ z&xP_|XWb@CdH@GAeJH(#Sz6g~@SQ8%>7*IgzO;|?LlGo6u-qexM{eHyJ*)0Zj)ee} z2t^QyqN7L0iw_fOYEJQ@^qs9y15JIHQ81fwh0@w7?b$MVS&;+^PNt{RrH&>+XVd5Y zqTxjO55yWVKbkDj^lJI2Q_qwxbU(10vS}=-^Ki*8m1J$H#Yc`L*gvAJ_d2xI&#hI* zy;38E6Hb~vWcy`a+%sgHuaRvpXS^F>#aP88I&?8Zc{=gT=YTIUp9TirUm+xLk8&f2 zzN(6fiV|h9(b;}p0E(2Q;0?1}?mgPEQ|q|00P$m3sZumOiq^9<^aZqZ!W#i*Rj z`<=A#?fMBqC_{rB+t|Uz9i|(pZTYh2d0egF&QS6f%R7YF#@~JUl7T<>j3>$OnVKoT zbla{yd)f)T1!bb3SjPU-d~ezE>StPVOWD2Z^#Ts(ZCkH6QpCLgI2gDk^n{B#-MSbZ zSYLmsyuyxdnT_hE*6fiou6BW@l9D$cMZ+S|A!{I2n$Z8y3HE_np3{9O7t&ve!Jc?ZDP#V#>qub_6kmbD(nxi#vf<}^G$xu6|H}#^S3xW zK%W_D+z zKy)K9_4}iH_x2-XL@cw5#MoKaOeyvmU9d?vFyMWxci6U<$MPr6oH^4qCV%~ul5+R> zd7-BTi3yr}>vgLDt)Z+qW;;O;V&)Jek(O33D%GBy+R=;vz$J~@H$rKBA*EqG!k<1$ z(e(6!XW7|Ow^Ynz72OaTATiqmnvZ2S95)95V0838b*1%|!bXZ^sNdh6P4y16>vuVe zy+K4ka^3)i%P8vgmpinR6lvmGD@vxxcS}_hE7;RB^>r(vDWM!`Z*3H9zW&!&d(>_3 zF^F+&(M$Sv%2etq1vn=r=xUWDwl;*e+5whvKq&&8vE~;|lkR&;TM!q~pEAh3Vzx6*ch2{(Cm0LoA4>S>_S?kri#fa%FCUefA<8 zOKegYz{AuJ7!O_yD(cwHe{W@I+SyMZKRQ2ArOyJONMv9~)a)~PU`caTPvWzS|Mv{1 zBmZ|bbxBT0!2Oyxck45hO&dSI5K~HdI?}v=A^bM9ciKO`8V#JUP+~9Jr3=HNeq8ey z(0ZaMAir9;xDYocVJ(LK5yiI8?u$l-pXH;?^MGQV<3%pynIGx=D>G|D!&*s!LvQ_^Qtte#3*)}JHr#idCO50N$tZ=KYQtX`Iygjz4=q3QEcHY{fB*q` zanDM!d#_r#grRj5Y@^c>M1R;0&s6|MN!x!plSrCZL=XGgY;?#Zf3PqQ6; z%QbmvWcP6jNAp{4FBiA8Z2GaW^!HIy7scYZ>)(Er{yt&K-@#O$fGLv^2@f(W&7Au|0ekQa3$i5Rjar=E9 zlx5))W7%%s$jJi^`Gqf^zIKyzvV!_jx}Y#?Z4GVtveeXF$;k<8v{tQ3SU2SKl&)2G zrx{A=wv7AL>>X_NpeiylbIuflRne7|z2$v;J8yU|abeE1nul4b5;*e@5m2^P_4(B{ zmX`PKd3{~G`LE~KhM16Ig#n}bPtjWVBz=CzX?ykOAJf=jX?gU1n1_pftd-SV^S1>B z!Ox=p?l&6PofA)-Ug)}Uj!V!52_^(NIe+2+6>3R5Fp%vWlK`)DMgL2~&d!$Pj-oM> zd2Y+A2R22Gp`ooU0nxT8a%o|?`|E32%eiz@G*TRy2dI=J#n*Xc(( zJ){X<`$<7cdRn~g+P58~q`b}y*!H8>Rp+5P{;ywWx#Y@djOmtpYx^WazhCWox$@q`iJhJ+^fj2;w*Qnao}L=(&#%1|y2HDxp2VYb=Bg+i zxQF$foNWf!w^iZVc@)07i=xGw_lxeU#s8{&@01g1Hf+U;pko11gU6ZgIdY`;o~ho6 zi8Ai)TEw~B$^E-)wsvatl#TwdI&Ic6_LOP&vq zMDWX-zwI35&5Vpj5(S1LWxcyQg^C~mj^DqRpFd6YX{(>bO!RcfY>)#Zd|<`E1Fl#UU;N?v#!m{= z=%XH5bX#S`&q?CV?)Z2&zW{#)Kf6E(PCHlE*@8{_^J>*N1@Q|bo8oeA|9qz5`4LVD z7h(vuA~y`BP{OjDPAKJw| z0a{aQ%Tf!!_Nr=oE=GLs^4Q$o-uw8)`5n9E1@*bU)?okCpC3o)oE-D!MyHN)0>fgEe;EyuetBYLK(v(fxxT66R|u3RZOSEVPK zC*eV%wlp?2#s*}!awRyL0et|^0=xwca9p>zxQeZYB8Y)!=>Y4nw;FYG?4m?(+AG#4 zpddGGJz|7#4D0Ewzo$TzI`3DWg5raI?6N_wiV}r4W<}%3`F)k56#UwsG<674RvSHf zIKpL+aOlVPr7`l{z@$)=CMX^-^rpdBpZe$UtP<sW%H^o&IVNuhM;i1mgQiCpeV3})rC`QN z2Zu!DSO8o(W~0*>TPiS-ZT0tui>wNSd%}bYd6dN`h_Xfa0+<96j_bj1i~YHCQv1Gs zySrzcg_>Hp?e}L|;TB2HEO_Ti5my`u3!7`(#qnW>!_I|gzur~!p0@SVsmi16r901# ziWqY1oAHC?dDU6pF0I?p^IU|ydIx`Vx%vCjf3-C}(z|wHzqSVqFM4We_IT(}nQqBR<~{voZ1t%{v1g=G8nO5+)uYSA0%lA z@@HXD*-WYeXU#VLN9-LAVPL3CPMRDcmIbe$00hyw_sB4I6(hc4qUX#ql=8ybMbnbQ za&hba{^t7XP8>$|K)pgvC2T0H!6bMUksyGrOe|i0awQ#|pw{2%t;<1q`QYhS7aE@0 z-9GW25G|USW&g=d7(|f9bQB=C;7{8RbH}D3^trrPz(~CPcI>F2Q>VH!lLpR=Bmg(_ zeG8*^zg@Ap#Y1OPmmN**>KocguW#KkxYtiV?cEpOmoLrs9o@+zqI6pS3H5IFj5gVQ zY9aAaH%f0jitK!=@$6g|XIJ&}J%0brzBup4_J^hv2!r~Zhd$?f{`q5WXlNd0uA0j4 z$Du|N9pAYC4I|<#-@&DYz)nCLGiS~EqI!wC-!P&_k3AQ^W%^_#e|&6o-D1#wG2>&# z44>^U8pS;`VZsEh$Bvf{`1v_t{{ctu_+LBp1m2C@7Oq!sA0OE6Tu%DwirY^JS@P!> z6O2n6_dRRNv7%2!qZE>2I{op74G=@~&CPdD4cCu0Cs{?9Rsa~5+}Lmc+$zy7vn{Ts zJu`=86Hf8yIA` zc-IG`qenn;4TAchquJjjX2zAl*b%rc->y#`+%M`BHcDZhr+cEL3KxPYI(L|fE^@=F zS*a!vrQlDfs7}$Ifj{QL{)F(844kTP$9{m`HXU|I4Nz7UVL5xNQ6JrE`T1Py++~E4 zRBMBd-xpd|l*o6}I}JsWNLBWlb9Hrz@A|*G{q(xhPztWjW(oD0u ze|&1(h4h^d%0_87JCC(`pip#UnWMB%`;K|nB|^q$rvDv!UbS2rJOW}&7{HXfcYOEm z9d{L%`(g6W5O)Cqa^imd{{6gVye$KMUybX#ZuFtPL(c1l1ZDhb(%3jM)^^nCs-Q7h zQkmbXvb5Y>9eN0XHXjOx@gd3sD@-;3bmdjcg$o6mz=gxi!*Gl$K`p)1H~yQ36U;$f zWZ&Y;CPWM~$JDeFfCF$ImmbhqrT_GSgO!!S=CF zaj5`??eXaiyJ}wCON#TG^tS*RL~S<{T1?^b;0d8onIWH%U%8gvm%O zSg?SEz8N%X3qk|-@84ycwgqF>XyLKr%`91N!=4sK%i8$&UbgWUFbevn%ZN*<`R6B> zpo9(U*JpuzK>iom_gd9mL96Bg4;&1+P1eX3PQfOGLz_tz;oC(fC3O#;xl--|#VWn| zvCz<^bvqgw8icw+5ZGnfy%)cdJb7}e`Ik>At+RdBRM>7_Se%|N_pP7a5~p6pM<1T> z_aAil`_Jc7oj)fzOkGiLOM5~Vr~iK@IHH`4l>7+}8_jL7W`985iX;WiLQ0h%>Zcw@ z$rgk(ZfZ_6R5DCHGN6Z2{FzBUfq{dbnOj0D;>>@0=Rgm?ziZ=WhnD;!GlW(jn^|9N z*Z#m^8SL1~0*edU6(0tsQ&eI@b3o~^w{NR-U+`}Rlncogn$0BpwuX`OMR6gbI4~)S z;kmCph4u;VHj}_AG5EE-JP;IFV8kG>8e_*!9d?{mBC|8)pZ|j#1PFfTl569sOu);f ziqCPRLP$k|1>pl29I5KXaBy51FX*vL-T>r=)$}s zAJ8JGOqi#7QpBK&!S+Zp6#Dh+M{8TVTA0l_Bc^i|^h>*Aa=wY<>kgGhzsV2Eof zErL#U`m{vBr_wIdUba*7q=i3SsT)PN`17L`0#?Cogl|8zZ=W#-#m`JgoAHpJKQ>VQZnLwctqgF3C+LHS^N6`IB)IisF0Tu zeKD<9QP7m7qrUQTvc{DCne7M0yrIU4k&ZlKjwT%inK)E&Y5rf4`U7J5`s2r8rFDAR zJd%w_srkwTk}7i9=eSdj9h1z?JynUas{Q#e(X_s@=bXpSQPF&c(ALj`NJes0qt)MKI}ZwJkCxr zBl-OKo5jVw$nE_Pj%C#Nw;Lu42$D%G(be15;hpz5$E2TXPVdZ2S4W1(@a&lCNSV=H zdYKL#$_DD;ipO7+Ekd40I1){OSMf#9bjLsvrGzLhYxSEs8O!H}XUxdFWRIL^j?18; zMY{31M{uLxj~Xsgo@N?07F+Fz1dDghTD1i@0@DgdnUCA&_V_OcA55Qe^8;me-JBB7 zDeru5byR6AeXew6-HHbZ-lO4&$T$MACDtWbZfXVWJrW8*T47cM_JqMTPI9nCwF*Yc z{zMUi%z9A~UE66{ovQaTAw6Rzo_lfO9jn@W#Bmt@&R7o>s1!0Dpv9}Tm!BOY({*Ss z$&oJC@~H$DftO24gyj$Kw1{;^rHfSW&HMLDY7CWn^}3A=T7;f*uBy5E0cg+1%`}gZ zM-3({&xlrqk@`a9eYGgdj}_^|>7Yc$*JjKn;MYg&N}Nn$MuNhNyeVN$q2XNFqpS{* zApI?+yc!3PQS>W1+J%eaf&mhNz^PFLT8D@F^OcL3cbo)5yXWryVf+D4#D}BNW6YVtABhvN$Ruf_UOuV!iq$Gsokw3lqd+xU!B2uilqxtoh%w8rZ zCu^Z*T!hqF91?B@;dw)j-)Dl~DqESB6SQywbRaUZFP*KhPW(rr(c_TR@k#L?Lw&G| zTNo$GQPp6yHX3Lqlc8J;Ya@L+kG^(!q&yTg2aQyv9@3tNji5C&bZ~XHR1rXPu=|>EWYC zNAtvl6A(_lfEHIcFqoN6;f#T59#cEV)Fj@+I-~>>6v0f!+mf@g`tT0$4z7Kg0rvo6 z7|a-c7=z3fCd1R#Lxs!B0cy3E5C$QFm6@L-59jOs5Y zW`KCySle=3!x42973~FS()+yGv>U8?iNzVQo)m`?K5_QHI=MYZ+$lmrP$%Vs6YME? z$DGjGQom`^p2kDWDHM(Czg|nBzx#K8dzUWSCWHRi>&sm4z_;@nSKujbMzb3;BV+Rz zSCC66Hn~$_!5PIG?by%gfBTMKrwQ@=;N+yLfq{+Ce8us$167OcxO`vN);hjv)f_qU zZe?ZbfsFf1+6dp?(y=T-6w`0w#y$|Q2;{Ap9*+E3Yf@{$1;qqQS>x=9*&N2m`LcH~ zQDw$&3vb`v#{b}L^eqT1VgP=a@hO_m3Vhe+5kW@Lb{b5gEK#_4Hv@~^^h4TP_~jgg zn>bf~-h8g1I-)~Z>kP|2R+FKudCBl;yttS_T8mJ*VuJ|Cb1eICqNkmQ)%0yAPF}dM zCsL(2af~Ti&SVicA>@K+c6d9GKMff?xFakzEQ`<*{}Lqy&HuamQ9y&E7vJB1cfh9( z!gzWSsJFOWPXWOOTO4xB$}+d-ZWvD3h+u*6{YuexxVWR5-YZtR25@Xz2e& zA1xf(`-;Mm#r2qu%l^u1oloplYnzm~ZQZc5XBR9Ro@Nu5xGmK^dWBP;QLpP;=l%i06*Y3j+RBFylPXPAOlyURoB`$+N1LpY)BHIk zKoR^EoRBaaWRPM&Yq1k<*ZcB|C-#$j#Wo)W014Zdulg`($;Xt^&I($RBRu1$S?cKA z%rm$CCnH1N&ev_>wgaQWbLdv)pln8D=Z3V5D^R?xMBI9|Y}Yv`9|Wtqej z0)Gnq3FTBJ(n9RUMWvLNmv^obN@wjy)zYy$LMPm$BNC=W;~VqP6mu%51jrP&I>y6D z(0lR8q5KG76&w<_0B|XDmIZ)>3F%HvO{h`y(n}d*R3Gl9Iyl9?4bmRv%~`?5B+ze~ ze({irj18;(rm8A4c0`4%;i`5nep5nE9oXMRuFSLlrhazu+Xo1!237hXE<-LOxjud7 zoI8QWI)ZxNAS8UGY9t=9y`dZ_Z|OZH zRvJd8mKbgmLT+&T8G}>^2{Y2SIJ+GPKB}rPC+$X!0>alvNlC!LCaP2LjXNYzI*_^2 zt*rcLH?-G|*oiA5)mT{TjSh+OZ352%ufz99za_$rsXHL2;7x+WfiU|Z+;o5+(C_+F z&viW8LsBaEOHchH8zm(6oRsl5tXXUA+$}a%&CX$r)eS$n$j-s?bt&uWyE|ociHJ{1 zYIv1(QTjkwSc9hpw>03f$fw~zRod3>fnNd<6kRH8wei|MCW`yP-^4OpdNiTPRLs^t z22uAN*>DgNkDh(|-s4<@xa1TT%3XGj084>!XI41L>t1>B?&p($!+MP$!*mA$SoH|& z$Y@Q|O=%G40wJBiq7l&c9p^omFOw6;UxIN#sD=R$BkIu!Qc3ZavW*Y!2c0nw)=4`& zI;XP#qvKmwJil4ot6#rU(-fw3EY8c@wPVNo!#_u=7CA3PoONfX_WDcK2?om#HMHxi zf8lGyNh%cj%*~8P)YgxNWv>7lxLwizcQQ;knztiVw;f;X|EyyM-#BZq_#Y_Av~EC} z7I>e?g~ax|kC+VrR@8nMrS#>prknKoBM}puU7q3B{P2A&36Ss z(tn=0-}*av?~O*=q9O*0f}?-l$jW(?rp*Z{sZf0I(6!E?rfR-lyO$09mTLn(bC-%V zPcj9+0uom>X?pXHtVhOddS-G=4v2j-Lp#;EfwpJoqxet@sdENHGWlvaH6n2YhZYOm z2^iTSJ<}ci4j+;1UXwIk zHA(*d{Z~gIzt5;NX`FX~s ztBT64iQm30cxzFfHOueuvg9q1k=hYefmetoiJIxecM)G$eY#B7TF}}v><)mpQF)EOSZxE5c#b6q@ z#_?omX#Uo%4hip=$NCxi?=LR415))Ms+4XYN6$f4VPJp@iR~Y2A%*aqUNo06hg)h? zTJ2S(Te&I*j-lS&`>GZPUdh5pX=HUiD(aQzo9>|vYR6#<)BWvVg+8~kT3+mpVuHPN%e@x^hWs!Y7A;A>M?*h zm+WFHB3+#Bv+w7|5f3W@c0i4Ve|G!3l8pQ4$VnZD-PG0!L)&+G6y6K5tANo#z%g8= z#BuI8Y~3cdxNv>IE02ZfKwjz!!rra#x1mdx7B{LQB~_`(?W4D^lv;7wMhdd4+pzr#nR zy02Lia2q+;SB|na{!qo9MaY~5_KENU2TfmtwIYC!B1jp4F&A1+CUHhXRKgxbgc7sQMEKub_8rUREBH;J5=&J`o4@(XYr+jP zwLu-aEBDw%^MB~&^+4r?ZW19$iPO(3cs_mqzR4wPVB!sbd)dofWE5;}KX~w}r72ja zeUg0OuPNGN%))!@>OXq)Dxcn-Ctkl^(Bzdfx4Yy&)26vP+&I;4%!(lM0lk-04A|7K zsHmg*9JQV%Rk73COYPh+zW$a``DNQxr)Hk$^=K1b-N~L$J5>fM$!mF)FFQ$;iSX0` zvw4n9RM2u+M5guHnkVf+vC#{QpnKHWYIL7KR7%`%9er4Dby-=pMYJiuP^?m=c$PkNQ#iyb79mnaK4aC z-!bcWe|>FATOeJ4GExZqg^&f_8|AG?K@~?!kTo!iU8SVlfDc43!ssdD66kLGObXn1 z{I2$rT@CHluGyL!65QEs-g$?UWKVw{QTLkx2XLmjep4ISBHIOq2EsWj@%ws<|2H)p zv5fx9qXN&pbZIZxb`M%s;T3Fcy3q!_vXs6Qfvrf+z?kOELnKiB7Oc}0&=92+&5$RI z_o=@1E+;m%t_P%D^Z3>RedJ6xnpGc|?Czt|eE!1+DIepmU4uHfdyVWfWc~V8%~^XS zR$rB0|2$JFen*E+H`6?f*6M0lp6{$U$t85HntJX1fz4GBb(PscuOLIaMkHQ?STr<$I@b*)Ol;#7qrJ)jTMbZadJ#L4Brtzv77jt@1_f za=NyD8l6*pZ0y&cA7+QUXdNrd*&1{u=IYgyw!J;74c66|Ktnfe}`)1r6#Ca8bHH5)gNO<8U zYg=H-xfVzdW?i)2lNtQw<>iqdJiW1L6KD+bFA2u*PSr<3j^j!ZNFxy3O3&l_z!jNX z*F*PZrDIhsCK1WO?1A&j6rc4E33$mlmp#+$mjX}kMB z-gBrX%*<_TdU{v$h@1OnNDYijy5wUR0KBM>M}UloqmM9=EkrwP8B3 zbqj!zoz&dW0n!&2Y?zp(QcGm|f)s&vFq?TlMW5G&mLddCr^-BUct3%YaF0L&6OUxB zwy!XK*=#PA%LI#_8J9>OEm(hE?}^O~#udfy3T8=l?MiM*P~j}Qj>}5B#Qio z)$-QXtFK~$7NqTu;?4>9)4XPvk)58_%vGAtGg_YaZT_(G?9QDFo-D8IwlaIj*l6#| zui9O2z0y#*Y2k_yW3PmC?R#5mYtKC|X!nN9<#DN6^x|4<~aIl1ux0 zjL-<$JxRXkL5z)xMAjPj^ACS2>v!)ZIZR<>d12ZdY7~dZh%}3i%9*=WZGI3BV{%^XSRT!gG{BUXHs;Xw|M<5psb! zbIOfBrUd>Sk+w|TL^UZ%MZZa>N7%Ryoitzn@Mvg|{dDos@nCtawwSQ-g=uqu;Kk^~ zj0vM2clQdMBO1#t-MgEq?W-nuk{_ZAb+EQLj!B0`Oa{((&%H%7jWa0)2Fs$_kT(aFjEZ-vx-oZcDVq zHp{cGFXr_>1CG1!q2*}Pb5Ca>#R8z%TI?+LSpbyKysUJ@qxu*zzjTv;NGK~JldV8kJUTl8VA*ZYI zSnE~g_wRQ!e{k2#Z-Iowl*L1Sd>(Gn_4siK^Tu}vgNvVcSB{@FUPZbovufM^va6$( zXpRW(by3B5+mB0w%ojA)J}=8Sa=2?@WmA;}Hdf4fk zeh-DtqnC8?Smk>(#&zN8v%dAtKR*>lJCr5Ir&0?E5P!viMbMLe*Wj$pJ=$!~33=mv zYK29YM~w5dbPki(c*dN=hYzRsPi1PO!GmJFJw3{alp0bEdFEo74DA9KHH*`^^c`Ixg*YQde@X%P1r14glZYI~+1 zarsj6JPnsH0)cGFyj^6p{_>voMOV`I_^9sd`nY1qw7sj>T%G*)XU}o|cK3gOmmRQj zd+K-Hb@khWrGh`)_!V_sk82X#OT<0bU*%mxYZ-yV;b^~7#xGb6=_$j&SZEW3z<*%* zC`=Z*5Fb4i6nvk*`MB1AoM%BRH+zM9MJ_R4zy9+dCCOmz>g2*MQ$oMKkx)uZ zvrGAX{kK`loxS^l>#wK#j0+CFJZJ99lgGXH<@$q62-Oh?q*!m$PQ^#08BSmmOt*ex z_b^Q<{n0Bm?K&Gc=jlh!K7#dGiGyoKiO>kB*v(`5Vytc>e`Q_A8_p>%$-gD${{U)jH(;FKSlU8~SS?t_awebAscTh_oA=5&#k38~DM$W3!I*IVTS|z1> zFHha%z)qS@!)Hs*U9eBBYu}Z~Qvlw4WUrFY1ily}oyGjKv)?!y>u8B+tD7??7C0&sN-6W)t@#BIz=&{*QF!J9o-f z4hQKIp%t_h7-bnaMJi4n%Tx|e-)^T7F!VwRSLByNi%aIb*qx*rY0>cg`-XuAU1DNx zZkN85`ttG2C;!yEn7gz$n=fvsKE4}y!sYgw$CnB7ayvY}No2n;Yunh{w}1QO!ts4I ziIoy~*F<&~-V@zM>&@eztEN92#VZSs?xfkWG$t-}!fI3}G?2e_nUybj2 zdB8cg@`y;Z(bRw8DBLiQnYnmn)drc??*mjUad;bzSeogzX=huvenTf@tu@Gbc4a_6 z<>0l|;Y!sP^R}hEK2YB2+vN}2mZqnsYIogpbohi6&G+R$8~_v?*Jj&=CXo@ztwH4LXrQAB!{!>kr` z@}xdA&)Al~zEkLqP{-1^cafCTY5o~Dcd4(r?K2HbbK4Kh_P*Q?%fq^L8PQr_l9opa zy25hZ8iRu&y|Y{E4>}Almn|7_`iH;k-fN6Pn+*b;YkD`o-f1N7^!P+jpvClEvcr^h zEtU>y{FG|sVPg^cp?R)`=1y z1*GEQR~`I!;v8_;uMG{asENV{xS9M1=5qxOjletpH!tFhykR4H*S`P4`t&}++PgKr zE*!5E;;2nkA4Yy_kX2N?PbEDx%B>UUjP}5Wcp~!UXx5p|XZ`;?iaB@BV4w#@Ku#v2 zEHd>D^^$b$^qFtDV(a`(z5B(R?%ekCpSHJm_wW9DcD7b^Nm1mzZTHe@yQV)q7i`fu zf6T28TP|N7M>i%!R-kU}j5g3OsB^%CoCO+IXTqC7s;UT?#p-h_!yijb$+2a$u7>$@EC~rb+8a* zhI0M;w@#KEY2P#{Y0SD!t#h@PBn(oc#B+hCF335Es<{EkvnjUQaehA0>7EGu5>`Xn@E(@h?Ri~B|1t#b}g-cR!d4ti)3@8tqVOoghmy{j^%Y%F(n4=*}I`( zp!0RjrK_61?HpL-_vz59Q`@&oR(Uu%H8l3N%&16{>sL^CCOBZi&qD*3_f=5eKltHc z+tE)8N_xBaPt8vps?j6O?8sil?g@tv_Z4f?NYsUTRE@F}k!mzKdVluC*=a!XazM`8 zR%X-F3ce0qkO1Q8jD<9ZVe~TUASVf>&_kDlRXkK8lZfJ%6y>gZjRw` zJ58H6tJQ@dj8ZK+Ux3Ryd3l||ra%O}j8I~QX85P2ZQ7FEgd zJ+7-KId)B5eRNFMS;gxlZgtryz{o+>d~oA$(iTD0tsu8Va1{;p6~9`6u& ze4ZsU96+oJ@Bki8QAnLi)!2P5_C(PHi)27lRj*B^jn0GJgO@8!a>^V1ctS1|IG2qp zM&Wi?Qg>6Z5Ov5id*>k32nEvu4Ch?Shf^}#^c|kmG#ah#*-<_1e3$EOE=vcEU8ni$ zuT*->$dT5cN6wvF@*?l^sRP6=3G}jeSLSNZGw^m^n?-o1zjmT zxlf<=LO<42#up=|8=?{cCV7X~37X`~jWA13FL8KY7__ip6aI#Y9^494W)ac2;4}{;j0YpQH z%xPVz;<2>pB6f_QXAP%_^Wg_=tZ27N1jTIE&GlSs5Iqr(o#P~*`jN%M0$LiY;^O-E zH~JHjpSzQLk!1qsBQK4NvJ?^zpm#{Rt_a#h|4k$))3MNeGUtGjQjlO62sYg{zBFZ` zCSLTf=sH9s?!;BE$B&RBI}Qm-Xn}O;5sWMZBP?OIquo%XLI@P_zabpdaY2ebaRg$2 zo#*sV`a|$bi6JBKX#BnVJydlL95AY_3Y80rFemf?{Uh;WMG)R#BHB#(NI2##yIoA$ zDyL1v%kWg@SSOuQFVYmOF;i*6*S`1Pd5idOVC}~On-Ag1kLff1ywCzv59vRqEhTb={C=FXq~{eozR;Ghe?Kk*OjPn5&&O zT*%sRR0mC0Q`hWv{g0*g#&cFZPpeA?3scGJ>RS>hC<-VXf-L)*eZ435(>qd9ur=r^ z<;^p}D)8#STxT47D3RI@{LHY^5W7pZf4jCl!@j?nu40aShbhPPhnR-OO#FE$MRSb9 z`5{eB3fmMH>*Sc#mCelGxS@P+A20tNOy*zN4*V6d0bZLhfv<%>K`Ji3FoLe(KEBM| zoPYavOz8nde)8V|`2~yJtL7cuf%F-^PFfFlbK}$%g*}H4PjLFFePJdRQX#_zA6$Fh z%lnVQ+_~Co&!1}7S9YL@BP(T=)vFeLV>rz`UbD~gtW3+`1czQ$ho7i(ZP~#v#WO@}V%4v$l$`meh`T@mn&yS9tGR4=SFXl7R z6*3T+Zp_+|CXB>*mf~DBw?4V@{^`u>tun0_R`0oI;%;y9HjunSggIYvxzXf2qP2K- zJz#@$`ZItJs;NnceasY*IF4!o#J-1or@GS1uiwA7GQHFN+qVOw7v9!3@R2k*6DbF= z3z#J8B9*i0Q)1AQB?&z1AO3xcznhqR+02w3ZDwXv;WZ=TRMFA>gNq|%6bCOF>*v?S z<3?<(?-%8^lr{U#9bm%@DO6+1^h#!&U^bo-E9eXwi zx7a(|Wxd3P4S)X{TtL%7Ugn1LPTFCiOVf?~S337IF_T<~D6E0JmOlF3PXu21IYwJG zGj@9}FDQ*sXND_*_u~l%h7mm{(aUum-lSwK>+Gu$Eqnod`rf8mCXSlCIE!E34%0vT zCD~EpO`d*Z4fgo%+~y*hfh5N3zuzl?T6OF0fgqE+c=Xsrx-Dwvn8rP(jsqdl2J4mV zPfI&t=OFocXuQwCUArEL8QHe{$xzjAvet8k z79Z8W3TS+A7=HJ$a#de+#$B%X!@8wW$8`#l^U>{@3cz(9Mr0F6(cYZer5k zj&cix2tSyLEMLT{GKZ}zSn(WDEZQV-b0{eRXAl3`EFU*pL*v!+l|9D=R2)?v@BHjngHSfDT>!vC_?M#C7$3}lNKxmxkL=GRG8@}I^ zl5cs|2-EDMMcd-yJdS$?{P17pla%f11&Rpo({$EEQ`JW4-&tu3tHKbnBY@8hz@si* z2c6`Zn4rbwjhm@?nWj->o6Mw}w}Kabj<;yBZs|vsmtac}9A)kHF;E6~-+U^~|IL+H z^crH56IVAFL=IQh6ZK{~~8CX`!38#sJ8%YYtl@GMg+ z%F<0svYFqjR}Gt~bZ*_+`|G{Z$?*q_!$R$B17z@mz<_yQzINB}MV}1IHRcJu)&Ac2 z-DzV$b(s3L%EtHagDi?d22@_|I4dSF>#Wnkefy04{IaUsxZ4=$i|2GQy0U7X{X4Pd z2)2x~%wrczRmJUPoU=(V5BgqAa-BG!U_mor8tShEN0|JR?O)PA;s#brZm8|+48aP4v@D{sT7C9Rn@o^^at7;4Z&KKZmZ-En9j1xu+tC04%IswANQ} zqHphREs6-uZ?Ty(C!dLc$iF@e>;$5XM%zOt6^=+9>=#VBH{&zqpWz~F<*b8$2GUv6 zD?^G18|G@lVjezZRk1|#%e%J`R-zPJIB#CNUkXBGP1=&!=D{zovah#7%R=9QFj-`1 zx!kBzl{RNa6l@j6;1;4vCL1!)< zweZPOLiVF$OrEs$@#eK@1q84CjL|` zMT$aT^3Nv^dxhzC$C@P~SVjCy8eDa}+P6WqkpMzhm@Of0!c`;3Yt>rc*1B77S*ftbdIN;x1Ik5Z z$+jGkye~FJ~R;rgIHg6we|XuLx-NwGok}zJ8yjX&o}K)RXAYda6?py`2i^n;|sY}bFTS~ z!s;Ow7l50l5b%j*H*EHxcW>_A$w^3-k38Adb6MF8R+62G(>j{p!GxGg`d?(w2kAaf z(lPx@JzH6c?V;j&nRjJBS?f=P1Og752h-&STP=ue1VE$(?SiFXF67G? zjimSO1!V_YTXxPpHwUcN!%j+hNxzt!o^N~$%_+3q60XAScC2K0kHVvX;S9`f0bc-N z@=sZ{A%dA9o#(9Fp6Hkqi+7M&JU)J)N_&^}XMgIhmy5hAHfQjt8(5q(3IJvDs)g7? zrcpzF(ZB&7ovZGfU=t@|EKzT(Dtp%Hm2RfmG!XVRnz`nLf94v0Xzv5J$&_US6D$HHg5MM+VK4kXY3)aFM%IMBU-h=wF3m?#}Jt zB~zq(I;icFBZ%K(h+RSnDF=N$BSA)J?XP~=ufTXf&aPlsw6$dsD6^@J>jt|0f=l*} zef#!B|GaQT&d0>$_Pha{3M6Vs(m$uE4IVr&%JOT2totBSMX?U(^qDinn%$|WI6@T# z_2u^v965uPqD|MAFJFq1L;-JnpW;G40((jr*5iy7+WxwyyM$O|yK?P;*wWVsZ)~K6 z2rLg$L&#IEdxFidF2fEVHJ*FeT1RhT#|I}y)C5lWbKZL4uiayQs#@iba2r{ow@T!! zp_k-q;bW5|glzgG(dW$<2A$9sBM{c#f6Yf2U<6^wMZ(K0us*smdZIgE!Lq8?fsZ%{ zv862g)AN;}>m z;q-!Y5X)tl$#7DY>{0Ysu|oLxFe#*eNk#3$v@SN45j>al+~q%`NSdRYjm@c*^UsHj z*`WHc8^S>Z?u3liEZb+A?9M3@PM2R~yw4%7nUG3qh1j&fwj`2u#uWced3gGR3-6ga?b*ef#A>|5i$f1L>(KTK0I@oUR@Da#>NE44TnPEVr8I9>a7O924!i=3H0CpN= zjvTT9@Dto&X(~z|PGX8kb6k=1_aT`{Lx!Anc|2tJ@SP&Lms-v*D_L4r;~+LHe)&?l zu#O{-TtUd`Q#5`k~8X~KmS&j+fT8?XHV zE8R7xs_Mq9GTi1s26`zk%%U?%@ zV$WZ#8_KWGvuFya9mc6gw``(Zx;>h`19;}fSG4ne9ouCY^#-k$7&Wf_tXPJ?U1`bH zbjJ~t?Tsvd5OXGQOYziEnxsvNd_(ZQr_9r|Fpv_jfCU#Jby*8iFUMSiH; z_K*-EW)h2X5mgi7^y%f*gN*Fji}Lq~L---fL>1o$X!rQ4ye!st>CzGHqJh$_ZE}fm zB^FaG&!8uWE{uo+cjqF;BOp3BPEa(LF>7sw>jwMC<Fup@5*$Wqt#37Yg|P`bLpQ)Ng`c z%8bGU7;Wmb^vBzjAmW}xl#qtTEtW4*!08yfy&>o$$@UKy!JajN2_Jp|MesxGJ~*%= zv{;{ML|mE`N@#_tKwp3U{`pwcoLa zN>y#zC{jwhR*vI=AidY7=SL8V0|8&MTc*64Rmjhcnt7xa}+S<8I zEIx@i`fHWKWpgk{#$U0(vB7!Fs8K3-cAJ|wQcZZdhp@eUPy8|B_jz>L&osiPI`{kZ zQVNmTqWKr*c98yM-`iuLXY&r^K&x>)g7t{kza;ImbWbfbXM*-YUdiu6V?{YHHaN3B6F!&0%hd9IJO7^@M}40cqry6% z$q6O+FUb_9OtH{t2BG`ReUQ!~ry4~o3?HKoU5-+E+fopK@xEUcp$B_-Yi{rLDZ-k< zGy*YKmST@G0RU(Sj|%34p9G&0rGb3{bJd;Bj;v|ibFlO3)#ijJBd*Gf(~8XRunRPq zAz9dyI1M$LtD1kU=D7-Hn-YTL43=>h>UsnSP%@q4M;vsN-qhTFz@~2U^4e?Ex+!&P znqxGw=yq|wl?}}qkjwj#J1W8DaJI$UZcFE_g|P@Ca3l;j5Fdsn3g@vDSq&)PKC)K{ zD$kv%lPCQIiLrsO|ZSmFIlT zPYy+@061uaM5F_9VbO4kJBZoIf2!?nmP+X8>~zhoT=?qdQS}ufloJl~M8MCvVhJ;x z8BNGLXr~EmGZCV?SFdggzb^lVx2c+0`WiD}xV)>qMwHmS0hm`c#&oVp%owpZhmNCL z#txe}-Es}U2gFNHGgrkghSG}@B-a)Ty3on#9g1<0=C@t( zE!zXD#0P=f6T8X8f)C?qd;X{mpZ0i52Zg?qCUzV!;AHIe392L7!TAZn7TpS-L9wKZ z{jPaW;_s1*%ZT=khIJ345WB3u-kjdf!J+l#B%eEyOHU=A%~vJ{p79FvULNdq{ney- z(jn;xAVdZ~f_7okr$gURppVIS6~qHpjV&LOjvOxyf)o>DJf`!^d6Q6<6KWAKQ~m9` zAwR6IMAAHb`~Llou41HF4pPeb^XF6aS30UkT429=&1bw&=H^v(T^S+S+izaFYxLGf z%5Qcasm_{9ma-i!oqEp z;-S^w2YI z!=n?fq_2a^dBc|gHPxbH09OA)D4l$hH$-<~+^JBDR8?k7M0Kpsc~Wc)J}Nfp!6Pqx z)X#m^CS#Kr;V=ssFmPDciN7YVbx`U-`X4e$ZZ@G-ZTaKFc)Z8*Eait+p)Yi_$bsf+ z5Gn~`uIJ}nn>xRWRp;Qy4FpepP2SWZF0sD-_ad#qj3J~XZ0DdIn`S#d_2{a#iE+Ce zQKAxeBnZDbS2c=M88OM_=!WQZ#__#Ry4^THel0qm1wa%c;_u#u3tYAeWC)PIaHwp4 zQ;!vWAa#d-uB1!||po z4SD|aKx~qohS(B|k&+c^znreGrlKIBl-ooKeRg~so$#K`$wHDrzoXxnVq&7Wi&8{6 zo$3+yWYe#^U6te_LxO=0NV!uz5Ewe4VbP*4hVOK1Bdp&DZpS+Jy3cb|JxzkgS#3nR|D?#m92 zeD$TZsZGl{>!!&2@}I*OY6M!%b$yKz>6|SoArbUcL-m_>i4cBU+rB0&GwZ@tE){ilN>h>KZGt+hmc)LsIM zsBU6$kyutj&D3mKwXG!P8W)x#qU63Khqi`bq~p%yz$Dc9s?0FS*8r|9y^9XlAJmsa(E z@K7to@9VzPn|GlMEI2YhQ0`CS^(GgqE%uQW^r-L9G6*pd4}+uLXVk)uyajo$+kXr$ zuj!rub4X(1V$I2?PhTpYYiVw)o%)mSk|}ui zlHsyLigXBlp;WJ8<%P&Z1-KL8h|iy+z-4R}(r28ouj=b(T3F;8nr{~}Az^+P{r-uH z-`~%%6E{SLv}bC=l_@SfIvzR}c%!Z4?+}^`sX$~19Tkn)Q%q(#0FBwWorYW3>KXN3Rabw#*|_#Xwa5*{f&u}Ol_2GmOSEk9t@ue>`HsG`s%K22p8WcJ zSTk2#1xSB+wvU;{@_9~kSs2b=!Vrd9kt`T;Ud9=2`1v-wsWRdF>I*J_g0By2tL)z) z+4ZSp14rm3gel;^jAW60FFu249hr^U*AfbTNeh3isd9s8%Gf4$u`-VZ_c)l6%nXa+8>sy5;_rH6ILX(%o!oE>?n)e<#rZd3hly4~y8B$B$J$*v-l^ z=)v?k*Ro<~golzRCo)}WYeg@Q43Z@-pdKR9YU$FUW-@+KVTxz8fX8W|*c9^eYdBP$ z&qQ&|MM4z5fCUIbk#>9~15b>t!sL#pm-!g1Ycl|fU+7RfHi##+O zIbz>D9fAnX1qNAzn@e=&knl<^*LUp>={je2=)dIzOaaY9$?*}MWqJ8^v%4p3?eO6E@aE%e&qz&Ov+thP&I#9qABUTqC3oAnI-xrR-RzUI3W7HeeTf4!kTRQAS{4Hp$1S;mUmX(~lVxHWUp^2x4GO-Xr` zUgmNpEv?sP@KV%tTXyc0q6gW$xgEcNOopX)A?b-#~*{Ld`abV&i4jXl#K(#9X&$k8D4sftlLs>Uj+noI1owLRJcgNN_dEmL7=Y zIt|MIKi(wUy1?F&5_~)nfBQkbcq%q77!9S1cXEt{PzBtBk=g8|^fkCvv4Fb`>a5!) z88l1`i6}Iono?Zv77!`PKf@?E)=eZalZEkE0hM_h?iaA8tjAh|AxFndSuha8mw|Wf zWRG_3Hn*E~H)G=z+O-T24@TlrJRjq7{X z=Gpl?v44vdOmcEL$Z*;F0+V}RHWa6hkT8N7lMzCS8aJoc)tP|h@Ps0dYVUm)dkGS~ zPj!5ek@xVU{Q|&&&>Z9ccL*+c#!qoVxJZYPA%kRKFHhJZA*t}OrA8Fu z+&BOWpObjzfP``ly%zq{&Xs_oO$$XDCb4U^pa1^(aMI{*PH~)!`=;Og z2mrBy9^`!xhFc&)ZbKVU-nYvPAVGg#>N>XbyuDDc^6%&Ka;_yG$>MtVa_|)rp$ve6 zYR?+(ny&T+(Pk{#%66!M#STB1cK_#1FQ(4B#E1;mMnIi(0S|VS!tm4qj;JXhxoE<4 z?;X61aae2y!g0oyjJv~=uL%Ox>uL+G7dB=pe(WC?mlCi+zW)02ts|Up@B7synN6~a zRJ%=NF_KL_W$EC!7$FOwr$^daD7=L&Z(TeRpl%huiezO17m=F-`X@Au_T!*s(umXh)7g|I#I<@;L_ z4c~hfjk-!El4O`+M~t~NoiZbxv!2xCt@H7DC$i7@mFRmxT7+|dnbor7r2dx-rg7JP zyq>qN#b8@VuEPB0B&(-Iw2+-j-ea8sxz!#s<}PwUhUjmc_ski6EgnjUP`#EGdXLM= zEia-L?RQB|KG=6-^z4{p95khoe!wM>%(@B+EUFS>3PhufUx*&4hFLF-qkPi0rT({Y zW%1HC|3k_#`*O#DA=CZ=H>&Qt$Jz@VB#DojJQtoOdJ%cCP3+8&viY4yZI~W2ieb9b zpag%Nd^UhUG9pjK&!t~HWlMu5tiavU(u~Txh`OoyXXf7sTqqxIk?rQ`$vPKd7tmYY zTh{Is+kbF6UvN#JOjYhnvI%#Tc5smGm+1EO^_pF*2)mYPs@M(o7C$SQlt!wN3l=Rh zfjNH0MoD7bsMs$SB@oRj*uPh9Pd~mYn;)!m?uJH3Cr$tCN?FZjt}B>hfy`g~c7HfF ztm-WvBP;so#vg@mSZ-~V1k5cFWf_bV)m2STPp`M^t$cb55pMGItI0UO$XJmWjo;Sz zj}R$gaOKKw$RD&Dywt-&>S|z%0XK(_y!P;s4uxlTcITvS+_>PQ&8*saF{_2?fftr8 z`c8@_*9h+uA4J@{!WWUaGc*(fP-1Dm%{)erw^%=5Qm)2ng;+?@4oLu7B6In-V2N$c zNCO!D;VrtI#r0(0v^b$%%sD{#J;yeRlxJKKCB+A)#7NMr8=IO2PjP~P6UrXqt*7kz zNK>#~w+XnP&Q8g++&!fUecW>68x*mldGXN0q4LKeLoIec)sg9i>gH6!nQqTTC{ak?+w zY+77SZ#foHTd0IZ5F~Ok;-SH}2M-xw(RiMvU}*_PbYSl5n8KxkKm+(7q3<{(Bv)HsGTB59 zb#CsNxRCP1xat%Ksm@3USUTc5=Ym6mpnETNyIE0jWya}t-~eJJoM4oNHZ#oR7C@Wd z>oE0hpWQ`D1$N@OSJn<=ovct^3PR`iZOQ_zxm7@bX@4VuShJNhk2ZL$U&FxT20pLh z`(hAsLVYsJ|IcEqQ&S?YT?n`oF<`da5{IuhOLRgf{6bH0!tZrg(oVMeB9;;BbxRej`DTZtfof($XAac}`sO`l;G_I5 zk4{LKn~z;@Q$BKRpNIhpo@1s>YYVJ8G-`oI*SmLb)xDi+Yo`Hut3YWJy@L>u=H7ll z89z+CXpX~g*3xoE!W7O$E`^*S`-9v2cR7bf*(c5)a2NVmsc|FWe2^xnnOnt-AYvB@ z_*ue8hy_@7Grds?>yse`A2U;R?bdA(W0!QrQ`<8{SZvGpRNnXKh#n)XkOF}+>^X8| z2aUamG^N>JsJ2u&>s}D$`#l4%SRlN8F`@Y4s}Z_0y^Q1R7M|B=o-N1EFO)OPfjz?B zs*QWk*COUqnm7_`Z@5QY1NL0~b6(6;N-iUhSQ3fU55HrlDcX>3ge}*wkenJ1S8q^t z&eZft($T#^|Advh&xx5z%JxE$K19D7Znl-LMMZTm2(6*NF31O8Di)I3lDQJm#~(7c z+*Z8t*CZi0=%4EOYu0G%Q5r-29DMg)Op!en7q3x%#XfX@hR2|^BcZ!HtE-o%PYc~Y z@o(aSRTIC34m`5=q?TsW*DV<8I%ZW=*?b+{zr-ASGTemFN z)jwMCW7(jC1?%5M&GuTBAY?{#S~i%|f`jh_oUNaCqx-p;xld=RyDeGbaAw5*Pz&Q~ zVy#|WULy!irqm~~T{}7n_L>Ni7pX}$vrnI?E+2Jh(v7&}2QBQRw3s>b16FnslO?hS z;hdRn5Vpms^ce2#Tb!bbVuCHhWKh;zsQgq{w_#;xJ)K)5ie}Wk^Zp$WaENb|C_u;E zDyl_Kuv^t;^c3qcnkJ+kIMyvP((=K>@3*_`v)Bh*u&mG6Hs>pLmmE{ynvnUubBKUfM%BWeZj81N}DG#xf@E%wGTq>d_ zRPI)lOm%XU_Qcu$O(W6zFdy^eV2W}{~r9tm*7SV55>4h*|#S)FQ%r}9{ zqpsLEN*aeq_LY~>VIC4T9};jVeuu~VdYuWiPhZPSgiPpP8ZU2c)uP3V*~eGGDMQJc@|vv(OHylJV|9e8(Lyozjk)MGT3Bt!F_sfl_C$X zk=LeAzxmuLblcB*EfX74!;|m1AY~OLg~*=PYoXQAeB*oX3T~}Q=%4;uF&N(q+EBso z(OSFS+}xgbz9Hk8xoXp+5z3|P!h`=dLr%i>kA5QKhLRi*zJkLeGGKV6{cKO>S|K0Z z*_fa@qDQ#do{eQoEo;hW?a~gL;LP+QwnTu>u-4H-+tRY5yYA?o8pBmEjBHa=m(m(N zTE+9oo_2%GB_nF~54)uk*URf&p6{0z#qLAA$D_Btn1>40twuA@T2*G4hsUjLj)n!w zx_x+og{vA&UF5OIYt9k7P%(R+&M@k*`e~9-8i2YHhkRp-H3Sghn)$-TCI)A?Qb4&I zT3wsA$1r%Co9D&$-yUj_Jt(qEPxEpDqgfbImzq~ki{m}D3jcW~r4Us~bi=nR1#En_5$GO-=v(Bx#5E&sC z*kKy>*Bj3S8I6u6lviIr^S__?j5X<#owAFc{^4LiO5T-kf zJU3G+vUt{+n^p5`Ci$Nm2Nxpz+-9~?f&S=jZJOOOS`bjN$`ge{l?wmywHqZaep)AOd;D?mJNLkt4bH* znXbr02PRV2&xKtqr$8C40#=xskgzB`q@JFhOFx6nqiOhW2B_U{blSmqM)&-8!3&f2 zS)|f))7h~vB)EEFDTE^4_Mk&2$&d{=JC#3yyU2uxGN4a~DQA0BT>n(1HT3h9E0U7g znEG|oeU?@0`NW?(Q1?mG%BR{#v67wz{Mev`dIjw^=jU!=3EjD;C9>Yv20-*X zU+c1E%fp$!Y(rpO2osJGKUTE~W{#~TE`4}9p zZiAc|UJCNoxai0!~ zu}B>VUCa%r?fb@Gitsum9 z<*1GMA;=P?o4g54#7qt9KAKMRujgn7b_Q`{3oSt7WRuy+&RZTHl~6A!o4v|Te`Tk$ zHr6#sUNVDWun77zA=aOtrMkK-y zp%WJi9M(KG{GT9cL&fm-CwA{+Ap+6nbXisb_w}rYo;o#I*5XQ_@))mPCWRAQ=UCSr zA0ypApx519TV_t&VxO>};ZRqX@}ET)yPlrNtD| z5|1hg38NL>cr?lKw+e`-B8pH&XjOjxJ_m-VtEYlXuu=q$CiC9Ihl|M(@m-PdLfUf3 zvGMAQ!ZG|6&z=%`9_yP}Vf6m-e@jV_lm0)ny?Hd3Yx_4$<|#uNB4a|7p$wHFLli}l zjF}6WQ&K21m6S*n8A8ZRBr;XXP$9DVv%P=s^Q?EhYdwEG_geeD@11;y z>pHLVJdV$FoC9r&@v4eaCi%m_%hB?ag?e9LN^4&pFKV69VGB;@&6 z?jcl>&wyyTY*j|3jM|&1UC=_q($$DLH!q-rMT$l*|NKcu+U@3)dRa!vH`z}b;f51X z95_$J2~g&QpMZ8_J7y-SvjMsJPwc(>Ol7u%6;v;Qfqxut&=wivi3u^>zc;M?x3NgD zMkw|7%wby|)uqopa`rR-G-N;;2o>Al@k6*85&wT0?)8`yUOn(g=g79Sqq>MAgqOZ9 zY3U4_1u{wu-HoTqjYojfq1Cyq;L3%v6vYd^`9_gVo1jG>zaaQ1AO1_mf4BD*|T7C6s<*vcOWC&`&le_`V ze(^{qXBt*KaUesq1-1CG@%3DoU;*bRB6jSdDO2dNs`29l6sz*b<@s?J7d>dTfG1Z@ zu9zZ!8{qlsO%Z*7U5rE`g&U!8%jQ23_uDW3OT_J4GYz4PBlk&;^9$tQG%Y;#5iqvG z*3PeIP=am5d1n2x7kFXUf*Cul7P4?&|uk zD;q=&NZuC2{5OM0L8?qq$rLz6K<<1l7OE;LWJm^njqlHVpk{S}cxqU6^2kXxG@Yy? zPXKEnszai1B`H0K$=c0riKG91!UMrta%IuZ+Qq;o(I0 z1H~;MN2s3cJr=c)J@pZ^8de!?xyAdOmxTs#P%h5spWFm7Het3L|3NGc1XYh6B+Vf| zM8UQp{asjNji3>~Wy1Iot1?MkehxiO9Sq6Zij_WJf z%3>h{LKx(!BU~B~isRcyi~$b=fYZBVWt?kF93t0F1e`eG^@;lDB;CAF@DWd-rr*JH zm7?iGh17IM!4**o)9z)xV9_wp!H>mG@`f_TpEaQzno_>IeE*3#l*DMq4`Kv25);PJ zT%!G;KphfErAh8BdwhgfgXpGPHt`qmIdTDj+7)Eo*;@m}bayjIH2n=)I zfreq8>QGKwHEJl5rv)g6*l!T#UpeA`X7_vkyA!DP*FhuM>5GO0BUI;gTt_mE8NWG7 z+i(p)N0%*h^XKYs`_YG6cof}ufD*WNIo!|6D#08{5}}FB6whFnRn5OL+;mh{IAb~4 zCLo`sIMl|o8)b9OxcJ+E;?hzw-$N;$RY4R^f5P)(G&T5FGx3jI11TdoSg{1huieXz zWYyY#DTd=iACPZitOrqq&ua@$HSl$4-d#I9orTDN5NGEcjn*;i$0yRWk+x6p95T-g z7KMLBHU z;G^hVEmj+a!~>6y=ba6n&QPzD+y_)z^f=w*cw_@t7sUbSJi5?_>03zGiRfQOS0eXR zGQ?O?8&k8*@`B%;!g)qET8x~*b=!nNzCmbp-kW?Iz|#n#$YS2r-rl>_9ZJvX$G^%8 zN5>f6OhmARILD?6Uge?mUGTdiF9iQ$hr$*h%sSu^K#<}HoC^kU3*ex_6CMM| ze&|kdG>~(@)6)&z#b}LxW|H4S>&$B*^3J>4O>p@+>HxZlEUQb%WTr0kJ5P$MYy2dSbe@ zSVzUTPSjB%tn5$*ApQv3aX%)N3Ekx1z=s^AUTL8J|7x6(4-yVB#J^M@Z_4C+OSx;Y zd(up@`&Q`V09Xe=(vSXQ7Mr{LfN0MCm5!XwTf~de%*We~uX`CHc`}p;#RqvIl5>gF zGO)YQq`-PU9B*p9FO2%^t z2wcm`8nc?`t!2lCj{6mTr*A3k0i&WjN3w~M5(4Vpb|OXHSV~*K1A!mr4zb6fq{JD~ zW}Am76Ge}~{r2`EsNVk-H=q1scV0l=IUH?b zEn-wB|G?A<6F@fIRhLcxC(b-^4m^p) zb8@m{etG?mYD9`E4gdEXiFE&6^oyQDLr)bd&*<_b<>tolIoXTI^W{t5_w^20%OB=m zWX*ZYRUYHfXHu2}E?wh#;p(udsUf#pQ}EKOgLZ-%8X+#acEZVfWXx`um}myl2W5)p z6i$K+yJl+6fF`{6kaQ%{X+LLv{i>J*v)V|`@a3?NGFBwk3VI;L*I|Ce@FFa z=vLB@zl^$bIxRh0)r_IK_Ift$y@sj$=WR7oL+qawF8o+D4tN_SuN@;&v(Ar6IQZw# zJNXPJf3fPmXUW!M>>Fn8G$*6av_3v_P@cTr)Lg2HD)Kt5J@UWz)Z5uTw{POC38A(oeGYA&euzOSO-B-WpuJ7uA-vWBG3`oi2%nBMv^ z8WpQ`)aH)e>*nSLH^q_Z>A)i`9^0F(AWZjG^7*skJuk|3tSQw8lf-`eRq0_c#1|4? zj@5;2c?|Uk0TinoQ~&vvo;ECxLZ!Dn!ceI4@J+oj?ci`!Z59PR59)|w0TRu>1DByi zATucuO<81a`gr4y&&`vjd?6%Z8hr+QZjd?2JXinYeoa4HIFg0$> zCSnxS55U5SEVn*w?G?M?6cvU%)7FT|7~#?ei=!CH(rhPjH+ggmrK#yImuH1mr40jBbAq&! zi$6-!ojhg6tsZ@@l(v`(4;J2fYtOab$A|ftXMVKJdc09n^ADCtv~RF4xg_BeqNDXMi*VV;-5SvGIb=Cp@xEyjJZw=pp5hPUZ*_Mk2$5j0IV5_0s`j@BNe8#G>`YdsYizu&d6`R| zjN~$E+tC5k1mXBmVCRT_1NEKEz}&E(+-;#<8(bcJsok9T+7ZPauwH2xN>vtyL&-4Y zhLPL!9%`5b>HA%=@%VN!FuTfk+5m%0QNgRBwjMn7lHVH>G>(tR%d(}M(@fc*;-lAN z{iH^C>%o-t>LKmktgO<8FJHH`nKANihA0=pq((TUF?LnTGC$MBi*Jn=!U61W|dyTBuNYc0`P{s2u!jxjiMmZ zA?QJXh|kN6cZ_oRK||9m7~MOGmp`vqRQR#NW80rW`l+d>2?yJU_s51|>qxZYls}JG zM3_sBy>X9}onq@Xxjd8LC?=KP`A_a?QHQOUr+x51z2Hrnd6}!ttSH-HVd6nqr?qFk zc>Rbz8rS?B-7B`EwLmUjCGUy@jmjQSez{|m*`(Ye+EhFIi9q)UK>_xTd1`$+Y(?-r z^~oG;ABA1?C6qMPlY`4$wlVz9ite#W+0z4@5o^m|rv$EP$jT{pxse z`xvRC!|w2&zcxADJ^ICU>RC}N|oUvE>$>i#h~d2MmeoNjP=H+JK(++wz&fFnG4AukRYtlhcWB+V%IIdfi`u7iBbR6Tq_=jxTQBp}Cn z4&a2~i;(JSaxd|93~Twvir&Duamb>$qQYT`?&i~eWVRJkzOey>1LOza+pDrB*C^L( z-3%U*$9kh(qt=2K3M;1U8?Sh>{>IEZ_dVMJ88i#&pB}>9(Av% z?mZxla|0vMUnCnE46Urx58p_@^U=o~*G%SF!9606t3wYT^r|LJg0!g^EHo4XVyG)vQK)J2a4Ryo;M{<-*d0 zx{yRu!o&e>6bZS*!mhWK1(LG?y&`noT&rJ0-bEnx8nDI@&GDk;b|J8aiY~~l2zwZOK8ro>D5i5$Y?sHgs z46Ll#_wPrL+!g>{7-0I}IPH&ue->W)ymy-Sn3-FdOp(FDfyNb~!SE~&7d#pgk(F)l z=@u`hWEnW}8H1f*6?$9Zu&)-q9Qo(JVF^(8kePTem0v}RRtlb*^x7m=bgiJ8+oT@{ zec^2~q7X(7lBfr5{RRYMkU69%YCfSHAt@a=*RBDPC$k2yYa{xk;_t=1hmZZ|cikVV zVWbn1`JT-DyKj2Nt79Uf_B|UVF;>+EAAY^6BmEI^8{+sr!Y1> zOhH3)Uh_Lw-&HgevYtaKL_r3P%_8EnFgTG+f1 zW{d|qh0p2W>3Il%5?ByBH0KqC;=#jr$e0}@siXmuh7~jd6A-~1kgykEZKT0OvXmX9 zd8ibTp@Rsvl?v$!buWG36ac-*RBr4T|A~+QyBGjsBJm1X)|q%XiTeS%WOOU|7LPys z1C55xFVK8`e(>1B829JvvBM1_+VN_ zD?S5>=!Q}q)Y!3vLVUkX6%X0_uA(6V=73pI)POUg-^F?tK(WbK51a?EAuL7G&=r%U zVeGF(2TZp}M=S?mY|~P!Q;8YL>?AV=$v%#;R`!A2b-P$xifIv$h`Hd)GJS&O*uhcB zgHr>3-T+XxYCZ)+a-b6;!^;1?jAS4Jj<>QZzwWLs>_yRFQ-~!)QnF=2@$=*)G<|i> zBO&i-p*|tWhQQUFRY3~{i|XRCV``IC+sY$6nYgZCw+a^paFfEq!>zn8*ql{7cvq5c zW?@3gxx1xDLB!dGfTO6gt&L1EzZq_wPai+pMc2VrP-KbkCSChkqHj&ZYc%;0fvZ1K zdn)ln9>%;wqayDCG@P=vLNe}`fQ02)eywvPAWdo z(QtU?rg$2vezLLmyLq(x&m*!F`!DuUDE|n}P-9J&79@VflBz;Qc0NdpE-s$AavlmR zqOX6Dm*;Mh9r7X^4h3PfSjf-gIuQE47hNz!kn7Mx;H8M!131agfBJz*^1$U)pvT&D zqd;G59h4KMuaw}>K{}0@QkZAQy+-Ymv2h=(r-#Q9ux(YIPikl4l>xR}`(UkC88CEm zln3SK0bL12XBCDBO*Rb6z-t14nLERYPF?EOkyz0;DXOk4#wJ!)`azdCem%;DrymMU z!=p#L)8^g#OE-giq#;vtL1mDbB$A$`-ji2uQE=4sl$RIG<*Qe!u5>sDZO3dW8f|&4L%to@s?ea+`tJ#d8T;nSELDIFY1*yaLE%eSVA#vXoBx<>G!U zultIkbPhD!&mo$`%1nCr9V!GO%(}uU1+y=i;gR6;F<$iR1NT9A=5Q&%`T*RSbM(?! zTm192QhWY}hi(ial@lMR;QLH^T?nZxp->5Vjg7~1zbQV)PGx1Jm>fhY0jx$OF_Quy zQFiwDQMv+8PC!CPF;0U1o6J)NgA&mA)6``I!f9V$U%>Ud`FAR}UYIL4!V!&Ij$_zq6mKI+z~>KO5kgG93C+JqGLL~9Z4M}W?PMM?@qTq zUJrf?bPq!UWiTfIIsn$#z*q#rOOR&)JUSVB3HpQvcz#@c+|S@%9p7%gDUX+#mYJ3c zf7A3;XJg}7><)vFIvG%?cq4=4ogH{UM0F8!GmCvsNSA}ylB}xEXk;1L$@5~%dTA?b z)`_AYitnNa0bokjv9kv^+7l`cfe~;0-W(5^n-rF16Ej$zq*S4PBnU4C#P$kbRJU<4 zk#T!3uHq{9$C4uq(?m;C3hfn>@T`-hWte=hbJ=v>(hs?R9YqCxwU__ql#X1|>#dd* zSFs7pc@3)rGmj`L@~qi#G^4}pkf|JRG`0XM#Y0FR0*i^7#;spO=7ZPS+KO4+z!?mq zN`kFv29107kGJfDLnNjWV`d>EXBq1pK1^ab9Eg$$_G67Gu5A;qdCHLk4T2@lGP=Q* z2nP3;pwUz$xs}B44DWIzg0Ip#obze5k@o?WDt0>cwR>+N(r=)0xFQ|fJk~uX-9Clh zZy$2zpkD~rgh(J0zytO;i_TP4f3A2vpVyOfU}SNy6^`GCNu;M+OON_j7ILER#4Yra z-_pz>ziHyE0Kxxv#@pxHPSOMRxdx`W zd;zaoTQ@-oFylIC=+i6xrDJ4br5yJb5n>a0MI;ZMR3&!#jMBQHZ;Dg!)$sUv^;-Fw z&!^{)6jo2&1Hk@Q?uZ<;kIy1ce|pY<`sEcV#{SkYNEmJ-x_%(=WH=xUprt9U9rtM9 zbs-j%5rl=3#0_xmP~gAf_Z_sGyvIb{yB8r*0Kb(9D~VPh&lg1SHNbbJpx@!P3d3=; zXXWQQP?w+sj~!V>-_%p@PjTWzv8Nys4^Rl*vYzoB#(71|jKW$G$1?MX(G)J98BG6( zgv!7h!Xhu=00gGN-z(bk2Rc9Os}8PmG=WE6IqpE6_4N35JH;e8<>Kd(ah#weJprK- zk$95f!DzorV5Ndku+ZG}>-*mm9l@J*NlXQi;=UB{zY5as@`cMZ30WE}J@70*eoR8a zWOid%&LQ{&32=!lJ|eUMcLRl)t`s}1kyjEK)~l0&r!N@dEWdG_J+B?EtGz(~!afy)fyE}`ovaZ$npNInwkMK=J`AfpjJfasl{sOP?Y(?>q3 z`?mYTo>v-dG2tMD#1_MY>o*R1hzFAn`IXNit3?=xT*hrGKb?h1SBQho15kM|}N+R@dAJI61zwT8K!qm(w+1cVk(;pwXZFrS@(lwVR zP@JR<11v5|{2WXtkcLGIt-(6@y184;*Rgt1s;%9-hg0aI>@Ag+Y&{%eeWazN6Qv zkQw@2?kmGaMHTgKCkv>a(S?xKz7 zY*!Xo6QZ%bq5rh4Y>)}n;lqac5wZ47GSo^`=ls>xW40c#*lm2YbywjoagzVuNJs;1{tzbzZxzTg*{T6c_F+@SvZNc%mShgWI$Ipq|FogNq3ais9~ zg0-y^o&S834_&*|>h-hjduM)xnp#sve+~(Ko4Q)`yQbpnrtTC&$<>7#Y4`WP4X61J zFQ0OMf6C>$)wz;Pig4qkieJ75+(pL1$8NZMcs4M?rG8#xfayp5n%L5d7L7C~Dvbmr z>P0lThSMmDT1V4!)-A5t6*Ky|c-j%VfW1z}>$WQd^bwP?OBadvZs6p{={=BIBE{d(= zrKO{_FP~AJ*}ebZ%^zN8!Yk9VxJAIhpt1W59z!#pL8P^Buds^^5NG-aGZD+Lz8#P(mu4UwKp| z`R;Iv|LBu9DdC}X4Q|1S743!mUB=ID6&M;ZuKne`XMBXo{C4q$$Ym5rn#YopUtS)W zos;AYiwKHJ3aGQ`Z5gkrNi&ssY_L`^;z;(`h-!7~hh1^(Ju9@Fi_GJm^$jFBCel{Nc)*mlvX-0b)Y#i-&&0qGC( zPkp=FSeI8$Y>15=JDftKIhFf8YhNXj++ZT z6`41EBG$_+^ zxN3Dyyd4&8?^(-1s};?)9K1e_-pgfMs*<_TLk?#3n*wVqH7jHm!ZboO3O9M~$S*30 zs#V|K&eS(T75=`VVa}{+*A`n+zIj}{o$A3sw)~*)r@UU!YrnMBQp*_}F5!`_ z3fWGjm=bKfhyHA+hMgB-?+L229EkQVzUhLug-+1N?(Ch$pKv!B-v%_~vK6~3EA9|zgUiB@aG z)2@kmcUmm%u==mmp`+Jbnl6PgMyt@T&x@q?zqFHTp9|f^Pa8jO9i~b)4OOeG-6*HC z)4E#QYCk9S{@EXM<@FsEfe)C5F9o_g+U_yD?)2&PJA;hFw%avyB9=$~VZOUt<0MUh z8aHEWeTTJTSn?*}&6L)Q2LPqJnDfAxNz6m;{*fh3rh}r&GLo;>M zuV=qp4DSq28aB0uSh761vsLPYoPgJ%9ZlOq)o4sU2fblkji=hI#%34c8N{rxkXuim zOm#+Y;qW%8w)q3YOnsK4zT;OdE^gLi@wSrrWE2Zi(jjJM139op=ynA zK58Dj`@Dm}24WI&p^W?G>m?O#QWQyyU3|EM?bujpM^E8hx52IEtWMk2v%)<1>$A(2m;U^AmDobb>Ypep zuyx4Li&@Qblhdj?_ay=PDY0`?C-xWe@3mQ?Ow>vjVVDZEQ9En1Rb_5#SEB9fH4D=w ziwpB{>iv}Kx6I6P$C$27k)D}M@D?n0Vhva-Oa4`}FYQF2SAIw@cSG7~|-u{`gZE9B--nGbzx%H`H^pN}X=X&M*87D{Ja?Y3leRUuTd0 zu32DRw2H2%UznRMsaTy!tQ+(-Jzk$b;5&DDYWOqjs?LHpcV-W7x=mMY-kb0lM|T0= zir`zqrE2HJs7y|TRBcrI8MNKW@aWT*x`r-cGcM<{IY%je&)S)DIJel|;TTk{A7cKa z%5s^LZI~r)jfN-PhRrkY)nuu(Hl1=6(APf)?c3&XQQc^lW7$ezu)!Hfl|IsoYSG6{zwe}dTS zrH^m!rMtA|7nLtnx%;icCdQj|wwkMky<^`6ZJCQ1bEqvuu3t12_I0Ck?fgZxma5_dt-6dm<`qwL>3)a~7^b({2(l}a_ed7W9^@?QJ$cs;BbJm184TdJL*Z%KUD zM2iUn^%{o22T{L+zcul@QLr7ewBi2D9=Y7{o#ugSU{ULc|FD{^&$q{lL2DmVR`;lh zbCy26nE7k@)F|uzyaE6F6zTQX2F80>c4`zDg)XztXekE05ek&3=)Lvv;{N``5WB0v zCf}8mw*I!!juk~W+mFdZh}0l6CjnWGeED*&-}QfEIF0WN-^gJ5w8l3=J(Hj3=&%LL zV})U7YESO=-79jf_W2jyNSD_sZ{j-|U+!a}{56!fy;p6Wl60bb)!MC1Z38!6)M?h1 zR$NsGS~vT0R7Y0$k>{qgkJtT7dslfVs|S7uyPZpt-s$x5x_Pz8=xSD`Vu7nWFU{=l z!^(%>0B0yUD+;aGMXAGQWrWc!~uy?r#H61GKykaj=a7(K99cS{f(MY>33978dlEuE^yqgoH(o4@CAL?KG zl7@b3~{}WuwVp$gr6FD(O)Gohj2f8pw|S9 zfJ*?HsSDaxB)1v}9l^@`D)?4+aLcE+r|;=|sd0mS?YMd=$Dk{n+M8YKPZ&6^mQ&eD z8V)qCxC+mvIqM25C7E=(@ivu;Gu;{9)eFi`{v@1cWB#Pd0t{5inYNAS(aaO`*U97jdcYc{M}n zCJ|Q#K2XKb9SU)6H4Gng;&W*;?fRTY8RF7;PMGhmbkl_`P1DsCibZ9Eg;n4dV85*i1XB57@PEMbAprvq1f9)yYi*2a?YE^B%vUE`A&=U7Y zciIdIi7<2j^d^Db{y`2`2bJ!pMvl77bkrLP?wp{!5EQ?8c#}pBhsfQJ4|(nNi%nuv zpRtWnmfEJ((~p%;TklgyHf-8GH8t>5L90h~uG~H2?9*83CO=p4(dok43jW1SthfAx z>bQj_?=?16Xq$h#UZGfbZ(!tn(+BUifj85hgInwJK9-*x zjQLPk<=f4&sCueMU#huR{I2f3`n;f&P`}kN#r`kMJxW98Lcujvo&BZK!|rdCqGem0WT*SAHWC*+Wb2L z_Q$IogFwap{)%4(kK>=UyYkWlRhECGR&eRsYVqbZEDg985fL;QrT}(-W^Rt4q>zFh z+kQbc)JI&FyMN{}ojC)GhKV{&pz{Cd7T)`&7DU-&aApjFMv&V8OrsyVImqNbLI7O* zj{-MuzQPC3Oz#%ICn~?wliSywnUcMxF$8nH1HvIP4+F3_)P1%j z+2O|L5fEI~~gI-CNDPJ;t=HNN+0YTRw zwENEy$=L@(K5$@3%&6OhxO{}lBu3zstX8WM8O=WCrmlZs@f`_?gk%EE9b;4GX z@imjZMdJ{M3!F?6WB6-3Q-=%`t^G?HJ?U1Nwf9wL^VEzY?@?d0(OP< zYN~Q%EslU)5Dk}~;Ml+*Q%hf9+3N9djmD-C05srwFpbs;c*vpgf_&b!s`mYFRr*y~P3zjRE6(E~eD@D;$cF+S>Ay&W zUK^y7k°Ty>zzS@8tP+b@cm>ZT^ne<=G)eB^k;fP_QViOhN`%r{^{E@j;XJ{wqd zZdK)6(2s$xR?&0FZPI?wTU1wM*RI>3btX&TNdoKyf65bu|ji{QC6* z!NEIxZnzg|-o4g&v#F_xtdt7r7uzzhg(DVO-oh&X@MG$v*N)&P^s?OcNv&r@8YJkc zgc_Y#QBZ1R{twX6oTT9w7slICUv_tMg0QgU9=nAUvS!4JAwaJBrhg z*Nz~L5j=Pgvfv7?66?pt5B7vmIm`26E&J81miF>6(~>zL5WB-i_WcV|ReA_}u&i)PhfR`bhoZbX<#?D0q6w&#uPvY5zQkCeo#ze;n9w-1UL85!~FGlwv>&N*3W=NeldHiaFHjIY|b=i)~H-+a;beG6oA5M&n3 zYb}6TL3)5NnX23m!xIxOIMw$-9mO(s+S0k!ls6aD7nwrMU2{zTa#f|7{^1Iy6HoeQ z&9`!i>ORG)?D=FBFQsib_w(l{5NuF>O8tr#AxTrzeLPXgIbAHOi=tsYVHXybG543K>TOk@C(lzrX*YVBwmQn8%4d7dP{#$(_pb zhNR&(iC;8V)&BA+Q1wt$qxZ8=p6C`^A75Yb9Xr-xoDYWd9N&HzN11y4nAZ6FwJmql z4I_bkf{pAk&uMAV1CNKvBK;?dz?&uvxmgN1hhdNGN;-jczZc9%g5NPla8prHm%PO* zRUx$y#}!H`#PFGO+Ey4sqSSo*s7B{=M8;z^fM0;?uE4_TVw8Hc&gpm1*uA^1qwuO1 z9};Gr{rdVem~lpgXE;M#Chsv&D}^H!j;I-Y_*-AS62;Dh(7!$i)*4vg6DRu_Oh69> zl)?2O8&!m{BCWX{xw8NhF^_PFKtw#(6hw(FB;AXdNG|1LfUcbkp8=_bsTj|(ts$K6K%CVN;RS!Y60^QX zo`OWG4ILKP3t1Rj?y(n{JTSpI^* z^wb1YW}^?AzC!-zzklQ1{M457fBuDfjYbB6eE$Buzkk8`W)7w&slD$BUr;8Iarjb1 z1?4;FEMMv~X+j1A&I(qzHU{fMg2#BF4H8qpvf@S@$$Z-q* z06r0jO)O(xm#Ts^V_;MZp05 zv)=jhkM&2_9aAQ=0SioQ)^C)B@|AqV8#ge*M;yFu&uR1=M6!cP4)`9JNSZT$wWw#G zA~ilUF|MMDpeyUg{F_5H+u;f#vzhRCpp2^T=qO1Efd0}E6%J}L8>3pHdd0K3o-v%f zD<4E|5H%sL=15U$so7SXjHP%N*aR@Fx$4UXL@OpiF^$Cvv-JbKYu7%-^^m%fcf}!0 z4@WGYQhlVXL#g@!~2{?uzycxo-!eK&kGM00wjt1n1Pvj6xWyNyNQDy=RY(E`)qB6 zaDQ0DO!f6yFz$Lg3D-To9rm_*1Z4adTn!0=HRf*Q-K(%|SVKa1-)1pU5Q!0TK)^a# zjzuBhdgo4_Akwap2yg@lVJbo^Ry_$4!3I@vF*9TjHuid=008eu_@@zwaqRF48#*71rZfO^=f1L^CJ=%{gnCe_5n6w4 zG7?_mxj|^pTj4W?0gnbNgw%)-JtFZD>IoWDUk>;h#9a)%I-U}8crk{5dT?R10Cf{l zdW#)OIf#!5Hzmn}ra%T8Z#jnY?Lw;MGuDFHZ^$5$w1H-gh$V2v)6qk~;0JxIYy7`Y z2?~iaI1sUmyu9l>bdvuj-4V&HIMd3YtHInI1Yje2@6_t@dBRVUhz!_#aEqQH zItda6ieu&1pL9cOKvLq!KPXQTcV&;~6bgureDfXn0}{PW*mLY*;Ec)A&I^D-;b4l& zZM4EMXip#~Al1O`-Mm5Q-)5G7w8gk#nMtpAigAFRTSZH4}VF1yE(kRo|%uE8jr!Lx-2R~o! zXxk44|1DzurJ@B?En3>z67b+;L7!;KtK^Xke{Yv<>9@hhM|fpzgC1gHVzaZn9uTCr zP`7VVMX?qus{0maW+qYx4&eL7GSh}W#Kp|@{7`G;=y*S~^J^vquU2<&WQE^I(X7(_1*v&mOvnZ}WKhUDlg_|^DY4EAb<)oiJR(FFrVZo<1R6OCvz zmvMD4kmnS0Ud#0gwnup<-RjF1e5 zJ1HK$ahK5&GKe0zPYf^&1*24>o?&_ftu>19D}C!ZLA%#)fkP+REYq>Fjw zSO&|Q+)213-=crpydhfWB!d0lYO$&wkM|`14>R1eunTP2v&Y5{>93ir$YOtxlM{nX zmn=xT441GJ$(thjNN3{RqoRj!csAS0?gt$n56{&W_|ckst{?TH#43J^>$UAY+Hj1Q zNCr|TJ^2_9@K^MyW?UUl$4N&!HFr3Ret2{*E2AuIAtkMRccv0Iae}w8xgz* zu+VRaer>sMTaqWkHv+pEgv3 zymuD_Ze%a=F-TF_QuBMN`YnpQEQmn%qZ4|Rq8zyTrHfIMBcn}$FEzhR2Wu-c!6`nlwXl0aTCir6H17as5^~d z!plUzw?#qWC@&t7pZNLy-FFRIUcI^r{B7Gdi--HMOqY=Vn+5bh55q8GuuOkK3cFv` z?`I5}7$ie=Nt}^u=!SMJN*-K&E_NbvS2y1^a8iAQn~;m$?=2b@^1<`6(vuTmkBzA`Wvh% z-@yv(UoifBL2f%VREVU*#dhEJjJ!v89ydhFw&08GFl zW(lF7?|UC^Jo0;UjnSGCx6>|6_S4C@0T#jXgBGfG+*kv|o}+FjmIQ3kRIDmNcn_p? z{(TP=T{uNiNMpd^eJm1uX^V%>8ju`?wuo1f1WM=&;qjb&ytIa-(k3Urf`||{ibg;U zM7V|HkiasSu?Ync3~++@lzaD*9D94TLG|&!n<7LoSV+#{1+vhifv2FJhsqe{3w=nL zN$-W5g{t|}*jOXJ^ge(f#DT2pw;pc{k$4d9KLWBzv^M}l^RdZ}3_5^ek3N4x#xB8< zR?KL)(+nlZ=f@4=zN?IS1*HHpuRN;F0~mFRV7L7Ft0@2UASNRda=vtg0^81W9gAOT ziXbtyD8vQ)z6fK45^39j3n8Dx5JF0vQ26`Ahl5vwS_gZHJ+86IBCP8NQ6b=m$b4i+ z5Hrfs0*}m!oj#80K*ZSE2)d_r;61r6&%c$)ai6k%JH0 z*w)610C5y>#Keq~?{EEr=Y*^z*d$;o91!oVaX^QL{XPQnU<}(NwI24C2jU-K5}f(* z;~CZ%Dj>+rpCPaZ1|)Ih+e0(;@$=^hJRcCPlLi$~rln+ptQlV-wx=B)gOvw(^PwpP zY-awR0~jirqfLkk2Da(v=U1apgk2}?FHo}v`wP}zSqg$L(OSc?F-RNt?V9q`u+5*P zgzgK65ul%#=p3(rcLg;y>9+@ZQ`FLAA<~8hBlp*DR>5;98O18LXK45{@U)V@xQO8+ zcn~4LWD(04ZUN$U3OOlq_RlXxF`J+Se|&m;qP!FECU$mwt8BEV5Xk#Y>mc4>SKz7& zUL3n7?pcBKF!bzz)KIK++Ro}H$_C@>oADxe_Q ziBEv8qsLbZw;ScrGTu4_}uV0Pqg|CZZ_|W^;`t z(YRkUU&L;OHpzv&VK!2A5zToETSL^Ftn6%zkeh52LyB+V{MD7^<$S-O$VfW)PSJ2# zmtS|Ihfin`r2yRIG^=Jurf{zzZA5OA#QH0KQ_-feZE)FRPs1gFH|6B|s>I<5GI9Srq61K1=;hHXfV}4FfNtbv9w1z&P+1TcqH?$^g$% z{#fnb#h5?N7b-1QDL1i{yJycxl}6;AJq~D~``zwWRV|f%2&zI=fu_CqMI#(MM7K`n z=3_;BA&TpgHuV;1x$p_}i-Hgzo1|%#t_v-}UzcYst5)`}d#)@4wi zP7Yi)%w7iL063qVb!)KCgZ)^5)r|*vxi}Kb-&C$1#ly&nppCiFytwSNGzy@NmbFR2 z?LYb05rj@kgBokrCRc5&92A|~fVz=vC#G659RiLh21Z5^Ma7Dt$8Z^b8Xh+EtPZz) zjg+-M+;5o0Nr3U()$ZMkcxEFEvZ;z``*Y)pP*u$xC%hx7nE#vyd3jQVj!F3dmJAgr z`wqiL4A^4e_R1aFp%$=E?6a~Ldm!%IMzw%Gzl&(UkP?8zOVDsuO-Lw_l46Q+`hlER zzvgjXA~?err62Z-b$^|LuMCBYb;cZs@{;Hc}vdyPLQEGkKqwyLVq_r(SG9Le>Hj z{EoJn(KtiCEEbnLR5h`uH33zEo{|SqX1kDCqgarVO2=dOMaA7Z~CgCZ#i^8?hc1eiFvi}23^Z-(xQ zzue+NBWdHCn(*qE@$7j4y7U_-fG7#!K@tU_-@$o-Z9t`dBKk;snbRQ>T1WuecaLOn z&SPiqifg-tYV5^VAc4fahJIoM#$Y;F&xkz(w|A=AIouLU=~Nd8B04?KvoK1F?AQ^9geAWz6Ff%?OH11U z9p93%xrVI@Gns^igg}<)q6q)IL80_uk#sGD4oq!aOQR}ER$;5ZOIIayC$A>MwPtKpl8J7MC z4!y)X=# zw6IW|VM4-@xC}^)P)leeC;iRDKj6NZ;f2X)6x@m9=rK(evC6};H>O%+PH3=gVlH-vcizsx8c*L zPZ)!3`VSy(G%$z}fklui{xo1zYer0 z#cyH@K}mf0nAfg#glCP6`9KI3t`**!6oDmZO@KDTs9Nu)Xw&rqRT5fMGHb)Fzcd`w z2_6~-k~dF~6}>EY+CveoaZB2&1w`i!(aZQCSSt~l>BWyx`KsZBhn;#8u2Yo`429t^ zEfY!>>lVEO!JYAF!(flD4G73~7;=Gv??a=GWLS*xdB1eG?wH{Dyxgeg?qHSPHqQB} zlC1>W55V~nq-T*VJ|CYs?b%#Rx`xNTr_hQFVk5u)?AfykZ6Z;@zV{r#9>o{ag zsmdq4PKcLLQo_tNf8E82r=OHeNY{vl950S?5CM~Q&V~gZt`w}S*ZtmwcAd&nE0sf& zg-IQ?_X4nMS5{Zs9InMioF^<=Swohej0`Je62Kaycc&u40(&aBIPS-q)&m|SU+M?+8* z0LM7TfmrEecamxxZDFh^K?<6h)R9t)5H*a|qQFK8U<&sZoHi|FRvf70-=yw2gzgrA z7_6pH)Zj6IFCGvLcyLraX^7E3Z~zEP%N-M;N^^A7=yeZbz$F3?E2OWfw+kTwl4K8J z$A}0Gea13wI?rot`%gAt%=7KGKjQli(`P7Wvfz5>aE9p&qd$>I(EujfhFa7Zny2eNKMK zV*~I2q6!1$G>jc|@{|0@umfJ3>Vay4pN%CJXYvn3V?ej|14YE$z~)4(d}OOfP|+TY zcPA_%lvQ~01p=2(X=-T^RbUo>!*~PoH%c-(9GplqY1fI6!pmzIv9wN5 zdg2yQvEh&fa?Pp^H!=mnS`hGf7qA^}#Wf66?C7|G>x|LdQ0l)XgLiGmauFA*mrWd>&>6-ovY zUx9=goQVjV1`t5(iX;dOr7$=fL&!34_+l;#Q7NXu;Ei><7Ew*;gZY#^L{Zj*CGTAF z1#kquOX%O6ZA;UnVMTEmn^76}{74}ib%nofEO`~QHlK*=7~*rh{4hrPX{HnK7$+== zD}_|6aypI6EM8{z^P*#9w1D#q0iwr z#%`2v`RE#uY=}uU(Iou&wHsSKA3Nj>qqm(>J|eI9Ge(hO+APuWV!|(08wH>hg1_XA z?MuK&77QYP8phBQfLaMxrGobsyb#!Xx~om!^P0x%op{Yuk2BhD%IG8jt%5vVFFtRS zC?rj2a`!hN!(b)MAA4t&UqH$wKeAFt!Vj7ck`OUn`&$}$sStVGK!MYN$?n8^PBb52 zj-lyO`2+L>zq}blgSfo>OxDV{l;GoiR_08T(|kx>A}_(RZFc9Rc^w>-6V7bUWC zT*>w-A6J13KM&`zp9HQZsynA6_u8>+&XaIm@8dPW1z6Q1m_IKF z7!%W2j~r5M1nz+0g9<>`DF^|83(1WT5OQ&FybIN*1CHzEk3pVFf%qE(} zx4_I4)+TF4WI@TH#UH`t8bQF7Jz7yb;^crN6*-Pl3d6i^qG$nf2bzK;jA4(;M4v1jz=+`k{(JJT;UxsGyI74F_xju@h7)87M5#j@`syPfOF#G=}Z) ze!wrUMnME?cpMV3FhP-$9-9m%gjDJ+S`O6ldN_YjK088efPxc}@ld1OQ9SLA=!gg$ z%EQx?b2x*DS|OzUlpZPY3CII}Mg>VFhFHJB4vP|481qO$K7vi~Re_$MUipNE9BO6tv-9ODxjR#aQx?Y ze1M*uJnaP?C6I^#yy>YBU}ICqDM;!T3OxKd+{T2z2ShhCL2C|?-Ty<=c?a~^w*TKA zC8JcxEG0!DiR>tp2<<%yNp_N%RFo7-vML!(g%(Otscb4KM9PQ=$?o?$@8|pb<9YsY z-?u(ppYuA-<9M%w9Ifj_KNay-I5Y-F%<#I%jNnVH#g z1Uv-D2L1<sUSPq%d=cPpye z+0_R9l0tT&I?Mh0-G;2L6@zxR7x~frum*%EVwEzlegH`oBD+m5W(JAKF|)b~ivd$} zF$WYbLFA+bKntJoI80)Jh-DUOoZO*c8gCSJkW;rB*>k{W`S|2)csAiywf1F?s)xE> zb+xrS7H?iP_VxRUt|BUf`XF|26OTYt*W-N`QcTq}eR|Cpd^5?W&By5qFbHux1y7sM z!d246Fvu-~ffmJpi~8kqr7XACWCfA2MTAA!>3IiJj#MDZU$*RxPci#Hv_DNGE#|_- zix)57+LbpfeE1rqKF@s`N2|;~G9z!>-xQxa{)4T-o8JhChF3+0XXFQw4vaW4*4|Aop1cY!o)KwMnhhS$YJGNR_Xsr#yq zhJhNZ{QalQA7ug7(R~*H&>FkCzSg!ANo+vuz?M!dcdMx5>GLqUbvC&FR_q!9#C3t; ztejG3mHA>Lzkq{pvww>>IzsY7D>k;h*(r~RK-_W6P4`e26jSQNT0IZD&bw4fMQR+4 z6T?Z)3P(+UyI%g^)4-=|Lgd<|-sO_U#Z#$dHck4y>P7xHpsxaO2EYO68YhOQK z`?PnLl}bZNFdHzm?j9045gmo>;x5o3c}gO^3kOAG*8-Gw7WSJvv*?x=mRI1H9=!Nf zW__;xmZtdf6%008lD0eayIhWG!rq!YX5}TPowYtsfK6$=^8_x%@puDw)vl%HhDI?> z{$?%#v9b_cTLd7IpA$h=Yt3FIuT}X zhh}b%d!U-3F7sco2cc-rc1w=a7IGI5rpPSN4w*STfc^L%y6>l~L_KjER8Dk7o>n8X zrZIEgT6EqN@LPljjbo=UL(>zo{Xy|!%2RVSp80R**9oH2vWc6qjbdI=Cm#NWzi;bB zWudaWetBIo9+d&49HlWO%ieJ41#RXew$Y#)_q1y@EvDO_Woh|DHuV!YH&f~x9=DR3 z+FrUK5u?Z5X!a@DJaIEO=TG>kA7Z&6PugE+SM3>mX?GYVm_D88Ia==CIXV5s`?MB1 z4q*god?nFodlbzsC_9zAWqnUz6$UqboJ|aZB5~^~kvH-mPov2wLV$DKX7#l~4j64R z${(?vyP`T~me+>9;ei?t&z;}b`zn2mK!1S&XzQsDuZ~uJ=-w%#2vBXD`wqi+rfJ&4 zrrxz z0dX21{oVR>`zOH!QeAm6`^3iWumd#0^LgXA6R$d7$-6N6#*jYs`dj2`=BGT)C|{vX z*Gy}T>ewjN{#v*0-OpF1ynL0>^UH*VYKKdoT2$6MROU85Gz`bDoa1Vp)^ehD>Eg38 zwxy;I-NyrV(SM3r0mk*lojV`H|N0q^$a>4_sQ)*}P5(c#cdfhlkt6vY1l=W=Wi$1`xByc`qm8%FsLXj=zCYs)h&&@SE%yd}_ z48jL2pUnC*eHt{@^3o&s7wPy6jC5$T={q;mcdoMYAf=K)Dfirvt`j#k4FtWIf*lK; zxD>M-+^aCLh(8qZiyIygrAnh&7ds(^H7vJo)lv&ZMw*a*)1N=$HWW4laqE*^wx-1W z4IicsGYwcp&hs=}3Op}*mCHp7*_bhcwcAvgAez%LZl!%~ur?K51Fk1c_h3KA3zYX8jRA zChybiZF?BOn-)=2jW0lh07m)@8WaLS!v&I_k{60E69mf_2C8Oz+h6Fp>YN{3$%JI; zPMm0|bgMm0skog*Dm`6__$>sHtD(`8RSA>|D)E)re6@Mf99eWRdDpgQ);gm?;i`}t za6pX>yU+i&A?$w0LRy;oB6dz0(DA|G2+0`~0@j{M$kW}VkWbBCuwcKJwv4UxzQSF3 zdE5Oumvg7L{A#8h6kRD-u0G5js68Ii()Y4oMNAe9$ieXNwN`#+sg*P~N;P{r%cCv; zU}^5XT*vj94Dl9B)*!MZ|w!5o5 ziXJ>@PO$0tY(6u1i%baSbOcSffB83mv{97Y(-!@ zZ2b7X{6v+ip1=mPp`JYKf^CFspV#{O_3P0`R~s5iKt(Yg-B4MJctrq+jDX3XR?`7N za+uc}C6*)HD1;@^j9y4)8@a}lIHN1WjgCP&gDeVnL4^m< zXT;lO>{`ZJHnQeKRDDHq1A`Hc+yiDWgx8dQhWV?kL2Nhz=pez&?OOYjXY`O&K{uF3 z$DgO2B}`gTMWx})HowG#1W}i$*hn+qYQAyNYHXql5IK9AxjzbYwTPZ(?Mp?TtVCxf zD@0{w4NEdVonyd9)3WTK?>rvEZqgb3_K(rwRZcn3 zTas8l;a$k9g~ulD{10jUb*ACU+WJy;wUW%E3`#&m{ZDNDUZwERjIrnQw{NG}blUQL z@@h5Ux(fiP3XA6|Xn~6eA`eb`<5P_CGEq$Z=C2dfO z*i^HoE5Ugd#@jFVXFAq_C44?S#B2v`jCM)?09`H+?38)uHz$~?2}SGj<>&7#eFl&O zzSE%HIXc_A|JJQrQD zcQ1A3vkZj+41w^FTOR#xW~3Gg2LAF7p&^jr1-UAyd$tcmZ?(r+9(`bAb&KQmX?A>S~5>Ig(>?3t~H zxk$_`A_w1f9Sh)eBph&08pJIR_dHNJYgY2u3|Zsm$s0Hzg6QEHynm-g-2)*cR&{8v z=YO7UVCm2U9__PHK1;rKc#5Qn7hPWJ^y7DG8?^HHg&}zUCRcke`Lo|)uARc_q#R4X zyv^t|x!UMeeZm?|C=o&>EM!7+bY!HZ|FgwT?cS+UpMD){RJObMCFXy0R*w_zFGDqc z;=}?V+*Nk2J8N^qRy_uZmI^lx0*v&N%PS*Wo&ZlXN-9cZ)MmskumB**a=!!O@C?s` z^3IVC|93|}2dt5M->lPwxP0wySkI7mYHZ1+ zArd=MXf!!B)~ipd?yTMTa&4IOVMApxk&t2w(SUCJX0Gq9>Tp6!KZF-EACNgQoo0CB*63Lx)2?W%?p6Fl&l$=B<=Ja z*A1vYkhyPPnenqXhC|&w$Yg(6U+>oF+^th)r})L2jr-Jg-c{ZIzZ^`Rb&^AN)*855 zzq||Id%cL=9}nE|q}a&o|Bq3w%{aU0uHc08Kh6c%6uTpN|IlJlCP)Q_Y^_CrNtej# z!-|h%-cxcA9PXeR;MCcf*wkWfYdasC1;<|`&LRVPRzb|{|F}|V=w$cDj8}9MR9{9i z+l$dLh0ZbPPf&pJBUX*xQ#;|N;~2ahvslp_8{{UbDGQbZAEI5Yx8mvuR zp;w`9DCR7kM{ln1F?ad@QQkUxc^jk*cm+ijf@{PeXBmo%5w3m)f6Dzwp=+E(`-mlx zzLS4`KW$owa&%eS=G1qIi0^w* zfh~F4X?JMCg{OjFiw$0J4+8NyLyVc%Z6XrJ*i&oE{}cqfbooYp7|~Ag>1m>gvbM1s zSXNq!NW2|tDVEhQ;Ga;Np$i_eph+OwplU|+Q~*ZMPY_Yz!--#Ig+rcd!a3&bvyaAZ z|K}@Edbn-gY0)@9oHAk~F&oAx$OYMq7U&WCGa#jxtge>Ie-TPO10a>y!XJG``TWO2ZrA&-F0w8{`0~6haiRxbT zn3h&xsbRkfJ#;thvYGg@C6b4KjL-lfnSwTtjf)dI4}h+U$@pjb6E<;B56rNR*vQaq zJjr1$8_7T$~=4vr==S z@Kb=RWXBKg$7+SsXkA3K|MEvFyjgJuA!3~yT6W6YBoY*$w~#i6f>(}18G;rJ5F`J^ zp)T5%%JYQJO6&|VS85T^gzLDm7-NUed&SUf5Wlz?>&rz}EQk#!;M%@n1wSpuMh{c! zIAPVU8v!qG^vZwG{NsWr0u{y}~-{Z1LlckkL&Nm8!g=1)^4?d|=~ znJU~Wt6s4Do!7MAw+HnKjfdrW#6V7UKz{w|L)Cch^FiYWY_YY|_V&?@TWt3s#eKZq z=4{)A`}cqOJ6aA~ZODR){WDgxUC-68s^s05i#Z$m%I0UjC9&8eMLe!MslYM{T&juGH%j447u$5?kqgn%)vs<$d%p9_5YO(&2}3yerzme zJ5FVz7#v`kh5SzF1wgJ|y?q;qI01$aeF<888c3=>GopYZfevIR9fK@ILNGee z3+1&N)~v~A{{fDn@dNI-l|6*v!n}T)fi}cCxdCf^F&9LI*?JqI{Tasu&lZo!Tu2U3 z?S2I;{Vy;dwDvQK^mUL8uim^V#(P3}7Qx?oQSvo%P`-SUpZp%1(QOEm!+SMmra9rm z1*yTUDHv*^FC1xL#I~VMI`NCVW*)@V^O?3LlvZ&p9w+c)u{;;tNBnf;R-(xgidPUU zv5bnw2$^cG`xolWnun*7Q@HenWE5GsrQIh`3J^61p~c|RXYs^WqzVtaydTw(SZH9kQ-rhLh7!7#WEF zOZg1vLd1}j70fr|r>e=^HzO`gO22Es^wK4);0t@Pz1Pf{YX@bz539WQ=jWGCv-ZsJ zNb=Y{?Q5{F$FFZ?WtQe$KIKZy8CjdX_J1V-p@T4tixhiWQqPHBF0$kRD25=50U~9qzDKKre(Hak zT`2#2A%3`(A1|UA#FW4hX5x7bNj0J!Bbb2y>Y4J>yCg|>8=%D{@`n;K1r@I-Z|!J3 zg>sZ<%iu`svO{p+=*}JjXbEB&&4Kr~2Xj{0`R=Su5ET!vHx}guD4v#H#58#i>f{{~ zZ!y`>7zrIaeun{sFisW0V7uC)?wU#-S$^B@}bF;9TJo$*Wr0)71(;M@;_Pk|KacH({uY^U$ewcN5Re{deA`z{<4Kk~6pVd=BhW z+oYkedP7GGpKD>k!Q;}?7lcKqC>-OLxmaauqo@4)yVT7pDFL}J1=j%JRc>l-O z#Y0-d37G-U!*WVFqJMAx_Ki%gPSgZGKInG=w!>n{Ge-iX{37jFETGL4-vlDTi4(nh z_pb^VHp{i6S-9g%y=;wHdar#<)#|ITqPumxmcIBz^WS^h$<6luy(;@|<7=(xn5|oW z)OM`Ox?1z$Ys{>*gQSLROKVIu>@1^lUEX1a?Z8f*ZvE7-RgGNs>gFDgXWK7ea~G;& z>_(R^(WI6^cY9&yWnzN>uxy;7sv3wahudajwL%iN9E1&FS*%`@pl&f@qa}mU_)l?0 zC_QzZWlCEXIVP)*72=0$t@fZbo=*p`ZX=%quJ0Jvc-qsenJ6#9)ds@qfdsQ^N`-Em zBbsCH+C*y@Q`txtDBmTqHe=}jwohX!a4R@nTWvjR%@c&uq2(q-kEv?(@|*Ti0JSpQ7-?Q^@k7H5s#{QhKQbz$h? zD^0qF#rwAj&W>hzy{_HImKAH)PI0enQu^LBn%;BJ^nc#RFgA@vzs8)2tl^qxrPTE4 z{l?9n<^98w($HUI%ADBZsS>R-xp(i+0Z!95ydQG$JT>*n!XrcPov7#^aVNTLLtUeK zWf}MO+GotBAD*-a>3a8Nlx4e5dD}PpRopR7sa{p>>N07x@dWFO>7$bDw;o&fMjAY) z(%7TWLU^e;G$Lq{h8|~`Gs2b{jt4lQP!%O!Nx+v2D~`CZY47HShK6*S9RQ&{MlN5r zr_^&$1mR8(DMV9P!B|b-17uvMaHuJ(sw=K#>4Y_oJ*uEUv-yF%8^Q#6=N~w>YX3(O ziKS3NV}uY;sBH)=0r1`h^5CoCwVK14?>;{K57$&xL2M?Lf2IP4J>Tx$#xOJj5G9`ZD@^+YTPuJoX=%WzRL|2@zt#C}U zoI7_eOZ#lc<<6Nsdw0tgOLKDrpk;CWBAAi#?8LTrJ{P`F$tUc&2Betj*JUhuC^Jqt zzP}&icUklMw{PrB@?)fRn6Qv09w9vsQ97l$gF1*jPQtG_nLmGPNu|irf(UI9!7P4s z6NFp1!qQ!ZkBYq3utI4eqZfi-xd6$(OEhPk_4DgmAC)Wl(JI7Xqs`yHo2pKQ1Rq+~ z>tOh6tIBPEHSQ)_X;s~tvUNagoLl6}vnLWzG0xs-wDrxPpPwuo{LK%hEBHz=kEaS-hFFB8O4;gMY_Bc5$z4`nD+Yjc+rloFI zr48e4BRh7n_1$VNxBi1_ufDcHTbjbUD(&fcxa;ty?5@X-%O0E9XC64SN4lpcU!V|R zhwriMc$(W$4Ir?m&37Tyh7 zhw9;LVHFiNzdwH#!TD0Ke6i1z?(ui{^3if-ou3bjR2enuLqw-1GBwWu+*m2|rAuBa z`YQa4!n49al1rEsXf-`9Dp}{BGiBJX%knn%dq}se%sH3n>m*G=Iksz%7(UUR5hoPA5{Wghvav zxnCJuOhpC}gPqS6-9NGeY8=fH%Y{UP>d;>7UTP*$guPd5_%G#pfQCCPnrEM;H$m^L z@fGhg_o{8|yRtoJ%(kn(AGWvoojsYme#}sQr-p+IU0o_24=j_=ym=Sh&%sYRtB(HJ zv`_AfwUOV<34eMH-hKD@*C)AudvHAnX5BCU){o}a-a}rTOtCvof|`bo;}y4k9+tY3 zCIN44y`dy8FFkNn*X2hieDoAk*2WJ9I+{uivYvi?vDJ2|Mu!+9`MGlsJWFo#IO18Q zSXo{Et@5qAd$qZJV}fa^Mos_z5-4Ii`xlg}l>VVl&1UvQ6+wD!+iT`qu=_XZ?*^sz z{V7>BG26@NRFTkZu~oB{XZJ=3CSkKOG#%OiJ%QDa`s-lxGwJ7fj+kL zaUFnyv1Pd<>bv3dKu|h-=|M8xU@SviUgQ;swQ$%5MEGiY`!ABURWA(HR8+K=i3)f8J0R}>z_57 zrIuBuwD_t0#Vh2N`N}xDwrW|M*VxOxnswA?Fch7?Sa!z+{{?zKTvmk zz>y=W^ft*&>#-J_M~1OwIZ6G=oDzxGf%%q^HN&QgV?Y;GxFwAqdH;i5;?I6~EaFcT>}yPaBF0YC)?o7R-y zL!g&D03_TPnyh3eAMo4q*iNaptpr-jxxpePDeybOF3Y>VL{fKx&cX3HslyS z8(`Ls@s0tIZxBJP(8-jW!E$iioxgvZK;$~6rKFi733a;-1~0A~`abAcrZB{Od|Wsj zBK9&b8}qu-{-@*|V(v(M&|}JP0&bd`Jf(d32h@9V_e8k>a3ueO%_W%bKnz9Tq@Y`K z-tNj92}}O#*DplnH-y)UR(j~CLU+Wv+^zyDd0F6($lt2thJjPO4+QD-o_h8|@1o%B zON*=m43w_CpZ4v*K=*fsD^+Gr(EF4c|7$CcMdPW(Di4*&8PseN5H5Xfi zA6zo);?h(mx|1*8Y}%}Rw|LHk^)fPU9kY7+wcqx$GgD^8lo898-MW2fnb%fdR~K#7 zw-x=0)ea9d9jUeCgmH4*1rz&dpPjj0`cE%~g;eLP-nn?>^s`EHY%!etsxW=*#t`I0 z=@fGXQ8)`0^yK6V3j?A%Bnf(W8@P12ox`A2=qUXnn%9CM<8*8O`t^$Y$McvwnL1HE zDA2>AKzN|~>`>=SIVd!t!=ruW2ecQfIt!)s$DJ7JiI*jK^xIv+;1oR zKM@f{Kv8tL!r=;@TZ@emO&+gb9OMhOPSS5mk*ZZ)UMUXjn;D1wGg26xa{BD$GbukD@?|; ziZwMg!h%br5QI+E|35p>;^Vgc@fLp>am&OTI6>VB-i6B&iL6k@{wGx(=h_XHSZVzD z$G`Hk?7w^gq^MU-x0rSQkez+ohwnZ(nBG3Kx!ZcL>H2mR8g}f}LYj{LHfZlJo#N2y zN7t?ltbJ>E&h*QCCwu#?&{Q+4PUYS_`Y(Ey^=O4{8_L!o1^{iSs4zT&8avtQNsVj0N-JFeFsi)KNB14)LQ9F+L}--i;~NbM2+tXTq-UPuyUeh9k~Qi z%7!6R5-tU6TleH@!k{Il{rNnL=|YSJSzHq$f}W5~$gi(7PQZY|f3Hthzsk;bX9#f^ z4Pp*PzBp7_xijAz5wH2k&}J}V!qwGsd58<7m_`k+?wDPoW({U(GP7oq6?B&3@E zo1EyW-(nooyK>E@07SyM4}T6-?W?#aYffl%HJJCVT>%c|tL_i&s-Bo|`7X6tY#tSf z0GK$)O*@MOh1VWo18VrA57K{v)>v$ot8>OBG`4X)6%1KzXQ83u1`=jJ;gkmQW#T_) zmMDj)`V8lD;*|N#jzI%vg4;$-TBpKrD`LO8%cuxtUs7^1beli~-O&C~{LpR>*DKR~ zInPuQW}9k4t_@lVpiZuJuEc!!pR|k&QF&+_kB(oyD zF)8c*T@4%t=#|hEQ%nCP6z4*~i$tTHv;O+FZGoWy58}!;4wjT$l%PDxN;SmaKUVI8 zq^7BTwX8&1sZZ^WTN{%qBbyZeUik4_vVOYOlE-nKV~(_b{QX%X@yZp6`1BP zU5(ztcoj#*U~hDcVfFgSs49SqdC+VQ9ZsF&^nYQhaddJL8Nz}{*U_;slvZ5Rb#1oT z1kXj&l#`0fQ$@a;a_JmAbYz^oVy+OcgRHDRhlGY71`Ws29Oez-34lNBb@*-B^XIvX zOHIv&6Ew_Z>o+*ldhu$s$o^ohVx!0soy$c_FfTFD@LnX2t@d_yR-^%nuO3|wifVsH zIB;l(ji1^l%iTRLQ=&F#Ae-C5R@PPAPq-1FzxJMHZtu;D=I`W|U^(cR(sP-MMB?#(*{q(Q zXaNfR=Q?l+Q_j$h&;YrbU*vaYq# zouBO^+@kr8&I?C3w?YJ~hOVgfd8kx`*sg1?&Oz$oye-7;NaI#rO3L5 z9V~Hwa$y+ykeA7s&%?(}GBWKh;z6K!ZszB==gkI%hebRR9j$b*MCj1?Q|>f@0TYOK z4u+|Fs{+juqJ`KSp{iaU!?_--RH?F9;&bj5HvBJa+ zhPQ?;UFy7OW}>Y6sYQ$Q@`Ho>N_^0|S95Lp$dQWc)hAhb=e2xPH(#>v!Mok_3FCY< zaQnZJ``=XJ^dHv8}%nGuJ+z56;P(PDcDJ}DXkmoKkFyY~v16>u}{ z*_9g??@2_BQ8()M_ufhQpW}YD$dsO2JTlzf+9ondJ+)G@-aLMnf=+Fu!_8$)=FH;* z(*DLr8mc7}s`f1y{UfiF4=W#c`_)>x?&Yp;Jq+g_G6*rX)EDBuWr&RC;5)xQ%LW6IGqU_ zw3G+W5^>k`oMUu!mLaCz_3r3G0opS5lHhhY|q*Dlu-#ZyZvh7!_aBFtS;Vu_o8hi4cYbLA@>?SSV`J%HOu;Eeo6jl zX6BezC#Q|xCSMl0>~5fJO?u$=o;?*u%}t0d>_pL?(|?k+|i&iTv4)Z7BT?8kSAQuppgw1#-q@Qn|fJXwL^*5#QimN3dL zbW;PrA@q3=0Q{Q2zc!#H30Xb*?dV~Pb~0J6LCC-0eWxnpeRgfG>oH$l~wYE^zaAq zM)kYrM@0pVxd>5kWv}{%NRa3?se91-85<`IoGNk6$L#jc1Y4&&|3wO& zzd3;{Q2LIrvU*%Z91*m$1=!~iK16oAls6DEeAOmJNycSX!E8unLJM<&*b z?gLMkbaj(*qS(5_W&sVWcolG-Cw!Ia7Ac*CIA8c~F@)k4+ij~cLTgFUr@XQsKbQ5{ zlC!q{&h0COQ{4B|T%Tc?XTS5xmSqJ6H&?7uo|MyGam~ZFe_e-cQ|;Suq+ejjwJUmm zReQ5Td*jA=7OvbGaZXnfTxEmigq|M0NdM+TX$=RtJx3x&8}0d;Y}IkZf^MyU_nKHI z+B!WGpTs;#b?jRPfIGN0_h)BM;*>c-tiuLU6w2@nqKr0)dbaB8m)Ua9t`rWhSwNJn z@Lq8~->%PDEL(vQ>ff(MO@xU*o}6xtEJ>J9w$^wU3(7;M{lb9TqizD2itsoP1Nf7L zOG0#v;EVkhpJA)b4QDp-qJHKg{EUCpP(4vvwv?+AK6_^OxcSWVLZGa2Fm3(2#dLc9 zG<-a5kwW3K*wH77%{G8BqH*EAfJ+(<9me}5O?$lplgaxxZsYpXo4eq-XO1N8r)08W z(!4`=>G8zmWq3=C(WoA6K_R6>pi<<@gW+DdeA$h*4$<+2hkAbVgX}fsIxaZYd&8sS z$MXvcuJ4fwh|+FUgclpDWbnAKqM~8{y`sV(yj7Ea{4h;wUvqcCp@L|wIZ58;-}@>Z z404^fJz8$;v zI{GN%(B>Tfg3wKIU=#YB|?T*ma!^>XMa9MTu()ci` z1)EQvKm^+?6bC}Rf+a+xdLX!NvfA`T_>kyAT4nnVA1;#jLqnIbf8mI1=b9r?Q6eA# zq{xZ@5#dY`+sKLJ60ZNewtor%5*jx%xXI)lUrxWwNsMhYWT@|jdWgLc%BY0}zR%3Ag}HFc(uI+|c}{mvOfBPs3ZWr7 zf0nwd_xjdj%jV7EEfqNd=rcCsp!=7WFA+IUHRc~?ijGYf@M-A95)57d zH;5Tx{Dkx&FnZV&^|l1_^IyUmPR|giY2oRBzCrZ|05F>HD-J6 zt>DJGCC#gl7ApJA3UIu>BcZvAy8S%ebq1q?`EfrhlSWYy=2Fw#pI$n@#s(!48kl2_ zcy>e-3iy!-Vq$J8r5>u=nune9eY~Ri!uD?DmQb> zgW@th%~>p6Y{)jDkSf{v%aBT%9tk zcKs*&^ug)*L#NqTs*Vb%@8i2(x7o^H&o#5t8c#{dv^y2w|Gn84xT8E#e~5-{Z_R_X zj!&-k@6xlccBksgGGDC?_IUSs{brxE#fr-V{bIf}{rm{R34jaU8;hZVk|+IXCpAN) z7Gfu9CjvfcCmQIKl{>{1_W>M*JwSL%54Vm~Fs|PakbZ_Ek$%FE16%41Yb6p@Hs?iJ zjTGcm=5wEYV9~Dtm)|!{ABXx)z?XmkO=^h)CvWV55P`&zU55|L9@pIv3mb`(=#;@o zqOP4&mRIyxdV7cUyF#^3JP-Wd}*AN zS49Qr{ozIlU!9`ho&WwC<<1$bOu@q$EPB#>S4W|qet~UX*_p_ zSz>M5zlFEVGE;KT#CMPXde~z2@t0dlycbRz>3X>Jxy-$5i_a^L9C;9cWKDgkh!vNb zlGVecNhgM6P{I?Ts7?wG3NI7A4da?XJNQaybc;AYpS;?a0hD0%#a1pfav89V*MYSs(blzCfx(M z;)y2y+KP%Dw6rctyNwshb;H7K-u>-R`Wd+boO@F)Li&jG9_t1j*e%ymm`%r zcIdFXC3lfhOz_gBdw2J<$q2LzdSW^3de)VFHwP!%%rZIPwP=!$>HO=$=Hi{|5YSoj zW~b?EhxZty*4N>%VYgLM3KIFV&&mXKDk#_;d*Y~C&HW5d*?Q@Uq&{yXC8W)bLJS`F z^Q`{fv@673Q=<6O3(Zk2TYq&c)Ezay+vZhIyX|{h>9B9Yp550^OpTp7Kfm)HY!Km8mEDZZYpE^izankRD1Z3~;ZI ztpFzSJyJpgQQG1|X~!k;Fl&tqrGMr7GtN7o z{-FD}@D0?P$Mge`WQf0Uj@U8bt)&>zjJ%{d&bT%T#;ibsh1u2?_Pwxgu>7 zn*>^A(sHqczugsACw0d7_kPYXdUpiO#Sg!{9Q=5#(IjEnqEORD_J4MW9_T-VEbu8I zObzr7;+Exe)ipFM{o}z5O?KTtlER>oh1rj)ySX818KzS91x*6ViYqv0qKZN%N2xjg zBYKYu2u{27+7@mU>ET^}_~GvZgcKRXja4feYX5=sBMv>_av70YKj~V?{1}m%_s=J2 z+g#J_5N5#AA|eEGG~O_E5c+*^sOB$U7+!*eUxjZ^B>RXkKoXrKLMtjNENyIFYzij4 z=QG<}h^GkHl9Md&GNxZ3Is8^uTA{LwK8y>G8fiOgMY(jv+H3ZzMlZH@&CS*87PNm) z$-$UY?VfKRVtuK%Tuyyi;p&C=wy^j2V&P>EkKO+FOW!vYCz$F@l9ZI`u+Ti>Tma|l z!sq(>{PIM-&6h9A99+{_;oOp$^)3HnkA6d^P2ZYh>A$LS!M9Jh`wu%umc^f6iAOFi zJTBclZ~I^6Lq#X1`tM436cpJ$P+osVyYZ_vY**}+kyJhWifs~l2g*vN)#Ec;zdlqR zcHcNk@$M4a-=EBNF6wTW9bB<+dXQQ0uvxR^zKK`i!;^i{)stM)_olY_YVMV-c$JpK z6aZK*uWA39;az3Kh_kyJQvv$8q(XoR zd|b8c=444i%BG;*>xr z7(*5kN_H`ci5W>GGc(;sL_{dN%Brg;E-ucpP8PE5B9D6M`Z@8^Ys1XdPZ%4Og|GE9 zPzvu?SiJAAX5OeCU(L<+CM-(SzvgUF8NFwqfAIWbivEpUGTS*#cSG&34ILBB zmwUH9v(<1|@ZVljaD4((+6|cC5lGb$iY9<^Hr@D%c^MO+&>pjEhb+RLq~7_wZS~3S zTfxw>?N9R@SgIQcJJ`G;^xU}LA5nOw{d%j!AS2v(LKeqpI5Orf4eJ-^g-7R?j95{b zf?%W$hC@WxE5DlUIVV-~*{3R?QmS2F2}wMXOJ_E05f$g!wXNM_N*O%FhrwY01Slf- z+FG}_%|VG2{~;oDDC#4$4vcpW^#!Xr10wR5Ic1K@LBS_5PTfP$+4Qsjjjg`A`=n-@ zyWLVbQ`yKS3E!x`Ew|^#1fMr`j(vKe{nN~V)LiKepPpvOYACn;Q-8J5Ah>ryP-doy zVO;F)hh6nbw${tW?AZEwXKz!_(X9y&uAkg4QQ6nQA46J`Cc!^Ho_U$cn0)kp5-5eTl+pl*cM9%>I+w?R0p{s1v zd;?xA^~a6%Gbeeq+ty~+?@7;BRCC zwBbgw!I!L$OtrCmprqvgyW)Zq>2Z1`9uH@$CewYYKR+~5@BGybxy@g%WIwbFj*bot zo49F#<-5jm8HuXBvagOUSOCjauy#bGWF4DL3qsn=zkHcJrEs67gnoAHc?XBAcA2ZT zJi44YQDfHmQRm%n+=>0(ORup)vEtj!?k@)?nFN|WyS7EYhiUPCx6r6-W@#3xeM#?g zTDj6D-lgW@!*C2d{n{eS>II|05Cv9(coIgo?jKjd?K^kU@)mksymsxcQmQShyJ!Wf zm~f+YRyPoWCu{|w#!G*eUMDE0^wUWwD_F=0|58mW8jvg+lxM07(#!4J!UhEB^y zj7)UP=NrS=&z2|-XZ!Z;i#iz%ZD^sTYlbezbIn~nBe9%Y;*xdpqVIVMYqJNEG(WC? z#SjY#N&O=aPE0NE-}csI*fRg!*PmWij9FB?U-71@ioxs7KR;+Wjp`-oSGdprgL$8( zw@8?fBMZ!L!2%UPU^2uD3N(7lD$j{OI{WXhhEbNEM`dI*(9;2RFG;vcr|4g~D&%5X zT7G$XORl|nYn<()dDLdz<0&^2ma`^`i%le?J_&kt)Qi;ln{PCku5su9$Z2^VEO8~Q zjD&vzC~ATGPO}LWzS|(~C2?i4o?%zP^5C5Hf}%B`W1e0bDK%hxEx~oMWc`Jwt1a>b zBo?_TUX*iZ^YnHvjW?7*QzvvC`%CrL2};39MJeei7V0=d%q zw-1~PKX2USr+?sD{?J32=_(Opw6469LuPT|(iWeBAI-0PG9w3_TQbM;yPsd-n#_#U zIlW%4ucW)NHE*Q(`@aQwD{y{#YNu+S<~Jul7b%RgT$CD zPa25M(tU@@>KTLR7YH9y95?QvrGYa4f6lhj73~wSo+GfAPzeyvWrC1kiH^~$_QE(1 zZ`V*V;^e5bYnJ9@6%ko_j1)fF4S|5>`es`voroyP*`PRh)!k#!DLMK-H{zsr!Ihq*&|-2{Dd3WY#-zT%k{?(ubY@EKilMz+&8cT-0sL#S2vq_Lmxi4HTEep#x^3 z-CeLC<-V;#sEn0WLG(-bA@4;#Ik!5$`lR_U#m3@w*KD(%Yc|UL-u}XZz_ zJ|9bW(~sjv6p)zkMgSr+eDTGy%r(sE#-85DdE(={Mnsg4NVClDYtBFCi>FrHsI<5_ zpM4G@BkPZ+)d~UU_ozqNtQ!P^coAif(CX4s3(!}f zG^{$7t$0a>Ju0$ol{e);qk*Ah$kZB{DBWu}^EfwvSjH?o(VXJ71Z$H&!z1s()C3^5 z?qxye0op5W0))(+H!sFQ-_%UQ87NtNF~L1nRwT{loL0U=nrh>Y6DW`C42ucz*WK2Yqv5eaXY{N)mW&f%RGm}o+XDjvq_a9TU@VHu! zmT^_a_Y$`KZQ6RGxz4{J$0fk4W7DIvzGgjy=nw`1F($4ed;tiI6bWUap%{ae3d1M> zez|??$jAa~*1&Q*T3%c#1sqDzNat4*%MTzH9LEqgE`0UE9)8VO!lNc@tWcAl-2N~7 zff_5Xev;U8fqDg}jkrSymBc_rpp4i%5wd!rTT5?{Aep{>cNb2b-=)_QJtkh2gU#Ta zpE=oI@nkwXIf=wv-s-bZ0=&~gPz)lt7YQUkHCRC>rQrQhmPCp)R~({MxpoHT5nlSE zQxxO}NX*$EXmos*w_ifT9retiSM-KCJ4)?IIRD~Jqx!2g zF3z)Gjhvo0=VI+u#tqC~V%aXnl&xAA@X#MMee~ zyIJ>8p1)_liVOcCFaJ^WY6ObUXSjtlbYipKj=({m@PI-Ws}D;&@-Y6IuFIHi{Ttq! zE!uzgsrBxT8$PQ2%gD^rdVFqFR(6m_-pQ>hwoRYdm3dc&{(^uq9N&k~gNRUaP9}FM zlHmvE7Ac9*#L=-oqw*^VPp9hWIBP}gEPxy#HlSKL!0T0;o zT|5p#!HA%1ZZDxhk(OQ@lM>D;fa0vJ?OhtCr1(7MNJX;0jiJeb!02-t-kZC3_RQ}v z%<{ac;w(EiO_LH=#XXnQ($b|%&J1?Lp31-``ii1(7i6vmarj?D^8ZQ+^#aAimrJ`*nl8%vs&@c<5*44)sxK4g? z)#|FG=jDAxMMdebGk`R=M3lDmvxxi!i_KR~K<6g3Kk!2?W_~soEvlb95f({sVgvdj z+VWA_+gjGBD9sw~S(9OWYs`&+7kO8gZuVNU^Wi{aB-tX8iJ!;|IiPGLGR=eZkdKcH zl&H5~;8C6+D{_fRayCecM7@EHw$On~AOqjs@TPvekAS$F>msgbMywT=>&jLMF zyLz#(@cO=IQOd*BBmsH=9Jk;5i^wgxuY3taRP9H*;v+5J&G&wslNmApd{5QBpP#NQ zFw0nJ5o2PMzPP10KmX;4sB?F2-!>f77OKmS^kw&z2*>C-(;;^0(Er368rs875b1L{ zY{Xup^6SPoUhE&Fm{wM%`NGs(k_V0@{Iw6;85HW#*$b_Q?N?bmKc z611$7&n-2vR9)-jz4`Fq6)hKFPia9wmXVV*Q^Lh_M3W>`lAK?IxQ@CiBAH zJ4Ywahr3n3HcrwMX<$gugajq}73#~tPZ4egda+%+xSZZUUVb~TO+r3=j=xQvo$~Mm zbub?eU>bA{R(|2HSaHl`X!F~c2Z0Z*AP?hC&Q}S~y83z7EjyEedwkwKR2FHZcqj;iAYG%Oz)=XlA z4BWTx1_gmQZh$J$qg@Vdo5HR|aKcT){lX8N8%b}{KijukOXI%aOQ-;P3Au>aF^97k zQ=14XU}sHlge;|e+hFJweCf6P?W$^V!VSXk9)k!)Rv zvIZ`~D`fqn$OGr*Ub0?vmTZBrb&9HUUEhv2#~Y>J@I2BXWujX-z$MXl=$E*p9w%Dd z%ep1Uekb;YT}Svo*U@n({+}@{vxRqDvn$1D{ zWC++1f4ZNvc7%e?IQN<-0b^a%rgt&bi`8XC6OEXx$^nP$wf-WrpNnWLd9m-G*+xQ%t zsSdPO^hQP!=$Muk6f31`i(PexET&6shP&SzW`&uNADB#y;`ei1K zKOe0)Y|+cNExiYyC`fa$AMaj!XSCUw;ll?jAZ++aH0DGSx`%B2)+gB}b6aitK%O6F z#?QFH$8w7Rd@3cMGdzN%>>dKy`JDL8&4u%BAB~SQihOkX%(h(%bh=)k2q-+Oc=YHI z2$vIHHjx&U=B04xfU2hM@pJw2Tmn?}ALU%W)$rm%%-q}VE7kWee{MT)-Ro{2sz>xK zZ>}5C^c3?fhDxy-1wH6=>i7SJB459KI~Y_`iv&QC{eStgHsh+#sKaKeXA2T48^>yC z-Kear%rblji4R5%27ac==sb76h>CH`J$No`;G_;iwvVa{8qvYK)o%H6vscZt63f$B zVMS{!Qo4l&0wudMNrAmgZP0&j_=B@YT(ygxm4j9wPpPb;19^0IJ9c{OR}EcKuT>md z{%9N6-Bc0p42&%3b+Jr`R~Gm+VXT!N;n*TFcmDhy z;;$?$jGzBTjjRP!259E?K+=-Yzx;jEyt(Zz5QfC&ZKK({<%bO$l~@yh63bI<;}DHG*UB|YC!!jrr@;T(b0|z9)#)q@QNn_>0;9LCFHi~XfgMP2(c536c9{ku_s8tx?4xER96Hz z@peTa!Y5JYOzt&ll_JM6UI>K9^}KQ4M#6ICe`Z<1;L!8xcRw(Af%(W}Dbb4X=RJppao2c zg`omJ#3sWd2F7^PuZCXbE!t4+Ra~_7T@0Gs`5|J>42k3lkyumL)~2aoS;1-$QVxWY5#T$Ju= z`$CH)_ZwTp1}Q}FRCYs7`({o)LaEIkEanc0SUWNaEDNZzkAi}clT*x^EA(75f>wF0 zYiZet^on#)9nR0FMHh7Ov2p%Ip7x`>?c2pfJbl{~!2|HKd)1Vn#i(U%5GK6x7fwON zV#)C~nkeaNq2r<6`~b!kYFTk{a=kiI(0S+H>`bYv?`#~cav-|V?Blbq$}4&@Vshxx zc@dk@>#G%O#K$ltcBr z83rf3;#B*oG`&Iq zG}zE-D!mzVkgtc<%6XCi^80|asQq0dvq78TJw5oCq8nj3;KquW!=#S0L3I zs?*wI;k5pvvUkoi?GL6r$|`;L*%Ll{d1Kq_U7yZ>jSB;n0f5l~P6PYItww5oVoY*D zo;m*5bE#}wu@`~ReW8Rwn$-W=#uVG^0`Y=%9pWoi?HO;iQBLU2Z9`9TVN0tg(#2ay z5kh_=0rMxA01w@a^QPpj%wd!wBzA_sT2f(rr&S&v?oYh>HZ_gYn`OLp+3*z-^c3I> z$V@=}mIrNxM-j{0#9m!fHE6buVhIOdV`y-J&%5v+LHlG%4i|;UxUt6u!Mj*L$=6P6 z5d%=j3g=V1cN)bydk>XbK%y+GxQUzv0#Q^}?u?zS?)1lohK&oSx6*@&5tupw4xt+J z|LA%XXsp+^ZJfxMAsI>;qs(Orp%7)P%wr;n%u|seb0SG1+EKzNkByik8n?z(udb{dR&ICysl2RE6xJyNCN9n!S&K4le0eys@K|Bthf$Dic_8@G$ zo94#QC<1i{7NgYnbZm(KaAv*uh4gJl3jlpxn3x^R>G&mPzZd19SPFt%gN}G!@5qLB z63~1!5TH~Mj#C)G#>Yztydw7_l+KvS7fd15A>X9Xrtg*TeQ8<+uX;KyVMO*^i1sC3s;0WFw4b;ScPr z<+><(E;}H0oCBm7EEk`yC64y`8Na>PYV$yLGMOoD$W*5?JikNe!`!=2zGSeT9DX3? zSsaEZmjE)MPVa7NbVnQ1>+=1;bJ59VzTSVcfuaZrC)GAE7Gg24$qo*XUs@oC+5&zL zNo}&ax17)g53}>DW3+HU!TfK9=NCQzAWy|tAQ|FlgqAFI;0_);h}cKe)f;(!Ve+Dd zLEiIp0i*0wD2#~BBwE(Tvlhkrgp!IGc`J;l!+YFCgZ{N4Z&J(>P0>Z$j%zLNmMpGf zY|g~gRPz%Uuig6ap}Dtk5}e4RDa*!94+-%RzX>z^6fFDL7K}%Tm;T(v=P(5ouCoMQ z3j0d`jN6_7Aktw?maONc3;Ue0PUX33QDoMqwP!t-l z>eBl;*`(vZobfK!FW6d33BS27)702_$nkbeF8=Be!|ud+?ghG^Ox1Q(RUUvJAAFo< z-mWi*cX_!TxCV;(#G5y*^ZH)GbQg4Ok|l%lZsh)7kMcN;ED=*Pyj_fqsylA?4ZbU* zWahiz`8a&#ePpG(xS6FYf{lnhH1U-tc^Lq3Gm*%osI08lF>?&K1P#1xf}JhvhCHY7 zK6V3LBzy(jvD&!)r40J)@c z9}nEG-UN&ocnr{WUAI1GIk%C=87Q&LJ5u=#L*P0|kT3irR`Jad1PZS)4Je@!AXoMt zKMa3E7?9tJYyI6;4FC@e6o2;f4L-s*K7SSgG7%zrX~^LMKpvc|__peGk0)V-JcfIj zP*edAfr~`}e~N0SrY$u#=SQY+d|UucxZK9OAMbcq72}4ez&W~VC7}p3=4o3ufbX!ZrO%rqAypE4X9w^0q{H%ujc5{P8dtCGB@5-u$J-4 z6v-1Xd;r309{N+XE#~j!u@tdt4@jkwOj_uca4ogYauJby@Bts~ZlV&nm}?SQ~SfI#Z=`xOsG>z8}CA7>#@g9Sa%MrgYM}B zlr1bnT+f~w|HdaNT{Pk*lzzww(POLw?M4@PJuyD}A8&VHf?FLq?wMKdux#z_USw_# zJPkej3D1$F=c$gXzkr7uXd*Ac$;mwL;Di6u}NTeGVFiI16dzX1vTv@iMyn@flb{>05~|{w$5s##9)_>ARCrO-wrQFLrS<>nRXIBHE0A)cu*A!+dPh|r$f;| zelK*l1cJiodi?X}7f`e~1F(;+X%Os(tuy(?`{GifG%XSNvTN+S55hUHvf`Kgw6!nZ zkLtmh@ec4PnhuV_G62gy6Bm@Qa)yK_W1?E9&~l%TnaTo_2VZB~2VYz*A8@FjKygX} z3~?`@p3@V0S0vTa)-+Tj$mpfDXC4O!0T4j7#<~(pX_J$lG_rs{6wYy!H%3>)J6{Fv z_ZLSKie-OlZ3JrzJ?*kbs_e`n@3YXhLU-(ntwKDahZu z?VF+)zWa$DHdzE&Bdzuq+Z9S6KD+D{s~$XXz&%a;A~6JFhF1Y%0DF8|@4mlce79uc zgWUC7svJYtA05%iZmLn`eE=R8@sA;q6`0SlB8dwwgoVd8w>T!G_X|*@ml8x=}0Vju)73{|-?9dTkOBD>xfaU?hJU;US9)p4HGp9jb$Hv(q zFAttrJ8h*ic8#cPS=2MMQ~)7HRvT=jNHRLCe3M`;3Yj@os|L6a8)&TG2X(~-eW*u1IanR3d4`29<{pkyMF)Q zHD4fVeART1u#1;jwv<7SMc68ZqnyzD@gXaVmJRta_J)2w-E3S_2mBtg_X`>RhSmzA zBJ+VCBo6WR?b{#Jx7!g8b{7vV^XHE}M;-}AKi5f1qD?UsW4ufQOGV?m z{IbT$B?T9ATZs1s)(GV&F2O+luEVh%c|(rwD1r~bga}YiE^c|!+oF{ss;81;4!E(| z+1X)ob~~+iWYBK+&{b1Yjd88jX*Gw6*;0`a*pR?xz7R)Wb@cGAN4HzpV*N44Z=+w#7+#iSb4 z^zkg;ekBI&59sR6y~q|6o^kD$%sP>79- zL)68iCWq7xlz>IA(%7I^i?YhNq~z)=k?r_+Pwq zoc?~LUANYjMO4&ocRJ|+cU-%T8N;T%JfGh)otb%(sm>B4#e=e*5dQGo$vhN%%zxh& zOKw?&cJbKwxKCfpL0Gn6A17dc7HGPx7D6I+%!mGZ{(W?MSR6v(2gmP|EE-s~tNYNF zWaXQD{k@WyzM-=`egZG09wb6?+q(MVzRz zU%xi|$4!`5Tt-F)13?m2sc^Y@e$~>c%%wvx?P|-%@Y`3YS3lB%C zjGa4^p<%S!`*i!6JLdFQH(3YFzkV~p!29IPM{Vu2^!N?^4SZl}(iGEb0scT4&xVl? z3f@bpsj0~i=QrjY)P00W6;_N6XFx3XC9|T|96_TA9c>j2jQt&uyD_;lpyg;$v{CIz z`re|QHf#ULh5!7XWhMz2V#d^>hHY(4QwB%Uf~A52PiN=)J98Q-UJKBvr01Fz@juft z9KV04C_0+4HGQGX(G_%95_bxevaol_PFM)kg6gwhxN!;NOoq-d{;djtL456(D_(jGo10WRNN7XJj6n`w%UO;M1uzVPF4&efIE*v{W4;VKpxm;3g45s2Ja{CmX%56R zJ(`cW!t!)duxILCb&dL9<%ys(W%8+q%wVnE)ZRXomHZ-p109R`#C&M{+;IDUmj<3Z zxZ?o&5C+nYEQprPC3KdP_N%}BwGE<0jd@EDhOEGAz zC5DzZjo@HPC8cLro;QASYWI5KD8wD-h>`;&N;`LV1DNoT99PU>(T|amiTuYzEXC05 z6Hb)8s+V@GU6D6#T(k~3%A*7l9C74DpCgO9_}Qx;7mcN&qbz3 zbH@6sg^>jvBrd}8d#ywYmHp$2E1_mNwS$^A@{zKfo5)bu{#A;PT``9Y{??xVy%+X< zlHkEiAxl{Raz*q-i1D{TS50q_3d+c zE#zZl+uL04g2D(RC+O$F#zi1PJF*)#-*kd75B(ky<<^uS{Q=BjYmO0Jc*9{~!&pFV z$2I%io2#)zaaG_T_eN|RNn{SDQBhGLM+i)B=+0R-c!Tbyq zXn>Qj35l!YmMbiPVKCTAOG}F}vVg;>*0mi5xkiUm^o`ik6)D%R$4GAVXGHXRxNlQZ zuAJ=>xAnWR$Dn$twdEp5-=V9}^k@4SokOK3HdiLfvQiE69Ei@P8O5qZS^(8thp zU!=2m#QAtNdre>+`cxA3iTMKjG8vy$JicHk6>W+J2M|Y2&IHwu2(Fz{Fe^Gm?AQ^3 zhRPnis0(0K0cQjqXM!sCU%ioFO}}3d3kH9dRUSNK5RCT?4#Bg$kJ5*7UYCXbJx|st zX1j&o20+iJAd$EPpaw2}FkxFPycMB6gAc&wO!#8^YG}zt^*bc+B#(mo(qx^OAsfmC7JD#S z9N8u(@aU06b5$n#3)pW-vK*C)>mNuj`U`UM&R&w08FjC{*}8x&p>; z(81HeOnLRy{Jc-J41PQV+$t{|lM|IFuHTj(5AD*_^9OMz9U}}GCwD`^VS0VTOV~_S=2j`(=CMwSS{QMwZ z*WH_SRa5aWw;zmu0^Ti#S~`{~PukipfS3t^fLqd!!-&EH6AxLl3HUjvas=h&zn8Rg zZ@BO)VFBZct;0PYcLSoukyp|2%W{*bNSLJc=$ERo=eq=;X1v6-HLhFscCet?A-Lf* zxpbV=Mw!yo)zu}`S6Nm@uw#-igqpk3?OlY`y~k4vdC$a90RXEX&MIC9s2?*>w#_1! z=i8a^yl`YX%3Kjep9KV_6~=|?@ExIL-X`*WwfiGRhrWQaSJXPhE)KkS`45B=lfQ;= zMvwvZQjWmq1<#*MPWK^q%XeXtFwbR1Jk1{iF;KeiB5BW%}OS8R2jmc}8O} z2_-@wjHSSK9Q*rc{x7jt#()8Xb5tJ;(d*z+fdi-J#CN#bf?tr-BxcVBH6E5V?rKA@ zz0qV7YZ%=2PAF9Hdog_YKU5WWY|b=-d~!b|jbQ*(Ds_lvvC_cRz_rMTtB(9#!ixvS z3_~Jh`RH!*#Lx=51*+q++qYAt=zc#s6)+?X6`d_ow?|lh|AZ9jQp$#{Td$6pYj_nE zcjNv7(+eX*dkor%Wg&-P5&N|a$t1G^la18W8}^*(gDVTh;6wp~2dH3H zY|#kZ`~TD-NPDVl<@C@uTTXQzy`E@{F95i--gOB9jG8eze4VSa$#}#nRyTL(S+-|Y z34$isnsiPR?`f>Q;ks&@1e|X;7w{G0_JzlawACj}Z_xK-dnk2BovyF~16U~9v`L8F z=}H|)srRrshy5VN)maF}u92z7%XOz_@%k=ZM`liZkJwh6~1LIKnon4`L0IXhHn7 zP0(!uMbib$2gVmcDN&Z72_QBpTv9fp5Cah-j2C_I*Ffx($!Tu~`q&fD4k2RH4lWH~ zWe~YK*yA_=vr(o%>2w(tDIvcT?-~eku`_3jh<~zPkmw zV~U=c=njbuE`5gucJ1m)`>UprEkkf`_}an1;wFaH*WfscVvJl~zyv8Xfn}47N90$* zAEO_TCD!qzF0-XsugleAL(gabw8`SW0W6No)0H@F0NKAFZCIrF@{X>rs)GAKGMGLf zR_Vh9{*|*F;0gpx6FWdWf_Prlf5_DbM1^Q5aVHYx){W9EXmfDZc3bGBQQy!9g}km2 z_sThROn9%v-vite5-5eEfEbOzBoK*6m(dgg(lalqJ%#BoG*&lji`cG+VsY2OwuKDs z0n)D%To}fT57o#SQX~*CW4t96wHI#rPN<^!6ieVwd@{&wurJPN6_pEmNd$bB`6n-L8GC~M|90K;N@IMCyi3I7A0{Q-ZLgOYE zB!nM#uGDXS7cWI}O6u;%T>gQ_@Lm!Z0E!u}qM6zF>s%YF3>zuvCXuJ={$cWdJUF2tQ8Sm&_6xnh#McUGnrlE1OzuJdC*h_VHs&-z-kLY8Favh z$GD_<`X?{$Jh@tEYY6)51;Yps0zi#!>Fk^fFEzKB)3*w#)v?bK+3Gt%Q*60?Chzf| zy}>IyLssAKmK{C$mx+Gkvui>f7cw$QS^snx2F{bf~#G4Vk1aS6QBveXmnV6Yr zx5(vRDtIKhu$km_!)d7*>p2$K1VU9%RBl=Mzxdh34_Du~_GzxG?pE0kdB0^{fb{=J zJ(6Dkk4ArzsA{xlz6WBGcKUr?O~E6Pg)|7-52i}_bCrW@ zEtcDx06h@sr<0521qIW7g6dr<;$AcCnp&}ny$chKT`Y5_n#&)u>*x!09I-r!gr5=R zlj)0zw^=ADmA38YRd=2~Y3y$~E{)P8J}fNE=s~FhW*Sj~*01wT1cQOWKk5tEvv~tr z>jhY9g1y^%E@l%MG z1i2B~BOdG>kGN;MH}flS{aM>o=^lCT^5_kxOj}h^%kG@c)=M~(F5bNPAo<6!@%zno zta)y?0@g)D%$?VVa|jNjJPcO}#czkWv50+OM_yTNZLJKrJjPWuCYaObT7|I5bJp8l zu`O*!#n5X2RNW8w3Qn`$LeuBH1_egl-Td3Bl9KoZ*GV0V*cW2gx$SD&2eYcJ; z`vjDfctb-)Psg`R96HRrwkBJ*)H%EYKA>5NO>9gIt!MrB>$%1IKB{<~%ErTjHKNmAyCBOqE<@9#52yAJaq5)O?5U)iP7Y&}}I?kE2NseXGia?5m8ILXbA zs4(u^s`5&f#d{=R#Zvp8fd9tSI-loY#aa;m?~p3Ee1)e$>rTuO+N@=fta z8pTVYzLdm#2Glh&H}rw-21qD+k5hwuP+oA;@U*jBS+LUQf%RVo8q=g-Wl|0$#hYbc zFgnWba-^;GSli$)t$zVs-FG#{$rIfB);dG58toAW<=7S?q8NC!^X%JQyed1sPPF@oBR64B5B)fHK*;>B$8Bqj!h3>o^J zF&2v!Hy?em<$lE8pgp}MOuBgwvwv3G?SAqk+{&!v!~ILIuP18jgnLiSJ@!Pih?gMf zlc$m_BA@R($4ou8eQL8Gvk9o&n88br2n^uYis=8swcPDQOXjzbYQ9r?l< z8XE2kC4f8&r3<-)fd#&L91?`4d|u>+uL7!5z>P_IWl=gXCnESC8bmC9B4LBO0OF+oWEK7MNA+G0br5*3n3=M84B;jwu+~X7|maB!;rOG^&^lHb-DQflZl*X}RNd zQwk$v`<^?HK&c8-K_MB&m)*mV5+4KNu*;6?%DsJA6v}2esM?%lbAX0 zAE=JjaPttZecobt&u8skwGMf1)DsWG?jI!1(w;(EDF`u_>R1Q19aV(_WVl^$e_0^!=Af6%M7oo4flD zuuslm+8KZpR7|lQ&;SsHucv1hP-RlBd>Uy81yFv-Vz~V} zr#-UIJ0&sTD3RHZ{Qc7oBpQoc0Mks8)&_?uHgH;yjESeZ1B5-)o9BOhQNWz!Im%qV zp98LaP5FjikA_uI?f?$Lgku-66EhwWjCDB(o+Z5Ou*8Xbps8WBhIfoIF?d#1{*_>L zRa#ma?|45cs}Lf%4FqYBgDorFw&1}N^)r^s|4NB17Z%Q+DTW7X6)9TqfPTOi?<*Y4 zvGa-fq1$jR2Ut~vQbYU^L2Du;P~u)qY_4&uuBiHhv;`2lm|UEjuYtJ&G=lhkqj6~h zLjyuCyv}U|c{`!ajlOYX3lVw3jfKPn6E_S19mz!p&;F`U` z0SC+?2qY*i*1TRq!qV3cAMk>lOAif!*bI0k9ZMi4rG!RP?m6#3?)K7 z5X(U)&dz-$JSekZQv_P=6|ho4bQbNYLDe$1n_5TPL>Sa~d@`~t;oZ*3$w@vc zE>A#VN+8L?-T6st>v}L0!EV*Y%0%xtSqr#_`PQu~wjmu+)csW~aG%iq^!eRxA0ho4 z>vlb+gL{^)lJ@<=HzWlRU<;mu1-)fnHA551#Yyqr zqLtC6hNzZEMD6MU?*xa1Q8$kbOiCQdb`pl?kmN}|1ehecFJp%NC@Abs>u9Q*TsLAk^JcBi*FN);<>YgwdT67nPt_SpD%A@G~< zVZpD%r9B0f#5j%=V7IqOTzShBjIZL&{8{Nw!q5qffs13+z^3=?*>iB^IItDK7&SQ5 z3chr^RNxN4W7H4X{~se5_ZPsA!n(R{azvp;iudLx=AWvnJivC*JG*V0r+`s@b#1K= zkV&Jig4fO<*C(}AM{NXp06Gs?ZKiwMzI^%e0~WZaO-xjd-0dNg0<E+r?usqM-SQtF})Ao6!&QKBaP9Y#6@uL zqOI%&ky#Ar*qVw3{QRUm z--t#VZgpUzlVUKZo`HaG`L_t61qRC9MBK)_|h*_0^Rz`EhS+%|n`gJ3vOsiNr0s}u#l6V0(P6x64`>#Sql zG>WScms)Vm5UY3un$4|)thg_s%SeC;9+tWtMyUXW9ZoP~zz36dcpvrp-pS4NLXINv zB0ft6o!B^p9g&b(45@7M7+mkKFw!}UL4o!Ir1Q;~-x&NOr=IgVS^GLLuyC^?3=}ha zmk^afxI}mj@OmKRdR6Ie-7_>KlikNi={9F`F*e-k*2ba&qr(s5CNOfqFE;-q@0nr> zfWorEg##LY!sTy;>-Ob?#)9v1ZKN7Xiy>Yl^E2#7(#!T+FjZDs>DY(7K%+N_;33mn z+6!-8xN*S~(FhJ%8HnB0>W+J)B2-GNpd3D0W1xz`;~M?1Z~GtUvHk?D{+vx3bU1P1 zUu(j#@(Uq1bD?31}aQTMBmtTNJnYJ5CKGZoQ_<8NKQV~ zf1dy{D<*_QEe2eRcy%IW&2WaleuUzzmlx5OuGHOWnTP?q5z{Cb0EpeDfaep~*5uqr zGf(nyLDA4P*mkyo(4D$*5{kAyirUD?$cTbW6JqWgve(Sm0UTh&un2-M3YZrl?PE~Y zqS}7fMoNKOa9#^EC_GkKF-ibzh%zug-O%wqbmOH8-<=p}98p~#9&YC?2($PR5lD!jj@Fa>krHtzZnb85gSf(iG}sA`1cnil~J&kl<3ZlTM<719;?8!HvTR zKLCtp<({FL+ivD*old1Fwy!t5d;Y zMHp}2(_3#;2@A!EF00kNVX)-@4h@M$BhE@xryn!?-%DQz1p*2_$kXS~WqIL={69QW z@stj>ccs2QK81Ph`t|~D1JpMa7`4tcQ5zmG&GrFQ`Y%@-Wv$!5!&ow6#s^(Qg4p%1 zy_jF|D=EDbDZ-RP=9!poUia$Z+ViSh$47o`^$~K})4J$tnmXU@EJw5-C@Gdm5FF|z zw);7)<#D#9hon-k0R#mN4`wkdDpU97a|d64Qj_B16{$K4o5Ki9^+{o)*QLRfw>o@& z|E{86hIvPxf++9<{syoPTR8{Na{lS`AhHF~br9Az1&&|hSU%-jw&{v*qeHW2E3fnJx$^*%~zb#6=i_tc8TK zB+M01k7{Fp%6~EiwR@_i*|V;g$04#_zQ%eQwNW~RyofNoQ!qjSOL;y)^AOtADZprDrP(2>r6^zIjz!>0{28JiO$=+{u{s+aaH;Ru&G``LoP!VvER zUKH*5k&`nv->@|O)?f$F9Mc4|qDlNbq-$ltKNi`F9)S@-CmXqY0b^<25w*}b5$BEyy$YrzG&qj{w^#29Y2VBp<5X($`vE=|0*nv$P>u=l`_pNY^EGkmmBV&}_ z+-<}zocxSm-t?pdWxlm}y7k>n>L<7_PdokosiA>Xjo&@9TJ9Ie-E#X%FNOFh^%l`C zFDG7$bkyGvm+f}Pl%$^laF!TjJ0}hiJ5lozRxBZBf>HCa>-MgiU%DzX&T{_*bxvRv zfTmywjT*Traiu{+MKVon=6-;L)ITs_cCP86_3PQOZefeM&E*g5JGV*K>+}XAfChab z0GUC2OQ4;Ro(@=V4Dc#q(G4YwM9+_xFZku;QgEDWOaJ$jobLML=SAZ~ss^0Mvj&~LrruMC_ei1NozF-Q4Y}k(OHI8Sf^?KYWQuw9 z+2?SWRQH_NO((3gW7E}u+m)!WL=s%pdMcRr`U4b#=453r@@B#z=eytFLZRuPR1ry&h&<- z+OnPDcw5Rfb@e%fm^T?_mFF-18XcuDz8i6y&ux&VVNGz^|FcLO?LRl03r)Qcj6&0X zdDB!^2b<`~2OsCx>)Nhx{T?h+w_zFmF_mSl;5~9(aub8kwH<;#ZL?TcmPbMbqz#82 zs7JQk$4kwHs8a^7Ne2WnM6cYv-SQh58tGeGRFwlw9xi?FyF5}b{O-Z@{3brDt0+nT z_Fg`Yto-B8=96Y z951EaQa;OWeCNac<<(b@{5|1!aco8!Oetc{2%#(PPs}C6mVR8}i<#|{f9UG8HxhW? zrMx_0tQMPsc5s+6b|(H2d{$;k{)g8dzqy4j>N)<;G)3?1503nXI|}nocOm9xy!-&K zl3;Q!B0cO?wrNSaAGE%^A9pRs!r~%_1{YUWa)gEY2Bzp!FI$$!-$W;}f{LX4 zYmZOvY7Iwds72Xmr$c-E#{E^uefRg69kyPZ_S1-!;@mS5FwyBSJgl|QI0)^$a@Dy2 z#3m2b)H~`e_=-yrAj6+`u(o2gCZo@G)BY->QeIIi+7sqX=DUW6hrLs;y9&98_|&Xr z$?Urj6LV9@A@?xtuVs45=kJl7W2soEtv$p9&?P14xWJZ{b06sKJ(e7xnSY(ZHM-odKzp*k@Qp$1;#pyqsI4=r3;rIv zS{)+>hd0M@yt0NT%f}ys3m^+rWgkuUdm}H;5G`~MR-{8=^C~`95C4xabY3DlqHPoj9It)2kAF z<~irp5BYm*-XeWWTRS-xt+9#N8=f<@;%xP{xe95D8=1GU78L?4h`$ft4+(&NUf$kU zWvKOfof#N34~JRjD1#=XhQTZ33mRtWJs=Ek=q-L{bnn8oo%REA8^eVD{ruV(^jyx# zamY&_v}5&X2wKqysD3*8He2h{n>r!5BjC!`m4Aw_pSI-KCeVx-W??uFJ8zSkj&vPu zH3MUdmoKA_n0xyO;|ipYJroCdkgoW5zaKK6-%IKmg??=`P>`pOEfkWJWJ(>}Bu7G8 zjEu)(cY1uJB3%Or(`)mge`U|~2=S`x-zkI7>jS7-5b^wlZPNM1NJ_c3HtXb_+1{&3Vmishlkf50 zKS5+}!&YIIlbuZnGN`+Vod=iNZeeYjv*HfzSI7+8iH@sYrqHn6)x1|G&e>VH0dsT58gS!OQ^J0#Q=I93|+4!5%b9yk8!p#ME<6!*eM%;fll>Maw z>)-!d8_I&Rhba{CyoD@DR78XV7G7RxCQ-K?fPO(W@S7d_4I&M9QCneuAWKx)Dg1=>P$9HOKYQi_j zQkgzh?BG2!Xn~)>QV#5CvHQFB?Yjjm1}dXQsG-OZ0237PC6y90#%8;P-ADBGg`kjy zgShs6SjlOol$H_g9w`sN5uSN(*8U9*CImv#Xr=*j>H|F|ED)eM6VY6@yT8{1l-h)K z8T|c?`F{|1G21T;JYk9_OnkU7-v#SH22r^H2(?g9>ks?U$I5u4J|>9+(5c+7Z7tHm zpcGC&B!W|UzK#mUe#Bxu4?_e;F;fR9r0{EbYcH=~pKbHf{CH!o&a2$mg86@1a#~O@ zTVf!BMeZLz2;hLk`hlLRgI*9rpC3C5?g|DkqZLe?!0xRS8DguOw8IR%)ypZ(?d`}glr zs~+N>V7tOjIEsX269BHOUhxHtA>oPw^snCA8q8t<(HOqx=uhGKNJ4NPFk1QPdt#c1 znK9m7jjw>n(_Vl5aYNF71~0#J0`j|-5Q49*>G|{L)4Zo09kGcebI;j~Up;}b5!B;h z2rp7zK2{fS6&2UkC3-ke03K+@6M>?eiZ;{N>8MClRTF0U5E=WwYyKr9%=OKdqoq5J}y5G=kKfIzCqBRx;gY;oKj|k@PSqDaV zWBXQr^L{}0jfqc;Zl25;V}OS^As}o5njV3ns*8yKlK&29<;Ihzv>y}MSPzfyL;QNgB=Nak8*TWYKcP~n2g2>Ir_;viqO&H zD}Mg@H2*>Xl-f^lK=Ix14?10rQf@nR*!am?0U9qFaEx5us?c3}e1E~*R|j2)`oOgb zaJF2#demaxe4LQuh!f(lgs>3w;BcYO!u<0}i_p0S={Xqc10By>Wue~=g$u!x(K)4v z$`*)%N{dI49Hz3CH&!^-q)Dy_oM0+` zUxrmLY#o1>krG($1*g+tO|Zm7!&gESi-;Hi5rkVh=JFOA%IP^ zy_W|enm8(!;(z8I3{4sBm))MefSbg+PliePZe)$y+TJoWBNnBP&Ke|{3b||mn;mFCCD5sFKH_!D(}c|70Uf5W%rGS^fUd>T0Y}R#@z&F#(O^!a^PT zh%><1dvN7F(Xm1=@^XSr{sN9fUqFyV4ZN`mX83wIte>B`1w;ip>YuNQT#=*KVP1|C zh=gn44%v;NKJYzYy5`ZziqHu%gE|li8!Q?30XeHmw`W*&ga(MFU531GUT#d}AYlCg zTqjN1fd-$Hpgp*yCnhINv-*q*?0R-6p51V&GwRTjrQ_KbNW>yJ+c8KY!Lz_G+Mol# zX1G~<8U}&0xaAO>YdDOl41_rc?tdi|I{k8$x~dbAAf|70e2|+ z`Zjk2Xs)~ux-dBgC@mR>%v17-!D0rmnc%mrQBAIy3&d#y!t@_6?(V#0=ds8eZc-!Cs{_5|2*<%bUf2Sw^@!($oh(=qY)N#j`~M4=z;agWU#yymuP9&DPr-Td?_ z9EnlK8uH+fBR;7>4WNz7zQ7Nxh5Yk|yWX~%^6wwzho`BSxYguKKS0|hu+X2BoE&xd zZZrM?GbCR?`|AGxew?$O_i%cc)+M{elljL-dL@%^4JC?P#uY9pv13LhV}yPMkk2jU{ytnT$QkEie1Gah{IT;}VGUn(o@PVGo|`FzutqhgIE z(~RO0?0wW_l{Acu-Whdu%B6RWVjY-hUQ3A=-E{A9X}yoXxyF52F{Y8f#M51Jd1^A; zN-|==-K}L`B&CdOYfW;;%@sjap7QdR`%+TbFKG@u3$l0GdEQlSeBb-PUEc!X03GtM zl4j>!<#`NPr}>iWR`8N4VkwF-MxvcreK!MzOcm$rmJWZnJ@#g{)!)Njpl$fe#DnUH zXTxoGZFeQSanVbtv2J%-2%v=EZtCi@mKAposS?OL66((czu#*ccRORSed=Dzu5f=+lkk${ zdm%X*w{A^#)ay0%q!WcK%TX8pJv1eKH1%8RC?u$8IGW7VZc$Y%K6xNz^EToUgA}Jh z8RhB3QNiVrqiVN=11KAxSUyppP3R9(r(yEtALg}L6fj`^@hMY|)|2sK71g|4wMFvV z$KtF^)l>`bSRGhSxj2`^T)X~iA0sn|K|;wpJE!&MPK^fgWKU*#hcz;naZZ;;s;*D; zWPDrm>J>w$NM^I-)Yl{7+Z4)Ps~x7yG~<(DY0kS{XZ>M3;Pzj;tMHfr!Alx`vE{)5OrzZ-it zb>5E2P}bUNRT-7Q(>gNyR^9Jm0Of|w6z|`U|2;F?^2e$IL&+As6VExTgDu_oc0cLO zx!wM1B8nJZ$R|o| zny3-*GNk;o-s>%!pUiYvz~kcZ7jD(3SUC4e?>llsJtnx{Ns@c|9;NmA+~c8zM)T*?en*3*=im-rWXyJjf2YF|_D3BFiG`Pc2~!!QxXvR-v-RiPW| zU*r_W1e2d}KH0|bJIm1W|dR?V7u8d~309uQ6-(EFy zOFF-3)MC0EH>Y)XbG3zadX6z?x-W$uN6QEGJF?Z`XB1pjuAFOHy2E?b^hwqM!6FOW zTWPYvAp#N`u1j$+STIW;UB;K`yU*lyX0Bt?JIR|$v{BD%7TtB@W>JGp^{^X z$a-oJY?cz8h&MwvG&M}8~N?07(FB$b2C0wtlbZ7$idUf^e&<9SJd~2$o=c(i8Q ztw4qg%pAXejnQiBP&zpUe3AF^i^>s1 zUAc1fy`W%ANVjv#szm4RhTxjbe0%onwvPy4;Z+r@q>?)=_*guW^6_6@7_|-V|EXPf zz`?l&l@R__T-m!Cs6 z(sK?cTGE0(Zjq)bvDr9bmJ|N*W$Rhii7!t#zMk5+9jVjMhr&P6VtFxQ>NeNJ!&S@m zuS?E;i82bPa{AsYy86r0+YPK<@9~VTnZLhpE6xw!GQE6IMS16uh&Ilj-y_<1)NWDh zRjL{7>oscbU1_PATQ2_i;mMKx>TY}cEuZ)v83{HsFJ^DAJT1z0L`6s@5^9w1Pabn% z3=iLHke@I!G)6EfJ437LmA?NbCj`5I&WhfC`xg5*6(T<$ak83!qg0L(w!F-$+Z|V_ zK+(xhouQIRVb3GAuWjqXV&@?`s&^N;%6XrP>^(&1#o)Zb<{;ezsf(!%ccy(mX|?a= zsCdaJ;Fc#yEj5)rTf`ZkJ4%s6@PFA=1! z&GvS4yIGsB`h(83YCkItHD`wS7JH>CDz>nXRYON+tp`d(T@EeOD>BN7)q4JuecrjI zMljLl+oL^+)YTmB##|LIEAx*m)2W-<`ZXABWH{pUy1qXKjBOR`DkS?4 zHa5B@exW;`z&g!R>7p1!b7CpZ_JibD?Uw$#&7o83dEr*eukOfy_>{Su<3wEXnak;o zLzFI)2TV5P%iB3phBLMJr(Inqtux+lH8=X1S;2#ZuQhUmZSMmi9b)g8gy!Iq7Z^ z$0j#smc=}D&`{e=5mI}gXwM1TpC0u`(?7o)d$&Al$|#i)*`N@2kaka0I920|b$=`w z$F6Dw@G4ns4nNvB)1b9gWhm44&{2ygHyl{K4bpQQWu4in3)l5m^7t%9@BXW~Yqy13 zH&cIz_O`9B)HW+EGLC+w)RqW5_kDgnvs5BGE$x+yQv-dpwSMefGoL0{&zpWL@D=GT zWcnmp_Lu(yS45=hw`qp!Z&|&04`*;W)EO{-^%S}FhH0ySIM-*lEIpC_z3MMa6$9#= zgMVriz1289T)&CUD9Y>YgYEq_;d}W0tZ^lVr^Zxs$;c|6XDw;F==;!>@;kS>y42@> z`oJyJ)QMx0O>IaXTh4PbP%YMj(L3f?AmMS)($W$bHN_N!Ai`ohvj4Z-w*9pGJ)^hx zWtoaQD<7`o^S6z>WI1Y2lQBcTJv>zE^NWr1zj)8uonn$v`oO6Xbuf6pyTn-8mTxqb zQx{k69b|rQ#=1R*#-+c6BVzuSu7x}0k#)2ttVOg6TjVOjhJ7|?MIIBopZ;2ha!?}k z2BnONSBbOmS5e0P+_3P#RbyfH3qO6l%Gt}+!)VTOY&m*cA@cGm*5Vj1j{FUFhJN*W zK3`uqQC29PE!&c=u&FI+Q?%DHmN^+ZxtJn`uYFDDD-s`BiSWPN#{F!xmce7X<-yy= zZBnBA{S?-5X6FW<^0%(r_94@-IXYW-?Ut^hGjr+I(dQbkuN;(FNbmSCYUa1=n@qqV zO1&3jqEw$v7%iU~voO6F%Uwq^GkT;#@la6oZcS0jXl}uW8fIHunRlkV(qY-3|KtUQ zy38WOnQcq^4p+qv>yxh-wmh-10&oS=FYf@A4K!2)rEs#bjTtf6ya*e(mFY=$K{KtE zZBaE#>)7-b>T}GQx>5JEM&5{bdcCF`pcrK1zjrh~^{}m1_d2V|W|<7d%KCMS#Vtdp zOeqfeX?UzX<$i4Z4+WZOOWmoEjbp#?5E$B~AtX!x(v- zPjwB^A2wD`1-iCLeBd`eVDWl&+ev2Y#U4 z+VYD7{jU#S3wa-l1)2>$6?ks+?LZOdih)$*$_u;uY`5Mtb6-yM&<>l~|$!h=WpW*Ee6~-1UyedzLi5$2G z;NlSh8^-=D#5$NaG^VMR`o7P2vw>p)i&yzHmF-3PeJl$RB4!?elz%C{gm1mLtAjF2 zQ)``Mc!-)*(|CH=s^B-Rb}E(iYsdWR>*>}PN@=i)QLlvwDn-*g5$313Z^dcucWu4_I7DU8EuU_dfjz&yKEB5hn@e{eT>@-s*uHPs3ceatl*?xD4kUH#W3s;N-orw9AR zlrTLyEhV;va(DCIhL-Fl9`T?04pHO7W%;*j=q4-ooycq5q-f$-^49BA-xhXvb~*RK zhZ^mlB4JHVGVLLVQiik_h)@gw-y1|Xxg;yhZkEAZV#}d%ugFzLVRvlDfym7xVlCBm zsd%CW3eQePHQ7aPUUF%%NxYaAIz!RE(YH~lMX^%6w(Sf*-F@A(OBO#a9f)6=rOPqf zFOSUVxYw9g~V+FikTM3{oGw1754f{NOK1cJQ}{Tl*L){w5Qp~5oTz4=bB)7;)a zNQ>{%=KFA))3SA!E&E*X+qWmyx4uPst*mS`zlZ0pWu0D;?w%gD2`MjSS!$^zBh|j|r$6@7UaWbRdcDc@t06bGoZ5UUZL%(Obm+?V z<`Yd}evfan++N%#yApDKrRHO8sqfrudQU{>=A(VdQdMhzk87P<;x)aC?;R}S$*8;+bYtRT=N|b(ck7>eycf^7{OCO#oF2;=jjYwpX(hjTIX`oU z>#2v*PLDs<7Y^V2RQ-;VQ=!3Mp~rnvq02h-fz;e|JjdMik2eE#J>TzA9{O!UFS&PO z@~qzm(( z!uo*;zz;lczYOZ|*8lKVHVj#vyJOq&%O_#<=HHB=n6A!GdBNxTI`8*Cx&N*{eg5&j zbn~h+X8w@^{yw|1e|>%T-olUecD?YUgWW>m9RAb#Q60~pH-nB=7V_&Xe)TBaGTJfC zAdbFgZ)^C9yPb8Kp>M7B^891Dyr}fxX6Tbps#k;b)suhEOo$ji`r)=~pGHx4_Jw0q zo+T@GBLWl@0@Ork0?h3Km~n&!j8xWT4P^!)Sg`w~;nkJdla`tRfBuD%<&67Y*Mzeq~Fq+$LG1(}zEcq%9@y%C6D zGtB3R!^Z^vd)B&maXi3I`R99j8M#}h2VXh`1u;`))Nf_p`{l>-yjAvtiNV`xBWtts z?N>Vus7KydezVUCrHS`AG_{|a%FAQJeba~s<>hcoxeVS*@JOr%$@7|5Uv4i$@*CC88pt5u>*4VOV`a}v!za3^{ zJ~i+xm>Z;b^IGxzERt-!*@)-u`1lF1t^DpRuiGsz-wMV!!e|uwO}CmvX$WWvz8=2q ze6joO+Y936B^%Dl$7?_}4W~e+rJDtpFlT~C_<{51|Ja$kznXu}dNboy>BZLZeg6G( zj*w#lCC^>C4kV8b>TSI3$Yb^D;H}z~746P6F*naCDVdu);>U8-#Dp4}b2{pbQa0z} zbr1m~_2OJvk+dC5a*|V0)bCi!2c#jn>LSp<9pVT1S)afN7fiayV-@NL1h2wi!!`2Q zWlDJYXI57SJZ(F1z-6_zqJm=^Gi}|?rNw@?JqSKBggEYtKb^o)7~mBYP=?vpO!a^{ zCcsLE*q;2xl8&%fy}eP81yJqT6Y|V*Mj8A^0|SEtE~n4jx7dK0)2Xp_nA;FlIY?-O z^Q;>Dsm~2fP1iHW$V#}wUUm!Y&{d_Ty>=PnZ~=t-T~v7au(PLUjHY1tSvvPiuLcG> zHaw*WgJ=;m0Usu6)GH3`m~!otEqrLdK~0N=x}^}I@RQelNO=Sgqx!;#1gP?81u zbgEU&#=I2E9z2+pHZv@G!y2mzTG+uWmt=>F9#&ONfRG3C6`s$98|Yxq1Qp^TK=n2@ zKX=_uVvYFcA8wH|)|WPH^tfUQk|2B;-sfC)g%faJUrudppvAuDNcPgz)%|wMvpM$d zTQgW>U`SPYd_q6t*wB{;A=+mwg4hOmBsdw@g;PM8m0wuMeVt{U2HnieOyK92G*+Mlyxj2Z6r%} z5*dRQJ6WF6|iJHMY=?ccR|J-nrg^E$5t83V=U$U01a*y3~;ju&Z4f=B77yTIeblBsL zVpv|~%iKEwuo|@%-!M+HD|qx8jct3?h&5}}c(eSoQ8`GJrYzLAMY-9(q`gx!O>O~Y z;mhPb?vbUB3d;6=-sH-}%j;K#j&mwot7i`UHgX@bk?||*-apB3S=Ik&sR^3Zs6>~x zS5U28FEMUfvOaWXS8Z*@9{06Ro?QC+n1NZ|8Pf^f%-*Q2MeShb!2a03 zzw43O`$?zU^U|P4{k6C9P-pvJf9-fX;C=2dv6)};A;CvG84ejW>ghDDqYEQ^nR)8I zx6S;@JGhS@pO&7!@WI7ZtCaip?R&)|#$uMn9<4brPl~+FiC8GYy1KbVa(tHWc#!;c zs!>uJeSHnON+KYA=jz@1Dzw(p_2&U;rj_4R}P(|n=%3lA> zU-KP(`pi&&9LPI!Sh&y<)iCzd{7-LuS^x6G0n+b*=I=6T(%K_Otmy8hA3K)n*Vj!s z!C+JCXD?o?gKg`Bf`T;930M5ekfvSAat#<~f$b_}^JYp-OiWB(U2$w#aOs8Kovno3 zCOUC3NJZ)+B*LVpgvz;df96QD3V8b5_1g+&T*Rg0BNk`#-uyE{HO4`dV9;*q_-wDM z(N_n=STI)04X&R+TV&VClUqwnl75k zcVDwHiYi-*-hV_AvgpKklB^Wy1gY%MNK8BW|OWeB1o? zrjG*+lP^@XI`y#Af}Cu>Tk8zHZW z+n%-oR+IOadk~h{wZsp^k_)e2=f76&v{Qs+Bx);RAIZAm0$(k$&0jcfm5z5_2NoD7 z%Z-8eH*ULsS4K~!K8w{>TH_R1QsPWQ9UY90RVBz$Y~He^;`VLTO(UD5D3t5*z02qN zG1JkOPi$nqMrC~0%$I%p^(!bUYBt4m;>(XYR?AQ9FpRc0*|dvmBRY)g^x;XjgdxL* zsp#tJUb%Xe2sL`ln1;G}*_z+mg%Qf5_9kAnRyAOESl8^%a5L70*5__g@bvA8Lr9uf z`V2mL^fgwei%z=|@O9!cKWydedu%jsB(%L-{w04WAEzlzY?2sSyb6(H`I(#vm6-{Q zCDzr~8*Ca`;GK8o@q*9Kx+N6ktne_|#D0$`{PeQKj3>>?M-5!L!N0Gr1Yb_50IL(j z!sK@DH7#C$_vVwAPGExsuw0`1la!H}XX&Fe6FuGDaD%>?nRcS>8&#T)3I#E0N7(z0 zzwEHnWHGC?>48Jyju>2eXy%GDl?)LQ-^Au|+}J_B@7m4UxN##sP!CwIof*aw>Jg1B zx_vD+jf}xA8_B;Ry>+_j>nk|9Bnjvk26iDPIq5-k&QUICqPTwjdNotxL&*RjVu)b| z10cIb#jdva$ZO;q3a(vigF1=Vx}%2=KV%A_gb@yf0zH8xc(J1nrBKWY2q;4J1%`ld zAY|I|`^VD37%wR+7*jf15T)WeZJHLWiG4^L)4z6J)&a)Ta=7##8PVYgZ%O|W@$!~% z2U)QZQ%p-q8lzC%jns}@zg}BJZp!P{^0IFq4eClyT3iPp^&6U+uB%AixG@Ukr7Z(f zvCQQNB^$1i?qe_eW0r`D{Fo`8eyTGaR_}Fj#oipMDomGiM#$6RZ#TEw(NbZA8VK_! zE)ymw;}C^R5!k(X^}PQ1jl+Fd4Us61p+lv^ULkNp=sF{6`eC$whe8qfxZF+xA%W4X zk>n^E2GPz#huYFw_JF0%9~IP$LMnvby^~=pvaAG@uCC_PuM+d%{U_h3i+B%SyJn@M zMvi2zyQ_HcaPW)aE^LP8KTeqhh_JkiG~A>itjMGqP1eSLSdS1dumOH4(kwMKHTLwh zw=aJ)!Y}jb)vIpKJ0L;9ND$MldB>)pUn_SZdcEwDhv-;HFh{FB{#2TFoNJ>qV!kUd z3qSd$tXK8xB?_B?3z-u6$f|RlI58FnUM~dQs6v{@6+0|jHgH|%0g=eEX_1^TH&qJV zw)({jt(_rV{WUclITD@J7{?Z#{dkRp&0=}Z)a0bj%ffJ{}@d}f^W z73*+F&IBJgN*-vFkR~V;t=BYzd*z6nINQEen>JCy^p?)3ZfLdEboc=yW)j&o9 z#qwkM+l$unqwrcbFhOe8D1xq#)0{Z{hC`oU6BpM_WEbGHWaQ)|2tC0T-Pa6!y`em= zlDT+sjbN&!AaFw27#ka#;hgNBn1&j`5iPQLGjk0$L9{r6uF&1xkRjcX?5^k}^%T80j=4~QX|*#Hw|Biu1&>m5~V z?+nw86LSf$!r$N|xs4w`blS?V!4`-e%Bv;}++pNJdj4#E;VO#35!*@D1*eb@9nV2v z8{zBwlPDvOpm%Y;tjsW48|~*&$PoN;s%}z&Aj=_^(w16>h`5xZi`N-xih9kwb-|lJ$&fPZ* zFKk4AKzbmHR3X<7>l&7H^3~ZEDD=Op>y{AVV$>aCYeXKLNdSG?Z|Y)i`4F%nltnf(48m zpF%;#2FV;f(q<|uJ}`4X^12M#s&7CCkyF=)fg2eWb!<5(8oHDFgbBK1CZozb(%`UR49aZtDJgX^PV_+E z21S!xR#0?Ii~(e~r30;)=p;aDQad1JAU8ve>N0+O3^UCz;yu}`BJPA}@LGO{_tMSM zvJKTXr!o`#XLyh;IUGsXTP{B67MQkouewY;(rSXAtCfF%ij-+|e2JTo8;DLY@dYGD zDRuNJstV2?`3<~rJr2xJgHR-#HN~LM!+k`5(w4kjJ_5b?6&oR{*jsGQ|H#Q1>gU&k z1FTi+){(-&VkpRQwzhIi9pP-tJ-scqZ0PbXgmz0i&;CO-&gY6+P#1>@UH=UO&$ zZ5_kFTB0Xp59KJsu4nz~Vx#YDOaylSoQ4a$Htk^^+&c3KBrsyd)|KSwTn}E_qd;(%?3oGM~Ek`t0ELVXWfy z=VyS6I82?|W%K6E@2(E(W@zX#Wr_yZO7M`;Cx2^3XLuY|d+kQPl-X#KanUks0^2zM z05ucYwLVj&19V!oiumof)>L2O{faIcxlc0%W!UJg{WthO+P*v^Px4VBM&XxAiXqBL zs(dnD5TRAuPeJk($joGug>e!li1DWoesiAA4u(}(TC*@DBz6nyvcWa9Cwj0_L$D~1 znfmflxKHF=eAf+iauT4#er6|?=FMS)s@u<;Qe9gMUjA%aScW-&&ErTw>a^Zo-rh>H8VXKUUzG{j&?@OJ6SmEQv! znky@l^25vB#h#rUcBs&%-1Fca{!#8(ZEbCh8%Sj8y`Z^sHSKmJvdRc59Gy=L6pQRs zf%pjvPKFz%r83=50;R7Lmyq?djg77-LT~+!7oX^Qc;PY`Z%3KpyaUU7kTaKw6L005 z0*4r$=|-vuN+P-zqmS9^+p{$eP0Fe0>lRwVE1^mbics%D#pj*pu|Hwkwr$jzQgR=g zq7~!N1Y{^{ioP~5ViRyJJUPgq%fq#jFT`L^&T?d#fj`8jqZujIS!6h7OgU;h9RH#f z+}ORf8VAeW%E}Wp*M7d=z4aQZ&5mJ20Qz;#H-^=2+a53_&Bfj{x&BT#7e-zCDDh5! zmF3LTi+;q7nEzfuV#~ohJ>T0Fo3dtliLsQB??!Z`>Uo#v(L1Vf(ZWkP`}W~bKVt6k z)s(cy56_?K5?a4=8(%qQjArI4EaYP8>Hb3Yk%L>=}!Y{YKGYZ*iYPB5-mWA*jh&5F;mpVqtz)a>D$x5yMI}C9iZLU%E}_H=NJ@^#tv$@ z9$womTH`OfyIx%!fD=pp2o5q~)PWlv%?)gs74%F8;U8B1j9k&7LNs~X_t zR7A0&K&;^8mp2zu`uexFbUW{Nuw(j*Y&&4;SD%(M=ID25xOH9jglsP#AIIdl)3JZ$ z3q(O-L_vJ%P>yrDm-Ex{ChZudO|5X*c5OzUhkmqudc~p<7dj7TM~EaJeu}CLj7TAk zlt9N$&uT-?dFN@=(bDj?=%YNj>*E)mc^>gWCk`jDh6^6fOIe(MSbpVNN&ctdKH>>3 zGU*%#g&ZlU9jZpyD|Wtpz96Vkac_eiLr~<#Cmy}GQofRz5R~9U!+g(zRxk1*31zS*A*-5NK>{bHZA&hgT zqN0^#jYOi<1LU8S!|A5&RzAPm4|%l~0Acb(7AV%m$f)AoH(X@c~t_wTJXucC4}`Tg5xnOPHMqwsKrpYd|5EmcFOj~JoG zOU7|Qm5vH5A12$*?mPuo9<*uGuj|L)8wqAWpg1;=p2)+xy1g@-lHbtAk=7H#Mpy3N zU-#sRpJH2tJ!RLml0m%6*HqjhVzz{eA$P^sQ1T@+yPHI4rHGwzb`kbx%ICNnwVgG4 zw$haW-qSf>caH^%-ZLK%7%sWzlap?%?Bp!aORzzpDn&xQ15t!xEBsQ<$=S1KS3iGV zm7UmE8Po(xRvQ8ws@Snq$2G+x5TEmYzvKA`kidcAm)@O6@f*7l@;=Qq9`Or8aAr+A zb!z+h&8_2#|Jb+h`oW0Fl^*Sn9{=kvsX&GV2lqoaevRa9%y*;=T6W~Vdj|Y_kZqv7 zCN=l6vKYab94461gzOl?7{?sPzMsnj`<7)_wCc4bc?X;F-Wo zBof0Icavg@j#WJ5!qQR|7N9e!C0P00yLT}a&>-{SC3eVs!FhnhsB&zfh$)K~207d# zz!(w#N$V1mf?#|tnuLtf&=fPfL-WT{1p_wx#dX%$qxFnC#m#F1=Ykc_CR zMM-uRG-b&cq_|K+5GlDELc};6mpp{LFsQuoU8tb!Wij$ zbA_~ofeirW`OZRARte-$z^NRzD!LA;l7|y=CL565{k6z{=T3=@l)MbTv_lN0M2zi|lV$r! zb6%m{zI`ksB;L$GlU~VUCFV5e1=+qj(^Lj&#MFdVMm2$jS-&XP75es1EC1wvsC6Et z#p+${MH1!6o{)NxA{Jn#IlG!^J5xmNltFtHq8axiixHtH>A}3v0{s;!90PqQL%x2rk-)D2BU~T4U-6b=w!9k(ecJ9a&(Wgo~7PNCwlhp z-+{-ogvniVR31Pqyl`b7-QygP8bC%|`)Ny>Ac1CNYFa|_eK8~9!#^x_LJvnUxL{O$ zDF~A%OPE1p?*m2PgQCabH!~Kg7sG)AJAoA261yqyqA9r}YuDR>Z{9xS-v;P{J-|-L zMb@mzt(gw-gIc60DavQBJ)T(+acM)Vs#(s|ZAjB-AQ0K(@p|pZ3|CR{rb zcF@*=AzIXj{CenV$D9dqluL$mJ$cMnA%sUoMQuODd!BN9ks6r@3pcXjo$HHuFd7!C zSdiH1)*N~bPVkUHd`u2`8qL7JQmmfU)STh5uivqwC13(a;n{olI``<&Bfs%n1QX$w zwPi0l9Gip>63fdgmXc?aW;vD*RL(?^Z!6BFd=iZ~BstCdN$Ir#w$Y}W39Y6}C zGe;ZQFdI@+)u=tWHMF)*;%oh7L?Le>&Dll2qZu8&E(S5HZ2 zhz8!BKEFA13>0P@FTZ^Ip0;K4qUdx#{DlQhW9_?6p8E5D??C$wclhAdMV|vT0{%WJ z+9NXmD4g*M;KGSlr#< zkT&9228b(;{pr!WM1DX|ZyiU+zuKx2(`7C*;B7zcP&T6A zZ)6*M#rc$@TP##T4IbiMMGhxuB}ca8a8MB>KM_s@Ymrf2Y0Ucj-|2G;DZ)}{Dds3p zQR(#|4~d36#2>R4%snf81mHy;gLUfxVd`|z)TFdAWR6Iyi7%ws5|If}F_cjK3T1t6^73u8G%*^KBKV4qg`1m+nh96yb z^!@NneR7mfR6-n1uAMa`UCDd{Nj3SUGrzGN8 z&FJxwm~AM~ngjZU^IO3}2oW&HZhEHjw6pi0Qb%T(h(_d*mb3C;t{#eAvjpdBndP8l z9Ds!8ng#dx{{(J%7_Y(D#h!yBUN^U5W!aSsy4X0KM*}-i0AM)KBEmi#yLvS~EDIYk zjJP*%7I2c(hJDz7Tkpy-lkI!<7+VG0R&d8cSOE5f-PG^47fdehNO@v~&%rzBd(~Ln z-E+_fHWjN`C^{nFOMoA6`{-8h8R&86!SCU`?Z(}+Xo|Uk*Vo6574`{jYRMuU9h;&q z*Kn$mg8BE~b)t@xdgsA$@SNv#8e#`rr_wkiYk}4<%Ydc3YwzAN&)D&X8y20Yp#+RL zc<|tm2j$**Sewj;WpwiE`${S2=_HI)(fjU9^&#%Jh#e_*T#g0yguD~M0@CJ-W1Z87 zkg(9#$n`E>Yif6lgt4-?HuOL~jTPJ1MQx@SnnbV5v2us1Kb88MPuM~UG{!7Rke9R@VRvOO^wCf zw`|_L)5S<*du4gK1|1#c3*a_@Rr1t99bk5fToNmd9Ny9+RcbpHyh7OG> zIMBR>PBr9*ejZQi>syd$DilDc#7*L<#)>0ZS<3KQVbLCE7u?M|L z=b?96iMq(FD41BMU9*euIot(*PjBK|BQk>GO~&I=kR4tN2D|FUx6 zf`85h`n*oGYgDC(@9O~uu!=%bx3^E8M@s+shL-ueb_;0g#nDGyUhn4T?Hx%h^x_Dg zT1F^i35vYy*8cOA2(_J#YOX`Eg~NdHDd0ogKbBK@NPUwD3s?W7C!9oZomE0 zE#D_Uy_M&&Pw#qBP62CEi6dVRd=dj4tWg5be_ytrC?yj~UZ9t@FIK$1XeMLl;ck0hTsd-rbW zk8747XsdU*TB*~ZJNLIy`*_WseRuBef-4Pb1|9Hnq=NMF^IIhaoyfIs4|{)S#pI+2 zi!Yyg9r-X3u2j`s-=Mfj{tkq6$+c=n^!A+>;Y(s)#8WY;F z(jsno!Xsv}N35BUnW7!i3Pg&;z|{1`pZhB)>M-7P?A|)qnBFJdShJKlLA13fYRDrV z!b>V5lFdh@nHRPyE6KGnnxjpQe9}ZyU$z|`Z)gB@Q>X8y`sbwTo6q9>UH@Ls8SbAP_s(Cg4l^XR4U z7B$zEH~nh8pti1VE##KKrwQ*K4{6i6b3AmMk_Plxy}3lk&(Lc;L^ z3jmAVPh+?;y5FzquX+N}lT4LV-?uFrB(|33c!Wgx_SVUM@a3aV;-H()I~Sfh^;jLb zSBMj^EyVWJra0|5iFs9#zxCOR!os!#TYPM-O?mgmIXj1Sm2pc<)-3Y7g3m z(WxQjK%u9nMJZ(x_gc8mWp~A_GxTW|zwzijA<-(ZW=#Hz92YRZ`=^qRTMu3JB>%vj zXL@t*x!5d+D$7YHm&JzXOQn{k%>YAWQDWKYCN`5hA^di5b(bs0`a4tqg03jOe3-kO zeUf}MWo7Yu(trM8!60LH`8xXLKD5Wn^CmN@P&Li*XrH5L^<1rx1^7yOMZ!9#Au}P{ zcl)@chZk}dXKs0$wX9c1bfiCeX1b}7+sPAH@oE3h!DpRB%%pS3^hxywvbhfxyGykjIpDcvc!MU#d z=+2`dL2aY0p-Qnv{+C>y8HMlAIcu7@;omy{`+setSO)%s#{KAV zHJXtP)LW%FXD6nw;WFfW42<3Ntvz|1U7gOJzhKAX?(J;H<8{%Qfqu3mb$yhS>{Y0y_{>;Y3|XYW_|eUy~gouTia8X+S%_W8A6_R z?S6LJ%z}S%^48Nzt?#$7c3xd8cTQ%i&MyN2tGfjP@v<35<>}s7u!3*i*ey z&B)K1Z?^usHAO7tqUOQun=Q7tre`K|GmLy|@85{zl76odM)x(>D@c03Y8{)0?Pq; zi6dJ&()JYb#y-?)K6S@m=wQ&!eQtUt^U=BYPV}xMhbGqQGwd@kh$vF2p_n<8N%MwP zUF?ja>QsgFv+ARYhMUfu@L=o`_h->AcxpC!?2 z^xkG#Rb*iM>|3QHx>?=Z{I~GsKXK$Nc@Tf0*!s51#rKtI^G-g$PHAU&#$St6h>zp= zXYC@%5F%Uw3`|YU}72ZkR1SzJr_lj(S?vyLVJuwrm+3 z8fy6Pp|+}O-HshQcC#GSvlNZ=3|wA#@aR#5PG(JSFPEdEql(#TS?9U^(R=kM&CJYj z-MF;4xVYpKdnqDb9^K7iAjrng&;Rz_JL~i3P5R?XOG|Ni!otE~i!A^9*f*C2RD+7* zk`1%zHf{R;=iBR&SFc!@n3O_U_Avg&89!1C;G&_Sp3U`e>LryX1^B~S6|{XRN}iJ ziQ_WJvq%WtBA}n5OeHBL#Ubyr|3^!bfRGRaH#axGZsM_cNju{*FNqkzQ>l1iis6wF z3HP5Ts2R6i#q~ILK6}^Pe5?7POjKf`&TEg~PfUvhg$#3eczM@ZT3UW@Nz(Q4kyTt< zmb!iW_J!WE_?{l~SMH^I(*SJv}~80*W}fsQT)uD zoDqiEd&I@>P7T$aeC2-rUQ`svVGrS0F-v|Kna;_F&hGAKs;*P*Ki7J7v%);Z)fX@J zug+DjO*PB(CF^I+&vsf){rh)zYVh5$M+e(}#<{q-ypNN(abLvjF-lb{W zF)ngoJN4ASx+PKGcjwepW8L%7Zuq} z4>t&(eidEn_Jc1{)42H3&Ro-?>i6#>aAhp4tZ$#_rFs4R$y~X*D3ohfe75fP&Ym*w zsAs2N>u2ih^q(J~L2~{4l)d$#j8ko2Us6)QYCQh(dBB?gha^Sa;!EGoeR|5~_&JC7 z(4qJWzoqB4b(^R+bD4Lat58y6{ ztXsD(ct}lMT@rDp6)SrD2KC0U#lLF3Do6?vn!mpk)%5jMJ_*^yH~po=s6M*P2jD09bW_E&-151j)S zzXTrR--Xz{rLJONz*=~=In}AJV*E={_|C%~C-N-hzkf*3K6{qmq3FTzR1aC`>*r^ZZ?*2*YflZq7}tqjc}`BwC+4M+Y(j=- z7XOa^{rz<-3fT7j=ah~eW5lmfjDLF_Yg*)>_tan?#rXL6&-TpCiHV86f7>)D+{Zp| zL>>zoJv)X#(A3qXd4n)i@LTNv^-JsP%ZqL2_WvF4j?#(|9Qg4=qsZYC<5JkoGWtKg z<=RNmy=E_k{r&w(K>hpub;lPZX_j#TUyP8^+1@g5obY-S-I?#rrI*Lp5x-Z(CwA;V zcQq<%3&r>E-+gC48M*)J2)(yQoAP?9__(5BOfB$}d=#C}3kN&XH@y-wd=q0MX%F@3=!C&x)D!_`|mWbQ>qu2aJ2VHEmp z*BHGn`Sfd#xQ7pi1_!T)gwSliMV+C4bYXe~;r6nv;`i6$y7%uHk(BtfI36{>2Gw5W z#kq-=I0;)?TH0if-(U7SblhYYG3AMBAZLApW;4l^H*b`Feafy2;r73&pOx;vvKSX1 z@A9i-A1Ty+i~HTazpv`(V5N|flDfg*tFUnWl`3D=+qZA?zcn^C^7O3vS!NkHyvVy* zpWQ!}S9$i!i}M>cY&el^DDptsQN8`~@pS_-5)K`EmseI8miZsZINdhQ{XY0E{M`QS zm%hDvhyU7G(1RN5b>+&AcV>5X$b6VgWJ+r1%i9v8^{)2TjziA<6m)cS9XTdZ%)8XB z937jGU`I7It#JDgcO2}mPMez>BkbIMe0cZy^L>?&O#_32)%YRr>0!EZpV?Hj7!n9i zp6o@w*572_rFs1LNLPVvw6MuN#Qfc;sG1*10k?6Y*+zN%va$~zAAcz6I;@#iM$5;S zcfG62_;apV9jbX`(}?f#RNdb{-yW8f9P}9PI-;dTpKX|%8nCu%Y;2tGJ#&_llCrDB zW!JN3<CfU&X6CFVDMlI7lOh)9_OV4<2+LX=Hn5Q68qsH-LAo?df?WRxwYRYuJRM z8hYnWCT=k!w*qJB@`_=1??j;(|3O*V=gtGlq{iV62BFShyLQcY?wcp>ft7=U1C#zV z)ub=)9!bf2j~;O@{`>9tQ#aQz%|NiBscHQi{18vikD0P`$+KtAQU!WQth-eopqAI! zj!yjY(ziwg*+I`goTUE$Y)i|rD1UJED#Zb(-uDTIJrBBlXKcDJGBi0*gPKi`hL2lX zJ5Up{&*b^;^XJcRpr)?)Yy5UWc=~bc*zAk*?ZJYlo;v^jB7Ob(^*H1UinQ&gr>98J zp|m>Bd_Vpn;cy+w$swP=dvR{nXjV9D*Zw!_Y6{ELZ*32*u#+@dx9+iatf>3%FEK3& zOA)v}$qQZia~Ac?-twG>Tmu(oezX@mSw4F7$hh1`8o_Ce&Vk^uJ$qKY)OBPpE)A{r z?BLr_gB;^Xd>9lVS_TGNRBaMND0{9eiCo|{>ec13t}<__^)#DQ zGusPrZ%K1e@I7dhXJOazM9=+a>ox0|-~ziQ_KL+{Lccn)R5EoEN>Hj0;u-iGYTJ)X zVe|Y#)MxDMNZtMS=bN^xt0dYGuI3QhP+NxfRs@}C?&th~OP53ejG1zh?V9eRbPEOl z8y~-*tfDeCIoXehS5Q8an0AFGWHD%TqL@&!S8%Pu6UX7Crp{ zVR{|K)NSeCMdb2PEv@RRs%wbIf=l0IpBEO6_LNGYc&hnn@$rf6-m@nN2{SVg!u>H# zW3$iSpH=9_^MBiQ(fDs4v6|>DKYaR?`@s8nDc{K|8e!vn3TamU*bQ}`5z zuKW{5F$XY0D|DJ{J^$^M^vsWzOx+}fzrQ|(1FPvIDI9lqch|FIkMz9jxAd>A&|X)C zHtONS9giPBE-5KFQSNgY{o;IU^16Tj{$Uxp`{h^WNPz>wHB27Ab+bMo13hT`Yw;Z} zd7su)H3kxv==cTs`8@mfWxZL=K+`{A&7&whdVX5!g?5I1TF_CS)ZA3*%fTeoVUgKrVgdE3zut(T@Q z;WDUt{l*RSTa*B62ZylGP&&Yj$Cj1;z6&FBltTRA$!e%cc_GsV7+hv&*i(2~4l7aBpgZ-gF3|pW3%sT95p_6v#utL?O9(dQ@ z9))-t`twHz_iy8hkGD5RvmLO^+Um-_)35eZAke8hhX5sSPEbY~>7ZIUdwTvrKjBpH zjXUJ_eVg3b#Gv?ilWVQJvkVsRoc^4lzfUqg=ZUV_u>2W-?-!T8Jy=pUVY zdYi2KPq)e$_v+V?yEMoh+_Y2nEvgS@I(j6Xv~We!8Od~g37S9QOl+TR$Y44@5^H(T ze;M7ZCj|-Jk!P7i&nB4MIW{#nC$r%~tdP;BTk184Zx?50CH|vVph0Bwalgv?^?V6gJ^*CgTW5ebS<`9CAz|Mbq*`J1ohcPcQ z_*(Ku6l^&6E zlo*G~%F4DKbna&uQU{B~hLM3xOgKlWX+b9DeGbuDdv>WuSfK?na>;FDVy`pGHMMg- zJvTFB`=_TAMSI}qPfc`05KF-dn4zAS6h4l8=!_TG7%roq6^)?A=hH@hJh3Q$RkftV z7uj0kVtb=F_85OnT^)_CZcA##e!huq_wV0-|L9N@(*Gkm8hRDLf-NGS!XVplqkw=w zFe67bpc_tU7Z1-HreH8tA3Cy5l7z&teMmB%5$NVK0#rHW&AWF&PfulUHy;ua5doW1 zVEs%eMhienG+1yQi=Y3i?oK5TK2=p!NYb_V#!qPYrT#0j+R2>IDN@Kp8d)7$A_JWgUQLsB$f$`+0O!^6^9llX*S>vpWWY6eGTGZGn3a_k z7!qP@W1|cV8zD!RnwEA{S2t5Rh`yzHI59Hv2;N9Co)mYvIfa{YNu7`L9g0kmzO~oZ z(-^ES)1=4M4btnQ$z_Mja4LTHRs<&R!?VC4&2NGLb zPpHVnS5^jG^ZVG|&LQoTw6OhHr52%hyKhGEAu`rkeCE( zqiy=Gt<0%E%UQsIN!h%mMG=of5c2Fj7sY>^PgZujk&zKNvU6WwilD8X(o4Emf2GmU zg>Dl){U9cp<<+H$mI&{TQFj6{r0hTR|M_#Ni;Iiv!|KY?!s1xo`OarMK~4}BVE@H0 z2CN>cVQjBGC%yx~WBwS9zb>bQL0xfq5>5K^37I*3-@M!>21RmUXsCw3T)e|*Yf6oj zg z0L}XL0s4&-+6JwqnzV!3ych&6BfXI_R(X!!om?4BgD_tGhyzoK?O%G;lZ%E zWHb^2VnP6L-UjfEAZ2asrXL@sFp@HHap_ZTjzD-^UE`7I{CG7iI`sgNWN+L`6db3PKsjjjChtGbYB!EM5$uo>FiI>{gymFe*750 za^d>jaq&viAmx-~(6^v6-3Ya%AQCwj-DzW;3)buMuMS>mX`s4i+luV$?8w0L<>f_f zL&N)eOMoo^rsA}k^g)!EHg0n(uJ+iDy33+C+iI%+6;1u*fEGP*;r&yKJdNy675A57 zV`DW?S1rBY_yE;zEO-)t??1P1;Q~U(4Lq>n$9jCokAUnd$2?I>oSmKb8pO3UH;;e~ znHXQDZaueu;_t%ftdG0wH^SRMpUM!vIYA8${lPFE2td zmY)EIGR(8Mc(4UmyUWpF^kW8^oE#>z(mHKj-5)4)Ew|RZw|aH{@T!PeIaMd>vCcfgQ=~tB?24gY#SF#C(9n?DL@moe|M4*n452eU zLm3$vE~p8fvSp*ElZHT6#UK0EXeUu*3f#I8@#dScVwj~Lglo&cJZ6Z3$MB->wdWn& zXL0{!FT)&T>kjtq+qD4VH$D$5FP8_%3~CgFY|v8~u$JI=yYj=Fm&?HAmEf7_X~0dH z>8C5jWo-iCS{XKYTfmN~HVy76P*Us%Q!>LC(58zaZzhjCDjZpX!KbLf1w z(zu8jab$X=33W~qOp?*leD+|&Y&#%4pcru=s?^lfh%fkP5%*f+9y~~GJ6HeVga4%@ zP+>Wim5D&MCooKU>hm1*n&bgPD)`H~?7*eIii?vGJ~60ln~O|W8reQqpU46YQb38C zXj*)!S*&u|rejs|xt5_JJ9^?{B#FUOgV9?FO%M5GWMY$(lRZE0F-5OiUG^qPY~ugh zuHpUr;nozDXfeyx7@?e?$e5USm>AEurBb28Qhhlk$mUwwqiG0o3^ambaSOWOfeT&a zWP6Vm4}fjPC!u9z6#kV`#txn4P{69(@rMWeGM;y#?UQ*MSMPI$^)rSd*YPe1C@*)k zG6+!0HPqJE*G~zhr{7`$z;@}5yHcVWQ48fQSXmROG%t9V; zHG%XX{&t-Hji|Oq#o2$UXB#vyFzyuAm*scuy76P=pwNZe7#~QlM$N-G;3e>*1bq%+ zbH%=#7FZU1b3jJ84&nqqodx#t~gsH*-?2And> zwr!dZ4_rjHdIJ1aUcMqFlAIS_Rwn1ax?qpC zh|?TFWrQXX2}I)RI_*}YlstLpl7j;w%FlmHZ`fg`xCajwlcJvu9<#9E!p#G# zvl!E80}dij)x$51ehm1$p|Q~hDjmgcetyu2?>j96nn52EI_1ED`*=^_QJhM2boAC| z9Z)bq!}T6L!sT`8>53=S$&>9yPje`#s~UQHS8oX(k#XuB?k&HPo}TXfr{~a(>#rb} zfuIk^hzzPAK_|pOaL?hx55Y$P1k?ZS1be-8Ypje1m(v4Jle;u{Q=gIZ<93o9IyyQS zaw^*n`Qvo}`-nWWhYGB@t$N`GV+8fijm3|Zh8IXNN!HidX*_hk% zW$yxvWfd+hS-UiD-;Ip4zIzHUqj~h`0HLgVN?bsip&VH4&PGkEIAI9Q)=wbj6h*1| z>IcDWo_bc`gS@i7Wz$EHYD(|imSoln01!ugYg zunbs{2FZ~MK>O&BTX4u`?m@7VS{Z`?XW%erj&C}tI-HSZki`PtKs~LEqWbB~n``UO z1My&jQWcGL8>-!iDW>4Xi`^(W`2CNVz*4zh-K z^nre~UXox;>@Lw-BaY1ffFEcqeW!x-^y`tSZUfa+6|0Mma`N&Rs0GlQ!GRM|3#uQA zWk81L8rX6M3~vNx4)?ycc}I_%@)IEOz<~oD&rUxyE4~yN6?Fr>9HbJ_mr#LHxhjCp zcAPR6I`#An>uZ;_RX>p94mVF@oaRTMp&PV9hWd~wukG%>zt4B(xSpi|b7TMn^*v84 zD=P-v5Xuw)@bmNY$!+Jl%CE=}00|6(F?4-`&fJGu)thBkmgQZPD`vBSU{OmyzBqr9 zR48X>aVR>#nn9?;l{4(>k)AX_~?2A`Q+g7r-ux1geTa{B`0p2t*RO( zg;^96Sq(7e#jktyQa4;4`%D%1V*!}%kjvl($bL|RHioJyqqK$j%=1m`%R6NaMk>eo z?vJ-d-Z=hTO;a;7DevhshR@6!wp#jLLr;WA?U#1#Cpa|}6=lqH^tZb2Y3>FH?dYdA zEOoV0zs=duqPX_{j^gS~bY%$SbsZfqp6F_*rW4%?XZ*S~e*=o0$~|hl0xk~iju79d zi>T%hHh&)8@tk+s!`9wj!^+Ak#pMdZO!D3-Frt9aVMj>Z+aVkB$lt2@9Dy=`oK`o_UVZ{}y)E@TlJcYyUVRs34z%D)|;fY|R(A+67(@(gw?1`M)0pbhjDcolp;+;*;klDM_~=TSxfbUH8oM zxcZ-g;=g~^A3jp>BL6=t$$#Hrtwo zKqvObLD+|&sFc@P2!w@*o;?;ci(vx#(j$npwzjr6z>I^2fke-Q5nt=R@KFrTeF1Co zD3j?J9&PxlWI#RtpAS%W`CO4HuuC^oE~Fg=(arIaQWwAM-!APCsGOpdp&yx$u+pP| zHrk2-=k7K)^e>`mI90B&gYa;5a|8Tk0Io|zfir-*`9yc6Tsgv;&3g>-!v}%hCa&M~ zf1VrsBc0a8)io7Wo>b=OoOw{H88x7|Q$jEGkg6XCA-={3oCg>T; zeC*<-hI@O!j{b4`ilaK@F}`i%_WeqtYjaO^aM9w@(snRu016Nn0018>D|iF8*^Db< z-nsPIz%U95T;N2`NOJIOP{@4z_}<;SSK-k4^yw2*KVx$HD1;$?kN~Kdj}j6#_OW@t z##p{fLV_8$0{jbTyrsE`LibUETZx=NXl0mMs4A|^{yOEqI4S@(1Oyud;i#x66@(<= zVpu{0QH%(a4`cwL5b!P(<2sT1$o+vy03CtUyO(aqw6%8$31usNnL}TZE>?TjKn zKQ%x>TG}pnr-BbKM-HmPCx8;IE?qEV7lXQ<);R|Cpytb$FRjnoM;Ut+Qgit9AGJ9Y z!Fxl2nKDi71VAICFpSE0FUS?ROVSQ^Zrvg{1A4xVz5Se4-(if~C`Dx*o@*9g4=X4n zlBxEC7~%l;0z+2FG5)1aLqpsK!gr#>Z?i(6(H>Cp<)Mb+Np)^N>So9#RtKYuoG9VTv1qCc2j^(7;DPQ z?CplB3K`EygcozsAGlY<9|NUN55^X#x!UK>?PXvW?vK({fCDTD$C?j23i_ATZdyXG zBj%>2>bOY)BFrTuB#@ttv0_ef3@0BSq9y{8~VRD5JCWfL?IKQpzG*Y)Mg^f%8wD9lu}&--9Z;J`%! zl+`vi{<54Eh}6V%bOt(m52jyKuT7i>-+=w>1}%h9o~?YLS3GZ|VX?nzJ+N~V_}qM` zkU1CL*40U!|F|0ntQiaxasXEuf)aZl0Rit85fsnE@l9g`Vk# zv0c{lk9qPz9}kZMpe2c)19$h&w{|OQ>)MYQP0k)3e5e$66c%rybh=FT9|8J6TG46W z7e10Js@-s}wanX%m=Z9l6Q5aR!x;?bNf_+&FLX=pS~q91Svo^M6yihMOE@ryPFC{+ zz0c;-rPao~W-uMJbad9BT72gRLewvsy}V!zzW_0dLGL5lH`t2fD8NL3gkDALYWvTB z48-k{xYR?;7uM-7!db;IArhMc$Xue3!4e7Lr3N`ntQ15A$~qZB21~dwfP+-(_^&d+p6H$5chwu-iKiV`>V^$k+ow=9BRw zTR)3X0Vp45fUi(p8(Qxp%&t>0+zbyVeEI+DDiMb*k6qNf{$2RiZh%sf2i@KAka)$f z{JU_rCGi&IFugu4W@e?y+ogCx!Z2digr}*!^&hYisjnM=g)rRMi#f-O(Yx*Ww1nYEwx zp@9gSI*}t|ru|`Y*FtmQh8)L?6&pIhU)N@!siPaCc$9sa+u$=3_rNG)- zRqZqHmd@H;)1jIBb1jrET5R*<$CJ9{lZm(E+s9@D0|O<_e`J6D{ONQZcb}B&84a!b zFGdji{lI^4O2B3s8WklqJ660$qF_(+!-uK*^2ryv_X-+`-ieA5f>r{Ju@!yth3URn z(QfDI-5t3M7r!WhBH#e23e1@durl6PnFP}v$M?D>bD1L3`K8OjYda4*Uq$~@Xm1_{ zkbL3Tc^j>kc&uPzK-c<#l-JzNf_ihl;|Vi>y8C!nBvA(X`>h`AKR>-lEv$dlxAQc5 zA4c&iyHjaxcOO1xTWC$|^)2s7=BS0BED}`i+dvNUB-}6rmbG2}V;OjEc9v>8D6xO!=fuawoj~5lZff*@h)eTdq@!GKS(azIu_pM462d2<4s+ydt ztA0cp1pw8H3*Gl{0tFadDL2Rcb-x*YqwY?MskE2icsFef{bSvb=XO<}*>|(-%d?em z27C47(=Yp1c5rn)0^7Q6F#d&UF7IQBg9o=MDk|1Ap)hd(JUz3l6cx;IibHq71b-Z* z5_%jnO64v-zU%1z&X~ zHK;<27ZC!(-@mi4vF(9N|6MThK+36C4whagTHXc$OGDTXURZx^f8GeB?*fLP#OOg> zT>seEk_D#Hcd-~x$+S-b3bfz08C(fmdyLP+!owLbot|;%U4mOpp~J@*>{N{w@^}d_!YwyLbaF;_XNnMi0#s0P~rv$ z29Dt1&>eQ~-kttNN@aL-MdFf3{cJcwfYS>8!V&s1P8nqIHsrj=UtANlsm7Zcbn2Z9<)c>s+GrD(ao*u z%Y!&t$InW}+uX;f^|L0GUU})ks!$NYu`AQpTL8Eo*ahalW+=C?Am-Me(K&cHpz%7@ zKZ6|Fi(jHs?OQAYLsTw_o;jo0^@48ufj~?DYpW|&%^QbZ<0Pr7YpOo7EUyF&aVc&x zD^`w|6u&H(o^izErPaomhV&OseTgt=uyAl39s8WiKs)VI-ddNTeVuJ#%64E`VTtjE z2k~G1wF>MgE-IW1~ z2v-p6!SF-0zV>vg3IAGKTl=&0-rC@5CMljkKc+9vty3~K?h*?93d(a3^qG03Ka69) z$_gCMTU%G*is)$zN`O8947bCJBWReT3DFvt6uhDxz4w}W8n3*3d8@zJ=~sb(Mcn@L zDcflF+E@W_iTNs2IDj!x!HD}z z{rf;&MYctuh@=4a==#j^yTZbO0qbd=cI6*wnjQOYko_3c;)4_ww}OJu?@eMYACs>I zV|zd=#zEO@KGXe=uu96g`Xp=IKOZ`)H$@}w+PpWLEV|Q(f(!cs-O}Jc}*R5;a7!c>F(R* zvVYFo`fT*gwS`XQ4<9o=aLTd}3$wF{){hX{^QmI_(%iJFm!_tU4Gs zA&|odvz`gkHx?-9z@;LdsJEJC88r4+-B?F%f^MQ*74Uw{>rVLG?qiBe{^Za$HrrF- z_ceKPWh;E5=lUw-Swhn_pGSQk9(MlO%1ccB@HOIReV_5=b&gpctXoe>IS)&D-@Nxt zRwq(6Vf{4QUH8F~5moIlwB0lC_ypBjZ#_^q6IF7v@z>IEbbQ3DdXfCMo*s*mP?ts(bKP5(dm`v#4g;1LjzpjtU6e4bOqFK`fnP1k>VcX#XX z%r%Q8l-}LDOz8H3xw#_Pt`WR1QTC3wr=~H7M%};vFKN*`jXv8VPXLx=z!rGS$TX6r zmlkX-ZIN;UmMn}b!TUQ9y40~jgIX4eOox3YxW>QINISeSvZM>^Zcl$qzx46~-Jd_# zS=gC*-^Lr}(r0Kh+;6DYJlCOl;*U-ut8nl_VBnjV7h7}UC(9i>t(Luph9E$>{TnwS zx`>U9*5p^GK3L&scN{QJ*G&?8VpdG)*!lUfR?L=;91Z|;k%eV94iVEgG~D&;j4T3G z`HWAk?bEFsI(|IG=W>(jA!`=&*gmpgxW?a9F28?rg& zq`<&~HF>AVc#{F;=`kLtO8_fm*dR`Ghyjd-dKvoXPnZK%Kmfg^K9eWx!55HXn2l)( z%n`gN2pynrtZfjZAK3ddu!gt;qJU!{7`^HS4Vh#>N~-&eQdTUP%SdHn-i-LV8&wha9S z&ptnm*hxu|x2tsVN*q?IeJ}4Zam&6E%q5F51p|;3G*MVEbqv zl)$+%y&!3HQc|`@M&2P^7>@Y@azVj7j)cNK0OTxY7MAJvvI*}(=e?>vcBj(`8&g;Y zi0q9I5L;Vl66+rqert4JM96rb^!5^;Q_gG_<)$YzSC*5VEP1)4<$6l)*w$?r5!**6 zYRPVXAt7NF9RaUcjaiS7^uXc7Q3bZ^^RpNU>+6ZSiaU=~`k&n!gNPClJVjGh9yH%- zn7itI{qY*y`j3VZ)CG3;?aV8%sU6G0V_%Kfu|%L z!=(D>JirwYo;S2Is!^8TzE$N?^gl*+fPA`~c4EoU7T_M-+I7qmnAkDnZA|j}$8r3@ z{sG`e4D}%xwLU&Rz5`?+V0W2kwrI(GSBW8MSGjW{j$ zq=OP9q#ufA&7)15Hv^Q(OX+B;rXQT1pR$FA{V2xDdH7ry>IH#Ch(HHt+)2Fh^z?KI zoFxUZr&KNE(Ne(^Y++_T0;Q!M9TE~SB(&+uwUs-BmPCWW&!798!$GbY)+B63Zee44 zmwe;~g3o(ss>Ryc8upJka6VJr_DO7e%kPV2g8=bWQfRy>3S!yO<<8=rOyC+B;N*m?+tuJ%$F0gMf`qcc;x}NDs0cDln zzdMRn1Ij}ajU(61@-4(lI{9}mXT)`XWoR@$#^~im`_VKjR`QMslAWQd|M zfE+(~`0&u!m|2I)R^|nbo>!N{aid#-a44UL!2}KTK$sbFwb)T&VrQ>Kxrvu{G}POC zdYFxwn_Jh|cpE-gG=>-@b3ZIL*taCw;nlD5)YR71ZQ!KM@BlOyOBj za6n(f_B8|2P#m<8mG2Fj@jO=ch!+qE ztAHV1DEdTGxaN!$3lo#6#bXuravNM2u^ z<3X*h=eXoeZx_ypo12?=6;A*S=2*kV7p%WMKqD*A;ZsC&W`-7>p$;aHpoQbM=jon3 zQ>M(Q_LtJrtKQake>fF!gdvCt6&JA;@dZO;6G8ys$O^jypm3RCL&J<`@$m4k*VW)? zH)hxY1wFxk0`^V--{LpTOq47@BOhh)0aSJ))5_mT|r5Ana_#F?N7qF=sM zi!YAG;ETxvFTHLr>h!8=Ulwc@dtFxZ?T$6wQ?jNo?-~D_K zqW?(B3FqnIqu2(rD6IxQKaGQ{MbZ)mo@Y3SaDikO1^Nznw3(%eN-4yoO2SXfv4m;_ z2YC+bDn8}COqc!&13jvulM)t%=&kiJlLf`x5~hZobl z|M!Xo_Pb=&hsD@rhB%H|1d_CxoU2eFKd{5avW22J*h?+Rd9Ybh$05H*ODve2vlS zeUc(~UZxcH%6eA%ZPHrvOD$h#YDZhwCCcsdO)9hx8Z+mFtqAe{<=jI|=h$8G|Gvr+ zx&d`5df&+spIJd7UA&&Cs3|?TAhlHj5d!ft0b;S~%M0*V>_XMEN27&%eKQ1tp1N=Tw;pOS-A~j8oxUPqyc!Nk9oNGz#QqXXUX>4F%*2LwND&vzvpajXme?B%vsX0e`0yJ|_|p+g zoIok0xuQ_PLfD*IA+q+M^J%`;+GDz-86PRXCe9{c5LW0kofm8!4UBl+6w9#3N?|^E za{hRc=}qaFrl5d@rL$*fr!reURyDuhKs7sQ4V&AeM}gy2Ys00F+egVd0_dDGcHagT zXN7tyE;dY$XpD9jZiH8tIQt&SUJe7gUURgY&2wD&B08nWCqN@=5%}ZO@QLh!&mtJd z&ebi*CB`J2;)^WH-yNdLM__~c`H?;i2+%}|LQB|htSr5KBiQZT5)zTfAF_>3HZQO$ z&b8n_<;2Sp`nKQDuqWtY<(;L8nS~^|y5mT1hYT~-;o~#U-ORB~0+viBk@?hcALLE% z<#`EM(Dy(f#kvKV@Gu1s%N)LmtxeKLu#Trl48KGe^Qwf$>JdUP5>Hux28}`o8XS&Oxk7P-}_BFjTb~_qGazyJiqQ8>7}w zaKk28(Il`gIUlP`Q80QzgsA~Ugmp)OShNlv6klyQ_7ojZBSW9)I@rm-(1qQb-uxEA z2$%DNHGYRLaiT529pK~mXSBU6_T8V)UdZU!Z2ZyR~TS>(D0Q>4~mF`_0(I=!eV}OjBl>%P%Dr1;`HI zPLw(f>2JS%D}NHQlr-9wR>u%zMV>?wxpo^<$b zxlVu|eK{$8hWyRz{>D7pZ_EGnq@++~h(BQ!)3zch2m8q|ei#1=# z%0FvIcd0B!h@s%o+7dS-_o`9ZXuF(0V`m=2Zxa*cefM340t2;-p3(Ld+&!c6O1<@! zg-y}%TthIN*Y&ehQ%>Yx{CWVM$6#2WN=sQxi~nUFxpCgS9+g)td{tlj-^rlY;t?~ElDlHGlCpb<%7e3N1F5$XYa=E+ZH_CVN1+}7wivKpp%+4*W;z}CT&ojL0o!bX&p#s}Ytu`#^{>fH3GH)rGVFlM6i;{ zeYoStCE?t! zL@bNnmOGaC_-G}r!Gw~D48r<79-cxLvbTy^5b75sC{K(Qwv?7P0qh(23~X6YZDBCSzpdaRy6*P95k^vMPa|tCDi!ZbAb5#PTTMcOON(@1e~7 z99@2e%C0DGx4hp!Z~2ihy}b-u>7}nabgGNOTflZ8D45wVGFVah4}NMLx5;g3HKN85sy$Y1)P>v*+7_!6+UVV3M0R$nO~eqw%(Pxdm) z>e}n;i%!_zz1lwjz?&PevVuMZttp|rCAAp1$ zL2}()UGPj?)6YUt3WOf6heuzeadIszEk~)vV#W6Ey?fOV*PZ=Lr@R@w^H>dY4d**^ zAG%cY3JFIxBgS(?=Lx+GC!82%M+?vvbwTJx97t0%G%j< z4|i6>jTY(o)KqlzZc0jemUNMoE>nYQDy30)vIQ4bZZMU(EEm?2RX#}iYJ8ECAd2wd z74r@ic4p>WS*5=i%iBlj*I!-QWtkGl0n%4ivfri~T_JVD4L^~-!}y?)Q1 zKPS(@Kx9E8;tEo(!u=;IvZepR9#9pwaLz$Y51Oc0+LWyrWfkpvOY|JTFEJrt30Pry zQbmHh%8b|vv0LfbDGCLR%v4wYjaR{2%I&8JM2_18#}gb%Fsf{V>lL=YQanQk@+*WB zTT~4^>j8UI{CM2TAj*yRrKFv*?pCmrL~zMJK#UCi{8qs1nGT=ecr%`vwnlQVV#Rd0euug3iC1>~h zZJLOwUL2O0I~ihOKFVsKy=TJc|~N5FLSYS3Z1D z=J^@kdTzt5l~eN0PQ8Imu|X6!4qT)+61BBWakJ;On?6}mLc11G-*kgr#HzQfTI2}_ z$K=qR$z~bn#TWT!=AIXYsjgzlj{_tTdA1lHPH_a=?U*8Lu)~9m4=${yo8}&9Y#jaZ zgYXo3f-%t|X8$Z-J7C$dOaW#vOpod&qIf()Ge&sOE!Z;nm}4S^=QLdUTMna%4F&*E zXW~*)0Bpg?Q+oIX;qaNQ9>Oj!xCL$5M)EKO*=Af0TGjd@(~BpVK#;@p z%gA{UYj9i81k^#zX=O~``uxQMPtn3rU8i}ZS%ALKhR1^axUY+SnL$8gf@{f*!NVcI zv1k|=bVy~(0w{re3dg!L`sCSx>Y#|7hwpb6+Dl>E2sgjh$kHzePqyjpO@x-W4jKsN z)VDB(ZQQefXi z#BnI^#ixL$s2{(lKJpzBATd4G)r9~Q;^7nso8p3B)sCl(+dqCpwR;TI7aW3B@gJ!C z<`upic$62}g5{6@|1ZW`##0DIUM=@KQfw+9a3n-qhy6=Xfr?h6Hk$?66P4XdT#%7J!b-S1Uf#P@7(mOQ*4=U_&XM#lwS=QojN)8MY`(w=yz(fV)keG z)Ej8m1_ycM%xTU2xMD@=!YR`<7=Fyl1b;7g0q_8fG*t-uU6-L@O*yOzB zaA)=O8$uX?x2vkH{hsS-;=Q#L-~qeL2;SavS0H|YQd1L*8N{mKlSqc^6c5DCEiNn! z{P`nly1Q2yoJ~B|WbwE?#+|a!2r%-fF)%LkOB+s0|J>C9o2#a&%feZ^t z;ZLBby+9B@(H)H^myd0%ZF~3-{jkw7;rVZfG3)?z$(>?R`v30@588ayc3zma1H-8b z;*oTF_{dc9I!PFU{{kt=LUzw!XAfq~$Ml%oUx5eqHJF*4G|e@fK`Uh1w(VTL`VH}r ziEpp#i0#3}B?SmbeZs!Tv>BU_n3AdyUAQJzZIgEp9x$886`vN3m9lz6L2?KxHXbIm zujADhOnW!NCm?lefp-NUR>%I{GxI|P=vCL$U_BL^BKoPSbQFN;>aavZ03{wMu^cn( za-2p~(rw;M9=HeF9bVA(FDH3M7^dQ3R96%Ncn^pzdE;CIEI44^73Q|Rn~h(REISYc z{NxHxA-s-CN=jD;1`~0L_~pUT(NOH~fw+K3_34DRI|P5?IRkJcc4Y7dFxb3k3m>$j zW7r~jOJ1l$($eXDQq9qUthXYE!Y-#h=pWv*SLd*&a@}pKiz4X^U^1WEEkLlodFxij zntc+lq6_6FuH6+rmks|PvfewK>%NT}Z^|s$qbbVXr6ICKh3t||s0fvn6p=_mRzfHx zBb&^Sgd`O*B3YSP+2eV=@B988&+#0;|L*&`uD*T0pU-=o=WCsG?DF@{7a&W;&v5VF z8TLaDLhsy}=%dPhuQ)pG>w5lt1wSBEQLBJwd>1V#L;4x0wz}_CcTD*HW?M~gxq+E3 z?MmzU^YgA%URPV=DSMRdY_NCo^t@d1?cxdesoNX_P-;3(J-nY26D(tS{QQJ zMjAUQ{%|$1TGlrw&FKEGBVAca7y$wVfq?)KAjm1O8vx{he(WSC69A@6L>UtQ6%>Xl zs;ZQ*wlfnOB~RrUn^zNSIlm{4OKPV_GqO%S~1-CdL27^rcH?E=)t`S6EqVO z38w6;6ADqs^k0olzq~zpN-yp9%98IIP0kaUE6kh{%3l`$xmWPNZl$TqP~6awPsLiK zL?hQTGtHQN!Ofj|s7HtYm4Vg|{Z!@+fgUp-<8>+x5|ezH8k?HDot75{<)2r)IYHdljih3>85)D0BO}LYp{5Kv(ycpO=5?lqIAqzm!t%ZJ8DDF3clp*&6 zmN+>gA*z`K&I!=#UO`6k`Lkykh;VEFem~sI=v2TtJn3;?(++$>bPfXg1937pP!@XyIAJvk5A;IPNKAwyCJQbOl&7R@AUbgzkxY!?ObVdx zi9sJ{CkanO%SzOcU@@ab&Aby@On|#Xp!=btgA?*RpMw`tbO2eS20h4|2KY#nH@7+* zQ1|TL?naafpbZq?v~spXphGqZ4s?qz&*TtMLa;(W0<{2g{R0As(g|G1UC;wp5on=c zlwnXd+=hl8VDx*SBbi89LdQe)Efm|plL%0IrGw?Ik$qpX;nP7)$3e)82|0~!1TIEf z$r|Vb2{=Z0V1U$nA?yRr^%T2-_!0njutQn+8N>;>snM@p#8CwFWIT52?opTz&`Z&B z9vV&0jVsrX)=$()(ZS8ZAp`e0jDU66`BZdt2Q4ezcg4lUZPt^(opS%-!vW%kXm95L zqz=041a>j5a4t=|PnnW>Bt=`ZIR9DU(7%a#k;>VN+c~IDp7d7AF)1I6FEJL`)c@fF zbxY03Flv20xsQFtmmyN(pp>iFzTM07#&zG(%RfpC4}S=jrKWaMrr04vtI>GBQI?fD zeZ!8U+=8@y=grn567G$Khb!e8iwMg@t*pdnl-oG>M}3&q;@>Qg^Y)0VTY~3l-%Bsf zrX*xecOOBtA0lo^JQ#9lAWajKk{ZNKfRtV>+~Cl(`6KoK7!%>+n3!hHeONeZ2v`>t zNKI9h5BeFha(!ODFE5QiYkQDsh=p$#FiR9`=YH|Tp^?YFAQ%UlE~vMsLHUq$2_%rp z$;pvr0}l<3ck*9A0JT@rt6KB9Dyfq`d~ibG2$7T!LnFwkg@G6cQgsn|6N&N#-hGh& zlmnt`U~?z-{|JDu5N?Vx%};UZ%}RULd6xBclTL6K$3$$|B-W zMTbb@-=OJ7sU@GF`D<+Kw5jPYfiT^!%_!|u&YXGJGYy44>P}*u0@?>8HR|Yfez2^5 z)u(A_0yoUb_x0lo109r>2Jr7zJ>~cU5H37 zd!;cFF77&E3YAv9%5G0?zjP*NY6FA3b3!1Mh#Lp0F%o7-EOFSgD5{hY)r$yd9QzA+ z7;(1pZ~ou>iF{dfq&TI1Ej*@q1rI#3>R`zl?=BDa_4SoIah8_}1{6mStnwQ2fIeZj zJ@ZoFTIG>rYcF=M#7_11H0?`BQ)@|*O#B=DYwQYjzL$0E(g0_oBnQy_NA>+Ai4S5& zd?8}+hw21%bR6y(M_A6l?r6G--)>3Y9Qpfz2I6+g^IyKCxv~&3G}XUXKwVQUHgK;MCPp{H}rQ`hoy4Cq3 zhUWAmJmIiqb#mWK7C2?fc4n;oqMa`x7BGEwV1=w1Nb%J)yg>`OF4FV2Eve;6Jf zE~i(^I*+s$?f-lUgcbb#?tJFuw*+|T^t2#CMHfs&!bE@pK_`O3GXdNYm=zB*(EVdI zy!tATH6M)~51+g6LX8=yX(es}8gAjhfB+<2zNfF}3#;YT&)5ObeAA{Q$$+sb469)# z^zc}cCpuRR)H57G1JSh)6uUHFVFC8KW_X5AGzm8{G+?x#s3j?PO4)Z3DKLR-5!(=j z_mG2>Wne!x3nJ9}Y$Z7)IfArK#XuAZr~p1UJa}AB6zb1)_(`Lao0}`YbQ1y1Sw1NP2ZtMtu@FyPAYnl$3e3^1 zKp608;_GmCxh$c?ag8Re2AZGj$YT^t^<3hJ^q!oc7Yz2gu;n4JPd}jy zNb&WZa9OPv5m3*x+qz4Tw?re3k}8DbAVXSS>`^Kuo9QE-9JAxh410oh3fN2s@1_Yz zLEhqx_(yw2wad$%9p(?OEJK>GAW!e_4JKY!O46-QCv2FRek>#)kgz7}(|S^k;rHF? z3wPsv-yV@ZAmmb}@(vZY?P5vH>Jz-rkNr~{gq?w$V6j<>M<<+y%2Jcp_;5{cZ|?&9 zUp8N!Rb0|~N3##G88EkpaNwc3CaaBxRk#m`$xSOHo2M|Pr>-%en1MM1T>Z-?&NR0d zts@ASQEqeuy<4?G3(HU|1~dz--LOW>FQH;UAuo*6&xGlQ@pfXaLfMMQFcMb{7mGTM zRwN&Gl?UYT)Aq+7td1`$M@szn^z(D!GhU-VvRH+0l-N;+=_1QUy_O%F4;6wls<{nf zjt&yQAc+)?WG})*2ir*_U_^2|kc3x|)gZ4)RVTCfCn<54ebzqcFHteGcMf$k0a$=$ zv1&(zDmQjh$df13Oph%Pz3BAz>CDW`;B1OsDxp=4jg1|wSk8sCR-r!yH|}oV`B%1D zl^c@1D@sJ$B!R*4^=vtO)o1=SA%n%^Fyg>NxHA*rL3pDpNn8o(&p>Yw3mNpZeekl9crF~l{^esA zp=2P2CWNs;RkqnROYp2gv3$ZFeA|zeXl@RY>Kup>NfMC!)W|t~0C#O4RIR)d#~~mg znt2rEL{6is%4tnzd5B$?KApww@?%Xi zCI0G+#C->cIf`rie0_m^*%wm4FbPe{hd)t}(;x$|HH_zKcUgH|3vamm??iTqO+v?Z zzcu0&j|u0wDXg_&|Cvo*UNFX(##_}s3=v^C4)d{Gfb z^&(-l0Es<%$KOH3KrGPYNrEpU>5`yI*+orHWEoPrn$((X*XLz&&@n?uD}r-4j@Q6r zh6t{Ab#>IJ29xq}zY>c&qcE0a)6RJw0@|Z{Ct3JtlOR!&6@&8?P7g)cJ!t1XwY4pT zx28W<<%YZ!&C>?T#$#`d9=3HWoHAyRkjRnB_=EcynJ}P;l(W_B?1pKq!lf=q&0Kee zMxWp?P~~B5&ma{XKM%e~A0uIv6rHay`G9nq`*iLxSLJjrZ`T-5NeYMc^hROv(r)Ha zRmgULN{XYFA9tazyDmUJY^cCfhp-JzaSts+ONO}V+Pj(d+UOjBO>-Ty! zqV%n;i)`qYV4|)Ta4XZahG;(`z`xq792*T;LI%si#hxzP7iG$2T-V4taH#rp=>T^kW+S;4lG!ro? zI~DZO;&iv9=MuHm`KjHTuC!CWFk2Pg6d@wQo4CNs^d&up{>`EWuh#E{Ti+H+gj<;* zE%ZMBB7i<;F|?&QF&Ro?vXilkm=K*|X-Oi;;pPcLAQkRHG86@R6@=34lDVlm#~RMY zzXAO_gYfvL0U2mR-b(5tIL^wKr)c1g^|TOtnYrF<14RKMSq+B<7-j-A16KCwum=2x<1He= z2(d1L0EdxVo%-tDsu4P&QxJxMG$E)GktJ@H4+1E`KBtqSfC3MawiK{sxUApT*VEt$ zqfbX+7~FH5LS8%{VKcH4PtBoR^>d%b?GMs=8n7zEe|0f(at0!;s+_?H){|Fy=^XIL?gw^=f{VDk@R)HLZ3Y$s#E#74fR2nf4p39S z;ihP+QxlbBIq`zwaS`NwpOtIEe~R_lS{WE}3Z&tEDvi`x*e*n7Q&&dCNbQL{lIH~r&Hp4=ImlPR#UCsNaEg z-p+J&`>%EAJup5J%8jN!2{hFF>Ox#ZttH9C57ij#qvy+qTs%ROm&oJ*D-$0%90Lke zwP3x_!heS4iGaa`o8SszYkn$XPcR>xnqQC(3)&SCXRbXi&(Egp&DDSxp>+fbak77K zSR@tBAV3NC8lCg18*G#4lc$5H!u*$=A8>0REgan7XWWYZK#9>{qFkat>Jo|yUs&n@ z83V&3GX#LWYQL(tDhrJf`Ss+<6Ecv2-0-LyAVMS3L84d>R|9Kdjp#{Y)rV?z1FjhY z8$Uj`W(D1#Wznr$jW)~uta7y@KV!r#WcNDoUt64bBdxp+hbPPk_kBIOH=#h!%5q;( zFWy+r2Vk1`SqtwEGP1FK#N#zAyLkj*WiXE+wfHtf`T#tvVFDxh0n5NDaEl}DdB(g~ zm>Ef(aDS0B_V)HCfKEyowrvS1V1&1tm|OcO_QMh0l(Yq>7Sg8Ku`~#sA?+#(bpmnw zL_`px6p=YND2S3dY11LDwFhL)!u5}X_7zqH86pHXAua&KD4apu9EqjE7Zy018UZUf zP{DPZ>Yv8O{yd+Q+1}Q+9aND{8J~$>G%$^|rS3KI=#bAd2th{>D!{n*OHUk)aV#1CU$5z(cio)U%q79t=*?vorllt%QuXS+9$0V}$x z5ls@6(6wUiWJ7BK1oR&MJhaZ@1_qu;+YmF~B#VJcuv;X9MFJc@)=UB&y~33Q4bO@i ze6+l}bm;jDsvd`~F7IkSDb}e})CtSF?GqYcqX?9Ry_9HP9Y97WLC{82Qk+Ai6+zd@ zCX?E1hYni(#EEs-B9h$Uui?yv^7g5;+t){Ji@5%hpfpkLZX^~c^o}Pe-x%TM1~dS~ zWIHG4`MWM_KPyy8SwRNSKt?Hwcr=uRu;!XVI!GE2>v|8 zp(P6KurI$N%Q|rvKeQ4kW{{MjhImjEIu1eVu-;2SKf{~(69MP|1Q4Bn51k{rMU+uE zl}S<#$qK8e=t)173?r~%`K{wX2}vX$xygay5xNC6iy;&mScQ8~(h%7OvhZ~Ht#A|F zoNc@4Af1!`vHm}~GmL)4->n(dpbV zHacn?^{lu1Su^#AqsOjr%?xcEX`wnOr%mb+1?3Cp&+i>dfB7<#`afZPYpWiR!P`n! zC>cw*{H3Of>T;qEY;$&W1qV2^eiufWeU`5#VgZ0MheXxul3<$eWU^IwYXfdBVmI6$mVh)NM6uw9yvI}_ zkSfHxh61igO75Slct}Vf2 z=TW9k>nn6V6vuNo5!sJ}o*0$zJT`372!_*wjNm|VLf5ztsvmg!?dCJ!fWndldim^x zjUEI>3vh0-$=o~)l_cV#7T>-w1mSKu^H~5nGg*^QJ@k2>We)#yhsc0Tzrh|MG4;@L z;wruq7WVCYN};V}W#uOo;ot>vVMSj`R$VG!513*1E z{+-ZUTHN_ts&3_~*mW9e0y4vkNN+;o&ce}%8>j}j4s|+Bx+X7@B`|X;X$e_?M0U(b||5D0YnRqXeR*Gn5p&~_y^Z^iAP)laxA0imt_AHeCWsyBxGSy0n>B{j~QJ$ftN{)E4XwlEiFkFFc4)@%IP(x&mG`8y9_vn zxLQcw;|aD1G!)QCLGJV`dO>*#?Ew@SfjIa8(wdW-!UjM-6a)#7psv;0L_)iV#StWJ zjWU59N9LWc$G93lsG32M0fJk*{FWztd$g2oYcR{teA?4n;QPz8&`l;V`0Fj7UB~a5 zHh&1xQc=#Hez%^u6VQ{+LlZ7b5MdNCw1yXy6uY)y)88vD_DIMQ1dX~DizpCr>rZSb z8^{no{B9#eo&mKk&#U97bX)-5!XqPNeDfib7*;45qll?Z$i#qv9#4x9|2P0i@c^pO zoWeDvAP#E@mz^(+dF%g*LQVi#yMuoLAeN&fA@q(6y7FjATL#?i=wHP#z|F_Q0SZMn z0L@Uoh|P4PhOO#`H4lZ@+b_>A!Eb;7kJs;R0NrZx=`or3#PkfyM09Z!jTqj6R`9>~0m<0uCWI)~$TW97T;xlRTIn@{Y4nvsMZ={gUX{fC9Mx)6tgG-C#G08f2 z%W3djtJCP*B>!^~r!mLa66iU$Rbj2f_Yye|WKZMHGARuH7`UMFb*Lk5%%NuM{EwOK z(qqEUbetv!PUF=AY?8y!PR!JR((ey4FAInFP8t6%R*krU9vGRy54p;TREhs(NFpWd zBqW)I__+WMUqb4Q$^_9zAZ>5i=l2(oNul6yQ;gnbM?HT@i31KqSsa zgTSLDKRW$h!2)pWa80g!f@Dm^TVf`gc7tiR%B8y-?&~$Oh)|zTP3fG9M=T2A-?nTuIR0lQGFYmw9m__^aFc%q~OB+5lF2*Nk|=M}ql$E9TMg3|zoYYv&0FJ2&1%hpnS%RFw|xM>Q5`4587d# z0stoZx^a^~PExM{l-Cb9r5A&loiVyz9G=g@qW1 z5{%jac;AcihV|0${bdWhI|zFSGl3k^^%%ecVD5fUkQFKtVhVyQl=vuoIxd0aLZdD; zoKUU@VhU^a1C(npViEHWo)Pv#1^xjDD`X9KwF!0-6jji0>P1&`H63^psM?8w6W1CB zpozgSLgu%jXGZKM8MlG((XWWA#s6>Lq$mmL5uh=l5jC2#86C6_xU=! zZOme#0tkaM1JYIs(!51M@r2_BGILQlrN}!*h0uon9bFE>#0y^CGhPP?AUc$|4jHt| zI6~3BSJl_6pAue(Fn^v2!7Y^4RbEC4>6wyEja<^wPl>t;>-8)~l%4ROK7y=zG*$yR zaln^~qtzp!W$=`e>@VEhiO~734I3QYi6n-H59u`(XttRnD9BP=L2DMLe}H$mU&0A|}tw3}%6#XyW>r^(}7Gp=_|m025*}b^)_+O;ln?kJ%h4~w1eLx zwV$&&A+Zd_1bJy>gcvG%k}KQyaXX4dk_d=i4i6qjp%3~btju1V*^NOPg-CNi@1=`@(AB`$1$0-YKqO4D4ur)ZW?9K!^Q@Ay^CTOfB$Vlr4=P!_T;Q z^^Po*J+`*9AN%vZjPwX?(`I$^0c!*P4GNA4dmNb*e;)%RO5$dNFb}83c1Ff&J-#&5 zb^yKp7bk^TA?PVdNIBY=TF5lsdacQz?(an{dMaKf7Q#9hM9>;Y@}F3fF-3=ldKjAF zg}<@PVwgt+*4*BHT*4|G!d|33i+dVm zC*chf&p~5{ujV%x1(Fm%I9#-;KlsuX2yTii_)qi|aFv+X#`ZvGVKAe~Mw34r7hRRZ z^3V;0^kW7RSn+2|C^+%_@vBg=;D3PHSK768B4hefzcCP3q}vh<1;M5SB!JW&GYDWV zPKK{c#44Nw{*ir$ih&1W20UKUOG$O#^Z{@NkQA__*_A7Fl-^#DVB#3?NLYE@Y=>Tg zaR$QF&J?Is-wKq#C>WsU;wpx6j@2$fCce^t6? z^Xzm%YgOPW6JI*y2lQzgfDO0>1=GIBVB7=Qr)_O3mjq5LDsBS$0?r^2;V!zVEcFpr zs(*U&(6B^l=L`Cq@j2{yN2crW zqZK0~b9`rGFm_2<`NO5a!Oyj|J?Mxil+Hb$N8Brs+9frWH1$FQ5s6GJ~b*(mTR&-UldPHAjLVGJi^x+seOf6&lW9@jy=jLH%LbKo>u#?*sVJwTfduTs zhojIKcpZ;}Or7w&fJ#WmA9dJK*wHcm%k%p}j0Y+;WGkxfJ&L7c6@K^pWTuNq-){b6 zocZ>?D@%KY=w{r%*zJ|sm6)_wzjJ!=B68E39kn(0uC2b^y!#dO?A7uUQ+4{a zIE1ju7)E}TDFMK9h8qjc?Cw8~u#)3lVW-1`jk`uZMW;;DnE&!aj07Qv><6zm3m}fj zkc1>J>|60=dz6WYO=~5R$QXyR3iJTLhH+|>b_p)gb-!?CkeDOnf{>U2@bCl`f%&U|7)dARAZ#?GeeBz}c?k)Kwje8W*a~)p%AbYQfDUGZx4W24#Il-IAi2;9b$I z{$IX{ARy0AhLL2FT^f-AZtJm`t#F%>8C|HpHv_&wUlfHPdPwRTSGocNQaak&NVIj5 z?ik0xcaLsPHaaUCKr*mb z36bZUQC4pAuo#gW%it!e1+?JVfI*y=UL7A3R$T|cB_98$( z1L2-~L^U-}RaNehVwG2@Xx8&~T$lfKiEo?xL;ai?eeLg7tzeqHayf4*XU{T0=L z&on8^>t$IlpH%biDDcgDzI=siZ3i|A_B_yz7S8KRYHDica1_lT0|Gi)HlvI8XxXPa z=k6E@dzqzVqFaz@jUBlTUCV_V2z5Z?)s}0^2*w_!x}T4ZFOeKv?%;9XpmkJWN460!k2MkAY~X#@fK46peR;0;?Z? z>&YSUYIPfcqn+=N_qgpuzwF>x!Tj{3%h_5XdV< zF>84pFo`2*ognJ)X95cu2noug{{+g3h4wYu9fX+r9bjmD!ta3ry*)nmWYyyGLSRe} zsF$0ILoe6ndtC9DNs!v`p*sznuE`LBjg?DPEektoV;7uG5_>$&6(uqg;w%E z{!UMyB<4#j|h-p|iJ{9yC>QVxo3$~Jw!t#$PEwG9khyJXoI zPBLO37BLfzWE5{jiGcY40GJT43vH4bE>dP@=Gj7Qdg{9RdJzMJt+qLwVC@v^nm zudF;O?vf$8OOm5-<%ZeK57oGxfAy??Q^v2SA-hPq>y`oA^Fy3WtW~?c{4Gp8nURCB zed)XQmp8ln`sEbwiW;?M_-@^4+AZUl5fdY-sUZJeZS3m{^Rp@E3(8CPGAX&L=;+B` z7Cd|`P`|Ly;BMl*dx0aWo}U~cA2$_c8@(?89t7uj&Cj2)sAS+X0gP{dqXdK4j_f-l||WTN4rX6Kzpn{*vu(OSyFPl`xawc zk2CVnUd{m5Ue%TS0*8f`-|57Z=@u)qHGhBr!F2Hg1WV z%a1y>pL9DIn1_0niHWjz^3IlPJ9d_h4NwHt8jsB{1kx7Tiq^a@{iu ze^ajmt`&@G7iQvX*S-a-U%0T7H0<4Lt5T3o!yRzu!#d;N@Y!lzxZwF9BN4f_v$M$# z-DMGP9#&eI_)kmye19PRro5codTb#~_@hc~uERBq1-*f>FHQJtUHt#(jmPLj!((dG^ z?M%{FZB5&LFzou#S1IdRSwp^@*SdSBd>rqRACo_9b>HlDptZdJ?(FLIg)?QxQzm4?s&|1Zw`Iz=CcC9-* zj}a+a;rhn4pmXX#hXe671Aav;DUj&>0$EwU`cI_m*3uQ6A;d@vFy%^kf7s&lyJT#mFg zkMkyHW>6*eJw9;g#h7fE!F_<;1D|8-0a5~RgcN29XQ24a`K#B0o^-?JQcKj50G3Ft zol!h~v{%0G$Pq3D(dcM-M*B2@7|d8PFo-fLiZs$pJ`<}3Xpz{r0dSBQcT`M-#!2KQ z^PAyXAc^qMN7R$}4LS3NsMm?(KWWmeOWN!K-LBDjhC?cW96N^Qm#ztsHrnj*j-Lj-CE}SvhZH;w&jHbmw!^PEGb}Y zkduJ;*{9vPXP1Uva#6T+%VaI?*tJzlOYvc%~e34FVa@g_!aI`#%h8E4Ae%Fxl( zoy5bP{Oh$=teda#n++S&Z8+tn0c0EKQ_&zze^J;bhT6hI@P_to699);ah=YM$AzBU ztTz?SMBt@;=|8=7h(tf0l&a{&4`Ru6=tZOf+?6OdK;hgbYYJ;~9hb5*s(mt{0#J}s zn^k09yJN|J=A}bWWs#3?gntdgIcUb7kf|93UG9M>d$J*5ht0*N(OwopVp3`2egVE> zfg-tNaX6I-f*!I=h9*Z>3?n?8;Im-6(Ea?TqN`qBzDc6XaaJ@QzJ49^%AjH>_B6ck z;_N|*JwEWV4aRwlE;SDHze;5c5H&MPT(z?}`v4o~+ct_w)z7fZvI38{oufTm6fE`j zyX?|vPR;1Cc$|5eHci1ZL(_*FW#9CS7grCPUwXP@^y}vQ%iB-JZ{5X|`tIdY`xc7I zcXuj7xWv!0rE*G9_qw!+hx{wl*xmDCw4X=M#-WUzn)>~7JsaL!2N9#tm|a!x?)>gP z{m(WgUy5f->=Etdj>{~QB}z(MIoYWK*T(qUwa!s99ta5U{J?+V<$D{Zuq%gRL?dwD zxkKi~nQoPBs|IYSNBwA0xOJjmkk_*>e#Yq9C8XGwB0MQU^VZ^xX_ zd<76~ICyKC3xH%0x@oS{d3LQJ*>M?I4IPPYef>HBjoq%CwK{<}uuzC$D$%yJ-O zc=B3}d+`=5poK!I$s>{eexJP4YnhvSFY+-fnITt=iTPheEni%R1FN#x zB}u*Y?z2o3gR^6mg*3F%In%}08U4Ddwgt|uRc))bB*XBP3XE;oS6|OfNs#oiJ<8Ce5uEW+LGvz?*(dFgk zk&H+9oISW+?BD2$yYz8nWbfa<-Jd*ugW0-#<0mK)V~l)OS);uYXW_k*m}5|**CWK? z?eObVlIejuiZ?+1XlF_z-@wo0cw ziAyuoR{Qf-WJXk0+MbFTP_!_9s;55qnQQXv#rTS+M=stf)1&FF2@J2vvRFH$m(bsN z?dQ*)KVR08`7cCAbA8K>kH5Op{wvMYl%jdcvprc{B{z?BddYS=2OqZn{odklQ>uRb z$FC7t27*NL49M>)h6)`kI>JzENM^PH4B7YdHczO;``X(3Yb$@fh(8hX0NiKZO*WR5 zcc~g4(rn!taWMb<_L`v24s{GApwXaZtUyT%%mXT>t7CbsM4@KgmUE7gCD0)JYFi%F z(+8ENo6)oamAklOn?Q)sW#EgM_7|9YYijb`E%KBjX`Lb_bp|eDYWPed2X$D$!sN zGaW8RB-B3w8`Is?5UMJC6x+%5Z|NN2*-17P+e-hN} zlLk-(p-D}*56?@K=RO>2oz{Pd9X zz`O4$iPuh_-zU9eyi~tXx_mC~>Y!&|NV$Z7#*@B3EXGH7$lUy4{8f(YZ4{X!3X{>r zwbd0M=xA#u9r#U!LR8Oi=AAo@e`w|ZOX-8*pzK5BzERUKZOfv2R(s;kBTUPy z!Y0~UT1|rLQHoz#_yh$ltNeb>&)@u9-SV>|F{@p1Zrt1`AeePBBhviH)0TH22F6~v za)$9rm=BX8m+&gQMXnGdyTk8I(^FE+fRR3qh#-}UN0#xr z)MhLvYABWgvmySrTq58Y9o2R!qxl)_?M$s%`wqnfRaVL)1a^edktdQDcQe>P0>T5Z z8D05t`A`n1J$wX^Ut%>GOixp`PTo@Gr_@oOugc9zoz(OZBr$@h6H(Rz1Z%6`hXyhW zm~Wb%0K7&pY*!}qZ^!>33ZGWLaMd&ShKK*72*hduC+xtobUE)FD9Qck2OuKCSMjKD zvP3C{q~R7jb)}QS#`ycmU;_k@0NMXFzUsmo+jiL+YTw7<;Us?;%TjRten++jT0JGL zS8v|b*ht1rp=5K{lfr573x^Ow0C(a9!vwL<+8W2Jkt75lv}9?#cp38pP}6AQQNnKq zsBPS~l>?&w!Ub!^<5c?kCmxHV#mF{}ey;Q?HMJM0syXNhkn0mD2fik$Y%_Kr;sJ0E zkt-M2L9|9>%$PXHtm%j`c!|O;W!Mh>i9hl613Jj_L#L zqZ`ootaItW+eyIo1O%)S;1ciUHd~5Mb~uz{M3BG*KAx8ixVfSM2f}j=sU&ejlhauF z0UM&G2ht0=%uyrb*I>5Hc2WhSOuZi&+3My{kDGYs?5b+JFaZP*RLKpF5zRV@k0b!M zktnr(TMiNB^;a-BlK3$);hDLwRL*D_%Egp->hSVzwqHXNwDN5hJMEsB0wf)W)hO)DZZdofX5?&~wOy{gbZAR8Y;F1QFUdPY3PcV@Vs&|~ zdEz>DG*(j@>tL-MD>$~u`Y6{5OAR4)FVS1Mbh+I_6;H5q5+4OleF5(V*a30Vf*=Ft zaGX$-SaN|5G$JeK)Ah>cf76iCPT)-mSI$QDGlBarKs@>oO&tko^IDw-uivwUSPanP zc`Z*OJY{|l_63P~$2SW`vXVGbiKg2lk{=}lF0W(Ot947t&yOI{y`VOnk>IVZuV2Hc zAzgva!gVyO<_9u#kY$e3Y+)O^m;tQO0TBDvXl}5Yjub6~sz0)(NJX+GFE3@Ou~!`< ztauurA3p!F2H8#m&(@X)WoKQ#9vd(8K}5oTS@pf>?Ef1Mc^?DNK0exe`A23?F3!ot z@=H6XW;?#HrCR2l!z@_W+`7YexW_^fWo+K`kUTBG%5b1~{d54}0qJ8XIT91l< zc&MR(F`pvP8Rg;+^Re8v%cRam^WRQ)>3-iep!g6S9mlU(%Q|qWWcu^5BbcHNt_m0z zF{nWxg$WfTn-2BxphpZtdx68>Kk-Um{mA3QL4jryp#$EabFl1^5)(HN zWwJQ>4P0_ioRO$8C_1tcpO{fHrKTcf;zafJlj(JIy-3)yk$ggv&0%=Bj8ye8td?3WQZ#G!MK?sz0~mb_H)ZC>7QadUZoW*bqIa6 zkxb;R+w9mJl6!_p35tL_i;Iga@vlhOU#pQb9TZ?Bft`#V!*fQ45E*8JV-X>X>}22v zMj$~rd~E1ogS7+ztHR>qcu_yg1t*&I>krAxKOb3oaUyI`_1k>QSpXPN15Hd%TR@Ko z)cp$%ClZscoYVs*3E5?B!NCwt*hroQ^7(ZHO#JZhFdY1{A_6C2Qoo_G#Wu~jnwqM#p>2f+(v$a-qPG zD^Oy>AU3_{F2wtdkRF^y2@6nHdh$IZp@jhppHL)S0a*^zvB z>m1{>jF#{3wCo%3B~i)N(|SbG<3$39!n}>|zju3+gA$@$vh(plq~;Blo}xrkz!szo zj%<|rEi?eB&2ibmkr5Ht)hxp!Bf-VpHW-Td{jM)_?Y3$jQWZjZ1NzV`r|sNmSwNzx zB!?Rg1EQh6e*JpM>f%TVvKH|8O~s?1@Wi790`B_d%a^NPN0Q+tL$1m6Y#A8HaG;p= zY42E%fF6@crGM^Raruz8iAkeGqqG79;P_V3*oNe_9lLZvNs9?<;xHDdZ9$cdB>8U8 zOm>mCL-s4u6i;2aa3Qd()l~Emq;KjXrHXEDg#7)T-(gQ$;oE;aID(R+GZHbfC?(}c z&p{y;8hn*2lXJRX-BnNBWi_VvLTqn@kk3Aqx~tB|{FR;UJxkiC@t|Kkd-kkt^zb{x zpBLifr=+GndeIE%ORKR5sKYa8;YfxBAx0hKK$6w8;Z!!s0}`= zcL=i3kO8JR48+I(a$ST`5eD?M>c1nvrmnYMsKG_@5COF$E;Uf3>@Xhni?dCTFD_BH z4565~-C(`%=+UC;{>WTfXl)=Gx{qQgcfR!a@#Ai|+o15I2c-zHC2?y(g&D;d=bSB* z(QF5X@b3454pA1G$B$1aY#jYb)7JK@ccYkCruk;@W)?M&*Cv_=rV1QB;tMv^t${qp zXK5?;bb44mDt!LpsTL;u8cGVF*e$UYltxrVQjrn`8d2{xOt-IUgoV*QJtkZ)=NIyw_hEMz54Td zrD;&WhHcvp$2$)RonsS0csd4cO-)Y+FRrf>jcGt%kEE^LIGsk9VuqLh{b}m`WZyTq zMTG|%Z4ENiP>An!AH#l3BnF0Kz0p%kYde}J@V8N@<=dGT@q_VcLtUjyM}A7|t$;Y; z77oyzLS@f`Nb(KzB%s}9uYg7b@dWF?0(uydYwy{OMx0zH;^Ns#?H46EhzNqT#FlLS zeQDS+q!q@c`=j9C5I+_%899M_h?<&uomfh_*V?K?tL_=^TFc*%TJ(YDarTdV^!&;< zXxUC-x=d|oA}mg1Zy~;+}DxYogj$+aa z+@06pOlNE&Kibda5&pkjmStfu^y$qw1NNOUHqj-_z zCPr0DvUJgkfxjBSo~p3UA#9Ly(M4O^jTefaJU?)!&l^O~XA~$zCkcXp27ZP*;4KL^@9L)xMgL_l_VI{+gV5mjf_lJMk-Nj61~{= zeaa+(hM50j8o*@y$Vi32DXx{a0#yAwENC?CM;8Yl<;tOK@katAx&SVOF@Xxq27|d@ zBklow4$z4b+b^_km{_gx9JQLQdBwxs*GblnjtyS-24SIWy@ zew^>>X&$O+H<|B{xbjjjHKw8H_EndZ)YMD)bazLR0t4%9yFa0ZLQNQ_Z|vq9Y3xpQ z8!Q#rA(!5P0Y98T)vP9cC>IAX$*ZQLBL=|9;!`~pnA=c(JrX_`%0eKq@2K6$gdh|r zkl(rW__C7O6#WFim>-=|F;GaCH-rfu2gM_q1&j0cC(JVXsaKHoZL z5;e1#c(nh62bLI3hc-wQ^R2Oi-2jh(#v!HIy?V~jR->nh9H;T7m>!rKdkhypMg!LjeYm6r~_go=4v9 z^QNb#e+PE%6N>Rlya6&7vh{*XY-x!x$U0I<&*Rdo#{(eNTcGn(M~3)ATjZ=>h@!P) zrogxKKmN1yd|GJoi#x@~t_^MQp^%#!|GlJNy0O;^L$j!6S4`O3UkezNVhN|Nd#Qi^7GSmpdq8k5%>6((e#t z;C)+MHPJ&)|0}Bb+qa&mqiF=Mg?6A#4u_ZYcr66u7q5RyeqwciIHyJ#uSK28bd!+e z_=-y7=*SHA?%`a3Em+txb#TDFVa)E93Ey^xY+f21{v2jv1?j$@XBsejfI}C@#@UYs;B&V|FFsKaK?XGPPc|whe zde+JQ0zWh5(&F|XP*A0Oz_wnboEGjg`=My(2Nh5R>!=hAb&9gRzb2RTo5X10BX|Df zzu)Kmj5qPKu7bke*kfA=40tloLOl8~T+vdUSACLVxI&(?!n~Ic#yF#lQ8yZK2jkH3%k%(XDnwMpo>OM92ip8<-sIj>)1bd;@6ewn6xRBzw^?;rgx zVexri-|JxlX`$*woOt=r`9Q0^5vszHxIv6O#_}H8`zdnSR)f)5nw8TE;T7v0+~U%? zHEWeJ4q6I=%!1*C_s0y%7h;EPc!;K{)3rC~;*Ki!9-goMs%DCl^U3oVYAAG8 zmuEvC(sw84cDsFU4O109gi=ZbYh5b$x^5Kt&2k)VUit zIQGbzaI0e5|K&QSM2T1Y}=ySuxa(t$)$GBcZh;TQZAwqoQ|tA%!%Xx;&q?D)x4 z>sD#W5MX$Y!}6k!@!t=3Hb#E_5fL9xMo@jeQgKfMZ{py7#^u#xQ4N`P%!-O9y34-g zrHX1+RmuHKtUkF@UBX9K3t8XiNa%ebI!vMn7+r#}i<}_n3v<=#}>=Dp_e?NZ-rPU)R(SqI!ZAUhA2Jcx$50 zubTs=Bud>ekMC-Kcw~NvyJUl;kx#`5(Dc~ym#x$M?%qY(o5foBy?a{_I%o!=`==)d z>y5qsp0eRr$^_}Q`tOZbTfyMq;52ZpEI8*N7%9X%fFI(BXOe-|yAIAUf>{QnS?qV@ zMae{j9-TKju3Q^yXGTYf%7G-ILd*izQ4G;oIcGRcjJMv~$|G)jHZ4gh+$fQt+M8)u zCV*B!K!^g5$T)DcvxALC|4x)iaf5upWzBY0c(JJKLCxJXcJ0}$vbHI-_gQNoRbHih zNU6BcXp!C>n4ILvBD=R#yyWyed zDmDDPQ7`qQ>{N$?`GZ`8XBxZ3QlvZ%894`i92TH!9@rs~-F`XW!pH{TPaW?3u4TnM z4xbA28pl#O^>uV6P(8u{5@+3E;?~bhgeFm97LYyk;A%|-wLK%!nYFLe?Ep`IzkwhN zd+Py5HuSl>y_Q4BbSBJWJHh?kbs+lUtF$yrgo%K`_k*l?IxWLNT+T6z%4e@HV8mnR zjB0uO+b)x+M?%TZI#i$nbcLyLa(X%my&)M=w~WEegJUE!YOr*!D+v`e0do$d4+%Na zsyuub%9D9@@8KgArE`@o-4!KnfRG;O(ogCijpZ%>!V#oPZ!wFm5p||R&e+;wYdNA* z7%e_LNXX25Y(h+ssg8)Rb<1(Ksk?D?P<`cp|X0MPA7!?GrFHuZ(BRd@n zZR}5SK9$Vc4r5QDV(IsWnMxSs@KYX8`dnQNk4W3?nE6 zm*2`^lp!cksnEX)NWDi7+2uYW1Th$DHHY@x`)E+RU~VH9DQW~_p7vT@I6$78t=jYT z+qUtGOF$VJY9+(HmCThSI*^9^74)kJ$wI+;FF1G`HcHQ(ExSxnO(0ck@Ii*xyy$4R ziQLspHI}Hz+{63B5H`8?6qo(dAuM6goWzFY2AB1Es1paF*f2-v)?jrZ0$+r1A#KD- zdu>MJYOVR9!0>O0TwtMvOYHJq?Vsy#m>7(on5ep%rEi^*p5Bl88QX6IavqjuQOy!{ zG9K1x920L*=D>)x6_vB}OZRsyft!`IChP_457>HqZ@%wzt+v+udP6gFXqYGE)>%hY z4>V7%c=5J2Ha7X5*bHE8$&NygHuOD(5mmM*z*{HpWl1fj?G#$t#2>8@`h=r!$v)}` zBSx?PtPBja7D$8Nok&iA8D_!Xnbu;_s|?76l5W`pqr1UjHZ z;=qSK&!n%cBs~%kDwRONx9`Dhnxnbty!P$%EV%F>f>&bxQFjaadwd%h$Odm}ASc4w zdOs-Zhb-j{W_bbJ_RP#&>JEAl6TdUt?*6utq_(=$amaO4Y`a6=+a*M+*2|Ep;^4Gz9<*rB~~HvW4{OE#uPz!w_a@gazwMcMd@5YnFPE>I2Q7uB4VD`-Am2jnf6P}*rKI3{^N$c=` zj5xMn%y!NZ#6`c%du^H1mT4e(;P?l0JC7bc3adMdT#shE@Rb#5vqORJ zuhiC7o_3tFESx@L_Uc@^d4F#9S>7(GF5U=(M`0uhGTtp`SWv88_f`lbFs;oW7H z+OVxV8G8FddCZyr`LZfT-Mu!&M7%c8$b9Be4qj>w#%F;cw1hDPd@Q20eAQkg;zgng z9ZKtcL~^sCYAb|L2{;lf?mggPq`(7*dB!y^HFf0Vo78VZ|A({pfadz|L z*|JAbMuoCR8Cjvs$S6@Lm6n;6vS(xq*+fRjt`JHh6p5mtalbxY*Zti0{ha%p=bYy{ z=eqvazw!I|eSe?ve!tfHkI8$9hHY1_2)!!44A?kj;f$aBQ1M-Q$9DDB>~o(*6K zm?lAVP-rPb-vgX^3kAekd){u!)yPnonhZ-xc_UnEKmGpw#QeON5^XnSf-ZZyx_WWI z-ptIKQPT7(yR<|C4RYl;-WJX{0sd)&7S-ZwGD-VG0G;KTE;$kphwTCEaR^te=2)M; zTsX$&F>Tr6(exX%g|u>U?1}PVnq;|C&B3pNXn3hszRrR}CA!Wz$K@>9~ITsZUZ#QDq9!{_}|Fdx*ItzonV>aHd>2M zfisUtTZdf*_U>JM_Nw=L`mpOGnRvOV!`VBRy($jyPq)TJy;1*7j`_Tqy62d@Ls~it#W9#7#LhfEDIFOdeMawR)C(7WbaHux)g65k#~%s> zf0mU@P!D~evD3xwl|+zu!hv2~-5#>b{KCndRHDv4@;; zBF+ZHl?fp|$LY>dXEgi!VB00s4R%d50vyn$&S@_HFAep+X+IvIt!PvfehOBCL1DNR znV8t@lG~sX%0r)>aQ3g!6aclUI~J!){$4_NfRU>4(9sWm5-!D{1?U2hVRNWxkQDU; zMGhX@a&vQW^mVzs);7N$&k@P&+{GdFK<4x|I1Fa`?AE>L)Xq;o@4MlhtkBawF=diW z==-T^8p!*KTaVJWZ3J8cDF!TvWSQWn)qqkSNH&a~ktr!X5btgq`^D<`Mmh3qVXbAe zz~Ep=d$5e9OVNVKffaA==jLh^0>Igp zq&K+htwmhC267SS>N|`+g7+IrrVqkn0s89EZDB`e^mW_{?mkoWH#_-- zj-GAnCC6u5@(vUhHu=WF%l1J8x{0o{vG_jvKxh+n_kYbgD5D*}D`aYP_w>ZI-x1$# zk|%jUSAgNn-NaoJoA>%$XAy07Ip$mGF;BOdRn);v&M_NdG*a9WAX-2k3`g^a{oh2% z3;awjj`ATov$t+tD$0$Exw-h|rA5nRN~XQ1)B9kc&a!9HaLo~s+$0WRTlgkP06PIS zFFulr)?EAT$0}f^_!qj-hqodAhN-A-CEltXafZa+Mlfy&UZu~?nTccJ%Y>JTiL4RK zK8Z{&F0u4U#%v4=A*w}GwP4)grP9}sE=}I}>YZvo>ephMl>8TKOuZ(H$WTJKs zs57JS@@6RvJvG=Ap%T1SLQ``Kr|3#2W7jlc#o~2iV5u8k{Z3TRAtCRQ{XXs3v*&yM zg5j}a!>$jHcwI!Jp&7}+p+8!P zq^2O@_#_%4%X7o_)ArlkyM%&;KE_5;zPTvMC^B}IsrJ{y85zd534^P0prT-tau@-0 z{ZCmd9_#qk3+$N9cIT~IktFi!;~`DH@Fqy{kN`0Bx5(%L5oKezefi(Nu8i*|r!8`F zMN-}0Xoj;Idfmd+(VHK!S3!ZyEUfuBG`zBS2{UgRwO{@IUDTjvgo(tJQ>+X(ezvgL zwW4j;?-RcnG!a_Mr>hRGKj=z0-%cr_;*39Y(5&`>m?06e>tghN% z$Thu8CqQYJ^{&u3<&NiEiQ%pLk%B zzop}Qv90vtA?-8DtLK5FAOp^;>@eFpUHiCQtZy+IW#ggnn=D7`xB)`xjD~Woi?b52 zGvIeTWMO4!%=|W{%ALpmYvgCOdrB6eQ%+}Bfj=_aMf@u{k< zui#FiC`eJ%>X-@7H5Ox{>sH`i^?p`uNJBblH|B%+Hr%Y z>DfKFlX~+^N5YPs7A+IIS{|93EAq=;tt>Ol=opQZl*88x>L4Ap35_4BaUvn{rmGh9 znejlOG+Sv1(UUe8)|oV?cLaTNOF&L!FUT`At>oPg4$zwEhTW}{Vjus~qrRr(mHFEh zZf^08j>CM}2aoV((W@}$EqojMRxwhPxjZ01_Yz(Xg(JewrFV4zIRfWAo4xC(L}b}I zFtE^eLAtTGT6j7(p2O^+TFd~u^jhwV?62bt6sC({J;M!lYUBf-e; zHY#K(TU4XR&Y`R+Zsl%;rCx7`eT>UV;-n<~gLbPf~NSxKPOC(n4#Mv)h z=o`0e**3kOHA#=v`P+gp?D1Qtg-D)EygXU;WE zvi#Kg!Q&E^9QLovv=bmUgT6)l-deONH+!rr^eU?z9S$^Hq=%gA88x+$2=oM4Qzr}*>c++_$k!u^e^QjA zZU%|lPJk%<1}K+N&o(XsU6mEO$%puc78fh7!T7YOoz8X4tgH`h8rNVI+`!0XRFmtu zeFKruXb8y)5#_;L7mRNL{&-SL1zk%TTiXeo2g)P3&AR*SQf`NC^8IuEZME<6i}dDa z*Xb}MvFU}+bD_`QZp;p-n+%xbf6?TCk$noCwSb`DdUZo^#Ov|WD)XEJy`RuNkfEhy zrBP8;L1BX{;@MWeIwJkR4ZO#E6m4RfN5Jt?4R!`oD}EExws4j_tA2&2i$wWNlwY7> zKv5E`!)MfUJsLxN{6D|G=o9H6NJT`Q?Igc}2KwCD(gG>8F43Cc-w=&1M4*u5fR$qxf6fv~h2Hu34lTn=i{p*F5C&`$gVq6Sjlkzo z$Z#g~AP@!8DN59SUPC+;kjcCU&pH9W*bW#v2r~`9Ga8c`m-9!tj~E%5yXmb_XOP<- zx9c*eJM*n*R&nNX!{5Kg7u(cZkN;4pt^{NH9;h^lx&gGgEu^Ih6-nbpiQ32OdHe%g z?B3$`wud=`ST^#lYVP2rV{zXSutDQn2Z9e!0Q!_YokqEL^H5j!btONwU*n-$drJjJ zFXTz`J$IQk?lmXC&keWO_D?&8&<}`TFgE_%{RMJ-|&6`=ZAn4i_scU1$q5GRTDkkvfLh z3nG7XJLcdZ5dy!U0F57-iRtIpSp)JRb--R&>erS47_;s%EGF%+3#tVKDB=7iacBsu zK&lV1plILOWs4y+jwg#Tr!{#neVMWS9@()ZXBYSwDH(wuJ0d}NZmfn4l6dd|B$@`* z?XyIMXe892Vc-UdcfR9Pg*$!4n?gxPJq9TA!&!~JXprcMSi%YP9a)I~p%}gN_cw8? zV&2Ms8G{+j*q|UPE%*8#0ys^GSuymJeTmr~h%sTQ@-7<^3ck6+lsS!$YtMF*P|S*j zqW`(Jx7U@LPgkNC(^9p?^}@F)5bl8^LsCoK3vUEtv|CXAuV6Wm-((v`lf-f-N7^%vvsJ#$cq;uBBVUOZRK=t zhSC9CgbX5C|9orsOIZxPIlQL;3bz442ICY01Ud3Q<0!et{PRt6a<9Z4XZ7}-?q)er z42+XLI32qDZa@kEdwKukg+`T(UWkY}(bfQraMY*ZU}yJj>OV7mf}up6rvMklZd9f? z?^fLY0pd`25UvRtV-($#NY4S*N9B!my*i{~?W(ufL zY%s`dB>vICtKBU|@E0mi%KED6^vgynS+Z}P1lH3!mkE8X2Pf8uC z`Jr`Ud7>41Iqb4N?%cIq-mwr0W3$gy6Bz~KuJ`P;S{F3p3H(Sr_rfbVx!ds~qyC+p z>i6zFIHKNV)829rxB@o$85w=C==PHlhGXDd@LiV2b#Fv23B<ot-xa$9y z^L?nEpkY3x-CbC;+TT4q2?T>v%pS^M%`QPU?o^ zth?b=IUoWFr>kj*g8}hh#Kr}C9#XSH07f`uDMEOOJ2*QhXC`y7AUBr)_DC)cgEADH zdEVwtahASdh(P3`cUHnAvwZc1v6yWdhRnFsNnpc+oE#glCOBMJfw$L4eFWWu5L)3? zUsMWE<=P@B^Qei5mX8e$WbW8gBvR`7rmg*U-@K7mfYrQCjRy_jdtkkU;3Co&wC4`X zM!$dmhHnUTdCJ#nAZkbult!nqx(c}HvUO40n>P(DmyWA5oDNs{yD85!L?@a_Ky~4@ zq;u3$50R?Rq`8W$bVQWNClqRlQ^MaSBom^z&k9-gbPKQF3 z445T`yr?eQNlOlIWTOId~H8YG&R06m}3=d-UPs#Mh z)dpPwCY(ORye(Q-xn|4M9b9zt)ny=Wv5nAhr+khF;1A_AqMy#cuA_F9FNsP?-i#hC z^m%)<`Yn!GgJB%rS6;ZZxRl$;a)53iwr(IstG;)lq7HyCfR(%%c~nr(knr*L7LgG~T0S5*uo8FxPNHcm6=x^NMnYH{A72ImEd6CcMHo6f zfK#v|Azb4iZWshzfw#nwM5J%vIB(wEfPk~}@K;ByCZal202x7mF87iSlsCX2l1s&3 zr<)|D3jrBg#Fb?$pKu=X2srfut+pZ?@?47sBF=C2l|YlGKUWf5;1A;-^7uC zq|TvE>BNGg(`5id@kXR!L=M3_%JiNW3t7Spj<>=U$ zzP7&8VBy&Lo~H{q5Ij6S;|I}G4^dv}?>Dl(_`@Fd9pCZG>rwT&K58J5hk}AftgZE6 zKnK?a4%v_6$G{F1@X2vW{849Y!PS!ePii=rsbv$4gCimYRaB7h0gPuO$aGvuNs%L7 znwqylkH_xW1ZEA~JTSRw93;r%ef4L@(YzNV&J%KBzZ5-v{kHb@*}IE>j-o}oot%8} z6@N;1tlX*byrR?-ha%N9>+KMn21D&qhbZ)e9?-B5M8@?J38vK6CR8o%Q>;~@4clpz zRb9=I_u|9n(7fg4^k2{7j?-4W*(#&v`Y4pfbp#aG7nI6yTk*jx{_?~`PJ&4Q&NQrd z$YgQbY?X648d5GLMyI^0^R>-+K&|ez_GRDz!NI}s9F^Z-zzJ_xIt-$q)uEJui;JXl zVnQ1qt}}2o^2fVNL2@7{jSLC|T#3_shoE3veG%?%V&f;K9&9OM)j}slw5JkvU;CZk zJUJ=d{zCB96JWKV(IwCstw(Q0goVdp7Dr_Ic-QVXe8{0fTSO*C;10(0tgp^iZyZVu zjg0!!dFXCUIspYg246hz;N4SGQ&<`!-^$(JB%^C7cIe_j1P{(ZA}K&z+VHy-y|`!3 zo_$fYxU{dM(-x7jU-a4cv0M96rf0n6?YO+Nih@)s2w5OwH1WeC6z9>xM|4Wi)c+gJ zj${Io2#H%7OpY()y!xx$Fead3E_3=G)y@~%AH)3CZgz^_^5VOR@w6L%y5~f#-i*j& z2Ou_!s%c7DN~*8E2W1PEktD1dug_0&Az+SVGvc*D!X!T`^b(JWd(xd$>{}v4$I3bK z$i;NT^^pU=9d2rMb#)@sK^61M}xLT~D8; z!c+w(`M0|PsABoCT}ZqHI?b1bR+yQ?3h+~DK1zQ3D1TjDorTe^Tg`eXNAV`u4rJDl z+=veZWnLIB3ahUKa-RM~fC@t94+*Z&^ILw<&ehykx)h}yhw~Trx-Vp&;lm`R#uIYR zhp@T8H^6}AiA10SSXF$t=Q*xDV4FQvb*v$WIgXeK1-JWM2;NgSR`d24)FB(hOUixw$@ zimfB#{&wZTa|3^WM9jqf{Tb(>+b{&dGd9tP%#whB047~HR2ZX8#sNXn`$~*vh~SPw@jtyW-;rX$Vf= zR$3&SHwjNkGq^`GL55!GYwuS3TdvY(7K-<^d7VjYDanedT!kqI_b$osL}c$ea5q3o zyN9^gx9`|-8CI_(@4cr^*|sxAstO2eK9}t7WXjW?AKAz@eu#%fBVYC?{~_Y5J2Ox7 zp-eWcj%*mD|Gh`6L{yK2+zmZTObKYoB%IC-U0L&e@z7AU@yVkBZ5dU@_I4m3f?d9JO&(||XO5AR0hh36)zK>od%`C|Lk)SAzGe>}pxn4Ny0Ia5RJ=2|(4dLg9^e3vqe9_T$d2;X&LQj&!4F;#U2 z@yEy+erV9}B|K+-#N3d%CHbTn=s#d8p&A)VFJ)0tz_<$`GHx(isLja~;d}u%{x0Vu zr@FD_a5<=pe2J0|L7CXzu325D{Yn;PCEAZtEpdmE$4%Glfq5!)BIw`(P8>x0-+0cQD@07!eH!h}nVu zK0Tdx6ZKW4t*$5lNuL%w0R8D_Q_$`xOX5rGO4qFy{vRO((v8vXa% zpFKlUxms3h?)xww$A2hQh?X^^<#g!%tgPWL^W+MK1{Q5PFrHRd z`HX>CA{0X(ESSGnyh^f6`05MXE!Kj%yv`wz{G6_y74R#Ng+7|yhrSas~1u9pVW^IwB%O+-Wl;?7iLaNuqty@n2vvqLs?!&1(TA3u^Y zuk%Mx(V)D~eV}p++-k7J{l{$mto-8*l@z)i5o_*p{5}1%fwoxw&zq`6YrU`$X4NrY zoq-aEyx#j|E}ry)kG-Ck=!IRf2(K=DR5JE5r|z) zYTwO|AH`BlF2B@F9C*I@i{UpJ9&i1roOxDN`*i{D8oOOQy#Ca1z07#!pwQ^v*WA}; z&KXg#JJ^O>kj1Gy)#&eU&V;To=~Jp1E`Mrdcv`Kq64!*hgTW$X7&6_6d8II~-4Y?-HCv8~QyZg_`z37G_+6k_5dG3*z(zaKX zT+OT6@9Gj(oH<5p#~13ozpf5GXG?f}BV%>CuPry?)oJH_CL@*eDGLV&q+Vqder$g8 z+c@C4+}W^scMg4yg>2~u1{UU#DdXq2V`b*#O6jZ9XE}#`AGHQg>Ah3f#xsZ3kNp!WS zs7UA4t6*n8>O9J))WMWK+qrJ8GYY)YIzPY}5k@IbS^iixh|!HIH!xIVtN5v3 zK4;&RZwfq~vFlFjnP`rRW#`&Me76UET@BvV+XeBKW_emjj30jIO)uLqNfRXhy2$U{ zxxjAKb(ch$TEa$Zs75%xxvy>F==nxL8xfVz`pAunmm?*TGkyKJHPW0T{*{YEH(TFW zK6#@RQMfp?HuYdeqQhezA-8?qK?dQ8EnjHNxr5xU&o_OQJ7i1g#SqX~wfHUU`4c_U z5|!4qEZqiJadusNZF{s^8y~XsE0x@`nW2NJ&e(Wc&Q<>zrKmU2d^gyu0 z*Mcj59WKjltBwsc)!%TaHg7#e7>iwms@2|`c~^AhY-q0I%k8e6g$et7pS1!Uf8~fM(CiilKX`1BNoYe<3IvbjueNUA> z?ZQ!QFkzOl?SWKkeC$|7xNuTO``LX+hVzHtMbH9<4q}% z4NvoeypDT3QgvL%SiZ$#gJzn+=Cix6B(QQkPZht`-@102DP;r2L9x%;RP43eUr|^k z?obyi^sJOxJQ_+VVx@W_@VR*oE%TeigvhV~58irJp)HsHRmXC`lf`2uyjyqo7Oq+SCQH5TbLncvw z{kefX%4`SiB`Z17Drh$=9CYPyji4A!t>b>K*!v)jmp+)I$W!Grr=%sl%F)aG%(Wjq zXA7Q4WVwmOAybUWD}GtAqhq|Lr|~Y;PC?Z;_Rov!qE78-S)<6JR=vMK#+Sp!oq@f4 z;#lSS%Qsjz$A-9Hf3?O_+~7UkBJD3@q0dh6H!PK#zR$y1dIao5xd^);1~ z^C;FclSz_G4W@8c+9r8Wm*x9i+rxoQrAAaW+WQ||4W}(z;IDT}_>z{-Jj!9Cx~+2U zm2&Do9hAAL8G=FQDA~C9nD-xJG1&0r#Qs}>TSR}=(mQQY)TybTu<*>>B<B7i~7lrl6k5zZyt6e)=-ijnvcwmM4x%>|xwgQeG_0l283P z|A|tq%4s?`KbE5V)XKlMJbXC*V|s@rRm^sQiVg@jFHtxf<;>n)#Z44^X+!egL7xmP3 zv?jg2xv-zBck>2~88gqv;jOE3Tufo#?;l-b9o8m0`@@gZ{zF)=;ctfOGYsiPD;hI3 ze*_M@Bo=eHP)C>cHsraaJ=UB#CuRQhMeu^rfp?+DZ*z=#_ol8fRXQn}YG{)v2qiu;bC=5JB6?!u;KyQm%kP^$dD}{4(b{Eu#p2>xO@aXxSAe^s-LayVF-3u zH(~Gc##hNFsN_Y(B8TmUgH^)&&5rbI7qwnmrC8jslW)VKmq^&(mHq6;m2bwiWtZsQ zZ~cFyb7f6k(}BSO&+7T<7(4KBO5S`3uvP9Ku{G$_}R;lA&YvsBM|+r*jg ze)hSV;p{;ZuPZzo*qYeg6pLm#l5g%8s$BGu-VsW-pi$)?0l`{Hm|obof>Xc5YB^Kf zR8Q>94;%UQ-S1jguWvHj%+6)EyyRHx%d4l}+T39e^<7>F=R7WV!Id7L|Ux}dDE$TFy#+~zgzuNklao(T4b%OcS)inn*Hy8F^IbQtT zX^0|Ww~q7CrLjQaz`VVd_uVXC7+-Gvv0!}fo}wzHDbHf)C}xNp5ccl>Rrg_HQaqXG z=Gj7l@)^-5NsX+uB4U+!%~G##(g;vYQ%IgtTh)=WF$@~zcCgL=Nf8jB=4iA&Fz@HW zxv%cE*R^j}7S=H7oVKK)E(wxi`26|uj}OK}22vK{bg0YPtd!Z+B29m{sFk$#qHaO6-zhpho|Vfl%!L%!q&NgtZ@%#3 z;NQ!)kFV1YwP%u(+*yt_Pk3AS8mk*r z9oI^@v+rS~b{_YTpUe(^wq3byhlt77=ybYN8Px^uUOA2nr_wU1EomPgTi<960P&}< z$-1K~S6gWgq>l^iPTRvSb8UBE1!uE*BSmU0cWqQNJrmO!Yw`33?_Fz7NX;as1stk0 zq|>4@*#6TpNF}K9KoZ@#xgcL2k26=TrR=U~x?g_wWWs!h@)o8YKkYA5Dhkl(2k9}4 z&#j?md-{x`bWO7OmJz+L4Mw8Oe7C0Rz8kPl3jTC6lVaCyG~Spra;qfmB?~Woi1M}x zI@`AsdlN0Gyft}a-E^2or$u5Uj!+tG-mlDlb@ai*qYBj)$)*_CI`Uq)DZ__xrFO4jcBFiaQmOKJ-c4=T3`|P zpAOb{5wECzt!b{1%G(x2H-3`xuR+dN);Cde+dk(|C$0bfJwz(^AxDRi`a!zLroo|& zpQW#8NeI}-zo#GFHEy`^`PGyVc1Jmp;=7M)%zd+bt3wv5v)8TBx>gZZacoA7I_T}R z;l?cU$Hqc63@#gT*2Yri7aA0?jEuS2F={R{zYCC`P+ZnC_(*52FZm&82V?#^D?i)q zHBPB&q8E45i%x!c%u4y<^(BeZ&wmEKW*bd>d}!y{t@;wZEeEc#Fgx1j`?{!((YO?( z${mc=3mqsIA0MTmEwFp0vZ;B`WyW>KL`{Po|?bH)P8*tSWD`_u43Q zt@rju1(kz3!lTAC^%1=FW4b#=&703S_4}P+sP?I%%ctwr<8M`GW0B8dJY!FJ|N4b< zEYnBgPv5r`ZuvUma8`h8{zB`-Id7F<-r&l#PSq<(qE%jIJnXL-4!nEpa3!|kpivro zMqx5lfRbt~)rcuARiOp@s1KEs&h6^WQM)heZlpSWjXTmt?0S%j?YE0JH(a}=f3w7W zz_IxI?nSq6_3JJ~P+P3qwMru`x_xLpb@!FJ63eW@)@f*t-Rons4kcVXA!jxLQ`~QH9SnN``Ugt8;dTB?CLCxdK}3 zj%96KF;%@va|NPuV-j(t_2VhqJnptCzb&L{s7EM zbPH+ql&3GPsbyy0ZJxO=mgcVbv1bl4B%}<)*WP)SggdGA^~n%0fq|QSSui5p=>rct zF~a`qOY@F`=fnMhD{DCX=0EA6W89S>wF!;U+_E7l>iGNL9d$Lv1n6B!V&@)J70kc z8xW9tE3A}OBvZDqUwy9(=cPP{FD5(Z*SpYcdwb|?=JoHC%a!*;4NoO)=TrOn_tAdr z+#vv`z@M`+68bkmk_)s5=AI1bLSY+8JZd@&rd}h%@MJ?bMg5g(*Y8UkbgDTNyPfv+ zMl?KOZKkJyp6q>wkGs?zHr6K5qSTl>Mq?T4Yy4I+iytz6S!iIocUv{EV)wlhtu}Lg zS8X6bEwmJ#IIxE5ZLsL$5sS)cK7Jam2WK1?+gw@YdOA8AJ2z6(v#yGYs2Q!@GJ=T4 zH&Gp0PEK1!u3k~iW@@!MUv*fss899CkMjE`e*ez0UZDrf1Lr6i!0i|iW>WF>D;WxC z*FuYjgiM4in5p+IJ>3cUxI(ovW4i(o){vN8fObIXCqW>dfC+)5t7iL9nzDWWZY<^W z;jGd5$TCGBwdWxXA(>{#_(s@NAn1x;WhzH0t&@gtyW83HtbShnu7UHs;qfe zPv`d5;~{QmDN4CnE|e!E%yfvv?zN*a;B{8`ufHGYeCbCKQ%lcv5dN$>C%Nn1P!X{S zqfY)#eW2;`Lp!rljm+{-&o$RRR8(l>xn$GblWkxqB*XDwX)!Ic_%@Y1=WJ~3cP>S> zyDzlNeD`=l0f8(7Xw<)YdG62ZyUDlyUF$9PCw+>B6{jb9`kcx=YF|CPcv14?;V*73 z<&w&Ow+?=o^Plrs-@NugQBekuRYI2W)_juo0R942p=dnQctCAlBytP@G{$4j&%D>< zn|=m|5K;Uu)PC6vMsS))S4fZ>9(s1gf%_&Q_YU65WC0IgR_iMTJcalKd@o!;NS6B- z?7jg2#Qu62y#9yP#C%3;s1_bn97KpyNQJ^_KH$5iyh zdx%^%&yFvg8q`Z}l=(l`oo?VNOy>XUGrMWKvzRNAd_Wz1!QbjboIIvrHba}z9V2{1 zPuuj^{TuFQLifpeh`p7I>@7@q%I;9I;fbX9=De1F1JL?shkPeDz}ll)$Bm3A0BP(# zU2>zZ3b}wK3)0UU$q9(YlsTKnHdaG60z!x#Fgo~7!YE;aF{ngA1PvtEKtV#1+u>A7 zNAL`$@R8U@~)niuEp7AK_UT)b9$Uw6lF`rx3fgo`~<+Ta=|A-|CrA z;B72}{IC;HIv{hIYW$y$kq6de<+lpDig7TdoC{^ki!Gdc*12=)mf+{5B{I+h+zXTr zO$6HqZODD7iefifTIoWndz9GA3DXut}CxB z;4t$nyLoOCH5Er#t1>%&>~|nsjg4QBFOraq#D5EcFbtdFtnX^ca8O7(US+1O;q^4Z z5ewvf`eKO&C?yD@5%4cL=PRYKxyeaZURxQd5Wy{WY4!Dgf~I8)o?#FrBNx}0fS@=| zaK!yjUraO60+Nrc2|%_fgC9Q)m$vn#b3GZJ5rGpOlj8{}SMzPZ1(s4YrXPBHmuDn+ z?RSa~?L;bV!JerB=yFJQwW80bIVXk)=Qvm;E`icb!dEN!!WiyRICgk3-jt+^AtIb) zPRcmy$<$@x_J;NNP>63ULXb!mU}zXgUK`71o3j61jCB5z8)AI)=xvNwM8Z|itRfb? z5`$o;*8M&wVxT7>iZ5Q2>xwAiwOvc|Hh=!+GV|l&ISfrm=sIU8v|ccJ4C)yiluV_= zp&=pK33KnLs5kAI^}ASdX2;LUbRmq?VZ(O;2B_!v6*i1{@irQUJn8&Q+6^e3Dj)Tk zz0gjJpql=@_eGtbkEM82Xo8?$J!DS%mY*M)Ia@8SP_h~4ORxM`8^U?~UweaH(rCE& zdoKA}K~i+sJCrw5@Hvx29Yg_@DI_f)n9VJ?1wu2?=Lf18q@(Skh{qQ!BpMkE*p%i48W<)t_z`i&ziDuB$aY!o z;*WY6nw>3p;rE+$O?i2FnC7YM+{6$42e?SCv9q@(%V20qMat;CLf8ibjI7|9x&}Rn zLiH?TyR+v$D}0eqDfu*=0mpZeC(csR{Dtg(T%HgxO~d6EXqNmM3aGI^_wJbURvn&& zW)1}x_A5b+`< zW%xlPN}j|*Aqe%6zVd%En$|qXJG@OJPudJK7(!a?Y}=#w5qMg1Zr)s4Mh059#DDL> zhHGX&&I-hpr6Q;g^VlFRWnb#~d7QCb{U&r2aL(J}lqKP@5c!c5JH>NzzWw*k-unp8y){{z zI{1;~IFWg~!Rcf!>&}RM2Tq>M{P&4=<2>soIX)1lV}ve|^I^Rlwo>mGK$rt&RVShb zVnFN!X2DgAdUZO-3Lv>;Z9%uqx=6!Zcv&Ametc$gKVe58FTr+#;?mn%X0sMw0p)+x zWVkwk$A<_whpTxBfLX^dn8OAJTaXbRej2~@I^SgWnj7Y2b?;piKGL0u>-C@%SKi!;#L=GC%DQ$eQ56yOAPNWAWSrPr( z=g+DIfBdY|?%%(Ty#WgrnY{rm49GVMoM}V$V|I&BM~Mbzy(zP)6md@ySS6^ zhu~ayepH83hrDxKoX@BuNZ4oa%%S#%%5EbromJ!-#zGK4B>XUnK^oNcQ(qgwg`kvq z!OTKy2ZtF7xUR_&j4eV!OW4oEA6^1`)$a6|(%>{8Y@ZiVG+xp$;ZF z#X>?k!!nplfWrgwyPy_3jE>13z6jPDg%mypq?=pm=_4F^zMnjCQXBnPdskO^gCitp z5HQG(q(EbU*f*@?wx@`EOKAVrqiktz-hq;aeBt2F-nF=!9hqHV0D6H zpC37O>J*voOLC5|N+3grqly~oiPWkfiHOn}m=b~)kl{q)h@rg8gxQiPvqAUab76A) zz8{m(U-9HyCZ!X)YXAOOCHhCq0fOcNq3@Vh8dq7l2T%P6{MndYSYuU>u}I-W92f`} zE{O`pwPaP#X6@T(-GI@d^R>TMMv!EIqnd>F|D4Hyxp2?VNUcBn-Q2{;m_$fU1fk8V z|BXYFsJWpiOMBshg+M|&z|GSIVto+Bx@xT;3Sy$iE_NIz`IVi-8vqRw1lBl_ybymS z%Xw3R4_X*9paBV|2+Goex((xY$Ve-E3IyYU@ZXIuyu9<<^CSA;$mq#Pn;vuqE^!hN zi6A?yU)ggRdNPtAT;q?k@h#k!jqvFAMI7~F)@7n!s9zQL8S`ety|xm76SayQ)(xlM${YSq+13kR(q+ z+{r8&_$N_+KuIqLISA^{B(KogUfB9kSCR~{(}S_7xrlfo&g$G&7;!OVb6$KfNej``qeq$r^pVFP{y=QoqX45OgjeH{ zJ;a5zDwB*jX(B4e3&Y?=#6d_Vox47RFzRQp*;DE%XjnVp(!hm5a&n-Q#LhvDM>=OR zT^7KAV@g0P<_FF%>E4sRhM$-l4;UvaY3oqyAHZ-`AzElmZr4#SptUBEG$=qQAj4R? zgs3exYi0HRbl1^J!}D+p$09{fNPpj=T4?}coNtiFZjbbY;6@t*k}FNT2e=XMLDL!w z%jB^tJkA&>fwGWgScPpvRD+gCP%%dTreu$(W-SwwJ4VNjJ-`++twPofV3By%Xg6%z z4lMjwG9v)tn?|^}WAT$)lNOv=-7p3QP7$R2nqo9S8lLSGp2ku5V^75`yf8g(jIksc zKygHH=5-*dAcyQPEi0=eBNLPRqaIJ_IuJdY0|}c52;vT0y|Iuq8o_E7elb5v0~bgP zLL!cku`4mqWC@X3#dE9A97>n49EDA-ykn8~Qh?h}6jIr{L<%L0Sq@t2>esvwE+q`x zOg`M~g(vOpg^^HXN=zfD^F>krcR-*Ni|^P3`I{Zr={RE$T`YvKA9txA2m#4Kq@(%p zBew**60;c0*A>PCtXe`NEy^0A4v@nq292H&oblwXmYSLLg|N6P5D=b0J_@WfH(5iA-5$U_z1A|}QbN@I5N6n3%^6dn&zlGr2KqXWGx*KafaLy*m-0|zjxT7GFk zI?sP`^9y-}g(f7NU<#4qrqBb%;t;dPQ|-XsDS*&|Uq$6OBxbcs_vkg3}ZILtTZDY@TNsu>3+xPcPg~l062omFipfp35i7jI8_jHO@n88FPpy;3$p? zLuhGZaodVQ(SDyS9aOZ$Ep9-boqExrD0G9y=&aI-2s;?L&{)l}NLChx&e<5B3WD4e zAxM`@@T30wRFRP1`y*Ph15TR%DZL;D>0~$q4NZxEq+SHl~t5rVcd1F=#^nolwrs54^^_2Zej~eF=p@Fda9eFhC*$ z#Ih8TZyOzLN15|L4D!3^?}?ey>U$)`2R zD>t68i#kFaC{tLo2B=QcP@{=qXB97KA3kh=OzK>cf<(3&F(8qb($dg?l`r_;*MRV% zqsX?1!IXe0*nN)>F$`3oKVxv2-j0e&Cn+Euc;J~R6^Nj(gNlA%Dts?T44Kt6b##uB z=L^VEhpbHuha#y#B4~I`u-!=5uzgby4S74V-pJA3fi5?dJOOCqI}Xw-WfC8nmZxV) zR9sv(@Qy?0kx(vC>EBPEeGP8u?83rN?7KAsq572pLI6Y3WvDPQL>vcJG0@f;&M6YY zfanL@AC|aBi0l$&H<@&WOms539GR)dN*z3r(ceEXV4?E}`yb+WN8IDrVfa&2jK`c| z&+*5UHU0pz?!&%-hzOD~ihc)WjT+(rAQ>g8<76iiiCb&W{M3gk6kWHe7!v*aLvE#(_ z^dSr^COLxeBH?Vp5k(S&$mIZiEi8N_?id%rklhkaMGq)O2P)i;qpT(Kx?qA4kdk7+ zp^XhnCJ6wT`u_8$G{mQVf4e$6cR~Yy&mS**0ukb5?h!r`1hOO;6EBXIsUP8A@~%VU zGi@RY!6-DLg(q3%0(@zx?;#Q;l@Gk5B#0EC8$pjy4??CJf~nmQJ(JrO1r>2Qz|}R?he7XS*t&#+eVL0ryxy7Dq&f$>nQAWbIo+p!{0tYh|>ZQMV_ zFNbW-J%8wB$*2qRo#-Xrge3@pCmvr<#BRe56UwFT zh>zJof*jE0>EXv)7pu*YFNO_##utfZx5C1d9F9Y>idwKjbU$p4ST<0mUPqfOD%$yp zdn3_GqF2SEMr$fJG6;z(EO|Jy!~`t<{`p;n`faZ8U~u}-?{5i{A>rYVI@y?+gBluC zU`|1+i#?Li-it4V=--5e(-0&GK{)WF`Bf|oEu`$>2_SUC& zCem(j+u=4?`HR&KGb)0>ahq`gYp5d!dEO9)5Aa(EN(bt&YZuF z!-cC<0%sZ)bdC5&WHaN*mMVLEl{t;bE;OZx;r@4%;in~P&IH{(u`w}24W_t*VYhna zeFjzul%hUHKjJ4|ii(N~i-;&{)Z_ApcT7CTQu=y13k=Ba5$7p9JUtQ7YXoanm%|eq zN!g8~r-Ru*FJPiU+ORT~0B|W(MPe`iWYC2{m3r~-Z=XB6jx~X_8PTb~|M)bjYP44R zA}NTr!4w48Cb0b;y@5dyf7&nqZl*VUG8gLp+`Z@me^g!`5@5FiWqu}8n4jP5Qr_R& z8yFoOt<-3E#0YBnhpf?Q!kaanbtqO)IY*(&AFVp0FO<@LGQF@6V=W`zb(mt$xJ*VT z0&v9(jqaknh9eK?B==|WuDMf3uyG7D=YB#M}nPbBHx05Sm#A$!|S z3;dddA5VpO0VaxEMv#sqT$p74h(5>`O(`ucwNg$cCBT^sfD66727Wq7R&43&3d7?c zA95UJ7Bsi82pM)&p2eXD2gnu%2FjcMykrmt2?K)VxvbOYtaPow543Z0s}Xm40OELx}@{gxjn{TvhYh*Il0S=p>IR0sdP_ z)(rY9;DThp4OIR~9xGq%qV##e3!rI7@!3WaO+rH_u~i-x{nF-}PPp^Z7_LQ}4S(S; zB|(^|(mWB&VpTkOF1)veZ#pGpcIzuwdGs*f!9xHRc#nw*d+nF%Uy7&5nprTwKZp8~ zS-r%uP$r=ZAqB9h&EGqifGG-%Po7X=VZqlEKc81npq@RzHK_9UfNhD{cN~x;NS)+* zkr?>+Dn8kE$I@YF!Ephjd4nYidu42FO!?pg*sYMve=h}f(yIGTXEG@Z?!AavR49d@ z8g0=%xssuf&RkXy5j9429@vmBhY5$ z;rg~{WaRgXA4j2h5WAmK_w+UDtBuN3m=zJ$(9jUT54Q}o(|G-rV;}E3`ahIJ5zPps`V$#em@19St|g)~k&+J`$_h{y31QjxbM^O-2X&nn-(ae7DkxEpI+BDV@NM z`Iii^L<&5(-N|GW6a^Tc5Rc+N^`4JewCI=F4oB&G=w;8bO}tDE(d?!J-iV!PlE(tT z8vh5;4lBx^dmJs{slsYQ%JjciLoPj9{U5Ns6Mr;ZnrbH9?&fV(}m9za?luds0Y#b1D&FcX#} zh5}C6JrWlmkD(}MLPQh+8WB^~zVG8#M^SpW@wv|651;~l)H%pJ$cC2zrh&*$XMuy5 zQ-b7)l}UcwmbhAm>Q#Vi-MxF)VY!d5fDIHDg3aTV-k}V{xCAW#lsGHfnm=NQ;?Rko zn4+f%OAAWBKo4DQU0pZUQ=VyP0k*;>4CP&lBkH2A(N@cTvH+`lcO7EZj^1Kpa`VURNH@)mKG-c+G9MhEEl(Z zZE0)6_#(#$M{*hN@p$~_Kkyl*BJH360A*fkW&&o70>pQCWRdXwB9QpVo*%L5MD+=G z01B4n59${#R9kJu^$Qd;=a^+hrZ*EC+YL}r5)wW2Vmo%E4a;;tdzOjAbjvcwmoHxk zoka>HVBd?~dlw|psslywc~-;1$$2VbaBrkPyszij-ou-ZDiN-=C++PI{&NC>c}M^J zDw@yO#01Umm}{c)&#gSl{>UW6bk-r^wx@TtL(c%9Dc<6A)iuOpG~6qB#YeC|tOg2H zFLIqb&g}bvecZ{mb?fYwADNA>g4yAd1N3zQ#$xbGpV3QVGi=3J5rEaC9GdEki-yGl zu4Uq3J>GDUO!onV_3*;73mL}(D~!sY?>pg7Krnul`m59B=BUz16eW&Gqz8+V!CvIQ zV1-Y{Nr0^cbB39AWMb5u_XY&>u}j&lCDW(CaN$nH&`@3!g1Y#b@QjId5NsP_Es0^? z43V%1bayE^CfZ&F=i)P@V;rmU;>UW#^Nzq5cNwfO@nfUd$KswqY9Wbe!%zjSqJE44glpsTYo;J>q4Jw8o2(GA_d)SEz-w{bnBqbE;3AnD1`Io08 zS~59-6#P)sAP+s>9>AmxV7>O>j4{)N6-f)2#sxd517<=1KX`~nf*^jVX#=ApBE{gi zc7*tW(D}e!mf-9iTJ<$Din6K)fJZ{l&>os$(*Ou0>G^mM^r2Su1Qfr0yH}?kHhJ6_ z_C04Xisvzq9IoFpeA${s)^#X-cUe`(Agzv=15vh-SZN9bDiP5S=|>PnLsCM(fCBD# z_HU`d>5Rn0RHFyqakmu(R1P_%BKr3*R-N`K9@r?-Jk>O}c`7g?cPoWMT z7J42s6RGRxo}2sDq1z*OI9>%U<3X?=GC$BQ>7!OaGee@|Ve2Jhq)`6OAgN8|*Jmok z4Q*3AdlQ*3u|QXke=cT-<&}B4Cy4F1KLai`?trg`tMa&2fjU3IAG(Ia809G0T-X3> zc%#Qo71e0~trm%;MHxXNf^k-$hMpjSgK%sCOGb}#Z?Dhkkbt$QXQ1WaPLOv~bRf=2 zGBN~ad>~JEfJJP?=?!{|L^2?RwxaVJZcn)9ob@$GcFF*<>Q$d{)WLv>$_V&0TGID0 zQb!lc7XU_Jv_?d>c+H%24KJ15{Us6}^B)rB#tjV5{1O^)XD|Gj1n2%#NIEC(3o|=b zRU0tHlbEs5<-*&E89Ud|$y<4tGH?0)K>_;l_Ec_M`ke=gLB&XBsOBUW^NX$(jsN!~4? zKjLMHcD_wt<##jcUv12u3|K{qpc$&K?Us&URy#>5qfeX_CWvAN-=T~?l=~z_8W_07 ze*x!z5r=?(D_04R51yQ;b9~_fp8qsYTzl~IKB_Si$24dA&O3<80P{>5VT^2kHFYKv zG>i-Zpka(@A_;28e-V!@DlkNtG=FM@3TBXart$rt(6WV}kR+sFo#5ui@Mj^MqwDuf zv7g@o+m}BwZh;fcRjwid#Rm(7q>hHLNj&QG0pfP1g~fDeYL(C+ptd-|s8Qz=$0u%G znhibwuzU1vkUfz3xu8vO>unTCK;^dsQ^as9pi7O2i!&_^0DA#1Dv)WCbA0DPr(Fpq zZ3QDn3Mhrx`->j;aq-{9DN=oQ)+@B+e3?IZloj770i1pqhu7BGsa;KVJQG|G04qSd zBmnlVBMLoTP`^Zb;Igv+ZQJhslW@w z<=B0d_7-&%(l}qp2vX8)9x1|no*)oJu&&NCc^x`~Vor`E4Woj?kD~CvA4MmoF>#%_ zh<8K_fII6)Ww|ncK?*Khbkj7OHf19FqiwDSmo{DsXFS@o`1!pr*oQfsq9hOn00NOG z`nCZ2ckBAel{}tUaa*R9*(mbAc-knFTe4>S z|DkjVUE41bN&Ua`y5{`qBr$I0F>sut73}^f z*V)-gShx)JX!3C9w{goNqcw3ib&inG1{{w4_{ur5+58k9}wI%@n|#c!I6 zTm41LTc1-G^bhak%iN;1{p^Myfy0s3%tv##UlB4^U&pXfD0oATnd*Mk{aYrfnAKul zYf`Pb+{<);cGi3KTbrC{+Ks4dk_%tvqUWA{dX|JM>&z#GX&-w!GV*FY6_S(nd$?$9|MUearqMv z*f`I`^KH{Qs1Of=c`AAI=+W2Ns2o0YdH`q0$OGK9*lb&q^wtNqpX9}LMCvabCWcvB zERb*|2!6M=7pM>fIUlV43ZO6Ay>M}4}Wv;_fB*P8SQ;VQD z(~KUNfnr323`#c`Nv5>$-p(&_S}_Ub^da=RFHUzFI=U=;TB0ApxsGUeeH1}0 zBxDTZS1;GqDS_6S2EIa~C~$wQ#Q-QoTTqbre*k7AxdA9GDN%4VBBO~UkRT|;pwvMZ zc>h%tmQd9pcN_HzDakMa^gd7m;FtuzV&LG=!b}>}QKTclsL_73S=uS$x3rpYg_1uH z^*0~DP{bCTS(!WLGFTXfVWqe~$AO@hxsBYx&^4mfG08V;1ZS&`@&{!k1^Hd_is02E zVZZ;r`0jyqog#9=K&2NvQy%E=_eDMd%=GBmM*Fi>C!|y}`pAOB1BL|BZfUZ86OVEt z?p*T=*oNPdNFf!7Mnm$_AAgG17gyc1>GO$gXEEN(`$y`Y#QBsML21?1$ zAe2&wP?0&&_dC~m-fjE*@ow9@9_qfY>pYKP-}hrbwt>2deJ8m19A;?*wm!394=8TM zLUi@&&0-K+bTWdG=YV0vUrWXC5*X&*8Lcz$bDxD*9Tb}QjX>jf;8X8$QUuu|7DfQ( z0G0YtVkYz1#WSFKUx=1W%uQ2hyyU_GM$Qy^hR~VoB!l`u8wfiNY%wOfR&yma{0|GIy%{Iko-ghf)7L23lZkS z4dj)U2GehO>l)AdG z^=x$=F=k8v1{2uJ+~?5+ae4t?;rb6BIg&%OlbDhsh)t6?E0dlJh^6U>(2AFdt2*}z3C5Eg?9hn^Vpd-|!~6y|H!uGL$xAUZKo z?A8U}JbZUGo9H{#%?LSscvnJ#GL=XD8`O7*kQ`N()$Ll$u3X9k{tHdO=N4*JYKy(r z_S zMau0ygBdM=1=l8-LNnBPo6f9H&Z^Y=J?kvE^8Lz zHz9hZ6@Ce=n}>hTPTZ-)zC7`0z<`9%8t0H4;LQ$QyLNr|IAOo_`26hbj^T#7uuK%b zcH75w5c_IPkA4RMB|xjyk-nxV(rmzT;x05TSVB zen4Q(qaQss(|bC>lGBANQX%06y^Jj*CNRbWNt+!Ii4fqG$*c9h@$e9S(!|c8-1i$Q zdqaHkek*{)33C#fEGEuZo!)kj+l3zkBf8DPD6(76B+Og*PNbr(mt&;|IcF zaC>=<9mT>s;_o(=)nPfZB+X?R=XDOwlEMHSQjc1&x9rqeAxfsg>k__@CQge03}+>@ z4Ms{MOU>&3z=+=^NfdIQ1K2wZJ4&AJ^btpwBp7GzwRq;h6*zSMgu~uTm)}=T?mo!Q z13DG43@>KUnLEr<&>E|HwXELQTwP2+W)Jqo6#x7 zx$MeBex3NJ#M0xgSIFO!V>;#M=dYSL@SdmP`t*|^Uzi9ElkFI&n@dQ%MbRKq-~ak% zlv%UNu3o=(?dVh2xTMP_N=4hclRJvLJPKTHb$vzmfR@EvO5LbQGXi#ch4!_QXCN>kuF@xzs{~#lMYU7UG|#M!?*6>`*TP2qfM5KG_pmL&gUw; zcd!yF;*=qIUG(@U{(n*fiv1p_Wi@GD$iI(bjYbF?#UzBw)SYijqW62v;DS~>k7t=I zSK%cUi#@^8`sTK$yg{a&@XKG4@9Khj2EEb#j^Hsm?zZ%uYued2}5;`C}i{U8|@x|6e zf-v#Y4nd|6D+`AlBJV`ooX*fh`Sd3^MfIm;#n9?FFmVz_LiAXI1m?%Q5m}ic=c0Mi zZm}XDD=kRtSlD*Cv_b5j;LF#~qrCh1 zF8=a8-A5_SSyHdP5`6Mb0hk@!v!@fJWW62rE_!n~GO_NQJxumw1kuzAP8Wb7yjN4q z=`H&}hmt+tn*qjr{Q5QJ%kcSnRnmlN$~78A^$Dz+nX*HI_czt^U4Lq&*G%cGB7Z-r zto#h8@K@dS8t7+w-S3`t^VP2}!ZuA6<1D5a=8?XjUy{CzITY&y*@%UWAS&ZXn<7iH z*mUQySsONP)QvN#d-cjEAtB+XH#b_W82-RQtm`TWA!zx8tpMFFDy>i+s<`09LhbF_ zZ~pwv0!Z=LH*eiq8ZoD9nbTwlY`DdpU?4WMlnaN19=bM^s30<7tcLhC1LUN;WBJc?dXFv%xAvVs>@48cFveQJE{|hE@Iqcye z!`QtlI4r<#_E~0S?Fetqe!1^eqUAB@e#%VF=^2Y@D%e+P7z~t_rb<##4_6=9^Q4BC zWP;g0sEqwB92{vx#jFg37PeG}i=u;;*45rZPp^1-v)C9#(>v){`PHVBeyj+v2 z`ro-YRBiscQMSY@HK#H?{hE-?4)k#cWG3&UhML4+6~7d#!FtOwK^Bt$#hH^PcGCz5 zec8RkWx&qHS=+to5-GFQT$e^HwsCg0y0bVbEp39wo;2FS!oqjwFDn<@^OMc3thBy) z%1eHi2sgZQ&Hd#-s&k@DJ}KK*dp52;iC3-&nGBSV&|rwF90NIeOnWle&WQ);zI?gL zGyqdvm&Tb;KUm$_JR$R@L3=Z(sqhj7n-*W?`lXRggGFFnu)Au0V^-OC!#bl}Hm6n1 zywX4T?uN?wx8g5*T3K0apFB2h6qd~@4{YFXOX+4<$Dr(8s312R z+v1pC1mlN@nsvEC1OiG~r{##9DI?)R9#sWNoh2|fgi52DCklnw6H8$_^=aN)hFbDDFj>#+zkQmZ-nyX~UOq-|YE; z=o*fVOfnc@*h@x@;fOBv-pDIIf$1DOQ_0Jq1wC;_-TCpowk*W`;T6v+D^<6*Z5Dmx zyn&>2%M>TiT+f3C52~*XJ4$7Q5qP}n0u)`Y&t$IVdcZAUjz52X zxvpHfaxV2g351BoQZLQAs@dyLyzU>C?FHR z)M^@!79HF3l0AohT$av#^eE|oEgDHt)(bWa_(SY^8fg$JMkkM}ySi*Yr60{wkM8s4 z&5PS;8fpBABJMN7!O{dPb*eO_Wi^sPe>1XTPoQ9D+!=zv6gr zS@2*eY7NL_u`9BmTW6j-fBp)7(Wu90_vk$UVuyJA(u9ZR5(mmmR#~&tF~t&CW>wP= zgf4MLQC-M(H2P#-{HJG+3Km$^rcLULc7m!L)gMX5zPV7-O1p0P=yl#uw$o9X8Pcl? zBYN@__iG5ri+)7c*jRGiv#Z&6?(Bte4}WiLVAV`k4-)ENK9vfv2Yu&3#2)$~O3dqG z#YUXt1`4eIVnV{CaKnPv>pAN~e@(GdS@QRVNz{}c>*`b3%OYASP$GynqH^-P$1m5O z<DL>s=Y7R!>QHd70L%$g zCFxftcG**VKJ!@PhYyL#$#X7Oz_0{xz8^g@YOQM8O-IVFV10LR(^n}&`dKh5b?Y}o zBg=&GtKyHQ4%||;HiTCIYnPOglG(KB;3$V^$Q$F&uex2X4N;gKk_&U#FAM|}+y(E-vBFH6&qm)|Y&2NQE5rwS0O zS1+NRElaSfD9NQN8Nq)m_WWKUB*_msc$~FdsvG)+vdb&ZfKOdkY#5jU)m z`CW0!c-}NWO!nPpUDEAC!{<0Wy%N1TUQa_(aE7EL>$Ivzj}8NdF{X2e^3#I7Qv@n| zD;7T7E146v*)Zf$*GZY-Ql4u6H@PaSFvH9 zMnW92gG06M)7}c(O${qW&K2eDt98s_cWwEnUPhGLWLi%s7SDaKJzf}GmR+)=uL^zx|S zaMMp?Sw4J=y`Ybn(%^xE$_Qj5CMG7m{7_(EFYFD|uA0(#G5Th#WgYMyy%LyNxK%eZ za*OE9A7yxXkZd~$SbBNpmiksN@FYgcTpALk2Mh=f3sbZ!;rBgq$OE0Iy3**9_{22i z#EBw`qC!?1-}BZ=zH$Ye#Qu#o83|Ne3__PS5~@XZ26JFlR!B)Kkd*p<853R9nsiDr zB+c)hH=^q%5_%_;K8uMY#tK%FgX|Ibj)@Eco$BCv{_WJMQ{8CAPPiyyHx7o&y7>dz z5wS-reqdJKty{O)qxoI5*9zF2HI=`vFGH~d<_7w3r>H1CC#c9ltpyCDG;WC^oCI73 zF(+hF2G#4m-VMdqHZ(|zA`aPY$m1u|wtgFjW-?UnPN$c!u9U_AKTYp9Vy=&Pab?N5 z`O?!a0W4x~Tg1gKdZL8VuPvj@jQP0c1!=)31)<#MEMB}g`s6X_U6{D4?=_$N{tmvY z!!m7)w&v|F05t;`tKA>BYv_!P2ZX|dHuv-I@4Bh`GBbZGuM2^uZ4p|Ih6alSG&`fQj!giB0SSyRy{^RJB7vFh3+vRV53M)P|r0+I(?qso1odU^7|p6bgK zgYQ0M>2SG|$8kHm5#z`AMzTevsH}Okt2U$!hMnm#|NOLsN0M{BXmNZ`a1eVCH-%J@pKb>0eNkeanYqcR5?AX^qva`@Cz_8mHCpL*JBLo+S zUR=C>Yv>6O3(A8eoUg1kgYtUL>kr284ty=t@rX41^c$}Jp;8f@sd(-qX3mtOFs1_F zxn}`)1745YXdQ2skW*9BesGkg>sQSUZf-&yLk0+I*tFaN0kJKkg=rtu+eU-72L%P3w{!XUCNvLDME0>K?d;_gJ4d^{Q^hihm z$xuLG=#ZP@l{daC8zUf_`Sb5;uFID<6N6>w|5vk@puSA6kFG(S?UGiehp#2w@N!>V%~T&Nt}`*!ZV!Iex0?jMNf&Z^s5s_>{ARY zDw-Mt2gjIr<@^&sVkKOP9$E+YMP89%zMYeN^zn`*kDY(`-(c;wR5^PBX2w^FVJ04<{PzjPzr zckf=~#fxoRy5n~-Ur#Ug>VWoBk;ZXKc0G*7tWVDMsX2%T=AITAk~n}0nx^B z7flYka|IDs0rFHVh~-)CPh8kd}`%tFcP-ojpV)oQqs!^n4fk&RPKt8PP#b2(0q1I zYovPojhWDD${J#Tv583@-xC}9kg_Es<%9->_jw*}t>UTDg5zq!c^BHBP>&!RHN5T> zQw6y(R8C^C06!+fMLPjC2gw)Z&2HT7NlQ#26Nv2^Yz;KdoBY@i3v>_m1~arGZ9KFX z0(Pn=S@6GXR56Zav3>bHNngB<92z|U0Glb|6^cHkq;dux3c;IM&o0luCySUqliwat zXh8o-?QPd{2pqBCji!4li3wRA5~-9X>c_9~cSa2VG>kDJAg@qI*CqO(_S6~w`tgvU z!CfChtjs8Rb?DG$^MZnc7qC|p<$ONYScxW+N(Qe4x^yuR@_pIV#oBcX=L3MWc!G=$`k0F^3Pu zc4NS6w|?~X@pB7nYR-uD?_?{GAYo8#NQoEiH+Hzu_%f%jgzZVI?uer1pq!>YbC3>Y zv4=&eX^iMNnIdCwsPok6(_4fx0Bsz2fez3DTy4^qLnGx@DOK5q++CfvHp^z+x&?H( zN(mEp<-N=!K`g#Exocg8DNQI{3Gm=c{)RsF+ z%^9hr<&@HTzHh0qRkX_369yWVCUU-=1M~$YL)yt)F41jOc6Ifi2<6^lY4iX7T-&7J zXIi(*NGZTYCH&q4RaMoew68$P6t05h7Qu1Lm(f%UlzK;v;#CLslx3kRNDH?N26i6s zm#eFon&(|-{HWVcBQ&42gT5YLzbXyO@}kwg!$Ic|-X7Ix-!!nHYO!QB5zp_gPmp_*(r% zQ2!s+ALMD!G2j<6gx3_|>K}&KtI3QRJsN+A!;rB!TmWA|qTl|oc6s`GUwSXuAw!r+ z%V$+NF^2Plj<~+d2O=K1kU^$~>dYHYCu#2Zqd~CxcTo)t7ohT`k~Cf9jsO0+CVkg7 zgqLkZT%2;&RTplK`z4+-8%U@@x=Bb(c&T%Gng=!GPNry}1EK)D>c6u=lPBxEJ#%49 z6f5!Q(ml>tWG-I>4M>XJ`Sw9WW25(}#QkgoL5av+k3b6s1qZwiyZg7VHqy*Dp#SJm z7F-x0K0-vr`{LlS(ZPyjbc#ucDab4spoJ%=&gMEJTIzCbp@Temm6`yAZ!fSfJnY}M z9zT}Gu7Kh~w2-u@j0^p%JG^`G$O$&GQ_|6n?7Vh|%{-T#PZ`ngcL38W6r>OvWH-YN z-qYM4s5=53v3SX_;pVOVnN|Gvk47H_zps;oii2t`7}zTOly17U%aj+FN0AtSnjc=Y zP=)h={CcC=XccX|tl}NzXlVTH{n`;WG&7vZPXIYfKsYZfy?mdBf=Ykn)TvLlSxW{f zvhWgR;uU~>8hr3EVm4&zZtL;tF<%Q`ucrfGL?wbmx$Q@-pGmw~N-+G$xC9WE^GYO@5{ZFZjRX9Hew4*Rnh9D`%43HwLuq z!y>f@jJHMyEk&aZgceikj;e(QJ)D?HD&D!+kExCum+TiNm}wmy6VStVH^stYz7CBY zEcZYtRu(KBpnB9#*%8v8Gk}KT)MR4y?krv9aIE}|965V2~vs)`2!q1cGh-*xZL{o&e%_z!C?3| zietxu2L!@~6RXJG|H*=|jM0sn4Yy5I%*&_m;5fmrZp}56A@cF(knCf9Unb#@aMKm; zfr7MUzMx?P!)7Z1QV1vva3C5c^XYIS(AB9qmb|*|@GfevcErSg%C0(<7iMTc8I;Tk zzV^VH+eqeM;&RS^oIBr<(Jwj^euRARU4K@p&tGP$ypKsrN*(0M#FScW87L_!FB%n1z+vq%i6yWVj~S2(;rYMQ$6Z? zQbL~Rbpk#RFQ;v9n=0;r&BTXS9hXqiZ7S;Cse4gmX~SqGB^9lTe_Jvl;Vlv})tCZL zk_L%(Sr}mz7WUvEQKfR91X)g_%<%%@uKyqxZ5+TGh&GN`JeuJggkZ-ECIhFud2r?p zidv4|L*`fcIRr4U=FcPrH`t9Ho9w@C$$VAzK_$h|uCengBET6%|Xb zS7=}aAwvZ;k7rEcvCd!(YxXR1lYX=$Q+z~y0tG1tE?6|}2niyf38iLM6Bprp3J6qt z&|XlLP#FRC`%+PoQ1JQ)rR)+)0qO*~)_`TyNxm~K0+fo?X@tEO@81^*;~n}l($<`a z#bUNgpt#g^LTbinfe188rWe3nH2Wi#xPpxae}qm39RpDqRv;WDJoo*|mp((cLhy+3 z8?LG#W7pQ)JVm?rk@0sdcM4wJ4JKkopqgq!yDf4KBHe^Ve5vm}B*Q}Fp zpx7KHo5xUwn0Me5Lg$ZU2!f{u=)kMusq=Y-ph~DVpz>};T*@xKW_IF*8TU#AZzKgm zfrJj_7Im_0L;~$e`i5Rj1JEl7a)mp?Z_j0749Qajh!A+~%(jnnMA=2?67?|Hpm5|k zGJD0{bhp~S$b~yc2e*9xJ}+Xi8&luHB9u!|Jg9SYmKK>9hU74KEYoW-cV|qU+GpUv z0{Z`SWN*Ghi)iPBKPZbm#sab+(%wZ2QVN|<<16M7nEFz4c(_y0?v$G|ELyEiM-wZ= zi~}&s%f`la%WpE-LbogwHjokY{3c4}y?O=It$g@CyISs?bhK1He{~u+j74C#va@#q zd8)GT5=R1YXQ}VN6qiu&@rCla$oGLN<4jD_{lmfr z4E6rfmomqGvMbk1H>#F~?efFVVeCs4FbvAEfy@KJqf#m*p%oB1cN$g-iWfLfMsniC zh8+;ZPy9pS9oSu=f!g?SHs_9~aXX8$aS(cUt>=g{8m8>_EP zcvZhUI5_%2Np2BYPe_k8Zro@;=MAJ+9+)`bcpo`AMy&tg=|Rv76&x8vZY+A)@+(X_ z!AaVgT~}VVoeu*oI-jemtZ5i981YPAZLS}f`hoP@gltbiafy>ay6-tiB~Nyl1ArB& zLDbeTb~|})Cue#ehkV+DCdGsA3l@nKcK0-2PXK1&(8;nn)}QRfzc1QvSr^J9%Y-Q_ zmPDIJrZ)3K6h@AG1h$u)P{RM^E2*7_Yf{Gy=m=67Qp9=^z0=cJbCKB6qXYl2no(J^jcMrZtxx8BNt z`*78Mz2$sLW3sbnPfd5t>eU|F<@mHsy%(PLd72zCzu&Q`eXFX5jp{P9U3T9-n-{aD zZX5V4jaNQQxmWA=0lkxB6irOLf|k1~46l_>O8Q&PwaKk{@VKs|4?4GxkLC>(ej4PS zhVu9UJjRpLJ@;`2!UI*v=g0_AHRLI$g(HfDbD&q0C=XihMe8qeju2{4-IQ>#1o{Tw ze=k2AN3`6H6D93HQNRm=fh`_6Tul5Oisl4qH7Xk+d=jyXerOe&4TTmQo<*gf6P5+! z5wsvK>Cc%SBbF|7aELA2_f8tg)S+X?#GD527v@(u8Bi~xmsUUzlgIcOb&N=;!Y_jL zwm-2EHisA@q=wG)S>KoP(%#ns zXBGWEhVe0Xa4TwDe$8zc?ITa?K47QV1RcBJ!H)H0oQUYtYLQU7@{ zQJi{?zjv)+%GWKLh`NT29<8Sz7&cLW)STGnbSZn^io|zcw+P!oDsbqHHZV8zKc6WB zYQAV%CVUI6JX-X+{?9%MRire3|D*q~LI3T;?7tNifBfD=8@=Z4U5ooUzJ1kPN&HnsELa8nP{Cj9s@E5S0+Im}Ha#;|Y0 ztrgpbFQ4HeoAFiJBtfafH!AYQn==t}|EXR3IKFFkcH+haH#LJj!$$?1Bbw?X*V`a0 z=UsWu@UI{H^lpBUz4*qI(IGG1^_c9ObHa7s(N|-azW7?<`|{aM%_mQ)mhHRpsN2Yq zA0KF~Jn{U-{@*C4A|ClC{rNiJLybgSoD;Gm;GjCGLlq#rJOW|z{N$SZh@sm)c0A+# z?dgIQJ8q2DZusyacY7cW5_u1J@}4q=Q5#==Btk1R#NvNRy*7lg-&fknWW`*Q4~Pa)AVe{I zOZAhS(1JnD^A^ptI^Djj17^giC8Vl_e6!>$k@ks8Vs|rj01W9!gaX&bANz0U3%ORf z@2_#YXoM@Am#V?y)|0{NaRhovi6X?|lx;!;b>ik5vsC>C*OI7ePwl!t#+IkoU(RZj z+HMNWE!5>6M;~&th~$&jzbfD_Z!}s0_9Db9TS~`_-`lpWue;&=zK54MJ9q2e_rA&Y z?cKG0AH8=cFve};%mT$TJ|lvE);4DDdiYS4Vsn3xmdm`g7kkxscU?5IyFu9X+7}N) z#S@@`xWZh6cto?NuI=%+#&5Nr13S38Ge9Qa^7H*=1dTZse@{AJpf_=#O@8m%jOFQn zJ;&`aAqwiX^cRsdh0N65Q(#*>kB_^9zw&TgtiYtcZE zv|itwfZ!^UD!Oy05TvOeNW+f_vA?D`~g_NYliibov~!@pz_4J;dJ z$<658En{a`MJ!&Nxy92X^qfy;|Ilp|kznH0r5nkxDM?A7ny2YckB9wiZH+Wh!tC)j zvOt;)Y#aSgof^Vm0)fBsv9l$LcJnJbWe4u$kz^8GPWfx->h@5KUlQ{dn&o{@%(X(t zIqr^kHdwi8QKIuw*{|=_73EisqlVw<)|QhyH}LK4{b$QJu6%P}YoH|6zti0yF%3$chJY_UjKxHB6_L>?RVt_?ewI8&q%DymS z7l7LFW_QU7A#h*{OmHta=YTk^e_beLnYvc2%q(8Fy3l|#C^&5NZpLL%vec|AU0qk$ zA4$BL3C}7=&j!AxVVqhRCB!OB-A)#wW6;<7vQm(uf9YCU2DHv1^dSO_L8$X2?EVvj zsy-+hg4H#$@NQgvo*c2bb!&`DeI0?r+yynlD6TmR49R+T z=sa&cwYu9e!^p?q!q@2Z*f7S-to_v3J_o<$&O*F@uWm3m3E2>Jj-$YPc+`W{UCx31 z()Kg(QrtL8&XiW|C;R{#v)P=uV)FN3F?!126qdYJGDYZsnI7-P{oEh3W`30b$B zgK~hZs=}kAb7wJ%B_uM!&df65JcNYAT0@yYJ)^(_{d?PtdGlcZ@#68zboRxkOkTYv zH(XVIi%fq79sicka}(-QE~L|&i|$WhlrBS7QE{e45}UraoO0Q^b?R2L1@s)^{cOeq z06W;c+W~rYdkRl4iLZUSX81IVvZ$t~ol0YBvKlL& z3WGN>L;}1KYmq#P5|T~=+IJ1mmvEO9gWx>oTB<`lN?si{dCpw#a4ss4=8_v5*U^bv zoo|n1eu$>yTH{7?4uYhwBZ9xRT@x~9h$%4(0K0PS+OZ=1 zPc4+^^3`G82tJMtbG%+TRA)>R>sUkzA#VvKH~;T`>W+>NQ`cFE^uur=|igoa6?SqR2hb&9t=m_x@>Ggdiv}@ zne}aL^4NkTc-fB~F{kX(%-8@O{_5DQR{gSz*gyPv`#B-jY=Mq$`?{1(^EPkY|KNp@ zu-{+?DhT53ifwKeCgpJIi%eg*nwCfSapQP$!ns{8#ljW$#_{y7R{!Z z7!*DKtXa24e1E<}mVeJMKVn*3ryOi(f6wVBjc{+-h^) z+h-Q-C-5>+Cbn}Nd3%4vZtaM(R8B1S?Tu7c03rD>L8#w|c)RYdzTPC)e(ey3DnR~7 zRW3|(>KJLa53Td7i!)!Lu&*gGmc;kd}0#Y!)|RprWxv(9zi*wnvw?4CV(&7K>j zmCdJb*m-9AZpG7m7OvFZ?I6|3ZtgY^at3qEExON}P7;M&hl+;u6WN$+;C*(>VSF22u))0PK=t+I`Ar^`H~sm zo;ky$=8UTr5*p&qNIwMJnEoRnOJ5kW36K@C4k8S4>otHCIOilZ^%#;2o7M`Q@VYK( zy?M~k0f(+R=j+X&5~|2DK;R4@wl_vnd=t zF$(#?QUVq&Z?_ra>iO>0x5oDJtxYu&bNs5quaCD(_&BBc%Bk#L5**pwQ#HqxOoyp2 zz1q~h_syRuLEZ16Nb=}9s-Nt1pSl*iz6-;yjUc%(Xn|-4k;ZLW79Awsn=w&w9prdz zs9u+hyeP)=wYg%8Mf+wc+S-31_CIuKQ@}j@ z$8sT{g^CQ2wcfcEg^5Vr^`}!nk+^AMG+FD5w-MP?ZXpop8SlUxVhcip<YnycEv}W_`FIpQpwja5)0^uS` z3Iq9Oy@sXQb?z|x?qIby+gc208vsRT5ArO=1oXY^BL)atvf6(K!8yr98_&l>yUa}R>hhHuDE~N4$+)>$oNg}yTVpTHZdZK4M zlS{+YD?01;GtF2%x_|qrTypfhj&16XU)UryTBhiH`V{yiz2CdCY=yWf)sqke0CdRt zYG-3jK$beBBdwtDhe>bupDlLP-rtwOB^+Q<0l#ewc9Dj!MB>F;#2O5c28C9Hw(Sir<05m@3W;%RRP~f-%qE*}Dd6sf_~+|hpEP07 z592_YWhsf*2AU;E-Vco$p|oh}s7yOwzZlsvrvB z{wZL^-n+>MIuyEaII#^*rtbCYk<;QOC6@|}{(nO-0*ps{ zAoh7-KJ+W}_Z~6-keYfPxSpnyDaZ{K3HKEg#tHLheuj+0X0# zgqx#{-DjU`-u@%P?S;I&O+la9b7!WFxS#FuAfVvg$6ERIGSc5_wi+14+dfls*U%rS z(Vp?PeTlJrS^T?-0nIBi%)eCqY;_)fxvg%0qTf@2x?}$7ZbT@vWD$ zO--cIol`e{%ExtodB;L^W#`VDrQKWK%(>crY5G~o`|k=4*ZFMr3V7(TEu&lZ{o^Ae zt9R{MsS;6^Ji)VUY#(q*c<{X??>iZtX`rpuj<^MUMAFoaB1(1ynA~5U3`C70L|Atq zF~LVR-B3%Td%$3@L}6mQ!Bo`_o({LL_35(wJhrIdC_`pMOCWdmcrn5QL`K(lj*_569g`t(l~E($xXMAp#mX({@vtId*e_G2_~gwIwEl7Dzm(};?K68Ih=2sJOn6+E zy3a=kIv(wVJq(LoDAo2smLmbnV}bY*F|nu-QIrUWdg|q3?{UqcZQC54w;uQ9UfqeW z&^aPTPyy!v(5|Ehw0It!V*huo#?96XdWV(A^`CKK@;}F?WM<}GsK{;q({=uQ`De&? zJa05S>}Ynm%ZjHXKg>Mc1^L=~yV+BtQk?hPxZ!qx`Hp3y`;`3wpYZBkabPBp6azoWBAF3&O6o~)ju3$t>57FG`VZ&c&UY#;>R8sI>ElHzN^#y z`#DMD0&jKc?EhQcVdwf*lM9uVdoD};n6kS-*RUkO^5v`5Pg<8LEmY2Nc=Dy@>dWr^ z^73YlS#`%h>*>>pTdoewog-AuK>_ zZ$3bKA}j!rLy7vF7LkJ5i1yRq%*=p!!e>BwocSyq-n}f4R4_|09XN{qD3LuCpwd7y z!q}z0HFch7OlJQ2peoFKVDFhbE?YLJ7BdB=fQ6_N%)nOSuD5>OV_~c$GB+>HwdOQ5 zB%@nkAhyH?ZfqCICx$$kD>Jx5-;`4KXbITub~Wp zUWR(=Cn+f(Fjn&~G3ZlYuz3v915t)kL? zA6l%tMMYw>Br2O^l>Pv@cfrkn;;Yc?X!dON?&mk=hB^$|+pFKfYExBRK9o4MG*YFp zhJQY;bMxp|lB*ANN6DKDkdJvut zI1-p&b?7ThkeTz(vYT-A*8*fN!sLdxjUNa)a5p!%LnS*$Hu*EnDBLq1#Oo9@M84ZH zp&r?b4OMH#pVs#P0iuX!8O{UbAaVhxw{yg3`vu1eq7km{Dn}dpy}t#jCt*`br7IvL zgQ1amL;A3=5S&N(l9TjupbC{#ec{l-94^^h7`6(BV#}qAq_wn#N|`=Q;Ml+-Vh9n8 zwQB!2vvm%9ByC1waa0$QY4~u{;|?Bn^^7!#xo5-HRJ3x#^oP+X3s^J&V`Kal#F7$* z&#r2(ljq+L zs2Y~?T8Ic3e?tFvw0*DhI^_&*tB%uUOXU*pB}-}(((+oad$xNVS}^MHoyyq5Zrb?= z`rfY!8SD1;P`B=tPZoRruK4m^<#_Aqn-4ShZ{D$|i?+ty+?OljR^|P18MVPp@gO4j zB0RrZg<)A zJKX9BR=s>^v3lU0vy(&Gy2`yg1W2qqfN;nxCgEx#a9CcU?}0_EP}uxwrHq=xEFB%6 zGKocz)~MhJov}+V%K@K`Oh1 zWIh$Dn9*veOl$TII97gX*$Gi0ym{l7oIZL*oozIa3pnxgwwC!QBf{T59y-;n1NzZa z)7Ec|MIeRj9H|)5AS4%`ps_!RIYY><^^2@ge3Co)!RCf>x&?+T> zaPJ?xBEq3ZD1{0H02{>Dhc{82G9~q%!a*OMxkgPZRw9pMCmbF%5i}xb!kJSs$88pL zo@q|uaK_<$`ri1K5MdS-2|)si{l&ne&^^GaU1;)A0`qV&A?l=Ka)GCb9pLlNyHcGz zLl!sQ4bBNzTV5!;0K5s|%oWBcu_^g{W6_$!cMFy3EUj`4CIoD#95GMlLQ{F+_1ao( zoo+rpNy{fV&7x+x-jWw^jnzi`y;Y1lYqg(A9vfD3eOE(fqFcnAt$qYc* z`%y^>ax-BD2*y!}h{eOjEQyCJWHe~LgoB)>W^rQxwFr{6tO=?5b6LPp2wT#?05N4~ z=KBH%#7M_f?0Nv8ABlZ3;ue4#5utEh)yVmkYhTcHGm&x2=!vSRDWZo5bUOWnM~Q>rQ58>IMI+}=>#BA_oG(-fUrSREho`EoWLAsq{K>!({$iIlTq zGU`n!j$lkkg2AFMm;`oyJw|Ogz_yTW_39Pk<5f7DHiLt{$Twy&qnhFoQA6#6`~;E` zXBGfOQbuOW9IM-mYlPO!<<;|rV$Bj{V9%sAFH1^D44ANdb;rTpxBl+>>kGg-@ zaJB$+-t~G-e7TvcPfE&UECVO3>h|`l{LVPz+4?=Mt{bc491wt5+otfSO79l{jl1-t zFC5dYaCD5U{_=Fq-2R`PZr3=K>#TEf95S}bVtVuMb4DYcq!^UdOrO4QOVz$e@2!2^ zPfvPz-#E~9H;P9B!-^F(J=%vecKk;8no#)zHWDcW@ct9#(zW$;<+7KY9z3SG@c*Uu&h#vrg71t2R#C2o@QnN;O!G=Va?TQ^vGh561vP{hJc&B&InK& zB_Jj4%#SbQ4b3eq`m&CbOQW-7{Bqq@7yKbrmY>7X4u+P9weA0L*x1-%>-Uc%oB89)cnAA$S;ITeq&g=9pyQbM1G zxtlX|K|9j2W5!Y?7Sj(FMo$C_p$z%o{NlReAgVQZRJ)$LVn;u@I{(LxNi)_rnEjr* z;lc=M#5r9skZbvj`2nT$UcgPUMabhqeW#Hc);=kq6Q{qHYxL~(y0H;&n!K7Lk`eOO zec$}8W0{zl>5%!^RjKefwQ%vB*>fL!!&2w!z0b@nFJVd$X3){a4?PqfFu-;PbXSEN z`V6+k{*5ot)37ftJ6=C^!m-P9tSBpBp?vjDY)jO&h+pHoI0$7ln_;?u4C zRLCS@x_4&9FsoF5NjyI`m@19y|G9J4pB^>a$4*z9|1BjikcFP3sIn>4PAL9BPsUzr zkV+6vCbln5lhR@aVRNW7oX@&FQW8OJWw+4Hwp;LL%CFVfp5zl}dD&uV}xB z)P$cb7mA6MPNpOZ^@v+iFF|q!b0d-ejy~X-U+H|Lxv-Mc;uhM9JCNod7k&Xw?5do2tBOpk_DX3{~;~@`ZhVZw9!vuReXg z?A$%YQTwXXcC-z@Ql{=I9XwM~vVyVR@M#{Sm{?}BpFl%}lp14T9w;udC9==OEIly_ z7TMBJvz+;SsA}dv+-f4{ONN|)t!Me;Z_W2?l^}l-~nF8zyVkeZSGAZtxR!6A65#jBm_<3$#(1Z-;ms$v6nOc znuS>i=gbbv7mk3Wl+;{|07gz4$$J~`SB_RnnA{?a6^qfR;&5xIN=EaaEKnlBd4owU zQ;%w*I`!$^Pb>-JQ{^HhV_ukA?OsD3Z;AGAH_G6LH@&yYsJm=i>KR=gzb8iVP?+L7 z`E~2W6cMbRu;M+Eyk*A8lYb1XLJwrDf5cxWt_gKL4V|FGDJ(is1L71Y%y`9GX%SNy zM3vBKW9C5Y_uh^NJ{{o4&+)~ti+-EmFxz%@>HZ~Y&Rw(G7*Ww~Hi(vdovYQ}vpKVC z*WR~&HRqNt`p56vM$O#GGDhuNQc?|VBi{OrOF6sk3#)y5%BcA>M&{A{0V4l|Z!>l^ z0(E+doucq{;Sr#6F<*U=IzW7S)cHPV2Y-2v-A-RltFX}J>5y=Q%dN8I)mxZpt&K;r zx0l4W{>xh1% z-O1@5is;G2-t+GlTa<>oPdHj6CZMq;>x+y@NN%uSVpItmv!2+&aju208Hai$9Fn?8 zk7S5mLZe;Yu}gQJJv)v(?0Mp0cdw-6IX(x`XBO!3(dQxvinjPRgZ-5{-7ZC%?i=c^ z@7vX-%xyU{dze%40pyPA)}R^mue8 zn4zTCr~htMkdvSs5kmuv*=D*di%V$1YtCW|bA-2+eX(WGol18o<9}2hAHRH2i~qcl z=S|gn+_(F<`IgVJyz>!9=ub?n% zTZWs0tMrh}Z&BaA?X^huIrcUF^1iYcw`SF5oImG($sk(3;`gupjWRvqS5_`_pM;dU z_{4tWw{6=V8GlCzF)t>5#A9zG+*=!(yeN;L*X|V?4h&AR&IBsgbwIscNdxGwb+J|Et?S`I>M1HZCvk zR@={&KL@+|$6mZKqo<65?6aEf{dPM{cHZYcciw*g#>PALYWwobJr+zKVts9xtnJ_v zSBzfX$X>0qwM{Z;3YH*s1d}|=<3kR|=7Cm0N+T6|LsOo?#@kT2BXnDN-Z9qI(TJQ7 zyNhk+pmf51ik!~cq>NQfnXMBKofw<9ZisB3$Et_iUUj3PGrn*@CwdffeN6vZS5*Cp ziz%#GTAtnUk7W zc~)PluXkqnB3rebhSJF5g)8L`$d0*L31mR9#>7r$z>J|_*kdH% z0s-Kh?A`OM$x4-Qoax4uZkeYYse%;Q{QRFA=Q^jl`1T&sR5SCB)blB-E+>x-n7;np zhwa+hHqWkJx!kLJU&lL2A#Q8Kl=~fiJoxBCn~B{NRVMX4K6y{~em=<=Ude8YYqs^% zo-^0`Lt#ix_9e4~i%)`=PbfPWd?fGJ^^LxhoqasmmR>j0RwbbOrOD1ny$|2FHvUU- zAsU6qrtjZtBDQX-z4xs0A4Rjc{*{~7OM2c)O?`c)|FwQS_qNM*faCeSwx+jbLQiMs zu6Jz(b5uZwfa)#$oWmF+M=EmWD~G^`p>$~>fD`F1%Z?E>x(>bi&)^tO@#G)|k!Y})0sZ#kr3dhQ;w6+w< zuRIaj#URvQ@Btp1RfZMmie%ld^rK7V1kj zrzH0?dh*WzACJnj9a+w68f^F7|8D2FxWujMAD)ku{q(}7pm6v0-+6OlRM8DYN10Ub zh;En{0tzc^4TTH1fy7+xx?Svf$OoC*jMbyNr)SC+H;4cQxzXQGz8V!)G<163D4)&+ zdg?hfS~&%W-?I1shYFmh3?vr(_iyLf+n@hAT|HdKrzH1fcC*#^EY&q%E!%$kuKyUk zC^TV_=B9;L%e@*?uH1iDyg0u0)TS4|nr-jP)?Tx>3wUhy`OS)~z?jpLm6w;y(^)WQ z&fM!Q#crDh1PvbP$jHTZ=Gl{qPs$FQy7-)O%>tREy9%*jkRyyT#fElU<< zKa7YB{FoAVeb=rRZ%3bec4mikL#m6*rcFJ~%py;3`r-CSC3fKuBagvSrhnEsrpVv+ z%IRGD<7WHK#g;A)XH6|{YLYmUe4O%`k5O0^&r-T4DPBK0 zOXk=p(hdms0#u*Nw=_hr@7`T@dgt-$!$(gKE~4L&KsK1Nd5LR4{(j@vcYyhXy_Vo= z7%#<0?_a1(* zEGqwbxz7|Qg;cMxJ8s`!SXOs%sKnvJb!P5bZl08oI{Ga+zBz|IH)JG#Kdmw5!oZD9 zKDN1elQcf0Iyh!J#0@;d;HMauVFd}2g5c;4$ z0{Nk8w_6*64=4M5yD-~$!E`L{7C&z__C(tW4W=h}flUcYr+xC49XfgP9xfV-kbX4`ND#PnDgW@WcN?6(S#4a@PiyLD2Lc88d{d>S62%MH&wvs8L50z;WbkZz#_p)dJ z8UJ{`^4WFIu|iUQ?p$8R2L>z!Fbm3>%+zP~w$BUjJ~P;`VMBUvrAd81Y&&UN(C1K* zu!jLsKiavwf|A&q11quyjRT051Fo$;?#GX zd58CpyBpD7-3XF3>+kt~pTop%Nl|nBbb5Svo+@cFx@nwLn&sqA&&Dp*dH3nL_oiRbeuYbd)|Rs#_T)s5)8{vTQI0nPRQ z_Yaqq)gUqoWzR@NvWuc+ee97+!zQv-MoLN9S;@#A5s?+qFp`xrB1J_hNk+!~czu88 zf8Xam|KB;+IoI{Qe&6ErdB0z;=kqb~9FTY@$alSlLdKW1!9QW-CTh3Y`!SU+WE}u_ zU3;_89heDT18sn-WSb$}G|?2pM7q&K5>G0$&YMsIjBkebiM!2j>?eh|Ys9I;k zt!un@!GvyCN<+AB{2~7(v*QmTVsr3+K=Ge-Wgi)3IglNFN@mc)z@>`60NxUmSM@pi z58v7fB=v|eI5N|6dv;<;#A-C|iJSnEMM`RD%FT~r3O?7B$p9g*=)80;KLXJ;PmJ6kI!TNtG;#x|-VTy?P z3_C$R5XjC{5&qFBp4o8FXQ9}?XCVC}`f%z-+aXMVkl)EX-sCra_w0XmO+0*1?+2dv zg!Shmo-d&Ija}K=qpBz5!VRuC4X|ZCP%@fdflUJx5it{Cdj$soJ2Rf8(H}pKf`|jW zO5k1K*bvxlF#nLpxx)Kqa@HTct%R~gNpdo8OjK$U^|Nzqn=`ev(ya!HtoS##ba(UC zu52)Ln-ahnPLgEEsYBM)NfO_MzIc3E(3+4Ycu29DfXIFetT%`f6bBrVKB&7h>?X6K zFWbpewdaQxKj;RxjFq$iT0C|R(g@@?U_u(fW_#kqi03k2aq|fojL2$l0L!s4YH@|l zViQ6J)aYNALx&2Dp(tp?Svci=6P|*GnH%rd+@ER7;E(K^hiT1auZV<{^p!K}NVy2cw_)f0mO$9$@m^Rs~-W z_WsfZPk5~0`$v@jlKP7a^S^8%h7~FdI=H3~ehaqEd;5-Y#>QMznu=kw{lmtQhQm^M z*wLs_>#p@x-f>7=ScHjbwOhEPyq#Y>Z&ZMok4UD|>CvX^gg$S30_cNOjhGkGA?J?= z%MTzGvAqSX3ZL7!cCpNKA0PGUesHo;RgfkqXy+-}vBB=Ytya%a{*1k21nFJm_67a~ zLloi$V9#a;XGzKkVdeC-dV+fPPf0Fy4OlJl2+RQPi8BM#a}zem8+HKAL1&?8s%9(! z;2Bxup^txYqGYU*R;Kc^#x)TbqvE*Ft6jy003-FIorcv*X^0yl zJ(J)A2($v8(dZ1KnjZRS(mj3sdIPAlz`zx$$9|(t40z#)$g?$NBB%QskszS>so?$7 zDDQcw0O|bP9=^=``c6Odbziacq9UK#PW#G~6fN_B`am3sxZw-mRrug$$GOQ3j}h_E z7#CTZk-uyg5Kzksq|~BuHx*n|{Ju{P&{d%JDFp2JCj*vmBCKAa4JadFR6*QK=;ae` zNRJjYZ&jHaoDFD04Xs)2;!nooKcT7~#~VS49Y`dQQrIlwGtCR@vI>zZLn<%8d`0`7 zFfsV!>vnIaBpVm?x{cw?4@+b^Km7cb{Sg-s9W~kwpFWSM{%O`(Gq}M4&nTcmWX6!1 z4^|M*Ks|8Q1B?VQ*8<3Btv9R^Eu!+gXbJ=qNW>@PCZC+efsIev+3bQxb^m{+)}SCb zBEHV13%{Q5RuXkUzp2Zuce=`|NC!ZU0!Y4l3s)^z*f==Il}d2O6DQKDrgZXq0i3$* z{b2ENWCV2whqw-okfPdiAC!ZVx9u?fBl_{PV95QzWlCm~@gCp5YgJWW!LbP4-$6`2xZaREO%gRYnZYIux_A9F?yCfa04!1Q zLvB}ElfXf9?gATW;nR2-N=_dDigiskXX_BJm6a7zV-{78w*XBdsqeTA@R<{x8UAy3 z7Q(UZ!19nnA`5R9-m61+O4=tcp{@ko+_(j(4k(fSBx1@17DQ}|GMq5w104pj06!EG zO9$O8;n=|c0)^?J!<%)J0&Mu!rBwB9Au+kL zA25EBbW@-k$VEFR#~`Th08*P@bs(jh!5K zVtVuM%Q9{fk6q0Qto>K?^}UiK=jpRI>K@8+dHnEI7@z=ZK?$p2sP!izo&e}uU|t>$ zs^fN264O&K>|a0B?t>7^j>aAf2%y2o@)cH=mMu!P{%bIYCVtqk867Zxa$jjOmM@MFX0szw)5NJ>f|&mbKJ`15i9 z#S9o@*OSj?8D@Y&27`PMo<@kpcQ71OZ907>+~}{@aOiKEH>frsq6R6c!p7%`=q>{g zsD`)58;L<{?obsNne>nxABrP+25}d<0ik>{bk8o?&#v2`Zk>84V4FgbW7F!H zrIl*$j*PUQ=UrSnK%ANihpHfC5obJUZG}d4Ehv@|wR@M8wjDL!ZJ)uTE9bym|n3$^V60;x>#*6t0ygBjT z)nJ(!haGKjNf~TXQ_}$wZKHlbw;oOgV%kKbT`Q!XxYvn;*h^VkM5J?_e(gV(bL<+B zhGI|3o8K)lZ3=}sAqUz!5R5?VfjGmF)(d*Fd8qA}dVYFkC60F? z1?nP=Cdw=Wd|9Y|_~4ii5d0>2lQ1=RA!&(Zi(&QV!~zh&^8yq!^4{VnjQQ4ZacTBQ z>@_>W={zV(Q=cfmxm7@**|W|=3|N2rdXM0>qoW5(KI^}7E(Ll{g!OT;_FXZ@QlZm} zBZ-C3+H6Kk7s(PoCx_gDmab>t+8U7``)%>v2A@8`j(RBS8wqK z{OeN{Z7F2+7ZDSK_$9c()q)yTYPCmvuOPh-ugPaDre`s0;T(p@gtBwLh}2PkPODS( z!Dk|%P!Tg>yahA?kr_z%IkQCZh-!DZ`gIzH!6M z#mmcIYHo&vZ~+AMTW0s89UBH;l4}WNITGSPO_e9Tef&WRZzD;AM;GFLR`Vy3h+5ua zwUeW>qbss7^MIsHIpgRjt;By<*JG7D6TMS-v!Eb7%dNc^RtDZ?Tm|V3W&8LSNPV!S zAmT0Qy)nc3^|i(SA~4Y*nWgTM7q7xi?Me@w19~QftmO}O?mXG3mmU1eRsWz>d7jL3 zo^2t)_AVRw0tC0cGZ2eu)V$N2*?oP|uYu~m`|4YZ$ctC?K=Zcrq~h<|jeC+PchpZG z$0)my>d1aQ9Lr1Jk5~!*YN5VYblGdxTwbGdK*LCk0q{h5@6n|2)vyihW;A7IpM8F{ zYRAg9g|R_q-MZ;OXdy1)&>miQn2=zttxV|8fhrmILCyIXA{2tK7A+TP7S6>`8}(&r zs!lg;hD^^5!YVxRNVSyeTl>CWlPA)HO`J<{ACf$U*)Jb9Wkxk8cF4949Nr+YGZ7|C zxM@~a_m1U1*b;6=MoL58lKAZ@uyPn4#V$l3D@_LVRsh8iPY>Cn+9rXsVE7=q(aC)} zcYWr|$9tEWR6h5J{rbb!nQdJ7wBxv9_Qi|o;~s6CA8P{t6`t$V_hmYKgw2V$XpcW) ziIlu51`C#!)8Iz6lRdkJa8MW+NdF%o=#7G7Do{6Jb30U??{(#s;GZL`tRd5ZOO=QE zesp2QsH=P2;mok%>P9s-ZQo++beD!q2{wus)nYZ*h2Qwn*m2MMkgdWMA=W59{YD?*7Z}$9I|eYWfvO+(XpuCV)4tQ?yo6~{IA=B=^GZRwg{$9O8t<_X+ zbX-VAPcdI@nI?GMvnQU732K3sH5D4viWiwveRY+^1?EoX4{iW0jhTrwnj*Y1lFeMo zad@Y1DzD=9*14;k$Kbg z>~`hVLCgF9+!dxS z|KQm7PBC}#DU5K>rr8K<0rIU$EfX?yNSFgid8}BNbzk7IR0IZv3BIA5%JA62!U@-0 zD(b7Q)1P(7z@S>d8XJdw3+tNe#iC<-qgen%n_(ab2Q|MEsjL5af77ezU(+v6zO4!H z)9>pk+41M=v`yxtm|w2bl`OJ%y@#WOPQG1+7xBFAk&sX(+ZP*Hntiv~ck|Hzp&@z4 zRvV^nKazV_Z=-l;-$nWtzXSgW#=mOv`eZK2bw0}5zw=vD=16}z|Ix<*eqY~coSRcr zy!Pt(@6idypEUvq1>-#ZZPw|$$gF)!p@BYW^^r7vinw}Hh(DodIhftO1Jx{OvIBBM zD${XunF>aXjcVnX71G~)ivX?9SLg$fF)m3O2O+sJ&VO*Gy+x%P`P41IP04i=BW|_< z=>)ze?xmZ;ur$@5>mEwgc5zAAb2KweWiB}fe=nu&b$pDnZ_vq&n~AalLQ-V%??)J_ zruz6VTbAX)^27A+oCZbH7lAu7aOEm%r_0*PTw{hyrn%O?C1!P7+rR1E^P(Oh5|5vm z_b)9Xq6k;Rb%_0wc#M$;02eBO0~GGWoBWc7XkJg{%x=erLh|ynMT?7I4aKn*oszv0 z8#{VOe*}ToBn1&+srNAhL_PBjV`IC)O&*w(zZUMzNK< z!STO{7);R;# z?}K8p^v@$Us|@iS3(hZ21G(7w}=Pu1f z+W4F$DA92Ia_wIe3hKjg^dj=2hv?nEdNjK6DD`v+#8et$Gr+a~PDR_+ zK3L;4)#aRn?QJ2oexa@gDN366d)kk>8w-hZa7R?Wt^apSF$bHR(fh2)C13F&F|;-( zM)IjwRI?xjIKqfaTK{?#PUIK2e4?WT*l*|<1wu3f(T{I|2tQ2IzQyJ8JvnQqrucik zx`kOHxOZ#!{1%ICG&`!xDi+;!{9^{P%;Q8(C*!v|5rbSonzbqAIAUeZB@=zK2;uK zo9`CjM>Si#Cd8|t%bqvpz86FU$($`d^?$r<-=28RX~IUPes}oh3)&18S=kX7!n=y| zCHxn%^kj}62?3@dKGEyxFCKeHDq2)+>>&kubLhODxFqtB7Y?$1QSpcFdOFuPt@-!I za7SX#quk1?LxTsF%LL!ejzEP5VX{6VjUdcLx{h6u?f)^Z{Y`a>eZf=93o3V*lHXKb z=O!TMAS`?)IVH*!1_Rgm43bDmeGFbZVx58Mj|>Fd+${itrp(tv zF&4j;TKSIo`ol(oL?(q{5Z#{K0I`9~`(QoKg`gQT8t})f0It!4L6#lt-*wVXuaUYu z{pU~9#4dy6N`I~*OF+XRBhcxj52u!wUrU|p7fRQQsvB7r0$B!j9fLAVSERkGtPo%9 zGt6v2{h1LsM3gP~u|M1?bl*n|V}R2o0~mQZuU{|m-Zek}G)G_aQ9kRx@r_a>K7|Cp z0hva6PbboYVL7-8<(^W?YsY-VkwDr<5;`~jyOW-#K3LBo#X2|nIwe%mvCtebEj{n= zE5}T}P6YQ22o0I=^<$HG44o^N2>SzD7wqR~FUF5aN2JJrYJlt_!KeuiCYTP+62$<7 z>Uj9cQVusm%D}CT_VvQ`2@DtN1aNb}q?Yx5`*p@IfDvth(2>AkXi5LWS+K$*96%FD zGzm6cwWlj6pg5bLeR&5(68V|Y zJ_zg7Wa{4la0>X`AfhY(bH1|p^XI2k7x?p@IBiv45Pak)mi5m(9HN3>$SQq;)Ho^5=;jElnvOFy~@jwYE05P$%qaS6HRL( zC^)qXzeY4LKZC17xy?QN0|?_@qJa>*og~mByriX;|DZ5aFYd@N6xo_PrIFqUY41s( z4ummM9{7OGhx|yyH3Ffo&gs*cdo|_$ztXIq@9B|HDG7YK_|e>KkCEv^&1>z*1r<1` z`Pi9XOP0T}E5pwp;D$2X1fNkV2#xU_AU5b4)qsi+r~&%r|1YCdfSV~IMI_oY+o~=! zp7Xk}c~q81>1o?t119dn7Z802Cyfu{{7D)IXe)w6;Y;a6#5fKPZD1`V9u#LRc`qSN zLrwT$7;7X|mcruX9Y*49Lh7DAz4GhF<3l{#)dHU3 zW@r)22!9de8*31CMB=DTO%u$PJO5v+QtJ=m2-^|u*?i)*&Nm}g_N%}qhOw4ZW!=Sv z0ZR^a9%u|8{3jM_`T9CcOi=auT{MOAgFq4>EaFsY=Tc(WOzU}X@dTS#v)syyh)4)2 z%&c43e-z0s$Xc@HvJgTmyJJ2QfZ--M4wRS?DUPJOYpjgoj9?4=F`MVU|50VOtfC6r z$aJ>+?D4Nd7MaV|LpBDD?!+gJoQl_yN6gJ5t?U(EG&k#`7+}U1atfe3XilK}Ja*l0 zZfPO?M_0T}_}|@KbrAOhAhZzCpcO=#P7ujNVH$GOi zh<&@OhX?FUHxc0n+k*(I99u!(a4NbYwBSEej6rT&c{UC-gp#9O#=X_e3LYm=@vy;2 zg!4bY*^p-N5u42Q4z1B0=r))}mr4l|nLU)OLZ_-B-vbxpjwc6wD~$lZNG>nX+7Wy+ z0MqWC?cmwA4Ytq`Xb~O{j-%+8(4*L!#_h11iL#ZuU5IuLj}(B3htzQ+yoZ@?f9MaB zECYS0j^M{4W^R0h1(sFh(jyVq$c}Bhs(KkazEVn2Te+VAnCK?ZiJg`|QE zFTfn(vP7Nl-sa_Fq%VnZiT|~P*^oqhuc<+Slnsuu*AGqn`IgS{EBl`2>Pnh1QyAsR zgI0_k?fA{xgWTCe5# z;q#Yg$$LkC_T=*a=xd$aRrz*3-McD7?=P!#c152WY3C=Uao)TuIlozG?%l*AyjBoz z0ZC(mTYj8qSU!O&sLZ|NCV+GtAi8K8BMEyR9)F}#!UwocA#sM(vEv!=m`?_3K*@$Rmf&Ww9UL9Y_m_($!Ab7Ocx#G{J)LD zkF85mxIy3Z`PG|>!W@?<(bmgrsp%ix?s*6j0$er>v_O;KxU9-5ypaE|67qa35sz;k zJ1@tLS$Enug#A%e)JUI=R-F29-N>ZFHx`va90R1Y+EOYrSv2|;hC`C~jf(@p#n&zB zRzO4`3J(7_W*>{srs9c%mb-QsmW$)J4+&M#)~&B!;{TnpyyP7CW_5|WvlDW_O)M;K z9T#=041#8TmpwTg&gb=|WHLCEn={h|zkdUT@K(sTPpvF1px%LpG@3Z-uA=rDbL;TJ z_PMFhMorWUo$7pcWb`AlVquRB0z(XUyoj=LGH#jOZleb(9N!}r3IQ)@&-l+o^L(|; z>w~TSHm5bKJTE%^T&U86*fq4Cez52{a0+y@aN-%?li@mYV~3A)vUOKq7uvOp`|$T# zi9Ku*qT*eCUv54Kqbzp($7(rGhc?1BiL>33c^ z+HfvJ%rMQ_Mp1TZYH88pI(i)8 znb4W@y?UpFp>aY@4fx}S$IAe}A8;4+E;pYJWKuN#=C;7rE{ilYh>C_MDVap5NZz zqKiKhrE2dcK0Ql(!6&kNcQjPTLwfS|9y?Fq>?SexOZ8ba(X1fbu!aN`mRg zgKu_rtGvV29IF4AgMwEB9#T`c^A+PFdb>^OK%>|2ba$)t%tU@MNiKdk)V|&+T9{mP zr0{+l>v!MW&K-#TxC$0&7n<+k(F4Ct4tSi~wn#CAz9|8kW<=qFLG((_Xx1*2Z*DqJ(P@uFy2wxpIMps-a}G;IRGcjzZdps;iDkbU94ynp z(jy5?%}rjFL}C-d__Cu_7I+fzw*VgpFaMNqMOaxd4xs}@KzVNPg2L%>Sj0f8&hoDe z2hq9?2h0xcc;R#mz%fpUPw&s(%K|JL6CV#4op@I8q*5cK6f{v7-o*rvI_@wKlkDw* ziXWafhVa3|y$t5ua-hBba*J+)nhnd_f%4$wZO#0yf4)w9JioG;Q&SIFASEXd%|{kS z;?x1Vf?4YZl8|60SmQC#a~(haG*W1Y(1?Vl;%tSx6ECI3E$fXl;|&wN;k}ark)q3k za$*+-W@nPOxn3|l6MgI+or{a8sGgj&gv7hY8~=XSuhG8ymv7jMuV#4<0^k(A^%a?<|G1}bAu#x-+TOgqV*T*27bK`nvW@Ez{ zwREh?8F~FTJ@Kl4ll{giqoAd<)cuKn)oA!ik_+4144#22)FY!*s0GoxUHYc4s;tTl zoV|yO%j<UW3K==4 zhzHgQkqpjil2TIV-<5pSn)6MlIhJVo!bVM-TTU^k&`Co5&jpS4lfY!Xz2~*sIVj@n zh@l&Vic{f-FJhw+olt*lQ7EUtC|QVCG3|({*Az3zjLwX320QgNHGf!@f2~z{vl;mu2w>cd?eWRECs?tFMx&`|FlN%u&W?qX(_2IFX}@N+p|EY>3RmhS>xUcI zv>j%du#lYujSw`;fuUItA;<5OyE=#MA zGaa$o%1b4`M^o!r^Nf>Xdm-T^UPga^WqiS(0QKO)n!!+pYz6eS49$k7`+VP3xuLZ| z!tE$yerPD|o-Zz@ISxiXyCoi$rUUIPx7vDTWJ|h;ObI7wBrW<0HEdS348{x!wz4wvmP(aCGs;X-I7?oyyO6J{t1NkkD#c3Z5_4PHhwV$v58C=Wt`SX&) z3hFK(6$tei9AfXtUDZ-7MTJ-|hURGHp_#)&c7XyF}B1Hzj9 z^b_)X_u41Qoe!^F{vp@GNYi%=8YyU0%PZRV$FVaNEX=UqfBuYGb>RTZMVg=_t@G!_ zoE6^Zk0Yc-bYO6can*FGzSp*ue!C?%x^(@TH& zhSfOWkmFF1)M3Rb^!v9v=(!dgDHz9yb`61HxZ*@>@p}<%&;SUH<2Yu+vg}-)OtiLFX2F3m z?D}+itLYx&516-=7piTVH-1%0=HW{hHd^*Qn9^;g@0|vdLLaGJOQ4ay|4MpkHfu_v*=HY z!c`x2`!+JqcEJga&w2W#KxB_t$0vnGc`*`22uLp;7$CVfNE(@>q+IsD`FWX>O+Y6t zQL8EqK(sKnHZw}F*KckhXy4Xgjj((N)lHTI6c&1CF&`hC8FX*wJ=fC-Nw)v2m-v^Y zeXAxylbhaI0@(xFSce`K-p^`B|H0#9)7@Rw3jT|^GG-h-YDex^1+JLgP4@Y;UZ}>@ zX+F3R&7I$+#l(Z+q}ZK&-%2*CRvqUPI&PmgYQ6I19{WMvG`aEo;@iXC!Lkw(sJ|gH zQ}l&{rtE}rn%Czu3u2uvCnZ_bT2xjGhsr}1j?WY6ORo;oU3vWI5t(9_V26Z1ZjLIH z5@$y$X7Tt&33kHhfPB)nZ%9`Fp#@b3e1YV-?5pueNliTlP}{g^ z+6qucI!cffOhra^c!p-)vAYYC2FlLuA30@Z>Cz`gu{L+R=;=jRTkEa6ZFd(UN}vG@ zf;#-l$~B3-35cI>o4O61xk%(%ZtE)}-%jq>d2E-<&wlpQ0OLPD_HkiB z1p5h40w(BgVvOEqY`BOK7*P~y*ep@EGGwf)r_1QSAT+-qW_wpz**+J+{o%AZNn4Zm zWmkT{U-SH?KAW+ELi5q-h27~9TlYH_$d(zg^B+^aaP<)i)>JR4`fc1MbIAtNvJUxq zY%P0ypUW4Qa7R41|F{3gj~_mW&CU08|6gM#tmL^D>)@1tb%!v^a5rt&fsvY>>{>Y2 zsLRdUk@#T>V*^gSn7BAHyYTvbf9TN9X^B@+V)t*VwW6x`zv>UKmM{6cYHq8&TjGKR-hv|u~Igic)Fk`@(hlKI4oe-L4YPyU*t$``4}B^Lb;f~e-d z@VzV1&^Jm_;MhX#B^w(X>;L!tS!{3&5YFL=1EOsUamb@{ok2+YAkl7^tlLUk@RK5Q z;DD3Ka=4L^Xu$G_hWFIo=Gvo$ij*#U_T4F3H`x46D=3Twch#%OIT%|7*S%~#hcdq! zr-7G{69YQFZ(g@8#rQ@hj>4IaPd|S~W@Ma7xjy8dUnLyci+oLV4&lCUe<8=+PCGq) zB)EE9O+^rV3V5XJ_!&syqM*(w0Q^&!5y^6d4;xuN?Fg0N+UqWeHCP$aP#ii)XpYa% zJB1Am|A8Ih?cZAV!TXZtk5Ed)j7XvjL~o9-36fPhhrXg_bil%*HG@#JPQ?BaKNd!> zgAia8oxj)c@Dz*3w3*$L~dg1(P1#f!= zU=qk4c=hVDVMgv1-cyWD<28DxUZ0DKj`hyns3NH8JNODa?lc@Eq*;(RaNZW1IN4Bf zM`l6>$;Qr3c0!n*6&Pa|E=eCJNfE;Y5WFS2B(Mq)h(Ad(`xXL_2jtS`UZ6M_8S zIgFz=)H2%qLy7Hvu+XGW00mp=J~&6v!;bUNN=72}zXk3S6qwHu@`H|1^7Fk;-n$>) zmfzp+4PfE`l-k2( zfAJL(JrvZhxckXMg)xYAb8$crd>*7L8RMRmQ?#Il@s^r}2eJ!8MMF+o#?@U70Yx>y z3G-(E#*g2>--K%g6$ZqUpq<~#%E1wa_Gmo*&ERVAaYL!I1nzAS2_=96( zhzH+M5%6!Hv012trL8AA5;3}vLjg$1F6f%=k5LKgoW!C}{uwq2%aM6eARYKIjR#f} z!Ab$nf#K?ZamA277Y~=rybYK8YT&y%E1xToyp)>Nq5GCI0z7ym0R)p+6X5wIW&>Ld zxb`ndu)z{dU}qS4V(o)fPTZoUNkbSgG*lP}U1@hR$vP!dHemaA50TW|kfP^kY${jbTWJba*$yOxKWC6Hp_i zo$6_SKevO!r2Uz{9ApDp!chHZYA9*F}hwI!WX;x_)paor+Ysiuz7mrMK;iurPd=;&G4VS+{?Fvt$0_Iz=CYyX(1S z*0i*+Z9d`U#i_b&=lx)qh%t*k2XjHZx|kF&Vg%!nj>nZ_?Hv>!f700Tyw5=5GI^AU zj0))Cu-@Ljf)aBJiy0^r(81Xbh4S?qH%K;FB!gpt+x68&&lhedqu6eU)Of6hfFVNwt*^ zw9<)cE7e6c0p!no_@|Z5W?ESWXq+@WY8IL_{31g8Wm`zDVQda3{|U{Z=cxTdFf~k^ zb=>t82%Nxk>JQ8d=ICF*{oo$RZcV*VrRt0$hdjau%wh$z?$!S*lD&9oQOq*f<*md1 zIH1G&+`abA5&m;i=)0)j^Viw=*XL-Cy%cQ#kudN>^MRTgDP)@gxFN-QICOWK4(4Bt z+$iTr@6adCo9VnC4*3=YE<)n<(pS?jSeLnfhd~nV;bsf5-vVGSN`k$*_PDo%a<>mDr2IF5`2i4 zpD0U|E`EQB=QC{8XE!%A3uqY|yj^gTUqa#rF4N%{-^p2-v$syBAZ{UC=V{0AJe9cM~QS-}`9~$2}csc$42JCAapiJ?x%it+Bm5Z}jYT z&r@#0EQC*-#LYq$g|3AejI7<5rt@u#${ra#`rV!Vv8Ke_i(lS;EC6y8l8X&r_$%nG zcD{>eHG?RTP^6){_*I~0wQRB; zKqnH?Fi%{8=MDcdc4C{^6^hx_va>%*Dz>z01oe{x43vE`)JwlGpqk7qIoR52z%NUz z$r$g~5aj|$KC8whJa%8-77ncM+beyXWklqNUIkTdZ;40l?@Nvadd9T9M3hAat&ITY zFpAZx8^hf%k3T~QpD9RNj3BtstAx#(4=4uV zj3i57Uul1P?h7{Ct&n^n1dCX2?%dG;sf|Cnu-Er6Yk^Ji&0*Tfltq;EPF%BV`Zu8%G8| z_T(6Q0jkH}gzM%kn8n^MghlTM=CXiq$qWhrsFF^@V?xd@5$V#lK)ev*=sMgYQ4nOV z0ixb{NC_>6pw9HmA7R{qxa@g2XcZ(a;HSptss@_C#`B#p!SpEe^G0tk8q#`+BA|&( zC>ZXOpu;vkJUQ@K8u#*P$i;BcYz2Kt0@g6Md$%{%beBxY8Y{gF>f2-KuAoUq=6-lj)kN3J#Ad66Dus{pI7v^EquY@!PhpPZ|^k>1v zBocRMp;&2asl78 zsv=3ea=#b~Dp5JUO~p3`YDDA14hTvm3NKJnc)m~K1cKTKKT6*CwQu&M+u(%E2#gxDHR^XE|dg z^qo6i9_HT?_+6&h-~M(ttuipP`q6j%I*;U_7k$wIoPBX|aS$k2ZVG>e;TEMCUHEu; z{eNlHE&sJpe_uKW5@pam@<+U8ahTSP7}^do!~|Vf$zDA zv`1U9_Ar87<7j)i8V@SF8>j%LDFhR|l4tlF8;fcdA*cSqgiX?$ETy4ilC-S+RqA8y zKdl9d2c}v4W%Ex$l7Sy%&@ZY^1)e6>+mARQ6B?g`2XW2V^_E{vac6WEAdmq$-jcH6 zRRz3ETA=UU8}__gbN0*`W2P$?r8t>oTwsJ3WswWnsM^qb65cZ=>6)kbAJR$XY zHWZ29fokR=3q3p%b(z_^SJH~!;K!MC7d~~KA!6&jbld&^`kjN+U3CbBaWUA>83=!| zghzBknrL28!FE$1K{L7<8-+kMzNJy}6?3H2m zu(c>=#6BmQaWVpYoL1#VjoPeqs=(FSptpsSL-`V$jF6=KIkElP+AY$?M_yR_t-rc~ z{Y-t(6Qn-8wtJRh#jInnICpn9Zbu=0dPbVZDs z#U7@c<}GGX&Ulr%>MJ%GyCoz_gY_=VOP~Tk+z3vyxHV7Hb-8C>t^zREM71?k4L^d8 zt^4-P2VZzzMLz$gK?C3z#pgpQ`IK!Pj_;}X0y3M%H(Okz@w^Ly&DScJD_u~GWzQO3 z-$Mj4Zig}oKz*Xe=D%ltzQZ-N$3C+%LgDUD=OISGzPk-^2qhzV7kAi0uqz~fG5Wab zbx1O>KcRR_&v9P??Lp^F7r)#RfYMxh+~ z2ggm4mFi|TSRQ<7awSb0a%(J2*foRmTclgV@BTZkCKj}`-jtt-#*;gPWy3Z-kC*Fs zE05Xrp17>y$0kB|=MGS9j*h;#7)PJVk^O)-R7m11{k>ic6(l!a!22=+lH9h2L zXmPnTpe*PN7Jf+kF5u)Osh|Z%&O8fJ(x^2wEi5Pq(J4v$0R1(@ra3?C>1iG(J8nX715@8S)U)bvQR@tlHx?e?J8{yCy9hv_0$ayF+JJmNFvOik^Dy}{qd_TUB z-0hO!&w2c^6tzmhzjQ@%=DyA@wu%?n*@YJ~N6@8ur&l;gY_i;{l(OOM*)RE}^!xm_ z)YP)uQ5;h?_m~9EYcO=n1uV)J71)=D*ZS))FmHVzn!i!azq0^(97(f^o8N<|8b6Jm zgS`2O_ivp+t>iMLaV(qTUbHC#BrXZhXK$P z4*7H#)`0G=ZTV31zes^_Ouhgory*#LXdnM1HFRA%Is<-EKkgi3f z)?mOhYxh@BP>3gmwaDRsgy7q7{VJtZd=2(HQ&Uq*ecSdHgChbW$G047qIrB3=@=OO zIOXkQ77_$?Fwd1?i-R%;(yKIpdVTEn-*q{eU4NyG($3e;6}j2T3(S1fU;R^7FX>|~^nbocIVe@P(p*w605Z;HHYQq4m2=S8__IX?KX zQ9Q>Jhzf=(626Q1AI_4!?jhvAkC_!`ERk|Uj9oYWTiT8_IhuMnFkdvEnr<+p(x0k9 z2%!N4cf<#k(futi3%R;}_;vwi@m>7>uED~kRtqT?lz1z1^YHTWSvz*)!YyZBha_;) zGqN(~kBmc`30Hy*ax3#MD&qque=T&3_iiG%hk+u)CRK5VE^(lBovNs#$SHS-6VDVw>tNDMXvhG?lF zIrGqT73Vpc#bK#sA@hAR{#3u8wejR2Nm9Nec%A(G2-JbWr{vKiU~yM5D3kcJ-m#>{ zfP^-Q3+^Dww+m2z@I8EeI*`52o+fDW0mMnlf8OlJT>mfV5r*lH(0M=-Ky-~&RV-Xw zKE0)k>L|R127<%t;~hcM8qPQ=!{lV^*zK6K1hB*rV;V<_q#8W_Xr*gtZ(k4Y0H7CC zxu1ZDDEwGtX9uyFZ#?qZBX;v`a{n5ld*p_2e0)3!P9iNiuu~++*isR=9UqUTS#5jn z^zlG6c7nKs75f4}8~gJBcEMsLo$Zz9UXN5@^14C`li%w;HMkd8B`|6EAwdluuse{G z0UvByQ7dzK%#8{LvYS`acNfRPlOOF6DEsyO1>j!b7j<4^cU29u=JHjQ_ zLq$X!KWKZ7qN%b+j<8VxTnk?g#?nJajugaQgl`!;LR)D# zaAc(L*ut(GFSQ$I0boL9W#vPNG{9%V@wxLG%w6nv{rlxQhbQ-evc{*S2iW3M--eAF z*OBy3j!YpSurOnk6^x?c4^I#DDwU^wKy6~w!I2MHQ04dX)>hjw-LL(W_x*0YHH+ev zRbtYI*dP3SjRltgCwN0hCsBO)iA_D*z!D>*;zNm~b++>3gpi%xH#2mzG_wq>O1wVT z16#w#6lPsq{6m<2f*zPuY^)aU>jeT}sERYYKb+(?G z0VqG?*~3)3fvN+6feO>xg4x_;;uCg3C_S?(^2KuYW6B{gCsef)HGPP%3ecL2hi|Lj zWN$liyDL5^S|Ga87 z+{V#%H$!nJJ=1Q`j6x4_&H?e&BHC0uE$p8zGTDa_iwY0VQGNZ+^=PJLGe#;8{#sFK z=@&?40$+^BnfCQvnxFx8Jj~cFFqGo-Gk_}J*7m;HH8B5Y0rz7S>#fN`nC^-5=Y4uV zZaQr?4m2{Scdf^F(HM%x5Mp_fK=73nAZDXj$9UJ?jthus;k$7o_+Fqva!M+P%E9e9 z>_B&M;~H36I_?pu12guqU4gyT`qJu+)3@#HF65`Aj0#uRY(rFw5dE%}106VX7jScZ z?atl^Se)ZQy%)Wwz4?dFPCtRpUoKVf#Ek#w~ z`vkaWejEiS>WvIkh1?r{6>O2yBq~jP+i-1Q{+)8Ew&(UPxS8+Wq}Q zVMu;%|2~KRsu@S&l_WBJpda_(Cb#O$jZRz9cVKkD#SJZV-rH}`P7*$dT=aN1ci?;^ z4_D;3<-LkG9OlcyUsaWjEIwZoZCl)ZSxhHe)a zSTbMu(zv_hsZZRmUWN_EK^shJx56_-R5s`^=ptHU@6qe_XZ{53yI~$3_3YU*X?rF< zPR^Im_rl%^n#1ZGRZrqT*A_DBuAMWUNy_|g zA|oF^^MHA}?4;jR<5t*Y88;bAfp6!0XF89Vn2c*0eV<@X^&qp^*jN2`HC%KX zui7h}wHL3M<3*Tur5oJkNKR}Av9E!#l}K(8HqDMzPK6-2G z6{FP?V;6nm9!kM>+Mq>UE!w5o_>)?CQvF9S?(zS%cJRW+t2?c(4Ohn#D5loB0|Y)I zfsn^&a^ss+Wf==&zWVk}^b^F>OB8n%`8zfZ$}TPY-AQJ~Byf*ilCstfRgxFTR^;1G=$ z-RyDOUh3G(H(Bi}SWDxj+lDavg9H}Cg8 zUy@o13v~O8mJ_Bzb1hqECWMg>K3Y@JZS8IWg~v{;Zc4CFv%m<3*|Xz3G~>CwxBaR#7X?b^bCeVD>6wCndu@=b*5VdvC4m zted><>ch3SGL$Wf^M<+2s?pYBqD0cuAZ_)ctp( z!?L&5Mx0ip<}?}+6l>*)Ep@kX&oBCND#TyD^01!h{!m)Q+^1YwF8dDMPB)mCxb9v* zXeZ=iMj1-hpZF-L;Y;BOx=a=0mhqBa%YN*x&;3We$Afl@tJCgur|_9viKW@QB`r!n zG$Xfc44G^G4xzt&KyhWXq9_wZc-Z=rJI|yUUxX^w4(X!lzd!6`ovI_ z{vyi9L<3Z+Co033HN?mK`Lux6`_Eln*m;N#-T*`?g*;k?~E=T6xn$de*s_7F2QE zW16vab~Th&Ckp8@%)ec*(;hRkc$u~<;KX{}_iK!0Ih|r_*y&_mrbRt{&3Kw=%r}eT zBenj-sdX>Yc8e~r$>{Q;RGbjgxPE`am#*Y-an!z>dbUQt&GVmj(cO-j)L}K{Y|y6O zVodSLxAvA`dh6K8r(*iLjW92aoEp(E-ExUrPtFET7AL7m`j+@Zf zBkEN8kTR9sx#me;jj_1id$GBu7!HQ-Kh$0B-)zlN8cWDxwqG~)__c=$SAmp9h4JsW zZ|ZSRHJ_&sQpG+DjQFU;(qEv7ig|BYkZk%=x<*A_0Oe#_r*5TCtlxf zML5nBZ}VF*b>J@Uh#xrW{z#`n;nfXcvtEy>1@~&PW;>AoQQjp2byI%Ch6z0vQ8+3CgX zl9lz2)Y~p^+RpWr%CSgyps%`OM)7*F@f`y*?G{Qb3)APd z4Wb(zx3*CGtv97-UbCXU@Y1VDJe%s5I?JQA>3VE8Z|6O+qi(;<$+lLhGLQLq+Op4! z4H^DDNhZsexX)27(auu`+CAP2E`nog!iHlL}G~BQ04)8D&K!gvh?1$9bODbzlG8|J@#s>-s%7>pMQ5_vigSj@LRg$3#@VZP6`! zP*t~_+-Ekm$8MQE+4W0xsKfBOsM^URi#~@uXn5gD^D-x;1IN8nx?EN2`r&Dl{Yk28 zd%RcNT%9qgnc?s4Dt#Z8WV~OUq1p9sora~Zo11DYe4luz$KEKPx0hr4Odj~+toq&; zN-Zs8?UWZ(oI2lj>?7-nq<+gZMw(TO?|XF18FPE}N~leN_Khquec0;jdl(q8B2NQ42{yXCP-4@-nb*16}w4CEwZwq*8A=Edf!fN9`dKgn`@R+_haD0@cLgD)GZs?De_dx$j_Au0|xh6HZrfEo|$^;(}t!) zG6pu=_^9ECvtGZ0fx!B z&HR70*LfAMwR7P!i~3Q4TZ-zxy=%XruujKRx#n`2t5Uhj)okO~f!murnjD}ze27oy z0TW|<0*YICbez-K&bG~4b(6~9hr7hBo3LV7>dEGMugU_Il$&pzG{L%_;irj*>g$@7 zJ#F;(LQaxu-=1&p1=)6bsNFDZN=5E+^%aTr->d2sS*~l-a?zI43LB1k_3m`c&~DY} zz0F#Ce(>8eYrvcn$~!_cGjUfEFn$x%s6t&)>&llEqtbqzkTsSTMQECt1t4>)Ne{j|&`9~oo=uBd7-@A5v7COdX{2+&_2FaeO%B>VFYVmo z*QT6Nn?f45yI%QI(c<|H02Y)c(`GpbGb z<kH_l+r6Wcd93L_y!nPoPdbYB`n9rVPTfTdj^Xs0=pQS#RFB>Z#Gzhc4 z7;IF9+&9is8_!v0=+VD+m5}{wj8a}Lm*85$#x@j>JK^syVerYY6$h6&G>i>u)5WQM z?|YWbnjE@P-q2vv_ewMOrrUz=efZHpRdss@U;E5ax6Ly5taSfyI5@WDJ7xPJK^Izm zxjy+~hhyDO8UE$iTwj0twih|S+#WS;FnNMS)4;^Z`t=`1fd0CF-1a#>*5jmcMTqYy z$7Id4kdxbjC%x+Iv(3x9?Hj|g`%3*{wr>6S+j+P_dO_}wq@I_ORUeGoIJd&1sOWa& z5o2@D(Jx=BZn$|}Z*9-dp06&eM|3&Xx9^|o+=_~vAM-17tr{N7^$Kni)L>GVLBFC? z=dK;lqVX%;k>34t3M`^SjR(HpvuoGg_a-I>t6YW~TpTeX=f_Di?a3AIqIWk8dO2jU zN#*<3p$BZuL%KDIjW^j7QE0GXy2FgEjSWh^z2B3d{cX=4wI8p05k2n`CjW&>pNCE-hMbR!t<4ts+MZRxfuaL2?~F8 z{`fg(%=|`k7GH5X6?ve8#fFak?V6{l}8-kpA? zd|zO7UNc*_NVRQB|0?x!R^MKin)SSw<2TT+@6HDEbJnHR1~_})I`H~uYQWFMy;uES z78?F3GQ7k-)u?mPUd+WO+OA7}{d=8DinYZHE60QIR%oeY%=Xv%7yjz=ae>mcQ6_7m z_pa<$H7dTU`RbXef&FJsKd{Kr-!XSqhY`mj-uY_n)cmV(!|?3Q{lArbNlh=XwbzcC zUs-3-(%Ue%y8qfD73&j2HyvGIus?P2<~T2h)i!Ibe$QJvJSzNX^wLU?koE@q_58v% zbtxEAaR0SY(cT;38;y4dZ+m^x!uDR%^I<(ZN4z_vKXF8%;k=bIcZJ2h3=j7Sa}8N_ z^xZz|!j*HwZ|%F4a-t&Zz=H+8o*TOxnV+yyUt^)%U3bdnQ(n(P|W!KH@s2bTIW39-5pb<+hID5SCexg8bH^Oj%++LQKGpt=pw4!x zVtYh*jr=<4&hj_qE%)0RA2_s4LGRATPbU4FO__6U%obATFszxsZdU}4^pUV4f5xIC zaEHf%2`KmPJ&HHoWn5<1@WRr1%5x?SE=+HGY0H>B0RI4 z*SE}Q$1ipmIAn-p<^3=c@-)y(GdDRGb}S3^RCKi0xXmBdDW-;QRy-Biw6X`R58{&2 zvjC1lW-?HVe2lH;5tlDD$oX?-O_^)?$Xj-mV@x^|Hzev#hTNR14pDM7sXUO`1q*>q z%zp;`VJ`Po@hCH~z+|b#^j^sWMMs9D@Ll?!jWA$q-kkDLRa1M3QZ-?o_U5wfk{B`P z{1)8h$Tn6BektrLK|LUvd4=uoj#B8K9;_PLU&~~3P|Nc;1F+a2;TVxS2%1sM7nM!j zClqTfGda4A#FwZ~y$w zWy|jm^sa>hhGIFg$PU1D$L`$=U?CIcC0O+%d_wcy#@*q4J1RK8Iq;C5Dj7fA)xILUm@JXTf=3UOTnyEjmIzKuT zDFIZ;oJWP{T0@9!Vf0x2`t|E^b0nYUuiQZt7#A*FD2cu?1gR*Nn3#>;H6uh)F9{UZ zj``2AB#~bWfN!z#cTU}!@6EgCfFREV^sDj?wnDob;9R;YtJv*gH<0I`&WRY#PXmpR8xEF|Kl&$MUagjYHGrgR{eo|A~63>`ldgD z<{@FWsP8V^dTZNAhW~16-hDjt__1S>mH5`j#$N?)N9JUZr*Y_U!GzCw6xSRv~hJXW!BahKk1{8`l-VWiRi({ZCG@iLHW4^5ZmldZwP z(snUbw2Q>u4kzvJO+;U5bD>f`O10yH{og#AIWu=?5L-{E8 z3`tG>d81j%4w7FGxX+tz4#~vur7bowuZb0x8Ied_`S=hSbC-^M6|yfNU@Fh9u;mDoZ-!^29;;F%c+;P$ZK_R`+paQzvEfFYNmv>?rn$rQE5hm(?$LQ_+Zp}a1M z&KWY%Uj>_xyUC*gxtfHI!?prCVbWU52%;Yy#P9k^ZM@ z#5Jek&!0V8X7VGMjgl-h_T5h^w)6A?998r$Ex3f3m~*-OV8r)xvD>$AFM2ero5oS>~F+IC*+aA$Po zfmNZ#VS61rTL$EBGW0iHn)o}N$POMx(?@0<6{A^?yIeNR^;cP7nf7*f3X#Cd-@e;CAe?Nm4SSvk5pL~^Sa$w5zL8W1x=ZdV<= z@&)dh<#P{lVoCJTvX*CO42*=RmfZw75Nrk92kgk7?a#j`Dy-ZgsTw`TPD3e;z?b~s zZRnqofZ!z-)(R>Ex{j=skP8>u;h}`^k|+o@waCPX0j(alBd&y}vK&tY1OUR1QSN14 zd(9MTx68|R8#Te)JiyW%Q%W#LAA4;^_nijO4)1~(iI{=B9pg|r_&vfCO~;&e7%>*h z6=@+!m_Yv7xw(BAF+$Wc!*~W6UFTKNs*-lYNWQs}*2DfS{)*<^M#UyZ^l$ZexNNO_ z!l{Uc2trn5XDi=0wAJHxaG9D6DrOPMz^s(j??VU|$b4xrYw>*@?*twIcpyogL%TSM z+0Q5jYws%gl-*(rglH}1i6T1$0`nevuiSUH9~28)Iv7`1?7v zT-a;#A2SMlc&(QAzSdctJg=6^~*C2rHuqsmdKV zp}1ZDR8=QW&Lmw2gG{ecqdIvJT|m`!1Gz+AP0Os^ows5^Mvg(Atc?D{UkSV1q{o_| z8&~T?Ap52N`5lO!=f%sH*H}nWQ()_0&y##8ZXk~1e#F^etU26pwuPzb2biT;)N@>f zyz2O*q$ZLLg?Fhfg$IHmYzAiu?#=w+)Sc~+Qb`S!adK63N?sOoCL%0i77}Jw4`bkm zD-D&Ah&NZY-?Z^pt;MQ$k9UyV!j2sbj}6yYpOgnj+QhtlqJ#=PI z+@rh63#-TGR-RP4Q3kHa4zMt<6Vk7%;(N#X|r9rwpyQf6g>~bl+V{ z(R+Bv#o$AnSiZvr6@7-VY4F3yHpH7s?@5H({hzx>x;@Rq0KGlKP73ecU9_7buP@7S zSTKF>;lnqf;b+jxkiEf#V2uSgTd@!3SF`2zz>?pa4G<~V9Xb|nGPg;XOAZW3bZ6I- zjwccEwJk6}DX!eX-CAe^0yfoTrnAQOJL){Z{dZRx0TG)7d>~aCcBcA^TY*hNA3VyZg*u@;EXpH7?QJ_B+O1f|Rg&AYfloM7ltbvKQ6Zvf<7t!al8?Yfv6H4%?s{Ng;9BEK z^WNrj79VE6IFTPX68p>Yj7UU@xrC#?+%e#U4DX}zW)^Ng52F%L;@iZQ%N-Uc9gmj? zz9rs)^G(b3@u&&2Tgv*7;@2Vm0Xsn^nOxG=^Oks%*&FRRu}A>egJD@L+-+x$-TZH4 zT`Czg5M|MSa_{qiBsvwx^#k=0Y-zyB^=+^opnyM25}w#J3FeLC44c!2ZX=#l z2%Am6R#K@qoq!)s@*-HzNG^#5%Vyy1bXFLPqzRq148GxLk^4ll#2N-;)qMDHB`eEk zz=<_m@H4`0SPxh*EaRZ;#fAZDy-5w?;#(M6qdMnwmp~c{hBhd)$acz!o|huR!%e?x zJF+#EE49G=Cp+`r-jj)GxhPNR{wk_R9J^S1Pu`(1W1G=_pY}stf|@z}vNAKbgr3~k zg>4;2#agyLabhAHT*MO~^A5xaG{*S|raWI2f+E2IAFa$N+J2_l6vw z+wq>Rb{-2<(&i(M>^(AyMufAk+_Cn`l?hxufEf*nH5N0+sKYHHIM#vM=+B(_nqMls zE5l5QyzJNS?2VFz4i1xXL?YO3c;L`OPSqt%HgVtOM@W;X$ANg|!GlB8AfNtv{&U0> zJw1XS4Qb?XqTnA5%)dKf*Z^d^v1?<_1brc!^!brRC+vau7MiSTOBKyjF4J{4_-xE+ z6D6j(c?Y@#GGmI4_27i(^kwFu_<1*N+7w?LKs6m@oz#EaxU?H3uh|dS67v>Tf_zZN zBPJbMl7^F&4OTRkw3uvurY2)~SS5~(FDx4Ns;b4VC;903WwcAiWgH|uzZ^ToYtlHA zC)8>9TD<0@gqPX%n}B z{WvQ-TZ!3CZuIq|)_mwS_4N4}H|K8|{LeTqCe%r6;KUN<(~Bqw?;bShcLH*@vrP1z6nvA{`{7GY+?K%b}@FR8H`BG`|~Gh1Klxrk)1{- zbthXYC;?d7wPWa^mw7}~qB14$Hr7)!a~*{QAKFJE^n3O^+06X1@6^6n%K4^tcD-nk z1&2hs3|5iQ`PbgPtqA@Cg98U~SiE=_OD#xs7ayNAtxGpbzCLvD0}Nyl%9hM#8_H2O z&3{G4x`%{O67URONRew|Y`lEp4@!%3k&yyJ;o*$ejJ)HdsHk}E4o&86Of>jV)_8i_ z*gXdjH=uY;7{`%4i8&lJihzTR{{+$X`j|$5+R>9WuWsBt`kY<)G;Uq5`M*~l8<51$dHYmVHvlIioV1Z zJiKt!q%B1~Sw402JyM>2uZ&@>>M0Co)pv>IK0fi zVW$yG8zh4}s0VJ4bTf^DXJ}Qs>>D@A<0>pFuXeP4)#|Yg-y_=DQZMEKa}Z1AH>es5 z*qofH5-vOQn(h}DEkW-;J6FFgAdH*z&SdNj=!Pk)_vG!&t}1hh2u^bNu}$B-dmm~U zFuqS}f3buimT)85ld8%StbH$50FmdH-0tno(P4igEH5Xg55+WMcf20#n$bnuX_L!dzTAd2o}wH=cJGN3t}|v5r>&|w zZOzk^>GYw)1D6HL$$$nqIVFYm9VVa|C|mhw&NChaFJ^twY0G16YO0{B^^(B^+LbrO zyOvsWqbudrUMYm}xdeWF5mxAeRlqcAgXF-H!b5wFL&tsP%BVz(!OVcAq~*!x8-ZdT zz-m5WneoBHhgG|+d^}ZVe^*pk=3mq9-Mc;o0NA>8YCPJIc+|+u9a*^XA3SO&ZFddy zlw>If9v^VrEwk#?tCbUfpwrKbs8d^vjLW+jVC3^^n8*eQJzXSzGEnGlz+ z-Sz>Pv`azV`cGL{h^XEuN-jR9Ji*{lLuV?fK;wRWHo1mISB=vpHa7NL+qlDJH^3PB5_<^BF9Qy^KEuB5C)}8K06g`IJNeB(5C$x9baa?S ze&S@HoR2Q~(Y`~6sgM>dcdzGapXgW0zi{xIEw<%I^H+Js3}NmCcdp~ollmPlnJMb0 zx0D_jTxQ_l!E7an`1x)v**=NcxD)4SNVVwN(&;iS4R=o3Y7A$?LXVhlhJ*{(nVQUT znFlGxgMp7OBlY>2LYMr(Gx_~)RGx&ivC}});GCtolR2M38>Am8DDb0Xq9TI>ku;6+ z2?Y$73Z+KRocv+wmVv2vz7?!tJ+OYrwia&}dnL}(loBm${6kn-l0L{%r)%G;qR zF*`WJXZ5W;Bd& zu@cavf_*jhK?JLb`IqI8BO0Hnq4~|Fok1zFA^a0DI zG5zmgnGoP8SlYX^AFhz1yHxZkmiZWfq}*#v*RWcF7yaA_B%r?`GR=rIc#w}i!-m~i z?@GFhpcF~(*#$UmfjDR@FJIa06XYt$n`Ec>_0aZn0Q5NbWw#&JH^3SUl1Go`0E=Nw zxZ0jJyB8)J3H6gH4SZLa+LT6rs)_9b;8^&5XH{ebi&9XlR!+3@&& z8WVgW4D!~w&18?y{?dK)3Pr9z@4cPUg{19{7h%$)&V_0mdiir3=AW?a zv>;i90w)PZCL99*0gOBt`(x<2M!Aa+t>NaTK0$aHP<^H8@+NT zwS%;9Z_j7JDat|tz&Ww{=~^EYKymB7x>6o4Haay}Rke)9Nu_`PIBab$&J7zkFS|E$ zydzt`WZsT^x<>f65kQR6pyQnk`dq^;X5y9+&nT011uEfYfc4HG@n5{YtU+80x+G{` zQ&Ux{9EXj9=W;VsAoltDPvShM@=gUj6uObcbZJFJCYdvav?@BlNVFXqK9!d^S0e}s zCGG81?~a*If5xNg4WTlX)Y}p{{2g}A!*S9 z<9-1#-o@7!tW9EHkr+(IKjUI#+4}J)I7XnDK)-R{{|1U>d!0=Wm3QhUjb;w%Tx);T zJ`le@5W^lrV;?7V0Zryhc?|y{u z40tBYF11@L>LxD`xTG$q7pV62XUrJ!wfK_`N%<66%nlgFf&gBTcrbCXjUC&~iy!;M znIa=DU@xz;QRWYGtS@Po04!56Pty$wF^Bo8Dj;cQzXzYy?dk`u=wfJ?>(bMOn6sHP z|CzW)LIt6_7!ch~92}Pa61=Q~7w2Q!_U*mTo;_QvFA+o3TIC*#T}rnR#h8-%dnoyH z0{H8`8A5b{Fn_DwPUGw8xbAoEzX;?J{>XK!e(%DH74rN*#Q_+|)GnTu4*fG$w%V@~ zA!Vj=f{MOm5MKrUx-L94FnzD@&KZ)uhuOJzajQh-3g9(7i6`%neGTL!| zmZA5Sv1E36W_qk%9m=UsEd^p|d49h4#SNSi!VnpO0>*_m|%)U%60Pb3Wp#W|(f9~G3bEn+nCM~|Z6t>5H43L-kyT}0vH&?0b2W_w*){azx6^Q#9_O0w7lo~CP|nf93Y z!)WuHG0Y#v#O#|e(|zgTytRSDosIJ!Gs6V*zhQ$0GP7di#*LuN@PM>DGGhdgPp}}4 zBQNSC3|KoIUPkq2b}!Zbu8<>{HSpSTgopyy`4Dz3NtPtsTm3MPoRh)x{rAqD!l@RP zSOl_2OGXi1p8uPC z3irzP00SjF7lj)w>2fLr<_VsUi<^W34H1|m&dOo>nn?fzIMB4Wp>38TPL?Dk0z_~6 zpWkCxl~)<80Y*dYl8`Nwaz7b?NLkkeID(r-jN>?12W1K(#(14J4s-btpj0ViLm5F>woV%(~{JoBh1>7VAlaNPsrmVj3TUeYkmJY$?m=B3tob9$#fnzy}&;Y zs%qt<3Sj>t%P)S@pN(RX{g*1UqZ4Z0W%%FVj{fwF5Z2nD_6}2oI&Z5@^<1J3#DI*JWoJGt7h#Iuq zrl#A_x`1gxx2db&Kv+{*DM<+QHC~#7t>(_%&0lD6Ak2|S>~oZq;^AGUFD+G{s5>!0 zh!)i$L$I`7vPvIuwqyJDY&&~^br_Xf_4bgE)_hIAO>m`QI3Fw z6%V2;Z3E_LY1uhW4a$2PsdZ=0C|Cw?kO;{#D{?Sk7T_D5B-Muds#Q}NFvu0LiOC|a zxog+JlC(w%d+K%;*5~=ztfDVm09=xcqNzB02p&-ayD5LuzE=*GR1ij{*`{xVP-`R{ z$3DV>DdXVF8}@>xXOrjJwe*6Xg5y;n&aS%``@(!j$9B=L`qiuFRZoA)L{k}hN%u3& zUtt280no~*fFxcE3RW6n4-c)^?G>ZFsgMh9xrTiKD<)H)Fa1v|)b0cbIHN2FF#JhG@n|b ze6fN5HI7jp%1vufIS6X9w6yU=-35i=X6p&TVsp4l`y-Dje*NoIv{KljYRtNKuPu`W zKw^a*MBJbQ?X8PUMDorg_!+vB)z3`rPD89Nl!-~6j*4~kk^gSOd^dP!Z@L*&q!UB$ zi7FL1x~?MhBx^uIpG0%P-e2@(Y3uu>uH_I0@y}+&!y!3!4H$1y0hlMp-h8lB!S*Jfz?PfZGSJHRqgLR$t{!hG?(OQsA@2Yb{| zjte1VkOMOyH9XNxFRG8xmMYN`fV!Jm7b+<#iaLVs6>q}4>1&FdhL>Aj`#0Pq8%}h= z?;~SYtMk1z1_yNUEe{`aDIp$84LqvX-NRph_jL;XrO66gpC!TXbA@wxb!;zhgg8UYOkZdclwaBmVAdwTQ zc!dgcEw~(e&~u)Z5w<4&D$w0BRyMW9=QG0{#SB@(K8|h8TXyp!fjE^a)by zJdz5@{scw;#_SVDrP$F$(QWUOTm}h*kZdLzJjn%MI<)blB)wV6-&m#J04&$m7Fx)? zgf|k26$yQZQF$Z(P-*pZWDa18%ve|oSe``azoT+veI$q$h=0Ou?)5DX2Db(@gD-lr8zg4|hl6gh3L16)9Ay|*#5hqW& zy~~wCgYuw38TZ|Rwa1c2r?VTmn&`&aDCxwOQE^F;#Y$~7fl{TjuIF$LG|@Jw&58A* z4SDwN-5wZk(PJbhck+U$l}DIuTqHIU&5IBzwRp)Ig8eAgrM`GvPU}K=J%|a7d=vO6 zis*mm2r4r>ileGC$yoH?jqT9akU+tt$SY)9p?fA`rSuv}&U6)AL%AL7){U?5FR;I@ zxKKbIi1(J1rXA;_{evxlU5wdiZsY!mt1TtR1((8|{5N~XY|)~~{o&Xb83~6EI2J>( z8%fM5g~s`#CK?>rQdEFb396pCO-afyP8MSt1S6x}Yu$o4h5U5CbYklkS%qxzB9)@{ zAQ*=Je(AMwlQO1@ewbbHIM~W=$n;?I23?|lC&79)=(5UP#z4& zQYI0P0fDcefirpFfoc#C-I7!>ARMsD7rgS6h{6W9kx!vxlhY!ZTSru+Ko)dh%r`N6 zG`+22J*3*2Up?BQ(gtezj|C`M< zNcD`IM<~ZAkmtG205JwPE8GM73W#e`ec1VNB4BY*QuprNhiZH!>LDRv<qo2lek42whbJ-JydNTFp-#c~;^D6))8Z~;f`IyqH zR~ybsQ_8HIJyA7sr3^ZkH1MbEKTZGr`-dCIBPYmD_9@hUTF!S>NQ%D3x}d5Rs2y60 zdJ6@UY&5A5GIEkqLvk3^v2o?~Y(oyyK9!0sx~Kkhe_LSDhZG)UVVycL-tl$|v=1f4`kYch zQ(#m%N@YXktdSkJez2SE5Lu0|-yf+v1mqxDa5MCeHS{jtd<{3_)u5Cm+Ut9HGg3^iy+Br(v1L zMh+i-!O8~am=WDW6gFmP--8T~w~XGuzbz#!)SCi#1BK}7>`y#)s39bytw8syRK_a$ z5_m&})huh;meIQyUks=v#~=7R&jx*}%<@3~arknwZS%o{2eWfMOjz7enmMX=6lW4= zP79(w`eMVJFaGr#Hl8(S&P^%8O8@-MLddz!V11`r6Mgfn~MxzV0#z7f`34~CR#1d%4`}b4z5Lag7rj#xx^;SSf)|S5I1^C zMHySYL_Py*lJY-hgUqx;r}O(fl!6`(jA48zSZgD8NSh6{W|V(f=xV2fxdRPe48E&F zVal*8DU_MtF@_O`UFlg9i+5Y}I=74OBf={>6#<{Sj#yGgL-`u+QW8FQYE{zHiIqd} zRT3$5`KS<4$ld{Lo;0~AsNfD#vx)D4=hR%QMO(#6O*wWMF$AV} z9)-0G_MxQ7*8Tc;D@txtQ$!a{jvYT<|N6RcM0gO-{PrEhgi;=XD&(LrSwn~2iG!CL z!4dkrs>%>0pG!hX>S{OUt9Zp7iiI7Xw^3LbPb40WAS?vyNFXIR=x=()6ea z?z-~YP*FwE^K!Q|YDT)->$8V!f+bY&p}$ZiFIl=jG)h5~`zGMS0LmMMXh`2=_YAdaf8@ItW2bUim>Vz|U`s+hT0-X#SiNKfY9S zFFZL69fRPPCGF$mqvPh}qzcj+U2Wa>+@iw^cELc(y5KyKNkz0>Y^lN&@TCm_ui0;6 z`WLVvajabjV39CZ(e^^i1J9S0mrrG^3HpYAnQ5MQh)2;?$os17pwoc-s;RAwsJa*bJf=nUfASgaJ>iVv# zZ~s%lK|zuLN81_Wy!|}DE}^d+tLpCednao{C!+Sv%ei-bnxP>r~v zD_yn`c7$}qr|VOxeA&)D(BDYbF6|QHj@9>9Tl%X&xk9dF6c=ACQSk)O;ES^al{R%r zFyboHg4mH4-7&#PlW_c}sYlpl(Y-fU{WT6E7npqC@#7KzhCd|DA*myRbwCI12rqGD zWfy&aqU6cceO_I?A^ZI?^k8GgbY<<+%d1z#>boy!gztMiA?pU+63q3h%1quDV*Uy$tn0~9{K_$gS%>`~%t z(CzR+>RsRU8lC%pT{>v<(UDIdtVV#&LBO(yFkhQrt|5PE(xjPVLSr2+x`>`p^rtyp z;)Br%KZ7#TS{JQLuO+P=vfBQrNd}sP!~%4bxEe&AG7iJU(??Ymxuf6W9x2CYh>7E|UwT7dbPB6LNCt6TOfb@)AQ}f&?>jX zOqF88>Nsb;&ZtqOvrj%1S)|D~lMTx0j?8gXKPoc2R6=#ZN3+(eaWklx&m%=-VWI~e zNPinyU4FldOM=J&8#HVfF)$D$gf87?xN*ie^i+qV+H8yS-0--KpD=_A`4B<|Q0C0t z8-1@r;(=j%rX={eLvK;`rG6<{ER)B9CVP0k=`EZD4V23d8zqH>qJO9D%%MI~FCO_7 z;|b73glNaRvqP+>E#qp((Xu_=_+zPx;sG$U`YX@iMNeA26#en7Fn%#6U{5%MF+PTnC(82hcIpAM$ltc%t;^{(gM<(?I<_Euc zG1=6#9TzTP!%2lLE&^|$==USga90S)w4V^JYB}8wa>?Q+Ma_pw3=bc;jf@1P4<+-I zD{Exq;?tt*_Ctpr=zXFH(n6SwF5G@xfBiPl`JXEbn{iUYdPsC42PFc!0AIaQU>lTQS8)mF6wyqh^Dk}7WpxGF*^zPOE3&^vl z(1?XwS#ke4r2(^o%p+v92@DW=hoHp3e#w(B_8S}tbP@!^jl%M6gNb8#IHVR3NQ}nB z+;9v~K9uRgtijaxGH!)lX6{ET?k^h>aR=sZU#B@ISlxG7dAUW8CCgWM3^wusdJ{FU zs_L7NfJf-5p$HfQ{k4%Zd^d|*i4`q!0~u%IVTElJvF-5jhSZFrOWpXhu3rr1#2dgZ zl?k0ifys7RTXcUO)5%c#4eC^?*x$#D5@?uT;K+YSF~*hv0e<878#-+$i3>dPr}39? zQm0yN5EUo3ro1dLMMwh~CiBq~c8O6T?FS494fNbp*(T)nh}!nYXEd=}748uacn;mT zDtJ{qslJkvPP>6iA-vgLElo`U?TI4JY-HXTYT+7v&XUH99Ub++h1k>b`^6jz49unT z9;#D+MM~d)23OV=DLf>8&C>G8XLlE?s1DT4oW8Cu7iknw;($Blfw`zpE2YX61sexR zIjbv1R8P~mtNC#|!R}!C(I-6je=r6Jh_zF@s@up;&GD((3~O9fy;DV>o~*QuWh8Ad z;+w8Jo7tS6*qqT@bwD%Rj8O1f5()fO%$nVfFb<>g=jUzXjP(7<7VoR8`=nb;blQ0U zR&D_mYJ|9WkzDAkc(b?Iq$(dzj2yB!G@YJ0n3%|>)h+XRdIQE7M0QXqX*Yu*>d*aO znUPgsf3-1k8z8-`>1IZiO{Ha_nW#0lpFVv$IDj{(+>;BMe6l%sOweLXnKI@6oOF6& zf!q)gQ8!{Go}aKn-#@&1X5xhloT0{+mTR*$ckS9G{shq1(v8gTq8YhA@ zx_Hmoi!en3>eB8U_Eeod$;_-hF$E&e_jTDdFWG%F^*EDr9$IJFKY>@~m@7w}Atn*# z%}_rhr8x8}tEUe;$%H^?02^eC~~v3(ae*q{lN zz)baU{(R1SGBVjusM$4^Jk)Smy!fK~vj)Ao1po+q_Wb2I{=sc+^+UNNV`?QXbM}3Z zOp(nl?{qM$u~>Gf61x^2j!C%zfc`MDxFd)2LSzTRPjR3kT5t6CxqAmI&s2JjwN>)) z5VxNFlH$8vVj5Cu@XQ+N?(TnDedX=K!i3pLV?aCE3m->j#c@{d={X*l(qnhxPJ^?T z23kY&92J=iqaC$9w?+30y5odN*g0XTm{DnIz>~FIw&lanP~W4Gjqkq8IP7IaOj;&r z6k`J2pJg@*4-em)`--i)ZL3!9*9%5`jy#{3xRBfkiq(Xi9;eY(f!S$)KQq^miI$8D zknrR#wVmm576Gb9tA1Y@HN0w`7EXxp@Ig?1Efwwk0F|VH!)?5o4iWjW>OliGeeo(` z_ScScTDY*trN!Ey!l~v}v5ASUriB;fnb-(0;sGs(9`p-`9(~UYP+4ei6+UNF3x>AsKvL@R`Kyohg*h6OgiQC zW${?P4icK&l0oconb)s-vrBj;8ZaKs>||low=Nc{?V^e-8x5-{+S_;a>BE^Dnc$&< z;E`e(vB;1KQ)Ck+T!L~!sCCxMD zB@TpU7{p{qtbQvU6+lkdlyp?2B{63w`ag|Ht~X&nz5$Sl9}JV>a+WaU04urAD=+ax;9Azr&q>KrAwu?xPEERrd%hw zEG^3Zjo&KOYbWsB@@o$x|?OJ}H{n;R~Q?@2t*F3VDrnZa8bxPtRj0DD)V~>oA zLxf@NGm|P;*zU0INKM5A#@lu|mw=7|GPOez^#1w_#ep0-?|X;IhRp zu*CrtcQvUiS{zO))_R!>4tK8H`PQlxOcWM%qRj(2 z5ODB>Qh$c@FwlkEnKkGjrG`yHiz(DRg4lqsXxGN}v+6|Y)uQW&+cCE3WUZ8w4MgF! z9hx^gyCVFS%haijxdbnO?S`>5tt6L09R3v092JLVnV2{%$zGYTN?n3KP*1@}9rb+J zW{9Y-MOc9c)JXS$VR8@ae(LHixuK{|r|T}+(P1i)3$P#1#visP-RTpWN(gqKS+Uuq z;EPbuP6?NDR8$sdc4lV{>9Kv=l>L0qdoJrV4W@Z5h?sVvn^h=zME~c{wIO~lV+~*- z{`b#?X(~QuZ1Mm5Cs^-ppT%?fsdU*DFkoQ+Wn)MI*d4R0>$L~JY-1jDeUQ6AbAhJi zR^OLZ7e0c&3_6%|-$!hy^5@QiX)PBI4oU2g2ai};`F{GedPkZ!X!y+K$p9CZ2{Asq z8XX%v`1dmJZo-IN<~C_H$;F(PAfaT&z^>E%@U6rf4QmVh~8 zhCGAJ4EYk~SftWdKv0pbsn1PjrmOe;2&;d;b8ulFoyh(-?7Z%qj#|}xyYinbrF#|I z+Bz*-y|#&Qj8CN6oV0FJeH-{TZqVS$jmGy~&YkmG`)NYhwm(0cTw2lk(Z1~8-%`z+ zIXk<5AHJkSC17<$gX)ruH+Md~UuE6Cee8VixMy!}c{`W7Fbhvs^Xrc9aO zuJQBRfaYBr7Z=}7A75AZx1;8fx-lB9K6`ej-8^XEZ51`O|Ng?on|tri26PgOWnNwv zW8=nq{pU~i>(NwgNYvW0`Z})*y3R?r@jYZQq@()HzjvDwxp8#Mmgb+oJMQkbHYqH9 z;STFoWHT6t4YxU@d8GOA89wF_XT!sHqBuBxI_#fhh-2e7^?;mWqovWgUQj?h&t%9- z*v%W~oS6q;aQ1F;47LZtV!S7v>V?W2=dG++&>(mfnW_A*n+@wRgh^|xc=J|2kzAQG zPuGA5$)Biz{wwFOdrHv)tv8~JDV{S9CUnH`RiL4*VZ>f6;*P>QGG43f-Mbm6jv`z} z+0O2LOmDz|8aJCB>wjPTo7ps(m%r{*e0BG%`-fgefsLPBns@1Vq*`6YfKD0{Mg^q* z?338Nn_8cU5hFfSM(jPJ^eBn!CK3iuoyy#tm)~go_=oXLdaN^AlAG5&JmT3^W$$|> z3032NjPYo^>V1b>ANJ2=D$M*qJv|M_rc9TW&i^weGd_4v^bALj1q zrhg`R&ea>f<33j<1%fm|`-eqGODYQI2fmpwmM9_-dJNW`yGNCgW$~J~)w|qR{r0|Z z+ON3yDWw(Tn1I+K!oqfB<>j5&Kj6t7r>1Z2Wwo3+(+%AD=-VH81qGpa)}djFtfNf_ z39Q1Nf@rI4r%p~kTObgJCgu8{M&JP@*2k?f_QEx7@lnyt3BrPko+1}G-<{bMaF$wH zT5J?>(87)ad6MUYE8;pD5=LhQmZWn$a%<8;zPb$W0?QTnc|9Ty+#tR&F>aS%(HJ9* zfKwLxow(ob^p4?h1$%`J;2a>R$4%q9<)q95__Q#nmwHAW2KT1N!ytwh41YrgY5IhR zJlKEY#0)B+hzn*aR?2jv91bWqtzsO{0B%t%BfkONqfWp2F8IC?6r;YZuFQr-p_ zDl>yeuCq12Y?B&Y`|Y&xW^Zrz?-QmBZPB=9t(MmLsLV+S-&#-auBpJ_trlI&Dnh-t zu6f%j_4g(B(Srv++uX!1|7h-*NS~HUp)1NJv_Ajyv&pk1;ik{stgWA^B|10U@cr-M zB_&p>LpwftcW3h3P1h9NSNvR*l97hHoZQk=_!1C3VTS|qJ3YOtm|D0vkl^i_HUc#U zg{XHV()q%5_Hv4-5FmEwn@7oG5rc7E{ZE|OGgtjD22SAZ7f}q4=%25f{$#;aX^b5C zSO5JrRQfoslS5Jt_oJi5S<&_Mw~i0ILfE%iEu4!$qwU(aZ{4ol6t)$n*oA+scPM)S zb`ciW2J0!>|24>@t_0n&*Y4Du1$30RHl8=H#owjBMJ2?29ynykE0iuT>3UV$HKlN0 z^TKx|XyM4(XVa$LXVfO1r7Iw2a{BLX89{ye_1zOmg`k3Wu*&AfY%T- z%vNo8bAhJNns~L1HcXR}>bXqt+XLPeKmYnpPc45Q0nrff?ou5wOiZlI0xwd*g(M~h zxTSsDF7pQ{72MxG9456o3Tbjy+~Q9F|6r!GyWQ(eq2Z_#W@deMI@|Y#h&l7z^Evu)-USIaUoH*a{a`)tm*PtA{w9XRe!X0urxkEMK?;l8qcSp19` zeV$v*RXdikt4e#~lwm%~e+FE)=nr;NbLpkwh>jIWE9X|5dU`e;@a1c%cf-rhZFlz^ zYV75x$#5e_su5Wz7-St3;!{(FvS#@JufkmHXDA*z2qZ!puC`~}47`RE_@(W{z}(=% zratM5WwL>`&N6j|Xug5&MCnD)u_eD)0L;-V{AB$$mACTf% zS=kkU?OWXSiYF-}rO+hY0`@=|KJH#Oqrtf5_8vQyhnigmu4ZIx#;L`aCnRl?)DQLD zpUtQdG$Mh(cAP?6e0_bxd|ZuPTwKJKD=u*?=x5N|-B?MLD7+RV@Gvqu=uZAg-{_zw zvO5&w?hv-{Vt$n6qH_V#7b-97_wtddN=j{I?ExQ2G!Z>M&(@1eAfQZ0pCDW(v=)1l zAh^Q1k|#jdh`?k0Re@B2Y(<;x*}b(^W1LR+&$VppVW24mA-yI+g{dPuJH4hUX+u=Y!j1D>55?*)7~ea-riX& zuQ;iIl%k)K zt-3aIxbe5@`|I7tt5-Ski(D_ zTuzQR$`-M90i9%M6^wN_HZLH10cVhbbGXHXv&I-5yo|%Hk=^FMk{U-S-y7$upn^rR z1a2<80m2$~Nrfu_j|ICsYoS94e)Q_Do{Eam1ORrv%i$wO=6y^Zq^imb3X$Wm^G`$N zu~*=~Xj{7JdbXpALO0_L_ye4aXkRR51}!6|?COm*o)Jkrr@qA+E9Zh$(Ku;=c2R5q zHw}SKf5vvFaP7=`8CQdh9~PFVnWK~hVzfbA<6Jp;e{u`wHa;t9#6=$fQMv`D^2Kt< zcO)PAp0gi}E;PJ2_ZCYJVd6C^i+bLZCQh5C$N)m;qW#OucTYRD;m6Wv2V9@39n1b& z+HuC)9c_c(T?n3Ze{I^gkXg@{K6rO`scwzJ*aDC`lG{vfo(fIY@9IKKQc1Co&^-bVOUWNLnFIy z-nqNo>p5&pIXyS29s6`?nv45~@$Y}VQ0_GP!m>R{=hwhxGnY=HMGi$`)@Vbipf&m< z8^Znj_eDaA%A=wbEL*iWJ+Ed1sV8aAZoXRXu=Ma;^(MDfiP$9yMEZG2CFv61M|9kw_iH z-vVV!eNn@pYY`bZT+=^ekn<;WPf1Tuv~zb0DKXZ@SK?sI&Ueb&8e{dz^s{AIUN zR(G6lx@y%SziA`Uw%fB!Q#&RVyG5_8*bh+(?=JEjgfn7c1L@+;HmvcfIQL>8(-V@AbbCSZ`i{+ zYuMF8(PL_1(RILrdT*<%S-)T$Ba)xMFxWdgXX8h+`n7t%vTCAipKltim`_$z=@zQ^FM?*lLZ=S6e31rmK3u){1hLB7tgB9FJisJ+ zRnPau@vr;UmbGbv3aA&{!|?1hkLAlnB>oUdGo|zVoC5<}gx?yu;{T!Ry#u-4-~azk zdk>Y0c4(qfY1a`Yv=AXm3XzpW8X78+hLI5st7JB0M8il6$x2Bfgm!7D_}(6!&u{;} zf4$FnpW^j;J|E+{?(6bC9TZf^p1p72e^EpZ3S|~tg!27X(Al532SKex=F;Tliz~ar zSp&{$Z`G4+1g^~#)wH7Gt>|~tVzMrkH3>hWkXIr~u5F`q5Y3%K2Jvl}pyTwsQd4KK zx+!@%Q>8YwtzMlurF)-*mscX=BfXZ7HF2!lo9yg(CTOCWx!e=CKEns}Bg4d(k8D?- zZczM`+L@a&Ixa4fC9yd74}`nG8xWx)SlySLtn6Ij&;eaNWgPiSB8347H2Xm0FV)OOPQxa3&8U28rWdJQ@@rVd z11fpiZYnMWG3ZN0R2c(!M=C21V*8?|$r2jdFJFX)ia@i;SirgYMd%YBWyyhK$0IT~ z8#cPf3j7nwoLgoTV(Tv1J#X+xK)Zgl@+|lp4lT)BL)=w+ki_6f+1`rY{7IaVGAsRp zYvw-S%e!YD4SGJIQa;E2#AIP{!OJ)JK!-sq#u@&d;Gj-07GU7e`MUze9h;jMk@un< zvMg)@{T^pH5dM&xIyZoY-Fy{)%zpMe|7jDCJVSRzkk(>hWITS{;??ba?Zgsubo?^We80N-wQ1 z8`aLIZ{HS2>-wBHF;eZ6tr;toKp8QceWSTK_S=ILnuVVp=H+!#&A3Xd&FfMg}-fkJnE!j%dZbA_seqBBFdMCbX~Pd-|qX9 zp;&oDZWvP5JGc)5XV^&fm(yyeDxb%{xj-p3%^S>}fsp)8ZtevG>vCJkptqm%t%jAB zRy$2ZYx6XHhZ2pJm{w}?LI}&?=7lW-a(h1%Eth}PY*%O^Er~C%LQB^=yrd9p_%)8Y(P#3 zamNlkKUCJ|*))Qkf=t3$SiGgi??g%Wna$3`Bu1v_ZL(pg z*MbLO_+BY=F9IPz!8;iN3xzy) zz6kG0-ZC!={S!=v2=rqyo_+JivgYzy>pWXR5q?l%Ojc4FLsWY4VHt;hPJgTvZ_bCr z7c)CiF$bhC7&&q;;>)e@u0mNv^L+AQt!PhrH4ac$4-`UJ(exOf6=tBcG?P-jA@g_5 zL<#Zt>yx4*^xbf&qM(>vI(>CON+jAfR_I@H;0Sx4CNc}q4bA5rSW5=ZqIO4MISiI{ z#-+Mahx$-w^4na>^_-*bVyS~RtI56${2i%%zNlf%^(_V&-_uf4r?3Wu(I2LS$U#NJ zgK?uBn#{~of+Y@hyGK5+tSsQ}t(ekp2w}+OTPVs3YvZ#n?W30U-yC(Tb%M5Avfbez zrGp<=I3i&I)u^S`mLsi(1Y1Bt!MIFOR=$v!c;dnt*CV3402LQrArLcR6+7*gm^=MwP-svEmxWn*@tk;jVx}t_KSpX6-61ldqdVWH=0b~Fn!c{By^LMr;q|^s z(-9T|4S)l>E&Z8$OXM;lk4A}JSe$igVZ|J}1c9TVOhH45apq`qFcA^wKnLzhNb2jC z$;D*RAqoNV-B+rb)xc#G&V}OFZXXBZxF}M)5V^8WAEU5?{iLN)b){a`eY4o|WR5rA zSy|rXT%m{L^cge4^rkNu@!;Y4Wzr|RuQlm?xKCF2ag$5!D+X|>aHtA>2uCdRxM+to zHAVQXkoVthJ?RATJcZ%cLixsv6ar18OuLHu=!M;HV7DMynA|+swriZ8$gUGSJF5<1 z`iH0iCpgi>EF6QIyHa&Klxl!JCX`)Rt-@63YhWY_0qdrEf+Z|6H;=mfXBFyP)*H_; z*5t#tQ>w<}kEuQsAmQrOIh>I~Kt$y`FJ`TTt_kjb8 zypH+>3mc3^PzMS5{gMrdGdhF$9{$=~b};Kw$q|z?-4EKYS>nCle_~bih3uB7I?5Y? zISNM+!y^dh^+d`~`F69qLjjTxSd_oUDOTI+)vGfG4Cv-rhbM)6jz_fPVDz*Jnn}v!#2r)-#&FkV&j$HqB`HdRp!LO7*02@w+Ieo(79lEno_g{1@BX zCrP|QsmAD?yPw}hymc(O;FaBU7Lq2cCod=*upd(L6sItebNY*@Y2(h3Eq+u|C(XkstS8V&>cH=1X8h1 zU!eZIn`e{Sg`$&?LA){!JupaZBO@s%kr8ipjYwqabvDeG@*`-UWsRkF8+s`UL6gJw_F)E^~ z^w&?h2M?q+=(x zCKeZa7{u0f?KuQrJBn(GJQ18JwsZWRwE<%QT~It~HcLt@(?Bc$Q&8itZ*u45%XL%d z%gPweBE5xp`iA@n%Qy7Q2!2zE$Q`Vv%L3D~ii(6bo(tx;A;vOIu9)klluMf=w z;y+Ht8L6}1gWiei7lyDMrSk@`5)qAnBg>lZ zlT=PD2%H4xH94ZEU;hpq%=W~WAab-#GNfpo*vDcH5}pxZR-^13(EVWa;b*aQJ0gII z+H&*GlaJLx;{==uHzM#Sfs#I(AUarb+bG|KUsXLm={_DX(JfD_AEl0 z)xtL=4~s2+edah3vF2#XbXKHX1(pFE?azD$6x$S^IVej4w;J;X;8@6i5N#?ORP)^) z8Lh>>vtps8g;4-4E?1|`wl&EVER;*&L4AN6g|Y{fsGp={=i96zP}n?IbtTK;IUo|09nJFkJfq6d^a0vQ#C44cOHMbM6xK(g{e$bzdM5) zJ5MSc$x_Ck8iL*f@a`ZV8+*-6(VV~QRLn}J!V-2oP{UHkHOe;6vN2m7z5W2rcp%9y zEXK~SxE-pwBSm7HaF@{)3S5Q$Bf3vajtEQU@r!E(N%mhATZhyy`M>VvCCMo)SDhUi zT2-*%;DO-4aqkoU`6o=Rv#H}BTuK?w6Z zr-^llZnPOpp@`d|cD_sopMW2Q4FVZAF%;riQmrBY5`p(f^Mzq6YQBu;b3eHqh7?ZT zeBAz6UhQ8fN}1kk19TOBXAvvvy*NwSF8vM$Me#Htejr}_SQtv2c_JqUeDmoIXGJ>L zTk%smcI|0Ig@HKz0xRnVeVa&7fg->` ziH&PyyzSf-iEXn@2b6SM>*wcD`_DgZ#Ur9ifIQ5<85`&PQc^m&=6R^|k%FH;J%d`! zae-3h0w0!Sl5%rOhH8oO|*6F`s&`na6> z2QbDqKB4tGXWL3+<7EgRK!lK(5>%)GSjNBnk8(wL7C4p&QEx<&b5yAHnBZ;h3 z5tlW5UdxkPZN9H-Q&;Jjc`Y_87<=~K;OX5J*7p6L6mM(_CI9%J_Zj*Xb0W7koURA- z6hU6lVQ{F8adYqyidZ7*fl*AFLH;c?0zi8?^j-o>r#}*bOMq`;Ov{v=-?m`sX?5M6 z;jZfFpyqq&`gO@j?ad+HDB6J+yujm;kuGs@`7ux@9dc(d$pKqaLm-7}KErt9m>x`1 z4zB~@I2A-wl&6B_rWpjNK#w~xb?q_GGFZXaz`teh5+7`=K2%>lT6>YyRcT>T6Zt3< zB-=hzhVqn5wkn;ld_kTyD45!g(2!SZ@byoy2cp6Q)b8I$gZd zPbYf7kz;PZN10a-8~WbHq*(e19&kHm}Trz#JM zwztUl*lAc6KiYWN`N?Jl!{>Z$xohLR$<*{;*e>D_YSAILl!*NHRwkufW>Vdy?CS9^(^2-oBG5{B^yS^ zoqKh*bRRO3p>rhB47e>akD7B~h1X6emz0#;`7Ljn=XH^>?TOk6w(Ygf3 z5%OZH3aJm$;$86Cg=HQ6BZ=vrtOeE&4Ga5-W5<7}o@eLrlia`5t0j&czqB|YrX+t` zL)gvecP7<*1}jmZv3)Zk*hK7%03J}0<5})cv49s4{ID8o2mwZwkGBA%;Xq&rSCAdy zu!FrTlxyoY<~R{#gSv~qAmsB;pAP<&GJZgJx?Fa)&VerH^GzR{Qym3F|K@ zIeB+Wb6zP5XYE^3!Ft%n{v^qg0Eyo z6^@YiK2DV0xV`TUj{ExU+g+>%;8Ltt+^im-f*l+k`JD%l$#(v(Z)BuNhs^<{ygLK0 z@KNlWVxTA^i2_|W@00RYzwrH2>NcOTW0$D*e)l#$*3xov*MXonGj~TuCT{AHlG-gN zr+UNXVbao7k2nH%{oM4Y^dF~)sLZB>MF9$784ERz{LnHPo}J_Qa(i3r=qt0f1pe@L zZWtZHbl7YC(UuNTa<6I0Qyy*D@-ZuO*6&+w2g*$P7nZjDTHx|?)2eXY`o0qfjJ^_G z1h1tsYwNRVqw36=T}X#9c!j#hOl5QP(ffaut(&~_`pSr&-BN#lGCpc3pRZ;xB)#R5 zym=ii(4$Ys`EJ;{ZRWvPkFT!#R~-r|+HuO@-L_1RON}nD4GlyewFP z=Jz!`jEMilWPi!&9<<-c8hm|=Ud?~_@S(`i%FA2UsJwe1gbcL@82lNESE;Kf#d?D~ z`TVf?Ly8(LU1U_$C!;-L8V)BFkKC@W#HMW`g)7Cr63kC2*_TA0+F_dMLkl^ zMV#RD+|m^mq;Y`@-7R>b@uqo$PyF)Odyjf{@7@!4d*`4@XP;=h9qQSyT2)rI^XtzR z<0oyic(bx=&jH_?^iJI_8V527Zwz?qee)f9#au6vTN@bfEFhH!z1^w#`2A3CZw-UefGd9|<0 z;Axh9#Uw0@e}oG7;GIM@(3iqVqOXNuJ`n03Hkms0fvTBn7^8j*4Hh( zP*mbuk)Sl(U!zvG37se!|7Xvhbxt-wbtfY?->^gDCTc$N`Tp)#L9b;AgH}E4Ki2(fVDN;Y_r@%7??2OY zzvY=TA(Lzm)zyW~Z@52J?ReU+_c054M@nqiz-4*v{I@KB|HpBs?ImutXv~aFxYc{e zkVNxdyS@blmXDaDm)@9o{g9fMyII4lb~U+P7bCUgO8*$eX62}PSgk#(B{#bJApeYi zsu%yc-J$wVjDFU$q|p-3S0*hqJ?|k`a7I4A@UmIvJ+EVu<8z&gOYHYe-j;uM0SwKV z%Ogx39a4X#EDrxSym0OIP%J%hwl3SJqk%-d+Eh9*B@~ERsNe+H4b&p$jeuzJRGeet zLg(&o*_NRdS9fldVA2QM_^18)k@QrIPbJoulF)AH;pU22fwOe9ZZ12k@P4Ob{i&-M zc@On0F9=gxH!8qr@|M6eE9RH0sH-n&{W3!5cv`K_pv!jeZzKNdk`k7&d(8DCgi+CT zb3GexU1PUOLc;yU*UTQHSAMkX-W|a2>5W0dcc^M5et6ku&i1a-af}G6el>EQt|!lt_a3q*93*0endy`3T>H=^ikNUQ|$2%Erl?ZEY*p zzpE%ImK`>17;`kHwk~{9bg*jIJo-Wo2WOGGQ@Nvlm^i@yN7n!~8P0_UvHB4`#$dHU zGKLB)CEFKifT881!+N1Z>oxyG{C4l1J9dV z$b~bFcK^x@nU%R?&ThkqqT%y<*4*`c`8;ab=yx^oH&i<&Dp!jPX#lVhe~98j*n8?WT!ZEUEh-hXWi(?Rg1rPE4TK``Jvpqw@U4& zPq{gd+NyKB&xX}i{j*|${IS}YgjbOP@gWTkUoWnlntVk<$;T-<@X$?LND$EaCB$=j zd8j|A{XDU?bmP?W)t!V1o6>(pOicLAA@GbMWCQdaC=!49V7g?{y1}$TgJYVAnQuYx zuYTSYD+SxTdQ+5?!eN~#vVvPC$9pPSWM><%JmC>cC);t7jt+%R@z7a58YY#i^kTc3 zRb{?|l$01Sz&+G>x<+j`zRiZlMJcklSH@Cd_Aj%(Hlv^ZRW+$AW>7f1EFX;ph9fs^ z8r}Cq(D=7UvsF#kxx06Le*clyHR~gu{kA{6xjQre^jw3DQ3ggmYTqAUAUVY1?4Aj4 z=PD`nn>KlHwx+_ZVp}wwua3=me-2-1)UoqtI;{#Y{DnwTx>p~9is0Z$Tb?#M3SR+c zA6Vi-tE94YB`A3q6G*tpQ4;m9U%#rV+NW~ zzhB|f5hKng8rOSmYwG-^uG?X1@|?W)IfK-^_F6B^xbUOqwokCdXqp$|9v_jkFl-1Aq2FG^p#+xhW_o;;cNyQN7kJtKFjTkFX7 z9sWGM7Tt?E48mfA(a|I!pfM_;pHBG%kV$|D=yH6S7@>Rv0uFy@bN#b6Uk_!A)#UYN z8aJA@XRq)34TVT+{I9b0eS-{3OEXW+wqF?ARJ3Gf?28*-uiTf9eS5S2*w%)N5(8!T zccG(CvOC*1!D+kXkj?D$uP?poqY}1jTE|+4aded_HdPPt`!9O_=ZZ0}x8sDULEwYz zRGoDBzDKf1DLpOZT!5t_d zMYx^_%wQ;iX#bRV_m(jfDa%3#Dq4P2j67~i4lHg$#6{>V3lgn_50f3ZL`fOj=Igx# z_<=cdB6kl~*t@fiLD0o1ai@EFd-f~TRaXAwTGh!R{8H0A=P75-j1G1!Gge-3voQQ% zf9KZp6wlUgr_~EHj=XbLBPI4(rG>ei?)cMEJ9kczp5-?G_mA*3QDt6VYg_-Q*~EK$ zt1rDdP$Y$NR14kcnYM?k=5KpL-q~>Qb+}A1p27S34+MV5)&6r!8V#V*`%+(Ubp~+4Y@&!mlKY_p z0!hH3-=_wxoiNGW=DGWUmSUaEao(PcEResj{N1apK7+&@H%WQd17I&h7L)<|LIp3P z?IF;jxpAkgp&a+81&tA9ULo1*lq(>SVQp>ZuD_a_a?#hb_{6+17z7SiA???~3}_w_JJmJm;3#-L6JfZ&p%MwroD>x&5-QfyIs0`aSO4hwNSU`SGI68CMdy ztg|PM`qxi+-nT%@DVr|+^l_{F-wosYPl*;$EQ>R_?xXb5iEM2lx6alR?LL%pAu>dHz3bEyg~vxYcHI6h5)e zLFROSP0dk_exdRmL#?dUI?kDAtd%(Ldx&}{%Lr#RZ&QrJ;U?OEb`W4o48UbNlEC+A ze-cR^r|(~LbJr|vwXCo}*EjNqcqd>tVq6lyYCSx8IH=kG&^Gt(5lx#>-tRVNSp} zV5%a|N6LMZKiyWl9AJgPgB$lZdKWWSvIL-CXf`=tmRW*-JkJU)x z*Y{h#WXY({mUfC;JgseZ+FDP%rrN!G_6!C6)cz8qw=L5$HrBVZfKu-9R{ zb^HCP3-~(^3Z@d1^9G=D89BMn_r@GwU%s&SdPqI!^t&N0(4wGpS)is+qoepabNMr{ z1MRud5O@U5a+DMKNW$(-s4a#%A(JrfQv0q~S));DlBvirRJc(I^BAyhPr3WLld76( za=ufrwS-@C8pcNupr%l$#FCGX!kt_-4;*|r8W33};=UlOfT6q{{~llSD*!n~6shh9 zXTQ@EM9yDiBO0LN{J{Km*w ze4%%b4fBHU2GF7m78KBMx8DxvvnhHI%7Z!|AZZS5G)HRfrRlNBJ+RaY#F_B9Pe>L` z!ZxtPkL_IaCs;;G3{hvl4@dET^dVjobT30ikkXy~oj8g^9tzWitD;JF^{<_jNef`OXXH0~x#fUA`PGaBMU-81OL-)l*;G*w9wVEFeTT z;2&a2x0rAiHDA@!i|(&VgwX^<*$^8DyhD}lCA4QBzkRE_)%Hhm)F_p~V)TrN4iX8l zEc`;u5_o@SZ5|!g?fxV8)iLgEJ|{n|XtZ5^%wDELd-sgT=MVIai0ywQ-C*O?^JAa% zjoPkPwXSyz&NK*4ba9E$@l;1F3^CG>TF0UF_m2Nkk9v#XP40Os`)HTh51o9vQSwMAkZcxG*miWkC+2t=_mOM+hfX7 zr9(3DX9*N~xMwWLoS1+Tkd(Y>>JXmj)Km4-lj64CIA7*9{L9u<4_ot1cQOyEY)W3z zXUd&-Z^m62@ho}JK2Z3=^LKLVOYh&a(o#t6!0q)3X3w7Ti@Tn@L9bO??t>rk7{qu) zfLiCQ#u!8hB4t4+X`7dI)07#z?)jF5v{wTB=UZ!^Oi>LlTG{LPfdg5ab`DmSLu3nO z`0B%l6~AV0HpzZutaqxf#-*#b3@>FBKf9dRcf;!B(OXl0ABocyorKNRt#OpmY~X$D zI9BTn1x38L<>dyhbrtQ`=IKTE_u0}wCV!`=&yvFG;_TL)|FWvyte&yFa0c8%kJth` z7wuoll~s>ge8ze^G+Ei5mg^vDrTSP-s7N0;RrX6XN5sZY8D5) z#+(_YT-GgWwS(th@h}K8=`x5z$+zhvVyVOs3pl_jw=!gF8eMJn|0M#r<8R0gpD7_x zrb+lI;&yXRgXUAYxn4mxo;w^`t`Zh4B3ilB)^j@`winb8a>Jwj(}7Isp6;9aTTbOl z_Vk*=^w5)Ap7vd+vVTZnp-OYcl0KG2i@RRnY0!fFWSipdufCi+{Dzg0??i%&#V{yi zyMfI!yLrk*gsG%Yllqiw*vlyS$hDUz$K=Eq&*}H?yO-;_Dr$67?&WyDfhjHKydSpY z(%XEhRLV^7kfu#Vprr zWA*e-M@KKMO)6_)#K4%Qk`bqv2?hn^fQCgBB7Wm_Y@Zr*Djd`Ty^tCuX@lZoAn8vB zZvBe)I8|DD9F{uW4bpc6dH=O4Q_PhD2v*9FT;|{W(OxExVx*?K*{9se(s^2ae

B6Q<& zT>^{`SDVzZQt-m1fAqudBRC8lAlaMK=>4Y0BYXY)X(1T><8wbt!y7INFGQxi5ROLp zjfjxUEh%X}q?z7$;{Ha}B}-1Xe3X?54$V~Q*KT*t(_2G*eZk^WzP>$asSC|1 zJ!3$kt5YBI{IB(Qy5zO%v1?Zv%e=I%NGVu8Jzv_kyd)o z&f_NRFIzWg_cF{TNf-b5NBjP29sBa54Q_)fZlt9N!KKde=HQhv2iNX2)~R^73a^-< zVQ{hj>aY?YX6+#fK_=wA_+ml%vrMIeK-Z1D=ff~Tg6KsY!SB-U|&qg#p8HbscVT;>d2qKKHY0G+lT6f!ch)+z7V zcz>1nj8Q~pAT{&xdn*&%^Ka0ADv{lrHyfd zJ%*5&;upESIwP(+g-$^_!3=>Hq~SIuOOSx974Z{q`oo1M|o;) z_Yb923m2kw*_cssN2rW{C;gklFnUt0F>gkS%h}Ll)g3q>)5?s^u#fa0NUEa7?{Uv6bU;{FXmc z|AY-0{zoXEk&x|NU0;OS17QnHAv$}W=c`czPbRz2YW*ZZTpR=_s%ed;;zg+W9*RDj z9!QK>Ad&_2r-;)1bbP0b;o3S??~-Gdu4wZcWb&~NZ1`f?L_L+TQR3|5HWbkXL}gYG zrVg{fL}zIlJ>ggdVi)>TIzK8?7b8!r%}LeDlF~iSy7eBU*~k4*uB#(StiUo2bK9p8mOt=TUcVVQ4z|q$*hyprwj+A$2`hO5_{4N$8 z>LQ>P$(HY_Q`_@?U5uABbf907Z>q{ z9JW==TdpE4K?JN(*$7QEc&<=);Kc1Ho(AX?cbHC}P}t&|ObHf@G0*#6LBW!jkF%;r zv+8k-*Fi$;`cZYbIV9i^ktXNT(qJN_xcLN$MWJ1MK_%>;#x4G|XIK+Ai5r=)_#D(# zBwc|^LSy|)ZW*#+Mo66%598#?!bvqnO=@+I9tA;-3{tY}E4a>aUu8!*Oh@-7#(cJG z`}4s~sD8u@2S~uZZz85xmrg=H;UdNzFqV#oWTk5qDN74?O3At`v(JLAHyR!dA(zAD zXKhRVCHKu2ot<^Y4IoODR^*;_i#_V|1ZxZU2VtThWe3BU;=*xq!!g7V!|70+-Ho$F zSPp20#RLoj<~}orL0aux{Gqi9-Oos;*^QNzox(iR=L$*xhV6LDgJ!)~5ORH2m)QnU z!t#r@nTaQ6WQT#)Y_PD%6{8r?8`45<_WNU~1c<9SWN?Rl=qaFjI{p zp^l=;TtrT#)CnsfFusVg6mT2TjdsLLiep%GD9}=NK2xN7c7{wSU@G__TsY#Y$G~sF z0^QbB>MabGGzhVZzQF__kvc4{UedRO)jTA=rpqSe#{b!ExJG?LJ9t|2%?<*|d=bwD z?{hl}O;KVnO}!c-7vtX-zGoqJM_$29{sU^EOJ$2=$4?$SQm*q1nJL4z!=y)^1y zTZ?45@33C0R})szcgT2+=vIAWS{X&HRTpAn4^11QaR=3$69XcchB{%12b3699c%kJR7 z7>pNyBxz0b3D6V+Bqh1g`5|J?pvdIrS1c?}{`u>dBM0tc$2AB`G2Vh>?pD~LA zTaKz^e=$|IE2<4CPDZdq zHSR9_%cjC!&lC&VB#9wImh!w&?bh%?=(re%^Z-v2kqsYY!`$_=h=k#T!(al`WW1j$ zk#ktp+Kst^4#&QxrgnlPXOft)j^gFWg>Y5;Fv7nGss4j3|UU#}|BYMRQ2#W4rd z=dQWZ@ysrXwCw|1v$8$cZ<1a1Em=lE`fzbmGo_N~I!W(agnSiZhMP=+nF$QJ-kI00 zz)OUtvvCA{n4O)CbY46PaS)*U!cRT|Qe;c3{+0_X$C0v7i;K6(c zGAa^=+>9*6k06KEFfVNdt)ck2!cjuBCck?m^dg|n1_myhm{e}E@po(5_4_pB`e}(5 zPpqn*+ufv0h?k@d#>cLsJOwax*Cq8rFz52()zif|Y6oA2QF|e)Zxx6z4|B#3~t33TVx!OJ%D(I z@zpZCN1+|jJTH}YoO^)sL^0@b>hSA!haKy`er@in*# zI~M%@*g=RFXg~i2Z^W1-{2+{r*p^dZ{7&lo*RLCq`j<^Rly$h~-lGr@x@obeN#E*HST0)FHy@nloybM83QGC=n(n!B^S+J;8-Mr zN-)J`yGBs0dhaQq+8sN8{%~-IN%3FpeNrY-{-DggZ18;tSth81?)u1Tc15NFFN3|Z zEN!L^LElkF z9#mXqnQ= zxk9msvUA4ZYpIfr2D5O`Fo3BSue{Q&Z8!1QO+!>TpBPn(j%y5e6?2brr%#;Jo`jBn zxVX(`UtX&)7G8@L!5IycOx3{UVKsC`XxN7&pVK|&I3KB8+7?9_$UcR^ul(`S!@Xl> zprX9|^y|u4|FVd`!DL?3QwLcSgDv_t96=KT4?5#PtCCWWrOSprP=4uBcq2Kw;E<}d z7-ns`s{?)y%9`WoZjhbfrMZ0O%1-02=TePQkvks}11yC|FhE^f=P|NxKvht^fRLj` zdwDFc@JFXBmX%L%N*j(W<=mAkPb_1oJn6Y@FX}cL&T=8d@ltupr$u6x;dmXb>O&-{ zh)5FdR;dpUg?k4TB2_TkqHC{SOdQP(ycELNOY(-o%O~V5-9F>d=`QQsdO0{G+K-y% zSRJZ5W@UjXW?fkNh^3A!H z(D17;)ZxEL+WmOO6dGJ^Wvjo`)X9yG4S49oS|!uY`0||_Bw$v9PG@Ay6Fe;XOk(o1 zwS63_W$~%vr_$z1XWLP?aTIp8n3iz3=mH~lpH($YjBKwttrxd5oGH{BBqJRgy^-Lj zIb@LNZAHStx}n29PL=5fyRY17ck0xu*I$*{HxLw0iZ7^FOWq|53pUOH@Zmod zlxRmS3WkgHKL|x4o%zeF2)6vsP=J_qasR_xdRtkWmQP)Ct!rlH`rj@tZM#$M9NpEe zzs`sI(S?>g*VL%s1(Y`s!U+>IpK*qZ5FDb9g@1{9PK3HD?vu74)XZ*>Xw(^GKJ;Um*o%ty=p zKuEM%eiV|zyg_d>VlP=U!e1voiU4V`Et9#LEw7_}K{xa8h$GvlJJyXXSB={Xd&v|R zx@w`2n>;xF$2mX(3S@@L?lMjivnYkCCSD@BH9{jU zavJ%t37Y3d&inIZ6BnB}n?#88<;9Q=>x{l@FCZSDzVPtWSJ~2!-`6 zE*E~yW)alAU>loBaqqO(1xnaOSDz&py@7PhvkM73-XqN)EH z+e6co#uvT@+=lqYdYZ8@G$?2Y9KK)yz0(R92SApl$JH;!K0|M)`eI341`*t_J4j_S zmYtO=`$CC@EWLZ3<%|H!Xs!1;;~QN)jhFy0sy^b4d5Kx{&%wJUQ8TYG$oT{TsX4w@ z!ut>&80v(84CRvY39zIWqYv*e)?Mv6W4fB!$B$W~{Y}E0Rw@>cBaA`Wb|~kuAi6{g zz~B_H0~!xXHPKEA@fT|J_-Asr!OOdrHwsONh|*x((PH1cJsRqP5&$sVX(D8);l=Z# z-`*{oH@aK5A`!TcPf`reizqho)DxHlSDe46d!EGH$-p%5FqR_#Ml9O^yD9=sgC;RE>jByWWq-&TuqxsYc zOK(Vi32xoFbLXzP+&=zscO8|mlB>LIPE+Psdqp3{>L!vbMVeDch{~JiWw*eUHPx>T zc<|s=Ou{+iIqJ(hCrP2sOT0X#`JRjdY0-6;GM*;f-Q=h#&^IVTQ5(}Kiq?o$fFN?= zXB&c~bB6p^tI303ZP-T&-WC#DEK6jVGD_RbvHI0AIyo@=-BKDlGN=Xf5H2VqhAD;Cs|fBZOo!a*NRVugj7Ui6cR4B4E?C{ z3#UOA`3v<Q@kLxCD1x}qkhDPIbEO9A#8H%1uGE2>i$yA)S9h<^dn$mg5N z%STpv&!bX>jCx+2<#WPP0$^JtXz^ZQ(S2U|WZNudX+YL<2AkYL=T)3_@1J0#hcktsgl#L);9#7 zI?=A5fpLbEkI(v+Z6{Q(p6>l0#vW4wlY58JeR*h>NM2xyj*u|0cMpJzlhwU!{u#aR zs}l@6IoUj*>IV+Yeg6FadRQFAYI0XcPt?4ZqN!f7oW@3+O>Fy!;TI&5=jP;0Hf^7h z8a+;-GKYg#R_%n0Oy96e)}?!W`=@`Em^nMxRW;5Run zYJB-HTEZ11EQ>L11S)e$6#?3s_1I5y#yE<5NFb343P2pfb?ukfpBGoHEX$|^sUm4S zdCk6Yo~51`8!1w`M;wS!1hsktMa%t#w2f~iuscl5{OKYB2*M@4H?P#`aACP@U-XWK zHp|VUqK~hw-hSu5luznu(v9S}75FFmUW935!FQO5e6Fa035F5O%vo02;Rd=7Fk6*~ zukAe-6Bgy`g+iKBRp+Uv#I>g4<{216>YxtQiA25bgzoCFj9 z8nlAQg52T@YGDJ#3J_NZ_Z~%O6_VIY8LOe9ixz%};KK^@cF@4rit; z+Nvjg0NPB{Vyp~-w3C3r%0~6Slx^Vk_UBsqirCnV%!cI#`Fv@_q;ca?-iP8x`Ap|L z6*G_!ebBf4WVY~AG|(&|5xmT$KNV59aCt*^3jGFBy^Ju!PdR>mT^ChWtR2vKtc=XZ zVx1L1EkiXc%!P`N(qE8EV&DPa4>QtufF^;FPz^y9hz&-8i=WOHf8O5SPN{Fp_G)zM z)~$1X=nbaJ8eO?Eat0YfWyMPr3!edtFk@MSw}>GM=qN-*B*p=YVX7G9@M?G?(q2zx zNTu}yB^j!4yYK*uVznI$+I~CgoR##{;~0Ofc-V^V@B|@d97W!tkWEdxDJNcs9B4$&EeZvY)B|!ATJ80+E|z=FSZfvy7M` zN&K)E1&T;##a1j*)W*)7nNogt^W@uw;b9earIWbln}cpl7n$4WVUY~eI4%a%7kRV| zMThC8*#;sVzfh`g^H4poUa&$-?y5?{Y06{Z$m#-&%kN~)$T3)c zOu#Zzr%I5CiSh}0?@n1+Kww~C)7CXkv+cLX-!5sMWUt(*6B*EIYh;7Zq;2~?_UtZ+ z+_r)JP{NaUyBD1I|FE0QpZ0&+&8))7o9;6riUU+T!Nzm3th~H8&9-nia*jc*1^gFb z%1iBu=8lgA>xrXM9FwfZ%4=D{r+chaqs(A>O4R&;5^;X~-#E?}5za)GN-mx#h5?`C zL~MtBN_fa|4x;CyK?2xAop#+|Sm~9iRf(03$91o|juBj_b(w-V5gCF2gbS4a3H3$g z)`C2`6SPUR9bB{0&d%Q#zce5R>r)chO9;OOk0b^xo?l6H?@@B23cG5L8^FbfDI$u1 z$gx)|GHx(MQ@$~RvY$@Ry6|cziDj4}#xkOf4$V9xb-%i-HSSvxMQ!oB4YKaj$B!c^ zBw#ZJH=Uzx{m8!qUfr?LTfO?EkI#Nq6iDyIl9g**_IY?r+VgwW-8Y3{dyJ~TVfi>g zp};f0I^p_6aEku@5weSEsIOmd1X&j~FvGWKw5mRQu-oNEj4?0{AA^qyLyQ2D*1jyH zb1H?nn0T=zSE2AX^$#Oq%42^$diXG$D^p}3V#onO78}v?*s;ZYdp|#SQpIrT?G1bj zmNbtYXlH+eF$Q8H&8k&bFLr&F+{N+BR8HsK+B2g|Ko5Zl!6T+O?I}3B;5ZM2nJ~9n zTY6A$zE5jXFdI36XP~{^x2#>ez3rB&{;Nyajg-wmyn*b9S#GVr2PPYo9amy>?5g+| zmnH-%b#eWMpgg$wYsFS>Vga=OjZ!*@QrMTtR|rMsxqihHVUk%^HVCg@jq|g$*tFo; z9EzqVHplLVe58(d_B@b6q7g-et?OfQ1CtE%sIJ7oY;mn%bo~t&j|LD*np$hIS6`2u)V? z)fMVXrzvv1jPrkB{^A1ZL(#gQ32J=>fQsN$B>D>7CZkI&V~{x{Cy$_<)z($W; zfCCN&bP7l`#p@w#2K^Jg&P$m%1#wrCDF7e69yOzwT1$p^^N|m?o=txb#Kth$PancY zcQ|v#^Xt2v3l<$#QR~D=t+$8Ao2r1skYF(pi=kXU^BsgxxIk``&`eOApSb6^0!+ST z(9C>^nfAzTne{ib0L%FTj$s#G{ZPgom0aoi(BC-X5wgWe$5C!Aswfwdl_*VpDi{f+n1rh8M zsTnXMu(a|-mQFxqLsgYSk=Y#Q<$m+?LdG{6{!w-s;*MUV9$K6PQjr2cwQzu#0R)lF za?)BNrH~ITM#B|vk8hZYNXdKZ0lUqcg(;HVHM!E`!2bOxpoIOAr$u?C5h(%r$lBn# zRE}t+5sz$H-OcwEIPR7jp~>Nn04W(Wc5D}VJSs+(DMvLVPIPQ+`ulOK)`sl6Hmw8x z5;wPgrv83kWP&>cX_-&E(kOR;ejTF;b5YuXSeXqQ;4w`dzT?gJ8kB&9{NIxNvhtMW zJbdKhZlK_3DxWa&N-}sVFjb6241IoupSIJ|ZfjFG#OwOPu@i)D*H|>N@Q9cn!o3N; zC_W=MKSVH==5m44WEt6D^+7LrdgtnMg z2KKH(`(fL|?SjFD4-`b+m!fgdMiGOFK6P4tL{wztKckQC0_Au8{(kq1Uk~#0Pd(Wz zqosnF8C%iauP-R1IV^~Qucq*nQBcSalte{`SL0c)u7R7o^-9Wxq&*R`+OrO}_6YUE zR%q0kEA)sop!~JbfVw;o>9U-=cR^W8)^BWZ1*H~oMuaQrobBzY=fQhrA5+yz2LQ32 zXV4kX6&`>bCn?QH&LX>~deNLwmoD`rBEkbKuTrw}MfV2u#plz~o?Vn@ECvxYxA9qy zyE~wNe={w|f_wKyDk)txiPG1(Y}RG&$9BZ1M;G{P=2sJ9#f6cLf?fC^LzIj%GzA zP0CbaD#S?*(=L1w)aC-qhiVDeixG1VsEz=b4-zBRf5?#Xak8>b)DdQ;riC)qdnK0T zw$*+7@!dufx)G?1`v$qBTh_01`+t7_o+fJSosq2@C6;LkLj^i37?WD)SOE?& zS@|<*D;@rSO#)xi9?Yr-0QzRZAE+fga7lJA zsv2o_#jwsgRO!mMF`-L+!XOAxaE=5?s~rv^ssHL3o#n6sW4_2~HW`Ag8Qqb5x{+V;DC0k=oLVZ*#NA~&!T5p0Mz z4Wt=wnB_W;%_s7GqVFbj@;%dg`SP%V7|O{B3knLGkymcu(IYZS<@Ah1o6~*uKYeBJ z&AoidOXnOM)|-vFkX}(Ox5RNb{?bp>s-NgIq{o`mVBP1R?5?dASsvkWBpMd5iUpq9 z>%-M65{G)ro45YnuvA?u>^_-h^+KUc%E#YK6cTDlV`1*=$aZj41wx#{N0CYwsqKmKgIcsKX?LVV%hy#{>n4gMbt8EW!>@lkYdLP=j?}<$ z&#x{0{XDYA8l$%U-Mo(va^mApOaa0NnMk3kG25c8K!w-j~E)DtO1z!%C}V%92l>Ag2*V$Y$L zjY>sx1;|9N2a+UG4>2L6w=~9x$AdcX{;EHNnq#}7=80M!O0rtR^xZS3P5XLxyOnL_ zt6t^Lp0z}Uhx18%vL={nL$<}wtipz3pSAmekI%VStphF-XA>NBl1(RZdI)0fUZvY*KO2vr%lihM~{+$ zVRz`Wck*Fj!-uPktUZ4?dw$~;!_^Z`w%_$BM6E2o`emAihg|;DbklhA7&KGs%^V#Y z9+Z^W|Cl_t+3$$K)$Jcyrkat%CGrk2zj0;Bck?a&O|pu8e*ek&6FHqBNckL#Le$lc zF0l###P;*HCZo{*m>uV9`zO@n14a5MQz+}Hw^UE^qOR1?p_AgD^A~YQoJ4L-phIwq(69 zo*6wm)FTDDa=yom8OrABGaOddovMFpFw?QKYx8U`S`yw^)Nmhn)vG#so$ufO_(U^m z_|h63k};@FU_W2O8F8KoqapmZNDZs3G?`$R+SzcEb2tKJP+zTiJ(1i08`Ej*h4ydX z4U6m=lWilWMdK1sE|sXvkGPO^a?I$+X(Ja7-qo+)iBoc6QTz6dmQTK-9Z}u6)16m$ z`VO6>y;WAL;MlMY)gvKQDJqs0uJ(NHqjN#k`a)@hzad6DE<+)&BxsEO8>cX|(!5}K zs~5p3u~~*pFGigocT52XPId)>4+<~b4pyCEq8`oXw z_Ik&M_A7im=YxA?#+^I=7BfrxRa1|5&$FIh9$l+nT`D`aOKYo9@gF^Xee&bY8b<+a zBjH-v_4tvg&FZ#2fjww8iSGNl&+HwvF8!-gS@|V{YRPlgB^w{bqL-xE*P18O&C`6W zVxQi_2l(12ONp!;q>Utyf$8VNf9&{HZXr7&q8kieDS9P7rhw*9gKSzg$~}nSg+D?;Y?3Uo(Z(iDX6?wr(G;yzwd+x@Zg~g)N}Mz8 z8!wTVyRu#0Wl5)2j_ZEnex}xLCgAa3C&RoX|rp^!ORXGVu#bxU+sex46(H zlU*9vaw4IdzlN%o^V_}s#<;gcYL748k!*Xg_b~U{tJVZwRJ3{7`ed7l=c#}3V0Y_Q zJv+L|K@8!DQXwk2tf|TAP0A3XTGx9I4>0E<(>A#~8-q6Qs&#;++Vss&WJE(~Z-1h= z>9laaYDA7kxb)`D8`s8gzPOk&)OP*KmC@L*fFi|gPU*%4QAG`ObRA&bHsHLwTU%dn z)`H~EkU7r1fy0=drNx$ue>ffxBMR6(YJReW1mxf+1f*X${)$M%r;Xt&nhCMDq@)DK z2_mo(%DX09?7R8S_3M{TTWKqfDqJI?$0>XK02obCH9!Y*AO8mYqktt2DQN1Z2g{mI zO~NEZg5GP;)|;|cdpDS=jd-#=RMqau7vZYO{vgu!et&7PF(G#1KLLf%bBlQwHJ^d+ zse~7ki-hS{xOF)l)|E8}`FGmB9Z|X_y^eIoy36j_Nh^(v9tspvORk1vxpTHm9-OZz z7(wXJCLY8BBQbPn#l>HxR;KqQTv~UG>gyaMR1~yKVmyb@#;DEK5tX%~3sO0c+_Pk7 z5eTRYqXC-aiyF4QQFuRTPbf9j75ggC0PUj>(RQ=xBS7hazzV6fg!7MPWxAdIg`)E% zRYL{2&WfJY4sZ!-ao988Y=hEuTQSY7)lGEDz z#=$scfLOvQexzZ=tKyi(IV1bUe1v zlL8E zS+O>vO{2n)n>LZD*qU!cz~Qxxjcs(>dg4$8ZV<_E+>6FV*GsJW8Taa~_xXR3_UBPO zw(tKho=Rm%#!!YLAxWrAp->t`hC-Ssq?F7`G?^ioMj5!GfRMiKj*|Z@qU}3s*J8+ziy+_3#8Mt{k53Xo# zfB`7JlP_wis|{Ya7Tdq?}H8K)40NsZx@xHgSdC51$N-O<6SZ zWce@hDOgJAP13xcLP4ApvTfUq$8dC(-#H{CWMj1M*Eh2trM{>!XXpbo`}ONrx?ITTd2HIQj+>)-i9aupT#$h{BlJ;2+$_0N=s3z2fYs=l#7i=Gw9HM-~- z`RiFdaQSeTklm%{%b=-pMe;7weR+A3xG8`uYd5PhvY~?g%;ah`YHL8^`hS*xz&s@} z)@+P47il-Gv%Bi*9{==y(JKF=4mXqSh&E(c*i@f(NsRKU39!d;RBS6O7ZC_?WJt=i zCd<3Yv=~lIC`G?Y4vZgAmoO0c4|Zk~6Uh%utAqt0!+?B`;udOx-)szGX)}p_un7Ll zJ`jo3igw2=iCGr`R?5$oAsAwZ77Z_>@#Ez#S_g-RXXT>`=`(K4HkQ_?tb?(eD-ujQ z^&YjY-$aCA2dFXWG=jTnK)$E>u+xpyly8 z7p_v!fjYqYDuI!i^^Ax<2-YHV;2_cnIPCv&pnBuQ%0q?6_n9#->B~$Z-^0bRKHtx& zukx5F-Z`GyB_Z9`Gvv8wonMi13pNdO4%csgO0Hf#CwxTdlPM_cQ!pRPJ(EIw-Irk= zcOA7c?-k(!%z>vR1Z27&OHZHXTD(!2hr0yc;_9h3+GE6AaVTLj5Zwo=tDk%!!ur5} z7w7eyFmBw-ckdp>iNqDLH;^E_@~q~n)~%ka{e$}T9loQnagFe<=ff}*d5P#n9G9R* zP;5(>BGa|SF8ikapUu4DU5A&&C_8)Y0^g7?I|^clF#k3y1E{a&kImH-TQFK~b%T!+k!mHoUDT zOgOGzcYKlOrQcE$0Snsp)mi)9J3hV!<`*M&`RA{ejP>ez=uoN0oKsGQx59Y zZ;gnEXkiXawn;5#nx2w2^q(Rk{`QRdm28K-&$F?y%yCqoYYqb5#n5BS!eswpf;c56 zY=HbckAtA3`lT$oXg|Bb=9s1J`tYllFTYCv1W%yg<&(<#B1gZhv12zYHa0DOq+RY^ za(C9AlQCW0-0GK&__B~BQ*I>GTCpTyD**o1P#vv2yY-Xn_tlI&UynYOqog@pb@b@N zK%9E@UraOly#M(6_P9E}m_V7t^b+fJ(jvm(*Fv;G%PkzAvR*A$UgL55UyOk3=kGSw z>I0I8Cxq<;aRco_!`4|4>40wFPibuygI6)+nRDai`Z?c!U$Q+|*IZV;@cZY_E)R0r zZ=d21kC<4xHq(lqGhP(ZME1vaojS3?cGE7>D8QoR4mg~z?keZy5m=-m@DM{_N4Gh% z&{`HbN{G=n_uziPmd)^1MZAuUjc~|D*}yES^1{B@6FKtd*f4Qx_5fN>XM;>#E+ksv)3N))a>xaw(G^- z&>N79DY-|oIdHhOu)%U9#<92XlEweaXimLpeeNuF;8gMRWIAT$R0|E^vHNw1pj7Ev zS+!EfKbRb|6nP&U6VUviGmEqwI7HB}1)9FD#dO)p=*~Lp!xumZW=SmNxQ55WmSVi% zQN`w`=66olxjY;_8E(8#p}elbHB`9f2(Bi>8KI8kJV({W+&P>JOcm<&x8-e`SAUYk z`31D|+T>p!6V^t?(?x&D_APh|0!yXO{J{~9-vdd|hjGP({s1^QH>Nxt=t#xX9Zt5;<3n@2^JeaaO?;@9YCTZDnP(-(*+BnOg~GYj1Az%a%?lZ z_`xM#U%|;|#6NB2O3l9G*JY(XZiXnue@?3+rdVaftOfAtuS=az|Cw0cPcEp|HEejG zq0XE08PW<0i~B}KwO^(cm@Elq3bI^N0Z~ma{y8Q3bQ>L`jHSjBIJKMmpduO9*ns;{;&OLg3*44k3 zXMQxbd{}20Gr%r>XIHsF26FBA{bxCsChKK)_4kv^R=xARsh!%QWp6*+xpM^D>qE-^ zAt4ssD(xWf8SEVd1@R_TknpDU`f{}=v;4(i{8+C!`I-(zp4+qS>KsK^*P7;2gJ%-2 zdh_`nljWK}2Kqj*LM#^ZP6MQ1S)pH>f1w7HKQJTiK~I^i`>Uo_=HbJoCgI!C($W%) z6n}?s7SwLd?ZS9YM=4~XZ*RSoy~GTF>2Eyagx}wut_oJteSdm=EBym?d8 zv@4Dw0=mY_S}-IsHyO+9sx{wsL z8^(LBhbd2W)$KD@MS0cIrG(s`n0~Y5!Os4r*CtFTp8Qu;Io#Cr?2_#(Gcs2xj3^kQ zY~A$sX33Gr9(jKHpyl}Y}6mZdvPer&QR*IJJT;@5M> ziR;@xOgkW7S$<+^;DwW@N5cnhee^W&=uu5`)7>lc4$eq7eB^Q^KH*l@@A!(S@@^tySk(o z*;oxy4wVV)lN~>IaEnb{U3##}DuwpppWdFDYdKrB^1``pLx+SOjh&*pJj%o8h)f_P zcALMst2dwM)ALL>tLPaEMzsm%q^4_JHh(0!OmGgM$HoeGC1~TmrF|c{=!D0{{z_ct z+%`bjl)8d)D8b9!9uz@X;|PGixw%=u(+s5kjEH2bn`UVl`q=VE@`K8oubo@}q&IgHp7Suv;?K<|;|1s;r{cX~pzwVdPYBy%8e}Vv) zVl$@FK?*9Jp2suTiJch9XAh|_I+~DUbEKzh-PZ(8>JQJdBVcvoQt3(c>h)a4uTHSpeaebwIB z*p=P?^e(-*G_d@(#gTK}GtB4&x3fJ?RF;+Ym;zCD0kS3f2NiG-kqrxIvFGf2Nf;O? z6XA_09*rH_#)WbdR?#xTm@G0!z_lioM%~=>ucof<=Zg91<=$@(mj!g{8BlLP9}O>T zG{xHabylgnA$|%oS>{6$UNiJ{)=jjwK1IuX?OsK<1tRbTu76|h3-1Wd-;n@7qEV^& zdu7^KuQ_|Wm38eEVk`*rt$(UoU)I!Iae^RsAnK%C>S5X&`ifroH#lgWxGi6A8vbBi z&9vn0y+_wP)*n3BYf<9N#=95bPPNKO58qYpf+6vPKCX5uF3Hg|)CJLuJ*o9x%l7S_ zUz0va`c!R6mn!X_S1m0WwAe|?drPOkf0z6@9oKN8;&S>@ZI5Rmm!gUs1-eZ2Mmp^P zk-$gsyZlauO4hg)Yh~P!tLnA1q*wXxQ_%nHmyoR1*>;SY=@cxcGV}uUX2Fkxm*ML@ zE$^hvt5-F0g@~oEbgP^+X=rELS$l%s>^Eh%T=!1<&b^8*avozVDjM6LRqpGb50Jq+ z5YUw1XH!uv(%gBmy5wGMN9)`?{Q(7kZNB}=*=<)N9bmp#M~Snr=j^j>-H(9ZO+MU| zo6+4mE$YCR)~2J4!KdX;tGT#$XMkham-nLNUYIjpvPgb;vupNh8K=Bt@D1K zeDAucK^Lp&y%L&-I!mWS=8TtXe_D0I)Aqwg9v|TH`rSIDxP5khy}KhQTS-_jkf$e- z>AsXGQ+Lho`2M2dll?_DDfd@rOqJa+sU=>nb%;v@3jngh?Jf)324so}k5%HYAw&H^ z5~P9^`fIH@)!qgsy}{P4Jv4wrSt5}nfB&Sn z84&fKP}L#Z&S|)uz@Zo8`a#PUaM2h>B#(l7_b5~TW0q7&aLdFA2Mo$=hH6UBuk;DXq?4d2Zh)%#rua=uG(M~t|p{0RFJdSJH*VIYa69#u-i=God|q(Ck-b{&Gswxx`ph`_Vup!ik_nE^oS{VK(x+onO^_NzzihA1KUGLEATY9E||CSAl~c4L$bh|64? zh>-5kwD=baXK>fvjQGr}hWS^YhHxt;yEVXsIw!}Wo9x9Xd#v$^F!6Ap*p%RK7BSuz zpKo|-71{9aoe8*+=J4aytAf=`;p2Uw0h~K$PQVT4DG;#m?;P#=V2G&IsZz%xVLXnEIZDjQv!%(g!Z7K zLX5jbJ~9$Ee1h~p7fLoWDtCPM|j&e23g|F)|2?-+WS%M1P}mme=U zFO0jPLecgI60Q~_{(xHw+B7?)@E!nsSOQEW=(Gs48!P5JgoTDW*8O6S(iI>iC{|ue zZ7dN#BF4B0md<$?io($9)zu&`uxQ~a2^c77;B;Z;^B1XPX|U#gT*7KzLX60xwCqYz z62N)bC!&i){xiM15Ci!jkOEO`g8a>@`jeOn#=OUAw9TD^qj8HUOFnRh-J~)I)}y$v z!jV-3fB-@pVaD}8gXCp)OA)sJFB@2BmEmpJHtVigvj$~v&-ND!2dJs5r*HbQ1YbcB z4<#I0h1CdmsBJYLL@1fTTtpNAia^0RA2Mv%uep_vFDgu#nl|x_h_&dyFIK=N}{u;P{{%=Uzb27)LarM+;qgyjS00JE7+{U-(EUm)^)P(lknaZ=I{FgN%A zuB<>TI4v7NF10;@hTh)e zIXwsRI8(EV)E1$+zjFtqf8w(ZPu<0{T3cI!i_i5NH&k_Xn@^`P(;2Fz1>W^0{MXz$ zH3ut}AMq~=9KL0`w0j7%VeS?0F9Qb&1FDg9hs=|w+u8L6{kjYd3-@?OC|vNOsc!$y z%@G-8PWp_?aNUo{z zE0TDg+uH0MGecd7?A+4wcc_^P;07&&&ChS`G!253&cMN=3Xnb8;m~6a9=t^f@f5fq z^s!rqvzUOfz`3b5KPR=5o`0K?sy7LZUSHEvQ}=S|ndOf~J_NIEbZ!Xi|K;n~1D|A7 zpDco#EEwaUlF=8bqf|}1BF6yw(OKQvfD=IzY!H+c$()baJ*8J+^zh4v%DLO zBwxIE@o}piqI0gLka;sD3x=?Vgd9fFT2G}&Ga=e#mUKI65rjDUXk4Zgb3y_h?kIPu zVZ{k|ffMDAj#n}!7y@0J+6E$Y^km;jd{b^K@|>mC-Ujx?j1}?ITYY}1V$Ysy8EW$H z!#B)oo6>>h#>>Ktjh=B*F&D>RdG=6Y_0CI)(+wC+u&~KJ;xUMg>?c7d4IIa{y&V+` z4o_pEtrQ~6w>ay9_L`ddZ}0Fj|9hNUWmImZp7*YePNL) z5;}!xL{5$p)=1OlLppQKJQA+9np71gUA!pV<}vNY4*jv2kD;oi?7uXAOk zu0NI)WhGv(VstFAFzum>&N92J4<9NjE5|NdkC(+=J{l69X^DYZS(o~)JjHo6Yf{TL zQNS^>i_QI*V;+Ad+Y~d4K@1!Mi##pfEKt+8W2}XF(lRpo@TWXPzCzaHyzmfzH?3jY zY;aadma|vUK4Cco1|PP_DaVE^$%?Ci~2=7xrCuzV;+ z?6HwB)zeL@JJ?YA>(#FMb)oFl?0|j9V3X4d%-AtRP_Rp7>fwd$E|SyhJW33;lPg#* ztf{njlENjLA+YAR+1)ZGG38=aVj+atTdK3ud)7;Lb=-I6iFor3`z*q}kDX;DXyju4yUG4@VoU9aA~yDr*BwtG-N zFxNS!1Rz0`ba3Bf^h6qw?@vGPj}s(I(L%Pd=222)3EKXxh#4N7meP_>0v*)kq z0gEyFuXQ>C&f3ft06iQidr6YY2n!PUo`NS*Wfbzg9 z(t)aS$}w$OsiDInLMncJl9iP;Hy`n1LTGff5IgpUwr!MCO5;{nb!oD&L=yQInAQpc zc{Ni}{okL9=UZ*%_MabZC0P`5#YIQh4zR^i-D2x-@H;d%A>_kl=nib8%j{|h*#}~+ zbBwSB;G@kS{gW;7?dH$FCMqZ9Mk1&j_SD6%!@Fiov3cM)oXP+Pckuq-$G1?%v8YEm zm2?!%5m-MMoHOiSTCnna-%Tm(00i)99Dv;=kC6H=&ZJvVXs^&OdlB;^=2j<9?hr~O z*<-sqcyz`0jlIyBqqt{7)924(CLmn51ht}p2fkL7SuykU!qpB24(q{4l@--~zE{!G z6|(~`td6-fj=IKx2LT}m(FMDG%mZTEb5d=k`CYcp3n^O`HZoL_VpMlocZTO&VLb;a z;{Jz`LT<@DG)S(X7HCeCaTeF@{CO#riG8v2`CfG~<3YfsZd8&>5nA8whm85VpbT@g zQ3n>d&NuZ#!`X2bFr>Y1Wb~CZTx38vIQ7e=bskI&*?}HZ6dfGBCUj62 zwK-)C`FKF*%M)ZK8Wu zK9)T2xwN~Sv}BRREp|K;+Cfp6@AXyRf3|v6koJPMlKuu|u3}Zbc|(|L-tY7Tw}IkS z>V=97#qva^@q)I=Xn0|cSwO7x;HO?B530D;LOqXV0Hmo4>msGc^q$hUvz&*q zTD$m?svZW#RbAr@{$6w57Ca;ARMfuuUhT|=B_)SlR1U6oQBXbBc&@I_Q>{&~%)82- z0aK3^%$#JMD61^LPVPY3>^U91TGnL0u&=Ib>wVbPIKR`Giq74oGdEaTwW(X!c&?vE zb^YMVFFrAStgbl+JZe+@q5aaUXPpLb+vb-WYrDft{_#T357P7I7sW03EH80G;_aQl znX*d5Tq--6RajcL-+s0FjFiIs&&>yxmR{c$W0T#_ZoH&MQc6JZN-vwckGli}ys5f> zr?^}?+v8$KDTNV3hWk(4d@QxovuD*0!y5C)u2nL-S@^iO&z8dHQXR*eOjH;z)xGDG zeIKh|$@EwndT?N6(M_qmEj63NqU8q9nzFM~=iY5Lbh2C||Mu?G4VgZloMZzo==Rwa z@u_cN;iIajhRP)3;>@H_a{BAeR%PE+ER|cnub==)U`|FBqF58`K3rc z8!>ms@!2&UY7D~n4Yx8d8vek>zr6j*P~Sd&F`7e?L+>^>wT+lCuRcyozN5G0c>nn- z`-iAJlGZaT>#ge0p?O1Gn>&e9L!|quzxybq_sw#ld~Q%p&X4ha`zG{?jC0MhKPJEJ z*VL5ZKPspG9G8?C&|Y?4L!Yw!D>R>LuF?LS0Og}~_U8ME@WPBoT#Ng>4 zH{04BE9e&$2Nj)pA4fMf#>E*-8lm0!+4DC}caF48Q9Pscb5;nF^O{MrHEX6$ zsi}`ooVqkJFW)KNXQjpP_=KD8#V@dQJg{%#fsWVq-uUBtX>0Y%c0JyJa@p<(ocPQu z(e`2XTF)A}UKX7z+U9K7o@4XFeenLsz?6_@cfGs~XX<@;G4PU9!(|n9W37Q-;(ZFt z+8X*-zteR1bReX0tHQ=!V=W5?w`=G5GdORH!Q_?+%gc&S-nOi;yx8TUU&=(a!G=Cl zwVF2^>wJIK-xnVqOSQ;+qg_?tW@CkOc(YXK zQ%{p!KF>Q;S*=?2`AgVRlbKOZhZ;B+=h`eBTG`wA&mfi0-~XP-nD^pE!C_4&s|&UY zfrcL^k6*w4>2i-$&4C73!QtUAvE)a;!&mN6&qdTP03DDOTuv ztk35(2?dpF(#Ixw{_x12pLD}-w85BxS7fJ#gq-QKQ);1KqHLcg2{pN4{$_=96SH^0JR6X|@#eK1ihdW`bayR# zI_15KSFX&Dw!KnzI4B)=e_~}AFS%FpoW`T4XY;4a)JUp!J+yJ>ac!?(E{?sm@3gnh zl-lg(GGBFA!yDtzpLfRZsCm=e+q3<VV6gV z0TB-!(~XBdetEu1)@pKO!lK&s(7^wyb{HbF$8SbqmGrf*p)0nPXP$Lv)2ZdGUI*8X zo#XuKMtWK+3>xg{rkZr<<*^}U6Z}VY$UMA#-6}cH9brYDN})3i4@Itg5Yi~IU3sjv z;f5aFhZfqknQG^lIDFcnZAX=lUw*MfN`14Z%*>-Z2Fzc0FuSTprd!-}t>5?JW^9+T z%Tt*uKXbWi_Ulg#pL@pT_S+lT=7q(@@56Tl9%;~jG_l>$n9qxFWxXZbYN>U2%Ta?><&Y@!^?R~Xb%}H>qZsJ z48>RODql7Gxb^z6`&{5!zrmp=W!w$sjywL&<^;Llt4xYavN zy6NZk-xE}HxA&38?W5#-ov#i|Su^q4#hvdupB+=!IQ49<;(nPQ`f}+y)wZ%DPF^qF zxPREQ$Nkm>$60K@;AQzqzV%k%s9AElmwOsSDdkzk$=;D_Hv3^eeAwNM=GAQvE9y9wf zL%!MgUHko#gStAKeU4Sq@LT>quCtt8VbAtu$*K!N?r$30=k$|hTHg@h{cSqhT83GY z`ab2s3UlULml`jsOif>Xeoco%w}|_A)4M_roR7 z_Q%A0)g{tdy{hYc4b6&Gex7di8!2}>{N=}uZ`?N;6~CTkc({$8=JwWx{TF>FcG&;i zFsyy+PwDm=MYp?bZJGD+#H&};#YbzLs=Ooy?DV&jN4C=`(Lb$&=AjNUBgSs&I>}aQ z(pFFNo>>zY^y~P$YO%?!DBsq%haG#gv9Hixo2$QCdq~F}KLhgH>mA+JH}OsV?E`0lczoRyaIFKoNAq*SxF!KC=yRDbEXZt06H zA6g#mRJc$@@=<{PfEPVYT!#$ny>NGvfwjpUiRXTH@^ALG-8<$%uEavswUXYF6IxZK zJ3T+$X0pm(h3~IC9LKDYt?wYM^DM2y&0U6n-)(AhLH^R1s4u#nJ_c32+QFT2eI zUH!p5(l2j4v$)l?M#1FT4!4t%hJ{CVynWcyB;N3p;a|B;1CM)_S!3)%S(DU2Fa(oO!t* zafeg)>z}3U-=|}fIQIW zTgV7&6khxK7gwZzd2y}Fx{9+NZLTEVIC1l+zO3=)A+rL1sL6F`J70I*N(GyN-QP}K zC_kh@wXnm7Glg;6U(N9{wsdOy_eJBx6pihTHZ8T@WBv0_{MMcknf}(_tgYTm%S34> z-FahzZ%tSbm>cqYM(NErVeZ={`}BQNzAE3|`j$aIyUdHNTl5S(D-d!uGBz0P{zS)o zenLBtI-#ndU&p?Os8Y-GIc9W!qJ=KqKlH5IRH~{*IUi10oja@ePp12j#Ge5TExuzP_RDrC)BSD#(DH5d zj<+?HD(5fQ+S^<8`||d6@6D}`l*{L}b`4hp-6?`ag^A>GkcF=dvliODp|1v}fnXIB zU+FYt$da3}pJso0f2VK6(Q7$*lC}N)x+=eW(|*J3HC53EWrkj;UeVS`|7F&&jxrCU z{1c@O&E(eljdtCoS7>aywbO|(+3S*i118SX%iBEhL1Dhm)>~Dw=PokZAw<& z+w3-EM6bsiv{f&4u&McUX-`$vc}M3HPng%Vv!Q1} z@{8*^vhwRo{}iA5E1zDU^*dMTo@emr#$S~`QpXRxG+0ainptDZjpY$T*Nj+Fc)g8p zwngUnz-Ezt4&Z?MxiI=*r~($5yYlj@ppFkKbb=jK$GqyG%s$oSy23EI?A5k=G-~cl z>O4|mOE+OG0xi6C&CKid7K|+bIrK#*2+kwKM?zwM=kVsoYhy$W*cPHWXzeUmS{R>1 zrFN1K5xOwqO77q91}Q^Kyo4(d?#No3XLbv0wjHYZyGP^0;Ho>DEa;IY+yO{B!Yv16mL;S%9^J5f69#1LZG5#QfglCR4a)s4(mDE+3f z@Tg*gt*3m3Mq)k$aRI}k=x0#t>+37%h`g<)ovwd%j_22Rvr!;h{6~9d@*|Rwe0+Xi z=&99tL9d7#SNJHTI;t^cgY)T;{YH^-ns7L&=+RYC=KS)hi8IrcQw+*VAva4icgNSN zL!_A)KayHe;B9xcx0iKAr(;(-*bG87n1KC?$ZMN(O6aTchl3MebNbH3!!J#LT%MWa zecE~B>he2nD@~vhkno2jnNZEP5(37)D55yS6OxzB27EVh3;tLk!_cl4-E8JMU7Kxq z$OxC{%uE>~@%j=ob7_mR15E9cB}rS{1yhCyr^R)y&pg#k2QYwS_TH39-v(z!Fc>Wx zZ{Hi$V?%N`IUN) zOZMRnIx{hMWnp;iVTRGbkEzM!%iz9X+KcQEqcF)=M7In^RF5M!s{o+nqhIANT1e9o!V-13;PFheh=n4Q*_9m=iXv=15XuO8^S`%&k@2zl}$&S=Y)i2(TGN>#4%B+CY=9J7{hF{Rl$&*3{rZ%1HoTWKrQM)uxygMy8nV%=XeOmT}?qq}C{ zi^ZnlP%H4nUN&${rp$h`rRSxF!{`+(n_8!*1CEEZ>MH%pkM*xz7fGg_qUM6KD`oC( z&vt#<EY^H5&H{8Ts_B3j45;W*ra$wj7|>7+>N&(8@k21NJtsuH{|*3 zF&H*1Ikx1p;xnI&(~AqZ-*X6xVq`7>mcgD6&$Pe!d<&Hy$0h}0&&Xj zS#^_3?Er+2aPl014De9kFbk@Soo-Q8_w5hf?SF2D@zcwjBQEjO^k%CR!gvU-Pd~&c z6jgO#HxybnxTmO{g>wn1Cj>co)SZTe(H7%=;U^KJ>sD5`P8@Hby}dXW|6NWGlv8^J z_kvkbm(HCFdAP9>QOwyBX91mOtY@_ZieF(-gfnj>FRPd9!!#i*V|-u_F))CQ#YS`` zM!_&P{>+)+oSgBfai2KrOf0HWrpQHdq#n_ZHGMACv`l@)11k>ot4b>kW2e2-2zWRp zS?3?ftO7<1)~wRGVA#KZ{P%Azk#d1n|G)nvC!NyuKggKh6Tc8=b=(_Z-TwT#L4WrO zmpt66$QFxeKz0Y;0khYAW(mysFPJ_+kue9U3d))G$Z#O_AGaJLLCVV#roqg|U^531 zSm)(n4&TCb#tdhEjFAsI==;u0Qqz`W>xodi7sd^-v5UI3&{Jp)JE~?1oA(@tEtC1r zU%yT}QJ(SRd^g*^H~}S|IV1AK)Svzj_YUKStK@$W^{jq(?7)G6?6?R=0!f^R_~yji zkOK;#Q2goBVn{Jh!z!rMim?ycCXq9YWD(kdZaa;6C^ndhjU#Z6nQ_k>2M!)w5q0K4N-$Is=Dd=i3}wmF>!gs)>6i=c-2stP!(UUs%kyHJkPgvD^{VqZa8yXAxxijXHR6H~tYp*9UjB?f|#_2lGyp7S4?6 zYa0lk69X|p>*W}LEoEkBcX-_FwRp}7)-c{G;^|_CC`VMqw-LUT)RuxS1a#Ui!Tq<$7j~xU0A(f<}aA0Cj-am#HplH)Qdtp<|okbtiP{j67k8Ki90na}!D1tXqwJ z$E~@d>A>S6vWMYl@O%ejV1&P)>Cs7BG{v=r4k~0_9K1r^##uM=(=Wb?V2`rNiv_}i z1+R+2F%fyRaG@tuS!iY{1z`e_7Nf9z%Id3=8jYE)3g$NaaS^7%sVvf`BsjLW;hm0) z1vH!&qi5WYYjfnWhJu8r_(-%hB3D!Jdl*^5PKsd_ z$Y2vMG9sAA)91a!T%R3m52Cv8u&~8ZBRH9{qV$CR&6|icW-To(NEQ;Ed-lA>AX)JE z=(bpYJCNaz>HRfZ)g(Z1DaKXzn5hnCL~h}Y5IE+;FLeF!Y%}K6R^c-fBbaY%l9M{4 z^u~=a)>=H5(hT}|^gLTzO{;nHOvpev{*~Ki#_GLCw#aacBl-(9G&T2AyClF6S`4-=0QbSc$4VAPV z{l0Q3E=2pVvtC$-QqP>*9$xOUpH5FTr2l05pG@4<^6f|DLm0hBrs49aPq)WZ?MGat z6&V#}23_)=l!Cs>pSP1+!^E31#je97IFdgTO%v~%a^ys!TN#jImur2mZrweG;0bM{ zWK4KkE`D|DVT%iH=2v89=9IJ%P%q+nh7W!h7Y;sg=vU;>CLDCgF~f8Ko9e`Szpt-P zpeQlD_-sv32F9UHqiI|tVTY-h&S~1>Y(A3ofjW%EQM~ODU%cU2_S~E|QH$bfKux#` z|8C9g*fFzCCnbgAj*tLv!yFcE4m;@Jg!p(9O0@k^BOJ8wmK-{0kkKAqee2J2#~a(F-eSv(7mG+~U5`>>Y2*kWe5iB(J}eA~ zc!F9@IMt%q`r_HIRzQTf#~Tl;sH`-HY<>^-{j^;wCyp3&JSLrHGy9%%){B9xV&qw_1#!qSGOAhAqx}1<1Wk= z6eEUHq(S;cd2D&jrZ2?QS$G@g!@3MOgcBzV18J(ysW z&L&C;DKzI?B7qk0A`mwPyY{PhenS7`YDwWZS$;y6-pV4ge%*Kh{Nic48!j@c@EOtC z{^(9cr~#Z-sIVev6hcikuoP)2MHGjp($d~?$I%mXQ&tWlJrF{TU@Kz;o)Ntgme(^p z4ak;(@w$7@p5xq7ijhtbCkCgPy~Zgx4(xCH$o}}_qwRKSPC#jMxh^m!15p;2xC2IzCf|IAs)ta z^x67KJ0a4h3TAT=0zb&(P?1R8yhZdJE@B537I0Y*iF;^0y>@LUD`LUVbM3VZj#(nI zFrYyj@nj>$<6L)v8=(i~Ymrk^Ueo(KmyKE#pGPBBKenbL(Z*Fy&y?nTrDa?VbrPA9 z@aiQnMhMcODT(5eufq>e1A$H5CBuq(J#k?HrWzvc1jZggkAhnblo$KlWOzzYyHqr6 zCs7b%e){qF7U8Ck^p$`}dA>WyKnc)|}I zy2UeqIb#WW-*(;9aqyhw<<%k|Y2NG1@LN31(FA!7l79$rMLyMSk9L&QXgYAu53o)i zggNDnm`c2g=mE@t38-;}qUHPc9XsaCpFbV`chHRuf=77Q_wW4=1q&P;1Uk0x->d!p zeXITt`g^p|FCie~x`6>#%=MoA`pu-c;`!#LTN0>#^ypD9U*vC8XVmD$Bt z;IVi)AQ1OO&%eK0)EhFe*yGw@8g4?x#Q|dRA_O8as1%(pX=3>CI&Wx$`69UJ8u(X= z5hB<|gq!BsC&N;txB{G@G3a@tr=p@Um<;RV5)*Hfn{$pCgQidF-eRDbSDWZxHgJCd>cVZ$;nosY8`P=``TiRMvUne_F4r#EJ^U48ucLYz7?C`%6_ zg8?Dv*>IEWk5lvXc3X7r(uJjqyI^dyaB^j}Ky1$ub?%Rr7Gdsy@X4>Laz6raB0#6J zjkMo>eD`LjwT;OmyRxHAIRBWiQPkR}NTR}T17D}^_`wuvJ=~C;`HyzryM^_^}>X2aY zKf2Y!hjCoK%_`f{U?i8?(?$*_(R;0?RkBe;`90$k%4^?Kew!ykMS2B=9kc8k;kJL!>kiGl8uNJ~Oqs zFprSXYU!s}nmFf_O}W_&R?`3udgIXwYMT}B1KicClQf3SYbg~o-87R7$QD5`vSoo{n}eM&+CJMsVY z&?m}G+}#_NUTPDs>v}t! zl6QD??uCn*Da&72BsbQlUAo84Cjy`n6W=bK-gA|Di}V;Gm>;j)Up8#w>!=TJLAhzH zndcm$5f(Nw1Qh=b8lYNr1|+d#Wn=InoB7-x&WvEL{$A=#>fll)qA~Iw8%N(5oHbJC zd+>-pNx9wUzaN-325^v*Z~3LfLFpv$ANw5(4m+QY`?`%p5&N1Us5JaWH}wAkn>FM=e3Q)Qm@^KEkj!6*@{Z?tI__`|mQx z;uG>o{|O)NFDCMYD3rvBU7qz}@gg@$I(4cDtwHF~bAbH7JbZ|l<@Z@bT*}%9Bg%aG zKb`~l@dYxukQZNS{j&-na@h+uHTS5ra=+oAG)yO`r$F-nnz>p{Nj|2gHIhYqE=|Zj%n38bs|3C|Tc*e`dBo`putSO4B!V`u zieTq>|ZNw_SlK)YuWhQwEP8pL3&t;)k>GsoL|)A+@}H^Fn32}XZzj+QvwkFM zRVIWqt_Ks?-4UI131tcx9TGouEflKVYX68soGo*S%zO1B>{1yhCL%k(J$cE<_hv#0?r z>BZ>>X}2pWmSkscz#=0_=jx_=H(#>^QPdweCOh#A{@zbF~%|C zcv8vj(K@?;pu2gowI3u3$Mf7lKlR+z0crTP8|8anr+j<|pH?rMo>Y8NhlvjWw zI-k~3_Fy9HR9)yVl%>Kdg8O&gwL*hZQ5>lp1Dd`R2t2@N7Mu%UB*9a|F_W>X>InT{ zHM3H}WGH?0zg+2N;ZxT(r6*4;HS;a8y;B5ppO=ddU}PlJDX8OFm_oox76~(|M+0s`` z63+sbFE|BKfsVuV<~8H=%ZM6SbWC$lzxEet0OK}&RuN{0F}fY%9<$9PrKSoUDg|zF zpUrV$lvRhhR)8qQq!y!?7RM1FfG4&O#39b$ZlbcPW$IBKHof0V>U5kJ8%6Zan??AN z@_>o(RtDS(I4OngY*s(cWuuEZ$DR56WuLDoaD=r$pu!|MJVK-v6!?hGD()lNS|yUP zE^Yk-P<{Jb*l=6+M% zqNebt^2ywnSI&_Ks$&ShNLcH{j#=fA-kNgp#c@XczrSBUI^azbpxF@WCgCg|YKqeuJm;vnF&ZEaUZjnLPB zrs?4Hu7SaRIvAbcdr&a+0&R<4?7kg#$oY*1PCDbZPcfvapcW8TRV$xnhA?IqNey(u zs7YBt!d{nkynElivh+e4V+4^)RzXqXb8s7>G*aX~3XieapQ!6vs zYDGvKO~Q$C+Xp2B2Mll_SccaUBL$z9H8l0TP~?EGkG@&S#?2r|1VD*Bc9#AZY1Y~3 z=}{bfOF2BnBSC#3^z&=KniG=dv+2UzNsMF$Bh1UHCc55yDK<@{~gz@QMgyTrEbpzUIKDMHL>g!hTWAPnevLM>&K|0#I5w zIpI6bgR=y$WRvhKEG%IRywH%})8LOLs0p-mYq8yt9^rE5p1dbVCm|;% z28^M}=Ato^DRx+FghWFnbFY{6Q%7w;1K;95H(E7-ceo^fR#u}m4MOS+U>#8;IN9pDWaYFc!%?$!mA*@z)yBJWmJRjrh3 z=b;6c286y4a_Op7pMt*iq%p@T52FgB)KCiVMN&OW-@bXXezw6u;}(n%MvO9ErD!*! zOrKGPu;6NF*i!!C*bQe4hL!W}_t6s?4ZV~4fVWJwD&P}wf--s>KH%toDx?3I@|J0I zxj*;+%>;NrM{6-Q{-5b_hS|1%DyRQ>g#hc_|MypyN&Y{-EoU?6vSW|``K$cHFj)u+ z{?E((KaQXOKfbxE`FsNj#DL6gW~k4NXkh*FtVuAK!qez7^aEB7_Xq3&@z+4pu2hH4 zOPBIIkseY++^_^`)$BAZ6@U(j&WegVP}g%xf`m+tuhLhGQHb#^K$O0Era0qi!&K1gmM_H=}vK><0F^64n(W5We zqS%7^Z~f7RXTrQn55W@wVW>l65X9bNcQcY?U34tv3={L&J4~TiHm~PVq(j^;FCiBFw zS9XOX0n{%f{z8EWdMgaUM9w-8mLNCca|maI4ebzcE#$|WoW#cbbukLX)*d=iIGfwT zpHc0ebf)G)luaf;Lh^vl8NDiVpSxYKVFEIIc$9$wif}R8rJ2cGXC)o*k5(pz)l5vH zQq5wg-#VKqY;L)6Vx~#cB)lb=KVjW21;QpQhdGVGk3_5yb*DB5kZd*HYr-smmc0{q zklIhQvf#S7uXgq0oc&nv%R_=%UP9QpFbMpiP_R2@tO{ZUf7T!mCT~ z-o)a_*`z$YcX5Vi1o(v+*QVPW+Y{wwSk*LDafXY;M}lPdRl(~-sygkMB@4Tl(iL1w zf#UKRwqIysfxoe!j-x>mmQ|}>1XGzXo;XR3$b6hYqN@|XKbwkiFm$bJ&Z9)gHKYg1 zpxVr*9-~`@0GEL)d?GoHoqHE8JR?L#AydbgO?RneVxH!!nX-Ee^)Q{Dfq;A_dh75& z0S?j9gG&oHIsrgZb-d=X%{fIU-j-ttXiQ`UjvxQTC!}NzhtgD@M8MXlf*t{S!pZ^w z(j?sZMJ>X4W)6bK2!*mTxeku z$_v4o8ii-SIAAa@0?*NPduYjbqX8u(P~`NmGiWnK!aS0+lk~4CT{1?BI+DXF!kx>B zYm_kZU`rCg>9p`Th^{6xi55jfbhh6v@*KGW1cq(~$m9J)NdnwW_#MX!+YptWpj8Rz znuj@KMK|WH!Q?f7m;Y&454$IGfG8sbRTtNsyOi6zrW~uL6s0VubC+O;K)M`feRH4I z#EE@5teFPgdc!6*d(k2{J}Dpw@N$CKZ41Ki?vLp>xS3=-ri(nRXI*c-T0fEBlnOpIkfHj<0N4+Zh#2eU0~ ztKayx{^}3adk{}!T%3yHaGNV22tm}5>}rBzLJ`VJyCp*G>0}6=EUv0LD%`)t*c`-` zYbJO`%;Cko0%x+vnM3&q5VAi+%{ZJliZh=Ac%6NW=P ze!F>|#vi`(?Cw(}^r*9er47Yug{FitN)JXsd_N*?Zi_7w2(s6>HSr0nLgGTcfI{s9 zfp7X}0V*O=Mdb${=QzAFHqQHSW}qSo%N!VO{Nl+O2QT9fbsfUUE!wb9wcX761H77Q zN78Nz*AOJeh8##-?}2!m{|`lhYb{1wtSVK0$lzkZPwNHUS!`W7YQ3N?ynAq}8*Opi z<6Rlp)r7~ZpNKZb80I1~L6&_UhJhB}=37}E46*euyhl^_nHXgZXqfQnGwT{>N2~Q;+wM`ssJkgzHlB;8SPx2W{0WYNj zvo`Sv&Y?~vWd&O{gRKC}iI(nwy?471<6@jMuJJ*4SSJgf%^?U^0*__qjDYdwv~OYw zX5P19zqBziGh4#Wz`Kfr0GyY!RdzlalxuX;yM!gCSWHASbQkH95~6_uCl>3H3i|b_ z#3Lg8O>oD-7NThfX44E(eD}iFfkU9?%^MJ$Y!HZp`d*_ba9AnA6jt~|uu+NNkg@DF zRM#S35w9j~X3@~?S$fG9(qEHf=pl=rkpskm|9o}xAPM` zsxuCck7U_)JS*)sj>N_t3}C8+_&-6Sk@|s`HeXg?)S^3K zd|5&dCp=E5cAjCU0zs*LV8y*2ov){cG7Zt7nB%8CnZ+~}Foog}6=p`LLr7d;C-tA1 zDCNA-0fRk8ysU3oM=n0B{5|j_7TD#2KehulUU0$+BEEUFS zM<@MBzqWJ7j#0=Nm;l~-O1pt;HttX_KYr|F0Dw~FqCC&a_s&Ah^$ANOP?XT@E zZa(u>Mp0>LpNtMFLcle2YR!WstEgprokeFw<2y^FhefF=c= zi(s;~-!6%6fN@hc-cBex8SN%{mLbuWJ33sgL%TCI_ijW=*nGacRf9Y@)UNFL5@_bldubIbP+P zB^p(hz5ZE=Y@5Lr*SvN^Ro33zFzR_)zh1ho-G%*+o}TrlCvqOWIHEx{cWT+?kN|#W z*}nBq@*y|g(2F!%U(s8Zg~;$LGvj)|B~^@=;9dxuQW5b7h?chL&h~ybpgKyz&_M)s zGsNNki7fH+JMj|g;+B>s$@RjJK~k&~ttTIAYoSFfqYDP~NZajHfBFzi^PoEKg;cA^2phb6IDjR*q$LP0fz@Vjm_o zs79nOvnLnozY=&7rIl9W?{HFP#2gAX9 zn0xhaBDtSPxdK0f_&pAY?dB-Kz&zez?sW|>8HrIio;e;+IjUU9LS@Vr$yI>&M~*xK@<5#Z?$@`Tp1&cZ;TVWSqV=GRgDXcJ4GifK zi24B_Rjdrgq+?8i3J_p0idz$}{N4o+2AJ8&owR`yH*3#*$09mRc?VvB!zJXoA3wk5 zQ)Qkiz}WkdY!njj(xpp41uZRDLsIdc)&aJp+k7(0emd-sPbV2d5B1>p(NVioMhBKI zQeA%$jSi5ETtLvW^~a~$L@IC9u$Ab~o}axk=(b~3h`xx2#Pr?Wu#8hDu0#&k!^lEP zYzlx+qR7Y3qleM}ZZd3!ce%0KsRY5IokPI@_4vasSqT!g#DK9rKbNuL((t!rkjqRO z6)5abe?EZYgE=opKZ{&g%NRVeJQc9kf)gl6GVpsjYS0!Sxx@#B73W;&+7!uc8=iw` zG73bisT&yVU21_hm5fs1QguweNbqGuBB44tfRRS{z{B`rm3&b*0casXY#W4XM=7_l zD?pkV9bJz&moHo=Z+&m(<6Z~+* zqLyE^dHoE!#6Y!5gPfh7E{OQI7#MiA{f5g5avKZ*AS4Vb zT?_`oL_f2{T5S%Spmni6Nw^`R}o-%1@!k0v<@(U=V5;FSgf%xmNQP#7a`Wzke$BNU&~!j|G$R$N(^+zlYu!3qK8q z+6dL?h7;-?$*c4U5M8w54Auuxt^)*^-I|!4o!LDOCc7a0&52lBH;>k3p z2<&_d>;!Unhe9Fn>QJj<1;0hRjwH+!{B&Yf!aSZEL_Z#rTZ|wqNTba^hHoT18`6Q$ ziFu8s#S%48cXu3=>O@tExJ{?AXS5Usiu(GUiY}&EpCYjaNs9x1kYogrA|3Z7W;{oO z5}QLeVk-8f?D&T-By<(&(yoO$==09Zq_CG+>^^O)T7@2+Zy zenWNXRCpJC_YAavWC}YFdXlh-m$8EM0Bb~ZkyZmh^mE+o7=`IF=F3fQ%8cUPHmZIR z3L!z$(t8N~3&p#~-+u#eMh)%9fAt9TtZ@->#$5_T`bz3Fk? zy~g`MC>zO-Ut3+21IT^~>r62OukHkqjJ4|#{|{a@QNAIp4C=aRKo1uoak__`MBD() z1eBL>yq}gOLT?&@*G(n@&Gi9&Ln+J%UIyhJT3_+Jnr&OQ*g$4TC|48?@o2l$4bDzW zrYdgHCr3GQbpZCDQmhRXslJWCNN_iAu$5Ob7_k=u(T96ZeS^dC0^UFiYc?PvR5<;PJf&PP z=x}f(vd})mgx>i1LEur&Uy6e9vK|4qL6H=9$RV;@V1%$xSeocZ%sbCvWIKv?>Jp1u z#s3r*Hc^K?Q_4iE*p6f94S9c)y=4FbpJV6+q(K-bAp+nIyMl0Z=rcr|@;i}<939O) zn?fsGXMf1j&d#&`5Vpfph3ft-VJ#yp$?Rgg#Z?z-?K*(OLd)!re;0Z-j(f;&5!{i z=*|7Q-Er8UT0+`doijy|1TaES`lw&t5ho*H(0d69r*JpqnZ&{24Fn216J;?aK0v{3 zXH=&Ul|&OG)jdiFlmeEAh4B`^$5jp)M-p8>F=JuLLD!g~=}(7O@B=hXyhL+Lb2GFj z=OZo0jd4^H!5h#d#?vwu?{j+FAt)CilZ=re`=1*$O|`lWFX8zKA1UYs{=+XJtm^1^ z4x=Nw8A3Rz8w}pW4FL^RU2E%9ISKEC31ttA*3WO7g2(BT8nIlr?&>;Kg%i3YFBe|y z2Yr30Rmf%E9E+|1rvxhW!NOtUVaCQFDxbq;uOWzs=J2EKTv5CR4;iJsxF|XLqJ|xK z1i(0yZFUf3utW&qd?rRYv@8$Lyu)l0arYsW=hbNYufgw)PAOK17crGUo9Hp`+4^Nh zj+VLkDGQ6FZd(VZ{}0AS;@+iyo1v;A&@=ES$TNq|3O87Geudnc1P>?r%?5khemp1o zM2qijfB+K>N?MS>)y`63-pW-~Rb}uud>K-AhZzJHwXw&IGH!CK!e0QodcHgwS{j14 zU_Y}&WdZoZ>2VWhMoacfD6B5(p1!xA^_lK7+n!Q9#W!z=ApvP?ry+$A+FmH z#a#8yN0p4=j*h-$Hunb0_aa(i2%OIy+$bU`8B69z_g-wp5OD0wkspu4NJ`!zR)}slU@2@_^# z4jY>mOJKH7Z;_OU#0mwgC z4$iYxnLw0@n^gIE18_V6)+8Lg`44y%)IW4~9fqg)+dWUO>F>3$uox?(1Snd)Ypm4gziCpCTo!pfaTE}V;c5rjd>#DRKU41Q`7 zn_c2M2>!<9%Q-#%XqWS4@*odz^jYH4)3k&1IqUlA|*$ZDnD#!_^te32I{s2Tl zbu62d%oH!}sk!&&@HgX21uOw`8a#|BQEHo-#yGo@a6;frQbE4K-thBp8X6Gf{$KR< z$^mFdfX94Re4e~B5xuC!eD|&(C^^8IJGJ&g=!;Q0S##0bJj`=QrNBya7Y6F3J-ef;As7aU)cQ3RM(F z)igkczG+BeoGhjJbPQ5p!lHy>TpqhgxNx8XmRf)oh*0Mh7mp%Sy@25ZJ`=i~?r~2Ebv>7)xLdL|=g#;WI#Oj5;P$ z#}Uq=xHB6Wk|}MCSe7E0Sx^V!`2B@&Z#q+|2??C!UD?19DtPYnNkjxj|w@m|!LlKn^ zy~U0&W0XVFsII2bQsjfYCqpX(kg9a}mm0TyJOYS`+7j}=w>@YWQJ+cU!1u#25-djo zfS_;wmTs0w;Sz;;b7$Sn(t%I|Q_D3nfK`qr+8cyi>$o3|0cKbLQjU?lYI{qKN=;MIu&9!T zMEWu+h0SmhM8uxHaAx+y$!%ut6)>;?BeBIPK&lp5*`Ub%rDO0?IO-b&gD4D>{018; zogF4HH;eGm8wLVghpIDLeew*3?%eJKoQ*YN_()%B2(tjZ%;rp~DRQ=;cSeq~klB`N z-)}zA<C+5Uxk1IulNz9j1u-=M*B6&65!k+)ZNC6{85(WYuyj?0b^*+8g)R3rh=F!as`K*D0bmK@SHZ*-mD)u}T{}C- zvu{CKFCajRQ=4!ui1~$vQUn$SY!C<;;WBd3DaP9a@{W27w*Vl}m+#--McM&in3H&Q zsNl&oHFOH_5B^UnjJ5^%O)a1lJV!(ZWW9^$`5#w)AqUp#50+RagmQp6TmgC$Ol$`M zeQ|by7MB(;J`BXvT&hDOBWT3lf@3R%!wFQVllvX~P(T&g*@7f*-qw}}zNVz&;utj8 zhQ=PXnowxxcJm3r;0LJ@#IX;s`cTIax(a+5E{yf=Hx$E^u7je6Py}V7_?15%Fv7lV z8pi4OCu3X`S_&k%zQn-^;EK$qLE(BRIN!<2%F5gzwQItt#{WvZx~<6nzRic*;Q0e~ zK}m|o4t#yY1^gi1GoCeGALRV^Fb)&05>O@h#>ClzrH4nFhG=bk4463JoY8ESWDP?K zFar}4);JHU8tjz2fUKc~SJl^#>3%>oH3--q9#(_|1=Tp%rB+F!=Ue z7vujF2}{ye45ZSZYvuUbE!>-dx<_t9i=|lH+g9k@6JRCrfeG zPTa_s0xnf7)OB+|r{3+2I(}gv_HEIn)z1c@hlTBPib`vrbxm}G&L?JCbo;oD#gK7* z@mwU+82~_b4sB z&&WF7`T8II@r(`(3_cqL8WvU#WXeF5O$2x(a{-rV=*OyJ7DQ9vdBjwQy~D$VPs0j0 z4I$5O-=-qgErc$>4nk&8qZf|Ad_mNVaZo?PwG@+kIrJjDHpFF%9f%F5l=aCHn~jbx z6{{Z^1AuZdxrmC`nE}DU;id#)2O*I#YGf;JYc8s^ZS6v z{bz~B^|1fgG4>$&T5LGME9&Q2fytnSAuJ5eQxX-%?l*k`iW}g?L?HlKD2PCqIW5{# zRdFF0mw(w%tA_#eZ_9~02Y{U=tZ%kvq+7wCy)ZkZ3WqmgR(|xB&23qHNe?du#OlOB zjP4Qy0@}Up0G8?zy^iY+?W_@k>0#j@-~*X^y?wjom?Q0KdePHbepNEybO_*-7+WTh zyrHWEoW56eyKCjlX3!@@Ko3#gEH0D6bG3m2O7AL-?4hL=jv#?F{3Q(R33pObWHIeoG~fa3N~n~#fg$7sZwm4nNZC@X+}kil zM+0dsJF(hAn|qJ9AVk;W&>i+I*RIWqJGitgg81?PS03!c|LgRkK&A%R{3o)i3I|3&VqAAF! z0=x>ow%6cpAai(vzPz-w6tw9z>`?e@Ak`sN|D{XyZEXeJ*Y4x=BtuRUyhnDSPFq_H zShLH${=^#%BH#!JFz#Vy7~!aaQ`92AyN-zaiJ1@_FeH$>(fATOHyAN=nfva0*J@tl zQpM5q-%uH3G*Lp$`#e_+M=+P_0Co@9mT6FJWSTh{D^4t#8&=P!d=HG@j+haY0u87; z<3NJ|C?L=^kP8&a@<2!s?M#FA*aD>s;XZ+Wknu1@-e!s)jykmI8=xvfjnaaVVPr_e zdg1~Dn4gZlgQ;J*0=-LEXhyj!U)WZ2$|GY3dI^R#~k<`XdpI|;~zG<=l(S{w1CPpW& z$gWJB{XVrVT$X2Qs%Hl(m!{U!ig~Sd9pb@K7k4+kpS%_uyOEJM=DCLhA2Qu1?(C-K z=I)z=Ir)KHBaXK{@C;3*U<8}8{Mp zgq~ja5;rx8wI?3Zk*j(ePB^9GoQ5Ab0duhuF=BWx5Sh1+X-~_^-i3^I8X$uOjC$xwVR@IX`In9a>)hwj%ujXCS#@~aocnwbgV?-p zF4j`oTrW!BeCl!FTa6?ByKioZLUfwLgM%ll*JAhYTYqeI=Qn?o1O@}Xp*YLOTYY}t zr`$d7Q##DD?4=q>$@xAZdbogs$i`q`9(kgRBGWGM3JY9{47e`JBwv5`Skej)$aOV5x_izH!!$V1!w z#K?fmBEp3U2rmKRdE9-}baWI$jbCB>sySaCa{NLHSxOK-5ue1sfaN+|p}xi6`JdM=NQP`1XO;|F`%fK0ZXN$udjpvIx^J^7Z&8*q@D5>&&kdv5-_O$ zFr&xZ{5{6aVcz;rn~UddZ6JjyJkj|S1AICD|l5jy(Y&t-|wfRJ7jJ_#U0jTRnyzdx}MH1w&Ea zUghigCXDSC{shi>gR~*a&2>M!^DkzIlb9LAa@3+p07@CiEbz0b@t)n|jT>vf?inB7 z%PGI=&}V*Fe%tPQEuuTW|68<}KIa`%p1YOj)QyU%m^=xOr!vyzwp-|Su8W;}voaMh zJoB9thn6 z0s;uv=~D8QNZKH!7purBFE1a!s4LtM@j~J#wTapXm?y416WxR?;teLc2!NEhLC3yT zN#m><4(3fLwLF)d1(Ox)X?4xG5*-4DmAG;MW1OGrtEtb;>KH?fPUa{5zfvFat_A?~ zWV#Ks2oAH^VB27)VPau9Y%+;72ngSJczBdbA5p(0R0bGnmMU9#`-}{_;d?}HMl>_{ zbEt~&OE840v+@o(7|~84C?caJru&Cl09;^lk0;eWT!vnnOpQ5Ww$m_m6C^oLkcL7g zULeX6-8sT0tYMD7DpUIr@9g5)pCHiz@k6kSX)<|TgxvsA9^_Q{QUg{ikSkSujE#wD z^)EIcz6+4KH)c0$Lxqmti(5AlNj#&4qdyk@*tU55Z{$3*h~^`H-Zwkm{qnS89x|%4 zONg+euzSDq)n@UBcK4FQqZbwTR{sqfbrgzqFZC8{Cb%ukL@WwmxM;aGb>%XT-{NEE zAC52b18QVUXlL(OB_5ll8~(E}TW+A-;B)NH(;HvDbd;8TvT$8v?MRnmU`#2q|7D;V zCi8cxiFe`0!rD1-HiV#tQw@^d>DXB@ex|~Re>H2hDd9?FIvGSdK-AlQeiLHrqzRI# z$kb|FTxewIHmt$fR2{~?<>bxAn-y7Em6P*0;`T3Yh-bUE<<#~5MPIkf(UB>U)<*NY z39=SrI^8Zu4(TTBYJC^9bE+wQsn?CE_%oeYS0(#lVk!>wi;7N@Dmlyb*=}3N*_u`U)}ijtjp81t6)ESJ z_vhx$W{)rSz1q0#<(shSfl$pZt;>^iiC1`94;!1bJP)C!p8fMKV!w^Smyf#MVZ*l> zd=9%FdylS_XYcnTtE-Uft|O|xjMKjTKlnJ-ltV)H%IvPJa?#2R^!|R0GPh+8^OknC zW(zVC(i`=5O6qIWHQs$3Q%x=u8DI z4^6i^dzOAyo9VNYRqKEfUwk5Y?FoaY=b}LIW43E(#-Xr&)E)K@z}^7PEd=F_6posz z8bHi%AI&ZnD0kY}m=!%0iLFDGfM8F^P#}e6#jgW*gjv8~fuI%M(rA7YHzcvb;L1cz zL>#SX^r4AQt+#>s&IUdtC_Kq4Mp1$iO7r<oyV;6A2oD_7hMxkYvC*o|^k3M_XTIBsL3I$HqS@A7%Fz z!kErkItyb~jmJLAV#v<1vz7L^x206BJnN`b?fh6_&vc7O%fu1sOOavvRqT#VS)Xfa zXkPr0h6HaXun#CH>3tUc*CPr9cP#vJkK{y@|5e>z)N(6uZu~|itCw`O$~Wl@FJ``m zJlgG@KCcs1OcUhHckJ)ashi`8`;P29wsbrpXm!w2X8*=qp1|_h*vjHk^&QKLWQ7#{ zoxc`B^GUIFRB3Q@RQcSwC*CnV@854lkqko=o)v+6Fv$~0F*+CA0qrQ0V{(Qhv=F`k z))Ikb-;sb-iPi)=)1=%ukVQU6AEiq7P#lV*_~;@aR}xJDj~+gBJf^(26pbwA;tX@~ z@}6-o(|*UMkdoV8s>J-VrG@r|vF88qG$Jgy#0+3cO)cbstP2wClz`ioN`77^d{;M ztz!`Z^#NXqoIP-%501sU-(sXPzRDtDSXEg`{DII7Kmb5-6}T?(jpNQi-DC%21@ztz zHeMq33KR%SB4@>qnV@kfjZmaUTN!H_TUs zdaPNPlES#sOGw(2eeh?uZ=7@qWn0_gARJ9i-4@utJip;LZxVkc$(g3EK#^_!?{DAO z*!N#T%Iej-11>eiRaN<|g@!z2&6bdu)mY5RGcUXI``55tp?}rSXHa&_tzVz-{ak#H z^mv|W<0UTWe}QJ~`h!C;1d0#>nd7E~Mb~%T5&xAVkO}1+ z3n={l=L`x!Kp*hEa6zL+k{y-yHaZoEJc<($6ePoIcViG4UJq(sVYc`ICD@;Ev`B;g zh3s|bRdl!bq~Lx?g8~o+6e5}b2~T zr)ciyVv#Px$r*@ygZOm-B|>V7^dex9LCBG{x5jgwY zb*LBnMAl5H1q8f*Pd;xQGU`-MYQyAj_p6bK{$Jjr2~<&ML%vQf#pY8m@Ya=1dE<&1GR~+&mcM+{2TRRmR})lP_;iPHxQpofTzG;;$tw4|g@IZGAc4kS4cjll#kO zS!L;AVa|_)jX0I<&!@Vc{amDa^2(JK9>*)VDZD^yc;hgIFJ-T=@Kzvu1jq#-0-L1` z7zy%hVSph($WhbWeJ@0TbJ{_6LB#P8<01qC;1u9;Vh)7$kI2m-9*oI-_EH{+5YyoS zE7!rlfYb}Y4gnd~4~<_Ex?x-bvmlJ-@Xz4*IB8;u)Dg@jk+2-adpqglQ|^73j7^51 z0{fT%K3Gp^6M;XYux)^y8NU}a7vx5`=7|OkLM$mMDfq;3zbFGFgt8m|BIf(s8S66h z*|P!QBImW2;lgS~>Kd71_+KX!mk3i0j_A^g3XpQ|&_Xq%tbPOi2?`7guqrYBhXOZ4 zrj3Lm@X>`({lS3}FZdX2eFk<(jy`+oL@gI-uR4+_dI($6~%Tn`5$}ar>fNSwB~tPj?wb-E0#aKYI%afA^9oQ zZIvCV1}C>j9N)l|x|ra+w)((&-~D^XBhz{pF;=l`aq?|1Z0fCXQb8#3J|rvB!sale zwYAr$ipqUxYHhkApuT@fBv9gOs|_6KfESjbz9d`RwQ7MyU;onQ{Mu{fmux&HO$&V< z9=djqzW=O=igUJBeoG6@nIw9KFmbUPb>*&!TGcIC!XhFJ`D4G=S!QMqc&QHNc}xmk z?r?4WH~y3Eh0|zqhhDZ^5R_17{bn|yC;X1k5hAfdJ2rr3tQZ*@ zusp)?-1fv?98 z3G$1Y!M12~V!rC6FwD+P?GZ>zCNttmI2aN8xeqrJj1`Y$eVwNtG$ktlAu$V??6`PM z{fK+(#5b8?)*j>INya}Ur<#xjvkVfcQM~(PcB3!xduz)K_-sAm9SBeQq-%m`Z#2~w z9~``}z|SH%5OIuLkBzl|rg&OnB8FSqnAJ6Y;l`?MO z+-A=C&#(J#;@d*g40=t=Kdgo_sBQ0*4LeeDp6aC0U@_?j6_1_5-A|%I@h!0zqY@Ka zfr%2;K5@qY@rR9rV+FIxarmgvJmu1+C^J2U+SZ}BbVj!IQ;tb&dC4eEl$RyC(tWtw zAW?+s1q|q32?5@@4kqk3@Q>um@+e~)Z^UE({C^$Hh4!vmz{o6JXw4o9DB+Zkp+~ON1Sh>h_Lq zRKDoYY=b`_%~HD>RTQux#1fNXmH>7TCh#7OHu7H^Iy$0YF=|iOWJ6PjY8wXT%({Q7 z+zCS1eAoo&dJw>PU}_E}f0^I1*bB!`b?xnuC@#=^lNY!yLI*=|&B%81OuOASQU69U zNyc9Ew(~aDFse5WqPG*D+bLdv=2EekGEIl?p;+8!YN7W7E$8TIf8E+fYfv;~y{{ru zx6bG1kZQrx^A`KY|Bdl#H;B|=j+7Yt!NFTMUY}__laF-KwshUzwxweH(LG~;GYa^lJaY-$+9dgoc4CQOe?dY;_u&| z8d){w;$nA7Pz%z_V3fYZY(uTHbMqCP2n6m!1lw0h13d?`OM-$=%w-QIb>;~0^dpUO zvS&G8H*NDv`LOC=-CYP-cWKanrJ=?4z2BDHsahXGn z3>G$Bslim7-9?%COwztAzbAu^T{+jzRbYtq!=%Q>Hd7<-)>5OJ!!kP*M4NBBrO3tC zHL>Gt0JBZapJ!BB-n^ZiA2Jy+So3qRtlI{2KkbiZRYwmwBy)?k_x9br@p|yxnM|y> zVCwYbl^^)@vXS??c(^?V+~*GE!kG=f?)Fc_io;9#svKtCgNgZQE2MmV_ z$)CBiTWVH#Znny|e?T$fx~!-c-{X@vZ?Bz{e{z!2!ZAK(v$$>JQxngW{HgE z&RrW~jL`1)M5jg%oCetWZjjCZxlXK6|<8gT8KF`U!o&vtXO)H#s%Za>Ib zcp3nf41kh437`a64gJoYw<~vZ%9ND_KP`I~|Ejmlk~?2!gX-72ItMNvottg05vCsz z+ej(DIBDtm;(}PLmj{{-L-=c92oQ1kQbYoCa39217=b0VIgYR!w0WR0E{?Tx!-NM) zcQbS#KoFXlHm$&NM?%CS3;DH^&?ma1xg@nIWF;`;Fhui9oEuQsq4e)q@h1Ag*hN=xGcODh)*z7+pzh9Ut|nW*;dd++g+>N+Lu zrUJptGs=R`5|0U8Hl~&G-kzK?rSCJP`08ez9fI-w3Jp|LWZua{hrj*(tIn@D8)XW) zCcxOb|NDatB}hbh#4NI`cb_tvzjml;__M`gl99Lti>dZ(m>Rc#YU7wq3+f*!7~K*B zR;565jGVuF=K18=u>vuE%?XO>E2J7GFL`8kB-(=Ay>F3qN z@_Tr;y&_0eMJDszUg-`03J3*`Am4)oS_}?&-J!PhMYtQXPO@!7Bs4#ARwM-3X zaq*zRzn=?i`hG)^jX654$p3QqQgi^y!N%d)jLxZsS0}UCV+{U0%3uZrfFLIXI(lfp zem5}@s+K07*L895%@)viGxPJQ_>sef;(3rkz&S^%NN^D`m-V=t?|>afuMbUMx)l&- z-CWaMAdjJ(0-6o^3c1(Z-F+NKn?i^*1DlKt%E8Q%Q2C@M05(ZX33?XJs3r|LNYEkY zBoXOEcMX+prqOdAeSPNms6VPa5Cp)?%GofwjGkO1P8Hnzxl-hknE6$-qe|oXc{mTmh$IYgl zh9;r+ZzTF{KwR;PX^At#v4Yc_XtuD*S)@k#`z>h7^|cE_UPs+J#T)qWK=g&Vbg#*k zf1wXsIAm!O1B4U8eXU;#NHFK-!(teX?1vq@=7HxDUk%h{#5F@U3LqOq2hedGdj$SF zi9e|`Rg3E4y>^9fU1J`29hMvlaJrJ}bqgyg*_F0~n*x z0FHrpZ!sXj>ebCH*(N00fRXHDZ!gSM*yRop1t?Q3Yi&dm z9@U5w+k~}>c>_dmAQpEA1qkle8@HN!mIa~30?hXsrf+fK zqyc|4iEM??s10N5&_i5qeN?u6(GF63;!RQfwm`HYgqO!xhn$mqb+nK8>crCot>%40 zeeXz7*81~J3lp5`*VMQvKF!bX9(lXGN2?>(YzxAtR1ia*shb(8>Ma*wzLZwI$WB3V z-=%ZCuP@cs7zk!)9J{KAGmOm}tvRoDO!sp(>exTMJu{PP+#P^A9bj~NqvZhPfP~q_ zw;{n%3lm*XbZHi!D5AEc-NJZwCH?80~I(5QZ?b6Xb(5aF2iC}qRX+vo{<@>My)h)uIqmjz{AuP_eb?bm~vIj(- zNJef!6qH89HX%Vl5b7g@9D-g6y7Km1GiegPh5rJx&&deCDw|sH+wXzDU{hz7RVWM` z*e@ow9kwJeY<5RS(Axn!L`;`03PjSy;iwl!%T7)L^vaWLvS!JANRBWPvVc(GUU~ukGR$6U$HVFC?afLse-SPX<*++2Z7zX z3joMTmI1C2aG$nF6?yweDhi-ebXhj1_Caggkb`Ut`>cI;?C9sn)w-IPIGt0?;rQbA z{VUUZb#wR=#*$9fi+5F3l-e%48x~QS`x^~G&%)Abm!c$U6T)`t)SpO+`C~HkgE<{z z_nNHJ^$sDz2Am-Am^w}PZoW*~A!rzh{2tjY7=aY=sRe{SN$tinMTQ%oro+U;0`>QH zfo!!jEq#t60WawALC5hz`bJt^tjO}bI-)OwcNYU2>h@nMe*OFeG^tPlL(Pih4ei$& zKLLRqvj1=uwG4;=O@O2%#DdIjY55mfDN7MVD+b2ykf*1Lj13TRv}af{Tfssr=5_7x z-bN-#;A5W2*jw`o95G6vt5HX?Z3^Xxd&t*R$(CUKpX)Qi#c)!Aqd6D3=QN_P94aCN(>>c%>ZbigiHr#{W04p z3UsKf9*xuGVwLX5#BCh@eC{cT$i=}h@^>n>qa=@as*RSr%Y`Kv;&?U|O<|E)B4 zxpsVFPCqU3SkKHz)Ghf(Y)EcNHmrIG68(U!5VcVM%W8!@6~~XafpS^7xV*zd?*_q& z_*#;1IdZ=Gwae2_T0b6J*>JzH@hB4Qmi171Jul_1;QZs#1W!WTkhR4>dRG2pj7(hq z_*f-uPn7m+__7_MAA9mlziM$n3xyN=(l5z~kc*vyZl;+xu5BD_e5Vn-0$HD;u|IGa zK%`R!6iqxQAG#m3#cK@6q=q&DslE_dVR<8oX`p6p6=_Ss=pEV$ZPcSKy-6Wp7>Vz| znPJK^EGs}rrRZoy{6#ztct1Ky-K>m@?62u$4&l%R<{M`!1XzfQhKBRdp^v?q0rqs8 zRaDp8+3h>0q#XK{DTTq@SCi^FK&KVNu#k`@j8I>lT1&tUga;;H5Ox;~owm8sW^wJ*D4&ef;odvAx1z5(-va6jTA`+uF|$7g!9Wv1`J^`n+6b0~O=Fk8sNK{$yV z$PTCWU8D&TmoypZ%iDS?LRxzl*jexFLnUDNe(jrC{ zp;#Kx{CY=_!3U3m9rD9<%Y|fgZEbk~*bo38D-4Dw68=MaBNDTRh!4PaQQHth^AdDg z?n_F~NLUQ9@rilRAnyxM`k6(or-#CYZodU;sstvs8zeM{B_*CH0*66=^0_8J4o#mG zbON5kbWvV@fPsle6h{N_-LpxTpuZz7K?t7!rIJ{qpO_Xyl1)GX3u3Z_Kpd>#NSKzn za0fF7XV5o%z>lSTxQR7PN!YX$=kQsn`7FbC;ycgh2s*vUKmS-o`9qG$OsHabNbO!f zUiXW&9dn&SnQgZ(Q`hhxEVH!>ZPL$RbYPS}nPhi*X4kgxit87LMnWby4u3Dv7toj5 zvU$dYhcW%ISoigf*Ax_Pud`!2{IW5^(&l5mB41utUTCzsh={iSA17rW&^4rOC`7CX>UdK0z`h!drN)_FbL!M79HWSP4-r~~yaQKf2fjoGk5?GR z$Kc81Ys-;20Z@xPJmeq`N&*$jcKog3XDSG`5Jx_8R9s(FBi;^TBXU0+ow~1we4%bk zD#FvYg_Mbeqv`U8HuB>y)xRGELjrtqyfa7ria0N8*Mws8QEBOeqiLxxGtcBZ?}9s* zqW{rui!WWz?Y6?wj=p~nc`N9QNsbN#|4ytGhYwqRTlsAgq5id5ZAk0W1b0a?CHz*k z0FMM@_8Nn>h5dshRg;qj!;}!BOSHm>mEzgI{}q!V(~WH8$C~}zi3ED=BLwYp08Mc2 z^uO}3?D7_2Q~LF`S^CV?DW^^i#Od?b-*rq=OEb}Ra}Sj4W6Bh_vb^E?tg7mEV3k*a z=z8I%tY?>9l9s-7pKy2I4(9=RVgTO5fq=-iUx`(-&{S6^7TZ)DA%s3epcv|RGRg_D zV}8&gVCT0XX%RblH5^q8i?$&WlgNlqhn`{~oNXuay5!E(V>oie5e8M?k@C-6Wo=!IJ{OnnY#*RwPpfFu6+Q^l3Y&xpesN@zTRlf_x*W zUUM$=m*Czevs+Ov?~7`}z^g&19nV37w)pQ(o8e0Hnf?vhs_uNObenNt&uP^%6h2oJ z4Q3IVd~fwAQA9&l=S+P_Pj8vL_R506!3b_%+PS=k5-oYIC9bz#I0Y3NSC!M>G&Qpr zRAN|4zT!LM*_t4~u45u-@2BLT=_(p(1|9(krx32mAX_0Lqv^j>1Diyz{1MtGz^~Rj z_}Ck=Tg59hL0V5-zuma<_hpZXACDm2?mdbj>7_>yc=1;tXY@TX-|C)f!#`dpCn#l9 z)YPE-vBn{ao=aBNV!`otIAvOUt;VBoj!j&IfQs1LAG8% zmJm%&2>^P(=~_y{KET8X~HWY3h2#9*K~?oRo|2@Ez%w22e?i$g@vc18>4!1P0Q*~in$3* zf6md*Va0;(M}Mxkxf)~k7!r?SIl`ArjNd38&_*I!Q{$L+Q1!8GY7OWpPNbB)}XfW_waUs>vAPX@hKq>oIrQgH?9&_L5%3 zWf#cH$}az6Wnp0fCl8!ZERGI*jDFUY?~bYP9|U(G!h;n$4Zwx^1zl|6w#g3{Ty0ydmv6(y;A7>Hm%6G zU?5M2dU*b)(LuNBxw~3N&gk5DGCX$RhLEdkL3>|%7=0v74TASqM52Pld>2`y*W=uV zbBE!b@G+&b@@iSc*ys5G@IU&IGXUS{dJ&E&z=6w9oR6WOhiXKc~Iyz%i zW?E$T&!??q3u|xH-`(t6U{xUwe|MFoPhi|)2J-X^PsNnl1j8O!=(yo&? zKNt9wxs**h_66glIO%#(KdNOIiV75z#9siO(&WCOk3d z!`{P-1n!5DP-{rSI~~Sp-sa`yrMom4g`#f1(jC!%dB~8&95}fB0=SHzq;1O<3p&N? z*Bu$9c;3^*WQP4m`=;@aWiPI8-^ur81|0)nx0?s98oL3N@Vucf@QqC~UacaFySA=w z1FU1`zf~F~Eo`QsU@EAD6OByp_QR+-5CynCsKg#DKl=U>YgvOHu%81A|6;=}Pm) z3kVLSv61X;odek@OCbE?tVqXQgbK9-^<$?Nh(pq^B6}EUIgl?dZtAFDL}J`GVj~KS#zwy1^_HS9IRdC` z1cCLM|NhOn@KPY}_Byr-&rq|*9FrjT&-qDO8ZOSS9pf`HMD>I|hMJVH3=5j^H6eCeOQwHrD=1~*>c{I$IE8~db$1e0C6lf^!sq4QtA zx61Voh6fe;|G4+yQT!o>OM7Jbnd)zD35xWZe`@HKt-mF~H&@GoDP;D#eVhExC%MLY zMW>658N3l+7ew9q&-|ELn#w^5n-I}Y)LW}xi2oe5d*VIE#6hBOMn*@Or$F_Ot>kZVfV zdBwXsn)z{?nwEQzAtNv?LeL4WZn)sHtkB&Q<@&(h`LU9b_kR_qs=!TwN)_*e10$G4 z-KNll#_{0hi`&E3qZLA93MNP6z=E|{oWptu4RzpJ;w&WD$vC2HSe*uq_VXwi9$Z?Y zifisW+sF_rOl{*)M^VtPZ~wJ3kK@j+{i_^e3JU!$AC3!tj~)G!@j^m3i{)0@FiU&K z>DlE`2b5=nKYwbVQzfM^MgqN&mAvjF!>8D?U+bBqLsAV%IKuI~j^G@Sy>YI(qisUW zN`>Ds@PcIOA*dnJU&Fsh@-uLF@Lc%U6x%cf87xZ{00Ag0Dc3)L6ZPVuqSvE-mA-_P~@ZNzYaiD z3r8_ZG-V)!Xt{-+A6@|rNE*tr-fJBhL6Jkl+)N5(rrBRG5>1O~ZBcHY{hgircp5teee}=>50)E{1&id)VdtvWJg1bw^jFjD@*LR?Kt< z`b#d$sm}#JWSJ(HPQ`KhaW+46{B$E+-}DjHf_cuvMg}T*ZI^GRD`s9bzj{LdMzpA} zB|p_u|M|xMZ_ejE`uvJDa3b8OAGrUi$Km7Cl{vZ?3b?>U=3Vf9~lp(HmFY9%ldV!-6w?5{eB;a%%n3zz<2SS)- z2F!_Q<8U8ijY2iGosG>mc1i=m;=~;X=}@|!YMM?HL~ytNM^q4CmE6!U1LHT3{YR38 z11Q5tr;!AKB>WBwbdO%XzUqNDU!O!lOKetnGbAtU71(4Pv^X#n@zW6nK}brRu{xPL z+^BKjXVQopmJnwRBf7iazohdoBq3?ody&*8>vqf!GjEF(ooq!IKd?1Nu2uCI zou*Np37!kQ=6384hfW6)vkLJ_=h9D8V|&wi((eh_$l0Kq%JTI85l zOSW$SXhnhm|13_OHZ=S!YZFcS7yKxUJR^d%H;*H_**DT{?l*S(Q*@!D^o)@v+D8P} z5ZDPGhrXhFYe&9?Ds+PlBLUQ<{{PAkWYzv^%fis2I5^Bm3}=bCR+HMAoIY_0)8-pNb=rNw~drY1EJe_9F4)- zuUEL!Kl*OQDk4-iLY7$psN8DKnM<+mC3G+lkV~S-OcAfHmsjM*P%d}onyKTdLk{Q1&pc}f!qk?A&s`39v*x7sh*Ab)L_ISN!%kstMT)2j z59)7TDNmgg0ZkLj71Rqv^$3fOT6#_EUW7N}VqmH`u3h7Q<#~pM&!dB^7Ne&u&dYh_ zU$B~;@xCzK)xI}M*3d|TH4FiUDE$Br0-Gk|ebI=4+=DxY1(q-bc)!B+oM~S5hcPz3 z;v7u4mDP0j*lPuCeN8kg6{2DU6SJheIv_l#z^tcQ`i}|HDR=h?}-w ziVBV~_;0p3#3YBGbCf9$Dj}&Ywli z{g;OujC2%oLiKxFAo zgb!m@R{FfrX%qezIt~v|Ns@edT&c)=3x!@8kJ18bABl&#DyK3UB7li$-hHK^4z3`Phd{_lg!mz+&O1PZ&Y%Eu=s#2E3@T&e zIXk(-bJ>S`*z*V_0t?B9@@7F_wXt!>~z zVxkK5rx#fltaLIJ&NXHWMvUij-eJNO4q9*k)toYv`M?C=SS4BA92M(ShficObs`!c zBhyJdDt7Kk;n*`(K9^7)N_kH9TIJ-AW60|6BQMbpzWvHX9qhhORu*@h3-#7qHR|>e zSh1P#YC*u=L>}CejWZ^v!KEUJJ)(QoGzJMg*Z zRjR5@jnuu^SY-$XP``uV>0o*=hkKCpp~S?r^q1gkzO9BQP79>z7;f3T5eNf_(TBwk z5tfR0C`1N)>s{0lL_dfpuD-0touH=hg;nT? zRt8^l+p(pq?6N-7@Cd2KNNo?_MAY904uloE)L4$5GTF{p!^QWMQOro>_?<x6oN>;37jt|+~8nOp=Wf%i!Q-bB}FA6FF_ z7N|)c!;h_WYinZEBh#?HfLoQ7lBCE$C6|Ovj(#`}sm048fHRpad>7*xAgF2pN0JE!6WUMtG#%OY z8!$9N*zKJ>yG2!I4^DhUk&B=lLLlC@|0WB*!rctT#uEr_37{BJ7s|_f{jr&FLPETK z*CZuZw?Z-}Dczucpkq5sM@vhK9fYb0!4a6savF66&S{^rYHNHPB6&wBp%uOxe%5~j zmyoRrboeZMX>%_n~^dutJ{Jy4@F$f@g@xJ}6I!{kK^yvScnWuVKNSe9Y z3NlYr>Q*3PP+(JlyT{#Ak1s?`Ma9>Sm-!oGkZyrLMcnsoJAI>|TUv&5b8{r(0-JdA z>t~k@4IdoX_v;t(!kJ;f@r^-9VLiZgYQip5y1qFwEsz46hGaWKj~2q0O_F;t z3517_@9FLA0mH5R@;F^c1lgMZMjWA5AD^0J77V#d!7{{)>9+VlB;;DGurkU}T}BVs zMG&F@+KL>`MAbb6GltC!#VN_M!tAmjoGm1<{i5VzQd8qq$Q3i^1cfYSt*7S&*RHC9 zpuyEc(k`)z!EECD1mc?=grcHDdcquy9k$Tks&Kxvx@X{2!M+3QLxOJn-M@eTZe)_$ zTypvBlmeWy$e28pAQN0tQZnX+0UBf>BdaM1qz;foaK?bPrEx^avi|`qvp=F`xKusD zo{Rrpe!eJi@p3khO99WARJA~6Sx7~J1Tt$N%iA88%8E0ttZZ+5>k+F_!^KbcKQnGO zrjSHfA?UOL9~z-LtdQjs`RlBAIxliQv0g}O3<-)N=MxlU zh~s+#CI{i)NRMT&O12}QBn*t$ucHzCpsPu53*@CO@2ni&uf5p@!@zFF%0dhA>;3KN=M^4W1NU#K z^^do%t}rr%T-dPjA9s*mavi}TV2zZ4hZJMzsOUHNV_XZeC2C(Be>Mry0BZu1b1F@l zI`JFuGv45%PI*_m4Zc}HvMi|!@mjET!Z=4?7z^PIqP~yE_4u;)EC0*`tq1a!P-74? zI4H%AG7`}-ICv7uBYZIU79fWsLEG%xbKP&UJ!1lEuQIO1E&Xwx{!Z`du(!_;|qPm=fCY@c=& z-k;pc^hV%~L^YMF$_P*WS>etdt=qTR(7A~2M8xSZ>TaTK0^Bi!!hRNFdUT7RAF8J8 z#Kpu6(SZZ~sCHtO=Hw*3S(H%<;w~|m1W>wMzO3VmKf)Ye>i*zGtAPxC_qA&~vr5uh zkWxzKbUB_1zV55$<`5hKJES~;I&lg(!F>Qy(GCA%*>Y$n9X4m5#GB`ZH{#SUU)~9z zGqUav9R|Gt<^17M{@3{V{T=Y`5obSu8)k+`+&cI^iWC6Mo*_4r9I?e3AWV#Ngcqs^ z87v}_XBotYkBZal4GQKZXe0rtT)nWv=!GNmI_PALgMKF|G+|_#a$c8#R~2j~lh*gTPRa`?{f1Mj9zv(-1{4Zj0!OGeTV zyzy3OsCvXjfVPNEYZ*E{_CKdS-dL5ZcaVFmR6nN@S5KX*7&dWAfdJ|d3>dM?{k{59?DMfodbR~%A&=Cag zk3Juyn`H|Iz}kU(N6u-lTjKW#-{_&C37!tWV7rYkjNjqaQ*Yarrxo-<^qwK~R3PT6 zTe4O0XoH_QIXDEL%BUD!@slPJIF$Bj+j~J51gy+Z0tn;p3?w|q;NUBy{}?{E{qMiy z{2awt77OexQs%Z69iKTehfd(;k9<|j6wX@yLJc{0v5$ot`VG8{lQqABXILRG3Y07yr-bE zKg_gQ5o~~Yd=%8x_L+;W$B&bMOXxdwuW`%HC-ly&ETTnw zhwAF`-1moSCl-H--+ct192CUIz}pCbo(K5pZrF99<#WM}e*T^{jtPzZHl={21ebX; zI{QbSW9vxX6}C3wxlLFv$Txp~&K@h3Ao*lKso=Kk#X_=8R<0@zm;PWYUt5Iv3A%#u_GedNV#y~8PoQSe*)3-;a zgPR|gm1SUuBSS&A9UGlxqIi=tGo7`oO|yS|FeLyIsC+2HVP4pXUd-)n86Q|RX2<>< z$is9E=WwA_ID8Q*;mZjoH8PRTCj{y82H0T)9|H!?leJ|r6p7-_dc=?e4~Nuu6nw3@ zxw(5GI411@v>+%%Fv!Ji#6yJ_K;75c4dH6Rw{H*GhZq6Yz) z9u2*A%`lvCVUv`@Bz`GzAH|WL6tax>-wT)n-aP{$cc3spS20E7sX1`c$tigCVCn!X z&tJ5>x9C~i1=FdNkD@BVl%=eys;GB%4OB3q^n=kg@AWRYE7HKQXm|qhiI1JC_*zK5Ba#^eak z!3_JlAB9s0lq0rfm~s$z?Wnq)XFV{Gh5saKxQT>Lk;AOoT3WE1>Oknz*^y#*l=}FC z&B)ekLsf05;dE)@nq1T*Ez0n%B+&}+H7R)Tt>DxegVQa0@-;84XNZ1m+qSyBB@{v!G*?-hs3z-mLfxQI*pI{2a(WZ? z+HmXy5EhAh4fgbu>bwue4<^pml_6!m0KYYalFAX-11nc9)Y>W#Mu|w7pTMk`gl`Z> zn;xGm`^YG=&Tu&0T$5VAYz|}1FT%c4uv1ie(4EwyxR)>_l_u^@I9P0 z4N(NdEYLDcPsd;mfnP|SflB%>gnR@6NR_Z9((7csk4LveDje7fOC|Q5yz-kn{VS0p z`vaF$fzt>=+-_{(f^r;tO$ti;VHk-$1f|a}1x)QN3MIy$_)FmqL~G1eKYTca^>-W# z64Y$ssf!lew1}a3wZ?x@6s+ct6$Mg!re^5VfMLr13B$#a1a4dhNvuN0d<^p{)&Pqc zjbaP$RsZ!YEJlMn9YTPmkc3^F_lA@T;NTr+RD^WP?R6eIk-4Vdj65@3pN?V0;Ku<8 zKGQdNeX^f5*TS@murt6;IC%7F3~`smcq_G2c~KQ`M&#t=rul_Jjs3?$6XS)|Pz#q~ zX1jU>4KvBc!PQtl@)5x#(+TdwIjM&`Zgb0G>Wf0lq4@ZseqABQN1{J5pA3Wo7zG~#Bcs27bxu7jw^|vj$|z`=LmDo1JqW-q zgZh?m!SIQMzVSO|C%@_0WE-b5cmv3(1lX=#>}~je0ZJ6BNHMb-e|F7xd~`G@G_+;s zkQGMq@IXs!7M-^mN8T5_ZSs|K`&4!4H?QY*A1vdSE}NuO6c7+)#HFBQ?<)R_^hp@% zcUH47MAE??(@OG7e5#~u*@-^xeID-ZM;~Mg&d1D`Rp?!Cl2)vW-xJ zLjZH}^U%u*UVzjfT;3?Y%n;u=H4B5sq+@(n-@T?Ey&Wc0(@%Y z`Rci$Wo6>;s~Fnbb8ls}L%SAy?|$>0gYjwTEa%1|ObnL4n-RPQ1YK*y!=js-K$1pd zdwcufLkUxrd@c(Re$acgVX+gsKMo2xSo1^eg`*!MaUv$lsBripEC`S<$0&i9Z@wnm ztm#{+`oFb&gIhK0jeE8PRZ&aLpFRt3IYfQ)TKDQGo(l_8QBm1AHugySflN}opUtzL z2&pHWj9+a%RIS?;m+`(^H3aMjBy4a)rej(mrgccsTwDTNI#Nu6wQT?~Pq6as_1LyB zc|=c1084DX*K23Zbajo4E*DLI7Zm73^mg3Mn}YlHy<#bBtt68ibWioa-kqs`YXy z>U=!niIn!nwP*0zic$=Ar$Ma`iRE_*XTT7mf zq(c~@DLP&pYBL zJycV$a6kPmaH-{22I zItaoNI3mPb0Ood8L0Jg40(p@(bgaZXP7Txv*x$&foNHx{^inV6x}2^Z7g8;IdW0og z3qa=RcjoXzM|@;Xzgu_a18A2IV{b;kdM1iH2!|8JSCWQ$jOO{yymG$EvL~zd&A!lA z-0)u&4YX^BrlV9=e!_pWM*LWc_SoIPFp;@eIdgeiGvLsIg-5dPi4`RnnfGY^?TP`o z1hhM}X|D0xPyf|}age$Yq5wkcK{sv(-vZV>!w7CALTv+~qd3lxr<{nM5ETxgVgWQ3 zJ3MzECJVYwDbic98Jt}&VAqrOdv#zd|mX=UUhYR5#zoiz$ zVyq-WmlI}9oS}hitO=m=nrUXGl%NaTdHi4=N(08HQur(JRYF`w`V z6VI7UHm26f)={7T;L%&NyXpiL72WUwuJBT+n61ZWHZlP-M(BYna4aEfK?GO9UYiLv zHyD|~dBp`gHgdknFbyn=&waT)>U1~i1Gcbi@6;}ib^P4B_GxBaWbQURZK9b0ss@%I zL<1x**}?ydAVlhbVgA3;J<)rYbakECkN-I4XFgztzl)Xr4hk8Rj)6r#I`xuo;pU^v zC8Effm>B2Mu8xk{sd=oNhCiRS5;zjBdf1^zx~daI4&a>!L$yHoLfB&4abA-V?OxaC zjevx)@ut+=el?R6wjX}VM@a1g?2`C@khE+v03e)WC}c^nBW@|F{SbNWgz+fcsWZ)4 z{-F;vqt<1pj895xfe~>8NKruTpSsDwTzVbGQ;itw6EhW(xlFQW@p#B=0k9?X1NGXm zgO@=bAq*(gv9)!o>&A}ze~g;jd+P0+yXMbUh8iD+hJQSF?umQLxetzF7?O=;0Rb5k zltRS6@bcQ9{|CE4cQFOtd>eiuVMCbTK5?SA^7x*>znL)Y_%LYC zJD=QTXZy%=OD8v!hAFIC zGn6*YA{D|4++<8|ZbCMschlNf8sk7BTLP$yjhnUpO;^`9$q#dA)`*n>sxx?HqPgal zb2k7%0f*W)<;$cAqh;B46?GXT>wGAi@Iz7|A22dX4SA`Pb`5~@Zj50HEK6g-@rUG$ zcw@pPI2M5$IOpB^?lYrbgE!1wOa;gzhgPju@U$NSkV&H@FFz10*yr0ic|7XuRUn)>GH;3hWXoSrj zT%2PQdR*t*a?^wHNe7-NceS`(s4=e$qnMP^OCsJrEk0%39 zf27n8sasl_Yo2X7;5RMo-c={2x0n%Gw(nhV4V}{@XLdMlQY>d7gBfx^u zlH_AqZmJf}_LNg0z$A%R^jawGIKzgK^c@h!?#nt>{UnVy-3MPzL3eGsLrtgJle z$2DQKX^?6MObVqsj`?Mi#gCIJ8X8LetCGV3-Zv&~cbU!*b{VEaX$Ehxda-}(>)<1? z1Q0;nf3XAo4LC;70tVEI7RkBO-|e@q)c83UA&7dOfC%_i7?^ZvKbT37R?gvO!2li` z^sQUBctl+coJNUtFd^b%R) zMv0Xi%nW(tp8}S!2H6y2k2X}2lZW`{r@>Tg!P>*Hy117{0U~~=cyY0){fsWR`=;UqS*rFAuX@Bi`fD+ubEE-QL0&2*Ux=F}^ZH?m zo6mi8e^+1h1Dpx(z;DzBiB2u0}0lLed{Yg@>4 z4uh(fI0GyH)E&+@3&*JORfZ}R3y=JzHS1{A=f;<+BwqdGTD|;?)5 zi=|6S-*5fLgIwxKW8G{NuPy_FwCj4Gnf0LR*}oD!JX2aU9IHktDLK9z5uSMU#;xg*~PXFQE(qoT@2?Gh0sIwtYMOUB`{nI)r>MKE_5y1klZ%O z{h+*@%mhjDA21WdlVv)WO1bk`=9|M|U?DU4a8R#_Q|<24Z(B4Oj6n+kSBp^XFzO~8 zO8w5SN72VnU|Mit(lIjUz0(V17xVFDUiI1nvJR$6#9)X=SO&(C2qcG{d@eALD)*rq zC`x}|#<+kN2{~g($plpaq+C>U2z4>j{ApI$k5^DnH%Rb#2+YYuL>TZ2f*2eOn{m7= z^?yFyB#13uC^+V!)Jr#j**zvUsiNZ^XpPb}>|kZIneejo^|Mjb5`{8msDSOBBWXb! zTPy$>x9a!&1BK=V2omY}ud5awR@%z1yM90YGwDj9@P&;VnkF z>e(;LAm>N>7L$`BsBjyjCcvYe66dctu=4SlJh}YEFc((cRCL{nAQ9UL+Q9voxv`+mx=4n5bgU2!-_sPGyik`$Uoc{1GP<(6Hjx@0dox z`d+^7=#3_{mK231~u(}m~O~z@;-Ee z_!)L4i;(asXh;wH8!P6R=UGg_1Q%mtQr)5`h4f_s#U6CMBnAK{CO_Z=Y}J_gyg+>f z^LlEmWs$Sn(c;7B?&6IUU7ch(P~hM#R*px9krCcv;*Ex8suz}F1TzH1uNIRWba~{g zM-%ZVm4BPV<;x8n#ujQ>S@UyqyyD_90A!+JgoM7#y4FjO==rcY;K0G<{3D^0fMRe~ zlhQVh)AI{d*Tmi#nEK3zK>m)d4Gm67yFzbnHMk*=_1mI&*)7=cjs-dLaPAN%A(V3D zw~-VyTot+)z#DK<4<{rT{5F)cu-h@u$1*LkqokwjNYTw2$4CJAm*IdSJyDUth7yI9 zBM3eus4NJ3AOS1@|DTfd!GLT?;9%nP zKh8s$Bd5{C5HU0$UC7euFrRfOJ&soew!+TH=0-PSus`X)Jo#NvnhKV{O$pv0Il>n? zTvdgVc4KUUL#KtBm_i)1)_^tqkv`Hila8AtCT3>3L3F1iXaezq zLffKwEak`M>@QzDR~2Ho4_AD{en8Y&sC!{zf$*O}v)aSDis{LEPI&pDD8#(U&~(M@ z&>7~nmR;RBgXZfC3S)H_&dFmAMTmu6Y~k+={sSdfMfRhd-FYFV*2k%Ym8|Tfxxhk zZyJoi*w4+z2UbOspx1r`cL@h0q}iayt*Q?|7yJUF<;{CtV`@EC%LN2!pda0D>M_XX z`j~lZRHLqbs*2Ox%lV1B^#OUGnjy&`#&K@F87rNl=u3!31Wnl3?CihZU{@an6Jev? zg2Q4b_{_4J^Pu}Cs}L*|$k0gK777K!ssA76JNf3nlN!`u)~>5QNRMe>N762o&nzT) z2t`=RzPvl^dO2mC$M*OrYzNe|r#N+LYAOkYL^O662Qt!?1C{XN`lgs-=WzZ$iXB!D zⅅtS5Qje{=aIKax-H0L4MhKBJ>Gp)=f;ssf(92O)(2mA2>BqVviy4!WAWq1_ll4o_e_+0 zcZYmVlUo?mruA+z?OUWrk9v<(wC;slFH!7ag$(|_fpcS%>9H>EP|7#Op;LFntdEW2 zXeU|sgbxIKRYIh&hYh_3%$~{E2!jp5N5k7v#**vY{RFZ`QC%3mEeguLIqtGw`7~!+(xV$p4eXBo6&*?4%9Vo zkk;2R=cEm7N`9mZAW^%pAKM}q>aCMA_tR)XDP_3jFWW6$IP;eGRgcfCWb*5ZXWpVt zO5L|Z?dv!IW9{)h1kf6TQL7r0G^$Mzu(-r)_Mcp9aBlWcUDmqr`OGr{Wt)qVs8kI6 zfS<0GoC4=-RVRNv525{63U0X`FYJN(0`EF-R$zB(S%W}CfT<{YD}Ctb@ceOa&al|t z_V48{f-tWsDt^oQwoWH6VR$kuwNLI#OJixTLNbl}llkxfcHnMZSSfmaM zfi<5S{L?4l)rmJnD=Py86oXd~M#5d_s`DHkKdQYc;iQGDA|6^GjqM3on&Vd`aPxDV zQpVs%;aYMq=S23D`4Tfs50OG>Iku>$gV|PSFb0Gj+P35Aqqx^FjQp7&w|pm_BJeNY z^MHwQ$HAJcrX+zoN>w)KN%#MuRZd_?9pkwJfj5$2DAGw6zxHS2h!z}Hef4b-bi0hitYV!scKlV4 zY%oPew;76h_Z3pzr3d#^UUL*4=&iGw1QRz5KT$r6Vp*ZO3rwUj6U5 zP4I8U&WH^S^HccZTWVPB2fqO@34!F$Q>0p1P&#pJ;l2@3TzuQQ=%s)Fia7i#GF%4T zl#IIYaKKs?vUUNr0zZQ^{urQjwZ0b=Wn|fLWs^hA?+Z9TfjE!^Grapr>%?^_Ft)3# zsKEFPld+qzIDmj}#qcYg^Qzs3N*_9$r~n5-b<~9z157zl7lwgG8iz;d7^)6z^ zfl?8al?qgR2rKTypFEOPvGC)Er>AH9^I^f~SDGCE*BF|Rc5v1bGpXfY-@70bKv)%N z3y8oGp2DVWmdb4D=U7(JU_KP#|MMhXUL@O@97>HgH(7B_FOB>3tbmYuojCT|qD8Nf| zxK1Pxv_K=b1m=tzX2|)2B)R?W=i9roP4As~*0rHCUkm&kXw1c31F9jcF4gd+ksyhD^H(Umh@CHp(vh9N zu(ZHpkozs6r`UoWijaU%`^Uz{qJt+gKpb2;Mn-Tg4#Qf5A$}ApaCBLGm?zE6&6)Kl zrKkVV$n{Y0rggZ;tn^1?I`8sNc5v(JFN~+%D^hB}Du&xe6Z8Xc>It`I!@frH9gM;VF=rj_$+}* zfZvI!4`J9~Oq42hq#Hfk7qB+%Yui9lGANwkO{9zfweriCNT6r`lRx{X^ndebH+_(V zmmPL`|Hs3Ft0MDhz-hrS5JJ*QbDx;uE%^ithHm^-0l!2Q@GdO)!JTiD8K%VEilSu# z)84k!{Qi>w>*Al*gfT#Shz|o`&!zBm`?hUMUkgi#BMR?E5g>vj!UP8l#*mw=gG=Fc z!hP+NMo2r3R!At;lv0KW4cW^9HdZN?=UD80Q5Pdgh%w9O_ zR#H#k=0KGG>w=-t6@&~9PZ3HM)K6eol=e2n3mQSF`pt_AA*5Gf{Wnd7>h#L^ev(EqiB5o%a+Al z3{#L>aB~|xoZDI%=}uY`vG=^e2UtssEUD)Zd1 zWFgv3P;0o?mL^zJUWInOiXDq45EiQ0`-%bkv!}Kk18&Q zQls3x9pttW#T?`VN>xSh*9F0WU)qG)`hA6Fba;5*hBLn;=Rbod&KwcgQ z)HSVa445(pzJZ2$$uYK~x!%hG@RLimI|;akbQ#z zq>MN_In6if%ZeguRA*C*Iv-8%#it_4NAz$0`euq1 zPt*=rY(L=Q_!8_s{)ID{MX)UccbHuN&71lvDvg)A>Q`_3?DBt{CU?YdY(}vU{TLZ` zK;x;?({e~>tZr4+!h$tR?QT5W8;R0(0jZ^P*l;(RT2P{C)&iOerZ)xdjW-zd7zmmt zyhx7s6CgRz*cnOSDu8eauM)zoJb3X<)ho^o8`Qxt1sEseTM0I%2KI6|`s8AqL8##}<*0o}v8NPp}7p&cFeFFLaViI*K36^NJ@?=VG?! zic-NheF-32B)SkRd@IbkNY*cAWV`^8Fjq~hy9-oA1CHwO3T?#n5sR2>|3Jf~i$o8O zS0B+WyHLY|PP!5HtH(B3sEwZ$!iERjgn)<;#YElO3fV64A3`}r3|r8bmSK-OIZ$%R zlC6O*6C;P3BUws&k?9ezEaEfEQf#C8;lb~oox=0S4ha{Y23HZYYOeiqnABt79fPtS z+h6_9SNhXKb=QFb5Vysw9}^RNGBQb5m=icHA{mjRMSxM%)K=gaB@EeP9TQ(@a@DBP zIqUgc_HKc9EPQD5e|foNp4|PsU#X1+Y3Ny(U?l zXn96~S%Xm(C9CS^`s^7+ZN1xMY|Q%dNvR~fd}$07m3*R%)6Z*QU)8iol|L?h zwP%>;xxg;%hve|M@v`;+j^b@Rn<8x``c*|1Q6MXe_J?$xkmFscx+Op{su!zGrGy>P}JMTJT#d zrygmayfR9CQQTk`+mwQ)%W??G&q1|l-=ky(p?JV>R?0WBmDA~5HXJ9Dv z$v632V8J!tvVFT#pO9&+9bATo&8$W)S^f;o-f57&?(*d_?_EVUg53{KP?gs6C-=#o z(%5W3N4I&eR`T5h%8`Jboo8OtKaJ)3{qRC%9_wh^rp%+-FFnL8=xJ)*?cp`_^X##E zHM2{8FORP-N1n1Q=D2kC`mWP+iU-4+>jG*&-;Ld_`Hpva`pG|y10w-_KX1Q}xZ<4G zC>(>54US6LZ%^UPF?Mw1Mg4#Ym0nY`z#U;!#l*L%e1MqcUyLx``S@vRxDMWd6xhHZMuaUrOyjI#s{TdL8(fYv zo}W%LNjy1lMQ)hF);4pqtOrX=#E97-Cu640_d$kY+vwX$;c7a<7`i@c&f_*Rn)_Dj zY-yYs)a*HV=ELxwUSHo$PY#RDzia+%6)wb@n(7?U+rMb>Oa?hGw->)fPlR9Idu?m9 z)u`{D(`ie!+XDj=mS&Ca1Umd3>=-yQa;eBxZa#Z}N$`=$+ih|)GpB}z#&Yk@D6Rea z;JagCE=cN0{fb|b(D>AOJNvQK-?I%5toP1)@A9bUyX>Cpe=c$PXVcnlyIOCg*gW=( z0lC!$F811by09?UM@jpF|M}G9RTaI!H4)DK6tUTTN0Z_yoRq$Rw)?ebsD{$!%-2iV z`H;*@Di*+>t|diG+OLf5qQ?~SEwXX&`01%q2cseW@3!_0bqv%H|;*y`%HFY6(`nfE@x*fx=J*dde6wGA~L6oJ5}!kHGdvZZ;g z%P79~BY!Ha=~@+~km(yYnGBP`)_1ixYH#xMnIe`iYWo}C5efgl0I^YKL1?L*Ru{=( zOmF<8`B6Y^tt|o;5a+`PTO|EyRx5ZWHrF{V`+R<_>!%2qdehPN=IT@BzM4By3hsJDPGtc%mo3Vrw66w%EMZdx{m~^I*v;8>|tvn2UsiQPcAc`bQM(yNDWF)!3$sIij=X?^jan2^5B)U)9)0NxpK5=e-b`JS+!TZ3-4NC3o^*pB zLrH!G9bKVRbQ!KMvf5dN6;V|A07L3c{`Ll``0upUF$~E2m z(*k;JeK}$;c02!%3YbpVNT?!OEk9+07SCaL;H493Vz z)cLlHG=OBNMkL&%b)X-ntE7DU;!*rEM)@^KT z5+;a83Ty%h6OZ73cJ+Q>gPLURIeIiFNS`B)6$188fLcVn$hG{MwuX?V5zX13=UYEmAY|)jqec9PA9jha{tx4LadsMUdoT(ti2hsxP?gS5fHS?)$b#Sc9Yq zgbov+Bw8(gNy+ZPRRKOzXS6Ng@SyYgff@y5a~Fk8U`IgJ2nYf#Ml$-j1(+rzTlFTy z`)vdK2?(MBNy0={3+g)sx(Ha>FZ|E+H_YMQ!Q~G561I`;e(vrw>bvruY&t3X_xIgr zo&a9p7xG3{aD!vyxN!1e;=6?zuFv(NJ>H)T2<(aTj@aJakR=0Vj+E^?c1Qt~NJIIL zR6+$Yiz1Rur`g?x;VP;k=%=>81O_FVBakz}JM9Y#O~Vat^6A)Wogq_3x%A@FHx6T7 zE_aT4WXAyqF*ML2G zl8;XvKY`K#sOkJz#10aQ2j&yW;R9$!`kPPgkHRz}iHQfK)9Ac;zI`aN!HvcA;s7(Z z%;-=^>qS6agcb@aKTzx6jTL}=AtjMWI{)co)4#P{$*%?V!zcv#OgTu01^5I8arLJL zc@%uEV(Ab&bd)f}*cb@_>mYd>$1@3VdX1lKP#Dc%cVE*(vVjj?Dv% zSqKSsKvbaue*1r7>MkwIAbR0~3=Q|%)Im_YJ5L~BUQ?VGjHd;-f( zj6LKSCIMdG2WIkW^L4+hJjm>Ur|+JvUoc`HYjKn&ItWzn2JP@!i_^*$DgROjA>xZ2 zy9Lv2n77I`KOAY^>Y?)FGKmvXfJaCN1|?79`P63=e4z~k1Vhs6D8SnUc_y!QVF}tu zgS#p5(wjj3bZrWufoy82#2fm=duYOhPGL>&AfO# zQ_P2%jU6Zg$qT5C*%zLU8Nii@6K-x$fkjnW1_p*88#lb&Bg@Tu(L!liIy8@Wc2jD4>ka=3A=J`PI~*o;-Y5uOYGk!r-2rwqW}cV zVt4QPtHdO#=#B7Sg(0y?-GgImD*`y^JofA_)gjD9WnZGNgXNv!?8X$maI9MZxuOrA zgVhX-tW!2Gtu-m!qEL{JkFwapcI@2B(h|ksA+4yGn0ufl2wiCD@8>#xe9GB{>nZ?S-yb0E)pHS={Z5L6aWbKo4REjktXtYtZaZa)-}IHl3$ycjZ%Y^R14 z0_na$&L!-7CHJ1K3w-@pM_+#@4Ex1HZnjQD0xW=yI(j(snr0ikJ1taqTR{H7_VBZJ zZBT2cT|GTY&iaD8ml-K36DyYQ5#Ky83tkMxaG2gILMA0%nBr&R2B3kLiqCEnbAK^Tc$gk0jnK zTuk))0m7w>$M7vIw_VQKKU9wnLLEkuP3qD);@C0r^ZEKZ5?a@G2YT$5tj}vLfr`C< zaAw8i<2RqOQ=9A>*26tWd;ANbUZSCcDp8od?ZpJxZ`jB5g%UV4F|{G))d1pO4H}BD zziB%*PL>-wKl6rzuQ-}_)Z_bve3c`QM?+5!t=Ec~3scRexRm4xn9*PZ###uHUc7eo zYCOkjeMMO;J%}g5w6ZBRg3>|v`Hu~U3`3;0mQWIy0kSAg4BJ8cj5W&_e<;J_fAH_G zt*3_=)gz+n135T8>Ovrm)RNsDqBs=5z0*ak(VLV{`p!UkTnpt%pRQBjFcYg!Kl z+es1^O90lbKGQf-i3crCC9nLw`=I~`c{oDr;n6{DmGu`1le;nc7g`v`OG9RgFD5vQ znNdDYV7I|Zw659X0I+m;l_VG30@OX}Ra+%2HUCV};z1&IpB|zIfCWz zxb~(HpQ(_~%wRh9NjRtbfCkbG_{?qSku18beZGCj=i7pe2asdaFtEXuHG_);On;Aa zD?L7>v%Z0f&hew((!93cI`MykcQ|0}LNsFpI|04s6HaU7T%|)lo?dqX(P?UAb`Hn@ z>ylj3>GxHU2x>9G256R230qFE4a5|ea$5T$-Z{+Uq6Bz(GjysHaS0gj5fKAn(GtH@ zfREUEZvF?JxCW6I&TTTj^?Cl{#Wg@CmC*cxHPk`uj9%yW?bOsHTOyGsV(54O)_W{G z&^d{gl2|WgWo0cr`M07B;g`w%qlyQ%uC3V$w|`1&9FCWKJXs zOz0@?aXn;p>C&ay6a1q%Fw%A(eg0kqw}uSVntU|^MSjk&jE1s4A5z3({p4^$nM`=N zWr-I5quCUy{|kGTR$xA1Vz8bKf8sti6rRrS6gEtYS730wAl&(WaC0SQxA zMGMvI7^LApGn(i3{r$P2rY>5skXWGMPx`?IOrJPF`g))$q=lyHKlN|9km9z6B{hw8 zEc8L`ir)fdz%68%Q(z1`QMZ^x_>>{&NL!5(x>?M`?e4rVI`?3lapqF|z$O zo_sGbCd|`e=oXnv{l7oC#WASs`q-Qk%#hrM?ey#qQB%cJSyAk1tVlbxNHZYHsB}Ek zs-#;&{xi~XFneba8nZZXEMWfDu< z9;W0=EHOKJ%otsMzI$_6aeWF`p83Prt=GnDii@SSF@5B#R<*NrGrpUG4-lo{M&+&xFS7>YMl&Wp) zPRT@_=h8Cm>g4#?^)ZWCFDFlVm$n`Jt~*n^+IC;l5^#D@%{3LH1^CBBB<0~Uzbx0D zJ+0LxC0Q@Dj=pRtd${RIwav@8hrh#{C?X@rdCr_}eD`Ojp{$JUH=X9n@EeQdY>VhU z$$F+3ea|GAf5P)7ZdBfK6Kk8-j~{qg#W^w4J#U@XH0dzW=g2nj)km{^@cTriXFUaC zLcEUcnZHzMo^_{4{OtGd6AQoQ&6cLo0C)tRXZ$glWfBzHGSpKL+EP5V+zml5=VE@p z<~Ol3M~)pIdGyPnry$3rqalfo_wgEbI~lWJw^19(Jn^xDb$|Ot-I}OB9;aqvH95(* zEHs$6j+V)9ixuV1u88eHicIz1jg{PXFN5p2B{v$BQHroliFRv?@U}D72VGGS9=%D^ ze!59G)W$mgD@93dJb%QXxk*7rg<$J@^^cr2pG7;rQ)sN4Q8_I%=tIH8EF;~hE9RC$ zsm&;){m`K7;gxMno^_N7UcCxxf!bDdKVP*pC_byEx_LU!h50yV@A88^f0K9%Io_V+ z6Jt$eGCQ=ME_PkOTjipPHBOxZ zxnI%{|pY(TsCFp(bLzT^X|De<~u8@|e5>n{{m_ z_qarpD&3d;@mA9Acj+$E_w?9Z0-!I``IqvJ-!$( zaqn4%gJkRbkU*h(w$11C%U-E7Q3+V7q;SfQi6(bwH6G58^xR3;{UM~9=h5adWx?0R zSI*Gu%Y%=MRmn7P4Bp8jH5_Z{dXs6=cn~Yj>uo!m@2V>(Yn|Mu)ed=pG8aHe)Q23 zPls;kBMl|2%n@=W=~{jAd5A{Gfp8ShkesG%IJI{`#=O7)TO;>E|Hzy`GwrLi{@82IO?4f@> zq??voChQkg(|*>5BUDr4P3p9>aqJ`d{|V7uQaU-XaWVMw@5gi$Th?zEuo!lBc5E7m z&!tk{@%dff;Lg(LwH^=RrcdvFDlXB~t}X20vpF;t_PyoCNB|Sp;}ewqMyk*{LT8{ImUgb*#`VCwk*y2Di`hu6z_aI zP+8C)b$n`4xA(njPYK&=?nz*)2LvWV9@5^gm+E{pPzm$KuIuVPqHk@RKPNoA)8GFj z`qWLi;2W|#E*NygXl1_~q*^(THWZ~fNuhGsm`% zgT3%kDox-;s@*g@%Cw6FPO+)!>NU;ZXxtK(kwi7gXA!|vnjhnLVTwO2*r1i^9mPGa zV}X{aeJ*_w;Q#y^%yEZLoGs_v#av!F(A+d&`Pm6M@l8K&}yzJ}}dRFKt6i%o+! zU79Xkp!h;F^_Qvh+`1o3GnI2n{A}y&o8cYp z_ix5GKfEuIaMN}=61J|IY1z=$bBBWacY1-1e@oU0P^2)%o?v|MYUydrLa%z*q4!7! zefX~O45`yhu~QqXa_81gw&Xv~HQ8ro_C#}a&$gVnEAU`Py$JRt=H+2vSp9Im^#gRX zCDd6L(Esw<(uGQf7_km-k-jhDHr+U*u&seM-Dq@*I(W@7H@J5%+wDVh{2uG%#2fjz z*7mcWl<+&Cvfvt@9r5>AjdhugJ98hCFV znkywB+!p;Y#D>1EiD&dQlh^hz&8rXc#WPZs1;qGva!I;e-SgF2RaQ#prK;7sdnZ*+ zgl+8Sbzo7D-S(W~lmvBWWBbF(sXV);emas2}~&;N53(0ZR4$IH+MGWVnL98sA$AW2XfZLZj?x$yMZ%D${K69I zO%k$=4*iPNGh0SWxzBm;KKIxpPGvFx}gKAeV?2imMn&jx79TU953}Fk zB9qTr{Vhm=_54goksaUj-`^Rv^mPzcrQKRhDci=c?as69AoVCaHP#;>k_xhY$10GO+Mm84U3B3F_Y7*h$;m-IBlH93$sJy;j{? z`RD@Ev16f+w^y;!9yngJg)$;;Q;hKuxAz9Z#TR|m?Go9#A5~GV=w)<;F9uTWnft+Z ziLWwfB7$%HzHqk7&OC>$KPtvXh8*KnRVYq4l$?I|;F{^btu8Z8_SERlG1>7&eA{@mb>-}D*_t#;o{hZ^9$5*@^;{XU9=|m5 zjz|2oicrLK0DWK7Y;G;rw(21F#uD-QhW^gX3faMAbwlRqR}4+RsRaKzF@y%GpS-$H zMR3dO%CN^?n+qZio-Dup(t`4UL}ml~tJj_-j=Sde>%T1sQlhmg-N>lUykFjy;T^9E z(?!OOCP%7*BsTW;Qe1zXu`k3$_0si^quHVA+-J`OGRb@JZPYiKY+Ebn^VP~8h zXi~ivU&k%Nt4UMs;;GJT>OWspZPDaHBO}xNE5v^I(;l4)YVG}QvYwfaOi!XYSPDlC zcLtw`DE~Jp8J1UYVY*sar@*`?T^ifjo3~z(tk?Q^CkNYssj_ef=6Pm zKXCAT#c|_PGvni$2Px~kirNNUZ_~B^RCE8^O50%B$a}Cz@-6$%q|i8tTe51lvbWwu z^;92@+&!$y%J;{!kK)0e&#t>B|JW$JEo6_f6w@zv{O%aGzS`>b`BYU_u4z*hs)M)8 zqGuo7Sa&Q{%{}&IBlGR}sk*$@kR!fSf0L!e?YKMmv}sbk;!lj8Zd#ho4f(FO*@?3! zy_-UEPx+R3uc^wN5w|URpVPCSyei^nQD@skxxd~$sFdkW|0l0!WgNdV`>tM2>KeVo zWR(?WN^4cf^~3PY&bKxU2d925KjKs@RHCiP3_h0Kd&DN6_b?@of>km5?;S+~W6s>B ztJ!XE9==nLYi~DlIvB7dt-Q9hbNBAu0A2Q;d@YQreq`IFZP#batgiE)Hwvi95kE&~ zv@xKOX?N2UuLE0nr=jJjR0bDY_)^w3c0F_XUTS%YV>>NwO|Tsa%N+F&nhIj3S^Tao zY|_sbVxlxk$N5a;HMJxa<<+5zw2(&y86rL=nxzxAiExiw=Bv(~CPv+($6#heR?xqBP zyBb^|S#!O);Vrd~_~+>6l>uH(3zsI}Y?WzhJ1ybPv8&h5bcL)$O9@Ne8{Ml)-&SYQ zbjfvR@(1p#w>KJ(>Zn@37=3Bo^!e5>`4Cb)pa`mt@(j#7y?*yt-Dr3d5$2*(KuV2HO?8!GhJ!xwuv zt!_}s+GR$_T6MIo>}Xwi8*Iz;`qx$8x=T&FcdGSvYdoh4kN&LSE-}c>?j2h{TEkdQ zZ>+k7);<0Fh!Kx7OSIZ^(-d~E`0a77ao3OZmV|{osO6uG)E2DFkQ!(6uGp)#@q)P- z`yR(J%P(g?jIc&O%3yV7^L|3h7rKr(RfG5nU^qo}qKg(&9gJp{Fw6nN@hMp3VA{Px zKEul8`&XNnJFJ)X@8Fhi((x#(+VI1j(b+(Ox1m(A^{wRK^Rwy!x%;lpC))Q~h94cQ z9Nm1?aHoCKV)$0mUo>|me59T=6dN$E^=QRxF>La6vYo44Da}_(8w~Nu7Sc&(hG_s7rJjv-CDTS_PgZY%#3#W zo34>xhqT<1xnEbD8f#kI{MCb=LWLe|&UV6t0aa!bhUr9MNi0?BSATsN_3bhKMm1m+ z71n-eY<21VfkJUg-%qVE1H1J?+UqKNm^Htc`q~o;*Ajj@@!ulGCD5Ry^&Y%cE~S5F z=I=$e)S5%@8Z12?9*AC9GFS^-vu2*#^u}J2MdRoX_o0W+gB#jxS(|Im$3<=ZJbOMK z^hRRi0#uBSM7?0pwgX!4c3k6ZX~~aQevSubp0M95RPL*|{^a|t()z&@L!O?ylt)$^ z4h^1IViB15_jl{(52n7m*_TxAW?c&p{v!Rz()SPF=CGcp(zb%~P``ir^oh73937q>GXQ>@meZ)vqRbb#}DqT~}e(7dnYD#(HcR|xmJ0~YT@UGKLcdrYk06kn- zTzuQtLtFPirU%)?QEF!Yy!MSp${!2r{!z?TjRKtOvK9;{b4g5MA%IJaL zY-+HsG_|#5qPQeEJzw?+o-owwC^3k+vTRPP>FvFj%sj2E27A(SR&veqdj|*CC(Y!v zMNpPot%u`+>)1Nz0n*aaGQf2I&!;7Im>fDp#S*cXf58vs9R@iocD-Xx7I;Qr<_-?T zza7c0tb}3Incnr;XV~9=_)z_87&?^@y&N_8C6vzcg{g>d%)WpB{=ohq);bu}GSc-9*DfOjLf#VPNPGI`pQlT4BA?bdCFq|__eYGKG?h*k)B`*D{VUNZ*}hc`Ed}) zp~=NP*}@6^F?=;a7Yzw*X}p#z5SH=a!N#bWV$Ca&7r%*uNGrW>-^|{)Q!?B#GBTq` zF5CITL6Vcv)WTw)^Y7F%9v<4z6JvNwS9RyzZ7H6g!%9WmMVDwkY~2`56RZ*x60!~^ zq%qI7Qu0wRE-qf$caq07j19wAzP)?BRg@2jrXWqA>FFoRDaJ3VM=soIpJ?jnU_jw3 zEho1l;lf}?#|aJAXwWyv&+Ao$eM9e?H?&H3m8U&2!s3;-?!gs~&ii2|w0$nNa17AC zi%RJ~Pe}c-Hs}G|@s7F$G_uwY+Coo)%k|&OCZ{@J1R(W-G+XmzH zrDybo3*bai0%~I1lk>ALbWK|ZAD`pOF=ME<8JeP^u(Y(i(Vnlmx9HBhmE~nM{)CL2 zkKJotdN~|$yhZxe4zg8Hva_*KWSTr%|L?@fli&llT-`onxLR9gC5}9g3IE^sKT?Q_ ziZ;4dYX9^Eh39{CcIII{u5Y{lnv+U~3?Y?R8KR^R(wre=D6OQC21TI^$!b(9MM9Dx zk|7x%w&_pZm%Es9zYt=Q3_85vW)tzr)Jk*uuPCswH{+H3(DDdeLHu~(rM zBaDClec+k1XSKX5+dq6VnqS}m{}hh7rLFDO?c3!rZpm3$y@(u*H?Qd1e9dM|Kf(Tm zlbW27A*UKEMPWX~K$+1f1uo;3ZS}WAF2LQx!w?by$}_Ma!$AOS7boGFwOz#%d-LYa z`}gk+sN?cL-~Va0;+D)wm3)l84f2lqKU!TxRo^H{NIxA`7_#EVUYh$@gRITp)=72l zxX8-N2@c5VGyXGl-I7!FSrcgT1B7DLX{S!u%DTF`&v!;=MnB!Ssq9b;gn!g?w)UL< zU7ot&7E5&#dIeaQ@U}Ab6$WtdhoN;0^E;I8vr(f*i)CBTb=iOn2P(+hjvP_luyNzG z34I=ntZNaE%?(_wCz<4RLgIl&~E;b?P)C=j6#Ava+%} zW~IFHnZ%DpHp6gST!yLj?*7#4f;Dl-1jTdb&tpeBFzPcrJe=d8SJUb9mFi((2FQc+ z!0EM{sx+_uY+98mMvc$oRkoQeaq~Xq-eh#lw5FaaoyDeGv4Rs4LY!Cm<&tf>+3eXp zxEfBkle@cAzxuq*i0YQ(sN4cRe*9>xIMnawzWw{1xYDulC!5L(j?^O zO*8elJ4)r3^2RS1nw_0}|LM~|pFFXaH*$1$PxX^q|E_9!Z-=#OzwEeaUNO{2N>NeK zchAD!k)1ISoZ?qs{+X7snlIlK*pPFxASvTv`yhp=LdPr{+m?|=Qb!kz>~n}~v>y0V z^9JunE|t$lu0)qbT)FEMY0{B} zm3x;rNHjM6X`+f+_WS60h!wDa+!w32`DW3ZHRts=ot9z5*bfUzw>ZOS$lY3*BjaX? zEiIH!pFe;8>fJjAPnRQ}KUAU|H>(jgPYsexTskdb$Hm`jVjM&uxY999}mpEo~&_+%Otsn%(8+8_!EwAU*+Nh7G~)7k_i}Zn;0& zJke&AmWXS3m$*p|J0b}>c5H;0-r1!~7u?TH;`h(-dEdJb>qvz3$Y4ERDK3yM*c(*$ z<$Lu?y0||W;SSh@Q81$ivN0*eUxS0dz`UMpSL6$Xtlp%JO=?mq_`2&<>=Pa@Jrd{aM&w^!UVrm*(Q1x|*S)H+`=V)X_ zR-;EXL#;Oy6;%iu>>Y~$cxcA!*6H#A!UDo@*KSl$P^h{V+(~`hxCnae(2j}}8vKp4 zM3Zax!a52iH97&_Kk+Nry=YXSXJ$?~#cgNgjf(5B_#*Db&`R?NDS#@E7}52YZ;_^D z2AZk~Q4BS3Ar>RVYZnhp>j@3^y3oTAQjHu+9`%N|32uAi5>a?bXhoEfN%1j|{DSYk zF=Egap~NI|e~zTU2DxMxD$5oD1xaJgDh}cO1(SNi$MfaaY8oXkJc`jvs)DnTnwc3v zD?Uw^TWA0dEp$#I?h*8S!f0qFLTD0MKRIl+q;1@=W|b*Z2GOQ$vV8gQr(U|WxQPn} z8Z%0MfK2NUICf?$S88BUg>(uwSG%ztMyJ3h(LtyCGlEv~{KCR65`qUYu7^%EVQs`< z7x@3(rt2HkT3J~s&^0>@C%p`55?c?r7|W2!5PjAr>V0=|&|(>2f56w*S3Yi*sZbF- zgVF)gGKRRaTj=22rj2wC(n5mTcN;?0O!8-1R8{Dg8G!>Bw&t>(cb3(nMLT$aJ5Z7~9jxo7W9*pN_v^xDBKSR-+0MI; zS)aTuI%VJ^m<#O{0&rBuItpR~EjEZ^$BH|~j_kxq(Su+lUYH9N5Gl?`n=vBmcz4_B{@cYb_m!2G?-FWU%)UHG=YKdV9S9Be5_i0Y-M_%j z&O(T33gulG0<|ALs4?G0Ma;US5r`Q8aZFk~BpPBOy~RW=2#MlR{Hw}iyfrE+qV3Fr z^|7;3V1IUUbGDK6zpWka1f` zWzCr*JDGN2T1ja6m>jkv_wJ0%LS-_6#%8e}baZrHqnED+|7kdUDU-4ByHN0mg~um| z1GM+vrr}q#fH7I|Xim;SA*du8i5D$iyo=tdJ&5J(!ekjqsUQ%k27@f1`SD_L7qEh2 z&4+GYl-W4H-E-y2;kYod@q`-o(KBauGOS2VjRBHgFD}I{vbNsI{K`m7h*nPI}*Z4)b%iNbz!&f4ofVV~3d{>g2Nmn@kr zvYs zn%Tq<*4~%xD!s8aMoBx+*4atFc!C&PMoV2R1h1X+CTZ5v*7TaU|Hf;C$z4XH#LQ#t~6DJ_eW)o5rIJ&vN~B~)qX z!GaTR=v)^-yxLo3xqkM{nfXvxG~p4_&jaCGcv^LJ^&reB?RkW&Q^mXiX2sa#222)6 zp41j2;RMHz?~S?s{N;-n24QJq^JiV%ik!kd@$ucrO}{s11`36Hd`38aPX(=B3C*C= zw33JsOOf+N#i>r8K9r~kLgNiig(eIhTIH2!`7T0XmV`5kjJ+Bc#rU)!X0g(N63HNX zG!i?^N?DUSZglg|1!;YScVXMnn0W3X2V7tq@87$lqa}+#3w@}@G(7P@PMkbBpE!Z- zz#rvNqoh%{F(vuryfJ%GhKvX)^NtAp3B&=i%8taujEk+}8FPqTpuh@GighC_jmB8L zt$j+EEqGQW3d6>Zm7@(fH7(7MV*ho>o{BOc5Me+Tns>x5JM#X4P;4WUH`i z#0*9H&14V@6?|)hm&=dla_nCb%$UOig}b%|hH1Ys{d$<1nx>vSY3%Ac0ud-gEW^+R zs*W{RGKBp|s7hNPfh0y^o+F5+%_ScdG_Dgi8IaNz%=WF5%K+&f(zBUA$Nm`CDQT?hKVz2_oKHZYPN#Y$M@}kdaVP`|MFd-x)gp_lDppSz`7!pB)nW*%O5>Xc)h5Z|vn3%}iGUw{G>(;qeFJUJF zmjEdrUXT`Ux-`1(CFe_M1RVTltcHfH2v`VCgmUNfpI@dEnX{UmB|=abz|RmmG0a)v z!JC@aaO9F-5(|^*u?ktpJ|fuZNGF+y?baT``h~*hQrh%EYlOa1Owsqs(nHf@X!R>T zFu6}??1lJ#?ZFq5pm+Rk9N zJCmH*8auEE+6_5VavPjFvA%3l1Nf6{Y7E|aXOYnnkytKX+%3tHK@xJJNvxJ+gkek( z^lUDPUi1>oLP%c-k%TQwP_f=e0<(H`))SpXqGZBok=SNu>z%uB!I>#L7XI)a@iKu3}b{=q62MLjyQ2L^mHm9(^ynGN5f}Q94s97l;qYBAd>UufQnnZ4ZIFqG=L(JQ^ zuIW_*T;f?nn=}WfqCGmnX(ExWl#EQ7XTYMLkUSfaX{ct5We7p8wJm#Mg{LQKrFX9O zQ&ijvse4As0(`A3oK<+oT_iq}23yd?fBEL-xY#zqzW=2w97xMU5Ou7Gq~vOXgVB584PvE)Tse-Xl>rX=n%}+z`8^@p%-o?&b6dpY(a1??sgU8jq6|^&Nr!Kk{0M#QE6AO8;HR3JN1g%b0Vs!w1_O|S zS{?o8jzgS?97#<}y0>a*Xpq}Djc6hM4=sHsb{cktRI_-FKmVeaHm=`*gG_70M4yC+; zhE{g2GPyg^#IW3k!u=+yy)M~}?K3o@0B+_%`E#aPPL4JPlzuTyZNoCN*<9^>Tf;3~QQqMjkEG}iXg3a_98ddmkphL@nwc6U+ zd-v^oI90#+w9P5|FB{gUd~KLJ1<;QKP%AY>=>mtaT=;r$*bPimaay*lGs*R! zL5)~z7mSU(ROjN2U;8_>WOr;YoY>b->qKD1#;X^zCmkwv>-VVU%Riu)&UeztrqeyW znAbk$VVMIThgTLxmHmY!z{#y9t~|E-5AOhul5OMpbLZR-=8-T-saOOI88Sq#29eZI z#r{ig*QnfI9BPsL;OoA*QUiD1v?Fg24H*ZfZR#hcRbs`u4bS~iIeXe*rLT%7AFs$c zw3bs!gUnVA`PAmJg^-ACV^r=}w0#>QSJavs+J!@D9%tw7?G zBbS$VrR9gypD~IX+LzYkK8!e$p&B(dOT2EKCi^qk^co!qrD)UM{oom=_$ z%gSzyNt{4bK$F5B_W?{~`~%MA>qVb_<~3sB`spcZVVWmjpRpN}E|a`y6bF+fy>a#8 zevD2`QGM3R5=YfAWo_*~G-Ih)SXhW|ac9=qV&3rm_gu#}x8HQ*5$GT4nP74AYd^o1 zwiAnlI`aKD)k9TV>|OpvNdYdj%dOL`wR64z90HCT={vnk`@88sBe!mqcJ1<^!D{Q3FUzfHIcbZo-?*{teoeA*NJK1M9)xQeB@T?0zzX@W>w|Y+ehQ#<3sPhsq;%~1<7}$?^?JZltW@rC`otH zAtWP8a_Tp8g}b}K!i9Y(B7izcp?#CyidS)3wMq`*H_%58l#zPh@wJ3i*!Gl7S^8=$ z89w`5Q!UAuBae<=Wx7fjcR0&4W1gX52Yfw9QdNC@k1U&Fne?4hHUJCwQ|TKC9t{u#&WNO4h2 zH@ahkv6ku_qSV;Eb4OSEWZWL0L(r6Ie}iJ$Hg;#7-RycUw{JoXI02hoxt)Wql(p?-T*J2$^!carQA zC()otG0s`~8yO#|mYCB^eL`#n((<6c|6T{)gF5&)v26%2g>FBotElNDCwHPNJhnl>`y7a~%Rs=Y8n^ipnF#fnJ}!lyiWrL)j<<|6m}RL>@T@@@&(^G)$DJ zdk`coTC_-W<9Dh?uhv}VPSD{Pf5Ta;&mrJ^k$EwwgxH}PPs%q0%}_HUV*uR|&Xlph z6LfS`CQj^+Z!}7>Bu(yFLTQL5Ic7#qAg02#iB%;a#e6vuqv5s1+y)>RGF>0!?%yi5 zSg}_SLK4UzEfoT2I&~`gb`6lMg}pvQq4^vxsD!|l>8Z#_IfGt_yCq`~pHpAY08s_H ziZmPFynS0u6a^GaPmLjGnA)sUFxDsy> zS*4}=4N|8zjbY76$Bum2wAvvp)5o3C73+y?+YhG%5|Mo2g7xT_^+k*4&krU606y9s7gt4;WZSruL<%&KLE`(AY9VO;lLvXvWlnhH9coBDe-(Tmxnj z>C5xhn+98ed-E_;Q&X=2a-4t2;z!Uj=g}^QfuI~JCe0?Jc>H(^ z7*AgySF$)Uj7v!6k{>3YIweD<10a9h(m}6IpXySTci^`{y55!1B>z>z~!$UPmkNFDdg>I< zy&F;CY1)YfC@NmRcTW~1@dksjg6JD02kb;BCDo%xCyDF+6M$0r6%|7Ox`fm9L9qCG za*X~~F*Y9UY(p?ople0xJ2f^_v3z>G1htFiugt>5Os!slsxE$ zGqtpo7vt5*cKd><;s%4!R0A^F+e-2)qLb@?{WXZxL`2uxy`56)vDH0fWK2*275Y&^ zh67I_y0Nr-FJlj&#aGDWEH1gn?%lgzJ##y>JbW7mT~7DB?KXBbp8H^Zk$BMD97e~@ zEK;F=f0rkxelISOqZBXNGpGM(>`4(i>C&YEA`5?{45p_c7JuJ9MT!O3Zs6xt_iMD3 z+Y-B2bda3g=iKt}{k4OMu?>~cZEgMn7b1Tt-eGJcg^>Ua>R=>A4Y8`a`g*a#3J;Hb za4b=_Q&QSa(^GA_*6|1KRDJe38LW3X`;3zC^6J_1=9v?HGG5?SRaO47rUs`;z`N9p zxigpQPCvdG7nhWT6T|WfN93d4ot87MIH^Bq;6M-AJ}O~{)6%*F+tI7YJ(er3N&Q8D z2*&eZ##G0Qk)fZqSjQT3#Qkb4^SvIHE4nn1M=qxuH>*%)ac(@( zBB_+2R6e9k0T}h^(2#ss2z!C|&>#p%_?)+g)xRPeh z>Z5wQ-?sWcjWkic+H_|$&S^m1!lYC~xHJsW${oU^Hr_m{uZ}tnKg$j2F}0mZ`f-Nl zgHDxw8UZ$yknpVj=qf${&m=l@pN-pj#B5(i##p-?|F~z`aYxJ)MLj;7c#YcH&yjVb z&gK8_6141W-=R+o*VI4Jpz_Zwt4?{$+rSS4deZyoe4~itx~|+L!w5l#KmO?=v~A)U(J1ezaD;UMGQuvsXLM0)TUd*+PhP{fIXOcDe|~$}ZR%Vq z>XdJKiwWuWj3!Lad7c`p0VFK`fpkiflHqQU83cMm`C2k2LfCC$J;Z=Y0=((UMZ0e= zo5bJ+01*~c{2DAxTn5IAi39^aPcl{QRk^ukX@?;Gv%{uqJ_ zXB-l`?_C9$c@1<5B+qBfn(J&uV4AIg?ez|w*4Sm)M6*Q)yOkUej&ALX#r7M!913Xa z&P8?|UP&Fvi6}+XRJ*3H-KA*H&IetPmI#IP7Znwa%-O4Ym@->zU7G9xHyE-jgE?ew z+!}-JORTK6{QQ>pGn9ESER!ZgYk}>;W)a%*;P+jpTC-BvFa~#Z#5NuKcI3xQ{F#Uj zfLcV_i~U2~IY`jQ%F6l@hKQI(WK#@+!?7(l*gksKZ6P)n3L6V>3aq#IzzUcV9j27z zWHtPCu_smBVP`Uomh=xLJCmirX$G*&jgZ5w=3G<%65dhN_{9(d?Ds+0Cq$s6-Ukn@ zT(Lr$y79$;nA)WRYoXU3Gv%E$m}MWVu4 zgGPPvHVNT_k#XVp7&Ads; z$IWUgK@*j=UhufMgF*`CmiG1;XY(-!@LYT|jug3`Wis}o68~uPKmtylPsYf!QY%;Cu61qpL#(k>_f?80eODh)THzPRvZ6Th$2D{%<(Z0E zKPpfaG)+WmXk_!)2x){&7V~$8-~iuqm$09xs`%=v0j`n``TcPrHntZO2)|<}$P#>r zRVNu0i-{P*7Rt^_MM&NM%kp{n&+kMVn%dg+jsENPS9y9Ch%l9SOK^R~j{7t#^cLri zlL(y!ieQ9+a?pabWuqE6!kol9ols!p{i!ETJWSA~tTz?Hv)&_im`G|0OO1j+tM0=F4==A9Co^+TtbNcQS8zPhUm7agw1+Y>GKe3U99!)OxMOZ!WFPNW zO5{p!TIQE<7p78+So`kX+4X)(F%ejNvBg^1s#z2b4L6-JmG$P_SuDSF#bq`9{uYxb zkt~Ds9~?3cY)B}5`;Kix&N5hI$pzHXFPhu7JW6l4dCEZf{n|-y$#oNcTi&u}-MX7d zAp;N|ZPtq_>==L76zfI$uOcsnOx@JyHtoCMiGFTriC{Aa{e0Vqj|67q=hEfNRU9(C zyHBxLRh*`>f1XhsXf4nCk#e@?0j|A%{p;G=?wvY?+TU#y^fwA*dXK~qdWZwH(XtYO znG$h1Q@P)Z(s}yH@3$cy6~M{AT?zw;r!2H6f~LIrh~M0l9u z((f9!!*zle-ya&L%`_qO#|}S9Xqhy4dr6bRr7y34HvHo^rr*iZ-;5}7>kZ1icJMbz z_S@OLgs0qr;0N;XH3C|Nt|X@Ol{pM*mWPuaM475&0%T34A;T*!*=1NKHe@ENQJqBF z=V8K=zudLC7Z5RGS7D8CK>4t9h?DhxeZudQ4oO;%Yo}+{_Q|drl@nJ3IK?#ecm6|6 z?=PufXw+f2h4^=oAI1(dm+#Z3Q?P+%nZsW}$)H=&MK#UYY2OT*L)QO1>_2GilO@JM zMKw<3>6ETa{QPtnT=u7+p1*hzkea7kry}g2Qq1wS!P|11DnAusQQh62dL|o|H+EMM zC0v2^l7C!nt&ZYn+-F8~^?Z6nrtDUWc~j-k{3;*+d-3fW4OvXL6GRR&9id=!+NbKe zeZ-~tWjRV}pW2n|#$GJ1`Fef>M4Vp8aD-8gOgeL>H*BG>yu7@Us<^%>cZc5Fl(*PB zufN!<_20wfqm`y9lDbM_iV8?1|76Ch$RdZV_EVrvbW`}-B+FE74Y*SDy7UZ)-m|Bl z!0W+Rrai=Y65RQ2KR=NI7wHU<#OGq|Qb(476ROVhniH{mev7GE?<(ew8sZ#cCa^20 zeuL6{431d{Qh-&h9o*ZC!wb{l>i{EQd?zpr@(&S3W(Ky87X1z6KYtHUoi{%x4fl*P zyx;cSS`#N00kQs7|0wNc4Mw++#5`4djY{xgx3n9ngCua`zWZIQy){5nGLH@&sMozh`J`YF=}RnHG+G;@24@W60kG z5vd3Y+`5mpP;|Np)Ps4JmVthqwq6($o)vHJKhv#rs;1Ahcq&8`h|M#jNDS zK0^TGOu$X%W){3quUm!V`1Oipsn&oWL4$wO)Rbp5>#>rls~jDD92CBAdaH@oZ!k#@ z!mP#UFJ+qt5?1|a3H;d-+*H{vKm>&C{Em_Qe#>^CQ*9W>M~Ht3z8$^&_sxb+G{v&Rv&eDv#37VLk01+XQN z#NW0<>sGqA=O9VugC_6bsT<@;BNgZSDf~@5yZo543RtRfT# z6o0|r1fH0j>=IM@KJjN#y@U7k-o!+p2p;#`7Cz%rG5Hi6zSUW^PiTt>{4C&#r~Psxr$@dnI082~DnB#i`)jlDq4yzS{AtPV+P416$6$O^J z=dY}ubih95LC&~~88uuj^LaZ!{=~7vtO;F^2FUKzu(0fw-}>CT#4k_U7ma7e=EzV5 z=`M6(kdXNAzRPD5RdAcm!bO_eB|iYk0OU9r=cN==UccJEe#Zl=#f^O2F9vZh7=jUP zy)8peO>sarEi%EI+Lsfqc;>DVT7k@10kuAP_D7p}ll}UC{MRo{`Id;N{xg@C4Gpy$ z^N#Cxevx(()JWh;#T(mNTMt88WJ?ZP;UCeV_TvM^|Djy>HI-G#?xpa*{;D(PLD$j$ zUw><#o;vWso7rzh|CC51!+lL{e3v=-F4uEevz-4(#%qq(($Jisp{cuQytbZ>w%&NH z(VCj*hll?4_~n1RVWsyn57&+V_ct_!uXN!Js^S}bmbvQ9@K~^Xt&z9is%0Ll+>E?8 zNVK#k>u%7}8m~QC6Yb7Tvut>EpZ|Ha_3|}qJ-k;*EKSTbw8l=%nKb1izg8E&-mua~ z&-mBXn*7HV>nzgJ(bLw^)0(8Aslyc?j>}%odvyQfJ>F{;d3$?F%q_-SE>zYUJ7Mhj zG0N)3BbBv)h&8x*O=aV;%I3@c{GB{jEti~&_4_}+b Date: Fri, 28 May 2021 14:49:57 -0500 Subject: [PATCH 020/226] Added a new 'gemmlike' sandbox. Details: - Added a new sandbox called 'gemmlike', which implements sequential and multithreaded gemm in the style of gemmsup but also unconditionally employs packing. The purpose of this sandbox is to (1) avoid select abstractions, such as objects and control trees, in order to allow readers to better understand how a real-world implementation of high-performance gemm can be constructed; (2) provide a starting point for expert users who wish to build something that is gemm-like without "reinventing the wheel." Thanks to Jeff Diamond, Tze Meng Low, Nicholai Tukanov, and Devangi Parikh for requesting and inspiring this work. - The functions defined in this sandbox currently use the "bls_" prefix instead of "bli_" in order to avoid any symbol collisions in the main library. - The sandbox contains two variants, each of which implements gemm via a block-panel algorithm. The only difference between the two is that variant 1 calls the microkernel directly while variant 2 calls the microkernel indirectly, via a function wrapper, which allows the edge case handling to be abstracted away from the classic five loops. - This sandbox implementation utilizes the conventional gemm microkernel (not the skinny/unpacked gemmsup kernels). - Updated some typos in the comments of a few files in the main framework. --- frame/3/bli_l3_sup_packm_a.c | 2 +- frame/3/bli_l3_sup_packm_b.c | 8 +- frame/include/bli_genarray_macro_defs.h | 14 + frame/include/bli_obj_macro_defs.h | 4 +- sandbox/gemmlike/bli_gemmnat.c | 88 +++ sandbox/gemmlike/bli_sandbox.h | 56 ++ sandbox/gemmlike/bls_gemm.c | 304 +++++++++ sandbox/gemmlike/bls_gemm.h | 101 +++ sandbox/gemmlike/bls_gemm_bp_var1.c | 518 +++++++++++++++ sandbox/gemmlike/bls_gemm_bp_var2.c | 590 ++++++++++++++++++ sandbox/gemmlike/bls_gemm_var.h | 124 ++++ sandbox/gemmlike/bls_l3_packm_a.c | 328 ++++++++++ sandbox/gemmlike/bls_l3_packm_a.h | 122 ++++ sandbox/gemmlike/bls_l3_packm_b.c | 328 ++++++++++ sandbox/gemmlike/bls_l3_packm_b.h | 122 ++++ sandbox/gemmlike/bls_l3_packm_var.c | 198 ++++++ sandbox/gemmlike/bls_l3_packm_var.h | 63 ++ sandbox/gemmlike/thread/bls_l3_decor.h | 73 +++ sandbox/gemmlike/thread/bls_l3_decor_openmp.c | 138 ++++ sandbox/gemmlike/thread/bls_l3_decor_openmp.h | 44 ++ .../gemmlike/thread/bls_l3_decor_pthreads.c | 213 +++++++ .../gemmlike/thread/bls_l3_decor_pthreads.h | 47 ++ sandbox/gemmlike/thread/bls_l3_decor_single.c | 141 +++++ sandbox/gemmlike/thread/bls_l3_decor_single.h | 44 ++ sandbox/power10/bli_gemmnat.c | 9 +- 25 files changed, 3671 insertions(+), 8 deletions(-) create mode 100644 sandbox/gemmlike/bli_gemmnat.c create mode 100644 sandbox/gemmlike/bli_sandbox.h create mode 100644 sandbox/gemmlike/bls_gemm.c create mode 100644 sandbox/gemmlike/bls_gemm.h create mode 100644 sandbox/gemmlike/bls_gemm_bp_var1.c create mode 100644 sandbox/gemmlike/bls_gemm_bp_var2.c create mode 100644 sandbox/gemmlike/bls_gemm_var.h create mode 100644 sandbox/gemmlike/bls_l3_packm_a.c create mode 100644 sandbox/gemmlike/bls_l3_packm_a.h create mode 100644 sandbox/gemmlike/bls_l3_packm_b.c create mode 100644 sandbox/gemmlike/bls_l3_packm_b.h create mode 100644 sandbox/gemmlike/bls_l3_packm_var.c create mode 100644 sandbox/gemmlike/bls_l3_packm_var.h create mode 100644 sandbox/gemmlike/thread/bls_l3_decor.h create mode 100644 sandbox/gemmlike/thread/bls_l3_decor_openmp.c create mode 100644 sandbox/gemmlike/thread/bls_l3_decor_openmp.h create mode 100644 sandbox/gemmlike/thread/bls_l3_decor_pthreads.c create mode 100644 sandbox/gemmlike/thread/bls_l3_decor_pthreads.h create mode 100644 sandbox/gemmlike/thread/bls_l3_decor_single.c create mode 100644 sandbox/gemmlike/thread/bls_l3_decor_single.h diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c index 13c6905e74..30f19ec4db 100644 --- a/frame/3/bli_l3_sup_packm_a.c +++ b/frame/3/bli_l3_sup_packm_a.c @@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \ } \ else /* if ( will_pack == TRUE ) */ \ { \ - /* NOTE: This is "rounding up" of the last upanel is actually optional + /* NOTE: This "rounding up" of the last upanel is actually optional for the rrc/crc cases, but absolutely necessary for the other cases since we NEED that last micropanel to have the same ldim (cs_p) as the other micropanels. Why? So that millikernels can use the same diff --git a/frame/3/bli_l3_sup_packm_b.c b/frame/3/bli_l3_sup_packm_b.c index 024ad21794..07b05d61a1 100644 --- a/frame/3/bli_l3_sup_packm_b.c +++ b/frame/3/bli_l3_sup_packm_b.c @@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \ } \ else /* if ( will_pack == TRUE ) */ \ { \ - /* NOTE: This is "rounding up" of the last upanel is actually optional + /* NOTE: This "rounding up" of the last upanel is actually optional for the rrc/crc cases, but absolutely necessary for the other cases since we NEED that last micropanel to have the same ldim (cs_p) as the other micropanels. Why? So that millikernels can use the same @@ -280,15 +280,15 @@ void PASTEMAC(ch,opname) \ } \ else \ { \ - /* All other stor3_t ids: pack A to column-stored row-panels. */ \ + /* All other stor3_t ids: pack B to row-stored column-panels. */ \ *rs_p = nr; \ *cs_p = 1; \ \ *pd_p = nr; \ *ps_p = k * nr; \ \ - /* Set the schema to "packed row panels" to indicate packing to - conventional column-stored row panels. */ \ + /* Set the schema to "packed column panels" to indicate packing to + conventional row-stored column panels. */ \ *schema = BLIS_PACKED_COL_PANELS; \ } \ \ diff --git a/frame/include/bli_genarray_macro_defs.h b/frame/include/bli_genarray_macro_defs.h index 23cee1064b..eb932c5582 100644 --- a/frame/include/bli_genarray_macro_defs.h +++ b/frame/include/bli_genarray_macro_defs.h @@ -128,6 +128,20 @@ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ +// -- One-operand macro (with custom prefix) -- + +#define GENARRAY_PREF(arrayname,prefix,op) \ +\ +arrayname[BLIS_NUM_FP_TYPES] = \ +{ \ + PASTECH2(prefix,s,op), \ + PASTECH2(prefix,c,op), \ + PASTECH2(prefix,d,op), \ + PASTECH2(prefix,z,op) \ +} + + + // -- Two-operand macros -- diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 2b3ac35ae0..855384425e 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -1190,7 +1190,7 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer -// (e.g. BLIS_OBJECT_PREINITIALIZER) +// (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. @@ -1223,7 +1223,7 @@ BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t } // Finish the initialization started by the 1x1-specific static initializer -// (e.g. BLIS_OBJECT_PREINITIALIZER_1X1) +// (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. diff --git a/sandbox/gemmlike/bli_gemmnat.c b/sandbox/gemmlike/bli_gemmnat.c new file mode 100644 index 0000000000..37fb701859 --- /dev/null +++ b/sandbox/gemmlike/bli_gemmnat.c @@ -0,0 +1,88 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Given the current architecture of BLIS sandboxes, bli_gemmnat() is the +// entry point to any sandbox implementation. + +// NOTE: This function is implemented identically to the function that it +// overrides in frame/ind/oapi/bli_l3_nat_oapi.c. This means that we are +// forgoing the option of customizing the implementations that underlie +// bli_gemm() and bli_?gemm(). Any new code defined in this sandbox +// directory, however, will be included in the BLIS. + +#include "blis.h" + +#undef GENFRONT +#define GENFRONT( opname, cname, imeth ) \ +\ +void PASTEMAC(opname,imeth) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ) \ +{ \ +\ + /* A switch to easily toggle whether we use the sandbox implementation + of bls_gemm() as the implementation for bli_gemm(). (This allows for + easy testing of bls_gemm() via the testsuite.) */ \ + if ( 1 ) \ + { \ + bls_gemm_ex( alpha, a, b, beta, c, cntx, rntm ); \ + return; \ + } \ +\ + bli_init_once(); \ +\ + /* Obtain a valid (native) context from the gks if necessary. */ \ + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ +\ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ + rntm_t rntm_l; \ + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ +\ + /* Invoke the operation's front end. */ \ + PASTEMAC(opname,_front) \ + ( \ + alpha, a, b, beta, c, cntx, rntm, NULL \ + ); \ +} + +GENFRONT( gemm, gemm, nat ) diff --git a/sandbox/gemmlike/bli_sandbox.h b/sandbox/gemmlike/bli_sandbox.h new file mode 100644 index 0000000000..d6e6522e8c --- /dev/null +++ b/sandbox/gemmlike/bli_sandbox.h @@ -0,0 +1,56 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of copyright holder(s) nor the names + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SANDBOX_H +#define BLIS_SANDBOX_H + +// NOTE: This header is the only header required to be present in the sandbox +// implementation directory. + +// This header should contain (or #include) any definitions that must be +// folded into blis.h. Typically, it will remain empty since any header +// definitions specific to the sandbox implementation will not need to be +// made available to applications (or the framework) during compilation. + +#include "bls_gemm.h" +#include "bls_gemm_var.h" + +#include "bls_l3_packm_a.h" +#include "bls_l3_packm_b.h" +#include "bls_l3_packm_var.h" + +#include "bls_l3_decor.h" + + +#endif diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c new file mode 100644 index 0000000000..3e4c9b2a33 --- /dev/null +++ b/sandbox/gemmlike/bls_gemm.c @@ -0,0 +1,304 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// -- Define the gemm-like operation's object API ------------------------------ +// + +void bls_gemm + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bls_gemm_ex + ( + alpha, + a, + b, + beta, + c, + NULL, + NULL + ); +} + +void bls_gemm_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // -- bli_gemmnat() -------------------------------------------------------- + + // Obtain a valid (native) context from the gks if necessary. + // NOTE: This must be done before calling the _check() function, since + // that function assumes the context pointer is valid. + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // -- bli_gemm_front() ----------------------------------------------------- + + obj_t a_local; + obj_t b_local; + obj_t c_local; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + { + bli_gemm_check( alpha, a, b, beta, c, cntx ); + } + + // If C has a zero dimension, return early. + if ( bli_obj_has_zero_dim( c ) ) + { + return; + } + + // If alpha is zero, or if A or B has a zero dimension, scale C by beta + // and return early. + if ( bli_obj_equals( alpha, &BLIS_ZERO ) || + bli_obj_has_zero_dim( a ) || + bli_obj_has_zero_dim( b ) ) + { + bli_scalm( beta, c ); + return; + } + + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( a, &a_local ); + bli_obj_alias_to( b, &b_local ); + bli_obj_alias_to( c, &c_local ); + + // Induce a transposition of A if it has its transposition property set. + // Then clear the transposition bit in the object. + if ( bli_obj_has_trans( &a_local ) ) + { + bli_obj_induce_trans( &a_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); + } + + // Induce a transposition of B if it has its transposition property set. + // Then clear the transposition bit in the object. + if ( bli_obj_has_trans( &b_local ) ) + { + bli_obj_induce_trans( &b_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local ); + } + + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + { + bli_obj_swap( &a_local, &b_local ); + + bli_obj_induce_trans( &a_local ); + bli_obj_induce_trans( &b_local ); + bli_obj_induce_trans( &c_local ); + + // NOTE: This is probably not needed within the sandbox. + // We must also swap the pack schemas, which were set by bli_gemm_md() + // or the inlined code above. + //bli_obj_swap_pack_schemas( &a_local, &b_local ); + } + + // Parse and interpret the contents of the rntm_t object to properly + // set the ways of parallelism for each loop, and then make any + // additional modifications necessary for the current operation. + bli_rntm_set_ways_for_op + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + rntm + ); + + // Spawn threads (if applicable), where bls_gemm_int() is the thread entry + // point function for each thread. This also begins the process of creating + // the thrinfo_t tree, which contains thread communicators. + bls_l3_thread_decorator + ( + bls_gemm_int, + BLIS_GEMM, // operation family id + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + rntm + ); +} + +// +// -- Define the gemm-like operation's thread entry point ---------------------- +// + +void bls_gemm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + // In this function, we choose the gemm implementation that is executed + // on each thread. + +#if 1 + // Call the block-panel algorithm that calls the kernel directly, which + // exposes edge-case handling. + bls_gemm_bp_var1 + ( + alpha, + a, + b, + beta, + c, + cntx, + rntm, + thread + ); +#else + // Call the block-panel algorithm that calls the kernel indirectly via a + // wrapper function, which hides edge-case handling. + bls_gemm_bp_var2 + ( + alpha, + a, + b, + beta, + c, + cntx, + rntm, + thread + ); +#endif +} + +// +// -- Define the gemm-like operation's typed API ------------------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + bli_init_once(); \ +\ + /* Determine the datatype (e.g. BLIS_FLOAT, BLIS_DOUBLE, etc.) based on + the macro parameter 'ch' (e.g. s, d, etc). */ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + /* Adjust the dimensions of matrices A and B according to the transa and + transb parameters. */ \ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ + bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \ +\ + /* Create bufferless scalar objects and attach the provided scalar pointers + to those scalar objects. */ \ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + /* Create bufferless matrix objects and attach the provided matrix pointers + to those matrix objects. */ \ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + /* Set the transposition/conjugation properties of the objects for matrices + A and B. */ \ + bli_obj_set_conjtrans( transa, &ao ); \ + bli_obj_set_conjtrans( transb, &bo ); \ +\ + /* Call the object interface. */ \ + PASTECH(bls_,opname) \ + ( \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co \ + ); \ +} + +//INSERT_GENTFUNC_BASIC0( gemm ) +GENTFUNC( float, s, gemm ) +GENTFUNC( double, d, gemm ) +GENTFUNC( scomplex, c, gemm ) +GENTFUNC( dcomplex, z, gemm ) + diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h new file mode 100644 index 0000000000..b296ac1c0f --- /dev/null +++ b/sandbox/gemmlike/bls_gemm.h @@ -0,0 +1,101 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// -- Prototype the gemm-like operation's object API --------------------------- +// + +void bls_gemm + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +void bls_gemm_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +// +// -- Prototype the gemm-like operation's thread entry point ------------------- +// + +void bls_gemm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +// +// -- Prototype the gemm-like operation's typed API ---------------------------- +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +//INSERT_GENTPROT_BASIC0( gemm ) +GENTPROT( float, s, gemm ) +GENTPROT( double, d, gemm ) +GENTPROT( scomplex, c, gemm ) +GENTPROT( dcomplex, z, gemm ) + diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c new file mode 100644 index 0000000000..ae695ce34f --- /dev/null +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -0,0 +1,518 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* restrict alpha, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict b, inc_t rs_b, inc_t cs_b, + void* restrict beta, + void* restrict c, inc_t rs_c, inc_t cs_c, + cntx_t* restrict cntx, + rntm_t* restrict rntm, + thrinfo_t* restrict thread + ); + +// +// -- gemm-like block-panel algorithm (object interface) ----------------------- +// + +// Define a function pointer array named ftypes and initialize its contents with +// the addresses of the typed functions defined below, bls_?gemm_bp_var1(). +static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var1); + +void bls_gemm_bp_var1 + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + const num_t dt = bli_obj_dt( c ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + const inc_t rs_a = bli_obj_row_stride( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t cs_b = bli_obj_col_stride( b ); + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + + // Index into the function pointer array to extract the correct + // typed function pointer based on the chosen datatype. + FUNCPTR_T f = ftypes[dt]; + + // Invoke the function. + f + ( + conja, + conjb, + m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread + ); +} + +// +// -- gemm-like block-panel algorithm (typed interface) ------------------------ +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bls_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + const inc_t irstep_c = rs_c * MR; \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of the scalars to prevent any unnecessary sharing of + cache lines between the cores' caches. */ \ + ctype alpha_local = *alpha_cast; \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ + ctype zero_local = *PASTEMAC(ch,0); \ +\ + auxinfo_t aux; \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \ + BLIS_KC, /* 4th loop */ \ + BLIS_NO_PART, /* pack B */ \ + BLIS_MC, /* 3rd loop */ \ + BLIS_NO_PART, /* pack A */ \ + BLIS_NR, /* 2nd loop */ \ + BLIS_MR, /* 1st loop */ \ + BLIS_KR }; /* microkernel loop */ \ +\ + bszid_t* restrict bszids_jc = &bszids[0]; \ + bszid_t* restrict bszids_pc = &bszids[1]; \ + /*bszid_t* restrict bszids_pb = &bszids[2];*/ \ + bszid_t* restrict bszids_ic = &bszids[3]; \ + /*bszid_t* restrict bszids_pa = &bszids[4];*/ \ + bszid_t* restrict bszids_jr = &bszids[5]; \ + /*bszid_t* restrict bszids_ir = &bszids[6];*/ \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ + thrinfo_t* restrict thread_ir = NULL; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + B. Then call the packm implementation. */ \ + PASTECH2(bls_,ch,packm_b) \ + ( \ + conjb, \ + KC, NC, \ + kc_cur, nc_cur, NR, \ + &one_local, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias b_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_pc_use = b_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + A. Then call the packm implementation. */ \ + PASTECH2(bls_,ch,packm_a) \ + ( \ + conja, \ + MC, KC, \ + mc_cur, kc_cur, MR, \ + &one_local, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_ic_use = a_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Query the number of threads and thread ids for the JR loop. + NOTE: These values are only needed when computing the next + micropanel of B. */ \ + const dim_t jr_nt = bli_thread_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thread_work_id( thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ + dim_t jr_left = nc_cur % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur \ + = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* Assume for now that our next panel of B to be the current panel + of B. */ \ + ctype* restrict b2 = b_jr; \ +\ + /* Identify the current thrinfo_t node. */ \ + thread_ir = bli_thrinfo_sub_node( thread_jr ); \ +\ + /* Query the number of threads and thread ids for the IR loop. + NOTE: These values are only needed when computing the next + micropanel of A. */ \ + const dim_t ir_nt = bli_thread_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thread_work_id( thread_ir ); \ +\ + /* Compute number of primary and leftover components of the IR loop. */ \ + dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ + dim_t ir_left = mc_cur % MR; \ +\ + /* Compute the IR loop thread range for the current thread. */ \ + dim_t ir_start, ir_end; \ + bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += 1 ) \ + { \ + const dim_t mr_cur \ + = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ +\ + ctype* restrict a_ir = a_ic_use + i * ps_a_use; \ + ctype* restrict c_ir = c_jr + i * irstep_c; \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next micropanels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_ic_use; \ + b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_pc_use; \ + } \ +\ + /* Save the addresses of next micropanels of A and B to the + auxinfo_t object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( mr_cur == MR && nr_cur == NR ) \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + &alpha_local, \ + a_ir, \ + b_jr, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + &alpha_local, \ + a_ir, \ + b_jr, \ + &zero_local, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn) \ + ( \ + mr_cur, \ + nr_cur, \ + ct, rs_ct, cs_ct, \ + beta_use, \ + c_ir, rs_c, cs_c \ + ); \ + } \ + } \ + } \ + } \ +\ + /* This barrier is needed to prevent threads from starting to pack + the next row panel of B before the current row panel is fully + computed upon. */ \ + bli_thread_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTECH2(bls_,ch,packm_finalize_mem_a) \ + ( \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTECH2(bls_,ch,packm_finalize_mem_b) \ + ( \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \ +*/ \ +} + +//INSERT_GENTFUNC_BASIC0( gemm_bp_var1 ) +GENTFUNC( float, s, gemm_bp_var1 ) +GENTFUNC( double, d, gemm_bp_var1 ) +GENTFUNC( scomplex, c, gemm_bp_var1 ) +GENTFUNC( dcomplex, z, gemm_bp_var1 ) + diff --git a/sandbox/gemmlike/bls_gemm_bp_var2.c b/sandbox/gemmlike/bls_gemm_bp_var2.c new file mode 100644 index 0000000000..957cd57944 --- /dev/null +++ b/sandbox/gemmlike/bls_gemm_bp_var2.c @@ -0,0 +1,590 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* restrict alpha, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict b, inc_t rs_b, inc_t cs_b, + void* restrict beta, + void* restrict c, inc_t rs_c, inc_t cs_c, + cntx_t* restrict cntx, + rntm_t* restrict rntm, + thrinfo_t* restrict thread + ); + +// +// -- gemm-like block-panel algorithm (object interface) ----------------------- +// + +// Define a function pointer array named ftypes and initialize its contents with +// the addresses of the typed functions defined below, bls_?gemm_bp_var2(). +static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var2); + +void bls_gemm_bp_var2 + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + const num_t dt = bli_obj_dt( c ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + const inc_t rs_a = bli_obj_row_stride( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t cs_b = bli_obj_col_stride( b ); + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + + // Index into the function pointer array to extract the correct + // typed function pointer based on the chosen datatype. + FUNCPTR_T f = ftypes[dt]; + + // Invoke the function. + f + ( + conja, + conjb, + m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread + ); +} + +// +// -- gemm-like block-panel algorithm (typed interface) ------------------------ +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bls_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + /* + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + */ \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + /* + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ + */ \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + const inc_t irstep_c = rs_c * MR; \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of the scalars to prevent any unnecessary sharing of + cache lines between the cores' caches. */ \ + ctype alpha_local = *alpha_cast; \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ + /*ctype zero_local = *PASTEMAC(ch,0);*/ \ +\ + auxinfo_t aux; \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \ + BLIS_KC, /* 4th loop */ \ + BLIS_NO_PART, /* pack B */ \ + BLIS_MC, /* 3rd loop */ \ + BLIS_NO_PART, /* pack A */ \ + BLIS_NR, /* 2nd loop */ \ + BLIS_MR, /* 1st loop */ \ + BLIS_KR }; /* microkernel loop */ \ +\ + bszid_t* restrict bszids_jc = &bszids[0]; \ + bszid_t* restrict bszids_pc = &bszids[1]; \ + /*bszid_t* restrict bszids_pb = &bszids[2];*/ \ + bszid_t* restrict bszids_ic = &bszids[3]; \ + /*bszid_t* restrict bszids_pa = &bszids[4];*/ \ + bszid_t* restrict bszids_jr = &bszids[5]; \ + /*bszid_t* restrict bszids_ir = &bszids[6];*/ \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ + thrinfo_t* restrict thread_ir = NULL; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + B. Then call the packm implementation. */ \ + PASTECH2(bls_,ch,packm_b) \ + ( \ + conjb, \ + KC, NC, \ + kc_cur, nc_cur, NR, \ + &one_local, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias b_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_pc_use = b_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + A. Then call the packm implementation. */ \ + PASTECH2(bls_,ch,packm_a) \ + ( \ + conja, \ + MC, KC, \ + mc_cur, kc_cur, MR, \ + &one_local, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_ic_use = a_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Query the number of threads and thread ids for the JR loop. + NOTE: These values are only needed when computing the next + micropanel of B. */ \ + const dim_t jr_nt = bli_thread_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thread_work_id( thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ + dim_t jr_left = nc_cur % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur \ + = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* Assume for now that our next panel of B to be the current panel + of B. */ \ + ctype* restrict b2 = b_jr; \ +\ + /* Identify the current thrinfo_t node. */ \ + thread_ir = bli_thrinfo_sub_node( thread_jr ); \ +\ + /* Query the number of threads and thread ids for the IR loop. + NOTE: These values are only needed when computing the next + micropanel of A. */ \ + const dim_t ir_nt = bli_thread_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thread_work_id( thread_ir ); \ +\ + /* Compute number of primary and leftover components of the IR loop. */ \ + dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ + dim_t ir_left = mc_cur % MR; \ +\ + /* Compute the IR loop thread range for the current thread. */ \ + dim_t ir_start, ir_end; \ + bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += 1 ) \ + { \ + const dim_t mr_cur \ + = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ +\ + ctype* restrict a_ir = a_ic_use + i * ps_a_use; \ + ctype* restrict c_ir = c_jr + i * irstep_c; \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next micropanels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_ic_use; \ + b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_pc_use; \ + } \ +\ + /* Save the addresses of next micropanels of A and B to the + auxinfo_t object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Call a wrapper to the kernel (which handles edge cases). */ \ + PASTECH2(bls_,ch,gemm_kernel) \ + ( \ + MR, \ + NR, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + &alpha_local, \ + a_ir, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* This barrier is needed to prevent threads from starting to pack + the next row panel of B before the current row panel is fully + computed upon. */ \ + bli_thread_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTECH2(bls_,ch,packm_finalize_mem_a) \ + ( \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTECH2(bls_,ch,packm_finalize_mem_b) \ + ( \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \ +*/ \ +} + +//INSERT_GENTFUNC_BASIC0( gemm_bp_var2 ) +GENTFUNC( float, s, gemm_bp_var2 ) +GENTFUNC( double, d, gemm_bp_var2 ) +GENTFUNC( scomplex, c, gemm_bp_var2 ) +GENTFUNC( dcomplex, z, gemm_bp_var2 ) + +// +// -- gemm-like microkernel wrapper -------------------------------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bls_,ch,varname) \ + ( \ + const dim_t MR, \ + const dim_t NR, \ + dim_t mr_cur, \ + dim_t nr_cur, \ + dim_t kc_cur, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict aux, \ + cntx_t* restrict cntx \ + ) \ +{ \ + /* Infer the datatype from the ctype. */ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype zero = *PASTEMAC(ch,0); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( mr_cur == MR && nr_cur == NR ) \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + alpha, \ + a, \ + b, \ + beta, \ + c, rs_c, cs_c, \ + aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + alpha, \ + a, \ + b, \ + &zero, \ + ct, rs_ct, cs_ct, \ + aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn) \ + ( \ + mr_cur, \ + nr_cur, \ + ct, rs_ct, cs_ct, \ + beta, \ + c, rs_c, cs_c \ + ); \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( gemm_kernel ) +GENTFUNC( float, s, gemm_kernel ) +GENTFUNC( double, d, gemm_kernel ) +GENTFUNC( scomplex, c, gemm_kernel ) +GENTFUNC( dcomplex, z, gemm_kernel ) + diff --git a/sandbox/gemmlike/bls_gemm_var.h b/sandbox/gemmlike/bls_gemm_var.h new file mode 100644 index 0000000000..025b54a06f --- /dev/null +++ b/sandbox/gemmlike/bls_gemm_var.h @@ -0,0 +1,124 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype the object-based variant interfaces. +// + +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTECH(bls_,opname) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ); + +GENPROT( gemm_bp_var1 ) +GENPROT( gemm_bp_var2 ) + + +// +// Prototype the typed variant interfaces. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bls_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ); + +//INSERT_GENTPROT_BASIC0( gemm_bp_var1 ) +GENTPROT( float, s, gemm_bp_var1 ) +GENTPROT( double, d, gemm_bp_var1 ) +GENTPROT( scomplex, c, gemm_bp_var1 ) +GENTPROT( dcomplex, z, gemm_bp_var1 ) + +//INSERT_GENTPROT_BASIC0( gemm_bp_var2 ) +GENTPROT( float, s, gemm_bp_var2 ) +GENTPROT( double, d, gemm_bp_var2 ) +GENTPROT( scomplex, c, gemm_bp_var2 ) +GENTPROT( dcomplex, z, gemm_bp_var2 ) + + +// +// Prototype the typed kernel interfaces. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bls_,ch,varname) \ + ( \ + const dim_t MR, \ + const dim_t NR, \ + dim_t mr_cur, \ + dim_t nr_cur, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict aux, \ + cntx_t* restrict cntx \ + ); + +//INSERT_GENTPROT_BASIC0( gemm_kernel ) +GENTPROT( float, s, gemm_kernel ) +GENTPROT( double, d, gemm_kernel ) +GENTPROT( scomplex, c, gemm_kernel ) +GENTPROT( dcomplex, z, gemm_kernel ) + diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c new file mode 100644 index 0000000000..c55a19c7b7 --- /dev/null +++ b/sandbox/gemmlike/bls_l3_packm_a.c @@ -0,0 +1,328 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + /* Set the pack buffer type so that we are obtaining memory blocks from + the pool dedicated to blocks of A. */ \ + const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \ +\ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \ + const dim_t k_pack = k; \ +\ + /* Barrier to make sure all threads are caught up and ready to begin the + packm stage. */ \ + bli_thread_barrier( thread ); \ +\ + /* Compute the size of the memory block eneded. */ \ + siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ +\ + /* Check the mem_t entry provided by the caller. If it is unallocated, + then we need to acquire a block from the memory broker. */ \ + if ( bli_mem_is_unalloc( mem ) ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Acquire directly to the chief thread's mem_t that was passed in. + It needs to be that mem_t struct, and not a local (temporary) + mem_t, since there is no barrier until after packing is finished, + which could allow a race condition whereby the chief thread exits + the current function before the other threads have a chance to + copy from it. (A barrier would fix that race condition, but then + again, I prefer to keep barriers to a minimum.) */ \ + bli_membrk_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t to all + threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else /* if ( bli_mem_is_alloc( mem ) ) */ \ + { \ + /* If the mem_t entry provided by the caller does NOT contain a NULL + buffer, then a block has already been acquired from the memory + broker and cached by the caller. */ \ +\ + /* As a sanity check, we should make sure that the mem_t object isn't + associated with a block that is too small compared to the size of + the packed matrix buffer that is needed, according to the value + computed above. */ \ + siz_t mem_size = bli_mem_size( mem ); \ +\ + if ( mem_size < size_needed ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* The chief thread releases the existing block associated + with the mem_t, and then re-acquires a new block, saving + the associated mem_t to its passed-in mem_t. (See coment + above for why the acquisition needs to be directly to + the chief thread's passed-in mem_t and not a local + (temporary) mem_t. */ \ + bli_membrk_release \ + ( \ + rntm, \ + mem \ + ); \ + bli_membrk_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t + to all threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else \ + { \ + /* If the mem_t entry is already allocated and sufficiently large, + then we use it as-is. No action is needed. */ \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_mem_a ) +GENTFUNC( float, s, packm_init_mem_a ) +GENTFUNC( double, d, packm_init_mem_a ) +GENTFUNC( scomplex, c, packm_init_mem_a ) +GENTFUNC( dcomplex, z, packm_init_mem_a ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + if ( thread != NULL ) \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Check the mem_t entry provided by the caller. Only proceed if it + is allocated, which it should be. */ \ + if ( bli_mem_is_alloc( mem ) ) \ + { \ + bli_membrk_release \ + ( \ + rntm, \ + mem \ + ); \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a ) +GENTFUNC( float, s, packm_finalize_mem_a ) +GENTFUNC( double, d, packm_finalize_mem_a ) +GENTFUNC( scomplex, c, packm_finalize_mem_a ) +GENTFUNC( dcomplex, z, packm_finalize_mem_a ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + dim_t* restrict m_max, \ + dim_t* restrict k_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ) \ +{ \ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + *m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \ + *k_max = k; \ +\ + /* Determine the dimensions and strides for the packed matrix A. */ \ + { \ + /* Pack A to column-stored row-panels. */ \ + *rs_p = 1; \ + *cs_p = mr; \ +\ + *pd_p = mr; \ + *ps_p = mr * k; \ +\ + /* Set the schema to "packed row panels" to indicate packing to + conventional column-stored row panels. */ \ + *schema = BLIS_PACKED_ROW_PANELS; \ + } \ +\ + /* Set the buffer address provided by the caller to point to the memory + associated with the mem_t entry acquired from the memory pool. */ \ + *p = bli_mem_buffer( mem ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_a ) +GENTFUNC( float, s, packm_init_a ) +GENTFUNC( double, d, packm_init_a ) +GENTFUNC( scomplex, c, packm_init_a ) +GENTFUNC( dcomplex, z, packm_init_a ) + + +// +// Define BLAS-like interfaces to the variant chooser. +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t m_alloc, \ + dim_t k_alloc, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + ctype* restrict kappa, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + pack_t schema; \ + dim_t m_max; \ + dim_t k_max; \ + dim_t pd_p; \ +\ + /* Prepare the packing destination buffer. */ \ + PASTECH2(bls_,ch,packm_init_mem_a) \ + ( \ + m_alloc, k_alloc, mr, \ + cntx, \ + rntm, \ + mem, \ + thread \ + ); \ +\ + /* Determine the packing buffer and related parameters for matrix A. */ \ + PASTECH2(bls_,ch,packm_init_a) \ + ( \ + &schema, \ + m, k, mr, \ + &m_max, &k_max, \ + p, rs_p, cs_p, \ + &pd_p, ps_p, \ + mem \ + ); \ +\ + /* Pack matrix A to the destination buffer chosen above. Here, the packed + matrix is stored to column-stored MR x k micropanels. */ \ + PASTECH2(bls_,ch,packm_var1) \ + ( \ + conj, \ + schema, \ + m, \ + k, \ + m_max, \ + k_max, \ + kappa, \ + a, rs_a, cs_a, \ + *p, *rs_p, *cs_p, \ + pd_p, *ps_p, \ + cntx, \ + thread \ + ); \ +\ + /* Barrier so that packing is done before computation. */ \ + bli_thread_barrier( thread ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_a ) +GENTFUNC( float, s, packm_a ) +GENTFUNC( double, d, packm_a ) +GENTFUNC( scomplex, c, packm_a ) +GENTFUNC( dcomplex, z, packm_a ) + diff --git a/sandbox/gemmlike/bls_l3_packm_a.h b/sandbox/gemmlike/bls_l3_packm_a.h new file mode 100644 index 0000000000..201a24efae --- /dev/null +++ b/sandbox/gemmlike/bls_l3_packm_a.h @@ -0,0 +1,122 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_mem_a ) +GENTPROT( float, s, packm_init_mem_a ) +GENTPROT( double, d, packm_init_mem_a ) +GENTPROT( scomplex, c, packm_init_mem_a ) +GENTPROT( dcomplex, z, packm_init_mem_a ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a ) +GENTPROT( float, s, packm_finalize_mem_a ) +GENTPROT( double, d, packm_finalize_mem_a ) +GENTPROT( scomplex, c, packm_finalize_mem_a ) +GENTPROT( dcomplex, z, packm_finalize_mem_a ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + dim_t* restrict m_max, \ + dim_t* restrict k_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_a ) +GENTPROT( float, s, packm_init_a ) +GENTPROT( double, d, packm_init_a ) +GENTPROT( scomplex, c, packm_init_a ) +GENTPROT( dcomplex, z, packm_init_a ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t m_alloc, \ + dim_t k_alloc, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + ctype* restrict kappa, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_a ) +GENTPROT( float, s, packm_a ) +GENTPROT( double, d, packm_a ) +GENTPROT( scomplex, c, packm_a ) +GENTPROT( dcomplex, z, packm_a ) + diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c new file mode 100644 index 0000000000..cae93df012 --- /dev/null +++ b/sandbox/gemmlike/bls_l3_packm_b.c @@ -0,0 +1,328 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + /* Set the pack buffer type so that we are obtaining memory blocks from + the pool dedicated to panels of B. */ \ + const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_B_PANEL; \ +\ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + const dim_t k_pack = k; \ + const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \ +\ + /* Barrier to make sure all threads are caught up and ready to begin the + packm stage. */ \ + bli_thread_barrier( thread ); \ +\ + /* Compute the size of the memory block eneded. */ \ + siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ +\ + /* Check the mem_t entry provided by the caller. If it is unallocated, + then we need to acquire a block from the memory broker. */ \ + if ( bli_mem_is_unalloc( mem ) ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Acquire directly to the chief thread's mem_t that was passed in. + It needs to be that mem_t struct, and not a local (temporary) + mem_t, since there is no barrier until after packing is finished, + which could allow a race condition whereby the chief thread exits + the current function before the other threads have a chance to + copy from it. (A barrier would fix that race condition, but then + again, I prefer to keep barriers to a minimum.) */ \ + bli_membrk_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t to all + threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else /* if ( bli_mem_is_alloc( mem ) ) */ \ + { \ + /* If the mem_t entry provided by the caller does NOT contain a NULL + buffer, then a block has already been acquired from the memory + broker and cached by the caller. */ \ +\ + /* As a sanity check, we should make sure that the mem_t object isn't + associated with a block that is too small compared to the size of + the packed matrix buffer that is needed, according to the value + computed above. */ \ + siz_t mem_size = bli_mem_size( mem ); \ +\ + if ( mem_size < size_needed ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* The chief thread releases the existing block associated + with the mem_t, and then re-acquires a new block, saving + the associated mem_t to its passed-in mem_t. (See coment + above for why the acquisition needs to be directly to + the chief thread's passed-in mem_t and not a local + (temporary) mem_t. */ \ + bli_membrk_release \ + ( \ + rntm, \ + mem \ + ); \ + bli_membrk_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t + to all threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else \ + { \ + /* If the mem_t entry is already allocated and sufficiently large, + then we use it as-is. No action is needed. */ \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_mem_b ) +GENTFUNC( float, s, packm_init_mem_b ) +GENTFUNC( double, d, packm_init_mem_b ) +GENTFUNC( scomplex, c, packm_init_mem_b ) +GENTFUNC( dcomplex, z, packm_init_mem_b ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + if ( thread != NULL ) \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Check the mem_t entry provided by the caller. Only proceed if it + is allocated, which it should be. */ \ + if ( bli_mem_is_alloc( mem ) ) \ + { \ + bli_membrk_release \ + ( \ + rntm, \ + mem \ + ); \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b ) +GENTFUNC( float, s, packm_finalize_mem_b ) +GENTFUNC( double, d, packm_finalize_mem_b ) +GENTFUNC( scomplex, c, packm_finalize_mem_b ) +GENTFUNC( dcomplex, z, packm_finalize_mem_b ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + dim_t* restrict k_max, \ + dim_t* restrict n_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ) \ +{ \ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + *k_max = k; \ + *n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \ +\ + /* Determine the dimensions and strides for the packed matrix B. */ \ + { \ + /* Pack B to row-stored column-panels. */ \ + *rs_p = nr; \ + *cs_p = 1; \ +\ + *pd_p = nr; \ + *ps_p = k * nr; \ +\ + /* Set the schema to "packed column panels" to indicate packing to + conventional row-stored column panels. */ \ + *schema = BLIS_PACKED_COL_PANELS; \ + } \ +\ + /* Set the buffer address provided by the caller to point to the memory + associated with the mem_t entry acquired from the memory pool. */ \ + *p = bli_mem_buffer( mem ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_b ) +GENTFUNC( float, s, packm_init_b ) +GENTFUNC( double, d, packm_init_b ) +GENTFUNC( scomplex, c, packm_init_b ) +GENTFUNC( dcomplex, z, packm_init_b ) + + +// +// Define BLAS-like interfaces to the variant chooser. +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t k_alloc, \ + dim_t n_alloc, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + ctype* restrict kappa, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + pack_t schema; \ + dim_t k_max; \ + dim_t n_max; \ + dim_t pd_p; \ +\ + /* Prepare the packing destination buffer. */ \ + PASTECH2(bls_,ch,packm_init_mem_b) \ + ( \ + k_alloc, n_alloc, nr, \ + cntx, \ + rntm, \ + mem, \ + thread \ + ); \ +\ + /* Determine the packing buffer and related parameters for matrix B. */ \ + PASTECH2(bls_,ch,packm_init_b) \ + ( \ + &schema, \ + k, n, nr, \ + &k_max, &n_max, \ + p, rs_p, cs_p, \ + &pd_p, ps_p, \ + mem \ + ); \ +\ + /* Pack matrix B to the destination buffer chosen above. Here, the packed + matrix is stored to row-stored k x NR micropanels. */ \ + PASTECH2(bls_,ch,packm_var1) \ + ( \ + conj, \ + schema, \ + k, \ + n, \ + k_max, \ + n_max, \ + kappa, \ + b, rs_b, cs_b, \ + *p, *rs_p, *cs_p, \ + pd_p, *ps_p, \ + cntx, \ + thread \ + ); \ +\ + /* Barrier so that packing is done before computation. */ \ + bli_thread_barrier( thread ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_b ) +GENTFUNC( float, s, packm_b ) +GENTFUNC( double, d, packm_b ) +GENTFUNC( scomplex, c, packm_b ) +GENTFUNC( dcomplex, z, packm_b ) + diff --git a/sandbox/gemmlike/bls_l3_packm_b.h b/sandbox/gemmlike/bls_l3_packm_b.h new file mode 100644 index 0000000000..728d21aed5 --- /dev/null +++ b/sandbox/gemmlike/bls_l3_packm_b.h @@ -0,0 +1,122 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_mem_b ) +GENTPROT( float, s, packm_init_mem_b ) +GENTPROT( double, d, packm_init_mem_b ) +GENTPROT( scomplex, c, packm_init_mem_b ) +GENTPROT( dcomplex, z, packm_init_mem_b ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b ) +GENTPROT( float, s, packm_finalize_mem_b ) +GENTPROT( double, d, packm_finalize_mem_b ) +GENTPROT( scomplex, c, packm_finalize_mem_b ) +GENTPROT( dcomplex, z, packm_finalize_mem_b ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + dim_t* restrict k_max, \ + dim_t* restrict n_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_b ) +GENTPROT( float, s, packm_init_b ) +GENTPROT( double, d, packm_init_b ) +GENTPROT( scomplex, c, packm_init_b ) +GENTPROT( dcomplex, z, packm_init_b ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t k_alloc, \ + dim_t n_alloc, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + ctype* restrict kappa, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_b ) +GENTPROT( float, s, packm_b ) +GENTPROT( double, d, packm_b ) +GENTPROT( scomplex, c, packm_b ) +GENTPROT( dcomplex, z, packm_b ) + diff --git a/sandbox/gemmlike/bls_l3_packm_var.c b/sandbox/gemmlike/bls_l3_packm_var.c new file mode 100644 index 0000000000..8a4c1d0206 --- /dev/null +++ b/sandbox/gemmlike/bls_l3_packm_var.c @@ -0,0 +1,198 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define BLAS-like interfaces to the variants. +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bls_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ +\ + dim_t iter_dim; \ + dim_t n_iter; \ + dim_t it, ic; \ + dim_t ic0; \ + doff_t ic_inc; \ + dim_t panel_len_full; \ + dim_t panel_len_i; \ + dim_t panel_len_max; \ + dim_t panel_len_max_i; \ + dim_t panel_dim_i; \ + dim_t panel_dim_max; \ + inc_t vs_c; \ + inc_t ldc; \ + inc_t ldp; \ + conj_t conjc; \ +\ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + bool row_stored = bli_is_col_packed( schema ); \ + /*bool col_stored = bli_is_row_packed( schema );*/ \ +\ + /* If the row storage flag indicates row storage, then we are packing + to column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( row_stored ) \ + { \ + /* Prepare to pack to row-stored column panels. */ \ + iter_dim = n; \ + panel_len_full = m; \ + panel_len_max = m_max; \ + panel_dim_max = pd_p; \ + vs_c = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( col_stored ) */ \ + { \ + /* Prepare to pack to column-stored row panels. */ \ + iter_dim = m; \ + panel_len_full = n; \ + panel_len_max = n_max; \ + panel_dim_max = pd_p; \ + vs_c = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ +\ + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ \ + { \ + ic0 = 0; \ + ic_inc = panel_dim_max; \ + } \ +\ + ctype* restrict p_begin = p_cast; \ +\ + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ \ + const dim_t nt = bli_thread_n_way( thread ); \ + const dim_t tid = bli_thread_work_id( thread ); \ +\ + /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ + ( void )nt; \ + ( void )tid; \ +\ + dim_t it_start, it_end, it_inc; \ +\ + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ +\ + /* Iterate over every logical micropanel in the source matrix. */ \ + for ( ic = ic0, it = 0; it < n_iter; \ + ic += ic_inc, it += 1 ) \ + { \ + panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + ctype* restrict c_begin = c_cast + (ic )*vs_c; \ +\ + ctype* restrict c_use = c_begin; \ + ctype* restrict p_use = p_begin; \ +\ + panel_len_i = panel_len_full; \ + panel_len_max_i = panel_len_max; \ +\ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. (The + default is slab.) */ \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + { \ + PASTEMAC(ch,packm_cxk) \ + ( \ + conjc, \ + schema, \ + panel_dim_i, \ + panel_dim_max, \ + panel_len_i, \ + panel_len_max_i, \ + kappa_cast, \ + c_use, vs_c, ldc, \ + p_use, ldp, \ + cntx \ + ); \ + } \ +\ + p_begin += ps_p; \ +\ +/* +if ( row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +if ( !row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +*/ \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_var1 ) +GENTFUNC( float, s, packm_var1 ) +GENTFUNC( double, d, packm_var1 ) +GENTFUNC( scomplex, c, packm_var1 ) +GENTFUNC( dcomplex, z, packm_var1 ) + diff --git a/sandbox/gemmlike/bls_l3_packm_var.h b/sandbox/gemmlike/bls_l3_packm_var.h new file mode 100644 index 0000000000..0e8eb9ee8a --- /dev/null +++ b/sandbox/gemmlike/bls_l3_packm_var.h @@ -0,0 +1,63 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// Prototype BLAS-like interfaces to the variants. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bls_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ); + +//INSERT_GENTPROT_BASIC0( packm_var1 ) +GENTPROT( float, s, packm_var1 ) +GENTPROT( double, d, packm_var1 ) +GENTPROT( scomplex, c, packm_var1 ) +GENTPROT( dcomplex, z, packm_var1 ) + diff --git a/sandbox/gemmlike/thread/bls_l3_decor.h b/sandbox/gemmlike/thread/bls_l3_decor.h new file mode 100644 index 0000000000..bb8a95bb46 --- /dev/null +++ b/sandbox/gemmlike/thread/bls_l3_decor.h @@ -0,0 +1,73 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_H +#define BLIS_SBX_L3_DECOR_H + +// -- sup definitions ---------------------------------------------------------- + +// Level-3 sup internal function type. +typedef void (*l3sbxint_t) + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +// Level-3 sup thread decorator prototype. +void bls_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +// Include definitions specific to the method of multithreading. +#include "bls_l3_decor_single.h" +#include "bls_l3_decor_openmp.h" +#include "bls_l3_decor_pthreads.h" + +#endif + diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c new file mode 100644 index 0000000000..851a29e52b --- /dev/null +++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c @@ -0,0 +1,138 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_OPENMP + +// Define a dummy thread entry function, which is needed in the pthreads +// version, so that when building Windows DLLs (with OpenMP enabled or with +// no multithreading) we don't risk having an unresolved symbol. +void* bls_l3_thread_entry( void* data_void ) { return NULL; } + +//#define PRINT_THRINFO + +void bls_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + // Query the total number of threads from the rntm_t object. + const dim_t n_threads = bli_rntm_num_threads( rntm ); + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. We do + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. + bli_membrk_rntm_set_membrk( rntm ); + + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + + + _Pragma( "omp parallel num_threads(n_threads)" ) + { + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + // Query the thread's id from OpenMP. + const dim_t tid = omp_get_thread_num(); + + // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // NOTE: This calls the same function used for the conventional/large + // code path. + bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + bli_sba_rntm_set_pool( tid, array, rntm_p ); + + thrinfo_t* thread = NULL; + + // Create the root node of the thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + + func + ( + alpha, + a, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); + } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called from the thread entry function). + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); +} + +#endif + diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h b/sandbox/gemmlike/thread/bls_l3_decor_openmp.h new file mode 100644 index 0000000000..9c956d7c36 --- /dev/null +++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_OPENMP_H +#define BLIS_SBX_L3_DECOR_OPENMP_H + +// Definitions specific to situations when OpenMP multithreading is enabled. +#ifdef BLIS_ENABLE_OPENMP + +#endif + +#endif + diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c new file mode 100644 index 0000000000..f87d79fd6c --- /dev/null +++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c @@ -0,0 +1,213 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_PTHREADS + +// A data structure to assist in passing operands to additional threads. +typedef struct thread_data +{ + l3sbxint_t func; + opid_t family; + obj_t* alpha; + obj_t* a; + obj_t* b; + obj_t* beta; + obj_t* c; + cntx_t* cntx; + rntm_t* rntm; + dim_t tid; + thrcomm_t* gl_comm; + array_t* array; +} thread_data_t; + +// Entry point function for additional threads. +void* bls_l3_thread_entry( void* data_void ) +{ + thread_data_t* data = data_void; + + l3sbxint_t func = data->func; + opid_t family = data->family; + obj_t* alpha = data->alpha; + obj_t* a = data->a; + obj_t* b = data->b; + obj_t* beta = data->beta; + obj_t* c = data->c; + cntx_t* cntx = data->cntx; + rntm_t* rntm = data->rntm; + dim_t tid = data->tid; + array_t* array = data->array; + thrcomm_t* gl_comm = data->gl_comm; + + ( void )family; + + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + bli_sba_rntm_set_pool( tid, array, rntm_p ); + + thrinfo_t* thread = NULL; + + // Create the root node of the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + + func + ( + alpha, + a, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); + + return NULL; +} + +void bls_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + // Query the total number of threads from the context. + const dim_t n_threads = bli_rntm_num_threads( rntm ); + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. We do + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. + bli_membrk_rntm_set_membrk( rntm ); + + // Allocate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + + // Allocate an array of pthread objects and auxiliary data structs to pass + // to the thread entry functions. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + + // NOTE: We must iterate backwards so that the chief thread (thread id 0) + // can spawn all other threads before proceeding with its own computation. + for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) + { + // Set up thread data for additional threads (beyond thread 0). + datas[tid].func = func; + datas[tid].family = family; + datas[tid].alpha = alpha; + datas[tid].a = a; + datas[tid].b = b; + datas[tid].beta = beta; + datas[tid].c = c; + datas[tid].cntx = cntx; + datas[tid].rntm = rntm; + datas[tid].tid = tid; + datas[tid].gl_comm = gl_comm; + datas[tid].array = array; + + // Spawn additional threads for ids greater than 1. + if ( tid != 0 ) + bli_pthread_create( &pthreads[tid], NULL, &bls_l3_thread_entry, &datas[tid] ); + else + bls_l3_thread_entry( ( void* )(&datas[0]) ); + } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called from the thread entry function). + + // Thread 0 waits for additional threads to finish. + for ( dim_t tid = 1; tid < n_threads; tid++ ) + { + bli_pthread_join( pthreads[tid], NULL ); + } + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_free_intl( pthreads ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_free_intl( datas ); +} + +#endif + diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h new file mode 100644 index 0000000000..ef5c3bad45 --- /dev/null +++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h @@ -0,0 +1,47 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H +#define BLIS_SBX_L3_DECOR_PTHREADS_H + +// Definitions specific to situations when POSIX multithreading is enabled. +#ifdef BLIS_ENABLE_PTHREADS + +// Thread entry point prototype. +void* bls_l3_thread_entry( void* data_void ); + +#endif + +#endif + diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.c b/sandbox/gemmlike/thread/bls_l3_decor_single.c new file mode 100644 index 0000000000..7d9017dcd5 --- /dev/null +++ b/sandbox/gemmlike/thread/bls_l3_decor_single.c @@ -0,0 +1,141 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifndef BLIS_ENABLE_MULTITHREADING + +#define SKIP_THRINFO_TREE + +void bls_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + //pack_t schema_a, + //pack_t schema_b, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + // For sequential execution, we use only one thread. + const dim_t n_threads = 1; + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. + bli_membrk_rntm_set_membrk( rntm ); + +#ifndef SKIP_THRINFO_TREE + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); +#endif + + + { + // NOTE: We don't need to create another copy of the rntm_t since + // it was already copied in one of the high-level oapi functions. + rntm_t* restrict rntm_p = rntm; + + // There is only one thread id (for the thief thread). + const dim_t tid = 0; + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + // NOTE: This is commented out because, in the single-threaded case, + // this is redundant since it's already been done above. + //bli_sba_rntm_set_pool( tid, array, rntm_p ); + +#ifndef SKIP_THRINFO_TREE + thrinfo_t* thread = NULL; + + // Create the root node of the thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); +#else + // This optimization allows us to use one of the global thrinfo_t + // objects for single-threaded execution rather than grow one from + // scratch. The key is that bli_thrinfo_sup_grow(), which is called + // from within the variants, will immediately return if it detects + // that the thrinfo_t* passed into it is either + // &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED. + thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED; + + ( void )tid; +#endif + + func + ( + alpha, + a, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + +#ifndef SKIP_THRINFO_TREE + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); +#endif + } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called above). + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); +} + +#endif + diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.h b/sandbox/gemmlike/thread/bls_l3_decor_single.h new file mode 100644 index 0000000000..211a43a894 --- /dev/null +++ b/sandbox/gemmlike/thread/bls_l3_decor_single.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_SINGLE_H +#define BLIS_SBX_L3_DECOR_SINGLE_H + +// Definitions specific to situations when multithreading is disabled. +#ifndef BLIS_ENABLE_MULTITHREADING + +#endif + +#endif + diff --git a/sandbox/power10/bli_gemmnat.c b/sandbox/power10/bli_gemmnat.c index b2dabd29aa..846ccd35a8 100644 --- a/sandbox/power10/bli_gemmnat.c +++ b/sandbox/power10/bli_gemmnat.c @@ -32,7 +32,14 @@ */ -// This file is needed for the BLIS build system. +// Given the current architecture of BLIS sandboxes, bli_gemmnat() is the +// entry point to any sandbox implementation. + +// NOTE: This function is implemented identically to the function that it +// overrides in frame/ind/oapi/bli_l3_nat_oapi.c. This means that we are +// forgoing the option of customizing the implementations that underlie +// bli_gemm() and bli_?gemm(). Any new code defined in this sandbox +// directory, however, will be included in the BLIS. #include "blis.h" From 7fabd896af773623ed01820a71bbff432e8a7d25 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 29 May 2021 16:28:03 +0900 Subject: [PATCH 021/226] Asm Flag Mingling for Darwin_Aarch64 Apple+Arm64 requires additional "tagging" of local symbols. --- kernels/armv8a/3/armv8a_asm_utils.h | 49 ++++++++ kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 125 ++++++++++---------- 2 files changed, 112 insertions(+), 62 deletions(-) create mode 100644 kernels/armv8a/3/armv8a_asm_utils.h diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h new file mode 100644 index 0000000000..7bf97d555c --- /dev/null +++ b/kernels/armv8a/3/armv8a_asm_utils.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +// Apple's local label requirements. +#if defined(__APPLE__) +#define LABEL(str) " L" #str": \n\t" +#define BEQ(str) "b.eq L" #str" \n\t" +#define BNE(str) "b.ne L" #str" \n\t" +#define BRANCH(str) "b L" #str" \n\t" +#else +#define LABEL(str) " ." #str": \n\t" +#define BEQ(str) "b.eq ." #str" \n\t" +#define BNE(str) "b.ne ." #str" \n\t" +#define BRANCH(str) "b ." #str" \n\t" +#endif + diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index c01c67f5a0..251931f7c5 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -34,6 +34,7 @@ */ #include "blis.h" +#include "armv8a_asm_utils.h" /* o 4x4 Single precision micro-kernel fully functional. @@ -155,7 +156,7 @@ __asm__ volatile " dup v31.4s, wzr \n\t" // Vector for accummulating column 11 " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. -" beq .SCONSIDERKLEFT \n\t" +BEQ(SCONSIDERKLEFT) " \n\t" " ldr q0, [x0] \n\t" " ldr q1, [x0, #16] \n\t" // Load a @@ -168,9 +169,9 @@ __asm__ volatile " add x1, x1, #48 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. -" beq .SLASTITER \n\t" // (as loop is do-while-like). +BEQ(SLASTITER) // (as loop is do-while-like). " \n\t" -" .SLOOPKITER: \n\t" // Body of the k_iter loop. +LABEL(SLOOPKITER) // Body of the k_iter loop. " \n\t" " ldr q5, [x0] \n\t" " fmla v8.4s, v0.4s,v2.s[0] \n\t" // Accummulate. @@ -316,9 +317,9 @@ __asm__ volatile " \n\t" //End It 4 " sub x5,x5,1 \n\t" // i-=1. " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. -" bne .SLOOPKITER \n\t" +BNE(SLOOPKITER) " \n\t" -" .SLASTITER: \n\t" // Last iteration of k_iter loop. +LABEL(SLASTITER) // Last iteration of k_iter loop. " \n\t" " \n\t" " ldr q5, [x0] \n\t" @@ -454,11 +455,11 @@ __asm__ volatile " add x0, x0, #96 \n\t" " \n\t" //End It 4 " \n\t" -" .SCONSIDERKLEFT: \n\t" +LABEL(SCONSIDERKLEFT) " cmp x6,0 \n\t" // If k_left == 0, we are done. -" beq .SPOSTACCUM \n\t" // else, we enter the k_left loop. +BEQ(SPOSTACCUM) // else, we enter the k_left loop. " \n\t" -" .SLOOPKLEFT: \n\t" // Body of the left iterations +LABEL(SLOOPKLEFT) // Body of the left iterations " \n\t" " ldr q0, [x0],#16 \n\t" " ldr q1, [x0],#16 \n\t" // Load a @@ -497,17 +498,17 @@ __asm__ volatile " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " \n\t" " cmp x6,0 \n\t" // Iterate again. -" bne .SLOOPKLEFT \n\t" // if i!=0. +BNE(SLOOPKLEFT) // if i!=0. " \n\t" -" .SPOSTACCUM: \n\t" +LABEL(SPOSTACCUM) " \n\t" " ld1r {v6.4s},[x7] \n\t" // Load alpha. " ld1r {v7.4s},[x8] \n\t" // Load beta " \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) -" bne .SGENSTORED \n\t" +BNE(SGENSTORED) " \n\t" -" .SCOLSTORED: \n\t" // C is column-major. +LABEL(SCOLSTORED) // C is column-major. " \n\t" " dup v0.4s, wzr \n\t" " dup v1.4s, wzr \n\t" @@ -517,7 +518,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x2] \n\t" //Load column 0 of C " ldr q1, [x2, #16] \n\t" @@ -533,7 +534,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS1: \n\t" +LABEL(SBETAZEROCOLSTOREDS1) " \n\t" " fmla v0.4s,v8.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v9.4s,v6.s[0] \n\t" // Scale by alpha @@ -557,7 +558,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x18] \n\t" //Load column 3 of C " ldr q9, [x18, #16] \n\t" @@ -573,7 +574,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS2: \n\t" +LABEL(SBETAZEROCOLSTOREDS2) " \n\t" " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha @@ -597,7 +598,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x21] \n\t" //Load column 6 of C " ldr q1, [x21, #16] \n\t" @@ -613,7 +614,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS3: \n\t" +LABEL(SBETAZEROCOLSTOREDS3) " \n\t" " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha @@ -637,7 +638,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x24] \n\t" //Load column 9 of C " ldr q9, [x24, #16] \n\t" @@ -653,7 +654,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS4: \n\t" +LABEL(SBETAZEROCOLSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -673,10 +674,10 @@ __asm__ volatile " str q13, [x26, #16] \n\t" " \n\t" " \n\t" -" b .SEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump). +BRANCH(SEND) // Done. " \n\t" " \n\t" -" .SGENSTORED: \n\t" // C is general-stride stored. +LABEL(SGENSTORED) // C is general-stride stored. " \n\t" " \n\t" " dup v0.4s, wzr \n\t" @@ -687,7 +688,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. " \n\t" " mov x27, x2 \n\t" " \n\t" @@ -729,7 +730,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS1: \n\t" +LABEL(SBETAZEROGENSTOREDS1) " \n\t" " fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha @@ -779,7 +780,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. " \n\t" " mov x27, x18 \n\t" " \n\t" @@ -821,7 +822,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS2: \n\t" +LABEL(SBETAZEROGENSTOREDS2) " \n\t" " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha @@ -871,7 +872,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. " \n\t" " mov x27, x21 \n\t" " \n\t" @@ -913,7 +914,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS3: \n\t" +LABEL(SBETAZEROGENSTOREDS3) " \n\t" " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha @@ -963,7 +964,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. " \n\t" " mov x27, x24 \n\t" " \n\t" @@ -1005,7 +1006,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS4: \n\t" +LABEL(SBETAZEROGENSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -1050,7 +1051,7 @@ __asm__ volatile " st1 {v13.s}[2],[x27],x14 \n\t" // Store c116 into quad and increment by rs_c. " st1 {v13.s}[3],[x27],x14 \n\t" // Store c147 into quad and increment by rs_c. " \n\t" -" .SEND: \n\t" // Done! +LABEL(SEND) // Done! " \n\t" :// output operands (none) :// input operands @@ -1203,7 +1204,7 @@ __asm__ volatile " \n\t" " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. -" beq .DCONSIDERKLEFT \n\t" +BEQ(DCONSIDERKLEFT) " \n\t" " ldr q0, [x0] \n\t" // Load a " ldr q1, [x0, #16] \n\t" @@ -1218,9 +1219,9 @@ __asm__ volatile " add x1, x1, #64 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. -" beq .DLASTITER \n\t" // (as loop is do-while-like). +BEQ(DLASTITER) // (as loop is do-while-like). " \n\t" -" DLOOP: \n\t" // Body +LABEL(DLOOP) // Body " \n\t" " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x1, #448] \n\t" //512-64=448 @@ -1394,9 +1395,9 @@ __asm__ volatile " \n\t" " sub x5,x5,1 \n\t" // i-=1 " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. -" bne DLOOP \n\t" +BNE(DLOOP) " \n\t" -".DLASTITER: \n\t" +LABEL(DLASTITER) " \n\t" " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate @@ -1554,11 +1555,11 @@ __asm__ volatile " \n\t" //End it 4 " add x0, x0, #144 \n\t" " \n\t" -" .DCONSIDERKLEFT: \n\t" +LABEL(DCONSIDERKLEFT) " cmp x6,0 \n\t" // If k_left == 0, we are done. -" beq .DPOSTACCUM \n\t" // else, we enter the k_left loop. +BEQ(DPOSTACCUM) // else, we enter the k_left loop. " \n\t" -".DLOOPKLEFT: \n\t" +LABEL(DLOOPKLEFT) " \n\t" " ldr q0, [x0],#16 \n\t" " ldr q1, [x0],#16 \n\t" // Load a @@ -1605,17 +1606,17 @@ __asm__ volatile " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate " \n\t" " cmp x6,0 \n\t" // Iterate again. -" bne .DLOOPKLEFT \n\t" // if i!=0. +BNE(DLOOPKLEFT) // if i!=0. " \n\t" -" .DPOSTACCUM: \n\t" +LABEL(DPOSTACCUM) " \n\t" " ld1r {v6.2d},[x7] \n\t" // Load alpha. " ld1r {v7.2d},[x8] \n\t" // Load beta " \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) -" bne .DGENSTORED \n\t" +BNE(DGENSTORED) " \n\t" -" .DCOLSTORED: \n\t" // C is column-major. +LABEL(DCOLSTORED) // C is column-major. " \n\t" " dup v0.2d, xzr \n\t" " dup v1.2d, xzr \n\t" @@ -1625,7 +1626,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x2] \n\t" //Load column 0 of C " ldr q1, [x2, #16] \n\t" @@ -1642,7 +1643,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS1: \n\t" +LABEL(DBETAZEROCOLSTOREDS1) " \n\t" " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha @@ -1667,7 +1668,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x21] \n\t" //Load column 2 of C " ldr q9, [x21, #16] \n\t" @@ -1684,7 +1685,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS2: \n\t" +LABEL(DBETAZEROCOLSTOREDS2) " \n\t" " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha @@ -1709,7 +1710,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x23] \n\t" //Load column 4 of C " ldr q1, [x23, #16] \n\t" @@ -1726,7 +1727,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS3: \n\t" +LABEL(DBETAZEROCOLSTOREDS3) " \n\t" " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha @@ -1751,7 +1752,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x25] \n\t" //Load column 6 of C " ldr q9, [x25, #16] \n\t" @@ -1768,7 +1769,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS4: \n\t" +LABEL(DBETAZEROCOLSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -1788,9 +1789,9 @@ __asm__ volatile " str q12, [x26, #16] \n\t" " str q13, [x26, #32] \n\t" " \n\t" -" b .DEND \n\t" +BRANCH(DEND) " \n\t" -" .DGENSTORED: \n\t" // C is general-stride stored. +LABEL(DGENSTORED) // C is general-stride stored. " \n\t" " dup v0.2d, xzr \n\t" " dup v1.2d, xzr \n\t" @@ -1800,7 +1801,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. " \n\t" " mov x27, x2 \n\t" " \n\t" // Load address of C. @@ -1827,7 +1828,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS1: \n\t" +LABEL(DBETAZEROGENSTOREDS1) " \n\t" " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha @@ -1862,7 +1863,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. " \n\t" " mov x27, x21 \n\t" // Load address of C. " \n\t" @@ -1889,7 +1890,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS2: \n\t" +LABEL(DBETAZEROGENSTOREDS2) " \n\t" " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha @@ -1924,7 +1925,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. " \n\t" " mov x27, x23 \n\t" // Load address of C. " \n\t" @@ -1951,7 +1952,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS3: \n\t" +LABEL(DBETAZEROGENSTOREDS3) " \n\t" " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha @@ -1986,7 +1987,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. " \n\t" " mov x27, x25 \n\t" " \n\t" @@ -2013,7 +2014,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS4: \n\t" +LABEL(DBETAZEROGENSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -2043,7 +2044,7 @@ __asm__ volatile " st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. " st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. " \n\t" -" .DEND: \n\t" // Done! +LABEL(DEND) // Done! " \n\t" :// output operands (none) :// input operands From 916e1fa8be3cea0e3e2a4a7e8b00027ac2ee7780 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 29 May 2021 16:46:52 +0900 Subject: [PATCH 022/226] Armv8A Rename Regs for Clang Compile: FP64 Part - x7, x8: Used to store address for Alpha and Beta. As Alpha & Beta was not used in k-loops, use x0, x1 to load Alpha & Beta's addresses after k-loops are completed, since A & B's addresses are no longer needed there. This "ldr [addr]; -> ldr val, [addr]" would not cause much performance drawback since it is done outside k-loops and there are plenty of instructions between Alpha & Beta's loading and usage. - x9: Used to store cs_c. x9 is multiplied by 8 into x10 and not used any longer. Directly loading cs_c and into x10 and scale by 8 spares x9 straightforwardly. - x11, x12: Not used at all. Simply remove from clobber list. - x13: Alike x9, loaded and scaled by 8 into x14, except that x13 is also used in a conditional branch so that "cmp x13, #1" needs to be modified into "cmp x14, #8" to completely free x13. - x3, x4: Used to store next_a & next_b. Untouched in k-loops. Load these addresses into x0 and x1 after Alpha & Beta are both loaded, since then neigher address of A/B nor address of Alpha/Beta is needed. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 44 ++++++++++----------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index 251931f7c5..279b61b796 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -1135,20 +1135,14 @@ __asm__ volatile " ldr x1,%[baddr] \n\t" // Load address of B " ldr x2,%[caddr] \n\t" // Load address of C " \n\t" -" ldr x3,%[a_next] \n\t" // Move pointer -" ldr x4,%[b_next] \n\t" // Move pointer -" \n\t" " ldr x5,%[k_iter] \n\t" // Init guard (k_iter) " ldr x6,%[k_left] \n\t" // Init guard (k_iter) " \n\t" -" ldr x7,%[alpha] \n\t" // Alpha address -" ldr x8,%[beta] \n\t" // Beta address -" \n\t" -" ldr x9,%[cs_c] \n\t" // Load cs_c -" lsl x10,x9,#3 \n\t" // cs_c * sizeof(double) +" ldr x10,%[cs_c] \n\t" // Load cs_c +" lsl x10,x10,#3 \n\t" // cs_c * sizeof(double) " \n\t" -" ldr x13,%[rs_c] \n\t" // Load rs_c. -" lsl x14,x13,#3 \n\t" // rs_c * sizeof(double). +" ldr x14,%[rs_c] \n\t" // Load rs_c. +" lsl x14,x14,#3 \n\t" // rs_c * sizeof(double). " \n\t" " add x20,x2,x10 \n\t" //Load address Column 1 of C " add x21,x20,x10 \n\t" //Load address Column 2 of C @@ -1610,10 +1604,16 @@ BNE(DLOOPKLEFT) // if i!=0. " \n\t" LABEL(DPOSTACCUM) " \n\t" -" ld1r {v6.2d},[x7] \n\t" // Load alpha. -" ld1r {v7.2d},[x8] \n\t" // Load beta +" ldr x0,%[alpha] \n\t" // Alpha address +" ldr x1,%[beta] \n\t" // Beta address +" \n\t" +" ld1r {v6.2d},[x0] \n\t" // Load alpha. +" ld1r {v7.2d},[x1] \n\t" // Load beta " \n\t" -" cmp x13,#1 \n\t" // If rs_c != 1 (column-major) +" ldr x0,%[a_next] \n\t" // Next A address for later use. +" ldr x1,%[b_next] \n\t" // Next B address for later use. +" \n\t" +" cmp x14,#8 \n\t" // If rs_c != 1 (column-major) BNE(DGENSTORED) " \n\t" LABEL(DCOLSTORED) // C is column-major. @@ -1771,8 +1771,8 @@ BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 " \n\t" LABEL(DBETAZEROCOLSTOREDS4) " \n\t" -" prfm pldl2keep,[x3] \n\t" -" prfm pldl2keep,[x4] \n\t" +" prfm pldl2keep,[x0] \n\t" +" prfm pldl2keep,[x1] \n\t" " \n\t" " fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha @@ -2016,8 +2016,8 @@ BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 " \n\t" LABEL(DBETAZEROGENSTOREDS4) " \n\t" -" prfm pldl2keep,[x3] \n\t" -" prfm pldl2keep,[x4] \n\t" +" prfm pldl2keep,[x0] \n\t" +" prfm pldl2keep,[x1] \n\t" " \n\t" " fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha @@ -2060,12 +2060,10 @@ LABEL(DEND) // Done! [a_next] "m" (a_next), // 8 [b_next] "m" (b_next) // 9 :// Register clobber list - "x0","x1","x2","x3", - "x4","x5","x6", - "x7","x8","x9", - "x10","x11","x12","x13","x14","x16","x17", - "x20","x21","x22","x23","x24","x25","x26", - "x27", + "x0","x1","x2", + "x5","x6","x10", + "x14","x16","x17", + "x20","x21","x22","x23","x24","x25","x26","x27", "v0","v1","v2", "v3","v4","v5", "v6","v7","v8", From 9f4a4a3cfb2244e4024445e127dafd2a11f39fc5 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 29 May 2021 17:21:28 +0900 Subject: [PATCH 023/226] Armv8A Rename Regs for Clang Compile: FP32 Part Roughly the same as 916e1fa , additionally with x15 clobbering removed. - x15: Not used at all. Compilation w/ Clang shows warning about x18 reservation, but compilation itself is OK and all tests got passed. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 41 ++++++++++----------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index 279b61b796..be5e20ae78 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -82,20 +82,14 @@ __asm__ volatile " ldr x1,%[baddr] \n\t" // Load address of B. " ldr x2,%[caddr] \n\t" // Load address of C. " \n\t" -" ldr x3,%[a_next] \n\t" // Pointer to next block of A. -" ldr x4,%[b_next] \n\t" // Pointer to next pointer of B. -" \n\t" " ldr x5,%[k_iter] \n\t" // Number of unrolled iterations (k_iter). " ldr x6,%[k_left] \n\t" // Number of remaining iterations (k_left). " \n\t" -" ldr x7,%[alpha] \n\t" // Alpha address. -" ldr x8,%[beta] \n\t" // Beta address. -" \n\t" -" ldr x9,%[cs_c] \n\t" // Load cs_c. -" lsl x10,x9,#2 \n\t" // cs_c * sizeof(float) -- AUX. +" ldr x10,%[cs_c] \n\t" // Load cs_c. +" lsl x10,x10,#2 \n\t" // cs_c * sizeof(float) -- AUX. " \n\t" -" ldr x13,%[rs_c] \n\t" // Load rs_c. -" lsl x14,x13,#2 \n\t" // rs_c * sizeof(float). +" ldr x14,%[rs_c] \n\t" // Load rs_c. +" lsl x14,x14,#2 \n\t" // rs_c * sizeof(float). " \n\t" " add x16,x2,x10 \n\t" //Load address Column 1 of C " add x17,x16,x10 \n\t" //Load address Column 2 of C @@ -502,10 +496,16 @@ BNE(SLOOPKLEFT) // if i!=0. " \n\t" LABEL(SPOSTACCUM) " \n\t" -" ld1r {v6.4s},[x7] \n\t" // Load alpha. -" ld1r {v7.4s},[x8] \n\t" // Load beta +" ldr x0,%[alpha] \n\t" // Alpha address. +" ldr x1,%[beta] \n\t" // Beta address. +" \n\t" +" ld1r {v6.4s},[x0] \n\t" // Load alpha. +" ld1r {v7.4s},[x1] \n\t" // Load beta +" \n\t" +" ldr x0,%[a_next] \n\t" // Pointer to next block of A. +" ldr x1,%[b_next] \n\t" // Pointer to next pointer of B. " \n\t" -" cmp x13,#1 \n\t" // If rs_c != 1 (column-major) +" cmp x14,#4 \n\t" // If rs_c != 1 (column-major) BNE(SGENSTORED) " \n\t" LABEL(SCOLSTORED) // C is column-major. @@ -656,8 +656,8 @@ BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 " \n\t" LABEL(SBETAZEROCOLSTOREDS4) " \n\t" -" prfm pldl2keep,[x3] \n\t" -" prfm pldl2keep,[x4] \n\t" +" prfm pldl2keep,[x0] \n\t" +" prfm pldl2keep,[x1] \n\t" " \n\t" " fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha @@ -1008,8 +1008,8 @@ BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 " \n\t" LABEL(SBETAZEROGENSTOREDS4) " \n\t" -" prfm pldl2keep,[x3] \n\t" -" prfm pldl2keep,[x4] \n\t" +" prfm pldl2keep,[x0] \n\t" +" prfm pldl2keep,[x1] \n\t" " \n\t" " fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha @@ -1067,10 +1067,9 @@ LABEL(SEND) // Done! [a_next] "m" (a_next), // 9 [b_next] "m" (b_next) // 10 :// Register clobber list - "x0", "x1", "x2","x3","x4", - "x5", "x6", "x7", "x8", - "x9", "x10","x11","x12", - "x13","x14","x15", + "x0", "x1", "x2", + "x5", "x6", "x10", + "x14", "x16","x17","x18","x19", "x20","x21","x22","x23", "x24","x25","x26","x27", From 5fc93e280614b4a21a9cff36cf873b4b9407285b Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 29 May 2021 18:44:47 +0900 Subject: [PATCH 024/226] Armv8A Rename Regs for Safe Darwin Compile Avoid x18 use in FP32 kernel: - C address lines x[18-26] renamed to x[19-27] (reg index +1) - Original role of x27 fulfilled by x5 which is free after k-loop pert. FP64 does not require changing since x18 is not used there. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 613 ++++++++++---------- 1 file changed, 306 insertions(+), 307 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index be5e20ae78..dfdda863b1 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -93,20 +93,19 @@ __asm__ volatile " \n\t" " add x16,x2,x10 \n\t" //Load address Column 1 of C " add x17,x16,x10 \n\t" //Load address Column 2 of C -" add x18,x17,x10 \n\t" //Load address Column 3 of C -" add x19,x18,x10 \n\t" //Load address Column 4 of C -" add x20,x19,x10 \n\t" //Load address Column 5 of C -" add x21,x20,x10 \n\t" //Load address Column 6 of C -" add x22,x21,x10 \n\t" //Load address Column 7 of C -" add x23,x22,x10 \n\t" //Load address Column 8 of C -" add x24,x23,x10 \n\t" //Load address Column 9 of C -" add x25,x24,x10 \n\t" //Load address Column 10 of C -" add x26,x25,x10 \n\t" //Load address Column 11 of C +" add x19,x17,x10 \n\t" //Load address Column 3 of C +" add x20,x19,x10 \n\t" //Load address Column 4 of C +" add x21,x20,x10 \n\t" //Load address Column 5 of C +" add x22,x21,x10 \n\t" //Load address Column 6 of C +" add x23,x22,x10 \n\t" //Load address Column 7 of C +" add x24,x23,x10 \n\t" //Load address Column 8 of C +" add x25,x24,x10 \n\t" //Load address Column 9 of C +" add x26,x25,x10 \n\t" //Load address Column 10 of C +" add x27,x26,x10 \n\t" //Load address Column 11 of C " \n\t" " prfm pldl1keep,[x2] \n\t" // Prefetch c. " prfm pldl1keep,[x16] \n\t" // Prefetch c. " prfm pldl1keep,[x17] \n\t" // Prefetch c. -" prfm pldl1keep,[x18] \n\t" // Prefetch c. " prfm pldl1keep,[x19] \n\t" // Prefetch c. " prfm pldl1keep,[x20] \n\t" // Prefetch c. " prfm pldl1keep,[x21] \n\t" // Prefetch c. @@ -115,6 +114,7 @@ __asm__ volatile " prfm pldl1keep,[x24] \n\t" // Prefetch c. " prfm pldl1keep,[x25] \n\t" // Prefetch c. " prfm pldl1keep,[x26] \n\t" // Prefetch c. +" prfm pldl1keep,[x27] \n\t" // Prefetch c. " \n\t" " dup v8.4s, wzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #192] \n\t" @@ -560,12 +560,12 @@ LABEL(SBETAZEROCOLSTOREDS1) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" -" ldr q8, [x18] \n\t" //Load column 3 of C -" ldr q9, [x18, #16] \n\t" -" ldr q10, [x19] \n\t" //Load column 4 of C -" ldr q11, [x19, #16] \n\t" -" ldr q12, [x20] \n\t" //Load column 5 of C -" ldr q13, [x20, #16] \n\t" +" ldr q8, [x19] \n\t" //Load column 3 of C +" ldr q9, [x19, #16] \n\t" +" ldr q10, [x20] \n\t" //Load column 4 of C +" ldr q11, [x20, #16] \n\t" +" ldr q12, [x21] \n\t" //Load column 5 of C +" ldr q13, [x21, #16] \n\t" " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta @@ -583,12 +583,12 @@ LABEL(SBETAZEROCOLSTOREDS2) " fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" str q8, [x18] \n\t" //Store column 3 of C -" str q9, [x18, #16] \n\t" -" str q10, [x19] \n\t" //Store column 4 of C -" str q11, [x19, #16] \n\t" -" str q12, [x20] \n\t" //Store column 5 of C -" str q13, [x20, #16] \n\t" +" str q8, [x19] \n\t" //Store column 3 of C +" str q9, [x19, #16] \n\t" +" str q10, [x20] \n\t" //Store column 4 of C +" str q11, [x20, #16] \n\t" +" str q12, [x21] \n\t" //Store column 5 of C +" str q13, [x21, #16] \n\t" " \n\t" " dup v0.4s, wzr \n\t" " dup v1.4s, wzr \n\t" @@ -600,12 +600,12 @@ LABEL(SBETAZEROCOLSTOREDS2) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" -" ldr q0, [x21] \n\t" //Load column 6 of C -" ldr q1, [x21, #16] \n\t" -" ldr q2, [x22] \n\t" //Load column 7 of C -" ldr q3, [x22, #16] \n\t" -" ldr q4, [x23] \n\t" //Load column 8 of C -" ldr q5, [x23, #16] \n\t" +" ldr q0, [x22] \n\t" //Load column 6 of C +" ldr q1, [x22, #16] \n\t" +" ldr q2, [x23] \n\t" //Load column 7 of C +" ldr q3, [x23, #16] \n\t" +" ldr q4, [x24] \n\t" //Load column 8 of C +" ldr q5, [x24, #16] \n\t" " \n\t" " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta @@ -623,12 +623,12 @@ LABEL(SBETAZEROCOLSTOREDS3) " fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha " fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" str q0, [x21] \n\t" //Store column 6 of C -" str q1, [x21, #16] \n\t" -" str q2, [x22] \n\t" //Store column 7 of C -" str q3, [x22, #16] \n\t" -" str q4, [x23] \n\t" //Store column 8 of C -" str q5, [x23, #16] \n\t" +" str q0, [x22] \n\t" //Store column 6 of C +" str q1, [x22, #16] \n\t" +" str q2, [x23] \n\t" //Store column 7 of C +" str q3, [x23, #16] \n\t" +" str q4, [x24] \n\t" //Store column 8 of C +" str q5, [x24, #16] \n\t" " \n\t" " dup v8.4s, wzr \n\t" " dup v9.4s, wzr \n\t" @@ -640,12 +640,12 @@ LABEL(SBETAZEROCOLSTOREDS3) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" -" ldr q8, [x24] \n\t" //Load column 9 of C -" ldr q9, [x24, #16] \n\t" -" ldr q10, [x25] \n\t" //Load column 10 of C -" ldr q11, [x25, #16] \n\t" -" ldr q12, [x26] \n\t" //Load column 11 of C -" ldr q13, [x26, #16] \n\t" +" ldr q8, [x25] \n\t" //Load column 9 of C +" ldr q9, [x25, #16] \n\t" +" ldr q10, [x26] \n\t" //Load column 10 of C +" ldr q11, [x26, #16] \n\t" +" ldr q12, [x27] \n\t" //Load column 11 of C +" ldr q13, [x27, #16] \n\t" " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta @@ -666,12 +666,12 @@ LABEL(SBETAZEROCOLSTOREDS4) " fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" str q8, [x24] \n\t" //Store column 9 of C -" str q9, [x24, #16] \n\t" -" str q10, [x25] \n\t" //Store column 10 of C -" str q11, [x25, #16] \n\t" -" str q12, [x26] \n\t" //Store column 11 of C -" str q13, [x26, #16] \n\t" +" str q8, [x25] \n\t" //Store column 9 of C +" str q9, [x25, #16] \n\t" +" str q10, [x26] \n\t" //Store column 10 of C +" str q11, [x26, #16] \n\t" +" str q12, [x27] \n\t" //Store column 11 of C +" str q13, [x27, #16] \n\t" " \n\t" " \n\t" BRANCH(SEND) // Done. @@ -690,38 +690,38 @@ LABEL(SGENSTORED) // C is general-stride stored " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. " \n\t" -" mov x27, x2 \n\t" -" \n\t" -" ld1 {v0.s}[0],[x27],x14 \n\t" // Load c00 into quad and increment by rs_c. -" ld1 {v0.s}[1],[x27],x14 \n\t" // Load c01 into quad and increment by rs_c. -" ld1 {v0.s}[2],[x27],x14 \n\t" // Load c02 into quad and increment by rs_c. -" ld1 {v0.s}[3],[x27],x14 \n\t" // Load c03 into quad and increment by rs_c. -" ld1 {v1.s}[0],[x27],x14 \n\t" // Load c04 into quad and increment by rs_c. -" ld1 {v1.s}[1],[x27],x14 \n\t" // Load c05 into quad and increment by rs_c. -" ld1 {v1.s}[2],[x27],x14 \n\t" // Load c06 into quad and increment by rs_c. -" ld1 {v1.s}[3],[x27],x14 \n\t" // Load c07 into quad and increment by rs_c. -" \n\t" -" mov x27, x16 \n\t" -" \n\t" -" ld1 {v2.s}[0],[x27],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v2.s}[1],[x27],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v2.s}[2],[x27],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v2.s}[3],[x27],x14 \n\t" // Load c13 into quad and increment by rs_c. -" ld1 {v3.s}[0],[x27],x14 \n\t" // Load c14 into quad and increment by rs_c. -" ld1 {v3.s}[1],[x27],x14 \n\t" // Load c15 into quad and increment by rs_c. -" ld1 {v3.s}[2],[x27],x14 \n\t" // Load c16 into quad and increment by rs_c. -" ld1 {v3.s}[3],[x27],x14 \n\t" // Load c17 into quad and increment by rs_c. -" \n\t" -" mov x27, x17 \n\t" -" \n\t" -" ld1 {v4.s}[0],[x27],x14 \n\t" // Load c20 into quad and increment by rs_c. -" ld1 {v4.s}[1],[x27],x14 \n\t" // Load c21 into quad and increment by rs_c. -" ld1 {v4.s}[2],[x27],x14 \n\t" // Load c22 into quad and increment by rs_c. -" ld1 {v4.s}[3],[x27],x14 \n\t" // Load c23 into quad and increment by rs_c. -" ld1 {v5.s}[0],[x27],x14 \n\t" // Load c24 into quad and increment by rs_c. -" ld1 {v5.s}[1],[x27],x14 \n\t" // Load c25 into quad and increment by rs_c. -" ld1 {v5.s}[2],[x27],x14 \n\t" // Load c26 into quad and increment by rs_c. -" ld1 {v5.s}[3],[x27],x14 \n\t" // Load c27 into quad and increment by rs_c. +" mov x5, x2 \n\t" +" \n\t" +" ld1 {v0.s}[0],[x5],x14 \n\t" // Load c00 into quad and increment by rs_c. +" ld1 {v0.s}[1],[x5],x14 \n\t" // Load c01 into quad and increment by rs_c. +" ld1 {v0.s}[2],[x5],x14 \n\t" // Load c02 into quad and increment by rs_c. +" ld1 {v0.s}[3],[x5],x14 \n\t" // Load c03 into quad and increment by rs_c. +" ld1 {v1.s}[0],[x5],x14 \n\t" // Load c04 into quad and increment by rs_c. +" ld1 {v1.s}[1],[x5],x14 \n\t" // Load c05 into quad and increment by rs_c. +" ld1 {v1.s}[2],[x5],x14 \n\t" // Load c06 into quad and increment by rs_c. +" ld1 {v1.s}[3],[x5],x14 \n\t" // Load c07 into quad and increment by rs_c. +" \n\t" +" mov x5, x16 \n\t" +" \n\t" +" ld1 {v2.s}[0],[x5],x14 \n\t" // Load c10 into quad and increment by rs_c. +" ld1 {v2.s}[1],[x5],x14 \n\t" // Load c11 into quad and increment by rs_c. +" ld1 {v2.s}[2],[x5],x14 \n\t" // Load c12 into quad and increment by rs_c. +" ld1 {v2.s}[3],[x5],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v3.s}[0],[x5],x14 \n\t" // Load c14 into quad and increment by rs_c. +" ld1 {v3.s}[1],[x5],x14 \n\t" // Load c15 into quad and increment by rs_c. +" ld1 {v3.s}[2],[x5],x14 \n\t" // Load c16 into quad and increment by rs_c. +" ld1 {v3.s}[3],[x5],x14 \n\t" // Load c17 into quad and increment by rs_c. +" \n\t" +" mov x5, x17 \n\t" +" \n\t" +" ld1 {v4.s}[0],[x5],x14 \n\t" // Load c20 into quad and increment by rs_c. +" ld1 {v4.s}[1],[x5],x14 \n\t" // Load c21 into quad and increment by rs_c. +" ld1 {v4.s}[2],[x5],x14 \n\t" // Load c22 into quad and increment by rs_c. +" ld1 {v4.s}[3],[x5],x14 \n\t" // Load c23 into quad and increment by rs_c. +" ld1 {v5.s}[0],[x5],x14 \n\t" // Load c24 into quad and increment by rs_c. +" ld1 {v5.s}[1],[x5],x14 \n\t" // Load c25 into quad and increment by rs_c. +" ld1 {v5.s}[2],[x5],x14 \n\t" // Load c26 into quad and increment by rs_c. +" ld1 {v5.s}[3],[x5],x14 \n\t" // Load c27 into quad and increment by rs_c. " \n\t" " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta @@ -739,38 +739,38 @@ LABEL(SBETAZEROGENSTOREDS1) " fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha " fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" mov x27, x2 \n\t" -" \n\t" -" st1 {v0.s}[0],[x27],x14 \n\t" // Store c00 into quad and increment by rs_c. -" st1 {v0.s}[1],[x27],x14 \n\t" // Store c01 into quad and increment by rs_c. -" st1 {v0.s}[2],[x27],x14 \n\t" // Store c02 into quad and increment by rs_c. -" st1 {v0.s}[3],[x27],x14 \n\t" // Store c03 into quad and increment by rs_c. -" st1 {v1.s}[0],[x27],x14 \n\t" // Store c04 into quad and increment by rs_c. -" st1 {v1.s}[1],[x27],x14 \n\t" // Store c05 into quad and increment by rs_c. -" st1 {v1.s}[2],[x27],x14 \n\t" // Store c06 into quad and increment by rs_c. -" st1 {v1.s}[3],[x27],x14 \n\t" // Store c07 into quad and increment by rs_c. -" \n\t" -" mov x27, x16 \n\t" -" \n\t" -" st1 {v2.s}[0],[x27],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v2.s}[1],[x27],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v2.s}[2],[x27],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v2.s}[3],[x27],x14 \n\t" // Store c13 into quad and increment by rs_c. -" st1 {v3.s}[0],[x27],x14 \n\t" // Store c14 into quad and increment by rs_c. -" st1 {v3.s}[1],[x27],x14 \n\t" // Store c15 into quad and increment by rs_c. -" st1 {v3.s}[2],[x27],x14 \n\t" // Store c16 into quad and increment by rs_c. -" st1 {v3.s}[3],[x27],x14 \n\t" // Store c17 into quad and increment by rs_c. -" \n\t" -" mov x27, x17 \n\t" -" \n\t" -" st1 {v4.s}[0],[x27],x14 \n\t" // Store c20 into quad and increment by rs_c. -" st1 {v4.s}[1],[x27],x14 \n\t" // Store c21 into quad and increment by rs_c. -" st1 {v4.s}[2],[x27],x14 \n\t" // Store c22 into quad and increment by rs_c. -" st1 {v4.s}[3],[x27],x14 \n\t" // Store c23 into quad and increment by rs_c. -" st1 {v5.s}[0],[x27],x14 \n\t" // Store c24 into quad and increment by rs_c. -" st1 {v5.s}[1],[x27],x14 \n\t" // Store c25 into quad and increment by rs_c. -" st1 {v5.s}[2],[x27],x14 \n\t" // Store c26 into quad and increment by rs_c. -" st1 {v5.s}[3],[x27],x14 \n\t" // Store c27 into quad and increment by rs_c. +" mov x5, x2 \n\t" +" \n\t" +" st1 {v0.s}[0],[x5],x14 \n\t" // Store c00 into quad and increment by rs_c. +" st1 {v0.s}[1],[x5],x14 \n\t" // Store c01 into quad and increment by rs_c. +" st1 {v0.s}[2],[x5],x14 \n\t" // Store c02 into quad and increment by rs_c. +" st1 {v0.s}[3],[x5],x14 \n\t" // Store c03 into quad and increment by rs_c. +" st1 {v1.s}[0],[x5],x14 \n\t" // Store c04 into quad and increment by rs_c. +" st1 {v1.s}[1],[x5],x14 \n\t" // Store c05 into quad and increment by rs_c. +" st1 {v1.s}[2],[x5],x14 \n\t" // Store c06 into quad and increment by rs_c. +" st1 {v1.s}[3],[x5],x14 \n\t" // Store c07 into quad and increment by rs_c. +" \n\t" +" mov x5, x16 \n\t" +" \n\t" +" st1 {v2.s}[0],[x5],x14 \n\t" // Store c10 into quad and increment by rs_c. +" st1 {v2.s}[1],[x5],x14 \n\t" // Store c11 into quad and increment by rs_c. +" st1 {v2.s}[2],[x5],x14 \n\t" // Store c12 into quad and increment by rs_c. +" st1 {v2.s}[3],[x5],x14 \n\t" // Store c13 into quad and increment by rs_c. +" st1 {v3.s}[0],[x5],x14 \n\t" // Store c14 into quad and increment by rs_c. +" st1 {v3.s}[1],[x5],x14 \n\t" // Store c15 into quad and increment by rs_c. +" st1 {v3.s}[2],[x5],x14 \n\t" // Store c16 into quad and increment by rs_c. +" st1 {v3.s}[3],[x5],x14 \n\t" // Store c17 into quad and increment by rs_c. +" \n\t" +" mov x5, x17 \n\t" +" \n\t" +" st1 {v4.s}[0],[x5],x14 \n\t" // Store c20 into quad and increment by rs_c. +" st1 {v4.s}[1],[x5],x14 \n\t" // Store c21 into quad and increment by rs_c. +" st1 {v4.s}[2],[x5],x14 \n\t" // Store c22 into quad and increment by rs_c. +" st1 {v4.s}[3],[x5],x14 \n\t" // Store c23 into quad and increment by rs_c. +" st1 {v5.s}[0],[x5],x14 \n\t" // Store c24 into quad and increment by rs_c. +" st1 {v5.s}[1],[x5],x14 \n\t" // Store c25 into quad and increment by rs_c. +" st1 {v5.s}[2],[x5],x14 \n\t" // Store c26 into quad and increment by rs_c. +" st1 {v5.s}[3],[x5],x14 \n\t" // Store c27 into quad and increment by rs_c. " \n\t" " dup v8.4s, wzr \n\t" " dup v9.4s, wzr \n\t" @@ -782,38 +782,38 @@ LABEL(SBETAZEROGENSTOREDS1) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. " \n\t" -" mov x27, x18 \n\t" -" \n\t" -" ld1 {v8.s}[0],[x27],x14 \n\t" // Load c30 into quad and increment by rs_c. -" ld1 {v8.s}[1],[x27],x14 \n\t" // Load c31 into quad and increment by rs_c. -" ld1 {v8.s}[2],[x27],x14 \n\t" // Load c32 into quad and increment by rs_c. -" ld1 {v8.s}[3],[x27],x14 \n\t" // Load c33 into quad and increment by rs_c. -" ld1 {v9.s}[0],[x27],x14 \n\t" // Load c34 into quad and increment by rs_c. -" ld1 {v9.s}[1],[x27],x14 \n\t" // Load c35 into quad and increment by rs_c. -" ld1 {v9.s}[2],[x27],x14 \n\t" // Load c36 into quad and increment by rs_c. -" ld1 {v9.s}[3],[x27],x14 \n\t" // Load c37 into quad and increment by rs_c. -" \n\t" -" mov x27, x19 \n\t" -" \n\t" -" ld1 {v10.s}[0],[x27],x14 \n\t" // Load c40 into quad and increment by rs_c. -" ld1 {v10.s}[1],[x27],x14 \n\t" // Load c41 into quad and increment by rs_c. -" ld1 {v10.s}[2],[x27],x14 \n\t" // Load c42 into quad and increment by rs_c. -" ld1 {v10.s}[3],[x27],x14 \n\t" // Load c43 into quad and increment by rs_c. -" ld1 {v11.s}[0],[x27],x14 \n\t" // Load c44 into quad and increment by rs_c. -" ld1 {v11.s}[1],[x27],x14 \n\t" // Load c45 into quad and increment by rs_c. -" ld1 {v11.s}[2],[x27],x14 \n\t" // Load c46 into quad and increment by rs_c. -" ld1 {v11.s}[3],[x27],x14 \n\t" // Load c47 into quad and increment by rs_c. -" \n\t" -" mov x27, x20 \n\t" -" \n\t" -" ld1 {v12.s}[0],[x27],x14 \n\t" // Load c50 into quad and increment by rs_c. -" ld1 {v12.s}[1],[x27],x14 \n\t" // Load c51 into quad and increment by rs_c. -" ld1 {v12.s}[2],[x27],x14 \n\t" // Load c52 into quad and increment by rs_c. -" ld1 {v12.s}[3],[x27],x14 \n\t" // Load c53 into quad and increment by rs_c. -" ld1 {v13.s}[0],[x27],x14 \n\t" // Load c54 into quad and increment by rs_c. -" ld1 {v13.s}[1],[x27],x14 \n\t" // Load c55 into quad and increment by rs_c. -" ld1 {v13.s}[2],[x27],x14 \n\t" // Load c56 into quad and increment by rs_c. -" ld1 {v13.s}[3],[x27],x14 \n\t" // Load c57 into quad and increment by rs_c. +" mov x5, x19 \n\t" +" \n\t" +" ld1 {v8.s}[0],[x5],x14 \n\t" // Load c30 into quad and increment by rs_c. +" ld1 {v8.s}[1],[x5],x14 \n\t" // Load c31 into quad and increment by rs_c. +" ld1 {v8.s}[2],[x5],x14 \n\t" // Load c32 into quad and increment by rs_c. +" ld1 {v8.s}[3],[x5],x14 \n\t" // Load c33 into quad and increment by rs_c. +" ld1 {v9.s}[0],[x5],x14 \n\t" // Load c34 into quad and increment by rs_c. +" ld1 {v9.s}[1],[x5],x14 \n\t" // Load c35 into quad and increment by rs_c. +" ld1 {v9.s}[2],[x5],x14 \n\t" // Load c36 into quad and increment by rs_c. +" ld1 {v9.s}[3],[x5],x14 \n\t" // Load c37 into quad and increment by rs_c. +" \n\t" +" mov x5, x20 \n\t" +" \n\t" +" ld1 {v10.s}[0],[x5],x14 \n\t" // Load c40 into quad and increment by rs_c. +" ld1 {v10.s}[1],[x5],x14 \n\t" // Load c41 into quad and increment by rs_c. +" ld1 {v10.s}[2],[x5],x14 \n\t" // Load c42 into quad and increment by rs_c. +" ld1 {v10.s}[3],[x5],x14 \n\t" // Load c43 into quad and increment by rs_c. +" ld1 {v11.s}[0],[x5],x14 \n\t" // Load c44 into quad and increment by rs_c. +" ld1 {v11.s}[1],[x5],x14 \n\t" // Load c45 into quad and increment by rs_c. +" ld1 {v11.s}[2],[x5],x14 \n\t" // Load c46 into quad and increment by rs_c. +" ld1 {v11.s}[3],[x5],x14 \n\t" // Load c47 into quad and increment by rs_c. +" \n\t" +" mov x5, x21 \n\t" +" \n\t" +" ld1 {v12.s}[0],[x5],x14 \n\t" // Load c50 into quad and increment by rs_c. +" ld1 {v12.s}[1],[x5],x14 \n\t" // Load c51 into quad and increment by rs_c. +" ld1 {v12.s}[2],[x5],x14 \n\t" // Load c52 into quad and increment by rs_c. +" ld1 {v12.s}[3],[x5],x14 \n\t" // Load c53 into quad and increment by rs_c. +" ld1 {v13.s}[0],[x5],x14 \n\t" // Load c54 into quad and increment by rs_c. +" ld1 {v13.s}[1],[x5],x14 \n\t" // Load c55 into quad and increment by rs_c. +" ld1 {v13.s}[2],[x5],x14 \n\t" // Load c56 into quad and increment by rs_c. +" ld1 {v13.s}[3],[x5],x14 \n\t" // Load c57 into quad and increment by rs_c. " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta @@ -831,38 +831,38 @@ LABEL(SBETAZEROGENSTOREDS2) " fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" mov x27, x18 \n\t" -" \n\t" -" st1 {v8.s}[0],[x27],x14 \n\t" // Store c30 into quad and increment by rs_c. -" st1 {v8.s}[1],[x27],x14 \n\t" // Store c31 into quad and increment by rs_c. -" st1 {v8.s}[2],[x27],x14 \n\t" // Store c32 into quad and increment by rs_c. -" st1 {v8.s}[3],[x27],x14 \n\t" // Store c33 into quad and increment by rs_c. -" st1 {v9.s}[0],[x27],x14 \n\t" // Store c34 into quad and increment by rs_c. -" st1 {v9.s}[1],[x27],x14 \n\t" // Store c35 into quad and increment by rs_c. -" st1 {v9.s}[2],[x27],x14 \n\t" // Store c36 into quad and increment by rs_c. -" st1 {v9.s}[3],[x27],x14 \n\t" // Store c37 into quad and increment by rs_c. -" \n\t" -" mov x27, x19 \n\t" -" \n\t" -" st1 {v10.s}[0],[x27],x14 \n\t" // Store c40 into quad and increment by rs_c. -" st1 {v10.s}[1],[x27],x14 \n\t" // Store c41 into quad and increment by rs_c. -" st1 {v10.s}[2],[x27],x14 \n\t" // Store c42 into quad and increment by rs_c. -" st1 {v10.s}[3],[x27],x14 \n\t" // Store c43 into quad and increment by rs_c. -" st1 {v11.s}[0],[x27],x14 \n\t" // Store c44 into quad and increment by rs_c. -" st1 {v11.s}[1],[x27],x14 \n\t" // Store c45 into quad and increment by rs_c. -" st1 {v11.s}[2],[x27],x14 \n\t" // Store c46 into quad and increment by rs_c. -" st1 {v11.s}[3],[x27],x14 \n\t" // Store c47 into quad and increment by rs_c. -" \n\t" -" mov x27, x20 \n\t" -" \n\t" -" st1 {v12.s}[0],[x27],x14 \n\t" // Store c50 into quad and increment by rs_c. -" st1 {v12.s}[1],[x27],x14 \n\t" // Store c51 into quad and increment by rs_c. -" st1 {v12.s}[2],[x27],x14 \n\t" // Store c52 into quad and increment by rs_c. -" st1 {v12.s}[3],[x27],x14 \n\t" // Store c53 into quad and increment by rs_c. -" st1 {v13.s}[0],[x27],x14 \n\t" // Store c54 into quad and increment by rs_c. -" st1 {v13.s}[1],[x27],x14 \n\t" // Store c55 into quad and increment by rs_c. -" st1 {v13.s}[2],[x27],x14 \n\t" // Store c56 into quad and increment by rs_c. -" st1 {v13.s}[3],[x27],x14 \n\t" // Store c57 into quad and increment by rs_c. +" mov x5, x19 \n\t" +" \n\t" +" st1 {v8.s}[0],[x5],x14 \n\t" // Store c30 into quad and increment by rs_c. +" st1 {v8.s}[1],[x5],x14 \n\t" // Store c31 into quad and increment by rs_c. +" st1 {v8.s}[2],[x5],x14 \n\t" // Store c32 into quad and increment by rs_c. +" st1 {v8.s}[3],[x5],x14 \n\t" // Store c33 into quad and increment by rs_c. +" st1 {v9.s}[0],[x5],x14 \n\t" // Store c34 into quad and increment by rs_c. +" st1 {v9.s}[1],[x5],x14 \n\t" // Store c35 into quad and increment by rs_c. +" st1 {v9.s}[2],[x5],x14 \n\t" // Store c36 into quad and increment by rs_c. +" st1 {v9.s}[3],[x5],x14 \n\t" // Store c37 into quad and increment by rs_c. +" \n\t" +" mov x5, x20 \n\t" +" \n\t" +" st1 {v10.s}[0],[x5],x14 \n\t" // Store c40 into quad and increment by rs_c. +" st1 {v10.s}[1],[x5],x14 \n\t" // Store c41 into quad and increment by rs_c. +" st1 {v10.s}[2],[x5],x14 \n\t" // Store c42 into quad and increment by rs_c. +" st1 {v10.s}[3],[x5],x14 \n\t" // Store c43 into quad and increment by rs_c. +" st1 {v11.s}[0],[x5],x14 \n\t" // Store c44 into quad and increment by rs_c. +" st1 {v11.s}[1],[x5],x14 \n\t" // Store c45 into quad and increment by rs_c. +" st1 {v11.s}[2],[x5],x14 \n\t" // Store c46 into quad and increment by rs_c. +" st1 {v11.s}[3],[x5],x14 \n\t" // Store c47 into quad and increment by rs_c. +" \n\t" +" mov x5, x21 \n\t" +" \n\t" +" st1 {v12.s}[0],[x5],x14 \n\t" // Store c50 into quad and increment by rs_c. +" st1 {v12.s}[1],[x5],x14 \n\t" // Store c51 into quad and increment by rs_c. +" st1 {v12.s}[2],[x5],x14 \n\t" // Store c52 into quad and increment by rs_c. +" st1 {v12.s}[3],[x5],x14 \n\t" // Store c53 into quad and increment by rs_c. +" st1 {v13.s}[0],[x5],x14 \n\t" // Store c54 into quad and increment by rs_c. +" st1 {v13.s}[1],[x5],x14 \n\t" // Store c55 into quad and increment by rs_c. +" st1 {v13.s}[2],[x5],x14 \n\t" // Store c56 into quad and increment by rs_c. +" st1 {v13.s}[3],[x5],x14 \n\t" // Store c57 into quad and increment by rs_c. " \n\t" " dup v0.4s, wzr \n\t" " dup v1.4s, wzr \n\t" @@ -874,38 +874,38 @@ LABEL(SBETAZEROGENSTOREDS2) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. " \n\t" -" mov x27, x21 \n\t" -" \n\t" -" ld1 {v0.s}[0],[x27],x14 \n\t" // Load c60 into quad and increment by rs_c. -" ld1 {v0.s}[1],[x27],x14 \n\t" // Load c61 into quad and increment by rs_c. -" ld1 {v0.s}[2],[x27],x14 \n\t" // Load c62 into quad and increment by rs_c. -" ld1 {v0.s}[3],[x27],x14 \n\t" // Load c63 into quad and increment by rs_c. -" ld1 {v1.s}[0],[x27],x14 \n\t" // Load c64 into quad and increment by rs_c. -" ld1 {v1.s}[1],[x27],x14 \n\t" // Load c65 into quad and increment by rs_c. -" ld1 {v1.s}[2],[x27],x14 \n\t" // Load c66 into quad and increment by rs_c. -" ld1 {v1.s}[3],[x27],x14 \n\t" // Load c67 into quad and increment by rs_c. -" \n\t" -" mov x27, x22 \n\t" -" \n\t" -" ld1 {v2.s}[0],[x27],x14 \n\t" // Load c70 into quad and increment by rs_c. -" ld1 {v2.s}[1],[x27],x14 \n\t" // Load c71 into quad and increment by rs_c. -" ld1 {v2.s}[2],[x27],x14 \n\t" // Load c72 into quad and increment by rs_c. -" ld1 {v2.s}[3],[x27],x14 \n\t" // Load c73 into quad and increment by rs_c. -" ld1 {v3.s}[0],[x27],x14 \n\t" // Load c74 into quad and increment by rs_c. -" ld1 {v3.s}[1],[x27],x14 \n\t" // Load c75 into quad and increment by rs_c. -" ld1 {v3.s}[2],[x27],x14 \n\t" // Load c76 into quad and increment by rs_c. -" ld1 {v3.s}[3],[x27],x14 \n\t" // Load c77 into quad and increment by rs_c. -" \n\t" -" mov x27, x23 \n\t" -" \n\t" -" ld1 {v4.s}[0],[x27],x14 \n\t" // Load c80 into quad and increment by rs_c. -" ld1 {v4.s}[1],[x27],x14 \n\t" // Load c81 into quad and increment by rs_c. -" ld1 {v4.s}[2],[x27],x14 \n\t" // Load c82 into quad and increment by rs_c. -" ld1 {v4.s}[3],[x27],x14 \n\t" // Load c83 into quad and increment by rs_c. -" ld1 {v5.s}[0],[x27],x14 \n\t" // Load c84 into quad and increment by rs_c. -" ld1 {v5.s}[1],[x27],x14 \n\t" // Load c85 into quad and increment by rs_c. -" ld1 {v5.s}[2],[x27],x14 \n\t" // Load c86 into quad and increment by rs_c. -" ld1 {v5.s}[3],[x27],x14 \n\t" // Load c87 into quad and increment by rs_c. +" mov x5, x22 \n\t" +" \n\t" +" ld1 {v0.s}[0],[x5],x14 \n\t" // Load c60 into quad and increment by rs_c. +" ld1 {v0.s}[1],[x5],x14 \n\t" // Load c61 into quad and increment by rs_c. +" ld1 {v0.s}[2],[x5],x14 \n\t" // Load c62 into quad and increment by rs_c. +" ld1 {v0.s}[3],[x5],x14 \n\t" // Load c63 into quad and increment by rs_c. +" ld1 {v1.s}[0],[x5],x14 \n\t" // Load c64 into quad and increment by rs_c. +" ld1 {v1.s}[1],[x5],x14 \n\t" // Load c65 into quad and increment by rs_c. +" ld1 {v1.s}[2],[x5],x14 \n\t" // Load c66 into quad and increment by rs_c. +" ld1 {v1.s}[3],[x5],x14 \n\t" // Load c67 into quad and increment by rs_c. +" \n\t" +" mov x5, x23 \n\t" +" \n\t" +" ld1 {v2.s}[0],[x5],x14 \n\t" // Load c70 into quad and increment by rs_c. +" ld1 {v2.s}[1],[x5],x14 \n\t" // Load c71 into quad and increment by rs_c. +" ld1 {v2.s}[2],[x5],x14 \n\t" // Load c72 into quad and increment by rs_c. +" ld1 {v2.s}[3],[x5],x14 \n\t" // Load c73 into quad and increment by rs_c. +" ld1 {v3.s}[0],[x5],x14 \n\t" // Load c74 into quad and increment by rs_c. +" ld1 {v3.s}[1],[x5],x14 \n\t" // Load c75 into quad and increment by rs_c. +" ld1 {v3.s}[2],[x5],x14 \n\t" // Load c76 into quad and increment by rs_c. +" ld1 {v3.s}[3],[x5],x14 \n\t" // Load c77 into quad and increment by rs_c. +" \n\t" +" mov x5, x24 \n\t" +" \n\t" +" ld1 {v4.s}[0],[x5],x14 \n\t" // Load c80 into quad and increment by rs_c. +" ld1 {v4.s}[1],[x5],x14 \n\t" // Load c81 into quad and increment by rs_c. +" ld1 {v4.s}[2],[x5],x14 \n\t" // Load c82 into quad and increment by rs_c. +" ld1 {v4.s}[3],[x5],x14 \n\t" // Load c83 into quad and increment by rs_c. +" ld1 {v5.s}[0],[x5],x14 \n\t" // Load c84 into quad and increment by rs_c. +" ld1 {v5.s}[1],[x5],x14 \n\t" // Load c85 into quad and increment by rs_c. +" ld1 {v5.s}[2],[x5],x14 \n\t" // Load c86 into quad and increment by rs_c. +" ld1 {v5.s}[3],[x5],x14 \n\t" // Load c87 into quad and increment by rs_c. " \n\t" " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta @@ -923,38 +923,38 @@ LABEL(SBETAZEROGENSTOREDS3) " fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha " fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" mov x27, x21 \n\t" -" \n\t" -" st1 {v0.s}[0],[x27],x14 \n\t" // Store c60 into quad and increment by rs_c. -" st1 {v0.s}[1],[x27],x14 \n\t" // Store c61 into quad and increment by rs_c. -" st1 {v0.s}[2],[x27],x14 \n\t" // Store c62 into quad and increment by rs_c. -" st1 {v0.s}[3],[x27],x14 \n\t" // Store c63 into quad and increment by rs_c. -" st1 {v1.s}[0],[x27],x14 \n\t" // Store c64 into quad and increment by rs_c. -" st1 {v1.s}[1],[x27],x14 \n\t" // Store c65 into quad and increment by rs_c. -" st1 {v1.s}[2],[x27],x14 \n\t" // Store c66 into quad and increment by rs_c. -" st1 {v1.s}[3],[x27],x14 \n\t" // Store c67 into quad and increment by rs_c. -" \n\t" -" mov x27, x22 \n\t" -" \n\t" -" st1 {v2.s}[0],[x27],x14 \n\t" // Store c70 into quad and increment by rs_c. -" st1 {v2.s}[1],[x27],x14 \n\t" // Store c71 into quad and increment by rs_c. -" st1 {v2.s}[2],[x27],x14 \n\t" // Store c72 into quad and increment by rs_c. -" st1 {v2.s}[3],[x27],x14 \n\t" // Store c73 into quad and increment by rs_c. -" st1 {v3.s}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. -" st1 {v3.s}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. -" st1 {v3.s}[2],[x27],x14 \n\t" // Store c76 into quad and increment by rs_c. -" st1 {v3.s}[3],[x27],x14 \n\t" // Store c77 into quad and increment by rs_c. -" \n\t" -" mov x27, x23 \n\t" -" \n\t" -" st1 {v4.s}[0],[x27],x14 \n\t" // Store c80 into quad and increment by rs_c. -" st1 {v4.s}[1],[x27],x14 \n\t" // Store c81 into quad and increment by rs_c. -" st1 {v4.s}[2],[x27],x14 \n\t" // Store c82 into quad and increment by rs_c. -" st1 {v4.s}[3],[x27],x14 \n\t" // Store c83 into quad and increment by rs_c. -" st1 {v5.s}[0],[x27],x14 \n\t" // Store c84 into quad and increment by rs_c. -" st1 {v5.s}[1],[x27],x14 \n\t" // Store c85 into quad and increment by rs_c. -" st1 {v5.s}[2],[x27],x14 \n\t" // Store c86 into quad and increment by rs_c. -" st1 {v5.s}[3],[x27],x14 \n\t" // Store c87 into quad and increment by rs_c. +" mov x5, x22 \n\t" +" \n\t" +" st1 {v0.s}[0],[x5],x14 \n\t" // Store c60 into quad and increment by rs_c. +" st1 {v0.s}[1],[x5],x14 \n\t" // Store c61 into quad and increment by rs_c. +" st1 {v0.s}[2],[x5],x14 \n\t" // Store c62 into quad and increment by rs_c. +" st1 {v0.s}[3],[x5],x14 \n\t" // Store c63 into quad and increment by rs_c. +" st1 {v1.s}[0],[x5],x14 \n\t" // Store c64 into quad and increment by rs_c. +" st1 {v1.s}[1],[x5],x14 \n\t" // Store c65 into quad and increment by rs_c. +" st1 {v1.s}[2],[x5],x14 \n\t" // Store c66 into quad and increment by rs_c. +" st1 {v1.s}[3],[x5],x14 \n\t" // Store c67 into quad and increment by rs_c. +" \n\t" +" mov x5, x23 \n\t" +" \n\t" +" st1 {v2.s}[0],[x5],x14 \n\t" // Store c70 into quad and increment by rs_c. +" st1 {v2.s}[1],[x5],x14 \n\t" // Store c71 into quad and increment by rs_c. +" st1 {v2.s}[2],[x5],x14 \n\t" // Store c72 into quad and increment by rs_c. +" st1 {v2.s}[3],[x5],x14 \n\t" // Store c73 into quad and increment by rs_c. +" st1 {v3.s}[0],[x5],x14 \n\t" // Store c74 into quad and increment by rs_c. +" st1 {v3.s}[1],[x5],x14 \n\t" // Store c75 into quad and increment by rs_c. +" st1 {v3.s}[2],[x5],x14 \n\t" // Store c76 into quad and increment by rs_c. +" st1 {v3.s}[3],[x5],x14 \n\t" // Store c77 into quad and increment by rs_c. +" \n\t" +" mov x5, x24 \n\t" +" \n\t" +" st1 {v4.s}[0],[x5],x14 \n\t" // Store c80 into quad and increment by rs_c. +" st1 {v4.s}[1],[x5],x14 \n\t" // Store c81 into quad and increment by rs_c. +" st1 {v4.s}[2],[x5],x14 \n\t" // Store c82 into quad and increment by rs_c. +" st1 {v4.s}[3],[x5],x14 \n\t" // Store c83 into quad and increment by rs_c. +" st1 {v5.s}[0],[x5],x14 \n\t" // Store c84 into quad and increment by rs_c. +" st1 {v5.s}[1],[x5],x14 \n\t" // Store c85 into quad and increment by rs_c. +" st1 {v5.s}[2],[x5],x14 \n\t" // Store c86 into quad and increment by rs_c. +" st1 {v5.s}[3],[x5],x14 \n\t" // Store c87 into quad and increment by rs_c. " \n\t" " dup v8.4s, wzr \n\t" " dup v9.4s, wzr \n\t" @@ -966,38 +966,38 @@ LABEL(SBETAZEROGENSTOREDS3) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. " \n\t" -" mov x27, x24 \n\t" -" \n\t" -" ld1 {v8.s}[0],[x27],x14 \n\t" // Load c90 into quad and increment by rs_c. -" ld1 {v8.s}[1],[x27],x14 \n\t" // Load c91 into quad and increment by rs_c. -" ld1 {v8.s}[2],[x27],x14 \n\t" // Load c92 into quad and increment by rs_c. -" ld1 {v8.s}[3],[x27],x14 \n\t" // Load c93 into quad and increment by rs_c. -" ld1 {v9.s}[0],[x27],x14 \n\t" // Load c94 into quad and increment by rs_c. -" ld1 {v9.s}[1],[x27],x14 \n\t" // Load c95 into quad and increment by rs_c. -" ld1 {v9.s}[2],[x27],x14 \n\t" // Load c96 into quad and increment by rs_c. -" ld1 {v9.s}[3],[x27],x14 \n\t" // Load c97 into quad and increment by rs_c. -" \n\t" -" mov x27, x25 \n\t" -" \n\t" -" ld1 {v10.s}[0],[x27],x14 \n\t" // Load c100 into quad and increment by rs_c. -" ld1 {v10.s}[1],[x27],x14 \n\t" // Load c101 into quad and increment by rs_c. -" ld1 {v10.s}[2],[x27],x14 \n\t" // Load c102 into quad and increment by rs_c. -" ld1 {v10.s}[3],[x27],x14 \n\t" // Load c103 into quad and increment by rs_c. -" ld1 {v11.s}[0],[x27],x14 \n\t" // Load c104 into quad and increment by rs_c. -" ld1 {v11.s}[1],[x27],x14 \n\t" // Load c105 into quad and increment by rs_c. -" ld1 {v11.s}[2],[x27],x14 \n\t" // Load c106 into quad and increment by rs_c. -" ld1 {v11.s}[3],[x27],x14 \n\t" // Load c107 into quad and increment by rs_c. -" \n\t" -" mov x27, x26 \n\t" -" \n\t" -" ld1 {v12.s}[0],[x27],x14 \n\t" // Load c110 into quad and increment by rs_c. -" ld1 {v12.s}[1],[x27],x14 \n\t" // Load c111 into quad and increment by rs_c. -" ld1 {v12.s}[2],[x27],x14 \n\t" // Load c112 into quad and increment by rs_c. -" ld1 {v12.s}[3],[x27],x14 \n\t" // Load c113 into quad and increment by rs_c. -" ld1 {v13.s}[0],[x27],x14 \n\t" // Load c114 into quad and increment by rs_c. -" ld1 {v13.s}[1],[x27],x14 \n\t" // Load c115 into quad and increment by rs_c. -" ld1 {v13.s}[2],[x27],x14 \n\t" // Load c116 into quad and increment by rs_c. -" ld1 {v13.s}[3],[x27],x14 \n\t" // Load c117 into quad and increment by rs_c. +" mov x5, x25 \n\t" +" \n\t" +" ld1 {v8.s}[0],[x5],x14 \n\t" // Load c90 into quad and increment by rs_c. +" ld1 {v8.s}[1],[x5],x14 \n\t" // Load c91 into quad and increment by rs_c. +" ld1 {v8.s}[2],[x5],x14 \n\t" // Load c92 into quad and increment by rs_c. +" ld1 {v8.s}[3],[x5],x14 \n\t" // Load c93 into quad and increment by rs_c. +" ld1 {v9.s}[0],[x5],x14 \n\t" // Load c94 into quad and increment by rs_c. +" ld1 {v9.s}[1],[x5],x14 \n\t" // Load c95 into quad and increment by rs_c. +" ld1 {v9.s}[2],[x5],x14 \n\t" // Load c96 into quad and increment by rs_c. +" ld1 {v9.s}[3],[x5],x14 \n\t" // Load c97 into quad and increment by rs_c. +" \n\t" +" mov x5, x26 \n\t" +" \n\t" +" ld1 {v10.s}[0],[x5],x14 \n\t" // Load c100 into quad and increment by rs_c. +" ld1 {v10.s}[1],[x5],x14 \n\t" // Load c101 into quad and increment by rs_c. +" ld1 {v10.s}[2],[x5],x14 \n\t" // Load c102 into quad and increment by rs_c. +" ld1 {v10.s}[3],[x5],x14 \n\t" // Load c103 into quad and increment by rs_c. +" ld1 {v11.s}[0],[x5],x14 \n\t" // Load c104 into quad and increment by rs_c. +" ld1 {v11.s}[1],[x5],x14 \n\t" // Load c105 into quad and increment by rs_c. +" ld1 {v11.s}[2],[x5],x14 \n\t" // Load c106 into quad and increment by rs_c. +" ld1 {v11.s}[3],[x5],x14 \n\t" // Load c107 into quad and increment by rs_c. +" \n\t" +" mov x5, x27 \n\t" +" \n\t" +" ld1 {v12.s}[0],[x5],x14 \n\t" // Load c110 into quad and increment by rs_c. +" ld1 {v12.s}[1],[x5],x14 \n\t" // Load c111 into quad and increment by rs_c. +" ld1 {v12.s}[2],[x5],x14 \n\t" // Load c112 into quad and increment by rs_c. +" ld1 {v12.s}[3],[x5],x14 \n\t" // Load c113 into quad and increment by rs_c. +" ld1 {v13.s}[0],[x5],x14 \n\t" // Load c114 into quad and increment by rs_c. +" ld1 {v13.s}[1],[x5],x14 \n\t" // Load c115 into quad and increment by rs_c. +" ld1 {v13.s}[2],[x5],x14 \n\t" // Load c116 into quad and increment by rs_c. +" ld1 {v13.s}[3],[x5],x14 \n\t" // Load c117 into quad and increment by rs_c. " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta @@ -1018,38 +1018,38 @@ LABEL(SBETAZEROGENSTOREDS4) " fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" mov x27, x24 \n\t" -" \n\t" -" st1 {v8.s}[0],[x27],x14 \n\t" // Store c90 into quad and increment by rs_c. -" st1 {v8.s}[1],[x27],x14 \n\t" // Store c91 into quad and increment by rs_c. -" st1 {v8.s}[2],[x27],x14 \n\t" // Store c92 into quad and increment by rs_c. -" st1 {v8.s}[3],[x27],x14 \n\t" // Store c93 into quad and increment by rs_c. -" st1 {v9.s}[0],[x27],x14 \n\t" // Store c94 into quad and increment by rs_c. -" st1 {v9.s}[1],[x27],x14 \n\t" // Store c95 into quad and increment by rs_c. -" st1 {v9.s}[2],[x27],x14 \n\t" // Store c96 into quad and increment by rs_c. -" st1 {v9.s}[3],[x27],x14 \n\t" // Store c97 into quad and increment by rs_c. -" \n\t" -" mov x27, x25 \n\t" -" \n\t" -" st1 {v10.s}[0],[x27],x14 \n\t" // Store c100 into quad and increment by rs_c. -" st1 {v10.s}[1],[x27],x14 \n\t" // Store c101 into quad and increment by rs_c. -" st1 {v10.s}[2],[x27],x14 \n\t" // Store c102 into quad and increment by rs_c. -" st1 {v10.s}[3],[x27],x14 \n\t" // Store c103 into quad and increment by rs_c. -" st1 {v11.s}[0],[x27],x14 \n\t" // Store c104 into quad and increment by rs_c. -" st1 {v11.s}[1],[x27],x14 \n\t" // Store c105 into quad and increment by rs_c. -" st1 {v11.s}[2],[x27],x14 \n\t" // Store c106 into quad and increment by rs_c. -" st1 {v11.s}[3],[x27],x14 \n\t" // Store c107 into quad and increment by rs_c. -" \n\t" -" mov x27, x26 \n\t" -" \n\t" -" st1 {v12.s}[0],[x27],x14 \n\t" // Store c110 into quad and increment by rs_c. -" st1 {v12.s}[1],[x27],x14 \n\t" // Store c111 into quad and increment by rs_c. -" st1 {v12.s}[2],[x27],x14 \n\t" // Store c112 into quad and increment by rs_c. -" st1 {v12.s}[3],[x27],x14 \n\t" // Store c113 into quad and increment by rs_c. -" st1 {v13.s}[0],[x27],x14 \n\t" // Store c114 into quad and increment by rs_c. -" st1 {v13.s}[1],[x27],x14 \n\t" // Store c115 into quad and increment by rs_c. -" st1 {v13.s}[2],[x27],x14 \n\t" // Store c116 into quad and increment by rs_c. -" st1 {v13.s}[3],[x27],x14 \n\t" // Store c147 into quad and increment by rs_c. +" mov x5, x25 \n\t" +" \n\t" +" st1 {v8.s}[0],[x5],x14 \n\t" // Store c90 into quad and increment by rs_c. +" st1 {v8.s}[1],[x5],x14 \n\t" // Store c91 into quad and increment by rs_c. +" st1 {v8.s}[2],[x5],x14 \n\t" // Store c92 into quad and increment by rs_c. +" st1 {v8.s}[3],[x5],x14 \n\t" // Store c93 into quad and increment by rs_c. +" st1 {v9.s}[0],[x5],x14 \n\t" // Store c94 into quad and increment by rs_c. +" st1 {v9.s}[1],[x5],x14 \n\t" // Store c95 into quad and increment by rs_c. +" st1 {v9.s}[2],[x5],x14 \n\t" // Store c96 into quad and increment by rs_c. +" st1 {v9.s}[3],[x5],x14 \n\t" // Store c97 into quad and increment by rs_c. +" \n\t" +" mov x5, x26 \n\t" +" \n\t" +" st1 {v10.s}[0],[x5],x14 \n\t" // Store c100 into quad and increment by rs_c. +" st1 {v10.s}[1],[x5],x14 \n\t" // Store c101 into quad and increment by rs_c. +" st1 {v10.s}[2],[x5],x14 \n\t" // Store c102 into quad and increment by rs_c. +" st1 {v10.s}[3],[x5],x14 \n\t" // Store c103 into quad and increment by rs_c. +" st1 {v11.s}[0],[x5],x14 \n\t" // Store c104 into quad and increment by rs_c. +" st1 {v11.s}[1],[x5],x14 \n\t" // Store c105 into quad and increment by rs_c. +" st1 {v11.s}[2],[x5],x14 \n\t" // Store c106 into quad and increment by rs_c. +" st1 {v11.s}[3],[x5],x14 \n\t" // Store c107 into quad and increment by rs_c. +" \n\t" +" mov x5, x27 \n\t" +" \n\t" +" st1 {v12.s}[0],[x5],x14 \n\t" // Store c110 into quad and increment by rs_c. +" st1 {v12.s}[1],[x5],x14 \n\t" // Store c111 into quad and increment by rs_c. +" st1 {v12.s}[2],[x5],x14 \n\t" // Store c112 into quad and increment by rs_c. +" st1 {v12.s}[3],[x5],x14 \n\t" // Store c113 into quad and increment by rs_c. +" st1 {v13.s}[0],[x5],x14 \n\t" // Store c114 into quad and increment by rs_c. +" st1 {v13.s}[1],[x5],x14 \n\t" // Store c115 into quad and increment by rs_c. +" st1 {v13.s}[2],[x5],x14 \n\t" // Store c116 into quad and increment by rs_c. +" st1 {v13.s}[3],[x5],x14 \n\t" // Store c147 into quad and increment by rs_c. " \n\t" LABEL(SEND) // Done! " \n\t" @@ -1068,11 +1068,10 @@ LABEL(SEND) // Done! [b_next] "m" (b_next) // 10 :// Register clobber list "x0", "x1", "x2", - "x5", "x6", "x10", - "x14", - "x16","x17","x18","x19", - "x20","x21","x22","x23", - "x24","x25","x26","x27", + "x5", "x6", "x10","x14", + "x16","x17","x19","x20", + "x21","x22","x23","x24", + "x25","x26","x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11", From 7f7d72610c25f511ba8cd2a53be7b59bdb80f3f3 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 31 May 2021 16:50:18 -0500 Subject: [PATCH 025/226] Fixed bugs in cpackm kernels, gemmlike code. Details: - Fixed intermittent bugs in bli_packm_haswell_asm_c3xk.c and bli_packm_haswell_asm_c8xk.c whereby the imaginary component of the kappa scalar was incorrectly loaded at an offset of 8 bytes (instead of 4 bytes) from the real component. This was almost certainly a copy- paste bug carried over from the corresonding zpackm kernels. Thanks to Devin Matthews for bringing this to my attention. - Added missing code to gemmlike sandbox files bls_gemm_bp_var1.c and bls_gemm_bp_var2.c that initializes the elements of the temporary microtile to zero. (This bug was never observed in output but rather noticed analytically. It probably would have also manifested as intermittent failures, this time involving edge cases.) - Minor commented-out/disabled changes to testsuite/src/test_gemm.c relating to debugging. --- kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c | 2 +- kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c | 2 +- sandbox/gemmlike/bls_gemm_bp_var1.c | 3 +++ sandbox/gemmlike/bls_gemm_bp_var2.c | 6 ++++++ sandbox/gemmlike/bls_l3_packm_var.c | 14 +++++++------- testsuite/src/test_gemm.c | 15 ++++++++------- 6 files changed, 26 insertions(+), 16 deletions(-) diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c index c31384cc45..273caeb3db 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c @@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_3xk mov(var(kappa), rcx) // load address of kappa vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate - vbroadcastss(mem(rcx, 8), ymm11) // load kappa_i and duplicate + vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate // now branch on kappa == 1.0 diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c index 02c894a393..be6877e71a 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c @@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_8xk mov(var(kappa), rcx) // load address of kappa vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate - vbroadcastss(mem(rcx, 8), ymm11) // load kappa_i and duplicate + vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate // now branch on kappa == 1.0 diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index ae695ce34f..330a94801b 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -230,6 +230,9 @@ void PASTECH2(bls_,ch,varname) \ thrinfo_t* restrict thread_pa = NULL; \ thrinfo_t* restrict thread_jr = NULL; \ thrinfo_t* restrict thread_ir = NULL; \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \ \ /* Identify the current thrinfo_t node and then grow the tree. */ \ thread_jc = thread; \ diff --git a/sandbox/gemmlike/bls_gemm_bp_var2.c b/sandbox/gemmlike/bls_gemm_bp_var2.c index 957cd57944..22df767aea 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var2.c +++ b/sandbox/gemmlike/bls_gemm_bp_var2.c @@ -538,6 +538,12 @@ void PASTECH2(bls_,ch,varname) \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype zero = *PASTEMAC(ch,0); \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. + NOTE: This initialization should really be done statically since + var2 executes this microkernel wrapper many times, and the overhead + of touching the temporary microtile adds up. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \ \ /* Handle interior and edge cases separately. */ \ if ( mr_cur == MR && nr_cur == NR ) \ diff --git a/sandbox/gemmlike/bls_l3_packm_var.c b/sandbox/gemmlike/bls_l3_packm_var.c index 8a4c1d0206..3265ef834d 100644 --- a/sandbox/gemmlike/bls_l3_packm_var.c +++ b/sandbox/gemmlike/bls_l3_packm_var.c @@ -176,17 +176,17 @@ void PASTECH2(bls_,ch,varname) \ cntx \ ); \ } \ -\ - p_begin += ps_p; \ \ /* -if ( row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ if ( !row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +else \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ */ \ +\ + p_begin += ps_p; \ } \ } diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index a414b3404a..f485829a1a 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -254,18 +254,17 @@ void libblis_test_gemm_experiment bli_setsc( 0.9, 1.0, &beta ); } + #if 0 + //bli_setm( &BLIS_ONE, &a ); + bli_setsc( 1.0, 0.0, &alpha ); + bli_setsc( 1.0, 0.0, &beta ); + #endif + // Randomize A, B, and C, and save C. libblis_test_mobj_randomize( params, TRUE, &a ); libblis_test_mobj_randomize( params, TRUE, &b ); libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); -//bli_setm( &BLIS_ONE, &a ); -//bli_setsc( 1.0, 0.0, &alpha ); -//bli_setsc( 0.0, 0.0, &beta ); - -//bli_setm( &BLIS_ONE, &a ); -//bli_setsc( 1.0, 0.0, &alpha ); -//bli_setsc( 0.0, 0.0, &beta ); // Apply the parameters. bli_obj_set_conjtrans( transa, &a ); @@ -456,11 +455,13 @@ bli_printm( "c", c, "%5.2f", "" ); // bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR ) //bli_printm( "c before", c, "%6.3f", "" ); bli_gemm( alpha, a, b, beta, c ); + //bls_gemm( alpha, a, b, beta, c ); #if 0 if ( bli_obj_length( c ) == 12 && bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR ) bli_printm( "c after", c, "%6.3f", "" ); #endif +//bli_printm( "c after", c, "%5.2f", "" ); break; default: From 7c3eb44efaa762088c190bb820ef6a3c87db8f65 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 2 Jun 2021 11:28:22 -0500 Subject: [PATCH 026/226] Add vhsubpd/vhsubpd. Horizontal subtraction instructions added to bli_x86_asm_macros.h, currently unused [ci skip]. --- frame/include/bli_x86_asm_macros.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h index a4987b4c5f..b470d320d9 100644 --- a/frame/include/bli_x86_asm_macros.h +++ b/frame/include/bli_x86_asm_macros.h @@ -885,6 +885,8 @@ #define VADDSUBPD(_0, _1, _2) INSTR_(vaddsubpd, _0, _1, _2) #define VHADDPD(_0, _1, _2) INSTR_(vhaddpd, _0, _1, _2) #define VHADDPS(_0, _1, _2) INSTR_(vhaddps, _0, _1, _2) +#define VHSUBPD(_0, _1, _2) INSTR_(vhsubpd, _0, _1, _2) +#define VHSUBPS(_0, _1, _2) INSTR_(vhsubps, _0, _1, _2) #define VADDPS(_0, _1, _2) INSTR_(vaddps, _0, _1, _2) #define VADDPD(_0, _1, _2) INSTR_(vaddpd, _0, _1, _2) #define VSUBPS(_0, _1, _2) INSTR_(vsubps, _0, _1, _2) @@ -1015,6 +1017,8 @@ #define vaddsubpd(_0, _1, _2) VADDSUBPD(_0, _1, _2) #define vhaddpd(_0, _1, _2) VHADDPD(_0, _1, _2) #define vhaddps(_0, _1, _2) VHADDPS(_0, _1, _2) +#define vhsubpd(_0, _1, _2) VHSUBPD(_0, _1, _2) +#define vhsubps(_0, _1, _2) VHSUBPS(_0, _1, _2) #define vaddps(_0, _1, _2) VADDPS(_0, _1, _2) #define vaddpd(_0, _1, _2) VADDPD(_0, _1, _2) #define vsubps(_0, _1, _2) VSUBPS(_0, _1, _2) From d10e05bbd1ce45ce2c0dfe5c64daae2633357b3f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sun, 13 Jun 2021 19:36:16 -0500 Subject: [PATCH 027/226] Sandbox header edits trigger full library rebuild. Details: - Adjusted the top-level Makefile so that any change to a sandbox header file will result in blis.h being regenerated along with a full recompilation of the library. Previously, sandbox files were omitted from the list of header files that, when touched, could trigger a full rebuild. Why was it like that previously? Because originally we only envisioned using sandboxes to *replace* gemm, not augment the library with new functionality. When replacing gemm, blis.h does not need to contain any local sandbox defintions in order for the user to be able to (indirectly) use that sandbox. But if you are adding functions to the library, those functions need to be prototyped so the compiler can perform type checking against the user's invocation of those new functions. Thanks to Jeff Diamond for helping us discover this deficiency in the build system. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e8793964f4..9a69fca8b2 100644 --- a/Makefile +++ b/Makefile @@ -461,7 +461,7 @@ endif flat-header: check-env $(BLIS_H_FLAT) -$(BLIS_H_FLAT): $(FRAME_H99_FILES) +$(BLIS_H_FLAT): $(ALL_H99_FILES) ifeq ($(ENABLE_VERBOSE),yes) $(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" else From 56ffca6a9bc67432a7894298739895f406e5f467 Mon Sep 17 00:00:00 2001 From: nicholai Date: Tue, 15 Jun 2021 18:17:39 -0500 Subject: [PATCH 028/226] Fix asm warning --- kernels/power9/3/bli_gemm_power9_asm_d12x6.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c index 187182a095..ec09f8e380 100644 --- a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c +++ b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c @@ -95,8 +95,8 @@ void bli_dgemm_power9_asm_12x6 " \n\t" DPREFETCH " \n\t" - "cmpwi %%r0, %%r11, 0 \n\t" // if k_iter == 0, - "beq %%r0, DCONSIDERKLEFT \n\t" // then jmp to k_left + "cmpwi %%r11, 0 \n\t" // if k_iter == 0, + "beq DCONSIDERKLEFT \n\t" // then jmp to k_left "mtctr %%r11 \n\t" // else, do k_iter loop " \n\t" "DLOOPKITER: \n\t" // k_iter loop @@ -107,8 +107,8 @@ void bli_dgemm_power9_asm_12x6 " \n\t" "DCONSIDERKLEFT: \n\t" " \n\t" - "cmpwi %%r0, %%r12, 0 \n\t" // if k_left == 0, - "beq %%r0, DPOSTACCUM \n\t" // then jmp to post accum + "cmpwi %%r12, 0 \n\t" // if k_left == 0, + "beq DPOSTACCUM \n\t" // then jmp to post accum "mtctr %%r12 \n\t" // else, do k_left loop " \n\t" "DLOOPKLEFT: \n\t" // k_left loop @@ -121,10 +121,10 @@ void bli_dgemm_power9_asm_12x6 " \n\t" DSCALE_ALPHA " \n\t" - "cmpdi %%r0, %%r26, 0 \n\t" // if beta == 0, - "beq %%r0, DBETAZERO \n\t" // then jmp to BZ + "cmpdi %%r26, 0 \n\t" // if beta == 0, + "beq DBETAZERO \n\t" // then jmp to BZ " \n\t" - "cmpwi %%r0, %%r9, 8 \n\t" // if rs_c == 8 + "cmpwi %%r9, 8 \n\t" // if rs_c == 8 "beq DCOLSTOREDBNZ \n\t" // then jmp to col store " \n\t" "DGENSTOREDBNZ: \n\t" // BNZ gen stored case @@ -143,7 +143,7 @@ void bli_dgemm_power9_asm_12x6 " \n\t" "DBETAZERO: \n\t" // BZ case " \n\t" - "cmpwi %%r0, %%r9, 8 \n\t" // if rs_c == 8, + "cmpwi %%r9, 8 \n\t" // if rs_c == 8, "beq DCOLSTORED \n\t" // C is col stored " \n\t" "DGENSTORED: \n\t" // BZ gen stored case From aaa10c87e19449674a4ca30fa3b6392bb22c3a66 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 21 Jun 2021 17:53:52 -0500 Subject: [PATCH 029/226] Skip clearing temp microtile in gemmlike sandbox. Details: - Removed code from gemmlike sandbox files bls_gemm_bp_var1.c and bls_gemm_bp_var2.c that initializes the elements of the temporary microtile to zero. This code, introduced recently in 7f7d726, did not actually fix any bug (despite that commit's log entry). The microtile does not need to be initialized because it is completely overwritten by a "beta = 0" invocation of gemm prior to it being read. Any NaNs or Infs present at the outset would have no impact on the output matrix C. Thanks to Devin Matthews for reminding me of this. --- sandbox/gemmlike/bls_gemm_bp_var1.c | 3 --- sandbox/gemmlike/bls_gemm_bp_var2.c | 6 ------ 2 files changed, 9 deletions(-) diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index 330a94801b..ae695ce34f 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -230,9 +230,6 @@ void PASTECH2(bls_,ch,varname) \ thrinfo_t* restrict thread_pa = NULL; \ thrinfo_t* restrict thread_jr = NULL; \ thrinfo_t* restrict thread_ir = NULL; \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \ \ /* Identify the current thrinfo_t node and then grow the tree. */ \ thread_jc = thread; \ diff --git a/sandbox/gemmlike/bls_gemm_bp_var2.c b/sandbox/gemmlike/bls_gemm_bp_var2.c index 22df767aea..957cd57944 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var2.c +++ b/sandbox/gemmlike/bls_gemm_bp_var2.c @@ -538,12 +538,6 @@ void PASTECH2(bls_,ch,varname) \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype zero = *PASTEMAC(ch,0); \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. - NOTE: This initialization should really be done statically since - var2 executes this microkernel wrapper many times, and the overhead - of touching the temporary microtile adds up. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \ \ /* Handle interior and edge cases separately. */ \ if ( mr_cur == MR && nr_cur == NR ) \ From 907226c0af4afb6323b4e02be4f73f5fb89cddaf Mon Sep 17 00:00:00 2001 From: nicholaiTukanov Date: Fri, 2 Jul 2021 19:47:18 -0500 Subject: [PATCH 030/226] Rework POWER10 sandbox - Add a testsuite for gathering performance (in GFLOPs) and measuring correctness for the POWER10 GEMM reduced precision/integer kernels. - Reworked GENERIC_GEMM template to hardcode the cache parameters. - Remove kernel wrapper that checked that only allowed matrices that weren't transposed or conjugated. However, the kernels still assume the matrices are not transposed. This wrapper was removed for performance reasons. - Renamed and restructured files and functions for clarity. - Editted the POWER10 document to reflect new changes. --- build/gen-make-frags/ignore_list | 1 + sandbox/power10/POWER10.md | 30 +- sandbox/power10/bli_sandbox.h | 37 +- sandbox/power10/gemm.c | 128 +++ sandbox/power10/gemm_api.c | 77 -- sandbox/power10/gemm_pack.c | 889 ------------------ sandbox/power10/gemm_pack.h | 64 -- .../power10/{gemm_api.h => gemm_prototypes.h} | 27 +- .../{generic_gemm.c => gemm_template.h} | 114 ++- sandbox/power10/p10_testsuite/Makefile | 31 + sandbox/power10/p10_testsuite/cast_funcs.c | 180 ++++ sandbox/power10/p10_testsuite/cast_funcs.h | 62 ++ sandbox/power10/p10_testsuite/common.h | 16 + sandbox/power10/p10_testsuite/correctness.c | 337 +++++++ sandbox/power10/p10_testsuite/correctness.h | 176 ++++ sandbox/power10/p10_testsuite/performance.c | 103 ++ sandbox/power10/p10_testsuite/performance.h | 58 ++ sandbox/power10/pack_a_templates.h | 426 +++++++++ sandbox/power10/pack_b_templates.h | 403 ++++++++ .../{generic_gemm.h => pack_kernels.c} | 70 +- 20 files changed, 2091 insertions(+), 1138 deletions(-) create mode 100644 sandbox/power10/gemm.c delete mode 100644 sandbox/power10/gemm_api.c delete mode 100644 sandbox/power10/gemm_pack.c delete mode 100644 sandbox/power10/gemm_pack.h rename sandbox/power10/{gemm_api.h => gemm_prototypes.h} (78%) rename sandbox/power10/{generic_gemm.c => gemm_template.h} (60%) create mode 100644 sandbox/power10/p10_testsuite/Makefile create mode 100644 sandbox/power10/p10_testsuite/cast_funcs.c create mode 100644 sandbox/power10/p10_testsuite/cast_funcs.h create mode 100644 sandbox/power10/p10_testsuite/common.h create mode 100644 sandbox/power10/p10_testsuite/correctness.c create mode 100644 sandbox/power10/p10_testsuite/correctness.h create mode 100644 sandbox/power10/p10_testsuite/performance.c create mode 100644 sandbox/power10/p10_testsuite/performance.h create mode 100644 sandbox/power10/pack_a_templates.h create mode 100644 sandbox/power10/pack_b_templates.h rename sandbox/power10/{generic_gemm.h => pack_kernels.c} (55%) diff --git a/build/gen-make-frags/ignore_list b/build/gen-make-frags/ignore_list index ccdd18f644..3561710b4f 100644 --- a/build/gen-make-frags/ignore_list +++ b/build/gen-make-frags/ignore_list @@ -5,3 +5,4 @@ other temp tmp test +p10_testsuite \ No newline at end of file diff --git a/sandbox/power10/POWER10.md b/sandbox/power10/POWER10.md index a9b19c5a85..501f93f9f2 100644 --- a/sandbox/power10/POWER10.md +++ b/sandbox/power10/POWER10.md @@ -1,24 +1,20 @@ ### Low Precision POWER10 Kernels -This is a special BLIS Sandbox that allows users to call low precision POWER10 `gemm` kernels. +This is a special BLIS Sandbox that allows users to call reduced preicison or reduced integer POWER10 `GEMM` kernels. + +Supported kernels: `IEEE float16 (bli_shgemm), bfloat16 (bli_sbgemm), int16 (bli_i16gemm), int8 (bli_i8gemm), int4 (bli_i4gemm)`. #### Introduction -This document describes how the low precision POWER10 `gemm` kernels are implemented. The document will also demonstrate how to call the `gemm` kernels. +This document describes how the low precision POWER10 `gemm` kernels are implemented and explains how to call the POWER10 `GEMM` kernels. -**Important: This sandbox does not have the full functionality of BLIS. This sandbox can only perform single threaded, no transpose, GEMM. At this time, full functioning POWER10 hardware has not be released. Once hardware has been released, the kernels will be further optimized in areas such as prefetching and cache blocksizes.** +**Important: These kernels does not have the full functionality of BLIS. This sandbox can only perform single threaded, no transpose, GEMM.** #### Implementation -The kernels are implemented in `generic_gemm.c`. They are instantiated with macro templates. The main template is called `GENERIC_GEMM`. This template is used to create the 5-loop `gemm` function. - -The API points are created in `gemm_api.c`. In this file, the API points are wrappers for the functions that are created by the templates in `generic_gemm.c`. - -#### Kernels - -The following low precision datatypes have POWER10 `gemm` kernels: `IEEE float16, bfloat16, int16, int8, int4`. +The kernels are implemented in `gemm.c`. They are instantiated with macro templates. The main template is called `GENERIC_GEMM`. This template is used to create the 5-loop `gemm` function. -#### Low Precision Types +#### Reduced precision/integer Types | BLIS type | BLIS char | Type definition | Used to represent... | |:-----------|:----------|:---------------------------------------|:-------------------------------------| @@ -28,9 +24,9 @@ The following low precision datatypes have POWER10 `gemm` kernels: `IEEE float16 | `int8` | `i8` | `int8_t` | 8 bit integers | | `int4` | `i4` | `typedef union{ uint8_t v; struct { uint8_t nib1:4; uint8_t nib2:4; } bits; }` | 4 bit integers | -#### Low Precision API +#### Reduced Precision/Integer API -The API that is used for the low precision POWER10 `gemm` kernels is similar to the existing [BLIS basic typed API](https://github.com/flame/blis/blob/master/docs/BLISTypedAPI.md). The main difference between the two is that in the existing BLIS typed API, there is only one type for the input and output matrices. However in the low precision API, there is a input and output type. +The API that is used for the reduced precision/integer POWER10 `GEMM` kernels is similar to the existing [BLIS basic typed API](https://github.com/flame/blis/blob/master/docs/BLISTypedAPI.md). The main difference is the POWER10 kernels expect two types: `ctype_in` and `ctype_out`. Thus the new `gemm` call looks like the following: @@ -50,10 +46,7 @@ void bli_??gemm ); ``` -The first `?` is for the output type. The second `?` is for the input type. - -At this time for IEEE float16 and bfloat16, the only output type is single precision float. For int16, int8, and int4, the only output type is 32 bit int. - +`??` is meant to replaced with the kernel prefix. #### How To Build The Sandbox @@ -64,6 +57,9 @@ Add the following flags when running the configure script to build BLIS correctl Ensure that you have GCC 10.2 or greater. +#### P10 Testsuite + +In `p10_testsuite`, their are performance gathering and correctness checking programs for the POWER10 `GEMM` kernels. By default, the performance gathering and correctness checking is done over square matrices ranging from 80 to 4000 in increments of 80. Performance is measured in GFLOPS, and correctness is measured using the BLIS method. #### References diff --git a/sandbox/power10/bli_sandbox.h b/sandbox/power10/bli_sandbox.h index 77c5fe2cb5..22d293d130 100644 --- a/sandbox/power10/bli_sandbox.h +++ b/sandbox/power10/bli_sandbox.h @@ -36,14 +36,12 @@ #define BLIS_SANDBOX_H #include "blis.h" -#include "gemm_api.h" +#include "gemm_prototypes.h" // NOTE: This header is the only header required to be present in the sandbox // implementation directory. -// This header is used to create the typedefs needed for low precision - -// int4 type +// int4 typedef union { uint8_t v; @@ -54,7 +52,7 @@ typedef union } bits; } nibbles; -// bfloat16 +// brain float16 typedef union { uint16_t v; @@ -80,36 +78,25 @@ typedef union #define P10_PG_SIZE 4096 +// microkernel prototypes GEMM_UKR_PROT2( bfloat16, float, sb, gemm_power10_mma_8x16 ) GEMM_UKR_PROT2( float16, float, sh, gemm_power10_mma_8x16 ) GEMM_UKR_PROT2( int16_t, int32_t, i16, gemm_power10_mma_8x16 ) GEMM_UKR_PROT2( int8_t, int32_t, i8, gemm_power10_mma_8x16 ) GEMM_UKR_PROT2( nibbles, int32_t, i4, gemm_power10_mma_8x16 ) -/* Creates a function that initializes a matrix of type ctype with random vals */ -#define RandomMatrixMacro(ch, ctype, rand_func) \ - RM_PROT(ch, ctype) \ - { \ - for ( int i=0; i0)), // innermost loop iterations + i8_pack_a, // pack kernel for A + i8_pack_b, // pack kernel for B + bli_i8gemm_power10_mma_8x16, // microkernel function name + 4, // K_MMA + 8, // MR + 16, // NR + 384, // MC + 6656, // KC + 4096, // NC + 0, // A_ALIGN + 0 // B_ALIGN +); + +GENERIC_GEMM( + i4, // kernel name prefix + nibbles, // input type + int, // output type + (pb/8 + (pb%8>0)), // innermost loop iterations + i4_pack_a, // pack kernel for A + i4_pack_b, // pack kernel for B + bli_i4gemm_power10_mma_8x16, // microkernel function name + 8, // K_MMA + 8, // MR + 16, // NR + 384, // MC + 6656, // KC + 4096, // NC + 0, // A_ALIGN + 0 // B_ALIGN +); + diff --git a/sandbox/power10/gemm_api.c b/sandbox/power10/gemm_api.c deleted file mode 100644 index c0e33932f1..0000000000 --- a/sandbox/power10/gemm_api.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// This file contains the API points for the low precision POWER10 GEMM kernels - -#include "generic_gemm.h" -#include "gemm_api.h" - -#define GEMM_FUNC(ch, DTYPE_IN, DTYPE_OUT, A_ALIGNMENT, B_ALIGNMENT, MR, NR, MC, KC, NC) \ -\ -void GEMM_FUNC_NAME(ch) \ - ( \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - DTYPE_OUT* alpha, \ - DTYPE_IN* a, inc_t rsa, inc_t csa, \ - DTYPE_IN* b, inc_t rsb, inc_t csb, \ - DTYPE_OUT* beta, \ - DTYPE_OUT* c, inc_t rsc, inc_t csc \ - ) \ -{ \ -\ - if (transa != BLIS_NO_TRANSPOSE || transb != BLIS_NO_TRANSPOSE) { \ - printf("Transpose functionality not implemented yet.\n"); \ - } \ -\ - GEMM_PASTEMAC(ch) \ - ( \ - MR, NR, MC, KC, NC, \ - m, n, k, \ - a, rsa, csa, A_ALIGNMENT, \ - b, rsb, csb, B_ALIGNMENT, \ - c, rsc, csc, \ - alpha, beta \ - ); \ -} \ - -// ch dt_in dt_out MR NR MC KC NC -GEMM_FUNC( sb, bfloat16, float, 0, 0, 8, 16, 1664, 1026, 4096); -GEMM_FUNC( sh, float16, float, 0, 0, 8, 16, 1664, 1026, 4096); -GEMM_FUNC( i16, int16_t, int32_t, 0, 0, 8, 16, 1664, 1026, 4096); -GEMM_FUNC( i8, int8_t, int32_t, 0, 0, 8, 16, 1664, 1026, 4096); -GEMM_FUNC( i4, nibbles, int32_t, 0, 0, 8, 16, 1664, 1026, 4096); diff --git a/sandbox/power10/gemm_pack.c b/sandbox/power10/gemm_pack.c deleted file mode 100644 index 3834b6d7ce..0000000000 --- a/sandbox/power10/gemm_pack.c +++ /dev/null @@ -1,889 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// Templates for different packing routine - -#include "gemm_pack.h" - -/* - - Details on bit16_dt vector data structure - - Vector X = [ X[0,0] X[0,1] X[1,0] X[1,1] X[2,0] X[2,1] X[3,0] X[3,1] ] - Vector Y = [ Y[0,0] Y[0,1] Y[1,0] Y[1,1] Y[2,0] Y[2,1] Y[3,0] Y[3,1] ] - - These bit16_dt vectors represent a 4x2 matrix. Hence, in matrix form it - looks like the following: - - X = [ X[0,0] X[0,1] - X[1,0] X[1,1] - X[2,0] X[2,1] - X[3,0] X[3,1] ] - - The outer product instruction: xvbf16ger2 (bfloat16 outer product) - - Syntax: - - xvbf16ger2 ACCUMULATOR A, VECTOR X, VECTOR Y - - Semantics: - - A = X * Y^T - - The generic packing routine would load 8 elements from the same column. - This causes an issue since the instruction expects the vector to be a - 4x2 matrix where the data is packed in contiguous order. Thus, we must make - a packing routine that will interleave the matrix data. Making it so - that when we load the 8 contiguous elements from A, it will represent - a 4x2 section of the matrix. - -*/ - -#define k_even_apack_16(ir) \ - *adest++ = ap[ (i+ir)*rs_a + p_idx*cs_a ]; \ - *adest++ = ap[ (i+ir)*rs_a + (p_idx+1)*cs_a ]; - -#define k_odd_apack_16(ir) \ - *adest++ = ap[ (i+ir)*rs_a + (k-1)*cs_a ]; \ - memset(adest, 0, 2); \ - adest++; - -#define pad_macro_16(dest_matrix) \ - memset(dest_matrix, 0, 4); \ - dest_matrix+=2; - -#define BIT16_PACK_A(ch, DTYPE_IN) \ -\ -void PACK_FUNC_NAME(ch, A) \ - ( \ - dim_t MR, \ - int m, int k, \ - DTYPE_IN* ap, int rs_a, int cs_a, \ - DTYPE_IN* apack \ - ) \ -{ \ - int k_odd = k%2; \ - int p_idx; \ -\ - DTYPE_IN* adest = apack; \ - for (int i=0; i0)), 4, bli_i8gemm_power10_mma_8x16); -GENERIC_GEMM( i4, nibbles, int, (pb/8 + (pb%8>0)), 8, bli_i4gemm_power10_mma_8x16); diff --git a/sandbox/power10/p10_testsuite/Makefile b/sandbox/power10/p10_testsuite/Makefile new file mode 100644 index 0000000000..a817496db2 --- /dev/null +++ b/sandbox/power10/p10_testsuite/Makefile @@ -0,0 +1,31 @@ +BLIS_PATH := ../../.. + +BLIS_INC := $(BLIS_PATH)/include/power10 +BLIS_LIB := $(BLIS_PATH)/lib/power10/libblis.a + +CC := gcc +LINKER := $(CC) + +CFLAGS := -I $(BLIS_INC) +LDFLAGS := -lpthread -lm + +OBJS := $(patsubst %.c,%.o, $(wildcard *.c)) +PERF_OBJS := performance.o +COR_OBJS := correctness.o cast_funcs.o + +all: performance correctness + +$(OBJS): %.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +performance: $(PERF_OBJS) + $(LINKER) $(PERF_OBJS) $(BLIS_LIB) -o ./gather_perf.x $(LDFLAGS) + +correctness: $(COR_OBJS) + $(LINKER) $(COR_OBJS) $(BLIS_LIB) -o ./test_correctness.x $(LDFLAGS) + +csv_clean: + rm -rf *.csv + +clean: + rm -rf *.x *.o \ No newline at end of file diff --git a/sandbox/power10/p10_testsuite/cast_funcs.c b/sandbox/power10/p10_testsuite/cast_funcs.c new file mode 100644 index 0000000000..8108602c53 --- /dev/null +++ b/sandbox/power10/p10_testsuite/cast_funcs.c @@ -0,0 +1,180 @@ +#include "cast_funcs.h" +#include "../bli_sandbox.h" + +// bit map used for casting float to bfloat16 +typedef union +{ + float v; + struct + { + uint32_t m:23; + uint32_t e:8; + uint32_t s:1; + } bits; +} float32_s; + + +// cast float16 into float +float cast_f16_to_f32(float16 val) +{ + uint16_t in = val.v; + float out; + uint32_t t1; + uint32_t t2; + uint32_t t3; + + t1 = in & 0x7fff; // Non-sign bits + t2 = in & 0x8000; // Sign bit + t3 = in & 0x7c00; // Exponent + + t1 <<= 13; // Align mantissa on MSB + t2 <<= 16; // Shift sign bit into position + + t1 += 0x38000000; // Adjust bias + + t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero + + t1 |= t2; // Re-insert sign bit + + *((uint32_t*)&out) = t1; + return out; +} + +// cast float to float16 +float16 cast_f32_to_f16(const float in) +{ + float16 f16_out; + + uint32_t inu = *((uint32_t*)&in); + uint32_t t1; + uint32_t t2; + uint32_t t3; + + t1 = inu & 0x7fffffff; // Non-sign bits + t2 = inu & 0x80000000; // Sign bit + t3 = inu & 0x7f800000; // Exponent + + t1 >>= 13; // Align mantissa on MSB + t2 >>= 16; // Shift sign bit into position + + t1 -= 0x1c000; // Adjust bias + + t1 = (t3 < 0x38800000) ? 0 : t1; + t1 = (t3 > 0x47000000) ? 0x7bff : t1; + t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero + + t1 |= t2; // Re-insert sign bit + + f16_out.v = t1; + return f16_out; +} + + +// cast float to bfloat16 +bfloat16 cast_f32_to_bf16 (float val) +{ + bfloat16 bf16; + float32_s f32; + f32.v = val; + bf16.bits.s = f32.bits.s; + bf16.bits.e = f32.bits.e; + bf16.bits.m = f32.bits.m >> 16; + return bf16; +} + +// cast bfloat16 to float +float cast_bf16_to_f32(bfloat16 val) +{ + float32_s f32; + f32.bits.s = val.bits.s; + f32.bits.e = val.bits.e; + f32.bits.m = val.bits.m << 16; + return f32.v; +} + +// cast a nibbles struct to a float array +void cast_i4_to_f32(float *fvals, nibbles vals) +{ + int8_t val0 = vals.bits.nib1; + int8_t val1 = vals.bits.nib2; + + val0 = (val0 >= 8 ? val0 - 16 : val0); + val1 = (val1 >= 8 ? val1 - 16 : val1); + + fvals[0] = (float) val0; + fvals[1] = (float) val1; +} + +// condense two float vals to a nibbles struct +nibbles cast_f32_to_i4(float val0, float val1) +{ + nibbles vals; + + int8_t val0_ = ((int8_t)val0) & 0xf0; + int8_t val1_ = ((int8_t)val1) & 0xf0; + + vals.bits.nib1 = val0_; + vals.bits.nib2 = val1_; + + return vals; +} + +// cast float matrix to float nibbles +void cast_f32_to_i4m(float *a_float, nibbles *a, int num_elems) +{ + int j=0; + for(int i=0; i +// print kernel name +const char* get_kernel_name(int kernel_id) +{ + switch (kernel_id) + { + case FLOAT16 : return "bli_shgemm"; + case BFLOAT16: return "bli_sbgemm"; + case INT16 : return "bli_i16gemm"; + case INT8 : return "bli_i8gemm"; + case INT4 : return "bli_i4gemm"; + default: printf("INCORRECT KERNEL ID\n"); exit(-1); + } +} + +// normalize the vector using the forbenious norm +void normalize_vec(float *t, int n) +{ + // normalize t + float norm_factor; + bli_snormfv(n, t, 1, &norm_factor); + // round up to closest power of 2 + norm_factor = 1 / (pow( 2.0, ceil( log2( norm_factor ) ) )); + bli_sscalv(BLIS_NO_CONJUGATE, n, &norm_factor, t, 1); +} + + // Pre-conditions: + // - a is randomized. + // - b is randomized. + // - c_orig is randomized. + // Note: + // - alpha and beta should have non-zero imaginary components in the + // complex cases in order to more fully exercise the implementation. + // + // Under these conditions, we assume that the implementation for + // + // C := beta * C_orig + alpha * transa(A) * transb(B) + // + // is functioning correctly if + // + // normfv( v - z ) + // + // is negligible, where + // + // v = C * t + // z = ( beta * C_orig + alpha * transa(A) * transb(B) ) * t + // = beta * C_orig * t + alpha * transa(A) * transb(B) * t + // = beta * C_orig * t + alpha * transa(A) * w + // = beta * C_orig * t + z +float get_resid( + int m, int n, int k, + float *a, int rsa, int csa, + float *b, int rsb, int csb, + float *c, int rsc, int csc, + float *c_orig, + float *alpha, float *beta +) +{ + + float t[n], v[m], w[k], z[m]; + float one = 1.0, zero = 0.0; + + bli_srandv(n, t, 1); + + // normalize so that the values are at the same precision of the input values + normalize_vec(t, n); + + // v = C * t + bli_sgemv( + BLIS_NO_TRANSPOSE, + BLIS_NO_CONJUGATE, + m, + n, + &one, + c, rsc, csc, + t, 1, + &zero, + v, 1 + ); + + // w = B * t + bli_sgemv( + BLIS_NO_TRANSPOSE, + BLIS_NO_CONJUGATE, + k, + n, + &one, + b, rsb, csb, + t, 1, + &zero, + w, 1 + ); + + // z = alpha * A * w + bli_sgemv( + BLIS_NO_TRANSPOSE, + BLIS_NO_CONJUGATE, + m, + k, + alpha, + a, rsa, csa, + w, 1, + &zero, + z, 1 + ); + + // z += beta * C_orig * t + bli_sgemv( + BLIS_NO_TRANSPOSE, + BLIS_NO_CONJUGATE, + m, + n, + beta, + c_orig, rsc, csc, + t, 1, + &one, + z, 1 + ); + + // v = v - z + bli_ssubv ( + BLIS_NO_CONJUGATE, + m, + z, 1, + v, 1 + ); + + // norm = normfv(v) + float norm; + bli_snormfv ( + m, + v, 1, + &norm + ); + + return norm; +} + + +// test to see if the result from a BLIS GEMM kernel is correct for a given m x n x k mat-mul +// assumes the matrices are of type float +// assumes the matrices were randomized and normalized +void correctness_checker( + int m, int n, int k, + float *a, int rsa, int csa, + float *b, int rsb, int csb, + float *c_orig, int rsc, int csc, + float *c_ans, + float alpha, float beta +) +{ + double start, end; + + start = bli_clock(); + float resid = get_resid ( + m, n, k, + a, rsa, csa, + b, rsb, csb, + c_ans, rsc, csc, + c_orig, + &alpha, &beta + ); + end = bli_clock(); + + printf("%d, %d, %d, %8.4le\n", m,n,k, resid); +} + + +// create all the correctness checking functions for each kernel +GEN_FP_COR_KERNEL(sb, bli_sbgemm, bfloat16, cast_f32_to_bf16m, cast_bf16_to_f32m); +GEN_FP_COR_KERNEL(sh, bli_shgemm, float16, cast_f32_to_f16m, cast_f16_to_f32m); +GEN_I_COR_KERNEL(i16, bli_i16gemm, int16_t, cast_f32_to_i16m, cast_i16_to_f32m); +GEN_I_COR_KERNEL(i8, bli_i8gemm, int8_t, cast_f32_to_i8m, cast_i8_to_f32m); + +// correctness template for int types +void i4correctness_kernel (int m, int n, int k) +{ + if(n%2 != 0) + { + printf("int4 can't handle odd sizes in the data-order dimension"); + exit(-1); + } + + int rsa = k, csa = 1, + rsb = n, csb = 1, + rsc = n, csc = 1; + + nibbles *a, *b; + + int32_t *c_ans, *c_orig, alpha, beta; + + float *a_float, *b_float, + *c_ans_float, *c_orig_float; + + /* buffers that will be passed into the kernel */ + // int4 buffers only need half the space to store all the elements + a = (nibbles *) malloc (m * (k/2) * sizeof(nibbles)); + b = (nibbles *) malloc (k * (n/2) * sizeof(nibbles)); + + c_ans = (int32_t *) malloc (m * n * sizeof(int32_t)); + c_orig = (int32_t *) malloc (m * n * sizeof(int32_t)); + + /* std format buffers that will be used by the correctness checker */ + a_float = (float *) malloc (m * k * sizeof(float)); + b_float = (float *) malloc (k * n * sizeof(float)); + c_ans_float = (float *) malloc (m * n * sizeof(float)); + c_orig_float = (float *) malloc (m * n * sizeof(float)); + + /* randomize matrices with float vals */ + bli_srandv(m*k, a_float, 1); + bli_srandv(k*n, b_float, 1); + bli_srandv(m*n, c_orig_float, 1); + + /* normalize the matrices */ + normalize_vec(a_float, m*k); + normalize_vec(b_float, k*n); + normalize_vec(c_orig_float, m*n); + + /* cast the float buffers into the buffers for the kernel */ + cast_f32_to_i4m (a_float, a, m*k); + cast_f32_to_i4m (b_float, b, k*n); + + /* cast float buffers to support int values */ + cast_f32_to_i32m(c_orig_float, c_orig, m*n); + cast_i32_to_f32m(c_orig, c_orig_float, m*n); + + /* cast the kernel buffers into the float buffers to ensure that the values match */ + cast_i4_to_f32m (a, a_float, m*k); + cast_i4_to_f32m (b, b_float, k*n); + + /* init alpha and beta */ + alpha = 1; + beta = 1; + + /* run kernel to get result in c_ans */ + // strides need to be adjusted since 1 element stores 2 values + memcpy(c_ans, c_orig, m * n * sizeof(int)); + bli_i4gemm( + BLIS_NO_TRANSPOSE, + BLIS_NO_TRANSPOSE, + m, + n, + k, + &alpha, + a, rsa/2, csa, + b, rsb/2, csb, + &beta, + c_ans, rsc, csc + ); + + /* cast integer result into float buffer since float is our std format for correctness checking */ + cast_i32_to_f32m(c_ans, c_ans_float, m*n); + + /* using the BLIS GEMM correctness check method, get the resid */ + correctness_checker( + m, n, k, + a_float, rsa, csa, + b_float, rsb, csb, + c_orig_float, rsc, csc, + c_ans_float, + (float) alpha, (float) beta + ); + + free(a); + free(b); + free(c_ans); + free(c_orig); + free(a_float); + free(b_float); + free(c_ans_float); + free(c_orig_float); +} + +// using the DATATYPE enum, gather test the correctness of the respective GEMM kernel +void run_correctness_kernel(int kernel_id, int m, int n, int k) +{ + switch (kernel_id) + { + case FLOAT16 : shcorrectness_kernel(m, n, k); break; + case BFLOAT16: sbcorrectness_kernel(m, n, k); break; + case INT16 : i16correctness_kernel(m, n, k); break; + case INT8 : i8correctness_kernel(m, n, k); break; + case INT4 : i4correctness_kernel(m, n, k); break; + default: break; + } +} + +void test_correctness(int kernel_id, int start, int end, int inc) +{ + printf("%s correctness test\n", get_kernel_name(kernel_id)); + printf("m, n, k, resid\n"); + int m,n,k; + for (int p=start; p<=end; p+=inc) + { + m=n=k=p; + run_correctness_kernel(kernel_id, m, n, k); + } +} + +// correctness test for bfloat16 gemm +int main(int argc, char *argv[]) +{ + + test_correctness(FLOAT16, 80, 4000, 80); + test_correctness(BFLOAT16, 80, 4000, 80); + test_correctness(INT16, 80, 4000, 80); + test_correctness(INT8, 80, 4000, 80); + test_correctness(INT4, 80, 4000, 80); +} diff --git a/sandbox/power10/p10_testsuite/correctness.h b/sandbox/power10/p10_testsuite/correctness.h new file mode 100644 index 0000000000..aea647848a --- /dev/null +++ b/sandbox/power10/p10_testsuite/correctness.h @@ -0,0 +1,176 @@ +// templates for generating correctness checking functions that check the correctness of GEMM kernels +// using the BLIS GEMM correctness method + +#define COR_KERNEL_NAME_(ch) ch ## correctness_kernel +#define COR_KERNEL_NAME(ch) COR_KERNEL_NAME_(ch) + + +// correctness template for float types +#define GEN_FP_COR_KERNEL(ch, kernel, input_t, DOWN_CAST, UP_CAST) \ +void COR_KERNEL_NAME(ch) (int m, int n, int k) \ +{ \ + int rsa = k, csa = 1, \ + rsb = n, csb = 1, \ + rsc = n, csc = 1; \ +\ + input_t *a, *b; \ +\ + float *a_float, *b_float, \ + *c_ans_float, *c_orig_float, \ + alpha, beta; \ +\ + /* buffers that will be passed into the kernel */ \ + a = (input_t *) malloc (m * k * sizeof(input_t)); \ + b = (input_t *) malloc (k * n * sizeof(input_t)); \ +\ + /* std format buffers that will be used by the correctness checker */ \ + a_float = (float *) malloc (m * k * sizeof(float)); \ + b_float = (float *) malloc (k * n * sizeof(float)); \ + c_ans_float = (float *) malloc (m * n * sizeof(float)); \ + c_orig_float = (float *) malloc (m * n * sizeof(float)); \ +\ + /* randomize matrices with float vals */ \ + bli_srandv(m*k, a_float, 1); \ + bli_srandv(k*n, b_float, 1); \ + bli_srandv(m*n, c_orig_float, 1); \ +\ + /* normalize the matrices */ \ + normalize_vec(a_float, m*k); \ + normalize_vec(b_float, k*n); \ + normalize_vec(c_orig_float, m*n); \ +\ + /* cast the float buffers into the buffers for the kernel */ \ + DOWN_CAST (a_float, a, m*k); \ + DOWN_CAST (b_float, b, k*n); \ +\ + /* cast the kernel buffers into the float buffers to ensure that the values match */ \ + UP_CAST (a, a_float, m*k); \ + UP_CAST (b, b_float, k*n); \ +\ + /* init alpha and beta */ \ + alpha = 1; \ + beta = 1; \ +\ + memcpy(c_ans_float, c_orig_float, m * n * sizeof(float)); \ + kernel( \ + BLIS_NO_TRANSPOSE, \ + BLIS_NO_TRANSPOSE, \ + m, \ + n, \ + k, \ + &alpha, \ + a, rsa, csa, \ + b, rsb, csb, \ + &beta, \ + c_ans_float, rsc, csc \ + ); \ +\ + correctness_checker( \ + m, n, k, \ + a_float, rsa, csa, \ + b_float, rsb, csb, \ + c_orig_float, rsc, csc, \ + c_ans_float, \ + alpha, beta \ + ); \ +\ + free(a); \ + free(b); \ + free(a_float); \ + free(b_float); \ + free(c_ans_float); \ + free(c_orig_float); \ +\ +} + +// correctness template for int types +#define GEN_I_COR_KERNEL(ch, kernel, input_t, DOWN_CAST, UP_CAST) \ +void COR_KERNEL_NAME(ch) (int m, int n, int k) \ +{ \ + int rsa = k, csa = 1, \ + rsb = n, csb = 1, \ + rsc = n, csc = 1; \ +\ + input_t *a, *b; \ +\ + int32_t *c_ans, *c_orig, alpha, beta; \ +\ + float *a_float, *b_float, \ + *c_ans_float, *c_orig_float; \ +\ + /* buffers that will be passed into the kernel */ \ + a = (input_t *) malloc (m * k * sizeof(input_t)); \ + b = (input_t *) malloc (k * n * sizeof(input_t)); \ + c_ans = (int32_t *) malloc (m * n * sizeof(int32_t)); \ + c_orig = (int32_t *) malloc (m * n * sizeof(int32_t)); \ +\ + /* std format buffers that will be used by the correctness checker */ \ + a_float = (float *) malloc (m * k * sizeof(float)); \ + b_float = (float *) malloc (k * n * sizeof(float)); \ + c_ans_float = (float *) malloc (m * n * sizeof(float)); \ + c_orig_float = (float *) malloc (m * n * sizeof(float)); \ +\ + /* randomize matrices with float vals */ \ + bli_srandv(m*k, a_float, 1); \ + bli_srandv(k*n, b_float, 1); \ + bli_srandv(m*n, c_orig_float, 1); \ +\ + /* normalize the matrices */ \ + normalize_vec(a_float, m*k); \ + normalize_vec(b_float, k*n); \ + normalize_vec(c_orig_float, m*n); \ +\ + /* cast the float buffers into the buffers for the kernel */ \ + DOWN_CAST (a_float, a, m*k); \ + DOWN_CAST (b_float, b, k*n); \ +\ + /* cast float buffers to support int values */ \ + cast_f32_to_i32m(c_orig_float, c_orig, m*n); \ + cast_i32_to_f32m(c_orig, c_orig_float, m*n); \ +\ + /* cast the kernel buffers into the float buffers to ensure that the values match */ \ + UP_CAST (a, a_float, m*k); \ + UP_CAST (b, b_float, k*n); \ +\ + /* init alpha and beta */ \ + alpha = 1; \ + beta = 1; \ +\ + /* run kernel to get result in c_ans */ \ + memcpy(c_ans, c_orig, m * n * sizeof(int)); \ + kernel( \ + BLIS_NO_TRANSPOSE, \ + BLIS_NO_TRANSPOSE, \ + m, \ + n, \ + k, \ + &alpha, \ + a, rsa, csa, \ + b, rsb, csb, \ + &beta, \ + c_ans, rsc, csc \ + ); \ +\ + /* cast integer result into float buffer since float is our std format for correctness checking */ \ + cast_i32_to_f32m(c_ans, c_ans_float, m*n); \ +\ + /* using the BLIS GEMM correctness check method, get the resid */ \ + correctness_checker( \ + m, n, k, \ + a_float, rsa, csa, \ + b_float, rsb, csb, \ + c_orig_float, rsc, csc, \ + c_ans_float, \ + (float) alpha, (float) beta \ + ); \ +\ + free(a); \ + free(b); \ + free(c_ans); \ + free(c_orig); \ + free(a_float); \ + free(b_float); \ + free(c_ans_float); \ + free(c_orig_float); \ +\ +} diff --git a/sandbox/power10/p10_testsuite/performance.c b/sandbox/power10/p10_testsuite/performance.c new file mode 100644 index 0000000000..25f1c3ff2a --- /dev/null +++ b/sandbox/power10/p10_testsuite/performance.c @@ -0,0 +1,103 @@ +/* + + This program is designed to gather the performance data of the POWER10 + GEMM kernels in `blis/sandbox/power10`. + + By default, the performance of the kernels is gather over a set of square + matrices. The perfromance results are reported in GFLOPS, and outputted in + CSV format. + +*/ + +#include "performance.h" +#include "blis.h" +#include "../bli_sandbox.h" +#include "common.h" + +#include +// print kernel name +const char* get_kernel_name(int kernel_id) +{ + switch (kernel_id) + { + case FLOAT16 : return "bli_shgemm"; + case BFLOAT16: return "bli_sbgemm"; + case INT16 : return "bli_i16gemm"; + case INT8 : return "bli_i8gemm"; + case INT4 : return "bli_i4gemm"; + default: printf("INCORRECT KERNEL ID\n"); exit(-1); + } +} + +// create all the performance gathering functions for each kernel +GET_PERF_API_TEMP(sb, bli_sbgemm, bfloat16, float); +GET_PERF_API_TEMP(sh, bli_shgemm, float16, float); +GET_PERF_API_TEMP(i16, bli_i16gemm, int16_t, int); +GET_PERF_API_TEMP(i8, bli_i8gemm, int8_t, int); +GET_PERF_API_TEMP(i4, bli_i4gemm, nibbles, int); + + +// using the DATATYPE enum, gather the performance of the respective GEMM kernel +double run_kernel(int kernel_id, int nreps, int m, int n, int k) +{ + switch (kernel_id) + { + case FLOAT16 : return test_shapi(nreps, m, n, k); + case BFLOAT16: return test_sbapi(nreps, m, n, k); + case INT16 : return test_i16api(nreps, m, n, k); + case INT8 : return test_i8api(nreps, m, n, k); + case INT4 : return test_i4api(nreps, m, n, k); + default: return -1.0; + } +} + +// print the performance data in CSV format +// performance is measured in terms of GFLOPs +void print_perf_data(int m, int n, int k, double best_time) +{ + double GFLOPS = (2.0 * m * n * k) / (1e9 * best_time); + printf("%d, %d, %d, %.2f\n", m, n, k, GFLOPS); +} + +// get performance data +void get_perf(int kernel_id, int nreps, int start, int end, int inc) +{ + // csv header + printf("%s performance\n", get_kernel_name(kernel_id)); + printf("m, n, k, GFLOPS\n"); + + int m,n,k; + + // run over all problem sizes + for (int p=start; p<=end; p+=inc) + { + // change here to adjust problem size + m = p, + n = p, + k = p; + + double best_run_time = run_kernel(kernel_id, nreps, m, n, k); + + print_perf_data(m, n, k, best_run_time); + } +} + +int main(int argc, char *argv[]) +{ + // initialize a square problem set range + int start = 80; + int end = 4000; + int inc = 80; + + // number of times the kernel will be run + int nreps = 5; + + // run a respective kernel + get_perf( FLOAT16, nreps, start, end, inc); + get_perf(BFLOAT16, nreps, start, end, inc); + get_perf( INT16, nreps, start, end, inc); + get_perf( INT8, nreps, start, end, inc); + get_perf( INT4, nreps, start, end, inc); + + return 0; +} diff --git a/sandbox/power10/p10_testsuite/performance.h b/sandbox/power10/p10_testsuite/performance.h new file mode 100644 index 0000000000..26c36f6155 --- /dev/null +++ b/sandbox/power10/p10_testsuite/performance.h @@ -0,0 +1,58 @@ + +// function name template +// each function that will gather perform will be named test_api +#define GEN_PERF_FUNC_NAME_(ch) test_ ## ch ## api +#define GEN_PERF_FUNC_NAME(ch) GEN_PERF_FUNC_NAME_(ch) + +/* + Macro template for getting the best GEMM kernel runtime out of `num_runs` + for matrices of size (m x n x k). +*/ +#define GET_PERF_API_TEMP(ch, kernel, input_t, output_t) \ +double GEN_PERF_FUNC_NAME(ch) ( \ + int num_runs, \ + int m, \ + int n, \ + int k \ +) \ +{ \ + input_t *A,*B; \ + output_t *C; \ + output_t alpha,beta; \ +\ + A = (input_t*) malloc(m*k*sizeof(input_t)); \ + B = (input_t*) malloc(n*k*sizeof(input_t)); \ + C = (output_t*) malloc(m*n*sizeof(output_t)); \ + \ + alpha = 1; \ + beta = 1; \ + \ + double best = 1e9; \ + \ + for (int irep=0; irep Date: Fri, 2 Jul 2021 19:54:33 -0500 Subject: [PATCH 031/226] Update POWER10.md --- sandbox/power10/POWER10.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sandbox/power10/POWER10.md b/sandbox/power10/POWER10.md index 501f93f9f2..cdfb09e7d2 100644 --- a/sandbox/power10/POWER10.md +++ b/sandbox/power10/POWER10.md @@ -1,6 +1,6 @@ ### Low Precision POWER10 Kernels -This is a special BLIS Sandbox that allows users to call reduced preicison or reduced integer POWER10 `GEMM` kernels. +This is a special BLIS Sandbox that allows users to call POWER10 reduced precision/integer `GEMM` kernels. Supported kernels: `IEEE float16 (bli_shgemm), bfloat16 (bli_sbgemm), int16 (bli_i16gemm), int8 (bli_i8gemm), int4 (bli_i4gemm)`. @@ -8,7 +8,7 @@ Supported kernels: `IEEE float16 (bli_shgemm), bfloat16 (bli_sbgemm), int16 (bli This document describes how the low precision POWER10 `gemm` kernels are implemented and explains how to call the POWER10 `GEMM` kernels. -**Important: These kernels does not have the full functionality of BLIS. This sandbox can only perform single threaded, no transpose, GEMM.** +**Important: These kernels does not have the full functionality of BLIS. The kernels can only perform single threaded, no transpose, GEMM.** #### Implementation @@ -59,7 +59,7 @@ Ensure that you have GCC 10.2 or greater. #### P10 Testsuite -In `p10_testsuite`, their are performance gathering and correctness checking programs for the POWER10 `GEMM` kernels. By default, the performance gathering and correctness checking is done over square matrices ranging from 80 to 4000 in increments of 80. Performance is measured in GFLOPS, and correctness is measured using the BLIS method. +In `p10_testsuite`, there are performance gathering and correctness checking programs for the POWER10 reduced precision/integer `GEMM` kernels. By default, the performance gathering and correctness checking is done over square matrices ranging from 80 to 4000 in increments of 80. Performance is measured in GFLOPs, and correctness is measured using the BLIS method (detailed in `blis/testsuite/test_gemm.c`). #### References From ad6231cca3fc1e477752ecd31b1ee2323398a642 Mon Sep 17 00:00:00 2001 From: sunchengguo Date: Tue, 6 Jul 2021 07:30:00 -0400 Subject: [PATCH 032/226] Fixed configure script bug. Details: - Fixed kernel list string substitution error by adding function substitute_words in configure script. if the string contains zen and zen2, and zen need to be replaced with another string, then zen2 also be incorrectly replaced. --- configure | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/configure b/configure index 92a6c229c8..a3f98ba9aa 100755 --- a/configure +++ b/configure @@ -775,7 +775,8 @@ read_registry_file() # canonicalize whitespace, and then remove duplicate kernel # set names, if they exist. Finally, update the kernel registry # with the new kernel list. - newklist=$(echo -e "${klisttmp}" | sed -e "s/${ker}/${kers_ker}/g") + #newklist=$(echo -e "${klisttmp}" | sed -e "s/${ker}/${kers_ker}/g") + newklist=$(substitute_words "${ker}" "${kers_ker}" "${klisttmp}") newklist=$(canonicalize_ws "${newklist}") newklist=$(rm_duplicate_words "${newklist}") @@ -797,6 +798,26 @@ read_registry_file() done } +substitute_words() +{ + local word new_words list newlist + + word="$1" + new_words="$2" + list="$3" + + for str in ${list}; do + + if [ "${str}" == "${word}" ]; then + newlist="${newlist} ${new_words}" + else + newlist="${newlist} ${str}" + fi + done + + echo "${newlist}" +} + build_kconfig_registry() { local familyname clist config kernels kernel cur_configs newvalue From a201a53440c51244739aaee20e3309b50121cc68 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 5 Jul 2021 21:39:18 -0500 Subject: [PATCH 033/226] Always run `make check`. I'm concerned that problems may lurk for `x86_64` builds on Windows which may be uncovered by a fuller `make check`. --- .appveyor.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 87aee9c974..2423eb2310 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -43,8 +43,7 @@ build_script: - ps: Push-AppveyorArtifact C:\blis.zip test_script: -- if [%LIB_TYPE%]==[shared] set "TEST_TARGET=checkblis-fast" -- if [%LIB_TYPE%]==[static] set "TEST_TARGET=check" +- set "TEST_TARGET=check" - bash -lc "cd /c/projects/blis && mingw32-make %TEST_TARGET% -j4 V=1" # Enable this to be able to login to the build worker. You can use the From 78eac6a0ab78c995c3f4e46a9e87388b5c3e1af6 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 6 Jul 2021 11:05:43 -0500 Subject: [PATCH 034/226] Revert "Always run `make check`." This reverts commit a201a53440c51244739aaee20e3309b50121cc68. --- .appveyor.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 2423eb2310..87aee9c974 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -43,7 +43,8 @@ build_script: - ps: Push-AppveyorArtifact C:\blis.zip test_script: -- set "TEST_TARGET=check" +- if [%LIB_TYPE%]==[shared] set "TEST_TARGET=checkblis-fast" +- if [%LIB_TYPE%]==[static] set "TEST_TARGET=check" - bash -lc "cd /c/projects/blis && mingw32-make %TEST_TARGET% -j4 V=1" # Enable this to be able to login to the build worker. You can use the From f648df4e5588f069b2db96f8be320ead0c1967ef Mon Sep 17 00:00:00 2001 From: Andrew Wildman Date: Tue, 6 Jul 2021 16:35:12 -0700 Subject: [PATCH 035/226] Add symlink to blis.pc.in for out-of-tree builds --- configure | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/configure b/configure index a3f98ba9aa..150bd6a857 100755 --- a/configure +++ b/configure @@ -3629,6 +3629,23 @@ main() exit 1 fi + # If 'blis.pc.in' symlink does not already exist in the current + # directory, create a symbolic link to it. If one does exist, we + # use -f to force creation of a new link. + if [ ! -e "./blis.pc.in" ]; then + + echo "${script_name}: creating symbolic link to blis.pc.in." + ln -s "${dist_path}/blis.pc.in" + + elif [ -h "./blis.pc.in" ]; then + echo "${script_name}: symbolic link to blis.pc.in already exists; forcing creation of new link." + ln -sf "${dist_path}/blis.pc.in" + else + echo "${script_name}: Non-symbolic link file or directory 'blis.pc.in' blocks creation of symlink." + echo "${script_name}: *** Please remove this entity and re-run configure." + exit 1 + fi + # If 'common.mk' symlink does not already exist in the current # directory, create a symbolic link to it. If one does exist, we # use -f to force creation of a new link. From 174f7fc9a11712c7bd1a61510bdc5c262b3e8e1f Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 6 Jul 2021 19:35:55 -0500 Subject: [PATCH 036/226] Test installation in Travis CI --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 34e7aa74b6..51e9cf75fc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -75,11 +75,12 @@ script: - pwd - if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi - pwd -- $DIST_PATH/configure -t $THR CC=$CC $CONF +- $DIST_PATH/configure -p `pwd`/../install -t $THR CC=$CC $CONF - pwd - ls -l - $CC --version - make -j 2 +- make install # Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx). - if [ "$CONF" = "armsve" ]; then sed -i 's/.*\.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi - if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi From 69205ac266947723ad4d7bb028b7521fe5c76991 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 6 Jul 2021 20:39:22 -0500 Subject: [PATCH 037/226] CREDITS file update. Details: - Thanks to Chengguo Sun for submitting #515 (5ef7f68). - Thanks to Andrew Wildman for submitting #519 (551c6b4). - Whitespace update to configure (spaces to tabs). --- CREDITS | 2 ++ configure | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/CREDITS b/CREDITS index caad6f6dd7..626874faf1 100644 --- a/CREDITS +++ b/CREDITS @@ -90,6 +90,7 @@ but many others have contributed code and feedback, including Paul Springer @springer13 (RWTH Aachen University) Adam J. Stewart @adamjstewart (University of Illinois at Urbana-Champaign) Vladimir Sukarev + Chengguo Sun @chengguosun Santanu Thangaraj (AMD) Nicholai Tukanov @nicholaiTukanov (The University of Texas at Austin) Rhys Ulerich @RhysU (The University of Texas at Austin) @@ -97,6 +98,7 @@ but many others have contributed code and feedback, including Meghana Vankadari @Meghana-vankadari (AMD) Kiran Varaganti @kvaragan (AMD) Natalia Vassilieva (Hewlett Packard Enterprise) + Andrew Wildman @awild82 (University of Washington) Zhang Xianyi @xianyi (Chinese Academy of Sciences) Benda Xu @heroxbd Guodong Xu @docularxu (Linaro.org) diff --git a/configure b/configure index 150bd6a857..27986204a2 100755 --- a/configure +++ b/configure @@ -776,7 +776,7 @@ read_registry_file() # set names, if they exist. Finally, update the kernel registry # with the new kernel list. #newklist=$(echo -e "${klisttmp}" | sed -e "s/${ker}/${kers_ker}/g") - newklist=$(substitute_words "${ker}" "${kers_ker}" "${klisttmp}") + newklist=$(substitute_words "${ker}" "${kers_ker}" "${klisttmp}") newklist=$(canonicalize_ws "${newklist}") newklist=$(rm_duplicate_words "${newklist}") @@ -800,22 +800,22 @@ read_registry_file() substitute_words() { - local word new_words list newlist + local word new_words list newlist - word="$1" - new_words="$2" - list="$3" + word="$1" + new_words="$2" + list="$3" - for str in ${list}; do + for str in ${list}; do - if [ "${str}" == "${word}" ]; then - newlist="${newlist} ${new_words}" - else - newlist="${newlist} ${str}" - fi - done + if [ "${str}" == "${word}" ]; then + newlist="${newlist} ${new_words}" + else + newlist="${newlist} ${str}" + fi + done - echo "${newlist}" + echo "${newlist}" } build_kconfig_registry() From 75f03907c58385b656c8bd35d111db245814a9f3 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 7 Jul 2021 15:44:11 -0500 Subject: [PATCH 038/226] Add comment about make checkblas on Windows [ci skip] --- .appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.appveyor.yml b/.appveyor.yml index 87aee9c974..d90d4ba724 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -43,6 +43,7 @@ build_script: - ps: Push-AppveyorArtifact C:\blis.zip test_script: +# "make checkblas" does not work with shared linking Windows due to inability to override xerbla_ - if [%LIB_TYPE%]==[shared] set "TEST_TARGET=checkblis-fast" - if [%LIB_TYPE%]==[static] set "TEST_TARGET=check" - bash -lc "cd /c/projects/blis && mingw32-make %TEST_TARGET% -j4 V=1" From 9a8e649c5ac89eba951bbee7136ca28aeb24d731 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 7 Jul 2021 15:23:57 -0500 Subject: [PATCH 039/226] Fix Win64 AVX512 bug. Use `-march=haswell` for kernels. Fixes #514. --- config/skx/make_defs.mk | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/config/skx/make_defs.mk b/config/skx/make_defs.mk index 3098f85ad6..00ae94a364 100644 --- a/config/skx/make_defs.mk +++ b/config/skx/make_defs.mk @@ -71,7 +71,15 @@ ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xCORE-AVX512 else ifeq ($(CC_VENDOR),clang) +# NOTE: We have to use -march=haswell on Windows because apparently AVX512 +# uses an alternate calling convention where xmm registers are not callee-saved +# on the stack. When this is mixed with framework code compiled for general +# x86_64 mode then chaos ensues (e.g. #514). +ifeq ($(IS_WIN),yes) +CKVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=haswell +else CKVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512 +endif else $(error gcc, icc, or clang is required for this configuration.) endif @@ -97,7 +105,15 @@ ifeq ($(CC_VENDOR),icc) CRVECFLAGS := -xCORE-AVX2 else ifeq ($(CC_VENDOR),clang) +# NOTE: We have to use -march=haswell on Windows because apparently AVX512 +# uses an alternate calling convention where xmm registers are not callee-saved +# on the stack. When this is mixed with framework code compiled for general +# x86_64 mode then chaos ensues (e.g. #514). +ifeq ($(IS_WIN),yes) +CRVECFLAGS := -march=haswell -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast +endif else $(error gcc, icc, or clang is required for this configuration.) endif From 17729cf449919d1db9777cea5b65d2efc77e2692 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 9 Jul 2021 14:59:48 -0500 Subject: [PATCH 040/226] Add vzeroupper to Haswell microkernels. (#524) Details: - Added vzeroupper instruction to the end of all 'gemm' and 'gemmtrsm' microkernels so as to avoid a performance penalty when mixing AVX and SSE instructions. These vzeroupper instructions were once part of the haswell kernels, but were inadvertently removed during a source code shuffle some time ago when we were managing duplicate 'haswell' and 'zen' kernel sets. Thanks to Devin Matthews for tracking this down and re-inserting the missing instructions. --- kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c | 1734 +++++++++-------- .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c | 966 ++++----- .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c | 957 ++++----- 3 files changed, 1838 insertions(+), 1819 deletions(-) diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c index 6e16287dc7..7907bd9018 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c @@ -100,23 +100,23 @@ void bli_sgemm_haswell_asm_6x16 uint64_t cs_c = cs_c0; begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. - + add(imm(32*4), rbx) // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) - + lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c @@ -125,46 +125,46 @@ void bli_sgemm_haswell_asm_6x16 prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 64*4)) - + vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, -2*32), ymm0) vmovaps(mem(rbx, -1*32), ymm1) - + // iteration 1 vbroadcastss(mem(rax, 6*4), ymm2) vbroadcastss(mem(rax, 7*4), ymm3) @@ -172,51 +172,51 @@ void bli_sgemm_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 8*4), ymm2) vbroadcastss(mem(rax, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 10*4), ymm2) vbroadcastss(mem(rax, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, 0*32), ymm0) vmovaps(mem(rbx, 1*32), ymm1) - + // iteration 2 prefetch(0, mem(rax, 76*4)) - + vbroadcastss(mem(rax, 12*4), ymm2) vbroadcastss(mem(rax, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 14*4), ymm2) vbroadcastss(mem(rax, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 16*4), ymm2) vbroadcastss(mem(rax, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, 2*32), ymm0) vmovaps(mem(rbx, 3*32), ymm1) - + // iteration 3 vbroadcastss(mem(rax, 18*4), ymm2) vbroadcastss(mem(rax, 19*4), ymm3) @@ -224,91 +224,91 @@ void bli_sgemm_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 20*4), ymm2) vbroadcastss(mem(rax, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 22*4), ymm2) vbroadcastss(mem(rax, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(4*6*4), rax) // a += 4*6 (unroll x mr) add(imm(4*16*4), rbx) // b += 4*16 (unroll x nr) - + vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 64*4)) - + vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(1*6*4), rax) // a += 1*6 (unroll x mr) add(imm(1*16*4), rbx) // b += 1*16 (unroll x nr) - + vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - - - + + + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) @@ -321,222 +321,222 @@ void bli_sgemm_haswell_asm_6x16 vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; - + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; - - + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. jz(.SROWSTORED) // jump to row storage case - - + + cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - - + + + label(.SGENSTORED) - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm4, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm6, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm8, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm10, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm12, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm14, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ //add(rdi, rcx) // c += rs_c; - - + + mov(rdx, rcx) // rcx = c + 8*cs_c - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm5, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm7, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm9, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm11, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm13, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + SGEMM_INPUT_GS_BETA_NZ vfmadd213ps(ymm15, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ //add(rdi, rcx) // c += rs_c; - - - + + + jmp(.SDONE) // jump to end. - - - + + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rdx), ymm3, ymm5) vmovups(ymm5, mem(rdx)) add(rdi, rdx) - - + + vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rdx), ymm3, ymm7) vmovups(ymm7, mem(rdx)) add(rdi, rdx) - - + + vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rdx), ymm3, ymm9) vmovups(ymm9, mem(rdx)) add(rdi, rdx) - - + + vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rdx), ymm3, ymm11) vmovups(ymm11, mem(rdx)) add(rdi, rdx) - - + + vfmadd231ps(mem(rcx), ymm3, ymm12) vmovups(ymm12, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rdx), ymm3, ymm13) vmovups(ymm13, mem(rdx)) add(rdi, rdx) - - + + vfmadd231ps(mem(rcx), ymm3, ymm14) vmovups(ymm14, mem(rcx)) //add(rdi, rcx) vfmadd231ps(mem(rdx), ymm3, ymm15) vmovups(ymm15, mem(rdx)) //add(rdi, rdx) - - - + + + jmp(.SDONE) // jump to end. - - - + + + label(.SCOLSTORED) - - + + vbroadcastss(mem(rbx), ymm3) - + vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - + vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - - + + vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) - + vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - + lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - + vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(r14), xmm1, xmm1) @@ -549,7 +549,7 @@ void bli_sgemm_haswell_asm_6x16 vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) - + vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(r14, rsi, 2), xmm1, xmm1) @@ -562,50 +562,50 @@ void bli_sgemm_haswell_asm_6x16 vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) - + lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - - + + + vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - + vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - - + + vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) - + vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - + vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(r14), xmm1, xmm1) @@ -618,7 +618,7 @@ void bli_sgemm_haswell_asm_6x16 vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) - + vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(r14, rsi, 2), xmm1, xmm1) @@ -631,262 +631,264 @@ void bli_sgemm_haswell_asm_6x16 vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) - + //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - - + + + jmp(.SDONE) // jump to end. - - - + + + label(.SBETAZERO) - + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. jz(.SROWSTORBZ) // jump to row storage case - + cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - - - + + + label(.SGENSTORBZ) - - + + vmovaps(ymm4, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovaps(ymm6, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovaps(ymm8, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovaps(ymm10, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovaps(ymm12, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovaps(ymm14, ymm0) SGEMM_OUTPUT_GS_BETA_NZ //add(rdi, rcx) // c += rs_c; - - + + mov(rdx, rcx) // rcx = c + 8*cs_c - - + + vmovaps(ymm5, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovaps(ymm7, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovaps(ymm9, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovaps(ymm11, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovaps(ymm13, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovaps(ymm15, ymm0) SGEMM_OUTPUT_GS_BETA_NZ //add(rdi, rcx) // c += rs_c; - - - + + + jmp(.SDONE) // jump to end. - - - + + + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm5, mem(rdx)) add(rdi, rdx) - + vmovups(ymm6, mem(rcx)) add(rdi, rcx) vmovups(ymm7, mem(rdx)) add(rdi, rdx) - - + + vmovups(ymm8, mem(rcx)) add(rdi, rcx) vmovups(ymm9, mem(rdx)) add(rdi, rdx) - - + + vmovups(ymm10, mem(rcx)) add(rdi, rcx) vmovups(ymm11, mem(rdx)) add(rdi, rdx) - - + + vmovups(ymm12, mem(rcx)) add(rdi, rcx) vmovups(ymm13, mem(rdx)) add(rdi, rdx) - - + + vmovups(ymm14, mem(rcx)) //add(rdi, rcx) vmovups(ymm15, mem(rdx)) //add(rdi, rdx) - - - + + + jmp(.SDONE) // jump to end. - - - + + + label(.SCOLSTORBZ) - - + + vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - + vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - - + + vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) - + vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - + lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - + vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) - + vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) - + lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - - + + + vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - + vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - - + + vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) - + vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - + vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) - + vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) - + //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - - - - + + + + label(.SDONE) - - - end_asm( + + vzeroupper() + + + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -946,23 +948,23 @@ void bli_dgemm_haswell_asm_6x8 uint64_t cs_c = cs_c0; begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. - + add(imm(32*4), rbx) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) - + lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c @@ -971,98 +973,100 @@ void bli_dgemm_haswell_asm_6x8 prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 64*8)) - + vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, -2*32), ymm0) vmovapd(mem(rbx, -1*32), ymm1) - + // iteration 1 + prefetch(0, mem(rax, 72*8)) + vbroadcastsd(mem(rax, 6*8), ymm2) vbroadcastsd(mem(rax, 7*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 8*8), ymm2) vbroadcastsd(mem(rax, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 10*8), ymm2) vbroadcastsd(mem(rax, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, 0*32), ymm0) vmovapd(mem(rbx, 1*32), ymm1) - + // iteration 2 - prefetch(0, mem(rax, 76*8)) - + prefetch(0, mem(rax, 80*8)) + vbroadcastsd(mem(rax, 12*8), ymm2) vbroadcastsd(mem(rax, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 14*8), ymm2) vbroadcastsd(mem(rax, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 16*8), ymm2) vbroadcastsd(mem(rax, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, 2*32), ymm0) vmovapd(mem(rbx, 3*32), ymm1) - + // iteration 3 vbroadcastsd(mem(rax, 18*8), ymm2) vbroadcastsd(mem(rax, 19*8), ymm3) @@ -1070,91 +1074,91 @@ void bli_dgemm_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 20*8), ymm2) vbroadcastsd(mem(rax, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 22*8), ymm2) vbroadcastsd(mem(rax, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(4*6*8), rax) // a += 4*6 (unroll x mr) add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) - + vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 64*8)) - + vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(1*6*8), rax) // a += 1*6 (unroll x mr) add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) - + vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - - - + + + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -1167,179 +1171,179 @@ void bli_dgemm_haswell_asm_6x8 vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; - + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; //lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; - - + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. jz(.DROWSTORED) // jump to row storage case - - + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - - + + + label(.DGENSTORED) - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm4, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm6, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm8, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm10, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm12, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm14, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - - + + mov(rdx, rcx) // rcx = c + 4*cs_c - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm5, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm7, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm9, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm11, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm13, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + DGEMM_INPUT_GS_BETA_NZ vfmadd213pd(ymm15, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - - - + + + jmp(.DDONE) // jump to end. - - - + + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rdx), ymm3, ymm5) vmovupd(ymm5, mem(rdx)) add(rdi, rdx) - - + + vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rdx), ymm3, ymm7) vmovupd(ymm7, mem(rdx)) add(rdi, rdx) - - + + vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rdx), ymm3, ymm9) vmovupd(ymm9, mem(rdx)) add(rdi, rdx) - - + + vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rdx), ymm3, ymm11) vmovupd(ymm11, mem(rdx)) add(rdi, rdx) - - + + vfmadd231pd(mem(rcx), ymm3, ymm12) vmovupd(ymm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rdx), ymm3, ymm13) vmovupd(ymm13, mem(rdx)) add(rdi, rdx) - - + + vfmadd231pd(mem(rcx), ymm3, ymm14) vmovupd(ymm14, mem(rcx)) //add(rdi, rcx) vfmadd231pd(mem(rdx), ymm3, ymm15) vmovupd(ymm15, mem(rdx)) //add(rdi, rdx) - - - + + + jmp(.DDONE) // jump to end. - - - + + + label(.DCOLSTORED) - - + + vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) @@ -1348,9 +1352,9 @@ void bli_dgemm_haswell_asm_6x8 vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) - + vbroadcastsd(mem(rbx), ymm3) - + vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) @@ -1359,14 +1363,14 @@ void bli_dgemm_haswell_asm_6x8 vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, r13, 1)) - + lea(mem(rcx, rsi, 4), rcx) - + vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) - + vfmadd231pd(mem(r14), xmm3, xmm0) vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) @@ -1375,10 +1379,10 @@ void bli_dgemm_haswell_asm_6x8 vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm4, mem(r14, r13, 1)) - + lea(mem(r14, rsi, 4), r14) - - + + vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) @@ -1387,9 +1391,9 @@ void bli_dgemm_haswell_asm_6x8 vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) - + vbroadcastsd(mem(rbx), ymm3) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) @@ -1398,14 +1402,14 @@ void bli_dgemm_haswell_asm_6x8 vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, r13, 1)) - + //lea(mem(rcx, rsi, 4), rcx) - + vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) - + vfmadd231pd(mem(r14), xmm3, xmm0) vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) @@ -1414,139 +1418,139 @@ void bli_dgemm_haswell_asm_6x8 vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm4, mem(r14, r13, 1)) - + //lea(mem(r14, rsi, 4), r14) - - - + + + jmp(.DDONE) // jump to end. - - - + + + label(.DBETAZERO) - + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. jz(.DROWSTORBZ) // jump to row storage case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - - + + + label(.DGENSTORBZ) - - + + vmovapd(ymm4, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovapd(ymm6, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovapd(ymm8, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovapd(ymm10, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovapd(ymm12, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovapd(ymm14, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - - + + mov(rdx, rcx) // rcx = c + 4*cs_c - - + + vmovapd(ymm5, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovapd(ymm7, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovapd(ymm9, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovapd(ymm11, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovapd(ymm13, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c += rs_c; - - + + vmovapd(ymm15, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - - - + + + jmp(.DDONE) // jump to end. - - - + + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vmovupd(ymm7, mem(rdx)) add(rdi, rdx) - - + + vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vmovupd(ymm9, mem(rdx)) add(rdi, rdx) - - + + vmovupd(ymm10, mem(rcx)) add(rdi, rcx) vmovupd(ymm11, mem(rdx)) add(rdi, rdx) - - + + vmovupd(ymm12, mem(rcx)) add(rdi, rcx) vmovupd(ymm13, mem(rdx)) add(rdi, rdx) - - + + vmovupd(ymm14, mem(rcx)) //add(rdi, rcx) vmovupd(ymm15, mem(rdx)) //add(rdi, rdx) - - + + jmp(.DDONE) // jump to end. - - - + + + label(.DCOLSTORBZ) - - + + vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) @@ -1555,27 +1559,27 @@ void bli_dgemm_haswell_asm_6x8 vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) - + vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, r13, 1)) - + lea(mem(rcx, rsi, 4), rcx) - + vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) - + vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm4, mem(r14, r13, 1)) - + lea(mem(r14, rsi, 4), r14) - - + + vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) @@ -1584,48 +1588,52 @@ void bli_dgemm_haswell_asm_6x8 vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) - + vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, r13, 1)) - + //lea(mem(rcx, rsi, 4), rcx) - + vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) - + vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm4, mem(r14, r13, 1)) - + //lea(mem(r14, rsi, 4), r14) - - - + + + + label(.DDONE) - - - end_asm( + + vzeroupper() + + + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1665,7 +1673,7 @@ void bli_dgemm_haswell_asm_6x8 vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) - + #define CGEMM_OUTPUT_RS \ vmovups(ymm0, mem(rcx)) \ @@ -1692,69 +1700,69 @@ void bli_cgemm_haswell_asm_3x8 uint64_t cs_c = cs_c0; begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. - + add(imm(32*4), rbx) // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(scomplex) - + lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*rs_c; lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*rs_c; - + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r11, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, 7*8)) // prefetch c + 2*rs_c - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.CLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 32*8)) - + vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, -2*32), ymm0) vmovaps(mem(rbx, -1*32), ymm1) - + // iteration 1 vbroadcastss(mem(rax, 6*4), ymm2) vbroadcastss(mem(rax, 7*4), ymm3) @@ -1762,51 +1770,51 @@ void bli_cgemm_haswell_asm_3x8 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 8*4), ymm2) vbroadcastss(mem(rax, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 10*4), ymm2) vbroadcastss(mem(rax, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, 0*32), ymm0) vmovaps(mem(rbx, 1*32), ymm1) - + // iteration 2 prefetch(0, mem(rax, 38*8)) - + vbroadcastss(mem(rax, 12*4), ymm2) vbroadcastss(mem(rax, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 14*4), ymm2) vbroadcastss(mem(rax, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 16*4), ymm2) vbroadcastss(mem(rax, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, 2*32), ymm0) vmovaps(mem(rbx, 3*32), ymm1) - + // iteration 3 vbroadcastss(mem(rax, 18*4), ymm2) vbroadcastss(mem(rax, 19*4), ymm3) @@ -1814,84 +1822,84 @@ void bli_cgemm_haswell_asm_3x8 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 20*4), ymm2) vbroadcastss(mem(rax, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 22*4), ymm2) vbroadcastss(mem(rax, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(4*3*8), rax) // a += 4*3 (unroll x mr) add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) - + vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.CLOOPKITER) // iterate again if i != 0. - - - - - - - label(.CCONSIDKLEFT) - - mov(var(k_left), rsi) // i = k_left; - test(rsi, rsi) // check i via logical AND. - je(.CPOSTACCUM) // if i == 0, we're done; jump to end. - // else, we prepare to enter k_left loop. - - + + + + + + + label(.CCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.CPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.CLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 32*8)) - + vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(1*3*8), rax) // a += 1*3 (unroll x mr) add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) - + vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.CLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.CPOSTACCUM) - - + + // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilps(imm(0xb1), ymm6, ymm6) @@ -1900,76 +1908,76 @@ void bli_cgemm_haswell_asm_3x8 vpermilps(imm(0xb1), ymm11, ymm11) vpermilps(imm(0xb1), ymm14, ymm14) vpermilps(imm(0xb1), ymm15, ymm15) - - + + // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) vaddsubps(ymm7, ymm5, ymm5) - + vaddsubps(ymm10, ymm8, ymm8) vaddsubps(ymm11, ymm9, ymm9) - + vaddsubps(ymm14, ymm12, ymm12) vaddsubps(ymm15, ymm13, ymm13) - - - - + + + + mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate - - + + vpermilps(imm(0xb1), ymm4, ymm3) vmulps(ymm0, ymm4, ymm4) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm4, ymm4) - + vpermilps(imm(0xb1), ymm5, ymm3) vmulps(ymm0, ymm5, ymm5) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm5, ymm5) - - + + vpermilps(imm(0xb1), ymm8, ymm3) vmulps(ymm0, ymm8, ymm8) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm8, ymm8) - + vpermilps(imm(0xb1), ymm9, ymm3) vmulps(ymm0, ymm9, ymm9) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm9, ymm9) - - + + vpermilps(imm(0xb1), ymm12, ymm3) vmulps(ymm0, ymm12, ymm12) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm12, ymm12) - + vpermilps(imm(0xb1), ymm13, ymm3) vmulps(ymm0, ymm13, ymm13) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm13, ymm13) - - - - - + + + + + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate - - - - + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(scomplex) lea(mem(, rsi, 4), rdx) // rdx = 4*cs_c; lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. @@ -1978,186 +1986,187 @@ void bli_cgemm_haswell_asm_3x8 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.CBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. jz(.CROWSTORED) // jump to row storage case - - - + + + label(.CGENSTORED) - - + + CGEMM_INPUT_SCALE_GS_BETA_NZ vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_GS add(rdx, rcx) // c += 4*cs_c; - - + + CGEMM_INPUT_SCALE_GS_BETA_NZ vaddps(ymm5, ymm0, ymm0) CGEMM_OUTPUT_GS mov(r11, rcx) // rcx = c + 1*rs_c - - - + + + CGEMM_INPUT_SCALE_GS_BETA_NZ vaddps(ymm8, ymm0, ymm0) CGEMM_OUTPUT_GS add(rdx, rcx) // c += 4*cs_c; - - + + CGEMM_INPUT_SCALE_GS_BETA_NZ vaddps(ymm9, ymm0, ymm0) CGEMM_OUTPUT_GS mov(r12, rcx) // rcx = c + 2*rs_c - - - + + + CGEMM_INPUT_SCALE_GS_BETA_NZ vaddps(ymm12, ymm0, ymm0) CGEMM_OUTPUT_GS add(rdx, rcx) // c += 4*cs_c; - - + + CGEMM_INPUT_SCALE_GS_BETA_NZ vaddps(ymm13, ymm0, ymm0) CGEMM_OUTPUT_GS - - - + + + jmp(.CDONE) // jump to end. - - - + + + label(.CROWSTORED) - - + + CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_RS add(rdx, rcx) // c += 4*cs_c; - - + + CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm5, ymm0, ymm0) CGEMM_OUTPUT_RS mov(r11, rcx) // rcx = c + 1*rs_c - - - + + + CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm8, ymm0, ymm0) CGEMM_OUTPUT_RS add(rdx, rcx) // c += 4*cs_c; - - + + CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm9, ymm0, ymm0) CGEMM_OUTPUT_RS mov(r12, rcx) // rcx = c + 2*rs_c - - - + + + CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm12, ymm0, ymm0) CGEMM_OUTPUT_RS add(rdx, rcx) // c += 4*cs_c; - - + + CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm13, ymm0, ymm0) CGEMM_OUTPUT_RS - - - + + + jmp(.CDONE) // jump to end. - - - + + + label(.CBETAZERO) - + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. jz(.CROWSTORBZ) // jump to row storage case - - - + + + label(.CGENSTORBZ) - - + + vmovaps(ymm4, ymm0) CGEMM_OUTPUT_GS add(rdx, rcx) // c += 2*cs_c; - - + + vmovaps(ymm5, ymm0) CGEMM_OUTPUT_GS mov(r11, rcx) // rcx = c + 1*rs_c - - - + + + vmovaps(ymm8, ymm0) CGEMM_OUTPUT_GS add(rdx, rcx) // c += 2*cs_c; - - + + vmovaps(ymm9, ymm0) CGEMM_OUTPUT_GS mov(r12, rcx) // rcx = c + 2*rs_c - - - + + + vmovaps(ymm12, ymm0) CGEMM_OUTPUT_GS add(rdx, rcx) // c += 2*cs_c; - - + + vmovaps(ymm13, ymm0) CGEMM_OUTPUT_GS - - - + + + jmp(.CDONE) // jump to end. - - - + + + label(.CROWSTORBZ) - - + + vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rdx, 1)) - + vmovups(ymm8, mem(r11)) vmovups(ymm9, mem(r11, rdx, 1)) - + vmovups(ymm12, mem(r12)) vmovups(ymm13, mem(r12, rdx, 1)) - - - - - - + + + + label(.CDONE) - - - end_asm( + + vzeroupper() + + + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2180,7 +2189,7 @@ void bli_cgemm_haswell_asm_3x8 vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) - + // assumes values to output are in ymm0 #define ZGEMM_OUTPUT_GS \ vextractf128(imm(1), ymm0, xmm3) \ @@ -2193,7 +2202,7 @@ void bli_cgemm_haswell_asm_3x8 vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) - + #define ZGEMM_OUTPUT_RS \ vmovupd(ymm0, mem(rcx)) \ @@ -2220,122 +2229,124 @@ void bli_zgemm_haswell_asm_3x4 uint64_t cs_c = cs_c0; begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. - + add(imm(32*4), rbx) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) - + lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*rs_c; lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*rs_c; - + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r11, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, 7*8)) // prefetch c + 2*rs_c - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.ZLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 32*16)) - + vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, -2*32), ymm0) vmovapd(mem(rbx, -1*32), ymm1) - + // iteration 1 + prefetch(0, mem(rax, 36*16)) + vbroadcastsd(mem(rax, 6*8), ymm2) vbroadcastsd(mem(rax, 7*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 8*8), ymm2) vbroadcastsd(mem(rax, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 10*8), ymm2) vbroadcastsd(mem(rax, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, 0*32), ymm0) vmovapd(mem(rbx, 1*32), ymm1) - + // iteration 2 - prefetch(0, mem(rax, 38*16)) - + prefetch(0, mem(rax, 40*16)) + vbroadcastsd(mem(rax, 12*8), ymm2) vbroadcastsd(mem(rax, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 14*8), ymm2) vbroadcastsd(mem(rax, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 16*8), ymm2) vbroadcastsd(mem(rax, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, 2*32), ymm0) vmovapd(mem(rbx, 3*32), ymm1) - + // iteration 3 vbroadcastsd(mem(rax, 18*8), ymm2) vbroadcastsd(mem(rax, 19*8), ymm3) @@ -2343,83 +2354,83 @@ void bli_zgemm_haswell_asm_3x4 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 20*8), ymm2) vbroadcastsd(mem(rax, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 22*8), ymm2) vbroadcastsd(mem(rax, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(4*3*16), rax) // a += 4*3 (unroll x mr) add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr) - + vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.ZLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.ZCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.ZLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 32*16)) - + vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(1*3*16), rax) // a += 1*3 (unroll x mr) add(imm(1*4*16), rbx) // b += 1*4 (unroll x nr) - + vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.ZLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.ZPOSTACCUM) - + // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) @@ -2428,76 +2439,76 @@ void bli_zgemm_haswell_asm_3x4 vpermilpd(imm(0x5), ymm11, ymm11) vpermilpd(imm(0x5), ymm14, ymm14) vpermilpd(imm(0x5), ymm15, ymm15) - - + + // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) vaddsubpd(ymm7, ymm5, ymm5) - + vaddsubpd(ymm10, ymm8, ymm8) vaddsubpd(ymm11, ymm9, ymm9) - + vaddsubpd(ymm14, ymm12, ymm12) vaddsubpd(ymm15, ymm13, ymm13) - - - - + + + + mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate - - + + vpermilpd(imm(0x5), ymm4, ymm3) vmulpd(ymm0, ymm4, ymm4) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) - + vpermilpd(imm(0x5), ymm5, ymm3) vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm5, ymm5) - - + + vpermilpd(imm(0x5), ymm8, ymm3) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm8, ymm8) - + vpermilpd(imm(0x5), ymm9, ymm3) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm9, ymm9) - - + + vpermilpd(imm(0x5), ymm12, ymm3) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm12, ymm12) - + vpermilpd(imm(0x5), ymm13, ymm3) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm13, ymm13) - - - - - + + + + + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate - - - - + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dcomplex) lea(mem(, rsi, 2), rsi) lea(mem(, rsi, 2), rdx) // rdx = 2*cs_c; - - - + + + // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. @@ -2506,186 +2517,187 @@ void bli_zgemm_haswell_asm_3x4 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16. jz(.ZROWSTORED) // jump to row storage case - - - + + + label(.ZGENSTORED) - - + + ZGEMM_INPUT_SCALE_GS_BETA_NZ vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_GS add(rdx, rcx) // c += 2*cs_c; - - + + ZGEMM_INPUT_SCALE_GS_BETA_NZ vaddpd(ymm5, ymm0, ymm0) ZGEMM_OUTPUT_GS mov(r11, rcx) // rcx = c + 1*rs_c - - - + + + ZGEMM_INPUT_SCALE_GS_BETA_NZ vaddpd(ymm8, ymm0, ymm0) ZGEMM_OUTPUT_GS add(rdx, rcx) // c += 2*cs_c; - - + + ZGEMM_INPUT_SCALE_GS_BETA_NZ vaddpd(ymm9, ymm0, ymm0) ZGEMM_OUTPUT_GS mov(r12, rcx) // rcx = c + 2*rs_c - - - + + + ZGEMM_INPUT_SCALE_GS_BETA_NZ vaddpd(ymm12, ymm0, ymm0) ZGEMM_OUTPUT_GS add(rdx, rcx) // c += 2*cs_c; - - + + ZGEMM_INPUT_SCALE_GS_BETA_NZ vaddpd(ymm13, ymm0, ymm0) ZGEMM_OUTPUT_GS - - - + + + jmp(.ZDONE) // jump to end. - - - + + + label(.ZROWSTORED) - - + + ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_RS add(rdx, rcx) // c += 2*cs_c; - - + + ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm5, ymm0, ymm0) ZGEMM_OUTPUT_RS mov(r11, rcx) // rcx = c + 1*rs_c - - - + + + ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm8, ymm0, ymm0) ZGEMM_OUTPUT_RS add(rdx, rcx) // c += 2*cs_c; - - + + ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm9, ymm0, ymm0) ZGEMM_OUTPUT_RS mov(r12, rcx) // rcx = c + 2*rs_c - - - + + + ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm12, ymm0, ymm0) ZGEMM_OUTPUT_RS add(rdx, rcx) // c += 2*cs_c; - - + + ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm13, ymm0, ymm0) ZGEMM_OUTPUT_RS - - - + + + jmp(.ZDONE) // jump to end. - - - + + + label(.ZBETAZERO) - + cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16. jz(.ZROWSTORBZ) // jump to row storage case - - - + + + label(.ZGENSTORBZ) - - + + vmovapd(ymm4, ymm0) ZGEMM_OUTPUT_GS add(rdx, rcx) // c += 2*cs_c; - - + + vmovapd(ymm5, ymm0) ZGEMM_OUTPUT_GS mov(r11, rcx) // rcx = c + 1*rs_c - - - + + + vmovapd(ymm8, ymm0) ZGEMM_OUTPUT_GS add(rdx, rcx) // c += 2*cs_c; - - + + vmovapd(ymm9, ymm0) ZGEMM_OUTPUT_GS mov(r12, rcx) // rcx = c + 2*rs_c - - - + + + vmovapd(ymm12, ymm0) ZGEMM_OUTPUT_GS add(rdx, rcx) // c += 2*cs_c; - - + + vmovapd(ymm13, ymm0) ZGEMM_OUTPUT_GS - - - + + + jmp(.ZDONE) // jump to end. - - - + + + label(.ZROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx, rdx, 1)) - + vmovupd(ymm8, mem(r11)) vmovupd(ymm9, mem(r11, rdx, 1)) - + vmovupd(ymm12, mem(r12)) vmovupd(ymm13, mem(r12, rdx, 1)) - - - - - - + + + + label(.ZDONE) - - - end_asm( + + vzeroupper() + + + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c index a6edf8c492..aead3ea9f8 100644 --- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c @@ -82,22 +82,22 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 float* beta = bli_sm1; begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a10), rax) // load address of a. mov(var(b01), rbx) // load address of b. - + add(imm(32*4), rbx) // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - + mov(var(b11), rcx) // load address of b11 mov(imm(16), rdi) // set rs_b = PACKNR = 16 lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float) - + // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. @@ -106,45 +106,45 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float) mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 4), r10) // cs_c *= sizeof(float) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 64*4)) - + vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, -2*32), ymm0) vmovaps(mem(rbx, -1*32), ymm1) - + // iteration 1 vbroadcastss(mem(rax, 6*4), ymm2) vbroadcastss(mem(rax, 7*4), ymm3) @@ -152,51 +152,51 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 8*4), ymm2) vbroadcastss(mem(rax, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 10*4), ymm2) vbroadcastss(mem(rax, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, 0*32), ymm0) vmovaps(mem(rbx, 1*32), ymm1) - + // iteration 2 prefetch(0, mem(rax, 76*4)) - + vbroadcastss(mem(rax, 12*4), ymm2) vbroadcastss(mem(rax, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 14*4), ymm2) vbroadcastss(mem(rax, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 16*4), ymm2) vbroadcastss(mem(rax, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, 2*32), ymm0) vmovaps(mem(rbx, 3*32), ymm1) - + // iteration 3 vbroadcastss(mem(rax, 18*4), ymm2) vbroadcastss(mem(rax, 19*4), ymm3) @@ -204,144 +204,144 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 20*4), ymm2) vbroadcastss(mem(rax, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 22*4), ymm2) vbroadcastss(mem(rax, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(4*6*4), rax) // a += 4*6 (unroll x mr) add(imm(4*16*4), rbx) // b += 4*16 (unroll x nr) - + vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 64*4)) - + vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(1*6*4), rax) // a += 1*6 (unroll x mr) add(imm(1*16*4), rbx) // b += 1*16 (unroll x nr) - + vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + // ymm4..ymm15 = -a10 * b01 - - - + + + mov(var(alpha), rbx) // load address of alpha vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate - - - - + + + + mov(imm(1), rsi) // load cs_b = 1 lea(mem(, rsi, 4), rsi) // cs_b *= sizeof(float) - + lea(mem(rcx, rsi, 8), rdx) // load address of b11 + 8*cs_b - + mov(rcx, r11) // save rcx = b11 for later mov(rdx, r14) // save rdx = b11+8*cs_b for later - - + + // b11 := alpha * b11 - a10 * b01 vfmsub231ps(mem(rcx), ymm3, ymm4) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm5) add(rdi, rdx) - + vfmsub231ps(mem(rcx), ymm3, ymm6) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm7) add(rdi, rdx) - + vfmsub231ps(mem(rcx), ymm3, ymm8) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm9) add(rdi, rdx) - + vfmsub231ps(mem(rcx), ymm3, ymm10) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm11) add(rdi, rdx) - + vfmsub231ps(mem(rcx), ymm3, ymm12) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm13) add(rdi, rdx) - + vfmsub231ps(mem(rcx), ymm3, ymm14) //add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm15) //add(rdi, rdx) - - - + + + // prefetch c11 - + #if 0 mov(r8, rcx) // load address of c11 from r8 // Note: r9 = rs_c * sizeof(float) - + lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; - + prefetch(0, mem(rcx, 0*8)) // prefetch c11 + 0*rs_c prefetch(0, mem(rcx, r9, 1, 0*8)) // prefetch c11 + 1*rs_c prefetch(0, mem(rcx, r9 , 2, 0*8)) // prefetch c11 + 2*rs_c @@ -349,12 +349,12 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 prefetch(0, mem(rdx, r9, 1, 0*8)) // prefetch c11 + 4*rs_c prefetch(0, mem(rdx, r9 , 2, 0*8)) // prefetch c11 + 5*rs_c #endif - - - - + + + + // trsm computation begins here - + // Note: contents of b11 are stored as // ymm4 ymm5 = ( beta00..07 ) ( beta08..0F ) // ymm6 ymm7 = ( beta10..17 ) ( beta18..1F ) @@ -362,18 +362,18 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 // ymm10 ymm11 = ( beta30..37 ) ( beta38..3F ) // ymm12 ymm13 = ( beta40..47 ) ( beta48..4F ) // ymm14 ymm15 = ( beta50..57 ) ( beta58..5F ) - - + + mov(var(a11), rax) // load address of a11 - + mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+8*cs_b // Note: rdi = rs_b - + // iteration 0 ------------- - + vbroadcastss(mem(0+0*6)*4(rax), ymm0) // ymm0 = (1/alpha00) - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm4, ymm4) // ymm4 *= (1/alpha00) vmulps(ymm0, ymm5, ymm5) // ymm5 *= (1/alpha00) @@ -381,23 +381,23 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vdivps(ymm0, ymm4, ymm4) // ymm4 /= alpha00 vdivps(ymm0, ymm5, ymm5) // ymm5 /= alpha00 #endif - + vmovups(ymm4, mem(rcx)) // store ( beta00..beta07 ) = ymm4 vmovups(ymm5, mem(rdx)) // store ( beta08..beta0F ) = ymm5 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - + // iteration 1 ------------- - + vbroadcastss(mem(1+0*6)*4(rax), ymm0) // ymm0 = alpha10 vbroadcastss(mem(1+1*6)*4(rax), ymm1) // ymm1 = (1/alpha11) - + vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha10 * ymm4 vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha10 * ymm5 - + vsubps(ymm2, ymm6, ymm6) // ymm6 -= ymm2 vsubps(ymm3, ymm7, ymm7) // ymm7 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm6, ymm6) // ymm6 *= (1/alpha11) vmulps(ymm1, ymm7, ymm7) // ymm7 *= (1/alpha11) @@ -405,28 +405,28 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vdivps(ymm1, ymm6, ymm6) // ymm6 /= alpha11 vdivps(ymm1, ymm7, ymm7) // ymm7 /= alpha11 #endif - + vmovups(ymm6, mem(rcx)) // store ( beta10..beta17 ) = ymm6 vmovups(ymm7, mem(rdx)) // store ( beta18..beta1F ) = ymm7 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - + // iteration 2 ------------- - + vbroadcastss(mem(2+0*6)*4(rax), ymm0) // ymm0 = alpha20 vbroadcastss(mem(2+1*6)*4(rax), ymm1) // ymm1 = alpha21 - + vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha20 * ymm4 vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha20 * ymm5 - + vbroadcastss(mem(2+2*6)*4(rax), ymm0) // ymm0 = (1/alpha22) - + vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha21 * ymm6 vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha21 * ymm7 - + vsubps(ymm2, ymm8, ymm8) // ymm8 -= ymm2 vsubps(ymm3, ymm9, ymm9) // ymm9 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm8, ymm8) // ymm8 *= (1/alpha22) vmulps(ymm0, ymm9, ymm9) // ymm9 *= (1/alpha22) @@ -434,33 +434,33 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vdivps(ymm0, ymm8, ymm8) // ymm8 /= alpha22 vdivps(ymm0, ymm9, ymm9) // ymm9 /= alpha22 #endif - + vmovups(ymm8, mem(rcx)) // store ( beta20..beta27 ) = ymm8 vmovups(ymm9, mem(rdx)) // store ( beta28..beta2F ) = ymm9 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - + // iteration 3 ------------- - + vbroadcastss(mem(3+0*6)*4(rax), ymm0) // ymm0 = alpha30 vbroadcastss(mem(3+1*6)*4(rax), ymm1) // ymm1 = alpha31 - + vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha30 * ymm4 vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha30 * ymm5 - + vbroadcastss(mem(3+2*6)*4(rax), ymm0) // ymm0 = alpha32 - + vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha31 * ymm6 vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha31 * ymm7 - + vbroadcastss(mem(3+3*6)*4(rax), ymm1) // ymm0 = (1/alpha33) - + vfmadd231ps(ymm0, ymm8, ymm2) // ymm2 += alpha32 * ymm8 vfmadd231ps(ymm0, ymm9, ymm3) // ymm3 += alpha32 * ymm9 - + vsubps(ymm2, ymm10, ymm10) // ymm10 -= ymm2 vsubps(ymm3, ymm11, ymm11) // ymm11 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm10, ymm10) // ymm10 *= (1/alpha33) vmulps(ymm1, ymm11, ymm11) // ymm11 *= (1/alpha33) @@ -468,38 +468,38 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vdivps(ymm1, ymm10, ymm10) // ymm10 /= alpha33 vdivps(ymm1, ymm11, ymm11) // ymm11 /= alpha33 #endif - + vmovups(ymm10, mem(rcx)) // store ( beta30..beta37 ) = ymm10 vmovups(ymm11, mem(rdx)) // store ( beta38..beta3F ) = ymm11 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - + // iteration 4 ------------- - + vbroadcastss(mem(4+0*6)*4(rax), ymm0) // ymm0 = alpha40 vbroadcastss(mem(4+1*6)*4(rax), ymm1) // ymm1 = alpha41 - + vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha40 * ymm4 vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha40 * ymm5 - + vbroadcastss(mem(4+2*6)*4(rax), ymm0) // ymm0 = alpha42 - + vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha41 * ymm6 vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha41 * ymm7 - + vbroadcastss(mem(4+3*6)*4(rax), ymm1) // ymm1 = alpha43 - + vfmadd231ps(ymm0, ymm8, ymm2) // ymm2 += alpha42 * ymm8 vfmadd231ps(ymm0, ymm9, ymm3) // ymm3 += alpha42 * ymm9 - + vbroadcastss(mem(4+4*6)*4(rax), ymm0) // ymm0 = (1/alpha44) - + vfmadd231ps(ymm1, ymm10, ymm2) // ymm2 += alpha43 * ymm10 vfmadd231ps(ymm1, ymm11, ymm3) // ymm3 += alpha43 * ymm11 - + vsubps(ymm2, ymm12, ymm12) // ymm12 -= ymm2 vsubps(ymm3, ymm13, ymm13) // ymm13 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm12, ymm12) // ymm12 *= (1/alpha44) vmulps(ymm0, ymm13, ymm13) // ymm13 *= (1/alpha44) @@ -507,43 +507,43 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vdivps(ymm0, ymm12, ymm12) // ymm12 /= alpha44 vdivps(ymm0, ymm13, ymm13) // ymm13 /= alpha44 #endif - + vmovups(ymm12, mem(rcx)) // store ( beta40..beta47 ) = ymm12 vmovups(ymm13, mem(rdx)) // store ( beta48..beta4F ) = ymm13 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - + // iteration 5 ------------- - + vbroadcastss(mem(5+0*6)*4(rax), ymm0) // ymm0 = alpha50 vbroadcastss(mem(5+1*6)*4(rax), ymm1) // ymm1 = alpha51 - + vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha50 * ymm4 vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha50 * ymm5 - + vbroadcastss(mem(5+2*6)*4(rax), ymm0) // ymm0 = alpha52 - + vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha51 * ymm6 vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha51 * ymm7 - + vbroadcastss(mem(5+3*6)*4(rax), ymm1) // ymm1 = alpha53 - + vfmadd231ps(ymm0, ymm8, ymm2) // ymm2 += alpha52 * ymm8 vfmadd231ps(ymm0, ymm9, ymm3) // ymm3 += alpha52 * ymm9 - + vbroadcastss(mem(5+4*6)*4(rax), ymm0) // ymm0 = alpha54 - + vfmadd231ps(ymm1, ymm10, ymm2) // ymm2 += alpha53 * ymm10 vfmadd231ps(ymm1, ymm11, ymm3) // ymm3 += alpha53 * ymm11 - + vbroadcastss(mem(5+5*6)*4(rax), ymm1) // ymm1 = (1/alpha55) - + vfmadd231ps(ymm0, ymm12, ymm2) // ymm2 += alpha54 * ymm12 vfmadd231ps(ymm0, ymm13, ymm3) // ymm3 += alpha54 * ymm13 - + vsubps(ymm2, ymm14, ymm14) // ymm14 -= ymm2 vsubps(ymm3, ymm15, ymm15) // ymm15 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm14, ymm14) // ymm14 *= (1/alpha55) vmulps(ymm1, ymm15, ymm15) // ymm15 *= (1/alpha55) @@ -551,189 +551,189 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vdivps(ymm1, ymm14, ymm14) // ymm14 /= alpha55 vdivps(ymm1, ymm15, ymm15) // ymm15 /= alpha55 #endif - + vmovups(ymm14, mem(rcx)) // store ( beta50..beta57 ) = ymm14 vmovups(ymm15, mem(rdx)) // store ( beta58..beta5F ) = ymm15 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - - - - - + + + + + mov(r8, rcx) // load address of c11 from r8 mov(r9, rdi) // load rs_c (in bytes) from r9 mov(r10, rsi) // load cs_c (in bytes) from r10 - + lea(mem(rcx, rsi, 8), rdx) // load address of c11 + 8*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; - + // These are used in the macros below. lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; - - - + + + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. jz(.SROWSTORED) // jump to row storage case - - - + + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - - + + + // if neither row- or column- // stored, use general case. label(.SGENSTORED) - - + + vmovaps(ymm4, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm6, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm8, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm10, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm12, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm14, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - - + + mov(rdx, rcx) // rcx = c11 + 8*cs_c - - + + vmovaps(ymm5, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm7, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm9, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm11, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm13, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm15, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - - - + + + jmp(.SDONE) - - - + + + label(.SROWSTORED) - - + + vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm5, mem(rdx)) add(rdi, rdx) - + vmovups(ymm6, mem(rcx)) add(rdi, rcx) vmovups(ymm7, mem(rdx)) add(rdi, rdx) - + vmovups(ymm8, mem(rcx)) add(rdi, rcx) vmovups(ymm9, mem(rdx)) add(rdi, rdx) - + vmovups(ymm10, mem(rcx)) add(rdi, rcx) vmovups(ymm11, mem(rdx)) add(rdi, rdx) - + vmovups(ymm12, mem(rcx)) add(rdi, rcx) vmovups(ymm13, mem(rdx)) add(rdi, rdx) - + vmovups(ymm14, mem(rcx)) //add(rdi, rcx) vmovups(ymm15, mem(rdx)) //add(rdi, rdx) - - + + jmp(.SDONE) - - - + + + label(.SCOLSTORED) - - + + vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - - + + vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - + lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - + vunpcklps(ymm14, ymm12, ymm0) vunpckhps(ymm14, ymm12, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) @@ -742,46 +742,46 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) - + lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - + + vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovups(xmm0, mem(rcx)) // store ( gamma08..gamma38 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma09..gamma39 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma0C..gamma3C ) vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma0D..gamma3D ) - + vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma0A..gamma3A ) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma0B..gamma3B ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma0E..gamma3E ) vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma0F..gamma3F ) - + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - + vunpcklps(ymm15, ymm13, ymm0) vunpckhps(ymm15, ymm13, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovlpd(xmm0, mem(r14)) // store ( gamma48..gamma58 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma49..gamma59 ) vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma4A..gamma5A ) @@ -790,33 +790,34 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma4D..gamma5D ) vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma4E..gamma5E ) vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma4F..gamma5F ) - + //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - - - + + + + label(.SDONE) - + vzeroupper() - + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a10] "m" (a10), // 2 - [b01] "m" (b01), // 3 - [beta] "m" (beta), // 4 - [alpha] "m" (alpha), // 5 - [a11] "m" (a11), // 6 - [b11] "m" (b11), // 7 - [c11] "m" (c11), // 8 - [rs_c] "m" (rs_c), // 9 - [cs_c] "m" (cs_c) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a10] "m" (a10), // 2 + [b01] "m" (b01), // 3 + [beta] "m" (beta), // 4 + [alpha] "m" (alpha), // 5 + [a11] "m" (a11), // 6 + [b11] "m" (b11), // 7 + [c11] "m" (c11), // 8 + [rs_c] "m" (rs_c), // 9 + [cs_c] "m" (cs_c) // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -841,17 +842,17 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 vmovhpd(xmm1, mem(rcx, r10, 1))*/ void bli_dgemmtrsm_l_haswell_asm_6x8 -( - dim_t k0, - double* restrict alpha, - double* restrict a10, - double* restrict a11, - double* restrict b01, - double* restrict b11, - double* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx -) + ( + dim_t k0, + double* restrict alpha, + double* restrict a10, + double* restrict a11, + double* restrict b01, + double* restrict b11, + double* restrict c11, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); @@ -866,22 +867,22 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 double* beta = bli_dm1; begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a10), rax) // load address of a. mov(var(b01), rbx) // load address of b. - + add(imm(32*4), rbx) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - + mov(var(b11), rcx) // load address of b11 mov(imm(8), rdi) // set rs_b = PACKNR = 8 lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double) - + // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. @@ -890,97 +891,99 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double) mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 8), r10) // cs_c *= sizeof(double) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 64*8)) - + vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, -2*32), ymm0) vmovapd(mem(rbx, -1*32), ymm1) - + // iteration 1 + prefetch(0, mem(rax, 72*8)) + vbroadcastsd(mem(rax, 6*8), ymm2) vbroadcastsd(mem(rax, 7*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 8*8), ymm2) vbroadcastsd(mem(rax, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 10*8), ymm2) vbroadcastsd(mem(rax, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, 0*32), ymm0) vmovapd(mem(rbx, 1*32), ymm1) - + // iteration 2 - prefetch(0, mem(rax, 76*8)) - + prefetch(0, mem(rax, 80*8)) + vbroadcastsd(mem(rax, 12*8), ymm2) vbroadcastsd(mem(rax, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 14*8), ymm2) vbroadcastsd(mem(rax, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 16*8), ymm2) vbroadcastsd(mem(rax, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, 2*32), ymm0) vmovapd(mem(rbx, 3*32), ymm1) - + // iteration 3 vbroadcastsd(mem(rax, 18*8), ymm2) vbroadcastsd(mem(rax, 19*8), ymm3) @@ -988,145 +991,145 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 20*8), ymm2) vbroadcastsd(mem(rax, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 22*8), ymm2) vbroadcastsd(mem(rax, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(4*6*8), rax) // a += 4*6 (unroll x mr) add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) - + vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 64*8)) - + vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(1*6*8), rax) // a += 1*6 (unroll x mr) add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) - + vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + // ymm4..ymm15 = -a10 * b01 - - - - + + + + mov(var(alpha), rbx) // load address of alpha vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate - - - - + + + + mov(imm(1), rsi) // set cs_b = 1 lea(mem(, rsi, 8), rsi) // cs_b *= sizeof(double) - + lea(mem(rcx, rsi, 4), rdx) // load address of b11 + 4*cs_b - + mov(rcx, r11) // save rcx = b11 for later mov(rdx, r14) // save rdx = b11+4*cs_b for later - - + + // b11 := alpha * b11 - a10 * b01 vfmsub231pd(mem(rcx), ymm3, ymm4) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm5) add(rdi, rdx) - + vfmsub231pd(mem(rcx), ymm3, ymm6) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm7) add(rdi, rdx) - + vfmsub231pd(mem(rcx), ymm3, ymm8) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm9) add(rdi, rdx) - + vfmsub231pd(mem(rcx), ymm3, ymm10) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm11) add(rdi, rdx) - + vfmsub231pd(mem(rcx), ymm3, ymm12) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm13) add(rdi, rdx) - + vfmsub231pd(mem(rcx), ymm3, ymm14) //add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm15) //add(rdi, rdx) - - - + + + // prefetch c11 - + #if 0 mov(r8, rcx) // load address of c11 from r8 // Note: r9 = rs_c * sizeof(double) - + lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; - + prefetch(0, mem(rcx, 7*8)) // prefetch c11 + 0*rs_c prefetch(0, mem(rcx, r9, 1, 7*8)) // prefetch c11 + 1*rs_c prefetch(0, mem(rcx, r9 , 2, 7*8)) // prefetch c11 + 2*rs_c @@ -1134,12 +1137,12 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 prefetch(0, mem(rdx, r9, 1, 7*8)) // prefetch c11 + 4*rs_c prefetch(0, mem(rdx, r9 , 2, 7*8)) // prefetch c11 + 5*rs_c #endif - - - - + + + + // trsm computation begins here - + // Note: contents of b11 are stored as // ymm4 ymm5 = ( beta00..03 ) ( beta04..07 ) // ymm6 ymm7 = ( beta10..13 ) ( beta14..17 ) @@ -1147,18 +1150,18 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 // ymm10 ymm11 = ( beta30..33 ) ( beta34..37 ) // ymm12 ymm13 = ( beta40..43 ) ( beta44..47 ) // ymm14 ymm15 = ( beta50..53 ) ( beta54..57 ) - - + + mov(var(a11), rax) // load address of a11 - + mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+4*cs_b // Note: rdi = rs_b - + // iteration 0 ------------- - + vbroadcastsd(mem(0+0*6)*8(rax), ymm0) // ymm0 = (1/alpha00) - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm4, ymm4) // ymm4 *= (1/alpha00) vmulpd(ymm0, ymm5, ymm5) // ymm5 *= (1/alpha00) @@ -1166,23 +1169,23 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 vdivpd(ymm0, ymm4, ymm4) // ymm4 /= alpha00 vdivpd(ymm0, ymm5, ymm5) // ymm5 /= alpha00 #endif - + vmovupd(ymm4, mem(rcx)) // store ( beta00..beta03 ) = ymm4 vmovupd(ymm5, mem(rdx)) // store ( beta04..beta07 ) = ymm5 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - + // iteration 1 ------------- - + vbroadcastsd(mem(1+0*6)*8(rax), ymm0) // ymm0 = alpha10 vbroadcastsd(mem(1+1*6)*8(rax), ymm1) // ymm1 = (1/alpha11) - + vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha10 * ymm4 vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha10 * ymm5 - + vsubpd(ymm2, ymm6, ymm6) // ymm6 -= ymm2 vsubpd(ymm3, ymm7, ymm7) // ymm7 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm6, ymm6) // ymm6 *= (1/alpha11) vmulpd(ymm1, ymm7, ymm7) // ymm7 *= (1/alpha11) @@ -1190,28 +1193,28 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 vdivpd(ymm1, ymm6, ymm6) // ymm6 /= alpha11 vdivpd(ymm1, ymm7, ymm7) // ymm7 /= alpha11 #endif - + vmovupd(ymm6, mem(rcx)) // store ( beta10..beta13 ) = ymm6 vmovupd(ymm7, mem(rdx)) // store ( beta14..beta17 ) = ymm7 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - + // iteration 2 ------------- - + vbroadcastsd(mem(2+0*6)*8(rax), ymm0) // ymm0 = alpha20 vbroadcastsd(mem(2+1*6)*8(rax), ymm1) // ymm1 = alpha21 - + vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha20 * ymm4 vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha20 * ymm5 - + vbroadcastsd(mem(2+2*6)*8(rax), ymm0) // ymm0 = (1/alpha22) - + vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha21 * ymm6 vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha21 * ymm7 - + vsubpd(ymm2, ymm8, ymm8) // ymm8 -= ymm2 vsubpd(ymm3, ymm9, ymm9) // ymm9 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm8, ymm8) // ymm8 *= (1/alpha22) vmulpd(ymm0, ymm9, ymm9) // ymm9 *= (1/alpha22) @@ -1219,33 +1222,33 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 vdivpd(ymm0, ymm8, ymm8) // ymm8 /= alpha22 vdivpd(ymm0, ymm9, ymm9) // ymm9 /= alpha22 #endif - + vmovupd(ymm8, mem(rcx)) // store ( beta20..beta23 ) = ymm8 vmovupd(ymm9, mem(rdx)) // store ( beta24..beta27 ) = ymm9 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - + // iteration 3 ------------- - + vbroadcastsd(mem(3+0*6)*8(rax), ymm0) // ymm0 = alpha30 vbroadcastsd(mem(3+1*6)*8(rax), ymm1) // ymm1 = alpha31 - + vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha30 * ymm4 vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha30 * ymm5 - + vbroadcastsd(mem(3+2*6)*8(rax), ymm0) // ymm0 = alpha32 - + vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha31 * ymm6 vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha31 * ymm7 - + vbroadcastsd(mem(3+3*6)*8(rax), ymm1) // ymm1 = (1/alpha33) - + vfmadd231pd(ymm0, ymm8, ymm2) // ymm2 += alpha32 * ymm8 vfmadd231pd(ymm0, ymm9, ymm3) // ymm3 += alpha32 * ymm9 - + vsubpd(ymm2, ymm10, ymm10) // ymm10 -= ymm2 vsubpd(ymm3, ymm11, ymm11) // ymm11 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm10, ymm10) // ymm10 *= (1/alpha33) vmulpd(ymm1, ymm11, ymm11) // ymm11 *= (1/alpha33) @@ -1253,38 +1256,38 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 vdivpd(ymm1, ymm10, ymm10) // ymm10 /= alpha33 vdivpd(ymm1, ymm11, ymm11) // ymm11 /= alpha33 #endif - + vmovupd(ymm10, mem(rcx)) // store ( beta30..beta33 ) = ymm10 vmovupd(ymm11, mem(rdx)) // store ( beta34..beta37 ) = ymm11 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - + // iteration 4 ------------- - + vbroadcastsd(mem(4+0*6)*8(rax), ymm0) // ymm0 = alpha40 vbroadcastsd(mem(4+1*6)*8(rax), ymm1) // ymm1 = alpha41 - + vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha40 * ymm4 vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha40 * ymm5 - + vbroadcastsd(mem(4+2*6)*8(rax), ymm0) // ymm0 = alpha42 - + vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha41 * ymm6 vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha41 * ymm7 - + vbroadcastsd(mem(4+3*6)*8(rax), ymm1) // ymm1 = alpha43 - + vfmadd231pd(ymm0, ymm8, ymm2) // ymm2 += alpha42 * ymm8 vfmadd231pd(ymm0, ymm9, ymm3) // ymm3 += alpha42 * ymm9 - + vbroadcastsd(mem(4+4*6)*8(rax), ymm0) // ymm4 = (1/alpha44) - + vfmadd231pd(ymm1, ymm10, ymm2) // ymm2 += alpha43 * ymm10 vfmadd231pd(ymm1, ymm11, ymm3) // ymm3 += alpha43 * ymm11 - + vsubpd(ymm2, ymm12, ymm12) // ymm12 -= ymm2 vsubpd(ymm3, ymm13, ymm13) // ymm13 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm12, ymm12) // ymm12 *= (1/alpha44) vmulpd(ymm0, ymm13, ymm13) // ymm13 *= (1/alpha44) @@ -1292,43 +1295,43 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 vdivpd(ymm0, ymm12, ymm12) // ymm12 /= alpha44 vdivpd(ymm0, ymm13, ymm13) // ymm13 /= alpha44 #endif - + vmovupd(ymm12, mem(rcx)) // store ( beta40..beta43 ) = ymm12 vmovupd(ymm13, mem(rdx)) // store ( beta44..beta47 ) = ymm13 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - + // iteration 5 ------------- - + vbroadcastsd(mem(5+0*6)*8(rax), ymm0) // ymm0 = alpha50 vbroadcastsd(mem(5+1*6)*8(rax), ymm1) // ymm1 = alpha51 - + vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha50 * ymm4 vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha50 * ymm5 - + vbroadcastsd(mem(5+2*6)*8(rax), ymm0) // ymm0 = alpha52 - + vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha51 * ymm6 vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha51 * ymm7 - + vbroadcastsd(mem(5+3*6)*8(rax), ymm1) // ymm1 = alpha53 - + vfmadd231pd(ymm0, ymm8, ymm2) // ymm2 += alpha52 * ymm8 vfmadd231pd(ymm0, ymm9, ymm3) // ymm3 += alpha52 * ymm9 - + vbroadcastsd(mem(5+4*6)*8(rax), ymm0) // ymm0 = alpha54 - + vfmadd231pd(ymm1, ymm10, ymm2) // ymm2 += alpha53 * ymm10 vfmadd231pd(ymm1, ymm11, ymm3) // ymm3 += alpha53 * ymm11 - + vbroadcastsd(mem(5+5*6)*8(rax), ymm1) // ymm1 = (1/alpha55) - + vfmadd231pd(ymm0, ymm12, ymm2) // ymm2 += alpha54 * ymm12 vfmadd231pd(ymm0, ymm13, ymm3) // ymm3 += alpha54 * ymm13 - + vsubpd(ymm2, ymm14, ymm14) // ymm14 -= ymm2 vsubpd(ymm3, ymm15, ymm15) // ymm15 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm14, ymm14) // ymm14 *= (1/alpha55) vmulpd(ymm1, ymm15, ymm15) // ymm15 *= (1/alpha55) @@ -1336,150 +1339,150 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 vdivpd(ymm1, ymm14, ymm14) // ymm14 /= alpha55 vdivpd(ymm1, ymm15, ymm15) // ymm15 /= alpha55 #endif - + vmovupd(ymm14, mem(rcx)) // store ( beta50..beta53 ) = ymm14 vmovupd(ymm15, mem(rdx)) // store ( beta54..beta57 ) = ymm15 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b - - - - + + + + mov(r8, rcx) // load address of c11 from r8 mov(r9, rdi) // load rs_c (in bytes) from r9 mov(r10, rsi) // load cs_c (in bytes) from r10 - + lea(mem(rcx, rsi, 4), rdx) // load address of c11 + 4*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; - + // These are used in the macros below. lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; //lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; - - - + + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. jz(.DROWSTORED) // jump to row storage case - - - + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - - + + + // if neither row- or column- // stored, use general case. label(.DGENSTORED) - - + + vmovapd(ymm4, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm6, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm8, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm10, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm12, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm14, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - - + + mov(rdx, rcx) // rcx = c11 + 4*cs_c - - + + vmovapd(ymm5, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm7, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm9, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm11, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm13, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm15, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - - + + jmp(.DDONE) - - - + + + label(.DROWSTORED) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vmovupd(ymm7, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vmovupd(ymm9, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm10, mem(rcx)) add(rdi, rcx) vmovupd(ymm11, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm12, mem(rcx)) add(rdi, rcx) vmovupd(ymm13, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm14, mem(rcx)) //add(rdi, rcx) vmovupd(ymm15, mem(rdx)) //add(rdi, rdx) - - + + jmp(.DDONE) - - - + + + label(.DCOLSTORED) - - + + vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) @@ -1488,27 +1491,27 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) - + vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, r13, 1)) - + lea(mem(rcx, rsi, 4), rcx) - + vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm3, mem(r14, r13, 1)) - + lea(mem(r14, rsi, 4), r14) - - + + vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) @@ -1517,50 +1520,49 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) - + vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, r13, 1)) - + //lea(mem(rcx, rsi, 4), rcx) - + vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm3, mem(r14, r13, 1)) - + //lea(mem(r14, rsi, 4), r14) - - - - - + + + + label(.DDONE) - + vzeroupper() - + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a10] "m" (a10), // 2 - [b01] "m" (b01), // 3 - [beta] "m" (beta), // 4 - [alpha] "m" (alpha), // 5 - [a11] "m" (a11), // 6 - [b11] "m" (b11), // 7 - [c11] "m" (c11), // 8 - [rs_c] "m" (rs_c), // 9 - [cs_c] "m" (cs_c) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a10] "m" (a10), // 2 + [b01] "m" (b01), // 3 + [beta] "m" (beta), // 4 + [alpha] "m" (alpha), // 5 + [a11] "m" (a11), // 6 + [b11] "m" (b11), // 7 + [c11] "m" (c11), // 8 + [rs_c] "m" (rs_c), // 9 + [cs_c] "m" (cs_c) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c index b14fb11775..2849e6994d 100644 --- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c @@ -82,22 +82,22 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 float* beta = bli_sm1; begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a10), rax) // load address of a. mov(var(b01), rbx) // load address of b. - + add(imm(32*4), rbx) // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - + mov(var(b11), rcx) // load address of b11 mov(imm(16), rdi) // set rs_b = PACKNR = 16 lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float) - + // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. @@ -106,45 +106,45 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float) mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 4), r10) // cs_c *= sizeof(float) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 64*4)) - + vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, -2*32), ymm0) vmovaps(mem(rbx, -1*32), ymm1) - + // iteration 1 vbroadcastss(mem(rax, 6*4), ymm2) vbroadcastss(mem(rax, 7*4), ymm3) @@ -152,51 +152,51 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 8*4), ymm2) vbroadcastss(mem(rax, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 10*4), ymm2) vbroadcastss(mem(rax, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, 0*32), ymm0) vmovaps(mem(rbx, 1*32), ymm1) - + // iteration 2 prefetch(0, mem(rax, 76*4)) - + vbroadcastss(mem(rax, 12*4), ymm2) vbroadcastss(mem(rax, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 14*4), ymm2) vbroadcastss(mem(rax, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 16*4), ymm2) vbroadcastss(mem(rax, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rbx, 2*32), ymm0) vmovaps(mem(rbx, 3*32), ymm1) - + // iteration 3 vbroadcastss(mem(rax, 18*4), ymm2) vbroadcastss(mem(rax, 19*4), ymm3) @@ -204,144 +204,144 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 20*4), ymm2) vbroadcastss(mem(rax, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 22*4), ymm2) vbroadcastss(mem(rax, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(4*6*4), rax) // a += 4*6 (unroll x mr) add(imm(4*16*4), rbx) // b += 4*16 (unroll x nr) - + vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 64*4)) - + vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(1*6*4), rax) // a += 1*6 (unroll x mr) add(imm(1*16*4), rbx) // b += 1*16 (unroll x nr) - + vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + // ymm4..ymm15 = -a10 * b01 - - - + + + mov(var(alpha), rbx) // load address of alpha vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate - - - - + + + + mov(imm(1), rsi) // load cs_b = 1 lea(mem(, rsi, 4), rsi) // cs_b *= sizeof(float) - + lea(mem(rcx, rsi, 8), rdx) // load address of b11 + 8*cs_b - + mov(rcx, r11) // save rcx = b11 for later mov(rdx, r14) // save rdx = b11+8*cs_b for later - - + + // b11 := alpha * b11 - a10 * b01 vfmsub231ps(mem(rcx), ymm3, ymm4) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm5) add(rdi, rdx) - + vfmsub231ps(mem(rcx), ymm3, ymm6) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm7) add(rdi, rdx) - + vfmsub231ps(mem(rcx), ymm3, ymm8) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm9) add(rdi, rdx) - + vfmsub231ps(mem(rcx), ymm3, ymm10) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm11) add(rdi, rdx) - + vfmsub231ps(mem(rcx), ymm3, ymm12) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm13) add(rdi, rdx) - + vfmsub231ps(mem(rcx), ymm3, ymm14) //add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm15) //add(rdi, rdx) - - - + + + // prefetch c11 - + #if 0 mov(r8, rcx) // load address of c11 from r8 // Note: r9 = rs_c * sizeof(float) - + lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; - + prefetch(0, mem(rcx, 0*8)) // prefetch c11 + 0*rs_c prefetch(0, mem(rcx, r9, 1, 0*8)) // prefetch c11 + 1*rs_c prefetch(0, mem(rcx, r9 , 2, 0*8)) // prefetch c11 + 2*rs_c @@ -349,12 +349,12 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 prefetch(0, mem(rdx, r9, 1, 0*8)) // prefetch c11 + 4*rs_c prefetch(0, mem(rdx, r9 , 2, 0*8)) // prefetch c11 + 5*rs_c #endif - - - - + + + + // trsm computation begins here - + // Note: contents of b11 are stored as // ymm4 ymm5 = ( beta00..07 ) ( beta08..0F ) // ymm6 ymm7 = ( beta10..17 ) ( beta18..1F ) @@ -362,23 +362,23 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 // ymm10 ymm11 = ( beta30..37 ) ( beta38..3F ) // ymm12 ymm13 = ( beta40..47 ) ( beta48..4F ) // ymm14 ymm15 = ( beta50..57 ) ( beta58..5F ) - - + + mov(var(a11), rax) // load address of a11 - + mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+8*cs_b - + lea(mem(rcx, rdi, 4), rcx) // rcx = b11 + (6-1)*rs_b lea(mem(rcx, rdi, 1), rcx) lea(mem(rdx, rdi, 4), rdx) // rdx = b11 + (6-1)*rs_b + 8*cs_b lea(mem(rdx, rdi, 1), rdx) - - + + // iteration 0 ------------- - + vbroadcastss(mem(5+5*6)*4(rax), ymm0) // ymm0 = (1/alpha55) - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm14, ymm14) // ymm14 *= (1/alpha55) vmulps(ymm0, ymm15, ymm15) // ymm15 *= (1/alpha55) @@ -386,23 +386,23 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vdivps(ymm0, ymm14, ymm14) // ymm14 /= alpha55 vdivps(ymm0, ymm15, ymm15) // ymm15 /= alpha55 #endif - + vmovups(ymm14, mem(rcx)) // store ( beta50..beta57 ) = ymm14 vmovups(ymm15, mem(rdx)) // store ( beta58..beta5F ) = ymm15 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - + // iteration 1 ------------- - + vbroadcastss(mem(4+5*6)*4(rax), ymm0) // ymm0 = alpha45 vbroadcastss(mem(4+4*6)*4(rax), ymm1) // ymm1 = (1/alpha44) - + vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha45 * ymm14 vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha45 * ymm15 - + vsubps(ymm2, ymm12, ymm12) // ymm12 -= ymm2 vsubps(ymm3, ymm13, ymm13) // ymm13 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm12, ymm12) // ymm12 *= (1/alpha44) vmulps(ymm1, ymm13, ymm13) // ymm13 *= (1/alpha44) @@ -410,28 +410,28 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vdivps(ymm1, ymm12, ymm12) // ymm12 /= alpha44 vdivps(ymm1, ymm13, ymm13) // ymm13 /= alpha44 #endif - + vmovups(ymm12, mem(rcx)) // store ( beta40..beta47 ) = ymm12 vmovups(ymm13, mem(rdx)) // store ( beta48..beta4F ) = ymm13 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - + // iteration 2 ------------- - + vbroadcastss(mem(3+5*6)*4(rax), ymm0) // ymm0 = alpha35 vbroadcastss(mem(3+4*6)*4(rax), ymm1) // ymm1 = alpha34 - + vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha35 * ymm14 vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha35 * ymm15 - + vbroadcastss(mem(3+3*6)*4(rax), ymm0) // ymm0 = (1/alpha33) - + vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha34 * ymm12 vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha34 * ymm13 - + vsubps(ymm2, ymm10, ymm10) // ymm10 -= ymm2 vsubps(ymm3, ymm11, ymm11) // ymm11 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm10, ymm10) // ymm10 *= (1/alpha33) vmulps(ymm0, ymm11, ymm11) // ymm11 *= (1/alpha33) @@ -439,33 +439,33 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vdivps(ymm0, ymm10, ymm10) // ymm10 /= alpha33 vdivps(ymm0, ymm11, ymm11) // ymm11 /= alpha33 #endif - + vmovups(ymm10, mem(rcx)) // store ( beta30..beta37 ) = ymm10 vmovups(ymm11, mem(rdx)) // store ( beta38..beta3F ) = ymm11 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - + // iteration 3 ------------- - + vbroadcastss(mem(2+5*6)*4(rax), ymm0) // ymm0 = alpha25 vbroadcastss(mem(2+4*6)*4(rax), ymm1) // ymm1 = alpha24 - + vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha25 * ymm14 vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha25 * ymm15 - + vbroadcastss(mem(2+3*6)*4(rax), ymm0) // ymm0 = alpha23 - + vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha24 * ymm12 vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha24 * ymm13 - + vbroadcastss(mem(2+2*6)*4(rax), ymm1) // ymm1 = (1/alpha22) - + vfmadd231ps(ymm0, ymm10, ymm2) // ymm2 += alpha23 * ymm10 vfmadd231ps(ymm0, ymm11, ymm3) // ymm3 += alpha23 * ymm11 - + vsubps(ymm2, ymm8, ymm8) // ymm8 -= ymm2 vsubps(ymm3, ymm9, ymm9) // ymm9 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm8, ymm8) // ymm8 *= (1/alpha22) vmulps(ymm1, ymm9, ymm9) // ymm9 *= (1/alpha22) @@ -473,38 +473,38 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vdivps(ymm1, ymm8, ymm8) // ymm8 /= alpha22 vdivps(ymm1, ymm9, ymm9) // ymm9 /= alpha22 #endif - + vmovups(ymm8, mem(rcx)) // store ( beta20..beta27 ) = ymm8 vmovups(ymm9, mem(rdx)) // store ( beta28..beta2F ) = ymm9 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - + // iteration 4 ------------- - + vbroadcastss(mem(1+5*6)*4(rax), ymm0) // ymm0 = alpha15 vbroadcastss(mem(1+4*6)*4(rax), ymm1) // ymm1 = alpha14 - + vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha15 * ymm14 vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha15 * ymm15 - + vbroadcastss(mem(1+3*6)*4(rax), ymm0) // ymm0 = alpha13 - + vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha14 * ymm12 vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha14 * ymm13 - + vbroadcastss(mem(1+2*6)*4(rax), ymm1) // ymm1 = alpha12 - + vfmadd231ps(ymm0, ymm10, ymm2) // ymm2 += alpha13 * ymm10 vfmadd231ps(ymm0, ymm11, ymm3) // ymm3 += alpha13 * ymm11 - + vbroadcastss(mem(1+1*6)*4(rax), ymm0) // ymm4 = (1/alpha11) - + vfmadd231ps(ymm1, ymm8, ymm2) // ymm2 += alpha12 * ymm8 vfmadd231ps(ymm1, ymm9, ymm3) // ymm3 += alpha12 * ymm9 - + vsubps(ymm2, ymm6, ymm6) // ymm6 -= ymm2 vsubps(ymm3, ymm7, ymm7) // ymm7 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm6, ymm6) // ymm6 *= (1/alpha11) vmulps(ymm0, ymm7, ymm7) // ymm7 *= (1/alpha11) @@ -512,43 +512,43 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vdivps(ymm0, ymm6, ymm6) // ymm6 /= alpha11 vdivps(ymm0, ymm7, ymm7) // ymm7 /= alpha11 #endif - + vmovups(ymm6, mem(rcx)) // store ( beta10..beta17 ) = ymm6 vmovups(ymm7, mem(rdx)) // store ( beta18..beta1F ) = ymm7 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - + // iteration 5 ------------- - + vbroadcastss(mem(0+5*6)*4(rax), ymm0) // ymm0 = alpha05 vbroadcastss(mem(0+4*6)*4(rax), ymm1) // ymm1 = alpha04 - + vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha05 * ymm14 vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha05 * ymm15 - + vbroadcastss(mem(0+3*6)*4(rax), ymm0) // ymm0 = alpha03 - + vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha04 * ymm12 vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha04 * ymm13 - + vbroadcastss(mem(0+2*6)*4(rax), ymm1) // ymm1 = alpha02 - + vfmadd231ps(ymm0, ymm10, ymm2) // ymm2 += alpha03 * ymm10 vfmadd231ps(ymm0, ymm11, ymm3) // ymm3 += alpha03 * ymm11 - + vbroadcastss(mem(0+1*6)*4(rax), ymm0) // ymm0 = alpha01 - + vfmadd231ps(ymm1, ymm8, ymm2) // ymm2 += alpha02 * ymm8 vfmadd231ps(ymm1, ymm9, ymm3) // ymm3 += alpha02 * ymm9 - + vbroadcastss(mem(0+0*6)*4(rax), ymm1) // ymm1 = (1/alpha00) - + vfmadd231ps(ymm0, ymm6, ymm2) // ymm2 += alpha01 * ymm6 vfmadd231ps(ymm0, ymm7, ymm3) // ymm3 += alpha01 * ymm7 - + vsubps(ymm2, ymm4, ymm4) // ymm4 -= ymm2 vsubps(ymm3, ymm5, ymm5) // ymm5 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm4, ymm4) // ymm4 *= (1/alpha00) vmulps(ymm1, ymm5, ymm5) // ymm5 *= (1/alpha00) @@ -556,189 +556,189 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vdivps(ymm1, ymm4, ymm4) // ymm4 /= alpha00 vdivps(ymm1, ymm5, ymm5) // ymm5 /= alpha00 #endif - + vmovups(ymm4, mem(rcx)) // store ( beta00..beta07 ) = ymm4 vmovups(ymm5, mem(rdx)) // store ( beta08..beta0F ) = ymm5 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - - - - - + + + + + mov(r8, rcx) // load address of c11 from r8 mov(r9, rdi) // load rs_c (in bytes) from r9 mov(r10, rsi) // load cs_c (in bytes) from r10 - + lea(mem(rcx, rsi, 8), rdx) // load address of c11 + 8*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; - + // These are used in the macros below. lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; - - - + + + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. jz(.SROWSTORED) // jump to row storage case - - - + + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - - + + + // if neither row- or column- // stored, use general case. label(.SGENSTORED) - - + + vmovaps(ymm4, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm6, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm8, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm10, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm12, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm14, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - - + + mov(rdx, rcx) // rcx = c11 + 8*cs_c - - + + vmovaps(ymm5, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm7, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm9, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm11, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm13, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovaps(ymm15, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - - - + + + jmp(.SDONE) - - - + + + label(.SROWSTORED) - - + + vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm5, mem(rdx)) add(rdi, rdx) - + vmovups(ymm6, mem(rcx)) add(rdi, rcx) vmovups(ymm7, mem(rdx)) add(rdi, rdx) - + vmovups(ymm8, mem(rcx)) add(rdi, rcx) vmovups(ymm9, mem(rdx)) add(rdi, rdx) - + vmovups(ymm10, mem(rcx)) add(rdi, rcx) vmovups(ymm11, mem(rdx)) add(rdi, rdx) - + vmovups(ymm12, mem(rcx)) add(rdi, rcx) vmovups(ymm13, mem(rdx)) add(rdi, rdx) - + vmovups(ymm14, mem(rcx)) //add(rdi, rcx) vmovups(ymm15, mem(rdx)) //add(rdi, rdx) - - + + jmp(.SDONE) - - - + + + label(.SCOLSTORED) - - + + vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - - + + vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - + lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - + vunpcklps(ymm14, ymm12, ymm0) vunpckhps(ymm14, ymm12, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) @@ -747,46 +747,46 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) - + lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - + + vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovups(xmm0, mem(rcx)) // store ( gamma08..gamma38 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma09..gamma39 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma0C..gamma3C ) vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma0D..gamma3D ) - + vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma0A..gamma3A ) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma0B..gamma3B ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma0E..gamma3E ) vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma0F..gamma3F ) - + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - + vunpcklps(ymm15, ymm13, ymm0) vunpckhps(ymm15, ymm13, ymm1) - + vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovlpd(xmm0, mem(r14)) // store ( gamma48..gamma58 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma49..gamma59 ) vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma4A..gamma5A ) @@ -795,32 +795,34 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma4D..gamma5D ) vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma4E..gamma5E ) vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma4F..gamma5F ) - + //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - - - + + + + label(.SDONE) - + vzeroupper() - + + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a10] "m" (a10), // 2 - [b01] "m" (b01), // 3 - [beta] "m" (beta), // 4 - [alpha] "m" (alpha), // 5 - [a11] "m" (a11), // 6 - [b11] "m" (b11), // 7 - [c11] "m" (c11), // 8 - [rs_c] "m" (rs_c), // 9 - [cs_c] "m" (cs_c) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a10] "m" (a10), // 2 + [b01] "m" (b01), // 3 + [beta] "m" (beta), // 4 + [alpha] "m" (alpha), // 5 + [a11] "m" (a11), // 6 + [b11] "m" (b11), // 7 + [c11] "m" (c11), // 8 + [rs_c] "m" (rs_c), // 9 + [cs_c] "m" (cs_c) // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -845,17 +847,17 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 vmovhpd(xmm1, mem(rcx, r10, 1))*/ void bli_dgemmtrsm_u_haswell_asm_6x8 -( - dim_t k0, - double* restrict alpha, - double* restrict a10, - double* restrict a11, - double* restrict b01, - double* restrict b11, - double* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx -) + ( + dim_t k0, + double* restrict alpha, + double* restrict a10, + double* restrict a11, + double* restrict b01, + double* restrict b11, + double* restrict c11, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); @@ -869,23 +871,23 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 double* beta = bli_dm1; - begin_asm() - + begin_asm() + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a10), rax) // load address of a. mov(var(b01), rbx) // load address of b. - + add(imm(32*4), rbx) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - + mov(var(b11), rcx) // load address of b11 mov(imm(8), rdi) // set rs_b = PACKNR = 8 lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double) - + // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. @@ -894,97 +896,99 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double) mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 8), r10) // cs_c *= sizeof(double) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 64*8)) - + vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, -2*32), ymm0) vmovapd(mem(rbx, -1*32), ymm1) - + // iteration 1 + prefetch(0, mem(rax, 72*8)) + vbroadcastsd(mem(rax, 6*8), ymm2) vbroadcastsd(mem(rax, 7*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 8*8), ymm2) vbroadcastsd(mem(rax, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 10*8), ymm2) vbroadcastsd(mem(rax, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, 0*32), ymm0) vmovapd(mem(rbx, 1*32), ymm1) - + // iteration 2 - prefetch(0, mem(rax, 76*8)) - + prefetch(0, mem(rax, 80*8)) + vbroadcastsd(mem(rax, 12*8), ymm2) vbroadcastsd(mem(rax, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 14*8), ymm2) vbroadcastsd(mem(rax, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 16*8), ymm2) vbroadcastsd(mem(rax, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rbx, 2*32), ymm0) vmovapd(mem(rbx, 3*32), ymm1) - + // iteration 3 vbroadcastsd(mem(rax, 18*8), ymm2) vbroadcastsd(mem(rax, 19*8), ymm3) @@ -992,145 +996,145 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 20*8), ymm2) vbroadcastsd(mem(rax, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 22*8), ymm2) vbroadcastsd(mem(rax, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(4*6*8), rax) // a += 4*6 (unroll x mr) add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) - + vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 64*8)) - + vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(1*6*8), rax) // a += 1*6 (unroll x mr) add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) - + vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + // ymm4..ymm15 = -a10 * b01 - - - - + + + + mov(var(alpha), rbx) // load address of alpha vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate - - - - + + + + mov(imm(1), rsi) // set cs_b = 1 lea(mem(, rsi, 8), rsi) // cs_b *= sizeof(double) - + lea(mem(rcx, rsi, 4), rdx) // load address of b11 + 4*cs_b - + mov(rcx, r11) // save rcx = b11 for later mov(rdx, r14) // save rdx = b11+4*cs_b for later - - + + // b11 := alpha * b11 - a10 * b01 vfmsub231pd(mem(rcx), ymm3, ymm4) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm5) add(rdi, rdx) - + vfmsub231pd(mem(rcx), ymm3, ymm6) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm7) add(rdi, rdx) - + vfmsub231pd(mem(rcx), ymm3, ymm8) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm9) add(rdi, rdx) - + vfmsub231pd(mem(rcx), ymm3, ymm10) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm11) add(rdi, rdx) - + vfmsub231pd(mem(rcx), ymm3, ymm12) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm13) add(rdi, rdx) - + vfmsub231pd(mem(rcx), ymm3, ymm14) //add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm15) //add(rdi, rdx) - - - + + + // prefetch c11 - + #if 0 mov(r8, rcx) // load address of c11 from r8 // Note: r9 = rs_c * sizeof(double) - + lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; - + prefetch(0, mem(rcx, 7*8)) // prefetch c11 + 0*rs_c prefetch(0, mem(rcx, r9, 1, 7*8)) // prefetch c11 + 1*rs_c prefetch(0, mem(rcx, r9 , 2, 7*8)) // prefetch c11 + 2*rs_c @@ -1138,12 +1142,12 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 prefetch(0, mem(rdx, r9, 1, 7*8)) // prefetch c11 + 4*rs_c prefetch(0, mem(rdx, r9 , 2, 7*8)) // prefetch c11 + 5*rs_c #endif - - - - + + + + // trsm computation begins here - + // Note: contents of b11 are stored as // ymm4 ymm5 = ( beta00..03 ) ( beta04..07 ) // ymm6 ymm7 = ( beta10..13 ) ( beta14..17 ) @@ -1151,23 +1155,23 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 // ymm10 ymm11 = ( beta30..33 ) ( beta34..37 ) // ymm12 ymm13 = ( beta40..43 ) ( beta44..47 ) // ymm14 ymm15 = ( beta50..53 ) ( beta54..57 ) - - + + mov(var(a11), rax) // load address of a11 - + mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+4*cs_b - + lea(mem(rcx, rdi, 4), rcx) // rcx = b11 + (6-1)*rs_b lea(mem(rcx, rdi, 1), rcx) lea(mem(rdx, rdi, 4), rdx) // rdx = b11 + (6-1)*rs_b + 4*cs_b lea(mem(rdx, rdi, 1), rdx) - - + + // iteration 0 ------------- - + vbroadcastsd(mem(5+5*6)*8(rax), ymm0) // ymm0 = (1/alpha55) - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm14, ymm14) // ymm14 *= (1/alpha55) vmulpd(ymm0, ymm15, ymm15) // ymm15 *= (1/alpha55) @@ -1175,23 +1179,23 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 vdivpd(ymm0, ymm14, ymm14) // ymm14 /= alpha55 vdivpd(ymm0, ymm15, ymm15) // ymm15 /= alpha55 #endif - + vmovupd(ymm14, mem(rcx)) // store ( beta50..beta53 ) = ymm14 vmovupd(ymm15, mem(rdx)) // store ( beta54..beta57 ) = ymm15 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - + // iteration 1 ------------- - + vbroadcastsd(mem(4+5*6)*8(rax), ymm0) // ymm0 = alpha45 vbroadcastsd(mem(4+4*6)*8(rax), ymm1) // ymm1 = (1/alpha44) - + vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha45 * ymm14 vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha45 * ymm15 - + vsubpd(ymm2, ymm12, ymm12) // ymm12 -= ymm2 vsubpd(ymm3, ymm13, ymm13) // ymm13 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm12, ymm12) // ymm12 *= (1/alpha44) vmulpd(ymm1, ymm13, ymm13) // ymm13 *= (1/alpha44) @@ -1199,28 +1203,28 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 vdivpd(ymm1, ymm12, ymm12) // ymm12 /= alpha44 vdivpd(ymm1, ymm13, ymm13) // ymm13 /= alpha44 #endif - + vmovupd(ymm12, mem(rcx)) // store ( beta40..beta43 ) = ymm12 vmovupd(ymm13, mem(rdx)) // store ( beta44..beta47 ) = ymm13 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - + // iteration 2 ------------- - + vbroadcastsd(mem(3+5*6)*8(rax), ymm0) // ymm0 = alpha35 vbroadcastsd(mem(3+4*6)*8(rax), ymm1) // ymm1 = alpha34 - + vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha35 * ymm14 vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha35 * ymm15 - + vbroadcastsd(mem(3+3*6)*8(rax), ymm0) // ymm0 = (1/alpha33) - + vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha34 * ymm12 vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha34 * ymm13 - + vsubpd(ymm2, ymm10, ymm10) // ymm10 -= ymm2 vsubpd(ymm3, ymm11, ymm11) // ymm11 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm10, ymm10) // ymm10 *= (1/alpha33) vmulpd(ymm0, ymm11, ymm11) // ymm11 *= (1/alpha33) @@ -1228,33 +1232,33 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 vdivpd(ymm0, ymm10, ymm10) // ymm10 /= alpha33 vdivpd(ymm0, ymm11, ymm11) // ymm11 /= alpha33 #endif - + vmovupd(ymm10, mem(rcx)) // store ( beta30..beta33 ) = ymm10 vmovupd(ymm11, mem(rdx)) // store ( beta34..beta37 ) = ymm11 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - + // iteration 3 ------------- - + vbroadcastsd(mem(2+5*6)*8(rax), ymm0) // ymm0 = alpha25 vbroadcastsd(mem(2+4*6)*8(rax), ymm1) // ymm1 = alpha24 - + vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha25 * ymm14 vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha25 * ymm15 - + vbroadcastsd(mem(2+3*6)*8(rax), ymm0) // ymm0 = alpha23 - + vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha24 * ymm12 vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha24 * ymm13 - + vbroadcastsd(mem(2+2*6)*8(rax), ymm1) // ymm1 = (1/alpha22) - + vfmadd231pd(ymm0, ymm10, ymm2) // ymm2 += alpha23 * ymm10 vfmadd231pd(ymm0, ymm11, ymm3) // ymm3 += alpha23 * ymm11 - + vsubpd(ymm2, ymm8, ymm8) // ymm8 -= ymm2 vsubpd(ymm3, ymm9, ymm9) // ymm9 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm8, ymm8) // ymm8 *= (1/alpha22) vmulpd(ymm1, ymm9, ymm9) // ymm9 *= (1/alpha22) @@ -1262,38 +1266,38 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 vdivpd(ymm1, ymm8, ymm8) // ymm8 /= alpha22 vdivpd(ymm1, ymm9, ymm9) // ymm9 /= alpha22 #endif - + vmovupd(ymm8, mem(rcx)) // store ( beta20..beta23 ) = ymm8 vmovupd(ymm9, mem(rdx)) // store ( beta24..beta27 ) = ymm9 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - + // iteration 4 ------------- - + vbroadcastsd(mem(1+5*6)*8(rax), ymm0) // ymm0 = alpha15 vbroadcastsd(mem(1+4*6)*8(rax), ymm1) // ymm1 = alpha14 - + vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha15 * ymm14 vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha15 * ymm15 - + vbroadcastsd(mem(1+3*6)*8(rax), ymm0) // ymm0 = alpha13 - + vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha14 * ymm12 vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha14 * ymm13 - + vbroadcastsd(mem(1+2*6)*8(rax), ymm1) // ymm1 = alpha12 - + vfmadd231pd(ymm0, ymm10, ymm2) // ymm2 += alpha13 * ymm10 vfmadd231pd(ymm0, ymm11, ymm3) // ymm3 += alpha13 * ymm11 - + vbroadcastsd(mem(1+1*6)*8(rax), ymm0) // ymm4 = (1/alpha11) - + vfmadd231pd(ymm1, ymm8, ymm2) // ymm2 += alpha12 * ymm8 vfmadd231pd(ymm1, ymm9, ymm3) // ymm3 += alpha12 * ymm9 - + vsubpd(ymm2, ymm6, ymm6) // ymm6 -= ymm2 vsubpd(ymm3, ymm7, ymm7) // ymm7 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm6, ymm6) // ymm6 *= (1/alpha11) vmulpd(ymm0, ymm7, ymm7) // ymm7 *= (1/alpha11) @@ -1301,43 +1305,43 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 vdivpd(ymm0, ymm6, ymm6) // ymm6 /= alpha11 vdivpd(ymm0, ymm7, ymm7) // ymm7 /= alpha11 #endif - + vmovupd(ymm6, mem(rcx)) // store ( beta10..beta13 ) = ymm6 vmovupd(ymm7, mem(rdx)) // store ( beta14..beta17 ) = ymm7 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - + // iteration 5 ------------- - + vbroadcastsd(mem(0+5*6)*8(rax), ymm0) // ymm0 = alpha05 vbroadcastsd(mem(0+4*6)*8(rax), ymm1) // ymm1 = alpha04 - + vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha05 * ymm14 vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha05 * ymm15 - + vbroadcastsd(mem(0+3*6)*8(rax), ymm0) // ymm0 = alpha03 - + vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha04 * ymm12 vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha04 * ymm13 - + vbroadcastsd(mem(0+2*6)*8(rax), ymm1) // ymm1 = alpha02 - + vfmadd231pd(ymm0, ymm10, ymm2) // ymm2 += alpha03 * ymm10 vfmadd231pd(ymm0, ymm11, ymm3) // ymm3 += alpha03 * ymm11 - + vbroadcastsd(mem(0+1*6)*8(rax), ymm0) // ymm0 = alpha01 - + vfmadd231pd(ymm1, ymm8, ymm2) // ymm2 += alpha02 * ymm8 vfmadd231pd(ymm1, ymm9, ymm3) // ymm3 += alpha02 * ymm9 - + vbroadcastsd(mem(0+0*6)*8(rax), ymm1) // ymm1 = (1/alpha00) - + vfmadd231pd(ymm0, ymm6, ymm2) // ymm2 += alpha01 * ymm6 vfmadd231pd(ymm0, ymm7, ymm3) // ymm3 += alpha01 * ymm7 - + vsubpd(ymm2, ymm4, ymm4) // ymm4 -= ymm2 vsubpd(ymm3, ymm5, ymm5) // ymm5 -= ymm3 - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm4, ymm4) // ymm4 *= (1/alpha00) vmulpd(ymm1, ymm5, ymm5) // ymm5 *= (1/alpha00) @@ -1345,150 +1349,150 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 vdivpd(ymm1, ymm4, ymm4) // ymm4 /= alpha00 vdivpd(ymm1, ymm5, ymm5) // ymm5 /= alpha00 #endif - + vmovupd(ymm4, mem(rcx)) // store ( beta00..beta03 ) = ymm4 vmovupd(ymm5, mem(rdx)) // store ( beta04..beta07 ) = ymm5 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b - - - - + + + + mov(r8, rcx) // load address of c11 from r8 mov(r9, rdi) // load rs_c (in bytes) from r9 mov(r10, rsi) // load cs_c (in bytes) from r10 - + lea(mem(rcx, rsi, 4), rdx) // load address of c11 + 4*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; - + // These are used in the macros below. lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; //lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; - - - + + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. jz(.DROWSTORED) // jump to row storage case - - - + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - - + + + // if neither row- or column- // stored, use general case. label(.DGENSTORED) - - + + vmovapd(ymm4, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm6, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm8, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm10, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm12, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm14, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - - + + mov(rdx, rcx) // rcx = c11 + 4*cs_c - - + + vmovapd(ymm5, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm7, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm9, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm11, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm13, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; - - + + vmovapd(ymm15, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - - + + jmp(.DDONE) - - - + + + label(.DROWSTORED) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vmovupd(ymm7, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vmovupd(ymm9, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm10, mem(rcx)) add(rdi, rcx) vmovupd(ymm11, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm12, mem(rcx)) add(rdi, rcx) vmovupd(ymm13, mem(rdx)) add(rdi, rdx) - + vmovupd(ymm14, mem(rcx)) //add(rdi, rcx) vmovupd(ymm15, mem(rdx)) //add(rdi, rdx) - - + + jmp(.DDONE) - - - + + + label(.DCOLSTORED) - - + + vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) @@ -1497,27 +1501,27 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) - + vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, r13, 1)) - + lea(mem(rcx, rsi, 4), rcx) - + vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm3, mem(r14, r13, 1)) - + lea(mem(r14, rsi, 4), r14) - - + + vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) @@ -1526,34 +1530,35 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) - + vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, r13, 1)) - + //lea(mem(rcx, rsi, 4), rcx) - + vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) - + vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm3, mem(r14, r13, 1)) - + //lea(mem(r14, rsi, 4), r14) - - - - - + + + + + label(.DDONE) - + vzeroupper() - + + end_asm( : // output operands (none) From 84f9dcd449fa7a4cf4087fca8ec4ca0d10e9b801 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 13 Jul 2021 16:45:44 -0500 Subject: [PATCH 041/226] Remove unnecesary windows/zen2 directory. --- windows/zen2/bli_config.h | 180 - windows/zen2/blis.h | 44714 ------------------------------------ 2 files changed, 44894 deletions(-) delete mode 100644 windows/zen2/bli_config.h delete mode 100644 windows/zen2/blis.h diff --git a/windows/zen2/bli_config.h b/windows/zen2/bli_config.h deleted file mode 100644 index 600fc24f9c..0000000000 --- a/windows/zen2/bli_config.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_CONFIG_H -#define BLIS_CONFIG_H - -// Enabled configuration "family" (config_name) -#define BLIS_FAMILY_ZEN2 - - -// Enabled sub-configurations (config_list) -#define BLIS_CONFIG_ZEN2 - - -// Enabled kernel sets (kernel_list) -#define BLIS_KERNELS_ZEN2 -#define BLIS_KERNELS_ZEN -#define BLIS_KERNELS_HASWELL - - -//This macro is enabled only for ZEN family configurations. -//This enables us to use different cache-blocking sizes for TRSM instead of common level-3 cache-block sizes. -#if 1 -#define AOCL_BLIS_ZEN -#endif - -#if 0 -#define BLIS_ENABLE_OPENMP -#endif - -#if 0 -#define BLIS_ENABLE_PTHREADS -#endif - -#if 1 -#define BLIS_ENABLE_JRIR_SLAB -#endif - -#if 0 -#define BLIS_ENABLE_JRIR_RR -#endif - -#if 1 -#define BLIS_ENABLE_PBA_POOLS -#else -#define BLIS_DISABLE_PBA_POOLS -#endif - -#if 1 -#define BLIS_ENABLE_SBA_POOLS -#else -#define BLIS_DISABLE_SBA_POOLS -#endif - -#if 0 -#define BLIS_ENABLE_MEM_TRACING -#else -#define BLIS_DISABLE_MEM_TRACING -#endif - -#if 0 == 64 -#define BLIS_INT_TYPE_SIZE 64 -#elif 0 == 32 -#define BLIS_INT_TYPE_SIZE 32 -#else -// determine automatically -#endif - -#if 32 == 64 -#define BLIS_BLAS_INT_TYPE_SIZE 64 -#elif 32 == 32 -#define BLIS_BLAS_INT_TYPE_SIZE 32 -#else -// determine automatically -#endif - -#ifndef BLIS_ENABLE_BLAS -#ifndef BLIS_DISABLE_BLAS -#if 1 -#define BLIS_ENABLE_BLAS -#else -#define BLIS_DISABLE_BLAS -#endif -#endif -#endif - -#ifndef BLIS_ENABLE_CBLAS -#ifndef BLIS_DISABLE_CBLAS -#if 0 -#define BLIS_ENABLE_CBLAS -#else -#define BLIS_DISABLE_CBLAS -#endif -#endif -#endif - -#ifndef BLIS_ENABLE_MIXED_DT -#ifndef BLIS_DISABLE_MIXED_DT -#if 1 -#define BLIS_ENABLE_MIXED_DT -#else -#define BLIS_DISABLE_MIXED_DT -#endif -#endif -#endif - -#ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM -#ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM -#if 1 -#define BLIS_ENABLE_MIXED_DT_EXTRA_MEM -#else -#define BLIS_DISABLE_MIXED_DT_EXTRA_MEM -#endif -#endif -#endif - -#if 1 -#define BLIS_ENABLE_SUP_HANDLING -#else -#define BLIS_DISABLE_SUP_HANDLING -#endif - -#if 0 -#define BLIS_ENABLE_MEMKIND -#else -#define BLIS_DISABLE_MEMKIND -#endif - -#if 1 -#define BLIS_ENABLE_PRAGMA_OMP_SIMD -#else -#define BLIS_DISABLE_PRAGMA_OMP_SIMD -#endif - -#if 0 -#define BLIS_ENABLE_SANDBOX -#else -#define BLIS_DISABLE_SANDBOX -#endif - -#if 1 -#define BLIS_ENABLE_SHARED -#else -#define BLIS_DISABLE_SHARED -#endif - - -#endif diff --git a/windows/zen2/blis.h b/windows/zen2/blis.h deleted file mode 100644 index b38b39468c..0000000000 --- a/windows/zen2/blis.h +++ /dev/null @@ -1,44714 +0,0 @@ - - -#ifndef BLIS_H -#define BLIS_H - - -// Allow C++ users to include this header file in their source code. However, -// we make the extern "C" conditional on whether we're using a C++ compiler, -// since regular C compilers don't understand the extern "C" construct. -#ifdef __cplusplus -extern "C" { -#endif - -// NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS -// YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. - -// -- System headers -- -// NOTE: This header must be included before bli_config_macro_defs.h. - -// begin bli_system.h - - -#ifndef BLIS_SYSTEM_H -#define BLIS_SYSTEM_H - -// NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that -// various parts of POSIX are defined and made available. -#ifndef _POSIX_C_SOURCE -#define _POSIX_C_SOURCE 200809L -#endif - -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#include // skipped - -// Determine the compiler (hopefully) and define conveniently named macros -// accordingly. -#if defined(__ICC) || defined(__INTEL_COMPILER) - #define BLIS_ICC -#elif defined(__clang__) - #define BLIS_CLANG -#elif defined(__GNUC__) - #define BLIS_GCC -#endif - -// Determine if we are on a 64-bit or 32-bit architecture. -#if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ - defined(_ARCH_PPC64) - #define BLIS_ARCH_64 -#else - #define BLIS_ARCH_32 -#endif - -// Determine the target operating system. -#if defined(_WIN32) || defined(__CYGWIN__) - #define BLIS_VERSION_STRING "2.2.1" - #define BLIS_OS_WINDOWS 1 -#elif defined(__gnu_hurd__) - #define BLIS_OS_GNU 1 -#elif defined(__APPLE__) || defined(__MACH__) - #define BLIS_OS_OSX 1 -#elif defined(__ANDROID__) - #define BLIS_OS_ANDROID 1 -#elif defined(__linux__) - #define BLIS_OS_LINUX 1 -#elif defined(__bgq__) - #define BLIS_OS_BGQ 1 -#elif defined(__bg__) - #define BLIS_OS_BGP 1 -#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__bsdi__) || defined(__DragonFly__) || \ - defined(__FreeBSD_kernel__) || defined(__HAIKU__) - #define BLIS_OS_BSD 1 -#elif defined(EMSCRIPTEN) - #define BLIS_OS_EMSCRIPTEN -#else - #error "Cannot determine operating system" -#endif - -// A few changes that may be necessary in Windows environments. -#if BLIS_OS_WINDOWS - - // Include Windows header file. - #define WIN32_LEAN_AND_MEAN - #define VC_EXTRALEAN -#include // skipped - - #if !defined(__clang__) && !defined(__GNUC__) - // Undefine attribute specifiers in Windows. - #define __attribute__(x) - - // Undefine restrict. - #define restrict - #endif - -#endif - -// time.h provides clock_gettime(). -#if BLIS_OS_WINDOWS -#include // skipped -#elif BLIS_OS_OSX -#include // skipped -#else - //#include - -#include // skipped -#endif - -// POSIX threads are unconditionally required, regardless of whether -// multithreading is enabled via pthreads or OpenMP (or disabled). -// If pthreads is not available (Windows), then fake it. -//#include "bli_pthread_wrap.h" - - -#endif -// end bli_system.h - - -// -- configure definitions -- - -// begin bli_config.h - - -#ifndef BLIS_CONFIG_H -#define BLIS_CONFIG_H - -// Enabled configuration "family" (config_name) -#define BLIS_FAMILY_ZEN2 - - -// Enabled sub-configurations (config_list) -#define BLIS_CONFIG_ZEN2 - - -// Enabled kernel sets (kernel_list) -#define BLIS_KERNELS_ZEN2 -#define BLIS_KERNELS_ZEN -#define BLIS_KERNELS_HASWELL - - -//This macro is enabled only for ZEN family configurations. -//This enables us to use different cache-blocking sizes for TRSM instead of common level-3 cache-block sizes. -#if 1 -#define AOCL_BLIS_ZEN -#endif - -#if 0 -#define BLIS_ENABLE_OPENMP -#endif - -#if 0 -#define BLIS_ENABLE_PTHREADS -#endif - -#if 1 -#define BLIS_ENABLE_JRIR_SLAB -#endif - -#if 0 -#define BLIS_ENABLE_JRIR_RR -#endif - -#if 1 -#define BLIS_ENABLE_PBA_POOLS -#else -#define BLIS_DISABLE_PBA_POOLS -#endif - -#if 1 -#define BLIS_ENABLE_SBA_POOLS -#else -#define BLIS_DISABLE_SBA_POOLS -#endif - -#if 0 -#define BLIS_ENABLE_MEM_TRACING -#else -#define BLIS_DISABLE_MEM_TRACING -#endif - -#if 0 == 64 -#define BLIS_INT_TYPE_SIZE 64 -#elif 0 == 32 -#define BLIS_INT_TYPE_SIZE 32 -#else -// determine automatically -#endif - -#if 32 == 64 -#define BLIS_BLAS_INT_TYPE_SIZE 64 -#elif 32 == 32 -#define BLIS_BLAS_INT_TYPE_SIZE 32 -#else -// determine automatically -#endif - -#ifndef BLIS_ENABLE_BLAS -#ifndef BLIS_DISABLE_BLAS -#if 1 -#define BLIS_ENABLE_BLAS -#else -#define BLIS_DISABLE_BLAS -#endif -#endif -#endif - -#ifndef BLIS_ENABLE_CBLAS -#ifndef BLIS_DISABLE_CBLAS -#if 0 -#define BLIS_ENABLE_CBLAS -#else -#define BLIS_DISABLE_CBLAS -#endif -#endif -#endif - -#ifndef BLIS_ENABLE_MIXED_DT -#ifndef BLIS_DISABLE_MIXED_DT -#if 1 -#define BLIS_ENABLE_MIXED_DT -#else -#define BLIS_DISABLE_MIXED_DT -#endif -#endif -#endif - -#ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM -#ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM -#if 1 -#define BLIS_ENABLE_MIXED_DT_EXTRA_MEM -#else -#define BLIS_DISABLE_MIXED_DT_EXTRA_MEM -#endif -#endif -#endif - -#if 1 -#define BLIS_ENABLE_SUP_HANDLING -#else -#define BLIS_DISABLE_SUP_HANDLING -#endif - -#if 0 -#define BLIS_ENABLE_MEMKIND -#else -#define BLIS_DISABLE_MEMKIND -#endif - -#if 1 -#define BLIS_ENABLE_PRAGMA_OMP_SIMD -#else -#define BLIS_DISABLE_PRAGMA_OMP_SIMD -#endif - -#if 0 -#define BLIS_ENABLE_SANDBOX -#else -#define BLIS_DISABLE_SANDBOX -#endif - -#if 1 -#define BLIS_ENABLE_SHARED -#else -#define BLIS_DISABLE_SHARED -#endif - - -#endif -// end bli_config.h -// begin bli_config_macro_defs.h - - -#ifndef BLIS_CONFIG_MACRO_DEFS_H -#define BLIS_CONFIG_MACRO_DEFS_H - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#ifndef BLIS_INT_TYPE_SIZE - #ifdef BLIS_ARCH_64 - #define BLIS_INT_TYPE_SIZE 64 - #else - #define BLIS_INT_TYPE_SIZE 32 - #endif -#endif - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -// NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. -#ifdef BLIS_ENABLE_C99_COMPLEX - // No additional definitions needed. -#else - // Default behavior is disabled. -#endif - - -// -- MULTITHREADING ----------------------------------------------------------- - -// Enable multithreading via POSIX threads. -#ifdef BLIS_ENABLE_PTHREADS - // No additional definitions needed. -#else - // Default behavior is disabled. -#endif - -// Enable multithreading via OpenMP. -#ifdef BLIS_ENABLE_OPENMP - // No additional definitions needed. -#else - // Default behavior is disabled. -#endif - -// Perform a sanity check to make sure the user doesn't try to enable -// both OpenMP and pthreads. -#if defined ( BLIS_ENABLE_OPENMP ) && \ - defined ( BLIS_ENABLE_PTHREADS ) - #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." -#endif - -// Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP -// or pthreads are enabled. This macro is useful in situations when -// we want to detect use of either OpenMP or pthreads (as opposed -// to neither being used). -#if defined ( BLIS_ENABLE_OPENMP ) || \ - defined ( BLIS_ENABLE_PTHREADS ) - #define BLIS_ENABLE_MULTITHREADING -#endif - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Enable mixed datatype support? -#ifdef BLIS_DISABLE_MIXED_DT - #undef BLIS_ENABLE_GEMM_MD -#else - // Default behavior is enabled. - #define BLIS_ENABLE_GEMM_MD -#endif - -// Enable memory-intensive optimizations for mixed datatype support? -#ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM - #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM -#else - // Default behavior is enabled. - #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM -#endif - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Do NOT require the cross-blocksize constraints. That is, do not enforce -// MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY -// needed when implementing trsm_r by allowing the right-hand matrix B to -// be triangular. -#ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS - #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS -#endif - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#ifdef BLIS_DISABLE_STAY_AUTO_INITIALIZED - #undef BLIS_ENABLE_STAY_AUTO_INITIALIZED -#else - // Default behavior is enabled. - #undef BLIS_ENABLE_STAY_AUTO_INITIALIZED // In case user explicitly enabled. - #define BLIS_ENABLE_STAY_AUTO_INITIALIZED -#endif - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#ifdef BLIS_DISABLE_BLAS - #undef BLIS_ENABLE_BLAS -#else - // Default behavior is enabled. - #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. - #define BLIS_ENABLE_BLAS -#endif - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. -#ifndef BLIS_BLAS_INT_TYPE_SIZE - #define BLIS_BLAS_INT_TYPE_SIZE 32 -#endif - -// By default, the level-3 BLAS routines are implemented by directly calling -// the BLIS object API. Alternatively, they may first call the typed BLIS -// API, which will then call the object API. -//#define BLIS_BLAS3_CALLS_TAPI -#ifdef BLIS_BLAS3_CALLS_TAPI - #undef BLIS_BLAS3_CALLS_OAPI -#else - // Default behavior is to call object API directly. - #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. - #define BLIS_BLAS3_CALLS_OAPI -#endif - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -#ifdef BLIS_ENABLE_CBLAS - // No additional definitions needed. -#else - // Default behavior is disabled. -#endif - - -// -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- - -// When building shared libraries, we can control which symbols are exported for -// linking by external applications. BLIS annotates all function prototypes that -// are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing -// a similar role for BLAS compatibility routines). Which symbols are exported -// is controlled by the default symbol visibility, as specifed by the gcc option -// -fvisibility=[default|hidden]. The default for this option is 'default', or, -// "public", which, if allowed to stand, causes all symbols in BLIS to be -// linkable from the outside. But when compiling with -fvisibility=hidden, all -// symbols start out hidden (that is, restricted only for internal use by BLIS), -// with that setting overridden only for function prototypes or variable -// declarations that are annotated with BLIS_EXPORT_BLIS. - -#ifndef BLIS_EXPORT - #if !defined(BLIS_ENABLE_SHARED) - #define BLIS_EXPORT - #else - #if defined(_WIN32) || defined(__CYGWIN__) - #ifdef BLIS_IS_BUILDING_LIBRARY - #define BLIS_EXPORT __declspec(dllexport) - #else - #define BLIS_EXPORT __declspec(dllimport) - #endif - #elif defined(__GNUC__) && __GNUC__ >= 4 - #define BLIS_EXPORT __attribute__ ((visibility ("default"))) - #else - #define BLIS_EXPORT - #endif - #endif -#endif - -#define BLIS_EXPORT_BLIS BLIS_EXPORT -#define BLIS_EXPORT_BLAS BLIS_EXPORT - - -#endif - -// end bli_config_macro_defs.h - - -// -- Common BLIS definitions -- - -// begin bli_type_defs.h - - -#ifndef BLIS_TYPE_DEFS_H -#define BLIS_TYPE_DEFS_H - - -// -// -- BLIS basic types --------------------------------------------------------- -// - -#ifdef __cplusplus - // For C++, include stdint.h. -#include // skipped -#elif __STDC_VERSION__ >= 199901L - // For C99 (or later), include stdint.h. -#include // skipped -#else - // When stdint.h is not available, manually typedef the types we will use. - #ifdef _WIN32 - typedef __int32 int32_t; - typedef unsigned __int32 uint32_t; - typedef __int64 int64_t; - typedef unsigned __int64 uint64_t; - #else - #error "Attempting to compile on pre-C99 system without stdint.h." - #endif -#endif - -// -- General-purpose integers -- - -// If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. -// NOTE: This cpp guard will only meaningfully change BLIS's behavior on -// systems where the BLIS integer size would have been automatically selected -// to be 32 bits, since explicit selection of 32 bits is prohibited at -// configure-time (and explicit or automatic selection of 64 bits is fine -// and would have had the same result). -#if BLIS_BLAS_INT_SIZE == 64 - #undef BLIS_INT_TYPE_SIZE - #define BLIS_INT_TYPE_SIZE 64 -#endif - -// Define integer types depending on what size integer was requested. -#if BLIS_INT_TYPE_SIZE == 32 -typedef int32_t gint_t; -typedef uint32_t guint_t; -#elif BLIS_INT_TYPE_SIZE == 64 -typedef int64_t gint_t; -typedef uint64_t guint_t; -#else -typedef signed long int gint_t; -typedef unsigned long int guint_t; -#endif - -// -- Boolean type -- - -typedef gint_t bool_t; - - -// -- Boolean values -- - -#ifndef TRUE - #define TRUE 1 -#endif - -#ifndef FALSE - #define FALSE 0 -#endif - - -// -- Special-purpose integers -- - -// This cpp guard provides a temporary hack to allow libflame -// interoperability with BLIS. -#ifndef _DEFINED_DIM_T -#define _DEFINED_DIM_T -typedef gint_t dim_t; // dimension type -#endif -typedef gint_t inc_t; // increment/stride type -typedef gint_t doff_t; // diagonal offset type -typedef guint_t siz_t; // byte size type -typedef uint32_t objbits_t; // object information bit field - -// -- Real types -- - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// There are some places where we need to use sizeof() inside of a C -// preprocessor #if conditional, and so here we define the various sizes -// for those purposes. -#define BLIS_SIZEOF_S 4 // sizeof(float) -#define BLIS_SIZEOF_D 8 // sizeof(double) -#define BLIS_SIZEOF_C 8 // sizeof(scomplex) -#define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) - -// -- Complex types -- - -#ifdef BLIS_ENABLE_C99_COMPLEX - - #if __STDC_VERSION__ >= 199901L -#include // skipped - - // Typedef official complex types to BLIS complex type names. - typedef float complex scomplex; - typedef double complex dcomplex; - #else - #error "Configuration requested C99 complex types, but C99 does not appear to be supported." - #endif - -#else // ifndef BLIS_ENABLE_C99_COMPLEX - - // This cpp guard provides a temporary hack to allow libflame - // interoperability with BLIS. - #ifndef _DEFINED_SCOMPLEX - #define _DEFINED_SCOMPLEX - typedef struct - { - float real; - float imag; - } scomplex; - #endif - - // This cpp guard provides a temporary hack to allow libflame - // interoperability with BLIS. - #ifndef _DEFINED_DCOMPLEX - #define _DEFINED_DCOMPLEX - typedef struct - { - double real; - double imag; - } dcomplex; - #endif - -#endif // BLIS_ENABLE_C99_COMPLEX - -// -- Atom type -- - -// Note: atom types are used to hold "bufferless" scalar object values. Note -// that it needs to be as large as the largest possible scalar value we might -// want to hold. Thus, for now, it is a dcomplex. -typedef dcomplex atom_t; - -// -- Fortran-77 types -- - -// Note: These types are typically only used by BLAS compatibility layer, but -// we must define them even when the compatibility layer isn't being built -// because they also occur in bli_slamch() and bli_dlamch(). - -// Define f77_int depending on what size of integer was requested. -#if BLIS_BLAS_INT_TYPE_SIZE == 32 -typedef int32_t f77_int; -#elif BLIS_BLAS_INT_TYPE_SIZE == 64 -typedef int64_t f77_int; -#else -typedef long int f77_int; -#endif - -typedef char f77_char; -typedef float f77_float; -typedef double f77_double; -typedef scomplex f77_scomplex; -typedef dcomplex f77_dcomplex; - -// -- Void function pointer types -- - -// Note: This type should be used in any situation where the address of a -// *function* will be conveyed or stored prior to it being typecast back -// to the correct function type. It does not need to be used when conveying -// or storing the address of *data* (such as an array of float or double). - -//typedef void (*void_fp)( void ); -typedef void* void_fp; - - -// -// -- BLIS info bit field offsets ---------------------------------------------- -// - - - -// info -#define BLIS_DATATYPE_SHIFT 0 -#define BLIS_DOMAIN_SHIFT 0 -#define BLIS_PRECISION_SHIFT 1 -#define BLIS_CONJTRANS_SHIFT 3 -#define BLIS_TRANS_SHIFT 3 -#define BLIS_CONJ_SHIFT 4 -#define BLIS_UPLO_SHIFT 5 -#define BLIS_UPPER_SHIFT 5 -#define BLIS_DIAG_SHIFT 6 -#define BLIS_LOWER_SHIFT 7 -#define BLIS_UNIT_DIAG_SHIFT 8 -#define BLIS_INVERT_DIAG_SHIFT 9 -#define BLIS_TARGET_DT_SHIFT 10 -#define BLIS_TARGET_DOMAIN_SHIFT 10 -#define BLIS_TARGET_PREC_SHIFT 11 -#define BLIS_EXEC_DT_SHIFT 13 -#define BLIS_EXEC_DOMAIN_SHIFT 13 -#define BLIS_EXEC_PREC_SHIFT 14 -#define BLIS_PACK_SCHEMA_SHIFT 16 -#define BLIS_PACK_RC_SHIFT 16 -#define BLIS_PACK_PANEL_SHIFT 17 -#define BLIS_PACK_FORMAT_SHIFT 18 -#define BLIS_PACK_SHIFT 22 -#define BLIS_PACK_REV_IF_UPPER_SHIFT 23 -#define BLIS_PACK_REV_IF_LOWER_SHIFT 24 -#define BLIS_PACK_BUFFER_SHIFT 25 -#define BLIS_STRUC_SHIFT 27 -#define BLIS_COMP_DT_SHIFT 29 -#define BLIS_COMP_DOMAIN_SHIFT 29 -#define BLIS_COMP_PREC_SHIFT 30 - -// info2 -#define BLIS_SCALAR_DT_SHIFT 0 -#define BLIS_SCALAR_DOMAIN_SHIFT 0 -#define BLIS_SCALAR_PREC_SHIFT 1 - -// -// -- BLIS info bit field masks ------------------------------------------------ -// - -// info -#define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) -#define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) -#define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) -#define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) -#define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) -#define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) -#define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) -#define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) -#define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) -#define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) -#define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) -#define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) -#define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) -#define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) -#define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) -#define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) -#define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) -#define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) -#define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) -#define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) -#define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) -#define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) -#define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) -#define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) -#define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) -#define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) -#define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) -#define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) -#define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) - -// info2 -#define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) -#define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) -#define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) - - -// -// -- BLIS enumerated type value definitions ----------------------------------- -// - -#define BLIS_BITVAL_REAL 0x0 -#define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT -#define BLIS_BITVAL_SINGLE_PREC 0x0 -#define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT -#define BLIS_BITVAL_FLOAT_TYPE 0x0 -#define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT -#define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT -#define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) -#define BLIS_BITVAL_INT_TYPE 0x04 -#define BLIS_BITVAL_CONST_TYPE 0x05 -#define BLIS_BITVAL_NO_TRANS 0x0 -#define BLIS_BITVAL_TRANS BLIS_TRANS_BIT -#define BLIS_BITVAL_NO_CONJ 0x0 -#define BLIS_BITVAL_CONJ BLIS_CONJ_BIT -#define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) -#define BLIS_BITVAL_ZEROS 0x0 -#define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) -#define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) -#define BLIS_BITVAL_DENSE BLIS_UPLO_BITS -#define BLIS_BITVAL_NONUNIT_DIAG 0x0 -#define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT -#define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT -#define BLIS_BITVAL_NOT_PACKED 0x0 -#define BLIS_BITVAL_4MI ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_3MI ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_4MS ( 0x3 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_3MS ( 0x4 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_1E ( 0x8 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_1R ( 0x9 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) -#define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) -#define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 -#define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT -#define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 -#define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT -#define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 -#define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) -#define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) -#define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) -#define BLIS_BITVAL_GENERAL 0x0 -#define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) -#define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) -#define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) - - -// -// -- BLIS enumerated type definitions ----------------------------------------- -// - -// -- Operational parameter types -- - -typedef enum -{ - BLIS_NO_TRANSPOSE = 0x0, - BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, - BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, - BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS -} trans_t; - -typedef enum -{ - BLIS_NO_CONJUGATE = 0x0, - BLIS_CONJUGATE = BLIS_BITVAL_CONJ -} conj_t; - -typedef enum -{ - BLIS_ZEROS = BLIS_BITVAL_ZEROS, - BLIS_LOWER = BLIS_BITVAL_LOWER, - BLIS_UPPER = BLIS_BITVAL_UPPER, - BLIS_DENSE = BLIS_BITVAL_DENSE -} uplo_t; - -typedef enum -{ - BLIS_LEFT = 0x0, - BLIS_RIGHT -} side_t; - -typedef enum -{ - BLIS_NONUNIT_DIAG = 0x0, - BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG -} diag_t; - -typedef enum -{ - BLIS_NO_INVERT_DIAG = 0x0, - BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG -} invdiag_t; - -typedef enum -{ - BLIS_GENERAL = BLIS_BITVAL_GENERAL, - BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, - BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, - BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR -} struc_t; - - -// -- Data type -- - -typedef enum -{ - BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, - BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, - BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, - BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, - BLIS_INT = BLIS_BITVAL_INT_TYPE, - BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, - BLIS_DT_LO = BLIS_FLOAT, - BLIS_DT_HI = BLIS_DCOMPLEX -} num_t; - -typedef enum -{ - BLIS_REAL = BLIS_BITVAL_REAL, - BLIS_COMPLEX = BLIS_BITVAL_COMPLEX -} dom_t; - -typedef enum -{ - BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, - BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC -} prec_t; - - -// -- Pack schema type -- - -typedef enum -{ - BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, - BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, - BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, - BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, - BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, - BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, - BLIS_PACKED_ROW_PANELS_4MI = BLIS_BITVAL_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI = BLIS_BITVAL_PACKED_COL_PANELS_4MI, - BLIS_PACKED_ROW_PANELS_3MI = BLIS_BITVAL_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI = BLIS_BITVAL_PACKED_COL_PANELS_3MI, - BLIS_PACKED_ROW_PANELS_4MS = BLIS_BITVAL_PACKED_ROW_PANELS_4MS, - BLIS_PACKED_COL_PANELS_4MS = BLIS_BITVAL_PACKED_COL_PANELS_4MS, - BLIS_PACKED_ROW_PANELS_3MS = BLIS_BITVAL_PACKED_ROW_PANELS_3MS, - BLIS_PACKED_COL_PANELS_3MS = BLIS_BITVAL_PACKED_COL_PANELS_3MS, - BLIS_PACKED_ROW_PANELS_RO = BLIS_BITVAL_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_RO = BLIS_BITVAL_PACKED_COL_PANELS_RO, - BLIS_PACKED_ROW_PANELS_IO = BLIS_BITVAL_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_IO = BLIS_BITVAL_PACKED_COL_PANELS_IO, - BLIS_PACKED_ROW_PANELS_RPI = BLIS_BITVAL_PACKED_ROW_PANELS_RPI, - BLIS_PACKED_COL_PANELS_RPI = BLIS_BITVAL_PACKED_COL_PANELS_RPI, - BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, - BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R -} pack_t; - -// We combine row and column packing into one "type", and we start -// with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. We also count the -// schema pair for "4ms" (4m separated), because its bit value has -// been reserved, even though we don't use it. -#define BLIS_NUM_PACK_SCHEMA_TYPES 10 - - -// -- Pack order type -- - -typedef enum -{ - BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, - BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, - - BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, - BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER -} packord_t; - - -// -- Pack buffer type -- - -typedef enum -{ - BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, - BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, - BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, - BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE -} packbuf_t; - - -// -- Partitioning direction -- - -typedef enum -{ - BLIS_FWD, - BLIS_BWD -} dir_t; - - -// -- Subpartition type -- - -typedef enum -{ - BLIS_SUBPART0, - BLIS_SUBPART1, - BLIS_SUBPART2, - BLIS_SUBPART1AND0, - BLIS_SUBPART1AND2, - BLIS_SUBPART1A, - BLIS_SUBPART1B, - BLIS_SUBPART00, - BLIS_SUBPART10, - BLIS_SUBPART20, - BLIS_SUBPART01, - BLIS_SUBPART11, - BLIS_SUBPART21, - BLIS_SUBPART02, - BLIS_SUBPART12, - BLIS_SUBPART22 -} subpart_t; - - -// -- Matrix dimension type -- - -typedef enum -{ - BLIS_M = 0, - BLIS_N = 1 -} mdim_t; - - -// -- Machine parameter types -- - -typedef enum -{ - BLIS_MACH_EPS = 0, - BLIS_MACH_SFMIN, - BLIS_MACH_BASE, - BLIS_MACH_PREC, - BLIS_MACH_NDIGMANT, - BLIS_MACH_RND, - BLIS_MACH_EMIN, - BLIS_MACH_RMIN, - BLIS_MACH_EMAX, - BLIS_MACH_RMAX, - BLIS_MACH_EPS2 -} machval_t; - -#define BLIS_NUM_MACH_PARAMS 11 -#define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS -#define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 - - -// -- Induced method types -- - -typedef enum -{ - BLIS_3MH = 0, - BLIS_3M1, - BLIS_4MH, - BLIS_4M1B, - BLIS_4M1A, - BLIS_1M, - BLIS_NAT, - BLIS_IND_FIRST = 0, - BLIS_IND_LAST = BLIS_NAT -} ind_t; - -#define BLIS_NUM_IND_METHODS (BLIS_NAT+1) - -// These are used in bli_*_oapi.c to construct the ind_t values from -// the induced method substrings that go into function names. -#define bli_3mh BLIS_3MH -#define bli_3m1 BLIS_3M1 -#define bli_4mh BLIS_4MH -#define bli_4mb BLIS_4M1B -#define bli_4m1 BLIS_4M1A -#define bli_1m BLIS_1M -#define bli_nat BLIS_NAT - - -// -- Kernel ID types -- - -typedef enum -{ - BLIS_ADDV_KER = 0, - BLIS_AMAXV_KER, - BLIS_AXPBYV_KER, - BLIS_AXPYV_KER, - BLIS_COPYV_KER, - BLIS_DOTV_KER, - BLIS_DOTXV_KER, - BLIS_INVERTV_KER, - BLIS_SCALV_KER, - BLIS_SCAL2V_KER, - BLIS_SETV_KER, - BLIS_SUBV_KER, - BLIS_SWAPV_KER, - BLIS_XPBYV_KER -} l1vkr_t; - -#define BLIS_NUM_LEVEL1V_KERS 14 - - -typedef enum -{ - BLIS_AXPY2V_KER = 0, - BLIS_DOTAXPYV_KER, - BLIS_AXPYF_KER, - BLIS_DOTXF_KER, - BLIS_DOTXAXPYF_KER -} l1fkr_t; - -#define BLIS_NUM_LEVEL1F_KERS 5 - - -typedef enum -{ - BLIS_PACKM_0XK_KER = 0, - BLIS_PACKM_1XK_KER = 1, - BLIS_PACKM_2XK_KER = 2, - BLIS_PACKM_3XK_KER = 3, - BLIS_PACKM_4XK_KER = 4, - BLIS_PACKM_5XK_KER = 5, - BLIS_PACKM_6XK_KER = 6, - BLIS_PACKM_7XK_KER = 7, - BLIS_PACKM_8XK_KER = 8, - BLIS_PACKM_9XK_KER = 9, - BLIS_PACKM_10XK_KER = 10, - BLIS_PACKM_11XK_KER = 11, - BLIS_PACKM_12XK_KER = 12, - BLIS_PACKM_13XK_KER = 13, - BLIS_PACKM_14XK_KER = 14, - BLIS_PACKM_15XK_KER = 15, - BLIS_PACKM_16XK_KER = 16, - BLIS_PACKM_17XK_KER = 17, - BLIS_PACKM_18XK_KER = 18, - BLIS_PACKM_19XK_KER = 19, - BLIS_PACKM_20XK_KER = 20, - BLIS_PACKM_21XK_KER = 21, - BLIS_PACKM_22XK_KER = 22, - BLIS_PACKM_23XK_KER = 23, - BLIS_PACKM_24XK_KER = 24, - BLIS_PACKM_25XK_KER = 25, - BLIS_PACKM_26XK_KER = 26, - BLIS_PACKM_27XK_KER = 27, - BLIS_PACKM_28XK_KER = 28, - BLIS_PACKM_29XK_KER = 29, - BLIS_PACKM_30XK_KER = 30, - BLIS_PACKM_31XK_KER = 31, - - BLIS_UNPACKM_0XK_KER = 0, - BLIS_UNPACKM_1XK_KER = 1, - BLIS_UNPACKM_2XK_KER = 2, - BLIS_UNPACKM_3XK_KER = 3, - BLIS_UNPACKM_4XK_KER = 4, - BLIS_UNPACKM_5XK_KER = 5, - BLIS_UNPACKM_6XK_KER = 6, - BLIS_UNPACKM_7XK_KER = 7, - BLIS_UNPACKM_8XK_KER = 8, - BLIS_UNPACKM_9XK_KER = 9, - BLIS_UNPACKM_10XK_KER = 10, - BLIS_UNPACKM_11XK_KER = 11, - BLIS_UNPACKM_12XK_KER = 12, - BLIS_UNPACKM_13XK_KER = 13, - BLIS_UNPACKM_14XK_KER = 14, - BLIS_UNPACKM_15XK_KER = 15, - BLIS_UNPACKM_16XK_KER = 16, - BLIS_UNPACKM_17XK_KER = 17, - BLIS_UNPACKM_18XK_KER = 18, - BLIS_UNPACKM_19XK_KER = 19, - BLIS_UNPACKM_20XK_KER = 20, - BLIS_UNPACKM_21XK_KER = 21, - BLIS_UNPACKM_22XK_KER = 22, - BLIS_UNPACKM_23XK_KER = 23, - BLIS_UNPACKM_24XK_KER = 24, - BLIS_UNPACKM_25XK_KER = 25, - BLIS_UNPACKM_26XK_KER = 26, - BLIS_UNPACKM_27XK_KER = 27, - BLIS_UNPACKM_28XK_KER = 28, - BLIS_UNPACKM_29XK_KER = 29, - BLIS_UNPACKM_30XK_KER = 30, - BLIS_UNPACKM_31XK_KER = 31 - -} l1mkr_t; - -#define BLIS_NUM_PACKM_KERS 32 -#define BLIS_NUM_UNPACKM_KERS 32 - - -typedef enum -{ - BLIS_GEMM_UKR = 0, - BLIS_GEMMTRSM_L_UKR, - BLIS_GEMMTRSM_U_UKR, - BLIS_TRSM_L_UKR, - BLIS_TRSM_U_UKR -} l3ukr_t; - -#define BLIS_NUM_LEVEL3_UKRS 5 - - -typedef enum -{ - BLIS_REFERENCE_UKERNEL = 0, - BLIS_VIRTUAL_UKERNEL, - BLIS_OPTIMIZED_UKERNEL, - BLIS_NOTAPPLIC_UKERNEL -} kimpl_t; - -#define BLIS_NUM_UKR_IMPL_TYPES 4 - - -#if 0 -typedef enum -{ - // RV = row-stored, contiguous vector-loading - // RG = row-stored, non-contiguous gather-loading - // CV = column-stored, contiguous vector-loading - // CG = column-stored, non-contiguous gather-loading - - // RD = row-stored, dot-based - // CD = col-stored, dot-based - - // RC = row-stored, column-times-column - // CR = column-stored, row-times-row - - // GX = general-stored generic implementation - - BLIS_GEMMSUP_RV_UKR = 0, - BLIS_GEMMSUP_RG_UKR, - BLIS_GEMMSUP_CV_UKR, - BLIS_GEMMSUP_CG_UKR, - - BLIS_GEMMSUP_RD_UKR, - BLIS_GEMMSUP_CD_UKR, - - BLIS_GEMMSUP_RC_UKR, - BLIS_GEMMSUP_CR_UKR, - - BLIS_GEMMSUP_GX_UKR, -} l3sup_t; - -#define BLIS_NUM_LEVEL3_SUP_UKRS 9 -#endif - - -typedef enum -{ - // 3-operand storage combinations - BLIS_RRR = 0, - BLIS_RRC, // 1 - BLIS_RCR, // 2 - BLIS_RCC, // 3 - BLIS_CRR, // 4 - BLIS_CRC, // 5 - BLIS_CCR, // 6 - BLIS_CCC, // 7 - BLIS_XXX, // 8 - -#if 0 - BLIS_RRG, - BLIS_RCG, - BLIS_RGR, - BLIS_RGC, - BLIS_RGG, - BLIS_CRG, - BLIS_CCG, - BLIS_CGR, - BLIS_CGC, - BLIS_CGG, - BLIS_GRR, - BLIS_GRC, - BLIS_GRG, - BLIS_GCR, - BLIS_GCC, - BLIS_GCG, - BLIS_GGR, - BLIS_GGC, - BLIS_GGG, -#endif -} stor3_t; - -#define BLIS_NUM_3OP_RC_COMBOS 9 -//#define BLIS_NUM_3OP_RCG_COMBOS 27 - - -#if 0 -typedef enum -{ - BLIS_JC_IDX = 0, - BLIS_PC_IDX, - BLIS_IC_IDX, - BLIS_JR_IDX, - BLIS_IR_IDX, - BLIS_PR_IDX -} thridx_t; -#endif - -#define BLIS_NUM_LOOPS 6 - - -// -- Operation ID type -- - -typedef enum -{ -// -// NOTE: If/when additional type values are added to this enum, -// you must either: -// - keep the level-3 values (starting with _GEMM) beginning at -// index 0; or -// - if the value range is moved such that it does not begin at -// index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START -// value that can be subtracted from the opid_t value to map it -// to a zero-based range. -// This is needed because these level-3 opid_t values are used in -// bli_l3_ind.c to index into arrays. -// - BLIS_GEMM = 0, - BLIS_HEMM, - BLIS_HERK, - BLIS_HER2K, - BLIS_SYMM, - BLIS_SYRK, - BLIS_SYR2K, - BLIS_TRMM3, - BLIS_TRMM, - BLIS_TRSM, - BLIS_GEMMT, - BLIS_NOID -} opid_t; - -#define BLIS_NUM_LEVEL3_OPS 11 - - -// -- Blocksize ID type -- - -typedef enum -{ - // NOTE: the level-3 blocksizes MUST be indexed starting at zero. - // At one point, we made this assumption in bli_cntx_set_blkszs() - // and friends. - - BLIS_KR = 0, - BLIS_MR, - BLIS_NR, - BLIS_MC, - BLIS_KC, - BLIS_NC, - - BLIS_M2, // level-2 blocksize in m dimension - BLIS_N2, // level-2 blocksize in n dimension - - BLIS_AF, // level-1f axpyf fusing factor - BLIS_DF, // level-1f dotxf fusing factor - BLIS_XF, // level-1f dotxaxpyf fusing factor - - BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. -} bszid_t; - -#define BLIS_NUM_BLKSZS 11 - - -// -- Threshold ID type -- - -typedef enum -{ - BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension - BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension - BLIS_KT // level-3 small/unpacked matrix threshold in k dimension - -} threshid_t; - -#define BLIS_NUM_THRESH 3 - - -// -- Architecture ID type -- - -// NOTE: This typedef enum must be kept up-to-date with the arch_t -// string array in bli_arch.c. Whenever values are added/inserted -// OR if values are rearranged, be sure to update the string array -// in bli_arch.c. - -typedef enum -{ - // Intel - BLIS_ARCH_SKX = 0, - BLIS_ARCH_KNL, - BLIS_ARCH_KNC, - BLIS_ARCH_HASWELL, - BLIS_ARCH_SANDYBRIDGE, - BLIS_ARCH_PENRYN, - - // AMD - BLIS_ARCH_ZEN2, - BLIS_ARCH_ZEN, - BLIS_ARCH_EXCAVATOR, - BLIS_ARCH_STEAMROLLER, - BLIS_ARCH_PILEDRIVER, - BLIS_ARCH_BULLDOZER, - - // ARM - BLIS_ARCH_THUNDERX2, - BLIS_ARCH_CORTEXA57, - BLIS_ARCH_CORTEXA53, - BLIS_ARCH_CORTEXA15, - BLIS_ARCH_CORTEXA9, - - // IBM/Power - BLIS_ARCH_POWER9, - BLIS_ARCH_POWER7, - BLIS_ARCH_BGQ, - - // Generic architecture/configuration - BLIS_ARCH_GENERIC - -} arch_t; - -// NOTE: This value must be updated to reflect the number of enum values -// listed above for arch_t! -#define BLIS_NUM_ARCHS (BLIS_ARCH_GENERIC+1) - - -// -// -- BLIS misc. structure types ----------------------------------------------- -// - -// These headers must be included here (or earlier) because definitions they -// provide are needed in the pool_t and related structs. -// begin bli_pthread.h - - -#ifndef BLIS_PTHREAD_H -#define BLIS_PTHREAD_H - -#if defined(_MSC_VER) - -// This branch defines a pthread-like API, bli_pthread_*(), and implements it -// in terms of Windows API calls. - -// -- pthread_mutex_*() -- - -typedef SRWLOCK bli_pthread_mutex_t; -typedef void bli_pthread_mutexattr_t; - -#define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT - -BLIS_EXPORT_BLIS int bli_pthread_mutex_init - ( - bli_pthread_mutex_t* mutex, - const bli_pthread_mutexattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_lock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock - ( - bli_pthread_mutex_t* mutex - ); - -// -- pthread_once_*() -- - -typedef INIT_ONCE bli_pthread_once_t; - -#define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT - -BLIS_EXPORT_BLIS void bli_pthread_once - ( - bli_pthread_once_t* once, - void (*init)(void) - ); - -// -- pthread_cond_*() -- - -typedef CONDITION_VARIABLE bli_pthread_cond_t; -typedef void bli_pthread_condattr_t; - -#define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT - -BLIS_EXPORT_BLIS int bli_pthread_cond_init - ( - bli_pthread_cond_t* cond, - const bli_pthread_condattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_destroy - ( - bli_pthread_cond_t* cond - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_wait - ( - bli_pthread_cond_t* cond, - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast - ( - bli_pthread_cond_t* cond - ); - -// -- pthread_create(), pthread_join() -- - -typedef struct -{ - HANDLE handle; - void* retval; -} bli_pthread_t; - -typedef void bli_pthread_attr_t; - -BLIS_EXPORT_BLIS int bli_pthread_create - ( - bli_pthread_t* thread, - const bli_pthread_attr_t* attr, - void* (*start_routine)(void*), - void* arg - ); - -BLIS_EXPORT_BLIS int bli_pthread_join - ( - bli_pthread_t thread, - void** retval - ); - -// -- pthread_barrier_*() -- - -typedef void bli_pthread_barrierattr_t; - -typedef struct -{ - bli_pthread_mutex_t mutex; - bli_pthread_cond_t cond; - int count; - int tripCount; -} bli_pthread_barrier_t; - -BLIS_EXPORT_BLIS int bli_pthread_barrier_init - ( - bli_pthread_barrier_t* barrier, - const bli_pthread_barrierattr_t* attr, - unsigned int count - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy - ( - bli_pthread_barrier_t* barrier - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_wait - ( - bli_pthread_barrier_t* barrier - ); - -#else // !defined(_MSC_VER) - -#include // skipped - -// This branch defines a pthreads-like API, bli_pthreads_*(), and implements it -// in terms of the corresponding pthreads_*() types, macros, and function calls. - -// -- pthread types -- - -typedef pthread_t bli_pthread_t; -typedef pthread_attr_t bli_pthread_attr_t; -typedef pthread_mutex_t bli_pthread_mutex_t; -typedef pthread_mutexattr_t bli_pthread_mutexattr_t; -typedef pthread_cond_t bli_pthread_cond_t; -typedef pthread_condattr_t bli_pthread_condattr_t; -typedef pthread_once_t bli_pthread_once_t; - -#if defined(__APPLE__) - -// For OS X, we must define the barrier types ourselves since Apple does -// not implement barriers in their variant of pthreads. - -typedef void bli_pthread_barrierattr_t; - -typedef struct -{ - bli_pthread_mutex_t mutex; - bli_pthread_cond_t cond; - int count; - int tripCount; -} bli_pthread_barrier_t; - -#else - -// For other non-Windows OSes (primarily Linux), we can define the barrier -// types in terms of existing pthreads barrier types since we expect they -// will be provided by the pthreads implementation. - -typedef pthread_barrier_t bli_pthread_barrier_t; -typedef pthread_barrierattr_t bli_pthread_barrierattr_t; - -#endif - -// -- pthreads macros -- - -#define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER -#define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER -#define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT - -// -- pthread_create(), pthread_join() -- - -BLIS_EXPORT_BLIS int bli_pthread_create - ( - bli_pthread_t* thread, - const bli_pthread_attr_t* attr, - void* (*start_routine)(void*), - void* arg - ); - -BLIS_EXPORT_BLIS int bli_pthread_join - ( - bli_pthread_t thread, - void** retval - ); - -// -- pthread_mutex_*() -- - -BLIS_EXPORT_BLIS int bli_pthread_mutex_init - ( - bli_pthread_mutex_t* mutex, - const bli_pthread_mutexattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_lock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock - ( - bli_pthread_mutex_t* mutex - ); - -// -- pthread_cond_*() -- - -BLIS_EXPORT_BLIS int bli_pthread_cond_init - ( - bli_pthread_cond_t* cond, - const bli_pthread_condattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_destroy - ( - bli_pthread_cond_t* cond - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_wait - ( - bli_pthread_cond_t* cond, - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast - ( - bli_pthread_cond_t* cond - ); - -// -- pthread_once_*() -- - -BLIS_EXPORT_BLIS void bli_pthread_once - ( - bli_pthread_once_t* once, - void (*init)(void) - ); - -// -- pthread_barrier_*() -- - -BLIS_EXPORT_BLIS int bli_pthread_barrier_init - ( - bli_pthread_barrier_t* barrier, - const bli_pthread_barrierattr_t* attr, - unsigned int count - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy - ( - bli_pthread_barrier_t* barrier - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_wait - ( - bli_pthread_barrier_t* barrier - ); - -#endif // _MSC_VER - -#endif // BLIS_PTHREAD_H -// end bli_pthread.h -// begin bli_malloc.h - - -// Typedef function pointer types for malloc() and free() substitutes. -typedef void* (*malloc_ft) ( size_t size ); -typedef void (*free_ft) ( void* p ); - -// ----------------------------------------------------------------------------- - -#if 0 -BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); -BLIS_EXPORT_BLIS void bli_free_pool( void* p ); -#endif - -void* bli_malloc_intl( size_t size ); -void* bli_calloc_intl( size_t size ); -void bli_free_intl( void* p ); - -BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size ); -BLIS_EXPORT_BLIS void bli_free_user( void* p ); - -// ----------------------------------------------------------------------------- - -void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size ); -void bli_ffree_align( free_ft f, void* p ); - -void* bli_fmalloc_noalign( malloc_ft f, size_t size ); -void bli_ffree_noalign( free_ft f, void* p ); - -void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); -void bli_fmalloc_post_check( void* p ); - -// end bli_malloc.h - -// -- Pool block type -- - -typedef struct -{ - void* buf; - siz_t block_size; - -} pblk_t; - - -// -- Pool type -- - -typedef struct -{ - void* block_ptrs; - dim_t block_ptrs_len; - - dim_t top_index; - dim_t num_blocks; - - siz_t block_size; - siz_t align_size; - siz_t offset_size; - - malloc_ft malloc_fp; - free_ft free_fp; - -} pool_t; - - -// -- Array type -- - -typedef struct -{ - void* buf; - - siz_t num_elem; - siz_t elem_size; - -} array_t; - - -// -- Locked pool-of-arrays-of-pools type -- - -typedef struct -{ - bli_pthread_mutex_t mutex; - pool_t pool; - - siz_t def_array_len; - -} apool_t; - - -// -- packing block allocator: Locked set of pools type -- - -typedef struct membrk_s -{ - pool_t pools[3]; - bli_pthread_mutex_t mutex; - - // These fields are used for general-purpose allocation. - siz_t align_size; - malloc_ft malloc_fp; - free_ft free_fp; - -} membrk_t; - - -// -- Memory object type -- - -typedef struct mem_s -{ - pblk_t pblk; - packbuf_t buf_type; - pool_t* pool; - siz_t size; -} mem_t; - - -// -- Control tree node type -- - -struct cntl_s -{ - // Basic fields (usually required). - opid_t family; - bszid_t bszid; - void_fp var_func; - struct cntl_s* sub_prenode; - struct cntl_s* sub_node; - - // Optional fields (needed only by some operations such as packm). - // NOTE: first field of params must be a uint64_t containing the size - // of the struct. - void* params; - - // Internal fields that track "cached" data. - mem_t pack_mem; -}; -typedef struct cntl_s cntl_t; - - -// -- Blocksize object type -- - -typedef struct blksz_s -{ - // Primary blocksize values. - dim_t v[BLIS_NUM_FP_TYPES]; - - // Blocksize extensions. - dim_t e[BLIS_NUM_FP_TYPES]; - -} blksz_t; - - -// -- Function pointer object type -- - -typedef struct func_s -{ - // Kernel function address. - void_fp ptr[BLIS_NUM_FP_TYPES]; - -} func_t; - - -// -- Multi-boolean object type -- - -typedef struct mbool_s -{ - bool_t v[BLIS_NUM_FP_TYPES]; - -} mbool_t; - - -// -- Auxiliary kernel info type -- - -// Note: This struct is used by macro-kernels to package together extra -// parameter values that may be of use to the micro-kernel without -// cluttering up the micro-kernel interface itself. - -typedef struct -{ - // The pack schemas of A and B. - pack_t schema_a; - pack_t schema_b; - - // Pointers to the micro-panels of A and B which will be used by the - // next call to the micro-kernel. - void* a_next; - void* b_next; - - // The imaginary strides of A and B. - inc_t is_a; - inc_t is_b; - - // The panel strides of A and B. - // NOTE: These are only used in situations where iteration over the - // micropanels takes place in part within the kernel code (e.g. sup - // millikernels). - inc_t ps_a; - inc_t ps_b; - - // The type to convert to on output. - //num_t dt_on_output; - -} auxinfo_t; - - -// -- Global scalar constant data struct -- - -// Note: This struct is used only when statically initializing the -// global scalar constants in bli_const.c. -typedef struct constdata_s -{ - float s; - double d; - scomplex c; - dcomplex z; - gint_t i; - -} constdata_t; - - -// -// -- BLIS object type definitions --------------------------------------------- -// - -typedef struct obj_s -{ - // Basic fields - struct obj_s* root; - - dim_t off[2]; - dim_t dim[2]; - doff_t diag_off; - - objbits_t info; - objbits_t info2; - siz_t elem_size; - - void* buffer; - inc_t rs; - inc_t cs; - inc_t is; - - // Bufferless scalar storage - atom_t scalar; - - // Pack-related fields - dim_t m_padded; // m dimension of matrix, including any padding - dim_t n_padded; // n dimension of matrix, including any padding - inc_t ps; // panel stride (distance to next panel) - inc_t pd; // panel dimension (the "width" of a panel: - // usually MR or NR) - dim_t m_panel; // m dimension of a "full" panel - dim_t n_panel; // n dimension of a "full" panel -} obj_t; - -// Pre-initializors. Things that must be set afterwards: -// - root object pointer -// - info bitfields: dt, target_dt, exec_dt, comp_dt -// - info2 bitfields: scalar_dt -// - elem_size -// - dims, strides -// - buffer -// - internal scalar buffer (must always set imaginary component) - -#define BLIS_OBJECT_INITIALIZER \ -{ \ - .root = NULL, \ -\ - .off = { 0, 0 }, \ - .dim = { 0, 0 }, \ - .diag_off = 0, \ -\ - .info = 0x0 | BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( float ), \ -\ - .buffer = NULL, \ - .rs = 0, \ - .cs = 0, \ - .is = 1, \ -\ - .scalar = { 0.0, 0.0 }, \ -\ - .m_padded = 0, \ - .n_padded = 0, \ - .ps = 0, \ - .pd = 0, \ - .m_panel = 0, \ - .n_panel = 0 \ -} - -#define BLIS_OBJECT_INITIALIZER_1X1 \ -{ \ - .root = NULL, \ -\ - .off = { 0, 0 }, \ - .dim = { 1, 1 }, \ - .diag_off = 0, \ -\ - .info = 0x0 | BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( float ), \ -\ - .buffer = NULL, \ - .rs = 0, \ - .cs = 0, \ - .is = 1, \ -\ - .scalar = { 0.0, 0.0 }, \ -\ - .m_padded = 0, \ - .n_padded = 0, \ - .ps = 0, \ - .pd = 0, \ - .m_panel = 0, \ - .n_panel = 0 \ -} - -// Define these macros here since they must be updated if contents of -// obj_t changes. - -static void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) -{ - b->root = a->root; - - b->off[0] = a->off[0]; - b->off[1] = a->off[1]; - b->dim[0] = a->dim[0]; - b->dim[1] = a->dim[1]; - b->diag_off = a->diag_off; - - b->info = a->info; - b->info2 = a->info2; - b->elem_size = a->elem_size; - - b->buffer = a->buffer; - b->rs = a->rs; - b->cs = a->cs; - b->is = a->is; - - b->scalar = a->scalar; - - //b->pack_mem = a->pack_mem; - b->m_padded = a->m_padded; - b->n_padded = a->n_padded; - b->ps = a->ps; - b->pd = a->pd; - b->m_panel = a->m_panel; - b->n_panel = a->n_panel; -} - -static void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) -{ - b->root = a->root; - - b->off[0] = a->off[0]; - b->off[1] = a->off[1]; - // Avoid copying m and n since they will be overwritten. - //b->dim[0] = a->dim[0]; - //b->dim[1] = a->dim[1]; - b->diag_off = a->diag_off; - - b->info = a->info; - b->info2 = a->info2; - b->elem_size = a->elem_size; - - b->buffer = a->buffer; - b->rs = a->rs; - b->cs = a->cs; - b->is = a->is; - - b->scalar = a->scalar; - - // Avoid copying pack_mem entry. - // FGVZ: You should probably make sure this is right. - //b->pack_mem = a->pack_mem; - b->m_padded = a->m_padded; - b->n_padded = a->n_padded; - b->ps = a->ps; - b->pd = a->pd; - b->m_panel = a->m_panel; - b->n_panel = a->n_panel; -} - -// Initializors for global scalar constants. -// NOTE: These must remain cpp macros since they are initializor -// expressions, not functions. - -#define bli_obj_init_const( buffer0 ) \ -{ \ - .root = NULL, \ -\ - .off = { 0, 0 }, \ - .dim = { 1, 1 }, \ - .diag_off = 0, \ -\ - .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ - BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( constdata_t ), \ -\ - .buffer = buffer0, \ - .rs = 1, \ - .cs = 1, \ - .is = 1 \ -} - -#define bli_obj_init_constdata( val ) \ -{ \ - .s = ( float )val, \ - .d = ( double )val, \ - .c = { .real = ( float )val, .imag = 0.0f }, \ - .z = { .real = ( double )val, .imag = 0.0 }, \ - .i = ( gint_t )val, \ -} - - -// -- Context type -- - -typedef struct cntx_s -{ - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - bszid_t bmults[ BLIS_NUM_BLKSZS ]; - - blksz_t trsm_blkszs[ BLIS_NUM_BLKSZS ]; - - func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; - func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; - mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; - - blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; - void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; - blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; - func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; - mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; - - func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; - func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; - - func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; - func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; - - ind_t method; - pack_t schema_a_block; - pack_t schema_b_panel; - pack_t schema_c_panel; - -} cntx_t; - - -// -- Runtime type -- - -// NOTE: The order of these fields must be kept consistent with the definition -// of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. - -typedef struct rntm_s -{ - // "External" fields: these may be queried by the end-user. - bool_t auto_factor; - - dim_t num_threads; - dim_t thrloop[ BLIS_NUM_LOOPS ]; - bool_t pack_a; // enable/disable packing of left-hand matrix A. - bool_t pack_b; // enable/disable packing of right-hand matrix B. - bool_t l3_sup; // enable/disable small matrix handling in level-3 ops. - - // "Internal" fields: these should not be exposed to the end-user. - - // The small block pool, which is attached in the l3 thread decorator. - pool_t* sba_pool; - - // The packing block allocator, which is attached in the l3 thread decorator. - membrk_t* membrk; - -} rntm_t; - - -// -- Error types -- - -typedef enum -{ - BLIS_NO_ERROR_CHECKING = 0, - BLIS_FULL_ERROR_CHECKING -} errlev_t; - -typedef enum -{ - // Generic error codes - BLIS_SUCCESS = ( -1), - BLIS_FAILURE = ( -2), - - BLIS_ERROR_CODE_MIN = ( -9), - - // General errors - BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), - BLIS_UNDEFINED_ERROR_CODE = ( -11), - BLIS_NULL_POINTER = ( -12), - BLIS_NOT_YET_IMPLEMENTED = ( -13), - - // Parameter-specific errors - BLIS_INVALID_SIDE = ( -20), - BLIS_INVALID_UPLO = ( -21), - BLIS_INVALID_TRANS = ( -22), - BLIS_INVALID_CONJ = ( -23), - BLIS_INVALID_DIAG = ( -24), - BLIS_INVALID_MACHVAL = ( -25), - BLIS_EXPECTED_NONUNIT_DIAG = ( -26), - - // Datatype-specific errors - BLIS_INVALID_DATATYPE = ( -30), - BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), - BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), - BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), - BLIS_EXPECTED_REAL_DATATYPE = ( -34), - BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), - BLIS_INCONSISTENT_DATATYPES = ( -36), - BLIS_EXPECTED_REAL_PROJ_OF = ( -37), - BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), - BLIS_INCONSISTENT_PRECISIONS = ( -39), - - // Dimension-specific errors - BLIS_NONCONFORMAL_DIMENSIONS = ( -40), - BLIS_EXPECTED_SCALAR_OBJECT = ( -41), - BLIS_EXPECTED_VECTOR_OBJECT = ( -42), - BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), - BLIS_EXPECTED_SQUARE_OBJECT = ( -44), - BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), - BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), - BLIS_UNEXPECTED_VECTOR_DIM = ( -47), - BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), - BLIS_NEGATIVE_DIMENSION = ( -49), - - // Stride-specific errors - BLIS_INVALID_ROW_STRIDE = ( -50), - BLIS_INVALID_COL_STRIDE = ( -51), - BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), - - // Structure-specific errors - BLIS_EXPECTED_GENERAL_OBJECT = ( -60), - BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), - BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), - BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), - - // Storage-specific errors - BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), - - // Partitioning-specific errors - BLIS_INVALID_3x1_SUBPART = ( -80), - BLIS_INVALID_1x3_SUBPART = ( -81), - BLIS_INVALID_3x3_SUBPART = ( -82), - - // Control tree-specific errors - BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), - - // Packing-specific errors - BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), - - // Buffer-specific errors - BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), - - // Memory errors - BLIS_MALLOC_RETURNED_NULL = (-120), - - // Internal memory pool errors - BLIS_INVALID_PACKBUF = (-130), - BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), - BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), - BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), - BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), - - // Object-related errors - BLIS_EXPECTED_OBJECT_ALIAS = (-140), - - // Architecture-related errors - BLIS_INVALID_ARCH_ID = (-150), - - // Blocksize-related errors - BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), - BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), - BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), - BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), - BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), - BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), - - BLIS_ERROR_CODE_MAX = (-170) -} err_t; - -#endif -// end bli_type_defs.h -// begin bli_macro_defs.h - - -#ifndef BLIS_MACRO_DEFS_H -#define BLIS_MACRO_DEFS_H - - -// -- Undefine restrict for C++ and C89/90 -- - -#ifdef __cplusplus - // Language is C++; define restrict as nothing. - #ifndef restrict - #define restrict - #endif -#elif __STDC_VERSION__ >= 199901L - // Language is C99 (or later); do nothing since restrict is recognized. -#else - // Language is pre-C99; define restrict as nothing. - #ifndef restrict - #define restrict - #endif -#endif - - -// -- Define typeof() operator if using non-GNU compiler -- - -#ifndef __GNUC__ - #define typeof __typeof__ -#else - #ifndef typeof - #define typeof __typeof__ - #endif -#endif - - -// -- BLIS Thread Local Storage Keyword -- - -// __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. -// There is a small risk here as __GNUC__ can also be defined by some other -// compiler (other than ICC and CLANG which we know define it) that -// doesn't support __thread, as __GNUC__ is not quite unique to GCC. -// But the possibility of someone using such non-main-stream compiler -// for building BLIS is low. -#if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) - #define BLIS_THREAD_LOCAL __thread -#else - #define BLIS_THREAD_LOCAL -#endif - - -// -- BLIS constructor/destructor function attribute -- - -// __attribute__((constructor/destructor)) is supported by GCC only. -// There is a small risk here as __GNUC__ can also be defined by some other -// compiler (other than ICC and CLANG which we know define it) that -// doesn't support this, as __GNUC__ is not quite unique to GCC. -// But the possibility of someone using such non-main-stream compiler -// for building BLIS is low. - -#if defined(__ICC) || defined(__INTEL_COMPILER) - // ICC defines __GNUC__ but doesn't support this - #define BLIS_ATTRIB_CTOR - #define BLIS_ATTRIB_DTOR -#elif defined(__clang__) - // CLANG supports __attribute__, but its documentation doesn't - // mention support for constructor/destructor. Compiling with - // clang and testing shows that it does support. - #define BLIS_ATTRIB_CTOR __attribute__((constructor)) - #define BLIS_ATTRIB_DTOR __attribute__((destructor)) -#elif defined(__GNUC__) - #define BLIS_ATTRIB_CTOR __attribute__((constructor)) - #define BLIS_ATTRIB_DTOR __attribute__((destructor)) -#else - #define BLIS_ATTRIB_CTOR - #define BLIS_ATTRIB_DTOR -#endif - - -// -- Concatenation macros -- - -#define BLIS_FUNC_PREFIX_STR "bli" - -// We add an extra layer the definitions of these string-pasting macros -// because sometimes it is needed if, for example, one of the PASTE -// macros is invoked with an "op" argument that is itself a macro. - -#define PASTEMAC0_(op) bli_ ## op -#define PASTEMAC0(op) PASTEMAC0_(op) - -#define PASTEMAC_(ch,op) bli_ ## ch ## op -#define PASTEMAC(ch,op) PASTEMAC_(ch,op) - -#define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op -#define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) - -#define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op -#define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) - -#define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op -#define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) - -#define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op -#define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) - -#define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op -#define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) - -#define PASTEBLACHK_(op) bla_ ## op ## _check -#define PASTEBLACHK(op) PASTEBLACHK_(op) - -#define PASTECH0_(op) op -#define PASTECH0(op) PASTECH0_(op) - -#define PASTECH_(ch,op) ch ## op -#define PASTECH(ch,op) PASTECH_(ch,op) - -#define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op -#define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) - -#define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op -#define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) - -#define MKSTR(s1) #s1 -#define STRINGIFY_INT( s ) MKSTR( s ) - -#define PASTEMACT(ch1, ch2, ch3, ch4) bli_ ## ch1 ## ch2 ## _ ## ch3 ## _ ## ch4 -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - -// -- Include other groups of macros - -// begin bli_genarray_macro_defs.h - - -#ifndef BLIS_GENARRAY_MACRO_DEFS_H -#define BLIS_GENARRAY_MACRO_DEFS_H - - -// -- Macros to generate function arrays --------------------------------------- - -// -- "Smart" one-operand macro -- - -#define GENARRAY_FPA(tname,opname) \ -\ -static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ -{ \ - ( tname )PASTEMAC(s,opname), \ - ( tname )PASTEMAC(c,opname), \ - ( tname )PASTEMAC(d,opname), \ - ( tname )PASTEMAC(z,opname) \ -} - -// -- "Smart" one-operand macro (with integer support) -- - -#define GENARRAY_FPA_I(tname,opname) \ -\ -static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ -{ \ - ( tname )PASTEMAC(s,opname), \ - ( tname )PASTEMAC(c,opname), \ - ( tname )PASTEMAC(d,opname), \ - ( tname )PASTEMAC(z,opname), \ - ( tname )PASTEMAC(i,opname) \ -} - -// -- "Smart" two-operand macro -- - -#define GENARRAY_FPA2(tname,op) \ -\ -static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ -{ \ - { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ - { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ - { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ - { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ -} - -// -- "Smart" two-operand macro -- - - - - - -// -- One-operand macro -- - -#define GENARRAY(arrayname,op) \ -\ -arrayname[BLIS_NUM_FP_TYPES] = \ -{ \ - PASTEMAC(s,op), \ - PASTEMAC(c,op), \ - PASTEMAC(d,op), \ - PASTEMAC(z,op) \ -} - - -#define GENARRAY_T(arrayname,opname,varname) \ -\ -arrayname[BLIS_NUM_FP_TYPES][2] = \ -{ \ - {PASTEMACT(s,opname,l,varname),PASTEMACT(s,opname,u,varname)}, \ - {PASTEMACT(c,opname,l,varname),PASTEMACT(c,opname,u,varname)}, \ - {PASTEMACT(d,opname,l,varname),PASTEMACT(d,opname,u,varname)}, \ - {PASTEMACT(z,opname,l,varname),PASTEMACT(z,opname,u,varname)}, \ -} - -#define GENARRAY_I(arrayname,op) \ -\ -arrayname[BLIS_NUM_FP_TYPES+1] = \ -{ \ - PASTEMAC(s,op), \ - PASTEMAC(c,op), \ - PASTEMAC(d,op), \ - PASTEMAC(z,op), \ - PASTEMAC(i,op) \ -} - - - - - -// -- Two-operand macros -- - - -#define GENARRAY2_ALL(arrayname,op) \ -\ -arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ -{ \ - { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ - { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ - { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ - { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ -} - - -#define GENARRAY2_EXT(arrayname,op) \ -\ -arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ -{ \ - { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ - { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ - { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ - { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ -} - - -#define GENARRAY2_MIN(arrayname,op) \ -\ -arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ -{ \ - { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ - { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ - { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ - { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ -} - - -// -- Three-operand macros -- - - -#define GENARRAY3_ALL(arrayname,op) \ -\ -arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ -{ \ - { \ - { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ - { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ - { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ - { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ - }, \ - { \ - { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ - { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ - { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ - { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ - }, \ - { \ - { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ - { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ - { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ - { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ - }, \ - { \ - { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ - { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ - { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ - { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ - } \ -} - - -#define GENARRAY3_EXT(arrayname,op) \ -\ -arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ -{ \ - { \ - { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ - { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, } \ - }, \ - { \ - { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ - { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, } \ - }, \ - { \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ - { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ - }, \ - { \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ - { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ - } \ -} - - -#define GENARRAY3_MIN(arrayname,op) \ -\ -arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ -{ \ - { \ - { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, } \ - }, \ - { \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, } \ - }, \ - { \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ - { NULL, NULL, NULL, NULL, } \ - }, \ - { \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, NULL, }, \ - { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ - } \ -} - - -#endif -// end bli_genarray_macro_defs.h -// begin bli_gentdef_macro_defs.h - - -#ifndef BLIS_GENTDEF_MACRO_DEFS_H -#define BLIS_GENTDEF_MACRO_DEFS_H - -// -// -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- -// - - -// -- function typedef macro (both typed and void) -- - -#define INSERT_GENTDEF( opname ) \ -\ -GENTDEF( float, s, opname, _ft ) \ -GENTDEF( double, d, opname, _ft ) \ -GENTDEF( scomplex, c, opname, _ft ) \ -GENTDEF( dcomplex, z, opname, _ft ) \ -\ -GENTDEF( void, s, opname, _vft ) \ -GENTDEF( void, d, opname, _vft ) \ -GENTDEF( void, c, opname, _vft ) \ -GENTDEF( void, z, opname, _vft ) \ -\ -GENTDEF( void, , opname, _vft ) - -// -- function typedef macro (both typed and void) with real projection -- - -#define INSERT_GENTDEFR( opname ) \ -\ -GENTDEFR( float, float, s, s, opname, _ft ) \ -GENTDEFR( double, double, d, d, opname, _ft ) \ -GENTDEFR( scomplex, float, c, s, opname, _ft ) \ -GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ -\ -GENTDEFR( void, void, s, s, opname, _vft ) \ -GENTDEFR( void, void, d, d, opname, _vft ) \ -GENTDEFR( void, void, c, s, opname, _vft ) \ -GENTDEFR( void, void, z, d, opname, _vft ) \ -\ -GENTDEFR( void, void, , , opname, _vft ) - - -#endif -// end bli_gentdef_macro_defs.h -// begin bli_gentfunc_macro_defs.h - - - -#ifndef BLIS_GENTFUNC_MACRO_DEFS_H -#define BLIS_GENTFUNC_MACRO_DEFS_H - -// -// -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ -// - - - -// -- Macros for generating BLAS routines -------------------------------------- - - -// -- Basic one-operand macro -- - - -#define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ -\ -GENTFUNC( float, s, blasname, blisname ) \ -GENTFUNC( double, d, blasname, blisname ) \ -GENTFUNC( scomplex, c, blasname, blisname ) \ -GENTFUNC( dcomplex, z, blasname, blisname ) - - -#define INSERT_GENTFUNC_BLAS_CZ( blasname, blisname ) \ -\ -GENTFUNC( scomplex, c, blasname, blisname ) \ -GENTFUNC( dcomplex, z, blasname, blisname ) - -// -- Basic one-operand macro with real domain only -- - - -#define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ -\ -GENTFUNCRO( float, s, blasname, blisname ) \ -GENTFUNCRO( double, d, blasname, blisname ) - - -// -- Basic one-operand macro with complex domain only and real projection -- - - -#define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ -\ -GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ -GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) - - -// -- Basic one-operand macro with conjugation (used only for dot, ger) -- - -#define INSERT_GENTFUNCDOT_BLAS_CZ( blasname, blisname ) \ -\ -GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) - -#define INSERT_GENTFUNCDOT_BLAS_CZ_F2C( blasname, blisname ) \ -\ -GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) - -#define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ -\ -GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) - -#ifdef AOCL_F2C - -#define INSERT_GENTFUNCDOT_BLAS_SDC( blasname, blisname ) \ -\ -GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ -GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) - -#endif - -// -- Basic one-operand macro with real projection -- - - -#define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ -\ -GENTFUNCR( float, float, s, s, rblasname, blisname ) \ -GENTFUNCR( double, double, d, d, rblasname, blisname ) \ -GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ -GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) - - -// -- Alternate two-operand macro (one char for complex, one for real proj) -- - - -#define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ -\ -GENTFUNCR2( float, float, s, , blasname, blisname ) \ -GENTFUNCR2( double, double, d, , blasname, blisname ) \ -GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ -GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) - - -// -- Extended two-operand macro (used only for scal) -- - - -#define INSERT_GENTFUNCSCAL_BLAS_CZ( blasname, blisname ) \ -\ -GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ -GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ -GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ -GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) - - -#define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ -\ -GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ -GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ -GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ -GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ -GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ -GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) - -// --GEMMT specific kernels ---------------------------------------------------- - -#define INSERT_GENTFUNC_L( opname, funcname ) \ -\ -GENTFUNC(float, s, opname, l, funcname) \ -GENTFUNC(double, d, opname, l, funcname) \ -GENTFUNC(scomplex, c, opname, l, funcname) \ -GENTFUNC(dcomplex, z, opname, l, funcname) - - -#define INSERT_GENTFUNC_U( opname, funcname ) \ -\ -GENTFUNC(float, s, opname, u, funcname) \ -GENTFUNC(double, d, opname, u, funcname) \ -GENTFUNC(scomplex, c, opname, u, funcname) \ -GENTFUNC(dcomplex, z, opname, u, funcname) - - - -// -- Macros for functions with one operand ------------------------------------ - - -// -- Basic one-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC_BASIC0( tfuncname ) \ -\ -GENTFUNC( float, s, tfuncname ) \ -GENTFUNC( double, d, tfuncname ) \ -GENTFUNC( scomplex, c, tfuncname ) \ -GENTFUNC( dcomplex, z, tfuncname ) - - -#define INSERT_GENTFUNC_BASIC0_SD( tfuncname ) \ -\ -GENTFUNC( float, s, tfuncname ) \ -GENTFUNC( double, d, tfuncname ) - - -#define INSERT_GENTFUNC_BASIC0_CZ( tfuncname ) \ -\ -GENTFUNC( scomplex, c, tfuncname ) \ -GENTFUNC( dcomplex, z, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ -\ -GENTFUNC( float, s, tfuncname, varname ) \ -GENTFUNC( double, d, tfuncname, varname ) \ -GENTFUNC( scomplex, c, tfuncname, varname ) \ -GENTFUNC( dcomplex, z, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ -\ -GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ -GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ -GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ -GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) - - -// -- (three auxiliary arguments) -- - -#define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ -\ -GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ -GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ -GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ -GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) - -// -- (four auxiliary arguments) -- - -#define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ -\ -GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) - - - -// -- Basic one-operand with real projection -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ -\ -GENTFUNCR( float, float, s, s, tfuncname ) \ -GENTFUNCR( double, double, d, d, tfuncname ) \ -GENTFUNCR( scomplex, float, c, s, tfuncname ) \ -GENTFUNCR( dcomplex, double, z, d, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ -\ -GENTFUNCR( float, float, s, s, tfuncname, varname ) \ -GENTFUNCR( double, double, d, d, tfuncname, varname ) \ -GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ -GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ -\ -GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ -GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ -GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ -GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) - -// -- (three auxiliary arguments) -- - -#define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ -\ -GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ -GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ -GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ -GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) - -// -- (four auxiliary arguments) -- - -#define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ -\ -GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) - - - -// -- Basic one-operand macro with real domain only -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ -\ -GENTFUNCRO( float, s, tfuncname ) \ -GENTFUNCRO( double, d, tfuncname ) \ - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ -\ -GENTFUNCRO( float, s, tfuncname, varname ) \ -GENTFUNCRO( double, d, tfuncname, varname ) \ - - - -// -- Basic one-operand macro with complex domain only and real projection -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ -\ -GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ -GENTFUNCCO( dcomplex, double, z, d, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ -\ -GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ -GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ -\ -GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ -GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) - -// -- (three auxiliary arguments) -- - -#define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ -\ -GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ -GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) - - - -// -- Basic one-operand macro with integer instance -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ -\ -GENTFUNC( float, s, tfuncname ) \ -GENTFUNC( double, d, tfuncname ) \ -GENTFUNC( scomplex, c, tfuncname ) \ -GENTFUNC( dcomplex, z, tfuncname ) \ -GENTFUNC( gint_t, i, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ -\ -GENTFUNC( float, s, tfuncname, varname ) \ -GENTFUNC( double, d, tfuncname, varname ) \ -GENTFUNC( scomplex, c, tfuncname, varname ) \ -GENTFUNC( dcomplex, z, tfuncname, varname ) \ -GENTFUNC( gint_t, i, tfuncname, varname ) - - - -// -- Basic one-operand with integer projection -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ -\ -GENTFUNCI( float, gint_t, s, i, tfuncname ) \ -GENTFUNCI( double, gint_t, d, i, tfuncname ) \ -GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ -GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ -\ -GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ -GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ -GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ -GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) - - - -// -- Basic one-operand with real and integer projections -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ -\ -GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ -GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ -GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ -GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) - - - - -// -- Macros for functions with two primary operands --------------------------- - - -// -- Basic two-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ -\ -GENTFUNC2( float, float, s, s, tfuncname ) \ -GENTFUNC2( double, double, d, d, tfuncname ) \ -GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ -GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ -\ -GENTFUNC2( float, float, s, s, tfuncname, varname ) \ -GENTFUNC2( double, double, d, d, tfuncname, varname ) \ -GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ -GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) - - - -// -- Mixed domain two-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ -\ -GENTFUNC2( float, scomplex, s, c, tfuncname ) \ -GENTFUNC2( scomplex, float, c, s, tfuncname ) \ -\ -GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ -GENTFUNC2( dcomplex, double, z, d, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ -\ -GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ -GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ -\ -GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ -GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) - - - -// -- Mixed precision two-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ -\ -GENTFUNC2( float, double, s, d, tfuncname ) \ -GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ -\ -GENTFUNC2( double, float, d, s, tfuncname ) \ -GENTFUNC2( double, scomplex, d, c, tfuncname ) \ -\ -GENTFUNC2( scomplex, double, c, d, tfuncname ) \ -GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ -\ -GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ -GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ -\ -GENTFUNC2( float, double, s, d, tfuncname, varname ) \ -GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ -\ -GENTFUNC2( double, float, d, s, tfuncname, varname ) \ -GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ -\ -GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ -GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ -\ -GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ -GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ - - - -// -- Mixed domain/precision (all) two-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ -\ -GENTFUNC2( float, double, s, d, tfuncname ) \ -GENTFUNC2( float, scomplex, s, c, tfuncname ) \ -GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ -\ -GENTFUNC2( double, float, d, s, tfuncname ) \ -GENTFUNC2( double, scomplex, d, c, tfuncname ) \ -GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ -\ -GENTFUNC2( scomplex, float, c, s, tfuncname ) \ -GENTFUNC2( scomplex, double, c, d, tfuncname ) \ -GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ -\ -GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ -GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ -GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) - - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ -\ -GENTFUNC2( float, double, s, d, tfuncname, varname ) \ -GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ -GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ -\ -GENTFUNC2( double, float, d, s, tfuncname, varname ) \ -GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ -GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ -\ -GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ -GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ -GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ -\ -GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ -GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ -GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) - - - -// -- Basic two-operand with real projection of second operand -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ -\ -GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ -GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ -GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ -GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ -\ -GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ -GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ -GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ -GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) - - - -// -- Mixed domain two-operand with real projection of second operand -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ -\ -GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ -GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ -\ -GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ -GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ -\ -GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ -GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ -\ -GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ -GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) - - - -// -- Mixed precision two-operand with real projection of second operand -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ -\ -GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ -GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ -\ -GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ -GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ -\ -GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ -GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ -\ -GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ -GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ -\ -GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ -GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ -\ -GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ -GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ -\ -GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ -GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ -\ -GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ -GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) - - - -// -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ -\ -GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ -GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ -GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ -\ -GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ -GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ -GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ -\ -GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ -GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ -GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ -\ -GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ -GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ -GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ -\ -GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ -GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ -GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ -\ -GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ -GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ -GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ -\ -GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ -GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ -GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ -\ -GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ -GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ -GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ - - - - -// -- Macros for functions with three primary operands ------------------------- - - -// -- Basic three-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ -\ -GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ -GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ -GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ -GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ -\ -GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ -GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ -GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ -GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) - - - -// -- Mixed domain three-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ -\ -GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ -GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ -GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ -\ -GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ -GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ -GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ -\ -GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ -GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ -GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ -\ -GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ -GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ -GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ -\ -GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ -GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ -GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ -\ -GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ -GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ -GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ -\ -GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ -GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ -GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ -\ -GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ -GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ -GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) - - - -// -- Mixed precision three-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ -\ -GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ -GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ -\ -GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ -GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ -GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ -GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ -\ -GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ -GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ -\ -GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ -GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ -GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ -GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ -\ -\ -GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ -GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ -GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ -GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ -\ -GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ -GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ -\ -GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ -GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ -GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ -GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ -\ -GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ -GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ -\ -\ -GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ -GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ -\ -GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ -GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ -GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ -GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ -\ -GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ -GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ -\ -GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ -GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ -GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ -GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ -\ -\ -GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ -GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ -GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ -GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ -\ -GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ -GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ -\ -GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ -GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ -GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ -GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ -\ -GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ -GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ -\ -GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ -GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ -\ -GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ -GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ -GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ -GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ -\ -GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ -GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ -\ -GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ -GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ -GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ -GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ -\ -\ -GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ -GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ -GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ -GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ -\ -GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ -GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ -\ -GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ -GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ -GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ -GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ -\ -GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ -GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ -\ -\ -GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ -GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ -\ -GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ -GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ -GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ -GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ -\ -GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ -GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ -\ -GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ -GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ -GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ -GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ -\ -\ -GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ -GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ -GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ -GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ -\ -GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ -GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ -\ -GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ -GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ -GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ -GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ -\ -GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ -GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ -\ -\ -GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ -\ -\ -GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ -\ -\ -GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) - - - -// -- Basic three-operand with union of operands 1 and 2 -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ -\ -GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ -GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ -GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ -GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ -\ -GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ -GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) - - - -// -- Mixed domain three-operand with union of operands 1 and 2 -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ -\ -GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ -GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ -GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ -\ -GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ -GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ -GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ -\ -GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ -GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ -GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ -\ -GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ -GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ -GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ -\ -GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ -GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ -GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ -\ -GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ -GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ -GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ -\ -GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ -\ -GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) - - - -// -- Mixed precision three-operand with union of operands 1 and 2 -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ -\ -GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ -GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ -\ -GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ -GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ -GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ -GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ -\ -GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ -GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ -\ -GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ -GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ -GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ -GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ -\ -\ -GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ -GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ -GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ -GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ -\ -GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ -GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ -\ -GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ -GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ -GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ -GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ -\ -GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ -GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ -\ -\ -GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ -GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ -\ -GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ -GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ -GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ -GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ -\ -GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ -GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ -\ -GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ -GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ -GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ -GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ -\ -\ -GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ -GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ -GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ -GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ -\ -GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ -GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ -\ -GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ -GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ -GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ -GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ -\ -GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ -GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ -\ -GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ -GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ -\ -GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ -GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ -GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ -GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ -\ -GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ -GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ -\ -GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ -GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ -GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ -GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ -\ -\ -GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ -GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ -GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ -GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ -\ -GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ -GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ -\ -GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ -GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ -GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ -GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ -\ -GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ -GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ -\ -\ -GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ -\ -GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ -\ -GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ -\ -GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ -GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ -\ -\ -GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ -\ -GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ -\ -GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ -\ -GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ -GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ -\ -\ -GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ -\ -\ -GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ -\ -\ -GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ -\ -GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ -GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) - - -#endif -// end bli_gentfunc_macro_defs.h -// begin bli_gentprot_macro_defs.h - - - -#ifndef BLIS_GENTPROT_MACRO_DEFS_H -#define BLIS_GENTPROT_MACRO_DEFS_H - -// -// -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- -// - - - -// -- Macros for generating BLAS routines -------------------------------------- - - -// -- Basic one-operand macro -- - - -#define INSERT_GENTPROT_BLAS( blasname ) \ -\ -GENTPROT( float, s, blasname ) \ -GENTPROT( double, d, blasname ) \ -GENTPROT( scomplex, c, blasname ) \ -GENTPROT( dcomplex, z, blasname ) - - -// -- Basic one-operand macro with real domain only -- - - -#define INSERT_GENTPROTRO_BLAS( blasname ) \ -\ -GENTPROTRO( float, s, blasname ) \ -GENTPROTRO( double, d, blasname ) - - -// -- Basic one-operand macro with complex domain only and real projection -- - - -#define INSERT_GENTPROTCO_BLAS( blasname ) \ -\ -GENTPROTCO( scomplex, float, c, s, blasname ) \ -GENTPROTCO( dcomplex, double, z, d, blasname ) - - -// -- Basic one-operand macro with conjugation (used only for dot, ger) -- - - -#define INSERT_GENTPROTDOT_BLAS( blasname ) \ -\ -GENTPROTDOT( float, s, , blasname ) \ -GENTPROTDOT( double, d, , blasname ) \ -GENTPROTDOT( scomplex, c, c, blasname ) \ -GENTPROTDOT( scomplex, c, u, blasname ) \ -GENTPROTDOT( dcomplex, z, c, blasname ) \ -GENTPROTDOT( dcomplex, z, u, blasname ) - -#ifdef AOCL_F2C - -#define INSERT_GENTPROTDOT_BLAS_SDC( blasname ) \ -\ -GENTPROTDOT( float, s, , blasname ) \ -GENTPROTDOT( double, d, , blasname ) \ -GENTPROTDOT( scomplex, c, c, blasname ) \ -GENTPROTDOT( scomplex, c, u, blasname ) \ -GENTPROTDOT( dcomplex, z, u, blasname ) - -#endif - -// -- Basic one-operand macro with real projection -- - - -#define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ -\ -GENTPROTR( float, float, s, s, rblasname ) \ -GENTPROTR( double, double, d, d, rblasname ) \ -GENTPROTR( scomplex, float, c, s, cblasname ) \ -GENTPROTR( dcomplex, double, z, d, cblasname ) - - -// -- Alternate two-operand macro (one char for complex, one for real proj) -- - - -#define INSERT_GENTPROTR2_BLAS( blasname ) \ -\ -GENTPROTR2( float, float, , s, blasname ) \ -GENTPROTR2( double, double, , d, blasname ) \ -GENTPROTR2( scomplex, float, c, s, blasname ) \ -GENTPROTR2( dcomplex, double, z, d, blasname ) - - -// -- Extended two-operand macro (used only for scal) -- - - -#define INSERT_GENTPROTSCAL_BLAS( blasname ) \ -\ -GENTPROTSCAL( float, float, , s, blasname ) \ -GENTPROTSCAL( double, double, , d, blasname ) \ -GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ -GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ -GENTPROTSCAL( float, scomplex, s, c, blasname ) \ -GENTPROTSCAL( double, dcomplex, d, z, blasname ) - -// -- GEMMT specific function -------------------------------------------------- -#define INSERT_GENTPROT_GEMMT(opname, funcname) \ -\ -GENTPROT( float, s, opname, l, funcname ) \ -GENTPROT( double, d, opname, l, funcname ) \ -GENTPROT( float, s, opname, u, funcname ) \ -GENTPROT( double, d, opname, u, funcname ) \ -GENTPROT( scomplex, c, opname, l, funcname ) \ -GENTPROT( dcomplex, z, opname, l, funcname ) \ -GENTPROT( scomplex, c, opname, u, funcname ) \ -GENTPROT( dcomplex, z, opname, u, funcname ) - -// -- Macros for functions with one operand ------------------------------------ - - -// -- Basic one-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROT_BASIC0( tfuncname ) \ -\ -GENTPROT( float, s, tfuncname ) \ -GENTPROT( double, d, tfuncname ) \ -GENTPROT( scomplex, c, tfuncname ) \ -GENTPROT( dcomplex, z, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ -\ -GENTPROT( float, s, tfuncname, varname ) \ -GENTPROT( double, d, tfuncname, varname ) \ -GENTPROT( scomplex, c, tfuncname, varname ) \ -GENTPROT( dcomplex, z, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ -\ -GENTPROT( float, s, tfuncname, varname1, varname2 ) \ -GENTPROT( double, d, tfuncname, varname1, varname2 ) \ -GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ -GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) - -// -- (three auxiliary arguments) -- - -#define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ -\ -GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ -GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ -GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ -GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) - -// -- (four auxiliary arguments) -- - -#define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ -\ -GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) - - - -// -- Basic one-operand with real projection -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROTR_BASIC0( tfuncname ) \ -\ -GENTPROTR( float, float, s, s, tfuncname ) \ -GENTPROTR( double, double, d, d, tfuncname ) \ -GENTPROTR( scomplex, float, c, s, tfuncname ) \ -GENTPROTR( dcomplex, double, z, d, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ -\ -GENTPROTR( float, float, s, s, tfuncname, varname ) \ -GENTPROTR( double, double, d, d, tfuncname, varname ) \ -GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ -GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ -\ -GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ -GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ -GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ -GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) - -// -- (three auxiliary arguments) -- - -#define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ -\ -GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ -GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ -GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ -GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) - -// -- (four auxiliary arguments) -- - -#define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ -\ -GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ -GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) - - - -// -- Basic one-operand macro with complex domain only and real projection -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ -\ -GENTPROTCO( scomplex, float, c, s, tfuncname ) \ -GENTPROTCO( dcomplex, double, z, d, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ -\ -GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ -GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) - -// -- (two auxiliary arguments) -- - -#define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ -\ -GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ -GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) - - - -// -- Basic one-operand macro with integer instance -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROT_BASIC0_I( funcname ) \ -\ -GENTPROT( float, s, funcname ) \ -GENTPROT( double, d, funcname ) \ -GENTPROT( scomplex, c, funcname ) \ -GENTPROT( dcomplex, z, funcname ) \ -GENTPROT( gint_t, i, funcname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ -\ -GENTPROT( float, s, tfuncname, varname ) \ -GENTPROT( double, d, tfuncname, varname ) \ -GENTPROT( scomplex, c, tfuncname, varname ) \ -GENTPROT( dcomplex, z, tfuncname, varname ) \ -GENTPROT( gint_t, i, tfuncname, varname ) - - - -// -- Basic one-operand with integer projection -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROTI_BASIC0( funcname ) \ -\ -GENTPROTI( float, gint_t, s, i, funcname ) \ -GENTPROTI( double, gint_t, d, i, funcname ) \ -GENTPROTI( scomplex, gint_t, c, i, funcname ) \ -GENTPROTI( dcomplex, gint_t, z, i, funcname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ -\ -GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ -GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ -GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ -GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) - - - -// -- Basic one-operand with real and integer projections -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROTRI_BASIC( funcname ) \ -\ -GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ -GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ -GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ -GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) - - - - -// -- Macros for functions with two primary operands --------------------------- - - -// -- Basic two-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROT2_BASIC0( funcname ) \ -\ -GENTPROT2( float, float, s, s, funcname ) \ -GENTPROT2( double, double, d, d, funcname ) \ -GENTPROT2( scomplex, scomplex, c, c, funcname ) \ -GENTPROT2( dcomplex, dcomplex, z, z, funcname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ -\ -GENTPROT2( float, float, s, s, tfuncname, varname ) \ -GENTPROT2( double, double, d, d, tfuncname, varname ) \ -GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ -GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) - - - -// -- Mixed domain two-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROT2_MIX_D0( funcname ) \ -\ -GENTPROT2( float, scomplex, s, c, funcname ) \ -GENTPROT2( scomplex, float, c, s, funcname ) \ -\ -GENTPROT2( double, dcomplex, d, z, funcname ) \ -GENTPROT2( dcomplex, double, z, d, funcname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ -\ -GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ -GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ -\ -GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ -GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) - - - -// -- Mixed precision two-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROT2_MIX_P0( funcname ) \ -\ -GENTPROT2( float, double, s, d, funcname ) \ -GENTPROT2( float, dcomplex, s, z, funcname ) \ -\ -GENTPROT2( double, float, d, s, funcname ) \ -GENTPROT2( double, scomplex, d, c, funcname ) \ -\ -GENTPROT2( scomplex, double, c, d, funcname ) \ -GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ -\ -GENTPROT2( dcomplex, float, z, s, funcname ) \ -GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ -\ -GENTPROT2( float, double, s, d, tfuncname, varname ) \ -GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ -\ -GENTPROT2( double, float, d, s, tfuncname, varname ) \ -GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ -\ -GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ -GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ -\ -GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ -GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ - - - -// -- Mixed domain/precision (all) two-operand macro -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROT2_MIXDP0( funcname ) \ -\ -GENTPROT2( float, double, s, d, funcname ) \ -GENTPROT2( float, scomplex, s, c, funcname ) \ -GENTPROT2( float, dcomplex, s, z, funcname ) \ -\ -GENTPROT2( double, float, d, s, funcname ) \ -GENTPROT2( double, scomplex, d, c, funcname ) \ -GENTPROT2( double, dcomplex, d, z, funcname ) \ -\ -GENTPROT2( scomplex, float, c, s, funcname ) \ -GENTPROT2( scomplex, double, c, d, funcname ) \ -GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ -\ -GENTPROT2( dcomplex, float, z, s, funcname ) \ -GENTPROT2( dcomplex, double, z, d, funcname ) \ -GENTPROT2( dcomplex, scomplex, z, c, funcname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ -\ -GENTPROT2( float, double, s, d, tfuncname, varname ) \ -GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ -GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ -\ -GENTPROT2( double, float, d, s, tfuncname, varname ) \ -GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ -GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ -\ -GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ -GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ -GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ -\ -GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ -GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ -GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) - - - -// -- Basic two-operand with real projection of first operand -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROT2R_BASIC0( funcname ) \ -\ -GENTPROT2R( float, float, float, s, s, s, funcname ) \ -GENTPROT2R( double, double, double, d, d, d, funcname ) \ -GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ -GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ -\ -GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ -GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ -GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ -GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) - - - -// -- Mixed domain two-operand with real projection of first operand -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ -\ -GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ -GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ -\ -GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ -GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ -\ -GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ -GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ -\ -GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ -GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) - - - -// -- Mixed precision two-operand with real projection of first operand -- - -// -- (no auxiliary arguments) -- - -#define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ -\ -GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ -GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ -\ -GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ -GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ -\ -GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ -GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ -\ -GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ -GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) - -// -- (one auxiliary argument) -- - -#define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ -\ -GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ -GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ -\ -GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ -GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ -\ -GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ -GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ -\ -GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ -GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) - - - -// -- Macros for functions with three primary operands ------------------------- - - -// -- Basic three-operand macro -- - - -#define INSERT_GENTPROT3_BASIC( funcname ) \ -\ -GENTPROT3( float, float, float, s, s, s, funcname ) \ -GENTPROT3( double, double, double, d, d, d, funcname ) \ -GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ -GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) - - -// -- Mixed domain three-operand macro -- - - -#define INSERT_GENTPROT3_MIX_D( funcname ) \ -\ -GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ -GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ -GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ -\ -GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ -GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ -GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ -\ -GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ -GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ -GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ -\ -GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ -GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ -GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) - - -// -- Mixed precision three-operand macro -- - - -#define INSERT_GENTPROT3_MIX_P( funcname ) \ -\ -GENTPROT3( float, float, double, s, s, d, funcname ) \ -GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ -\ -GENTPROT3( float, double, float, s, d, s, funcname ) \ -GENTPROT3( float, double, double, s, d, d, funcname ) \ -GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ -GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ -\ -GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ -GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ -\ -GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ -GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ -GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ -GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ -\ -\ -GENTPROT3( double, float, float, d, s, s, funcname ) \ -GENTPROT3( double, float, double, d, s, d, funcname ) \ -GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ -GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ -\ -GENTPROT3( double, double, float, d, d, s, funcname ) \ -GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ -\ -GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ -GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ -GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ -GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ -\ -GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ -GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ -\ -\ -GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ -GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ -\ -GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ -GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ -GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ -GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ -\ -GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ -GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ -\ -GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ -GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ -GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ -GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ -\ -\ -GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ -GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ -GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ -GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ -\ -GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ -GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ -\ -GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ -GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ -GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ -GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ -\ -GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ -GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ - - - -// -- Basic three-operand with union of operands 1 and 2 -- - - -#define INSERT_GENTPROT3U12_BASIC( funcname ) \ -\ -GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ -GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ -GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ -GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) - - -// -- Mixed domain three-operand with union of operands 1 and 2 -- - - -#define INSERT_GENTPROT3U12_MIX_D( funcname ) \ -\ -GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ -GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ -GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ -\ -GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ -GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ -GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ -\ -GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ -GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ -GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ -\ -GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ -GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ -GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) - - -// -- Mixed precision three-operand with union of operands 1 and 2 -- - - -#define INSERT_GENTPROT3U12_MIX_P( funcname ) \ -\ -GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ -GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ -\ -GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ -GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ -GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ -GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ -\ -GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ -GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ -\ -GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ -GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ -GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ -GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ -\ -\ -GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ -GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ -GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ -GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ -\ -GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ -GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ -\ -GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ -GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ -GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ -GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ -\ -GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ -GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ -\ -\ -GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ -GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ -\ -GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ -GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ -GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ -GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ -\ -GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ -GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ -\ -GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ -GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ -GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ -GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ -\ -\ -GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ -GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ -GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ -GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ -\ -GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ -GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ -\ -GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ -GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ -GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ -GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ -\ -GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ -GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) - - -#endif -// end bli_gentprot_macro_defs.h - -// begin bli_misc_macro_defs.h - - -#ifndef BLIS_MISC_MACRO_DEFS_H -#define BLIS_MISC_MACRO_DEFS_H - - -// -- Miscellaneous macros -- - -// min, max, abs -// NOTE: These must remain macros since we don't know the types of a and b. - -#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) -#define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) -#define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) - -// fmin, fmax, fabs -// NOTE: These must remain macros since we don't know the types of a and b. - -#define bli_fmin( a, b ) bli_min( a, b ) -#define bli_fmax( a, b ) bli_max( a, b ) -#define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) - -// fminabs, fmaxabs -// NOTE: These must remain macros since we don't know the types of a and b. - -#define bli_fminabs( a, b ) \ -\ - bli_fmin( bli_fabs( a ), \ - bli_fabs( b ) ) - -#define bli_fmaxabs( a, b ) \ -\ - bli_fmax( bli_fabs( a ), \ - bli_fabs( b ) ) - -// round - -static double bli_round( double a ) -{ - return round( a ); -} - -// round_to_mult - -static guint_t bli_round_to_mult( guint_t val, guint_t mult ) -{ - return ( guint_t ) - ( ( ( ( guint_t )val + - ( guint_t )mult / 2 - ) / mult - ) * mult - ); -} - -// isnan, isinf -// NOTE: These must remain macros, since isinf() and isnan() are macros -// (defined in math.h) that likely depend on the type of the argument 'a' -// below. - -#define bli_isinf( a ) isinf( a ) -#define bli_isnan( a ) isnan( a ) - -// is_odd, is_even - -static bool_t bli_is_odd( gint_t a ) -{ - return ( a % 2 == 1 ); -} - -static bool_t bli_is_even( gint_t a ) -{ - return ( a % 2 == 0 ); -} - -// swap_dims - -static void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) -{ - dim_t temp = *dim1; - *dim1 = *dim2; - *dim2 = temp; -} - -// swap_incs - -static void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) -{ - inc_t temp = *inc1; - *inc1 = *inc2; - *inc2 = temp; -} - -// toggle_bool - -static void bli_toggle_bool( bool_t* b ) -{ - if ( *b == TRUE ) *b = FALSE; - else *b = TRUE; -} - -// return datatype for char - -#define bli_stype ( BLIS_FLOAT ) -#define bli_dtype ( BLIS_DOUBLE ) -#define bli_ctype ( BLIS_SCOMPLEX ) -#define bli_ztype ( BLIS_DCOMPLEX ) - -// return C type for char - -#define bli_sctype float -#define bli_dctype double -#define bli_cctype scomplex -#define bli_zctype dcomplex - -// return real proj of C type for char - -#define bli_sctyper float -#define bli_dctyper double -#define bli_cctyper float -#define bli_zctyper double - - -// return default format specifier for char - -// NOTE: These must remain macros due to the way they are used to initialize -// local char arrays. - -#define bli_sformatspec() "%9.2e" -#define bli_dformatspec() "%9.2e" -#define bli_cformatspec() "%9.2e + %9.2e " -#define bli_zformatspec() "%9.2e + %9.2e " -#define bli_iformatspec() "%6d" - - -#endif - -// end bli_misc_macro_defs.h -// begin bli_param_macro_defs.h - - -#ifndef BLIS_PARAM_MACRO_DEFS_H -#define BLIS_PARAM_MACRO_DEFS_H - - -// -- Parameter query macros -- - -// buffer - -static bool_t bli_is_aligned_to( siz_t p, siz_t size ) -{ - return ( bool_t ) - ( p % size == 0 ); -} - -static bool_t bli_is_unaligned_to( siz_t p, siz_t size ) -{ - return ( bool_t ) - ( p % size != 0 ); -} - -static siz_t bli_offset_past_alignment( siz_t p, siz_t size ) -{ - return ( siz_t ) - ( p % size ); -} - - -// datatype - -static bool_t bli_is_float( num_t dt ) -{ - return ( bool_t ) - ( dt == BLIS_FLOAT ); -} - -static bool_t bli_is_double( num_t dt ) -{ - return ( bool_t ) - ( dt == BLIS_DOUBLE ); -} - -static bool_t bli_is_scomplex( num_t dt ) -{ - return ( bool_t ) - ( dt == BLIS_SCOMPLEX ); -} - -static bool_t bli_is_dcomplex( num_t dt ) -{ - return ( bool_t ) - ( dt == BLIS_DCOMPLEX ); -} - -static bool_t bli_is_constant( num_t dt ) -{ - return ( bool_t ) - ( dt == BLIS_CONSTANT ); -} - -static bool_t bli_is_int( num_t dt ) -{ - return ( bool_t ) - ( dt == BLIS_INT ); -} - -static bool_t bli_is_real( num_t dt ) -{ - return ( bool_t ) - ( bli_is_float( dt ) || - bli_is_double( dt ) ); -} - -static bool_t bli_is_complex( num_t dt ) -{ - return ( bool_t ) - ( bli_is_scomplex( dt ) || - bli_is_dcomplex( dt ) ); -} - -static bool_t bli_is_single_prec( num_t dt ) -{ - return ( bool_t ) - ( bli_is_float( dt ) || - bli_is_scomplex( dt ) ); -} - -static bool_t bli_is_double_prec( num_t dt ) -{ - return ( bool_t ) - ( bli_is_double( dt ) || - bli_is_dcomplex( dt ) ); -} - -static dom_t bli_dt_domain( num_t dt ) -{ - return ( dom_t ) - ( dt & BLIS_DOMAIN_BIT ); -} - -static bool_t bli_dt_dom_is_real( num_t dt ) -{ - return ( bool_t ) - ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); -} - -static bool_t bli_dt_dom_is_complex( num_t dt ) -{ - return ( bool_t ) - ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); -} - -static prec_t bli_dt_prec( num_t dt ) -{ - return ( prec_t ) - ( dt & BLIS_PRECISION_BIT ); -} - -static bool_t bli_dt_prec_is_single( num_t dt ) -{ - return ( bool_t ) - ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); -} - -static bool_t bli_dt_prec_is_double( num_t dt ) -{ - return ( bool_t ) - ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); -} - -static num_t bli_dt_proj_to_real( num_t dt ) -{ - return ( num_t ) - ( dt & ~BLIS_BITVAL_COMPLEX ); -} - -static num_t bli_dt_proj_to_complex( num_t dt ) -{ - return ( num_t ) - ( dt | BLIS_BITVAL_COMPLEX ); -} - -static num_t bli_dt_proj_to_single_prec( num_t dt ) -{ - return ( num_t ) - ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); -} - -static num_t bli_dt_proj_to_double_prec( num_t dt ) -{ - return ( num_t ) - ( dt | BLIS_BITVAL_DOUBLE_PREC ); -} - - -// trans - -static bool_t bli_is_notrans( trans_t trans ) -{ - return ( bool_t ) - ( trans == BLIS_NO_TRANSPOSE ); -} - -static bool_t bli_is_trans( trans_t trans ) -{ - return ( bool_t ) - ( trans == BLIS_TRANSPOSE ); -} - -static bool_t bli_is_conjnotrans( trans_t trans ) -{ - return ( bool_t ) - ( trans == BLIS_CONJ_NO_TRANSPOSE ); -} - -static bool_t bli_is_conjtrans( trans_t trans ) -{ - return ( bool_t ) - ( trans == BLIS_CONJ_TRANSPOSE ); -} - -static bool_t bli_does_notrans( trans_t trans ) -{ - return ( bool_t ) - ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); -} - -static bool_t bli_does_trans( trans_t trans ) -{ - return ( bool_t ) - ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); -} - -static bool_t bli_does_noconj( trans_t trans ) -{ - return ( bool_t ) - ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); -} - -static bool_t bli_does_conj( trans_t trans ) -{ - return ( bool_t ) - ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); -} - -static trans_t bli_extract_trans( trans_t trans ) -{ - return ( trans_t ) - ( trans & BLIS_TRANS_BIT ); -} - -static conj_t bli_extract_conj( trans_t trans ) -{ - return ( conj_t ) - ( trans & BLIS_CONJ_BIT ); -} - -static trans_t bli_trans_toggled( trans_t trans ) -{ - return ( trans_t ) - ( trans ^ BLIS_TRANS_BIT ); -} - -static trans_t bli_trans_toggled_conj( trans_t trans ) -{ - return ( trans_t ) - ( trans ^ BLIS_CONJ_BIT ); -} - -static void bli_toggle_trans( trans_t* trans ) -{ - *trans = bli_trans_toggled( *trans ); -} - - -// side - -static bool_t bli_is_left( side_t side ) -{ - return ( bool_t ) - ( side == BLIS_LEFT ); -} - -static bool_t bli_is_right( side_t side ) -{ - return ( bool_t ) - ( side == BLIS_RIGHT ); -} - -static side_t bli_side_toggled( side_t side ) -{ - return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); -} - -static void bli_toggle_side( side_t* side ) -{ - *side = bli_side_toggled( *side ); -} - - -// uplo - -static bool_t bli_is_lower( uplo_t uplo ) -{ - return ( bool_t ) - ( uplo == BLIS_LOWER ); -} - -static bool_t bli_is_upper( uplo_t uplo ) -{ - return ( bool_t ) - ( uplo == BLIS_UPPER ); -} - -static bool_t bli_is_upper_or_lower( uplo_t uplo ) -{ - return ( bool_t ) - ( bli_is_upper( uplo ) || - bli_is_lower( uplo ) ); -} - -static bool_t bli_is_dense( uplo_t uplo ) -{ - return ( bool_t ) - ( uplo == BLIS_DENSE ); -} - -static bool_t bli_is_zeros( uplo_t uplo ) -{ - return ( bool_t ) - ( uplo == BLIS_ZEROS ); -} - -static uplo_t bli_uplo_toggled( uplo_t uplo ) -{ - return ( uplo_t ) - ( bli_is_upper_or_lower( uplo ) ? - ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo - ); -} - -static void bli_toggle_uplo( uplo_t* uplo ) -{ - *uplo = bli_uplo_toggled( *uplo ); -} - - -// structure - -static bool_t bli_is_general( struc_t struc ) -{ - return ( bool_t ) - ( struc == BLIS_GENERAL ); -} - -static bool_t bli_is_hermitian( struc_t struc ) -{ - return ( bool_t ) - ( struc == BLIS_HERMITIAN ); -} - -static bool_t bli_is_symmetric( struc_t struc ) -{ - return ( bool_t ) - ( struc == BLIS_SYMMETRIC ); -} - -static bool_t bli_is_triangular( struc_t struc ) -{ - return ( bool_t ) - ( struc == BLIS_TRIANGULAR ); -} - -static bool_t bli_is_herm_or_symm( struc_t struc ) -{ - return ( bool_t ) - ( bli_is_hermitian( struc ) || - bli_is_symmetric( struc ) ); -} - - -// conj - -static bool_t bli_is_noconj( conj_t conj ) -{ - return ( bool_t ) - ( conj == BLIS_NO_CONJUGATE ); -} - -static bool_t bli_is_conj( conj_t conj ) -{ - return ( bool_t ) - ( conj == BLIS_CONJUGATE ); -} - -static conj_t bli_conj_toggled( conj_t conj ) -{ - return ( conj_t ) - ( conj ^ BLIS_CONJ_BIT ); -} - -static conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) -{ - return ( conj_t ) - ( conj ^ conjapp ); -} - -static void bli_toggle_conj( conj_t* conj ) -{ - *conj = bli_conj_toggled( *conj ); -} - - -// diag - -static bool_t bli_is_nonunit_diag( diag_t diag ) -{ - return ( bool_t ) - ( diag == BLIS_NONUNIT_DIAG ); -} - -static bool_t bli_is_unit_diag( diag_t diag ) -{ - return ( bool_t ) - ( diag == BLIS_UNIT_DIAG ); -} - - -// dimension-related - -static bool_t bli_zero_dim1( dim_t m ) -{ - return ( bool_t ) - ( m == 0 ); -} - -static bool_t bli_zero_dim2( dim_t m, dim_t n ) -{ - return ( bool_t ) - ( m == 0 || n == 0 ); -} - -static bool_t bli_zero_dim3( dim_t m, dim_t n, dim_t k ) -{ - return ( bool_t ) - ( m == 0 || n == 0 || k == 0 ); -} - -static bool_t bli_nonzero_dim( dim_t m ) -{ - return ( bool_t ) - ( m > 0 ); -} - -static bool_t bli_vector_dim( dim_t m, dim_t n ) -{ - return ( bool_t ) - ( m == 1 ? n : m ); -} - -static bool_t bli_is_vector( dim_t m, dim_t n ) -{ - return ( bool_t ) - ( m == 1 || n == 1 ); -} - -static bool_t bli_is_row_vector( dim_t m, dim_t n ) -{ - return ( bool_t ) - ( m == 1 ); -} - -static bool_t bli_is_col_vector( dim_t m, dim_t n ) -{ - return ( bool_t ) - ( n == 1 ); -} - -static void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) -{ - if ( bli_is_left( side ) ) *dim = m; - else *dim = n; -} - -static void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) -{ - if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } - else { *mt = n; *nt = m; } -} - -static void bli_set_dims_incs_with_trans( trans_t trans, - dim_t m, dim_t n, inc_t rs, inc_t cs, - dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) -{ - if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } - else { *mt = n; *nt = m; *rst = cs; *cst = rs; } -} - - -// blocksize-related - -static dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) -{ - return ( dim_t ) - ( bli_min( b_alg, dim - i ) ); -} - -static dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) -{ - return ( dim_t ) - ( i == 0 && dim % b_alg != 0 ? dim % b_alg - : b_alg ); -} - - -// stride-related - -static inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) -{ - return ( inc_t ) - ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) - : ( m == 1 ? rs : cs ) ); -} - -static bool_t bli_is_row_stored( inc_t rs, inc_t cs ) -{ - return ( bool_t ) - ( bli_abs( cs ) == 1 ); -} - -static bool_t bli_is_col_stored( inc_t rs, inc_t cs ) -{ - return ( bool_t ) - ( bli_abs( rs ) == 1 ); -} - -static bool_t bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) -{ - return ( bool_t ) - ( cs == 1 && ( rs > 1 || n == 1 ) ); -} - -static bool_t bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) -{ - return ( bool_t ) - ( rs == 1 && ( cs > 1 || m == 1 ) ); -} - -static bool_t bli_is_gen_stored( inc_t rs, inc_t cs ) -{ - return ( bool_t ) - ( bli_abs( rs ) != 1 && - bli_abs( cs ) != 1 ); -} - -static bool_t bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) -{ - return ( bool_t ) - ( bli_abs( cs ) == bli_abs( rs ) - ? n < m - : bli_abs( cs ) < bli_abs( rs ) ); -} - -static bool_t bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) -{ - return ( bool_t ) - ( bli_abs( rs ) == bli_abs( cs ) - ? m < n - : bli_abs( rs ) < bli_abs( cs ) ); -} - -static bool_t bli_has_nonunit_inc1( inc_t s1 ) -{ - return ( bool_t ) - ( s1 != 1 ); -} - -static bool_t bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) -{ - return ( bool_t ) - ( s1 != 1 || s2 != 1 ); -} - -static bool_t bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) -{ - return ( bool_t ) - ( s1 != 1 || s2 != 1 || s3 != 1 ); -} - -// offset-relate - -static bool_t bli_gemmt_is_strictly_below_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( ( n_off + n - 1 ) < m_off ); -} - -static bool_t bli_gemmt_is_strictly_above_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( ( m_off + m - 1 ) < n_off ); -} -// diag offset-related - -static void bli_negate_diag_offset( doff_t* diagoff ) -{ - *diagoff = -(*diagoff); -} - -static void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) -{ - if ( bli_is_upper( uplo ) ) *diagoff -= 1; - else if ( bli_is_lower( uplo ) ) *diagoff += 1; -} - -static void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) -{ - if ( bli_is_upper( uplo ) ) *diagoff += 1; - else if ( bli_is_lower( uplo ) ) *diagoff -= 1; -} - -static bool_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) -{ - return ( bool_t ) - ( bli_does_trans( trans ) ? -diagoff : diagoff ); -} - -static bool_t bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( bli_does_trans( trans ) - ? ( ( doff_t )n <= -diagoff ) - : ( ( doff_t )m <= -diagoff ) ); -} - -static bool_t bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( bli_does_trans( trans ) - ? ( ( doff_t )m <= diagoff ) - : ( ( doff_t )n <= diagoff ) ); -} - -static bool_t bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || - bli_is_strictly_below_diag( diagoff, trans, m, n ) ); -} - -static bool_t bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || - ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); -} - -static bool_t bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || - ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); -} - -static bool_t bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( ( doff_t )m <= -diagoff ); -} - -static bool_t bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( ( doff_t )n <= diagoff ); -} - -static bool_t bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && - !bli_is_strictly_below_diag_n( diagoff, m, n ) ); -} - -static bool_t bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( bli_is_strictly_above_diag_n( diagoff, m, n ) || - bli_is_strictly_below_diag_n( diagoff, m, n ) ); -} - -static bool_t bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || - ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); -} - -static bool_t bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) -{ - return ( bool_t ) - ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || - ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); -} - - -// pruning-related - -static void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) -{ - *offm_inc = 0; - - // If the diagonal intersects the left side of the matrix, - // ignore the area above that intersection. - if ( *diagoff < 0 ) - { - *m = *m + *diagoff; - *offm_inc = - *diagoff; - *diagoff = 0; - } -} - -static void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) -{ - *offn_inc = 0; - - // If the diagonal intersects the bottom side of the matrix, - // ignore the area to the right of that intersection. - if ( *n > *diagoff + *m ) - { - *n = *diagoff + *m; - } -} - -static void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) -{ - *offn_inc = 0; - - // If the diagonal intersects the top side of the matrix, - // ignore the area to the left of that intersection. - if ( *diagoff > 0 ) - { - *n = *n - *diagoff; - *offn_inc = + *diagoff; - *diagoff = 0; - } -} - -static void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) -{ - *offm_inc = 0; - - // If the diagonal intersects the right side of the matrix, - // ignore the area below that intersection. - if ( *m > -(*diagoff) + *n ) - { - *m = -(*diagoff) + *n; - } -} - - -// thread range-related - -static void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) -{ - *diagoff = *n - *diagoff - *m; - bli_toggle_uplo( uplo ); -} - -static void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) -{ - bli_swap_dims( m, n ); - bli_negate_diag_offset( diagoff ); - bli_toggle_uplo( uplo ); -} - -static void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) -{ - dim_t start2 = n - *start; - dim_t end2 = n - *end; - *start = end2; - *end = start2; -} - - -// mdim_t-related - -static bool_t bli_is_m_dim( mdim_t mdim ) -{ - return ( bool_t ) - ( mdim == BLIS_M ); -} - -static bool_t bli_is_n_dim( mdim_t mdim ) -{ - return ( bool_t ) - ( mdim == BLIS_N ); -} - -static mdim_t bli_dim_toggled( mdim_t mdim ) -{ - return ( mdim == BLIS_M ? BLIS_N : BLIS_M ); -} - -static void bli_toggle_dim( mdim_t* mdim ) -{ - *mdim = bli_dim_toggled( *mdim ); -} - - -// stor3_t-related - -static stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, - inc_t rs_a, inc_t cs_a, - inc_t rs_b, inc_t cs_b ) -{ - // If any matrix is general-stored, return the stor3_t id for the - // general-purpose sup microkernel. - if ( bli_is_gen_stored( rs_c, cs_c ) || - bli_is_gen_stored( rs_a, cs_a ) || - bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; - - // Otherwise, compute and return the stor3_t id as follows. - const bool_t c_is_col = bli_is_col_stored( rs_c, cs_c ); - const bool_t a_is_col = bli_is_col_stored( rs_a, cs_a ); - const bool_t b_is_col = bli_is_col_stored( rs_b, cs_b ); - - return ( stor3_t )( 4 * c_is_col + - 2 * a_is_col + - 1 * b_is_col ); -} - -static stor3_t bli_stor3_trans( stor3_t id ) -{ -#if 1 - stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] - = - { - ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 - ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 - ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 - ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 - ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 - ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 - ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 - ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 - }; - - return map[id]; -#else - return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit - ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position - ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position -#endif -} - -static stor3_t bli_stor3_transa( stor3_t id ) -{ -#if 0 - stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] - = - { - ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 - ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 - ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 - ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 - ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 - ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 - ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 - ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 - }; - - return map[id]; -#else - return ( stor3_t )( id ^ 0x1 ); -#endif -} - -static stor3_t bli_stor3_transb( stor3_t id ) -{ -#if 0 - stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] - = - { - ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 - ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 - ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 - ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 - ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 - ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 - ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 - ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 - }; - - return map[id]; -#else - return ( stor3_t )( id ^ 0x2 ); -#endif -} - - - -// index-related - -static bool_t bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) -{ - return ( bool_t ) - ( i == n_iter - 1 && n_left != 0 ); -} - -static bool_t bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) -{ - return ( bool_t ) - ( i != n_iter - 1 || n_left == 0 ); -} - -static bool_t bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) -{ - return ( bool_t ) - ( i == 0 && n_left != 0 ); -} - -static bool_t bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) -{ - return ( bool_t ) - ( i != 0 || n_left == 0 ); -} - -static bool_t bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) -{ - return ( bool_t ) - ( i == end_iter - 1 ); -} - -static bool_t bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) -{ - return ( bool_t ) - ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); -} - -static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) -{ -#ifdef BLIS_ENABLE_JRIR_SLAB - return bli_is_last_iter_sl( i, end_iter, tid, nth ); -#else // BLIS_ENABLE_JRIR_RR - return bli_is_last_iter_rr( i, end_iter, tid, nth ); -#endif -} - - -// packbuf_t-related - -static guint_t bli_packbuf_index( packbuf_t buf_type ) -{ - return ( guint_t ) - ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); -} - -// pack_t-related - -static bool_t bli_is_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_BIT ); -} - -static bool_t bli_is_row_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ - BLIS_BITVAL_PACKED_ROWS ); -} - -static bool_t bli_is_col_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ - BLIS_BITVAL_PACKED_COLUMNS ); -} - -static bool_t bli_is_panel_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_PANEL_BIT ); -} - -static bool_t bli_is_4mi_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_4MI; -} - -static bool_t bli_is_3mi_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_3MI; -} - -static bool_t bli_is_3ms_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_3MS; -} - -static bool_t bli_is_ro_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RO; -} - -static bool_t bli_is_io_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_IO; -} - -static bool_t bli_is_rpi_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RPI; -} - -static bool_t bli_is_rih_packed( pack_t schema ) -{ - return ( bool_t ) - ( bli_is_ro_packed( schema ) || - bli_is_io_packed( schema ) || - bli_is_rpi_packed( schema ) ); -} - -static bool_t bli_is_1r_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R; -} - -static bool_t bli_is_1e_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E; -} - -static bool_t bli_is_1m_packed( pack_t schema ) -{ - return ( bool_t ) - ( bli_is_1r_packed( schema ) || - bli_is_1e_packed( schema ) ); -} - -static bool_t bli_is_nat_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) == 0; -} - -static bool_t bli_is_ind_packed( pack_t schema ) -{ - return ( bool_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) != 0; -} - -static guint_t bli_pack_schema_index( pack_t schema ) -{ - return ( guint_t ) - ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT; -} - - - -// pointer-related - -// Increment a pointer by an integer fraction: -// p0 + (num/dem) -// where p0 is a pointer to a datatype of size sizeof_p0. -static void_fp bli_ptr_inc_by_frac( void_fp p0, siz_t sizeof_p0, dim_t num, dim_t den ) -{ - return ( void_fp ) - ( ( char* )p0 + ( ( num * ( dim_t )sizeof_p0 ) / den ) ); -} - - - -// Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix -// argument. - -static -void bli_set_dims_incs_uplo_1m - ( - doff_t diagoffa, diag_t diaga, - uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, - uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, - dim_t* ij0, dim_t* n_shift - ) -{ - // This is to prevent the compiler from warning about uninitialized - // variables. - *ij0 = 0; - *n_shift = 0; - - // If matrix A is entirely "unstored", that is, if either: - // - A is lower-stored and entirely above the diagonal, or - // - A is upper-stored and entirely below the diagonal - // then we mark the storage as implicitly zero. - if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) - { - *uplo_eff = BLIS_ZEROS; - } - else - { - doff_t diagoffa_use_ = diagoffa; - doff_t diagoff_eff_; - dim_t n_iter_max_; - - if ( bli_is_unit_diag( diaga ) ) - bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); - - // If matrix A is entirely "stored", that is, if either: - // - A is upper-stored and entirely above the diagonal, or - // - A is lower-stored and entirely below the diagonal - // then we mark the storage as dense. - if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) - uploa = BLIS_DENSE; - - n_iter_max_ = n; - *n_elem_max = m; - *inca = rs_a; - *lda = cs_a; - *uplo_eff = uploa; - diagoff_eff_ = diagoffa_use_; - - if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) - { - bli_swap_dims( &n_iter_max_, n_elem_max ); - bli_swap_incs( inca, lda ); - bli_toggle_uplo( uplo_eff ); - bli_negate_diag_offset( &diagoff_eff_ ); - } - - if ( bli_is_dense( *uplo_eff ) ) - { - *n_iter = n_iter_max_; - } - else if ( bli_is_upper( *uplo_eff ) ) - { - if ( diagoff_eff_ < 0 ) - { - *ij0 = 0; - *n_shift = -diagoff_eff_; - *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); - *n_iter = n_iter_max_; - } - else - { - *ij0 = diagoff_eff_; - *n_shift = 0; - *n_iter = n_iter_max_ - diagoff_eff_; - } - } - else // if ( bli_is_lower( *uplo_eff ) ) - { - if ( diagoff_eff_ < 0 ) - { - *ij0 = -diagoff_eff_; - *n_shift = 0; - *n_elem_max = *n_elem_max + diagoff_eff_; - *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); - } - else - { - *ij0 = 0; - *n_shift = diagoff_eff_; - *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); - } - } - } -} - -// Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix -// argument (without column-wise stride optimization). - -static -void bli_set_dims_incs_uplo_1m_noswap - ( - doff_t diagoffa, diag_t diaga, - uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, - uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, - dim_t* ij0, dim_t* n_shift - ) -{ - // This is to prevent the compiler from warning about uninitialized - // variables. - *ij0 = 0; - *n_shift = 0; - - // If matrix A is entirely "unstored", that is, if either: - // - A is lower-stored and entirely above the diagonal, or - // - A is upper-stored and entirely below the diagonal - // then we mark the storage as implicitly zero. - if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) - { - *uplo_eff = BLIS_ZEROS; - } - else - { - doff_t diagoffa_use_ = diagoffa; - doff_t diagoff_eff_; - dim_t n_iter_max_; - - if ( bli_is_unit_diag( diaga ) ) - bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); - - // If matrix A is entirely "stored", that is, if either: - // - A is upper-stored and entirely above the diagonal, or - // - A is lower-stored and entirely below the diagonal - // then we mark the storage as dense. - if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) - uploa = BLIS_DENSE; - - n_iter_max_ = n; - *n_elem_max = m; - *inca = rs_a; - *lda = cs_a; - *uplo_eff = uploa; - diagoff_eff_ = diagoffa_use_; - - if ( bli_is_dense( *uplo_eff ) ) - { - *n_iter = n_iter_max_; - } - else if ( bli_is_upper( *uplo_eff ) ) - { - if ( diagoff_eff_ < 0 ) - { - *ij0 = 0; - *n_shift = -diagoff_eff_; - *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); - *n_iter = n_iter_max_; - } - else - { - *ij0 = diagoff_eff_; - *n_shift = 0; - *n_iter = n_iter_max_ - diagoff_eff_; - } - } - else // if ( bli_is_lower( *uplo_eff ) ) - { - if ( diagoff_eff_ < 0 ) - { - *ij0 = -diagoff_eff_; - *n_shift = 0; - *n_elem_max = *n_elem_max + diagoff_eff_; - *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); - } - else - { - *ij0 = 0; - *n_shift = diagoff_eff_; - *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); - } - } - } -} - -// Set dimensions and increments for TWO matrix arguments. - -static -void bli_set_dims_incs_2m - ( - trans_t transa, - dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, - inc_t rs_b, inc_t cs_b, - dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, - inc_t* incb, inc_t* ldb - ) -{ - { - *n_iter = n; - *n_elem = m; - *inca = rs_a; - *lda = cs_a; - *incb = rs_b; - *ldb = cs_b; - - if ( bli_does_trans( transa ) ) - { - bli_swap_incs( inca, lda ); - } - - if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && - bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) - { - bli_swap_dims( n_iter, n_elem ); - bli_swap_incs( inca, lda ); - bli_swap_incs( incb, ldb ); - } - } -} - -// Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix -// arguments. - -static -void bli_set_dims_incs_uplo_2m - ( - doff_t diagoffa, diag_t diaga, trans_t transa, - uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, - inc_t rs_b, inc_t cs_b, - uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, - inc_t* incb, inc_t* ldb, - dim_t* ij0, dim_t* n_shift - ) -{ - // This is to prevent the compiler from warning about uninitialized - // variables. - *ij0 = 0; - *n_shift = 0; - - // If matrix A is entirely "unstored", that is, if either: - // - A is lower-stored and entirely above the diagonal, or - // - A is upper-stored and entirely below the diagonal - // then we mark the storage as implicitly zero. - if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) - { - *uplo_eff = BLIS_ZEROS; - } - else - { - doff_t diagoffa_use_ = diagoffa; - doff_t diagoff_eff_; - dim_t n_iter_max_; - - if ( bli_is_unit_diag( diaga ) ) - bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); - - // If matrix A is entirely "stored", that is, if either: - // - A is upper-stored and entirely above the diagonal, or - // - A is lower-stored and entirely below the diagonal - // then we mark the storage as dense. - if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) - uploa = BLIS_DENSE; - - n_iter_max_ = n; - *n_elem_max = m; - *inca = rs_a; - *lda = cs_a; - *incb = rs_b; - *ldb = cs_b; - *uplo_eff = uploa; - diagoff_eff_ = diagoffa_use_; - - if ( bli_does_trans( transa ) ) - { - bli_swap_incs( inca, lda ); - bli_toggle_uplo( uplo_eff ); - bli_negate_diag_offset( &diagoff_eff_ ); - } - - if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && - bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) - { - bli_swap_dims( &n_iter_max_, n_elem_max ); - bli_swap_incs( inca, lda ); - bli_swap_incs( incb, ldb ); - bli_toggle_uplo( uplo_eff ); - bli_negate_diag_offset( &diagoff_eff_ ); - } - - if ( bli_is_dense( *uplo_eff ) ) - { - *n_iter = n_iter_max_; - } - else if ( bli_is_upper( *uplo_eff ) ) - { - if ( diagoff_eff_ < 0 ) - { - *ij0 = 0; - *n_shift = -diagoff_eff_; - *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); - *n_iter = n_iter_max_; - } - else - { - *ij0 = diagoff_eff_; - *n_shift = 0; - *n_iter = n_iter_max_ - diagoff_eff_; - } - } - else // if ( bli_is_lower( *uplo_eff ) ) - { - if ( diagoff_eff_ < 0 ) - { - *ij0 = -diagoff_eff_; - *n_shift = 0; - *n_elem_max = *n_elem_max + diagoff_eff_; - *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); - } - else - { - *ij0 = 0; - *n_shift = diagoff_eff_; - *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); - } - } - } -} - -// Set dimensions, increments, etc for ONE matrix argument when operating -// on the diagonal. - -static -void bli_set_dims_incs_1d - ( - doff_t diagoffx, - dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, - dim_t* offx, dim_t* n_elem, inc_t* incx - ) -{ - if ( diagoffx < 0 ) - { - *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); - *offx = ( dim_t )(-diagoffx) * rs_x; - } - else - { - *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); - *offx = ( dim_t )( diagoffx) * cs_x; - } - - *incx = rs_x + cs_x; \ -} - -// Set dimensions, increments, etc for TWO matrix arguments when operating -// on diagonals. -static -void bli_set_dims_incs_2d - ( - doff_t diagoffx, trans_t transx, - dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, - inc_t rs_y, inc_t cs_y, - dim_t* offx, dim_t* offy, dim_t* n_elem, - inc_t* incx, inc_t* incy - ) -{ - doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); - - if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; - else *offx = diagoffx * cs_x; - - if ( diagoffy_ < 0 ) - { - *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); - *offy = -diagoffy_ * rs_y; - } - else - { - *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); - *offy = diagoffy_ * cs_y; - } - - *incx = rs_x + cs_x; - *incy = rs_y + cs_y; -} - - -#endif -// end bli_param_macro_defs.h -// begin bli_obj_macro_defs.h - - -#ifndef BLIS_OBJ_MACRO_DEFS_H -#define BLIS_OBJ_MACRO_DEFS_H - - -// -- Object query/modification macros -- - -// Info query - -static num_t bli_obj_dt( obj_t* obj ) -{ - return ( num_t ) - ( obj->info & BLIS_DATATYPE_BITS ); -} - -static bool_t bli_obj_is_float( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); -} - -static bool_t bli_obj_is_double( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); -} - -static bool_t bli_obj_is_scomplex( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); -} - -static bool_t bli_obj_is_dcomplex( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); -} - -static bool_t bli_obj_is_int( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); -} - -static bool_t bli_obj_is_const( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); -} - -static dom_t bli_obj_domain( obj_t* obj ) -{ - return ( dom_t ) - ( obj->info & BLIS_DOMAIN_BIT ); -} - -static prec_t bli_obj_prec( obj_t* obj ) -{ - return ( prec_t ) - ( obj->info & BLIS_PRECISION_BIT ); -} - -static bool_t bli_obj_is_single_prec( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); -} - -static bool_t bli_obj_is_double_prec( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); -} - -static num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) -{ - return ( num_t ) - ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); -} - -static num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) -{ - return ( num_t ) - ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); -} - -static bool_t bli_obj_is_real( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && - !bli_obj_is_const( obj ) ); -} - -static bool_t bli_obj_is_complex( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && - !bli_obj_is_const( obj ) ); -} - -static num_t bli_obj_dt_proj_to_real( obj_t* obj ) -{ - return ( num_t ) - ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); -} - -static num_t bli_obj_dt_proj_to_complex( obj_t* obj ) -{ - return ( num_t ) - ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); -} - -static num_t bli_obj_target_dt( obj_t* obj ) -{ - return ( num_t ) - ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); -} - -static dom_t bli_obj_target_domain( obj_t* obj ) -{ - return ( dom_t ) - ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); -} - -static prec_t bli_obj_target_prec( obj_t* obj ) -{ - return ( prec_t ) - ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); -} - -static num_t bli_obj_exec_dt( obj_t* obj ) -{ - return ( num_t ) - ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); -} - -static dom_t bli_obj_exec_domain( obj_t* obj ) -{ - return ( dom_t ) - ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); -} - -static prec_t bli_obj_exec_prec( obj_t* obj ) -{ - return ( prec_t ) - ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); -} - -static num_t bli_obj_comp_dt( obj_t* obj ) -{ - return ( num_t ) - ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); -} - -static dom_t bli_obj_comp_domain( obj_t* obj ) -{ - return ( dom_t ) - ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); -} - -static prec_t bli_obj_comp_prec( obj_t* obj ) -{ - return ( prec_t ) - ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); -} - -// NOTE: This function queries info2. -static num_t bli_obj_scalar_dt( obj_t* obj ) -{ - return ( num_t ) - ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); -} - -// NOTE: This function queries info2. -static dom_t bli_obj_scalar_domain( obj_t* obj ) -{ - return ( dom_t ) - ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); -} - -// NOTE: This function queries info2. -static prec_t bli_obj_scalar_prec( obj_t* obj ) -{ - return ( prec_t ) - ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); -} - -static trans_t bli_obj_conjtrans_status( obj_t* obj ) -{ - return ( trans_t ) - ( obj->info & BLIS_CONJTRANS_BITS ); -} - -static trans_t bli_obj_onlytrans_status( obj_t* obj ) -{ - return ( trans_t ) - ( obj->info & BLIS_TRANS_BIT ); -} - -static bool_t bli_obj_has_trans( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); -} - -static bool_t bli_obj_has_notrans( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); -} - -static conj_t bli_obj_conj_status( obj_t* obj ) -{ - return ( conj_t ) - ( obj->info & BLIS_CONJ_BIT ); -} - -static bool_t bli_obj_has_conj( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); -} - -static bool_t bli_obj_has_noconj( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); -} - -static uplo_t bli_obj_uplo( obj_t* obj ) -{ - return ( uplo_t ) - ( obj->info & BLIS_UPLO_BITS ); -} - -static bool_t bli_obj_is_upper( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); -} - -static bool_t bli_obj_is_lower( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); -} - -static bool_t bli_obj_is_upper_or_lower( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_is_upper( obj ) || - bli_obj_is_lower( obj ) ); -} - -static bool_t bli_obj_is_dense( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); -} - -static bool_t bli_obj_is_zeros( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); -} - -static diag_t bli_obj_diag( obj_t* obj ) -{ - return ( diag_t ) - ( obj->info & BLIS_UNIT_DIAG_BIT ); -} - -static bool_t bli_obj_has_nonunit_diag( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); -} - -static bool_t bli_obj_has_unit_diag( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); -} - -static bool_t bli_obj_has_inverted_diag( obj_t* obj ) -{ - return ( bool_t ) - ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); -} - -static bool_t bli_obj_is_pack_rev_if_upper( obj_t* obj ) -{ - return ( bool_t ) - ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); -} - -static bool_t bli_obj_is_pack_rev_if_lower( obj_t* obj ) -{ - return ( bool_t ) - ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); -} - -static pack_t bli_obj_pack_schema( obj_t* obj ) -{ - return ( pack_t ) - ( obj->info & BLIS_PACK_SCHEMA_BITS ); -} - -static bool_t bli_obj_is_packed( obj_t* obj ) -{ - return ( bool_t ) - ( obj->info & BLIS_PACK_BIT ); -} - -static bool_t bli_obj_is_row_packed( obj_t* obj ) -{ - return ( bool_t ) - ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ - BLIS_BITVAL_PACKED_ROWS ); -} - -static bool_t bli_obj_is_col_packed( obj_t* obj ) -{ - return ( bool_t ) - ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ - BLIS_BITVAL_PACKED_COLUMNS ); -} - -static bool_t bli_obj_is_panel_packed( obj_t* obj ) -{ - return ( bool_t ) - ( obj->info & BLIS_PACK_PANEL_BIT ); -} - -static packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) -{ - return ( packbuf_t ) - ( obj->info & BLIS_PACK_BUFFER_BITS ); -} - -static struc_t bli_obj_struc( obj_t* obj ) -{ - return ( struc_t ) - ( obj->info & BLIS_STRUC_BITS ); -} - -static bool_t bli_obj_is_general( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); -} - -static bool_t bli_obj_is_hermitian( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); -} - -static bool_t bli_obj_is_symmetric( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); -} - -static bool_t bli_obj_is_triangular( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); -} - -// Info modification - -static void bli_obj_apply_trans( trans_t trans, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info ^ trans ); -} - -static void bli_obj_apply_conj( conj_t conj, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info ^ conj ); -} - -static void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans; -} - -static void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_TRANS_BIT ) | trans; -} - -static void bli_obj_set_conj( conj_t conj, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_CONJ_BIT ) | conj; -} - -static void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_UPLO_BITS ) | uplo; -} - -static void bli_obj_set_diag( diag_t diag, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag; -} - -static void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag; -} - -static void bli_obj_set_dt( num_t dt, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_DATATYPE_BITS ) | dt; -} - -static void bli_obj_set_target_dt( num_t dt, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_TARGET_DT_BITS ) | - ( dt << BLIS_TARGET_DT_SHIFT ); -} - -static void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | - ( dt << BLIS_TARGET_DT_SHIFT ); -} - -static void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_TARGET_PREC_BIT ) | - ( dt << BLIS_TARGET_DT_SHIFT ); -} - -static void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_EXEC_DT_BITS ) | - ( dt << BLIS_EXEC_DT_SHIFT ); -} - -static void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | - ( dt << BLIS_EXEC_DT_SHIFT ); -} - -static void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_EXEC_PREC_BIT ) | - ( dt << BLIS_EXEC_DT_SHIFT ); -} - -static void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_COMP_DT_BITS ) | - ( dt << BLIS_COMP_DT_SHIFT ); -} - -static void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | - ( dt << BLIS_COMP_DT_SHIFT ); -} - -static void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_COMP_PREC_BIT ) | - ( dt << BLIS_COMP_DT_SHIFT ); -} - -// NOTE: This function queries and modifies info2. -static void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) -{ - obj->info2 = ( objbits_t ) - ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | - ( dt << BLIS_SCALAR_DT_SHIFT ); -} - -// NOTE: This function queries and modifies info2. -static void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) -{ - obj->info2 = ( objbits_t ) - ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | - ( dt << BLIS_SCALAR_DT_SHIFT ); -} - -// NOTE: This function queries and modifies info2. -static void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) -{ - obj->info2 = ( objbits_t ) - ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | - ( dt << BLIS_SCALAR_DT_SHIFT ); -} - -static void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema; -} - -static void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif; -} - -static void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif; -} - -// NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, -// packbuf_t is stored/used from the context in order to support various -// induced methods. (Though ideally the packbuf_t field would only be -// present in the control tree). -static void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type; -} - -static void bli_obj_set_struc( struc_t struc, obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info & ~BLIS_STRUC_BITS ) | struc; -} - -static void bli_obj_toggle_trans( obj_t* obj ) -{ - bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); -} - -static void bli_obj_toggle_conj( obj_t* obj ) -{ - bli_obj_apply_conj( BLIS_CONJUGATE, obj ); -} - -static void bli_obj_toggle_uplo( obj_t* obj ) -{ - obj->info = ( objbits_t ) - ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; -} - -// Root matrix query - -static obj_t* bli_obj_root( obj_t* obj ) -{ - return ( obj->root ); -} - -static bool_t bli_obj_root_is_general( obj_t* obj ) -{ - return bli_obj_is_general( bli_obj_root( obj ) ); -} - -static bool_t bli_obj_root_is_hermitian( obj_t* obj ) -{ - return bli_obj_is_hermitian( bli_obj_root( obj ) ); -} - -static bool_t bli_obj_root_is_symmetric( obj_t* obj ) -{ - return bli_obj_is_symmetric( bli_obj_root( obj ) ); -} - -static bool_t bli_obj_root_is_triangular( obj_t* obj ) -{ - return bli_obj_is_triangular( bli_obj_root( obj ) ); -} - -static bool_t bli_obj_root_is_herm_or_symm( obj_t* obj ) -{ - return bli_obj_is_hermitian( bli_obj_root( obj ) ) || - bli_obj_is_symmetric( bli_obj_root( obj ) ); -} - -static bool_t bli_obj_root_is_upper( obj_t* obj ) -{ - return bli_obj_is_upper( bli_obj_root( obj ) ); -} - -static bool_t bli_obj_root_is_lower( obj_t* obj ) -{ - return bli_obj_is_lower( bli_obj_root( obj ) ); -} - -// Root matrix modification - -static void bli_obj_set_as_root( obj_t* obj ) -{ - obj->root = obj; -} - -// Diagonal offset query - -static doff_t bli_obj_diag_offset( obj_t* obj ) -{ - return ( doff_t ) - ( obj->diag_off ); -} - -static doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) -{ - return ( doff_t ) - ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) - : bli_obj_diag_offset( obj ) ); -} - -// Diagonal offset modification - -static void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) -{ - obj->diag_off = ( doff_t )offset; -} - -static void bli_obj_negate_diag_offset( obj_t* obj ) -{ - obj->diag_off = -(obj->diag_off); -} - -static void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) -{ - obj->diag_off += ( doff_t )offset; -} - -// Dimension query - -static dim_t bli_obj_length( obj_t* obj ) -{ - return ( obj->dim[ BLIS_M ] ); -} - -static dim_t bli_obj_width( obj_t* obj ) -{ - return ( obj->dim[ BLIS_N ] ); -} - -static dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) -{ - return ( obj->dim[ mdim ] ); -} - -static dim_t bli_obj_min_dim( obj_t* obj ) -{ - return bli_min( bli_obj_length( obj ), - bli_obj_width( obj ) ); -} - -static dim_t bli_obj_max_dim( obj_t* obj ) -{ - return bli_max( bli_obj_length( obj ), - bli_obj_width( obj ) ); -} - -static dim_t bli_obj_length_after_trans( obj_t* obj ) -{ - return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) - : bli_obj_length( obj ) ); -} - -static dim_t bli_obj_width_after_trans( obj_t* obj ) -{ - return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) - : bli_obj_width( obj ) ); -} - -static bool_t bli_obj_is_1x1( obj_t* x ) -{ - return ( bool_t ) - ( bli_obj_length( x ) == 1 && - bli_obj_width( x ) == 1 ); -} - -// Stride/increment query - -static inc_t bli_obj_row_stride( obj_t* obj ) -{ - return ( obj->rs ); -} - -static inc_t bli_obj_col_stride( obj_t* obj ) -{ - return ( obj->cs ); -} - -static inc_t bli_obj_imag_stride( obj_t* obj ) -{ - return ( obj->is ); -} - -static inc_t bli_obj_row_stride_mag( obj_t* obj ) -{ - return ( bli_abs( obj->rs ) ); -} - -static inc_t bli_obj_col_stride_mag( obj_t* obj ) -{ - return ( bli_abs( obj->cs ) ); -} - -static inc_t bli_obj_imag_stride_mag( obj_t* obj ) -{ - return ( bli_abs( obj->is ) ); -} - -// Note: The purpose of these functions is to obtain the length and width -// of the smallest submatrices of an object that could still encompass -// the stored data above (if obj is upper) or below (if obj is lower) -// the diagonal. -static dim_t bli_obj_length_stored( obj_t* obj ) -{ - return ( dim_t ) - ( bli_obj_is_upper( obj ) - ? bli_min( bli_obj_length( obj ), - bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) - : bli_min( bli_obj_length( obj ), - bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) - ); -} - -static dim_t bli_obj_width_stored( obj_t* obj ) -{ - return ( dim_t ) - ( bli_obj_is_lower( obj ) - ? bli_min( bli_obj_width( obj ), - bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) - : bli_min( bli_obj_width( obj ), - bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) - ); -} - -static dim_t bli_obj_length_stored_after_trans( obj_t* obj ) -{ - return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) - : bli_obj_length_stored( obj ) ); -} - -static dim_t bli_obj_width_stored_after_trans( obj_t* obj ) -{ - return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) - : bli_obj_width_stored( obj ) ); -} - -static dim_t bli_obj_vector_dim( obj_t* x ) -{ - return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) - : bli_obj_length( x ) ); -} - -static inc_t bli_obj_vector_inc( obj_t* x ) -{ - return ( bli_obj_is_1x1( x ) ? 1 : \ - ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) - : bli_obj_row_stride( x ) ) - ); -} - -static bool_t bli_obj_is_vector( obj_t* x ) -{ - return ( bool_t ) - ( bli_obj_length( x ) == 1 || - bli_obj_width( x ) == 1 ); -} - -static bool_t bli_obj_is_row_vector( obj_t* x ) -{ - return ( bool_t ) - ( bli_obj_length( x ) == 1 ); -} - -static bool_t bli_obj_is_col_vector( obj_t* x ) -{ - return ( bool_t ) - ( bli_obj_width( x ) == 1 ); -} - -static bool_t bli_obj_has_zero_dim( obj_t* x ) -{ - return ( bool_t ) - ( bli_obj_length( x ) == 0 || - bli_obj_width( x ) == 0 ); -} - -// Dimension modification - -static void bli_obj_set_length( dim_t m, obj_t* obj ) -{ - obj->dim[ BLIS_M ] = m; -} - -static void bli_obj_set_width( dim_t n, obj_t* obj ) -{ - obj->dim[ BLIS_N ] = n; -} - -static void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) -{ - obj->dim[ mdim ] = dim_val; -} - -static void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) -{ - bli_obj_set_length( m, obj ); - bli_obj_set_width( n, obj ); -} - -static void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) -{ - //if ( bli_does_notrans( trans ) ) - if ( ( ~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ) - { - bli_obj_set_length( m, obj ); - bli_obj_set_width( n, obj ); - } - else - { - bli_obj_set_length( n, obj ); - bli_obj_set_width( m, obj ); - } -} - -// Stride/increment predicates - -// -// NOTE: The following two macros differ from their non-obj counterparts -// in that they do not identify m x 1 and 1 x n objects as row-stored and -// column-stored, respectively, which is needed when considering packed -// objects. But this is okay, since none of the invocations of these -// "obj" macros are used on packed matrices. -// - -static bool_t bli_obj_is_row_stored( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_col_stride_mag( obj ) == 1 ); -} - -static bool_t bli_obj_is_col_stored( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_row_stride_mag( obj ) == 1 ); -} - -static bool_t bli_obj_is_gen_stored( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_row_stride_mag( obj ) != 1 && - bli_obj_col_stride_mag( obj ) != 1 ); -} - -static bool_t bli_obj_is_row_tilted( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); -} - -static bool_t bli_obj_is_col_tilted( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); -} - -// Stride/increment modification - -static void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) -{ - obj->rs = rs; -} - -static void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) -{ - obj->cs = cs; -} - -static void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) -{ - bli_obj_set_row_stride( rs, obj ); - bli_obj_set_col_stride( cs, obj ); -} - -static void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) -{ - obj->is = is; -} - -// Offset query - -static dim_t bli_obj_row_off( obj_t* obj ) -{ - return ( obj->off[ BLIS_M ] ); -} - -static dim_t bli_obj_col_off( obj_t* obj ) -{ - return ( obj->off[ BLIS_N ] ); -} - -static dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) -{ - return ( obj->off[ mdim ] ); -} - -// Offset modification - -static void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) -{ - obj->off[ mdim ] = offset; -} - -static void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) -{ - bli_obj_set_off( BLIS_M, offm, obj ); - bli_obj_set_off( BLIS_N, offn, obj ); -} - -static void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) -{ - obj->off[ mdim ] += offset; -} - -static void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) -{ - bli_obj_inc_off( BLIS_M, offm, obj ); - bli_obj_inc_off( BLIS_N, offn, obj ); -} - -// Diagonal offset predicates - -static bool_t bli_obj_is_strictly_above_diag( obj_t* obj ) -{ - return ( bool_t ) - ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); -} - -static bool_t bli_obj_is_strictly_below_diag( obj_t* obj ) -{ - return ( bool_t ) - ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); -} - -static bool_t bli_obj_is_outside_diag( obj_t* obj ) -{ - return ( bool_t ) - ( bli_obj_is_strictly_above_diag( obj ) || - bli_obj_is_strictly_below_diag( obj ) ); -} - -static bool_t bli_obj_intersects_diag( obj_t* obj ) -{ - return ( bool_t ) - ( !bli_obj_is_strictly_above_diag( obj ) && - !bli_obj_is_strictly_below_diag( obj ) ); -} - -static bool_t bli_obj_is_unstored_subpart( obj_t* obj ) -{ - return ( bool_t ) - ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || - ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); -} - -// Buffer address query - -static void* bli_obj_buffer( obj_t* obj ) -{ - return ( obj->buffer ); -} - -// Buffer address modification - -static void bli_obj_set_buffer( void* p, obj_t* obj ) -{ - obj->buffer = p; -} - -// Bufferless scalar field query - -static void* bli_obj_internal_scalar_buffer( obj_t* obj ) -{ - return ( void* ) - ( &( obj->scalar ) ); -} - -// Bufferless scalar field modification - -static void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) -{ - b->scalar = a->scalar; -} - -// Element size query - -static siz_t bli_obj_elem_size( obj_t* obj ) -{ - return ( obj->elem_size ); -} - -// Element size modification - -static void bli_obj_set_elem_size( siz_t size, obj_t* obj ) -{ - obj->elem_size = size; -} - -// Packed matrix info query - -static dim_t bli_obj_padded_length( obj_t* obj ) -{ - return ( obj->m_padded ); -} - -static dim_t bli_obj_padded_width( obj_t* obj ) -{ - return ( obj->n_padded ); -} - -// Packed matrix info modification - -static void bli_obj_set_padded_length( dim_t m, obj_t* obj ) -{ - obj->m_padded = m; -} - -static void bli_obj_set_padded_width( dim_t n, obj_t* obj ) -{ - obj->n_padded = n; -} - -static void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) -{ - bli_obj_set_padded_length( m, obj ); - bli_obj_set_padded_width( n, obj ); -} - -// Packed panel info query - -static dim_t bli_obj_panel_length( obj_t* obj ) -{ - return ( obj->m_panel ); -} - -static dim_t bli_obj_panel_width( obj_t* obj ) -{ - return ( obj->n_panel ); -} - -static inc_t bli_obj_panel_dim( obj_t* obj ) -{ - return ( obj->pd ); -} - -static inc_t bli_obj_panel_stride( obj_t* obj ) -{ - return ( obj->ps ); -} - -// Packed panel info modification - -static void bli_obj_set_panel_length( dim_t m, obj_t* obj ) -{ - obj->m_panel = m; -} - -static void bli_obj_set_panel_width( dim_t n, obj_t* obj ) -{ - obj->n_panel = n; -} - -static void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) -{ - bli_obj_set_panel_length( m, obj ); - bli_obj_set_panel_width( n, obj ); -} - -static void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) -{ - obj->pd = pd; -} - -static void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) -{ - obj->ps = ps; -} - -// stor3_t-related - -static stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) -{ - const inc_t rs_c = bli_obj_row_stride( c ); - const inc_t cs_c = bli_obj_col_stride( c ); - - inc_t rs_a, cs_a; - inc_t rs_b, cs_b; - - if ( bli_obj_has_notrans( a ) ) - { - rs_a = bli_obj_row_stride( a ); - cs_a = bli_obj_col_stride( a ); - } - else - { - rs_a = bli_obj_col_stride( a ); - cs_a = bli_obj_row_stride( a ); - } - - if ( bli_obj_has_notrans( b ) ) - { - rs_b = bli_obj_row_stride( b ); - cs_b = bli_obj_col_stride( b ); - } - else - { - rs_b = bli_obj_col_stride( b ); - cs_b = bli_obj_row_stride( b ); - } - - return bli_stor3_from_strides( rs_c, cs_c, - rs_a, cs_a, - rs_b, cs_b ); -} - - -// -- Initialization-related macros -- - -// Finish the initialization started by the matrix-specific static initializer -// (e.g. BLIS_OBJECT_PREINITIALIZER) -// NOTE: This is intended only for use in the BLAS compatibility API and typed -// BLIS API. - -static void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) -{ - bli_obj_set_as_root( obj ); - - bli_obj_set_dt( dt, obj ); - bli_obj_set_target_dt( dt, obj ); - bli_obj_set_exec_dt( dt, obj ); - bli_obj_set_comp_dt( dt, obj ); - - bli_obj_set_dims( m, n, obj ); - bli_obj_set_strides( rs, cs, obj ); - - siz_t elem_size = sizeof( float ); - if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; - if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; - bli_obj_set_elem_size( elem_size, obj ); - - bli_obj_set_buffer( p, obj ); - - bli_obj_set_scalar_dt( dt, obj ); - void* restrict s = bli_obj_internal_scalar_buffer( obj ); - - if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; - (( scomplex* )s)->imag = 0.0F; } - else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; - (( dcomplex* )s)->imag = 0.0; } -} - -// Finish the initialization started by the 1x1-specific static initializer -// (e.g. BLIS_OBJECT_PREINITIALIZER_1X1) -// NOTE: This is intended only for use in the BLAS compatibility API and typed -// BLIS API. - -static void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) -{ - bli_obj_set_as_root( obj ); - - bli_obj_set_dt( dt, obj ); - - bli_obj_set_buffer( p, obj ); -} - -// -- Miscellaneous object macros -- - -// Toggle the region referenced (or "stored"). - -static void bli_obj_toggle_region_ref( obj_t* obj ) -{ - if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); - else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); - - bli_obj_toggle_uplo( obj ); -} - -static void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) -{ - //if ( bli_does_trans( trans ) && - if ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS && - bli_obj_is_upper_or_lower( obj ) ) - { - bli_obj_toggle_uplo( obj ); - bli_obj_negate_diag_offset( obj ); - } -} - -// Initialize object with default properties (info field). - -static void bli_obj_set_defaults( obj_t* obj ) -{ - obj->info = 0x0; - obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; -} - -// Acquire buffer at object's submatrix offset (offset-aware buffer query). - -static void* bli_obj_buffer_at_off( obj_t* obj ) -{ - return ( void* ) - ( - ( ( char* )( bli_obj_buffer ( obj ) ) + - ( dim_t )( bli_obj_elem_size( obj ) ) * - ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + - bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) - ) - ) - ); -} - -// Acquire buffer from BLIS_CONSTANT object. - -static void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) -{ - void* p; - - if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); - else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); - else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); - else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); - else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); - - return p; -} - -// Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. - -static void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) -{ - return ( void* ) - ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) - : bli_obj_buffer_at_off( obj ) - ); -} - -// Make a full alias (shallow copy). - -static void bli_obj_alias_to( obj_t* a, obj_t* b ) -{ - bli_obj_init_full_shallow_copy_of( a, b ); -} - -// Check if two objects are aliases of one another. - -static bool_t bli_obj_is_alias_of( obj_t* a, obj_t* b ) -{ - return ( bool_t ) - ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); -} - - -// Create an alias with a trans value applied. -// (Note: trans may include a conj component.) - -static void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) -{ - bli_obj_alias_to( a, b ); - bli_obj_apply_trans( trans, b ); -} - -// Create an alias with a conj value applied. - -static void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) -{ - bli_obj_alias_to( a, b ); - bli_obj_apply_conj( conja, b ); -} - -// Alias only the real part. - -static void bli_obj_real_part( obj_t* c, obj_t* r ) -{ - bli_obj_alias_to( c, r ); - - if ( bli_obj_is_complex( c ) ) - { - // Change the datatypes. - const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); - const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); - const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); - const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); - bli_obj_set_dt( dt_stor_r, r ); - bli_obj_set_target_dt( dt_targ_r, r ); - bli_obj_set_exec_dt( dt_exec_r, r ); - bli_obj_set_comp_dt( dt_comp_r, r ); - - // Don't touch the attached scalar datatype. - - // Update the element size. - siz_t es_c = bli_obj_elem_size( c ); - bli_obj_set_elem_size( es_c/2, r ); - - // Update the strides. - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); - - // Buffer is left unchanged. - } -} - -// Alias only the imaginary part. - -static void bli_obj_imag_part( obj_t* c, obj_t* i ) -{ - if ( bli_obj_is_complex( c ) ) - { - bli_obj_alias_to( c, i ); - - // Change the datatype. - const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); - const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); - const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); - const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); - bli_obj_set_dt( dt_stor_r, i ); - bli_obj_set_target_dt( dt_targ_r, i ); - bli_obj_set_exec_dt( dt_exec_r, i ); - bli_obj_set_comp_dt( dt_comp_r, i ); - - // Don't touch the attached scalar datatype. - - // Update the element size. - siz_t es_c = bli_obj_elem_size( c ); - bli_obj_set_elem_size( es_c/2, i ); - - // Update the strides. - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); - - // Update the buffer. - inc_t is_c = bli_obj_imag_stride( c ); - char* p = ( char* )bli_obj_buffer_at_off( c ); - bli_obj_set_buffer( p + is_c * es_c/2, i ); - } -} - -// Given a 1x1 object, acquire an address to the buffer depending on whether -// the object is a BLIS_CONSTANT, and also set a datatype associated with the -// chosen buffer (possibly using an auxiliary datatype if the object is -// BLIS_CONSTANT). - -static void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) -{ - if ( bli_obj_is_const( obj ) ) - { - *dt = dt_aux; - *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); - } - else - { - *dt = bli_obj_dt( obj ); - *buf = bli_obj_buffer_at_off( obj ); - } -} - -// Swap all object fields (metadata/properties). - -static void bli_obj_swap( obj_t* a, obj_t* b ) -{ - obj_t t = *b; *b = *a; *a = t; -} - -// Swap object pack schemas. - -static void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) -{ - const pack_t schema_a = bli_obj_pack_schema( a ); - const pack_t schema_b = bli_obj_pack_schema( b ); - - bli_obj_set_pack_schema( schema_b, a ); - bli_obj_set_pack_schema( schema_a, b ); -} - -// Induce a transposition on an object: swap dimensions, increments, and -// offsets, then clear the trans bit. - -static void bli_obj_induce_trans( obj_t* obj ) -{ - // Induce transposition among basic fields. - dim_t m = bli_obj_length( obj ); - dim_t n = bli_obj_width( obj ); - inc_t rs = bli_obj_row_stride( obj ); - inc_t cs = bli_obj_col_stride( obj ); - dim_t offm = bli_obj_row_off( obj ); - dim_t offn = bli_obj_col_off( obj ); - doff_t diag_off = bli_obj_diag_offset( obj ); - - bli_obj_set_dims( n, m, obj ); - bli_obj_set_strides( cs, rs, obj ); - bli_obj_set_offs( offn, offm, obj ); - bli_obj_set_diag_offset( -diag_off, obj ); - - if ( bli_obj_is_upper_or_lower( obj ) ) - bli_obj_toggle_uplo( obj ); - - // Induce transposition among packed fields. - dim_t m_padded = bli_obj_padded_length( obj ); - dim_t n_padded = bli_obj_padded_width( obj ); - dim_t m_panel = bli_obj_panel_length( obj ); - dim_t n_panel = bli_obj_panel_width( obj ); - - bli_obj_set_padded_dims( n_padded, m_padded, obj ); - bli_obj_set_panel_dims( n_panel, m_panel, obj ); - - // Note that this macro DOES NOT touch the transposition bit! If - // the calling code is using this function to handle an object whose - // transposition bit is set prior to computation, that code needs - // to manually clear or toggle the bit, via - // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), - // respectively. -} - -static void bli_obj_induce_fast_trans( obj_t* obj ) -{ - // NOTE: This function is only used in situations where the matrices - // are guaranteed to not have structure or be packed. - - // Induce transposition among basic fields. - dim_t m = bli_obj_length( obj ); - dim_t n = bli_obj_width( obj ); - inc_t rs = bli_obj_row_stride( obj ); - inc_t cs = bli_obj_col_stride( obj ); - dim_t offm = bli_obj_row_off( obj ); - dim_t offn = bli_obj_col_off( obj ); - - bli_obj_set_dims( n, m, obj ); - bli_obj_set_strides( cs, rs, obj ); - bli_obj_set_offs( offn, offm, obj ); - - // Note that this macro DOES NOT touch the transposition bit! If - // the calling code is using this function to handle an object whose - // transposition bit is set prior to computation, that code needs - // to manually clear or toggle the bit, via - // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), - // respectively. -} - -// Sometimes we need to "reflect" a partition because the data we want is -// actually stored on the other side of the diagonal. The nuts and bolts of -// this macro look a lot like an induced transposition, except that the row -// and column strides are left unchanged (which, of course, drastically -// changes the effect of the macro). - -static void bli_obj_reflect_about_diag( obj_t* obj ) -{ - dim_t m = bli_obj_length( obj ); - dim_t n = bli_obj_width( obj ); - dim_t offm = bli_obj_row_off( obj ); - dim_t offn = bli_obj_col_off( obj ); - doff_t diag_off = bli_obj_diag_offset( obj ); - - bli_obj_set_dims( n, m, obj ); - bli_obj_set_offs( offn, offm, obj ); - bli_obj_set_diag_offset( -diag_off, obj ); - - bli_obj_toggle_trans( obj ); -} - - -#endif -// end bli_obj_macro_defs.h -// begin bli_complex_macro_defs.h - - -#ifndef BLIS_COMPLEX_MACRO_DEFS_H -#define BLIS_COMPLEX_MACRO_DEFS_H - - -// -- Real and imaginary accessor macros -- - - -#define bli_sreal( x ) ( x ) -#define bli_simag( x ) ( 0.0F ) -#define bli_dreal( x ) ( x ) -#define bli_dimag( x ) ( 0.0 ) - - -#ifndef BLIS_ENABLE_C99_COMPLEX - - -#define bli_creal( x ) ( (x).real ) -#define bli_cimag( x ) ( (x).imag ) -#define bli_zreal( x ) ( (x).real ) -#define bli_zimag( x ) ( (x).imag ) - - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - - -#define bli_creal( x ) ( crealf(x) ) -#define bli_cimag( x ) ( cimagf(x) ) -#define bli_zreal( x ) ( creal(x) ) -#define bli_zimag( x ) ( cimag(x) ) - - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#endif - -// end bli_complex_macro_defs.h -// begin bli_scalar_macro_defs.h - - -#ifndef BLIS_SCALAR_MACRO_DEFS_H -#define BLIS_SCALAR_MACRO_DEFS_H - - - -// -- Assignment/Accessor macros -- - -// NOTE: This macro is defined first since some of the other scalar macros -// use it to abstract away the method used to assign complex values (ie: -// whether fields of a struct are set directly or whether native C99 -// assignment is used). - -// begin bli_sets.h - - -#ifndef BLIS_SETS_H -#define BLIS_SETS_H - -// sets - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_sssets( xr, xi, y ) { (y) = (xr); } -#define bli_dssets( xr, xi, y ) { (y) = (xr); } -#define bli_cssets( xr, xi, y ) { (y) = (xr); } -#define bli_zssets( xr, xi, y ) { (y) = (xr); } -#define bli_issets( xr, xi, y ) { (y) = (xr); } - -#define bli_sdsets( xr, xi, y ) { (y) = (xr); } -#define bli_ddsets( xr, xi, y ) { (y) = (xr); } -#define bli_cdsets( xr, xi, y ) { (y) = (xr); } -#define bli_zdsets( xr, xi, y ) { (y) = (xr); } -#define bli_idsets( xr, xi, y ) { (y) = (xr); } - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } -#define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } -#define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } -#define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } -#define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } - -#define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } -#define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } -#define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } -#define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } -#define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } -#define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } -#define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } -#define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } - -#define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } -#define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } -#define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } -#define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } - -#endif // BLIS_ENABLE_C99_COMPLEX - -#define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } -#define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } -#define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } -#define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } -#define bli_iisets( xr, xi, y ) { (y) = (xr); } - - -#define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) -#define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) -#define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) -#define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) -#define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) - - -#endif - -// end bli_sets.h - -// NOTE: These macros are not used by other scalar macros, but they are -// related to those defined in bli_sets.h, and so we #include them here. - -// begin bli_setrs.h - - -#ifndef BLIS_SETRS_H -#define BLIS_SETRS_H - -// setrs - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_sssetrs( xr, y ) { (y) = (xr); } -#define bli_dssetrs( xr, y ) { (y) = (xr); } - -#define bli_sdsetrs( xr, y ) { (y) = (xr); } -#define bli_ddsetrs( xr, y ) { (y) = (xr); } - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } -#define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } - -#define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } -#define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } -#define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } - -#define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } -#define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) -#define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) -#define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) -#define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) - - -#endif - -// end bli_setrs.h -// begin bli_setis.h - - -#ifndef BLIS_SETIS_H -#define BLIS_SETIS_H - -// setis - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_sssetis( xi, y ) { ; } -#define bli_dssetis( xi, y ) { ; } - -#define bli_sdsetis( xi, y ) { ; } -#define bli_ddsetis( xi, y ) { ; } - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } -#define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } - -#define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } -#define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } -#define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } - -#define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } -#define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_ssetis( xi, y ) bli_sssetis( xi, y ) -#define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) -#define bli_csetis( xi, y ) bli_scsetis( xi, y ) -#define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) - - -#endif - -// end bli_setis.h - -// NOTE: This macro also needs to be defined early on since it determines -// how real and imaginary components are accessed (ie: whether the fields -// of a struct are read directly or whether native C99 functions are used.) - -// begin bli_gets.h - - -#ifndef BLIS_GETS_H -#define BLIS_GETS_H - -// gets - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - - -#define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } -#define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } -#define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } -#define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } -#define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } - -#define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } -#define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } -#define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } -#define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } -#define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } - -#define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } -#define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } -#define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } -#define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } -#define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } - -#define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } -#define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } -#define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } -#define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } -#define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } - -#define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } -#define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } -#define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } -#define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } -#define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } - - -#define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) -#define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) -#define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) -#define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) -#define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) - - -#endif -// end bli_gets.h - - -// -- Scalar constant initialization macros -- - -// begin bli_constants.h - - -#ifndef BLIS_CONSTANTS_H -#define BLIS_CONSTANTS_H - -// return pointers to constants - -// 1 - -#define bli_s1 \ -\ - ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) - -#define bli_d1 \ -\ - ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) - -#define bli_c1 \ -\ - ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) - -#define bli_z1 \ -\ - ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) - -#define bli_i1 \ -\ - ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) - -// 0 - -#define bli_s0 \ -\ - ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) - -#define bli_d0 \ -\ - ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) - -#define bli_c0 \ -\ - ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) - -#define bli_z0 \ -\ - ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) - -#define bli_i0 \ -\ - ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) - -// -1 - -#define bli_sm1 \ -\ - ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) - -#define bli_dm1 \ -\ - ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) - -#define bli_cm1 \ -\ - ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) - -#define bli_zm1 \ -\ - ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) - -#define bli_im1 \ -\ - ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) - - -#endif - -// end bli_constants.h - - -// -- Separated scalar macros (separated real/imaginary values) -- - -// begin bli_absq2ris.h - - -#ifndef BLIS_ABSQ2RIS_H -#define BLIS_ABSQ2RIS_H - -// absq2ris - -#define bli_sabsq2ris( ar, ai, br, bi ) \ -{ \ - (br) = (ar) * (ar); \ -} - -#define bli_dabsq2ris( ar, ai, br, bi ) \ -{ \ - (br) = (ar) * (ar); \ -} - -#define bli_cabsq2ris( ar, ai, br, bi ) \ -{ \ - (br) = (ar) * (ar) + (ai) * (ai); \ - (bi) = 0.0F; \ -} - -#define bli_zabsq2ris( ar, ai, br, bi ) \ -{ \ - (br) = (ar) * (ar) + (ai) * (ai); \ - (bi) = 0.0; \ -} - -#endif - -// end bli_absq2ris.h - -// begin bli_abval2ris.h - - -#ifndef BLIS_ABVAL2RIS_H -#define BLIS_ABVAL2RIS_H - -// abval2ris - -#define bli_sabval2ris( xr, xi, ar, ai ) \ -{ \ - (ar) = fabsf(xr); \ -} - -#define bli_dabval2ris( xr, xi, ar, ai ) \ -{ \ - (ar) = fabs(xr); \ -} - -#define bli_cabval2ris( xr, xi, ar, ai ) \ -{ \ - float s = bli_fmaxabs( (xr), (xi) ); \ - float mag; \ - if ( s == 0.0F ) mag = 0.0F; \ - else \ - { \ - mag = sqrtf( s ) * \ - sqrtf( ( (xr) / s ) * (xr) + \ - ( (xi) / s ) * (xi) ); \ - } \ - (ar) = mag; \ - (ai) = 0.0F; \ -} - -#define bli_zabval2ris( xr, xi, ar, ai ) \ -{ \ - double s = bli_fmaxabs( (xr), (xi) ); \ - double mag; \ - if ( s == 0.0 ) mag = 0.0; \ - else \ - { \ - mag = sqrt( s ) * \ - sqrt( ( (xr) / s ) * (xr) + \ - ( (xi) / s ) * (xi) ); \ - } \ - (ar) = mag; \ - (ai) = 0.0; \ -} - -#endif -// end bli_abval2ris.h - -// begin bli_addris.h - - -#ifndef BLIS_ADDRIS_H -#define BLIS_ADDRIS_H - -// addris - -#define bli_saddris( ar, ai, xr, xi ) \ -{ \ - (xr) = (xr) + (ar); \ -} - -#define bli_daddris( ar, ai, xr, xi ) \ -{ \ - (xr) = (xr) + (ar); \ -} - -#define bli_caddris( ar, ai, xr, xi ) \ -{ \ - (xr) = (xr) + (ar); \ - (xi) = (xi) + (ai); \ -} - -#define bli_zaddris( ar, ai, xr, xi ) \ -{ \ - (xr) = (xr) + (ar); \ - (xi) = (xi) + (ai); \ -} - -#endif - -// end bli_addris.h -// begin bli_addjris.h - - -#ifndef BLIS_ADDJRIS_H -#define BLIS_ADDJRIS_H - -// addjris - -#define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) -#define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) -#define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) -#define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) - -#endif - -// end bli_addjris.h - -// begin bli_add3ris.h - - -#ifndef BLIS_ADD3RIS_H -#define BLIS_ADD3RIS_H - -// add3ris - -#define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ -{ \ - (cr) = (ar) + (br); \ -} - -#define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ -{ \ - (cr) = (ar) + (br); \ -} - -#define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ -{ \ - (cr) = (ar) + (br); \ - (ci) = (ai) + (bi); \ -} - -#define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ -{ \ - (cr) = (ar) + (br); \ - (ci) = (ai) + (bi); \ -} - -#endif - -// end bli_add3ris.h - -// begin bli_axpbyris.h - - -#ifndef BLIS_AXPBYRIS_H -#define BLIS_AXPBYRIS_H - -// axpbyris - -#define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr) + (br) * (yr); \ -} - -#define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ -{ \ - const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ - const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ - (yr) = yt_r; \ - (yi) = yt_i; \ -} - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of b. -// - The fourth char encodes the type of y. - -// -- (axby) = (??ss) ---------------------------------------------------------- - -#define bli_ssssxpbyris bli_rxxpbyris -#define bli_dsssxpbyris bli_rxxpbyris -#define bli_csssxpbyris bli_rxxpbyris -#define bli_zsssxpbyris bli_rxxpbyris - -#define bli_sdssxpbyris bli_rxxpbyris -#define bli_ddssxpbyris bli_rxxpbyris -#define bli_cdssxpbyris bli_rxxpbyris -#define bli_zdssxpbyris bli_rxxpbyris - -#define bli_scssxpbyris bli_rxxpbyris -#define bli_dcssxpbyris bli_rxxpbyris -#define bli_ccssxpbyris bli_rxxpbyris -#define bli_zcssxpbyris bli_rxxpbyris - -#define bli_szssxpbyris bli_rxxpbyris -#define bli_dzssxpbyris bli_rxxpbyris -#define bli_czssxpbyris bli_rxxpbyris -#define bli_zzssxpbyris bli_rxxpbyris - -// NOTE: This series needs to be finished for all other char values for (by), but -// not until something in BLIS actually needs mixed-datatype axpbyris. - - -#define bli_saxpbyris bli_ssssaxpbyris -#define bli_daxpbyris bli_ddddaxpbyris -#define bli_caxpbyris bli_ccccaxpbyris -#define bli_zaxpbyris bli_zzzzaxpbyris - -#endif - -// end bli_axpbyris.h -// begin bli_axpbyjris.h - - -#ifndef BLIS_AXPBYJRIS_H -#define BLIS_AXPBYJRIS_H - -// axpbyjris - -#define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr) + (br) * (yr); \ -} - -#define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ -{ \ - const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ - const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ - (yr) = yt_r; \ - (yi) = yt_i; \ -} - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of b. -// - The fourth char encodes the type of y. - -// -- (axby) = (??ss) ---------------------------------------------------------- - -#define bli_ssssxpbyjris bli_rxxpbyjris -#define bli_dsssxpbyjris bli_rxxpbyjris -#define bli_csssxpbyjris bli_rxxpbyjris -#define bli_zsssxpbyjris bli_rxxpbyjris - -#define bli_sdssxpbyjris bli_rxxpbyjris -#define bli_ddssxpbyjris bli_rxxpbyjris -#define bli_cdssxpbyjris bli_rxxpbyjris -#define bli_zdssxpbyjris bli_rxxpbyjris - -#define bli_scssxpbyjris bli_rxxpbyjris -#define bli_dcssxpbyjris bli_rxxpbyjris -#define bli_ccssxpbyjris bli_rxxpbyjris -#define bli_zcssxpbyjris bli_rxxpbyjris - -#define bli_szssxpbyjris bli_rxxpbyjris -#define bli_dzssxpbyjris bli_rxxpbyjris -#define bli_czssxpbyjris bli_rxxpbyjris -#define bli_zzssxpbyjris bli_rxxpbyjris - -// NOTE: This series needs to be finished for all other char values for (by), but -// not until something in BLIS actually needs mixed-datatype axpbyjris. - - -#define bli_saxpbyjris bli_ssssaxpbyjris -#define bli_daxpbyjris bli_ddddaxpbyjris -#define bli_caxpbyjris bli_ccccaxpbyjris -#define bli_zaxpbyjris bli_zzzzaxpbyjris - -#endif - -// end bli_axpbyjris.h - -// begin bli_axpyris.h - - -#ifndef BLIS_AXPYRIS_H -#define BLIS_AXPYRIS_H - -// axpyris - -#define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) += (ar) * (xr); \ -} - -#define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) += (ar) * (xr) - (ai) * (xi); \ - (yi) += (ai) * (xr) + (ar) * (xi); \ -} - -#define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) += (ar) * (xr) - (ai) * (xi); \ -} - -#define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) += (ar) * (xr); \ - (yi) += (ar) * (xi); \ -} - -#define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) += (ar) * (xr); \ - (yi) += (ai) * (xr); \ -} - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssaxpyris bli_rxaxpyris -#define bli_dssaxpyris bli_rxaxpyris -#define bli_cssaxpyris bli_rxaxpyris -#define bli_zssaxpyris bli_rxaxpyris - -#define bli_sdsaxpyris bli_rxaxpyris -#define bli_ddsaxpyris bli_rxaxpyris -#define bli_cdsaxpyris bli_rxaxpyris -#define bli_zdsaxpyris bli_rxaxpyris - -#define bli_scsaxpyris bli_rxaxpyris -#define bli_dcsaxpyris bli_rxaxpyris -#define bli_ccsaxpyris bli_roaxpyris -#define bli_zcsaxpyris bli_roaxpyris - -#define bli_szsaxpyris bli_rxaxpyris -#define bli_dzsaxpyris bli_rxaxpyris -#define bli_czsaxpyris bli_roaxpyris -#define bli_zzsaxpyris bli_roaxpyris - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdaxpyris bli_rxaxpyris -#define bli_dsdaxpyris bli_rxaxpyris -#define bli_csdaxpyris bli_rxaxpyris -#define bli_zsdaxpyris bli_rxaxpyris - -#define bli_sddaxpyris bli_rxaxpyris -#define bli_dddaxpyris bli_rxaxpyris -#define bli_cddaxpyris bli_rxaxpyris -#define bli_zddaxpyris bli_rxaxpyris - -#define bli_scdaxpyris bli_rxaxpyris -#define bli_dcdaxpyris bli_rxaxpyris -#define bli_ccdaxpyris bli_roaxpyris -#define bli_zcdaxpyris bli_roaxpyris - -#define bli_szdaxpyris bli_rxaxpyris -#define bli_dzdaxpyris bli_rxaxpyris -#define bli_czdaxpyris bli_roaxpyris -#define bli_zzdaxpyris bli_roaxpyris - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscaxpyris bli_rxaxpyris -#define bli_dscaxpyris bli_rxaxpyris -#define bli_cscaxpyris bli_rcaxpyris -#define bli_zscaxpyris bli_rcaxpyris - -#define bli_sdcaxpyris bli_rxaxpyris -#define bli_ddcaxpyris bli_rxaxpyris -#define bli_cdcaxpyris bli_rcaxpyris -#define bli_zdcaxpyris bli_rcaxpyris - -#define bli_sccaxpyris bli_craxpyris -#define bli_dccaxpyris bli_craxpyris -#define bli_cccaxpyris bli_cxaxpyris -#define bli_zccaxpyris bli_cxaxpyris - -#define bli_szcaxpyris bli_craxpyris -#define bli_dzcaxpyris bli_craxpyris -#define bli_czcaxpyris bli_cxaxpyris -#define bli_zzcaxpyris bli_cxaxpyris - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszaxpyris bli_rxaxpyris -#define bli_dszaxpyris bli_rxaxpyris -#define bli_cszaxpyris bli_rcaxpyris -#define bli_zszaxpyris bli_rcaxpyris - -#define bli_sdzaxpyris bli_rxaxpyris -#define bli_ddzaxpyris bli_rxaxpyris -#define bli_cdzaxpyris bli_rcaxpyris -#define bli_zdzaxpyris bli_rcaxpyris - -#define bli_sczaxpyris bli_craxpyris -#define bli_dczaxpyris bli_craxpyris -#define bli_cczaxpyris bli_cxaxpyris -#define bli_zczaxpyris bli_cxaxpyris - -#define bli_szzaxpyris bli_craxpyris -#define bli_dzzaxpyris bli_craxpyris -#define bli_czzaxpyris bli_cxaxpyris -#define bli_zzzaxpyris bli_cxaxpyris - - - -#define bli_saxpyris bli_sssaxpyris -#define bli_daxpyris bli_dddaxpyris -#define bli_caxpyris bli_cccaxpyris -#define bli_zaxpyris bli_zzzaxpyris - -#endif - -// end bli_axpyris.h -// begin bli_axpyjris.h - - -#ifndef BLIS_AXPYJRIS_H -#define BLIS_AXPYJRIS_H - -// axpyjris - -#define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) += (ar) * (xr); \ -} - -#define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) += (ar) * (xr) + (ai) * (xi); \ - (yi) += (ai) * (xr) - (ar) * (xi); \ -} - -#define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) += (ar) * (xr) + (ai) * (xi); \ -} - -#define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) += (ar) * (xr); \ - (yi) += (ar) * -(xi); \ -} - -#define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) += (ar) * (xr); \ - (yi) += (ai) * (xr); \ -} - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssaxpyjris bli_rxaxpyjris -#define bli_dssaxpyjris bli_rxaxpyjris -#define bli_cssaxpyjris bli_rxaxpyjris -#define bli_zssaxpyjris bli_rxaxpyjris - -#define bli_sdsaxpyjris bli_rxaxpyjris -#define bli_ddsaxpyjris bli_rxaxpyjris -#define bli_cdsaxpyjris bli_rxaxpyjris -#define bli_zdsaxpyjris bli_rxaxpyjris - -#define bli_scsaxpyjris bli_rxaxpyjris -#define bli_dcsaxpyjris bli_rxaxpyjris -#define bli_ccsaxpyjris bli_roaxpyjris -#define bli_zcsaxpyjris bli_roaxpyjris - -#define bli_szsaxpyjris bli_rxaxpyjris -#define bli_dzsaxpyjris bli_rxaxpyjris -#define bli_czsaxpyjris bli_roaxpyjris -#define bli_zzsaxpyjris bli_roaxpyjris - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdaxpyjris bli_rxaxpyjris -#define bli_dsdaxpyjris bli_rxaxpyjris -#define bli_csdaxpyjris bli_rxaxpyjris -#define bli_zsdaxpyjris bli_rxaxpyjris - -#define bli_sddaxpyjris bli_rxaxpyjris -#define bli_dddaxpyjris bli_rxaxpyjris -#define bli_cddaxpyjris bli_rxaxpyjris -#define bli_zddaxpyjris bli_rxaxpyjris - -#define bli_scdaxpyjris bli_rxaxpyjris -#define bli_dcdaxpyjris bli_rxaxpyjris -#define bli_ccdaxpyjris bli_roaxpyjris -#define bli_zcdaxpyjris bli_roaxpyjris - -#define bli_szdaxpyjris bli_rxaxpyjris -#define bli_dzdaxpyjris bli_rxaxpyjris -#define bli_czdaxpyjris bli_roaxpyjris -#define bli_zzdaxpyjris bli_roaxpyjris - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscaxpyjris bli_rxaxpyjris -#define bli_dscaxpyjris bli_rxaxpyjris -#define bli_cscaxpyjris bli_rcaxpyjris -#define bli_zscaxpyjris bli_rcaxpyjris - -#define bli_sdcaxpyjris bli_rxaxpyjris -#define bli_ddcaxpyjris bli_rxaxpyjris -#define bli_cdcaxpyjris bli_rcaxpyjris -#define bli_zdcaxpyjris bli_rcaxpyjris - -#define bli_sccaxpyjris bli_craxpyjris -#define bli_dccaxpyjris bli_craxpyjris -#define bli_cccaxpyjris bli_cxaxpyjris -#define bli_zccaxpyjris bli_cxaxpyjris - -#define bli_szcaxpyjris bli_craxpyjris -#define bli_dzcaxpyjris bli_craxpyjris -#define bli_czcaxpyjris bli_cxaxpyjris -#define bli_zzcaxpyjris bli_cxaxpyjris - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszaxpyjris bli_rxaxpyjris -#define bli_dszaxpyjris bli_rxaxpyjris -#define bli_cszaxpyjris bli_rcaxpyjris -#define bli_zszaxpyjris bli_rcaxpyjris - -#define bli_sdzaxpyjris bli_rxaxpyjris -#define bli_ddzaxpyjris bli_rxaxpyjris -#define bli_cdzaxpyjris bli_rcaxpyjris -#define bli_zdzaxpyjris bli_rcaxpyjris - -#define bli_sczaxpyjris bli_craxpyjris -#define bli_dczaxpyjris bli_craxpyjris -#define bli_cczaxpyjris bli_cxaxpyjris -#define bli_zczaxpyjris bli_cxaxpyjris - -#define bli_szzaxpyjris bli_craxpyjris -#define bli_dzzaxpyjris bli_craxpyjris -#define bli_czzaxpyjris bli_cxaxpyjris -#define bli_zzzaxpyjris bli_cxaxpyjris - - - -#define bli_saxpyjris bli_sssaxpyjris -#define bli_daxpyjris bli_dddaxpyjris -#define bli_caxpyjris bli_cccaxpyjris -#define bli_zaxpyjris bli_zzzaxpyjris - -#endif - -// end bli_axpyjris.h - -// begin bli_axmyris.h - - -#ifndef BLIS_AXMYRIS_H -#define BLIS_AXMYRIS_H - -// axmyris - -#define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) -= (ar) * (xr); \ -} - -#define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) -= (ar) * (xr); \ -} - -#define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) -= (ar) * (xr) - (ai) * (xi); \ - (yi) -= (ai) * (xr) + (ar) * (xi); \ -} - -#define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) -= (ar) * (xr) - (ai) * (xi); \ - (yi) -= (ai) * (xr) + (ar) * (xi); \ -} - -#define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) -= (ar) * (xr); \ - (yi) -= (ar) * (xi); \ -} - -#define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) -= (ar) * (xr); \ - (yi) -= (ar) * (xi); \ -} - -#endif - -// end bli_axmyris.h - -// begin bli_conjris.h - - -#ifndef BLIS_CONJRIS_H -#define BLIS_CONJRIS_H - -// conjris - -#define bli_sconjris( xr, xi ) \ -{ \ - ; \ -} - -#define bli_dconjris( xr, xi ) \ -{ \ - ; \ -} - -#define bli_cconjris( xr, xi ) \ -{ \ - (xi) = -(xi); \ -} - -#define bli_zconjris( xr, xi ) \ -{ \ - (xi) = -(xi); \ -} - -#endif - -// end bli_conjris.h - -// begin bli_copyris.h - - -#ifndef BLIS_COPYRIS_H -#define BLIS_COPYRIS_H - -// copyris - -#define bli_scopyris( ar, ai, br, bi ) \ -{ \ - (br) = (ar); \ -} - -#define bli_dcopyris( ar, ai, br, bi ) \ -{ \ - (br) = (ar); \ -} - -#define bli_ccopyris( ar, ai, br, bi ) \ -{ \ - (br) = (ar); \ - (bi) = (ai); \ -} - -#define bli_zcopyris( ar, ai, br, bi ) \ -{ \ - (br) = (ar); \ - (bi) = (ai); \ -} - -#define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) -#define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) -#define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) -#define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) - -#define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) -#define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) -#define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) -#define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) - -#define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) -#define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) -#define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) -#define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) - -#define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) -#define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) -#define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) -#define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) - -#endif -// end bli_copyris.h -// begin bli_copyjris.h - - -#ifndef BLIS_COPYJRIS_H -#define BLIS_COPYJRIS_H - -// copyjris - -#define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) -#define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) -#define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) -#define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) - -#define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) -#define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) -#define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) -#define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) - -#define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) -#define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) -#define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) -#define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) - -#define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) -#define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) -#define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) -#define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) - -#define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) -#define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) -#define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) -#define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) - -#endif - -// end bli_copyjris.h -// begin bli_copycjris.h - - -#ifndef BLIS_COPYCJRIS_H -#define BLIS_COPYCJRIS_H - -// copycjris - -#define bli_scopycjris( conj, xr, xi, yr, yi ) \ -{ \ - bli_scopyris( (xr), (xi), (yr), (yi) ); \ -} - -#define bli_dcopycjris( conj, xr, xi, yr, yi ) \ -{ \ - bli_dcopyris( (xr), (xi), (yr), (yi) ); \ -} - -#define bli_ccopycjris( conj, xr, xi, yr, yi ) \ -{ \ - (yr) = (xr); \ - (yi) = ( bli_is_conj( conj ) ? -(xi) \ - : (xi) ); \ -} - -#define bli_zcopycjris( conj, xr, xi, yr, yi ) \ -{ \ - (yr) = (xr); \ - (yi) = ( bli_is_conj( conj ) ? -(xi) \ - : (xi) ); \ -} - -#define bli_icopycjris( conj, xr, xi, yr, yi ) \ -{ \ - bli_icopyris( (xr), (xi), (yr), (yi) ); \ -} - -#endif -// end bli_copycjris.h - -// begin bli_eqris.h - - -#ifndef BLIS_EQRIS_H -#define BLIS_EQRIS_H - - -// eqris (passed by value) - -#define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) -#define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) -#define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) -#define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) -#define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) - - -// eq1ris - -#define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) -#define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) -#define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) -#define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) -#define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) - - -// eq0ris - -#define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) -#define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) -#define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) -#define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) -#define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) - - -// eqm1ris - -#define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) -#define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) -#define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) -#define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) -#define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) - - - -#endif -// end bli_eqris.h - -// begin bli_invertris.h - - -#ifndef BLIS_INVERTRIS_H -#define BLIS_INVERTRIS_H - -// invertris - -#define bli_sinvertris( xr, xi ) \ -{ \ - (xr) = 1.0F / (xr); \ -} - -#define bli_dinvertris( xr, xi ) \ -{ \ - (xr) = 1.0 / (xr); \ -} - -#define bli_cinvertris( xr, xi ) \ -{ \ - float s = bli_fmaxabs( (xr), (xi) ); \ - float xr_s = (xr) / s; \ - float xi_s = (xi) / s; \ - float temp = ( xr_s * (xr) + xi_s * (xi) ); \ - (xr) = xr_s / temp; \ - (xi) = -xi_s / temp; \ -} - -#define bli_zinvertris( xr, xi ) \ -{ \ - double s = bli_fmaxabs( (xr), (xi) ); \ - double xr_s = (xr) / s; \ - double xi_s = (xi) / s; \ - double temp = ( xr_s * (xr) + xi_s * (xi) ); \ - (xr) = xr_s / temp; \ - (xi) = -xi_s / temp; \ -} - -#endif -// end bli_invertris.h - -// begin bli_invscalris.h - - -#ifndef BLIS_INVSCALRIS_H -#define BLIS_INVSCALRIS_H - -// invscalris - -#define bli_sinvscalris( ar, ai, xr, xi ) \ -{ \ - (xr) /= (ar); \ -} - -#define bli_dinvscalris( ar, ai, xr, xi ) \ -{ \ - (xr) /= (ar); \ -} - -#define bli_cinvscalris( ar, ai, xr, xi ) \ -{ \ - float s = bli_fmaxabs( (ar), (ai) ); \ - float ar_s = (ar) / s; \ - float ai_s = (ai) / s; \ - float xrt = (xr); \ - float temp = ( ar_s * (ar) + ai_s * (ai) ); \ - (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ - (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ -} - -#define bli_zinvscalris( ar, ai, xr, xi ) \ -{ \ - double s = bli_fmaxabs( (ar), (ai) ); \ - double ar_s = (ar) / s; \ - double ai_s = (ai) / s; \ - double xrt = (xr); \ - double temp = ( ar_s * (ar) + ai_s * (ai) ); \ - (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ - (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ -} - -#define bli_scinvscalris( ar, ai, xr, xi ) \ -{ \ - (xr) /= (ar); \ - (xi) /= (ar); \ -} - -#define bli_dzinvscalris( ar, ai, xr, xi ) \ -{ \ - (xr) /= (ar); \ - (xi) /= (ar); \ -} - -#endif -// end bli_invscalris.h -// begin bli_invscaljris.h - - -#ifndef BLIS_INVSCALJRIS_H -#define BLIS_INVSCALJRIS_H - -// invscaljris - -#define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) -#define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) -#define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) -#define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) - -#define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) -#define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) - -#endif - -// end bli_invscaljris.h - -// begin bli_neg2ris.h - - -#ifndef BLIS_NEG2RIS_H -#define BLIS_NEG2RIS_H - -// neg2ris - -#define bli_sneg2ris( ar, ai, br, bi ) \ -{ \ - (br) = -(ar); \ -} - -#define bli_dneg2ris( ar, ai, br, bi ) \ -{ \ - (br) = -(ar); \ -} - -#define bli_cneg2ris( ar, ai, br, bi ) \ -{ \ - (br) = -(ar); \ - (bi) = -(ai); \ -} - -#define bli_zneg2ris( ar, ai, br, bi ) \ -{ \ - (br) = -(ar); \ - (bi) = -(ai); \ -} - -#endif - -// end bli_neg2ris.h - -// begin bli_scalris.h - - -#ifndef BLIS_SCALRIS_H -#define BLIS_SCALRIS_H - -// scalris - -#define bli_sscalris( ar, ai, xr, xi ) \ -{ \ - (xr) = (ar) * (xr); \ -} - -#define bli_dscalris( ar, ai, xr, xi ) \ -{ \ - (xr) = (ar) * (xr); \ -} - -#define bli_cscalris( ar, ai, xr, xi ) \ -{ \ - float yr = (ar) * (xr) - (ai) * (xi); \ - float yi = (ai) * (xr) + (ar) * (xi); \ - (xr) = yr; \ - (xi) = yi; \ -} - -#define bli_zscalris( ar, ai, xr, xi ) \ -{ \ - double yr = (ar) * (xr) - (ai) * (xi); \ - double yi = (ai) * (xr) + (ar) * (xi); \ - (xr) = yr; \ - (xi) = yi; \ -} - -#define bli_scscalris( ar, ai, xr, xi ) \ -{ \ - (xr) = (ar) * (xr); \ - (xi) = (ar) * (xi); \ -} - -#define bli_dzscalris( ar, ai, xr, xi ) \ -{ \ - (xr) = (ar) * (xr); \ - (xi) = (ar) * (xi); \ -} - -#endif - -// end bli_scalris.h -// begin bli_scaljris.h - - -#ifndef BLIS_SCALJRIS_H -#define BLIS_SCALJRIS_H - -// scaljris - -#define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) -#define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) -#define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) -#define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) - -#define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) -#define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) - -#endif - -// end bli_scaljris.h -// begin bli_scalcjris.h - - -#ifndef BLIS_SCALCJRIS_H -#define BLIS_SCALCJRIS_H - -// scalcjris - -#define bli_sscalcjris( conj, ar, ai, xr, xi ) \ -{ \ - bli_sscalris( (ar), (ai), (xr), (xi) ); \ -} - -#define bli_dscalcjris( conj, ar, ai, xr, xi ) \ -{ \ - bli_dscalris( (ar), (ai), (xr), (xi) ); \ -} - -#define bli_cscalcjris( conj, ar, ai, xr, xi ) \ -{ \ - if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ - else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ -} - -#define bli_zscalcjris( conj, ar, ai, xr, xi ) \ -{ \ - if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ - else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ -} - -#define bli_iscalcjris( conj, ar, ai, xr, xi ) \ -{ \ - bli_iscalris( (ar), (xi), (xr), (xi) ); \ -} - -#define bli_scscalcjris( conj, ar, ai, xr, xi ) \ -{ \ - bli_scscalris( (ar), (ai), (xr), (xi) ); \ -} - -#define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ -{ \ - bli_dzscalris( (ar), (ai), (xr), (xi) ); \ -} - -#endif -// end bli_scalcjris.h - -// begin bli_scal2ris.h - - -#ifndef BLIS_SCAL2RIS_H -#define BLIS_SCAL2RIS_H - -// scal2ris - -#define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr); \ -} - -#define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr) - (ai) * (xi); \ - (yi) = (ai) * (xr) + (ar) * (xi); \ -} - -#define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr) - (ai) * (xi); \ -} - -#define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr); \ - (yi) = (ar) * (xi); \ -} - -#define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr); \ - (yi) = (ai) * (xr); \ -} - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssscal2ris bli_rxscal2ris -#define bli_dssscal2ris bli_rxscal2ris -#define bli_cssscal2ris bli_rxscal2ris -#define bli_zssscal2ris bli_rxscal2ris - -#define bli_sdsscal2ris bli_rxscal2ris -#define bli_ddsscal2ris bli_rxscal2ris -#define bli_cdsscal2ris bli_rxscal2ris -#define bli_zdsscal2ris bli_rxscal2ris - -#define bli_scsscal2ris bli_rxscal2ris -#define bli_dcsscal2ris bli_rxscal2ris -#define bli_ccsscal2ris bli_roscal2ris -#define bli_zcsscal2ris bli_roscal2ris - -#define bli_szsscal2ris bli_rxscal2ris -#define bli_dzsscal2ris bli_rxscal2ris -#define bli_czsscal2ris bli_roscal2ris -#define bli_zzsscal2ris bli_roscal2ris - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdscal2ris bli_rxscal2ris -#define bli_dsdscal2ris bli_rxscal2ris -#define bli_csdscal2ris bli_rxscal2ris -#define bli_zsdscal2ris bli_rxscal2ris - -#define bli_sddscal2ris bli_rxscal2ris -#define bli_dddscal2ris bli_rxscal2ris -#define bli_cddscal2ris bli_rxscal2ris -#define bli_zddscal2ris bli_rxscal2ris - -#define bli_scdscal2ris bli_rxscal2ris -#define bli_dcdscal2ris bli_rxscal2ris -#define bli_ccdscal2ris bli_roscal2ris -#define bli_zcdscal2ris bli_roscal2ris - -#define bli_szdscal2ris bli_rxscal2ris -#define bli_dzdscal2ris bli_rxscal2ris -#define bli_czdscal2ris bli_roscal2ris -#define bli_zzdscal2ris bli_roscal2ris - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscscal2ris bli_rxscal2ris -#define bli_dscscal2ris bli_rxscal2ris -#define bli_cscscal2ris bli_rcscal2ris -#define bli_zscscal2ris bli_rcscal2ris - -#define bli_sdcscal2ris bli_rxscal2ris -#define bli_ddcscal2ris bli_rxscal2ris -#define bli_cdcscal2ris bli_rcscal2ris -#define bli_zdcscal2ris bli_rcscal2ris - -#define bli_sccscal2ris bli_crscal2ris -#define bli_dccscal2ris bli_crscal2ris -#define bli_cccscal2ris bli_cxscal2ris -#define bli_zccscal2ris bli_cxscal2ris - -#define bli_szcscal2ris bli_crscal2ris -#define bli_dzcscal2ris bli_crscal2ris -#define bli_czcscal2ris bli_cxscal2ris -#define bli_zzcscal2ris bli_cxscal2ris - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszscal2ris bli_rxscal2ris -#define bli_dszscal2ris bli_rxscal2ris -#define bli_cszscal2ris bli_rcscal2ris -#define bli_zszscal2ris bli_rcscal2ris - -#define bli_sdzscal2ris bli_rxscal2ris -#define bli_ddzscal2ris bli_rxscal2ris -#define bli_cdzscal2ris bli_rcscal2ris -#define bli_zdzscal2ris bli_rcscal2ris - -#define bli_sczscal2ris bli_crscal2ris -#define bli_dczscal2ris bli_crscal2ris -#define bli_cczscal2ris bli_cxscal2ris -#define bli_zczscal2ris bli_cxscal2ris - -#define bli_szzscal2ris bli_crscal2ris -#define bli_dzzscal2ris bli_crscal2ris -#define bli_czzscal2ris bli_cxscal2ris -#define bli_zzzscal2ris bli_cxscal2ris - - - -#define bli_sscal2ris bli_sssscal2ris -#define bli_dscal2ris bli_dddscal2ris -#define bli_cscal2ris bli_cccscal2ris -#define bli_zscal2ris bli_zzzscal2ris - -#endif - -// end bli_scal2ris.h -// begin bli_scal2jris.h - - -#ifndef BLIS_SCAL2JRIS_H -#define BLIS_SCAL2JRIS_H - -// scal2jris - -#define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr); \ -} - -#define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr) + (ai) * (xi); \ - (yi) = (ai) * (xr) - (ar) * (xi); \ -} - -#define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr) + (ai) * (xi); \ -} - -#define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr); \ - (yi) = (ar) * -(xi); \ -} - -#define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ -{ \ - (yr) = (ar) * (xr); \ - (yi) = (ai) * (xr); \ -} - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) - -#define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) - - - -#define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) -#define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) - -#endif - -// end bli_scal2jris.h - -// begin bli_set0ris.h - - -#ifndef BLIS_SET0RIS_H -#define BLIS_SET0RIS_H - -// set0ris - -#define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) -#define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) -#define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) -#define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) - -#endif - -// end bli_set0ris.h - -// begin bli_sqrt2ris.h - - -#ifndef BLIS_SQRT2RIS_H -#define BLIS_SQRT2RIS_H - -// sqrt2ris - -#define bli_ssqrt2ris( xr, xi, ar, ai ) \ -{ \ - (ar) = sqrtf( (xr) ); \ -} - -#define bli_dsqrt2ris( xr, xi, ar, ai ) \ -{ \ - (ar) = sqrt( (xr) ); \ -} - -#define bli_csqrt2ris( xr, xi, ar, ai ) \ -{ \ - float s = bli_fmaxabs( (xr), (xi) ); \ - float mag; \ - if ( s == 0.0F ) mag = 0.0F; \ - else \ - { \ - mag = sqrtf( s ) * \ - sqrtf( ( (xr) / s ) * (xr) + \ - ( (xi) / s ) * (xi) ); \ - } \ -\ - (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ - (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ -} - -#define bli_zsqrt2ris( xr, xi, ar, ai ) \ -{ \ - double s = bli_fmaxabs( (xr), (xi) ); \ - double mag; \ - if ( s == 0.0 ) mag = 0.0; \ - else \ - { \ - mag = sqrt( s ) * \ - sqrt( ( (xr) / s ) * (xr) + \ - ( (xi) / s ) * (xi) ); \ - } \ -\ - (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ - (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ -} - -#define bli_scsqrt2ris( xr, xi, ar, ai ) \ -{ \ - (ar) = sqrtf( (xr) ); \ - (ai) = 0.0F; \ -} - -#define bli_dzsqrt2ris( xr, xi, ar, ai ) \ -{ \ - (ar) = sqrt( (xr) ); \ - (ai) = 0.0; \ -} - -#endif - -// end bli_sqrt2ris.h - -// begin bli_subris.h - - -#ifndef BLIS_SUBRIS_H -#define BLIS_SUBRIS_H - -// subris - -#define bli_ssubris( ar, ai, xr, xi ) \ -{ \ - (xr) = (xr) - (ar); \ -} - -#define bli_dsubris( ar, ai, xr, xi ) \ -{ \ - (xr) = (xr) - (ar); \ -} - -#define bli_csubris( ar, ai, xr, xi ) \ -{ \ - (xr) = (xr) - (ar); \ - (xi) = (xi) - (ai); \ -} - -#define bli_zsubris( ar, ai, xr, xi ) \ -{ \ - (xr) = (xr) - (ar); \ - (xi) = (xi) - (ai); \ -} - -#endif - -// end bli_subris.h -// begin bli_subjris.h - - -#ifndef BLIS_SUBJRIS_H -#define BLIS_SUBJRIS_H - -// subjris - -#define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) -#define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) -#define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) -#define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) - -#endif - -// end bli_subjris.h - -// begin bli_swapris.h - - -#ifndef BLIS_SWAPRIS_H -#define BLIS_SWAPRIS_H - -// swapris - -#define bli_sswapris( ar, ai, br, bi ) \ -{ \ - float tr, ti; \ -\ - bli_scopyris( (br), (bi), (tr), (ti) ); \ - bli_scopyris( (ar), (ai), (br), (bi) ); \ - bli_scopyris( (tr), (ti), (ar), (ai) ); \ -} - -#define bli_dswapris( ar, ai, br, bi ) \ -{ \ - double tr, ti; \ -\ - bli_dcopyris( (br), (bi), (tr), (ti) ); \ - bli_dcopyris( (ar), (ai), (br), (bi) ); \ - bli_dcopyris( (tr), (ti), (ar), (ai) ); \ -} - -#define bli_cswapris( ar, ai, br, bi ) \ -{ \ - scomplex tr, ti; \ -\ - bli_ccopyris( (br), (bi), (tr), (ti) ); \ - bli_ccopyris( (ar), (ai), (br), (bi) ); \ - bli_ccopyris( (tr), (ti), (ar), (ai) ); \ -} - -#define bli_zswapris( ar, ai, br, bi ) \ -{ \ - dcomplex tr, ti; \ -\ - bli_zcopyris( (br), (bi), (tr), (ti) ); \ - bli_zcopyris( (ar), (ai), (br), (bi) ); \ - bli_zcopyris( (tr), (ti), (ar), (ai) ); \ -} - -#endif - -// end bli_swapris.h - -// begin bli_xpbyris.h - - -#ifndef BLIS_XPBYRIS_H -#define BLIS_XPBYRIS_H - -// xpbyris - -#define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ -{ \ - (yr) = (xr) + (br) * (yr); \ -} - -#define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ -{ \ - const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ - const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ - (yr) = yt_r; \ - (yi) = yt_i; \ -} - -#define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ -{ \ - const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ - const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ - (yr) = yt_r; \ - (yi) = yt_i; \ -} - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of b. -// - The third char encodes the type of y. - -// -- (xby) = (??s) ------------------------------------------------------------ - -#define bli_sssxpbyris bli_rxxpbyris -#define bli_dssxpbyris bli_rxxpbyris -#define bli_cssxpbyris bli_rxxpbyris -#define bli_zssxpbyris bli_rxxpbyris - -#define bli_sdsxpbyris bli_rxxpbyris -#define bli_ddsxpbyris bli_rxxpbyris -#define bli_cdsxpbyris bli_rxxpbyris -#define bli_zdsxpbyris bli_rxxpbyris - -#define bli_scsxpbyris bli_rxxpbyris -#define bli_dcsxpbyris bli_rxxpbyris -#define bli_ccsxpbyris bli_rxxpbyris -#define bli_zcsxpbyris bli_rxxpbyris - -#define bli_szsxpbyris bli_rxxpbyris -#define bli_dzsxpbyris bli_rxxpbyris -#define bli_czsxpbyris bli_rxxpbyris -#define bli_zzsxpbyris bli_rxxpbyris - -// -- (xby) = (??d) ------------------------------------------------------------ - -#define bli_ssdxpbyris bli_rxxpbyris -#define bli_dsdxpbyris bli_rxxpbyris -#define bli_csdxpbyris bli_rxxpbyris -#define bli_zsdxpbyris bli_rxxpbyris - -#define bli_sddxpbyris bli_rxxpbyris -#define bli_dddxpbyris bli_rxxpbyris -#define bli_cddxpbyris bli_rxxpbyris -#define bli_zddxpbyris bli_rxxpbyris - -#define bli_scdxpbyris bli_rxxpbyris -#define bli_dcdxpbyris bli_rxxpbyris -#define bli_ccdxpbyris bli_rxxpbyris -#define bli_zcdxpbyris bli_rxxpbyris - -#define bli_szdxpbyris bli_rxxpbyris -#define bli_dzdxpbyris bli_rxxpbyris -#define bli_czdxpbyris bli_rxxpbyris -#define bli_zzdxpbyris bli_rxxpbyris - -// -- (xby) = (??c) ------------------------------------------------------------ - -#define bli_sscxpbyris bli_rxxpbyris -#define bli_dscxpbyris bli_rxxpbyris -#define bli_cscxpbyris bli_crxpbyris -#define bli_zscxpbyris bli_crxpbyris - -#define bli_sdcxpbyris bli_rxxpbyris -#define bli_ddcxpbyris bli_rxxpbyris -#define bli_cdcxpbyris bli_crxpbyris -#define bli_zdcxpbyris bli_crxpbyris - -#define bli_sccxpbyris bli_cxxpbyris -#define bli_dccxpbyris bli_cxxpbyris -#define bli_cccxpbyris bli_cxxpbyris -#define bli_zccxpbyris bli_cxxpbyris - -#define bli_szcxpbyris bli_cxxpbyris -#define bli_dzcxpbyris bli_cxxpbyris -#define bli_czcxpbyris bli_cxxpbyris -#define bli_zzcxpbyris bli_cxxpbyris - -// -- (xby) = (??z) ------------------------------------------------------------ - -#define bli_sszxpbyris bli_rxxpbyris -#define bli_dszxpbyris bli_rxxpbyris -#define bli_cszxpbyris bli_crxpbyris -#define bli_zszxpbyris bli_crxpbyris - -#define bli_sdzxpbyris bli_rxxpbyris -#define bli_ddzxpbyris bli_rxxpbyris -#define bli_cdzxpbyris bli_crxpbyris -#define bli_zdzxpbyris bli_crxpbyris - -#define bli_sczxpbyris bli_cxxpbyris -#define bli_dczxpbyris bli_cxxpbyris -#define bli_cczxpbyris bli_cxxpbyris -#define bli_zczxpbyris bli_cxxpbyris - -#define bli_szzxpbyris bli_cxxpbyris -#define bli_dzzxpbyris bli_cxxpbyris -#define bli_czzxpbyris bli_cxxpbyris -#define bli_zzzxpbyris bli_cxxpbyris - - - -#define bli_sxpbyris bli_sssxpbyris -#define bli_dxpbyris bli_dddxpbyris -#define bli_cxpbyris bli_cccxpbyris -#define bli_zxpbyris bli_zzzxpbyris - -#endif - -// end bli_xpbyris.h -// begin bli_xpbyjris.h - - -#ifndef BLIS_XPBYJRIS_H -#define BLIS_XPBYJRIS_H - -// xpbyjris - -#define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ -{ \ - (yr) = (xr) + (br) * (yr); \ -} - -#define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ -{ \ - const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ - const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ - (yr) = yt_r; \ - (yi) = yt_i; \ -} - -#define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ -{ \ - const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ - const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ - (yr) = yt_r; \ - (yi) = yt_i; \ -} - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of b. -// - The third char encodes the type of y. - -// -- (xby) = (??s) ------------------------------------------------------------ - -#define bli_sssxpbyjris bli_rxxpbyjris -#define bli_dssxpbyjris bli_rxxpbyjris -#define bli_cssxpbyjris bli_rxxpbyjris -#define bli_zssxpbyjris bli_rxxpbyjris - -#define bli_sdsxpbyjris bli_rxxpbyjris -#define bli_ddsxpbyjris bli_rxxpbyjris -#define bli_cdsxpbyjris bli_rxxpbyjris -#define bli_zdsxpbyjris bli_rxxpbyjris - -#define bli_scsxpbyjris bli_rxxpbyjris -#define bli_dcsxpbyjris bli_rxxpbyjris -#define bli_ccsxpbyjris bli_rxxpbyjris -#define bli_zcsxpbyjris bli_rxxpbyjris - -#define bli_szsxpbyjris bli_rxxpbyjris -#define bli_dzsxpbyjris bli_rxxpbyjris -#define bli_czsxpbyjris bli_rxxpbyjris -#define bli_zzsxpbyjris bli_rxxpbyjris - -// -- (xby) = (??d) ------------------------------------------------------------ - -#define bli_ssdxpbyjris bli_rxxpbyjris -#define bli_dsdxpbyjris bli_rxxpbyjris -#define bli_csdxpbyjris bli_rxxpbyjris -#define bli_zsdxpbyjris bli_rxxpbyjris - -#define bli_sddxpbyjris bli_rxxpbyjris -#define bli_dddxpbyjris bli_rxxpbyjris -#define bli_cddxpbyjris bli_rxxpbyjris -#define bli_zddxpbyjris bli_rxxpbyjris - -#define bli_scdxpbyjris bli_rxxpbyjris -#define bli_dcdxpbyjris bli_rxxpbyjris -#define bli_ccdxpbyjris bli_rxxpbyjris -#define bli_zcdxpbyjris bli_rxxpbyjris - -#define bli_szdxpbyjris bli_rxxpbyjris -#define bli_dzdxpbyjris bli_rxxpbyjris -#define bli_czdxpbyjris bli_rxxpbyjris -#define bli_zzdxpbyjris bli_rxxpbyjris - -// -- (xby) = (??c) ------------------------------------------------------------ - -#define bli_sscxpbyjris bli_rxxpbyjris -#define bli_dscxpbyjris bli_rxxpbyjris -#define bli_cscxpbyjris bli_crxpbyjris -#define bli_zscxpbyjris bli_crxpbyjris - -#define bli_sdcxpbyjris bli_rxxpbyjris -#define bli_ddcxpbyjris bli_rxxpbyjris -#define bli_cdcxpbyjris bli_crxpbyjris -#define bli_zdcxpbyjris bli_crxpbyjris - -#define bli_sccxpbyjris bli_cxxpbyjris -#define bli_dccxpbyjris bli_cxxpbyjris -#define bli_cccxpbyjris bli_cxxpbyjris -#define bli_zccxpbyjris bli_cxxpbyjris - -#define bli_szcxpbyjris bli_cxxpbyjris -#define bli_dzcxpbyjris bli_cxxpbyjris -#define bli_czcxpbyjris bli_cxxpbyjris -#define bli_zzcxpbyjris bli_cxxpbyjris - -// -- (xby) = (??z) ------------------------------------------------------------ - -#define bli_sszxpbyjris bli_rxxpbyjris -#define bli_dszxpbyjris bli_rxxpbyjris -#define bli_cszxpbyjris bli_crxpbyjris -#define bli_zszxpbyjris bli_crxpbyjris - -#define bli_sdzxpbyjris bli_rxxpbyjris -#define bli_ddzxpbyjris bli_rxxpbyjris -#define bli_cdzxpbyjris bli_crxpbyjris -#define bli_zdzxpbyjris bli_crxpbyjris - -#define bli_sczxpbyjris bli_cxxpbyjris -#define bli_dczxpbyjris bli_cxxpbyjris -#define bli_cczxpbyjris bli_cxxpbyjris -#define bli_zczxpbyjris bli_cxxpbyjris - -#define bli_szzxpbyjris bli_cxxpbyjris -#define bli_dzzxpbyjris bli_cxxpbyjris -#define bli_czzxpbyjris bli_cxxpbyjris -#define bli_zzzxpbyjris bli_cxxpbyjris - - - -#define bli_sxpbyjris bli_sssxpbyjris -#define bli_dxpbyjris bli_dddxpbyjris -#define bli_cxpbyjris bli_cccxpbyjris -#define bli_zxpbyjris bli_zzzxpbyjris - -#endif - -// end bli_xpbyjris.h - -// Inlined scalar macros in loops -// begin bli_scal2ris_mxn.h - - -#ifndef BLIS_SCAL2RIS_MXN_H -#define BLIS_SCAL2RIS_MXN_H - -// scal2ris_mxn - -static void bli_cscal2ris_mxn - ( - const conj_t conjx, - const dim_t m, - const dim_t n, - scomplex* restrict alpha, - scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y - ) -{ - float* restrict alpha_r = ( float* )alpha; \ - float* restrict alpha_i = ( float* )alpha + 1; \ - float* restrict x_r = ( float* )x; \ - float* restrict x_i = ( float* )x + 1; \ - float* restrict y_r = ( float* )y; \ - float* restrict y_i = ( float* )y + is_y; \ - const dim_t incx2 = 2*rs_x; \ - const dim_t ldx2 = 2*cs_x; \ - - \ - - if ( bli_is_conj( conjx ) ) - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; - float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; - float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; - float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; - - bli_cscal2jris - ( - *alpha_r, - *alpha_i, - *chi11_r, - *chi11_i, - *psi11_r, - *psi11_i - ); - } - } - else - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; - float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; - float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; - float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; - - bli_cscal2ris - ( - *alpha_r, - *alpha_i, - *chi11_r, - *chi11_i, - *psi11_r, - *psi11_i - ); - } - } -} - -static void bli_zscal2ris_mxn - ( - const conj_t conjx, - const dim_t m, - const dim_t n, - dcomplex* restrict alpha, - dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y - ) -{ - double* restrict alpha_r = ( double* )alpha; \ - double* restrict alpha_i = ( double* )alpha + 1; \ - double* restrict x_r = ( double* )x; \ - double* restrict x_i = ( double* )x + 1; \ - double* restrict y_r = ( double* )y; \ - double* restrict y_i = ( double* )y + is_y; \ - const dim_t incx2 = 2*rs_x; \ - const dim_t ldx2 = 2*cs_x; \ - - \ - - if ( bli_is_conj( conjx ) ) - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; - double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; - double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; - double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; - - bli_zscal2jris - ( - *alpha_r, - *alpha_i, - *chi11_r, - *chi11_i, - *psi11_r, - *psi11_i - ); - } - } - else - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; - double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; - double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; - double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; - - bli_zscal2ris - ( - *alpha_r, - *alpha_i, - *chi11_r, - *chi11_i, - *psi11_r, - *psi11_i - ); - } - } -} - - -#endif -// end bli_scal2ris_mxn.h -// begin bli_scalris_mxn_uplo.h - - -#ifndef BLIS_SCALRIS_MXN_UPLO_H -#define BLIS_SCALRIS_MXN_UPLO_H - -// scalris_mxn_u - -#define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_cscalris( *(ar), \ - *(ai), \ - *((xr) + _i*rs_x + _j*cs_x), \ - *((xi) + _i*rs_x + _j*cs_x) ); \ - } \ - } \ -} - -#define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_zscalris( *(ar), \ - *(ai), \ - *((xr) + _i*rs_x + _j*cs_x), \ - *((xi) + _i*rs_x + _j*cs_x) ); \ - } \ - } \ -} - -// scalris_mxn_l - -#define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_cscalris( *(ar), \ - *(ai), \ - *((xr) + _i*rs_x + _j*cs_x), \ - *((xi) + _i*rs_x + _j*cs_x) ); \ - } \ - } \ -} - -#define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_zscalris( *(ar), \ - *(ai), \ - *((xr) + _i*rs_x + _j*cs_x), \ - *((xi) + _i*rs_x + _j*cs_x) ); \ - } \ - } \ -} - -#endif -// end bli_scalris_mxn_uplo.h - - -// -- Conventional scalar macros (paired real/imaginary values) -- - -// begin bli_absq2s.h - - -#ifndef BLIS_ABSQR2_H -#define BLIS_ABSQR2_H - -// absq2s - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of a. - -#define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) -#define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) -#define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } -#define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } - -#define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) -#define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) -#define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } -#define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) -#define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) - -#define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) -#define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) -#define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ - bli_cimag(x) * bli_cimag(x), 0.0, (a) ) -#define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ - bli_zimag(x) * bli_zimag(x), 0.0, (a) ) - -#define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) -#define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) -#define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ - bli_cimag(x) * bli_cimag(x), 0.0, (a) ) -#define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ - bli_zimag(x) * bli_zimag(x), 0.0, (a) ) - -#endif // BLIS_ENABLE_C99_COMPLEX - -#define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) -#define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) -#define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) -#define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) - - -#endif -// end bli_absq2s.h - -// begin bli_abval2s.h - - -#ifndef BLIS_ABVAL2S_H -#define BLIS_ABVAL2S_H - -// abval2s - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of a. - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) -#define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) -#define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } -#define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } - -#define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) -#define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) -#define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } -#define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } - -#define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) -#define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) - -#define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) -#define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) -#define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) -#define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) - -#define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) -#define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) -#define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) -#define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) - -#define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) -#define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) -#define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) -#define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) - -#define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) -#define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) -#define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) -#define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) -#define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) -#define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) -#define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) - - -#endif -// end bli_abval2s.h - -// begin bli_adds.h - - -#ifndef BLIS_ADDS_H -#define BLIS_ADDS_H - -// adds - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of y. - -#define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) -#define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) - -#define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) -#define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) - -#define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scadds( a, y ) { (y) += (a); } -#define bli_dcadds( a, y ) { (y) += (a); } -#define bli_ccadds( a, y ) { (y) += (a); } -#define bli_zcadds( a, y ) { (y) += (a); } - -#define bli_szadds( a, y ) { (y) += (a); } -#define bli_dzadds( a, y ) { (y) += (a); } -#define bli_czadds( a, y ) { (y) += (a); } -#define bli_zzadds( a, y ) { (y) += (a); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sadds( a, y ) bli_ssadds( a, y ) -#define bli_dadds( a, y ) bli_ddadds( a, y ) -#define bli_cadds( a, y ) bli_ccadds( a, y ) -#define bli_zadds( a, y ) bli_zzadds( a, y ) - - -#endif - -// end bli_adds.h -// begin bli_addjs.h - - -#ifndef BLIS_ADDJS_H -#define BLIS_ADDJS_H - -// addjs - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of y. - -#define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) -#define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) - -#define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) -#define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) - -#define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scaddjs( a, y ) { (y) += (a); } -#define bli_dcaddjs( a, y ) { (y) += (a); } -#define bli_ccaddjs( a, y ) { (y) += conjf(a); } -#define bli_zcaddjs( a, y ) { (y) += conj (a); } - -#define bli_szaddjs( a, y ) { (y) += (a); } -#define bli_dzaddjs( a, y ) { (y) += (a); } -#define bli_czaddjs( a, y ) { (y) += conjf(a); } -#define bli_zzaddjs( a, y ) { (y) += conj (a); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_saddjs( a, y ) bli_ssaddjs( a, y ) -#define bli_daddjs( a, y ) bli_ddaddjs( a, y ) -#define bli_caddjs( a, y ) bli_ccaddjs( a, y ) -#define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) - - -#endif - -// end bli_addjs.h - -// begin bli_add3s.h - - -#ifndef BLIS_ADD3S_H -#define BLIS_ADD3S_H - -// add3s - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of b. -// - The third char encodes the type of c. - - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) -#define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) -#define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) -#define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) - -#define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) -#define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) -#define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) -#define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) - -#define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) -#define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) -#define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) -#define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) - -#define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) -#define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) -#define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) -#define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) - -#define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) - -#define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) - -#define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) -#define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) -#define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) -#define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) -#define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) - -#define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) -#define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) -#define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) -#define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) - -#define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) -#define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) -#define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) -#define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) - -#define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) -#define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) -#define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) -#define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) - -#define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) - -#define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) - -#define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) -#define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } - -#define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } - -#define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } - -#define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } - -#define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } - -#define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } - -#define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } -#define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) -#define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) -#define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) -#define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) - - -#endif - -// end bli_add3s.h - -// begin bli_axpbys.h - - -#ifndef BLIS_AXPBYS_H -#define BLIS_AXPBYS_H - -// axpbys - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of b. -// - The fourth char encodes the type of y. - -// -- (axby) = (???s) ---------------------------------------------------------- - -#define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) - -// -- (axby) = (???d) ---------------------------------------------------------- - -#define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -// -- (axby) = (???c) ---------------------------------------------------------- - -#define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) - -// -- (axby) = (???z) ---------------------------------------------------------- - -#define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -// -- (axby) = (???c) ---------------------------------------------------------- - -#define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } - -#define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } - -#define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } - -#define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } - -// -- (axby) = (???z) ---------------------------------------------------------- - -#define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } - -#define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } - -#define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } - -#define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) -#define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) -#define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) -#define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) - - -#endif - -// end bli_axpbys.h -// begin bli_axpbyjs.h - - -#ifndef BLIS_AXPBYJS_H -#define BLIS_AXPBYJS_H - -// axpbyjs - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of b. -// - The fourth char encodes the type of y. - -// -- (axby) = (???s) ---------------------------------------------------------- - -#define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) - -// -- (axby) = (???d) ---------------------------------------------------------- - -#define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -// -- (axby) = (???c) ---------------------------------------------------------- - -#define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) - -// -- (axby) = (???z) ---------------------------------------------------------- - -#define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -// -- (axby) = (???c) ---------------------------------------------------------- - -#define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } - -#define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } - -#define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } - -#define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } - -// -- (axby) = (???z) ---------------------------------------------------------- - -#define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } - -#define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } - -#define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } - -#define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } -#define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } -#define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } -#define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) -#define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) -#define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) -#define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) - - -#endif - -// end bli_axpbyjs.h - -// begin bli_axpys.h - - -#ifndef BLIS_AXPYS_H -#define BLIS_AXPYS_H - -// axpys - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } - -#define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } - -#define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } - -#define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } - -#define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } - -#define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } - -#define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } -#define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) -#define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) -#define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) -#define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) - - -#endif - -// end bli_axpys.h -// begin bli_axpyjs.h - - -#ifndef BLIS_AXPYJS_H -#define BLIS_AXPYJS_H - -// axpyjs - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } - -#define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } - -#define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } -#define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } -#define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } -#define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } - -#define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } -#define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } -#define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } -#define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } - -#define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } -#define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } - -#define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } -#define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } -#define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } -#define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } - -#define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } -#define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } -#define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } -#define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) -#define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) -#define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) -#define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) - - -#endif - -// end bli_axpyjs.h - -// begin bli_axmys.h - - -#ifndef BLIS_AXMYS_H -#define BLIS_AXMYS_H - -// axmys - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } - -#define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } - -#define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } - -#define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } - -#define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } - -#define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } - -#define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } -#define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) -#define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) -#define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) -#define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) - - -#endif - -// end bli_axmys.h - -// begin bli_conjs.h - - -#ifndef BLIS_CONJS_H -#define BLIS_CONJS_H - -// conjs - -#define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) -#define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) -#define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_cconjs( x ) { (x) = conjf(x); } -#define bli_zconjs( x ) { (x) = conj (x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#endif - -// end bli_conjs.h - -// begin bli_copys.h - - -#ifndef BLIS_COPYS_H -#define BLIS_COPYS_H - -// copys - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -// NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. -#define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -// NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. -#define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - - -#define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } - - -#define bli_scopys( x, y ) bli_sscopys( x, y ) -#define bli_dcopys( x, y ) bli_ddcopys( x, y ) -#define bli_ccopys( x, y ) bli_cccopys( x, y ) -#define bli_zcopys( x, y ) bli_zzcopys( x, y ) -#define bli_icopys( x, y ) bli_iicopys( x, y ) - - -#endif - -// end bli_copys.h -// begin bli_copyjs.h - - -#ifndef BLIS_COPYJS_H -#define BLIS_COPYJS_H - -// copyjs - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_sccopyjs( x, y ) { (y) = (x); } -#define bli_dccopyjs( x, y ) { (y) = (x); } -#define bli_cccopyjs( x, y ) { (y) = conjf(x); } -#define bli_zccopyjs( x, y ) { (y) = conj (x); } - -#define bli_szcopyjs( x, y ) { (y) = (x); } -#define bli_dzcopyjs( x, y ) { (y) = (x); } -#define bli_czcopyjs( x, y ) { (y) = conjf(x); } -#define bli_zzcopyjs( x, y ) { (y) = conj (x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } - - -#define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) -#define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) -#define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) -#define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) -#define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) - - -#endif - -// end bli_copyjs.h -// begin bli_copycjs.h - - -#ifndef BLIS_COPYCJS_H -#define BLIS_COPYCJS_H - -// copycjs - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_sccopycjs( conjx, x, y ) { (y) = (x); } -#define bli_dccopycjs( conjx, x, y ) { (y) = (x); } -#define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } -#define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } - -#define bli_szcopycjs( conjx, x, y ) { (y) = (x); } -#define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } -#define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } -#define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } - - -#define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) -#define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) -#define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) -#define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) -#define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) - - -#endif - -// end bli_copycjs.h - -// begin bli_copynzs.h - - -#ifndef BLIS_COPYNZS_H -#define BLIS_COPYNZS_H - -// copynzs - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -// NOTE: Use of scopyris() is so we don't touch the imaginary part of y. -#define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -// NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. -#define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - - -#define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } - - -#define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) -#define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) -#define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) -#define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) -#define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) - - -#endif - -// end bli_copynzs.h -// begin bli_copyjnzs.h - - -#ifndef BLIS_COPYJNZS_H -#define BLIS_COPYJNZS_H - -// copyjnzs - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -// NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we -// don't touch the imaginary part of y. -#define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -// NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we -// don't touch the imaginary part of y. -#define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - - -#define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } - - -#define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) -#define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) -#define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) -#define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) -#define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) - - -#endif - -// end bli_copyjnzs.h - -// begin bli_dots.h - - -#ifndef BLIS_DOTS_H -#define BLIS_DOTS_H - -// dots - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. -// - The third char encodes the type of rho. - - -#define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) -#define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) -#define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) -#define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) - -#define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) -#define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) -#define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) -#define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) - -#define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) -#define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) -#define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) -#define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) - -#define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) -#define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) -#define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) -#define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) - - - -#define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) -#define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) -#define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) -#define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) - -#define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) -#define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) -#define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) -#define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) - -#define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) -#define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) -#define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) -#define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) - -#define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) -#define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) -#define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) -#define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) - - - -#define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) -#define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) -#define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) -#define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) - -#define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) -#define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) -#define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) -#define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) - -#define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) -#define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) -#define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) -#define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) - -#define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) -#define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) -#define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) -#define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) - - - -#define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) -#define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) -#define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) -#define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) - -#define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) -#define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) -#define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) -#define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) - -#define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) -#define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) -#define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) -#define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) - -#define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) -#define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) -#define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) -#define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) - - - -#define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) -#define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) -#define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) -#define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) - - -#endif - -// end bli_dots.h -// begin bli_dotjs.h - - -#ifndef BLIS_DOTJS_H -#define BLIS_DOTJS_H - -// dotjs - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. -// - The third char encodes the type of rho. -// - x is used in conjugated form. - - -#define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) -#define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) -#define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) -#define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) - -#define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) -#define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) -#define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) -#define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) - -#define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) -#define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) -#define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) -#define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) - -#define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) -#define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) -#define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) -#define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) - - -#define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) -#define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) -#define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) -#define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) - -#define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) -#define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) -#define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) -#define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) - -#define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) -#define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) -#define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) -#define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) - -#define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) -#define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) -#define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) -#define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) - - -#define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) -#define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) -#define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) -#define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) - -#define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) -#define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) -#define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) -#define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) - -#define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) -#define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) -#define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) -#define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) - -#define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) -#define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) -#define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) -#define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) - - -#define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) -#define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) -#define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) -#define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) - -#define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) -#define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) -#define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) -#define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) - -#define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) -#define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) -#define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) -#define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) - -#define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) -#define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) -#define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) -#define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) - - - - - -#define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) -#define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) -#define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) -#define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) - - -#endif - -// end bli_dotjs.h - -// begin bli_eq.h - - -#ifndef BLIS_EQ_H -#define BLIS_EQ_H - - -// eq (passed by value) - -#define bli_seq( a, b ) ( (a) == (b) ) -#define bli_deq( a, b ) ( (a) == (b) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) -#define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_ceq( a, b ) ( (a) == (b) ) -#define bli_zeq( a, b ) ( (a) == (b) ) - -#endif // BLIS_ENABLE_C99_COMPLEX - -#define bli_ieq( a, b ) ( (a) == (b) ) - - - -// eqtori (passed by value) - -#define bli_seqtori( a, br, bi ) ( (a) == (br) ) -#define bli_deqtori( a, br, bi ) ( (a) == (br) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) -#define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) -#define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) - -#endif // BLIS_ENABLE_C99_COMPLEX - - - -// eqa (passed by address) - -#define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) -#define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) -#define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) -#define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) -#define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) - - - -// eq1 - -#define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) -#define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) -#define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) -#define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) -#define bli_ieq1( a ) bli_ieq ( (a), 1 ) - - - -// eq0 - -#define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) -#define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) -#define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) -#define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) -#define bli_ieq0( a ) bli_ieq ( (a), 0 ) - - - -// eqm1 - -#define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) -#define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) -#define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) -#define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) -#define bli_ieqm1( a ) bli_ieq ( (a), -1 ) - - - -#endif -// end bli_eq.h - -// begin bli_fprints.h - - -#ifndef BLIS_FPRINTS_H -#define BLIS_FPRINTS_H - -// prints - -#define bli_sfprints( file, spec, x ) \ -{ \ - fprintf( file, spec, (x) ); \ -} -#define bli_dfprints( file, spec, x ) \ -{ \ - fprintf( file, spec, (x) ); \ -} -#define bli_cfprints( file, spec, x ) \ -{ \ - fprintf( file, spec, bli_creal(x) ); \ - fprintf( file, " + " ); \ - fprintf( file, spec, bli_cimag(x) ); \ - fprintf( file, " " ); \ -} -#define bli_zfprints( file, spec, x ) \ -{ \ - fprintf( file, spec, bli_zreal(x) ); \ - fprintf( file, " + " ); \ - fprintf( file, spec, bli_zimag(x) ); \ - fprintf( file, " " ); \ -} -#define bli_ifprints( file, spec, x ) \ -{ \ - fprintf( file, spec, (x) ); \ -} - - -#endif -// end bli_fprints.h - -// begin bli_inverts.h - - -#ifndef BLIS_INVERTS_H -#define BLIS_INVERTS_H - -// inverts - -// Notes: -// - The first char encodes the type of x. - -#define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) -#define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) -#define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_cinverts( x ) { (x) = 1.0F / (x); } -#define bli_zinverts( x ) { (x) = 1.0 / (x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#endif - -// end bli_inverts.h - -// begin bli_invscals.h - - -#ifndef BLIS_INVSCALS_H -#define BLIS_INVSCALS_H - -// invscals - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of y. - -#define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) -#define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) - -#define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) -#define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) - -#define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scinvscals( a, y ) { (y) /= (a); } -#define bli_dcinvscals( a, y ) { (y) /= (a); } -#define bli_ccinvscals( a, y ) { (y) /= (a); } -#define bli_zcinvscals( a, y ) { (y) /= (a); } - -#define bli_szinvscals( a, y ) { (y) /= (a); } -#define bli_dzinvscals( a, y ) { (y) /= (a); } -#define bli_czinvscals( a, y ) { (y) /= (a); } -#define bli_zzinvscals( a, y ) { (y) /= (a); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) -#define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) -#define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) -#define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) - - -#endif - -// end bli_invscals.h -// begin bli_invscaljs.h - - -#ifndef BLIS_INVSCALJS_H -#define BLIS_INVSCALJS_H - -// invscaljs - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of y. - -#define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) -#define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) - -#define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) -#define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) - -#define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scinvscaljs( a, y ) { (y) /= (a); } -#define bli_dcinvscaljs( a, y ) { (y) /= (a); } -#define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } -#define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } - -#define bli_szinvscaljs( a, y ) { (y) /= (a); } -#define bli_dzinvscaljs( a, y ) { (y) /= (a); } -#define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } -#define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) -#define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) -#define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) -#define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) - - -#endif - -// end bli_invscaljs.h - -// begin bli_neg2s.h - - -#ifndef BLIS_NEG2S_H -#define BLIS_NEG2S_H - -// neg2s - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scneg2s( x, y ) { (y) = -(x); } -#define bli_dcneg2s( x, y ) { (y) = -(x); } -#define bli_ccneg2s( x, y ) { (y) = -(x); } -#define bli_zcneg2s( x, y ) { (y) = -(x); } - -#define bli_szneg2s( x, y ) { (y) = -(x); } -#define bli_dzneg2s( x, y ) { (y) = -(x); } -#define bli_czneg2s( x, y ) { (y) = -(x); } -#define bli_zzneg2s( x, y ) { (y) = -(x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) -#define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) -#define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) -#define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) - - -#endif - -// end bli_neg2s.h - -// begin bli_rands.h - - -#ifndef BLIS_RANDS_H -#define BLIS_RANDS_H - -// rands - - -#define bli_srands( a ) \ -{ \ - (a) = ( float ) ( ( double ) rand() / \ - ( ( double ) RAND_MAX / 2.0 ) \ - ) - 1.0F; \ -} -#define bli_drands( a ) \ -{ \ - (a) = ( double ) ( ( double ) rand() / \ - ( ( double ) RAND_MAX / 2.0 ) \ - ) - 1.0; \ -} -#define bli_crands( a ) \ -{ \ - float ar, ai; \ -\ - bli_srands( ar ); \ - bli_srands( ai ); \ -\ - bli_csets( ar, ai, (a) ); \ -} -#define bli_zrands( a ) \ -{ \ - double ar, ai; \ -\ - bli_drands( ar ); \ - bli_drands( ai ); \ -\ - bli_zsets( ar, ai, (a) ); \ -} - - -#endif - -// end bli_rands.h -// begin bli_randnp2s.h - - -#ifndef BLIS_RANDNP2S_H -#define BLIS_RANDNP2S_H - -// randnp2s - - -#define bli_srandnp2s( a ) \ -{ \ - bli_drandnp2s( a ); \ -} - -#if 0 -#define bli_drandnp2s_prev( a ) \ -{ \ - const double m_max = 3.0; \ - const double m_max2 = m_max + 2.0; \ - double t; \ - double r_val; \ -\ - \ -\ - \ - t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ -\ - \ - if ( t == m_max2 ) t = t - 1.0; \ -\ - \ - t = floor( t ); \ -\ - \ - if ( t == 0.0 ) r_val = 0.0; \ - else \ - { \ - \ -\ - double s_exp, s_val; \ -\ - \ - PASTEMAC(d,rands)( s_exp ); \ - PASTEMAC(d,rands)( s_val ); \ -\ - \ - if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ - else r_val = pow( 2.0, t - 1.0 ); \ -\ - \ - if ( s_val < 0.0 ) r_val = -r_val; \ - } \ -\ - \ - r_val = r_val / pow( 2.0, m_max ); \ -\ - \ - \ - a = r_val; \ -} -#endif - -#define bli_drandnp2s( a ) \ -{ \ - const double m_max = 6.0; \ - const double m_max2 = m_max + 2.0; \ - double t; \ - double r_val; \ -\ - \ -\ - do \ - { \ - \ - t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ -\ - \ - t = floor( t ); \ - } \ - \ - while ( m_max2 <= t ); \ -\ - \ - if ( t == 0.0 ) r_val = 0.0; \ - else \ - { \ - \ -\ - double s_val; \ -\ - \ - r_val = pow( 2.0, -(t - 1.0) ); \ -\ - \ - PASTEMAC(d,rands)( s_val ); \ -\ - \ - if ( s_val < 0.0 ) r_val = -r_val; \ - } \ -\ - \ - \ - a = r_val; \ -} -#define bli_crandnp2s( a ) \ -{ \ - float ar, ai; \ -\ - bli_srandnp2s( ar ); \ - bli_srandnp2s( ai ); \ -\ - bli_csets( ar, ai, (a) ); \ -} -#define bli_zrandnp2s( a ) \ -{ \ - double ar, ai; \ -\ - bli_drandnp2s( ar ); \ - bli_drandnp2s( ai ); \ -\ - bli_zsets( ar, ai, (a) ); \ -} - - -#endif - -// end bli_randnp2s.h - -// begin bli_scals.h - - -#ifndef BLIS_SCALS_H -#define BLIS_SCALS_H - -// scals - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of y. - -#define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) -#define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) - -#define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) -#define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) - -#define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scscals( a, y ) { (y) *= (a); } -#define bli_dcscals( a, y ) { (y) *= (a); } -#define bli_ccscals( a, y ) { (y) *= (a); } -#define bli_zcscals( a, y ) { (y) *= (a); } - -#define bli_szscals( a, y ) { (y) *= (a); } -#define bli_dzscals( a, y ) { (y) *= (a); } -#define bli_czscals( a, y ) { (y) *= (a); } -#define bli_zzscals( a, y ) { (y) *= (a); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sscals( a, y ) bli_ssscals( a, y ) -#define bli_dscals( a, y ) bli_ddscals( a, y ) -#define bli_cscals( a, y ) bli_ccscals( a, y ) -#define bli_zscals( a, y ) bli_zzscals( a, y ) - - -#endif - -// end bli_scals.h -// begin bli_scaljs.h - - -#ifndef BLIS_SCALJS_H -#define BLIS_SCALJS_H - -// scaljs - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of y. - -#define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) -#define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) - -#define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) -#define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) - -#define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scscaljs( a, y ) { (y) *= (a); } -#define bli_dcscaljs( a, y ) { (y) *= (a); } -#define bli_ccscaljs( a, y ) { (y) *= conjf(a); } -#define bli_zcscaljs( a, y ) { (y) *= conj (a); } - -#define bli_szscaljs( a, y ) { (y) *= (a); } -#define bli_dzscaljs( a, y ) { (y) *= (a); } -#define bli_czscaljs( a, y ) { (y) *= conjf(a); } -#define bli_zzscaljs( a, y ) { (y) *= conj (a); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) -#define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) -#define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) -#define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) - - -#endif - -// end bli_scaljs.h -// begin bli_scalcjs.h - - -#ifndef BLIS_SCALCJS_H -#define BLIS_SCALCJS_H - -// scalcjs - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } -#define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } -#define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } -#define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } - -#define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } -#define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } -#define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } -#define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) -#define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) -#define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) -#define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) - - -#endif - -// end bli_scalcjs.h - -// begin bli_scal2s.h - - -#ifndef BLIS_SCAL2S_H -#define BLIS_SCAL2S_H - -// scal2s - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } - -#define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } - -#define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } - -#define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } - -#define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } - -#define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } - -#define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } -#define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) -#define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) -#define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) -#define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) - - -#endif - -// end bli_scal2s.h -// begin bli_scal2js.h - - -#ifndef BLIS_SCAL2JS_H -#define BLIS_SCAL2JS_H - -// scal2js - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) - -#define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) -#define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) - -#define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) - -#define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) -#define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) - -#define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } - -#define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } - -#define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } -#define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } -#define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } -#define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } - -#define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } -#define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } -#define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } -#define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } - -#define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } -#define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } - -#define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } -#define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } -#define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } -#define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } - -#define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } -#define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } -#define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } -#define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) -#define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) -#define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) -#define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) - - -#endif - -// end bli_scal2js.h - -// begin bli_set0s.h - - -#ifndef BLIS_SET0S_H -#define BLIS_SET0S_H - -#define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) -#define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) -#define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) -#define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) - -#endif - -// end bli_set0s.h - -// begin bli_set1s.h - - -#ifndef BLIS_SET1S_H -#define BLIS_SET1S_H - -#define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) -#define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) -#define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) -#define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) - -#endif - -// end bli_set1s.h - -// begin bli_seti0s.h - - -#ifndef BLIS_SETI0S_H -#define BLIS_SETI0S_H - -#define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) -#define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) -#define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) -#define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) - -#endif - -// end bli_seti0s.h - -// begin bli_sqrt2s.h - - -#ifndef BLIS_SQRT2S_H -#define BLIS_SQRT2S_H - -// sqrt2s - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of a. - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) -#define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) - -#define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) - -#define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) -#define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) - -#define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } -#define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } -#define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } -#define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } - -#define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } -#define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } -#define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } -#define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } - -#define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } -#define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } -#define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } -#define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } - -#define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } -#define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } -#define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } -#define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) -#define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) -#define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) -#define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) - - -#endif -// end bli_sqrt2s.h - -// begin bli_subs.h - - -#ifndef BLIS_SUBS_H -#define BLIS_SUBS_H - -// subs - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of y. - -#define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) -#define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) - -#define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) -#define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) - -#define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scsubs( a, y ) { (y) -= (a); } -#define bli_dcsubs( a, y ) { (y) -= (a); } -#define bli_ccsubs( a, y ) { (y) -= (a); } -#define bli_zcsubs( a, y ) { (y) -= (a); } - -#define bli_szsubs( a, y ) { (y) -= (a); } -#define bli_dzsubs( a, y ) { (y) -= (a); } -#define bli_czsubs( a, y ) { (y) -= (a); } -#define bli_zzsubs( a, y ) { (y) -= (a); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_ssubs( a, y ) bli_sssubs( a, y ) -#define bli_dsubs( a, y ) bli_ddsubs( a, y ) -#define bli_csubs( a, y ) bli_ccsubs( a, y ) -#define bli_zsubs( a, y ) bli_zzsubs( a, y ) - - -#endif - -// end bli_subs.h -// begin bli_subjs.h - - -#ifndef BLIS_SUBJS_H -#define BLIS_SUBJS_H - -// subjs - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of y. - -#define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) -#define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) -#define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) - -#define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) -#define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -#define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) -#define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) -#define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) - -#define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) -#define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_scsubjs( a, y ) { (y) -= (a); } -#define bli_dcsubjs( a, y ) { (y) -= (a); } -#define bli_ccsubjs( a, y ) { (y) -= conjf(a); } -#define bli_zcsubjs( a, y ) { (y) -= conj (a); } - -#define bli_szsubjs( a, y ) { (y) -= (a); } -#define bli_dzsubjs( a, y ) { (y) -= (a); } -#define bli_czsubjs( a, y ) { (y) -= conjf(a); } -#define bli_zzsubjs( a, y ) { (y) -= conj (a); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_ssubjs( a, y ) bli_sssubjs( a, y ) -#define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) -#define bli_csubjs( a, y ) bli_ccsubjs( a, y ) -#define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) - - -#endif - -// end bli_subjs.h - -// begin bli_swaps.h - - -#ifndef BLIS_SWAPS_H -#define BLIS_SWAPS_H - -// swaps - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - - -#define bli_ssswaps( x, y ) \ -{ \ - float w; \ - bli_sscopys( (y), (w) ); \ - bli_sscopys( (x), (y) ); \ - bli_sscopys( (w), (x) ); \ -} -#define bli_dsswaps( x, y ) \ -{ \ - double w; \ - bli_sdcopys( (y), (w) ); \ - bli_dscopys( (x), (y) ); \ - bli_ddcopys( (w), (x) ); \ -} -#define bli_csswaps( x, y ) \ -{ \ - scomplex w; \ - bli_sccopys( (y), (w) ); \ - bli_cscopys( (x), (y) ); \ - bli_cccopys( (w), (x) ); \ -} -#define bli_zsswaps( x, y ) \ -{ \ - dcomplex w; \ - bli_szcopys( (y), (w) ); \ - bli_zscopys( (x), (y) ); \ - bli_zzcopys( (w), (x) ); \ -} - - -#define bli_sdswaps( x, y ) \ -{ \ - float w; \ - bli_dscopys( (y), (w) ); \ - bli_sdcopys( (x), (y) ); \ - bli_sscopys( (w), (x) ); \ -} -#define bli_ddswaps( x, y ) \ -{ \ - double w; \ - bli_ddcopys( (y), (w) ); \ - bli_ddcopys( (x), (y) ); \ - bli_ddcopys( (w), (x) ); \ -} -#define bli_cdswaps( x, y ) \ -{ \ - scomplex w; \ - bli_dccopys( (y), (w) ); \ - bli_cdcopys( (x), (y) ); \ - bli_cccopys( (w), (x) ); \ -} -#define bli_zdswaps( x, y ) \ -{ \ - dcomplex w; \ - bli_dzcopys( (y), (w) ); \ - bli_zdcopys( (x), (y) ); \ - bli_zzcopys( (w), (x) ); \ -} - - -#define bli_scswaps( x, y ) \ -{ \ - float w; \ - bli_cscopys( (y), (w) ); \ - bli_sccopys( (x), (y) ); \ - bli_sscopys( (w), (x) ); \ -} -#define bli_dcswaps( x, y ) \ -{ \ - double w; \ - bli_cdcopys( (y), (w) ); \ - bli_dccopys( (x), (y) ); \ - bli_ddcopys( (w), (x) ); \ -} -#define bli_ccswaps( x, y ) \ -{ \ - scomplex w; \ - bli_cccopys( (y), (w) ); \ - bli_cccopys( (x), (y) ); \ - bli_cccopys( (w), (x) ); \ -} -#define bli_zcswaps( x, y ) \ -{ \ - dcomplex w; \ - bli_czcopys( (y), (w) ); \ - bli_zccopys( (x), (y) ); \ - bli_zzcopys( (w), (x) ); \ -} - - -#define bli_szswaps( x, y ) \ -{ \ - float w; \ - bli_zscopys( (y), (w) ); \ - bli_szcopys( (x), (y) ); \ - bli_sscopys( (w), (x) ); \ -} -#define bli_dzswaps( x, y ) \ -{ \ - double w; \ - bli_zdcopys( (y), (w) ); \ - bli_dzcopys( (x), (y) ); \ - bli_ddcopys( (w), (x) ); \ -} -#define bli_czswaps( x, y ) \ -{ \ - scomplex w; \ - bli_zccopys( (y), (w) ); \ - bli_czcopys( (x), (y) ); \ - bli_cccopys( (w), (x) ); \ -} -#define bli_zzswaps( x, y ) \ -{ \ - dcomplex w; \ - bli_zzcopys( (y), (w) ); \ - bli_zzcopys( (x), (y) ); \ - bli_zzcopys( (w), (x) ); \ -} - - -#define bli_sswaps( x, y ) bli_ssswaps( x, y ) -#define bli_dswaps( x, y ) bli_ddswaps( x, y ) -#define bli_cswaps( x, y ) bli_ccswaps( x, y ) -#define bli_zswaps( x, y ) bli_zzswaps( x, y ) - - -#endif -// end bli_swaps.h - -// begin bli_xpbys.h - - -#ifndef BLIS_XPBYS_H -#define BLIS_XPBYS_H - -// xpbys - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of b. -// - The third char encodes the type of y. - -// -- (xby) = (??s) ------------------------------------------------------------ - -#define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) - -// -- (xby) = (??d) ------------------------------------------------------------ - -#define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -// -- (xby) = (??c) ------------------------------------------------------------ - -#define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) - -// -- (xby) = (??z) ------------------------------------------------------------ - -#define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -// -- (xby) = (??c) ------------------------------------------------------------ - -#define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } - -// -- (xby) = (??z) ------------------------------------------------------------ - -#define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) -#define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) -#define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) -#define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) - - -#endif - -// end bli_xpbys.h -// begin bli_xpbyjs.h - - -#ifndef BLIS_XPBYJS_H -#define BLIS_XPBYJS_H - -// xpbyjs - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of b. -// - The third char encodes the type of y. - -// -- (xby) = (??s) ------------------------------------------------------------ - -#define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) - -#define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) -#define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) - -// -- (xby) = (??d) ------------------------------------------------------------ - -#define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) - -#define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) -#define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) - -#ifndef BLIS_ENABLE_C99_COMPLEX - -// -- (xby) = (??c) ------------------------------------------------------------ - -#define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) - -#define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) -#define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) - -// -- (xby) = (??z) ------------------------------------------------------------ - -#define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) - -#define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) -#define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) - -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -// -- (xby) = (??c) ------------------------------------------------------------ - -#define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } - -// -- (xby) = (??z) ------------------------------------------------------------ - -#define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } - -#define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } -#define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } - -#endif // BLIS_ENABLE_C99_COMPLEX - - -#define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) -#define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) -#define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) -#define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) - - -#endif - -// end bli_xpbyjs.h - -// Inlined scalar macros in loops -// begin bli_adds_mxn.h - - -#ifndef BLIS_ADDS_MXN_H -#define BLIS_ADDS_MXN_H - -// adds_mxn - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - - -// xy = ?s - -static void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_ssadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_ssadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_ssadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dsadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_dsadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dsadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_csadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_csadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_csadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zsadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zsadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zsadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} - -// xy = ?d - -static void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sdadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_sdadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sdadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_ddadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_ddadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_ddadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cdadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_cdadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cdadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zdadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zdadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zdadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} - -// xy = ?c - -static void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_scadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_scadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_scadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dcadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_dcadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dcadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_ccadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_ccadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_ccadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zcadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zcadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zcadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} - -// xy = ?z - -static void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_szadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_szadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_szadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dzadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_dzadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dzadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_czadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_czadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_czadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zzadds( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zzadds( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zzadds( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} - - - -static void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); -} -static void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); -} -static void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); -} -static void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); -} - - -#endif -// end bli_adds_mxn.h -// begin bli_adds_mxn_uplo.h - - -#ifndef BLIS_ADDS_MXN_UPLO_H -#define BLIS_ADDS_MXN_UPLO_H - -// adds_mxn_u - -#define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - { \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ -} - -#define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - { \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ -} - -#define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - { \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ -} - -#define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - { \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ -} - -// adds_mxn_l - -#define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - { \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ -} - -#define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - { \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ -} - -#define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - { \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ -} - -#define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - { \ - for ( _i = 0; _i < m; ++_i ) \ - { \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ -} - - -#define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ -} -#define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ -} -#define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ -} -#define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ -} -#define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ -} -#define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ -} -#define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ -} -#define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ -} - -#endif -// end bli_adds_mxn_uplo.h -// begin bli_set0s_mxn.h - - -#ifndef BLIS_SET0S_MXN_H -#define BLIS_SET0S_MXN_H - -// set0s_mxn - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -static void bli_sset0s_mxn( const dim_t m, const dim_t n, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - bli_sset0s( *(y + i*rs_y + j*cs_y) ); -} - -static void bli_dset0s_mxn( const dim_t m, const dim_t n, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - bli_dset0s( *(y + i*rs_y + j*cs_y) ); -} - -static void bli_cset0s_mxn( const dim_t m, const dim_t n, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - bli_cset0s( *(y + i*rs_y + j*cs_y) ); -} - -static void bli_zset0s_mxn( const dim_t m, const dim_t n, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - bli_zset0s( *(y + i*rs_y + j*cs_y) ); -} - -#endif -// end bli_set0s_mxn.h -// begin bli_copys_mxn.h - - -#ifndef BLIS_COPYS_MXN_H -#define BLIS_COPYS_MXN_H - -// copys_mxn - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -// xy = ?s - -static void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sscopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_sscopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sscopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dscopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_dscopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dscopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cscopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_cscopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cscopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zscopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zscopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zscopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} - -// xy = ?d - -static void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sdcopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_sdcopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sdcopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_ddcopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_ddcopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_ddcopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cdcopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_cdcopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cdcopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zdcopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zdcopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zdcopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} - -// xy = ?c - -static void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sccopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_sccopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sccopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dccopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_dccopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dccopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cccopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_cccopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cccopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zccopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zccopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zccopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} - -// xy = ?c - -static void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_szcopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_szcopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_szcopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dzcopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_dzcopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dzcopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_czcopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_czcopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_czcopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zzcopys( *(x + ii + jj*cs_x), - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zzcopys( *(x + ii*rs_x + jj), - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zzcopys( *(x + ii*rs_x + jj*cs_x), - *(y + ii*rs_y + jj*cs_y) ); - } -} - - -static void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); -} -static void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); -} -static void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); -} -static void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); -} - -#endif -// end bli_copys_mxn.h -// begin bli_scal2s_mxn.h - - -#ifndef BLIS_SCAL2S_MXN_H -#define BLIS_SCAL2S_MXN_H - -// scal2s_mxn - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -static void PASTEMAC(ch,opname) \ - ( \ - const conj_t conjx, \ - const dim_t m, \ - const dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ - ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ - ) \ -{ \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict xj = x + j*cs_x; \ - ctype* restrict yj = y + j*cs_y; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict xij = xj + i*rs_x; \ - ctype* restrict yij = yj + i*rs_y; \ -\ - PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ - } \ - } \ - } \ - else \ - { \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict xj = x + j*cs_x; \ - ctype* restrict yj = y + j*cs_y; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict xij = xj + i*rs_x; \ - ctype* restrict yij = yj + i*rs_y; \ -\ - PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( scal2s_mxn ) - -#endif -// end bli_scal2s_mxn.h -// begin bli_xpbys_mxn.h - - -#ifndef BLIS_XPBYS_MXN_H -#define BLIS_XPBYS_MXN_H - -// xpbys_mxn - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of b. -// - The third char encodes the type of y. - - -// -- (xby) = (?ss) ------------------------------------------------------------ - -static void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict beta, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_seq0( *beta ) ) - { - bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sssxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_sssxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict beta, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_seq0( *beta ) ) - { - bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dssxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_dssxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict beta, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_seq0( *beta ) ) - { - bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cssxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_cssxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict beta, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_seq0( *beta ) ) - { - bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zssxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zssxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} - -// -- (xby) = (?dd) ------------------------------------------------------------ - -static void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict beta, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_deq0( *beta ) ) - { - bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sddxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_sddxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict beta, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_deq0( *beta ) ) - { - bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dddxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_dddxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict beta, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_deq0( *beta ) ) - { - bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cddxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_cddxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict beta, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_deq0( *beta ) ) - { - bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zddxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zddxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} - -// -- (xby) = (?cc) ------------------------------------------------------------ - -static void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict beta, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_ceq0( *beta ) ) - { - bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sccxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_sccxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict beta, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_ceq0( *beta ) ) - { - bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dccxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_dccxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict beta, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_ceq0( *beta ) ) - { - bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cccxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_cccxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict beta, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_ceq0( *beta ) ) - { - bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zccxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zccxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} - -// -- (xby) = (?zz) ------------------------------------------------------------ - -static void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict beta, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_zeq0( *beta ) ) - { - bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_szzxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_szzxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict beta, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_zeq0( *beta ) ) - { - bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict beta, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_zeq0( *beta ) ) - { - bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_czzxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_czzxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} -static void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict beta, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - // If beta is zero, overwrite y with x (in case y has infs or NaNs). - if ( bli_zeq0( *beta ) ) - { - bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); - return; - } - -#ifdef BLIS_ENABLE_CR_CASES - if ( rs_x == 1 && rs_y == 1 ) - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, - *(y + ii + jj*cs_y) ); - } - else if ( cs_x == 1 && cs_y == 1 ) - { - for ( dim_t ii = 0; ii < m; ++ii ) - for ( dim_t jj = 0; jj < n; ++jj ) - bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, - *(y + ii*rs_y + jj) ); - } - else -#endif - { - for ( dim_t jj = 0; jj < n; ++jj ) - for ( dim_t ii = 0; ii < m; ++ii ) - bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, - *(y + ii*rs_y + jj*cs_y) ); - } -} - - - -static void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, - float* restrict beta, - float* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); -} -static void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, - double* restrict beta, - double* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); -} -static void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict beta, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); -} -static void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict beta, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) -{ - bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); -} - - -#endif -// end bli_xpbys_mxn.h -// begin bli_xpbys_mxn_uplo.h - - -#ifndef BLIS_XPBYS_MXN_UPLO_H -#define BLIS_XPBYS_MXN_UPLO_H - -// xpbys_mxn_u - -#define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - \ - if ( bli_seq0( *beta ) ) \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ -} - -#define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - \ - if ( bli_deq0( *beta ) ) \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ -} - -#define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - \ - if ( bli_ceq0( *beta ) ) \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ -} - -#define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - \ - if ( bli_zeq0( *beta ) ) \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ - { \ - bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ -} - -// xpbys_mxn_l - -#define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - \ - if ( bli_seq0( *beta ) ) \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ -} - -#define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - \ - if ( bli_deq0( *beta ) ) \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ -} - -#define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - \ - if ( bli_ceq0( *beta ) ) \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ -} - -#define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - \ - if ( bli_zeq0( *beta ) ) \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ - { \ - bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ - } \ -} - - -#define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ -} -#define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ -} -#define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ -} -#define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ -} -#define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ -} -#define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ -} -#define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ -} -#define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ -} - -#endif -// end bli_xpbys_mxn_uplo.h - -// -- "broadcast B" scalar macros -- - -// begin bli_bcastbbs_mxn.h - - -#ifndef BLIS_BCASTBBS_MXN_H -#define BLIS_BCASTBBS_MXN_H - -// bcastbbs_mxn - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -static void PASTEMAC(ch,opname) \ - ( \ - const dim_t m, \ - const dim_t n, \ - ctype* restrict y, const inc_t incy, const inc_t ldy \ - ) \ -{ \ - \ - const dim_t d = ldy; \ - const dim_t ds_y = 1; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict yi = y + i*incy; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict yij = yi + j*ldy; \ -\ - for ( dim_t p = 1; p < d; ++p ) \ - { \ - ctype* restrict yijd = yij + p*ds_y; \ -\ - PASTEMAC(ch,copys)( *yij, *yijd ); \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) - -#endif -// end bli_bcastbbs_mxn.h -// begin bli_scal2bbs_mxn.h - - -#ifndef BLIS_SCAL2BBS_MXN_H -#define BLIS_SCAL2BBS_MXN_H - -// scal2bbs_mxn - -#undef GENTFUNCRO -#define GENTFUNCRO( ctype, ch, opname ) \ -\ -static void PASTEMAC(ch,opname) \ - ( \ - const conj_t conjx, \ - const dim_t m, \ - const dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, const inc_t incx, const inc_t ldx, \ - ctype* restrict y, const inc_t incy, const inc_t ldy \ - ) \ -{ \ - \ - const dim_t d = incy; \ - const dim_t ds_y = 1; \ -\ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict xj = x + j*ldx; \ - ctype* restrict yj = y + j*ldy; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict xij = xj + i*incx; \ - ctype* restrict yij = yj + i*incy; \ -\ - PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ -\ - for ( dim_t p = 1; p < d; ++p ) \ - { \ - ctype* restrict yijd = yij + p*ds_y; \ -\ - PASTEMAC(ch,copys)( *yij, *yijd ); \ - } \ - } \ - } \ - } \ - else \ - { \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict xj = x + j*ldx; \ - ctype* restrict yj = y + j*ldy; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict xij = xj + i*incx; \ - ctype* restrict yij = yj + i*incy; \ -\ - PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ -\ - for ( dim_t p = 1; p < d; ++p ) \ - { \ - ctype* restrict yijd = yij + p*ds_y; \ -\ - PASTEMAC(ch,copys)( *yij, *yijd ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ -\ -static void PASTEMAC(ch,opname) \ - ( \ - const conj_t conjx, \ - const dim_t m, \ - const dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, const inc_t incx, const inc_t ldx, \ - ctype* restrict y, const inc_t incy, const inc_t ldy \ - ) \ -{ \ - \ - const dim_t d = incy; \ - const dim_t ds_y = 1; \ -\ - const inc_t incx2 = 2 * incx; \ - const inc_t ldx2 = 2 * ldx; \ -\ - const inc_t incy2 = 2 * incy; \ - const inc_t ldy2 = 2 * ldy; \ -\ - ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ - ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ - ctype_r* restrict chi_r = ( ctype_r* )x; \ - ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ - ctype_r* restrict psi_r = ( ctype_r* )y; \ - ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ -\ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict chij_r = chi_r + j*ldx2; \ - ctype_r* restrict chij_i = chi_i + j*ldx2; \ - ctype_r* restrict psij_r = psi_r + j*ldy2; \ - ctype_r* restrict psij_i = psi_i + j*ldy2; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype_r* restrict chiij_r = chij_r + i*incx2; \ - ctype_r* restrict chiij_i = chij_i + i*incx2; \ - ctype_r* restrict psiij_r = psij_r + i*incy2; \ - ctype_r* restrict psiij_i = psij_i + i*incy2; \ -\ - PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ - *chiij_r, *chiij_i, \ - *psiij_r, *psiij_i ); \ -\ - for ( dim_t p = 1; p < d; ++p ) \ - { \ - ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ - ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ -\ - PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ - *psiijd_r, *psiijd_i ); \ - } \ - } \ - } \ - } \ - else \ - { \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict chij_r = chi_r + j*ldx2; \ - ctype_r* restrict chij_i = chi_i + j*ldx2; \ - ctype_r* restrict psij_r = psi_r + j*ldy2; \ - ctype_r* restrict psij_i = psi_i + j*ldy2; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype_r* restrict chiij_r = chij_r + i*incx2; \ - ctype_r* restrict chiij_i = chij_i + i*incx2; \ - ctype_r* restrict psiij_r = psij_r + i*incy2; \ - ctype_r* restrict psiij_i = psij_i + i*incy2; \ -\ - PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ - *chiij_r, *chiij_i, \ - *psiij_r, *psiij_i ); \ -\ - for ( dim_t p = 1; p < d; ++p ) \ - { \ - ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ - ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ -\ - PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ - *psiijd_r, *psiijd_i ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) - -#endif -// end bli_scal2bbs_mxn.h -// begin bli_set0bbs_mxn.h - - -#ifndef BLIS_SET0BBS_MXN_H -#define BLIS_SET0BBS_MXN_H - -// set0bbs_mxn - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -static void PASTEMAC(ch,opname) \ - ( \ - const dim_t m, \ - const dim_t n, \ - ctype* restrict y, const inc_t incy, const inc_t ldy \ - ) \ -{ \ - \ - const dim_t d = incy; \ - const dim_t ds_y = 1; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict yj = y + j*ldy; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict yij = yj + i*incy; \ -\ - for ( dim_t p = 0; p < d; ++p ) \ - { \ - ctype* restrict yijd = yij + p*ds_y; \ -\ - PASTEMAC(ch,set0s)( *yijd ); \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) - -#endif -// end bli_set0bbs_mxn.h - - -// -- 3m-specific scalar macros -- - -// begin bli_copyri3s.h - - -#ifndef BLIS_COPYRI3S_H -#define BLIS_COPYRI3S_H - -// copyri3s - -#define bli_scopyri3s( ar, ai, br, bi, bri ) \ -{ \ - (br) = (ar); \ -} - -#define bli_dcopyri3s( ar, ai, br, bi, bri ) \ -{ \ - (br) = (ar); \ -} - -#define bli_ccopyri3s( ar, ai, br, bi, bri ) \ -{ \ - (br) = (ar); \ - (bi) = (ai); \ - (bri) = (ar) + (ai); \ -} - -#define bli_zcopyri3s( ar, ai, br, bi, bri ) \ -{ \ - (br) = (ar); \ - (bi) = (ai); \ - (bri) = (ar) + (ai); \ -} - -#endif - -// end bli_copyri3s.h -// begin bli_copyjri3s.h - - -#ifndef BLIS_COPYJRI3S_H -#define BLIS_COPYJRI3S_H - -// copyjri3s - -#define bli_scopyjri3s( ar, ai, br, bi, bri ) bli_scopyri3s( (ar), -(ai), (br), (bi), (bri) ) -#define bli_dcopyjri3s( ar, ai, br, bi, bri ) bli_dcopyri3s( (ar), -(ai), (br), (bi), (bri) ) -#define bli_ccopyjri3s( ar, ai, br, bi, bri ) bli_ccopyri3s( (ar), -(ai), (br), (bi), (bri) ) -#define bli_zcopyjri3s( ar, ai, br, bi, bri ) bli_zcopyri3s( (ar), -(ai), (br), (bi), (bri) ) - -#endif - -// end bli_copyjri3s.h - -// begin bli_scal2ri3s.h - - -#ifndef BLIS_SCAL2RI3S_H -#define BLIS_SCAL2RI3S_H - -// scal2ri3s - -#define bli_sscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr); \ -} - -#define bli_dscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr); \ -} - -#define bli_cscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr) - (ai) * (xi); \ - (yi) = (ai) * (xr) + (ar) * (xi); \ - (yri) = (yr) + (yi); \ -} - -#define bli_zscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr) - (ai) * (xi); \ - (yi) = (ai) * (xr) + (ar) * (xi); \ - (yri) = (yr) + (yi); \ -} - -#define bli_scscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr); \ - (yi) = (ar) * (xi); \ - (yri) = (yr) + (yi); \ -} - -#define bli_dzscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr); \ - (yi) = (ar) * (xi); \ - (yri) = (yr) + (yi); \ -} - -#endif - -// end bli_scal2ri3s.h -// begin bli_scal2jri3s.h - - -#ifndef BLIS_SCAL2JRI3S_H -#define BLIS_SCAL2JRI3S_H - -// scal2jri3s - -#define bli_sscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr); \ -} - -#define bli_dscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr); \ -} - -#define bli_cscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr) + (ai) * (xi); \ - (yi) = (ai) * (xr) - (ar) * (xi); \ - (yri) = (yr) + (yi); \ -} - -#define bli_zscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr) + (ai) * (xi); \ - (yi) = (ai) * (xr) - (ar) * (xi); \ - (yri) = (yr) + (yi); \ -} - -#define bli_scscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr); \ - (yi) = (ar) * -(xi); \ - (yri) = (yr) + (yi); \ -} - -#define bli_dzscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ -{ \ - (yr) = (ar) * (xr); \ - (yi) = (ar) * -(xi); \ - (yri) = (yr) + (yi); \ -} - -#endif - -// end bli_scal2jri3s.h - -// begin bli_scal2ri3s_mxn.h - - -#ifndef BLIS_SCAL2RI3S_MXN_H -#define BLIS_SCAL2RI3S_MXN_H - -// scal2ri3s_mxn - -static void bli_cscal2ri3s_mxn - ( - const conj_t conjx, - const dim_t m, - const dim_t n, - scomplex* restrict alpha, - scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y - ) -{ - float* restrict alpha_r = ( float* )alpha; \ - float* restrict alpha_i = ( float* )alpha + 1; \ - float* restrict x_r = ( float* )x; \ - float* restrict x_i = ( float* )x + 1; \ - float* restrict y_r = ( float* )y; \ - float* restrict y_i = ( float* )y + is_y; \ - float* restrict y_rpi = ( float* )y + 2*is_y; \ - const dim_t incx2 = 2*rs_x; \ - const dim_t ldx2 = 2*cs_x; \ - - \ - - if ( bli_is_conj( conjx ) ) - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; - float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; - float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; - float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; - float* restrict psi11_rpi = y_rpi + (i )*1 + (j )*cs_y; - - bli_cscal2jri3s - ( - *alpha_r, - *alpha_i, - *chi11_r, - *chi11_i, - *psi11_r, - *psi11_i, - *psi11_rpi - ); - } - } - else - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; - float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; - float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; - float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; - float* restrict psi11_rpi = y_rpi + (i )*1 + (j )*cs_y; - - bli_cscal2ri3s - ( - *alpha_r, - *alpha_i, - *chi11_r, - *chi11_i, - *psi11_r, - *psi11_i, - *psi11_rpi - ); - } - } -} - -static void bli_zscal2ri3s_mxn - ( - const conj_t conjx, - const dim_t m, - const dim_t n, - dcomplex* restrict alpha, - dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y - ) -{ - double* restrict alpha_r = ( double* )alpha; \ - double* restrict alpha_i = ( double* )alpha + 1; \ - double* restrict x_r = ( double* )x; \ - double* restrict x_i = ( double* )x + 1; \ - double* restrict y_r = ( double* )y; \ - double* restrict y_i = ( double* )y + is_y; \ - double* restrict y_rpi = ( double* )y + 2*is_y; \ - const dim_t incx2 = 2*rs_x; \ - const dim_t ldx2 = 2*cs_x; \ - - \ - - if ( bli_is_conj( conjx ) ) - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; - double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; - double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; - double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; - double* restrict psi11_rpi = y_rpi + (i )*1 + (j )*cs_y; - - bli_zscal2jri3s - ( - *alpha_r, - *alpha_i, - *chi11_r, - *chi11_i, - *psi11_r, - *psi11_i, - *psi11_rpi - ); - } - } - else - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; - double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; - double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; - double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; - double* restrict psi11_rpi = y_rpi + (i )*1 + (j )*cs_y; - - bli_zscal2ri3s - ( - *alpha_r, - *alpha_i, - *chi11_r, - *chi11_i, - *psi11_r, - *psi11_i, - *psi11_rpi - ); - } - } -} - - -#endif -// end bli_scal2ri3s_mxn.h - - -// -- 4mh/3mh-specific scalar macros -- - -// ro -// begin bli_scal2ros.h - - -#ifndef BLIS_SCAL2ROS_H -#define BLIS_SCAL2ROS_H - -// scal2ros - -#define bli_cscal2ros( a, x, yr ) \ -{ \ - (yr) = bli_creal(a) * bli_creal(x) - bli_cimag(a) * bli_cimag(x); \ -} - -#define bli_zscal2ros( a, x, yr ) \ -{ \ - (yr) = bli_zreal(a) * bli_zreal(x) - bli_zimag(a) * bli_zimag(x); \ -} - -#define bli_scscal2ros( a, x, yr ) \ -{ \ - (yr) = bli_creal(a) * bli_creal(x); \ -} - -#define bli_dzscal2ros( a, x, yr ) \ -{ \ - (yr) = bli_zreal(a) * bli_zreal(x); \ -} - - -#endif - -// end bli_scal2ros.h -// begin bli_scal2jros.h - - -#ifndef BLIS_SCAL2JROS_H -#define BLIS_SCAL2JROS_H - -// scal2jros - -#define bli_cscal2jros( a, x, yr ) \ -{ \ - (yr) = bli_creal(a) * bli_creal(x) + bli_cimag(a) * bli_cimag(x); \ -} - -#define bli_zscal2jros( a, x, yr ) \ -{ \ - (yr) = bli_zreal(a) * bli_zreal(x) + bli_zimag(a) * bli_zimag(x); \ -} - -#endif - -// end bli_scal2jros.h - -// io -// begin bli_scal2ios.h - - -#ifndef BLIS_SCAL2IOS_H -#define BLIS_SCAL2IOS_H - -// scal2ios - -#define bli_cscal2ios( a, x, yi ) \ -{ \ - (yi) = bli_cimag(a) * bli_creal(x) + bli_creal(a) * bli_cimag(x); \ -} - -#define bli_zscal2ios( a, x, yi ) \ -{ \ - (yi) = bli_zimag(a) * bli_zreal(x) + bli_zreal(a) * bli_zimag(x); \ -} - -#define bli_scscal2ios( a, x, yi ) \ -{ \ - (yi) = bli_creal(a) * bli_cimag(x); \ -} - -#define bli_dzscal2ios( a, x, yi ) \ -{ \ - (yi) = bli_zreal(a) * bli_zimag(x); \ -} - -#endif - -// end bli_scal2ios.h -// begin bli_scal2jios.h - - -#ifndef BLIS_SCAL2JIOS_H -#define BLIS_SCAL2JIOS_H - -// scal2jios - -#define bli_cscal2jios( a, x, yi ) \ -{ \ - (yi) = bli_cimag(a) * bli_creal(x) - bli_creal(a) * bli_cimag(x); \ -} - -#define bli_zscal2jios( a, x, yi ) \ -{ \ - (yi) = bli_zimag(a) * bli_zreal(x) - bli_zreal(a) * bli_zimag(x); \ -} - - -#endif - -// end bli_scal2jios.h - -// rpi -// begin bli_scal2rpis.h - - -#ifndef BLIS_SCAL2RPIS_H -#define BLIS_SCAL2RPIS_H - -// scal2rpis - -#define bli_cscal2rpis( a, x, yrpi ) \ -{ \ - (yrpi) = (bli_creal(a)+bli_cimag(a)) * bli_creal(x) + \ - (bli_creal(a)-bli_cimag(a)) * bli_cimag(x); \ -} - -#define bli_zscal2rpis( a, x, yrpi ) \ -{ \ - (yrpi) = (bli_zreal(a)+bli_zimag(a)) * bli_zreal(x) + \ - (bli_zreal(a)-bli_zimag(a)) * bli_zimag(x); \ -} - -#define bli_scscal2rpis( a, x, yrpi ) \ -{ \ - (yrpi) = bli_creal(a) * bli_creal(x) + \ - bli_creal(a) * bli_cimag(x); \ -} - -#define bli_dzscal2rpis( a, x, yrpi ) \ -{ \ - (yrpi) = bli_zreal(a) * bli_zreal(x) + \ - bli_zreal(a) * bli_zimag(x); \ -} - - -#endif - -// end bli_scal2rpis.h -// begin bli_scal2jrpis.h - - -#ifndef BLIS_SCAL2JRPIS_H -#define BLIS_SCAL2JRPIS_H - -// scal2jrpis - -#define bli_cscal2jrpis( a, x, yrpi ) \ -{ \ - (yrpi) = (bli_creal(a)+bli_cimag(a)) * bli_creal(x) + \ - (bli_cimag(a)-bli_creal(a)) * bli_cimag(x); \ -} - -#define bli_zscal2jrpis( a, x, yrpi ) \ -{ \ - (yrpi) = (bli_zreal(a)+bli_zimag(a)) * bli_zreal(x) + \ - (bli_zimag(a)-bli_zreal(a)) * bli_zimag(x); \ -} - -#endif - -// end bli_scal2jrpis.h - -// begin bli_scal2rihs_mxn.h - - -#ifndef BLIS_SCAL2RIHS_MXN_H -#define BLIS_SCAL2RIHS_MXN_H - -// scal2rihs_mxn - -static void bli_cscal2rihs_mxn - ( - const pack_t schema, - const conj_t conjx, - const dim_t m, - const dim_t n, - scomplex* restrict alpha, - scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y - ) -{ - scomplex* restrict x_r = x; - float* restrict y_r = ( float* )y; - - if ( bli_is_ro_packed( schema ) ) - { - if ( bli_is_conj( conjx ) ) - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_cscal2jros - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - else - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_cscal2ros - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - } - else if ( bli_is_io_packed( schema ) ) - { - if ( bli_is_conj( conjx ) ) - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_cscal2jios - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - else - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_cscal2ios - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - } - else - { - if ( bli_is_conj( conjx ) ) - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_cscal2jrpis - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - else - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_cscal2rpis - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - } -} - -static void bli_zscal2rihs_mxn - ( - const pack_t schema, - const conj_t conjx, - const dim_t m, - const dim_t n, - dcomplex* restrict alpha, - dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y - ) -{ - dcomplex* restrict x_r = x; - double* restrict y_r = ( double* )y; - - if ( bli_is_ro_packed( schema ) ) - { - if ( bli_is_conj( conjx ) ) - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_zscal2jros - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - else - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_zscal2ros - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - } - else if ( bli_is_io_packed( schema ) ) - { - if ( bli_is_conj( conjx ) ) - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_zscal2jios - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - else - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_zscal2ios - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - } - else - { - if ( bli_is_conj( conjx ) ) - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_zscal2jrpis - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - else - { - for ( dim_t j = 0; j < n; ++j ) - for ( dim_t i = 0; i < m; ++i ) - { - dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; - double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; - - bli_zscal2rpis - ( - *alpha, - *chi11, - *psi11_r - ); - } - } - } -} - - -#endif -// end bli_scal2rihs_mxn.h -// begin bli_scal2rihs_mxn_diag.h - - -#ifndef BLIS_SCAL2RIHS_MXN_DIAG_H -#define BLIS_SCAL2RIHS_MXN_DIAG_H - -// scal2rihs_mxn_diag - -#define bli_cscscal2rihs_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ -{ \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t _i; \ -\ - \ - if ( bli_is_ro_packed( schema ) ) \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_scscal2ros( *(x + _i*rs_x + _i*cs_x), \ - *(a), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ - else if ( bli_is_io_packed( schema ) ) \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_scscal2ios( *(x + _i*rs_x + _i*cs_x), \ - *(a), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_scscal2rpis( *(x + _i*rs_x + _i*cs_x), \ - *(a), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ -} - -#define bli_zdzscal2rihs_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ -{ \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t _i; \ -\ - \ - if ( bli_is_ro_packed( schema ) ) \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_dzscal2ros( *(x + _i*rs_x + _i*cs_x), \ - *(a), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ - else if ( bli_is_io_packed( schema ) ) \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_dzscal2ios( *(x + _i*rs_x + _i*cs_x), \ - *(a), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_dzscal2rpis( *(x + _i*rs_x + _i*cs_x), \ - *(a), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ -} - -#endif -// end bli_scal2rihs_mxn_diag.h -// begin bli_scal2rihs_mxn_uplo.h - - -#ifndef BLIS_SCAL2RIHS_MXN_UPLO_H -#define BLIS_SCAL2RIHS_MXN_UPLO_H - -// scal2rihs_mxn_uplo - -#define bli_cscal2rihs_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - \ - if ( bli_is_ro_packed( schema ) ) \ - { \ - if ( bli_is_lower( uplo ) ) \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_cscal2jros( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_cscal2ros( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_cscal2jros( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_cscal2ros( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_io_packed( schema ) ) \ - { \ - if ( bli_is_lower( uplo ) ) \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_cscal2jios( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_cscal2ios( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_cscal2jios( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_cscal2ios( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_lower( uplo ) ) \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_cscal2jrpis( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_cscal2rpis( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_cscal2jrpis( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_cscal2rpis( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - } \ -} - -#define bli_zscal2rihs_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - \ - if ( bli_is_ro_packed( schema ) ) \ - { \ - if ( bli_is_lower( uplo ) ) \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_zscal2jros( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_zscal2ros( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_zscal2jros( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_zscal2ros( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_io_packed( schema ) ) \ - { \ - if ( bli_is_lower( uplo ) ) \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_zscal2jios( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_zscal2ios( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_zscal2jios( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_zscal2ios( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_lower( uplo ) ) \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_zscal2jrpis( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = _j; _i < m; ++_i ) \ - { \ - bli_zscal2rpis( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_zscal2jrpis( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _j = 0; _j < m; ++_j ) \ - for ( _i = 0; _i < _j + 1; ++_i ) \ - { \ - bli_zscal2rpis( *(a), \ - *(x + _i*rs_x + _j*cs_x), \ - *(y_r + _i*rs_y + _j*cs_y) ); \ - } \ - } \ - } \ - } \ -} - -#endif -// end bli_scal2rihs_mxn_uplo.h -// begin bli_setrihs_mxn_diag.h - - -#ifndef BLIS_SETRIHS_MXN_DIAG_H -#define BLIS_SETRIHS_MXN_DIAG_H - -// setrihs_mxn_diag - -#define bli_csetrihs_mxn_diag( schema, m, n, a, y_r, rs_y, cs_y ) \ -{ \ - const float a_r = bli_zreal( *a ); \ - const float a_i = bli_zimag( *a ); \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t _i; \ -\ - \ - if ( bli_is_ro_packed( schema ) ) \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_scopys( (a_r), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ - else if ( bli_is_io_packed( schema ) ) \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_scopys( (a_i), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_sadd3s( (a_r), \ - (a_i), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ -} - -#define bli_zsetrihs_mxn_diag( schema, m, n, a, y_r, rs_y, cs_y ) \ -{ \ - const double a_r = bli_zreal( *a ); \ - const double a_i = bli_zimag( *a ); \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t _i; \ -\ - \ - if ( bli_is_ro_packed( schema ) ) \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_dcopys( (a_r), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ - else if ( bli_is_io_packed( schema ) ) \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_dcopys( (a_i), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( _i = 0; _i < min_m_n; ++_i ) \ - { \ - bli_dadd3s( (a_r), \ - (a_i), \ - *(y_r + _i*rs_y + _i*cs_y) ); \ - } \ - } \ -} - -#endif -// end bli_setrihs_mxn_diag.h - - -// -- 1m-specific scalar macros -- - -// 1e -// begin bli_copy1es.h - - -#ifndef BLIS_COPY1ES_H -#define BLIS_COPY1ES_H - -// copy1es - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_sscopy1es( a, bri, bir ) {} -#define bli_dscopy1es( a, bri, bir ) {} -#define bli_cscopy1es( a, bri, bir ) {} -#define bli_zscopy1es( a, bri, bir ) {} - -#define bli_sdcopy1es( a, bri, bir ) {} -#define bli_ddcopy1es( a, bri, bir ) {} -#define bli_cdcopy1es( a, bri, bir ) {} -#define bli_zdcopy1es( a, bri, bir ) {} - -#define bli_sccopy1es( a, bri, bir ) {} -#define bli_dccopy1es( a, bri, bir ) {} -#define bli_cccopy1es( a, bri, bir ) \ -{ \ - bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ - bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ -} -#define bli_zccopy1es( a, bri, bir ) \ -{ \ - bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ - bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ -} - -#define bli_szcopy1es( a, bri, bir ) {} -#define bli_dzcopy1es( a, bri, bir ) {} -#define bli_czcopy1es( a, bri, bir ) \ -{ \ - bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ - bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ -} -#define bli_zzcopy1es( a, bri, bir ) \ -{ \ - bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ - bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ -} - - -#define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) -#define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) - -#endif - -// end bli_copy1es.h -// begin bli_copyj1es.h - - -#ifndef BLIS_COPYJ1ES_H -#define BLIS_COPYJ1ES_H - -// copyj1es - -// Notes: -// - The first char encodes the type of x. -// - The second char encodes the type of y. - -#define bli_sscopyj1es( a, bri, bir ) {} -#define bli_dscopyj1es( a, bri, bir ) {} -#define bli_cscopyj1es( a, bri, bir ) {} -#define bli_zscopyj1es( a, bri, bir ) {} - -#define bli_sdcopyj1es( a, bri, bir ) {} -#define bli_ddcopyj1es( a, bri, bir ) {} -#define bli_cdcopyj1es( a, bri, bir ) {} -#define bli_zdcopyj1es( a, bri, bir ) {} - -#define bli_sccopyj1es( a, bri, bir ) {} -#define bli_dccopyj1es( a, bri, bir ) {} -#define bli_cccopyj1es( a, bri, bir ) \ -{ \ - bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ - bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ -} -#define bli_zccopyj1es( a, bri, bir ) \ -{ \ - bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ - bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ -} - -#define bli_szcopyj1es( a, bri, bir ) {} -#define bli_dzcopyj1es( a, bri, bir ) {} -#define bli_czcopyj1es( a, bri, bir ) \ -{ \ - bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ - bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ -} -#define bli_zzcopyj1es( a, bri, bir ) \ -{ \ - bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ - bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ -} - - -#define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) -#define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) - -#endif - -// end bli_copyj1es.h - -// begin bli_invert1es.h - - -#ifndef BLIS_INVERT1ES_H -#define BLIS_INVERT1ES_H - -// invert1es - -#define bli_cinvert1es( bri, bir ) \ -{ \ - bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ - bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ -} - -#define bli_zinvert1es( bri, bir ) \ -{ \ - bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ - bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ -} - -#endif - -// end bli_invert1es.h - -// begin bli_scal1es.h - - -#ifndef BLIS_SCAL1ES_H -#define BLIS_SCAL1ES_H - -// scal1es - -#define bli_cscal1es( a, yri, yir ) \ -{ \ - bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ - bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ -} - -#define bli_zscal1es( a, yri, yir ) \ -{ \ - bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ - bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#endif - -// end bli_scal1es.h - -// begin bli_scal21es.h - - -#ifndef BLIS_SCAL21ES_H -#define BLIS_SCAL21ES_H - -// scal21es - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssscal21es( a, x, yri, yir ) {} -#define bli_sdsscal21es( a, x, yri, yir ) {} -#define bli_scsscal21es( a, x, yri, yir ) {} -#define bli_szsscal21es( a, x, yri, yir ) {} - -#define bli_dssscal21es( a, x, yri, yir ) {} -#define bli_ddsscal21es( a, x, yri, yir ) {} -#define bli_dcsscal21es( a, x, yri, yir ) {} -#define bli_dzsscal21es( a, x, yri, yir ) {} - -#define bli_cssscal21es( a, x, yri, yir ) {} -#define bli_cdsscal21es( a, x, yri, yir ) {} -#define bli_ccsscal21es( a, x, yri, yir ) {} -#define bli_czsscal21es( a, x, yri, yir ) {} - -#define bli_zssscal21es( a, x, yri, yir ) {} -#define bli_zdsscal21es( a, x, yri, yir ) {} -#define bli_zcsscal21es( a, x, yri, yir ) {} -#define bli_zzsscal21es( a, x, yri, yir ) {} - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdscal21es( a, x, yri, yir ) {} -#define bli_sddscal21es( a, x, yri, yir ) {} -#define bli_scdscal21es( a, x, yri, yir ) {} -#define bli_szdscal21es( a, x, yri, yir ) {} - -#define bli_dsdscal21es( a, x, yri, yir ) {} -#define bli_dddscal21es( a, x, yri, yir ) {} -#define bli_dcdscal21es( a, x, yri, yir ) {} -#define bli_dzdscal21es( a, x, yri, yir ) {} - -#define bli_csdscal21es( a, x, yri, yir ) {} -#define bli_cddscal21es( a, x, yri, yir ) {} -#define bli_ccdscal21es( a, x, yri, yir ) {} -#define bli_czdscal21es( a, x, yri, yir ) {} - -#define bli_zsdscal21es( a, x, yri, yir ) {} -#define bli_zddscal21es( a, x, yri, yir ) {} -#define bli_zcdscal21es( a, x, yri, yir ) {} -#define bli_zzdscal21es( a, x, yri, yir ) {} - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscscal21es( a, x, yri, yir ) {} -#define bli_sdcscal21es( a, x, yri, yir ) {} -#define bli_sccscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_szcscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_dscscal21es( a, x, yri, yir ) {} -#define bli_ddcscal21es( a, x, yri, yir ) {} -#define bli_dccscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_dzcscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_cscscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_cdcscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_cccscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_czcscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_zscscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zdcscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zccscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zzcscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszscal21es( a, x, yri, yir ) {} -#define bli_sdzscal21es( a, x, yri, yir ) {} -#define bli_sczscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_szzscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_dszscal21es( a, x, yri, yir ) {} -#define bli_ddzscal21es( a, x, yri, yir ) {} -#define bli_dczscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_dzzscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_cszscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_cdzscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_cczscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_czzscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_zszscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zdzscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zczscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zzzscal21es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - - - -#define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) -#define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) - -#endif - -// end bli_scal21es.h -// begin bli_scal2j1es.h - - -#ifndef BLIS_SCAL2J1ES_H -#define BLIS_SCAL2J1ES_H - -// scal2j1es - -// Notes: -// - The first char encodes the type of a. -// - The second char encodes the type of x. -// - The third char encodes the type of y. - -// -- (axy) = (??s) ------------------------------------------------------------ - -#define bli_sssscal2j1es( a, x, yri, yir ) {} -#define bli_sdsscal2j1es( a, x, yri, yir ) {} -#define bli_scsscal2j1es( a, x, yri, yir ) {} -#define bli_szsscal2j1es( a, x, yri, yir ) {} - -#define bli_dssscal2j1es( a, x, yri, yir ) {} -#define bli_ddsscal2j1es( a, x, yri, yir ) {} -#define bli_dcsscal2j1es( a, x, yri, yir ) {} -#define bli_dzsscal2j1es( a, x, yri, yir ) {} - -#define bli_cssscal2j1es( a, x, yri, yir ) {} -#define bli_cdsscal2j1es( a, x, yri, yir ) {} -#define bli_ccsscal2j1es( a, x, yri, yir ) {} -#define bli_czsscal2j1es( a, x, yri, yir ) {} - -#define bli_zssscal2j1es( a, x, yri, yir ) {} -#define bli_zdsscal2j1es( a, x, yri, yir ) {} -#define bli_zcsscal2j1es( a, x, yri, yir ) {} -#define bli_zzsscal2j1es( a, x, yri, yir ) {} - -// -- (axy) = (??d) ------------------------------------------------------------ - -#define bli_ssdscal2j1es( a, x, yri, yir ) {} -#define bli_sddscal2j1es( a, x, yri, yir ) {} -#define bli_scdscal2j1es( a, x, yri, yir ) {} -#define bli_szdscal2j1es( a, x, yri, yir ) {} - -#define bli_dsdscal2j1es( a, x, yri, yir ) {} -#define bli_dddscal2j1es( a, x, yri, yir ) {} -#define bli_dcdscal2j1es( a, x, yri, yir ) {} -#define bli_dzdscal2j1es( a, x, yri, yir ) {} - -#define bli_csdscal2j1es( a, x, yri, yir ) {} -#define bli_cddscal2j1es( a, x, yri, yir ) {} -#define bli_ccdscal2j1es( a, x, yri, yir ) {} -#define bli_czdscal2j1es( a, x, yri, yir ) {} - -#define bli_zsdscal2j1es( a, x, yri, yir ) {} -#define bli_zddscal2j1es( a, x, yri, yir ) {} -#define bli_zcdscal2j1es( a, x, yri, yir ) {} -#define bli_zzdscal2j1es( a, x, yri, yir ) {} - -// -- (axy) = (??c) ------------------------------------------------------------ - -#define bli_sscscal2j1es( a, x, yri, yir ) {} -#define bli_sdcscal2j1es( a, x, yri, yir ) {} -#define bli_sccscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_szcscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_dscscal2j1es( a, x, yri, yir ) {} -#define bli_ddcscal2j1es( a, x, yri, yir ) {} -#define bli_dccscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_dzcscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_cscscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_cdcscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_cccscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_czcscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_zscscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zdcscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zccscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zzcscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -// -- (axy) = (??z) ------------------------------------------------------------ - -#define bli_sszscal2j1es( a, x, yri, yir ) {} -#define bli_sdzscal2j1es( a, x, yri, yir ) {} -#define bli_sczscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_szzscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_dszscal2j1es( a, x, yri, yir ) {} -#define bli_ddzscal2j1es( a, x, yri, yir ) {} -#define bli_dczscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_dzzscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_cszscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_cdzscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_cczscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_czzscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - -#define bli_zszscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zdzscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zczscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} -#define bli_zzzscal2j1es( a, x, yri, yir ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ -} - - - -#define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) -#define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) - -#endif - -// end bli_scal2j1es.h - -// 1r -// begin bli_copy1rs.h - - -#ifndef BLIS_COPY1RS_H -#define BLIS_COPY1RS_H - -// copy1rs - -#define bli_ccopy1rs( a, br, bi ) \ -{ \ - bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ -} - -#define bli_zcopy1rs( a, br, bi ) \ -{ \ - bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ -} - -#endif - -// end bli_copy1rs.h -// begin bli_copyj1rs.h - - -#ifndef BLIS_COPYJ1RS_H -#define BLIS_COPYJ1RS_H - -// copyj1rs - -#define bli_ccopyj1rs( a, br, bi ) \ -{ \ - bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ -} - -#define bli_zcopyj1rs( a, br, bi ) \ -{ \ - bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ -} - -#endif - -// end bli_copyj1rs.h - -// begin bli_invert1rs.h - - -#ifndef BLIS_INVERT1RS_H -#define BLIS_INVERT1RS_H - -// invert1rs - -#define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) -#define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) - -#endif -// end bli_invert1rs.h - -// begin bli_scal1rs.h - - -#ifndef BLIS_SCAL1RS_H -#define BLIS_SCAL1RS_H - -// scal1rs - -#define bli_cscal1rs( a, yr, yi ) \ -{ \ - bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ -} - -#define bli_zscal1rs( a, yr, yi ) \ -{ \ - bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ -} - -#define bli_scscal1rs( a, yr, yi ) \ -{ \ - bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ -} - -#define bli_dzscal1rs( a, yr, yi ) \ -{ \ - bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ -} - -#endif - -// end bli_scal1rs.h - -// begin bli_scal21rs.h - - -#ifndef BLIS_SCAL21RS_H -#define BLIS_SCAL21RS_H - -// scal21rs - -#define bli_cscscal21rs( a, x, yr, yi ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ -} - -#define bli_cccscal21rs( a, x, yr, yi ) \ -{ \ - bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ -} - -#define bli_zdzscal21rs( a, x, yr, yi ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ -} - -#define bli_zzzscal21rs( a, x, yr, yi ) \ -{ \ - bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ -} - - -#define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) -#define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) - -#endif - -// end bli_scal21rs.h -// begin bli_scal2j1rs.h - - -#ifndef BLIS_SCAL2J1RS_H -#define BLIS_SCAL2J1RS_H - -// scal2j1rs - -#define bli_cscscal2j1rs( a, x, yr, yi ) \ -{ \ - bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ -} - -#define bli_cccscal2j1rs( a, x, yr, yi ) \ -{ \ - bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ -} - -#define bli_zdzscal2j1rs( a, x, yr, yi ) \ -{ \ - bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ -} - -#define bli_zzzscal2j1rs( a, x, yr, yi ) \ -{ \ - bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ -} - - -#define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) -#define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) - -#endif - -// end bli_scal2j1rs.h - -// 1m (1e or 1r) -// begin bli_invert1ms_mxn_diag.h - - -#ifndef BLIS_INVERT1MS_MXN_DIAG_H -#define BLIS_INVERT1MS_MXN_DIAG_H - -// invert1ms_mxn_diag - -#define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t i; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - scomplex* restrict y_off_ri = y + (offm )*rs_y \ - + (offn )*cs_y; \ - scomplex* restrict y_off_ir = y + (offm )*rs_y \ - + (offn )*cs_y + ld_y/2; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ - *(y_off_ir + i*rs_y + i*cs_y) ); \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - float* restrict y_cast = ( float* )y; \ - float* restrict y_off_r = y_cast + (offm )*rs_y2 \ - + (offn )*cs_y2; \ - float* restrict y_off_i = y_cast + (offm )*rs_y2 \ - + (offn )*cs_y2 + ld_y; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ - *(y_off_i + i*rs_y2 + i*cs_y2) ); \ - } \ - } \ -} - -#define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t i; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - dcomplex* restrict y_off_ri = y + (offm )*rs_y \ - + (offn )*cs_y; \ - dcomplex* restrict y_off_ir = y + (offm )*rs_y \ - + (offn )*cs_y + ld_y/2; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ - *(y_off_ir + i*rs_y + i*cs_y) ); \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - double* restrict y_cast = ( double* )y; \ - double* restrict y_off_r = y_cast + (offm )*rs_y2 \ - + (offn )*cs_y2; \ - double* restrict y_off_i = y_cast + (offm )*rs_y2 \ - + (offn )*cs_y2 + ld_y; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ - *(y_off_i + i*rs_y2 + i*cs_y2) ); \ - } \ - } \ -} - -#endif -// end bli_invert1ms_mxn_diag.h - -// begin bli_scal1ms_mxn.h - - -#ifndef BLIS_SCAL1MS_MXN_H -#define BLIS_SCAL1MS_MXN_H - -// scal1ms_mxn - -#define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t i, j; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - scomplex* restrict y_ri = y; \ - scomplex* restrict y_ir = y + ld_y/2; \ -\ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - { \ - bli_cscal1es( *(a), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - float* restrict y_cast = ( float* )y; \ - float* restrict y_r = y_cast; \ - float* restrict y_i = y_cast + ld_y; \ -\ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - { \ - bli_cscal1rs( *(a), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ -} - -#define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t i, j; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - dcomplex* restrict y_ri = y; \ - dcomplex* restrict y_ir = y + ld_y/2; \ -\ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - { \ - bli_zscal1es( *(a), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - double* restrict y_cast = ( double* )y; \ - double* restrict y_r = y_cast; \ - double* restrict y_i = y_cast + ld_y; \ -\ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - { \ - bli_zscal1rs( *(a), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ -} - -#endif -// end bli_scal1ms_mxn.h - -// begin bli_scal21ms_mxn.h - - -#ifndef BLIS_SCAL21MS_MXN_H -#define BLIS_SCAL21MS_MXN_H - -// scal21ms_mxn - -static void bli_cscal21ms_mxn - ( - const pack_t schema, - const conj_t conjx, - const dim_t m, - const dim_t n, - scomplex* restrict alpha, - scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y - ) -{ - dim_t i, j; - - - if ( bli_is_1e_packed( schema ) ) - { - scomplex* restrict y_ri = y; - scomplex* restrict y_ir = y + ld_y/2; - - if ( bli_is_conj( conjx ) ) - { - for ( j = 0; j < n; ++j ) - for ( i = 0; i < m; ++i ) - { - bli_cscal2j1es( *(alpha), - *(x + i*rs_x + j*cs_x), - *(y_ri + i*rs_y + j*cs_y), - *(y_ir + i*rs_y + j*cs_y) ); - } - } - else - { - for ( j = 0; j < n; ++j ) - for ( i = 0; i < m; ++i ) - { - bli_cscal21es( *(alpha), - *(x + i*rs_x + j*cs_x), - *(y_ri + i*rs_y + j*cs_y), - *(y_ir + i*rs_y + j*cs_y) ); - } - } - } - else - { - inc_t rs_y2 = rs_y; - inc_t cs_y2 = cs_y; - - - if ( rs_y2 == 1 ) { cs_y2 *= 2; } - else { rs_y2 *= 2; } - - float* restrict y_cast = ( float* )y; - float* restrict y_r = y_cast; - float* restrict y_i = y_cast + ld_y; - - if ( bli_is_conj( conjx ) ) - { - for ( j = 0; j < n; ++j ) - for ( i = 0; i < m; ++i ) - { - bli_cscal2j1rs( *(alpha), - *(x + i*rs_x + j*cs_x ), - *(y_r + i*rs_y2 + j*cs_y2), - *(y_i + i*rs_y2 + j*cs_y2) ); - } - } - else - { - for ( j = 0; j < n; ++j ) - for ( i = 0; i < m; ++i ) - { - bli_cscal21rs( *(alpha), - *(x + i*rs_x + j*cs_x ), - *(y_r + i*rs_y2 + j*cs_y2), - *(y_i + i*rs_y2 + j*cs_y2) ); - } - } - } -} - -static void bli_zscal21ms_mxn - ( - const pack_t schema, - const conj_t conjx, - const dim_t m, - const dim_t n, - dcomplex* restrict alpha, - dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y - ) -{ - dim_t i, j; - - - if ( bli_is_1e_packed( schema ) ) - { - dcomplex* restrict y_ri = y; - dcomplex* restrict y_ir = y + ld_y/2; - - if ( bli_is_conj( conjx ) ) - { - for ( j = 0; j < n; ++j ) - for ( i = 0; i < m; ++i ) - { - bli_zscal2j1es( *(alpha), - *(x + i*rs_x + j*cs_x), - *(y_ri + i*rs_y + j*cs_y), - *(y_ir + i*rs_y + j*cs_y) ); - } - } - else - { - for ( j = 0; j < n; ++j ) - for ( i = 0; i < m; ++i ) - { - bli_zscal21es( *(alpha), - *(x + i*rs_x + j*cs_x), - *(y_ri + i*rs_y + j*cs_y), - *(y_ir + i*rs_y + j*cs_y) ); - } - } - } - else - { - inc_t rs_y2 = rs_y; - inc_t cs_y2 = cs_y; - - - if ( rs_y2 == 1 ) { cs_y2 *= 2; } - else { rs_y2 *= 2; } - - double* restrict y_cast = ( double* )y; - double* restrict y_r = y_cast; - double* restrict y_i = y_cast + ld_y; - - if ( bli_is_conj( conjx ) ) - { - for ( j = 0; j < n; ++j ) - for ( i = 0; i < m; ++i ) - { - bli_zscal2j1rs( *(alpha), - *(x + i*rs_x + j*cs_x ), - *(y_r + i*rs_y2 + j*cs_y2), - *(y_i + i*rs_y2 + j*cs_y2) ); - } - } - else - { - for ( j = 0; j < n; ++j ) - for ( i = 0; i < m; ++i ) - { - bli_zscal21rs( *(alpha), - *(x + i*rs_x + j*cs_x ), - *(y_r + i*rs_y2 + j*cs_y2), - *(y_i + i*rs_y2 + j*cs_y2) ); - } - } - } -} - -#endif -// end bli_scal21ms_mxn.h -// begin bli_scal21ms_mxn_diag.h - - -#ifndef BLIS_SCAL21MS_MXN_DIAG_H -#define BLIS_SCAL21MS_MXN_DIAG_H - -// scal21ms_mxn_diag - -#define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t i; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - scomplex* restrict y_off_ri = y; \ - scomplex* restrict y_off_ir = y + ld_y/2; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_cscscal21es( *(a), \ - *(x + i*rs_x + i*cs_x), \ - *(y_off_ri + i*rs_y + i*cs_y), \ - *(y_off_ir + i*rs_y + i*cs_y) ); \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - float* restrict y_cast = ( float* )y; \ - float* restrict y_off_r = y_cast; \ - float* restrict y_off_i = y_cast + ld_y; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_cscscal21rs( *(a), \ - *(x + i*rs_x + i*cs_x ), \ - *(y_off_r + i*rs_y2 + i*cs_y2), \ - *(y_off_i + i*rs_y2 + i*cs_y2) ); \ - } \ - } \ -} - -#define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t i; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - dcomplex* restrict y_off_ri = y; \ - dcomplex* restrict y_off_ir = y + ld_y/2; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_zdzscal21es( *(a), \ - *(x + i*rs_x + i*cs_x), \ - *(y_off_ri + i*rs_y + i*cs_y), \ - *(y_off_ir + i*rs_y + i*cs_y) ); \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - double* restrict y_cast = ( double* )y; \ - double* restrict y_off_r = y_cast; \ - double* restrict y_off_i = y_cast + ld_y; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_zdzscal21rs( *(a), \ - *(x + i*rs_x + i*cs_x ), \ - *(y_off_r + i*rs_y2 + i*cs_y2), \ - *(y_off_i + i*rs_y2 + i*cs_y2) ); \ - } \ - } \ -} - -#endif -// end bli_scal21ms_mxn_diag.h -// begin bli_scal21ms_mxn_uplo.h - - -#ifndef BLIS_SCAL21MS_MXN_UPLO_H -#define BLIS_SCAL21MS_MXN_UPLO_H - -// scal21ms_mxn_uplo - -#define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t i, j; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - scomplex* restrict y_ri = y; \ - scomplex* restrict y_ir = y + ld_y/2; \ -\ - if ( bli_is_lower( uplo ) ) \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_cscal2j1es( *(a), \ - *(x + i*rs_x + j*cs_x), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_cscal21es( *(a), \ - *(x + i*rs_x + j*cs_x), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_cscal2j1es( *(a), \ - *(x + i*rs_x + j*cs_x), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_cscal21es( *(a), \ - *(x + i*rs_x + j*cs_x), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - float* restrict y_cast = ( float* )y; \ - float* restrict y_r = y_cast; \ - float* restrict y_i = y_cast + ld_y; \ -\ - if ( bli_is_lower( uplo ) ) \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_cscal2j1rs( *(a), \ - *(x + i*rs_x + j*cs_x ), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_cscal21rs( *(a), \ - *(x + i*rs_x + j*cs_x ), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_cscal2j1rs( *(a), \ - *(x + i*rs_x + j*cs_x ), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_cscal21rs( *(a), \ - *(x + i*rs_x + j*cs_x ), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - } \ - } \ -} - -#define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t i, j; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - dcomplex* restrict y_ri = y; \ - dcomplex* restrict y_ir = y + ld_y/2; \ -\ - if ( bli_is_lower( uplo ) ) \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_zscal2j1es( *(a), \ - *(x + i*rs_x + j*cs_x), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_zscal21es( *(a), \ - *(x + i*rs_x + j*cs_x), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_zscal2j1es( *(a), \ - *(x + i*rs_x + j*cs_x), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_zscal21es( *(a), \ - *(x + i*rs_x + j*cs_x), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - double* restrict y_cast = ( double* )y; \ - double* restrict y_r = y_cast; \ - double* restrict y_i = y_cast + ld_y; \ -\ - if ( bli_is_lower( uplo ) ) \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_zscal2j1rs( *(a), \ - *(x + i*rs_x + j*cs_x ), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_zscal21rs( *(a), \ - *(x + i*rs_x + j*cs_x ), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_zscal2j1rs( *(a), \ - *(x + i*rs_x + j*cs_x ), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < m; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_zscal21rs( *(a), \ - *(x + i*rs_x + j*cs_x ), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - } \ - } \ -} - -#endif -// end bli_scal21ms_mxn_uplo.h - -// begin bli_set1ms_mxn.h - - -#ifndef BLIS_SET1MS_MXN_H -#define BLIS_SET1MS_MXN_H - -// set1ms_mxn - -#define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ -{ \ - \ -} - -#define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ -{ \ - \ -} - -static void bli_cset1ms_mxn - ( - const pack_t schema, - const dim_t offm, - const dim_t offn, - const dim_t m, - const dim_t n, - scomplex* restrict alpha, - scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y - ) -{ - inc_t offm_local = offm; - inc_t offn_local = offn; - dim_t m_local = m; - dim_t n_local = n; - inc_t rs_y1 = rs_y; - inc_t cs_y1 = cs_y; - inc_t rs_y2 = rs_y; - inc_t cs_y2 = cs_y; - dim_t i, j; - - - if ( cs_y == 1 ) - { - bli_swap_incs( &offm_local, &offn_local ); - bli_swap_dims( &m_local, &n_local ); - bli_swap_incs( &rs_y1, &cs_y1 ); - bli_swap_incs( &rs_y2, &cs_y2 ); - } - - - if ( bli_is_1e_packed( schema ) ) - { - scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 - + (offn_local )*cs_y1; - scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 - + (offn_local )*cs_y1 + ld_y/2; - - for ( j = 0; j < n_local; ++j ) - for ( i = 0; i < m_local; ++i ) - { - bli_ccopy1es( *(alpha), - *(y_off_ri + i*rs_y1 + j*cs_y1), - *(y_off_ir + i*rs_y1 + j*cs_y1) ); - } - } - else - { - - if ( rs_y2 == 1 ) { cs_y2 *= 2; } - else { rs_y2 *= 2; } - - float* restrict y_cast = ( float* )y; - float* restrict y_off_r = y_cast + (offm_local )*rs_y2 - + (offn_local )*cs_y2; - float* restrict y_off_i = y_cast + (offm_local )*rs_y2 - + (offn_local )*cs_y2 + ld_y; - - for ( j = 0; j < n_local; ++j ) - for ( i = 0; i < m_local; ++i ) - { - bli_ccopy1rs( *(alpha), - *(y_off_r + i*rs_y2 + j*cs_y2), - *(y_off_i + i*rs_y2 + j*cs_y2) ); - } - } -} - -static void bli_zset1ms_mxn - ( - const pack_t schema, - const dim_t offm, - const dim_t offn, - const dim_t m, - const dim_t n, - dcomplex* restrict alpha, - dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y - ) -{ - inc_t offm_local = offm; - inc_t offn_local = offn; - dim_t m_local = m; - dim_t n_local = n; - inc_t rs_y1 = rs_y; - inc_t cs_y1 = cs_y; - inc_t rs_y2 = rs_y; - inc_t cs_y2 = cs_y; - dim_t i, j; - - - if ( cs_y == 1 ) - { - bli_swap_incs( &offm_local, &offn_local ); - bli_swap_dims( &m_local, &n_local ); - bli_swap_incs( &rs_y1, &cs_y1 ); - bli_swap_incs( &rs_y2, &cs_y2 ); - } - - - if ( bli_is_1e_packed( schema ) ) - { - dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 - + (offn_local )*cs_y1; - dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 - + (offn_local )*cs_y1 + ld_y/2; - - for ( j = 0; j < n_local; ++j ) - for ( i = 0; i < m_local; ++i ) - { - bli_zcopy1es( *(alpha), - *(y_off_ri + i*rs_y1 + j*cs_y1), - *(y_off_ir + i*rs_y1 + j*cs_y1) ); - } - } - else - { - - if ( rs_y2 == 1 ) { cs_y2 *= 2; } - else { rs_y2 *= 2; } - - double* restrict y_cast = ( double* )y; - double* restrict y_off_r = y_cast + (offm_local )*rs_y2 - + (offn_local )*cs_y2; - double* restrict y_off_i = y_cast + (offm_local )*rs_y2 - + (offn_local )*cs_y2 + ld_y; - - for ( j = 0; j < n_local; ++j ) - for ( i = 0; i < m_local; ++i ) - { - bli_zcopy1rs( *(alpha), - *(y_off_r + i*rs_y2 + j*cs_y2), - *(y_off_i + i*rs_y2 + j*cs_y2) ); - } - } -} - -#endif -// end bli_set1ms_mxn.h -// begin bli_set1ms_mxn_diag.h - - -#ifndef BLIS_SET1MS_MXN_DIAG_H -#define BLIS_SET1MS_MXN_DIAG_H - -// set1ms_mxn_diag - -#define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t i; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - scomplex* restrict y_off_ri = y + (offm )*rs_y \ - + (offn )*cs_y; \ - scomplex* restrict y_off_ir = y + (offm )*rs_y \ - + (offn )*cs_y + ld_y/2; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_ccopy1es( *(a), \ - *(y_off_ri + i*rs_y + i*cs_y), \ - *(y_off_ir + i*rs_y + i*cs_y) ); \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - float* restrict y_cast = ( float* )y; \ - float* restrict y_off_r = y_cast + (offm )*rs_y2 \ - + (offn )*cs_y2; \ - float* restrict y_off_i = y_cast + (offm )*rs_y2 \ - + (offn )*cs_y2 + ld_y; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_ccopy1rs( *(a), \ - *(y_off_r + i*rs_y2 + i*cs_y2), \ - *(y_off_i + i*rs_y2 + i*cs_y2) ); \ - } \ - } \ -} - -#define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t i; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - dcomplex* restrict y_off_ri = y + (offm )*rs_y \ - + (offn )*cs_y; \ - dcomplex* restrict y_off_ir = y + (offm )*rs_y \ - + (offn )*cs_y + ld_y/2; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_zcopy1es( *(a), \ - *(y_off_ri + i*rs_y + i*cs_y), \ - *(y_off_ir + i*rs_y + i*cs_y) ); \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - double* restrict y_cast = ( double* )y; \ - double* restrict y_off_r = y_cast + (offm )*rs_y2 \ - + (offn )*cs_y2; \ - double* restrict y_off_i = y_cast + (offm )*rs_y2 \ - + (offn )*cs_y2 + ld_y; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_zcopy1rs( *(a), \ - *(y_off_r + i*rs_y2 + i*cs_y2), \ - *(y_off_i + i*rs_y2 + i*cs_y2) ); \ - } \ - } \ -} - -#endif -// end bli_set1ms_mxn_diag.h -// begin bli_set1ms_mxn_uplo.h - - -#ifndef BLIS_SET1MS_MXN_UPLO_H -#define BLIS_SET1MS_MXN_UPLO_H - -// set1ms_mxn_uplo - -#define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ -{ \ - doff_t diagoff_abs = bli_abs( diagoff ); \ - inc_t offdiag_inc; \ - dim_t i, j; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - \ - if ( diagoff > 0 ) offdiag_inc = cs_y; \ - else offdiag_inc = rs_y; \ -\ - scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ - scomplex* restrict y_ri = y0; \ - scomplex* restrict y_ir = y0 + ld_y/2; \ -\ - if ( bli_is_lower( uplo ) ) \ - { \ - for ( j = 0; j < n; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_ccopy1es( *(a), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_ccopy1es( *(a), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - \ - if ( diagoff > 0 ) offdiag_inc = cs_y2; \ - else offdiag_inc = rs_y2; \ -\ - float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ - float* restrict y_r = y0; \ - float* restrict y_i = y0 + ld_y; \ -\ - if ( bli_is_lower( uplo ) ) \ - { \ - for ( j = 0; j < n; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_ccopy1rs( *(a), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_ccopy1rs( *(a), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - } \ -} - -#define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ -{ \ - doff_t diagoff_abs = bli_abs( diagoff ); \ - inc_t offdiag_inc; \ - dim_t i, j; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - \ - if ( diagoff > 0 ) offdiag_inc = cs_y; \ - else offdiag_inc = rs_y; \ -\ - dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ - dcomplex* restrict y_ri = y0; \ - dcomplex* restrict y_ir = y0 + ld_y/2; \ -\ - if ( bli_is_lower( uplo ) ) \ - { \ - for ( j = 0; j < n; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_zcopy1es( *(a), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_zcopy1es( *(a), \ - *(y_ri + i*rs_y + j*cs_y), \ - *(y_ir + i*rs_y + j*cs_y) ); \ - } \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - \ - if ( diagoff > 0 ) offdiag_inc = cs_y2; \ - else offdiag_inc = rs_y2; \ -\ - double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ - double* restrict y_r = y0; \ - double* restrict y_i = y0 + ld_y; \ -\ - if ( bli_is_lower( uplo ) ) \ - { \ - for ( j = 0; j < n; ++j ) \ - for ( i = j; i < m; ++i ) \ - { \ - bli_zcopy1rs( *(a), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - else \ - { \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < j + 1; ++i ) \ - { \ - bli_zcopy1rs( *(a), \ - *(y_r + i*rs_y2 + j*cs_y2), \ - *(y_i + i*rs_y2 + j*cs_y2) ); \ - } \ - } \ - } \ -} - -#endif -// end bli_set1ms_mxn_uplo.h -// begin bli_seti01ms_mxn_diag.h - - -#ifndef BLIS_SETI01MS_MXN_DIAG_H -#define BLIS_SETI01MS_MXN_DIAG_H - -// seti01ms_mxn_diag - -#define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t i; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - scomplex* restrict y_off_ri = y; \ - scomplex* restrict y_off_ir = y + ld_y/2; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ - bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - float* restrict y_cast = ( float* )y; \ - float* restrict y_off_i = y_cast + ld_y; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ - } \ - } \ -} - -#define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ -{ \ - dim_t min_m_n = bli_min( m, n ); \ - dim_t i; \ -\ - \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - dcomplex* restrict y_off_ri = y; \ - dcomplex* restrict y_off_ir = y + ld_y/2; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ - bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ - } \ - } \ - else \ - { \ - inc_t rs_y2 = rs_y; \ - inc_t cs_y2 = cs_y; \ -\ - \ - if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ - else { rs_y2 *= 2; } \ -\ - double* restrict y_cast = ( double* )y; \ - double* restrict y_off_i = y_cast + ld_y; \ -\ - for ( i = 0; i < min_m_n; ++i ) \ - { \ - bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ - } \ - } \ -} - -#endif -// end bli_seti01ms_mxn_diag.h - - -#endif -// end bli_scalar_macro_defs.h -// begin bli_error_macro_defs.h - - -#ifndef BLIS_ERROR_MACRO_DEFS_H -#define BLIS_ERROR_MACRO_DEFS_H - -// -- Error-related macros -- - -// Used to determine the size of the array of error strings. -#define BLIS_MAX_NUM_ERR_MSGS 200 -#define BLIS_MAX_ERR_MSG_LENGTH 200 - -// Used to insert filenames and line numbers into error-checking code. -#define bli_check_error_code( code ) \ - bli_check_error_code_helper( code, __FILE__, __LINE__ ) - - -#endif - -// end bli_error_macro_defs.h -// begin bli_blas_macro_defs.h - - -#ifndef BLIS_BLAS_MACRO_DEFS_H -#define BLIS_BLAS_MACRO_DEFS_H - -// -- Various Fortran compatibility macros -- - -// Macro to treat negative dimensions as zero. - -#define bli_convert_blas_dim1( n_blas, n_blis )\ -{ \ - if ( n_blas < 0 ) n_blis = ( dim_t )0; \ - else n_blis = ( dim_t )n_blas; \ -} - -// Macro to flip signs of increments if input increments are negative. - -#define bli_convert_blas_incv( n, x_blas, incx_blas, \ - x_blis, incx_blis ) \ -{ \ - if ( incx_blas < 0 ) \ - { \ - \ - x_blis = (x_blas) + (n-1)*(-incx_blas); \ - incx_blis = ( inc_t )(incx_blas); \ - } \ - else \ - { \ - x_blis = (x_blas); \ - incx_blis = ( inc_t )(incx_blas); \ - } \ -} - - - -#endif - -// end bli_blas_macro_defs.h -// begin bli_builtin_macro_defs.h - - -#ifndef BLIS_BUILTIN_MACRO_DEFS_H -#define BLIS_BUILTIN_MACRO_DEFS_H - -#if defined(__ICC) || defined(__INTEL_COMPILER) - - // icc - - #define bli_prefetch( addr, rw, loc ) - -#elif defined(__clang__) - - // clang - - #define bli_prefetch( addr, rw, loc ) - -#elif defined(__GNUC__) - - // gcc - - #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); - -#endif - - -#endif -// end bli_builtin_macro_defs.h - -// begin bli_oapi_macro_defs.h - - -// Define the suffix to add to object API function names that include -// additional "expert" parameters. -#define BLIS_OAPI_EX_SUF _ex - -// end bli_oapi_macro_defs.h -// begin bli_tapi_macro_defs.h - - -// Define the suffix to add to typed API function names that include -// additional "expert" parameters. -#define BLIS_TAPI_EX_SUF _ex - -// end bli_tapi_macro_defs.h - - -#endif -// end bli_macro_defs.h - - -// -- pragma definitions -- - -// begin bli_pragma_macro_defs.h - - - - -#ifndef BLIS_PRAGMA_MACRO_DEFS_H -#define BLIS_PRAGMA_MACRO_DEFS_H - -// Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define -// all instances of PRAGMA_SIMD as _Pragma("omp simd"). - -#ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD - #define PRAGMA_OMP_SIMD _Pragma("omp simd") -#else - #define PRAGMA_OMP_SIMD -#endif - -// Require ISO C99 or later for SIMD-related pragmas. -#if (( __STDC_VERSION__ >= 199901L )) - - #define GEN_PRAGMA(x) _Pragma(#x) - - #if defined(__ICC) || defined(__INTEL_COMPILER) - - // Intel icc. - //#define PRAGMA_SIMD GEN_PRAGMA(simd) - #define PRAGMA_SIMD PRAGMA_OMP_SIMD - - #elif defined(__clang__) - - // clang/llvm. - #define PRAGMA_SIMD PRAGMA_OMP_SIMD - - #elif defined(__GNUC__) - - // GNU gcc. - #define PRAGMA_SIMD PRAGMA_OMP_SIMD - - #else - - // Unknown compiler. - #define PRAGMA_SIMD - - #endif -#endif - -#endif -// end bli_pragma_macro_defs.h - - -// -- Threading definitions -- - -// begin bli_thread.h - - -#ifndef BLIS_THREAD_H -#define BLIS_THREAD_H - -// Include thread communicator (thrcomm_t) object definitions and prototypes. -// begin bli_thrcomm.h - - -#ifndef BLIS_THRCOMM_H -#define BLIS_THRCOMM_H - -// Include definitions (mostly thrcomm_t) specific to the method of -// multithreading. -// begin bli_thrcomm_single.h - - -#ifndef BLIS_THRCOMM_SINGLE_H -#define BLIS_THRCOMM_SINGLE_H - -// Define thrcomm_t for situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING - -//thread communicators may be implementation dependent -#ifdef BLIS_TREE_BARRIER -struct barrier_s -{ - int arity; - int count; - struct barrier_s* dad; - int signal; -}; -typedef struct barrier_s barrier_t; - -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - barrier_t** barriers; -}; -#else -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - - bool_t barrier_sense; - dim_t barrier_threads_arrived; -}; -#endif -typedef struct thrcomm_s thrcomm_t; - -#endif - -#endif - -// end bli_thrcomm_single.h -// begin bli_thrcomm_openmp.h - - -#ifndef BLIS_THRCOMM_OPENMP_H -#define BLIS_THRCOMM_OPENMP_H - -// Define thrcomm_t for situations when OpenMP multithreading is enabled. -#ifdef BLIS_ENABLE_OPENMP - -#include // skipped - -// Define thrcomm_t for tree barriers and non-tree barriers. -#ifdef BLIS_TREE_BARRIER -struct barrier_s -{ - int arity; - int count; - struct barrier_s* dad; - volatile int signal; -}; -typedef struct barrier_s barrier_t; - -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - barrier_t** barriers; -}; -#else -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - - //volatile bool_t barrier_sense; - bool_t barrier_sense; - dim_t barrier_threads_arrived; -}; -#endif - -typedef struct thrcomm_s thrcomm_t; - -// Prototypes specific to tree barriers. -#ifdef BLIS_TREE_BARRIER -barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); -void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); -void bli_thrcomm_tree_barrier( barrier_t* barack ); -#endif - -#endif - -#endif - -// end bli_thrcomm_openmp.h -// begin bli_thrcomm_pthreads.h - - -#ifndef BLIS_THRCOMM_PTHREADS_H -#define BLIS_THRCOMM_PTHREADS_H - -// Define thrcomm_t for situations when POSIX multithreading is enabled. -#ifdef BLIS_ENABLE_PTHREADS - -#ifdef BLIS_USE_PTHREAD_BARRIER -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - - bli_pthread_barrier_t barrier; -}; -#else -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - -//#ifdef BLIS_USE_PTHREAD_MUTEX -// bli_pthread_mutex_t mutex; -//#endif - - //volatile bool_t barrier_sense; - bool_t barrier_sense; - dim_t barrier_threads_arrived; -}; -#endif - -typedef struct thrcomm_s thrcomm_t; - -#endif - -#endif - -// end bli_thrcomm_pthreads.h - - -// thrcomm_t query (field only) - -static dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) -{ - return comm->n_threads; -} - - -// Thread communicator prototypes. -thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); -void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); -void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); -void bli_thrcomm_cleanup( thrcomm_t* comm ); - -BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); -BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); - -void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); - -#endif - -// end bli_thrcomm.h - -// Include thread info (thrinfo_t) object definitions and prototypes. -// begin bli_thrinfo.h - - -#ifndef BLIS_THRINFO_H -#define BLIS_THRINFO_H - -// Thread info structure definition -struct thrinfo_s -{ - // The thread communicator for the other threads sharing the same work - // at this level. - thrcomm_t* ocomm; - - // Our thread id within the ocomm thread communicator. - dim_t ocomm_id; - - // The number of distinct threads used to parallelize the loop. - dim_t n_way; - - // What we're working on. - dim_t work_id; - - // When freeing, should the communicators in this node be freed? Usually, - // this is field is true, but when nodes are created that share the same - // communicators as other nodes (such as with packm nodes), this is set - // to false. - bool_t free_comm; - - // The bszid_t to help identify the node. This is mostly only useful when - // debugging or tracing the allocation and release of thrinfo_t nodes. - bszid_t bszid; - - struct thrinfo_s* sub_prenode; - struct thrinfo_s* sub_node; -}; -typedef struct thrinfo_s thrinfo_t; - -// -// thrinfo_t functions -// NOTE: The naming of these should be made consistent at some point. -// (ie: bli_thrinfo_ vs. bli_thread_) -// - -// thrinfo_t query (field only) - -static dim_t bli_thread_num_threads( thrinfo_t* t ) -{ - return (t->ocomm)->n_threads; -} - -static dim_t bli_thread_ocomm_id( thrinfo_t* t ) -{ - return t->ocomm_id; -} - -static dim_t bli_thread_n_way( thrinfo_t* t ) -{ - return t->n_way; -} - -static dim_t bli_thread_work_id( thrinfo_t* t ) -{ - return t->work_id; -} - -static thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) -{ - return t->ocomm; -} - -static bool_t bli_thrinfo_needs_free_comm( thrinfo_t* t ) -{ - return t->free_comm; -} - -static dim_t bli_thread_bszid( thrinfo_t* t ) -{ - return t->bszid; -} - -static thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) -{ - return t->sub_node; -} - -static thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) -{ - return t->sub_prenode; -} - -// thrinfo_t query (complex) - -static bool_t bli_thread_am_ochief( thrinfo_t* t ) -{ - return t->ocomm_id == 0; -} - -// thrinfo_t modification - -static void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) -{ - t->ocomm = ocomm; -} - -static void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) -{ - t->ocomm_id = ocomm_id; -} - -static void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) -{ - t->n_way = n_way; -} - -static void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) -{ - t->work_id = work_id; -} - -static void bli_thrinfo_set_free_comm( bool_t free_comm, thrinfo_t* t ) -{ - t->free_comm = free_comm; -} - -static void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) -{ - t->bszid = bszid; -} - -static void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) -{ - t->sub_node = sub_node; -} - -static void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) -{ - t->sub_prenode = sub_prenode; -} - -// other thrinfo_t-related functions - -static void* bli_thread_broadcast( thrinfo_t* t, void* p ) -{ - return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); -} - -static void bli_thread_barrier( thrinfo_t* t ) -{ - bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); -} - - -// -// Prototypes for level-3 thrinfo functions not specific to any operation. -// - -thrinfo_t* bli_thrinfo_create - ( - rntm_t* rntm, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bool_t free_comm, - bszid_t bszid, - thrinfo_t* sub_node - ); - -void bli_thrinfo_init - ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bool_t free_comm, - bszid_t bszid, - thrinfo_t* sub_node - ); - -void bli_thrinfo_init_single - ( - thrinfo_t* thread - ); - -void bli_thrinfo_free - ( - rntm_t* rntm, - thrinfo_t* thread - ); - -// ----------------------------------------------------------------------------- - -void bli_thrinfo_grow - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -thrinfo_t* bli_thrinfo_rgrow - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_cur, - thrinfo_t* thread_par - ); - -thrinfo_t* bli_thrinfo_create_for_cntl - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_chl, - thrinfo_t* thread_par - ); - -thrinfo_t* bli_thrinfo_rgrow_prenode - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_cur, - thrinfo_t* thread_par - ); - -thrinfo_t* bli_thrinfo_create_for_cntl_prenode - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_chl, - thrinfo_t* thread_par - ); - -// ----------------------------------------------------------------------------- - -#if 0 -void bli_thrinfo_grow_tree - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -void bli_thrinfo_grow_tree_ic - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); -#endif - -#endif -// end bli_thrinfo.h -// begin bli_thrinfo_sup.h - - -#ifndef BLIS_THRINFO_SUP_H -#define BLIS_THRINFO_SUP_H - -// -// Prototypes for level-3 thrinfo sup functions. -// - -void bli_thrinfo_sup_grow - ( - rntm_t* rntm, - bszid_t* bszid_par, - thrinfo_t* thread - ); - -thrinfo_t* bli_thrinfo_sup_rgrow - ( - rntm_t* rntm, - bszid_t* bszid_par, - bszid_t* bszid_cur, - thrinfo_t* thread_par - ); - -thrinfo_t* bli_thrinfo_sup_create_for_cntl - ( - rntm_t* rntm, - bszid_t* bszid_par, - bszid_t* bszid_chl, - thrinfo_t* thread_par - ); - -#endif -// end bli_thrinfo_sup.h - -// Include some operation-specific thrinfo_t prototypes. -// Note that the bli_packm_thrinfo.h must be included before the others! -// begin bli_packm_thrinfo.h - - -// -// thrinfo_t macros specific to packm. -// - - - -#define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ -\ - ( i % n_way == work_id % n_way ) - -#define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ -\ - ( start <= i && i < end ) - -// Define a general-purpose version of bli_packm_my_iter() whose definition -// depends on whether slab or round-robin partitioning was requested at -// configure-time. -#ifdef BLIS_ENABLE_JRIR_SLAB - - #define bli_packm_my_iter bli_packm_my_iter_sl - -#else // BLIS_ENABLE_JRIR_RR - - #define bli_packm_my_iter bli_packm_my_iter_rr - -#endif - - -// -// thrinfo_t APIs specific to packm. -// - -#if 0 -thrinfo_t* bli_packm_thrinfo_create - ( - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - thrinfo_t* sub_node - ); -#endif - -void bli_packm_thrinfo_init - ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bszid_t bszid, - thrinfo_t* sub_node - ); - -void bli_packm_thrinfo_init_single - ( - thrinfo_t* thread - ); - -#if 0 -void bli_packm_thrinfo_free - ( - thrinfo_t* thread - ); -#endif - -// end bli_packm_thrinfo.h -// begin bli_l3_thrinfo.h - - -// -// thrinfo_t macros specific to various level-3 operations. -// - -// gemm - -// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to -// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. -#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) -#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) - -// herk - -// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to -// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. -#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) -#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) - -// trmm - -// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to -// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. -#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) -#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) - -#define bli_trmm_my_iter_rr( index, thread ) \ -\ - ( index % thread->n_way == thread->work_id % thread->n_way ) - -// trsm - -#define bli_trsm_my_iter_rr( index, thread ) \ -\ - ( index % thread->n_way == thread->work_id % thread->n_way ) - -// -// thrinfo_t APIs specific to level-3 operations. -// - -void bli_l3_thrinfo_init - ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - thrinfo_t* sub_node - ); - -void bli_l3_thrinfo_init_single - ( - thrinfo_t* thread - ); - -void bli_l3_thrinfo_free - ( - rntm_t* rntm, - thrinfo_t* thread - ); - -void bli_l3_sup_thrinfo_free - ( - rntm_t* rntm, - thrinfo_t* thread - ); - -// ----------------------------------------------------------------------------- - -void bli_l3_thrinfo_create_root - ( - dim_t id, - thrcomm_t* gl_comm, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t** thread - ); - -void bli_l3_sup_thrinfo_create_root - ( - dim_t id, - thrcomm_t* gl_comm, - rntm_t* rntm, - thrinfo_t** thread - ); - -void bli_l3_sup_thrinfo_update_root - ( - rntm_t* rntm, - thrinfo_t* thread - ); - -void bli_l3_thrinfo_print_gemm_paths - ( - thrinfo_t** threads - ); - -void bli_l3_thrinfo_print_trsm_paths - ( - thrinfo_t** threads - ); - -// ----------------------------------------------------------------------------- - -void bli_l3_thrinfo_free_paths - ( - rntm_t* rntm, - thrinfo_t** threads - ); - -// end bli_l3_thrinfo.h - -// Include the level-3 thread decorator and related definitions and prototypes -// for the conventional code path. -// begin bli_l3_decor.h - - -#ifndef BLIS_L3_DECOR_H -#define BLIS_L3_DECOR_H - -// -- conventional definitions ------------------------------------------------- - -// Level-3 internal function type. -typedef void (*l3int_t) - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -// Level-3 thread decorator prototype. -void bli_l3_thread_decorator - ( - l3int_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - -// Include definitions specific to the method of multithreading for the -// conventional code path. -// begin bli_l3_decor_single.h - - -#ifndef BLIS_L3_DECOR_SINGLE_H -#define BLIS_L3_DECOR_SINGLE_H - -// Definitions specific to situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING - -#endif - -#endif - -// end bli_l3_decor_single.h -// begin bli_l3_decor_openmp.h - - -#ifndef BLIS_L3_DECOR_OPENMP_H -#define BLIS_L3_DECOR_OPENMP_H - -// Definitions specific to situations when OpenMP multithreading is enabled. -#ifdef BLIS_ENABLE_OPENMP - -void bli_l3_thread_decorator_thread_check - ( - dim_t n_threads, - dim_t tid, - thrcomm_t* gl_comm, - rntm_t* rntm - ); - -#endif - -#endif - -// end bli_l3_decor_openmp.h -// begin bli_l3_decor_pthreads.h - - -#ifndef BLIS_L3_DECOR_PTHREADS_H -#define BLIS_L3_DECOR_PTHREADS_H - -// Definitions specific to situations when POSIX multithreading is enabled. -#ifdef BLIS_ENABLE_PTHREADS - -// Thread entry point prototype. -void* bli_l3_thread_entry( void* data_void ); - -#endif - -#endif - -// end bli_l3_decor_pthreads.h - -#endif - -// end bli_l3_decor.h - -// Include the level-3 thread decorator and related definitions and prototypes -// for the sup code path. -// begin bli_l3_sup_decor.h - - -#ifndef BLIS_L3_SUP_DECOR_H -#define BLIS_L3_SUP_DECOR_H - -// -- sup definitions ---------------------------------------------------------- - -// Level-3 sup internal function type. -typedef err_t (*l3supint_t) - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -// Level-3 sup thread decorator prototype. -err_t bli_l3_sup_thread_decorator - ( - l3supint_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - -// Include definitions specific to the method of multithreading for the -// sup code path. -// begin bli_l3_sup_decor_single.h - - -#ifndef BLIS_L3_SUP_DECOR_SINGLE_H -#define BLIS_L3_SUP_DECOR_SINGLE_H - -// Definitions specific to situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING - -#endif - -#endif - -// end bli_l3_sup_decor_single.h -// begin bli_l3_sup_decor_openmp.h - - -#ifndef BLIS_L3_SUP_DECOR_OPENMP_H -#define BLIS_L3_SUP_DECOR_OPENMP_H - -// Definitions specific to situations when OpenMP multithreading is enabled. -#ifdef BLIS_ENABLE_OPENMP - -#endif - -#endif - -// end bli_l3_sup_decor_openmp.h -// begin bli_l3_sup_decor_pthreads.h - - -#ifndef BLIS_L3_SUP_DECOR_PTHREADS_H -#define BLIS_L3_SUP_DECOR_PTHREADS_H - -// Definitions specific to situations when POSIX multithreading is enabled. -#ifdef BLIS_ENABLE_PTHREADS - -// Thread entry point prototype. -void* bli_l3_sup_thread_entry( void* data_void ); - -#endif - -#endif - -// end bli_l3_sup_decor_pthreads.h - -#endif - -// end bli_l3_sup_decor.h - -// Initialization-related prototypes. -void bli_thread_init( void ); -void bli_thread_finalize( void ); - -#ifdef _MSC_VER -#define strerror_r(errno,buf,len) strerror_s(buf,len,errno) -#endif - -// Thread range-related prototypes. - -BLIS_EXPORT_BLIS -void bli_thread_range_sub - ( - thrinfo_t* thread, - dim_t n, - dim_t bf, - bool_t handle_edge_low, - dim_t* start, - dim_t* end - ); - -#undef GENPROT -#define GENPROT( opname ) \ -\ -siz_t PASTEMAC0( opname ) \ - ( \ - dir_t direct, \ - thrinfo_t* thr, \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntl_t* cntl, \ - cntx_t* cntx, \ - dim_t* start, \ - dim_t* end \ - ); - -GENPROT( thread_range_mdim ) -GENPROT( thread_range_ndim ) - -#undef GENPROT -#define GENPROT( opname ) \ -\ -siz_t PASTEMAC0( opname ) \ - ( \ - thrinfo_t* thr, \ - obj_t* a, \ - blksz_t* bmult, \ - dim_t* start, \ - dim_t* end \ - ); - -GENPROT( thread_range_l2r ) -GENPROT( thread_range_r2l ) -GENPROT( thread_range_t2b ) -GENPROT( thread_range_b2t ) - -GENPROT( thread_range_weighted_l2r ) -GENPROT( thread_range_weighted_r2l ) -GENPROT( thread_range_weighted_t2b ) -GENPROT( thread_range_weighted_b2t ) - - -dim_t bli_thread_range_width_l - ( - doff_t diagoff_j, - dim_t m, - dim_t n_j, - dim_t j, - dim_t n_way, - dim_t bf, - dim_t bf_left, - double area_per_thr, - bool_t handle_edge_low - ); -siz_t bli_find_area_trap_l - ( - dim_t m, - dim_t n, - doff_t diagoff - ); -siz_t bli_thread_range_weighted_sub - ( - thrinfo_t* restrict thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool_t handle_edge_low, - dim_t* restrict j_start_thr, - dim_t* restrict j_end_thr - ); - -// ----------------------------------------------------------------------------- - -// Factorization and partitioning prototypes -typedef struct -{ - dim_t n; - dim_t sqrt_n; - dim_t f; -} bli_prime_factors_t; - -void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); - -dim_t bli_next_prime_factor(bli_prime_factors_t* factors); - -void bli_thread_partition_2x2 - ( - dim_t n_thread, - dim_t work1, - dim_t work2, - dim_t* restrict nt1, - dim_t* restrict nt2 - ); - -// ----------------------------------------------------------------------------- - -dim_t bli_gcd( dim_t x, dim_t y ); -dim_t bli_lcm( dim_t x, dim_t y ); -dim_t bli_ipow( dim_t base, dim_t power ); - -// ----------------------------------------------------------------------------- - -BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); -BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); -BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); -BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); -BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); -BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); - -BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); -BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); - -void bli_thread_init_rntm_from_env( rntm_t* rntm ); - -// ----------------------------------------------------------------------------- - -static void bli_thread_range_jrir_rr - ( - thrinfo_t* thread, - dim_t n, - dim_t bf, - bool_t handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ - // Use interleaved partitioning of jr/ir loops. - *start = bli_thread_work_id( thread ); - *inc = bli_thread_n_way( thread ); - *end = n; -} - -static void bli_thread_range_jrir_sl - ( - thrinfo_t* thread, - dim_t n, - dim_t bf, - bool_t handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ - // Use contiguous slab partitioning of jr/ir loops. - bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); - *inc = 1; -} - -static void bli_thread_range_jrir - ( - thrinfo_t* thread, - dim_t n, - dim_t bf, - bool_t handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ - // Define a general-purpose version of bli_thread_range_jrir() whose - // definition depends on whether slab or round-robin partitioning was - // requested at configure-time. -#ifdef BLIS_ENABLE_JRIR_SLAB - bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); -#else - bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); -#endif -} - -#if 0 -static void bli_thread_range_weighted_jrir - ( - thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool_t handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ -#ifdef BLIS_ENABLE_JRIR_SLAB - - // Use contiguous slab partitioning for jr/ir loops. - bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, - handle_edge_low, start, end ); - - *start = *start / bf; *inc = 1; - - if ( *end % bf ) *end = *end / bf + 1; - else *end = *end / bf; - -#else - - // Use interleaved partitioning of jr/ir loops. - *start = bli_thread_work_id( thread ); - *inc = bli_thread_n_way( thread ); - *end = n; - -#endif -} -#endif - -#endif - -// end bli_thread.h -// begin bli_pthread.h - - -#ifndef BLIS_PTHREAD_H -#define BLIS_PTHREAD_H - -#if defined(_MSC_VER) - -// This branch defines a pthread-like API, bli_pthread_*(), and implements it -// in terms of Windows API calls. - -// -- pthread_mutex_*() -- - -typedef SRWLOCK bli_pthread_mutex_t; -typedef void bli_pthread_mutexattr_t; - -#define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT - -BLIS_EXPORT_BLIS int bli_pthread_mutex_init - ( - bli_pthread_mutex_t* mutex, - const bli_pthread_mutexattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_lock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock - ( - bli_pthread_mutex_t* mutex - ); - -// -- pthread_once_*() -- - -typedef INIT_ONCE bli_pthread_once_t; - -#define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT - -BLIS_EXPORT_BLIS void bli_pthread_once - ( - bli_pthread_once_t* once, - void (*init)(void) - ); - -// -- pthread_cond_*() -- - -typedef CONDITION_VARIABLE bli_pthread_cond_t; -typedef void bli_pthread_condattr_t; - -#define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT - -BLIS_EXPORT_BLIS int bli_pthread_cond_init - ( - bli_pthread_cond_t* cond, - const bli_pthread_condattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_destroy - ( - bli_pthread_cond_t* cond - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_wait - ( - bli_pthread_cond_t* cond, - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast - ( - bli_pthread_cond_t* cond - ); - -// -- pthread_create(), pthread_join() -- - -typedef struct -{ - HANDLE handle; - void* retval; -} bli_pthread_t; - -typedef void bli_pthread_attr_t; - -BLIS_EXPORT_BLIS int bli_pthread_create - ( - bli_pthread_t* thread, - const bli_pthread_attr_t* attr, - void* (*start_routine)(void*), - void* arg - ); - -BLIS_EXPORT_BLIS int bli_pthread_join - ( - bli_pthread_t thread, - void** retval - ); - -// -- pthread_barrier_*() -- - -typedef void bli_pthread_barrierattr_t; - -typedef struct -{ - bli_pthread_mutex_t mutex; - bli_pthread_cond_t cond; - int count; - int tripCount; -} bli_pthread_barrier_t; - -BLIS_EXPORT_BLIS int bli_pthread_barrier_init - ( - bli_pthread_barrier_t* barrier, - const bli_pthread_barrierattr_t* attr, - unsigned int count - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy - ( - bli_pthread_barrier_t* barrier - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_wait - ( - bli_pthread_barrier_t* barrier - ); - -#else // !defined(_MSC_VER) - -#include // skipped - -// This branch defines a pthreads-like API, bli_pthreads_*(), and implements it -// in terms of the corresponding pthreads_*() types, macros, and function calls. - -// -- pthread types -- - -typedef pthread_t bli_pthread_t; -typedef pthread_attr_t bli_pthread_attr_t; -typedef pthread_mutex_t bli_pthread_mutex_t; -typedef pthread_mutexattr_t bli_pthread_mutexattr_t; -typedef pthread_cond_t bli_pthread_cond_t; -typedef pthread_condattr_t bli_pthread_condattr_t; -typedef pthread_once_t bli_pthread_once_t; - -#if defined(__APPLE__) - -// For OS X, we must define the barrier types ourselves since Apple does -// not implement barriers in their variant of pthreads. - -typedef void bli_pthread_barrierattr_t; - -typedef struct -{ - bli_pthread_mutex_t mutex; - bli_pthread_cond_t cond; - int count; - int tripCount; -} bli_pthread_barrier_t; - -#else - -// For other non-Windows OSes (primarily Linux), we can define the barrier -// types in terms of existing pthreads barrier types since we expect they -// will be provided by the pthreads implementation. - -typedef pthread_barrier_t bli_pthread_barrier_t; -typedef pthread_barrierattr_t bli_pthread_barrierattr_t; - -#endif - -// -- pthreads macros -- - -#define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER -#define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER -#define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT - -// -- pthread_create(), pthread_join() -- - -BLIS_EXPORT_BLIS int bli_pthread_create - ( - bli_pthread_t* thread, - const bli_pthread_attr_t* attr, - void* (*start_routine)(void*), - void* arg - ); - -BLIS_EXPORT_BLIS int bli_pthread_join - ( - bli_pthread_t thread, - void** retval - ); - -// -- pthread_mutex_*() -- - -BLIS_EXPORT_BLIS int bli_pthread_mutex_init - ( - bli_pthread_mutex_t* mutex, - const bli_pthread_mutexattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_lock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock - ( - bli_pthread_mutex_t* mutex - ); - -// -- pthread_cond_*() -- - -BLIS_EXPORT_BLIS int bli_pthread_cond_init - ( - bli_pthread_cond_t* cond, - const bli_pthread_condattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_destroy - ( - bli_pthread_cond_t* cond - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_wait - ( - bli_pthread_cond_t* cond, - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast - ( - bli_pthread_cond_t* cond - ); - -// -- pthread_once_*() -- - -BLIS_EXPORT_BLIS void bli_pthread_once - ( - bli_pthread_once_t* once, - void (*init)(void) - ); - -// -- pthread_barrier_*() -- - -BLIS_EXPORT_BLIS int bli_pthread_barrier_init - ( - bli_pthread_barrier_t* barrier, - const bli_pthread_barrierattr_t* attr, - unsigned int count - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy - ( - bli_pthread_barrier_t* barrier - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_wait - ( - bli_pthread_barrier_t* barrier - ); - -#endif // _MSC_VER - -#endif // BLIS_PTHREAD_H -// end bli_pthread.h - - -// -- Constant definitions -- - -// begin bli_extern_defs.h - - -#ifndef BLIS_EXTERN_DEFS_H -#define BLIS_EXTERN_DEFS_H - -BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; -BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; -//BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; -BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; -//BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; -BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; -BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; - -BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; -BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; -BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; - -#endif -// end bli_extern_defs.h - - -// -- BLIS architecture/kernel definitions -- - -// begin bli_l1v_ker_prot.h - - - -// -// Define template prototypes for level-1v kernels. -// - -#define ADDV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); - - -#define AMAXV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - dim_t* restrict index, \ - cntx_t* restrict cntx \ - ); \ - - -#define AXPBYV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict beta, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); \ - - -#define AXPYV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); \ - - -#define COPYV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); - - -#define DOTV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - ctype* restrict rho, \ - cntx_t* restrict cntx \ - ); \ - - -#define DOTXV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - ctype* restrict beta, \ - ctype* restrict rho, \ - cntx_t* restrict cntx \ - ); \ - - -#define INVERTV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - cntx_t* restrict cntx \ - ); \ - - -#define SCALV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjalpha, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - cntx_t* restrict cntx \ - ); \ - - -#define SCAL2V_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); \ - - -#define SETV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjalpha, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - cntx_t* restrict cntx \ - ); \ - - -#define SUBV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); - - -#define SWAPV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); \ - - -#define XPBYV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict beta, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); \ - -// end bli_l1v_ker_prot.h -// begin bli_l1f_ker_prot.h - - - -// -// Define template prototypes for level-1f kernels. -// - -#define AXPY2V_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* restrict alphax, \ - ctype* restrict alphay, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - ctype* restrict z, inc_t incz, \ - cntx_t* restrict cntx \ - ); - - -#define AXPYF_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); - - -#define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - ctype* restrict rho, \ - ctype* restrict z, inc_t incz, \ - cntx_t* restrict cntx \ - ); - - -#define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict w, inc_t incw, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict beta, \ - ctype* restrict y, inc_t incy, \ - ctype* restrict z, inc_t incz, \ - cntx_t* restrict cntx \ - ); - - -#define DOTXF_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict beta, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); - -// end bli_l1f_ker_prot.h -// begin bli_l1m_ker_prot.h - - - -// -// Define template prototypes for level-1m kernels. -// - -// native packm kernels - -#define PACKM_KER_PROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - void* restrict kappa, \ - void* restrict a, inc_t inca, inc_t lda, \ - void* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ); - - -// native unpackm kernels - -#define UNPACKM_KER_PROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ); - - -// 3mis packm kernels - -#define PACKM_3MIS_KER_PROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - void* restrict kappa, \ - void* restrict a, inc_t inca, inc_t lda, \ - void* restrict p, inc_t is_p, inc_t ldp, \ - cntx_t* restrict cntx \ - ); - - -// 4mi packm kernels - -#define PACKM_4MI_KER_PROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - void* restrict kappa, \ - void* restrict a, inc_t inca, inc_t lda, \ - void* restrict p, inc_t is_p, inc_t ldp, \ - cntx_t* restrict cntx \ - ); - - -// rih packm kernels - -#define PACKM_RIH_KER_PROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - void* restrict kappa, \ - void* restrict a, inc_t inca, inc_t lda, \ - void* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ); - - -// 1e/1r packm kernels - -#define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - void* restrict kappa, \ - void* restrict a, inc_t inca, inc_t lda, \ - void* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ); - -// end bli_l1m_ker_prot.h -// begin bli_l2_ker_prot.h - - - -// -// Define template prototypes for level-1f kernels. -// - -#define GEMV_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t m, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t lda, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict beta, \ - ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ - ); - -// end bli_l2_ker_prot.h -// begin bli_l3_ukr_prot.h - - -// -// Define template prototypes for level-3 micro-kernels. -// - -#define GEMM_UKR_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ); - - -#define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a1x, \ - ctype* restrict a11, \ - ctype* restrict bx1, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ); - - -#define TRSM_UKR_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ); - -// end bli_l3_ukr_prot.h -// begin bli_l3_sup_ker_prot.h - - -// -// Define template prototypes for level-3 kernels on small/unpacked matrices. -// - -#define GEMMSUP_KER_PROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ); - -// end bli_l3_sup_ker_prot.h - -// begin bli_arch_config_pre.h - - -#ifndef BLIS_ARCH_CONFIG_PRE_H -#define BLIS_ARCH_CONFIG_PRE_H - - -// -- Naming-related kernel definitions ---------------------------------------- - -// The default suffix appended to reference kernels. -#define BLIS_REF_SUFFIX _ref - -// A suffix used for labeling certain induced method aware functions. -#define BLIS_IND_SUFFIX _ind - -// Add an underscore to the BLIS kernel set string, if it was defined. -#ifdef BLIS_CNAME -#define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) -#endif - -// Combine the CNAME and _ref for convenience to the code that defines -// reference kernels. -//#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) - -// -- Prototype-generating macro definitions ----------------------------------- - -// Prototype-generating macro for bli_cntx_init_*() functions. -#define CNTX_INIT_PROTS( archname ) \ -\ -void PASTEMAC(cntx_init_,archname) \ - ( \ - cntx_t* cntx \ - ); \ -void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ - ( \ - cntx_t* cntx \ - ); \ -void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ - ( \ - ind_t method, \ - num_t dt, \ - cntx_t* cntx \ - ); - - -#endif - -// end bli_arch_config_pre.h -// begin bli_arch_config.h - - -#ifndef BLIS_ARCH_CONFIG_H -#define BLIS_ARCH_CONFIG_H - -// -// -- Context initialization prototypes ---------------------------------------- -// - -// -- Intel64 architectures -- -#ifdef BLIS_CONFIG_SKX -CNTX_INIT_PROTS( skx ) -#endif -#ifdef BLIS_CONFIG_KNL -CNTX_INIT_PROTS( knl ) -#endif -#ifdef BLIS_CONFIG_KNC -CNTX_INIT_PROTS( knc ) -#endif -#ifdef BLIS_CONFIG_HASWELL -CNTX_INIT_PROTS( haswell ) -#endif -#ifdef BLIS_CONFIG_SANDYBRIDGE -CNTX_INIT_PROTS( sandybridge ) -#endif -#ifdef BLIS_CONFIG_PENRYN -CNTX_INIT_PROTS( penryn ) -#endif - -// -- AMD64 architectures -- -#ifdef BLIS_CONFIG_ZEN2 -CNTX_INIT_PROTS( zen2 ) -#endif -#ifdef BLIS_CONFIG_ZEN -CNTX_INIT_PROTS( zen ) -#endif -#ifdef BLIS_CONFIG_EXCAVATOR -CNTX_INIT_PROTS( excavator ) -#endif -#ifdef BLIS_CONFIG_STEAMROLLER -CNTX_INIT_PROTS( steamroller ) -#endif -#ifdef BLIS_CONFIG_PILEDRIVER -CNTX_INIT_PROTS( piledriver ) -#endif -#ifdef BLIS_CONFIG_BULLDOZER -CNTX_INIT_PROTS( bulldozer ) -#endif - -// -- ARM architectures -- - -#ifdef BLIS_CONFIG_THUNDERX2 -CNTX_INIT_PROTS( thunderx2 ) -#endif -#ifdef BLIS_CONFIG_CORTEXA57 -CNTX_INIT_PROTS( cortexa57 ) -#endif -#ifdef BLIS_CONFIG_CORTEXA53 -CNTX_INIT_PROTS( cortexa53 ) -#endif -#ifdef BLIS_CONFIG_CORTEXA15 -CNTX_INIT_PROTS( cortexa15 ) -#endif -#ifdef BLIS_CONFIG_CORTEXA9 -CNTX_INIT_PROTS( cortexa9 ) -#endif - -// -- IBM Power -- - -#ifdef BLIS_CONFIG_POWER9 -CNTX_INIT_PROTS( power9 ) -#endif -#ifdef BLIS_CONFIG_POWER7 -CNTX_INIT_PROTS( power7 ) -#endif - -// -- IBM BG/Q -- - -#ifdef BLIS_CONFIG_BGQ -CNTX_INIT_PROTS( bgq ) -#endif - -// -- Generic -- - -#ifdef BLIS_CONFIG_GENERIC -CNTX_INIT_PROTS( generic ) -#endif - - -// -// -- Architecture family-specific headers ------------------------------------- -// - -// -- x86_64 families -- - -#ifdef BLIS_FAMILY_INTEL64 -#include "bli_family_intel64.h" // skipped -#endif -#ifdef BLIS_FAMILY_AMD64 -#include "bli_family_amd64.h" // skipped -#endif -#ifdef BLIS_FAMILY_X86_64 -#include "bli_family_x86_64.h" // skipped -#endif - -// -- Intel64 architectures -- -#ifdef BLIS_FAMILY_SKX -#include "bli_family_skx.h" // skipped -#endif -#ifdef BLIS_FAMILY_KNL -#include "bli_family_knl.h" // skipped -#endif -#ifdef BLIS_FAMILY_KNC -#include "bli_family_knc.h" // skipped -#endif -#ifdef BLIS_FAMILY_HASWELL -#include "bli_family_haswell.h" // skipped -#endif -#ifdef BLIS_FAMILY_SANDYBRIDGE -#include "bli_family_sandybridge.h" // skipped -#endif -#ifdef BLIS_FAMILY_PENRYN -#include "bli_family_penryn.h" // skipped -#endif - -// -- AMD64 architectures -- - -#ifdef BLIS_FAMILY_ZEN2 -// begin bli_family_zen2.h - - -#ifndef BLI_FAMILY_ZEN2_ -#define BLI_FAMILY_ZEN2_ - -// By default, it is effective to parallelize the outer loops. -// Setting these macros to 1 will force JR and IR inner loops -// to be not paralleized. -#define BLIS_THREAD_MAX_IR 1 -#define BLIS_THREAD_MAX_JR 1 - - -#define BLIS_ENABLE_SMALL_MATRIX -#define BLIS_ENABLE_SMALL_MATRIX_TRSM - - -// This will select the threshold below which small matrix code will be called. -#define BLIS_SMALL_MATRIX_THRES 700 -#define BLIS_SMALL_M_RECT_MATRIX_THRES 160 -#define BLIS_SMALL_K_RECT_MATRIX_THRES 128 - -#define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) -#define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 - -#define BLIS_ENABLE_SMALL_MATRIX_ROME -#define BLIS_SMALL_MATRIX_THRES_ROME 400 - -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 - -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 - -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 - -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 - -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 -#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 - -// When running HPL with pure MPI without DGEMM threading (Single-threaded -// BLIS), defining this macro as 1 yields better performance. -#define AOCL_BLIS_MULTIINSTANCE 0 - -#endif - -// end bli_family_zen2.h -#endif -#ifdef BLIS_FAMILY_ZEN -#include "bli_family_zen.h" // skipped -#endif -#ifdef BLIS_FAMILY_EXCAVATOR -#include "bli_family_excavator.h" // skipped -#endif -#ifdef BLIS_FAMILY_STEAMROLLER -#include "bli_family_steamroller.h" // skipped -#endif -#ifdef BLIS_FAMILY_PILEDRIVER -#include "bli_family_piledriver.h" // skipped -#endif -#ifdef BLIS_FAMILY_BULLDOZER -#include "bli_family_bulldozer.h" // skipped -#endif - -// -- ARM architectures -- - -#ifdef BLIS_FAMILY_THUNDERX2 -#include "bli_family_thunderx2.h" // skipped -#endif -#ifdef BLIS_FAMILY_CORTEXA57 -#include "bli_family_cortexa57.h" // skipped -#endif -#ifdef BLIS_FAMILY_CORTEXA53 -#include "bli_family_cortexa53.h" // skipped -#endif -#ifdef BLIS_FAMILY_CORTEXA15 -#include "bli_family_cortexa15.h" // skipped -#endif -#ifdef BLIS_FAMILY_CORTEXA9 -#include "bli_family_cortexa9.h" // skipped -#endif - -// -- IBM Power -- - -#ifdef BLIS_FAMILY_POWER9 -#include "bli_family_power9.h" // skipped -#endif -#ifdef BLIS_FAMILY_POWER7 -#include "bli_family_power7.h" // skipped -#endif - -// -- IBM BG/Q -- - -#ifdef BLIS_FAMILY_BGQ -#include "bli_family_bgq.h" // skipped -#endif - -// -- Generic -- - -#ifdef BLIS_FAMILY_GENERIC -#include "bli_family_generic.h" // skipped -#endif - - -// -// -- kernel set prototypes ---------------------------------------------------- -// - -// -- Intel64 architectures -- -#ifdef BLIS_KERNELS_SKX -#include "bli_kernels_skx.h" // skipped -#endif -#ifdef BLIS_KERNELS_KNL -#include "bli_kernels_knl.h" // skipped -#endif -#ifdef BLIS_KERNELS_KNC -#include "bli_kernels_knc.h" // skipped -#endif -#ifdef BLIS_KERNELS_HASWELL -// begin bli_kernels_haswell.h - - -// -- level-3 ------------------------------------------------------------------ - -// gemm (asm d6x8) -GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) -GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) -GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) -GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) - -// gemm (asm d8x6) -GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) -GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) -GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) -GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) - -// gemmtrsm_l (asm d6x8) -GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) - -// gemmtrsm_u (asm d6x8) -GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) - - -// gemm (asm d8x6) -//GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) -//GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) -//GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) -//GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) - - -// -- level-3 sup -------------------------------------------------------------- - -// gemmsup_rv - -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) - -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) - -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) - -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) - -GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) -GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) -GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) -GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) -GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) -GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) - -// gemmsup_rv (mkernel in m dim) - -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) - -// gemmsup_rv (mkernel in n dim) - -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) - -// gemmsup_rd - -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) - -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) - -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) - -// gemmsup_rd (mkernel in m dim) - -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) - -// gemmsup_rd (mkernel in n dim) - -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) -GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) - -// end bli_kernels_haswell.h -#endif -#ifdef BLIS_KERNELS_SANDYBRIDGE -#include "bli_kernels_sandybridge.h" // skipped -#endif -#ifdef BLIS_KERNELS_PENRYN -#include "bli_kernels_penryn.h" // skipped -#endif - -// -- AMD64 architectures -- - -#ifdef BLIS_KERNELS_ZEN2 -// begin bli_kernels_zen2.h - - - -// -- level-1f -- -AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) -AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) - -// -- level-2 -- - -//gemv(scalar code) -GEMV_KER_PROT( double, d, gemv_zen_ref_c ) - - -// end bli_kernels_zen2.h -#endif -#ifdef BLIS_KERNELS_ZEN -// begin bli_kernels_zen.h - - -// -- level-1m -- -PACKM_KER_PROT(double, d, packm_8xk_gen_zen) -PACKM_KER_PROT(double, d, packm_6xk_gen_zen) -PACKM_KER_PROT(double, d, packm_8xk_nn_zen) -PACKM_KER_PROT(double, d, packm_6xk_nn_zen) - - -// -- level-1v -- - -// amaxv (intrinsics) -AMAXV_KER_PROT( float, s, amaxv_zen_int ) -AMAXV_KER_PROT( double, d, amaxv_zen_int ) - -// axpyv (intrinsics) -AXPYV_KER_PROT( float, s, axpyv_zen_int ) -AXPYV_KER_PROT( double, d, axpyv_zen_int ) - - // axpyv (intrinsics unrolled x10) - AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) - AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) - -// dotv (intrinsics) -DOTV_KER_PROT( float, s, dotv_zen_int ) -DOTV_KER_PROT( double, d, dotv_zen_int ) - - // dotv (intrinsics, unrolled x10) - DOTV_KER_PROT( float, s, dotv_zen_int10 ) - DOTV_KER_PROT( double, d, dotv_zen_int10 ) - -// dotxv (intrinsics) -DOTXV_KER_PROT( float, s, dotxv_zen_int ) -DOTXV_KER_PROT( double, d, dotxv_zen_int ) - -// scalv (intrinsics) -SCALV_KER_PROT( float, s, scalv_zen_int ) -SCALV_KER_PROT( double, d, scalv_zen_int ) - - // scalv (intrinsics unrolled x10) - SCALV_KER_PROT( float, s, scalv_zen_int10 ) - SCALV_KER_PROT( double, d, scalv_zen_int10 ) - -// swapv (intrinsics) -SWAPV_KER_PROT(float, s, swapv_zen_int8 ) -SWAPV_KER_PROT(double, d, swapv_zen_int8 ) - -// copyv (intrinsics) -COPYV_KER_PROT( float, s, copyv_zen_int ) -COPYV_KER_PROT( double, d, copyv_zen_int ) - -// -SETV_KER_PROT(float, s, setv_zen_int) -SETV_KER_PROT(double, d, setv_zen_int) - -// -- level-1f -- - -// axpyf (intrinsics) -AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) -AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) - -// dotxf (intrinsics) -DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) -DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) - -// -- level-3 sup -------------------------------------------------------------- -// semmsup_rv - -//GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) - -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) - -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) - -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) - -GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) -GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) -GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) -GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) -GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) -GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) - -// gemmsup_rv (mkernel in m dim) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) -// gemmsup_rv (mkernel in n dim) - -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) - -// gemmsup_rd -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) -GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) - -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) - -// gemmsup_rv (mkernel in n dim) - - -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) -GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) -// end bli_kernels_zen.h -#endif -//#ifdef BLIS_KERNELS_EXCAVATOR -//#include "bli_kernels_excavator.h" -//#endif -//#ifdef BLIS_KERNELS_STEAMROLLER -//#include "bli_kernels_steamroller.h" -//#endif -#ifdef BLIS_KERNELS_PILEDRIVER -#include "bli_kernels_piledriver.h" // skipped -#endif -#ifdef BLIS_KERNELS_BULLDOZER -#include "bli_kernels_bulldozer.h" // skipped -#endif - -// -- ARM architectures -- - -#ifdef BLIS_KERNELS_ARMSVE -#include "bli_kernels_armsve.h" // skipped -#endif -#ifdef BLIS_KERNELS_ARMV8A -#include "bli_kernels_armv8a.h" // skipped -#endif -#ifdef BLIS_KERNELS_ARMV7A -#include "bli_kernels_armv7a.h" // skipped -#endif - -// -- IBM Power -- - -#ifdef BLIS_KERNELS_POWER9 -#include "bli_kernels_power9.h" // skipped -#endif -#ifdef BLIS_KERNELS_POWER7 -#include "bli_kernels_power7.h" // skipped -#endif - -// -- IBM BG/Q -- - -#ifdef BLIS_KERNELS_BGQ -#include "bli_kernels_bgq.h" // skipped -#endif - - - -#endif - -// end bli_arch_config.h - -// begin bli_kernel_macro_defs.h - - -#ifndef BLIS_KERNEL_MACRO_DEFS_H -#define BLIS_KERNEL_MACRO_DEFS_H - - -// -- Define default threading parameters -------------------------------------- - -// -- Conventional (large code path) values -- - -#ifndef BLIS_THREAD_RATIO_M -#define BLIS_THREAD_RATIO_M 2 -#endif - -#ifndef BLIS_THREAD_RATIO_N -#define BLIS_THREAD_RATIO_N 1 -#endif - -#ifndef BLIS_THREAD_MAX_IR -#define BLIS_THREAD_MAX_IR 1 -#endif - -#ifndef BLIS_THREAD_MAX_JR -#define BLIS_THREAD_MAX_JR 4 -#endif - -#if 0 -// -- Skinny/small possibly-unpacked (sup code path) values -- - -#ifndef BLIS_THREAD_SUP_RATIO_M -#define BLIS_THREAD_SUP_RATIO_M 1 -#endif - -#ifndef BLIS_THREAD_SUP_RATIO_N -#define BLIS_THREAD_SUP_RATIO_N 2 -#endif - -#ifndef BLIS_THREAD_SUP_MAX_IR -#define BLIS_THREAD_SUP_MAX_IR 1 -#endif - -#ifndef BLIS_THREAD_SUP_MAX_JR -#define BLIS_THREAD_SUP_MAX_JR 8 -#endif -#endif - - -// -- Memory allocation -------------------------------------------------------- - -// hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with -// libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND -// was explicitly defined. -#ifdef BLIS_DISABLE_MEMKIND - #undef BLIS_ENABLE_MEMKIND -#endif -#ifdef BLIS_ENABLE_MEMKIND -#include // skipped -#endif - -// Memory allocation functions. These macros define the three types of -// malloc()-style functions, and their free() counterparts: one for each -// type of memory to be allocated. -// NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING -// THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() -// and free(): -// -// void* malloc( size_t size ); -// void free( void* p ); -// - -// This allocation function is called to allocate memory for blocks within -// BLIS's internal memory pools. -#ifndef BLIS_MALLOC_POOL - // If use of libmemkind was enabled at configure-time, the default - // memory allocation function for memory pools should be hbw_malloc() - // instead of malloc(). - #ifdef BLIS_ENABLE_MEMKIND - #define BLIS_MALLOC_POOL hbw_malloc - #else - #define BLIS_MALLOC_POOL malloc - #endif -#endif - -#ifndef BLIS_FREE_POOL - // If use of libmemkind was enabled at configure-time, the default - // memory deallocation function for memory pools should be hbw_free() - // instead of free(). - #ifdef BLIS_ENABLE_MEMKIND - #define BLIS_FREE_POOL hbw_free - #else - #define BLIS_FREE_POOL free - #endif -#endif - -// This allocation function is called to allocate memory for internally- -// used objects and structures, such as control tree nodes. -#ifndef BLIS_MALLOC_INTL -#define BLIS_MALLOC_INTL malloc -#endif - -#ifndef BLIS_FREE_INTL -#define BLIS_FREE_INTL free -#endif - -// This allocation function is called to allocate memory for objects -// created by user-level API functions, such as bli_obj_create(). -#ifndef BLIS_MALLOC_USER -#define BLIS_MALLOC_USER malloc -#endif - -#ifndef BLIS_FREE_USER -#define BLIS_FREE_USER free -#endif - -// -- Other system-related definitions ----------------------------------------- - -// Size of a virtual memory page. This is used to align blocks within the -// memory pools. -#ifndef BLIS_PAGE_SIZE -#define BLIS_PAGE_SIZE 4096 -#endif - -// The maximum number of named SIMD vector registers available for use. -// When configuring with umbrella configuration families, this should be -// set to the maximum number of registers across all sub-configurations in -// the family. -#ifndef BLIS_SIMD_NUM_REGISTERS -#define BLIS_SIMD_NUM_REGISTERS 32 -#endif - -// The maximum size (in bytes) of each SIMD vector. -// When configuring with umbrella configuration families, this should be -// set to the maximum SIMD size across all sub-configurations in the family. -#ifndef BLIS_SIMD_SIZE -#define BLIS_SIMD_SIZE 64 -#endif - -// Alignment size (in bytes) needed by the instruction set for aligned -// SIMD/vector instructions. -#ifndef BLIS_SIMD_ALIGN_SIZE -#define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_SIZE -#endif - -// The maximum size in bytes of local stack buffers within macro-kernel -// functions. These buffers are usually used to store a temporary copy -// of a single microtile. The reason we multiply by 2 is to handle induced -// methods, where we use real domain register blocksizes in units of -// complex elements. Specifically, the macro-kernels will need this larger -// micro-tile footprint, even though the virtual micro-kernels will only -// ever be writing to half (real or imaginary part) at a time. -#ifndef BLIS_STACK_BUF_MAX_SIZE -#define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_NUM_REGISTERS * \ - BLIS_SIMD_SIZE * 2 ) -#endif - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#ifndef BLIS_STACK_BUF_ALIGN_SIZE -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE -#endif - -// Alignment size used when allocating memory via BLIS_MALLOC_USER. -// To disable heap alignment, set this to 1. -#ifndef BLIS_HEAP_ADDR_ALIGN_SIZE -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE -#endif - -// Alignment size used when sizing leading dimensions of memory allocated -// via BLIS_MALLOC_USER. -#ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE -#endif - -// Alignment sizes used when allocating blocks to the internal memory -// pool, via BLIS_MALLOC_POOL. -#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A -#define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE -#endif - -#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B -#define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE -#endif - -#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C -#define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE -#endif - -#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN -#define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE -#endif - -// Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. -#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A -#define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 -#endif - -#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B -#define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 -#endif - -#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C -#define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 -#endif - -#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN -#define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 -#endif - - -#ifdef AOCL_BLIS_ZEN -#define TRSM_BLKSZ_FUNC bli_cntx_get_trsm_blksz -#else -#define TRSM_BLKSZ_FUNC bli_cntx_get_blksz -#endif - -#endif - -// end bli_kernel_macro_defs.h - - -// -- Base operation prototypes -- - -// begin bli_init.h - - -BLIS_EXPORT_BLIS void bli_init( void ); -BLIS_EXPORT_BLIS void bli_finalize( void ); - -void bli_init_auto( void ); -void bli_finalize_auto( void ); - -void bli_init_apis( void ); -void bli_finalize_apis( void ); - -void bli_init_once( void ); -void bli_finalize_once( void ); - -// end bli_init.h -// begin bli_const.h - - -void bli_const_init( void ); -void bli_const_finalize( void ); - -// end bli_const.h -// begin bli_obj.h - - -// begin bli_obj_check.h - - -void bli_obj_create_check( num_t dt, - dim_t m, - dim_t n, - inc_t rs, - inc_t cs, - obj_t* obj ); - -void bli_obj_create_without_buffer_check( num_t dt, - dim_t m, - dim_t n, - obj_t* obj ); - -void bli_obj_alloc_buffer_check( inc_t rs, - inc_t cs, - inc_t is, - obj_t* obj ); - -void bli_obj_attach_buffer_check( void* p, - inc_t rs, - inc_t cs, - inc_t is, - obj_t* obj ); - -void bli_obj_create_scalar_check( num_t dt, - obj_t* obj ); - -void bli_obj_free_check( obj_t* obj ); - -void bli_obj_create_const_check( double value, obj_t* obj ); - -void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); - -void bli_dt_size_check( num_t dt ); - -void bli_dt_string_check( num_t dt ); - -void bli_dt_union_check( num_t dt1, num_t dt2 ); - -void bli_obj_print_check( char* label, obj_t* obj ); - -// end bli_obj_check.h - -BLIS_EXPORT_BLIS void bli_obj_create - ( - num_t dt, - dim_t m, - dim_t n, - inc_t rs, - inc_t cs, - obj_t* obj - ); - -BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer - ( - num_t dt, - dim_t m, - dim_t n, - void* p, - inc_t rs, - inc_t cs, - obj_t* obj - ); - -BLIS_EXPORT_BLIS void bli_obj_create_without_buffer - ( - num_t dt, - dim_t m, - dim_t n, - obj_t* obj - ); - -BLIS_EXPORT_BLIS void bli_obj_alloc_buffer - ( - inc_t rs, - inc_t cs, - inc_t is, - obj_t* obj - ); - -BLIS_EXPORT_BLIS void bli_obj_attach_buffer - ( - void* p, - inc_t rs, - inc_t cs, - inc_t is, - obj_t* obj - ); - -BLIS_EXPORT_BLIS void bli_obj_create_1x1 - ( - num_t dt, - obj_t* obj - ); - -BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer - ( - num_t dt, - void* p, - obj_t* obj - ); - -BLIS_EXPORT_BLIS void bli_obj_create_conf_to - ( - obj_t* s, - obj_t* d - ); - -BLIS_EXPORT_BLIS void bli_obj_free - ( - obj_t* obj - ); - -void bli_adjust_strides - ( - dim_t m, - dim_t n, - siz_t elem_size, - inc_t* rs, - inc_t* cs, - inc_t* is - ); - -BLIS_EXPORT_BLIS siz_t bli_dt_size - ( - num_t dt - ); - -BLIS_EXPORT_BLIS char* bli_dt_string - ( - num_t dt - ); - -BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult - ( - dim_t dim, - dim_t dim_mult - ); - -BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size - ( - dim_t dim, - siz_t elem_size, - siz_t align_size - ); - -BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size - ( - void* p, - size_t align_size - ); - -BLIS_EXPORT_BLIS void bli_obj_print - ( - char* label, - obj_t* obj - ); - -// end bli_obj.h -// begin bli_obj_scalar.h - - -BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached - ( - num_t dt, - obj_t* beta - ); - -BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of - ( - num_t dt, - conj_t conj, - obj_t* alpha, - obj_t* beta - ); - -BLIS_EXPORT_BLIS void bli_obj_scalar_detach - ( - obj_t* a, - obj_t* alpha - ); - -BLIS_EXPORT_BLIS void bli_obj_scalar_attach - ( - conj_t conj, - obj_t* alpha, - obj_t* a - ); - -BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to - ( - num_t dt, - obj_t* a - ); - -BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar - ( - obj_t* alpha, - obj_t* a - ); - -BLIS_EXPORT_BLIS void bli_obj_scalar_reset - ( - obj_t* a - ); - -BLIS_EXPORT_BLIS bool_t bli_obj_scalar_has_nonzero_imag - ( - obj_t* a - ); - -BLIS_EXPORT_BLIS bool_t bli_obj_scalar_equals - ( - obj_t* a, - obj_t* beta - ); - -// end bli_obj_scalar.h -// begin bli_blksz.h - - -// blksz_t query - -static dim_t bli_blksz_get_def - ( - num_t dt, - blksz_t* b - ) -{ - return b->v[ dt ]; -} - -static dim_t bli_blksz_get_max - ( - num_t dt, - blksz_t* b - ) -{ - return b->e[ dt ]; -} - - -// blksz_t modification - -static void bli_blksz_set_def - ( - dim_t val, - num_t dt, - blksz_t* b - ) -{ - b->v[ dt ] = val; -} - -static void bli_blksz_set_max - ( - dim_t val, - num_t dt, - blksz_t* b - ) -{ - b->e[ dt ] = val; -} - -static void bli_blksz_copy - ( - blksz_t* b_src, - blksz_t* b_dst - ) -{ - *b_dst = *b_src; -} - -static void bli_blksz_copy_if_pos - ( - blksz_t* b_src, - blksz_t* b_dst - ) -{ - // Copy the blocksize values over to b_dst one-by-one so that - // we can skip the ones that are non-positive. - - const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); - const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); - const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); - const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); - - const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); - const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); - const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); - const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); - - if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); - if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); - if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); - if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); - - if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); - if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); - if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); - if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); -} - -static void bli_blksz_copy_def_dt - ( - num_t dt_src, blksz_t* b_src, - num_t dt_dst, blksz_t* b_dst - ) -{ - const dim_t val = bli_blksz_get_def( dt_src, b_src ); - - bli_blksz_set_def( val, dt_dst, b_dst ); -} - -static void bli_blksz_copy_max_dt - ( - num_t dt_src, blksz_t* b_src, - num_t dt_dst, blksz_t* b_dst - ) -{ - const dim_t val = bli_blksz_get_max( dt_src, b_src ); - - bli_blksz_set_max( val, dt_dst, b_dst ); -} - -static void bli_blksz_copy_dt - ( - num_t dt_src, blksz_t* b_src, - num_t dt_dst, blksz_t* b_dst - ) -{ - bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); - bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); -} - -static void bli_blksz_scale_def - ( - dim_t num, - dim_t den, - num_t dt, - blksz_t* b - ) -{ - const dim_t val = bli_blksz_get_def( dt, b ); - - bli_blksz_set_def( ( val * num ) / den, dt, b ); -} - -static void bli_blksz_scale_max - ( - dim_t num, - dim_t den, - num_t dt, - blksz_t* b - ) -{ - const dim_t val = bli_blksz_get_max( dt, b ); - - bli_blksz_set_max( ( val * num ) / den, dt, b ); -} - -static void bli_blksz_scale_def_max - ( - dim_t num, - dim_t den, - num_t dt, - blksz_t* b - ) -{ - bli_blksz_scale_def( num, den, dt, b ); - bli_blksz_scale_max( num, den, dt, b ); -} - -// ----------------------------------------------------------------------------- - -BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed - ( - dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z - ); - -BLIS_EXPORT_BLIS blksz_t* bli_blksz_create - ( - dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, - dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z - ); - -BLIS_EXPORT_BLIS void bli_blksz_init_ed - ( - blksz_t* b, - dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z - ); - -BLIS_EXPORT_BLIS void bli_blksz_init - ( - blksz_t* b, - dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, - dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z - ); - -BLIS_EXPORT_BLIS void bli_blksz_init_easy - ( - blksz_t* b, - dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z - ); - -BLIS_EXPORT_BLIS void bli_blksz_free - ( - blksz_t* b - ); - -// ----------------------------------------------------------------------------- - -#if 0 -BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to - ( - num_t dt_bm, blksz_t* bmult, - num_t dt_bs, blksz_t* blksz - ); -#endif - -void bli_blksz_reduce_def_to - ( - num_t dt_bm, blksz_t* bmult, - num_t dt_bs, blksz_t* blksz - ); - -void bli_blksz_reduce_max_to - ( - num_t dt_bm, blksz_t* bmult, - num_t dt_bs, blksz_t* blksz - ); -// ----------------------------------------------------------------------------- - -dim_t bli_determine_blocksize - ( - dir_t direct, - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ); - -dim_t bli_determine_blocksize_f - ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ); - -dim_t bli_determine_blocksize_b - ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ); - -#ifdef AOCL_BLIS_ZEN - -dim_t bli_determine_blocksize_trsm - ( - dir_t direct, - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ); - -dim_t bli_determine_blocksize_trsm_f - ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ); - -dim_t bli_determine_blocksize_trsm_b - ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ); - -#endif - -dim_t bli_determine_blocksize_f_sub - ( - dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max - ); - -dim_t bli_determine_blocksize_b_sub - ( - dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max - ); - -// end bli_blksz.h -// begin bli_func.h - - -// ----------------------------------------------------------------------------- - -// func_t query - -static void_fp bli_func_get_dt - ( - num_t dt, - func_t* func - ) -{ - return func->ptr[ dt ]; -} - -// func_t modification - -static void bli_func_set_dt - ( - void_fp fp, - num_t dt, - func_t* func - ) -{ - func->ptr[ dt ] = fp; -} - -static void bli_func_copy_dt - ( - num_t dt_src, func_t* func_src, - num_t dt_dst, func_t* func_dst - ) -{ - void_fp fp = bli_func_get_dt( dt_src, func_src ); - - bli_func_set_dt( fp, dt_dst, func_dst ); -} - -// ----------------------------------------------------------------------------- - -func_t* bli_func_create - ( - void_fp ptr_s, - void_fp ptr_d, - void_fp ptr_c, - void_fp ptr_z - ); - -void bli_func_init - ( - func_t* f, - void_fp ptr_s, - void_fp ptr_d, - void_fp ptr_c, - void_fp ptr_z - ); - -void bli_func_init_null - ( - func_t* f - ); - -void bli_func_free( func_t* f ); - -// ----------------------------------------------------------------------------- - -bool_t bli_func_is_null_dt( num_t dt, - func_t* f ); -bool_t bli_func_is_null( func_t* f ); - -// end bli_func.h -// begin bli_mbool.h - - -// ----------------------------------------------------------------------------- - -// mbool_t query - -static bool_t bli_mbool_get_dt( num_t dt, mbool_t* mb ) -{ - return mb->v[ dt ]; -} - -// mbool_t modification - -static void bli_mbool_set_dt( bool_t val, num_t dt, mbool_t* mb ) -{ - mb->v[ dt ] = val; -} - -// ----------------------------------------------------------------------------- - -mbool_t* bli_mbool_create - ( - bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z - ); - -void bli_mbool_init - ( - mbool_t* b, - bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z - ); - -void bli_mbool_free( mbool_t* b ); - -// end bli_mbool.h -// begin bli_cntx.h - - -#ifndef BLIS_CNTX_H -#define BLIS_CNTX_H - - -// Context object type (defined in bli_type_defs.h) - - - -// ----------------------------------------------------------------------------- - -// -// -- cntx_t query (fields only) ----------------------------------------------- -// - -static blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) -{ - return cntx->blkszs; -} -static bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) -{ - return cntx->bmults; -} -static blksz_t* bli_cntx_trsm_blkszs_buf( cntx_t* cntx ) -{ - return cntx->trsm_blkszs; -} -static func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) -{ - return cntx->l3_vir_ukrs; -} -static func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) -{ - return cntx->l3_nat_ukrs; -} -static mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) -{ - return cntx->l3_nat_ukrs_prefs; -} -static blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_thresh; -} -static void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_handlers; -} -static blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_blkszs; -} -static func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_kers; -} -static mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_kers_prefs; -} -static func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) -{ - return cntx->l1f_kers; -} -static func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) -{ - return cntx->l1v_kers; -} -static func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) -{ - return cntx->packm_kers; -} -static func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) -{ - return cntx->unpackm_kers; -} -static ind_t bli_cntx_method( cntx_t* cntx ) -{ - return cntx->method; -} -static pack_t bli_cntx_schema_a_block( cntx_t* cntx ) -{ - return cntx->schema_a_block; -} -static pack_t bli_cntx_schema_b_panel( cntx_t* cntx ) -{ - return cntx->schema_b_panel; -} -static pack_t bli_cntx_schema_c_panel( cntx_t* cntx ) -{ - return cntx->schema_c_panel; -} - -// ----------------------------------------------------------------------------- - -// -// -- cntx_t modification (fields only) ---------------------------------------- -// - -static void bli_cntx_set_method( ind_t method, cntx_t* cntx ) -{ - cntx->method = method; -} -static void bli_cntx_set_schema_a_block( pack_t schema, cntx_t* cntx ) -{ - cntx->schema_a_block = schema; -} -static void bli_cntx_set_schema_b_panel( pack_t schema, cntx_t* cntx ) -{ - cntx->schema_b_panel = schema; -} -static void bli_cntx_set_schema_c_panel( pack_t schema, cntx_t* cntx ) -{ - cntx->schema_c_panel = schema; -} -static void bli_cntx_set_schema_ab_blockpanel( pack_t sa, pack_t sb, cntx_t* cntx ) -{ - bli_cntx_set_schema_a_block( sa, cntx ); - bli_cntx_set_schema_b_panel( sb, cntx ); -} - -// ----------------------------------------------------------------------------- - -// -// -- cntx_t query (complex) --------------------------------------------------- -// - -static blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; - - // Return the address of the blksz_t identified by bs_id. - return blksz; -} - -static dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); - dim_t bs_dt = bli_blksz_get_def( dt, blksz ); - - // Return the main (default) blocksize value for the datatype given. - return bs_dt; -} - -static dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); - dim_t bs_dt = bli_blksz_get_max( dt, blksz ); - - // Return the auxiliary (maximum) blocksize value for the datatype given. - return bs_dt; -} - -static bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) -{ - bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); - bszid_t bm_id = bmults[ bs_id ]; - - return bm_id; -} - -static blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) -{ - bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); - blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); - - return bmult; -} - -static dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); - dim_t bm_dt = bli_blksz_get_def( dt, bmult ); - - return bm_dt; -} - -// ----------------------------------------------------------------------------- - -static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); - func_t* func = &funcs[ ukr_id ]; - - return func; -} - -static void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -static func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); - func_t* func = &funcs[ ukr_id ]; - - return func; -} - -static void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -// ----------------------------------------------------------------------------- - -static mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) -{ - mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); - mbool_t* mbool = &mbools[ ukr_id ]; - - return mbool; -} - -static bool_t bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); - - return bli_mbool_get_dt( dt, mbool ); -} - -// ----------------------------------------------------------------------------- - -static blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) -{ - blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); - blksz_t* thresh = &threshs[ thresh_id ]; - - // Return the address of the blksz_t identified by thresh_id. - return thresh; -} - -static dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) -{ - blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); - dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); - - // Return the main (default) threshold value for the datatype given. - return thresh_dt; -} - -static bool_t bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) -{ - if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; - if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; - if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; - - return FALSE; -} - -// ----------------------------------------------------------------------------- - -static void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) -{ - void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); - void* func = funcs[ op ]; - - return func; -} - -// ----------------------------------------------------------------------------- - -static blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; - - // Return the address of the blksz_t identified by bs_id. - return blksz; -} - -static blksz_t* bli_cntx_get_trsm_blksz( bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blkszs = bli_cntx_trsm_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; - - // Return the address of the blksz_t identified by bs_id. - return blksz; - -} - -static dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); - dim_t bs_dt = bli_blksz_get_def( dt, blksz ); - - // Return the main (default) blocksize value for the datatype given. - return bs_dt; -} - -static dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); - dim_t bs_dt = bli_blksz_get_max( dt, blksz ); - - // Return the auxiliary (maximum) blocksize value for the datatype given. - return bs_dt; -} - -// ----------------------------------------------------------------------------- - -static func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); - func_t* func = &funcs[ stor_id ]; - - return func; -} - -static void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -// ----------------------------------------------------------------------------- - -static mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) -{ - mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); - mbool_t* mbool = &mbools[ stor_id ]; - - return mbool; -} - -static bool_t bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); - - return bli_mbool_get_dt( dt, mbool ); -} - -// ----------------------------------------------------------------------------- - -static func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); - func_t* func = &funcs[ ker_id ]; - - return func; -} - -static void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -// ----------------------------------------------------------------------------- - -static func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); - func_t* func = &funcs[ ker_id ]; - - return func; -} - -static void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -// ----------------------------------------------------------------------------- - -static func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = NULL; - - // Only index to the requested packm func_t if the packm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) - { - func_t* funcs = bli_cntx_packm_kers_buf( cntx ); - - func = &funcs[ ker_id ]; - } - - return func; -} - -static void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - void_fp fp = NULL; - - // Only query the context for the packm func_t (and then extract the - // datatype-specific function pointer) if the packm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) - { - func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); - - fp = bli_func_get_dt( dt, func ); - } - - return fp; -} - -static func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = NULL; - - // Only index to the requested unpackm func_t if the unpackm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) - { - func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); - - func = &funcs[ ker_id ]; - } - - return func; -} - -static void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - void_fp fp = NULL; - - // Only query the context for the unpackm func_t (and then extract the - // datatype-specific function pointer) if the unpackm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) - { - func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); - - fp = bli_func_get_dt( dt, func ); - } - - return fp; -} - -// ----------------------------------------------------------------------------- - -static bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - const bool_t prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); - - // A ukernel preference of TRUE means the ukernel prefers row storage. - return ( bool_t ) - ( prefs == TRUE ); -} - -static bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - const bool_t prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); - - // A ukernel preference of FALSE means the ukernel prefers column storage. - return ( bool_t ) - ( prefs == FALSE ); -} - -static bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - // Note that we use the computation datatype, which may differ from the - // storage datatype of C (when performing a mixed datatype operation). - const num_t dt = bli_obj_comp_dt( obj ); - const bool_t ukr_prefers_rows - = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); - const bool_t ukr_prefers_cols - = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); - bool_t r_val = FALSE; - - if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; - else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; - - return r_val; -} - -static bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - return ( bool_t ) - !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); -} - -// ----------------------------------------------------------------------------- - -static bool_t bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - // For induced methods, return the ukernel storage preferences of the - // corresponding real micro-kernel. - // NOTE: This projection to real domain becomes unnecessary if you - // set the exec_dt for 1m to the real projection of the storage - // datatype. - if ( bli_cntx_method( cntx ) != BLIS_NAT ) - dt = bli_dt_proj_to_real( dt ); - - return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); -} - -static bool_t bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - // For induced methods, return the ukernel storage preferences of the - // corresponding real micro-kernel. - // NOTE: This projection to real domain becomes unnecessary if you - // set the exec_dt for 1m to the real projection of the storage - // datatype. - if ( bli_cntx_method( cntx ) != BLIS_NAT ) - dt = bli_dt_proj_to_real( dt ); - - return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); -} - -static bool_t bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - // Note that we use the computation datatype, which may differ from the - // storage datatype of C (when performing a mixed datatype operation). - const num_t dt = bli_obj_comp_dt( obj ); - const bool_t ukr_prefers_rows - = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); - const bool_t ukr_prefers_cols - = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); - bool_t r_val = FALSE; - - if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; - else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; - - return r_val; -} - -static bool_t bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - return ( bool_t ) - !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); -} - -// ----------------------------------------------------------------------------- - -static bool_t bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - const bool_t prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); - - // A ukernel preference of TRUE means the ukernel prefers row storage. - return ( bool_t ) - ( prefs == TRUE ); -} - -static bool_t bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - const bool_t prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); - - // A ukernel preference of FALSE means the ukernel prefers column storage. - return ( bool_t ) - ( prefs == FALSE ); -} - -#if 0 -// NOTE: These static functions aren't needed yet. - -static bool_t bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) -{ - const num_t dt = bli_obj_dt( obj ); - const bool_t ukr_prefers_rows - = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); - const bool_t ukr_prefers_cols - = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); - bool_t r_val = FALSE; - - if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; - else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; - - return r_val; -} - -static bool_t bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) -{ - return ( bool_t ) - !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); -} -#endif - -// ----------------------------------------------------------------------------- - -// -// -- cntx_t modification (complex) -------------------------------------------- -// - -// NOTE: The framework does not use any of the following functions. We provide -// them in order to facilitate creating/modifying custom contexts. - -static void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) -{ - blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); - bszid_t* bmults = bli_cntx_bmults_buf( cntx ); - - blkszs[ bs_id ] = *blksz; - bmults[ bs_id ] = mult_id; -} - -static void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) -{ - blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; - - bli_blksz_set_def( bs, dt, blksz ); -} - -static void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) -{ - blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; - - bli_blksz_set_max( bs, dt, blksz ); -} - -static void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); - - funcs[ ukr_id ] = *func; -} - -static void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); - - funcs[ ukr_id ] = *func; -} - -static void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) -{ - mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); - - mbools[ ukr_id ] = *prefs; -} - -static void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); - - funcs[ ker_id ] = *func; -} - -static void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); - - funcs[ ker_id ] = *func; -} - -static void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); - - funcs[ ker_id ] = *func; -} - -static void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); - - bli_func_set_dt( fp, dt, func ); -} - -static void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); - - funcs[ ker_id ] = *func; -} - -static void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); - - bli_func_set_dt( fp, dt, func ); -} - -// ----------------------------------------------------------------------------- - -// Function prototypes - -BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); - -BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); - -BLIS_EXPORT_BLIS void bli_cntx_set_trsm_blkszs( dim_t n_bs, ... ); - -BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... ); - -BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); - -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); - -BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); - -BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); - - -#endif - -// end bli_cntx.h -// begin bli_rntm.h - - -#ifndef BLIS_RNTM_H -#define BLIS_RNTM_H - - -// Runtime object type (defined in bli_type_defs.h) - - - -// -// -- rntm_t query (public API) ------------------------------------------------ -// - -static bool_t bli_rntm_auto_factor( rntm_t* rntm ) -{ - return rntm->auto_factor; -} - -static dim_t bli_rntm_num_threads( rntm_t* rntm ) -{ - return rntm->num_threads; -} - -static dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) -{ - return rntm->thrloop[ bszid ]; -} - -static dim_t bli_rntm_jc_ways( rntm_t* rntm ) -{ - return bli_rntm_ways_for( BLIS_NC, rntm ); -} -static dim_t bli_rntm_pc_ways( rntm_t* rntm ) -{ - return bli_rntm_ways_for( BLIS_KC, rntm ); -} -static dim_t bli_rntm_ic_ways( rntm_t* rntm ) -{ - return bli_rntm_ways_for( BLIS_MC, rntm ); -} -static dim_t bli_rntm_jr_ways( rntm_t* rntm ) -{ - return bli_rntm_ways_for( BLIS_NR, rntm ); -} -static dim_t bli_rntm_ir_ways( rntm_t* rntm ) -{ - return bli_rntm_ways_for( BLIS_MR, rntm ); -} -static dim_t bli_rntm_pr_ways( rntm_t* rntm ) -{ - return bli_rntm_ways_for( BLIS_KR, rntm ); -} - -static bool_t bli_rntm_pack_a( rntm_t* rntm ) -{ - return rntm->pack_a; -} -static bool_t bli_rntm_pack_b( rntm_t* rntm ) -{ - return rntm->pack_b; -} - -static bool_t bli_rntm_l3_sup( rntm_t* rntm ) -{ - return rntm->l3_sup; -} - -// -// -- rntm_t query (internal use only) ----------------------------------------- -// - -static pool_t* bli_rntm_sba_pool( rntm_t* rntm ) -{ - return rntm->sba_pool; -} - -static membrk_t* bli_rntm_membrk( rntm_t* rntm ) -{ - return rntm->membrk; -} - -#if 0 -static dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) -{ - const bool_t nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); - const bool_t jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); - const bool_t pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); - const bool_t ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); - const bool_t jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); - const bool_t ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); - const bool_t pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); - - if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; - else return FALSE; -} -#endif - -// -// -- rntm_t modification (internal use only) ---------------------------------- -// - -static void bli_rntm_set_auto_factor_only( bool_t auto_factor, rntm_t* rntm ) -{ - rntm->auto_factor = auto_factor; -} - -static void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) -{ - rntm->num_threads = nt; -} - -static void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) -{ - rntm->thrloop[ loop ] = n_ways; -} - -static void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) -{ - bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); -} -static void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) -{ - bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); -} -static void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) -{ - bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); -} -static void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) -{ - bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); -} -static void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) -{ - bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); -} -static void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) -{ - bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); -} - -static void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) -{ - // Record the number of ways of parallelism per loop. - bli_rntm_set_jc_ways_only( jc, rntm ); - bli_rntm_set_pc_ways_only( pc, rntm ); - bli_rntm_set_ic_ways_only( ic, rntm ); - bli_rntm_set_jr_ways_only( jr, rntm ); - bli_rntm_set_ir_ways_only( ir, rntm ); - bli_rntm_set_pr_ways_only( 1, rntm ); -} - -static void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) -{ - rntm->sba_pool = sba_pool; -} - -static void bli_rntm_set_membrk( membrk_t* membrk, rntm_t* rntm ) -{ - rntm->membrk = membrk; -} - -static void bli_rntm_clear_num_threads_only( rntm_t* rntm ) -{ - bli_rntm_set_num_threads_only( -1, rntm ); -} -static void bli_rntm_clear_ways_only( rntm_t* rntm ) -{ - bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); -} -static void bli_rntm_clear_sba_pool( rntm_t* rntm ) -{ - bli_rntm_set_sba_pool( NULL, rntm ); -} -static void bli_rntm_clear_membrk( rntm_t* rntm ) -{ - bli_rntm_set_membrk( NULL, rntm ); -} - -// -// -- rntm_t modification (public API) ----------------------------------------- -// - -static void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) -{ - // Record the total number of threads to use. - bli_rntm_set_num_threads_only( nt, rntm ); - - // Set the individual ways of parallelism to default states. - bli_rntm_clear_ways_only( rntm ); -} - -static void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) -{ - // Record the number of ways of parallelism per loop. - bli_rntm_set_jc_ways_only( jc, rntm ); - bli_rntm_set_pc_ways_only( pc, rntm ); - bli_rntm_set_ic_ways_only( ic, rntm ); - bli_rntm_set_jr_ways_only( jr, rntm ); - bli_rntm_set_ir_ways_only( ir, rntm ); - bli_rntm_set_pr_ways_only( 1, rntm ); - - // Set the num_threads field to a default state. - bli_rntm_clear_num_threads_only( rntm ); -} - -static void bli_rntm_set_pack_a( bool_t pack_a, rntm_t* rntm ) -{ - // Set the bool_t indicating whether matrix A should be packed. - rntm->pack_a = pack_a; -} -static void bli_rntm_set_pack_b( bool_t pack_b, rntm_t* rntm ) -{ - // Set the bool_t indicating whether matrix B should be packed. - rntm->pack_b = pack_b; -} - -static void bli_rntm_set_l3_sup( bool_t l3_sup, rntm_t* rntm ) -{ - // Set the bool_t indicating whether level-3 sup handling is enabled. - rntm->l3_sup = l3_sup; -} -static void bli_rntm_enable_l3_sup( rntm_t* rntm ) -{ - bli_rntm_set_l3_sup( TRUE, rntm ); -} -static void bli_rntm_disable_l3_sup( rntm_t* rntm ) -{ - bli_rntm_set_l3_sup( FALSE, rntm ); -} - -// -// -- rntm_t modification (internal use only) ---------------------------------- -// - -static void bli_rntm_clear_pack_a( rntm_t* rntm ) -{ - bli_rntm_set_pack_a( FALSE, rntm ); -} -static void bli_rntm_clear_pack_b( rntm_t* rntm ) -{ - bli_rntm_set_pack_b( FALSE, rntm ); -} -static void bli_rntm_clear_l3_sup( rntm_t* rntm ) -{ - bli_rntm_set_l3_sup( TRUE, rntm ); -} - -// -// -- rntm_t initialization ---------------------------------------------------- -// - -// NOTE: Initialization is not necessary as long the user calls at least ONE -// of the public "set" accessors, each of which guarantees that the rntm_t -// will be in a good state upon return. - -#define BLIS_RNTM_INITIALIZER \ - { \ - .auto_factor = TRUE, \ - .num_threads = -1, \ - .thrloop = { -1, -1, -1, -1, -1, -1 }, \ - .pack_a = FALSE, \ - .pack_b = FALSE, \ - .l3_sup = TRUE, \ - .sba_pool = NULL, \ - .membrk = NULL, \ - } \ - -static void bli_rntm_init( rntm_t* rntm ) -{ - bli_rntm_set_auto_factor_only( TRUE, rntm ); - - bli_rntm_clear_num_threads_only( rntm ); - bli_rntm_clear_ways_only( rntm ); - bli_rntm_clear_pack_a( rntm ); - bli_rntm_clear_pack_b( rntm ); - bli_rntm_clear_l3_sup( rntm ); - - bli_rntm_clear_sba_pool( rntm ); - bli_rntm_clear_membrk( rntm ); -} - -// -- rntm_t total thread calculation ------------------------------------------ - -static dim_t bli_rntm_calc_num_threads - ( - rntm_t* restrict rntm - ) -{ - dim_t n_threads; - - n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); - n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); - n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); - n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); - n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); - - return n_threads; -} - -// ----------------------------------------------------------------------------- - -// Function prototypes - -BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); - -BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op - ( - opid_t l3_op, - side_t side, - dim_t m, - dim_t n, - dim_t k, - rntm_t* rntm - ); - -void bli_rntm_set_ways_from_rntm - ( - dim_t m, - dim_t n, - dim_t k, - rntm_t* rntm - ); - -void bli_rntm_set_ways_from_rntm_sup - ( - dim_t m, - dim_t n, - dim_t k, - rntm_t* rntm - ); - -void bli_rntm_print - ( - rntm_t* rntm - ); - -dim_t bli_rntm_calc_num_threads_in - ( - bszid_t* restrict bszid_cur, - rntm_t* restrict rntm - ); - -#endif - -// end bli_rntm.h -// begin bli_gks.h - - -#ifndef BLIS_GKS_H -#define BLIS_GKS_H - -void bli_gks_init( void ); -void bli_gks_finalize( void ); - -void bli_gks_init_index( void ); - -cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); -cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); -void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); - -BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); -BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); - -cntx_t* bli_gks_query_cntx_noinit( void ); - -BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); - -BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); - -bool_t bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); - -BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); -BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); - -//char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); - -#endif - -// end bli_gks.h -// begin bli_ind.h - - -#ifndef BLIS_IND_H -#define BLIS_IND_H - -// level-3 induced method management -// begin bli_l3_ind.h - - -#ifndef BLIS_L3_IND_H -#define BLIS_L3_IND_H - -// ----------------------------------------------------------------------------- - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void_fp PASTEMAC(opname,ind_get_avail)( num_t dt ); - - -GENPROT( gemm ) -GENPROT( gemmt ) -GENPROT( hemm ) -GENPROT( herk ) -GENPROT( her2k ) -GENPROT( symm ) -GENPROT( syrk ) -GENPROT( syr2k ) -GENPROT( trmm3 ) -GENPROT( trmm ) -GENPROT( trsm ) - -// ----------------------------------------------------------------------------- - -//bool_t bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); - -ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); - -void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool_t status ); - -void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); -void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool_t status ); - -void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool_t status ); -bool_t bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); - -void_fp bli_l3_ind_oper_get_func( opid_t oper, ind_t method ); - - -#endif - -// end bli_l3_ind.h - -// level-3 object APIs -// begin bli_l3_ind_oapi.h - - - -// -// Generate object-based prototypes for induced methods that work for -// trmm and trsm (ie: two-operand operations). -// -#undef GENPROT -#define GENPROT( imeth ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(gemm,imeth) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(hemm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(herk,imeth) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(her2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(symm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(syrk,imeth) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(syr2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(trmm3,imeth)( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(trmm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(trsm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(gemmt,imeth) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ - -GENPROT( nat ) -GENPROT( ind ) -GENPROT( 3m1 ) -GENPROT( 4m1 ) -GENPROT( 1m ) - - -// -// Generate object-based prototypes for induced methods that do NOT work -// for trmm and trsm (ie: two-operand operations). -// -#undef GENPROT_NO2OP -#define GENPROT_NO2OP( imeth ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(gemm,imeth) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(hemm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(herk,imeth) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(her2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(symm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(syrk,imeth) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(syr2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \ -BLIS_EXPORT_BLIS void PASTEMAC(trmm3,imeth)( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); - -GENPROT_NO2OP( 3mh ) -GENPROT_NO2OP( 4mh ) -GENPROT_NO2OP( 4mb ) - - -// -// Generate object-based prototypes for 1m methods that specify an algorithm -// (e.g., block-panel or panel-block). -// - - - -//GENPROT( 1m, bp ) -//GENPROT( 1m, pb ) - -// end bli_l3_ind_oapi.h - -// level-3 typed APIs -// begin bli_l3_ind_tapi.h - - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( gemm3mh ) -INSERT_GENTPROT_BASIC0( gemm3m1 ) -INSERT_GENTPROT_BASIC0( gemm4mh ) -INSERT_GENTPROT_BASIC0( gemm4mb ) -INSERT_GENTPROT_BASIC0( gemm4m1 ) -INSERT_GENTPROT_BASIC0( gemm1m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - side_t side, \ - uplo_t uploa, \ - conj_t conja, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( hemm3mh ) -INSERT_GENTPROT_BASIC0( hemm3m1 ) -INSERT_GENTPROT_BASIC0( hemm4mh ) -INSERT_GENTPROT_BASIC0( hemm4m1 ) -INSERT_GENTPROT_BASIC0( hemm1m ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntmx \ - ); - -INSERT_GENTPROTR_BASIC0( her2k3mh ) -INSERT_GENTPROTR_BASIC0( her2k3m1 ) -INSERT_GENTPROTR_BASIC0( her2k4mh ) -INSERT_GENTPROTR_BASIC0( her2k4m1 ) -INSERT_GENTPROTR_BASIC0( her2k1m ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype_r* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntmx \ - ); - -INSERT_GENTPROTR_BASIC0( herk3mh ) -INSERT_GENTPROTR_BASIC0( herk3m1 ) -INSERT_GENTPROTR_BASIC0( herk4mh ) -INSERT_GENTPROTR_BASIC0( herk4m1 ) -INSERT_GENTPROTR_BASIC0( herk1m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - side_t side, \ - uplo_t uploa, \ - conj_t conja, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( symm3mh ) -INSERT_GENTPROT_BASIC0( symm3m1 ) -INSERT_GENTPROT_BASIC0( symm4mh ) -INSERT_GENTPROT_BASIC0( symm4m1 ) -INSERT_GENTPROT_BASIC0( symm1m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( syr2k3mh ) -INSERT_GENTPROT_BASIC0( syr2k3m1 ) -INSERT_GENTPROT_BASIC0( syr2k4mh ) -INSERT_GENTPROT_BASIC0( syr2k4m1 ) -INSERT_GENTPROT_BASIC0( syr2k1m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( syrk3mh ) -INSERT_GENTPROT_BASIC0( syrk3m1 ) -INSERT_GENTPROT_BASIC0( syrk4mh ) -INSERT_GENTPROT_BASIC0( syrk4m1 ) -INSERT_GENTPROT_BASIC0( syrk1m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( trmm33mh ) -INSERT_GENTPROT_BASIC0( trmm33m1 ) -INSERT_GENTPROT_BASIC0( trmm34mh ) -INSERT_GENTPROT_BASIC0( trmm34m1 ) -INSERT_GENTPROT_BASIC0( trmm31m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( trmm3m1 ) -INSERT_GENTPROT_BASIC0( trmm4m1 ) -INSERT_GENTPROT_BASIC0( trmm1m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( trsm3m1 ) -INSERT_GENTPROT_BASIC0( trsm4m1 ) -INSERT_GENTPROT_BASIC0( trsm1m ) - -// end bli_l3_ind_tapi.h - -// level-3 cntx initialization -// begin bli_cntx_ind_stage.h - - -void bli_cntx_ind_stage( ind_t method, dim_t stage, cntx_t* cntx ); - -void bli_cntx_3mh_stage( dim_t stage, cntx_t* cntx ); -void bli_cntx_3m1_stage( dim_t stage, cntx_t* cntx ); -void bli_cntx_4mh_stage( dim_t stage, cntx_t* cntx ); -void bli_cntx_4mb_stage( dim_t stage, cntx_t* cntx ); -void bli_cntx_4m1_stage( dim_t stage, cntx_t* cntx ); -void bli_cntx_1m_stage( dim_t stage, cntx_t* cntx ); -void bli_cntx_nat_stage( dim_t stage, cntx_t* cntx ); - -// end bli_cntx_ind_stage.h - - -void bli_ind_init( void ); -void bli_ind_finalize( void ); - -BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); -BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); -BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); - -BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); - -BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); - -BLIS_EXPORT_BLIS bool_t bli_ind_oper_is_impl( opid_t oper, ind_t method ); -//bool_t bli_ind_oper_has_avail( opid_t oper, num_t dt ); -BLIS_EXPORT_BLIS void_fp bli_ind_oper_get_avail( opid_t oper, num_t dt ); -BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); -BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); - -char* bli_ind_get_impl_string( ind_t method ); -num_t bli_ind_map_cdt_to_index( num_t dt ); - - -#endif - -// end bli_ind.h -// begin bli_membrk.h - - -#ifndef BLIS_MEMBRK_H -#define BLIS_MEMBRK_H - -// membrk init - -static void bli_membrk_init_mutex( membrk_t* membrk ) -{ - bli_pthread_mutex_init( &(membrk->mutex), NULL ); -} - -static void bli_membrk_finalize_mutex( membrk_t* membrk ) -{ - bli_pthread_mutex_destroy( &(membrk->mutex) ); -} - -// membrk query - -static pool_t* bli_membrk_pool( dim_t pool_index, membrk_t* membrk ) -{ - return &(membrk->pools[ pool_index ]); -} - -static siz_t bli_membrk_align_size( membrk_t* membrk ) -{ - return membrk->align_size; -} - -static malloc_ft bli_membrk_malloc_fp( membrk_t* membrk ) -{ - return membrk->malloc_fp; -} - -static free_ft bli_membrk_free_fp( membrk_t* membrk ) -{ - return membrk->free_fp; -} - -// membrk modification - -static void bli_membrk_set_align_size( siz_t align_size, membrk_t* membrk ) -{ - membrk->align_size = align_size; -} - -static void bli_membrk_set_malloc_fp( malloc_ft malloc_fp, membrk_t* membrk ) -{ - membrk->malloc_fp = malloc_fp; -} - -static void bli_membrk_set_free_fp( free_ft free_fp, membrk_t* membrk ) -{ - membrk->free_fp = free_fp; -} - -// membrk action - -static void bli_membrk_lock( membrk_t* membrk ) -{ - bli_pthread_mutex_lock( &(membrk->mutex) ); -} - -static void bli_membrk_unlock( membrk_t* membrk ) -{ - bli_pthread_mutex_unlock( &(membrk->mutex) ); -} - -// ----------------------------------------------------------------------------- - -membrk_t* bli_membrk_query( void ); - -void bli_membrk_init - ( - cntx_t* cntx - ); -void bli_membrk_finalize - ( - void - ); - -void bli_membrk_acquire_m - ( - rntm_t* rntm, - siz_t req_size, - packbuf_t buf_type, - mem_t* mem - ); - -void bli_membrk_release - ( - rntm_t* rntm, - mem_t* mem - ); - -void bli_membrk_rntm_set_membrk - ( - rntm_t* rntm - ); - -siz_t bli_membrk_pool_size - ( - membrk_t* membrk, - packbuf_t buf_type - ); - -// ---------------------------------------------------------------------------- - -void bli_membrk_init_pools - ( - cntx_t* cntx, - membrk_t* membrk - ); -void bli_membrk_finalize_pools - ( - membrk_t* membrk - ); - -void bli_membrk_compute_pool_block_sizes - ( - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx - ); -void bli_membrk_compute_pool_block_sizes_dt - ( - num_t dt, - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx - ); - -#endif - -// end bli_membrk.h -// begin bli_pool.h - - -#ifndef BLIS_POOL_H -#define BLIS_POOL_H - -// -- Pool block type -- - - - -// -- Pool type -- - - - - -// Pool block query - -static void* bli_pblk_buf( pblk_t* pblk ) -{ - return pblk->buf; -} - -static siz_t bli_pblk_block_size( pblk_t* pblk ) -{ - return pblk->block_size; -} - -// Pool block modification - -static void bli_pblk_set_buf( void* buf, pblk_t* pblk ) -{ - pblk->buf = buf; -} - -static void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) -{ - pblk->block_size = block_size; -} - -// -// -- pool block initialization ------------------------------------------------ -// - -// NOTE: This initializer macro must be updated whenever fields are added or -// removed from the pblk_t type definition. An alternative to the initializer is -// calling bli_pblk_clear() at runtime. - -#define BLIS_PBLK_INITIALIZER \ - { \ - .buf = NULL, \ - .block_size = 0, \ - } \ - -static void bli_pblk_clear( pblk_t* pblk ) -{ - bli_pblk_set_buf( NULL, pblk ); - bli_pblk_set_block_size( 0, pblk ); -} - - -// Pool entry query - -static void* bli_pool_block_ptrs( pool_t* pool ) -{ - return pool->block_ptrs; -} - -static siz_t bli_pool_block_ptrs_len( pool_t* pool ) -{ - return pool->block_ptrs_len; -} - -static siz_t bli_pool_num_blocks( pool_t* pool ) -{ - return pool->num_blocks; -} - -static siz_t bli_pool_block_size( pool_t* pool ) -{ - return pool->block_size; -} - -static siz_t bli_pool_align_size( pool_t* pool ) -{ - return pool->align_size; -} - -static siz_t bli_pool_offset_size( pool_t* pool ) -{ - return pool->offset_size; -} - -static malloc_ft bli_pool_malloc_fp( pool_t* pool ) -{ - return pool->malloc_fp; -} - -static free_ft bli_pool_free_fp( pool_t* pool ) -{ - return pool->free_fp; -} - -static siz_t bli_pool_top_index( pool_t* pool ) -{ - return pool->top_index; -} - -static bool_t bli_pool_is_exhausted( pool_t* pool ) -{ - return ( bool_t ) - ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); -} - -// Pool entry modification - -static void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ -{ - pool->block_ptrs = block_ptrs; -} - -static void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ -{ - pool->block_ptrs_len = block_ptrs_len; -} - -static void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ -{ - pool->num_blocks = num_blocks; -} - -static void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ -{ - pool->block_size = block_size; -} - -static void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ -{ - pool->align_size = align_size; -} - -static void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ -{ - pool->offset_size = offset_size; -} - -static void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ -{ - pool->malloc_fp = malloc_fp; -} - -static void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ -{ - pool->free_fp = free_fp; -} - -static void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ -{ - pool->top_index = top_index; -} - -// ----------------------------------------------------------------------------- - -void bli_pool_init - ( - siz_t num_blocks, - siz_t block_ptrs_len, - siz_t block_size, - siz_t align_size, - siz_t offset_size, - malloc_ft malloc_fp, - free_ft free_fp, - pool_t* restrict pool - ); -void bli_pool_finalize - ( - pool_t* restrict pool - ); -void bli_pool_reinit - ( - siz_t num_blocks_new, - siz_t block_ptrs_len_new, - siz_t block_size_new, - siz_t align_size_new, - siz_t offset_size_new, - pool_t* restrict pool - ); - -void bli_pool_checkout_block - ( - siz_t req_size, - pblk_t* restrict block, - pool_t* restrict pool - ); -void bli_pool_checkin_block - ( - pblk_t* restrict block, - pool_t* restrict pool - ); - -void bli_pool_grow - ( - siz_t num_blocks_add, - pool_t* restrict pool - ); -void bli_pool_shrink - ( - siz_t num_blocks_sub, - pool_t* restrict pool - ); - -void bli_pool_alloc_block - ( - siz_t block_size, - siz_t align_size, - siz_t offset_size, - malloc_ft malloc_fp, - pblk_t* restrict block - ); -void bli_pool_free_block - ( - siz_t offset_size, - free_ft free_fp, - pblk_t* restrict block - ); - -void bli_pool_print - ( - pool_t* restrict pool - ); -void bli_pblk_print - ( - pblk_t* restrict pblk - ); - -#endif - -// end bli_pool.h -// begin bli_array.h - - -#ifndef BLIS_ARRAY_H -#define BLIS_ARRAY_H - -// -- Array type -- - - - - -// Array entry query - -static void* bli_array_buf( array_t* array ) -{ - return array->buf; -} - -static siz_t bli_array_num_elem( array_t* array ) -{ - return array->num_elem; -} - -static siz_t bli_array_elem_size( array_t* array ) -{ - return array->elem_size; -} - -// Array entry modification - -static void bli_array_set_buf( void* buf, array_t* array ) \ -{ - array->buf = buf; -} - -static void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ -{ - array->num_elem = num_elem; -} - -static void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ -{ - array->elem_size = elem_size; -} - -// ----------------------------------------------------------------------------- - -void bli_array_init - ( - const siz_t num_elem, - const siz_t elem_size, - array_t* restrict array - ); -void bli_array_resize - ( - const siz_t num_elem_new, - array_t* restrict array - ); -void bli_array_finalize - ( - array_t* restrict array - ); - -void* bli_array_elem - ( - const siz_t index, - array_t* restrict array - ); -void bli_array_set_elem - ( - void* restrict elem, - const siz_t index, - array_t* restrict array - ); - -#endif - -// end bli_array.h -// begin bli_apool.h - - -#ifndef BLIS_APOOL_H -#define BLIS_APOOL_H - -// -- Locked pool-of-arrays type -- - - - - -// apool entry query - -static pool_t* bli_apool_pool( apool_t* apool ) -{ - return &(apool->pool); -} - -static bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) -{ - return &(apool->mutex); -} - -static siz_t bli_apool_def_array_len( apool_t* pool ) -{ - return pool->def_array_len; -} - -static bool_t bli_apool_is_exhausted( apool_t* apool ) -{ - pool_t* restrict pool = bli_apool_pool( apool ); - - return bli_pool_is_exhausted( pool ); -} - -// apool action - -static void bli_apool_lock( apool_t* apool ) -{ - bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); -} - -static void bli_apool_unlock( apool_t* apool ) -{ - bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); -} - -// apool entry modification - -static void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ -{ - pool->def_array_len = def_array_len; -} - -// ----------------------------------------------------------------------------- - -void bli_apool_init - ( - apool_t* restrict apool - ); -void bli_apool_finalize - ( - apool_t* restrict apool - ); - -array_t* bli_apool_checkout_array - ( - siz_t n_threads, - apool_t* restrict apool - ); -void bli_apool_checkin_array - ( - array_t* restrict array, - apool_t* restrict apool - ); - -pool_t* bli_apool_array_elem - ( - siz_t index, - array_t* restrict array - ); - -void bli_apool_grow - ( - siz_t num_blocks_add, - apool_t* restrict apool - ); - -void bli_apool_alloc_block - ( - siz_t num_elem, - array_t** restrict array_p - ); -void bli_apool_free_block - ( - array_t* restrict array - ); - - -#endif - -// end bli_apool.h -// begin bli_sba.h - - -#ifndef BLIS_SBA_H -#define BLIS_SBA_H - -apool_t* bli_sba_query( void ); - -// ----------------------------------------------------------------------------- - -void bli_sba_init( void ); -void bli_sba_finalize( void ); - -array_t* bli_sba_checkout_array - ( - const siz_t n_threads - ); - -void bli_sba_checkin_array - ( - array_t* restrict array - ); - -void bli_sba_rntm_set_pool - ( - siz_t index, - array_t* restrict array, - rntm_t* restrict rntm - ); - -void* bli_sba_acquire - ( - rntm_t* restrict rntm, - siz_t req_size - ); -void bli_sba_release - ( - rntm_t* restrict rntm, - void* restrict block - ); - - -#endif - -// end bli_sba.h -// begin bli_memsys.h - - -#ifndef BLIS_MEMSYS_H -#define BLIS_MEMSYS_H - -// ----------------------------------------------------------------------------- - -void bli_memsys_init( void ); -void bli_memsys_finalize( void ); - - -#endif - -// end bli_memsys.h -// begin bli_mem.h - - - -#ifndef BLIS_MEM_H -#define BLIS_MEM_H - - -// mem_t object type (defined in bli_type_defs.h) - - - -// -// -- mem_t query -------------------------------------------------------------- -// - -static pblk_t* bli_mem_pblk( mem_t* mem ) -{ - return &(mem->pblk); -} - -static void* bli_mem_buffer( mem_t* mem ) -{ - return bli_pblk_buf( bli_mem_pblk( mem ) ); -} - -static packbuf_t bli_mem_buf_type( mem_t* mem ) -{ - return mem->buf_type; -} - -static pool_t* bli_mem_pool( mem_t* mem ) -{ - return mem->pool; -} - -static siz_t bli_mem_size( mem_t* mem ) -{ - return mem->size; -} - -static bool_t bli_mem_is_alloc( mem_t* mem ) -{ - return ( bool_t ) - ( bli_mem_buffer( mem ) != NULL ); -} - -static bool_t bli_mem_is_unalloc( mem_t* mem ) -{ - return ( bool_t ) - ( bli_mem_buffer( mem ) == NULL ); -} - - -// -// -- mem_t modification ------------------------------------------------------- -// - -static void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) -{ - mem->pblk = *pblk; -} - -static void bli_mem_set_buffer( void* buf, mem_t* mem ) -{ - bli_pblk_set_buf( buf, &(mem->pblk) ); -} - -static void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) -{ - mem->buf_type = buf_type; -} - -static void bli_mem_set_pool( pool_t* pool, mem_t* mem ) -{ - mem->pool = pool; -} - -static void bli_mem_set_size( siz_t size, mem_t* mem ) -{ - mem->size = size; -} - -// -// -- mem_t initialization ----------------------------------------------------- -// - -// NOTE: This initializer macro must be updated whenever fields are added or -// removed from the mem_t type definition. An alternative to the initializer is -// calling bli_mem_clear() at runtime. - -#define BLIS_MEM_INITIALIZER \ - { \ - .pblk = BLIS_PBLK_INITIALIZER, \ - .buf_type = -1, \ - .pool = NULL, \ - .size = 0, \ - } \ - -static void bli_mem_clear( mem_t* mem ) -{ - bli_mem_set_buffer( NULL, mem ); -#ifdef __cplusplus - packbuf_t pb; - //C++ has more strong type checking. Using -1 will result in error - //Pass actual type instead - bli_mem_set_buf_type ( pb, mem ); -#else - bli_mem_set_buf_type( ( packbuf_t )-1, mem ); -#endif - bli_mem_set_pool( NULL, mem ); - bli_mem_set_size( 0, mem ); -} - - -#endif -// end bli_mem.h -// begin bli_part.h - - -// begin bli_part_check.h - - -void bli_acquire_mpart_t2b_check( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -void bli_acquire_mpart_l2r_check( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -void bli_acquire_mpart_tl2br_check( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -// end bli_part_check.h - -// -- Matrix partitioning ------------------------------------------------------ - -BLIS_EXPORT_BLIS void bli_acquire_mpart - ( - dim_t i, - dim_t j, - dim_t m, - dim_t n, - obj_t* obj, - obj_t* sub_obj - ); - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ - ( \ - subpart_t req_part, \ - dim_t i, \ - dim_t b, \ - obj_t* obj, \ - obj_t* sub_obj \ - ); - -GENPROT( acquire_mpart_t2b ) -GENPROT( acquire_mpart_b2t ) -GENPROT( acquire_mpart_l2r ) -GENPROT( acquire_mpart_r2l ) -GENPROT( acquire_mpart_tl2br ) -GENPROT( acquire_mpart_br2tl ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ - ( \ - dir_t direct, \ - subpart_t req_part, \ - dim_t i, \ - dim_t b, \ - obj_t* obj, \ - obj_t* sub_obj \ - ); - -GENPROT( acquire_mpart_mdim ) -GENPROT( acquire_mpart_ndim ) -GENPROT( acquire_mpart_mndim ) - - -// -- Vector partitioning ------------------------------------------------------ - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ - ( \ - subpart_t req_part, \ - dim_t i, \ - dim_t b, \ - obj_t* obj, \ - obj_t* sub_obj \ - ); - -GENPROT( acquire_vpart_f2b ) -GENPROT( acquire_vpart_b2f ) - -// -- Scalar acquisition ------------------------------------------------------- - -BLIS_EXPORT_BLIS void bli_acquire_mij - ( - dim_t i, - dim_t j, - obj_t* obj, - obj_t* sub_obj - ); - -BLIS_EXPORT_BLIS void bli_acquire_vi - ( - dim_t i, - obj_t* obj, - obj_t* sub_obj - ); - -// end bli_part.h -// begin bli_prune.h - - -void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, - obj_t* s, mdim_t mdim_s ); -// end bli_prune.h -// begin bli_query.h - - -BLIS_EXPORT_BLIS bool_t bli_obj_equals( obj_t* a, obj_t* b ); - -BLIS_EXPORT_BLIS bool_t bli_obj_imag_equals( obj_t* a, obj_t* b ); - -BLIS_EXPORT_BLIS bool_t bli_obj_imag_is_zero( obj_t* a ); -// end bli_query.h -// begin bli_auxinfo.h - - -#ifndef BLIS_AUXINFO_MACRO_DEFS_H -#define BLIS_AUXINFO_MACRO_DEFS_H - - -// auxinfo_t field query - -static pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) -{ - return ai->schema_a; -} -static pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) -{ - return ai->schema_b; -} - -static void* bli_auxinfo_next_a( auxinfo_t* ai ) -{ - return ai->a_next; -} -static void* bli_auxinfo_next_b( auxinfo_t* ai ) -{ - return ai->b_next; -} - -static inc_t bli_auxinfo_is_a( auxinfo_t* ai ) -{ - return ai->is_a; -} -static inc_t bli_auxinfo_is_b( auxinfo_t* ai ) -{ - return ai->is_b; -} - -static inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) -{ - return ai->ps_a; -} -static inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) -{ - return ai->ps_b; -} - -#if 0 -static inc_t bli_auxinfo_dt_on_output( auxinfo_t* ai ) -{ - return ai->dt_on_output; -} -#endif - - -// auxinfo_t field modification - -static void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) -{ - ai->schema_a = schema; -} -static void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) -{ - ai->schema_b = schema; -} - -static void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) -{ - ai->a_next = p; -} -static void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) -{ - ai->b_next = p; -} -static void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) -{ - ai->a_next = ap; - ai->b_next = bp; -} - -static void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) -{ - ai->is_a = is; -} -static void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) -{ - ai->is_b = is; -} - -static void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) -{ - ai->ps_a = ps; -} -static void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) -{ - ai->ps_b = ps; -} - -#if 0 -static void bli_auxinfo_set_dt_on_output( num_t dt_on_output, auxinfo_t* ai ) -{ - ai->dt_on_output = dt_on_output; -} -#endif - -#endif - -// end bli_auxinfo.h -// begin bli_param_map.h - - - -// --- BLIS to BLAS/LAPACK mappings -------------------------------------------- - -BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); -BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); -BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); -BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); -BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); - - -// --- BLAS/LAPACK to BLIS mappings -------------------------------------------- - -// NOTE: These static functions were converted from regular functions in order -// to reduce function call overhead within the BLAS compatibility layer. - -static void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) -{ - if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; - else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; - else - { - // Instead of reporting an error to the framework, default to - // an arbitrary value. This is needed because this function is - // called by the BLAS compatibility layer AFTER it has already - // checked errors and called xerbla(). If the application wants - // to override the BLAS compatibility layer's xerbla--which - // responds to errors with abort()--we need to also NOT call - // abort() here, since either way it has already been dealt - // with. - //bli_check_error_code( BLIS_INVALID_SIDE ); - *blis_side = BLIS_LEFT; - } -} - -static void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) -{ - if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; - else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; - else - { - // See comment for bli_param_map_netlib_to_blis_side() above. - //bli_check_error_code( BLIS_INVALID_UPLO ); - *blis_uplo = BLIS_LOWER; - } -} - -static void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) -{ - if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; - else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; - else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; - else - { - // See comment for bli_param_map_netlib_to_blis_side() above. - //bli_check_error_code( BLIS_INVALID_TRANS ); - *blis_trans = BLIS_NO_TRANSPOSE; - } -} - -static void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) -{ - if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; - else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; - else - { - // See comment for bli_param_map_netlib_to_blis_side() above. - //bli_check_error_code( BLIS_INVALID_DIAG ); - *blis_diag = BLIS_NONUNIT_DIAG; - } -} - - -// --- BLIS char to BLIS mappings ---------------------------------------------- - -BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); -BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); -BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); -BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); -BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); -BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); - - -// --- BLIS to BLIS char mappings ---------------------------------------------- - -BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); -BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); -BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); -BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); -BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); -BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); - -// end bli_param_map.h -// begin bli_clock.h - - -BLIS_EXPORT_BLIS double bli_clock( void ); -BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); - -double bli_clock_helper( void ); - -// end bli_clock.h -// begin bli_check.h - - - -BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); - -err_t bli_check_valid_error_level( errlev_t level ); - -err_t bli_check_null_pointer( void* ptr ); - -err_t bli_check_valid_side( side_t side ); -err_t bli_check_valid_uplo( uplo_t uplo ); -err_t bli_check_valid_trans( trans_t trans ); -err_t bli_check_valid_diag( diag_t diag ); -err_t bli_check_nonunit_diag( obj_t* a ); - -err_t bli_check_valid_datatype( num_t dt ); -err_t bli_check_object_valid_datatype( obj_t* a ); -err_t bli_check_noninteger_datatype( num_t dt ); -err_t bli_check_noninteger_object( obj_t* a ); -err_t bli_check_nonconstant_datatype( num_t dt ); -err_t bli_check_nonconstant_object( obj_t* a ); -err_t bli_check_floating_datatype( num_t dt ); -err_t bli_check_floating_object( obj_t* a ); -err_t bli_check_real_datatype( num_t dt ); -err_t bli_check_real_object( obj_t* a ); -err_t bli_check_integer_datatype( num_t dt ); -err_t bli_check_integer_object( obj_t* a ); -err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); -err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); -err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); -err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); -err_t bli_check_real_valued_object( obj_t* a ); -err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); -err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); - -err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); -err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); -err_t bli_check_scalar_object( obj_t* a ); -err_t bli_check_vector_object( obj_t* a ); -err_t bli_check_matrix_object( obj_t* a ); -err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); -err_t bli_check_square_object( obj_t* a ); -err_t bli_check_object_length_equals( obj_t* a, dim_t m ); -err_t bli_check_object_width_equals( obj_t* a, dim_t n ); -err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); -err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); - -err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); - -err_t bli_check_general_object( obj_t* a ); -err_t bli_check_hermitian_object( obj_t* a ); -err_t bli_check_symmetric_object( obj_t* a ); -err_t bli_check_triangular_object( obj_t* a ); -err_t bli_check_object_struc( obj_t* a, struc_t struc ); - -err_t bli_check_upper_or_lower_object( obj_t* a ); - -err_t bli_check_valid_3x1_subpart( subpart_t part ); -err_t bli_check_valid_1x3_subpart( subpart_t part ); -err_t bli_check_valid_3x3_subpart( subpart_t part ); - -err_t bli_check_valid_cntl( void* cntl ); - -err_t bli_check_packm_schema_on_unpack( obj_t* a ); -err_t bli_check_packv_schema_on_unpack( obj_t* a ); - -err_t bli_check_object_buffer( obj_t* a ); - -err_t bli_check_valid_malloc_buf( void* ptr ); - -err_t bli_check_valid_packbuf( packbuf_t buf_type ); -err_t bli_check_if_exhausted_pool( pool_t* pool ); -err_t bli_check_sufficient_stack_buf_size( num_t dt, cntx_t* cntx ); -err_t bli_check_alignment_is_power_of_two( size_t align_size ); -err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); - -err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); - -err_t bli_check_valid_arch_id( arch_t id ); - -err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); -err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); -err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); - -// end bli_check.h -// begin bli_error.h - - - -BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); -BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); - -BLIS_EXPORT_BLIS bool_t bli_error_checking_is_enabled( void ); - -void bli_print_msg( char* str, char* file, guint_t line ); -void bli_abort( void ); - -char* bli_error_string_for_code( gint_t code ); - -// end bli_error.h -// begin bli_f2c.h -// f2c.h -- Standard Fortran to C header file -// barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." -// - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) - -#ifndef BLIS_F2C_H -#define BLIS_F2C_H - -typedef f77_int bla_integer; -typedef f77_char bla_character; -//typedef char *address; -//typedef short int shortint; -typedef float bla_real; -typedef double bla_double; -typedef scomplex bla_scomplex; -typedef dcomplex bla_dcomplex; -typedef f77_int bla_logical; -//typedef short int shortlogical; -//typedef char logical1; -//typedef char integer1; -#ifdef INTEGER_STAR_8 // Adjust for integer*8. -typedef long long longint; // system-dependent -typedef unsigned long long ulongint; // system-dependent -#define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) -#define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) -#endif - -#ifndef TRUE_ -#define TRUE_ (1) -#endif - -#ifndef FALSE_ -#define FALSE_ (0) -#endif - -// Extern is for use with -E -#ifndef Extern -#define Extern extern -#endif - -// I/O stuff - -#ifdef f2c_i2 -// for -i2 -//typedef short flag; -//typedef short ftnlen; -typedef bla_integer ftnlen; -//typedef short ftnint; -#else -//typedef long int flag; -//typedef long int ftnlen; -typedef bla_integer ftnlen; -//typedef long int ftnint; -#endif - -#ifndef VOID -#define VOID void -#endif - -#ifndef f2c_abs - #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) -#endif -#ifndef f2c_dabs - #define f2c_dabs(x) (doublereal)f2c_abs(x) -#endif -#ifndef f2c_min - #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) -#endif -#ifndef f2c_max - #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) -#endif -#ifndef f2c_dmin - #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) -#endif -#ifndef f2c_dmax - #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) -#endif - -#ifndef bit_test - #define bit_test(a,b) ((a) >> (b) & 1) -#endif - -#ifndef bit_clear - #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) -#endif - -#ifndef bit_set - #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) -#endif - -// undef any lower-case symbols that your C compiler predefines, e.g.: - -#ifndef Skip_f2c_Undefs -#undef cray -#undef gcos -#undef mc68010 -#undef mc68020 -#undef mips -#undef pdp11 -#undef sgi -#undef sparc -#undef sun -#undef sun2 -#undef sun3 -#undef sun4 -#undef u370 -#undef u3b -#undef u3b2 -#undef u3b5 -#undef unix -#undef vax -#endif - -#endif -// end bli_f2c.h -// begin bli_machval.h - - -// begin bli_lsame.h - - -bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); -// end bli_lsame.h -// begin bli_slamch.h - - -bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); -// end bli_slamch.h -// begin bli_dlamch.h - - -bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); -// end bli_dlamch.h - -// -// Prototype object-based interface. -// -BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROTR -#define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ - ( \ - machval_t mval, \ - void* v \ - ); - -INSERT_GENTPROTR_BASIC0( machval ) - -// end bli_machval.h -// begin bli_getopt.h - - -typedef struct getopt_s -{ - char* optarg; - int optind; - int opterr; - int optopt; -} getopt_t; - -BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); - -BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); - -// end bli_getopt.h -// begin bli_opid.h - - -static bool_t bli_opid_is_level3( opid_t opid ) -{ - return ( bool_t ) - ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); -} - -// end bli_opid.h -// begin bli_cntl.h - - - - - - -// -- Control tree prototypes -- - -BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node - ( - rntm_t* rntm, - opid_t family, - bszid_t bszid, - void_fp var_func, - void* params, - cntl_t* sub_node - ); - -BLIS_EXPORT_BLIS void bli_cntl_free_node - ( - rntm_t* rntm, - cntl_t* cntl - ); - -BLIS_EXPORT_BLIS void bli_cntl_clear_node - ( - cntl_t* cntl - ); - -// ----------------------------------------------------------------------------- - -BLIS_EXPORT_BLIS void bli_cntl_free - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo - ( - rntm_t* rntm, - cntl_t* cntl - ); - -BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy - ( - rntm_t* rntm, - cntl_t* cntl - ); - -BLIS_EXPORT_BLIS void bli_cntl_mark_family - ( - opid_t family, - cntl_t* cntl - ); - -// ----------------------------------------------------------------------------- - -dim_t bli_cntl_calc_num_threads_in - ( - rntm_t* rntm, - cntl_t* cntl - ); - -// ----------------------------------------------------------------------------- - -// cntl_t query (fields only) - -static opid_t bli_cntl_family( cntl_t* cntl ) -{ - return cntl->family; -} - -static bszid_t bli_cntl_bszid( cntl_t* cntl ) -{ - return cntl->bszid; -} - -static void_fp bli_cntl_var_func( cntl_t* cntl ) -{ - return cntl->var_func; -} - -static cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) -{ - return cntl->sub_prenode; -} - -static cntl_t* bli_cntl_sub_node( cntl_t* cntl ) -{ - return cntl->sub_node; -} - -static void* bli_cntl_params( cntl_t* cntl ) -{ - return cntl->params; -} - -static uint64_t bli_cntl_params_size( cntl_t* cntl ) -{ - // The first 64 bytes is always the size of the params structure. - return *( ( uint64_t* )(cntl->params) ); -} - -static mem_t* bli_cntl_pack_mem( cntl_t* cntl ) -{ - return &(cntl->pack_mem); -} - -// cntl_t query (complex) - -static bool_t bli_cntl_is_null( cntl_t* cntl ) -{ - return ( bool_t ) - ( cntl == NULL ); -} - -static bool_t bli_cntl_is_leaf( cntl_t* cntl ) -{ - return ( bool_t ) - ( bli_cntl_sub_node( cntl ) == NULL ); -} - -static bool_t bli_cntl_does_part( cntl_t* cntl ) -{ - return ( bool_t ) - ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); -} - -// cntl_t modification - -static void bli_cntl_set_family( opid_t family, cntl_t* cntl ) -{ - cntl->family = family; -} - -static void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) -{ - cntl->bszid = bszid; -} - -static void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) -{ - cntl->var_func = var_func; -} - -static void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) -{ - cntl->sub_prenode = sub_prenode; -} - -static void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) -{ - cntl->sub_node = sub_node; -} - -static void bli_cntl_set_params( void* params, cntl_t* cntl ) -{ - cntl->params = params; -} - -static void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) -{ - cntl->pack_mem = *pack_mem; -} - -// end bli_cntl.h -// begin bli_env.h - - -#ifndef BLIS_ENV_H -#define BLIS_ENV_H - -dim_t bli_env_get_var( const char* env, dim_t fallback ); -//void bli_env_set_var( const char* env, dim_t value ); - -#endif - -// end bli_env.h -// begin bli_pack.h - - -#ifndef BLIS_PACK_H -#define BLIS_PACK_H - -void bli_pack_init( void ); -void bli_pack_finalize( void ); - -BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_a( void ); -BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_b( void ); -BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool_t pack_a ); -BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool_t pack_b ); - -void bli_pack_init_rntm_from_env( rntm_t* rntm ); - -#endif - -// end bli_pack.h -// begin bli_info.h - - - -// -- General library information ---------------------------------------------- - -BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); -BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); - - -// -- General configuration-related -------------------------------------------- - -BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); - - -// -- Kernel implementation-related -------------------------------------------- - - -// -- Level-3 kernel definitions -- - -BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); - - -// -- BLIS implementation query (level-3) -------------------------------------- - -BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); - -// end bli_info.h -// begin bli_arch.h - - -#ifndef BLIS_ARCH_H -#define BLIS_ARCH_H - -BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); - -void bli_arch_set_id_once( void ); -void bli_arch_set_id( void ); - -BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); - -void bli_arch_set_logging( bool_t dolog ); -bool_t bli_arch_get_logging( void ); -void bli_arch_log( char*, ... ); - -#endif - -// end bli_arch.h -// begin bli_cpuid.h - - -#if 0 - // Used only during standalone testing of ARM support. - #define FALSE 0 - #define TRUE 1 - typedef enum - { - BLIS_ARCH_CORTEXA57 = 10, - BLIS_ARCH_CORTEXA15 = 11, - BLIS_ARCH_CORTEXA9 = 12, - BLIS_ARCH_GENERIC = 13 - } arch_t; - typedef uint64_t bool_t; - #define bli_abort abort -#endif - -#ifndef BLIS_CPUID_H -#define BLIS_CPUID_H - -arch_t bli_cpuid_query_id( void ); - -// Intel -bool_t bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); -bool_t bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); -bool_t bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); -bool_t bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); -bool_t bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); - -// AMD -BLIS_EXPORT_BLIS bool_t bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); -BLIS_EXPORT_BLIS bool_t bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); -BLIS_EXPORT_BLIS bool_t bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); -BLIS_EXPORT_BLIS bool_t bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); -BLIS_EXPORT_BLIS bool_t bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); -BLIS_EXPORT_BLIS bool_t bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); - -// ARM -bool_t bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); -bool_t bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); -bool_t bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); -bool_t bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); -bool_t bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); - -uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); - -// ----------------------------------------------------------------------------- - -// -// This section of the file was based off of cpuid.hpp from TBLIS [1]. -// -// [1] https://github.com/devinamatthews/tblis -// - - - -static bool_t bli_cpuid_has_features( uint32_t have, uint32_t want ) -{ - return ( have & want ) == want; -} - -// ----------------------------------------------------------------------------- - -#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) - -// cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 -// for more information why this move was made. -//#include "cpuid.h" - -void get_cpu_name( char *cpu_name ); -int vpu_count( void ); - - -enum -{ - VENDOR_INTEL = 0, - VENDOR_AMD, - VENDOR_UNKNOWN -}; -enum -{ - FEATURE_SSE3 = 0x0001, - FEATURE_SSSE3 = 0x0002, - FEATURE_SSE41 = 0x0004, - FEATURE_SSE42 = 0x0008, - FEATURE_AVX = 0x0010, - FEATURE_AVX2 = 0x0020, - FEATURE_FMA3 = 0x0040, - FEATURE_FMA4 = 0x0080, - FEATURE_AVX512F = 0x0100, - FEATURE_AVX512DQ = 0x0200, - FEATURE_AVX512PF = 0x0400, - FEATURE_AVX512ER = 0x0800, - FEATURE_AVX512CD = 0x1000, - FEATURE_AVX512BW = 0x2000, - FEATURE_AVX512VL = 0x4000 -}; - -#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) - -char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); - -enum -{ - VENDOR_ARM = 0, - VENDOR_UNKNOWN -}; -enum -{ - MODEL_ARMV7 = 0, - MODEL_ARMV8, - MODEL_UNKNOWN -}; -enum -{ - FEATURE_NEON = 0x1 -}; - -#endif - - - -#endif - -// end bli_cpuid.h -// begin bli_string.h - - -void bli_string_mkupper( char* s ); -// end bli_string.h -// begin bli_setgetij.h - - -BLIS_EXPORT_BLIS err_t bli_setijm - ( - double ar, - double ai, - dim_t i, - dim_t j, - obj_t* b - ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - double ar, \ - double ai, \ - dim_t i, \ - dim_t j, \ - void* restrict b, inc_t rs, inc_t cs \ - ); - -INSERT_GENTPROT_BASIC0( setijm ) - -// ----------------------------------------------------------------------------- - -BLIS_EXPORT_BLIS err_t bli_getijm - ( - dim_t i, - dim_t j, - obj_t* b, - double* ar, - double* ai - ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - dim_t i, \ - dim_t j, \ - void* restrict b, inc_t rs, inc_t cs, \ - double* ar, \ - double* ai \ - ); - -INSERT_GENTPROT_BASIC0( getijm ) - -// end bli_setgetij.h -// begin bli_setri.h - - -// -- setr --------------------------------------------------------------------- - -BLIS_EXPORT_BLIS void bli_setrm - ( - obj_t* alpha, - obj_t* b - ); - -BLIS_EXPORT_BLIS void bli_setrv - ( - obj_t* alpha, - obj_t* x - ); - -// -- seti --------------------------------------------------------------------- - -BLIS_EXPORT_BLIS void bli_setim - ( - obj_t* alpha, - obj_t* b - ); - -BLIS_EXPORT_BLIS void bli_setiv - ( - obj_t* alpha, - obj_t* x - ); - -// end bli_setri.h - -// begin bli_castm.h - - -// -// Prototype object-based interface. -// - -BLIS_EXPORT_BLIS void bli_castm - ( - obj_t* a, - obj_t* b - ); - -// -// Prototype BLAS-like interfaces with heterogeneous-typed operands. -// - -#undef GENTPROT2 -#define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ - ( \ - trans_t transa, \ - dim_t m, \ - dim_t n, \ - void* a, inc_t rs_a, inc_t cs_a, \ - void* b, inc_t rs_b, inc_t cs_b \ - ); - -INSERT_GENTPROT2_BASIC0( castm ) -INSERT_GENTPROT2_MIXDP0( castm ) - -// -// Prototype object-based _check() function. -// - -void bli_castm_check - ( - obj_t* a, - obj_t* b - ); - -// end bli_castm.h -// begin bli_castnzm.h - - -// -// Prototype object-based interface. -// - -BLIS_EXPORT_BLIS void bli_castnzm - ( - obj_t* a, - obj_t* b - ); - -// -// Prototype BLAS-like interfaces with heterogeneous-typed operands. -// - -#undef GENTPROT2 -#define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ - ( \ - trans_t transa, \ - dim_t m, \ - dim_t n, \ - void* a, inc_t rs_a, inc_t cs_a, \ - void* b, inc_t rs_b, inc_t cs_b \ - ); - -INSERT_GENTPROT2_BASIC0( castnzm ) -INSERT_GENTPROT2_MIXDP0( castnzm ) - -// -// Prototype object-based _check() function. -// - -void bli_castnzm_check - ( - obj_t* a, - obj_t* b - ); - -// end bli_castnzm.h -// begin bli_castv.h - - -// -// Prototype object-based interface. -// - -BLIS_EXPORT_BLIS void bli_castv - ( - obj_t* x, - obj_t* y - ); - -// -// Prototype BLAS-like interfaces with heterogeneous-typed operands. -// - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ - ( \ - conj_t conjx, \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC0( castv ) -INSERT_GENTPROT2_MIXDP0( castv ) - -// -// Prototype object-based _check() function. -// - -void bli_castv_check - ( - obj_t* x, - obj_t* y - ); - -// end bli_castv.h -// begin bli_projm.h - - -BLIS_EXPORT_BLIS void bli_projm - ( - obj_t* a, - obj_t* b - ); - -void bli_projm_check - ( - obj_t* a, - obj_t* b - ); - -// end bli_projm.h -// begin bli_projv.h - - -BLIS_EXPORT_BLIS void bli_projv - ( - obj_t* x, - obj_t* y - ); - -void bli_projv_check - ( - obj_t* x, - obj_t* y - ); - -// end bli_projv.h - - -// -- Level-0 operations -- - -// begin bli_l0.h - - -// begin bli_l0_check.h - - - -// -// Prototype object-based check functions. -// - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* chi, \ - obj_t* psi \ - ); - -GENTPROT( addsc ) -GENTPROT( copysc ) -GENTPROT( divsc ) -GENTPROT( mulsc ) -GENTPROT( sqrtsc ) -GENTPROT( subsc ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* chi \ - ); - -GENTPROT( invertsc ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* chi, \ - obj_t* absq \ - ); - -GENTPROT( absqsc ) -GENTPROT( normfsc ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* chi, \ - double* zeta_r, \ - double* zeta_i \ - ); - -GENTPROT( getsc ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - double zeta_r, \ - double zeta_i, \ - obj_t* chi \ - ); - -GENTPROT( setsc ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* chi, \ - obj_t* zeta_r, \ - obj_t* zeta_i \ - ); - -GENTPROT( unzipsc ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* zeta_r, \ - obj_t* zeta_i, \ - obj_t* chi \ - ); - -GENTPROT( zipsc ) - - -// ----------------------------------------------------------------------------- - -void bli_l0_xsc_check - ( - obj_t* chi - ); - -void bli_l0_xxsc_check - ( - obj_t* chi, - obj_t* psi - ); - -void bli_l0_xx2sc_check - ( - obj_t* chi, - obj_t* norm - ); -// end bli_l0_check.h - -// begin bli_l0_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* chi, \ - obj_t* absq \ - ); - -GENPROT( absqsc ) -GENPROT( normfsc ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* chi, \ - obj_t* psi \ - ); - -GENPROT( addsc ) -GENPROT( divsc ) -GENPROT( mulsc ) -GENPROT( sqrtsc ) -GENPROT( subsc ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* chi \ - ); - -GENPROT( invertsc ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* chi, \ - double* zeta_r, \ - double* zeta_i \ - ); - -GENPROT( getsc ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - double zeta_r, \ - double zeta_i, \ - obj_t* chi \ - ); - -GENPROT( setsc ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* chi, \ - obj_t* zeta_r, \ - obj_t* zeta_i \ - ); - -GENPROT( unzipsc ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* zeta_r, \ - obj_t* zeta_i, \ - obj_t* chi \ - ); - -GENPROT( zipsc ) - - - - - - - -// end bli_l0_oapi.h -// begin bli_l0_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi \ - ); - -INSERT_GENTPROT_BASIC0( addsc ) -INSERT_GENTPROT_BASIC0( divsc ) -INSERT_GENTPROT_BASIC0( mulsc ) -INSERT_GENTPROT_BASIC0( subsc ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - conj_t conjchi, \ - ctype* chi \ - ); - -INSERT_GENTPROT_BASIC0( invertsc ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - ctype* chi, \ - ctype_r* absq \ - ); - -INSERT_GENTPROTR_BASIC0( absqsc ) -INSERT_GENTPROTR_BASIC0( normfsc ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - ctype* chi, \ - ctype* psi \ - ); - -INSERT_GENTPROT_BASIC0( sqrtsc ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - ctype* chi, \ - double* zeta_r, \ - double* zeta_i \ - ); - -INSERT_GENTPROT_BASIC0( getsc ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - double zeta_r, \ - double zeta_i, \ - ctype* chi \ - ); - -INSERT_GENTPROT_BASIC0( setsc ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - ctype* chi, \ - ctype_r* zeta_r, \ - ctype_r* zeta_i \ - ); - -INSERT_GENTPROTR_BASIC0( unzipsc ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - ctype_r* zeta_r, \ - ctype_r* zeta_i, \ - ctype* chi \ - ); - -INSERT_GENTPROTR_BASIC0( zipsc ) - -// ----------------------------------------------------------------------------- - -BLIS_EXPORT_BLIS void bli_igetsc - ( - dim_t* chi, - double* zeta_r, - double* zeta_i - ); - -BLIS_EXPORT_BLIS void bli_isetsc - ( - double zeta_r, - double zeta_i, - dim_t* chi - ); - -// end bli_l0_tapi.h -// begin bli_l0_ft.h - - - -// -// -- Level-0 function types --------------------------------------------------- -// - -// addsc, divsc, subsc - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi \ - ); - -INSERT_GENTDEF( addsc ) -INSERT_GENTDEF( divsc ) -INSERT_GENTDEF( subsc ) - -// invertsc - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - conj_t conjchi, \ - ctype* chi \ - ); - -INSERT_GENTDEF( invertsc ) - -// mulsc - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi \ - ); - -INSERT_GENTDEF( mulsc ) - -// absqsc - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - ctype* chi, \ - ctype_r* absq \ - ); - -INSERT_GENTDEFR( absqsc ) - -// normfsc - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - ctype* chi, \ - ctype_r* norm \ - ); - -INSERT_GENTDEFR( normfsc ) - -// sqrtsc - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - ctype* chi, \ - ctype* psi \ - ); - -INSERT_GENTDEF( sqrtsc ) - -// getsc - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - ctype* chi, \ - double* zeta_r, \ - double* zeta_i \ - ); - -INSERT_GENTDEF( getsc ) - -// setsc - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - double zeta_r, \ - double zeta_i, \ - ctype* chi \ - ); - -INSERT_GENTDEF( setsc ) - -// unzipsc - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - ctype* chi, \ - ctype_r* zeta_r, \ - ctype_r* zeta_i \ - ); - -INSERT_GENTDEFR( unzipsc ) - -// zipsc - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - ctype_r* zeta_r, \ - ctype_r* zeta_i, \ - ctype* chi \ - ); - -INSERT_GENTDEFR( zipsc ) - - -// end bli_l0_ft.h - -// Generate function pointer arrays for tapi functions. -// begin bli_l0_fpa.h - - -// -// Prototype function pointer query interface. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -PASTECH(opname,_vft) \ -PASTEMAC(opname,_qfp)( num_t dt ); - -GENPROT( absqsc ) -GENPROT( normfsc ) -GENPROT( addsc ) -GENPROT( divsc ) -GENPROT( mulsc ) -GENPROT( subsc ) -GENPROT( invertsc ) -GENPROT( sqrtsc ) -GENPROT( unzipsc ) -GENPROT( zipsc ) - -GENPROT( getsc ) -GENPROT( setsc ) - -// end bli_l0_fpa.h - -// copysc -// begin bli_copysc.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* chi, \ - obj_t* psi \ - ); -GENFRONT( copysc ) - - -// -// Prototype BLAS-like interfaces with heterogeneous-typed operands. -// - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ - ( \ - conj_t conjchi, \ - void* chi, \ - void* psi \ - ); - -INSERT_GENTPROT2_BASIC0( copysc ) -INSERT_GENTPROT2_MIX_D0( copysc ) -INSERT_GENTPROT2_MIX_P0( copysc ) - -// end bli_copysc.h -// end bli_l0.h - - -// -- Level-1v operations -- - -// begin bli_l1v.h - - -// begin bli_l1v_check.h - - - -// -// Prototype object-based check functions. -// - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* y \ - ); - -GENTPROT( addv ) -GENTPROT( copyv ) -GENTPROT( subv ) -GENTPROT( swapv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* index \ - ); - -GENTPROT( amaxv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - ); - -GENTPROT( axpbyv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ - ); - -GENTPROT( axpyv ) -GENTPROT( scal2v ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho \ - ); - -GENTPROT( dotv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* beta, \ - obj_t* rho \ - ); - -GENTPROT( dotxv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x \ - ); - -GENTPROT( invertv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* x \ - ); - -GENTPROT( scalv ) -GENTPROT( setv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - ); - -GENTPROT( xpbyv ) - - - -// ----------------------------------------------------------------------------- - -void bli_l1v_xy_check - ( - obj_t* x, - obj_t* y - ); - -void bli_l1v_axy_check - ( - obj_t* alpha, - obj_t* x, - obj_t* y - ); - -void bli_l1v_xby_check - ( - obj_t* x, - obj_t* beta, - obj_t* y - ); - -void bli_l1v_axby_check - ( - obj_t* alpha, - obj_t* x, - obj_t* beta, - obj_t* y - ); - -void bli_l1v_dot_check - ( - obj_t* alpha, - obj_t* x, - obj_t* y, - obj_t* beta, - obj_t* rho - ); - -void bli_l1v_x_check - ( - obj_t* x - ); - -void bli_l1v_ax_check - ( - obj_t* alpha, - obj_t* x - ); - -void bli_l1v_xi_check - ( - obj_t* x, - obj_t* index - ); - -// end bli_l1v_check.h - -// Define kernel function types. -//#include "bli_l1v_ft_ex.h" -// begin bli_l1v_ft_ker.h - - -#ifndef BLIS_L1V_FT_KER_H -#define BLIS_L1V_FT_KER_H - - -// -// -- Level-1v kernel function types ------------------------------------------- -// - -// addv, copyv, subv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( addv ) -INSERT_GENTDEF( copyv ) -INSERT_GENTDEF( subv ) - -// amaxv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - dim_t* restrict index, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( amaxv ) - -// axpbyv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict beta, \ - ctype* restrict y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( axpbyv ) - -// axpyv, scal2v - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( axpyv ) -INSERT_GENTDEF( scal2v ) - -// dotv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - ctype* restrict rho, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( dotv ) - -// dotxv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - ctype* restrict beta, \ - ctype* restrict rho, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( dotxv ) - -// invertv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( invertv ) - -// scalv, setv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjalpha, \ - dim_t n, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( scalv ) -INSERT_GENTDEF( setv ) - -// swapv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( swapv ) - -// xpybv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict beta, \ - ctype* restrict y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( xpbyv ) - - -#endif - -// end bli_l1v_ft_ker.h - -// Prototype object APIs (expert and non-expert). -// begin bli_oapi_ex.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that contain context parameters. - -// Define the macro to add a suffix to the object API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_OAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_oapi_ex.h -// begin bli_l1v_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( addv ) -GENTPROT( copyv ) -GENTPROT( subv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* index \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( amaxv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( axpbyv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( axpyv ) -GENTPROT( scal2v ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( dotv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* beta, \ - obj_t* rho \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( dotxv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( invertv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( scalv ) -GENTPROT( setv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( swapv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( xpbyv ) - -// end bli_l1v_oapi.h - -// begin bli_oapi_ba.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_oapi_ba.h -// begin bli_l1v_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( addv ) -GENTPROT( copyv ) -GENTPROT( subv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* index \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( amaxv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( axpbyv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( axpyv ) -GENTPROT( scal2v ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( dotv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* beta, \ - obj_t* rho \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( dotxv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( invertv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( scalv ) -GENTPROT( setv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( swapv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( xpbyv ) - -// end bli_l1v_oapi.h - -// Prototype typed APIs (expert and non-expert). -// begin bli_tapi_ex.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that contain context parameters. - -// Define the macro to add a suffix to the typed API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_TAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_tapi_ex.h -// begin bli_l1v_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( addv ) -INSERT_GENTPROT_BASIC0( copyv ) -INSERT_GENTPROT_BASIC0( subv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - dim_t* index \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( amaxv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( axpbyv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( axpyv ) -INSERT_GENTPROT_BASIC0( scal2v ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( dotv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* beta, \ - ctype* rho \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( dotxv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( invertv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjalpha, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( scalv ) -INSERT_GENTPROT_BASIC0( setv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( swapv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( xpbyv ) -// end bli_l1v_tapi.h -// begin bli_l1v_ft.h - - - -// -// -- Level-1v function types -------------------------------------------------- -// - -// addv, copyv, subv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( addv ) -INSERT_GENTDEF( copyv ) -INSERT_GENTDEF( subv ) - -// amaxv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - dim_t* index \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( amaxv ) - -// axpbyv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpbyv ) - -// axpyv, scal2v - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpyv ) -INSERT_GENTDEF( scal2v ) - -// dotv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( dotv ) - -// dotxv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* beta, \ - ctype* rho \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( dotxv ) - -// invertv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( invertv ) - -// scalv, setv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjalpha, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( scalv ) -INSERT_GENTDEF( setv ) - -// swapv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( swapv ) - -// xpybv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( xpbyv ) - - -// end bli_l1v_ft.h - -// begin bli_tapi_ba.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_tapi_ba.h -// begin bli_l1v_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( addv ) -INSERT_GENTPROT_BASIC0( copyv ) -INSERT_GENTPROT_BASIC0( subv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - dim_t* index \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( amaxv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( axpbyv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( axpyv ) -INSERT_GENTPROT_BASIC0( scal2v ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( dotv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* beta, \ - ctype* rho \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( dotxv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( invertv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjalpha, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( scalv ) -INSERT_GENTPROT_BASIC0( setv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( swapv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); \ - -INSERT_GENTPROT_BASIC0( xpbyv ) -// end bli_l1v_tapi.h -// begin bli_l1v_ft.h - - - -// -// -- Level-1v function types -------------------------------------------------- -// - -// addv, copyv, subv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( addv ) -INSERT_GENTDEF( copyv ) -INSERT_GENTDEF( subv ) - -// amaxv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - dim_t* index \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( amaxv ) - -// axpbyv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpbyv ) - -// axpyv, scal2v - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpyv ) -INSERT_GENTDEF( scal2v ) - -// dotv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( dotv ) - -// dotxv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* beta, \ - ctype* rho \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( dotxv ) - -// invertv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( invertv ) - -// scalv, setv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjalpha, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( scalv ) -INSERT_GENTDEF( setv ) - -// swapv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( swapv ) - -// xpybv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( xpbyv ) - - -// end bli_l1v_ft.h - -// Generate function pointer arrays for tapi functions (expert only). -// begin bli_l1v_fpa.h - - -// -// Prototype function pointer query interface. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ -PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); - -GENPROT( addv ) -GENPROT( copyv ) -GENPROT( subv ) -GENPROT( amaxv ) -GENPROT( axpbyv ) -GENPROT( axpyv ) -GENPROT( scal2v ) -GENPROT( dotv ) -GENPROT( dotxv ) -GENPROT( invertv ) -GENPROT( scalv ) -GENPROT( setv ) -GENPROT( swapv ) -GENPROT( xpbyv ) - -// end bli_l1v_fpa.h - -// Pack-related -// NOTE: packv and unpackv are temporarily disabled. -//#include "bli_packv.h" -//#include "bli_unpackv.h" - -// Other -// NOTE: scalv control tree code is temporarily disabled. -//#include "bli_scalv_cntl.h" -//#include "bli_scalv_int.h" - -// end bli_l1v.h - - -// -- Level-1d operations -- - -// begin bli_l1d.h - - -// begin bli_l1d_check.h - - - -// -// Prototype object-based check functions. -// - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* y \ - ); - -GENTPROT( addd ) -GENTPROT( copyd ) -GENTPROT( subd ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ - ); - -GENTPROT( axpyd ) -GENTPROT( scal2d ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x \ - ); - -GENTPROT( invertd ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* x \ - ); - -GENTPROT( scald ) -GENTPROT( setd ) -GENTPROT( setid ) -GENTPROT( shiftd ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - ); - -GENTPROT( xpbyd ) - - -// ----------------------------------------------------------------------------- - -void bli_l1d_xy_check - ( - obj_t* x, - obj_t* y - ); - -void bli_l1d_axy_check - ( - obj_t* alpha, - obj_t* x, - obj_t* y - ); - -void bli_l1d_x_check - ( - obj_t* x - ); - -void bli_l1d_ax_check - ( - obj_t* alpha, - obj_t* x - ); - -// end bli_l1d_check.h - -// Prototype object APIs (expert and non-expert). -// begin bli_oapi_ex.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that contain context parameters. - -// Define the macro to add a suffix to the object API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_OAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_oapi_ex.h -// begin bli_l1d_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( addd ) -GENTPROT( copyd ) -GENTPROT( subd ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( axpyd ) -GENTPROT( scal2d ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( invertd ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( scald ) -GENTPROT( setd ) -GENTPROT( setid ) -GENTPROT( shiftd ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( xpbyd ) - -// end bli_l1d_oapi.h - -// begin bli_oapi_ba.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_oapi_ba.h -// begin bli_l1d_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( addd ) -GENTPROT( copyd ) -GENTPROT( subd ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( axpyd ) -GENTPROT( scal2d ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( invertd ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( scald ) -GENTPROT( setd ) -GENTPROT( setid ) -GENTPROT( shiftd ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( xpbyd ) - -// end bli_l1d_oapi.h - -// Prototype typed APIs (expert and non-expert). -// begin bli_tapi_ex.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that contain context parameters. - -// Define the macro to add a suffix to the typed API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_TAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_tapi_ex.h -// begin bli_l1d_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( addd ) -INSERT_GENTPROT_BASIC0( copyd ) -INSERT_GENTPROT_BASIC0( subd ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( axpyd ) -INSERT_GENTPROT_BASIC0( scal2d ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( invertd ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( scald ) -INSERT_GENTPROT_BASIC0( setd ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype_r* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( setid ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( shiftd ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( xpbyd ) - -// end bli_l1d_tapi.h -// begin bli_l1d_ft.h - - - -// -// -- Level-1d function types -------------------------------------------------- -// - -// addd, copyd, subd - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( addd ) -INSERT_GENTDEF( copyd ) -INSERT_GENTDEF( subd ) - -// axpyd, scal2d - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpyd ) -INSERT_GENTDEF( scal2d ) - -// invertd - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( invertd ) - -// scald, setd - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( scald ) -INSERT_GENTDEF( setd ) - -// setid - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype_r* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( setid ) - -// shiftd - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( shiftd ) - -// xpbyd - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( xpbyd ) - -// end bli_l1d_ft.h - -// begin bli_tapi_ba.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_tapi_ba.h -// begin bli_l1d_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( addd ) -INSERT_GENTPROT_BASIC0( copyd ) -INSERT_GENTPROT_BASIC0( subd ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( axpyd ) -INSERT_GENTPROT_BASIC0( scal2d ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( invertd ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( scald ) -INSERT_GENTPROT_BASIC0( setd ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype_r* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( setid ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( shiftd ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( xpbyd ) - -// end bli_l1d_tapi.h -// begin bli_l1d_ft.h - - - -// -// -- Level-1d function types -------------------------------------------------- -// - -// addd, copyd, subd - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( addd ) -INSERT_GENTDEF( copyd ) -INSERT_GENTDEF( subd ) - -// axpyd, scal2d - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpyd ) -INSERT_GENTDEF( scal2d ) - -// invertd - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( invertd ) - -// scald, setd - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( scald ) -INSERT_GENTDEF( setd ) - -// setid - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype_r* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( setid ) - -// shiftd - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( shiftd ) - -// xpbyd - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( xpbyd ) - -// end bli_l1d_ft.h - -// Generate function pointer arrays for tapi functions (expert only). -// begin bli_l1d_fpa.h - - -// -// Prototype function pointer query interface. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ -PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); - -GENPROT( addd ) -GENPROT( copyd ) -GENPROT( subd ) -GENPROT( axpyd ) -GENPROT( scal2d ) -GENPROT( invertd ) -GENPROT( scald ) -GENPROT( setd ) -GENPROT( setid ) -GENPROT( shiftd ) -GENPROT( xpbyd ) - -// end bli_l1d_fpa.h - -// end bli_l1d.h - - -// -- Level-1f operations -- - -// begin bli_l1f.h - - -// begin bli_l1f_check.h - - - -// -// Prototype object-based check functions. -// - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alphax, \ - obj_t* alphay, \ - obj_t* x, \ - obj_t* y, \ - obj_t* z \ - ); - -GENTPROT( axpy2v ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* y \ - ); - -GENTPROT( axpyf ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* xt, \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho, \ - obj_t* z \ - ); - -GENTPROT( dotaxpyv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* at, \ - obj_t* a, \ - obj_t* w, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y, \ - obj_t* z \ - ); - -GENTPROT( dotxaxpyf ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - ); - -GENTPROT( dotxf ) - -// end bli_l1f_check.h - -// Define kernel function types. -// begin bli_l1f_ft_ker.h - - -#ifndef BLIS_L1F_FT_KER_H -#define BLIS_L1F_FT_KER_H - - -// -// -- Level-1f kernel function types ------------------------------------------- -// - -// axpy2v - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* restrict alpha1, \ - ctype* restrict alpha2, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - ctype* restrict z, inc_t incz, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( axpy2v ) - -// axpyf - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( axpyf ) - -// dotaxpyv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* restrict alpha, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict y, inc_t incy, \ - ctype* restrict rho, \ - ctype* restrict z, inc_t incz, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( dotaxpyv ) - -// dotxf - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict beta, \ - ctype* restrict y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( dotxf ) - -// dotxaxpyf - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict w, inc_t incw, \ - ctype* restrict x, inc_t incx, \ - ctype* restrict beta, \ - ctype* restrict y, inc_t incy, \ - ctype* restrict z, inc_t incz, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( dotxaxpyf ) - - - -#endif - -// end bli_l1f_ft_ker.h - -// Prototype object APIs (expert and non-expert). -// begin bli_oapi_ex.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that contain context parameters. - -// Define the macro to add a suffix to the object API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_OAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_oapi_ex.h -// begin bli_l1f_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alphax, \ - obj_t* alphay, \ - obj_t* x, \ - obj_t* y, \ - obj_t* z \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( axpy2v ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( axpyf ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* xt, \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho, \ - obj_t* z \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( dotaxpyv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* at, \ - obj_t* a, \ - obj_t* w, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y, \ - obj_t* z \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( dotxaxpyf ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( dotxf ) - -// end bli_l1f_oapi.h - -// begin bli_oapi_ba.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_oapi_ba.h -// begin bli_l1f_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alphax, \ - obj_t* alphay, \ - obj_t* x, \ - obj_t* y, \ - obj_t* z \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( axpy2v ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( axpyf ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* xt, \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho, \ - obj_t* z \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( dotaxpyv ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* at, \ - obj_t* a, \ - obj_t* w, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y, \ - obj_t* z \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( dotxaxpyf ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENTPROT( dotxf ) - -// end bli_l1f_oapi.h - -// Prototype typed APIs (expert and non-expert). -// begin bli_tapi_ex.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that contain context parameters. - -// Define the macro to add a suffix to the typed API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_TAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_tapi_ex.h -// begin bli_l1f_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alphax, \ - ctype* alphay, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( axpy2v ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( axpyf ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( dotaxpyv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* w, inc_t incw, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( dotxaxpyf ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( dotxf ) - -// end bli_l1f_tapi.h -// begin bli_l1f_ft.h - - - -// -// -- Level-1f function types -------------------------------------------------- -// - -// axpy2v - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha1, \ - ctype* alpha2, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpy2v ) - -// axpyf - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpyf ) - -// dotaxpyv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( dotaxpyv ) - -// dotxf - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( dotxf ) - -// dotxaxpyf - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* w, inc_t incw, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( dotxaxpyf ) - - -// end bli_l1f_ft.h - -// begin bli_tapi_ba.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_tapi_ba.h -// begin bli_l1f_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alphax, \ - ctype* alphay, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( axpy2v ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( axpyf ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( dotaxpyv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* w, inc_t incw, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( dotxaxpyf ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( dotxf ) - -// end bli_l1f_tapi.h -// begin bli_l1f_ft.h - - - -// -// -- Level-1f function types -------------------------------------------------- -// - -// axpy2v - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha1, \ - ctype* alpha2, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpy2v ) - -// axpyf - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpyf ) - -// dotaxpyv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( dotaxpyv ) - -// dotxf - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( dotxf ) - -// dotxaxpyf - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* w, inc_t incw, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( dotxaxpyf ) - - -// end bli_l1f_ft.h - -// Generate function pointer arrays for tapi functions (expert only). -// begin bli_l1f_fpa.h - - -// -// Prototype function pointer query interface. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ -PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); - -GENPROT( axpy2v ) -GENPROT( axpyf ) -GENPROT( dotaxpyv ) -GENPROT( dotxaxpyf ) -GENPROT( dotxf ) - -// end bli_l1f_fpa.h - -// end bli_l1f.h - - -// -- Level-1m operations -- - -// begin bli_l1m.h - - -// begin bli_l1m_check.h - - - -// -// Prototype object-based check functions. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* y \ - ); - -GENPROT( addm ) -GENPROT( copym ) -GENPROT( subm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ - ); - -GENPROT( axpym ) -GENPROT( scal2m ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* x \ - ); - -GENPROT( scalm ) -GENPROT( setm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - ); - -GENPROT( xpbym ) - - -// ----------------------------------------------------------------------------- - -void bli_l1m_xy_check - ( - obj_t* x, - obj_t* y - ); - -void bli_l1m_axy_check - ( - obj_t* alpha, - obj_t* x, - obj_t* y - ); - -void bli_l1m_ax_check - ( - obj_t* alpha, - obj_t* x - ); - -// end bli_l1m_check.h - -// Define kernel function types. -// begin bli_l1m_ft_ker.h - - -#ifndef BLIS_L1M_FT_KER_H -#define BLIS_L1M_FT_KER_H - - -// -// -- Level-1m kernel function types ------------------------------------------- -// - -// packm - -// NOTE: This is the function type for the structure-aware "kernel". - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( packm ) - - -// NOTE: the following macros generate packm kernel function type definitions -// that are "ctyped" and void-typed, for each of the floating-point datatypes. -// However, we will only make use of the void-typed definitions because the -// functions such as bli_?packm_cxk() (currently) use arrays of function -// pointers to store and access the function pointers for various unrolling -// (register blocksize) values, and therefore they must all be of the same -// type (hence the use of void* for kappa, a, and p). - -// packm_ker - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ); - -INSERT_GENTDEF( packm_cxk ) - -// unpackm_ker - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conjp, \ - dim_t n, \ - ctype* restrict kappa, \ - ctype* restrict p, inc_t ldp, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ); - -INSERT_GENTDEF( unpackm_cxk ) - -// packm_3mis_ker -// packm_4mi_ker - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conja, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t is_p, inc_t ldp, \ - cntx_t* restrict cntx \ - ); - -INSERT_GENTDEF( packm_cxk_3mis ) -INSERT_GENTDEF( packm_cxk_4mi ) - -// packm_rih_ker -// packm_1er_ker - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ); - -INSERT_GENTDEF( packm_cxk_rih ) -INSERT_GENTDEF( packm_cxk_1er ) - - - - - -#endif - -// end bli_l1m_ft_ker.h - -// Define object function types for variants. -// begin bli_l1m_oft_var.h - - -#ifndef BLIS_L1M_OFT_VAR_H -#define BLIS_L1M_OFT_VAR_H - - -// -// -- Level-3 variant function types ------------------------------------------- -// - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_var_oft)) \ -( \ - obj_t* a, \ - obj_t* p, \ - cntx_t* cntx, \ - cntl_t* cntl, \ - thrinfo_t* thread \ -); - -GENTDEF( packm ) - - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_var_oft)) \ -( \ - obj_t* p, \ - obj_t* a, \ - cntx_t* cntx, \ - cntl_t* cntl, \ - thrinfo_t* thread \ -); - -GENTDEF( unpackm ) - - - -#endif - -// end bli_l1m_oft_var.h - -// Prototype object APIs (expert and non-expert). -// begin bli_oapi_ex.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that contain context parameters. - -// Define the macro to add a suffix to the object API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_OAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_oapi_ex.h -// begin bli_l1m_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( addm ) -GENPROT( copym ) -GENPROT( subm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( axpym ) -GENPROT( scal2m ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( scalm ) -GENPROT( setm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( xpbym ) -GENPROT( xpbym_md ) - -// end bli_l1m_oapi.h - -// begin bli_oapi_ba.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_oapi_ba.h -// begin bli_l1m_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( addm ) -GENPROT( copym ) -GENPROT( subm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( axpym ) -GENPROT( scal2m ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( scalm ) -GENPROT( setm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( xpbym ) -GENPROT( xpbym_md ) - -// end bli_l1m_oapi.h - -// Prototype typed APIs (expert and non-expert). -// begin bli_tapi_ex.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that contain context parameters. - -// Define the macro to add a suffix to the typed API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_TAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_tapi_ex.h -// begin bli_l1m_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( addm ) -INSERT_GENTPROT_BASIC0( copym ) -INSERT_GENTPROT_BASIC0( subm ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( axpym ) -INSERT_GENTPROT_BASIC0( scal2m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( scalm ) -INSERT_GENTPROT_BASIC0( setm ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( xpbym ) - - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype_x* x, inc_t rs_x, inc_t cs_x, \ - ctype_y* beta, \ - ctype_y* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT2_BASIC0( xpbym_md ) -INSERT_GENTPROT2_MIXDP0( xpbym_md ) - -// end bli_l1m_tapi.h -// begin bli_l1m_ft.h - - - -// -// -- Level-1v function types -------------------------------------------------- -// - -// addm, subm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( addm ) -INSERT_GENTDEF( subm ) - -// copym - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( copym ) - -// axpym - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpym ) - -// scal2m - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( scal2m ) - -// scalm, setm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( scalm ) -INSERT_GENTDEF( setm ) - -// xpbym - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( xpbym ) -INSERT_GENTDEF( xpbym_md ) - -// end bli_l1m_ft.h - -// begin bli_tapi_ba.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_tapi_ba.h -// begin bli_l1m_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( addm ) -INSERT_GENTPROT_BASIC0( copym ) -INSERT_GENTPROT_BASIC0( subm ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( axpym ) -INSERT_GENTPROT_BASIC0( scal2m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( scalm ) -INSERT_GENTPROT_BASIC0( setm ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( xpbym ) - - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype_x* x, inc_t rs_x, inc_t cs_x, \ - ctype_y* beta, \ - ctype_y* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT2_BASIC0( xpbym_md ) -INSERT_GENTPROT2_MIXDP0( xpbym_md ) - -// end bli_l1m_tapi.h -// begin bli_l1m_ft.h - - - -// -// -- Level-1v function types -------------------------------------------------- -// - -// addm, subm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( addm ) -INSERT_GENTDEF( subm ) - -// copym - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( copym ) - -// axpym - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( axpym ) - -// scal2m - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( scal2m ) - -// scalm, setm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( scalm ) -INSERT_GENTDEF( setm ) - -// xpbym - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( xpbym ) -INSERT_GENTDEF( xpbym_md ) - -// end bli_l1m_ft.h - -// Generate function pointer arrays for tapi functions (expert only). -// begin bli_l1m_fpa.h - - -// -// Prototype function pointer query interface. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ -PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); - -GENPROT( addm ) -GENPROT( copym ) -GENPROT( subm ) -GENPROT( axpym ) -GENPROT( scal2m ) -GENPROT( scalm ) -GENPROT( setm ) -GENPROT( xpbym ) - -#undef GENPROT -#define GENPROT( opname ) \ -\ -PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ -PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); - -GENPROT( xpbym_md ) - -// end bli_l1m_fpa.h - -// Prototype level-1m implementations. -// begin bli_l1m_unb_var1.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC2(ch,opname,_unb_var1) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( addm ) -INSERT_GENTPROT_BASIC0( copym ) -INSERT_GENTPROT_BASIC0( subm ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC2(ch,opname,_unb_var1) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( axpym ) -INSERT_GENTPROT_BASIC0( scal2m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC2(ch,opname,_unb_var1) \ - ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( scalm ) -INSERT_GENTPROT_BASIC0( setm ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC2(ch,opname,_unb_var1) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( xpbym ) - - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ -\ -void PASTEMAC3(chx,chy,opname,_unb_var1) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype_x* x, inc_t rs_x, inc_t cs_x, \ - ctype_y* beta, \ - ctype_y* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT2_BASIC0( xpbym_md ) -INSERT_GENTPROT2_MIXDP0( xpbym_md ) - -// end bli_l1m_unb_var1.h - -// Pack-related -// begin bli_packm.h - - -// begin bli_packm_cntl.h - - -struct packm_params_s -{ - uint64_t size; // size field must be present and come first. - packm_var_oft var_func; - bszid_t bmid_m; - bszid_t bmid_n; - bool_t does_invert_diag; - bool_t rev_iter_if_upper; - bool_t rev_iter_if_lower; - pack_t pack_schema; - packbuf_t pack_buf_type; -}; -typedef struct packm_params_s packm_params_t; - -static packm_var_oft bli_cntl_packm_params_var_func( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->var_func; -} - -static bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; -} - -static bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; -} - -static bool_t bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; -} - -static bool_t bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; -} - -static bool_t bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; -} - -static pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; -} - -static packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; -} - -// ----------------------------------------------------------------------------- - -cntl_t* bli_packm_cntl_create_node - ( - rntm_t* rntm, - void_fp var_func, - void_fp packm_var_func, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type, - cntl_t* sub_node - ); - -// end bli_packm_cntl.h -// begin bli_packm_check.h - - -void bli_packm_init_check - ( - obj_t* a, - obj_t* p, - cntx_t* cntx - ); - -void bli_packm_int_check - ( - obj_t* a, - obj_t* p, - cntx_t* cntx - ); - -// end bli_packm_check.h -// begin bli_packm_init.h - - -siz_t bli_packm_init - ( - obj_t* a, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl - ); - -BLIS_EXPORT_BLIS siz_t bli_packm_init_pack - ( - invdiag_t invert_diag, - pack_t schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - bszid_t bmult_id_m, - bszid_t bmult_id_n, - obj_t* a, - obj_t* p, - cntx_t* cntx - ); - -// end bli_packm_init.h -// begin bli_packm_int.h - - -void bli_packm_int - ( - obj_t* a, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ); -// end bli_packm_int.h - -// begin bli_packm_part.h - - -// -- Matrix partitioning ------------------------------------------------------ - -void bli_packm_acquire_mpart_t2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -void bli_packm_acquire_mpart_l2r( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); - -// end bli_packm_part.h - -// begin bli_packm_var.h - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* c, \ - obj_t* p, \ - cntx_t* cntx, \ - cntl_t* cntl, \ - thrinfo_t* t \ - ); - -GENPROT( packm_unb_var1 ) -GENPROT( packm_blk_var1 ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_unb_var1 ) - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool_t invdiag, \ - bool_t revifup, \ - bool_t reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - void_fp packm_ker, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( packm_blk_var1 ) - -// end bli_packm_var.h - -// begin bli_packm_struc_cxk.h - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_struc_cxk ) - - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_herm_cxk ) - - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_tri_cxk ) - -// end bli_packm_struc_cxk.h -// begin bli_packm_struc_cxk_4mi.h - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_4mi ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_4mi ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_4mi ) - -// end bli_packm_struc_cxk_4mi.h -// begin bli_packm_struc_cxk_3mis.h - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_3mis ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_3mis ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_3mis ) - -// end bli_packm_struc_cxk_3mis.h -// begin bli_packm_struc_cxk_rih.h - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_rih ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_rih ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_rih ) - -// end bli_packm_struc_cxk_rih.h -// begin bli_packm_struc_cxk_1er.h - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) - -// end bli_packm_struc_cxk_1er.h - -// begin bli_packm_cxk.h - - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* kappa, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_cxk ) - -// end bli_packm_cxk.h -// begin bli_packm_cxk_4mi.h - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* kappa, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* p, inc_t is_p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_cxk_4mi ) - -// end bli_packm_cxk_4mi.h -// begin bli_packm_cxk_3mis.h - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* kappa, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* p, inc_t is_p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_cxk_3mis ) - -// end bli_packm_cxk_3mis.h -// begin bli_packm_cxk_rih.h - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* kappa, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_cxk_rih ) - -// end bli_packm_cxk_rih.h -// begin bli_packm_cxk_1er.h - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t panel_dim, \ - dim_t panel_dim_max, \ - dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* kappa, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* p, inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) - -// end bli_packm_cxk_1er.h - -// Mixed datatype support. -#ifdef BLIS_ENABLE_GEMM_MD -// begin bli_packm_md.h - - -// begin bli_packm_blk_var1_md.h - - -void bli_packm_blk_var1_md - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* t - ); - - -#undef GENTPROT2 -#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ -\ -void PASTEMAC2(chc,chp,varname) \ - ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT2_BASIC0( packm_blk_var1_md ) -INSERT_GENTPROT2_MIXDP0( packm_blk_var1_md ) - -// end bli_packm_blk_var1_md.h -// begin bli_packm_struc_cxk_md.h - - -#undef GENTPROT2 -#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ -\ -void PASTEMAC2(chc,chp,varname) \ - ( \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype_p* restrict kappa, \ - ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) -INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) - - -#undef GENTPROT2 -#define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ -\ -void PASTEMAC2(cha,chp,opname) \ - ( \ - conj_t conja, \ - dim_t m, \ - dim_t n, \ - ctype_p* restrict kappa, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_p* restrict p, inc_t ldp \ - ); - -INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) -INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) - -INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) -INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) - -// end bli_packm_struc_cxk_md.h - -// end bli_packm_md.h -#endif - -// end bli_packm.h -// begin bli_unpackm.h - - -// begin bli_unpackm_cntl.h - - -struct unpackm_params_s -{ - uint64_t size; // size field must be present and come first. - unpackm_var_oft var_func; -}; -typedef struct unpackm_params_s unpackm_params_t; - -#define bli_cntl_unpackm_params_var_func( cntl ) \ -\ - ( ( (unpackm_params_t*)(cntl)->params )->var_func ) - -// ----------------------------------------------------------------------------- - -cntl_t* bli_unpackm_cntl_create_node - ( - rntm_t* rntm, - void_fp var_func, - void_fp unpackm_var_func, - cntl_t* sub_node - ); - -// end bli_unpackm_cntl.h -// begin bli_unpackm_check.h - - -void bli_unpackm_int_check - ( - obj_t* p, - obj_t* a, - cntx_t* cntx - ); - -// end bli_unpackm_check.h -// begin bli_unpackm_int.h - - -void bli_unpackm_int - ( - obj_t* p, - obj_t* a, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ); - -// end bli_unpackm_int.h - -// begin bli_unpackm_unb_var1.h - - -void bli_unpackm_unb_var1 - ( - obj_t* p, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffp, \ - uplo_t uplop, \ - trans_t transp, \ - dim_t m, \ - dim_t n, \ - void* p, inc_t rs_p, inc_t cs_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( unpackm_unb_var1 ) - -// end bli_unpackm_unb_var1.h - -// begin bli_unpackm_blk_var1.h - - -void bli_unpackm_blk_var1 - ( - obj_t* p, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ); - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_panel, \ - dim_t n_panel, \ - void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) - -// end bli_unpackm_blk_var1.h - -// begin bli_unpackm_cxk.h - - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conjp, \ - dim_t panel_dim, \ - dim_t panel_len, \ - ctype* kappa, \ - ctype* p, inc_t ldp, \ - ctype* a, inc_t inca, inc_t lda, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( unpackm_cxk ) - -// end bli_unpackm_cxk.h -// end bli_unpackm.h - -// end bli_l1m.h - - -// -- Level-2 operations -- - -// begin bli_l2.h - - -// begin bli_l2_check.h - - - -// -// Prototype object-based check functions. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - ); - -GENPROT( gemv ) -GENPROT( hemv ) -GENPROT( symv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* a \ - ); - -GENPROT( ger ) -GENPROT( her2 ) -GENPROT( syr2 ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* a \ - ); - -GENPROT( her ) -GENPROT( syr ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x \ - ); - -GENPROT( trmv ) -GENPROT( trsv ) - - -// ----------------------------------------------------------------------------- - -void bli_xxmv_check - ( - obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* beta, - obj_t* y - ); - -void bli_xxr_check - ( - obj_t* alpha, - obj_t* x, - obj_t* y, - obj_t* a - ); -// end bli_l2_check.h - -// Define function types. -// begin bli_l2_ft_unb.h - - -#ifndef BLIS_L2_FT_UNB_H -#define BLIS_L2_FT_UNB_H - - -// -// -- Level-2 function types --------------------------------------------------- -// - -// gemv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ - ( \ - trans_t transa, \ - conj_t conjx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( gemv ) - -// ger - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( ger ) - -// hemv (and symv) - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conja, \ - conj_t conjx, \ - conj_t conjh, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( hemv ) - -// her (and syr) - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - conj_t conjh, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEFR( her ) - -// her2 (and syr2) - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - conj_t conjy, \ - conj_t conjh, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( her2 ) - -// trmv (and trsv) - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ - ( \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - cntx_t* cntx \ - ); - -INSERT_GENTDEF( trmv ) -INSERT_GENTDEF( trsv ) - - -#endif -// end bli_l2_ft_unb.h - -// Prototype object APIs (expert and non-expert). -// begin bli_oapi_ex.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that contain context parameters. - -// Define the macro to add a suffix to the object API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_OAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_oapi_ex.h -// begin bli_l2_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( gemv ) -GENPROT( hemv ) -GENPROT( symv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* a \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( ger ) -GENPROT( her2 ) -GENPROT( syr2 ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* a \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( her ) -GENPROT( syr ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( trmv ) -GENPROT( trsv ) - -// end bli_l2_oapi.h - -// begin bli_oapi_ba.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_oapi_ba.h -// begin bli_l2_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( gemv ) -GENPROT( hemv ) -GENPROT( symv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* a \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( ger ) -GENPROT( her2 ) -GENPROT( syr2 ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* a \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( her ) -GENPROT( syr ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( trmv ) -GENPROT( trsv ) - -// end bli_l2_oapi.h - -// Prototype typed APIs (expert and non-expert). -// begin bli_tapi_ex.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that contain context parameters. - -// Define the macro to add a suffix to the typed API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_TAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_tapi_ex.h -// begin bli_l2_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - trans_t transa, \ - conj_t conjx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( gemv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( ger ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( hemv ) -INSERT_GENTPROT_BASIC0( symv ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype_r* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( her ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( syr ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( her2 ) -INSERT_GENTPROT_BASIC0( syr2 ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( trmv ) -INSERT_GENTPROT_BASIC0( trsv ) -// end bli_l2_tapi.h -// begin bli_l2_ft.h - - - -// -// -- Level-2 function types --------------------------------------------------- -// - -// gemv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - trans_t transa, \ - conj_t conjx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( gemv ) - -// ger - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( ger ) - -// hemv, symv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( hemv ) -INSERT_GENTDEF( symv ) - -// her - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype_r* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( her ) - -// syr - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( syr ) - -// her2, syr2 - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( her2 ) -INSERT_GENTDEF( syr2 ) - -// trmv, trsv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( trmv ) -INSERT_GENTDEF( trsv ) - -// end bli_l2_ft.h - -// begin bli_tapi_ba.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_tapi_ba.h -// begin bli_l2_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - trans_t transa, \ - conj_t conjx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( gemv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( ger ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( hemv ) -INSERT_GENTPROT_BASIC0( symv ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype_r* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( her ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( syr ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( her2 ) -INSERT_GENTPROT_BASIC0( syr2 ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( trmv ) -INSERT_GENTPROT_BASIC0( trsv ) -// end bli_l2_tapi.h -// begin bli_l2_ft.h - - - -// -// -- Level-2 function types --------------------------------------------------- -// - -// gemv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - trans_t transa, \ - conj_t conjx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( gemv ) - -// ger - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( ger ) - -// hemv, symv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( hemv ) -INSERT_GENTDEF( symv ) - -// her - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype_r* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( her ) - -// syr - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( syr ) - -// her2, syr2 - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( her2 ) -INSERT_GENTDEF( syr2 ) - -// trmv, trsv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( trmv ) -INSERT_GENTDEF( trsv ) - -// end bli_l2_ft.h - -// Generate function pointer arrays for tapi functions (expert only). -// begin bli_l2_fpa.h - - -// -// Prototype function pointer query interface. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ -PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); - -GENPROT( gemv ) -GENPROT( ger ) -GENPROT( hemv ) -GENPROT( symv ) -GENPROT( her ) -GENPROT( syr ) -GENPROT( her2 ) -GENPROT( syr2 ) -GENPROT( trmv ) -GENPROT( trsv ) - -// -// Prototype function pointer query interfaces for level-2 implementations. -// - -#undef GENPROT -#define GENPROT( opname, varname ) \ -\ -PASTECH2(opname,_unb,_vft) \ -PASTEMAC(varname,_qfp)( num_t dt ); - -GENPROT( gemv, gemv_unb_var1 ) -GENPROT( gemv, gemv_unb_var2 ) -GENPROT( gemv, gemv_unf_var1 ) -GENPROT( gemv, gemv_unf_var2 ) - -GENPROT( ger, ger_unb_var1 ) -GENPROT( ger, ger_unb_var2 ) - -GENPROT( hemv, hemv_unb_var1 ) -GENPROT( hemv, hemv_unb_var2 ) -GENPROT( hemv, hemv_unb_var3 ) -GENPROT( hemv, hemv_unb_var4 ) -GENPROT( hemv, hemv_unf_var1 ) -GENPROT( hemv, hemv_unf_var3 ) -GENPROT( hemv, hemv_unf_var1a ) -GENPROT( hemv, hemv_unf_var3a ) - -GENPROT( her, her_unb_var1 ) -GENPROT( her, her_unb_var2 ) - -GENPROT( her2, her2_unb_var1 ) -GENPROT( her2, her2_unb_var2 ) -GENPROT( her2, her2_unb_var3 ) -GENPROT( her2, her2_unb_var4 ) -GENPROT( her2, her2_unf_var1 ) -GENPROT( her2, her2_unf_var4 ) - -GENPROT( trmv, trmv_unb_var1 ) -GENPROT( trmv, trmv_unb_var2 ) -GENPROT( trmv, trmv_unf_var1 ) -GENPROT( trmv, trmv_unf_var2 ) - -GENPROT( trsv, trsv_unb_var1 ) -GENPROT( trsv, trsv_unb_var2 ) -GENPROT( trsv, trsv_unf_var1 ) -GENPROT( trsv, trsv_unf_var2 ) - -// end bli_l2_fpa.h - -// Operation-specific headers -// begin bli_gemv.h - - -// NOTE: level-2 control tree code is temporarily disabled. -//#include "bli_gemv_cntl.h" -//#include "bli_gemv_front.h" -//#include "bli_gemv_int.h" - -// begin bli_gemv_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y, \ - cntx_t* cntx, \ - cntl_t* cntl \ - ); - -GENPROT( gemv_blk_var1 ) -GENPROT( gemv_blk_var2 ) - -GENPROT( gemv_unb_var1 ) -GENPROT( gemv_unb_var2 ) - -GENPROT( gemv_unf_var1 ) -GENPROT( gemv_unf_var2 ) - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - trans_t transa, \ - conj_t conjx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) -INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) - -INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) -INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) - -// end bli_gemv_var.h - -// end bli_gemv.h -// begin bli_ger.h - - -// NOTE: level-2 control tree code is temporarily disabled. -//#include "bli_ger_cntl.h" -//#include "bli_ger_front.h" -//#include "bli_ger_int.h" - -// begin bli_ger_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* a, \ - cntx_t* cntx, \ - cntl_t* cntl \ - ); - -GENPROT( ger_blk_var1 ) -GENPROT( ger_blk_var2 ) - -GENPROT( ger_unb_var1 ) -GENPROT( ger_unb_var2 ) - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( ger_unb_var1 ) -INSERT_GENTPROT_BASIC0( ger_unb_var2 ) - -// end bli_ger_var.h -// end bli_ger.h -// begin bli_hemv.h - - -// NOTE: level-2 control tree code is temporarily disabled. -//#include "bli_hemv_cntl.h" -//#include "bli_hemv_front.h" -//#include "bli_hemv_int.h" - -// begin bli_hemv_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - conj_t conjh, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y, \ - cntx_t* cntx, \ - cntl_t* cntl \ - ); - -GENPROT( hemv_blk_var1 ) -GENPROT( hemv_blk_var2 ) -GENPROT( hemv_blk_var3 ) -GENPROT( hemv_blk_var4 ) - -GENPROT( hemv_unb_var1 ) -GENPROT( hemv_unb_var2 ) -GENPROT( hemv_unb_var3 ) -GENPROT( hemv_unb_var4 ) - -GENPROT( hemv_unf_var1 ) -GENPROT( hemv_unf_var3 ) -GENPROT( hemv_unf_var1a ) -GENPROT( hemv_unf_var3a ) - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - uplo_t uplo, \ - conj_t conja, \ - conj_t conjx, \ - conj_t conjh, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) -INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) -INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) -INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) - -INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) -INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) -INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) -INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) - -// end bli_hemv_var.h - -// end bli_hemv.h -// begin bli_her.h - - -// NOTE: level-2 control tree code is temporarily disabled. -//#include "bli_her_cntl.h" -//#include "bli_her_front.h" -//#include "bli_her_int.h" - -// begin bli_her_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - conj_t conjh, \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* c, \ - cntx_t* cntx, \ - cntl_t* cntl \ - ); - -GENPROT( her_blk_var1 ) -GENPROT( her_blk_var2 ) - -GENPROT( her_unb_var1 ) -GENPROT( her_unb_var2 ) - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - uplo_t uplo, \ - conj_t conjx, \ - conj_t conjh, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTR_BASIC0( her_unb_var1 ) -INSERT_GENTPROTR_BASIC0( her_unb_var2 ) - -// end bli_her_var.h -// end bli_her.h -// begin bli_her2.h - - -// NOTE: level-2 control tree code is temporarily disabled. -//#include "bli_her2_cntl.h" -//#include "bli_her2_front.h" -//#include "bli_her2_int.h" - -// begin bli_her2_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - conj_t conjh, \ - obj_t* alpha, \ - obj_t* alpha_conj, \ - obj_t* x, \ - obj_t* y, \ - obj_t* c, \ - cntx_t* cntx, \ - cntl_t* cntl \ - ); - -GENPROT( her2_blk_var1 ) -GENPROT( her2_blk_var2 ) -GENPROT( her2_blk_var3 ) -GENPROT( her2_blk_var4 ) - -GENPROT( her2_unb_var1 ) -GENPROT( her2_unb_var2 ) -GENPROT( her2_unb_var3 ) -GENPROT( her2_unb_var4 ) - -GENPROT( her2_unf_var1 ) -GENPROT( her2_unf_var4 ) - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - uplo_t uplo, \ - conj_t conjx, \ - conj_t conjy, \ - conj_t conjh, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( her2_unb_var1 ) -INSERT_GENTPROT_BASIC0( her2_unb_var2 ) -INSERT_GENTPROT_BASIC0( her2_unb_var3 ) -INSERT_GENTPROT_BASIC0( her2_unb_var4 ) - -INSERT_GENTPROT_BASIC0( her2_unf_var1 ) -INSERT_GENTPROT_BASIC0( her2_unf_var4 ) - -// end bli_her2_var.h -// end bli_her2.h -// begin bli_symv.h - - -// NOTE: level-2 control tree code is temporarily disabled. -//#include "bli_symv_front.h" - -// end bli_symv.h -// begin bli_syr.h - - -// NOTE: level-2 control tree code is temporarily disabled. -//#include "bli_syr_front.h" - -// end bli_syr.h -// begin bli_syr2.h - - -// NOTE: level-2 control tree code is temporarily disabled. -//#include "bli_syr2_front.h" - -// end bli_syr2.h -// begin bli_trmv.h - - -// NOTE: level-2 control tree code is temporarily disabled. -//#include "bli_trmv_cntl.h" -//#include "bli_trmv_front.h" -//#include "bli_trmv_int.h" - -// begin bli_trmv_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - cntx_t* cntx, \ - cntl_t* cntl \ - ); - -GENPROT( trmv_l_blk_var1 ) -GENPROT( trmv_l_blk_var2 ) -GENPROT( trmv_u_blk_var1 ) -GENPROT( trmv_u_blk_var2 ) - -GENPROT( trmv_unb_var1 ) -GENPROT( trmv_unb_var2 ) - -GENPROT( trmv_unf_var1 ) -GENPROT( trmv_unf_var2 ) - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) -INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) - -INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) -INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) - -// end bli_trmv_var.h - -// end bli_trmv.h -// begin bli_trsv.h - - -// NOTE: level-2 control tree code is temporarily disabled. -//#include "bli_trsv_cntl.h" -//#include "bli_trsv_front.h" -//#include "bli_trsv_int.h" - -// begin bli_trsv_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - cntx_t* cntx, \ - cntl_t* cntl \ - ); - -GENPROT( trsv_l_blk_var1 ) -GENPROT( trsv_l_blk_var2 ) -GENPROT( trsv_u_blk_var1 ) -GENPROT( trsv_u_blk_var2 ) - -GENPROT( trsv_unb_var1 ) -GENPROT( trsv_unb_var2 ) - -GENPROT( trsv_unf_var1 ) -GENPROT( trsv_unf_var2 ) - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) -INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) - -INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) -INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) - -// end bli_trsv_var.h - -// end bli_trsv.h - -// end bli_l2.h - - -// -- Level-3 operations -- - -// begin bli_l3.h - - -// begin bli_l3_cntl.h - - - -// -// Prototype conditional control tree creation functions. -// - -void bli_l3_cntl_create_if - ( - opid_t family, - pack_t schema_a, - pack_t schema_b, - obj_t* a, - obj_t* b, - obj_t* c, - rntm_t* rntm, - cntl_t* cntl_orig, - cntl_t** cntl_use - ); - -void bli_l3_cntl_free - ( - rntm_t* rntm, - cntl_t* cntl_use, - thrinfo_t* thread - ); - -// end bli_l3_cntl.h -// begin bli_l3_check.h - - - -// -// Prototype object-based check functions. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx \ - ); - -GENPROT( gemm ) -GENPROT( her2k ) -GENPROT( syr2k ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx \ - ); - -GENPROT( hemm ) -GENPROT( symm ) -GENPROT( trmm ) -GENPROT( trsm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx \ - ); - -GENPROT( herk ) -GENPROT( syrk ) - - -// ----------------------------------------------------------------------------- - -void bli_gemm_basic_check - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ); - -void bli_hemm_basic_check - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ); - -void bli_herk_basic_check - ( - obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ); - -void bli_her2k_basic_check - ( - obj_t* alpha, - obj_t* a, - obj_t* bh, - obj_t* b, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ); - -void bli_l3_basic_check - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx - ); -// end bli_l3_check.h - -// Define function types. -// begin bli_l3_ft_ex.h - - -#ifndef BLIS_L3_FT_EX_H -#define BLIS_L3_FT_EX_H - - -// -// -- Level-3 expert function types -------------------------------------------- -// - -// gemm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ - ( \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTDEF( gemm ) - - -// hemm, symm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ - ( \ - side_t side, \ - uplo_t uploa, \ - conj_t conja, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTDEF( hemm ) -INSERT_GENTDEF( symm ) - - -// herk - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype_r* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTDEFR( herk ) - - -// her2k - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTDEFR( her2k ) - - -// syrk - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTDEFR( syrk ) - - -// syr2k - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTDEF( syr2k ) - - -// trmm3 - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ - ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTDEF( trmm3 ) - - -// trmm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ - ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTDEF( trmm ) -INSERT_GENTDEF( trsm ) - - -#endif - -// end bli_l3_ft_ex.h -// begin bli_l3_ft_ukr.h - - -#ifndef BLIS_L3_FT_UKR_H -#define BLIS_L3_FT_UKR_H - - -// -// -- Level-3 micro-kernel function types -------------------------------------- -// - -// gemm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ); - -INSERT_GENTDEF( gemm ) - - -// gemmtrsm_[lu] - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a1x, \ - ctype* restrict a11, \ - ctype* restrict bx1, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ); - -INSERT_GENTDEF( gemmtrsm ) - - -// trsm_[lu] - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ); - -INSERT_GENTDEF( trsm ) - - -#endif - -// end bli_l3_ft_ukr.h -// begin bli_l3_oft.h - - -#ifndef BLIS_L3_OFT_H -#define BLIS_L3_OFT_H - - -// -// -- Level-3 object function types -------------------------------------------- -// - -// gemm - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_oft)) \ -( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ -); - -GENTDEF( gemm ) -GENTDEF( gemmt ) -GENTDEF( her2k ) -GENTDEF( syr2k ) - - -// hemm, symm, trmm3 - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_oft)) \ -( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ -); - -GENTDEF( hemm ) -GENTDEF( symm ) -GENTDEF( trmm3 ) - - -// herk, syrk - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_oft)) \ -( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ -); - -GENTDEF( herk ) -GENTDEF( syrk ) - - -// trmm, trsm - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_oft)) \ -( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - cntx_t* cntx, \ - rntm_t* rntm \ -); - -GENTDEF( trmm ) -GENTDEF( trsm ) - - - -#endif - -// end bli_l3_oft.h -// begin bli_l3_oft_var.h - - -#ifndef BLIS_L3_OFT_VAR_H -#define BLIS_L3_OFT_VAR_H - - -// -// -- Level-3 variant function types ------------------------------------------- -// - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_var_oft)) \ -( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ -); - -GENTDEF( gemm ) - - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_var_oft)) \ -( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ -); - -GENTDEF( trsm ) - - - -#endif - -// end bli_l3_oft_var.h - -// begin bli_l3_blocksize.h - - -dim_t bli_l3_determine_kc - ( - dir_t direct, - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx, - cntl_t* cntl - ); - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -dim_t PASTEMAC0(opname) \ - ( \ - dir_t direct, \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ - ); - -GENPROT( gemm_determine_kc ) -GENPROT( herk_determine_kc ) -GENPROT( trmm_determine_kc ) -GENPROT( trsm_determine_kc ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -dim_t PASTEMAC0(opname) \ - ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ - ); - -GENPROT( gemm_determine_kc_f ) -GENPROT( gemm_determine_kc_b ) - -GENPROT( herk_determine_kc_f ) -GENPROT( herk_determine_kc_b ) - -GENPROT( trmm_determine_kc_f ) -GENPROT( trmm_determine_kc_b ) - -GENPROT( trsm_determine_kc_f ) -GENPROT( trsm_determine_kc_b ) - -// end bli_l3_blocksize.h -// begin bli_l3_direct.h - - -dir_t bli_l3_direct - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntl_t* cntl - ); - -// ----------------------------------------------------------------------------- - -#undef GENPROT -#define GENPROT( opname ) \ -\ -dir_t PASTEMAC0(opname) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c \ - ); - -GENPROT( gemm_direct ) -GENPROT( herk_direct ) -GENPROT( trmm_direct ) -GENPROT( trsm_direct ) - -// end bli_l3_direct.h -// begin bli_l3_prune.h - - - -#undef GENPROT -#define GENPROT( dim ) \ -\ -void PASTEMAC(l3_prune_unref_mparts_,dim) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntl_t* cntl \ - ); - -GENPROT( m ) -GENPROT( n ) -GENPROT( k ) - -// ----------------------------------------------------------------------------- - -#undef GENPROT -#define GENPROT( opname, dim ) \ -\ -void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c \ - ); - -GENPROT( gemm, m ) -GENPROT( gemm, n ) -GENPROT( gemm, k ) - -GENPROT( herk, m ) -GENPROT( herk, n ) -GENPROT( herk, k ) - -GENPROT( trmm, m ) -GENPROT( trmm, n ) -GENPROT( trmm, k ) - -GENPROT( trsm, m ) -GENPROT( trsm, n ) -GENPROT( trsm, k ) - -// end bli_l3_prune.h -// begin bli_l3_packm.h - - -void bli_l3_packm - ( - obj_t* x, - obj_t* x_pack, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -// end bli_l3_packm.h - -// Prototype object APIs (expert and non-expert). -// begin bli_oapi_ex.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that contain context parameters. - -// Define the macro to add a suffix to the object API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_OAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_oapi_ex.h -// begin bli_l3_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( gemm ) -GENPROT( gemmt ) -GENPROT( her2k ) -GENPROT( syr2k ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( hemm ) -GENPROT( symm ) -GENPROT( trmm3 ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( herk ) -GENPROT( syrk ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( trmm ) -GENPROT( trsm ) - -// end bli_l3_oapi.h - -// begin bli_oapi_ba.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_oapi_ba.h -// begin bli_l3_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( gemm ) -GENPROT( gemmt ) -GENPROT( her2k ) -GENPROT( syr2k ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( hemm ) -GENPROT( symm ) -GENPROT( trmm3 ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( herk ) -GENPROT( syrk ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( trmm ) -GENPROT( trsm ) - -// end bli_l3_oapi.h - -// Prototype typed APIs (expert and non-expert). -// begin bli_tapi_ex.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that contain context parameters. - -// Define the macro to add a suffix to the typed API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_TAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_tapi_ex.h -// begin bli_l3_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( gemm ) -INSERT_GENTPROT_BASIC0( gemmt ) - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - side_t side, \ - uplo_t uploa, \ - conj_t conja, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( hemm ) -INSERT_GENTPROT_BASIC0( symm ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype_r* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( herk ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( her2k ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( syrk ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( syr2k ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( trmm3 ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( trmm ) -INSERT_GENTPROT_BASIC0( trsm ) - -// end bli_l3_tapi.h - -// begin bli_tapi_ba.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_tapi_ba.h -// begin bli_l3_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( gemm ) -INSERT_GENTPROT_BASIC0( gemmt ) - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - side_t side, \ - uplo_t uploa, \ - conj_t conja, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( hemm ) -INSERT_GENTPROT_BASIC0( symm ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype_r* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( herk ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( her2k ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( syrk ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( syr2k ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( trmm3 ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( trmm ) -INSERT_GENTPROT_BASIC0( trsm ) - -// end bli_l3_tapi.h - -// Define function types for small/unpacked handlers/kernels. -// begin bli_l3_sup_oft.h - - -#ifndef BLIS_L3_SUP_OFT_H -#define BLIS_L3_SUP_OFT_H - - -// -// -- Level-3 small/unpacked object function types ----------------------------- -// - -// gemm - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef err_t (*PASTECH(opname,_oft)) \ -( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ -); - -GENTDEF( gemmsup ) -GENTDEF( gemmtsup ) -#endif - -// end bli_l3_sup_oft.h -// begin bli_l3_sup_ft_ker.h - - -#ifndef BLIS_L3_SUP_FT_KER_H -#define BLIS_L3_SUP_FT_KER_H - - -// -// -- Level-3 small/unpacked kernel function types ----------------------------- -// - -// gemmsup - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ - ( \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ); - -INSERT_GENTDEF( gemmsup ) - - -#endif - -// end bli_l3_sup_ft_ker.h - -// Define static edge case logic for use in small/unpacked kernels. -//#include "bli_l3_sup_edge.h" - -// Prototype object API to small/unpacked matrix dispatcher. -// begin bli_l3_sup.h - - -err_t bli_gemmsup - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - -err_t bli_gemmtsup - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - -// end bli_l3_sup.h - -// Prototype reference implementation of small/unpacked matrix handler. -// begin bli_l3_sup_ref.h - - -err_t bli_gemmsup_ref - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - -err_t bli_gemmtsup_ref - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - -// end bli_l3_sup_ref.h -// begin bli_l3_sup_int.h - - -err_t bli_gemmsup_int - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -err_t bli_gemmtsup_int - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); -// end bli_l3_sup_int.h -// begin bli_l3_sup_vars.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - trans_t trans, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - stor3_t eff_id, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -GENPROT( gemmsup_ref_var1 ) -GENPROT( gemmsup_ref_var2 ) - -GENPROT( gemmsup_ref_var1n ) -GENPROT( gemmsup_ref_var2m ) - - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* restrict alpha, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b, \ - void* restrict beta, \ - void* restrict c, inc_t rs_c, inc_t cs_c, \ - stor3_t eff_id, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - thrinfo_t* restrict thread \ - ); - -INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) -INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - bool_t packa, \ - bool_t packb, \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* restrict alpha, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b, \ - void* restrict beta, \ - void* restrict c, inc_t rs_c, inc_t cs_c, \ - stor3_t eff_id, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - thrinfo_t* restrict thread \ - ); - -INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) -INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) - -// ----------------------------------------------------------------------------- - -static void bli_gemmsup_ref_var1n2m_opt_cases - ( - num_t dt, - trans_t* trans, - bool_t packa, - bool_t packb, - stor3_t* eff_id, - cntx_t* cntx - ) -{ - const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); - - // Handle row- and column-preferrential kernels separately. - if ( row_pref ) - { - if ( packa && packb ) - { - if ( *eff_id == BLIS_RRC ) - { - // Since C is already row-stored, we can use BLIS_RRR kernel instead. - *eff_id = BLIS_RRR; - } - else if ( *eff_id == BLIS_CRC ) - { - // BLIS_RRC when transposed below (both matrices still packed). - // This allows us to use the BLIS_RRR kernel instead. - *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. - } - else if ( *eff_id == BLIS_CRR ) - { - // Induce a transpose to make C row-stored. - // BLIS_RCC when transposed below (both matrices still packed). - // This allows us to use the BLIS_RRR kernel instead. - *trans = bli_trans_toggled( *trans ); - *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. - } - } - else if ( packb ) - { - if ( *eff_id == BLIS_RRC ) - { - // Since C is already row-stored, we can use BLIS_RRR kernel instead. - *eff_id = BLIS_RRR; - } - else if ( *eff_id == BLIS_CRC ) - { - // BLIS_RRC when transposed below (with packa instead of packb). - // No transformation is beneficial here. - } - else if ( *eff_id == BLIS_RCC ) - { - // C is already row-stored; cancel transposition and use BLIS_RCR - // kernel instead. - *trans = bli_trans_toggled( *trans ); - *eff_id = BLIS_RCR; - } - #if 0 - // This transformation performs poorly. Theory: packing A (formerly B) - // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow - // and kills the performance? - else if ( eff_id == BLIS_CRR ) - { - trans = bli_trans_toggled( trans ); - eff_id = BLIS_CRC; // BLIS_RRC when transposed below. - } - #endif - } - else if ( packa ) - { - if ( *eff_id == BLIS_CRR ) - { - // Induce a transpose to make C row-stored. - // BLIS_RCC when transposed below (both matrices still packed). - // This allows us to use the BLIS_RRR kernel instead. - *trans = bli_trans_toggled( *trans ); - *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. - } - } - } - else - { - //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); - printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); - bli_abort(); - } -} - -// end bli_l3_sup_vars.h -// begin bli_l3_sup_packm_a.h - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool_t will_pack, \ - packbuf_t pack_buf_type, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool_t did_pack, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool_t will_pack, \ - stor3_t stor_id, \ - pack_t* restrict schema, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - dim_t* restrict m_max, \ - dim_t* restrict k_max, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - dim_t* restrict pd_p, inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_init_a ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool_t will_pack, \ - packbuf_t pack_buf_type, \ - stor3_t stor_id, \ - trans_t transc, \ - dim_t m_alloc, \ - dim_t k_alloc, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_a ) - -// end bli_l3_sup_packm_a.h -// begin bli_l3_sup_packm_b.h - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool_t will_pack, \ - packbuf_t pack_buf_type, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool_t did_pack, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool_t will_pack, \ - stor3_t stor_id, \ - pack_t* restrict schema, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - dim_t* restrict k_max, \ - dim_t* restrict n_max, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - dim_t* restrict pd_p, inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_init_b ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool_t will_pack, \ - packbuf_t pack_buf_type, \ - stor3_t stor_id, \ - trans_t transc, \ - dim_t k_alloc, \ - dim_t n_alloc, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - ctype* restrict kappa, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_b ) - -// end bli_l3_sup_packm_b.h -// begin bli_l3_sup_packm_var.h - - -// -// Prototype BLAS-like interfaces to the variants. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* restrict cntx, \ - thrinfo_t* restrict thread \ - ); - -INSERT_GENTPROT_BASIC0( packm_sup_var1 ) - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - cntx_t* restrict cntx, \ - thrinfo_t* restrict thread \ - ); - -INSERT_GENTPROT_BASIC0( packm_sup_var2 ) - -// end bli_l3_sup_packm_var.h - -// Prototype microkernel wrapper APIs. -// begin bli_l3_ukr_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx \ - ); - -GENPROT( gemm_ukernel ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* alpha, \ - obj_t* a1x, \ - obj_t* a11, \ - obj_t* bx1, \ - obj_t* b11, \ - obj_t* c11, \ - cntx_t* cntx \ - ); - -GENPROT( gemmtrsm_ukernel ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx \ - ); - -GENPROT( trsm_ukernel ) - -// end bli_l3_ukr_oapi.h -// begin bli_l3_ukr_tapi.h - - - -// -// Generate prototypes for level-3 micro-kernel wrappers. -// - -#undef gemm_ukr_name -#define gemm_ukr_name gemm_ukernel - -#undef gemmtrsm_l_ukr_name -#define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel -#undef gemmtrsm_u_ukr_name -#define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel - -#undef trsm_l_ukr_name -#define trsm_l_ukr_name trsm_l_ukernel -#undef trsm_u_ukr_name -#define trsm_u_ukr_name trsm_u_ukernel - -// Include the level-3 micro-kernel API template. - -// begin bli_l3_ukr.h - - -// -// Define template prototypes for level-3 micro-kernels. -// - -// Note: Instead of defining function prototype macro templates and then -// instantiating those macros to define the individual function prototypes, -// we simply alias the official operations' prototypes as defined in -// bli_l3_ukr_prot.h. - -#undef GENTPROT -#define GENTPROT GEMM_UKR_PROT - -INSERT_GENTPROT_BASIC0( gemm_ukr_name ) - - -#undef GENTPROT -#define GENTPROT GEMMTRSM_UKR_PROT - -INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) -INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) - - -#undef GENTPROT -#define GENTPROT TRSM_UKR_PROT - -INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) -INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) - -// end bli_l3_ukr.h - -// end bli_l3_ukr_tapi.h - -// Generate function pointer arrays for tapi microkernel functions. -// begin bli_l3_ukr_fpa.h - - -// -// Prototype function pointer query interface. -// - -#undef GENPROT -#define GENPROT( tname, opname ) \ -\ -PASTECH2(tname,_ukr,_vft) \ -PASTEMAC(opname,_qfp)( num_t dt ); - -GENPROT( gemm, gemm_ukernel ) -GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) -GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) -GENPROT( trsm, trsm_l_ukernel ) -GENPROT( trsm, trsm_u_ukernel ) - -// end bli_l3_ukr_fpa.h - -// Operation-specific headers. -// begin bli_gemm.h - - -// begin bli_gemm_cntl.h - - -cntl_t* bli_gemm_cntl_create - ( - rntm_t* rntm, - opid_t family, - pack_t schema_a, - pack_t schema_b - ); - -// ----------------------------------------------------------------------------- - -cntl_t* bli_gemmbp_cntl_create - ( - rntm_t* rntm, - opid_t family, - pack_t schema_a, - pack_t schema_b - ); - -#if 0 -cntl_t* bli_gemmpb_cntl_create - ( - opid_t family, - ); -#endif - -// ----------------------------------------------------------------------------- - -void bli_gemm_cntl_free - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -// ----------------------------------------------------------------------------- - -cntl_t* bli_gemm_cntl_create_node - ( - rntm_t* rntm, - opid_t family, - bszid_t bszid, - void_fp var_func, - cntl_t* sub_node - ); - -// end bli_gemm_cntl.h -// begin bli_gemm_front.h - - -void bli_gemm_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - -#ifdef BLIS_ENABLE_SMALL_MATRIX -err_t bli_gemm_small - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl - ); -#endif - -// end bli_gemm_front.h -// begin bli_gemm_int.h - - -void bli_gemm_int - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -// end bli_gemm_int.h - -// begin bli_gemm_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ - ); - -GENPROT( gemm_blk_var1 ) -GENPROT( gemm_blk_var2 ) -GENPROT( gemm_blk_var3 ) -GENPROT( gemm_packa ) -GENPROT( gemm_packb ) - -GENPROT( gemm_ker_var1 ) - -GENPROT( gemm_ker_var2 ) - -// Headers for induced algorithms: -GENPROT( gemm4mb_ker_var2 ) // 4m1b - - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( gemm_ker_var2 ) - -// Headers for induced algorithms: -INSERT_GENTPROT_BASIC0( gemm4mb_ker_var2 ) // 4m1b - -// end bli_gemm_var.h - -// begin bli_gemm_ind_opt.h - - -static void bli_gemm_ind_recast_1m_params - ( - num_t* dt_exec, - pack_t schema_a, - obj_t* c, - dim_t* m, - dim_t* n, - dim_t* k, - inc_t* pd_a, inc_t* ps_a, - inc_t* pd_b, inc_t* ps_b, - inc_t* rs_c, inc_t* cs_c - ) -{ - obj_t beta; - - - bli_obj_scalar_detach( c, &beta ); - - - if ( bli_obj_imag_is_zero( &beta ) && - !bli_is_gen_stored( *rs_c, *cs_c ) ) - { - *dt_exec = bli_dt_proj_to_real( *dt_exec ); - - if ( bli_is_1e_packed( schema_a ) ) - { - *m *= 2; - *n *= 1; - *k *= 2; - *pd_a *= 2; *ps_a *= 2; - *pd_b *= 1; *ps_b *= 2; - *rs_c *= 1; *cs_c *= 2; - } - else - { - *m *= 1; - *n *= 2; - *k *= 2; - *pd_a *= 1; *ps_a *= 2; - *pd_b *= 2; *ps_b *= 2; - *rs_c *= 2; *cs_c *= 1; - } - } -} - -// end bli_gemm_ind_opt.h - -// Mixed datatype support. -#ifdef BLIS_ENABLE_GEMM_MD -// begin bli_gemm_md.h - - -// begin bli_gemm_md_c2r_ref.h - - -// -- Level-3 native micro-kernel prototype redefinitions ---------------------- - -#undef gemm_ukr_name -#define gemm_ukr_name gemm_md_c2r_ref - -// Include the native micro-kernel API template. -// begin bli_l3_ukr.h - - -// -// Define template prototypes for level-3 micro-kernels. -// - -// Note: Instead of defining function prototype macro templates and then -// instantiating those macros to define the individual function prototypes, -// we simply alias the official operations' prototypes as defined in -// bli_l3_ukr_prot.h. - -#undef GENTPROT -#define GENTPROT GEMM_UKR_PROT - -INSERT_GENTPROT_BASIC0( gemm_ukr_name ) - - -#undef GENTPROT -#define GENTPROT GEMMTRSM_UKR_PROT - -INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) -INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) - - -#undef GENTPROT -#define GENTPROT TRSM_UKR_PROT - -INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) -INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) - -// end bli_l3_ukr.h -// end bli_gemm_md_c2r_ref.h - -// Define a local struct type that makes returning two values easier. -typedef struct mddm_s -{ - dom_t comp; - dom_t exec; -} mddm_t; - -void bli_gemm_md - ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx - ); -mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); - -// ----------------------------------------------------------------------------- - -void bli_gemm_md_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - -void bli_gemm_md_zgemm - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - -// ----------------------------------------------------------------------------- - -static bool_t bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) -{ - bool_t r_val = FALSE; - - // NOTE: The last conditional subexpression is necessary if/when we - // allow the user to specify the computation domain. (The computation - // domain is currently ignored, but once it is honored as a user- - // settable value, it will affect the execution domain, which is what - // is checked below. Until then, the last expression is not actually - // necessary since crr is already unconditionally associated with an - // execution domain of BLIS_REAL.) - if ( bli_obj_is_complex( c ) && - bli_obj_is_real( a ) && - bli_obj_is_real( b ) && - bli_obj_exec_domain( c ) == BLIS_REAL ) - r_val = TRUE; - - return r_val; -} - -static bool_t bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) -{ - bool_t r_val = FALSE; - - // NOTE: The last conditional subexpression is necessary if/when we - // allow the user to specify the computation domain. (The computation - // domain is currently ignored, but once it is honored as a user- - // settable value, it will affect the execution domain, which is what - // is checked below. Until then, the last expression is not actually - // necessary since ccr is already unconditionally associated with an - // execution domain of BLIS_COMPLEX.) - if ( bli_obj_is_complex( c ) && - bli_obj_is_complex( a ) && - bli_obj_is_real( b ) && - bli_obj_exec_domain( c ) == BLIS_COMPLEX ) - r_val = TRUE; - - return r_val; -} - -static bool_t bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) -{ - bool_t r_val = FALSE; - - // NOTE: The last conditional subexpression is necessary if/when we - // allow the user to specify the computation domain. (The computation - // domain is currently ignored, but once it is honored as a user- - // settable value, it will affect the execution domain, which is what - // is checked below. Until then, the last expression is not actually - // necessary since crc is already unconditionally associated with an - // execution domain of BLIS_COMPLEX.) - if ( bli_obj_is_complex( c ) && - bli_obj_is_real( a ) && - bli_obj_is_complex( b ) && - bli_obj_exec_domain( c ) == BLIS_COMPLEX ) - r_val = TRUE; - - return r_val; -} - -// ----------------------------------------------------------------------------- - -static void bli_gemm_md_ker_var2_recast - ( - num_t* dt_comp, - num_t dt_a, - num_t dt_b, - num_t dt_c, - dim_t* m, - dim_t* n, - dim_t* k, - inc_t* pd_a, inc_t* ps_a, - inc_t* pd_b, inc_t* ps_b, - obj_t* c, - inc_t* rs_c, inc_t* cs_c - ) -{ - if ( bli_is_real( dt_c ) && - bli_is_complex( dt_a ) && - bli_is_complex( dt_b ) ) - { - // The rcc case is executed with a real macrokernel, so we need to - // double the k dimension (because both A and B are packed to the 1r - // schema), and also the panel strides of A and B since they were - // packed as complex matrices and we now need to convert them to - // units of real elements. - *k *= 2; - *ps_a *= 2; - *ps_b *= 2; - } - else if ( bli_is_complex( dt_c ) && - bli_is_real( dt_a ) && - bli_is_complex( dt_b ) ) - { -#if 1 - obj_t beta; - - bli_obj_scalar_detach( c, &beta ); - - if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && - bli_obj_imag_is_zero( &beta ) && - bli_is_row_stored( *rs_c, *cs_c ) && - bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) - { - // If beta is real, and C is not general-stored, and the computation - // precision is equal to the storage precision of C, we can use the - // real macrokernel (and real microkernel, which is already stored - // to the real virtual microkernel slots of the context) instead of - // the complex macrokernel and c2r virtual microkernel. - *dt_comp = bli_dt_proj_to_real( *dt_comp ); - *n *= 2; - *pd_b *= 2; *ps_b *= 2; - *rs_c *= 2; - } - else -#endif - { - // Generally speaking, the crc case is executed with a complex - // macrokernel, so we need to halve the panel stride of A (which - // is real) since the macrokernel will perform the pointer - // arithmetic in units of complex elements. - *ps_a /= 2; - } - } - else if ( bli_is_complex( dt_c ) && - bli_is_complex( dt_a ) && - bli_is_real( dt_b ) ) - { -#if 1 - obj_t beta; - - bli_obj_scalar_detach( c, &beta ); - - if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && - bli_obj_imag_is_zero( &beta ) && - bli_is_col_stored( *rs_c, *cs_c ) && - bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) - { - // If beta is real, and C is not general-stored, and the computation - // precision is equal to the storage precision of C, we can use the - // real macrokernel (and real microkernel, which is already stored - // to the real virtual microkernel slots of the context) instead of - // the complex macrokernel and c2r virtual microkernel. - *dt_comp = bli_dt_proj_to_real( *dt_comp ); - *m *= 2; - *pd_a *= 2; *ps_a *= 2; - *cs_c *= 2; - } - else -#endif - { - // Generally speaking, the ccr case is executed with a complex - // macrokernel, so we need to halve the panel stride of B (which - // is real) since the macrokernel will perform the pointer - // arithmetic in units of complex elements. - *ps_b /= 2; - } - } -#if 0 - else if ( bli_is_real( dt_c ) && - bli_is_real( dt_a ) && - bli_is_real( dt_b ) ) - { - // No action needed. -//printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); - } - else if ( bli_is_complex( dt_c ) && - bli_is_real( dt_a ) && - bli_is_real( dt_b ) ) - { - // No action needed. - } - else if ( bli_is_real( dt_c ) && - bli_is_complex( dt_a ) && - bli_is_real( dt_b ) ) - { - // No action needed. - } - else if ( bli_is_real( dt_c ) && - bli_is_real( dt_a ) && - bli_is_complex( dt_b ) ) - { - // No action needed. - } -#endif -} - -// ----------------------------------------------------------------------------- - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ - ); - -GENPROT( gemm_ker_var2_md ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT2 -#define GENTPROT2( ctype_c, ctype_e, chc, che, varname ) \ -\ -void PASTEMAC2(chc,che,varname) \ - ( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT2_BASIC0( gemm_ker_var2_md ) -INSERT_GENTPROT2_MIXDP0( gemm_ker_var2_md ) - -// end bli_gemm_md.h -#endif -// end bli_gemm.h -// begin bli_hemm.h - - -// begin bli_hemm_front.h - - -void bli_hemm_front - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); -// end bli_hemm_front.h - -// end bli_hemm.h -// begin bli_herk.h - - -// begin bli_herk_front.h - - -void bli_herk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); -// end bli_herk_front.h - -// begin bli_herk_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* a, \ - obj_t* ah, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ - ); - -//GENPROT( herk_blk_var1 ) -//GENPROT( herk_blk_var2 ) -//GENPROT( herk_blk_var3 ) - -GENPROT( herk_x_ker_var2 ) - -GENPROT( herk_l_ker_var2 ) -GENPROT( herk_u_ker_var2 ) -//GENPROT( herk_packa ) -//GENPROT( herk_packb ) - - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( herk_l_ker_var2 ) -INSERT_GENTPROT_BASIC0( herk_u_ker_var2 ) - -// end bli_herk_var.h - -// end bli_herk.h -// begin bli_her2k.h - - -// begin bli_her2k_front.h - - -void bli_her2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); -// end bli_her2k_front.h - -// end bli_her2k.h -// begin bli_symm.h - - -// begin bli_symm_front.h - - -void bli_symm_front - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); -// end bli_symm_front.h - -// end bli_symm.h -// begin bli_syrk.h - - -// begin bli_syrk_front.h - - -void bli_syrk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - -#ifdef BLIS_ENABLE_SMALL_MATRIX -err_t bli_syrk_small - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl - ); -#endif - -// end bli_syrk_front.h - -// end bli_syrk.h -// begin bli_syr2k.h - - -// begin bli_syr2k_front.h - - -void bli_syr2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); -// end bli_syr2k_front.h - -// end bli_syr2k.h -// begin bli_trmm.h - - -// begin bli_trmm_front.h - - -void bli_trmm_front - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); -// end bli_trmm_front.h - -// begin bli_trmm_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ - ); - -//GENPROT( trmm_blk_var1 ) -//GENPROT( trmm_blk_var2 ) -//GENPROT( trmm_blk_var3 ) - -GENPROT( trmm_xx_ker_var2 ) - -GENPROT( trmm_ll_ker_var2 ) -GENPROT( trmm_lu_ker_var2 ) -GENPROT( trmm_rl_ker_var2 ) -GENPROT( trmm_ru_ker_var2 ) - - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoff, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) -INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) -INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) -INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) - -// end bli_trmm_var.h - -// end bli_trmm.h -// begin bli_trmm3.h - - -// begin bli_trmm3_front.h - - -void bli_trmm3_front - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); -// end bli_trmm3_front.h - -// end bli_trmm3.h -// begin bli_trsm.h - - -// begin bli_trsm_cntl.h - - -cntl_t* bli_trsm_cntl_create - ( - rntm_t* rntm, - side_t side, - pack_t schema_a, - pack_t schema_b - ); - -cntl_t* bli_trsm_l_cntl_create - ( - rntm_t* rntm, - pack_t schema_a, - pack_t schema_b - ); - -cntl_t* bli_trsm_r_cntl_create - ( - rntm_t* rntm, - pack_t schema_a, - pack_t schema_b - ); - -void bli_trsm_cntl_free - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -// ----------------------------------------------------------------------------- - -cntl_t* bli_trsm_cntl_create_node - ( - rntm_t* rntm, - opid_t family, - bszid_t bszid, - void_fp var_func, - cntl_t* sub_node - ); - -// end bli_trsm_cntl.h -// begin bli_trsm_front.h - - -void bli_trsm_front - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - -#ifdef BLIS_ENABLE_SMALL_MATRIX -err_t bli_trsm_small - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - cntl_t* cntl - ); -#endif - -// end bli_trsm_front.h -// begin bli_trsm_int.h - - -void bli_trsm_int - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -// end bli_trsm_int.h - -// begin bli_trsm_var.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ - ); - -GENPROT( trsm_blk_var1 ) -GENPROT( trsm_blk_var2 ) -GENPROT( trsm_blk_var3 ) -GENPROT( trsm_packa ) -GENPROT( trsm_packb ) - -GENPROT( trsm_xx_ker_var2 ) - -GENPROT( trsm_ll_ker_var2 ) -GENPROT( trsm_lu_ker_var2 ) -GENPROT( trsm_rl_ker_var2 ) -GENPROT( trsm_ru_ker_var2 ) - - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoff, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, \ - dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) -INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) -INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) -INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) - -// end bli_trsm_var.h - -// end bli_trsm.h -// begin bli_gemmt.h - - -// begin bli_gemmt_front.h - - -void bli_gemmt_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - - -// end bli_gemmt_front.h - -// begin bli_gemmt_var.h - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ - ); - -GENPROT( gemmt_ker_var2 ) - - - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname, uplo, varname ) \ -\ -void PASTEMACT(ch,opname,uplo,varname) \ - ( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m_off, \ - dim_t n_off, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_GEMMT( gemmt, ker_var2 ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - trans_t trans, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - stor3_t eff_id, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -GENPROT( gemmtsup_ref_var1n ) -GENPROT( gemmtsup_ref_var2m ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname, uplo, varname ) \ -\ -void PASTEMACT(ch,opname,uplo,varname) \ - ( \ - bool_t packa, \ - bool_t packb, \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* restrict alpha, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b, \ - void* restrict beta, \ - void* restrict c, inc_t rs_c, inc_t cs_c, \ - stor3_t eff_id, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - thrinfo_t* restrict thread \ - ); - -INSERT_GENTPROT_GEMMT( gemmtsup, ref_var1n ) -INSERT_GENTPROT_GEMMT( gemmtsup, ref_var2m ) - - -// end bli_gemmt_var.h -// end bli_gemmt.h -// end bli_l3.h - - -// -- Utility operations -- - -// begin bli_util.h - - -// begin bli_util_check.h - - - -// -// Prototype object-based check functions. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* asum \ - ); - -GENPROT( asumv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x \ - ); - -GENPROT( mkherm ) -GENPROT( mksymm ) -GENPROT( mktrim ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* norm \ - ); - -GENPROT( norm1v ) -GENPROT( normfv ) -GENPROT( normiv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* norm \ - ); - -GENPROT( norm1m ) -GENPROT( normfm ) -GENPROT( normim ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ - ); - -GENPROT( fprintv ) -GENPROT( fprintm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x \ - ); - -GENPROT( randv ) -GENPROT( randnv ) -GENPROT( randm ) -GENPROT( randnm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* scale, \ - obj_t* sumsq \ - ); - -GENPROT( sumsqv ) - - -// ----------------------------------------------------------------------------- - -void bli_utilv_xi_check - ( - obj_t* x, - obj_t* index - ); - -void bli_utilv_xa_check - ( - obj_t* x, - obj_t* asum - ); - -void bli_utilm_mkhst_check - ( - obj_t* a - ); - -void bli_utilv_norm_check - ( - obj_t* x, - obj_t* norm - ); - -void bli_utilm_norm_check - ( - obj_t* x, - obj_t* norm - ); - -void bli_utilm_fprint_check - ( - FILE* file, - char* s1, - obj_t* x, - char* format, - char* s2 - ); - -void bli_utilm_rand_check - ( - obj_t* x - ); - -void bli_utilv_sumsqv_check - ( - obj_t* x, - obj_t* scale, - obj_t* sumsq - ); - -// end bli_util_check.h - -// Prototype object APIs (expert and non-expert). -// begin bli_oapi_ex.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that contain context parameters. - -// Define the macro to add a suffix to the object API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_OAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_oapi_ex.h -// begin bli_util_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* asum \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( asumv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* a \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( mkherm ) -GENPROT( mksymm ) -GENPROT( mktrim ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* norm \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( norm1v ) -GENPROT( normfv ) -GENPROT( normiv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* norm \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( norm1m ) -GENPROT( normfm ) -GENPROT( normim ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( fprintv ) -GENPROT( fprintm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( printv ) -GENPROT( printm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( randv ) -GENPROT( randnv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( randm ) -GENPROT( randnm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* scale, \ - obj_t* sumsq \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( sumsqv ) - -// end bli_util_oapi.h - -// begin bli_oapi_ba.h - - -// This file defines macros used to allow the _oapi.c files to produce -// object APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_oapi_ba.h -// begin bli_util_oapi.h - - - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* asum \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( asumv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* a \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( mkherm ) -GENPROT( mksymm ) -GENPROT( mktrim ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* norm \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( norm1v ) -GENPROT( normfv ) -GENPROT( normiv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* norm \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( norm1m ) -GENPROT( normfm ) -GENPROT( normim ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( fprintv ) -GENPROT( fprintm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( printv ) -GENPROT( printm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( randv ) -GENPROT( randnv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( randm ) -GENPROT( randnm ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* scale, \ - obj_t* sumsq \ - BLIS_OAPI_EX_PARAMS \ - ); - -GENPROT( sumsqv ) - -// end bli_util_oapi.h - -// Prototype typed APIs (expert and non-expert). -// begin bli_tapi_ex.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that contain context parameters. - -// Define the macro to add a suffix to the typed API function names -// (in function definitions). -#undef EX_SUF -#define EX_SUF BLIS_TAPI_EX_SUF - -// Define the macro to add expert arguments to function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm - -// Define the macro to omit the expert variable declaration block, since -// it is not needed when expert parameters are passed in through the API. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS ,cntx, rntm - -// end bli_tapi_ex.h -// begin bli_util_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* asum \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( asumv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - dim_t m, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( mkherm ) -INSERT_GENTPROT_BASIC0( mksymm ) -INSERT_GENTPROT_BASIC0( mktrim ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* norm \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( norm1v ) -INSERT_GENTPROTR_BASIC0( normfv ) -INSERT_GENTPROTR_BASIC0( normiv ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype_r* norm \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( norm1m ) -INSERT_GENTPROTR_BASIC0( normfm ) -INSERT_GENTPROTR_BASIC0( normim ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - char* s1, \ - dim_t n, \ - void* x, inc_t incx, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTPROT_BASIC0_I( printv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - char* s1, \ - dim_t m, \ - dim_t n, \ - void* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTPROT_BASIC0_I( printm ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( randv ) -INSERT_GENTPROT_BASIC0( randnv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( randm ) -INSERT_GENTPROT_BASIC0( randnm ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* scale, \ - ctype_r* sumsq \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( sumsqv ) - - -// end bli_util_tapi.h -// begin bli_util_ft.h - - - -// -// -- Utility function types --------------------------------------------------- -// - -// asumv - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* asum \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( asumv ) - -// mkherm, mksymm, mktrim - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - dim_t m, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( mkherm ) -INSERT_GENTDEF( mksymm ) -INSERT_GENTDEF( mktrim ) - -// norm1v, normfv, normiv - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* norm \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( norm1v ) -INSERT_GENTDEFR( normfv ) -INSERT_GENTDEFR( normiv ) - -// norm1m, normfm, normim - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype_r* norm \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( norm1m ) -INSERT_GENTDEFR( normfm ) -INSERT_GENTDEFR( normim ) - -// fprintv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - FILE* file, \ - char* s1, \ - dim_t n, \ - ctype* x, inc_t incx, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTDEF( fprintv ) - -// fprintm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - FILE* file, \ - char* s1, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTDEF( fprintm ) - -// randv, randnv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( randv ) -INSERT_GENTDEF( randnv ) - -// randm, randnm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( randm ) -INSERT_GENTDEF( randnm ) - -// sumsqv - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* scale, \ - ctype_r* sumsq \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( sumsqv ) - -// end bli_util_ft.h - -// begin bli_tapi_ba.h - - -// This file defines macros used to allow the _tapi.c files to produce -// typed APIs that omit expert parameters. - -// Define the macro to remove the function name suffix (in function -// definitions). -#undef EX_SUF -#define EX_SUF - -// Define the macro to omit expert arguments from function signatures -// and prototypes. -#undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS - -// Define the macro to declare local expert variables that are initialized -// to NULL. The "( void )" statements are to prevent unused variable -// warnings by the compiler. -#undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; - -// Define the macro to pass the local expert variables to another function. -//#undef BLIS_TAPI_EX_VARS -//#define BLIS_TAPI_EX_VARS - -// end bli_tapi_ba.h -// begin bli_util_tapi.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* asum \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( asumv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - uplo_t uploa, \ - dim_t m, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( mkherm ) -INSERT_GENTPROT_BASIC0( mksymm ) -INSERT_GENTPROT_BASIC0( mktrim ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* norm \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( norm1v ) -INSERT_GENTPROTR_BASIC0( normfv ) -INSERT_GENTPROTR_BASIC0( normiv ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype_r* norm \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( norm1m ) -INSERT_GENTPROTR_BASIC0( normfm ) -INSERT_GENTPROTR_BASIC0( normim ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - char* s1, \ - dim_t n, \ - void* x, inc_t incx, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTPROT_BASIC0_I( printv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - char* s1, \ - dim_t m, \ - dim_t n, \ - void* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTPROT_BASIC0_I( printm ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( randv ) -INSERT_GENTPROT_BASIC0( randnv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - doff_t diagoffx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROT_BASIC0( randm ) -INSERT_GENTPROT_BASIC0( randnm ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* scale, \ - ctype_r* sumsq \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTPROTR_BASIC0( sumsqv ) - - -// end bli_util_tapi.h -// begin bli_util_ft.h - - - -// -// -- Utility function types --------------------------------------------------- -// - -// asumv - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* asum \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( asumv ) - -// mkherm, mksymm, mktrim - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - uplo_t uploa, \ - dim_t m, \ - ctype* a, inc_t rs_a, inc_t cs_a \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( mkherm ) -INSERT_GENTDEF( mksymm ) -INSERT_GENTDEF( mktrim ) - -// norm1v, normfv, normiv - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* norm \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( norm1v ) -INSERT_GENTDEFR( normfv ) -INSERT_GENTDEFR( normiv ) - -// norm1m, normfm, normim - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype_r* norm \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( norm1m ) -INSERT_GENTDEFR( normfm ) -INSERT_GENTDEFR( normim ) - -// fprintv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - FILE* file, \ - char* s1, \ - dim_t n, \ - ctype* x, inc_t incx, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTDEF( fprintv ) - -// fprintm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - FILE* file, \ - char* s1, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTDEF( fprintm ) - -// randv, randnv - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( randv ) -INSERT_GENTDEF( randnv ) - -// randm, randnm - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - doff_t diagoffx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEF( randm ) -INSERT_GENTDEF( randnm ) - -// sumsqv - -#undef GENTDEFR -#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ -\ -typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* scale, \ - ctype_r* sumsq \ - BLIS_TAPI_EX_PARAMS \ - ); - -INSERT_GENTDEFR( sumsqv ) - -// end bli_util_ft.h - -// Generate function pointer arrays for tapi functions (expert only). -// begin bli_util_fpa.h - - -// -// Prototype function pointer query interface. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ -PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); - -GENPROT( asumv ) -GENPROT( mkherm ) -GENPROT( mksymm ) -GENPROT( mktrim ) -GENPROT( norm1v ) -GENPROT( normfv ) -GENPROT( normiv ) -GENPROT( norm1m ) -GENPROT( normfm ) -GENPROT( normim ) -GENPROT( fprintv ) -GENPROT( fprintm ) -//GENPROT( printv ) -//GENPROT( printm ) -GENPROT( randv ) -GENPROT( randnv ) -GENPROT( randm ) -GENPROT( randnm ) -GENPROT( sumsqv ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -PASTECH(opname,_vft) \ -PASTEMAC(opname,_qfp)( num_t dt ); - -GENPROT( fprintv ) -GENPROT( fprintm ) -//GENPROT( printv ) -//GENPROT( printm ) -// end bli_util_fpa.h - -// Prototype level-1m implementations. -// begin bli_util_unb_var1.h - - - -// -// Prototype BLAS-like interfaces with typed operands. -// - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* asum, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - uplo_t uploa, \ - dim_t m, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) -INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) -INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* norm, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) -INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) -INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype_r* norm, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) -INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) -INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - FILE* file, \ - char* s1, \ - dim_t n, \ - ctype* x, inc_t incx, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTPROT_BASIC0_I( fprintv ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - FILE* file, \ - char* s1, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ - ); - -INSERT_GENTPROT_BASIC0_I( fprintm ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( randv_unb_var1 ) -INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROT_BASIC0( randm_unb_var1 ) -INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) - - -#undef GENTPROTR -#define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* scale, \ - ctype_r* sumsq, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ); - -INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) - -// end bli_util_unb_var1.h - -//Routines to copy certain portion of a matrix to another -// begin bli_util_update.h - - -#undef GENTPROT -#define GENTPROT(ctype, ch, varname ) \ -\ -void PASTEMAC(ch, varname) \ - ( \ - dim_t m_off, \ - dim_t n_off, \ - dim_t m_cur, \ - dim_t n_cur, \ - ctype* ct, inc_t rs_ct, inc_t cs_ct, \ - ctype* beta_cast, \ - ctype* c, inc_t rs_c, inc_t cs_c \ - ); - -INSERT_GENTPROT_BASIC0( update_lower_triang ) -INSERT_GENTPROT_BASIC0( update_upper_triang ) - -// end bli_util_update.h -// end bli_util.h - - -// -- sandbox implementation -- - -// begin bli_sbox.h - - -#ifndef BLIS_SBOX_H -#define BLIS_SBOX_H - -// Each sandbox must have a bli_sandbox.h file present somewhere inside. -// If a sandbox was enabled at configure-time, we need to #include its -// header file here so that it will get pulled into blis.h when it is -// flattened into a monolithic header. -#ifdef BLIS_ENABLE_SANDBOX -#include "bli_sandbox.h" // skipped -#endif - -#endif - -// end bli_sbox.h - - -// -- BLAS compatibility layer -- - -// begin bli_blas.h - - -// If the CBLAS compatibility layer was enabled while the BLAS layer -// was not enabled, we must enable it here. -#ifdef BLIS_ENABLE_CBLAS -#ifndef BLIS_ENABLE_BLAS -#define BLIS_ENABLE_BLAS -#endif -#endif // BLIS_ENABLE_CBLAS - -// By default, if the BLAS compatibility layer is enabled, we define -// (include) all of the BLAS prototypes. However, if the user is -// #including "blis.h" and also #including another header that also -// declares the BLAS functions, then we provide an opportunity to -// #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). -#ifdef BLIS_ENABLE_BLAS -#define BLIS_ENABLE_BLAS_DEFS -#else -#undef BLIS_ENABLE_BLAS_DEFS -#endif - -// Skip prototyping all of the BLAS if the BLAS test drivers are being -// compiled. -#ifdef BLIS_VIA_BLASTEST -#undef BLIS_ENABLE_BLAS_DEFS -#endif - -// Skip prototyping all of the BLAS if the environment has defined the -// macro BLIS_DISABLE_BLAS_DEFS. -#ifdef BLIS_DISABLE_BLAS_DEFS -#undef BLIS_ENABLE_BLAS_DEFS -#endif - -// Begin including all BLAS prototypes. -#ifdef BLIS_ENABLE_BLAS_DEFS - - -// -- System headers needed by BLAS compatibility layer -- - -#include // skipped - - -// -- Constants -- - -#define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) - - -// -- Utility macros -- - -// begin bla_r_sign.h - - -#ifdef BLIS_ENABLE_BLAS - -double bla_r_sign(const bla_real *a, const bla_real *b); - -#endif - -// end bla_r_sign.h -// begin bla_d_sign.h - - -#ifdef BLIS_ENABLE_BLAS - -double bla_d_sign(const bla_double *a, const bla_double *b); - -#endif - -// end bla_d_sign.h - -// begin bla_r_cnjg.h - - -#ifdef BLIS_ENABLE_BLAS - -void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); - -#endif - -// end bla_r_cnjg.h -// begin bla_d_cnjg.h - - -#ifdef BLIS_ENABLE_BLAS - -void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); - -#endif - -// end bla_d_cnjg.h - -// begin bla_r_imag.h - - -#ifdef BLIS_ENABLE_BLAS - -bla_real bla_r_imag(const bla_scomplex *z); - -#endif - -// end bla_r_imag.h -// begin bla_d_imag.h - - -#ifdef BLIS_ENABLE_BLAS - -double bla_d_imag(const bla_dcomplex *z); - -#endif - -// end bla_d_imag.h - -// begin bla_c_div.h - - -#ifdef BLIS_ENABLE_BLAS - -void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); - -#endif - -// end bla_c_div.h -// begin bla_z_div.h - - -#ifdef BLIS_ENABLE_BLAS - -void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); - -#endif - -// end bla_z_div.h - -// begin bla_f__cabs.h - - -#ifdef BLIS_ENABLE_BLAS - -double bla_f__cabs(double real, double imag); - -#endif - -// end bla_f__cabs.h -// begin bla_r_abs.h - - -#ifdef BLIS_ENABLE_BLAS - -double bla_r_abs(const bla_real *x); - -#endif - -// end bla_r_abs.h -// begin bla_d_abs.h - - -#ifdef BLIS_ENABLE_BLAS - -double bla_d_abs(const bla_double *x); - -#endif - -// end bla_d_abs.h -// begin bla_c_abs.h - - -#ifdef BLIS_ENABLE_BLAS - -double bla_c_abs(const bla_scomplex *z); - -#endif - -// end bla_c_abs.h -// begin bla_z_abs.h - - -#ifdef BLIS_ENABLE_BLAS - -double bla_z_abs(const bla_dcomplex *z); - -#endif - -// end bla_z_abs.h - -// begin bla_lsame.h - - -#ifdef BLIS_ENABLE_BLAS - -#ifdef LAPACK_ILP64 -long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); -#else -BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); -#endif - -#endif -// end bla_lsame.h -// begin bla_xerbla.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); - -#endif -// end bla_xerbla.h -// begin bla_xerbla_array.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); - -#endif -// end bla_xerbla_array.h - - -// -- Level-0 BLAS prototypes -- - -// begin bla_cabs1.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); -BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); - -#endif -// end bla_cabs1.h - - -// -- Level-1 BLAS prototypes -- - -// begin bla_amax.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype_x, chx, blasname ) \ -\ -BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ - ( \ - const f77_int* n, \ - const ftype_x* x, const f77_int* incx \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( amax ) -#endif - -// end bla_amax.h -// begin bla_asum.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTR2 -#define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ -\ -BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ - ( \ - const f77_int* n, \ - const ftype_x* x, const f77_int* incx \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTR2_BLAS( asum ) -#endif - -// end bla_asum.h -// begin bla_axpy.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_int* n, \ - const ftype* alpha, \ - const ftype* x, const f77_int* incx, \ - ftype* y, const f77_int* incy \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( axpy ) -#endif - -// end bla_axpy.h -// begin bla_copy.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_int* n, \ - const ftype* x, const f77_int* incx, \ - ftype* y, const f77_int* incy \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( copy ) -#endif - -// end bla_copy.h -// begin bla_dot.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTDOT -#define GENTPROTDOT( ftype, ch, chc, blasname ) \ -\ -BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ - ( \ - const f77_int* n, \ - const ftype* x, const f77_int* incx, \ - const ftype* y, const f77_int* incy \ - ); - -#ifdef BLIS_ENABLE_BLAS - -#ifdef AOCL_F2C -INSERT_GENTPROTDOT_BLAS_SDC( dot ) - - -BLIS_EXPORT_BLAS dcomplex zdotc_ -( - dcomplex *ret_val, - const f77_int* n, - const dcomplex* x, const f77_int* incx, - const dcomplex* y, const f77_int* incy -); -#else -INSERT_GENTPROTDOT_BLAS( dot ) -#endif - -// -- "Black sheep" dot product function prototypes -- - -BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) - ( - const f77_int* n, - const float* sb, - const float* x, const f77_int* incx, - const float* y, const f77_int* incy - ); - -BLIS_EXPORT_BLAS double PASTEF77(d,sdot) - ( - const f77_int* n, - const float* x, const f77_int* incx, - const float* y, const f77_int* incy - ); -#endif -// end bla_dot.h -// begin bla_nrm2.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTR2 -#define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ -\ -BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ - ( \ - const f77_int* n, \ - const ftype_x* x, const f77_int* incx \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTR2_BLAS( nrm2 ) -#endif - -// end bla_nrm2.h -// begin bla_rot.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); -BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); -BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); -BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); - -#endif -// end bla_rot.h -// begin bla_rotg.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); -BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); -BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); -BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); - -#endif -// end bla_rotg.h -// begin bla_rotm.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); -BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); - -#endif -// end bla_rotm.h -// begin bla_rotmg.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); -BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); - -#endif -// end bla_rotmg.h -// begin bla_scal.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTSCAL -#define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ - ( \ - const f77_int* n, \ - const ftype_a* alpha, \ - ftype_x* x, const f77_int* incx \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTSCAL_BLAS( scal ) -#endif - -// end bla_scal.h -// begin bla_swap.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_int* n, \ - ftype* x, const f77_int* incx, \ - ftype* y, const f77_int* incy \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( swap ) -#endif - -// end bla_swap.h - -// begin f77_amax_sub.h - - - -// -// Prototype CBLAS subroutine wrapper interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype_x, chx, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ - ( \ - const f77_int* n, \ - const ftype_x* x, const f77_int* incx, \ - f77_int* rval \ - ); - -#ifdef BLIS_ENABLE_CBLAS -INSERT_GENTPROT_BLAS( amax ) -#endif -// end f77_amax_sub.h -// begin f77_asum_sub.h - - - -// -// Prototype CBLAS subroutine wrapper interfaces. -// -#undef GENTPROTR2 -#define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ - ( \ - const f77_int* n, \ - const ftype_x* x, const f77_int* incx, \ - ftype_r* rval \ - ); - -#ifdef BLIS_ENABLE_CBLAS -INSERT_GENTPROTR2_BLAS( asum ) -#endif -// end f77_asum_sub.h -// begin f77_dot_sub.h - - - -// -// Prototype CBLAS subroutine wrapper interfaces. -// -#undef GENTPROTDOT -#define GENTPROTDOT( ftype, ch, chc, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ - ( \ - const f77_int* n, \ - const ftype* x, const f77_int* incx, \ - const ftype* y, const f77_int* incy, \ - ftype* rval \ - ); - -#ifdef BLIS_ENABLE_CBLAS -INSERT_GENTPROTDOT_BLAS( dot ) - - -// -- "Black sheep" dot product function prototypes -- - -BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) - ( - const f77_int* n, - const float* sb, - const float* x, const f77_int* incx, - const float* y, const f77_int* incy, - float* rval - ); - -BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) - ( - const f77_int* n, - const float* x, const f77_int* incx, - const float* y, const f77_int* incy, - double* rval - ); -#endif -// end f77_dot_sub.h -// begin f77_nrm2_sub.h - - - -// -// Prototype CBLAS subroutine wrapper interfaces. -// -#undef GENTPROTR2 -#define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ - ( \ - const f77_int* n, \ - const ftype_x* x, const f77_int* incx, \ - ftype_r* rval \ - ); - -#ifdef BLIS_ENABLE_CBLAS -INSERT_GENTPROTR2_BLAS( nrm2 ) -#endif -// end f77_nrm2_sub.h - - -// -- Level-2 BLAS prototypes -- - -// dense - -// begin bla_gemv.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* transa, \ - const f77_int* m, \ - const f77_int* n, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype* x, const f77_int* incx, \ - const ftype* beta, \ - ftype* y, const f77_int* incy \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( gemv ) -#endif - -// end bla_gemv.h -// begin bla_ger.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTDOT -#define GENTPROTDOT( ftype, chxy, chc, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ - ( \ - const f77_int* m, \ - const f77_int* n, \ - const ftype* alpha, \ - const ftype* x, const f77_int* incx, \ - const ftype* y, const f77_int* incy, \ - ftype* a, const f77_int* lda \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTDOT_BLAS( ger ) -#endif - -// end bla_ger.h -// begin bla_hemv.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTCO -#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploa, \ - const f77_int* m, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype* x, const f77_int* incx, \ - const ftype* beta, \ - ftype* y, const f77_int* incy \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTCO_BLAS( hemv ) -#endif - -// end bla_hemv.h -// begin bla_her.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTCO -#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploa, \ - const f77_int* m, \ - const ftype_r* alpha, \ - const ftype* x, const f77_int* incx, \ - ftype* a, const f77_int* lda \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTCO_BLAS( her ) -#endif - -// end bla_her.h -// begin bla_her2.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTCO -#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploa, \ - const f77_int* m, \ - const ftype* alpha, \ - const ftype* x, const f77_int* incx, \ - const ftype* y, const f77_int* incy, \ - ftype* a, const f77_int* lda \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTCO_BLAS( her2 ) -#endif - -// end bla_her2.h -// begin bla_symv.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTRO -#define GENTPROTRO( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploa, \ - const f77_int* m, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype* x, const f77_int* incx, \ - const ftype* beta, \ - ftype* y, const f77_int* incy \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTRO_BLAS( symv ) -#endif - -// end bla_symv.h -// begin bla_syr.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTRO -#define GENTPROTRO( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploa, \ - const f77_int* m, \ - const ftype* alpha, \ - const ftype* x, const f77_int* incx, \ - ftype* a, const f77_int* lda \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTRO_BLAS( syr ) -#endif - -// end bla_syr.h -// begin bla_syr2.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTRO -#define GENTPROTRO( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploa, \ - const f77_int* m, \ - const ftype* alpha, \ - const ftype* x, const f77_int* incx, \ - const ftype* y, const f77_int* incy, \ - ftype* a, const f77_int* lda \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTRO_BLAS( syr2 ) -#endif - -// end bla_syr2.h -// begin bla_trmv.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploa, \ - const f77_char* transa, \ - const f77_char* diaga, \ - const f77_int* m, \ - const ftype* a, const f77_int* lda, \ - ftype* x, const f77_int* incx \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( trmv ) -#endif - -// end bla_trmv.h -// begin bla_trsv.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploa, \ - const f77_char* transa, \ - const f77_char* diaga, \ - const f77_int* m, \ - const ftype* a, const f77_int* lda, \ - ftype* x, const f77_int* incx \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( trsv ) -#endif - -// end bla_trsv.h - -// begin bla_gemv_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ -{ \ - f77_int info = 0; \ - f77_int nota, ta, conja; \ -\ - nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ - ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ - conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( !nota && !ta && !conja ) \ - info = 1; \ - else if ( *m < 0 ) \ - info = 2; \ - else if ( *n < 0 ) \ - info = 3; \ - else if ( *lda < bli_max( 1, *m ) ) \ - info = 6; \ - else if ( *incx == 0 ) \ - info = 8; \ - else if ( *incy == 0 ) \ - info = 11; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_gemv_check.h -// begin bla_ger_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ -{ \ - f77_int info = 0; \ -\ - if ( *m < 0 ) \ - info = 1; \ - else if ( *n < 0 ) \ - info = 2; \ - else if ( *incx == 0 ) \ - info = 5; \ - else if ( *incy == 0 ) \ - info = 7; \ - else if ( *lda < bli_max( 1, *m ) ) \ - info = 9; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - \ - sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_ger_check.h -// begin bla_hemv_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ -{ \ - f77_int info = 0; \ - f77_int lower, upper; \ -\ - lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( !lower && !upper ) \ - info = 1; \ - else if ( *m < 0 ) \ - info = 2; \ - else if ( *lda < bli_max( 1, *m ) ) \ - info = 5; \ - else if ( *incx == 0 ) \ - info = 7; \ - else if ( *incy == 0 ) \ - info = 10; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_hemv_check.h -// begin bla_her_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ -{ \ - f77_int info = 0; \ - f77_int lower, upper; \ -\ - lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( !lower && !upper ) \ - info = 1; \ - else if ( *m < 0 ) \ - info = 2; \ - else if ( *incx == 0 ) \ - info = 5; \ - else if ( *lda < bli_max( 1, *m ) ) \ - info = 7; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_her_check.h -// begin bla_her2_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ -{ \ - f77_int info = 0; \ - f77_int lower, upper; \ -\ - lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( !lower && !upper ) \ - info = 1; \ - else if ( *m < 0 ) \ - info = 2; \ - else if ( *incx == 0 ) \ - info = 5; \ - else if ( *incy == 0 ) \ - info = 7; \ - else if ( *lda < bli_max( 1, *m ) ) \ - info = 9; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_her2_check.h -// begin bla_symv_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_symv_check bla_hemv_check - -#endif -// end bla_symv_check.h -// begin bla_syr_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_syr_check bla_her_check - -#endif -// end bla_syr_check.h -// begin bla_syr2_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_syr2_check bla_her2_check - -#endif -// end bla_syr2_check.h -// begin bla_trmv_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ -{ \ - f77_int info = 0; \ - f77_int lower, upper; \ - f77_int nota, ta, conja; \ - f77_int unita, nonua; \ -\ - lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ - nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ - ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ - conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ - unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ - nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( !lower && !upper ) \ - info = 1; \ - else if ( !nota && !ta && !conja ) \ - info = 2; \ - else if ( !unita && !nonua ) \ - info = 3; \ - else if ( *m < 0 ) \ - info = 4; \ - else if ( *lda < bli_max( 1, *m ) ) \ - info = 6; \ - else if ( *incx == 0 ) \ - info = 8; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_trmv_check.h -// begin bla_trsv_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_trsv_check bla_trmv_check - -#endif -// end bla_trsv_check.h - -// packed - -// begin bla_hpmv.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); -BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); - -#endif -// end bla_hpmv.h -// begin bla_hpr.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); -BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); - -#endif -// end bla_hpr.h -// begin bla_hpr2.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); -BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); - -#endif -// end bla_hpr2.h -// begin bla_spmv.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); -BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); - -#endif -// end bla_spmv.h -// begin bla_spr.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); -BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); - -#endif -// end bla_spr.h -// begin bla_spr2.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); -BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); - -#endif -// end bla_spr2.h -// begin bla_tpmv.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); - -#endif -// end bla_tpmv.h -// begin bla_tpsv.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); - -#endif -// end bla_tpsv.h - -// banded - -// begin bla_gbmv.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); -BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); -BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); -BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); - -#endif -// end bla_gbmv.h -// begin bla_hbmv.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); -BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); - -#endif -// end bla_hbmv.h -// begin bla_sbmv.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); -BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); - -#endif -// end bla_sbmv.h -// begin bla_tbmv.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); - -#endif -// end bla_tbmv.h -// begin bla_tbsv.h - - -#ifdef BLIS_ENABLE_BLAS - -BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); -BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); - -#endif -// end bla_tbsv.h - - -// -- Level-3 BLAS prototypes -- - -// begin bla_gemm.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* transa, \ - const f77_char* transb, \ - const f77_int* m, \ - const f77_int* n, \ - const f77_int* k, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype* b, const f77_int* ldb, \ - const ftype* beta, \ - ftype* c, const f77_int* ldc \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( gemm ) -#endif - -// end bla_gemm.h -// begin bla_hemm.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTCO -#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* side, \ - const f77_char* uploa, \ - const f77_int* m, \ - const f77_int* n, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype* b, const f77_int* ldb, \ - const ftype* beta, \ - ftype* c, const f77_int* ldc \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTCO_BLAS( hemm ) -#endif - -// end bla_hemm.h -// begin bla_herk.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTCO -#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploc, \ - const f77_char* transa, \ - const f77_int* m, \ - const f77_int* k, \ - const ftype_r* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype_r* beta, \ - ftype* c, const f77_int* ldc \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTCO_BLAS( herk ) -#endif - -// end bla_herk.h -// begin bla_her2k.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROTCO -#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploc, \ - const f77_char* transa, \ - const f77_int* m, \ - const f77_int* k, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype* b, const f77_int* ldb, \ - const ftype_r* beta, \ - ftype* c, const f77_int* ldc \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROTCO_BLAS( her2k ) -#endif - -// end bla_her2k.h -// begin bla_symm.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* side, \ - const f77_char* uploa, \ - const f77_int* m, \ - const f77_int* n, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype* b, const f77_int* ldb, \ - const ftype* beta, \ - ftype* c, const f77_int* ldc \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( symm ) -#endif - -// end bla_symm.h -// begin bla_syrk.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploc, \ - const f77_char* transa, \ - const f77_int* m, \ - const f77_int* k, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype* beta, \ - ftype* c, const f77_int* ldc \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( syrk ) -#endif - -// end bla_syrk.h -// begin bla_syr2k.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploc, \ - const f77_char* transa, \ - const f77_int* m, \ - const f77_int* k, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype* b, const f77_int* ldb, \ - const ftype* beta, \ - ftype* c, const f77_int* ldc \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( syr2k ) -#endif - -// end bla_syr2k.h -// begin bla_trmm.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* side, \ - const f77_char* uploa, \ - const f77_char* transa, \ - const f77_char* diaga, \ - const f77_int* m, \ - const f77_int* n, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - ftype* b, const f77_int* ldb \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( trmm ) -#endif - -// end bla_trmm.h -// begin bla_trsm.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* side, \ - const f77_char* uploa, \ - const f77_char* transa, \ - const f77_char* diaga, \ - const f77_int* m, \ - const f77_int* n, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - ftype* b, const f77_int* ldb \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( trsm ) -#endif - -// end bla_trsm.h -// begin bla_gemmt.h - - - -// -// Prototype BLAS-to-BLIS interfaces. -// -#undef GENTPROT -#define GENTPROT( ftype, ch, blasname ) \ -\ -BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ - ( \ - const f77_char* uploc, \ - const f77_char* transa, \ - const f77_char* transb, \ - const f77_int* n, \ - const f77_int* k, \ - const ftype* alpha, \ - const ftype* a, const f77_int* lda, \ - const ftype* b, const f77_int* ldb, \ - const ftype* beta, \ - ftype* c, const f77_int* ldc \ - ); - -#ifdef BLIS_ENABLE_BLAS -INSERT_GENTPROT_BLAS( gemmt ) -#endif -// end bla_gemmt.h - -// begin bla_gemm_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ -{ \ - f77_int info = 0; \ - f77_int nota, notb; \ - f77_int conja, conjb; \ - f77_int ta, tb; \ - f77_int nrowa, nrowb; \ -\ - nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ - notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ - conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ - conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ - ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ - tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( nota ) { nrowa = *m; } \ - else { nrowa = *k; } \ - if ( notb ) { nrowb = *k; } \ - else { nrowb = *n; } \ -\ - if ( !nota && !conja && !ta ) \ - info = 1; \ - else if ( !notb && !conjb && !tb ) \ - info = 2; \ - else if ( *m < 0 ) \ - info = 3; \ - else if ( *n < 0 ) \ - info = 4; \ - else if ( *k < 0 ) \ - info = 5; \ - else if ( *lda < bli_max( 1, nrowa ) ) \ - info = 8; \ - else if ( *ldb < bli_max( 1, nrowb ) ) \ - info = 10; \ - else if ( *ldc < bli_max( 1, *m ) ) \ - info = 13; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_gemm_check.h -// begin bla_hemm_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ -{ \ - f77_int info = 0; \ - f77_int left, right; \ - f77_int lower, upper; \ - f77_int nrowa; \ -\ - left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ - right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ - lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( left ) { nrowa = *m; } \ - else { nrowa = *n; } \ -\ - if ( !left && !right ) \ - info = 1; \ - else if ( !lower && !upper ) \ - info = 2; \ - else if ( *m < 0 ) \ - info = 3; \ - else if ( *n < 0 ) \ - info = 4; \ - else if ( *lda < bli_max( 1, nrowa ) ) \ - info = 7; \ - else if ( *ldb < bli_max( 1, *m ) ) \ - info = 9; \ - else if ( *ldc < bli_max( 1, *m ) ) \ - info = 12; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_hemm_check.h -// begin bla_herk_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ -{ \ - f77_int info = 0; \ - f77_int nota, conja; \ - f77_int lower, upper; \ - f77_int nrowa; \ -\ - nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ - conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ - lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( nota ) { nrowa = *m; } \ - else { nrowa = *k; } \ -\ - if ( !lower && !upper ) \ - info = 1; \ - else if ( !nota && !conja ) \ - info = 2; \ - else if ( *m < 0 ) \ - info = 3; \ - else if ( *k < 0 ) \ - info = 4; \ - else if ( *lda < bli_max( 1, nrowa ) ) \ - info = 7; \ - else if ( *ldc < bli_max( 1, *m ) ) \ - info = 10; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_herk_check.h -// begin bla_her2k_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ -{ \ - f77_int info = 0; \ - f77_int nota, conja; \ - f77_int lower, upper; \ - f77_int nrowa; \ -\ - nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ - conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ - lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( nota ) { nrowa = *m; } \ - else { nrowa = *k; } \ -\ - if ( !lower && !upper ) \ - info = 1; \ - else if ( !nota && !conja ) \ - info = 2; \ - else if ( *m < 0 ) \ - info = 3; \ - else if ( *k < 0 ) \ - info = 4; \ - else if ( *lda < bli_max( 1, nrowa ) ) \ - info = 7; \ - else if ( *ldb < bli_max( 1, nrowa ) ) \ - info = 9; \ - else if ( *ldc < bli_max( 1, *m ) ) \ - info = 12; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_her2k_check.h -// begin bla_symm_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_symm_check bla_hemm_check - -#endif -// end bla_symm_check.h -// begin bla_syrk_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ -{ \ - f77_int info = 0; \ - f77_int is_r; \ - f77_int nota, ta, cta; \ - f77_int lower, upper; \ - f77_int nrowa; \ -\ - static char* dt_cst = dt_str; \ -\ - is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ - nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ - ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ - cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ - lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( nota ) { nrowa = *m; } \ - else { nrowa = *k; } \ -\ - if ( !lower && !upper ) \ - info = 1; \ - else if ( !nota && !ta && (is_r ? !cta : 1) ) \ - info = 2; \ - else if ( *m < 0 ) \ - info = 3; \ - else if ( *k < 0 ) \ - info = 4; \ - else if ( *lda < bli_max( 1, nrowa ) ) \ - info = 7; \ - else if ( *ldc < bli_max( 1, *m ) ) \ - info = 10; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_syrk_check.h -// begin bla_syr2k_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ -{ \ - f77_int info = 0; \ - f77_int is_r; \ - f77_int nota, ta, cta; \ - f77_int lower, upper; \ - f77_int nrowa; \ -\ - static char* dt_cst = dt_str; \ -\ - is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ - nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ - ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ - cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ - lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( nota ) { nrowa = *m; } \ - else { nrowa = *k; } \ -\ - if ( !lower && !upper ) \ - info = 1; \ - else if ( !nota && !ta && (is_r ? !cta : 1) ) \ - info = 2; \ - else if ( *m < 0 ) \ - info = 3; \ - else if ( *k < 0 ) \ - info = 4; \ - else if ( *lda < bli_max( 1, nrowa ) ) \ - info = 7; \ - else if ( *ldb < bli_max( 1, nrowa ) ) \ - info = 9; \ - else if ( *ldc < bli_max( 1, *m ) ) \ - info = 12; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_syr2k_check.h -// begin bla_trmm_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ -{ \ - f77_int info = 0; \ - f77_int left, right; \ - f77_int lower, upper; \ - f77_int nota, ta, conja; \ - f77_int unita, nonua; \ - f77_int nrowa; \ -\ - left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ - right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ - lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ - nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ - ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ - conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ - unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ - nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( left ) { nrowa = *m; } \ - else { nrowa = *n; } \ -\ - if ( !left && !right ) \ - info = 1; \ - else if ( !lower && !upper ) \ - info = 2; \ - else if ( !nota && !ta && !conja ) \ - info = 3; \ - else if ( !unita && !nonua ) \ - info = 4; \ - else if ( *m < 0 ) \ - info = 5; \ - else if ( *n < 0 ) \ - info = 6; \ - else if ( *lda < bli_max( 1, nrowa ) ) \ - info = 9; \ - else if ( *ldb < bli_max( 1, *m ) ) \ - info = 11; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_trmm_check.h -// begin bla_trsm_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_trsm_check bla_trmm_check - -#endif -// end bla_trsm_check.h -// begin bla_gemmt_check.h - - -#ifdef BLIS_ENABLE_BLAS - -#define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, n, k, lda, ldb, ldc ) \ -{ \ - f77_int info = 0; \ - f77_int nota, notb; \ - f77_int conja, conjb; \ - f77_int ta, tb; \ - f77_int lower, upper; \ - f77_int nrowa, nrowb; \ -\ - nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ - notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ - conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ - conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ - ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ - tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ -\ - lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ - upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ -\ - if ( nota ) { nrowa = *n; } \ - else { nrowa = *k; } \ - if ( notb ) { nrowb = *k; } \ - else { nrowb = *n; } \ -\ - if ( !lower && !upper ) \ - info = 1; \ - else if ( !nota && !conja && !ta ) \ - info = 2; \ - else if ( !notb && !conjb && !tb ) \ - info = 3; \ - else if ( *n < 0 ) \ - info = 4; \ - else if ( *k < 0 ) \ - info = 5; \ - else if ( *lda < bli_max( 1, nrowa ) ) \ - info = 8; \ - else if ( *ldb < bli_max( 1, nrowb ) ) \ - info = 10; \ - else if ( *ldc < bli_max( 1, *n ) ) \ - info = 13; \ -\ - if ( info != 0 ) \ - { \ - char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ -\ - sprintf( func_str, "%s%-5s", dt_str, op_str ); \ -\ - bli_string_mkupper( func_str ); \ -\ - PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ -\ - return; \ - } \ -} - -#endif -// end bla_gemmt_check.h - -// -- Fortran-compatible APIs to BLIS functions -- - -// begin b77_thread.h - - - -// -// Prototype Fortran-compatible BLIS interfaces. -// - -BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) - ( - const f77_int* jc, - const f77_int* pc, - const f77_int* ic, - const f77_int* jr, - const f77_int* ir - ); - -BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) - ( - const f77_int* nt - ); - -// end b77_thread.h - - -#endif // BLIS_ENABLE_BLAS -// end bli_blas.h - - -// -- CBLAS compatibility layer -- - -// begin bli_cblas.h - - -#ifndef BLIS_CBLAS_H -#define BLIS_CBLAS_H - -#ifdef BLIS_ENABLE_CBLAS - - -// Undefine these macros so that no internal conversion is done by CBLAS. -// The function signatures have been modified to use the proper integer types -// directly. -#undef F77_INT -#undef F77_CHAR - -// Include the main CBLAS header so that including this header file -// (probably via blis.h) allows applications to access CBLAS -// prototypes and definitions. -// begin cblas.h -#ifndef CBLAS_H -#define CBLAS_H -#include // skipped - -// We need to #include "bli_type_defs.h" in order to pull in the -// definition of f77_int. But in order to #include that header, we -// also need to pull in the headers that precede it in blis.h. -// begin bli_system.h - - -#ifndef BLIS_SYSTEM_H -#define BLIS_SYSTEM_H - -// NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that -// various parts of POSIX are defined and made available. -#ifndef _POSIX_C_SOURCE -#define _POSIX_C_SOURCE 200809L -#endif - -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#include // skipped - -// Determine the compiler (hopefully) and define conveniently named macros -// accordingly. -#if defined(__ICC) || defined(__INTEL_COMPILER) - #define BLIS_ICC -#elif defined(__clang__) - #define BLIS_CLANG -#elif defined(__GNUC__) - #define BLIS_GCC -#endif - -// Determine if we are on a 64-bit or 32-bit architecture. -#if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ - defined(_ARCH_PPC64) - #define BLIS_ARCH_64 -#else - #define BLIS_ARCH_32 -#endif - -// Determine the target operating system. -#if defined(_WIN32) || defined(__CYGWIN__) - #define BLIS_OS_WINDOWS 1 -#elif defined(__gnu_hurd__) - #define BLIS_OS_GNU 1 -#elif defined(__APPLE__) || defined(__MACH__) - #define BLIS_OS_OSX 1 -#elif defined(__ANDROID__) - #define BLIS_OS_ANDROID 1 -#elif defined(__linux__) - #define BLIS_OS_LINUX 1 -#elif defined(__bgq__) - #define BLIS_OS_BGQ 1 -#elif defined(__bg__) - #define BLIS_OS_BGP 1 -#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__bsdi__) || defined(__DragonFly__) || \ - defined(__FreeBSD_kernel__) || defined(__HAIKU__) - #define BLIS_OS_BSD 1 -#elif defined(EMSCRIPTEN) - #define BLIS_OS_EMSCRIPTEN -#else - #error "Cannot determine operating system" -#endif - -// A few changes that may be necessary in Windows environments. -#if BLIS_OS_WINDOWS - - // Include Windows header file. - #define WIN32_LEAN_AND_MEAN - #define VC_EXTRALEAN -#include // skipped - - #if !defined(__clang__) && !defined(__GNUC__) - // Undefine attribute specifiers in Windows. - #define __attribute__(x) - - // Undefine restrict. - #define restrict - #endif - -#endif - -// time.h provides clock_gettime(). -#if BLIS_OS_WINDOWS -#include // skipped -#elif BLIS_OS_OSX -#include // skipped -#else - //#include - -#include // skipped -#endif - -// POSIX threads are unconditionally required, regardless of whether -// multithreading is enabled via pthreads or OpenMP (or disabled). -// If pthreads is not available (Windows), then fake it. -//#include "bli_pthread_wrap.h" - - -#endif -// end bli_system.h -// begin bli_config.h - - -#ifndef BLIS_CONFIG_H -#define BLIS_CONFIG_H - -// Enabled configuration "family" (config_name) -#define BLIS_FAMILY_ZEN2 - - -// Enabled sub-configurations (config_list) -#define BLIS_CONFIG_ZEN2 - - -// Enabled kernel sets (kernel_list) -#define BLIS_KERNELS_ZEN2 -#define BLIS_KERNELS_ZEN -#define BLIS_KERNELS_HASWELL - - -//This macro is enabled only for ZEN family configurations. -//This enables us to use different cache-blocking sizes for TRSM instead of common level-3 cache-block sizes. -#if 1 -#define AOCL_BLIS_ZEN -#endif - -#if 0 -#define BLIS_ENABLE_OPENMP -#endif - -#if 0 -#define BLIS_ENABLE_PTHREADS -#endif - -#if 1 -#define BLIS_ENABLE_JRIR_SLAB -#endif - -#if 0 -#define BLIS_ENABLE_JRIR_RR -#endif - -#if 1 -#define BLIS_ENABLE_PBA_POOLS -#else -#define BLIS_DISABLE_PBA_POOLS -#endif - -#if 1 -#define BLIS_ENABLE_SBA_POOLS -#else -#define BLIS_DISABLE_SBA_POOLS -#endif - -#if 0 -#define BLIS_ENABLE_MEM_TRACING -#else -#define BLIS_DISABLE_MEM_TRACING -#endif - -#if 0 == 64 -#define BLIS_INT_TYPE_SIZE 64 -#elif 0 == 32 -#define BLIS_INT_TYPE_SIZE 32 -#else -// determine automatically -#endif - -#if 32 == 64 -#define BLIS_BLAS_INT_TYPE_SIZE 64 -#elif 32 == 32 -#define BLIS_BLAS_INT_TYPE_SIZE 32 -#else -// determine automatically -#endif - -#ifndef BLIS_ENABLE_BLAS -#ifndef BLIS_DISABLE_BLAS -#if 1 -#define BLIS_ENABLE_BLAS -#else -#define BLIS_DISABLE_BLAS -#endif -#endif -#endif - -#ifndef BLIS_ENABLE_CBLAS -#ifndef BLIS_DISABLE_CBLAS -#if 0 -#define BLIS_ENABLE_CBLAS -#else -#define BLIS_DISABLE_CBLAS -#endif -#endif -#endif - -#ifndef BLIS_ENABLE_MIXED_DT -#ifndef BLIS_DISABLE_MIXED_DT -#if 1 -#define BLIS_ENABLE_MIXED_DT -#else -#define BLIS_DISABLE_MIXED_DT -#endif -#endif -#endif - -#ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM -#ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM -#if 1 -#define BLIS_ENABLE_MIXED_DT_EXTRA_MEM -#else -#define BLIS_DISABLE_MIXED_DT_EXTRA_MEM -#endif -#endif -#endif - -#if 1 -#define BLIS_ENABLE_SUP_HANDLING -#else -#define BLIS_DISABLE_SUP_HANDLING -#endif - -#if 0 -#define BLIS_ENABLE_MEMKIND -#else -#define BLIS_DISABLE_MEMKIND -#endif - -#if 1 -#define BLIS_ENABLE_PRAGMA_OMP_SIMD -#else -#define BLIS_DISABLE_PRAGMA_OMP_SIMD -#endif - -#if 0 -#define BLIS_ENABLE_SANDBOX -#else -#define BLIS_DISABLE_SANDBOX -#endif - -#if 1 -#define BLIS_ENABLE_SHARED -#else -#define BLIS_DISABLE_SHARED -#endif - - -#endif -// end bli_config.h -// begin bli_config_macro_defs.h - - -#ifndef BLIS_CONFIG_MACRO_DEFS_H -#define BLIS_CONFIG_MACRO_DEFS_H - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#ifndef BLIS_INT_TYPE_SIZE - #ifdef BLIS_ARCH_64 - #define BLIS_INT_TYPE_SIZE 64 - #else - #define BLIS_INT_TYPE_SIZE 32 - #endif -#endif - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -// NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. -#ifdef BLIS_ENABLE_C99_COMPLEX - // No additional definitions needed. -#else - // Default behavior is disabled. -#endif - - -// -- MULTITHREADING ----------------------------------------------------------- - -// Enable multithreading via POSIX threads. -#ifdef BLIS_ENABLE_PTHREADS - // No additional definitions needed. -#else - // Default behavior is disabled. -#endif - -// Enable multithreading via OpenMP. -#ifdef BLIS_ENABLE_OPENMP - // No additional definitions needed. -#else - // Default behavior is disabled. -#endif - -// Perform a sanity check to make sure the user doesn't try to enable -// both OpenMP and pthreads. -#if defined ( BLIS_ENABLE_OPENMP ) && \ - defined ( BLIS_ENABLE_PTHREADS ) - #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." -#endif - -// Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP -// or pthreads are enabled. This macro is useful in situations when -// we want to detect use of either OpenMP or pthreads (as opposed -// to neither being used). -#if defined ( BLIS_ENABLE_OPENMP ) || \ - defined ( BLIS_ENABLE_PTHREADS ) - #define BLIS_ENABLE_MULTITHREADING -#endif - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Enable mixed datatype support? -#ifdef BLIS_DISABLE_MIXED_DT - #undef BLIS_ENABLE_GEMM_MD -#else - // Default behavior is enabled. - #define BLIS_ENABLE_GEMM_MD -#endif - -// Enable memory-intensive optimizations for mixed datatype support? -#ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM - #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM -#else - // Default behavior is enabled. - #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM -#endif - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Do NOT require the cross-blocksize constraints. That is, do not enforce -// MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY -// needed when implementing trsm_r by allowing the right-hand matrix B to -// be triangular. -#ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS - #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS -#endif - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#ifdef BLIS_DISABLE_STAY_AUTO_INITIALIZED - #undef BLIS_ENABLE_STAY_AUTO_INITIALIZED -#else - // Default behavior is enabled. - #undef BLIS_ENABLE_STAY_AUTO_INITIALIZED // In case user explicitly enabled. - #define BLIS_ENABLE_STAY_AUTO_INITIALIZED -#endif - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#ifdef BLIS_DISABLE_BLAS - #undef BLIS_ENABLE_BLAS -#else - // Default behavior is enabled. - #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. - #define BLIS_ENABLE_BLAS -#endif - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. -#ifndef BLIS_BLAS_INT_TYPE_SIZE - #define BLIS_BLAS_INT_TYPE_SIZE 32 -#endif - -// By default, the level-3 BLAS routines are implemented by directly calling -// the BLIS object API. Alternatively, they may first call the typed BLIS -// API, which will then call the object API. -//#define BLIS_BLAS3_CALLS_TAPI -#ifdef BLIS_BLAS3_CALLS_TAPI - #undef BLIS_BLAS3_CALLS_OAPI -#else - // Default behavior is to call object API directly. - #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. - #define BLIS_BLAS3_CALLS_OAPI -#endif - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -#ifdef BLIS_ENABLE_CBLAS - // No additional definitions needed. -#else - // Default behavior is disabled. -#endif - - -// -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- - -// When building shared libraries, we can control which symbols are exported for -// linking by external applications. BLIS annotates all function prototypes that -// are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing -// a similar role for BLAS compatibility routines). Which symbols are exported -// is controlled by the default symbol visibility, as specifed by the gcc option -// -fvisibility=[default|hidden]. The default for this option is 'default', or, -// "public", which, if allowed to stand, causes all symbols in BLIS to be -// linkable from the outside. But when compiling with -fvisibility=hidden, all -// symbols start out hidden (that is, restricted only for internal use by BLIS), -// with that setting overridden only for function prototypes or variable -// declarations that are annotated with BLIS_EXPORT_BLIS. - -#ifndef BLIS_EXPORT - #if !defined(BLIS_ENABLE_SHARED) - #define BLIS_EXPORT - #else - #if defined(_WIN32) || defined(__CYGWIN__) - #ifdef BLIS_IS_BUILDING_LIBRARY - #define BLIS_EXPORT __declspec(dllexport) - #else - #define BLIS_EXPORT __declspec(dllimport) - #endif - #elif defined(__GNUC__) && __GNUC__ >= 4 - #define BLIS_EXPORT __attribute__ ((visibility ("default"))) - #else - #define BLIS_EXPORT - #endif - #endif -#endif - -#define BLIS_EXPORT_BLIS BLIS_EXPORT -#define BLIS_EXPORT_BLAS BLIS_EXPORT - - -#endif - -// end bli_config_macro_defs.h -// begin bli_type_defs.h - - -#ifndef BLIS_TYPE_DEFS_H -#define BLIS_TYPE_DEFS_H - - -// -// -- BLIS basic types --------------------------------------------------------- -// - -#ifdef __cplusplus - // For C++, include stdint.h. -#include // skipped -#elif __STDC_VERSION__ >= 199901L - // For C99 (or later), include stdint.h. -#include // skipped -#else - // When stdint.h is not available, manually typedef the types we will use. - #ifdef _WIN32 - typedef __int32 int32_t; - typedef unsigned __int32 uint32_t; - typedef __int64 int64_t; - typedef unsigned __int64 uint64_t; - #else - #error "Attempting to compile on pre-C99 system without stdint.h." - #endif -#endif - -// -- General-purpose integers -- - -// If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. -// NOTE: This cpp guard will only meaningfully change BLIS's behavior on -// systems where the BLIS integer size would have been automatically selected -// to be 32 bits, since explicit selection of 32 bits is prohibited at -// configure-time (and explicit or automatic selection of 64 bits is fine -// and would have had the same result). -#if BLIS_BLAS_INT_SIZE == 64 - #undef BLIS_INT_TYPE_SIZE - #define BLIS_INT_TYPE_SIZE 64 -#endif - -// Define integer types depending on what size integer was requested. -#if BLIS_INT_TYPE_SIZE == 32 -typedef int32_t gint_t; -typedef uint32_t guint_t; -#elif BLIS_INT_TYPE_SIZE == 64 -typedef int64_t gint_t; -typedef uint64_t guint_t; -#else -typedef signed long int gint_t; -typedef unsigned long int guint_t; -#endif - -// -- Boolean type -- - -typedef gint_t bool_t; - - -// -- Boolean values -- - -#ifndef TRUE - #define TRUE 1 -#endif - -#ifndef FALSE - #define FALSE 0 -#endif - - -// -- Special-purpose integers -- - -// This cpp guard provides a temporary hack to allow libflame -// interoperability with BLIS. -#ifndef _DEFINED_DIM_T -#define _DEFINED_DIM_T -typedef gint_t dim_t; // dimension type -#endif -typedef gint_t inc_t; // increment/stride type -typedef gint_t doff_t; // diagonal offset type -typedef guint_t siz_t; // byte size type -typedef uint32_t objbits_t; // object information bit field - -// -- Real types -- - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// There are some places where we need to use sizeof() inside of a C -// preprocessor #if conditional, and so here we define the various sizes -// for those purposes. -#define BLIS_SIZEOF_S 4 // sizeof(float) -#define BLIS_SIZEOF_D 8 // sizeof(double) -#define BLIS_SIZEOF_C 8 // sizeof(scomplex) -#define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) - -// -- Complex types -- - -#ifdef BLIS_ENABLE_C99_COMPLEX - - #if __STDC_VERSION__ >= 199901L -#include // skipped - - // Typedef official complex types to BLIS complex type names. - typedef float complex scomplex; - typedef double complex dcomplex; - #else - #error "Configuration requested C99 complex types, but C99 does not appear to be supported." - #endif - -#else // ifndef BLIS_ENABLE_C99_COMPLEX - - // This cpp guard provides a temporary hack to allow libflame - // interoperability with BLIS. - #ifndef _DEFINED_SCOMPLEX - #define _DEFINED_SCOMPLEX - typedef struct - { - float real; - float imag; - } scomplex; - #endif - - // This cpp guard provides a temporary hack to allow libflame - // interoperability with BLIS. - #ifndef _DEFINED_DCOMPLEX - #define _DEFINED_DCOMPLEX - typedef struct - { - double real; - double imag; - } dcomplex; - #endif - -#endif // BLIS_ENABLE_C99_COMPLEX - -// -- Atom type -- - -// Note: atom types are used to hold "bufferless" scalar object values. Note -// that it needs to be as large as the largest possible scalar value we might -// want to hold. Thus, for now, it is a dcomplex. -typedef dcomplex atom_t; - -// -- Fortran-77 types -- - -// Note: These types are typically only used by BLAS compatibility layer, but -// we must define them even when the compatibility layer isn't being built -// because they also occur in bli_slamch() and bli_dlamch(). - -// Define f77_int depending on what size of integer was requested. -#if BLIS_BLAS_INT_TYPE_SIZE == 32 -typedef int32_t f77_int; -#elif BLIS_BLAS_INT_TYPE_SIZE == 64 -typedef int64_t f77_int; -#else -typedef long int f77_int; -#endif - -typedef char f77_char; -typedef float f77_float; -typedef double f77_double; -typedef scomplex f77_scomplex; -typedef dcomplex f77_dcomplex; - -// -- Void function pointer types -- - -// Note: This type should be used in any situation where the address of a -// *function* will be conveyed or stored prior to it being typecast back -// to the correct function type. It does not need to be used when conveying -// or storing the address of *data* (such as an array of float or double). - -//typedef void (*void_fp)( void ); -typedef void* void_fp; - - -// -// -- BLIS info bit field offsets ---------------------------------------------- -// - - - -// info -#define BLIS_DATATYPE_SHIFT 0 -#define BLIS_DOMAIN_SHIFT 0 -#define BLIS_PRECISION_SHIFT 1 -#define BLIS_CONJTRANS_SHIFT 3 -#define BLIS_TRANS_SHIFT 3 -#define BLIS_CONJ_SHIFT 4 -#define BLIS_UPLO_SHIFT 5 -#define BLIS_UPPER_SHIFT 5 -#define BLIS_DIAG_SHIFT 6 -#define BLIS_LOWER_SHIFT 7 -#define BLIS_UNIT_DIAG_SHIFT 8 -#define BLIS_INVERT_DIAG_SHIFT 9 -#define BLIS_TARGET_DT_SHIFT 10 -#define BLIS_TARGET_DOMAIN_SHIFT 10 -#define BLIS_TARGET_PREC_SHIFT 11 -#define BLIS_EXEC_DT_SHIFT 13 -#define BLIS_EXEC_DOMAIN_SHIFT 13 -#define BLIS_EXEC_PREC_SHIFT 14 -#define BLIS_PACK_SCHEMA_SHIFT 16 -#define BLIS_PACK_RC_SHIFT 16 -#define BLIS_PACK_PANEL_SHIFT 17 -#define BLIS_PACK_FORMAT_SHIFT 18 -#define BLIS_PACK_SHIFT 22 -#define BLIS_PACK_REV_IF_UPPER_SHIFT 23 -#define BLIS_PACK_REV_IF_LOWER_SHIFT 24 -#define BLIS_PACK_BUFFER_SHIFT 25 -#define BLIS_STRUC_SHIFT 27 -#define BLIS_COMP_DT_SHIFT 29 -#define BLIS_COMP_DOMAIN_SHIFT 29 -#define BLIS_COMP_PREC_SHIFT 30 - -// info2 -#define BLIS_SCALAR_DT_SHIFT 0 -#define BLIS_SCALAR_DOMAIN_SHIFT 0 -#define BLIS_SCALAR_PREC_SHIFT 1 - -// -// -- BLIS info bit field masks ------------------------------------------------ -// - -// info -#define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) -#define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) -#define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) -#define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) -#define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) -#define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) -#define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) -#define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) -#define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) -#define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) -#define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) -#define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) -#define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) -#define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) -#define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) -#define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) -#define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) -#define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) -#define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) -#define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) -#define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) -#define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) -#define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) -#define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) -#define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) -#define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) -#define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) -#define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) -#define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) - -// info2 -#define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) -#define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) -#define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) - - -// -// -- BLIS enumerated type value definitions ----------------------------------- -// - -#define BLIS_BITVAL_REAL 0x0 -#define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT -#define BLIS_BITVAL_SINGLE_PREC 0x0 -#define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT -#define BLIS_BITVAL_FLOAT_TYPE 0x0 -#define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT -#define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT -#define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) -#define BLIS_BITVAL_INT_TYPE 0x04 -#define BLIS_BITVAL_CONST_TYPE 0x05 -#define BLIS_BITVAL_NO_TRANS 0x0 -#define BLIS_BITVAL_TRANS BLIS_TRANS_BIT -#define BLIS_BITVAL_NO_CONJ 0x0 -#define BLIS_BITVAL_CONJ BLIS_CONJ_BIT -#define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) -#define BLIS_BITVAL_ZEROS 0x0 -#define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) -#define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) -#define BLIS_BITVAL_DENSE BLIS_UPLO_BITS -#define BLIS_BITVAL_NONUNIT_DIAG 0x0 -#define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT -#define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT -#define BLIS_BITVAL_NOT_PACKED 0x0 -#define BLIS_BITVAL_4MI ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_3MI ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_4MS ( 0x3 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_3MS ( 0x4 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_1E ( 0x8 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_1R ( 0x9 << BLIS_PACK_FORMAT_SHIFT ) -#define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) -#define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) -#define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) -#define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) -#define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 -#define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT -#define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 -#define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT -#define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 -#define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) -#define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) -#define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) -#define BLIS_BITVAL_GENERAL 0x0 -#define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) -#define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) -#define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) - - -// -// -- BLIS enumerated type definitions ----------------------------------------- -// - -// -- Operational parameter types -- - -typedef enum -{ - BLIS_NO_TRANSPOSE = 0x0, - BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, - BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, - BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS -} trans_t; - -typedef enum -{ - BLIS_NO_CONJUGATE = 0x0, - BLIS_CONJUGATE = BLIS_BITVAL_CONJ -} conj_t; - -typedef enum -{ - BLIS_ZEROS = BLIS_BITVAL_ZEROS, - BLIS_LOWER = BLIS_BITVAL_LOWER, - BLIS_UPPER = BLIS_BITVAL_UPPER, - BLIS_DENSE = BLIS_BITVAL_DENSE -} uplo_t; - -typedef enum -{ - BLIS_LEFT = 0x0, - BLIS_RIGHT -} side_t; - -typedef enum -{ - BLIS_NONUNIT_DIAG = 0x0, - BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG -} diag_t; - -typedef enum -{ - BLIS_NO_INVERT_DIAG = 0x0, - BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG -} invdiag_t; - -typedef enum -{ - BLIS_GENERAL = BLIS_BITVAL_GENERAL, - BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, - BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, - BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR -} struc_t; - - -// -- Data type -- - -typedef enum -{ - BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, - BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, - BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, - BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, - BLIS_INT = BLIS_BITVAL_INT_TYPE, - BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, - BLIS_DT_LO = BLIS_FLOAT, - BLIS_DT_HI = BLIS_DCOMPLEX -} num_t; - -typedef enum -{ - BLIS_REAL = BLIS_BITVAL_REAL, - BLIS_COMPLEX = BLIS_BITVAL_COMPLEX -} dom_t; - -typedef enum -{ - BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, - BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC -} prec_t; - - -// -- Pack schema type -- - -typedef enum -{ - BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, - BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, - BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, - BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, - BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, - BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, - BLIS_PACKED_ROW_PANELS_4MI = BLIS_BITVAL_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI = BLIS_BITVAL_PACKED_COL_PANELS_4MI, - BLIS_PACKED_ROW_PANELS_3MI = BLIS_BITVAL_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI = BLIS_BITVAL_PACKED_COL_PANELS_3MI, - BLIS_PACKED_ROW_PANELS_4MS = BLIS_BITVAL_PACKED_ROW_PANELS_4MS, - BLIS_PACKED_COL_PANELS_4MS = BLIS_BITVAL_PACKED_COL_PANELS_4MS, - BLIS_PACKED_ROW_PANELS_3MS = BLIS_BITVAL_PACKED_ROW_PANELS_3MS, - BLIS_PACKED_COL_PANELS_3MS = BLIS_BITVAL_PACKED_COL_PANELS_3MS, - BLIS_PACKED_ROW_PANELS_RO = BLIS_BITVAL_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_RO = BLIS_BITVAL_PACKED_COL_PANELS_RO, - BLIS_PACKED_ROW_PANELS_IO = BLIS_BITVAL_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_IO = BLIS_BITVAL_PACKED_COL_PANELS_IO, - BLIS_PACKED_ROW_PANELS_RPI = BLIS_BITVAL_PACKED_ROW_PANELS_RPI, - BLIS_PACKED_COL_PANELS_RPI = BLIS_BITVAL_PACKED_COL_PANELS_RPI, - BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, - BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R -} pack_t; - -// We combine row and column packing into one "type", and we start -// with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. We also count the -// schema pair for "4ms" (4m separated), because its bit value has -// been reserved, even though we don't use it. -#define BLIS_NUM_PACK_SCHEMA_TYPES 10 - - -// -- Pack order type -- - -typedef enum -{ - BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, - BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, - - BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, - BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER -} packord_t; - - -// -- Pack buffer type -- - -typedef enum -{ - BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, - BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, - BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, - BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE -} packbuf_t; - - -// -- Partitioning direction -- - -typedef enum -{ - BLIS_FWD, - BLIS_BWD -} dir_t; - - -// -- Subpartition type -- - -typedef enum -{ - BLIS_SUBPART0, - BLIS_SUBPART1, - BLIS_SUBPART2, - BLIS_SUBPART1AND0, - BLIS_SUBPART1AND2, - BLIS_SUBPART1A, - BLIS_SUBPART1B, - BLIS_SUBPART00, - BLIS_SUBPART10, - BLIS_SUBPART20, - BLIS_SUBPART01, - BLIS_SUBPART11, - BLIS_SUBPART21, - BLIS_SUBPART02, - BLIS_SUBPART12, - BLIS_SUBPART22 -} subpart_t; - - -// -- Matrix dimension type -- - -typedef enum -{ - BLIS_M = 0, - BLIS_N = 1 -} mdim_t; - - -// -- Machine parameter types -- - -typedef enum -{ - BLIS_MACH_EPS = 0, - BLIS_MACH_SFMIN, - BLIS_MACH_BASE, - BLIS_MACH_PREC, - BLIS_MACH_NDIGMANT, - BLIS_MACH_RND, - BLIS_MACH_EMIN, - BLIS_MACH_RMIN, - BLIS_MACH_EMAX, - BLIS_MACH_RMAX, - BLIS_MACH_EPS2 -} machval_t; - -#define BLIS_NUM_MACH_PARAMS 11 -#define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS -#define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 - - -// -- Induced method types -- - -typedef enum -{ - BLIS_3MH = 0, - BLIS_3M1, - BLIS_4MH, - BLIS_4M1B, - BLIS_4M1A, - BLIS_1M, - BLIS_NAT, - BLIS_IND_FIRST = 0, - BLIS_IND_LAST = BLIS_NAT -} ind_t; - -#define BLIS_NUM_IND_METHODS (BLIS_NAT+1) - -// These are used in bli_*_oapi.c to construct the ind_t values from -// the induced method substrings that go into function names. -#define bli_3mh BLIS_3MH -#define bli_3m1 BLIS_3M1 -#define bli_4mh BLIS_4MH -#define bli_4mb BLIS_4M1B -#define bli_4m1 BLIS_4M1A -#define bli_1m BLIS_1M -#define bli_nat BLIS_NAT - - -// -- Kernel ID types -- - -typedef enum -{ - BLIS_ADDV_KER = 0, - BLIS_AMAXV_KER, - BLIS_AXPBYV_KER, - BLIS_AXPYV_KER, - BLIS_COPYV_KER, - BLIS_DOTV_KER, - BLIS_DOTXV_KER, - BLIS_INVERTV_KER, - BLIS_SCALV_KER, - BLIS_SCAL2V_KER, - BLIS_SETV_KER, - BLIS_SUBV_KER, - BLIS_SWAPV_KER, - BLIS_XPBYV_KER -} l1vkr_t; - -#define BLIS_NUM_LEVEL1V_KERS 14 - - -typedef enum -{ - BLIS_AXPY2V_KER = 0, - BLIS_DOTAXPYV_KER, - BLIS_AXPYF_KER, - BLIS_DOTXF_KER, - BLIS_DOTXAXPYF_KER -} l1fkr_t; - -#define BLIS_NUM_LEVEL1F_KERS 5 - - -typedef enum -{ - BLIS_PACKM_0XK_KER = 0, - BLIS_PACKM_1XK_KER = 1, - BLIS_PACKM_2XK_KER = 2, - BLIS_PACKM_3XK_KER = 3, - BLIS_PACKM_4XK_KER = 4, - BLIS_PACKM_5XK_KER = 5, - BLIS_PACKM_6XK_KER = 6, - BLIS_PACKM_7XK_KER = 7, - BLIS_PACKM_8XK_KER = 8, - BLIS_PACKM_9XK_KER = 9, - BLIS_PACKM_10XK_KER = 10, - BLIS_PACKM_11XK_KER = 11, - BLIS_PACKM_12XK_KER = 12, - BLIS_PACKM_13XK_KER = 13, - BLIS_PACKM_14XK_KER = 14, - BLIS_PACKM_15XK_KER = 15, - BLIS_PACKM_16XK_KER = 16, - BLIS_PACKM_17XK_KER = 17, - BLIS_PACKM_18XK_KER = 18, - BLIS_PACKM_19XK_KER = 19, - BLIS_PACKM_20XK_KER = 20, - BLIS_PACKM_21XK_KER = 21, - BLIS_PACKM_22XK_KER = 22, - BLIS_PACKM_23XK_KER = 23, - BLIS_PACKM_24XK_KER = 24, - BLIS_PACKM_25XK_KER = 25, - BLIS_PACKM_26XK_KER = 26, - BLIS_PACKM_27XK_KER = 27, - BLIS_PACKM_28XK_KER = 28, - BLIS_PACKM_29XK_KER = 29, - BLIS_PACKM_30XK_KER = 30, - BLIS_PACKM_31XK_KER = 31, - - BLIS_UNPACKM_0XK_KER = 0, - BLIS_UNPACKM_1XK_KER = 1, - BLIS_UNPACKM_2XK_KER = 2, - BLIS_UNPACKM_3XK_KER = 3, - BLIS_UNPACKM_4XK_KER = 4, - BLIS_UNPACKM_5XK_KER = 5, - BLIS_UNPACKM_6XK_KER = 6, - BLIS_UNPACKM_7XK_KER = 7, - BLIS_UNPACKM_8XK_KER = 8, - BLIS_UNPACKM_9XK_KER = 9, - BLIS_UNPACKM_10XK_KER = 10, - BLIS_UNPACKM_11XK_KER = 11, - BLIS_UNPACKM_12XK_KER = 12, - BLIS_UNPACKM_13XK_KER = 13, - BLIS_UNPACKM_14XK_KER = 14, - BLIS_UNPACKM_15XK_KER = 15, - BLIS_UNPACKM_16XK_KER = 16, - BLIS_UNPACKM_17XK_KER = 17, - BLIS_UNPACKM_18XK_KER = 18, - BLIS_UNPACKM_19XK_KER = 19, - BLIS_UNPACKM_20XK_KER = 20, - BLIS_UNPACKM_21XK_KER = 21, - BLIS_UNPACKM_22XK_KER = 22, - BLIS_UNPACKM_23XK_KER = 23, - BLIS_UNPACKM_24XK_KER = 24, - BLIS_UNPACKM_25XK_KER = 25, - BLIS_UNPACKM_26XK_KER = 26, - BLIS_UNPACKM_27XK_KER = 27, - BLIS_UNPACKM_28XK_KER = 28, - BLIS_UNPACKM_29XK_KER = 29, - BLIS_UNPACKM_30XK_KER = 30, - BLIS_UNPACKM_31XK_KER = 31 - -} l1mkr_t; - -#define BLIS_NUM_PACKM_KERS 32 -#define BLIS_NUM_UNPACKM_KERS 32 - - -typedef enum -{ - BLIS_GEMM_UKR = 0, - BLIS_GEMMTRSM_L_UKR, - BLIS_GEMMTRSM_U_UKR, - BLIS_TRSM_L_UKR, - BLIS_TRSM_U_UKR -} l3ukr_t; - -#define BLIS_NUM_LEVEL3_UKRS 5 - - -typedef enum -{ - BLIS_REFERENCE_UKERNEL = 0, - BLIS_VIRTUAL_UKERNEL, - BLIS_OPTIMIZED_UKERNEL, - BLIS_NOTAPPLIC_UKERNEL -} kimpl_t; - -#define BLIS_NUM_UKR_IMPL_TYPES 4 - - -#if 0 -typedef enum -{ - // RV = row-stored, contiguous vector-loading - // RG = row-stored, non-contiguous gather-loading - // CV = column-stored, contiguous vector-loading - // CG = column-stored, non-contiguous gather-loading - - // RD = row-stored, dot-based - // CD = col-stored, dot-based - - // RC = row-stored, column-times-column - // CR = column-stored, row-times-row - - // GX = general-stored generic implementation - - BLIS_GEMMSUP_RV_UKR = 0, - BLIS_GEMMSUP_RG_UKR, - BLIS_GEMMSUP_CV_UKR, - BLIS_GEMMSUP_CG_UKR, - - BLIS_GEMMSUP_RD_UKR, - BLIS_GEMMSUP_CD_UKR, - - BLIS_GEMMSUP_RC_UKR, - BLIS_GEMMSUP_CR_UKR, - - BLIS_GEMMSUP_GX_UKR, -} l3sup_t; - -#define BLIS_NUM_LEVEL3_SUP_UKRS 9 -#endif - - -typedef enum -{ - // 3-operand storage combinations - BLIS_RRR = 0, - BLIS_RRC, // 1 - BLIS_RCR, // 2 - BLIS_RCC, // 3 - BLIS_CRR, // 4 - BLIS_CRC, // 5 - BLIS_CCR, // 6 - BLIS_CCC, // 7 - BLIS_XXX, // 8 - -#if 0 - BLIS_RRG, - BLIS_RCG, - BLIS_RGR, - BLIS_RGC, - BLIS_RGG, - BLIS_CRG, - BLIS_CCG, - BLIS_CGR, - BLIS_CGC, - BLIS_CGG, - BLIS_GRR, - BLIS_GRC, - BLIS_GRG, - BLIS_GCR, - BLIS_GCC, - BLIS_GCG, - BLIS_GGR, - BLIS_GGC, - BLIS_GGG, -#endif -} stor3_t; - -#define BLIS_NUM_3OP_RC_COMBOS 9 -//#define BLIS_NUM_3OP_RCG_COMBOS 27 - - -#if 0 -typedef enum -{ - BLIS_JC_IDX = 0, - BLIS_PC_IDX, - BLIS_IC_IDX, - BLIS_JR_IDX, - BLIS_IR_IDX, - BLIS_PR_IDX -} thridx_t; -#endif - -#define BLIS_NUM_LOOPS 6 - - -// -- Operation ID type -- - -typedef enum -{ -// -// NOTE: If/when additional type values are added to this enum, -// you must either: -// - keep the level-3 values (starting with _GEMM) beginning at -// index 0; or -// - if the value range is moved such that it does not begin at -// index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START -// value that can be subtracted from the opid_t value to map it -// to a zero-based range. -// This is needed because these level-3 opid_t values are used in -// bli_l3_ind.c to index into arrays. -// - BLIS_GEMM = 0, - BLIS_HEMM, - BLIS_HERK, - BLIS_HER2K, - BLIS_SYMM, - BLIS_SYRK, - BLIS_SYR2K, - BLIS_TRMM3, - BLIS_TRMM, - BLIS_TRSM, - BLIS_GEMMT, - BLIS_NOID -} opid_t; - -#define BLIS_NUM_LEVEL3_OPS 11 - - -// -- Blocksize ID type -- - -typedef enum -{ - // NOTE: the level-3 blocksizes MUST be indexed starting at zero. - // At one point, we made this assumption in bli_cntx_set_blkszs() - // and friends. - - BLIS_KR = 0, - BLIS_MR, - BLIS_NR, - BLIS_MC, - BLIS_KC, - BLIS_NC, - - BLIS_M2, // level-2 blocksize in m dimension - BLIS_N2, // level-2 blocksize in n dimension - - BLIS_AF, // level-1f axpyf fusing factor - BLIS_DF, // level-1f dotxf fusing factor - BLIS_XF, // level-1f dotxaxpyf fusing factor - - BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. -} bszid_t; - -#define BLIS_NUM_BLKSZS 11 - - -// -- Threshold ID type -- - -typedef enum -{ - BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension - BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension - BLIS_KT // level-3 small/unpacked matrix threshold in k dimension - -} threshid_t; - -#define BLIS_NUM_THRESH 3 - - -// -- Architecture ID type -- - -// NOTE: This typedef enum must be kept up-to-date with the arch_t -// string array in bli_arch.c. Whenever values are added/inserted -// OR if values are rearranged, be sure to update the string array -// in bli_arch.c. - -typedef enum -{ - // Intel - BLIS_ARCH_SKX = 0, - BLIS_ARCH_KNL, - BLIS_ARCH_KNC, - BLIS_ARCH_HASWELL, - BLIS_ARCH_SANDYBRIDGE, - BLIS_ARCH_PENRYN, - - // AMD - BLIS_ARCH_ZEN2, - BLIS_ARCH_ZEN, - BLIS_ARCH_EXCAVATOR, - BLIS_ARCH_STEAMROLLER, - BLIS_ARCH_PILEDRIVER, - BLIS_ARCH_BULLDOZER, - - // ARM - BLIS_ARCH_THUNDERX2, - BLIS_ARCH_CORTEXA57, - BLIS_ARCH_CORTEXA53, - BLIS_ARCH_CORTEXA15, - BLIS_ARCH_CORTEXA9, - - // IBM/Power - BLIS_ARCH_POWER9, - BLIS_ARCH_POWER7, - BLIS_ARCH_BGQ, - - // Generic architecture/configuration - BLIS_ARCH_GENERIC - -} arch_t; - -// NOTE: This value must be updated to reflect the number of enum values -// listed above for arch_t! -#define BLIS_NUM_ARCHS (BLIS_ARCH_GENERIC+1) - - -// -// -- BLIS misc. structure types ----------------------------------------------- -// - -// These headers must be included here (or earlier) because definitions they -// provide are needed in the pool_t and related structs. -// begin bli_pthread.h - - -#ifndef BLIS_PTHREAD_H -#define BLIS_PTHREAD_H - -#if defined(_MSC_VER) - -// This branch defines a pthread-like API, bli_pthread_*(), and implements it -// in terms of Windows API calls. - -// -- pthread_mutex_*() -- - -typedef SRWLOCK bli_pthread_mutex_t; -typedef void bli_pthread_mutexattr_t; - -#define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT - -BLIS_EXPORT_BLIS int bli_pthread_mutex_init - ( - bli_pthread_mutex_t* mutex, - const bli_pthread_mutexattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_lock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock - ( - bli_pthread_mutex_t* mutex - ); - -// -- pthread_once_*() -- - -typedef INIT_ONCE bli_pthread_once_t; - -#define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT - -BLIS_EXPORT_BLIS void bli_pthread_once - ( - bli_pthread_once_t* once, - void (*init)(void) - ); - -// -- pthread_cond_*() -- - -typedef CONDITION_VARIABLE bli_pthread_cond_t; -typedef void bli_pthread_condattr_t; - -#define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT - -BLIS_EXPORT_BLIS int bli_pthread_cond_init - ( - bli_pthread_cond_t* cond, - const bli_pthread_condattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_destroy - ( - bli_pthread_cond_t* cond - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_wait - ( - bli_pthread_cond_t* cond, - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast - ( - bli_pthread_cond_t* cond - ); - -// -- pthread_create(), pthread_join() -- - -typedef struct -{ - HANDLE handle; - void* retval; -} bli_pthread_t; - -typedef void bli_pthread_attr_t; - -BLIS_EXPORT_BLIS int bli_pthread_create - ( - bli_pthread_t* thread, - const bli_pthread_attr_t* attr, - void* (*start_routine)(void*), - void* arg - ); - -BLIS_EXPORT_BLIS int bli_pthread_join - ( - bli_pthread_t thread, - void** retval - ); - -// -- pthread_barrier_*() -- - -typedef void bli_pthread_barrierattr_t; - -typedef struct -{ - bli_pthread_mutex_t mutex; - bli_pthread_cond_t cond; - int count; - int tripCount; -} bli_pthread_barrier_t; - -BLIS_EXPORT_BLIS int bli_pthread_barrier_init - ( - bli_pthread_barrier_t* barrier, - const bli_pthread_barrierattr_t* attr, - unsigned int count - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy - ( - bli_pthread_barrier_t* barrier - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_wait - ( - bli_pthread_barrier_t* barrier - ); - -#else // !defined(_MSC_VER) - -#include // skipped - -// This branch defines a pthreads-like API, bli_pthreads_*(), and implements it -// in terms of the corresponding pthreads_*() types, macros, and function calls. - -// -- pthread types -- - -typedef pthread_t bli_pthread_t; -typedef pthread_attr_t bli_pthread_attr_t; -typedef pthread_mutex_t bli_pthread_mutex_t; -typedef pthread_mutexattr_t bli_pthread_mutexattr_t; -typedef pthread_cond_t bli_pthread_cond_t; -typedef pthread_condattr_t bli_pthread_condattr_t; -typedef pthread_once_t bli_pthread_once_t; - -#if defined(__APPLE__) - -// For OS X, we must define the barrier types ourselves since Apple does -// not implement barriers in their variant of pthreads. - -typedef void bli_pthread_barrierattr_t; - -typedef struct -{ - bli_pthread_mutex_t mutex; - bli_pthread_cond_t cond; - int count; - int tripCount; -} bli_pthread_barrier_t; - -#else - -// For other non-Windows OSes (primarily Linux), we can define the barrier -// types in terms of existing pthreads barrier types since we expect they -// will be provided by the pthreads implementation. - -typedef pthread_barrier_t bli_pthread_barrier_t; -typedef pthread_barrierattr_t bli_pthread_barrierattr_t; - -#endif - -// -- pthreads macros -- - -#define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER -#define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER -#define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT - -// -- pthread_create(), pthread_join() -- - -BLIS_EXPORT_BLIS int bli_pthread_create - ( - bli_pthread_t* thread, - const bli_pthread_attr_t* attr, - void* (*start_routine)(void*), - void* arg - ); - -BLIS_EXPORT_BLIS int bli_pthread_join - ( - bli_pthread_t thread, - void** retval - ); - -// -- pthread_mutex_*() -- - -BLIS_EXPORT_BLIS int bli_pthread_mutex_init - ( - bli_pthread_mutex_t* mutex, - const bli_pthread_mutexattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_lock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock - ( - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock - ( - bli_pthread_mutex_t* mutex - ); - -// -- pthread_cond_*() -- - -BLIS_EXPORT_BLIS int bli_pthread_cond_init - ( - bli_pthread_cond_t* cond, - const bli_pthread_condattr_t* attr - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_destroy - ( - bli_pthread_cond_t* cond - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_wait - ( - bli_pthread_cond_t* cond, - bli_pthread_mutex_t* mutex - ); - -BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast - ( - bli_pthread_cond_t* cond - ); - -// -- pthread_once_*() -- - -BLIS_EXPORT_BLIS void bli_pthread_once - ( - bli_pthread_once_t* once, - void (*init)(void) - ); - -// -- pthread_barrier_*() -- - -BLIS_EXPORT_BLIS int bli_pthread_barrier_init - ( - bli_pthread_barrier_t* barrier, - const bli_pthread_barrierattr_t* attr, - unsigned int count - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy - ( - bli_pthread_barrier_t* barrier - ); - -BLIS_EXPORT_BLIS int bli_pthread_barrier_wait - ( - bli_pthread_barrier_t* barrier - ); - -#endif // _MSC_VER - -#endif // BLIS_PTHREAD_H -// end bli_pthread.h -// begin bli_malloc.h - - -// Typedef function pointer types for malloc() and free() substitutes. -typedef void* (*malloc_ft) ( size_t size ); -typedef void (*free_ft) ( void* p ); - -// ----------------------------------------------------------------------------- - -#if 0 -BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); -BLIS_EXPORT_BLIS void bli_free_pool( void* p ); -#endif - -void* bli_malloc_intl( size_t size ); -void* bli_calloc_intl( size_t size ); -void bli_free_intl( void* p ); - -BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size ); -BLIS_EXPORT_BLIS void bli_free_user( void* p ); - -// ----------------------------------------------------------------------------- - -void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size ); -void bli_ffree_align( free_ft f, void* p ); - -void* bli_fmalloc_noalign( malloc_ft f, size_t size ); -void bli_ffree_noalign( free_ft f, void* p ); - -void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); -void bli_fmalloc_post_check( void* p ); - -// end bli_malloc.h - -// -- Pool block type -- - -typedef struct -{ - void* buf; - siz_t block_size; - -} pblk_t; - - -// -- Pool type -- - -typedef struct -{ - void* block_ptrs; - dim_t block_ptrs_len; - - dim_t top_index; - dim_t num_blocks; - - siz_t block_size; - siz_t align_size; - siz_t offset_size; - - malloc_ft malloc_fp; - free_ft free_fp; - -} pool_t; - - -// -- Array type -- - -typedef struct -{ - void* buf; - - siz_t num_elem; - siz_t elem_size; - -} array_t; - - -// -- Locked pool-of-arrays-of-pools type -- - -typedef struct -{ - bli_pthread_mutex_t mutex; - pool_t pool; - - siz_t def_array_len; - -} apool_t; - - -// -- packing block allocator: Locked set of pools type -- - -typedef struct membrk_s -{ - pool_t pools[3]; - bli_pthread_mutex_t mutex; - - // These fields are used for general-purpose allocation. - siz_t align_size; - malloc_ft malloc_fp; - free_ft free_fp; - -} membrk_t; - - -// -- Memory object type -- - -typedef struct mem_s -{ - pblk_t pblk; - packbuf_t buf_type; - pool_t* pool; - siz_t size; -} mem_t; - - -// -- Control tree node type -- - -struct cntl_s -{ - // Basic fields (usually required). - opid_t family; - bszid_t bszid; - void_fp var_func; - struct cntl_s* sub_prenode; - struct cntl_s* sub_node; - - // Optional fields (needed only by some operations such as packm). - // NOTE: first field of params must be a uint64_t containing the size - // of the struct. - void* params; - - // Internal fields that track "cached" data. - mem_t pack_mem; -}; -typedef struct cntl_s cntl_t; - - -// -- Blocksize object type -- - -typedef struct blksz_s -{ - // Primary blocksize values. - dim_t v[BLIS_NUM_FP_TYPES]; - - // Blocksize extensions. - dim_t e[BLIS_NUM_FP_TYPES]; - -} blksz_t; - - -// -- Function pointer object type -- - -typedef struct func_s -{ - // Kernel function address. - void_fp ptr[BLIS_NUM_FP_TYPES]; - -} func_t; - - -// -- Multi-boolean object type -- - -typedef struct mbool_s -{ - bool_t v[BLIS_NUM_FP_TYPES]; - -} mbool_t; - - -// -- Auxiliary kernel info type -- - -// Note: This struct is used by macro-kernels to package together extra -// parameter values that may be of use to the micro-kernel without -// cluttering up the micro-kernel interface itself. - -typedef struct -{ - // The pack schemas of A and B. - pack_t schema_a; - pack_t schema_b; - - // Pointers to the micro-panels of A and B which will be used by the - // next call to the micro-kernel. - void* a_next; - void* b_next; - - // The imaginary strides of A and B. - inc_t is_a; - inc_t is_b; - - // The panel strides of A and B. - // NOTE: These are only used in situations where iteration over the - // micropanels takes place in part within the kernel code (e.g. sup - // millikernels). - inc_t ps_a; - inc_t ps_b; - - // The type to convert to on output. - //num_t dt_on_output; - -} auxinfo_t; - - -// -- Global scalar constant data struct -- - -// Note: This struct is used only when statically initializing the -// global scalar constants in bli_const.c. -typedef struct constdata_s -{ - float s; - double d; - scomplex c; - dcomplex z; - gint_t i; - -} constdata_t; - - -// -// -- BLIS object type definitions --------------------------------------------- -// - -typedef struct obj_s -{ - // Basic fields - struct obj_s* root; - - dim_t off[2]; - dim_t dim[2]; - doff_t diag_off; - - objbits_t info; - objbits_t info2; - siz_t elem_size; - - void* buffer; - inc_t rs; - inc_t cs; - inc_t is; - - // Bufferless scalar storage - atom_t scalar; - - // Pack-related fields - dim_t m_padded; // m dimension of matrix, including any padding - dim_t n_padded; // n dimension of matrix, including any padding - inc_t ps; // panel stride (distance to next panel) - inc_t pd; // panel dimension (the "width" of a panel: - // usually MR or NR) - dim_t m_panel; // m dimension of a "full" panel - dim_t n_panel; // n dimension of a "full" panel -} obj_t; - -// Pre-initializors. Things that must be set afterwards: -// - root object pointer -// - info bitfields: dt, target_dt, exec_dt, comp_dt -// - info2 bitfields: scalar_dt -// - elem_size -// - dims, strides -// - buffer -// - internal scalar buffer (must always set imaginary component) - -#define BLIS_OBJECT_INITIALIZER \ -{ \ - .root = NULL, \ -\ - .off = { 0, 0 }, \ - .dim = { 0, 0 }, \ - .diag_off = 0, \ -\ - .info = 0x0 | BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( float ), \ -\ - .buffer = NULL, \ - .rs = 0, \ - .cs = 0, \ - .is = 1, \ -\ - .scalar = { 0.0, 0.0 }, \ -\ - .m_padded = 0, \ - .n_padded = 0, \ - .ps = 0, \ - .pd = 0, \ - .m_panel = 0, \ - .n_panel = 0 \ -} - -#define BLIS_OBJECT_INITIALIZER_1X1 \ -{ \ - .root = NULL, \ -\ - .off = { 0, 0 }, \ - .dim = { 1, 1 }, \ - .diag_off = 0, \ -\ - .info = 0x0 | BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( float ), \ -\ - .buffer = NULL, \ - .rs = 0, \ - .cs = 0, \ - .is = 1, \ -\ - .scalar = { 0.0, 0.0 }, \ -\ - .m_padded = 0, \ - .n_padded = 0, \ - .ps = 0, \ - .pd = 0, \ - .m_panel = 0, \ - .n_panel = 0 \ -} - -// Define these macros here since they must be updated if contents of -// obj_t changes. - -static void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) -{ - b->root = a->root; - - b->off[0] = a->off[0]; - b->off[1] = a->off[1]; - b->dim[0] = a->dim[0]; - b->dim[1] = a->dim[1]; - b->diag_off = a->diag_off; - - b->info = a->info; - b->info2 = a->info2; - b->elem_size = a->elem_size; - - b->buffer = a->buffer; - b->rs = a->rs; - b->cs = a->cs; - b->is = a->is; - - b->scalar = a->scalar; - - //b->pack_mem = a->pack_mem; - b->m_padded = a->m_padded; - b->n_padded = a->n_padded; - b->ps = a->ps; - b->pd = a->pd; - b->m_panel = a->m_panel; - b->n_panel = a->n_panel; -} - -static void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) -{ - b->root = a->root; - - b->off[0] = a->off[0]; - b->off[1] = a->off[1]; - // Avoid copying m and n since they will be overwritten. - //b->dim[0] = a->dim[0]; - //b->dim[1] = a->dim[1]; - b->diag_off = a->diag_off; - - b->info = a->info; - b->info2 = a->info2; - b->elem_size = a->elem_size; - - b->buffer = a->buffer; - b->rs = a->rs; - b->cs = a->cs; - b->is = a->is; - - b->scalar = a->scalar; - - // Avoid copying pack_mem entry. - // FGVZ: You should probably make sure this is right. - //b->pack_mem = a->pack_mem; - b->m_padded = a->m_padded; - b->n_padded = a->n_padded; - b->ps = a->ps; - b->pd = a->pd; - b->m_panel = a->m_panel; - b->n_panel = a->n_panel; -} - -// Initializors for global scalar constants. -// NOTE: These must remain cpp macros since they are initializor -// expressions, not functions. - -#define bli_obj_init_const( buffer0 ) \ -{ \ - .root = NULL, \ -\ - .off = { 0, 0 }, \ - .dim = { 1, 1 }, \ - .diag_off = 0, \ -\ - .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ - BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( constdata_t ), \ -\ - .buffer = buffer0, \ - .rs = 1, \ - .cs = 1, \ - .is = 1 \ -} - -#define bli_obj_init_constdata( val ) \ -{ \ - .s = ( float )val, \ - .d = ( double )val, \ - .c = { .real = ( float )val, .imag = 0.0f }, \ - .z = { .real = ( double )val, .imag = 0.0 }, \ - .i = ( gint_t )val, \ -} - - -// -- Context type -- - -typedef struct cntx_s -{ - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - bszid_t bmults[ BLIS_NUM_BLKSZS ]; - - blksz_t trsm_blkszs[ BLIS_NUM_BLKSZS ]; - - func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; - func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; - mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; - - blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; - void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; - blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; - func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; - mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; - - func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; - func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; - - func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; - func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; - - ind_t method; - pack_t schema_a_block; - pack_t schema_b_panel; - pack_t schema_c_panel; - -} cntx_t; - - -// -- Runtime type -- - -// NOTE: The order of these fields must be kept consistent with the definition -// of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. - -typedef struct rntm_s -{ - // "External" fields: these may be queried by the end-user. - bool_t auto_factor; - - dim_t num_threads; - dim_t thrloop[ BLIS_NUM_LOOPS ]; - bool_t pack_a; // enable/disable packing of left-hand matrix A. - bool_t pack_b; // enable/disable packing of right-hand matrix B. - bool_t l3_sup; // enable/disable small matrix handling in level-3 ops. - - // "Internal" fields: these should not be exposed to the end-user. - - // The small block pool, which is attached in the l3 thread decorator. - pool_t* sba_pool; - - // The packing block allocator, which is attached in the l3 thread decorator. - membrk_t* membrk; - -} rntm_t; - - -// -- Error types -- - -typedef enum -{ - BLIS_NO_ERROR_CHECKING = 0, - BLIS_FULL_ERROR_CHECKING -} errlev_t; - -typedef enum -{ - // Generic error codes - BLIS_SUCCESS = ( -1), - BLIS_FAILURE = ( -2), - - BLIS_ERROR_CODE_MIN = ( -9), - - // General errors - BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), - BLIS_UNDEFINED_ERROR_CODE = ( -11), - BLIS_NULL_POINTER = ( -12), - BLIS_NOT_YET_IMPLEMENTED = ( -13), - - // Parameter-specific errors - BLIS_INVALID_SIDE = ( -20), - BLIS_INVALID_UPLO = ( -21), - BLIS_INVALID_TRANS = ( -22), - BLIS_INVALID_CONJ = ( -23), - BLIS_INVALID_DIAG = ( -24), - BLIS_INVALID_MACHVAL = ( -25), - BLIS_EXPECTED_NONUNIT_DIAG = ( -26), - - // Datatype-specific errors - BLIS_INVALID_DATATYPE = ( -30), - BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), - BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), - BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), - BLIS_EXPECTED_REAL_DATATYPE = ( -34), - BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), - BLIS_INCONSISTENT_DATATYPES = ( -36), - BLIS_EXPECTED_REAL_PROJ_OF = ( -37), - BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), - BLIS_INCONSISTENT_PRECISIONS = ( -39), - - // Dimension-specific errors - BLIS_NONCONFORMAL_DIMENSIONS = ( -40), - BLIS_EXPECTED_SCALAR_OBJECT = ( -41), - BLIS_EXPECTED_VECTOR_OBJECT = ( -42), - BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), - BLIS_EXPECTED_SQUARE_OBJECT = ( -44), - BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), - BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), - BLIS_UNEXPECTED_VECTOR_DIM = ( -47), - BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), - BLIS_NEGATIVE_DIMENSION = ( -49), - - // Stride-specific errors - BLIS_INVALID_ROW_STRIDE = ( -50), - BLIS_INVALID_COL_STRIDE = ( -51), - BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), - - // Structure-specific errors - BLIS_EXPECTED_GENERAL_OBJECT = ( -60), - BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), - BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), - BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), - - // Storage-specific errors - BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), - - // Partitioning-specific errors - BLIS_INVALID_3x1_SUBPART = ( -80), - BLIS_INVALID_1x3_SUBPART = ( -81), - BLIS_INVALID_3x3_SUBPART = ( -82), - - // Control tree-specific errors - BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), - - // Packing-specific errors - BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), - - // Buffer-specific errors - BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), - - // Memory errors - BLIS_MALLOC_RETURNED_NULL = (-120), - - // Internal memory pool errors - BLIS_INVALID_PACKBUF = (-130), - BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), - BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), - BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), - BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), - - // Object-related errors - BLIS_EXPECTED_OBJECT_ALIAS = (-140), - - // Architecture-related errors - BLIS_INVALID_ARCH_ID = (-150), - - // Blocksize-related errors - BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), - BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), - BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), - BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), - BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), - BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), - - BLIS_ERROR_CODE_MAX = (-170) -} err_t; - -#endif -// end bli_type_defs.h - - -enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; -enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; -enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; -enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; -enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; - -#ifdef __cplusplus -extern "C" { -#endif - - -BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, - f77_int incX, const float *Y, f77_int incY); -BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, - f77_int incY); -BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, - const float *Y, f77_int incY); -BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, - const double *Y, f77_int incY); - - -BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, - const void *Y, f77_int incY, void *dotu); -BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, - const void *Y, f77_int incY, void *dotc); - -BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, - const void *Y, f77_int incY, void *dotu); -BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, - const void *Y, f77_int incY, void *dotc); - - - -BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); -BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); - -BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); -BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); - -BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); -BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); - -BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); -BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); - - - -BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); -BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); -BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); -BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); - - - - -void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, - float *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, - float *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, - f77_int incX, float *Y, f77_int incY); - -void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, - double *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, - double *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, - f77_int incX, double *Y, f77_int incY); - -void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, - void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, - void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, - f77_int incX, void *Y, f77_int incY); - -void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, - void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, - void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, - f77_int incX, void *Y, f77_int incY); - - - -void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); -void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); -void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, - float *Y, f77_int incY, const float c, const float s); -void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, - float *Y, f77_int incY, const float *P); - -void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); -void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); -void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, - double *Y, f77_int incY, const double c, const double s); -void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, - double *Y, f77_int incY, const double *P); - - - -void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); - - - - -void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, - enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, - float alpha, const float *A, f77_int lda, - const float *X, f77_int incX, float beta, - float *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, - enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, - f77_int KL, f77_int KU, float alpha, - const float *A, f77_int lda, const float *X, - f77_int incX, float beta, float *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const float *A, f77_int lda, - float *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, f77_int K, const float *A, f77_int lda, - float *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const float *Ap, float *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const float *A, f77_int lda, float *X, - f77_int incX); -void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, f77_int K, const float *A, f77_int lda, - float *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const float *Ap, float *X, f77_int incX); - -void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, - enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, - double alpha, const double *A, f77_int lda, - const double *X, f77_int incX, double beta, - double *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, - enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, - f77_int KL, f77_int KU, double alpha, - const double *A, f77_int lda, const double *X, - f77_int incX, double beta, double *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const double *A, f77_int lda, - double *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, f77_int K, const double *A, f77_int lda, - double *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const double *Ap, double *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const double *A, f77_int lda, double *X, - f77_int incX); -void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, f77_int K, const double *A, f77_int lda, - double *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const double *Ap, double *X, f77_int incX); - -void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, - enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, - const void *alpha, const void *A, f77_int lda, - const void *X, f77_int incX, const void *beta, - void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, - enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, - f77_int KL, f77_int KU, const void *alpha, - const void *A, f77_int lda, const void *X, - f77_int incX, const void *beta, void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const void *A, f77_int lda, - void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, f77_int K, const void *A, f77_int lda, - void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const void *Ap, void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const void *A, f77_int lda, void *X, - f77_int incX); -void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, f77_int K, const void *A, f77_int lda, - void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const void *Ap, void *X, f77_int incX); - -void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, - enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, - const void *alpha, const void *A, f77_int lda, - const void *X, f77_int incX, const void *beta, - void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, - enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, - f77_int KL, f77_int KU, const void *alpha, - const void *A, f77_int lda, const void *X, - f77_int incX, const void *beta, void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const void *A, f77_int lda, - void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, f77_int K, const void *A, f77_int lda, - void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const void *Ap, void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const void *A, f77_int lda, void *X, - f77_int incX); -void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, f77_int K, const void *A, f77_int lda, - void *X, f77_int incX); -void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - f77_int N, const void *Ap, void *X, f77_int incX); - - - -void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, float alpha, const float *A, - f77_int lda, const float *X, f77_int incX, - float beta, float *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, f77_int K, float alpha, const float *A, - f77_int lda, const float *X, f77_int incX, - float beta, float *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, float alpha, const float *Ap, - const float *X, f77_int incX, - float beta, float *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, - float alpha, const float *X, f77_int incX, - const float *Y, f77_int incY, float *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, float alpha, const float *X, - f77_int incX, float *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, float alpha, const float *X, - f77_int incX, float *Ap); -void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, float alpha, const float *X, - f77_int incX, const float *Y, f77_int incY, float *A, - f77_int lda); -void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, float alpha, const float *X, - f77_int incX, const float *Y, f77_int incY, float *A); - -void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, double alpha, const double *A, - f77_int lda, const double *X, f77_int incX, - double beta, double *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, f77_int K, double alpha, const double *A, - f77_int lda, const double *X, f77_int incX, - double beta, double *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, double alpha, const double *Ap, - const double *X, f77_int incX, - double beta, double *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, - double alpha, const double *X, f77_int incX, - const double *Y, f77_int incY, double *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, double alpha, const double *X, - f77_int incX, double *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, double alpha, const double *X, - f77_int incX, double *Ap); -void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, double alpha, const double *X, - f77_int incX, const double *Y, f77_int incY, double *A, - f77_int lda); -void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, double alpha, const double *X, - f77_int incX, const double *Y, f77_int incY, double *A); - - - -void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, const void *alpha, const void *A, - f77_int lda, const void *X, f77_int incX, - const void *beta, void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, f77_int K, const void *alpha, const void *A, - f77_int lda, const void *X, f77_int incX, - const void *beta, void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, const void *alpha, const void *Ap, - const void *X, f77_int incX, - const void *beta, void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, - const void *alpha, const void *X, f77_int incX, - const void *Y, f77_int incY, void *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, - const void *alpha, const void *X, f77_int incX, - const void *Y, f77_int incY, void *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, float alpha, const void *X, f77_int incX, - void *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, float alpha, const void *X, - f77_int incX, void *A); -void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, - const void *alpha, const void *X, f77_int incX, - const void *Y, f77_int incY, void *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, - const void *alpha, const void *X, f77_int incX, - const void *Y, f77_int incY, void *Ap); - -void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, const void *alpha, const void *A, - f77_int lda, const void *X, f77_int incX, - const void *beta, void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, f77_int K, const void *alpha, const void *A, - f77_int lda, const void *X, f77_int incX, - const void *beta, void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, const void *alpha, const void *Ap, - const void *X, f77_int incX, - const void *beta, void *Y, f77_int incY); -void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, - const void *alpha, const void *X, f77_int incX, - const void *Y, f77_int incY, void *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, - const void *alpha, const void *X, f77_int incX, - const void *Y, f77_int incY, void *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, double alpha, const void *X, f77_int incX, - void *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - f77_int N, double alpha, const void *X, - f77_int incX, void *A); -void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, - const void *alpha, const void *X, f77_int incX, - const void *Y, f77_int incY, void *A, f77_int lda); -void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, - const void *alpha, const void *X, f77_int incX, - const void *Y, f77_int incY, void *Ap); - - - - -void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, - f77_int K, float alpha, const float *A, - f77_int lda, const float *B, f77_int ldb, - float beta, float *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, f77_int M, f77_int N, - float alpha, const float *A, f77_int lda, - const float *B, f77_int ldb, float beta, - float *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - float alpha, const float *A, f77_int lda, - float beta, float *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - float alpha, const float *A, f77_int lda, - const float *B, f77_int ldb, float beta, - float *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, f77_int M, f77_int N, - float alpha, const float *A, f77_int lda, - float *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, f77_int M, f77_int N, - float alpha, const float *A, f77_int lda, - float *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, - f77_int N, f77_int K, float alpha, const float *A, - f77_int lda, const float *B, f77_int ldb, - float beta, float *C, f77_int ldc); - -void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, - f77_int K, double alpha, const double *A, - f77_int lda, const double *B, f77_int ldb, - double beta, double *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, f77_int M, f77_int N, - double alpha, const double *A, f77_int lda, - const double *B, f77_int ldb, double beta, - double *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - double alpha, const double *A, f77_int lda, - double beta, double *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - double alpha, const double *A, f77_int lda, - const double *B, f77_int ldb, double beta, - double *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, f77_int M, f77_int N, - double alpha, const double *A, f77_int lda, - double *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, f77_int M, f77_int N, - double alpha, const double *A, f77_int lda, - double *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, - f77_int N, f77_int K, double alpha, const double *A, - f77_int lda, const double *B, f77_int ldb, - double beta, double *C, f77_int ldc); - -void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, - f77_int K, const void *alpha, const void *A, - f77_int lda, const void *B, f77_int ldb, - const void *beta, void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, f77_int M, f77_int N, - const void *alpha, const void *A, f77_int lda, - const void *B, f77_int ldb, const void *beta, - void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - const void *alpha, const void *A, f77_int lda, - const void *beta, void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - const void *alpha, const void *A, f77_int lda, - const void *B, f77_int ldb, const void *beta, - void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, f77_int M, f77_int N, - const void *alpha, const void *A, f77_int lda, - void *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, f77_int M, f77_int N, - const void *alpha, const void *A, f77_int lda, - void *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, - f77_int N, f77_int K, const void *alpha, const void *A, - f77_int lda, const void *B, f77_int ldb, - const void *beta, void *C, f77_int ldc); - -void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, - f77_int K, const void *alpha, const void *A, - f77_int lda, const void *B, f77_int ldb, - const void *beta, void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, f77_int M, f77_int N, - const void *alpha, const void *A, f77_int lda, - const void *B, f77_int ldb, const void *beta, - void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - const void *alpha, const void *A, f77_int lda, - const void *beta, void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - const void *alpha, const void *A, f77_int lda, - const void *B, f77_int ldb, const void *beta, - void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, f77_int M, f77_int N, - const void *alpha, const void *A, f77_int lda, - void *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, f77_int M, f77_int N, - const void *alpha, const void *A, f77_int lda, - void *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, - f77_int N, f77_int K, const void *alpha, const void *A, - f77_int lda, const void *B, f77_int ldb, - const void *beta, void *C, f77_int ldc); - - - -void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, f77_int M, f77_int N, - const void *alpha, const void *A, f77_int lda, - const void *B, f77_int ldb, const void *beta, - void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - float alpha, const void *A, f77_int lda, - float beta, void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - const void *alpha, const void *A, f77_int lda, - const void *B, f77_int ldb, float beta, - void *C, f77_int ldc); - -void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, - enum CBLAS_UPLO Uplo, f77_int M, f77_int N, - const void *alpha, const void *A, f77_int lda, - const void *B, f77_int ldb, const void *beta, - void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - double alpha, const void *A, f77_int lda, - double beta, void *C, f77_int ldc); -void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, - const void *alpha, const void *A, f77_int lda, - const void *B, f77_int ldb, double beta, - void *C, f77_int ldc); - -void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); - -#ifdef __cplusplus -} -#endif -#endif -// end cblas.h - - -#endif // BLIS_ENABLE_CBLAS - -#endif -// end bli_cblas.h - -// -- Windows definitions - -// begin bli_winsys.h - - -//int bli_setenv( const char *name, const char *value, int overwrite ); -BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); - -// end bli_winsys.h - -// begin aocldtl.h - - -#ifndef _AOCLDTL_H_ -#define _AOCLDTL_H_ - -// begin aocldtlcf.h - - -#ifndef _AOCLDTLCF_H_ -#define _AOCLDTLCF_H_ - - -#define AOCL_DTL_TRACE_ENABLE 0 - - -#define AOCL_DTL_DUMP_ENABLE 0 - - -#define AOCL_DTL_LOG_ENABLE 0 - - - -#define AOCL_DTL_TRACE_LEVEL AOCL_DTL_LEVEL_TRACE_5 - - -#define AOCL_DTL_LEVEL_ALL (14) -#define AOCL_DTL_LEVEL_TRACE_8 (13) -#define AOCL_DTL_LEVEL_TRACE_7 (12) -#define AOCL_DTL_LEVEL_TRACE_6 (11) -#define AOCL_DTL_LEVEL_TRACE_5 (10) -#define AOCL_DTL_LEVEL_TRACE_4 (9) -#define AOCL_DTL_LEVEL_TRACE_3 (8) -#define AOCL_DTL_LEVEL_TRACE_2 (7) -#define AOCL_DTL_LEVEL_TRACE_1 (6) -#define AOCL_DTL_LEVEL_VERBOSE (5) -#define AOCL_DTL_LEVEL_INFO (4) -#define AOCL_DTL_LEVEL_MINOR (3) -#define AOCL_DTL_LEVEL_MAJOR (2) -#define AOCL_DTL_LEVEL_CRITICAL (1) - - -#define AOCL_DTL_TRACE_FILE "aocldtl_trace.txt" -#define AOCL_DTL_AUTO_TRACE_FILE "aocldtl_auto_trace.rawfile" -#define AOCL_DTL_LOG_FILE "aocldtl_log.txt" - - -#define AOCL_CHAR_DATA_TYPE (1) - - -#define AOCL_UINT16_DATA_TYPE (2) - - -#define AOCL_STRING_DATA_TYPE (3) - - -#define AOCL_UINT32_DATA_TYPE (4) - - -#define AOCL_LOG_HEX_VALUE ('x') - - -#define AOCL_LOG_DECIMAL_VALUE ('d') - - - -#endif - - -// end aocldtlcf.h -// begin aocltpdef.h - - -#ifndef AOCL_TYPEDEF_H_ -#define AOCL_TYPEDEF_H_ - -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#ifndef _WIN32 -#include // skipped -#else -typedef int pid_t; -#endif - -typedef double Double; -typedef float Float; -typedef void Void; -typedef unsigned char uint8; -typedef unsigned short int uint16; -typedef unsigned int uint32; -typedef unsigned long uint64; -typedef uint8 *STRING; -typedef unsigned char Bool; -typedef char int8; -typedef signed long int int32; -typedef short int int16; - -typedef Void *AOCL_HANDLE; -typedef pid_t AOCL_TID; - -#endif - - -// end aocltpdef.h -// begin aoclflist.h - - -#ifndef _AOCL_FLIST_H_ -#define _AOCL_FLIST_H_ - -// begin aocltpdef.h - - -#ifndef AOCL_TYPEDEF_H_ -#define AOCL_TYPEDEF_H_ - -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#include // skipped -#ifndef _WIN32 -#include // skipped -#else -typedef int pid_t; -#endif - -typedef double Double; -typedef float Float; -typedef void Void; -typedef unsigned char uint8; -typedef unsigned short int uint16; -typedef unsigned int uint32; -typedef unsigned long uint64; -typedef uint8 *STRING; -typedef unsigned char Bool; -typedef char int8; -typedef signed long int int32; -typedef short int int16; - -typedef Void *AOCL_HANDLE; -typedef pid_t AOCL_TID; - -#endif - - -// end aocltpdef.h -// begin aoclfal.h - - -#ifndef _AOCL_FAL_H_ -#define _AOCL_FAL_H_ - - -#define AOCL_FAL_SUCCESS 0 -#define AOCL_FAL_CLOSE_ERROR -1 -#define AOCL_FAL_READ_ERROR -2 -#define AOCL_FAL_WRITE_ERROR -3 -#define AOCL_FAL_EOF_ERROR -6 -#define AOCL_FAL_FERROR -7 - - -#define AOCL_FAL_FILE FILE - - -int32 AOCL_FAL_Close( - AOCL_FAL_FILE *fpFilePointer); - -int32 AOCL_FAL_Error( - AOCL_FAL_FILE *fpFilePointer); - -AOCL_FAL_FILE *AOCL_FAL_Open( - const int8 *pchFileName, - const int8 *pchMode); - -int32 AOCL_FAL_Read( - void *pvBuffer, - int32 i32Size, - int32 i32Count, - AOCL_FAL_FILE *fpFilePointer); - -int32 AOCL_FAL_Write( - const void *pvBuffer, - int32 i32Size, - int32 iCount, - AOCL_FAL_FILE *fpFilePointer); - -#endif - - -// end aoclfal.h - -typedef struct AOCL_FLIST_Node_t -{ - AOCL_TID tid; - AOCL_FAL_FILE *fp; - struct AOCL_FLIST_Node_t *pNext; -} AOCL_FLIST_Node; - -Bool AOCL_FLIST_IsEmpty( - AOCL_FLIST_Node *plist); - -AOCL_FAL_FILE *AOCL_FLIST_GetFile( - AOCL_FLIST_Node *plist, - AOCL_TID tid); - -AOCL_FAL_FILE *AOCL_FLIST_AddFile( - const int8 *pchFilePrefix, - AOCL_FLIST_Node **plist, - AOCL_TID tid); - -void AOCL_FLIST_CloseFile( - AOCL_FLIST_Node *plist, - AOCL_TID tid); - -void AOCL_FLIST_CloseAll( - AOCL_FLIST_Node *plist); - -#endif - - -// end aoclflist.h - -#define TRACE_TYPE_FENTRY (1) -#define TRACE_TYPE_FEXIT (2) -#define TRACE_TYPE_LOG (3) -#define TRACE_TYPE_RAW (4) - - -#define AOCL_DEBUGPRINT printf - - -#if (AOCL_DTL_TRACE_ENABLE || AOCL_DTL_DUMP_ENABLE || AOCL_DTL_LOG_ENABLE) -#define AOCL_DTL_INITIALIZE_ENABLE -#endif - -#if AOCL_DTL_TRACE_ENABLE - -#define AOCL_DTL_TRACE_ENTRY(LogLevel) \ - DTL_Trace(LogLevel, \ - TRACE_TYPE_FENTRY, \ - __FILE__, \ - __FUNCTION__, \ - __LINE__, \ - NULL); -#else - -#define AOCL_DTL_TRACE_ENTRY(LogLevel) -#endif - -#if AOCL_DTL_TRACE_ENABLE - -#define AOCL_DTL_TRACE_EXIT(LogLevel) \ - DTL_Trace(LogLevel, \ - TRACE_TYPE_FEXIT, \ - __FILE__, \ - __FUNCTION__, \ - __LINE__, \ - NULL); - -#define AOCL_DTL_TRACE_EXIT_ERR(LogLevel, Message) \ - DTL_Trace(LogLevel, \ - TRACE_TYPE_FEXIT, \ - __FILE__, \ - __FUNCTION__, \ - __LINE__, \ - Message); -#else - -#define AOCL_DTL_TRACE_EXIT(LogLevel) -#define AOCL_DTL_TRACE_EXIT_ERR(LogLevel, Message) -#endif - -#if AOCL_DTL_DUMP_ENABLE - -#define AOCL_DTL_DUMP(LogLevel, Buffer, BufferSize, DataType, String, OutputType) \ - \ - DTL_DumpData(LogLevel, \ - Buffer, \ - BufferSize, \ - DataType, \ - String, \ - OutputType); -#else - -#define AOCL_DTL_DUMP(Buffer, BufferSize, DataType, String, OutputType) - -#endif - -#if AOCL_DTL_LOG_ENABLE - -#define AOCL_DTL_LOG(LogLevel, Message) \ - DTL_Trace(LogLevel, \ - TRACE_TYPE_LOG, \ - __FILE__, \ - __FUNCTION__, \ - __LINE__, \ - Message); -#else - -#define AOCL_DTL_LOG(LogLevel, Message) -#endif - - -#ifdef AOCL_DTL_INITIALIZE_ENABLE -#define AOCL_DTL_INITIALIZE(CURRENT_LOG_LEVEL) \ - DTL_Initialize(CURRENT_LOG_LEVEL); -#else - -#define AOCL_DTL_INITIALIZE(CURRENT_LOG_LEVEL) -#endif - - -#ifdef AOCL_DTL_INITIALIZE_ENABLE -#define AOCL_DTL_UNINITIALIZE() \ - DTL_Uninitialize(); -#else - -#define AOCL_DTL_UNINITIALIZE() -#endif - -#ifdef AOCL_DTL_INITIALIZE_ENABLE - -void DTL_Initialize( - uint32 ui32CurrentLogLevel); -void DTL_Uninitialize(void); -#endif - -#if (AOCL_DTL_TRACE_ENABLE || AOCL_DTL_LOG_ENABLE) - -void DTL_Trace( - uint8 ui8LogLevel, - uint8 ui8LogType, - const int8 *pi8FileName, - const int8 *pi8FunctionName, - uint32 ui32LineNumber, - const int8 *pi8Message); - -#endif - -#if AOCL_DTL_DUMP_ENABLE - -void DTL_DumpData( - uint8 ui8LogLevel, - void *pvBuffer, - uint32 ui32BufferSize, - uint8 ui8DataType, - int8 *pi8Message, - int8 i8OutputType); -#endif - -#endif - - -// end aocldtl.h -// begin aocldtl_blis.h - - - -#ifndef __AOCLDTL_BLIS_H -#define __AOCLDTL_BLIS_H - -// skipped #include "blis.h" - -#if AOCL_DTL_LOG_ENABLE -void AOCL_DTL_log_gemm_sizes(int8 loglevel, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - const char* filename, - const char* functionn_name, - int line); - -#define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, alpha, a, b, beta, c) \ - AOCL_DTL_log_gemm_sizes(loglevel, alpha, a, b, beta, c, __FILE__, __FUNCTION__, __LINE__); -#else -#define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, alpha, a, b, beta, c) -#endif - -#endif - -// end aocldtl_blis.h - -// End extern "C" construct block. -#ifdef __cplusplus -} -#endif - -#endif - From cc9206df667b7c710b57b190b8ad351176de53b8 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 16 Jul 2021 15:48:37 -0500 Subject: [PATCH 042/226] Added Graviton2 Neoverse N1 performance results. Details: - Added single-threaded and multithreaded performance results to docs/Performance.md. These results were gathered on a Graviton2 Neoverse N1 server. Special thanks to Nicholai Tukanov for collecting these results via the Arm-HPC/AWS hackaton. - Corrected what was supposed to be a temporary tweak to the legend labels in test/3/octave/plot_l3_perf.m. --- docs/Performance.md | 55 ++++++++++++++++++ .../large/l3_perf_nn1_jc2ic8jr4_nt64.pdf | Bin 0 -> 20024 bytes .../large/l3_perf_nn1_jc2ic8jr4_nt64.png | Bin 0 -> 219714 bytes docs/graphs/large/l3_perf_nn1_nt1.pdf | Bin 0 -> 17910 bytes docs/graphs/large/l3_perf_nn1_nt1.png | Bin 0 -> 153768 bytes test/3/octave/plot_l3_perf.m | 3 +- test/3/octave/runthese.m | 4 ++ 7 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 docs/graphs/large/l3_perf_nn1_jc2ic8jr4_nt64.pdf create mode 100644 docs/graphs/large/l3_perf_nn1_jc2ic8jr4_nt64.png create mode 100644 docs/graphs/large/l3_perf_nn1_nt1.pdf create mode 100644 docs/graphs/large/l3_perf_nn1_nt1.png diff --git a/docs/Performance.md b/docs/Performance.md index be287716d2..051be7aea9 100644 --- a/docs/Performance.md +++ b/docs/Performance.md @@ -24,6 +24,9 @@ * **[A64fx](Performance.md#a64fx)** * **[Experiment details](Performance.md#a64fx-experiment-details)** * **[Results](Performance.md#a64fx-results)** + * **[Neoverse N1](Performance.md#neoverse-n1)** + * **[Experiment details](Performance.md#neoverse-n1-experiment-details)** + * **[Results](Performance.md#neoverse-n1-results)** * **[Feedback](Performance.md#feedback)** # Introduction @@ -601,6 +604,58 @@ The `runthese.m` file will contain example invocations of the function. --- +## Neoverse N1 + +### Neoverse N1 experiment details + +* Location: AWS cloud +* Processor model: Graviton2 Neoverse N1 +* Core topology: one socket, 64 cores per socket, 64 cores total +* SMT status: none +* Max clock rate: 2.5GHz (single-core and multicore) +* Max vector register length: 128 bits (NEON) +* Max FMA vector IPC: 2 +* Peak performance: + * single-core: 20.0 GFLOPS (double-precision), 40.0 GFLOPS (single-precision) + * multicore: 20.0 GFLOPS/core (double-precision), 40.0 GFLOPS/core (single-precision) +* Operating system: unknown +* Page size: unknown +* Compiler: gcc 10.3.0 +* Results gathered: 15 July 2021 +* Implementations tested: + * BLIS fab5c86d (0.8.1-67) + * configured with `./configure -t openmp thunderx2` (single- and multithreaded) + * sub-configuration exercised: `thunderx2` + * Single-threaded (1 core) execution requested via no change in environment variables + * Multithreaded (64 core) execution requested via `export BLIS_NUM_THREADS=64` + * OpenBLAS 0.3.17 + * configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded) + * configured `Makefile.rule` with `BINARY=64 NO_CBLAS=1 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=64` (multithreaded, 64 cores) + * Single-threaded (1 core) execution requested via `export OPENBLAS_NUM_THREADS=1` + * Multithreaded (64 core) execution requested via `export OPENBLAS_NUM_THREADS=64` +* Affinity: + * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0-63"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset. +* Frequency throttling (via `cpupower`): + * No changes made. +* Comments: + * N/A + +### Neoverse N1 results + +#### pdf + +* [Neoverse N1 single-threaded](graphs/large/l3_perf_nn1_nt1.pdf) +* [Neoverse N1 multithreaded (64 cores)](graphs/large/l3_perf_nn1_jc2ic8jr4_nt64.pdf) + +#### png (inline) + +* **Neoverse N1 single-threaded** +![single-threaded](graphs/large/l3_perf_nn1_nt1.png) +* **Neoverse N1 multithreaded (64 cores)** +![multithreaded (64 cores)](graphs/large/l3_perf_nn1_jc2ic8jr4_nt64.png) + +--- + # Feedback Please let us know what you think of these performance results! Similarly, if you have any questions or concerns, or are interested in reproducing these performance experiments on your own hardware, we invite you to [open an issue](https://github.com/flame/blis/issues) and start a conversation with BLIS developers. diff --git a/docs/graphs/large/l3_perf_nn1_jc2ic8jr4_nt64.pdf b/docs/graphs/large/l3_perf_nn1_jc2ic8jr4_nt64.pdf new file mode 100644 index 0000000000000000000000000000000000000000..517aee9ed1f520442a28e2d6e6bcf14b60c429c8 GIT binary patch literal 20024 zcmZsCbyOV9({2(71Ofz2g2Te%?oMzgxH~MmIEzb=;O-jS-Ccsi;_kuST>{*FfAW3z z+;jivnW?Vsn%T2cQ(aFz4O9>nr)Os1LsG1!%`kcx;T?5r##E~W%K3qhQ0m2901oj?kPW*|p?MtNHs z5DC4bsF<3Rh#0?}iK&Z&g%hmDMAp#83<4V?3vvd5`B`8+V%A0=6Cp5I+}6g)k>AMy z^3SmEc1{-77Ve<`%KtW`E+wiWsqkG&UPYOonU$LZHt^pHNiiu2NtOR6S zX@IgCV_%1S5kDJz{0u)h)yyiCo2V5ytKL8k7x;V~i9_obzT)x6!`p^kAu<27$4Gp4 zY#Tr}ct7al#cgx2)AhRG^YXICc<}!9%J%ho$Mr$iazKWr7WyhzPQdtj8>Rnp9=1c4 zzkGbKb6mTk<^J#>a|&4#9wJuHaH?C}ITPfYSyXq+7_eG`zCHlH>+JDeLbrS#mOJlb zrSThU5lTx<@;uXA6X#XMp@IXQ5}lCpxTxF}@sRO7;P2R^=KS-!nt(lCvs1>XSFyF* zPH(A4QoUnA@7F76ChU2}U!2>vw{{8zJvZ)R8l!ueyd_c^l`f&*1$`b`D<8C9-OFB| zb$J|`XK(BELbYF?GG8vCJ}(E*KYR}=xp~@BK+lr-!ps7K#ubM8l{s0jH6^r$Hu9yV z`W3aZdhm^wZLYYsnnQKHCB1g`oDByxaU&+!O{%Wms<~X_XbC+O?59^b8CS0?jzBM)LRK+Ol zSnRx?vvG^p>U>GEy=;$iyRP)Pp(kuqd;0r0f29Aq%=wVj9{zZv3a@5shIeq3S?=@l z{HinmcbW5b&nLcyTzJJc?)yDPbCNx+h-HKmue`uz=WCp2o=^Pi^Mf|^qmT5x&R+$} z7Rd*VgB_m*xqNkksRnV&{pPyNI8N^|?Gq)3NawV|wsF?X#&il3YOld>QfH0Sl~%)1 zt?VBlL?;}V=*@%!*1`Z~C-tEC^vll_@Lb>BmK|bdn&lp}AM*luWgmP>N}L8egVlD< zY~EioLWXSIXDSyj6P(ZT+1_|s1AyyNx0d)+AcrjwqaQbCUioivd|=w zI7*L5)J_@@l2AmWJz!zkR+d4azoQxV=Yh;}@3_^C4KYY+#qoVw6O9R+sLva^l5GtI zD*r0940*n-UEk}H-5KWz;*(cqjbn5gcJ21~u9YJ93*jthIBJrS(!Zq6EMBsw|7xf; zp>UGtYeZchRA!O~fc3Y8PVUxuzcrd7TRorVd)%n}B&J&hN8R=e;)terMb0?cSv>Ia zAWHcp{gJO(vXu8GLs~uSj)vuHgK}kRn>y|#-y_5JPL9N|>eAorv(ih@_7a|iho|u~ z4rC7BTW!d+Kq^zQ?gHg0+q&zQO3GigX6QKYO^!dk*PVjfxq5pj-BICec*aI1wm>(n2votfM&9UUiP<%N-Sz<}B;B2wHp5+RU)`V?k*m zUxRmXf&_}>QxqOX7j+e0lfL&AtXv?C;WInFm6LvX&eqIE9kC@J7j*q~D`!X~^~!CM zJk`A2{PLT+`y{!yc(b`ui%*Gx1q*Qcb&C8~p*~wm+M;GEUtUzv7r9XA2v5b3Nb6Mj z;EMZ9?%yR|{#^^N^(kY^6Zt3$Y#OTr=Eh#N<{MY1hS&Uo0Ov2)T=qepF~&rmJKB+U z%fn)k1Kg+EMHQ=JkqTr%X=cYYUH7Ep z^lKsZMa}jX+t(bB#R)f}e*aOx{Z_?T$M6xK-WFVKKf~8}A9nBJshmFh4=x%&(!+C> zyXx*gw1?3)qJoB1+rY`#1bUgLvonIuEFj9?NY1UKYv=`Z%Z&CbwOL0~*0=!HV~BV0 zPJ^?_W`hx|Iny`dSkY_KvS>C4o78~IwV-N6^=Z)(6gITt6s;R>|EP7-jO(`Luj}7R z&=N=Jhh9HlN_(%|@9?Mkc!KVbeY}=FRF?Rx!JnOYa#0i^)`bvRZxdqweR_R1=TgY) zYzl$TC*vxw@Aiom67G3>L!FmPxW7utcOJ;l$aW#MG-Q{>Mj!L5l|(#Apj{nyrQMMm z0LOxsgPO>&^@L>GIUPo;OQ6YIMD#-ocgpeTE9vX^Rsy7k-3Y?FVxQrA2aYg=>sVG7 zH1wbTUfg{?j07JjZCRs`Gm%dP`6F0B=6fHQYhLI@B-Aw-qw588x=M0|3&7BAV9{Xd|s;;flh z?#3FAl#6-hOjp?nWuoep-O_(}e?2FnCrRv_phdLF zuMWAQ<&SXx?1l%~aNKE^w^&vFli>Nj-7=JM?nWY1=owYHtwfshEkME+2n905_T zW7GOD%=}tyd2(ZsHitJpXLOBqcS+jws!Sp`!D1-{ET)rW3! zt9$9UYZHF)_$3hjVo8Ng&VBBiOP+r)lO-wsPBHoeJX@f?-^eA4c!(d}Wr#_jbdc=E#?QTMm!8eoom-Rr$PxfXeki}N_ZH+kwcD7i~<*NJPZId*%(a%IF_pk zi%I{1?i+k$k@M&P4zkIDWun~(T>M7XAPho8&)g{YcDOUw)8;i{hmmo_0xHI8GQg6CqhKQY?8-+R{@j{B9gH)TSI5Xho} za*q>2CTg3k+{0x~i`E`iX#fYWJG&+}6?f&0Lj8zt>wUkHwdTH_VV)sumiOsm#(Kvu zPX0N5!%Lp9Xm)t<<2LHN|J9xDqlfluvXcW(pc#XhbIXISH)DUT|Jo@Rdh=xYy2^UA zd}>_Y*TJwv*$k0>)o@*e)1r?gcEJUS2JUJ82fm%PZ8eFwWRncL7|pQY_PI*!x_4XSE@?t<5Nz=SwbGoCRFGJjIQ+s=@Vbh<_fAyCVxw9GJkcbrr@J7>YrDUFAMBxX z9TYUTHnLsFm;-+^Y610S5OA}R7i_xSrtlY>$)kLq4sh5xDCP1$cSfDgSV{c7ds!(z z{$TI&daV1!NjNk*yho#9(YSMdvEu1}eDd^#E~wx139Xcz>p*W;KGf&(EYC+E@~`T%_p_~_Ph5x6 zUoBnqiPzf>jc-Ki>h-|S6Dn^XEOPftZn;|`Wv2QfvP+jJ1Ir4AKA3fDuvEj390$}M zwA_vy?*8vQbT{p#SD8FN@$mr``Ibz z_T?t`GIasSi4^J2Fxasn^r9 zgAz?kmx}FV(?Cll&NX#~nqlRZl-e@U3g(rTR;Ma5zf2tLIBXjFRZr5nx5MD-)#`DJ zB7%#DG5<++RLObSRJ$&NeC~4v9Qe@lk%h zH#);&yyY8}P={j4z-t9j8)4DD9A>*vxDckAyop+!s*22I&mbS=&=2Yw&>Q70m#5k3 z$Ih^^Fd4$Kqx)v;7gG^gBr>F_Q86-MDNx*EWE)q?D7%tV>&NXW0FC*pSw)khHeB~K zGi>%7o+&f`5T#*lsKRpbSAMm))h%85QoDL##*~K|d;<}v48w(E-+tJqfpy;=(335} zzj;%f^!3`6rNEhg_|8!6&lK63cbG%3R#o7YOyoVcS*Qcuhi?+`WQhecUiuDnm)T~K z?qzb*c>W|CVGSfw7k?-ylp4szz6Bjbg~p?8Zxj5WRf3SvRLU{K|7`vdX}jeqde{BppaLflurWdZbyDU1TMve z=Z?W~78R1e9!%xf_KP>jbLbm*&mH6I1iLRPyaQ@Zx+E(tFxUYgL!Cq-op?^xKozXu6s45Ou*3Iwb#P8V@a9TRZ>vlQ`jad{6Rt#|GanF3drm8g4 z+orN*vNv3U8ULO%Mk4v(#li6NE2)&akIuk(>=AC;`$`NzfDCG9_v>@@%Rr8QeBSDs z5zjbHh=G3s{U_7ccF)K7C*MXKI#{A=0)kGxJhl3xY8+CZX`tLE?5__Y%1Cmz%3ldS zh<-?^gp4SiKaNfTITU2FBQ;xQ>}8hsTx6J8vhsBY==*p|FX*EF(CKczv$8AwN8QZU zV{X=5nY}2EPaJ_{k~|&aOUyyxs=f7HZ(eM=ay)HSZ0|e%YG41rU^F;M3BV(v%!yhKFXkQNrB&C9`JclK}b&><~FeQULuX`uu zq-hGw%ywhz64M7eel;%;7)tY(>aA}uw>EL!`yUFjt+jpUi$s~BeTy4|VftDE+zpabiEu+IyL*Tm2#fj$6`kIxd`3{iWB`O*L?-sgo8L6#WgObkb?K-oA zS?Lz}YvYK_c_;ntMz@jpfyt3j)Caej;p08X>;wtQ-G;|+g$ieSH&N*pewiyPc`;dM zwRbh6>>96KIxTeaIgFQ%s=JC}a@t>;p6wNif#zdI}~CkFd} zuW{&?yEZ>QvEK9fsJQl5GY7ps`ck; zM!oLF<3bU&OP)`dWHrzzg4v>>ZP1W?xB7O3Y-DZ5sCU7T;ul^AWQX{{Eat2n0wD6& zim`qHWULG7B1PfAUeIUtEqEWM2E5ttr~E~VJ>j+R?DE&6dJ?lm9|=8OO2jR0D9yBV zA`bbcOF7x4+p@U%wIi^ag{~7+Bd|)zfx$6jPDn=aOePLDLw#K1(=qhsiX#4ac~s)4 z>ny-A#X!hyfoO99F6}#Nqqo3)g-7Efuw3BfLWW)$RG~Y)M!H1c?l%RVj<*>)FWyuP zoc%s9O_Jy;8O8qfT)|4e(TczI)c>>-WB7@oRn*=0P>&k=SD4pmylHrt3_dhMKY0N;Fo`gHG1t@YCasW z=1|{UNj>BZu4;qi5pT?yk3GFgIg;)(}s-X)7qp7$m z-0^-hL$DM--FB&Oy znSy9{j7@K@GRoukPMreM%Lyc}h(xK8)Q9)VoLC7f;)!P$<_Id{OL_FOdJ=L&3i&=5 zU{;vWl5j~iIgq?Mh&L%=af=&dAnsPPDoQhC4v8f&POJ}4YSrwqHpMb3oSm%3%jYCE zlBJT!h;|4ibSd3vCeaAB7JceeLCT24P+82Mwfg~4b>6;Bz*Y$nNI01Wh1>*V-;sPx zi58B|XAsme2@(6X@iyXbNN|?8rrJ2Glji4+bP|~^7*wtK?#D*JPiRd-fA1hr#uWp6 z!vIBKBC`=@qFLG&OQK)weVqbnziB+G0lp?w$Y^k!q%vUeuI2m*h(5=49`XKv}$u`azl(^b+wP3|)()DiG33uD%I1e?|ohSQBtuu9#GR zO*MSzJBbPGKK>kHrs!d0)*-67(|F4oirI>~@AMlJ*cIg|uK3xacS$ypsv5b!HI1s& zY=%LRMpMI&$~S6XdIl5J7kd?)Q`YeJ8Qyj6S0F3Ka^nRLvutx^edTifFWo!tW4Aw` zq~qvbXv1E;w6Nsn2gj;6V?{d9pR9e8)+a*)+~ z2B_Kxoy2lgt3~Ac#B!V_I&D%r%?7n{5bcZ>s>OQ-8(IX0(zg{_Hnejsw(-B!?8?vZ z9jk6a&igU;iCNp+zW>*IEUenj{_Q=P>scU;c(m8}-^ZA&o~TWZT}=St~Y8c*l57Bb$v|K2^`%ZNC) zE&pz+z&+x#zz>lyy6Pn?l@JsxIRdSc-p%#J92)l1G@%4cdMBR73h9u~sa6vzZe6g? zcEUG#s90i_d?dC>O1|&NXHfBr7gEj0^tqWj@U~5f*t-~0Q}r02ld$kUY&EzPQ{nko zF@Z&CQcAC+Y%4fcwrhn9v)r3~xkLstaxYx6?wakh6}=pX#S^uSVphi;Mk7@;rmbtm zP+)uASe8^W^ZQf^=1fJ)36uVJreUe@jNM)+I&t~#ciXAGeSc9aq|WJ7=BM&{2eFf_ zFzb-?_oqyl_d!;e%|h(MO4_TqAWaO09V!`klw)Mw#D)7~uwtUdDi33gXa~O7UW+J? zSy)T_ABnquNJzABTG!l8om6fO$?#-c?iis#lJ9UFv)|~FX`p8eo8D2oT!ll}%k@$2I=_FLao;{- zD)^w2=lzd&{Co#h(>buy_L%Lp{s~L0kl!bkH%)Cc+?IqaBiE~mXl4IcZYgtegtT){n9?K+mZFTzyb)QYgJT^a?byNi zEqkGa*-nSZnm|VW$HHXy&3y@6d0fh70Sa06gU;p<)0ctf5k>j578eG`E&&XMEf&Pv z8)l_Aoq|McqvUNC(1PIc1IkFeA)TYz*l*bzoSAWFrnuUjm?R>}+qsR2MOo@0Jz*~8 zXBCICEY_X5pMUx;WN}eGPA{a7t3;*1Q$CFN<{?L15=;<$)4w6($}Zjl-m!B^Dx+kk zUqYmngKQy(1T!;kn&-p81MG;G-e|%z+w8M7?%&YB zXyjqrl3+Bj&k~@fSTE#Jv>zM_!vlx2ab-SZ+3Fexd{nv!3qWUa$Y;9H{!tVJ=Z%stki!l` z?vh#!l!o4rG3(xE6ekQ}$K6D(u)n8jIb7^gCx9`Qrl6iV(|QFP3#`^CMU{$CEy}0z ztWogu&>V$tJ#bWO|A5iLsF2rhjX|y~?_#xyt@i3zCe?i7*k3308=Gr_4C@yW_e8ds zt2ym>3QVf;R%{wGQjyqLq0h^P=RA`)o&OU`I^1EQ1kCvd+Q3^Y?U!~|=JbDu5|lBG zPF$9@plVnsDc;z>;D6SUMjQCYIS>-d(NXCByF8=)s?i|KRP`T#9&ZX6VMPrEK@Cpl zpK2iF50!uUdgU7DuE7x&sekR9py|l@FNXC+4-f3WN7Hkze~|tPTX|O9P_Oe?9dlMP z_hkAtC(7q$yR3zqNnHA(RgbwkkIbq7GKO7484bwFSby=9jCbj{kZXny>bQU%dqm{!s!!&+u>4oaCBknK}=iXnWl}r z8topnHTy*Z&px$YJU0uPdkMY?B^vtm4Ep48^z}>vV0S#`hCtF~gWZH5Xd)XK77LS} z&8Av^|Jn>&+-~dnHAT|`S2+j7&dX}sG7rp`1tZQ%s8HNy@iPwhincwf$fPW;tty$; zTJ!2AC(lt-opkqZD$@%k6QX}kF~%yO;GfNsv43B>F;*Z0S#qNaI8c%v7E9XfP^H9 zy$liw#rzwcL@(!_W!*$VH(`@J-9(M?;xD@0>|5=+Nr;KvS~I}l`Y<|s3RGUCB%p9i z7|oSf1=p4$GOBMCG`N+4pKR%b-w~{Gy5}3ktWEVpo*=g6T82ba@9u_NG z{PdQXLMmDyS6m9fL@vA?+pFe8ZxfugVM3%Be{hX0`vpkU&G{{hirGPjEu?&9S1j&B z&(rFmzj1tw%bS+!_79czNaPHVzp1)gi|m;_RXzM_QtM1*boZQkm;^o&CI02Am|#uK zsvV%uJ2Zc)+vOv)vv1W~v;Y%NBmj|Mf?t0UkBtQ)q78K*dR^U7Q@+Lk{mDcZecv0u zL}$WA>?2pKC0}37mVi%S*VcbQ16JX`m7HuD<+wB+MfcYwZ=-N|+UOq&&yM9cE611r zWE^KUbn^0MZq+dR-u;T+s5N&E`n!u8JQ0bu)?|vAP3ZVwf)(-MWRPt%(1QkJiJ440 z!~hfNL>J-K2pAgR43_UcVMReYVNGK^l{C*3xHyq43I_)nr-}-Ttksid!=<5RApD@g zqyZkCiVCwg`YHI4X19bJHZO__Bce62h$cuD$-l_?SEjhU>713pMj~7|5tR6rhBS~k zQ!cyz%Y+hZct+<%jzYya>2r$0jS_b-KGv|x8%%t$dFrDL0_tpq=6;GqLS#ZaCc?M$ zo}Z6t=0Q0MY8e8VzQa2RHS23=MjD$mU%7)bJ$}&rQss}FgrQ;@YAx_)YeyF z{|6Z-CpJ$FZ@T=?!wgh>AWq3fR6DOyb}FgQy|2L|T^wua*w4dvfr?o}dQd;Ep`VtI zY=WtN*>KhwNZ0*=X#7(C>6{x1sL}`XUkOU=zcAaJyhSGs|LYu31s|dkj)!rgw~n$y zC3&pF#^5e$oS_(X&l9MYMikQ|G%$J-OfP>G>>dwz@~s#!^l)n-c(NPr-C2M*-=jDj zFMKZ&7j8B70QnP9NkouEQVm4j%;4@&0SjF|#T!dt&V&av^rEIf`4}X0MVS^AB6Yv# zhvQQ?9i!r*TO;9vi-&g>3WMkvBo0zTA>kAQkrRgt!40CQea^W|+|m)hgAIl-&Ltzm z8(eo^yr>66e*LK~GZ|OXouQB@lNcQ|4;$!Q3m;beluoM{xjwQW)k1J1+16z2=bYDW z!`*G$=wjgzfF1sxlN+F3BqpiNy4qh*w=>pZ8w*twX!*HZF)%}L`Q`X{PP&Vt-Gp@>GtA6c!^gGgzxjxo2P}wS zr};0HHT>DzQ`m&grFElyZf>61hd zxt-_?lU1&c5&O>1sV&BrTYGN*c~#SBRiQWq)7AZ;oWWJj6{bb-!!DjEQJz@|k5w@j zb9UMBB!kHkQv0$5#R`h59Dv25iBM{(79R1RY7i|h2{D>Fqs$9e7Kj)Cu^shW_SD@| zvQT8MMyfG$>&rjDPzfrxq-0E>iIv3>zpf6XQ&w;;G7uzZ5;e6DNce8KBm-j~6q6 z?4tNlGJr$Y>?#N!V=1zy{UadTsFz}q5UD&_FfC@LVgW%_mt8`5CxXS)4p3Kem9t-f zk&U{w8D6-qmhppvOq!|=fYjNiT&8%Ga}3^S8nFK;@VJP|tm`JuJhk9+Vh)Ln zqq^Svlz}DS5O(wBADx)|YaBr^}QLGu13?F3^>yGJq+8U$8C`UlS zV5llnzeB-b2_lNPm|7k#%R4qPwe6=`EM_savKXd+q>&#A)C^L5q;R05yj_kPs-Z(o z&w`=QiOdpqIW^f#rTl`SC9oAVUuuM8B-lc|Yk5-zsEO~X8@dTT!pmge_H{*%b` z(LvRMHUE?5ugpbiuBe4;uFO^JG#LHA+Ee_U1|0swr~Wo;FNzv_*{6XB@Y*ygpNC6S z4vaYgIOFx(qraS-D^##1(2P(t2>ST?9Jk?ToGs?qB(pC3G4V(uubwtiljIPrV{YA{ zo#&XX+6WHUDY|Cx?k=~_W??dg%ZvCp`@@&5*ghM_zi5mN?~9g@GZ7uh7p<80a&`$i zMUQQW-)kwp(&Sp?EUz1w_jOcrZ!Y1OrQ^{^Gu3UdEn)vws9s6-S~n;P{9x8wKf5@e z{@zRkUmla(!U9~RWfOy1ZdRkvvp9iZb{gE)0FRn5+P63nc<>wM=%LtAM3I#?$#<&y zBa0Kx_mhVGftFOF46_Rg5x|I%_9MC}b>h zL#$;Ya$`T%1-8l)d0bkU;Gr)ao}54yJoO8d4mt5^cemA?Qn z)cCT^Ovo=;Yp}{b5P?k-$-c{d!XQU5>Zz9((k1jf4wNG=p25oxLGiNLGMIwLnA(&_ zR&{x|FkJRf$M%{pYjS)fd#hwx{CSuj^zNwOms|=?$}2yO@-fOX=I?mTYK%DR@^2tD zxh83$6jdyeTo&4~Q+a!`_XwZ(scMb#L-1HZbDKihw6#WVEKxUm%C5Ga4WTl7I>L# zV$TC5J?bQ4-K0x=G!N|xXlfaveoJqfIhj3>5Z(xuiGNq-Ub!m`3~^5` z5pK(mGu$gj~Lwb3{E*6Ng;m{+pv#R3Fw{vc#_`y^>#|W# z7xnDls<@MJ`KEGe{!#;%<4)VXqGKNW!B%JTB-As5XwupZZ@F^%L4iKRwMfSVY3Gc* z5i-#ME>XBgGdF3gE{%G#v1!(>0nU)pc@P zcwCDG@mM!50--CZAp;;J@z7R=COZ3Fy-+h<_4G_3=2kkzAj-a0I@T#-2i?Pt7PA=;E(fft7y?3DTZM!KaNVdE%RD=Z>wca%U_Hddj&3=S zNvdCPmXJxJqQ+RB!#t&1Z|U_E;1sdbWpMF1P})0moB;|<{#>5JiK&$^oI9n|OJiUz z*Wdbuj0d?Hv{vsMvz!6WNrHRer-9D?H2nLRPG9e`>~w7wmLbLXM%zU)qQkxIy2&TO zGtOdjBp#^hAYPP8{gU3qj<$NJQf`H31TH}d%7L`F>Ax_qKmUt)6`lNpc|HFN^I}-^ z?wFqc{ZECZj1Cqt{_vXGSw4*>Z}89DzG%vmO1PXI>?vge|8WPlD)3KpgGDVE9IODf zJ~8jI?t`_He-PmRM|;$vXrn!;*={?sJZENvP}tzJlC{CV+EwdvNA);gNPpQucK`G9 zgTayV_U*(QCInO;WN_>Hu{Q2UP692qWpyO5zX*F&aqN3lE){VSXc~m8Zr7ckru8uL zDh91Q)e8M!B9!l4O;&)2)n4;z8d?G~17I0q!uxh%aw7Ea8wv?r{1OJb15;zztNYvS z@6YON7J(&=Q?NNdz9Bd^8^;DW;wx^+fiiq)1PYGef?il$_NK$KDYtl_w6AsjzKMB8 zM{{UQHjl0x^6H=h#j2=5t9N;&J|!yH1O8XPz0*AiNuLwRmotwJn- ze4v_%#Z~e<5fR#{p_^82yb|?<4`3@34ip4cjyj`}GX#GcA}xq5>1U?bH?+)r%ia45 z4|Eu)3AQX1bq$`M$APSDDyPSLzi{z}sRq6&h>6tCo(qtR;O=}4EjKADfZ`TQv#7Lv zL4<^(%u4PDn_XFU#}l@HW2*xk{o#Tf4UAuOJR2neAhl(#8qNpP9PYZID)H+pp>zva z8^QS0DkQRI=-V5?vzR8r`)(NIYR#shp`rqw5l>Vq=`Hh?*uY2%6ZiNq7Rp?nB&yot zQS6YJLb9nPoaG<4xHuuB2@Act@qL`k+Bxu@N`Q*;UwrwSIlY|3LUDbR2T2qk^9a!@ zE4$92VsUn}uuxx0RHKkX5pzDhxp6ZU3)Y1#fw%;*26Wq;$%@1RbPkLLJqB@87?5+K^Rss^>Adj-xZ^jy$=T>S%&W{;wXboyo zRUi<|J04!x5JFBGzYwkofd|w@F&{9$@p;n#V;CVTQ)nnFN+`tU7W4Kp)sdRWhOlfM z{*xv3471F?Qv3I?)E@2Msr_CBvyIG`#@nfY#{BUBRC(}glVQ(tjnCuSsmaS@8|s?W zs|7xcV8d{Ueqkq{V#3*ms!?YC>t>YN0da)%X%bnWNz_RD@VYNt9+lx4 zgiku0DN9Q(y5p~MMI4SkI~#2S>>y8=URLbJB)?`8$Ic@}hsP?+gtrMD8O1bB z1k`4tpZWD#V4FqyjeO(O!X_6_!)#P2;+k$1=QrBzN%cRdZffngE|O@-FP9h`n1%6e zRz5E0c3lUrlMWDKvoRNE!2T?628R)@`VD}D31{_{1Gn^%)*e_Sa`sMM*P#q0-;V&t zCfIC6J#Vz2T)b(~{wV<2sQ&|>C1n-tOrcD#M(sm48vZo zTs#bVP4#TcndAI0Jcw_uIQwC?9d`)^6acwe!lD0DFTuW<)rvjdM5!~CgBK%`xTHoV z&`mYJZjceEjovWQCFeebP+5B_%c9Q^WF@HculjskkX2=C8D_eg9{p4`OXv7_cEF0H zXmt;khg8)+#d(Ish7Q&$`iu)|;{8U#igxcgo?dsaF4%?-C|WgI{Yv7seT#8V!wzur zKWW9s<=8z}l7|1L>GgP+i9X{WG(G;i;iE$lLAx}iLx649fOh8bd}C>feM{HJ+=J4s zMn9Bv>b@e#>Q(x`)y4m<&D;K7;+5~}K6>zr@JkH$rqJ}-tz^t=7{5C5`FqQYs}Pkk zrf|=QVArX6k=ePi%~?Fu=!qC=tSo@*$&l5gW0K`6{SkW{A#_V075fb2yLs8>)9|t= zbm7cH#qfHZ<@z+5#U;NW*SB!{dZ#U9&rk6eMWICW8wc5ef+gPKzD$tPG?!emd_F}P zxt-cgmPlzDD+F5r;LhGF*o~eJ69sjdsrTKad{USB}4bNB~I`X zQ|$%bu9EdA}RN8{Y6BF3i_&Q4KvNb@o9^lPSvO9 zC(gL!q0PmcjqnUsWs^QuQ+Kx9tku+?f|?f2TEz#lWm~`nv^tRs-#rWGT=uHDG7uG z5m7+d=A2O`@>R~IFsDJ?zQokly2Cg_jWM|lSl)Dn({f1%vTGvA3~}O>{QE2!3zpo2 zN?Nf%!)7`xaaQqE*z{CglahFkWpNY==?9!K-!!Tn&P70=g%ZzaLB18X6!dL};a;|s zs>kjFwCTK=dbBw<>bJ!MO$AcT3(f#exlnN^z#w?UoB6UeIGDuGR>&iLjmH*7rjA?&Cy_g*d*Z~SV;_D-jI6GE(7F5;TxUyHV) z%aAZ1GCAp0?zLKRbKLU%*FpPfjkGZNHVhJ22J&KRQM%e{>tR~M(CB*U4i}YrgA2~w zlDqVSe>|3#_r7;rASok*5XKrMWNa?~3TyvijfN({>Nkw^|E}MZ2i9!@8*#SsXWyv* zv!h~_bJ&2^_4+P?d*M-3rGD)Cg%oDg>FxO1a_%~e!`=TRzwTz`S$DHm6dLm8YCe|+ zY<7cAmwm23_gURmrA<^d^P^e1$-K!i&iG%oSVBU6zRUqQhbQ$^MrrxT&nEZ)i}x2gF*l5ETqHU7b}#e6v}@{Sa6j3AGgMF^(C6 zvKYB7HkIiZryxc`O@EZ)k#I4j-AH2Oob-ZNG54){K@g?n_j#bc_YI3yg`mEVToGp+ zT58YlA}rDqM;FkVMI-sij>P#%3qg>vmKZQ4ed*ZE&%{}mo~_Ub*nqd0WegUJC~6xd z0x~*@QCBtQnn+vp>H`*<_STO#3-&ji61|}O?RD-0ap+-&R5u=d8heDmR1Kyp%GfLq zeOgT9c--_qh9GPbS{z&E++KM8Al8P97V<-Rb_nowLLSaiL=oP^ZW#fv!*$6 zGX|P&9nL>NJ8RSOVia^~wYDg`Rb}2IAL&XVGg}TCYR00UHO2!HqyN|+2SG8h5FToN zi*{2~9;P#B(imcLCQB|1NVbOqWzt=J5L6HGqB~^O^WoX@E_LHC(NK2m6hr;Zh0;Tm zdJ-_A2*1RzJuEs828AgdE|7~gi7Q^} zkp;744548`R4EKJ^WUpdthxxiSGCmP7|R-FBS3agr7$VLqAR4iDI_K^lqiR}*$p&@Xe)?Eb4 zLZ}Rp%ReqOq7;+C`y$fMc6RXgS9^m*JB)Z8e&^M7)qih()Xt1*O0_&4Y??<*onznf z;wE~<-QwNVE=6Y7hV+q^1|QK4D=|PN9_^9>9JolPn`zhJVu~cR20y54c1*C1bNBP% zYyZVzbkStEf18K*w9r~W`Wd#=8e8kh#wNLKD!}QoLHv>;@8+xg_gMAZQlhs{dAyG8 z;AqZDxHf|A(Jth{b^JkE$VjPEw7RGBM_z*MSs?CB7)-U`1f9_HYdh?}`>;R>R_fnc zM%^CeqBPCyq%)MLhigLJp4FVk6IyE(~I~W z`QdUnshXL49w7U8@14(5Y2+lEm2UHtOSE)0* zFV77vb=*$EO1|1SP(9-(Af#{9jZd*g1=H3x9Cds_YA*fKE3k2%&I-6aj|@k50d0Cp zofa8Rfjd%;V<}9GmyMIzHoS@?TJu7V6;Y>?*c657o7fZ`xbgW(?<@?XJ|)n@3dxokK{BSNX03i#LsQ!|3p zU*b%K6$Xs^_(o6mky-8PlH_r6B!htZ>%T#+=v5f|JvybL)0*!H8_AokkzafjGH+dt=0?M z+Sglyw_frO9{nVa3w~et*ujjbof&L3x3Mp1VsQ2U;|8pCnNpxY4_33~GM}024z9{W z|B%*Q@V}|Z9?dlR^oC%HZbOD75`sQ)3JuwcPy5X;P5>n%?4W)VVjSYarGjryXA_Z4 zEKfi=fz5}p7Q7WuUXHdq9?dnUQBWJuTkTq&5nqoru-Q+T_-NW|FK|rv9x;;L!LTEp zc_4M0IL7$mvDTW*_bGhY@nr}yN*{R65u!h>mKlNBY{<=TwX(u5G zs`7W`%gE-V&TQkay-$L7+Q^0%)?xr$_wRuJrY)J+e!oUSv6vBfkai)mT2iIpq=B&1Xs z2*sqZB-oc2a=v-M2=s1Z!BLyy<54aUV0O7-0$EaMgiXn!r-`>R$gw*%kjS|SB(+H? zv(8G#o8qnJ*v^|C{zBSclI40is3H7e`q2RnZn}UW_PUAqM@3@-UN80cWP1FG-eNa% z{?kd)@B7?U6!LD}pW};DoEj1k5>o6mD64stO@-@>Bx;pY@ILMq$!6>h+jG8gOYTvX zFx*uCC6t7Rbw?#z?BL?s%CN=EKM_FfufWN=D{thH?46XGeTzna!ec`{cR^7H_odbe$#&t~nXuNW*m3@=fBwF= zeyUy7b;yJ%;o6HxAcw8Y%K>j|-2KtjtqbK-;mO(9*#Bwe>Z6)E&$#^_nRXjf*3v5P z0oI#1lKT;EzDzTwNzV!6ag}>VtzH&=HlD*LI`UP83;=GQkoSpN3RO8%PvUASh<4r?X@kO&coZ>3{ zRB}R39P{T7Gs_Mv6@L4QbJzXz3;G{R=*bI>%uK1n{((5)Wme*C%L-TRlftIpiNIm8_uXvQAt*r6<0^U#t@__3RVvo?>Ozi~#E zQ|>(-O1%2&S8JZ!cf=>_I=mqzF6a2}v{OFWtn-gvD+@?I&I?i zRDM~D!ShTfDt+tu8kxhA^tL7QiOY`(*31HsVQsi!7m(J}{rIq$}qAmEbf9J#(4X(N`Jh$uPjqi8Jb8GYZu4(>&^vuaNG(0jG z|3BD6alJXMa`ZJXcm?JD=KF|K6U>13%8Szi@s? z`9!XL;GDHS&f!m=pWlV(7X$vx#+UncKeY+b53YEoC2#l7)-IZt+TFCIjiRscAEdQ@ zm%8UuW5a^cs$ZtGzH4heI=nURP22L3{;JLO=|c}|wjN!{_I5u!x~=xbt^Zl3Tw{27 zt)4lQ*Y)droNx8(PG{WS@b0OD@kyT^EF*4Rzixl0pn3o5KMh~Hb+zQ#=dXY7SJy_j zk0#x2<4SHe-5lBXdB)j;-N#k;eYv2kHT%?O$McO36&W7zzJz_gwr&51+c(CwZmd4~ zua#|w4xjnO*S9w{67ihh6~p_0m4p!@A%jTYq*kq?WHDm;1b|3!ZGgiiAQarrxjal3 zA4a7FY)2qVNI)l9E`x1>^NUCgWQPoQa6vvKuv`$}M+ya=E#kyd9Kcx+Ud)9>V3)vA zkS@Xu@FTbdUO_yFB-j#%3`j9ABz;jB6A%SAP{fZQ!WVm7K~yS}ymZKxBhU~6jpQUz za$GbfX<NI%WA2XU2?hfMF{+5tVV} z9={12={<-ckIRIW(00Neuy7Tg+&Y0Psw*gF>nd5j3(GLej2R51np1|EfYKd-fC(Zl z$eNNZj{KZti6LwV`vN`)1`7MU{-7aj!XhFDSW9FaLlG9S(u6tfg-GOiZ>2{>luEB6 zm83cixdq0@)$l?kB2^j;R0=M)!2wB+Dug!^Rw0T31CH0!)TnCIDqbkZNxfc=6BJHS zO5jiiYyBbY!!S^jH*R?6|GjOF+6szJD7u5&TI}6 z5OdoIJO(@O^0;fQfW>2`2#Qn^S|zC|CTRmj8(>YqCt<{=aYC{{)B=z_0O4~NqLVl$ ziSj}*q|j{3rwWT8W9cd?4aqGEBqc-%&Jf6=LN+c`$9Vi))Q5GY(4GTMbg%5n571(i9!(IE)}LV1v%Q z14prxdHIYv9W%oV@+tkbOm12BKROIV0ClVaB%)bWn9Z>n7#29U45lq1lS)1MAPr5c zY2-#x8mZN*VV+43@{iH<>FF@TrqZ-JEsV@*G;KOe^C>ifpfoVzrqD>OmV~i2m8Jv6 z=`^ig{|#9kp`T2z z>!~cQj-;(Lt%F0*=~|tYv=BO-Mona8oA1oAT>5Pg6d6H`1eDZkv;;uJWj~!G`!Dm# BI%@y` literal 0 HcmV?d00001 diff --git a/docs/graphs/large/l3_perf_nn1_jc2ic8jr4_nt64.png b/docs/graphs/large/l3_perf_nn1_jc2ic8jr4_nt64.png new file mode 100644 index 0000000000000000000000000000000000000000..c77159dd5aa703e7adfeff6a902632acd22710d7 GIT binary patch literal 219714 zcmY(r2RN5+8$Ya_21Oz%86hNlg-Z6OLQ+DKy=6Cqh-6bn2+2+o8CfB+MG6^(h-7Pc zKiBX1zsqy{jvn)S-`9Oz=lNOZH&9DMapyM1Z4?v~JC&8>bSNk`WK&SAU*5VIU%7F{ zV>kZUXnsol6a_^===MLxoACJ|QzadB3JMPn3W}R|C@5C(rJLgv6fXP}6yGmXP)Np7 zP|(}QmugGl8=Fm373C<_$p0RfXGGvDl=e#Jo$#;5fB(IGcPtJ6XN$A4y8M=}TXyYX zI@&gp{1BHR>2X%yS;zSLA$v!=s}|N~hn(H*%?`QQ*i%p_1?K3gJ|8)>;REfB*i``V7ThiGKV@G)cPeiltanBKt#q z(c-#iduRJ>i!adDFPj9ieH6(^5ugiIW;ypyK<>0=KYe0kpGE0aXu zo@w#Y`+)&&V&5`1n-<%Ch*!B^J>&1R;kV!PR&E`aDiP&&<#VF^!VJe7>m}I4>Uh;t??t-ohEvr{MQ^oy4{f}&ubo!t zHcw}#WA%aap$-G}Zl4EJZ4W%-*{PVLnU?y_)zE`CG_EIqXw5z^ibmA%?&Gb$E`=Km zHK`x2&-N@oY*-x>DUr#`WR|({PVtnmc4zD}X*xmHx4p@iZ@Ryq*vL@YcJBqV@@$3h zbRexyyo8eP*|y8a-uX7t`h-hs=}(7DCqMYi7;Cn8^n-!Q*5eLuIF=SGxE~%*?4@E* ze@S0v{z`ap>S6QZtHlVr=nZ!I{HG6b8;EMS>z&C{;Vzroac4foRH5OXO{=Z4L>D82 z&C3h#`UbrVi`R0k>)zJL`x!I8_Akt|zML*i7u|Mb;I-zl#)8O*S1&J7Y0`2|o7V5< z_7q$lww+O$i1 zk6h%XbCO$=)~7gcFJIy=k2%SeeCO<;a=X5lGoe;{Ww#w|Q4Sii*6vyjq!OuUx%qVrdzaoV@%@2cKQJa@o>S#nn|*K|!Im zt}ZMr>{el+kgl$-oVf^_KMF#5D z*1RfdG-p|-CvEfPr0|VI)A2v2Ixt*GtYJ2~yAuYbbuy5Z)ZeYB--PNo9 zw{ESYP)Q5V&gN}?|6ZFr>Y6a)3xlKD+S=EIA4(;~B_`ev3){`X!Lcf{iRQnLyx`IN z==Hc8aqm@O{M^}G!+key+(@{UtD9BgJV8@z+n-|FUwZc0SxP_ITUl8=jwMV{Ds@Fx zJ+Bg#Rh9TI&=mzR8w|1erw+M ztu?82VX~*-#fv9rGBkn@9N){s!?S+<`dDdC(RJ(AG3?*pnS1g0+mQW=9v+h0>G?)$ zs82k>^-^ryy!qE$>j@#RWqaMcOCcEtUg@% z)*RRM?A$$@PcL^fF{!z^i7SR4to!gGTJyx;$m7?i?Z3QP*U{NI(Vaz)OQ>sTP{P#> zi=0et|JizCE&p=WdfUN@ker;O7hk)YHOC6XK78o7ICUOZu6gCz?Mi=}(a&CRiKZv0)K$72sZDE+V|fO=$T=#-%$6ZRtS`R8{YJlOR% zc;D$Xl_)LlC@HVy2T3>n?bFoM3=a?2;@YDYFCM1;NVNTlGA)Njd;<;;wtB-?$6TXI zjV#?99D*%}goPvVBL!Y7j@C9d3*LR~?-*+Qcc#kU+skYC>VZ{HuE5o121Ze9rPf60 zebOGnU*AQUuKoLaCm>)B_d5LJ$9qL)Z#k@%=i*edZjJ#114BMeBQ{B6bMuu?FKu32 zYrox{rAJFcBU|b7ck9-zUt)}Ws=B%&uMc|1U=#7%4M^F3epMBIST$D6?qFxSy6N|> z%pLnhg9}XSpS#apU7Q(g{j)OXKKEX1c4?_+X(HThCvI*BvrK%VwCD3HjWpPRHC>r! zEV`f3+4Q{_%F3%qaTs~~XTDSObD4XDUFF~C)rHBq4;SnlR_86A>E&q@S$1n>b^n>I zXMU2AVS4o{Hldh)fypNC--~Ad{;ZU8JyS>>TAm#VW|9auuJP9{vOMRVv_5d{5cX~M z#n;j8sfwm{c6l>HN=kdSTNXXUHmAiF(KGT+n;H7U`M^?tp*f6AaqG5iCO`U$tOqL+ za6Jx_AC4ZBa(naRQ}OvSHvw{^SRP)37kA&hIoZ+C(Pmi`FYa(#+H;ZL<@;IJAANN} zbUaGR%Ek-dyPuxrMui#gEnvg1j9@E{elBqmKYqN>qLbr8_Icmeua9SFB&IeEjHkGI!4qB zm8fFn+kWf-MeN%&wRM!`yY}vVi$|U|qZ8P}7@|-!(U~FPwVYoO@GI$5Q2zJ0c!M(R zg!ZS}Yy)0%>Z?mLp{UK3YyXZ@;LGwH5kr%cswl2ockI}BdG6c$vpEKI$BrG#y;Q#K zz=y8NHaLjmqyuJC6Sm@WLn30(o&EJbZTqc6p6z^a8_}t98 zzcflAgt5KARMDWwa?{DB>HZZQ$h&v%9ugF!K~=|B)*b8D_LVCo5IB}CT3oHcXu%JiNqwI)j;fl_Lnvnyu7{FElC{-N#agJ$UfObEdrh z&8-a*u2T#G0s^EI;+%~P4_9X-`KajYGh8UPJ$(4E->nT>ehqleC=U+N$OkE;vK%^e zbMVGm0`5(xz(n3-Y5M&qyUO=fzFSC(!?p8Xco~e19ifP28yXtYzi?qvx7$oGP9y(xveqs- zx;pHEmF1C8mlKafZK@j@wx;O>78Dd{^l}8mdw&fG2~kfR+xF>_?65oV2q|0W^1o2u zX!q}L$~UgjFL69_p&JjUmmLVyLp%0-P5zsbII0qs`pfa zq|48rKiigAl$E4t+1S_=LJz)2Q)&D?Q>m$>v=!&vY~*bSu!%o4b4#hq(9rPZkzGaAz(6AD9Bpk{s7Kh%ZviV9 zMJyjACbpmi#fm$ywzszn+kTD(U_%eC`TKimpMc3mv^w*SC(0W)ZluU}7*W6r2bPpb z82PLmM(^0cB>n)fg#TLGAtolKSYZnqK+9Dby0rC2s-G!3c4>Pqe6Pbbpm7WXYY@x? z3^4WMM;6O^F zJ_i``)xUqg69>r9(D12V-obC*z5zX~-?C-P$tU%B7hmsna&qdId3H9l`VJ+nJswGQ z^=)(!vY&=0CY14X#c^uUZPq_V{ZfnNU$=k%ezXVD)`1jDTVK1)lBWj9s5+RTFf76< zAP_cEAL@i2K_Cu}*0qi&l;_W%@5(+;3mCOy=T3`(vXYj}>k>qJKG!E4r& zH618Fo^bqnIxuUA;5tSz+veT^({*7Ql~`3Qh`+DzhQ!1~veYPA-bq0^xAqu!voVJF z11<4Z=G{Je8@O^t-Z@SW+uM-5(l0lG1K z?HUpsyn_Hul#$+74yhH?>awx{con@|Lt|{gse$rGmH$@n;bFBuH&X?kEAjp<+L@tg zPL5-5{;i`IUXJSyuQPaQtt>0M-mxSei$3|=o}E2$JI-25(uurBOTdN0&2d66tUuCt zdU~R#J$tltW9_fl7AmH;v})gm3Q&crzCHuJnZLZCH)P9$di=q$E>V$TXAcU}NRXWE zF}g8JYMbL%6*RO*lgEEs{Q3KL?!^u~z@~_G-@bp}cGvySX!VnT0L5HKsyH?Od-t|e zfS&pBeN$2@<9+$^C4r5**%bIa=C1+Xod$l{PWQs9J;l#fh7rsm`Scbn@msM$uNsu} zYXfEOxifRz=bnvvRDxF7CfC1vr}ECxYyAXw58LL=n}w}D1mk3O7TXzNZ@qG#m?Fxg`(#eB^XB{DWJzAXlykRsphi0{+!Bln24o12irRGd5w0a@T0tjGg$^Y+ z$1|^}NMyt29gp#aRhf%4d6XI&8rg;=;kc_g5T`HW;|=I^?;fA>*RPL(o=~(+>{N>r^oxyUh*U|t(yY2i z0`2WFkbmi>hQDN2dIUkaUTcFVYrs^ zZ78!Q`ij+u?Dmh0nsZMCUGM*J0;fc)tIa-NsHCLy1+_aJ2Z=)GZSVF&{f+Z#5gh8h zFKzb!UYg-#VY!ui@pVV3i&cDlJSoHFO9QFesS4nO9VznLN!wRRBhLrGAo`e53;4k+ zuN6Vadd@%kqClNSzJGs*s_yKfiC&D~`o6;v8xEx)?ZQI5h?Q>c_Ay^-nJ*z?18vfa z^Yd2#!C#e>O#F^ijyy$RZc-96^n%#f*vmFHIjZ+&=jLiYeR_n$i#mX2+t?kjj8`GJ zB~6t+p@q8Zaiw%}CxQyq9V)5$)|$P&|J z(V4z8gi(Z5M1&4SE;~Ehd9DIqBM2EU5*!kO7ilj~*UggfT;z3ibp_Ot>G3)4Iu$$V z05afow1T2+;lqVh8Lz;e$R|(Ou?QLgR0m>@U*`kV=Dk!F3))6Ep0A=5DUYI5hrPrg z>CAJyUHQUZV`F2(SN6BoZP+{>uZl7OHjrtx5at^d zMTgB^T948K#6IR>h!yG|$>`QSz{KQiL3yZSPol=f2M^qKH#ax`dUEqCRF;RFW;-rp zyJCO%TXsJKwC2fqt{N!;T_!(2zhSB9c5YzZJ^Hz&!5f|bQzDXQPCt0iW_;~h1CHM* zEv;QZ#B!IWcl`W@Mn|6r8Hh0MM1drdlDT<6PEOAJ z8ZQM34EfD1%w9Ls(r~40Vbr)BIRyo@vCK#HxQ760=9F$r2YdTBkW>t;-PU2n#@kc3 z;FcZuk{K!g9{R`~ifg%otw)G%8A=asf|QHBM-4XK%*#6lral5HY=ZI$l*FdFWj!@D zH5P_1m6R%A&B+QjH8nNTXrcDFd3cbXe7qQSrR2Mry1II%QDp@?|6&ltPFCEtC%F4^*0Af%9O&Y@wg8-7lTk+?Y>cz?_)B5|f3q3}p(S7g9 z%d?8wqE+GLezhgO5wxHV=f-)?%MPj3H>msE;5y0i=oN)X?0 zdAQ@#56jH%Lh=FHt$tzA`P8UV1`7+AR){|bf#j)Si6A({+tyai!P8L>+>YF5lBh=O z`&eMQB|SYIxa!}(-*zwU2Jb7RCIj&VGYH6XK4oRmjs{iDzEu8D$h;-r`lDd~9C+;r z_GiygC7=0r#TPc87EhRFLF{vw8IZoXcuphzCYS~5fKCx;L6f3m3yonv<)P35wsh7XXhufe%nfS)K5)W6WuB^ z^9cCyrp0ylgMwJ4q?pjlxOKB*GnmV2=aP%l!c}AV3fyK*Qc_c0S`Tw@2s#YQ0Z;el z8*e}-XkQp!%`>@tne3DUlFpBP{;q}>8u+6|U&faD{Q7z*UWH=)h7AP%p$(eiKd1t? zW=HBHqNCGihE$Zw_;H?qw~3+vr4sxi=R)ywfL?+KQ6iP0U4XzrTmA0&iiYM`GjQQt z(K*s3aNs}UI?6qluCC5cK&3e|SaVbW4a=hK@j939U5#HO-+gtIl9ryGoehic7OV!- zAZ>yu3S&e=#6`_iuE_x2*MoTm(sKUew29hA$K8gVj+=C}D2KiX&F{c7Xi|?0x3PBg z;XC+*TOup(WZ>~^TrIT10+YHR!wC@zdOpK9&CS6T-oJ4O6hSP&ff~Bs$;+njKJK>s zSms`+ktP~|=6|N#!= z-$2j8cIl#~X;x{Q1(|n>Q&)I|e^>lD@I<{l7oxV!_A9hfcVA*RIDi zEUHRnyC7q*LU0Y0Syft!$L8hdf3SHc<9K4$xvaf$adD6=zk4>&(6ojac3n*ThkLA|zH()Lzuwio#O-#g17NLftkUCGWfk%n4nPoIqq|D`>f706&mDlNJ=ussVU%MRn#I?vbr6yT#Hj5QzzF} z*s-@uROmtCPz%^OIq6@u;&4+Pl)4MDhcz%8_SDYmPSMS3YW6ykWlFVkXWiSk{wN|* zz2=s}jCcZa*RFA4e}08#QF5tuA}}#x!2H^^Wwi^3j~ofaMj4)(GQ7I+HqI4eNP3%d zeTTyN{e?Q2?7X~c8F!3yaL@miN2s%Nax4Zbq&=ZqEO^|{2{)ZNZ(v|zY#hcT^3Q^m zl~vs7+m77aTr&mU-0b)CCo_}`_gXvb`lt_yMEF|Umne_^nDL8~;E#V+C-XozMMGJP z7&bM^`@R~7h!LA{!_;DI>SG}+gg_vlNl&iU_i3sz_KuEVC#MHbyK~?e0ZiHslqr`0 zJv1AiLuoWDq2c7@#0JnE|Lh%J{ZbVN1Ahc1**7vWQn3A+BpaJ=p?PZudcNS9!my~Q zI_!|2b8|ejLA6j*ySmQe4?V{yvIwq zheIbM#>?$Z+s8i^j;Vg+#^MQE_dWmx9{(LFBL;cyNm^Rekx$&{rB9wbkt+YHDun)e zRQtd^;AwI`;G}&-+W@b^^%C(NZE50D@iFj&5^K6B6+eRYehSE6V7SR^om=y&l=Z?8E>X7kyk;ux#=nK0c~lM!einL10eL3k!)c z1ofOT3|?G21&)LRSlZoya)fq2yST_9 zC`gPy5NbRl7H;k!$P1I6Cd@Y? zBAhSesH&mrQENCbHOe6)#or@*DD&Y&oc7r}Ytm&+*tWZ)Gq=V_JJ35}LZCXEn3(`0A3Qktm+jupKgp*ewDq9vY z+S%EmzBIJ7yv5>S1z`9%ExhqQz;(~5j922BymFvqHp*DKd23>6J!<|{xM6CEQu`n& z;)>2!cuC+pX}~<_C64H!VDsrSbL=d$hG#Q%{NTBbAB6N(h$6Ci2gA+E$`h!n%?Zb2 zKomzNCK_>Q1MlDGJ9ET?X3Yji!bwD|65>vMD6{!gZ-KtPzWwh-8!BcQwVvFISZiZQ z|FBEOb}zs&&MBK~IbM?Tmq$1IGL#YYE8GH{DEsNp62xHuL_{&$2I}rx5x~bXs|KdV zAZ~xiAV+uU^WEKS`Gtk1I3vWsgv{czI>8BV@2P$PD@p=vO5Nw@*yj~+`Ufv}r<#F&L8Bt}Y2P<2l4ZZu! z{j&B(%vixWVqf_DTQMgcA9rPeQyTZoH)45)HKOrXWI3!rY!_#z&+u`9MTC6*xN)C- z`UWU&{8RDF4bf80!hnT;*aGfy5(oEP@X1?uYCGX$6Cjiz zTn0}Fw0Z=9M#}ByP3V)vxACSvBRiZ1LLn2wcd?^8=ygfSECgfw?_EcT-`w#e@WmlE z?Q{a>-PZmtIY}>lG*2QljTnPB*KK%u@%76qSxS-TaV%m~r&>BX!h!xS_4t{>vM_rd zsXn}LDsM1XLOa87gP?FN%3&6=fKxWdOGIqi zzSkZeh?i&RQMj!5@EXiRTX7t{n_Fq&^E*k>#WHXyJ(7F}A8lL0WpCE1MnpyV>129c zZ50YpIF0@WbcBsQ`dZ!DSr|Z=uy{~kxcN{(s{uEFiif{`O`EBSa(fSRk#t7@g?5w> z;1=hUBjB{q%<%x$wQqjY&~O(u6MeT4ic%&xA^xd<`<5d{zYdT;FgW-L3_=_&U)U;z zFwXl$fyJi=E0Z7+Xydp@jRXY;Ujexw?hU9CYwIMbhG^gS=#LR6V)C~zSpxF0hX=`y zuw%gXNgs=XVIogDO>4ty5S+!h2jP9!WaDU+ZTdoJ}HG;B4u@ z$AP4dPS482a}OmLo*_PZKYbcOEriE`L)MjZA%;k)5LrsE&uHK^v4eo~3T^u(iI)Iv zkeCcOQ=CVRP~uWgia$XgWCu=$g!PS4Q$=MPXr>9CG3oA*bJ2;MYzl9z`GD^bOUrG3 zJgh>q1RIEa$Q7_kWaq8J69Hibt({{ira))GC#dfNqM~m*I)JG%49oYNl8YyO3^0da zyMciL5O??ydr2q)!Xk`Udu0hh)KFY7;)h zd-N#o>x0==+)(+`T($Ua&&N>psbNs{Gxl*-C@APG zt$a#+=h~k+YHZAwk?h;o+A1KNm?v(4EsgX0b7`ry)8}N$RH658*AKZD??pt=5)~K{ z>P>rH{GKK;#lBSeu6)nRbEf)%S1v&t1p1TXi@}T6l)2A2USx(g75nHB+|b+i?y25N znwg%~KrX>a+8nVX+#M0AAuy0m4jX8;j^1e0UL$IiR^p{n+|{m&{9?>;VfRm+?Fx z*V4gVfi$XuQib9!`r*R|QcI8m;&=ab#qi=qY+E2Yk~AQ45|Acw;6N)OiwHW7DxrBs zXRRxRVbtEf8!0Y;oAAj(05df$tp_c}>MANJz_hT+##3@BAP4pK_U<}(Py<-jr4?Nk zwRd;L@9EcGq3?02z%@E&&J;X<&i#`KzTHpUe?H_>sr!C%A35Lh}E`EG$fL=qP= zGBPEXuDiG#=H=aiK!9%6VR7+T@N06rh;unp%|p2z?|&_ZwcPX=CMw7PT8S+BErKuB z17&eg#f>jtJ_#`yROAWYL!SR~ax5&flBgqQ@u=jB)BR0wMl%6RTqZx5A-zL@0&qus zLPDI3w@2=H7^o_7Tp@=a!t39;vk|R>Yz62q?#Zo7^YhK;6p_aWjfy(QSHDh<))MRk zNXJnk8=#6X4x`z96sUbTHdT8BS;68tUf6#H0YuE1TWOkLOcUvL~&|D-w_xy!r_ckmMe4@m= z8Awh5GZ$${eMwbXwd zAw7vgeeZ8 zW=lAI!?itDje~>$VIF>HHH-ZKsbF7Z zr!rVk)AxrD`V0l&$^yyzM!I1-rQTtCM3X zUMs(lMn!}aNPZZ82~omB4FvS^WL-u7Og0$k>tD9AQbp1+iu>F!3C;EOJw#(NVUO5d z>LdMrb-oIAG}ef~G0@7{cSV6T#{y)N-*j~e&#WyNp++%?Snfd^8tcD4Xa?!(&&-WA zQ<&!@76-L`e0A2~!UZ8%tO;&CS(okXl6b0_>FE4&a()Q?-b!CL#=*{x2AHlI2kS2t zVIO#XRJ%la;kd0N3V{)?U+T{{J`F6QOV*St7>hbQGBUfc;GI0b3M>kyH~0Q+*n>7P zDDNnIfi`69*+?Q+@B=z{Unf#bs0>b$w>Qtp5Ii z(x@igD4sL@{Rc9Z-I&&L$`^k9l1=4sH4%`6d#iwr2cU=w zu8Q3asls{sa~!;Ybw6=th0q@>mWQ`v=aT#Zo+3iGG*(uVtg16FVbIcrz(+jVwU3t@ z4%*LOzgSV)Q6scqRuO3cO$X`*;2x3e0Gckj4WcQs{U<+Uo1CDc4CHKVjEogD+lFJd zNhHi;&s7J@9hkCReGbsTDaQvGAB_(i%x%n8Ib=GW(=`C!5Ff6!-fsSe}^0lP`n%2T61D1#m4f( zxVt!c3Xx-AK}CD}d{Vrnw&acGE6B?Wpd;TuD6ImZeX-Oz8K*AWpeP6eeg`ZT*4@JH z7w_J?H&X4lolw)MUV*+B78~wHeM1_8mWinu=Mp{O^5<6$Bt8sqDQMBbj@??~FtQmM zLXUTQ;}t=f)p53Rz8!$K#;CAzavK59!FK3+b>H^&G49eRPD?wCz6&XUZN@)@vFf>* z{SU^jyih~F3_LAf{rsEg>HOEH4GCW^VmYm~G}TzA62L#3{XDa!0LJ*CuGhbcx5X|Tz~v1(G&0D=r~-@d}9<$Y5DhW z`U`&x01L7GOY2E*8hBbqM7t8#AB?DDsEq5XFJ}T{VHa#{%iK`86<`a3?vc>df;(~! zsXCeZFRtz0$te67FHRJiG#zCHh0Vyq$Y!1$MqB|&iC<-6FpyD5<{CMZw4msnWcW>D z$)HVYk3`i612|_H!*OzO9GveZLmP*X?}UyAw1D^)vAR$^X1n!_hJO4AfuMxT1l<}d z6+=i8CoW92K{K`ev2GS{Y85~z`n5JX3|!?vh1AA#KG=XHONX^yl{rd7V|F`ckwZvO za2fU@B^A|G#DxvZ+`<50fJ9E=*5i;cKX~xqF|#}6_fcU{_-c^oQU=ys9=>-h|K&@# zD!Gtf133weQ_o^#F1XPEIwowNIAMWab)eVPkWX z?Ow7#9)RFDKtZJQ_LK=GhTqOKAVGC}WdxP$Y_`6D9xi3{4(lrv^An;Vw*M)g? zcvu0Rfs_I{!j95iDnnKnxQM7x49~hB#8Do5kYbJfjn!cf69_cqih3ut*-<+j1e*Bk zsH4PaLmENtp>QBx7Mb0q(Z-0672d}YrCO29#y!_SZGf?qQ1cL4KDLR=y(As~XgTc% zshe*gPyZp$C^RsT5;z=ied)-3PW1=q= zJ4?vz=Vd;lie|vV!vPCMz?=k@5^Rfv8ew~QdYJckheM=>0L8C_tw=0D5A8(uE?o0( z{|m~8qDQ-oHxXZ-QzQP+kt3Ox$9}$9cSX!rt3+WZ$-*!vu~5F~l%+{P9bB*F+Z!&6G#`+U>XkK~K5 zrH*w&sleUfh&b=o-F0M&D4^Kz_VZkK{Hq_ZwEhi)ZGaR^a zv_tvVvFzH<5Tvht`*!%t7cKVHjVnN5?{Mj_TqgIT1J*S*1_C1Mps%s6s3F1$X#-MH zS5!3V)C52jHYdJqYEq@S7|yi-!uPXN^F%u`1SqrTzPJJnfY27gW5?6h^I+1AQjsST z_69KL6i5M*=kzEM2;Gc>TYA?d?^5~Gvzat-dfpBURA~8jwzd8I8sBa6>CI;*N#~p9 z@}^>TsNaV;<;B=G+`j$h?PjI}ER`N@k$YZwL_O@zn#0TqD6&p|7)#7)OYWj)=kOd3 zAbDT$Z*{WlQ(axvBfFr=)YQ5WgQ7Pz8?tkZM-;|;sCkbTFdvVOW8BGM-twj;f#1O- z_u|LCJ~)sOCVIF29!rIP{K|n1I(qX%5uTW~D1^S?rSnu!_gt%$_4D(AfoX@iZ|b05a`^hw^M&br zW*zPAVW_&s=H~SXhLGVEG|JX)f69g^6C=Q8s0?9ab6~)Lz$7g8znPc-CpRtRO^l2j(z@WceG<;hR@Mh9Rs0-LvcHb2v)&YyZ3f2OS(8 z3DiNYItTSk8BN)6knc|$-P9^Vq1T|&eLa&4Rrxj(zmeCDKR+k_4S!w zC|CSeA9$^;tyNqv$HgH_&2RsuaI8dO1CO2#i%8V+vYV8Y)Yk3WJzYnyk7;OSe)&$-AW|_g zu>|9Y*x0++?qGNgBiS+`Kq+KDBvxXY53m->CA3{x1a?sB|Id^M#+#~$|G?#aqf(co z_>r9UgF?Kz_~YKANBaQ|Z&(C?Av$8Ri^SgmPDty96i&h^fL+>{g)%Y9cExrO6x3OQ zXSN#GQx^fN1J=e#H#fHe`G<$uf3ZO<2dqlb$wba_7)+5GwWO%$y}Wetd`o zv|i6tG=Z3s=7F@wx-SsgNWFvxoq$*g8JZ#(5sVoy1ejv4#l*1c@oXfhynZ!uHYc4( zg6Igf&eNPTql@ZYd7e(zHI%Sjr8F7`JkItAi|0}VJh7>-NO`S@yO6)0(D+%Fa* zT)gr}V}!KfBjMefy;^p@Y1*Eyc5B%!R(iieF!Y+)M9u>+@LZ=JFnwTUMdT~(()!`dnbKeTdmgm+>|54?OW&|E9_uj=gZo9Fu z>>sOtf6PO_b{!Ge;yKS{Pitgkl<-rZ3I`#yGu*1jf8KfMXWvIk&fg!7_LkDAsVBS7 zg`cPgio>}?rn3PU%f&nY>wke;vOJJZ{7If4g(w1;62k09%m#o1ldwPV*>;%XY8*(q z_#$dNv8sxo(0?fjBAp{R>S#u}=MFXefJQnti&iJ0{Ui7?jBHHV-^EW}aH6nn$OsC83d=uhsFRNS?Y)!i{cBVi^zbwm zdUuvA`oEIgZbzzcwsc_n*euMy1=M!l-n9~v{?r%$;q9$T!w!`f{r zf;z(ruVA3ulZ-(9FC9i=4q)@0k56sD)ZZg9J0#OcnpntnN*`A1E8uyeK9I}`@*ITclBp%Tvf0guc(utJbXykXcFkhj;l2!t zZU1(R4W%8>MaHEm`Y3Tp;n4y~@5V$uXcf}pqbQ8r(S30p3K-Em@o$+8Bm+@uHF~oS z%OW{Bxt}=qj(=BOFvC+5o;PCkNp~Iwd~&|5MH&tF*_$`40^2O>eg_BF&S_>$TNYKb zIJ@|*E*MYgTjrhjJyqhkl1IbAS(9{Qo2d_`V5ZxBE!Qqllq|Xr-M$p% z=Fj4OUIX@lR2`rllY*~-Pn{rk&WSa@Fc|vzA#=m^nd&$p6ScUw{40UW2WTUHORJIMnA1rZO$&3&%~o->d+1R^qiteZu~=gxvVg?=wp<48sQ0Q0+zEF>}A zvDX!Wv;jEY7gN7f<7k?T18gfX)Ic>${4EZ%gv1MCjv0N}cSPbyG6>7TLtPktt5C;= zeT7sD%2O4_%kUct3jP=iX2Fr*j@lNbl4eC)2w54?BYCJ(=P}0x5-wTa(bPl+k}$sT zi`NZ_q?D8tej%Y6{0XpeED)NX33eKEbCSRZ!%9_E?AikKTzn6s;B_poGUf z7p|?Qs_GWTBxf;z)sCD#c6TtA63Q^-4~ZUgl72{4q$aa`&}L&rZ7__}^%%e4wLBEi z`|%@7L?bL4aLuDJH6GZ=dbT1f9aL6;%ve=37?YTneez^2AZIb?QTXZ$gWh6Ih(JPKvwq?nfqKJ$)I{4SsZcYsItYHs1i~mZV7TCEWtE?Q zP|2zt^?+=N+iov!N+#j0HKSL`bk0~jjOS%)-Y<5iSFb0xWN>7N#b{3ss0iixAcj#7w`j6r{u3=n~-0(s-&9?YS?oBCW50o@QU zgE>+bAt739z>79k$Pi7m7IGa<76a-;l4}UaP9a%~_;J`3kSi{uu(}JXo{W^oAb2p` zC@uK*2|phHXQOWgHW5Nx2!0#ZQXVxK5Ce#?=vv*cOY7i0!SKO+GCN)lqwyITW7~hE zqMKl-Qbf;_i87F>B~iKw*L%Ly`3Nni2JybZ>33s#zUqe1TOid}H`ZieDUSe>J;6=< z0W~N45GIBopezC@7}4C0V0tYEn$Qp}|5*j-Aw&oMe4@vV#4;epjSrN+ZpjO4`dVUH zY-4SWE+koX#!t2ihjM)NMaaT!0#ad#sH6?UrdVEGmAU@U+rWU4sA`7}Q6P6u!bCW3 z&NG_o>R(}wW99)Z?G$PPCQ6TeJli)0^X(3pIi$Kks8zn3cMhY}K+z)d8RkmVQ7M0E zdLOl2{6X9DB!+(D#ywN{DAf8h{TCQ|2X9bIdv4nz$jKTXx$)MmFEhtjizJE)G+!hX z2npQ!+B8%{?YG<``)xjJ(lg?MLUnFQoZI?K3YB8K!h?6{=bses`zh}I zt0jTv!~!TCl2{-IN+{Ft!GtZlwqi&N)e3qenGYKuzlM<^;@Ua|(namXH{bwgfFeN1 zN^ZZ#z<-(IOu8B{#5WA)7*%>lQ#0+_tso>Mz`RY!Bs~1F3SGP~WA)7EQ&G_h{ZF|B zjLvZ{4{Z_1IKWg}F+Jd?67{BbV>F*%tBf!Ax!MX7Q{!HK2L6*>S9`K4*A~xit`!-c zg#bps)bFqigrRzMiKcNA$pBk&8=4U*X6TVGwh)1sLvlY5fs0fwZ!! zDj7lmT!DqQ14gCgVj!C89T8&U+9SM+67eXkHc$4u3>}?;;e4G!yi*w=FHXZD%hME#sI`F0CzC zDUmEmVgjipElf-xA4<7>J4lbro0^)`vw>U~+QL|rte2M*dg8I}o=;s}d!kgLvZOVX zcVoO25(a5wWPtE{7bOC8I3LKYAkIrhyy0a0oSjWZvDdFS`AH;D?{toCA>JAQtUwjk zB2muJ?!SV-n}b>tcLVJeKfM#%2SL*h-CDpwzfr%8ZEVz`9^uT*Pxn6p#&U!-jF|sS z%rAAYYj>Du+xs~$@>8MQv5ZjJlpj8`t}Tg6TSjGs^WnQ-vh68rYhpyD`Zb{ zaB^pB-BrrT%i~Y~RMhk+=$a*WO7ex`>dqV+US7HR4-HDFX&1Q0|7`zqx0oBHyTeMLM_1~DY0zBb`uY|Pr>^W&j@EGEXwEXpNz9Na<2ttG&B zKQf9(`I%8tf5`sMGRt!gJ71LsJ>FjY!soH5b^&btG&mQ1}~_zWK>1;0WbHPSpX zPP~F~Q+^wbgKooqfXjN)#Do>H6htbN9PWu7^!P|$Iu;+FAFf1#5<+(;5DwPH&*n`ndjCi6 zclgmKBI!K-hK#v*NIqDgLF;>nAO#q?^ThrP&7_t>b&^<-$cBJK5>?>LI+*T&n->S0 z9W&KWurxS@0dr%mLg25Macu}ZAogkTpiyXa4CAR=hLXy)fT6`aK1wkRT1>&me20K`jEHg zrD4DOITEcnJT;gOL1`i5ji6%z*r13yoB?mxS=`VkMnrjEpXBy44>jIkmn+Ur)1I? z6>jgO`g38hX9=?Vkimj0oCb#pqK}D>$M% zhQ_zXD;VH31@v5%X3YJ0)JKnJQdF?kTLahaK$oozpQ z>Bizma}o|8e>Kzpx*o6#hwXlgE=fHQH35(QFc;TdL=%biN{%_~Ju(dmZ1n^o$yJ#j zbt*VNq^qDBlkr`I5zCxUV*3&JNSsG(cj8ro8y$<+&C-M__rLMs@w7;v_sz|qLlpV)U<}A z?_612Zfnvf0cerFIKK zs}--gtHKhyb>h@`rznK2W5qTt6}CFw$W1dFc-S!}pXFEXDPQVv>IYMoWvq~*VN6ZH zy@ES_fs-8{_R`SS-Nq2ZCBdz@E*~GC7q~EwKzZ$i_(#^@p~NVIbGddT z;2eB^ya5rs9wiVvA}WF~!9XMfS~yq&e~bAfF(n9mtPso)HtB(B_f6n7yt570w8K*Q znP9D;iX)nRY(U~X{~CGkC3FX8>365R%pbcY*g5sk!c;Db_7`HxfE6gI#PFZaSPfEFNbAFZP<8$cqsw{nLkRa;A0!Q3ypst! z6UE~*KVZm7AzBL~^9%AFuxZ!>inhY^y9xslqI4VHr3y<&-iC=x2B0vKH{XJ)CDpT( z0>K1dU7Um@>a z1pnm5}a1v1z^peS2R~sly~P{EaPH*E+LY%xVWOo zo+PFcH~4NmMV{3pYR=5|8rRF;Kg43iwZ->_FrEsal6QS5ym@o&**uM3Q9QAG&MNAw zc_DNu+5P}RH>#`#a=avged6zYga*YXIms|Y*>Jml(?S4;phF|yzUi`xNC6;ZpzGyX zKPcPNlS_GS!lHAV>khhYL)RWY+FqY)-day(^`noK)nhJQD|n#n{EK*p>-DhLFDx#` z3fl%N`s{Aw^o#m+I%n^81v>80OI?kje}Bv0_|hulDJ2@E(sh%5kN10)!}8)ShxQu9 z#~)O5zwqK_x;pI0THD^6ofY2KlJ+=rUj-W`!5lnVlK4mQEI=HYjMGD}$D>UpGpj$! z%gP$wz3a%d*eeIUACIFXQm#o5p)B${fJ{2jh+rgWf(-?RgqQ)v6PQMX84Sn**vTV; zJ;yMK7cu*A{{22oE_Ndr#0VFtxwzO8KaTeD-YFg<9dF=8;|;ZRlGGZHts^Xmyw6yZ zE_pv6q_(Ru<%vs#nVPA(PDE5R9q5e7?YW4d4DzjI_IVoKDOz01-yYeKC<+7wydMgY zS@G)k)Dn6yCqZyMx!%mLKY1~)iXoJ@y}iqRXJ0ts9oR@t^T2_NDI>O+^aoIaJA%r6 z8Y3K_W?qXk>wge#-;s8&MHhZmH3Tm_L;6>m7_-9u#aVCGyEa+*29_x?P)TSKlMYCy zeBaCAgu!%kB#28JRv})&l|eHRKyg57=AGBkGGr`?wtDcS;B{lUJ z8QTYj`XI7Z40ntu2GNUw?7s?1p?G0b3ogWSjPD@<7mBx;04;}T2_eP#*UBMXS}?K1 z%w4ga%ZukU-_AWoUZn8qPHAX%;k%Z$cUF{N)0=y;!F?|MZpt@?<4e4~G4WFc^ z|3abQzhJcOMUmBZbNiaj%j2nkCUrs&(6)@gwOzS#`Ox=Mtw{mJ12isO+)?_$l+^UC zEV@ww!tAGV-u`tDN&UT4C-vjw8S!^>W55xtDh3SG)A?30O6yumj4gy&FP7vRaF!VI zj*gbqBcZRUVeq$7ql|;+p0K$&lkaVQ_qTtJsDSKYG12u&vJ+`-RgeG#NV^07FP^>w zoa=u5zePrbj54#bDUoEA5E>*aGomC#Bs&etsAP|nkcfp7i~GKJRhguXRf*EtWXi*`-~q(%Vkse+wlH?l0}1#zA3#h@iU*fe3pY z`~zyc!pE{sgWrIQ1`gGG@WBEUB%MA0wIgW80gHG^aLx-6X&h`)IN*kYgjKYF)W+_% zx^+46z+KK%NoD2I$}fU04w1kHNKOVg1Xu;z)b;U8@zb;L1)(hl8BFRU>btVwiP8V; zKsA6<-pC=20?Yp0;_#y-j!5C4m6leek{<)s`(ka4JJ9cNeRBJXiAe?b~Dn zBc{n4Tm}(ofs+Q4hR}^K*$-_Qt?^WFOW{l(0QXz$@W`k?W%>ag9_j%`CZ=)}KqN~L zHxfWQ7C>yM&nh2pU^YRU9t#u~phptq(>O(5eGZ;Keex7wN~D(f@w$~ZoGOFl(n&|)e(;lL|cLsfZ4$6I2 zNwaDQaK4pzx!vqizWAW4s|zI0ZEMGn2#9O|?GXXe@9=B6E~%qhOY>jLn~xsdMnCeS zCaF8NYmS>PUU|11{IOI%pd4hC!h|v7FRRym9z$W#8%3 z)@x03=7T>3lox8=nDg_ayi(gFy;V9kfL+DIdul3#CqbF6qXX)??``koxAyw5<#9*s zg_~@NJjLm;qdSF0mLPW_h3F$#K>pT7%c z60E>?`V62gZlNk2J^t(q8U!L40cHf9(UT%wbK}QZz>6U&z20x}CVL}TRzN|I$ekcn z`uA3L=~56>E!%)V#q2q|-+g8$Aq&7W5-=C1Bqxxxo&_i%?ptu2m)(pSho~}fkJxp- z8585_1KW$;KuiM=w>2Zp4hw&`tSnd?QF5j4mzTdn)M?X2=<^{Yf*H*U9zTTf!Bg-9 zSRjc7!9s`VJ@a|Gz9>WzSA6so)k6RtqMTzCOahCZra|D$a1Sg+oqF;U0MVRNAi2@(2+=CF#45qVT z3d4NgL9|1_4pW}zPvbx#Ed!*XtmfK5s_V;EMdO=%{`m#yUKfs3mH+zT(6aZRSWOrdLn$uYv*|2V8atY_w;j4;pMe!A*V*^ zwp89~o>MzNIq84kPk3XdnPkgN%iqBYvMiG3Oe>LW;-8f=Wn~0Xtkqiu-IId@1N%`t zlLrZ?9H$dr4#^w?Xshd&9j7i5qu;}Vf*#~h(5@@0z)^5RKVe%K=TJf z%7XrZABQ1KD6pCjZdkB|(hWgle}9DQ*tMpGmTt}b=KiEAs6_+!Ir_xb4pHP({r=ez zj_w^UI_U9);D(6KL`uvldxuCP@PW?oU_bUTYF~8|ybrk>^pcR*MPK`wP%kb{4U~mY z2sm|%@5Q43`tstG5DAfdn4K9G74-*?Y+xs3muQ;D4m?320t;CzV&?z|Tj9Wm@^cQE zJT2HVUnL*yQb92y@4Kpq-xUd4DfzvlPHXm8?WYdn$Li@(YSOkrMk}0|hiBEq?f}6< z&IPd>fW+p*-+{vOVCy>bckZt~Kf12PguoHB473*1U+mPX8;RQ&rcutscEl7Ik zfVdLjQ&F%klI7A}xCcJ|@9@*{!2wBNNRmhls01ee8?mt+doOmv@V5B<_SRp|DzrzV z|&~d-wEgGv3j_M`?q~569|@`sX9j0;mj%+mG809cZ_c z`PuU}nk2U_mHm&zz1pz_I-C)#rec|enHeV>+S;zcygtq|-w&Z$h)i{;gNpzgcn-|f zXWU4n5gVR3l@QJ8I+hf-S86gRE-sGrj9}_uZ-9Ij#R4FE1pc}p{{jW|lL;jpr^GA< zOoh~4IcUhPBPkYz>KzRL+IIAx-W{!pH`Fi zT^w3WdRgE|yLqpW?dSIgpKTFk=1c!=)6*Er&&O4mQ+(qAm->%4)>WT29HHp{A<*f< zY;^P*d;ZE`usoNwYtZp zYwv!v-cEfUbn$2iaOM7?p(-qaN<4Xa+%llu%A1Z{FPIXSu1mnP?0Z|vAXJ)1HRq>N?8eszfEjlBVu zkoF=_qtgYKxrkj%Nr?+@h150w6Td+bAB#Hj3igk;xA)bLS)~L&1Y(ek28zOwnO(HS zQ_PS#^TCtI_JK({ml5S_k_`?0D+@ejPr~?yp9a><T$%x9U(72JsB2JJ?& zk^sNl!RJ}8TX5N&sD{B10(2KbErflKoUY?&5f-6UvJB=}#MUNO6bL@h&*9fpkda2$ zBYW1yrjo>ugDF7c9*>~3*)Vr6ue~VcL7Sz#{x3Fwg^>%DH2pLfD~Vw48q+G zV%r*+8Ln-TZ3z1d3TKPb&}wu?Bm@x-vE2ZPJ39|VSKonI5CE|d%+M|_b0gV%#m zJX2H6kavxnyXYd_+Qx64ozmE#M8A$O!jB&{3U)%6e&JgVFX~R%8uxy^xFi5k9kMTe z!F9z_YIW{;n!Ij}IuVc_lnImAhhoA+O4CZsi?> zW;0M(gvG@LK^KQ?&hyoH-=9&90@y})*@yP_KY31uu+D+S@|CP32S4=RI2Znym}LtFzJ2>oOzi3SZCD;j zl?ccV1#d&jPcb}&)-*ZF7YGM`;g2Q)}Bw8*tLLwEna|Dly0w%8w~aw+J8Jn>yOgz-3)L>oj<=RLU;$(Kyu5r33#Y4xvg4U1CMSnkJ){? zL$%a`j(cAlxyYQgKJ#T)^T2cF?exG_DcE--eCqR}B`fOxe)!^XkPXLnMGa{_^84=L z5P81DBI++ZZOEiIc)_?%{3zrjua0RvgUk`2sK3UYqmMH6^kDr$trhAR=Zhwa5baQ< zLg(^CR4O-Mg@^A*PF9+`;9Xg`v%tYYOk0iLe1M1{A|@sK3zzG2h(>LteK691n40%F z_TGcj0wQ^EviEV}v#fp$pb8!Ms~DOvqU9?-@q3haoGsX-Rcjy;I3Bc*cfy5qP;C-9 zHom;)AJx#<*tC?aQ$miS|HA$J0(w)L2A|3wP<}gHOeF$41nNTHS8^U+w2Z0KWo+pm zq_j_N&$F^}8t`*67SxUhF*G;Yi+#pz<>m;mpiW^5#5Qm434snFx_W0F&ga?RnymfNP%p0e?eT}UXZbCW1B~Ysj89vb~<5b1*UxjAqD1wNuxViaW z>j~4~A9g)<&q;S}`II6Li=auCFtgKal5WLhpXD20k=qya7Wtxs2ZMuJ`nGp>KXrw| zD?W<1yu6&?muOPHEa)N=I09d28!M{@q~Y^Q%+%B+5b}@|Z-`^0;4>f~wm!sBAZL*h zT?1%@SoiQb1|bEMkX<^kCTP+5tLDPla7!It^afBvx*pG~qk)#3B;dm2gH*nGHl6z~ z^wgKXbt+GZERKfS**3q7buwI-W~(i-HCweva&TV!Wm_99)dbQp_{EE{;D*`yWBoZD z-vWA*K0OGO=VsVB9w{NAel&IOd9@z<9ie}H5&P92fmGzOJ=HdzY^F<6#1sYe)g6Ns zR=s>g@=RXo3eK8@Aw&$7el6H{*g23H#Lt^3hr-(!Qy#D$t12osg5AW?3@{g8br#?J zKatq-vcO4crP&tx41lgU8*QPTLu}|6a|;et@_Y&rU8=3eV|MD_Ec-V#pKTc_97!m06jP<~^@V2iJ#T~APpGIQ z8b68=-gE=Dh5ax3e1xQ_t5@d7+eB=% z#V$xP6D`ZG!}=0TuomL^cXVum!rvY7XjmZ0t8;%u#VOEcKtO1Dd{Rv8s4zvHO;Y<7 zzhslV^uKSd)n`qce3%7S9|5ksf(}I4=X5hd_D9<$5V8>dp_XQKvWqgH=*isjgkNIf zXrLL*MQ6dFt;ik0mF13{UosMv&l$6e(kT6qTR~JmWKk#%>BK~LKUqTlSJ?_A~44|L_((xHU z!Af@4kt4KczxA$O;d0Rq`m711wW?j?z-UI;W4hVd-RJ4|?++-R=Rcx$PW?6|U)cZF}?B9-R$-MgC!&yF((7#FO{mXbZXO|S(*fGoTq+T3*Z)Ax$4 zRee@gGoN_))N{8y*&6sYBp`qaJ9o``Ie@Am`H0iX0kn5hypQC9nZkous$`8*Xw2SH zZr&Pn?@ni)e??p@^<8@E!LIk~*yUJ#g#_N0cscz}6F%i3y?3Cdsj-%+%XcNOT6|q7 zy^6zjhdu;(@NIY$N8>h4Ga~!Rs>E5{8=>#+A%q-afI)-ko2!vpXVMfvvV@k z)O^~fGvgSxK#801$~EI#>6Npu*0~N+ULD=Zc`A?n&lJlv*Y5AHklitps zfG`>7+E9bFI`jU<`X%lYV*{%`uK-*$2Ft%>5e=A_zhGvbsKevK6Ir>J*u*fVfS1KZ zdw#V}KnV7a-RJ7(&W9;^Q-$xM=1Db^zoO&$Z_=7;m{$HJ0y~Hw2w*6YeE}K-Kzln$I-P6-HD zw>9LGtb^-BxntSbzZ@jC#-c-rg zqvLhRmw~-|)eVoG8y>8xN!8L}lg;RPJ2K@S_E_T2;Ha_qI<4Byuptk0fLN_+TO3z2w`0aif`Jp z@4EPBqloS7!w=#PC>+i(%<+`1kKxS~grlwRpvP$&X+{yB(9VN_d?YOdJY6ryt^spz zk&9_<)*t2@?20<)8*a=vQqnWCQ|L&(udKqftR&{#?S68W zMX?W&f>c#Sfp8vD_Q6F$>S=WA@c58V!^d;=B!(BC|;*H?ju$h6qrLoW|czyf$BehF9^y}AK%N~%? zSL`k2%Ccx%NYvE)fgmUsSoy|T^KWsoiw@6(%Qe8kx@6UuJ~P>s)O$j8K$aIl9MGZd zLAoL+_B7-P5RU@v--K7Oyeccr6eaOxp`SLn^2@1Z3B*=&fj%H9StwvR4p$gG(23wW z5zinpwPQ#5>?Y%4W{vwy5^*x@R=U1_Vq+HU^B5ZL1$(^eQ*(@KXs&?m13ouV453u5G(eZFyZphIieU0IGU{#OF+;b#HBLjsBV;68x>}c6K&H z*G5LCt}@GKD;tbWlp9O;$S^C1q@)z8{OlCma7`X|#+>Z4R=i1tA>1r=R_|td0Q$rc z3m-~QXYdgRTR+Z2(FgvI#C|BR{yhtS?mg`k(WpagvWT!ux$vO2|BGA3Qr@`08&MkMrVS>)?%bG5e0&OffS`z4?Jj z5G2HiBzfTUm&L|Bf;t?GGMItwd(Uk_`a8k@iR>PWv&5vqjCvON5OF{F|E#0h2;KDd z?edo1JJm#=AUOV$SQ`^lN&tkMfP86KRf7W=UH;WcxujkJCNS*lUZ^P&7$h1cvz40K zhnQ$Eati=WQ(FrJ{I}7cGFz7P1#Yv>dy5QB%)~M z4L$_Rr*?6+;~HQUr;iNfEv-&B@{J!{84WeQ;N-F`Z_E5dgiKB7-!z#h; zcFmoMpFfmhMx2wYijN^+-vuz;rthRw930e^d>H3cMLqyzskX^A^BDu%92gKlH08wx z_7I&Qxp0r|P9MN|kub5{;xasPr2if}M{p^0j!FFv>j@6PfGj$*zuY$rOJd z_nwX8)@{22l+&sr^L>Z!RZG@dH79*y-xV!|3#jRuv4r@UsqO)edyE(F-8Pg5z)i^U z2)h7IcZ7Z-j4iybKls6eU4&!O`RWm+ih>O;8xi;0f89e1GTVYaPMGEg=tpq+V=RTA zmG8LzRNF!7Ks{C1q>-*e;h5vAVpb-(nq_dbb))O(&XxlQ>lfb0e9K235T;}Z5ZN%| z?IMm@*0$>NuG@R&Mzt}*=gxsxyo=cF-uz?ghqQwi5k~{+b6@wK9?29eniil%b-?4A@JAdZ&`TEA&PyIBL32nUvC2-z=j^x1TO}2EaOeE+b#*EBR z1%Migs5b0gGRz7bEa3(LG#F-?kx+X(UCN+33?I_~WkK8fy?)25+&^tbpPvP4QXsGR zQ?`OU^^t2``(8-gP%)7h--%N-V`rY|hN0|lw|ZVL<5 z<6l2gLVN6kyiaa7TVY9QX|*vU2ZyPu6t`Z+7g33cs(#PSkF{d*)Eh{LH@HH$+bP4W=JanDOT{B$-_Q zev<4nQW643#j!a9StPtDB>WyfwxXfoK1tj74f}R2V9cqC{3}D0JcSs%rQlE;WC>MW?hB|56N+;&a+GzIP8ca9v-zFyik||)R z5I84U?+ClS0dXVA_yRf!%m6qM865)k1_-1U$ac+WPH}ou5S=CPB-(vu5!#_skHUFR zW<))l@O75VXe&<>p3InM61=0D_#GZ>5?v7y!3;4Nu`;0PJ%Tuip;{J#ox|}2Bh*!l zw9prhgdm6b{K2atS4MC4dW;4?pJsczzgUEJgf`qTq!odYUp2XzOeMxwhPymg2yv-r z0C`{`>5&Yp`Emc$)blmsx0uVJ{X~YNEyl1eDf>4K|Gi?RaN2|>b+^QEn+y*R>PK}= zIyMD0Lsio+%g-8o=T=gas^vU(yvk)~*2Gtv**^#iAk`lLY6uX<+A}$@evzDqArqJg z0_O-MvB{X)l~U|GhPo>T8CZ~|;~8yWfSd#5)MDS-Do`#ZmFDJh$qd~?5#01#OZ(%} zn**-xp{EyA*qZP&EJ?`X@1r&-1aOeO2H=ONqFPL~_3tTTyN2=$?GYs@_@PX#ovV6` zbOxx`2)*3f+e?zOfFR}~6eD-Fp!eAYmtABEcb5fW{9bM?T?136V)1P;!4SC}`$0}&Kt&WfLE^3j zbdEJBmj4PV1kBfO?~(BX*Ou2d0FW<~vy7MMg+jJ^>_{ zi$!&==?O;XZ9wU1`|C#yOuKsUH<5Tw{Qhqxhjj?3tAx-O23@ZYvOVexR;nfXOeUf( zz}|8&uX7cX1z_O1k97&s#tPSm&05zNxU9=Yu)c`V4*`4lQ5t4uU{{I$5_%CSW1DQ5 zJ=T5<2f}J9ea?N}$tYiYXqQ;WOTQ;d{^|B>s~(d@@%lE0E57pW4~>tnhw=z{;8fVN zRnQ5;R+$4)+q%b@Ovo4p4uEmXR`852abLR$^o&rOfQG-#Iy#;N@(D@=Ly^p1TzljC zEH&8SR?hB3kDRL_0s=n_W25TSX=X0&-fS7c7L>CAVLT)r04;|GmJkFgDF_uHtO6iC zwgV#AY{Z=pkp63pHh$tGKmd>q_!1%>3}?*V^YjoLk+`9!KzXq|%ZwvRKvp2dNZecf z&5^A&jFIlVfJN27dGUg&?4&3#jdFgcN++yL{_yble}+zfAbiCFD%XNMPm z-TVKtobTzW`#vh1Wu79vX1Ft(kwl3BE{7-u=O9Mjd_Yh(z9gAU{pwQJK4`IFZT>>dN=@dR5$?ShWRb#MjIb(jl^`F8;T@m|}viTZ4nlA@syH{I3wxiw2-Q{?kw zj9+Uw9N%wMd7oVtG^yR?itTJShRHR#ZqpW!D-a(%;40!E#Cn32{!jfvhQ27+{Tpah zq+?ec1V5}>cVnHF+9CZd{~OBqKb#hmO^y;%0^+q0O95<^{7IloNDbO&3jLQM)G*S8qf)-M>;JJZC`hxI!JSKbyD1HIzVchZOi5Lh_ zf>aZqLTX*bV@^lSI5t1osYUz5mH~QY;LV^=7CS|8nvfyIKt6GYVcIeAWLKTbP~3{D z8uidgXi7vQ)RunE{hHyek6FKYt8!^y{gahK*)tdT1Sy`MSkX!IIB#nUgLmqq3d~Z5 z7WAPkLwA%nSpwC10l9P+epyz{wXGK(@{UiPV*K9P(g75`fV{{7ql$%j@nm;|5 zjoPh2ZZmvG4~{+BrFUbxkF5-GJ=q?fxd(7ob9Tl zBcea?2WjOjuKzvMI;|BU0*MzKvuLxEC9KXnpf$n`ND+JN`NZAmQ`zjCobee#=@3XF z*a$9~)XzTWY;8%@`8b7%P)celN|qdiXS*lg-Kwtgw!WT8=?GKy1I$n2o=t0|+X0@kEgPcO(=;uZ6cLdbJFA9oz`4b+61yJR`}RH&}He z?rA?uNU)1vnYN2Wz+t(Xtx&O z3iW&NkKR?)wJL;TV3lL=>*t=|@Pt2rmIVNDQwa}VBI1d0O7w$aCIjgfsuFlQbh5O6 z9L9GgsY3kxgZ}i&($cw3uQ>cST<-A{*W^EW4g?kn*g~mWGNY<-l8icrvg(w;Nh;XiQGM6B_!k(ftm_ zEnAb{L9JZkjk!WU&=U}$75rIb)TnAz1QXohsFor7Wb|Cduj*i)7qv?7g_A^u6 zHyukEGxsAcExV`~f&8jB>T9T1B?N_CzjX!gRHr^bhwp%f6K0;@&eif4et?HnPtn_B z_xY)6fmFRAOpjP!d6gH7c$B>ZOfuUWe6XLfTXo2w*{O%ROm=xmv`E%dZ7#4q$|$Sk z^nG@pgvr6GAU=*%298(aL@^F|m4(O2VgKmRAI=_GGsOsS#9`M|vVo*vVkX145-&N} zsFA%Pj~g}f^Mc|<_3qwYZeQOuHp4`4Ul8V&e5d19X4W>#_K>KU2*BBSJWhreSVN`J zBj|NVfr38`Vv7u&KsU8pqEbZ0=T61w`1n%!@$c}qB7W*(efV>RN^ytpah;d#u!7uG5nX0YXlyX)5w4&>H_F-s=V}2 zLZA5dP9AAR?-J&DlCW%WyS}bek;5qrW{~{UIGGFOI1CaxzP{OQs*V6)Xw}RAm6i!W z+Cdcd2vHrED)yXts<|-PYyMkP$z7L9bm-x!x}NNW>mL^sx9^}HN!&!loA~Kw#aCZc zZk5B_BQnDS?+uw~=gCxP5Y99{I#&5Xc(x#oCA+gEDM@Gs+#OdtY|yL0B9p83>7mr= z(=!WwGh?Z$`(Io%vTGM#+Zm($nlESv%z%&|9d&m8B{rF=)})I<0H)tvi2p*Je-e5N zGs_v=UpO=&g1LjE8Xq5XN39vF;Si*JYG$?aMp4mkJso!9)i^mpq@qvfC zJ7UCd=)E9w5OBhvC7^Z8{2c~u{66CnmFb*B3|%3yQ(@F5r!h=45~UDU-AnODV(tmGxeFhAjAHxU~A(OU{HyiHWJkjPhN>Btx~1gd9_S}wlNfaXsN(>? zZ?I*(7!URq8LR=#p0BU(N_5(O7#DDylpZ6QTO4=NhK&V6D^a|PSo_;42FBD0IT*&hQmx)^v!nKKUjUZ%T zRiVg2!(`ge!Op>U`OQ&HMRlB(I9P{~VvqtMRgpz_-4Gcq260^TU|*>}VlL5IDepaC z(7Fr+mRyHYQrYhgQ24G%yrD*C{A&G`(TB!ES*AwMDD7niI_*02T7L}Lu(qYGpBtu5 z`Qkf$8!KGhgM*_+)0Yl)n>Cz`1Z{vZa@eEf&}~Xmg#i>q;S03-JEEMcS{lBeCNpLr zGq|&rCrjl@Y*#^*>+FxBu&mp94X{lSSqxrAV~YI4!$1H#aK4~EgVYxjLJy%gnLVHW zZYPPb1l|5ET0^ksfUBD6u`v^2CW~sTNQ^*&3KV~Se9INM?QrA5arOfZTv5Iy+?SGf zW#NcNdqt9m6)7umKadF*ILXfwM?P#0IGr&V+1zfjuMRFi!jd9cPNSbg*;`n3!N2t@ z!nsL_fj)%H@ImIP@YjoEa0(hnYnnciz1X(_V9xT2mc#W5R0kGWY#fMvgPgVQ3T2QYuto9PU!D&+qLvjW7@Cy zZ_NmjOi!$6Sj&K}g}!pSIXe@T>YIySAD~B)MgTaLDVdeCc7D3F;!%3<>uUqeK8vy;U7GC!ei~PuQ*|*|S7xE}jQ)Vq0Bc%ix{T^sO$arXu zeGX4aKt-JF#8P1B<--;h+@PT3RtHvQHeMauP~vGYK0zxR4Oj)T^|JB!socCEu-pwq z42AX=LYd9b6sG@{4**&NT(swJ;l-~OB_}1RzJ8yW@M`Fr1%FLisC7!Wskr6R$=h>!CqrfUT$K`hA z$_FeA;0&ibMRAGtzes&jyvm%cfYG7E;x$(*mfijM=U3YYTzj*5FYJE*{w9hNb0x<_ zYm{10FBFY?rrXY^At=xAQY6$&y={H7k3!?(YH@g6+VB4!p#rfwynEDk&<;2a9%C7!w50$hd*J-S+z$n16gI zn}hDnvu80Qy*)!k{;KoC$X^-dsQ*IlX2GlVjg3={*oR8!`+52u|-~KWh<@9Jg$b zgQvWp83I};Xh;{252C51mH6fjbBZ2x z#R>?iJcE(Ycxrvu_=isT?s>qYasnRT`4O+ttHUy_Nwa+2>)ot(KEG#?Eu^Bd_Az$J z%)CM69pB*`UF7x09?y+rWnm#weKfnaD_nbH>yJ|3+riKNO*Q3}I=@Aj0CJ2!-~=WS zglOyCvbKb6^7`{$t}ljv+tPxZ(=e@acbZZah8CJ3{5o4uo=uR+**I~lpF>Bgz1 z3|S)>#fxC(ze^KLN12$}9=(3rDf;@^PW`0va#dtuOFcgpYh6-dCv&$n_}G0_x~E?! zL?k3I6k?>YRQELH&y0E(=1ryw0IzWJ7ke?%AAeMSKg^iMtyRLO1x6H-nS}9yh(Wmn zVT-kEbPF~v_T3&B5K!$Nc5rx^*_0iR_#P&9cFzj+pITZs={V|LPE#bZ%R?Ruz~|lS z)}jyVvdb_PuA{i{maYi;p4pih5(n9?(D?Ya;86#2#Y1V#n6Ca~XsBnA{-XCg`Fj=& z=`jd8LZHhLys~LC<`I8dNIWICr;2KYZkiaUE7xbJd*Fw(+tQA7N=r4w|l#B zskc17#>cTTkIhk1Y=KpN?1R9HP-!q=r`IZ^)z~y%D|XFTT$}qeB$H4cQL0kJZ!aF^gtA-08X#Gv-q3*;mqZTV^5Z+!?SI?AAg=eaqI5?l$JOb}S?};+ehd%y{`DI9zAi+OkMgV^`=Kxayjk?9 z6g2+jo~b1z@-OQuD%z`<{b;_7UFB4{zI>-nyXSMq>tRaI4%hE9R&&y_wsrJ>eqOM+ zZL!6U#^4y<4k}e*1OG+O&bWS|=X_~~=(u;J>(PpB2=f6xo{M@bC@3gvu@K$-DP3Ks zHubHw+$vX{d#9Mf^u)K<+vvqluO!`oJ8yLbZO@-45gtXn zvo~)J&-?dut97N(yiK9c0LCKqZe|1SP+J&G2vLC{I*35RFtJ_mv?2jW8o7Yd)&1n4Au+=T`BM#pe?r%i}DQ?~Ro)TTpb!xYpiTYvlG)innb%=R&4L*8y-lhML zUDml2;rHlj-JWb*wnLElS)(K*4Le8=;$Iff!Y&Jqjjh4Ku|ws$Vy)=4XlV&B?)_n& zN}P&Rr&2qMV{SQh(a_L1xw(}sHCAiRQ6(Ac=%gJ2VuHB)4^2%?o~Yv1?(C;grE@~! zk%^(qdIWY3FH>08hU+$HhE>A&}WsnHb;S-z3Q~7;Q#)ehi3#zdRX_6g*hLa?+%M8s1_aTW7m{;&G0@)50Y?_4%;} zzE0(MdObSQrhZsG?a;O9-s7nIoJytrPG_{JxR=-|PYFfcvuN z4@K5B%*|hZ^D<$l{viZ>KtQj0`Mm4xA2u=46i+;9aHFhEPSF=E`7k*Q=bL7sAlSmK z81uoXhR?{@xEix*JRrNT3-RpIMe-X)g%vKHFfwwVRi8a9seh6E{%|Lk;gtfdG^=B~ zwxNTw!QiHL1PaFNqf0j)P&rg+9wuO}qWu%1*E2)fE`y2!YT|7|`bRaAL-Cdvxw)^; zU+`H=X^t^cbGmY~RX;|S_Rf}@+)DNTCfa3>=m!;bwteI(YD4&Ad5FqtH3r+JAG_Oj zW;dLMSLb>DCGojbm0IMoA4;RdJDUCLsGV&3hMWp$AtcCndUtF+tt zO6_wqqe!I03~8wZ2N4C12i&r2`LwEZS4KbclH}lu`K0%Hdi7Q_A&ML7qGCCwZs~3$ z*Ex>zM;tlw&YPTFth)El^@&EhVk1XQ#nw3BW-=;!3&dh^>WlB;+D(aZMX1qmMhvli zv3Sdhu%ARJ)%z}3Q*d6|mRzZ0)zo2gi}Yj1O_&Rt(-V*3*a?-K6I?EQI}W8C0`UZ# zekAX!#oN^z$9HVr?7nC_GoR<$?5};bsoP3ISXkKC4G39?W@9|Hfxy+%43~`nQ?zY+ zl4rL*StGFMsOtUjhzLTQto1zo5}RyGxs|o!fck`aiz{ES=PAC(}@`-e7+PtG-YWDW&t`);Af^%G{?+==?=`e_@LrJ_~*&%;FaL@Upm%6{Tt8equ zHqzI}e&sGTy5l3SfZH!$GH$1SIzp?yVDQ)+7TPwm!gVPrpNsS4H`-s@_j~Hngf)$7 z3Um$6)6xQFW@av`Xr}JX{&liUdB1!|adkPD^HHvL;5X7xw`(6g>Nq}I5PyAzeoyQ% zBCMaPl|Mf5_3&h8QiW(M6^;dr(qVA7_315cQ%u#mYV6N03^Ef@Bg>C$#8r5lou67A zHf1}`AZmbMQ>0p`pOcAmVG=)W{Uo{PWvs`OUIbW9ywDv4Z;b}B5}e*k$Ne{p!tG_^ zwu#bloSR6Zar+?QYCJhPi2x%(_|=;1-)y?~;mp&*!tH3r<|s?Jlaj5z;|bv}QMa<< zh32b$BK)z!W}l^R#df!xj$o?o2hS5<%Ke8AX*OH2*fXGX!IKP8`m-J4y#l%AB{8$u z@WTSF+w7mB)rIf>$h27UX1!DW|E3!lZ8A+hT+9w!=3EFcU5bEsj@Z}H4(B@5RacX# z?%TIenkLkts@w(4NHntc-3L3F^T#v!+-^`LCgIwA8U0#BLbhq}-*gxc_LycELL`2c zQAbPf?O*oTLnYF>A2yvrO{Vqq^gFeNKliQeddTd@{t;US{1Fm}FO^hu>?pqGZ)9=v z{1e8~T6m2b5?sKj@QaG3>m4Y^ zk%yiuzD|Rei>v?Dr>8$3WbEwb1-Ljg?9H)-5(VEU;E^Pr&rZhhuPZ(_JkP`OC(h7^ zdo0?zn6vpQjDnm&{^cWwLs;l-m0`|Ny|XRNrjHQ%om^f{~;QpxJ$TOcV=s-Wl0ZUz{f$VWMG zqIBrm>SS$Q-C2al5+ws#wz-eb1m>PTd;8Yo^Dgl%ii!>^?bL3e$J?zU1p>)vRNqx^ zGH-%2+fF|lH8il&hL!*x89RopO20psxgBx5ZIOK>qiwazs?EM3kXF>`*rCGETemip zE{%)95odq;XYrdi5s-W>D#jO7kKajHno zFmlMp013%1`NIw$@eL2^2V{0N7!2S9`1kv9zVQnPJZrH4iOT`LP49(_BNdJG8uqS$ z788UJXTd3WV`|1)yWAO6lyGb@B{v5g+x7a50jJoNhq~F#428aB^8fyY>%@hRQmYC| z#~yR&$o0p#gkIk@pSnMIvqw#<8XM_BbMc2L67F}(_%!Y$B0K;M4pr8_K3AhWlNh-s z&O=|4{4@KcKWI!XmHfs$lYX zvMhz^qN$8G+;_$H#BB*Gw}U57G7|X;0KoIWuMGL8QK1atBpU)SaSnVaiG@T}687kk zY2sH5rXr);hEF}XFakkJY`nKCJP-+wx6yacV8Zz;C@cUC z8Rwi?kE15?&l0M4EQPquc0ob)r}ZcjnefYG$_UXboIKfV&?O2(%OH~Btp+Opb1Q=Gab4n;T?|26i4krM zXx(6Hcm|$^wiNABiQlD zz_@_yVpKPYmqpL_9g<2G&jy6~;ZfGpx!?Z4d4PZT8kL*o<;#-Tc1OYlei*gxMYjSX zZwEFD92w=9`U`yr9Mjs!PfIDej&T>5J~q5m>M5%~!_G^!78xH8?NdFTIOYSPIwgZR zOglvhkqaO<(&SEM1hd}84LSdOf8W}v{~SaX3jfvn%I^RX{X07nQrygJWa8nH7W1)>%+Uk$>4H;Fl72Rs7nAp z__^X;wX0)Si=96aL=6eSR=_HU{-_^0f@wK7aVM_>*#*P{Z?cHyu?;AWRinmBLBV^a z7EU%qL`0CW!Wbh)BtxL!ld)}^73cwvp-7{dMIGmR;O7{bnQf>!=H})q`ufVs zZ*N3Bwf5XcZhJlN2kYy1o$kJ|YrcgT(O{6U-RXxMrDJPtr!OHW1&2LuS#7Tsh_aAz zep6hfk8{tdeN9>nqFQ*?K$++_W`|XaA$$dFP96Jupl!H;^$Rnt1(hCm57lAt@NEO( z+@1PSu&vJ_yb=nuI_H0+&BxHllqnQT{eT-kUADo)h}kyQmQLYVcBye4C--Ig+tc4p ziZ=;$cXDD3${kD)#}}={S+m>C9aT4ED$8YaBdd;ZW+GO#@AoVws0bQz*K^t6a<1#C zxQNc|93Co;(O&70acxw4UhPM$+l%O+_RnKcXac zHmd#K;OksRh8E;C5o7oXuvuBf&-6V#m&y`Kv?-=bgjhdk_5V)*)emSIb2K{It6qtI zjhy?7MY((X7G@R}^92v&&A`1P4IjeK$!y;X*p<=-AMphbYJj~$mS3QIWnaR##&~bm<9X`ynAMX_nuj1gqJdt|eY8&g) zFG}w_I$}3-yRVBqc5cVPm+1mp#a3Kfw{9&P`*BuXT^)zgC&lsAf#7^)rG9!$N9IIi z$Q#!^muu(83z1QZn?!Ev<%S<4r)Iudn$0YXP`}D@x<5QNR)avP{nwWZGlTW45DUJA zbN@_~LdLpiKK=_mxq{MOUUn^&?WuhFN!Z4FY^onRXHa*in~wJ*ImV#yuD22d6To`5 zP;NwiIsO6fbUll)lq{6hxZYg`DeL>+KX{;9FvX=1B8`2AF#f0AElLGR5Of*#yHegp z$w6=25rT!$8?377UL)k(%*_ zDQZ)aNL-eM=OBRi>d=n`||ZqnAw1-J0OUi5?Z-8PQQs zzl@R6wN$|~_}ARr?C;+)cg2o|$saoM?~hwbg=jRHZzawq>La%5^;JLh%PUL}I2y(c zE)2)NX9;(Sg>T&_OgbLKiV;AlI`D!yJ#+|q9v?-pp=Ue4OWxgMSJ_a=jZ@Z3!(b`0icq2sgOL;_lG*lR)^y0vr6x~ zu9X3Ae+_$jdAaoYEEtUM!C-Qr=~1@yV(($ZC-*Ca)Tua93S8?EU_b`@z2|f+MH&?G z{2<~R{p{OJr?ZLga2*rn+m+?NS1>1*h@NYjnhKtr^$A9f@ETW#j?Ue8id1xY^Dk%? zcA~n(u!lPEMPv{+?&_BoC2t_f0jS_m334a1Iw$twe`_iD*u;poF_tGdTWY%Pvr6=UM`4t$1pL;rcEQ6Spj zT8VmCLJkBZIygH!lX3KyLvn@c4dnGm^o%A zpLpmt`+}*Yz7qH;0>r2XN9GxjAW__?$nYX914@G4RazG2_eXeeT5u}(K7mjdcZ!40 zQn}Lmy1EQA85^+p$dMR)-K3oHCcie1b8g|&kucFAcHUMq@Dy!hSH0)kjBvKhqMB|& zH;*m5J`EcUkwYr5$d4eUip0sI*t6Wr^$(>_<`h#D`L3{SZbVawv0owJll4b8L6Ep+ z-lCCmYeFFFMHXx}YA}9c3BMZ}5slPFG#_zM5v)yOBNc#9Xh@-`gsHF#4PPa&FT1=3 z=tTp!3AJQfs^DM8(1M||ny|~K(A1-4*}K+1y){JHe&Q6{6r=6LKFb=o`XRw1!yR!R zf=G&Nug0(OBlz@Ob}1^!;q~l?3U7aWz*PsSL)|4F_zKW*5w5*8+cE*mHmF3`;iX4< zOLBs@E3eyo?&-GrokEKM9_LZsNSVNgEBD9N=hL*(PVmyhZ|MMnl z**=LCb{IDxKY5SmdsJ8VvjGo8Cfk9%;`{6e+F6px&<4rsDV*in@*jZKVZONI>fy|N za!%>9&OA``W3@y7~bAo)(S(iFd1Bki{tbYq3AkjcJ+!FzvDU2kV*x+ z!v&WF@RK{ub~X?M<6t3kodZv1D-a_Q3&;JnGH^POvaKEO)Ym=39aB_my>QE^0Mll_ zAO%5Ndicp zzvE)gZ?E>_0pjHCY)=){zk)T?(AWqdp{4N3*`t3U!x7-)yH#u?u*dn@wU5Y0LvPe) zcsEaaXlQ7Kqp{dCPyXJ|wVH$J9H=5Kd{k$g#2x zy_Bed;)p3v=W+RLImMjyuUopOmVM9c9~)4|t(E0*>L7!?U-A)NvEXoSZZf+LWsPaH zX4=5z7ZoN^v-!`SRoZv`cT}uNYB6afH|PEil8B6o`akC0!=LNE{r}h2-g_4fNu{k) zR0=5xWh4z1MIxc;l+LCGQPPsVrEFT7Mnj>Z9fi`6_W0fp=k@*FZoj|a_q*Lb*XO#< z&UnAyulMtLjK||~JdUHU|LM{qVNLG`sSOskr5B(0^X=KOoAO59p6LAe`o!A%^Gvs5 z%Rbbjr7PtNc`PZ(4^=OI(}yo3?2f zcr(ZI>6UGWt@>d=bZyDSxe*nWMbrP52d@5GJ753u9Iez7fs33QtL|#TUaVCWdvlhl zcWAcVdX0(a&>OUjzUHd*GMts~C<1$!r%zVohfnx)U zZ#&%a$n;RaJ>D|tO4Q}7m;M`+Wz>NbBcWA1>=}Lxc>ng-NhMvxNa7LHmZ2^;w^V~U zaR~|FVIFMe9^{8!yxiqy)cD}LH)z^Z!FJ7KJK=aQ+aK`bcd&h2QWAQvVjL!#lzRU5 zynl>{#K@eirEz>82#lx}9)G{#qqG(;8L||%U(YWzP5g7)dblIG9QTQsTS+InxGCgq zn+jI;Yf{Yo*5|*6>D*td^(~&s%NGU%dT9R4n>fd!jot3wKNo*&Qixv!zz{g(Ae<&R_M#$MpQ~FBWWyDk3ea3lAut-DH& z^>EpsD{pn+$xWQC*}R6OlVgrHN>idg5P6r}0obz;e5lb`{1kGu8gIaoynwDQG}$~41!<0fR6 ztO3haU|cnf{=J|1>{QznmkHN+(DMBtCp!pl!h|E+B)|li4~bG~7e~<0LFMtLw!Foq z+Ny2g;jw#MO3+l?LU_Rf#iNBD9tcAzjo5?d3oNsn$4B=!i0x7F#o}6}q${#0yqlY` zY>ol}AJ7bV?sUezHhs9XD(r*n`t{4N>}>ms z!mBK?%c9%h46Jv4LB1$Udwrt6hQ_0I+C92Uv^h^>{oBTDEAsEp~D{ z2BL0{-I2Ls*~M$#cj8aR?om?on;mX?@7tTXm71R*+50K_Rly(_ZA`LTi}3{KZ3n+9 zR$p1sPd~_Gi2tV&CbR%zl3XIa>|P}gocQSBL-hMI5NClIGywc!2~0%JI@3#;4P~nF zkpIzv8~<)y7M`wwcmbFJJnC@9#&_lA*XZBG2a|*>72-rta>)wddwlk*R2R3Yh!kp$ zrsKM`gy;kfR<~i#yPGK6Q6b1pqQLmX#WAw9L{YWMjSV&Io5-|#h&GZejUVXyu+E0_ zqYK62k>dECEWup&#@cLACsAR6k$j-UbmaOI&+Dy(6Q}v?P}56LM&~AzM|UiJoL|+V zxwe6?@6_stXS+<*UcIoLR)nSgVeNIJ6yT}DNb3Ej8AZ7N`BUPvU{=Kx1g!fwc0fVz6TE$2(y&$7=&`f_rwW@iU*-#VX`pYSkH{e-_)3gs6V5bfPwQ55PEflF zFa;tI4Q!Ufl*oryjX+$M99*raN=n#B^Z|n%%a> zup0ss;R?BWp$gBcvl;^daYm|58N8w!6Qdxw9M$Xp7Ou#F#CT^ev9kL7XMeRiSmgX zHm6z628T{VQ>ozG3wK=T<)!EOf43%XbFfPFJ$3LZqO9^)=jHe*tk`ME$} z;t`t6orJP80D=+%**=8N|C&7gPLG3u8M&L6A3p4Ao|Ndaw&C-A->Yu4ZFV_(?ruZf zpaFBEgXRy5n78Bexl!a8+k%i?b=PdwQ%%L@4Yt=Or_!2rW#jNdvqJ5N@ph|sB%L~S zvGieqUH(n~xDj7J?K6tZt6A=%;pqI+H6uQL{MD3MU*0c#nrNeY&b%i1uh%b?4$S~m z^7uOb9uNH48>l@Pkf^*>3$&&&8tWj>=@sPAK!V9eL&B{8Z?Hp($|a+^)r%$9o0=lhK7;Gn?B#2;W0kIcXs$v2ny<6VY4Yd z5$ai^&jy`2fIq6gpl~Zrxc>{U_wizb)}_C1C+(KVz2b$M+L*DMt~}bTZO~uW@ngTa zs60R1ongNavfMH*?!zlz>lHIN+XnGRUJg_o?Rm3dy?S@zdRjZApi;dHz-@7OR>X3i z5#ng7_5;T!HD_(#+lQ_f+EiEL+xh+7P>Z8~7wU%M;TG>ci2cmG&F{>AJ;-z)IcMTW zufjLRa1(^FU;ugl1J6&N?uQ6fbaILy31)y$EZ1eUNnVY) zR&(tg?@bthDc9QmuB7kS&L=kH6isjEl@oez@sSCK-)8Tyov`LatbTu@uv&_0O6RU! zZPQGfYkT)Sj1jp1WM3a2wDrlVy9OzajlT9||Lq$$E&|{^EYH7_nHIZe-e6D)_ZPbc ze#0I6TSfTEox66i)mNqh)NKot@CkGKU0b~N_uqvjsj1^=OJ&a;%d5r0P}m5X9%nXp z*C6d)OWx(V$WxQ92vi_FG7%*S_`29zG-=htnrkEN4oPB!cl- zI1`fE88fYToH3Tzne?msg!QUy5Rgg(Y@=<@p83DN*NC_Y{6*%KsjlG5RG;7Y+E3R< zD$Gmg&nMWnw>+q8*2Zpc>bNDXdS8p5v45k+uSF66sE=Hn)4|!Rg{WOfr}lLLC@rnr z=IfvLz`0>nE)KU8FDKo9(03o&ts2V%-`w8ZzgcuOvIB-Re6PK>UAjo^!^r+6MzN{H zj~zSqjZ_-b} zVUP6#R?Yqs;$>iD$o#m5dCxP&mbbzrpOO4aP_wqABKeDZGkYflZ*@2sSY zYyBfTm4H5_G2KcQd$TeK;pmNC8*UTA@Pu~T|_I%AYiR#l&5 z)r3h31yK!Og882no({k_58LMl_0$=?@-8r{usU1QNXSnp;| z&gI!Yj=#4ab<>Jid(Qmx^ywh}&xiV-ZEJUQd48X9UEaQxAtjzfUH1`}dpLhc7dJO+ z^#HM-oHnVmgN668>}^b z!nJSF=;^kh=tf5L*rg)g$}s0=m?hMya88qnGi^zHBS~QOy(KVq_P>)cb;I3WQy?yY}|L}u$UMlZ? zxj)G2?g6#8Hnx-9iBijN-&YLpB6^oc=XPO-u(NEnSBdFMjTpXlg^2RJdIv8~iTDS( zVq|gh(A^Ku=CAsVmhBL8%2%Qb@%4O|KFHG$ORVRAbz99juAwa70*c@fGul0PhzHMH zW}Q}+B*#iMw$#KS0#y;aqzq&#yYhYzwdx>;zGr5^X|mScCf2;X(Vj*@yYE)j_L#Kc z5Yj#>N*#(%v~C#{ui5f-C&b7dBNX5%0k$exB5#3!We)3>9Pd+Uy$lv(2V2o;B zV%)ht#>(g?sOfWi7TmdWCvXj`Y8z{5y;-LrgZLD_t2b+S8t$`-m*IBoZ0}{dJ9iti z+O5XkZ|aU`e}3NW-a^)vbLgKKSy;9C6P@F-0MW*(WQWIklyw}!expVm;xeLnTtDyj z?MEs5JHLDD*rx9??T*Ii;G{+5Qv#RDd<=2*BbY|%{TdgIo;eMgbh(puHg?YRHgPJL zYSUhJPBQlfVx_fvCDN+z%evc*4pY{DkmfoJVM9>toEdH6@WwbV^S016J9sn%D7?R-DXK)tHRPEy=S+IJJ*#(OLuF*(gp0m&w1ah& z1}ISy=RpguMxqv#rsC546aA;u=+b$KB$I}cB4l~epV!`9T&tB*w`^Y4Z?;v0r>sDQ zHWqYQ-fqy2UQ5t=pu>84zsB2qXs;%%Us%MSkPnvwxgx)jKKuN)7tPPJFjcmplC)-S z+43j9r&9lI_b2}iAqT-imi#!>Fz6WAzbI%J^I{mRc>w!i1-BOjIjzBsvS~ABoZb3& z3pJUme%mpgj$;X7EC^~fLYiOJ5m7s?iHy}gySzh0q3olk(t57k@3L33D5IW5Je#>0 z>+K0IJD92lCS3RKksn3SW4@ECSI?;pzBFp(0mUYrqC)6x@8Q6Kn12_4ljtZ@3Dp4qH|f;q2K!gXASt@X9O)uw7h^f z!pBda`i&X$CH?j3;ws{goLdlIj@tatDmQ8|zScUBc6ypEV?UsfNB>M-YhkiNJ%HHR z(IMP8vQ@Nc^mMhu7MGV&v9sF=4ysCX7_W_RSr(Mm93C?GW>L7)R1{p(rcXChb1S?= z%|-e zejL3$IAi2SFh7|P&OX1{?826;*g0kCyZ7wT)zbriD=y;GA{8URmTq!e-_dc%JkM(X z65D8SH<4dZP}7D9qD_NR!aErPX2)#@L$qIbDA51p$vLK`MJ=1$h-&7olMPS+9TWgr z9B0WV^Izu9cZb5i>d$%(06ryJoH^7XP$Oq?cCa{^<5E^$ZUkb*Q(7&*@95EHM<&D- zYW*jH5lM7MjoC>nb*2Xo|nX_f*wxz*ai~D}n9%=7riFLJzDH+&0T< zzjhGQD?TSrZhhQp2j(B>Pt3O$HT$Xt6b1S)z_%%#v#sr5a2RgD!kY0sRMWoxo`!xD zPe$hEcHc8jol-zSP^IIGnTn|7t4=q+YyhCeiuQX^u|`Y} zZL3Z<)!u%y%!!IQhON#0`>%C$JVhiOv&(CUSwv?7MXtZw*PozKLZI6Wa`6i;>by)SfK#=ML#`X3&lfSZU7AhCcOrceRb z{cHz-7xDY$RG2?f;pE&}($ejc&N3qt-C^IiBAj}TfxGh9v7HIH*YgUyiuBs}b_BBoJocvAp@W7FwINapX{zf(>Ann)p2B_^snq(UurqES zmr&B<&cB#}wNgrg)3N>T3$I7P{*lfDiNOcfGr?`ek9l~Q(vy1R(y_TJO!Y98-E2-l zN3W%zAhsc+yub<7L9{L)K&NJGm9A6o& zXfoaJZ9+58K8pYSpSIu8Y?c50Z~nw)+76R84YmIJzX+fI_dmTw$Tz8^XnRC8ZRxnR zFB;Of0u>^(Sp0CHZaZ`ljIKm2he_GDS`|8bInI)2yRe0jEw$~#%dI-VN|2IDj3dnHQHl5%!yUpW}7s9|2lALP$ z63K`sA`ho7Q}K&3%00;>2;$h-#QWX6b!fV;Q2aq0;IVwOkMLRJVB=ZGe2vQa3S+7(N5EOwyzf%M#^i%p1&d21@u$NKuDTp|z zO2aB`e*VV2HJtnEkq)1mc2S@!h=Cx(LE^nD2BL&LY8G6Avw3@@acxk3B&ZBU z{=;(t=|u_r0h(+n3;7Ugf_#9{cZ0-J(iN?&UveuZKbOG!2xLOJdEKd=1}8zLv@yg9 zzALtr?vVOO*&96rcw5GAAaR^7w5x#3Za-kwMQC;y6Di{89zo&&9!_leVSC2kVYI{q zeBDH~itgY)M8X|U^r8A6MWl;#-MlgzX44H|fz$yUo?v~cKIhq+_fG5)#E$&bpiSq_ z*Ehb;n!yYMZwh{oJGanQ{}iK{iH^^;!53E~6}@9Eh*{Q#(R!inj?bRKr4)4lD|t^R zBpe`6BiC%pStO>Q!W)i;Q-Pg+vQl|Ck5>xlu!>- zUO^0ovocjd0t@{_7gUZnmz}1|niZfvec9#MFb2uhfK)P7iggh0302x9 z7Ch~RVkWSw^Xz0RpV<@M?Kq|^M%a`jRCp;+f{K$j^!`-`f9VDv`{f*GpM@hE; zqA`T1Fq4-)#00}Rv{;d#Vn&R*D(U*1Bc)Y9mH6Ty63UFNEnYnXg z{cG{fwgYuHHa6x5l*(Scn#p}S012XuzCo8@T!0)p6l!3$dY2YP+S;D?u~4bND%zLV z`;2H^m!i#&NYs}fk6my^yOK2)^40+dDuBiPv7At&ihT6rWjlwd7}t&B*xTYoVuW=5 zP(Z-oEM%$$42gv>K|5Ge)2foG^K;0$bh-Bkt$bP0rNY;%$H!%d?V?aT^k$OYjMlC_ z^BGD<4?WHy^0pEapmo2>`SzrN6lA*8u?4)oT2{@i2}Y11p@+625i^1^3PH7Ko=jD( zOYh->YxzcbEl}G7=qviYdz>+!*Lsw?Huyf4EER+_AJP<9_OO;+hEK`4b<1BqIj-<} z`zLm+aJ9i$S zR!{;~+x(*#ti{oZ?uppoD@8^`%*@Hp_v^Lw*IHXHsF8z1+&a_SsGQzXf^0NH`Tz2lWUataF)8~@)fj?N$M{fQ5r-03C187L=jsb^$$inlR%;fo5 zZTW6fK`rl3pEIO%Ktu$UNyYZgRtNZiDsqN+{_5Sl3p^V&58*NDag}`dibD_r>CTx{ zdq{2xM_3;jb*4P?!9#}V(=H#Px>U|TIe8JOk50}<-jgi~Yx(+Uid9`I3Z_5G+L#hM zS*0$vRzMj0;p;a{jspKFpuknbP~1WT>#jp5PY#;|abJLTVKDhqp9;3_m@=j^b3@j7 zZF|q0Hobcfd%X4UZzI+ah5&SN1LUE}1B4hGri)h2$jK>j)Rtdv6~X&8eKS+nyh{CDs^WbP~cW`b>%(b9uD*v<;?RDC%8oAASDo zztO~J1FA5g9^mgT!`1KTwP7cv0<$X!1{tgHg4d-z*CTIMl%xz}*rXw?S_4tab0mX| zRVontBlmt=lG1wk{yc!_1J9mWqvh{9+O7k#c4|YD)VI8S!Em^63+ykTJmR1Cb|2e{ zvX3cYQDh=!O!iWx^|{NFrWvM;55dNXqua!T+FeWnuygYC^=%Ifetns`%vsf)7^>f? zL53f0^xyvs5CNk*F0m3N0lfxyb0tgf_;H;`V;NpTMsth&ThF;C24Iu9Lf4Q|gLznJ zcXW1glI>M|N?J29Fa<1F(e!zgqOOnpIV3{}HI3;{s#d2mLr=ir9y9ctUY{Zt4y z3cqXl`T0(7NnG6{J%{7~MNm9xfnL%KoKJ3HB_o?*pt!enl~%MbY0BqaYL)^3BAWl6?OA2*OZeU$m6EKJqpS zIdPd_&XtHgo5%GLsKGP5W9J28HUEmA=J%kkMyX-C=7VB*AyT_CZr z4HE2wm17_YttbB)j$~D&iAdi)!0+V2GD|^;xa7uxzO=-$SppAMnbfJfm6T^{R+%@r zECTVOftFp_@RS()G6KP_q>WrV&8T%Q<6~7No zYl^|T1w*Lmk4r_u0kcRZNy=CSwFu^hJd@}T*A5(Lqd)OG6&{8?3BTH8q^02jwwd~5 z&BLuWQ18#IYmU83{LTp1+;88%o0Z#77(bqAnRWRZmJ~4z zi4ZtX^CGl0(LV~nJBf;n2S*sDFIy;FvM~WM`=Sm|KxNHh(RZpdS^;x^GYVdQ7TTw>1UwN&|; zzt7t{l<$P&^@fo|gg0uOpNDw~TCL=lZ|0ojlbkDfd)}N$Hhv`+m#O=8kC0WH(EfCbwC6W+ za#X%%WfJtr#hmCDm6djBjj4R2S9us_A+#9j{%M(rXdasKLU&hQxp{LJ$~1m5&b;@i zZU);QHS9>SvUM&8K4NZgO!Dj3uUUL)8kfg0?$Eij?C?P8s2pN8z?Wgz=F#e`KYQBF z|NXmn-WM)xO^SPYI@`P+bUFvO!HgL*FoYmhz4V)JwHV>~s5MV|c>5fR*;*mE3pCiWl+YCVU6LaYocHEu|Dz5et4G{ zt748LmjZ~IX+HJqjq)aU;6=-FEXQ=sidpJ=^5mMjSb-eNfBZdCq)=3JoHg*X&)>Xp zEBt}ciWA4pw0Dl$0PGI!PWa-t17y%K-8ZN{1^f!wnB$Kx3X^E`h1cxWEI6BHv4 zO4UX!uzjQQ#z(*nft#5=;kl)~pTT$s&pi20s#nP!?23c~zy<=562)919$zOme=&RZ zs5a>B*tx-X4@GCwk#aT}Xh>V-k&%%R8uwTYs1pl8HjbUmQ@&vIFey~^5=)QIKf70>7K+3$m={&=NA{aeVgt>WyX`2d1pgo$HWh>2R}TuK(nq z(@*s;jRDQ1#A*W6LWrZe%wH9sr9)}>rXX}rsOGPfRveI}!@H>59}uyV(Joo}W3}h{nwQUOJ@3Et~Y(3oa>qhtf>H#sAUAJtHVtSIlg^|Gu@LTY38jk5y z`Ee6|pYC_zWk>fo#?v6RM2imzfyAbss*C0Qxz8`4GL1Do%j5a;t7Z;$NWesQr%>gT z=+!9CsW<(paYgixP`qBv`;BIsJICy5wW<-~434ckI`@w?HA|Uipq z&+biNiBLPX;2wE@q_TDcZ;k6i`;E&-%$G&(;gPGA&h}e$QE*Bu{lUUS2EkOVg2)%Z z277}eQ3)`%?SD4{WQH-u?W|9NvQXEv#mMB6^%Mq8hn&TDYneK~R6+G&!|jzDAYrGX7`wYt^=I>D%`hQ|xKl8O zLxd^tCj>3^%lAf5qfPu;>MX)6WKD}lbsH*|zbq%_UWKYl>f2vA=Ns4r43Aemi@oRe z#*qz6prbbZe3v4V(Tv66lIp1MuamlAGiEcAQ!2B!RwWQffldbb*^yarnoa7qn<>#Y-5>0J*hg6K_{-vPZ3A?wDA}(IP-UQc1)+yHw?=h9`e!7-QK(EuorR5>-2fvPU<0F)x$;*gqEI+|^a05n7F z-=k+w2EV5HDCzg~SE`8my=O>#y>8xN=JCt>IE*!(f+ht_Jseg6)Z@BO{qwWHZ2)8> zAtfIlxAwC7dm*rq1LbbY0yK_JQxfH6rW2q9&K%X3T33LnuJsIN&6*Z#YLWlEUDSSn zSrIc+9^Ix4%`MDAp38>Dp+O^vr4JrFSjBS~KaB?UdUp0U3Y6&I(7#uh=ZngAjAq_H z2fGObK7A;3nF?T63+l>-Po6xXj6Oid=z90QT@vi!ttbZb?sAC~$on_)^OeVq>%vz` zN3g`C96Ax`WTDLrr|ne}phCq<9}Gmqgk_ah^!;fR1C6)U8{XOjNi;6sULar;|K$XB z|Lce`3e>~nK_RJ&aBx|ZbO8l>%H}3H^ae9}Z$2dbL1ijy2+3Tq%pF;f$=DFZg8#mC z#67S9e8{?RX^~GaHrc@rL=?WwrC7!DnbHgxIX1^~{iOC$C$W1M#0Ok1mwG;?F{EFsHn+73gbZQ9`#u{SNh&<1C6qGROJHfo=d=hcaJz8Ws@Q+3BobvDU zLkt`>>wKF9sk0^Hme;*Uodo@ zA7e&fIIFp1N|VOB8TBo2mu>@V#99c3%*o8&V@y@LsZs*Eps2Xwk8%3E(N4B{n;BQE3{*`sPVrs?$w1}Rn}TVUI~!rk*2u`aTO1o zU31qVx8Q0*QR)$7;fk)Hcww#lU_e^tsPDmX>G%7mq|Tk!^}WM)3+zHD-g>#eYJ0JC ziUr2$;=T)GbPRzI8d$S(Ri0KLiL|)j%z_JBCiy7c#Zi=E9JFTztt$4NtxZwI?nM|Q8bK*Z$ie-1VqANC&xG&iwu^p(dV4|R zztWZ=W|$>y+sBWx(FA9V-ZCaAix@zD2h2X^R?^cH&sx6VnB<<5Hc*9j2GwaW1@~LQ zPQ?Mw{?WO`08#v_d(rcU(a{4#yaHtjp>yFV%jO#7>|8^g@OyXBO7jQ1XsjvUeo^fb zyyJ|D3OiJ*VPb-EMW{6Kw|KZQaFne!y$C_W8pe-#acqm4$ZzAHr0&)@9w#xL-9gzP zficM-R9-v{Cq_YWFXXGSVco~)s$x;6@!6&m>_E5RW-;>1!FeBEE}Z57d-UNWMUXy~C#DcXV|i zJ_S?-vUzb6NS*%&Um`kR>}`Qg7?z`c;-+3xe=#0tSyV=pnuAnTUvcisk>iLc94JBp zz0IW9on)jMuwz;CpkH6?B^W(f+H}6dy|9+dhMrsIxiH0{r>ude{-IKpsW0kQQY!U} z3@x8Q7eFB651vuAq$pLljiPS8!Wqe9IY?akh1F{JDUa|3*&eDXz{K8eqnj}y4_X_l zN&m!|5~S3o;V61^MuIHb)ss#SEB9w$(zdTFIx8Ip>NJj)S*z%1PAcfIt=XdfxBd(eM{^^ zn2NLr8sz!+2WJ6z!vB&XXPr{yGB^rE#tgN?a!N)53|;JD^__l!x@D>EaCIyw^#?qF zC>2YT+|j3~3ZzE!r=8r4_N@lFJhR@W4#@I`w3PO)0 z&N?#uAI|h0^#yneUPLDhYte#l3&K{uS&J|m+HP+0DE${_fh%ert%^76w^*L3Al*Dl z0-HQa`j}l>0e0w?PGBEV5%m)l>lK275QE&8l>5>%e?szRUO9F8qlSL6ZVlGz1L4_h z%Z_{Z?uj;xfi|3(h&nR%qa64Vpzb?wqMq}iDL^x#)rIcS4!`%2XOsZMnLGNK$kl+N zXz&~Y-y}R~MDPIDd0A1x1_H;lE5lg!G=K#aNN${3x^T1rRR9=Zhw>6XUp`K|J;l

j`DZQmCBR4e~KqfI@~tOqgIfb0e&2 z=V4Zv<)f!Wn4$Rr63GT}0PptuX%Ia#@Y!r=a6ShJORJV3i9SmOA|XWnVc}N}u@iAlZ&Eoi>Kj*W^=3N}Svbzv zDL)7vh$g|h?g!_Yo9hv)>Rx9d6UAgEqy1(8_2=jC6WZQ3YpUuimS5WlYD)qpL72bu zRoaDs@V{ADI*=vm0UYEh1ANI`4oxCj8GRu>^=IJe{9>MT|ToU6^=Run9 zAXg(`m_`7312o4=i$4L`lC0Wb3f_LQC5V!pTS2`=r^?ByN2x0tAc&{ZuyezaNaC^H z8UIh_1W)su#kd?lXbd=S=H|YXT7W+!k3l98Oh)V*XuJjz$`Ab)NbhtI{oGC!)$Wl-HLTQT2lSAjam^0aSmo%M|+xkMlAhSr8IZ_cV)oC z1-Qm@sYBtSWM&0vL03T5hg*?w$nr1bM#9jF9r!nfV2Cmb0OUKUV~?U*QYW2C@{yr9 zvABZJmCiGiI#eb@N%J4sBr(G6gEb(RrZiM&`ee327MX#&oMz#bEEc0$QpeA^ajPCT zh_2trnj^Y-CvVC1vKs*940dpm0wLINkhpbi{q=E&&0iD;*AO^)YX|i~K`HGFUvCo% zL{j=K^cFVmZ>dH^5P|kjOquC@#O#1h>On8cgj&OkoV&Wg#hW+JxSU}~>>Kr@uD<@#n?A~!fB*bUR=YK6YU$#nAWYjRWl=9{ zQfp6V0~J~g0%ASHane5uUIaB6$XyujTs})$C0sP-z?MPLZBog3eH~2hx zL=$wEljs@R^0mDH!FT}|Ddo*;z6BdW-0u6wU53qya2u#u9kw zrdOo75Dp53;r*gD#|;hsYvv^Q%OVt6K7p)Fn$unaVOe3@Zne4A1VBg*AUXdkTod=` zEG40oa%e@k5zDDt#*G`NK*5(I3KVjL)VHWwgr1XK$V~c((v4eNk4WwU(Cfv}taSw{t1R6Gw%7Zig5h|qyGLHES*=8?KLV;PNkv!tET?0?_ z88&P`C5{g{g%W5c0ECF_@U6LMJv@Q>Sq$2>gA|xj37}cNgN#4dH7Mrn4?dDq83JDf z8>`s!eUCl>$APYkcnM*oT2i}V5)_J9xc=vs1Xc{L$`><}(2Kkhsu3s&5e~3FQ55oQ zD$ao;M{??zhk41CSTrYKtrVDqOb)!8o7=OOFJ++sWd&FhUFh_6dmU#Wmp$O?yA2d- z+D0-r#LOJLdF+z9A)dLp4E&5SOkxd|f8A*@(8Qs?f(--q-wlWWNJNaw-3s4`e(DoFJDeaH$I(hHxGhXI3}9L_HEnN^I)0@bM<=9h*cB^H5?_xHiAOnQ8#ed zu+|=nwtR1d6_4B(d<0-)N#F@gdI>(OK)4j^cky)UK4ve#{RbLbiXW#IM5#z+G7@0F^zmZ~5<&X8CKea$wMV0#y9J)>##DR6{Px?Rg@CXT=uSEmt3hL)-CNc)g~IiV7E@h&=u$%>D8Okl|rFPu7^v zLv}lKKU+W;MX5&-zas6KqPERAx=AcWKUWs!7Od?tW_1_puO@l5e+aJHP#)k?4E{S* z9Dv0<0zd#zndbHpUC>~6W9r(bgf79InP&3Q3(jfhpo^D#%G3PF-IbwY z_ErHL*MCNbBXb_$-!e^4h?bi`$JdvfhkRupVWtgoSrjvnJ4-5$8itUN9`w5MG_F-2NeIu4U~wNbC`6U%=#(Z_YLIQz48avbt?_ zTa*$Cvcz$Hb*bc30dQDl(ipjlfGJetBeB`m{QJw97U>!W2ieHQP}YV4d&nR+Ww}Vn z=#d1i;NSoheYo6wm;@d&Y|D%K!?K{tCic~c-Fd(|P&9I+p-`^6zdg@DJJiVR%g=A$ zo|Tm~@_ zc^QvZ{*92D?byP27N;hnSrLy-5%0RZep+0dV%cE8q)Er|Y=EceEzb~9P3+V9L5&}w?gQcoW2qH; zdXAHn&hiiyM2@(0|0Eelkne!|1qn~#(h4QdZM;53?}u7qIZ~u^c7?k^XSlYPc*0Cz z#F!`T7P^`ck=-yr=GcH3nlCi=xGgu54az2W3gnp7R&FEa!k%B5CqRD@SSq|$-f zjwWx!id#Jp*4Mj8ktwsVv;{52^AxmNeg}Vgc5!xN$-~-Hr^@12nwVuAV8K(wZ5-CZ zmoXtQaY4v$YyMPa>mG{Hxcs;kudIbINgdKp0$TnL%ii&H;8?N=S?* zcxFP-I)Zx1V;3nJ#mr7{AHd0FJQPBl8*4@6kaDLX6f!Z^qkN;)B-AsyCWDNU z-)UT4Lhg&ryO^AE4FtiVg%e9`CXgG#J{E7zq7|1Y$VCw73r7~pf3@T--MzX^Bjp1Q z0;&pw2CZnIl=WF)=l>d1^C$y_4azUMpups8h{^)IUsP|?ZYcUy(q*$eFcI>Ms8sJr_BKjM zcVrMC03_xG4)yENeMyLBJAw<5D48kfB)+6X0RL$Q^vS2GK|IFijDS(6 znT9>R45++-m2oCMIU55~A6rjFw9PcPS@{D*AXNGew$DI0*_5tJ32%Mtheh(ZOuieO zf)vcLVA`QC1xkSltL-3Zf1h$FXp8949mzMxX`i-eR ziYQuOR!Rs=1jjL-l~g;p>16|3m_wGAZQ@#zy4o|2jH!ty9>BqOnL(eExa4XxqQX$` zpNSjdF^A&eO5F~+DuB<}O_Nae8bvN6If=dk!L~)YR#Rgg3i>~SoiJP=)?x3$`>FWl=V`N(o$AKoM(ve`0`+@0ZcS@ljW!V}{B8_1rW z0kdSSn`!?yHZ>NPN}JMn3Qk_8UHP|U%CP9CHa`af>P&4t=?ZPbRq=kRO3{w!3kaka zTHw?9R9=2eFT=aDLszCYcMa4FrO%B1u_ZX;8?CMXdg7pru~A^lye37ftjA_`lel*$ zYi8NaICpFH%uu6`oUR)2)*#X`a3x`!Z(35C2!EKo8p&*+sFt~9U65E2RrsVZI#)pI zWX90_`S#9$*3XHBDSNLm?8mCL#ovR5g8avA1fnK|K8+%W9Hju;>1#w0Q}D{;?OJTnce;0 zp;(Ja-GoAD8UCPsM)Q4#P*{FFj`mo;UEnyNt@DgV?B!*`6IbyD+j z(VD}C2IDm}Sf%)SRW%FjlM9W;+F+xVN7MN)TECte>)Hgr)lL#C04!FV3O6I_Vfe+r zWD6-|%a4kIsGDa%Td&^S%v0e=c1H$!Zc;6{y-O+bqaRGEaO|Z=p_c^QT~z(urfpL?=_b6axOmAo_U@el zL(Ho;Zz$aNm@B=aB)!d74Y=VQJDQMjOj~}UU?!VhAsoDpL#%zX*v zhwEqNVxv}C5f147^D(FiP`BA#XuYS;{bm2Ny%cImjzH1Xwhi}?*$RLSrCo&N@&@bS zVfrhVKIZ&UtjOYI{ZVxIrwF?Kwp~!DN-kt^Ev>VU((|`(dDNR}hi3YC@h32eS7}X{ zHY4D{w0exM3}Z6Yf7&?a)L^1z(yQJ4X4=zfdGz~*FtBtGk1I;~nL@A-&FMUs<0Fca zsXz!Vi}Q335ITezio`3>yfWj&6jIDg)O)QN2|^4~AUiDAzq6}YjYf{enT*Egb)}9*EP7{BMxS#^ zx}-L7vw&`vwZbSep}>umOrkCWo=|B^0S7=umK7f|+dHS~(fmO`j#>R+3Qat%N{}cz zokwopiY7PsUIi^-`x^b|HV6@@Q&fZg;ZX_R1M0M7c$b~{H;@1!GDJR!tzfLrImP0` z0Rg*^Y~8*+=T}k5Og;dK8sGEwNRS{6XG2t}+KbPaRPpQ#q(-&|$oe|V;%c3qiV@ta z%okN=Fml5&M1F=|@9O*Y9hr{I0bCI0FKqOr*gzIY+sY+oq!DEX$XC0zZP(68Hj$zl zA1?GKw41rEr6}kQAC?J9(r)puYNlghgpg)XR?y`toxbXjy)X+0{gq-rD%6M2UvNu| z$TU4Zdqss;{;~1Cl(5{A?qbA6_rBESVCrEq8BJluldK8ysJZgsLr>dxzETNcxm0Fv zhzZgP3Zw|`Q1K{f2$DHeI|q&*H{i*c!3F{9eXf37Qgk)W2PY6m7nhDM=bM3HiDMUE z@ky|#msM4{K&v9EKug6bQzzAz)E9O1?^Kq4S4 zu>|KUqF~>~Sr_O^bh&gk0t^7w+yRi(FSx=0lrN$)m~rQ{2}kEwx35j9RH19jIW$=Q zA_W{y@K>oq(7LOEx=9Z!p9Dub`6QV7)1i;`YnvX15`+h9QGL{(SEnC&JUlRd)_|#@ zM%=r-2p`|XJW;DqWAHum{)MeuZFx67R`G}FH<*qYKv0-1qZ;d&<%xu@DWGdp3cfpt zh1L2*abK$awPW`TR_v&bGQCdKj`f+6?@dm=vMxc$N1(zA`X+zKwE=K7Gj)RJB0fDS z(*}c4e-vHqB;%4kM~?<0P)28)=gq^i0XXzoG!qG5_-4Ps62 z@xXJBy&@Qbz374CAWNoBEAp|7W`P}2`1#BBwvX~oR92Old2Ew8=Yg_OD1?AFOQmHvNJxGlBTgRm%@jhMEq{P)iFk^$!;nWR z+DQiwAD#i>&1Fkatyhe!Eca2qydfxj{uV_6VKbEPgEr|JkF9#HtBaEsrGXjy*??|5 z#;a}OpXB{Pyi*F{mYEzgKAhJs2x&^D%_<+kKso1>^wZX}-vXjN(}LE+8C{Ld(Aerg zoAlq<9nO&G>_EPxlM&@EV(JvHx!-B_DMpNq?~mTUVpKN(Q{e9)qev!)6`Xqx8$4Kx zEA#&S)?y!Tk!28gbE_(9H}H-HazY!!RW(=MzN(Ib-*^B1{Z+l@#TV0X?j;n_9C*!Y z+KCWHrVyjHVylsSTeLoDmKLFWo&RZX*q*c}G!{Z*3gU)`neH{C_eY{U(bEHtVA8<; zXIL{6orjyOTeseyp)q+~+QizY3JM+yy3@3bIT_HyIqKKPk-de&pl7KsVjFzlR4>$^ z0>&no6%m{Bmv-vhnI1@HxsGJ0E)Zp&zCPoCv4)=_GR%WtIi=aCyE2pxH@AeuL2cAX z5R4a@Hr^dkpRpB0HEPS~|5|cFt~m7#?R+FkjYiGY^{d_Jv19x9ZvYk;7#JA4$BDTv zvD4>qyT3X(1FZGJs`!Op8pPjS`5?n}*rH7z8{y3hw6%u@u~ll4h809Caff)#EQHCP zYx_Vsp)kQNi~~bPklT5(8IQh4D>5IA^)E_A;!BYFhqES|U=q>_g|p1e;QJ!Re^2RCL)JOI5DyxDv7b>x=$3q6CPWV;@HgCORv z??%V3juwCy^-KZ-1Y%8#oNi5Ytbl_57e_{}{w{EXFCmJz0j4T;4_rf8S2w@OI2G;S zyHa41HLg=kyBAxfyiH#{svDc{>PE2nkg(4RrKUaiQVQkwDUwUW}?92)Al=y*@#^&9|E#H>N@t-nTanJ2Pi`1AXxf75S{ zb1=S6C>KA^>PmOxX~1!!bK=q!E^nh7U5FT9F&}Hbz0nQE|JkOpwRnOXUzHp~C}%3h zd@66z>^9!^JZeAo?HSd5JS^41sXu7EKgFGxF`cb~48`cu*wlT>lP7P3u}7P{IK)sy z0;EjFpL7Vi0#kgSl{q!rZMyocN_+A?m8z3WF(gHEgNZ)~CWQ^4SBlGA2Xi0GiWiNAmgD;mAgB8Rj67k2to!=XxcS#v?j((7#e*|17fwE00_dxaDI_)0 z8TnOa)2tv0Oxz+h6V?paxL(aOJu(iwN76*ip+j44-1yUOWVcOc z0VkqNg1Z1X8^o;G)Sd$_yc&F>HD#Kl6T3FNFLMAicJeZdrITzz2M*TaAG>F*K8KEW z@{G%fs7xk2HGh4WirQRkB+<2{nXLGH!QKzGC8QDhw4~TQR*{ed(S?sqJfPDbrYnwF zJAY$u(v;jsar=TEvT!l22Oxj*#to)w>J8tahd0tWbMK;3)$Q7I?X{JoodfI_GiWA+ z3%K0XsI$%PFeORq#5%QcUjGMZ8p6o5$!hCERhoB?o$cHenm|iLT zjsA*`d9HeV&KrMz{~TI^tkYKa&0OC@+r3Q(7M{sVc7*)h^7pr^^tPZ>InyoIy}2V( zK6GC#j`Y{#yjZ<_q%wB*-iX~%OLfnP;!eDYw+U`OaTSxLY}IlJ<1KLG6O=&P}DD^z)7?`m~66sH3%xT&-%j5VK%#_G-%KufNo5vRN9ksYo1!2Giex@9zA*( zc{m{A>;3mL}mS3^>KNm#dM8Xu)ljbwpR@(z= zzebppRlSrFO2(AwPbh+S-HI}dMAL52^~7qIQOpr8QBY&iqnxM$pj z$ix*|9|G7ON0U0|jpxmn%V{8<5Tc8C@xl|207_gQTQ597N;H>qdq^>5(96px-UR=Y z!QFM{+f{VK@3Blyo8K01J_8RjvXNL<$^Ddpf#NvX(80b%1aRo3(NlG!$Vk@g`M($b zs}h@ilpc`|i5@XoGkl{qYoCC|Z3wpDnj;CRoF@=>5CvHP24+E%CdGE0#H3%JJ_?u? z!1l8ZL-woF{}re8Kj4DnzJy;d6}(dn6JXN&fC*h@QaBPm(kZQW z?@oapkh*H2MqZxM|0wK9lR$J+Mm?ee76;fP6W!JM zFnB*BbX8OU73iwDX&l=5Wz;Plf6QX^TA(6vm*E|`hHT+h^7LTabbYGd?oe3(Mn^%8 zagZLZn%J1Y;$Wjwr`db*X+d2fOM!2e?6gS1N~%*BH(I%5{d{&Nb?{*x#7o-^lm$(Gi*Wr(y0nrsD9 zqz5JQ9gNcfSWyFGRnY@Q0rGPKmkplnAb8mK7S0dE^cDH z*%p42! zu{w+_?Cs?}V%!nJ9U!YV3ErVOpia&nzQ{(Kr!8gKYmtA_g z_4@gchL#J1HoErs`+djqR;^I^@6_>eTu8I$H)MVymx$&;p+?~}LwY_+lAY-Katkd; zbTEMWMg@N#OnCnxA}6f%(~OF&13HSjivQfH?{z0|UC7R6Dr2T~xI6!iF%=cLaRLDBvIF`x+7I}E55!RKk|0o zuK@?Sx#uo*905`_5g?s!>m%Zpu0QP$bO|7FlcLbt-$_v!J&+IHW1`yuFRzAdcO~(l z0_aJ0;9()zPCTD@opx#K`PZ*rIlj;O3iDI&>;luMQ<#h4(bFs4#Y|!8h5UB986kJ; zMH@w4$;;b|Ae}VTve%FyL;7=-F#ziN^V?oe^H`~YL?B93Bt`M&GI4Z|Ek3j9WlP0Y z|7vvVLYW|54N@n^-u_P09)R0EEWm4kWyF@3pPj)qXz8j|xm4^TwZPBtGB3yzM~#?G zCNN8~>j+kncWiunKE)V^(i{A~Gs(&2MDwMz9}pltLyg2SNT!J0APL{V|G?>x7w$ZG zzdS=H)Tk{(9RyVo3PRXU{CsCBHoIp`B4%2cnK52wfCPczkw&M64{bSY#*B#TU*i#p z{0b}dar^o1+v6=0js*Jt&Tie1p&#B5s_m%q2eNnw$;})6@8Jf4VYd(cyJJTUt;5Lm zZx?`{e?v+vM-x6@reQ^YN1L%^@#0HNp@@()6-emHzM4;|%k1=~Apm`v(OGA&M`tUA z?%lgg?$xZhR%OF;t9YU|Ha5&O69huybIv3sT_wX^Ycw7N$~^X3K|woY2qb6eS-t~~ zQwoSzz^6~LCXZtv_>Q=402pukP3;HnEi0VK5=$YSYKwXf8ZzW1B~>U`1)KxM0iha0 z?3CUerUDc5en}Q)W?jR=!sr{=i{ZUIHqJ66jcqj;`5_kBmQi%cGEjg-2%I z8(;Tl^oq}^;ln5+WJ@?2ki`xTvq3+EixQ^o{K*ltVT3nQ&K#MkpzctZ>tJP;Xd2n! zr_j|)ueFdlY_eMI?4tnzUCF`2wrtrVWW+?OB=z7&+6eN*?v1v3HX#-?WeJb>htu1l zY4n~*S?)E78L@Wlg2qjzPP)IfXOA9lG;EqZoi_br!G*Gn&MEdU4O+L>{QBV;J^*F^ zH0d?mEnkKRkBnHFS{0Wew%)ilF_hWhMaEy;V(x-`{||A*zzjjH2ibar)2o@kw}`8U z=tOzarU#Edxp4A8ZPnV1pXbi{^?SPGk2OU{0@pP5YqtLF@Gm7pZTo=89|;IpI5HQ1 z8Ck(9Nf6%yf-f}-&lE9O0e&q=rnP~9S|pwX6nI6Xdx9tGnn3<=(>F)aAl&Ia$hI5s0sag^>4`>h@Zi2Q7Z9C^pfQkfmA8_Oji=}~J!d>6 z+UU~3{&&J}`>8MTZtUd5J?sxt;qj)Y_mE-Xm#;P&sP^ewHycOu)Fc{dlNLk*ytaDA75;y^-teahv*8v}c?#Def2&tB;W2BH&j1rV}8m{3Umw>;UGB zG>2?O9f1vaPJ`AHr=X~n2~29oq-M^8{q%k>c_spSx}wkfsp%-qFC!Qc;9fd0(8J=1 z)x3^$qMH>q-7^>~<*J^0@25T`bW?XCQtvx?@<|A8 z(zB4IoM!qm^v*jG2q37%2dmEKT9)x8H;cHe^|N>aIV|7-K0oKRy3__DUy~F z$_!cABYPyHBD-WO3Pq7s_THO}BFSDE4HQXbB*b^#&-;DH@f_d#9?y&a|M$Dcb)DBa zf~KXVq~7Cb02u^9jd8gWQ~^od4=`!Q9I|9AQYYHfGi19KH8r(34iw_Jb92a<33ww# z4IJ+9A>X5FH>uVXAU?G-6|>thK{N)AhJF`lgil=DNa5{23W`QQ4YkS{huxX7;z#8z zxK~Ej0%{y;)~Rf_u%gP%Ro38I8aUxh$9X?KKF5jyu4jGM9FE-etxJG@rQw5I$~f&IKDr%J|@fGUHMHddHD0%c*PZ!)Ow${rEscmG1j@V+ijlV%UHW@Gd(elIQz)v zinh+@55+<@L$1^AYrm;)czc6(d>>exp^=gIwYW3(YyTE`BeeFNZU*si3l-HT(3U^p zqYyrEyXOR-bf=YMyfQ5_GclR>(ox#}LSkiI;)PdZEr+e>LGk~9@yZ7rE>Fds12@ZY z-mF^C&XhE7Hfgdy;i|^(CacYW84wU8M3(q~XM+~-Rb%5lq6GnhYGP(4qolMQYv=n= zc@$3CP?Q2kp=KcZL-6^f&>RU>&Z&l2Ef*CPefTwFk%%N^5DgGBf8Qh51TqNXN&$=; zy~1S1DPd7j@`yg8x;F<|Y>9sg_zf}61Ui3eq0fZ${(va;^V2?|cEhM&XTLv@=%UL4 zk+b$D5&e)zMpB*0*5FqIw=EeL zSr6eq>3&fxp?m2qmdD)M0fQyP{lH|L>4FnnnL`sM(H@!%kLEUvf4 z!vcC-hw5U`uDsQ7+{hibD*(K$(4F7O(1NiP_BTb_G|o#rk5-2o#dwH#!EGW2Uic=m z@M%K51N!%Rd3kw$0fDKJ0t2;8>=z`R3bq_>3c2%d`QzvNA7gKihMwFOH#0#=&FIgc zk~|hsRam|1^SLSh`8s{qt*#53G8r|p=${<&l|25(b7g(;5XHacS9zugq*LzC{xZ5w zGlViVB*v~R00b6c3=S(dK0;OZ+HZeeeFt4Juh=%=2l}gGk5hPdO&FRt1?$+IyB_jA z4_z}@>)TjFQd%RH;R|hOVv;EI?{cad=mskK`gZniXp=M9M$drXnO*Ul3cyk#il&PB z9v!^n(vLH@UBp)w+21}s_|DnOTHsSk_a{-m?cSy0{MwtUsvd-IXbKHb?aR`u9ohdb z&hLY+O0x3bD$v5{nO+46ecI0`1GY0RdUT9OQj?^vx~0GM&WuRmF8OQ5g~kr71QSeU z^EB75*ZSs!QvV_D=w0AlK~w?nmuvQ1uP>+jYU+FY?W3c!Za@iNH!fhdh?OwM*}jgEkr9inFmT^Q*@OK36D#~DnH`gKSYs72+3SN97F1M>3@ zbQy4Uh=v>*L$R+2I40C{Tslkt^ee9cnIb6<_$4tSop5A-ids=sORI0xOB5m{9Bj(j zV|1*nI@xzaAP7OUb~Ht>@N0m_hYmFzu(zD9Y7QY00wCoB!THC076AB(rupu>XdY4L zeg+^)Y!NCjVWC($fwz4KQS9h96%-Z!B26e;oH3sZkj4c}KfxJlby&eQrl+f00v26A z075*{)ZF$Db#)0c7eJ{bBTB-OCsXV@VxI`c1#E+YDUOWKkm_b8KH7-PYQlCT7BP`J zb{{b~hVl$LtGpBWcSOUV(d>6yCseh?aoj({LXCc9<>;x^k}X%y9q+4NVe^q?V%lk= z-&*1F=_T(P@0ZD6^9-TxV`pq_?wT0LUfHd7LOL~7xoOeIySBEX*owh(?+K~6nzO{K z4h_V&-ehX_jEuuDPCvJ~%VT4^)j{@+t21lLZ=YmwWbF^wZdv|2vpA=qkQ3zwDz`E- zv%s2#(uu571~2cF9RF3Ere9654K9MNY>eXwa~)o<^5x6nw{HR#W3F+bZ#uV;GBBoC z^5DDL-wsCEPp%A^u2OFImrmQdt6n&G{|OxZJqi@X8!U*cmzSj9kk zO3KR(W|xxHc!|9L=60l_B--$gA62r~8Q{VNX;@Kgs2&YfYm|XUKezLb=3Fx<7rBZ4 z1G?0}Kq?TBGCbXY29Sk^6DPmt>~YW(@Ne!M=)@vN_i-KN8!1 zbe-qg@(6{wQ?ygIq#83CI50SMCT8(lK#K$;mjNRfcbP(v&mKodAF+6U0Ce`t*c~}`jNE$>fMX?m` zCpD6_ljnar24b*6}W?VkG ztuY@*&pj4NU2>fLE!F5$za=HL?cI>x(?s>uH(}Wh&kkRAOf+{92SFZa-N%@R>^5m- z(V3kAZ_T0vLEP3Xl2~9~w?Hf(pogpoKc}eFIKH-e-*3cS)6L+<4CC^2?mDTjd>&|cN4 zSrm{iPz1Y>&b*V>b>Vv|(2kt3M@p>3;_yFT{I7l<0XTmc^0ig51bC#<>KN&&5S@;b&v&SL+L+2WLAr5gp%|`fkjM_f>=H-L)JSzSxxBH255Ri^ zPQAit_;LIHhZ}%^3%Ct@)=lsg0S*n>aM-GTnriBC#IzG(F7^!>7l{_#S3@eyU2q|O z<8Om#hhc5`xtcOCy|7)fb{vLVyDWJ$a6ZF-b2bZe-rEY94RP3`vbeEAKHw0MQ5x5n0deG>{gTI7$9Zr)s{k~N(qaU*#B?8MR0 zE}mxdcT%jh!1S&K9VoHt*D#9TqTkvoZKyoYyz=j3VQC%l^ zMc-3>eU(rA=)JU}uX51GogEIJHRzVoGpp{{k7Hu88*Kq}D@#XsO&+2>#;HzpWida3 zQ@Xn)X8+DgFH~{8i)HxN{mM^l(4Vm(vp8YpuQ{ken0#Eww?9}?8qlyana*{N_Lx=S zy|JIBrkY2RNX{wEUgm4pR5YGOQhfXNbLggWDev^mj_hi|ycR}AQK8|lP6=hTjg3FG z_U@^_q#piI`dWI}yU%6l^*^F<;t&vM9bG{z(^7a9h&S-=)1KkZHcP0ecnXsJFSJhY z;gSr{qZPj8no||Sk#${N1+ukPoY3}+0wuk5T)e%dcD={q`g_}U24(Ov{26wjo<-~z zq*UEU5)SzMNfTg?0Qv+nJPEKQq#J2eQ@ej zbhW+LW54$rFHGJEfJz`3o4&&e3JO`JDeu)#EoxP|I=|xUj@}JdVYs`iOnt^}Mhrv& zgk!izS|c#kgzf8zA066Nh&T>{l>+j?H{Yk2DG;y*T^S_$gi3tGbJY#w{5i+Z*7T4C7tnwe9P{EQDS4&&_cih^FABciIsm*p~W{m)vhO{2)h5E}8B0>B@ICm1jf-c7d2_tQuhhplB-^%iix}(FhVIIxSH1kv!8f znu~J;L=B*WqgbPL&CP*$_#|GWYq$Y)3WBKPrXHDl=KS?Qvv)Scn&9FBW_VVamXTE~@W87^nX4DiZK|N$Ub8UG7If8Uy{xRV^Np$Q zK(Edh0lh0r>0k2|Wp}PW-yQw8^QH8aS-xz4!_QW(3-l5ZZp)cQ>)iK(N63yd7=AU; zfI9|8bJ2J3^YfG3UvzlryAI<3NLzN!??F4!UcLP;MEXX%!8JY} zMNL)j80R807j>tv`sG9ff0nkuhS9YC_9TvjjPrHWqWAm$e%`@+LT%;QV`de$A*zi% zKpu#}I59Q?7chAwtJ`WmF&p$pDX_*-Y_*>}Jsk$w8v{oE|BeU2#XRTsA3KwnZVL*? z+|40qyeUUzg3L+LDu)kOZ1|7C(8I~~gt@-U=9v)IY&AcN!lakU7=aj=w56)1egUNi zVEs|I#h)747Y@T8pZFzVbppb;y=@<|zu^7ifRGhz4*$YJQ9$D_zy!)%EwM_(W+$>! zG_5FT9rc*D2Jg-A`6>|_5Rf|mD=+T|sWhPkC-OeBPEi050BLCgD;hf0Hq=odr)?v> z0_s#^g$p||Xfc5I>;TwLDle3OL^+P0IN0HBgpZkM#GvL9FQ&GfVETX;F)lk$a2{qz zr0vAQMNW7SkS$|d+f1fmIKMpc=o*In?mn<6{6bV8WQgiM*nray)s9Sh_c87g)RLdy z9E$q_Qv)L~(h${B-7_I0C3Se*AWY`y(W4-fwLn}?)SwtqLD|Nvwg|&7;?{uzgIw?k zm;~q!wxtVv%>Lqb`HQL|{cjm5KEMb@U5hYb87nKRhZ}pi^sZp5L8-Na;HVhAd_so% zN9=YI69aybCSQChBmNFS1M$!?Tq4;)@{#<6sp?!M$j2wYf1QXJs38t;|F251**1;}|wyRY8Kx3oKri6&yy^a&3Q`t0eoux+=;soUaW`A$n z&nL*ZobB_jCFpTOka&eP6E8behpr;rg(0SSD&e^g#~o2W;>`VV0J*jE??stM-$e$?FG;3qL8u8qVnN)ZHhcU#wlE%9FNr!$#lNSM=O>r>5q(l`h^susZ%iQIpZ%%J`3+ ziuN`(i>$=G&q37&2W}S&OWOW;6B|qzVaWl>@h;HhVY5O)UjS6}w9RFNfLlN?lRLjZ z^<0#e-V7cK=t>U2E?`bDl+rQMgCsNdT_7M-mx`AakKCJxgZuKQDQfx1!%nR%ZeU}h z9oQ=F&SL8%g42F>sznI2wNdSs4ziaX@myx1?{bH&0%ounfD0dh`wgH4GJeBfgDlVt z99v+5JxeU0d}0c6+~JFVZmSqbx!wS0<4}d#V=xfwGXCU7d5IbmfDSSoefaS5=g;_Q z0=}RxgM3fX#YG%vI>k%`z!rFz?*TeN(!INc*1F!xEi(<_DL*l^@7}#wtj@dnK5-X;^AIYvM^RBENDm#Q z0%jk?wDRVM>wafmSsEMHK{Ij-hdnGo&0%gx)SNg-j^K7halOwyM;knO#l#IKF$soD zolFZ!H456ZF?3SIf`Fn8;W1(t`4^z0huG(zv19UhT{qkvPP?I&CI1i>7EbbLg3%HI zKOx}!7)rnPi(g1+0`Tg#eW#d{Rb?^30pJql?WJjAQE)R&BOShbtbm{rF)Mcsiv)e; z|Alb4u3Ya#PY@0f|l^Uff@z^){tKjQ8#I=WkELE+9!%;d5S z#HFO}Zu!c4txr4i^?_%nc$U6szna8=c#C#^o|L_I;i34f-DXt2w4NKn)=zi0JikUu z>)p(HP519mujW;$Ki_vQuTJa_J6FbX^A8`}NrdFqh@4yww2h4g*@4aart@JyN^FK% zpG0`?(Y%s4{`mSly!91|2sV=YY6^8<;k)ydJ9CPPUI0YJ#9VD^f~+q909R{$ zHZKi-2SP^z)9?J22U|XIe;USPa8KEBdXAPH>o9oeSqs5trekF6`SSg;Bk@iocL@qi zVKFfhJpcwRhycYsC#;$Q;6OSfaw%DZj}0ItC~{~IebBn##q|vg5ET;mgfO|{Ld-$q za!zV05BP*jR8J!@uOXa#GHC_tR}#}EELJj~j^X3GO8e%18g(F~h5!ux;3NS61)9m* znYp39zS5$$8!+o75D4ib(E$+|Kin$;DODD!Agl%QV~e~pr^Jmt5RDRW1f@A{!q8QD z{&=3*|yFT}`#BQNCAeF~yip!_RXM@W4NweTB7unhFa;vVbvP~s- z2G~6C-MsSmV`4wrtRrl8LIi?p)OkUc z`N6}DfBz=fd>Ap&bSv1P}j=j^XI;j!!u{x3GXdDNY|bPMhyNnf)VCk)#bxZ{Bbhd0lXm1~~~e%{H0x0tS@bu=Rnf)Ke^|SZHr{ zzvUGb7B0;i8MIE&2wF8~isB2pU0SM{HRdd!U@4MgvwQE4@=Fr$UK~TyLqL3(nsA66 z^o9sG_k;`G;444hCFhgaiDxJ(gCH^_&g3`%m&%rTLr7DD1BCEzU?wwSCjTU67jOr_ z2P{9O_Z8q3xfy18zei|64JVHWbE1*GuK%pxoOh`Z)9SP`_cp^m4 zg~6mOMqOxt@x~xywNdV|wX`JWyii>rT1UHj%P=iu1|%edTq4k5kjeA+?-^@$PymI+ z#twg(OqDT+#|MCUj2SK&IDvnarhNPMErMWPfcp=>mBzMO*$Ql^ z$v1D9@mx9RD!;zS_UInJ%LW;l(ltjMU%cMaJ<);^SJPDw_RtH{<-Z?m@HK~u+0ZWZ z4H`V*x!pa|Em~}-z6&fTw3~Y=MyTQcg=Q<0{kb+3lGoryjwy1sXATwH@uA#}!!DHX zp6-lm=*cwpju(6+7bXl=Yn}j)?4#F3Qg4bRy&o2fT@e=6x}xsrV&puvD^{shymv*F2=yTd|%_6tY96-!I9v_Kp$6xTsPO# z#dOp}9|4Y;;7UggHQQ%${}=v27E_vmt(|(Pj%yk1s_VabzT~e>_hBSwKjMB*ZZ068 zd!8zEKa&9eyzV2w@yA_mjf&kmY`1S;-{0j#|NCMKsZ-RZO2!{%KCikTkq|AK2JXHnnmy>Rnyw4e zj^U2qNitEv8HpeNjmI|Zxj_3aUgC7Pqt!dG94ux zEHqwLGa}nrYXGen^ER(=tH8U+)uV>Uk?RG$9eQNCC$dGN^N($a3}d)gZ#j+vSV(PhiI zxf!-F?zG^GkhC{xXA8U1`Z`7YaK;WsMj8WWA#U0mD+{a{x-O>WPo8`&*DF%E>OOKS zapBj&%q<{R5XKxt>5bqxlKeqp1%sW-Z&0;THoU3JN)&j)r=mf-^tB)=>IUP+aL$B` zg@1pf?~mWzD%!TCD%zKtW@FKulWqWMpIzoeIzdc%L(&7R0TDyPiB`MN>`0Dicv0 zwAT=h#@D|4=6o*mUZ2+Wp`rqMYPo+)0cnxV9Sfs|i>tSPtu-&F$Z^UjK0A3RjcvS* zsdw?K#jDuvdvvr1Y2>dqx`|4MPTXQlbD#J&_JPx zvHSk2*TmBD4XRRrHWI{e5T_RLDn)ytf72kD6JL;67Qw1(9IM{8BUfbLvJ;{?flw0@O^|K>#n_{q@>u4_-+1qaM?YJ1PY(HpQ zyc1ScFnA1x#|auW;*#>b{=2KPIDf^tjWvbZDuP$~KfC~{&{*GmnR!JD? zgQh{KK!kCjb>&JJ^0xLLv_FmK0#1OQqgfPM>Ei~cPSw{*>&Gs7cJF@SeBJqy+4bB- zH=R$jm${XJ8D^SaI^lAg)ogxfNLt)M9p19-O#|sszc%*doBN*CQLG%!+?K3rX&)N% z|y~r2Xhmb?D`Y@ga_-u4P_87#PcS z_Ko+*q74IVNO*&Uu5R&JO`8vlmd7zMmo+pR;lw6|;woD7cz$7_DQw2N&7PorDSz|M z?J|CCdgN(+_&5=3H+22AfYHz@64$2p$LtOMOHQ#Fu6P0z<~Ljmhyyz0v#&r)BDO6a zk#;&S!8#%iGN_|TP$mCD+DZ-Sxl1wb; zUK*L0yaKrVcVrEQh9@bF3%Rj0kxH7WL}`k-9JWG}VGL}3NK1i6lN`-LLfI6imN7w? z#e<8OUis0Pj)nKG5ATJMz5%N1i4=1`^)-(il@=eL$~Guh$US21R{85Uo%iog4Wbgm z+m2G|(0O`#V%lbwlZl#yWWN%J8eH^1@Dqh-%3{GwGq$rE&N(>x?wzDwSvb1sg$wfX z(mZx!8vp&L0$0qzoLdGHmP7dQYw7Av$+e^6ps;c0KWhNxY8 z6JSFP4ZR$j?b@~e;o}52>-xj0fm6K>78CPTa*ErvtrNlZ#O(C4mey-zsh-Cf8f>eY z{t8)!IBJWe*JKoD%5hVvG#o?*z>w7-%I?45UT>kH0m;V?^2)@}5o;x&_rx^_Qb=No z>00}yhr_=|7km(G8AY_nR^WCW+paZyAWz^Gyf`dtt;C!{0MPWi)p4CHFzwLEl7Mbok z8X9%|Z<@<))L8M~+u>*6vGYmEjW3%p((g`nbw5p}XUH;7Iq68hPmu9gWAM(2Tb7oc zXZ*tH;^`~QLKgK?)au&pQXOLMZd&pDEA7uc2i{D8YLo?0(nU!vRTxsP_i+7_0Eh zF<0LLDlTxhQVrpaC7guQiZ%<$F58>5SGEGmLljrPhQzxx*~0B$jbb!SOe%;QAv8V! zk^Y;1}GEk!z2@8A;Hxp%Lkp`qdQrNBVC4N|Y}1^n*%GjgqL%Y!FRw$!Qd ze(YME*BoNkj}FW~st{}QSTEvY+E3IkpW$kdo?X?9p&tyuiPjJ*Bqt}Qp+DaDj@%Rk zO)^7|ueoZypsv^bGjxt+EzMb{{?u>HeU20Xwv^4n8z%4UkfI&bW|BOYa*c_x*{hHt z7oa-&il31|hT*KRVQB(0Ko-PUG}#Uz$+^?S)D&ae_Q^rUEa^n+0Nkv9an+PyhFJ<3 z!+h%-Kw^}j(XLELtP#yxrvBYgD~MXSg@l@tRZpdI;pU;Uic|7g z1WN^fp`^Ea`>YvlRt$ACZbUWof zi=p+_XV11o{FP@>fJ79HARpSOBUW)qQpO7Sw2Rei%M$wT^WEC=CWCyK%7WQd-OA@9 zcEV5qtsD9Ts7ckQ?o7dy1~duIDw=mDE$`kTN2OcuSfX0d{ne*C*$0}gUu2Pj2}En? zEtk?QzZDdv4__%|HZkKe`+DpUq>#J$*u$8;(FCGNsG?MfS3Yc1w+XRffl4o}m#hk8 z*zZm^H%rwFs5Rba3QEd6{ng%q&Lo>d=4FTMez*xk-q3jL0-G=uwlUFAk-_DCU^Y|eowcG$J7J+ zatdtv!xd!HInb@bMpDD`fQZ@_U-#wAum5beZr|e-#N=C|qIm2gM{fW!V2>i5Rd0*j`=uJ}%tXRWMJPu6|!HU9^ZnOa#p)Koa!$b*M0hUYN_u zaVUNSx`O$5{d1N%nM9m9nU1G)Hd;800tX?e@lD9DRlWRCX@zQQPNrYoc|T2%9H@}y)sLvIY_KHAgM z2YwVDxMtiO%&`52B))K>j5{PS^AP z{DtN$m0gCIUxHqO>H)jOgQVWqqaXpbim)aC^7f8TCWAf<*27Wyfs2=zJ{_gn47!>7 zl7B8Uoh&P-xcJ}Z74>I70#5(~`*2(6gR^Ffse`WGL5@vl)^s0j5_4FIkm{gEErRsq zMD)?!`=(IM6U)DMiaXspt$gX0LtL}%Z!*={p1$4luOf+K@drV&_+|}*0;Oov+x?3+ zT?uG6BuY=rl`%QAm`~)jV|F@m++)v~RjmT!(@QB@C89?ko*og9!Z;Muuw)cn#PO$Z z{DT&5(u5zQ=i-A$wux{X3UG5DfDaN*=t;^n2~0nT!7#}MP*FJt=v;f|`TPl3Y!XH^ z4isZO(eg);Ts~??kH%AckTIi{;}8u4T^56;GZF9NW!Up#BQ_>R$O)g%O=cEX-INKk z*o|gZ&goE4AAqJ<>s}WV6}64-Q#FcbHkyrF`%uDOdT$4l&t*-=Pj@ZSfBj*S7#{xF zz;v(e`gLyxzO0pD)#uvkL4zZ#cA4&_RlRb@y0y3=!2B5!_;WF>Xs^z;6Z=L@7etp?Z zCMMB-2?1e@W8=6$hW~rpRfI)2j@9n_<|hZw*Jt$kZ0O599~|Z2xT`m&q|EKhlVynmWSym^WdX}0Hs5D7vxNbo~VS#JX$j|4Oo5_6!e(S!1-zO`}iE|C~og- zlYPgx;yQz0JCIjc4zv9~;80<-tii*#OEOx(U8v0MN`bf25B@aD7nl5M&fc05xIRRA zQNiYSoaOtMeL!L1k#-Ju1mXgiuqvce)rj2=_?j4oTtmcRVnPDkh`NP-J8Vx$dPZEk z@r}vovblLPbMo({s4+A1R~^ZX9w#}(xc^sqkQPQ%!GHXNxXSqCcYa1atLD1KChGWP zp(=}hIp3bpQt_u}XyV+t_ApUX=em!c{l#u6OlIx4yAETM74g{m)k%i!O>@ecN=UhQ zRr*?G;lczHL+!~K|Ar^{4!>pF9USMsU*h8vuA7{*6fZ8>eH&I^&;vw1Kj9=Y z>dAiJ?n9Y6|J|-*W9QNtBxtsNrKY-Dbk-s7;#Zp8Y)&j^{?_Ltq-^H3Wv%!Q!Y;mW zQ(N182+Ija2wozP0sxnL52ZJe2iC2QBCc(BI!M7q)hu}cJA7^dpBFMKv;nh zSbP}Yegv-H^kPzatc&NmQE9BZF_^9`QFo%Eaz_~7(A-mc$qwf~%$Xtc)tG%KCx?E$ zq1h>0-&iTFu<-Tc`g;G2z{<)kTB0nufwr~+MYml8)yJCGP0Mf{qWn>M;jMjDSbl5o zPC#oG78dhz!m>k8#o3qSnS`<`qm>eO{%H}^ld@kPk4bQEU+Y|1sGis7AW(PtFI%Z|FS;OuQdGU8ulxLhq2`?{c4|MTfk)T zlKnzP%7br1%W~>o0dJfwJuRJLbrG!sBK5OpdqJET!#6&J@!~0kFJ!xh;;V*VEei>xjXM+Sn&)mGOyb0kxu+N)Z#zoSiRr5Bji% z!fFBbSzlcQa{1mcV8pEKvRaX{nwmKy#wUM#&AP{bbxx?7UY$ujqVZBJJ++qD5J$^U zHajLe#Jv%;esEFEY#g%acql)5j~p#gd9 zXF`}pBUYb0X=Se+U9v@QhOYAVOy-(ZtW<{rnO_3W{s;%BS;;h|LvpSF4FVo!3+Y;m z=zx3JA^*ww#tqf!%f>`pN8B>OudY>HINuZZ?4-21Z*_GFRSHW^nwnXw3{+pp(QkV_x@o4E41%KZ48{j1ZE`nc0@n+SLm1*G}yLH zM0fh(MQvu3tr#1;1Kr?13cj@Bgqx(#mNV9>#hEnL)(2CL+{n;c40FFj$BtX30T9;{ zr%dU{1%(^Oz7HPSRPA(vTc~=$T2fNV<@4_`SpBds?=q@j2{;TmMSXh+7 zG%`vsAhsJs(wdW3{c}frSwrCr9*CT)fx%M)M{{#OJ*|#TywFf;WooK5?y*$xI^|u= zp|pUfPe;2oYhP`wtCMPY@4Zp8aj)F~M_R~abB+EXmOVK?hAU&BcZ&qGzF;j&km$y+ ztO78ln+0l%JkB-%6<7gB7k+iW0X6Es+Gu%XEiHYI&XoX_{XkldU1qbS2YybnQ2?;y z-}?1wQmHIGoqzD*2U>z+gZN22zagwhmpCY~A37GXU&oF~=(4RR4K7+_(NL>0qs4zW z+s)II{e$0sz8`d;NH8O_VVmLQ%VF-2nzyB1PIxQ9A+;&vb5k!rN4|Uc;+Y`VToKpPyTAjWU3ois-_|!MHZT7>gci4-lh) zKQuuX-3mCHWGP62-vD-Y7H|XEKzNgY(yu^2NOV&K=A`3;`8dGa)SEwocofK8;_~8SqkIi zBM2^7vL|8xn69T&JFFj^`rzWg0g_XIaqVbxu-t8_bcQR5%~kCNo(y9_>Tt3P@($mtHZpv}Asy^n|VX%PzWLcMin_ynUSGh;1|Mu|kCb2x*(m`TTBT zW8pkWPt1%~4U;Qw%?vAF{`Od^g90?ocvSTu38;H2zua|tN8T5`j!+o!5G6A-_QIK2 z9e*A_o(%Pp>X3Qe(qaOZG~m6LH^!!lf|U+!cs)NQ-S2Uu$o~Dcsw;Cd4fVP%KWe=1 zu|Fzv-gSaTR7GvmwF;w>%-c7aPxUD)q@MZUu^hpl`K1hXZ7m^>?Y;S-oomiv(pTwF zeYqO%t7#RLgzptcw!&B0(Q^S2ef*wMQuC8{m<()Yrv+)Kw7_%~27USTX}#)c@Eg&) z?w<34c?ih8RzDc5Hluo2_dfRpH9p*%t(OB_%Nr>INjK>^XU3o3*tyFt-p}xt8ahwLO_^=jVQuYpS`O zKK(Jbsa`8Of$g}1)bZcyD!kG~JSSPHsJ88wSD=rOQNQlslPq7`-`{sZpp=4`0XH;U zZ`ykk)#W&b6B4Vm=O)BP{Aa7^E?wH0gE!}aNQu6}mY?%fz8CYpIl=q9`*yLOu2pQq zzPzdUfMu(RvQ1k(f>tQl=l!NKjO4@Ny1FqRGQD zN9IkxhfM!hC$^vbf{c|ZC=jB$I`Yb>{`l|wG;)X1H~N}lgok$KS6j>$F+%o#qow69 zSN3Bl#JwO~M%87w*d?AbMCNHPZ^YM@*pr%%UT8LS&o@gd>OMU_GU8r5{?FGhXtZ#z zV429*hed^jW6%*y0H6ATlI8+Te)7#P{iv8#!c>|Zu0%J5;tfFbw(2xqHBD&xlmMjz zLJ-}u2lF44IvBjmLpjrlQq)Z1;HB78Z$KNSnaI+Rj+B*)4glbUgZrCW|4L}eS(KO{ z$>WF05R=rIA-NkJHn3c~^68@JxaVrkhUy(QWE6u|{*#*YxqK)ep+S1yZAHA-@wmDR zH)uZw0C@!7AZ6x0I@(m%8A{Lcr23)~0TEIjP7B+7B=KUf_lVvN&f;J->EE0X2Vnxx zH%grwL)8WXjS_4A;$hO#_ALU3O(?pOUMUA?3*;$2r!OGowe|=uybH`!1h+GFH0j`9IiWno-w(G&BqY0R3$1)SP*;nK5L^T=61cn87H3&0Rep5*n@H(6kjsR1;#?;rhvd7LNiz1 zziI~`57?=R{&7xGGamlXk(?>{Lbu}v4kR?gKf&}TrUvdc(SSsdQ;aOHn!36TFh7SF zj+m@F=VyZG5Ag*tVrz{Vwm_ zgoZzUBHeVo1GQ<#uP-tQG^An9Bk>{s0KRK%i7>fej zajgF|iN}##au=+uJEAU50)1Q$kl{atHz=V?C!->qfAGJ;AqHNP1!Q2h1}8qeHigB+kGA@q7oopNEH@>3mMHkknbN0%~h zBepyw=KXoT(&!3S3{C!T#>xZDFTU zc*GZa4QTfZnPtdGpX<%i+-JRcx1CA*sH;DMy1;2BIaYr`O1?$v@u|#WWxY%*b!vn? zVY-GF?q_xX*(nz~u&9Wo0@zyE#^$KX5P+Q0zkLzki3pg-JD&Zt&kbzfe)(JY)CJ52 zVRr#em&Vk?GYAvIHUy`O%;UK(#gZWnngQFQ>NS$UVqEV(nfyRI^YPJ}ZKv}#sdjLn zEJ06Dhh6tvO5uNIzo7Dj;xfUogHZm!aR!F&u4mI}A@Ut1G$zPB`c)eGf-=Tx>db)4 z5f^tJ*H=hYGyWBE(?F})SXbwR@D-3E-e*)b7b2Mx6Y;$1%^BL$3=G3w>*80JU$Y&^C%o%6!~YMDWBZ=+9^FAkt}|!$fq&VXc23v&We#nu z=RS=LT1Ss=E8*n$Ya`Kd5;l(?@7GmjyIZ53y6^q`i*xJKp{)gskBMtAysyHFhh2l< zGWP`L$6-O5>6zUmOo)(5(G0s4Hi3@>=b+x~VN6szLCAp3`Fnm!ohw|&5MCV}N4sYm zrn(c$e5LD&ivk)zQ_P*a?KXilT{AeSV<0#KgG;CoE2|}X*??gG{P}ZWa~rNa{S^*H zGK?qW(()f=EoZ4Ay(Uh`nAgq?LOg;me_-qc&#&<>U*OTG^(Uq9yUpu1zjx0<7WT<$ zi?PcDlXyjfy?5^m3Jk2Ptv$E@?Xlu))%}=@z@1-Iv!g&mjaT^O@7Ftj-HtmWxqhDF z9n0>ACx4XO0dKvdlHyk?g0gUjC019Jmz5E>E1N1g6LBmQkOSxE)2^*-I+GQ?*g9!8Kt9m8}9ML(TWq~|0>D>eaBO^&~c=^)2^^*OKNO$+? zO}GJ3bijrNN=uWNL^ud+!X(^%p%bSymT+Engy*=w|{tm#5;Y-x~3$nup5I|445&|GHBIuk=(h+h;}s~Ve{;I*am zN2@T9ius6CWMlDqVtj;iD7W=6QpxFe?b=nD4W}g^u!^5mVO{`8D52kH0G!Qs+(Q~C z&*ZSIt*{SdI-Tf0sqYIi1b*JLb?Wo{3oe5O)baQ~Z375q5EOMQ$lZV%NDd(DC&#ZF z4;i~QjxS!n_4Ay{8xw234V|HTrK=|~k*q%UNYjAf^aoSZ#xv1eh}f+`=RBnspu>_urdy$`v@UZgtr}t3peVUcDAC)zfpWEL$_vzwF(0q+cO( z?ndhejhF2fU_gq^D z+qP~smeI65(jI=y5B)Z*;oxhpG4>Wu7p+Eon}0+^go=g++{`ZE$BT*@2M3q&<9L9r z>!)U_gT~$aG|^WeIHzj@bc z>{Zi%ZRy`X+d+N1C^GoSYaGmY~ zR}=g?v#$E<>Pq?E*X1};^Wj5<{-1>@Ma3#Plyq2}7`sCrZKDCJa_v?eoJ2K1!Y~1s zSWRgs1q9M{N^4}@ZJrEx^U(LP>g6B8;^H~i6KN@IJ|o#dtfjKDHoNDW6ZQ)T%x7`8 zV{Kp`VgfTi$m_|XJ;1=sd=a)3kh*wryHG=!dEmhFE5ilQYQrl+<+S1H+ZuD(MPUqr zvV*4JW`fj=cw#Dl)e-pkv)sPOf{4F+wQn`g&e7~Wvn}6j_wrB9={-2fArhd`mVzmO ze`@MC(Pw|h58&=c03MM}&+f|0myT?D?XDGGqzLLJ#MNB?M!p;2o&#&Hbv#`+2>FYs z)MFH{RJ}=2lbkQ8&|MiW&8L0hvTSQ%+kaTL68iLY=!ze$EkAjYoE#PzY4-e*{b`b` z2J~v2F#b@^WwP(m({|;Iab3(=T>DTuIy2(t`D$4I5pB!P7+02xE_EP2*^!oHLt?v>}~An7_9)8N0XU{^4i*L?H z{28v4mUt-DQAa$vaVis0ET}VW=pAq~8pFb`HE#`b0_bTiOH|@%nX1x@|fz5H37=A#;49|BQ9e zqk!cN%~KuY+8{g!Q1Wijk>CGytmWre6z$G~<8n*OEi;-Y4e>F+0eFhP*a@{5p}2$L z%8Yiu0d(c>(GC-LIa5=(Ft5D012FpX#s)RdRK8Z7nL|bg_|* zA?E{xCwDwT!CN8K67fFzpZ_B2!>WGz8T7SjQEgA6B_q5?>0}*2FS>)|b`Q6Q;DX_m zlF}azaYi`40e^$;Wce%N7iL7nApY>5zIDl}bL~T}d|KKp%b5=gc4D7mav=Ou9ic}x zuoZg+vu~(zMt{R{vA6?{^*-jn}#22gek~7{G)HXjD$Gie1yk?T!K_r zJ{f9ox}?xoU^L#l*82L*P7`rKU0RCUgg6y}}p!23Y-NoU3teV}MPXQ`{ur7#Ux? zb|3DB5U(cT!qGZrw0RI&_Wh6M(!$}9G4!OrOg)3D*-PDFFVf7dclssb2`nj&!DA?^ z{AvwvsU3839m^*Spl!d5r&GapqUdHet$S{s&qVCe1syg^KQ*dly-Q@JV%DE5!rH9{mM_8JenME|H9y%mUeeI{Ag(=ljI$0CK>H|T#&{0USQD5WLypR^V_2Q@)D(lgaw~t=(Tk~ z!DCOkgu?_2oxm0%293*v8k(3Z&yKgQKp8{^9Qc1zw_}3;>&;4vV^Jp^mC2%)8`zwi zn+q*4YwTxn@w$EsnI|p*NrkAn>zDWRP=q&JDd{L}}^~tq9!nh9RdZBfIT2ykqlxYRztTM>>lj()gV(t z^7H5c!Dc$ylLrI@GRvrL{Q2?e0$#Wq^d(w4I-PIY!^8suoM^V)ojP^n99yYAnAUXk z^j!s|Ee#Dh#=eh(P8i>P&2KwMVc==|b%#65GR~^A32(rif~5MQ>q0%>>Pu~{c)2*e z9`DT12=a`OQ*pVh!Nq9B$o~LV_)jS|X8_gVf}e_ctjmj@zXsT{gx~9iHXodfr0z`_ zV8(V+?qgyI*rgE}Bs;JkTkRfiA4^CHvq2^ce^|jTYzC5?{>|9vAg}3X@-CMS5qYSZdpW9NM8>=UorycvU5W0mnpotnB0 zZ0ulbyl%N;^y;qtbo$|9Proa%0rd!MTM7!F* z(K`+qz(v73H{%X{5V4u{SWSYo@s>iVj!@iO$Sn$JxU8cClGlJJO6;VHiajVvm#VqX z?DJSWcl7~ieBP7O#X(^r5hkwRr+rO?A}(=e`lDvXFyC4+v|G-l^!4+B0Z9DdfNo5$ zIQF57Q@-9fcrIXI4-U#qSOp0S$3v3;Zo;xq8+D0};u^32K8=mS(Le&$-Ei&!w4R7l z|1j)G`B)PYq~aEwV`I3&baxOcCfLwLJUn&lOW#>#(!KeQ2nfLC7kI8FIx+jaTKZl@+ft;5YY^QePBFju5f6dDAZec4#de2H?tMM~?<@O7 z`Ca!12V2e-(sUkpA`aa@+|K%OZ-6aSct+@dZvO-mc6VEcatb}}HhF46Js*|CtEEXO z&Nb5CynQS3Hb>?H^?I648O7}Ii`t@CT60tVyI_V?zadGgLx~JwA&ewqwFOv0*A8Bz z-nknS`PQ~eDbJK*w|RRXIUG^28(&k;w(({N?8rvsqv{v)y3`JqX)-){Y{ew<`k}M4 zvuWMkt%Tr<{8oYoSTdMvfn&(T)C3)-BI+<5{h{<&c(7nRx^3bz99q{_8;>cnSu$c7 zphmqnd;eOc`hvOqfB>5OQ|~LwmFu=s9Vrs0lMTv+V$-hbui4wFve=X2wbVjeOh;b$ zACcvdd4aeV%$SL5L)U83e~b#X2d&YZ<8QO|1#V{#3ezZ65&PKa+w=s%Ri!L z?w-iz;;en4C^_BxW~)_8mE3?ph>{SrmvB4)C|ldp!w8;WWuQIG1W5b~-dE$9GwZ`6kdyyfZ&s*Ss;iA}~&>($x)R~9@rI-VD?hbsulbLC@g40)K( z!#N#@y8VP&Ab)>2yhL>A?*|7MZJ^*k0pHoiKAAcPOS${<(?`4U%{9-ZSJ5&AYWgN?(R6x=d9&YYYFyg=;le2mKmVQI(ix+wH zS{*rF>MuLd+RC7vdnSRirSXj5MjTF33#mEEx?g602it~Abz~qfVXI26|CYYIhQ9HO z90D@%1_zi^((Lju{01g^=13AvKeO-fp$q<#ANYt3hkE) za$Ca-=vca5MZ>}x_23&+ZnG}Q7>43+0x!Coot^zUgORa$cSiV)3$n5ST93qUYlK>b z7gz&~+YLycZs&S$Z$iSf+zBW`vr>c(cEenXKF<%B#Bhg+@F5sP47I3n+3V1~lBXbC z39*e#ig^Frs}Q3Y?DD+){OrFUK-%mUq_^LTzn_dV32;WhZ=WASrfq&ndSpU`5h%(f zW&Q2v{a6M$0+Jg10|JZ;t$34OwtxvE2u}wtKq`6(Im0I)Kch% z2lwC)f~iK_v(uF*(NO&Nbpgr3dGp>_jju0kCD*TSJa&oA@`lIqv=~mVZ{6upeWO#> zHV-*6gDC{s8yXvl2Q|^R{VRf7FSKOXca3KvSrElho)F6$R&XAb|F7_Osaw?1y}vMS8IpDcq?IN4$2L(C=RYDz`3&4c+#x$L=JRt~)1SIvtE&$S!ot~7GeFd_>GE4I^nkEU+t7d}E9M{rA13}QWN<}0giBmsRScH!9E_pNs*Tf`szA9Ls7 z)^q><{cJKyLeXV4D67cMCJiI0Bv}-+ou?&G-sf!lHXj^FjYbbWe%-tX7@9M9+Te4fGR*#~zO{UlQA&ry(Un3J&BpGU4FreADH+5ks{h#t$jM!tX+$-mSe}LiiViuF^@JxJb~uDO6rADdid~JA@zq0_8qJ4lViei zV4S^P-8f8k1JH?4JR?w`!=QGeFnpQD)}y z>qTvw?o#n-c_lyp0v6e9m5a>}HAoX*QQ(F>17|DDYtW!%L*Ok_;0fk%M;#gwYe^Jr zI0CB-8hsE2*XI|>qs{HCCqZw+7(meygz#j_Ei6RUv_RFrM;)ZysuvqfoJbpC&_!yJ zd-tUG#?%#_gvxoqn)+4*6Uu11&P{$z4ISeG#*|p_kmP9IzaJbDB6h^Q`?43hT*Ix) z4+DecY6f-fE?hCp_m>`uYgjXj-`{NN)Vp~#dWQ~uyggOlsj-i1zikbh&4$isU!$)* zKwo}2pT{pp{~o+9wA~+LOO?#Pz-HGAR5ZKJJ+BwHxuJ(LUUT3c=?m{&gjQC-jPwZM z53|N|w>RCCmSkkwduAoE$Pr^Wh9_mLo-IH<>{1zy*=aW`45hWS8q`54nP%HHamQ)D zV8M1@75@haVb>RF-zB3Jf>*NEvKcgme)^y_?5p52pF&s&?}D4Atggde{B@76=6pP- z7xt<#hoJT=kqRO*x9uQ3Mf4xq{eo6b0-zOZEX)d^;o9HV7*Rj*U*8n*>oII6{PEw( z2bmx}x&JMYlq@qAfc0nAa6i8$`wceFJmK_qs~AwhHmKciRuAbBck3yJksawDV~!jV zf{C$kr{e2YtyzSNoQg%bvdLv?2UwR=k2dA{U3e6UufgU%j;1wI#fpBw{OTBY6*fE& zyMfZ~)aWr#T#36+-0i6#Dxv|wcZ(gS%L~tas><^>YTat{o9~~B$6JR6-5Ezi&eu8w zQ8=M&5=Rg;h5_RR2UXs}bK>N(_xVBUMs)y7T5vos!%!LQ?mg=|6{2nW!f8p#CtdM6@6^Hb1gjE{* zhN_B3D~bIz^Cw8&XY=iP#Lc~PX4D}lv-b2%mm2lx@g#iy!3`TmI4yp@=S2HPTpE5B z=JxjcQzpda9i1G#2WMACUWJC0&5Xm={k-+M3gdreWo0k-33dj9|IrxMY~0RG2RgQ` z`SR3h^+3N3KTUh24jBFCRLly>Kv~pZV{~mS* z4*&D3uWQ=rJ3-9boNQ-)U9a_~2+Ek3L1`=e?)7q+L7ffO%(BOY{jM9GXAXDK#~Q|H z#YA385Nw*=lP;urd*_DU=w}U*f=ImC%}sT?=?31r-M%qcAlM?lB@+?`U1qo&*NRom z(wWg%C&qv%PQZa@B_zQv^cynxPY~;V1|;0Jz53>RbMJ(N1t)iR?%xO|)L=;&SDW!J z{XUxTt~g{ZOv?5J;N;$HZQrw}cU^r4mC-Z-3Gs?SsXb)c=k)2IUAty4oYKrTW67Zl zAA)rzloPTwhI#H`y5{!F1om>lQ^og8tY=}TI6eJ*;egFYy2@IV(g_1b-gd2LdM1a> z5A)~GrwiPddcbDNW~41FxiailJDHK%AwwJ;94wQDrw?Q1)wEmv&C0!Z4NX+hjIv2! zl&tfB0aMQvV(!XNvSGUk(O=aW!osFS-=LrO`dXkl198){&see%qpw;)47tPYS2VvH zee=fCx6{@Xbm;067}rb>+zuY%pB3E1<%4I`uClRjR`d1M2J1L0_Q~GK7^oX)-`n-G zHG_D2D-veA;J$RJ=c)4tN7}O~kTo^!Yb>GbS5;Y+ri?fKwI1wr+ECev_<`+J>w4L=Hc z!2CDTNwMQ1|N86AxXxrRrb%N*C@g>F@OE^_gD3w1(!?GHSF%Riy)O|D@q2ynef*MX z>b^(2Id4@DQ!$-FO7+ApNdhs3umobJxhk0o1nV8JFm=Mh9&gJU{2BZn?LKvjtDOcg z#+ZcLCuo|*oNcty&@EJ;)22;h*f0PS48voEW^t-F2mgMKn6X51$7d(0U-`r(IJ976$8$hRb=`Gn>YswbsABb#leBh_13Ve&!2M|cXM|i zWHr-aId-G6tq~sx9b6DD<>YKR_viP%mYT-;aA3wCrp8(v&%YItPp_z^&NC+h;t^oJ zTCU=DLR*YBv1(`du{L?mkTag@@r}(`s>ia<>JxK(Dh4ysdtID!Q3qYcGHa>3`!m`& z2z_@%b%vhXCQn}ST9xshPHhZk%$U($t3WT1xPaf}FUn`Ji8A{6GP>cbUm_C_Q$y@Z zPh#F@WMtIr*7!H0iz0_kJG%zQ_kYAr2QY<^&S|F&yST?-dAxJyd_=6rS0(Sdy6BjP zi;AgcI9`TqpA=_M&?&@Qr}QzoH6GH9kr!x7-$%RlE!woETNQ|&Hqf_=YC*)W`B zbAR?@TK+|tgl%EL+@qODnqE}(_)bVLw@wV-h*Nuf1CWl&>c0YgT&Ti~xUYT#&SFi9#RmQJ` z5ult6WyK!GTKm$26XILYy}ab`ZH~XG6|@_^b3e1>1}w8a{I1)O=~gO6Uv{m(Db{%) zsN9f}Jm+06Eg3Ro%!ZobO^qRl2r;o?z!rFAaV;|PC~b(O~ z+hYE=Wz@>aElW$ijvPsu5M;8qV^W*yZ5|J}OYf zbnW>YEqC{BYaP%yVl!?p^QNbUJs2o0QTOhn6K=O_TowZ#xyTEc;+1B z3`$3|DVq^Ww>uDRlOTJzpnc|TZ`t}Egb(;VjR@S|V^K3&P0fD&S`QdE&csaRVnc3d z&f^2mW}GRJoh#G31R8dXjg8WK@p$YW?iaNljeTDF_%1HP`Y-hQzPdJEJt>=1g5r6wYxQK+L-;radb>hNeVRCyYtwLr;HJls3`aBu&h@sFo8`& zb#)8vHnP8v!HB-;5M?HtE03c{*%LN%jSth`+c6eLj4?~>huA#CemLse_ov_V0(|Vg z#>77dc#W>eI5495v15OqMH6}@(Oo4^>R9YrUE7&mb5Su{vhL2hY#Dp;>eZ{IcTBVj zw}q=)soxCQx6jXL$cNtw3SJ7PV@6uBLp3FdGrP0I@9HS>wNF!LD4 zj#Mv;WHHU#DmQ1(p6!U&DOl(8FQM|95Llx-*fM3<;m#dpWrbw}Q(__}TUxToF7I^I zs0YdwYV}RmR+NwJvQy*G&C-;-^f6srW_IbaWz*|(uipa;L`Q$I)_T`u+&J5e;@LNG zCJ1LAb4t(P)RFB%ccUKR_0Y!&0Ty`fTgRh;RI~fnt(#M-AHFUqbqJMi9C9zuFV0J+ z>|#>>U0_wEY1P-flLf_)Ni&YS?Xz9@Yrt%OA0L_28jxS-j z()K?%!>kwG^82yX<5%Q%x*WGJNI^0XzFlF%Y(k( z8AXoou{JEVW2)Wv*WCxMns`nZ4_?IV>S;49N>;jr)mcDwAEBXf-1AM(xJW&hFfyJ) z#6G|0Tbt+gAB%q9N=K(6-fG#cZ85bs&YjDip=)5!GS*Ue{&BN1CB4@Bhb#R_Z}fWk zU@Q%Lk1I22H-X~3w|&NluCXdX!TxT(V@kS6ew=FOdt&z|OBK_7U5EZt9XEw>3ud+E zFI=D%^x}Eoxbub0#Vk&IuBmEvxG{KYgu-EY{+%|;p*v-&C-Ii%F2kUe>c>%372mtj z(#XhZ@c~d~tl+!)H(c}8FkM&0^y-q|M$hl3%VUj3+V=NY`zn5_2h5M&^AeRH?=Y{m z?KD99mupk|_UUsxEiLq-Ly+;!UbnYnM|l*=>vxYHhH)xv%!Oh{(l40>eb4w^YBL)YN2=kVUpQ0%~q`& zWM+CDZA?9J5exHD8?dx-uJPZ0grQfEt}c3KR!HiMvWT{9jo1|J-+XsvQTzLY9A~(jsMTOnwkl+tXck_msn<@TE@cgG$tIhY%olWE=ob$$J;A&+rzzhJU zS9jqOl7dp_l+8W=8kG8Z^0{_@Ka*KvN2kdxH3mEV1!AKOb$xc-JVNX3vf}xxXHE<}HcY(yc+c`1f;5emJyQFLl+s@O!XiYfy zut6f--=4vP9r~=#64ZI`KNPowoa1kAHQBW3-%!8Bd+Be~xZhsRS!fP+tVz+wmpr}l zovVC>7jtiPIet>JE0&9gT69>r&`-;4KWFUp(5fA_4+Blg)TuuS{@2>}tlPf5<37#6 z)Mh?DPHR`|Qx-`4PHha5f+CzEM-49Q??dbYxr3cdw{gLyd&i$?YTGv`XvwOj2I=ZB zM^rfk$_vLvH~JU$*N!$`nlZW&g2)|zUlrMFjVyXm=JPiH{=o2GAM5dA zCiRliO-xIXyA%EGlF~;ee`73k#Js~PN~62PwMGREBf+iZhv2wh|8WTQJ=934+}eKO z+e^DlT&0)B`-C!{X1Mu~%Ntvzq~Or5-sy@L z4w_>JYbB|InNX?7G7ZXuFK_P!H|nt33#+QTZ>n|LHIL}7ryu(Ke2|S9JIKVaiZM&* z9UxcRVH zYG1a7OU+JIU%h6X$H+Pt9-f}_=u~ow(%3F>fe;+OnUZ79L7%y)dgpP?;Obq@&O8vd z*X-l=mU~vdmD^+z&lwgQD@;RsORae|thaO?l0AWs*Sw218r-ht+@IYDAZ*OJ`FJ4Z zu!~CCmM2eUvwO2SoyVK%pQ7g3Kk&k?&ZA?N#m87L!^c)u-H>l`QH_J9XMHVOK(4s` zrkVm(paa?>wHgbpcUx>)ACDZkv@Vvy=%~hz7@>f*MsL9vey#fcG41$@CFhsrD()(L z7-UlbAo~fEJ}O?;(#i5c2u74Cad+Q5%*r}$o8HVjDe3R`-)bEFlsy|_%Pb}lT&n1n zs8KW63w-wLoZb7(#|$p895NG^Y=ePj#zea zag$9&P#$mAg<>wXoi=?eSc$| zZfeWG04WyD%yzgLZ)OR5%+aG)O6R4zT)cSkC0J1ZKfAx6xe^v@&ExvVPc3d$nSnfK z&>(exW57{wssvf=pMhv0|)v~A3q|yY^s19OaP9>v$P=&0H*IG+|J))_4Q@CHkSwF zI3Qi!EfQrsu#aD8Xln&_?p?fjGq||8c+J`0Y|Um>*!$vh*=g7fjbFJkJ*CiDnFfmP zzuo;EZzd+3wwuSPg=`pp0c?d|Dn{|~)%^%Y)R7z$ndUlMGj2yAgz`&_G$P81zegY_ zdh+z?4V{o7-HZGn&`_z*Ug-V>u8p-hF~cXld-o0+H~FRy>;@3|GRD95M?_3N8(iV2ZoL zrHdC8Ufwq@>j-R((Nr!vW78U!+Q{Cw!efDlpciBdm8#;{eM?@{;Z`BS##xideXBik z9ZKJ_ErmQI5RqH1EA}~mPky7p$OCJdBe)=z7~Rk@(_S(7AAI$GewaIb1hZaPA#l*9 zM{KrlMQcJ)$(7pG>ZMOSk}zfL?Yz8(oV0Z>;(PAzYnRl9W7>b|&_vghJ9kdncY;OZ zEx7rP`zH5(eL4V<8C{Xj_U!`0yLCx`HfFQIqA8VrHVL`6Z>JTO+GC`_4E9;?w*C`l zRA&6U*AhsnB{VlPCOg~~oejWd@o?e(-d>LyE$rT(#p0~kz`E2T`0LI1D2%A#tJy;T z>QC}**XA~FlJj88t{ViWs)v}xUKoV^K4{X4fc_krXR_@!{gsO8*`rx2r?T@x{My`G z9AF6=b(7)Wzj5A?Tn?-P60*bX^cc(D?T`#(ABb77`u*jmLt?F~k26UVLCjwF$`(!o zAr74EQ5Dm5f|`(K*W}*fPE&JLFwoc*JX1EeQ67A`|1ZlC+Sgk9oj@A!^VRE|xCi6BzLcAhUz zEm>bm%;6pXa*O#Dm6a!f*x-f5Qb)E*9NxDmDAplprOe;~QJ;r-1f2&*1vKqU%QPSz zDCcAfEhVdAWD&E_lUJ{rZ+K*`&MF$1-N7LW&pTcZdE&N2{O<=gui^EVtAQ?eT{5OcZ{Y^D+EE&BoZ zQ(!(AW`Sc~C76^o=Pkk>MYus=c5Z1b5K+n0KsA4xw#Uu1RD8yC z2{e)aS2rnfoArPGS^i&;88Ga>fA#l&+W4@G`hS1CM9-jKdYyN5{Eau;C1JfZ$JNz0 z?p5fiv}Fg$6~>1wFx{EgsQ@7Z_9Ju^kPry9;=Znaer+PCZHU?2lfHxb_e4Bct)I0s z-Q1r)K3g`A&jYRmpuWA3sfC~4qBGrg+s|4)IO4ZPvQbAqrCEo?1wSl2THINFtY4$h z^y#*?w^on8dm{7jhq}?f&4}wZyW1$(<`N-G03D^!qx6BcInEi9c>V z<@(bF59?{U?JTG|RB|`t=ia*^RZs1{dA&Fo6;(IZa$4cJ_`>1wI}2C*`$O&-C3lam zDO~XS$C9KsE>q%+!jiMU*}q$Rd)`UQxI}|&XVZPj1NzK%>%uwzE8H8pCWN9{X5$5T zrTh_z@B7MYzVyx6U|E@%Q)mOiG`50 z`cjCXLnxfLfss`iPemn7Bdq}2a|yP25IZSl9Bbu$Hx5|`oL?0fQ96$hayx!|2%R24 zMZnZG0thmTl{GIil+7kNeGF)28XeHd>FrLl5)*0dC|6`D3>N|%CnLvCm|ptV`UH;@ zWh6-0PG#blC)n@5JYww+bo~V!*}va{8Lv(u{lRY6Vl(N-{#nq9 zatneQI6xD|!iX$`@J>{(i4HOh;SH8eCdSXJWkA$0Ei7Nb;@u{+_ox<8SPdt%J#G_qV5;ap#%KGZ1?uc@LtE{mYp}(qQ`;>wMXYdj0k7 z1fLx{WHpJct?hAS6eO^bJVAsi&!J&2P&dd%U^1qRqT<23lQYR(`AhtA1D_J;%Q=*t zv0f9o$W?z4!r(&cFRnPVFVsb@kJM4dj`klH3dT@9*P%mHl9lGlD!q za0z$Ev0+{5Wf*W-Bn1jFA68M$ny^m>j(JyF|LiZbGqOmO{Acv*<0V>&kGuewkzK@m zrMhx+Qc_QW+))b8D^zpDNd?$J|xd!2ZMqJLyorUP4o6I<-9_{k3Jix`6VQx5GM`c(}&9e=HsaQVD} zmoO5x&GsB5uPQ?rT-!FrWX_&E$6K7u0JZE4rzp+k1>c6p8lHc~Nj6n1oL$NcFSmIc zr9T5F+7HjHI)Wara9Kb{qt&})6!^u9JWg4irH^_u$_xGbR{}{rE_;CB38*y@ba!OL zej_`JrTJR_^Ho=JC;iYUtmk=UWYG{PtRL4cE_&=iU;^;SgyiDKM>PoNh=TfnR?2_} z_x(G)ux!}`)fPsG<=(X&86E8|ri+juj2Yf!3^>)!E!cC@VTKD!Pt0w~d~i>S=R=4H z(6GpbF;0D%_4QH(!H9QdAs**A`nKy2h$c}9qe&W$Q-kCUD&-qM;G{`}1t5A`7&{S< zJ0d%2;rOs8p{f5clP+V`vM>7jkPl8JF{aQOLFf_~*51Jh`txnWpUYGj@}`qRmxfJPld&ikv+r0l2EuQ5D!aPm_@PYfM5eYx=k;8fvVZUERJ^aaENH)#YrXBkqc4s zf+*wl5qEztFE5>YY#F{ztVVK`e&H+LQep(h5Kr`~^^}kI_-yK-=R^{~fKRukOv1Ej z94uZquQ2FkIgflEEF{B8pP9#wveHA1_5^aVH$%Oh^!1BNJBG(>q!3m?j!bl!#D7`` zkaX^=l2}AC+l9F#E^yH%j2$qj{^ZcM4&_A3a(PHF>C0c0cr!hwszA0D^g@Q+&;g4l z6hBeH|JBb~<;oArYmdU+F#jtR<(;JIQ{8Il424BOupkf-gVY^XKdFQL!etBy?wwsb zfb%K)$jOP~uY6+eRluHTW@Lnl5&n`B*jBmeRJK~pWfC7LYr7iFV8dNpb9-c194=$N zxtQs(KuhS)2RH|_a;@p4Ns~IPeQO}f-t=_&^-|q~Ify=+6P>|+5wGs*cM--1g@>C` zs<7TljG`29Shm@q>CtJ%46%l-sNuo(VZN*5FZRXl^1Dk`I9&+bI9b41Jw zV$n!hJ6GOIWE6eD;RGp_m6ksb8nKCCgVND!8s=p!@Y|U~@cOyHj+5d6wNuL4V}iwe z?j9p`#v6uV`7hO!oHSysyq&V&8uZ5T_ypnrho}|K23a0eU+xdbtRE_NBc+A-_^|K& zX3ac_>DV$rMT=(v44N$tlptkYC?<%E#}T@(YrQGv9k}HW=A-0Bd}(?U8zeD7>!hV6 zJ19w~9z>&R^`GCruTEe5_&PBjHvuPJL1Y7l`78MuQ5-lq#a#Fb5h&>&mS#`C1#J|7 zDdqQ*Qve+WaF>)z)RD6xDkm`^fUHQtbp?(|e$7(0hvw#bca2=ViT6VgaK{Z!{-X}p z+skVb!x9P#1>19l*RM}}+!bvodK2IyaWEyjO@Nz>*xAx zAT-PatF^NTI-F2hPYNy-AE5+xtBGkILNEfJtPyy1^YW8Vl(;3gTD~N#O2*+Oy@~ta z=F76hihGb1SFNgDUmVgLWRv{Y;u_c5~Cwq>vvunoya>Ia9LK4Frn|`nD?J^xZ zn!a-jRBIhJgKK5qO2Lx^Fn4p+9Qs3GuAsep%bX4z)TP@!$*yyaT zf1`#CEn!b6ErFB4s;~-FuD1_YNNlX|{2+FT!}nH@!DK>^-BzpJwzVpqxK-g4$ndsh^63q}IFq5)o{v*I%@Z0vKZ!0QdhRbip?Rqq{Gvh%#qjsea zfKkAc5oP{+G}-qso8tCo$uU6jDrR0~f8>b$N>D&QFSF^LV0`C*5>NQ{^{4yyYvAC6 z*mY#ArS+Lc+~@)K-qTmGezAz)Ig8r?;K2<&J3?x;i4!N<%e;PJFa(k;s#2<(w-*Zm-5N%_i_5R_aA5lXaUFqA+*H#vnVRyG?w)~xB{QWE3p z&=)|WVgI!r9tdp(W9H?By>Wef6>JWc7azS#+|vz8GAMul{+ay#<~umzgJ>TPub*?4 zW6qu`JjqN!;dv{dS*(V1M3;ddhI7!QVpX8fX)J-`&gh#g+_=3fw=4eM{>aFQ5ZBN$ zlnd+bogH=PAieLJeX#uK?2CJ+4jwp=mup-9bpcm{p3I}^-$YOF5q}nF@GBIy-5 z?h;YA$Am#J$Zo+WCUHb$yReMm68Pz4^VghxZ}<4>>4kTk4V}}DRgs6PBGk@-||7ph^9dscx=ece=$| zrNtk0U1$By^5jaFq$uObYi~VT>#gSV&xx^LvofE3yZqz1vX6S7fgkQ(O(!(=H()Nv{sP41&>q2nQA8$_t)>*YTmjY@9b zWLVpPeeb+$#Iz33P+Z9iz>uQCNoyW?R99Eihrn32{f-Jd&}vCs2z_Y={z>?pC*FZgX5uGrD8{}dA5`^Ao)@$Ro6KDKt{sBp^@kHb zclmLpxcFn{*kzI5TI}j}kGU=}AOZ(IYMe{_~Q=k!;OA4fNSUiV}Y52(xoP!#&nxc?)IrgZ^wW>2sjEZ$7l6U!{yKvB0zB#(C_k!xa~E><0*YM#*33+gjUT)&ss(BJ?i2$$7VrwlW_N0lzh#_i<$>(%Ve}6V z&TZ&OWv>9HwUwS6{p+4RdsNMIR{ulwOP}+czWhO!M=RxK{XkkqO`5cmHUokGCB8`9 zj5GZ^sYQJG@`WFifr&nI3o6$3!f=rn0iQ*Vp)u)>TZDd2xm`QG?0Q;eZHX^)ukNOI zXu%s1eB^~fola#2S}y+w$bR z2|TOJ4S%w2o%&I6bJTYD`{R&S2`uhlhEdNmP!o?l4F=MwD!?t_6;u!&BRu!3sa)e0 zqj6`HGFcMHEoWs%s!Mn@MmW6L*8*Jj^3|)QuS&F2=6pgnpW}Ohar8>U2jhgi%szO3 zK(+IX6)d8Sn?o#i-r9a9d3ET}p_31-`=?z$*PQ>9@@#5!z;sPve56shm_bl+Ac(+) zFLB@tK%yxu3rD0#X)kuMPcGD>yXQ6K{on3*Z06a)UB&o#(hMRvMX~4Dky>WMfcRyU zmAH&cfeBlA7!E7fxch23qd7}IJR17%#20dlC)3%=7674^N$&6QPMT=hlH8m=3t<}=pS^ygv0Apc&k!K=sej?vj2Xf`eO8p^)5DEa$!-a*#2xyNG&J$b_KHw(zvxx@N9PB&Z2<2KdIq;)Fv>=Sz zUI&6WxKLc=v!KjWWuzk8fK2e!f8%+eC*T_R+>K*1h6sZJ@Rs3T*`iPQ_(VH9Ub}*@ z-bDaC^47~F3uOa9rh=4J`m8`=r09@hh7Q(P}ymFIc&05FXBo*M*god#tr$48tXi{Rx1Cl$QE%~4a+qZ3N zM5BJl?S8uVi7}?CN||lWK-#DDJ0C15Al^DbMNRFBd_eKRI`RRJAJ0n6yGw7&@yV4B z;IYdGfS5&vvSj!XE8sTLXiBnW#vc0>kJzlNBaC|$K0DN}e}CAm?YA>(Yij|*Td&)ImloNIXUG3smaOIu6^LediN<<{E+tz4=sp71 zPUbCuqcKMmn&TTJWiGG_E}ZQ+CIXiM0_W82C{PL;Xu0Y8Xg-Mm-!XTKIsVPkgvJuD zOF|VRty_%_tayFPhxrjgfD$qM5=@7MUZ@~-6y3g6hxGegcysW`k+8)>L$5w3ooshM z2L$z3U(d-P7A?i~G5dRYQ-h$VgrE7$PD&J{UIUHD(FQO3DfI1iWCLUj2wq(pg%$d7|7L2-D zKY256UAJHb2kSLRq9$*Zc_=U)0_Ijy4KSqIkd-~CZ9;m_ibOb17FXRnVDWX3VYr>P zHMs*^Z*6D)&Q384CqRl@9EBy+fbRPPZ_)x^DNdjMXXp#=i9<%gO{=38b)cC<$w2`o zsvTw$33d!d1lae&wO0Bx$hyn$MT}9z&~`h7&LDIEOqa6e2(}4xQZEup)(y{M>Kl@R z890a9`&b<1mcLY{Hq0w4+Y4nwGS=L>Y1o=C+d0`wt6y6aO64V@*16);hd4|yoK2a? z7xi`Ay==5uM=}Yub*q2=nen8O41^LuJTuuGe>ZuSELY&op0i*<_v=r2zxpqFU;?Gw zyq)1e9!I$+L&XHny+HhqvuBS0wLNMXThQKZVMO;O!(Tn0WM<|7()_ec0GYZ0F0Hw#~XvLG2vw#=ayGH4CQ#{iZ{FW4F95q7RBnA zy*dIsCnquD?A*V9`uQKYS2cj-_8PAf%|y3V^2{0gm3T8W7=$3Fpr9Q`mgk_X)@)ws zp1Jf{Lq+dV%`V+~wVcuFG}qfIp$gn>5{KZZ*rsW?FY4S6MHs5QLF;}R zQnSeGLn=LcgV@}h4$dLw=1oZ;@80>_B((0HfP!S{u!?rwBq`{UA-z~%$36Gb-?sQc z#sxAgx^9)ncshIFHY*>CqbOyk;i3m>MCqm1NcS28mv})|6zuM4PBO_^%&m!@|U^k7{2;2gD#oDvl6StP*Vviab}(2!ApG-C`laYj8VdcSjm8(K7J%P-%X` znUUeRP630!E2NtPo}>_qCZX(v_%ySH|^EG*?WZ?)dMYF+OAZgqabW z|NAGzM9bTn7v#TxqQ-R5P2|k{?;mXy(>bBL{;xmX0!{v(kF~VbW(elLf6TOkF6yD* z|L>p4T>@2Ym}L0xA0HK;nB)K7T#5ss#L;i9@`oYx1aBXoJ3weyhGv5>t#4>JMkkmm zGM5H|S)UC+Avx6P>@c=CJYJnP6y{S_@nE^Mg#QGBg_l7F1Yl=LK-1C}s`IlkAd|d} z=qH8kufr_;$U_#&RME5$LJra`G26fn;_>kIM2`W$_m`NWq<(VhGi~YPK0d3qpc*)V z?*Peqr)6A^_fJhsRGaX&2@|Z?S`QCLiWq^YMqBjK5x*pxHG z4-EiA+G%$Fk&!c(Dpj5xarow|<+($*e+#Es5@wAPIiB}cssmJ5AnEn!wE*4OBeB4_ z^mu3L!!9&nKyP%aJ7_eZ`gCwXY}mY=d>i^*WHxk{GDl~T?dd@lY|^ypGg?+mI-XQi zXz86s@fuCS^~8(>IOZu~lR6J6?0Jf+(W6o7*vl#%!ZmDd3-+6qXS=C)Q25d53^4V*-yey#+lZ>ntH!-45*>u=1_#`^V>sa1!yWDbaf5E)|Asv%qcntMQBU zEn`f_B-xx~+L#n5TT=htgFa1FH3^(R6H8tecV?l{C>Col2^m8{LVe{+s7S$-4KE1D z^A&<)lcCkxx39P9ct_fCHWMACj*xx7A08iI_^ud959b2zKz2zX*?Y#iM^4UlN)^es zq_9e^&~Wi14eaXjv8%>27H z;XrA0;LY;Eqz8adgb#D@@35;R_5xXnbPy>I+{dy^4JLfeV%he->+dLeJIXYEIuK$9 zAQOQo`MPs(zMK!5Kk9#gkldxfDZ#sUZ{^WK=UsVjd3a0CzRs)Oiwu*FAUu3pcG0tE z&nRN$7dttH%zxBXM+c@fXMUGekA@Elb6>XXbHsqVSLZ?=70&lLgvWrXOhYVl zX$qh~1v=^!b`>tPnF96KNS6B``=s!ETc=*vq`)H|K9cb4?JcBoB3V)Z59q=ayWTI`qJHS3r> zZ$}W5r7pNDI?aEXhk9(ydBNcf={bEjl}pOfXU62at=KioHZS^ydBE7r9*cbEn0@9b zB9Gd@C7lROV#iGq?L)&rZKKuXhl)2E02#0pHj=~=A}1+9@jtyMm#QC%h4X9mGC1Op`#$Xiv7gX0X5{nM=4fs=7VB=7h?j}MxjvQ9&;RJpIIy#Ux z5s7rVR*S(9N%Im;Q6Hyhs29+eFPVWPWA|G zR*&iF;|(j!O|o>wdlKbK#`@a9B>M`nj|B`Dqi*OeTn@Kj?#{82F>>+=>3auCM$%4K zT2vZ22kc$Q=OP}4-hYP7MnM!EJjW7Jmtdw0Xp#;w2-qw6?C4}oHi6{x0e&?$1VE() z23>;l!9kvur7iGCPo8$NA$1DEIK<;utERnwO!7wW09}jipLUMv(pm1IS6Ljb>k-=sC{{3Ibeg_^l6k45PU>Ae>kBW+vSa*Q#(STpb zcFA(GF?pcvH@Dz!&m}OT9OfBb<+B4Ud&%`5_&?v84w+_qg+%MYrW#F*lONYK%1L~7Ljj5EL|D&bj+zf?5 zBtXz>|Jj}w0X!;~1OfVr-KyddEc4x^rEIe=KpN70tA!b-hc#JOd9CiHyS2X^Z(Z*R zjjiCoWEXHuQ!*dk-~>4Zu#h4eIMj=z-zA93gg^aNTdO-gy$23T1BVRpt7LM!4zaJ{ zL8L$wu=*@D<+qa_Wsv~LJyBV^uIZef#w6qkdkyDsOy9DW%k_3VD+dZ=*eQj&OJEo- zZwOiBFyMoL5wI|zYjs9i{6WPd5t#)IT8QMh?wcp(nsXI?HJ`P#$kW>YN*#P2bwIsO z@wRE>#yVJ6W1vGv-lDOw1~(I|j9&7=nN>UKw?V`feA+`nfK<-s>08h>kLxZ}P>N=g z*6xXXbQk9N!MM-Hrdhf$%@{z%!udN?FMwzlNiWbYC=hc5);$b$g7b6Cg2^rdSnyLf zNLQcsz?&ma@NrS+X&1KA`N~rgxSa7+s|f4zq?WB(jb=#8I^L>fKpZ*_`a!01=) zwL#x1PA_zJzQP`a3m`VK7U<~oQ@!Zyuecj6Ui`u6aZ%xohM#YJ-nC2hj>A};;K@W( zyYz+mL_B|*QDVAPl3gf`i6eY13LJFreeQIjSq$Mb?OZX1ugp_xLa{mds(i8cv=`A&>PP6iHNx|Uj)Q?oGg!?_-o zi{i&j-j!fAO!wB6=bw$LX6gjD9{7PCmx2`3vBJQJeJ~xVtUi4>GG!o#0F`bFZth$n zIT?7gS=Fx9&q!{xMUiP%hb$_>?@yYt>q)KNrVOoAbJa$6N&OEv#3v{3%+6lFG539^ z52qK-o_!t$fvM0JBuYv6LWi$-Z3Vts*TlWXdRNRT2PQbSP=U$*6cIGaqn4b*lnp0` z2)T_@-iOmI?E%diNQ83FT~hQBn4h048s7)P$2`i20w_Zr}BKnZV%?pl<@yTUR05Z+LoJI8GjHUI66f7XBdGLXQ^z8X_85I)i zGyc=V07*1mv;aTG*TL%D3*jvcyc&=#tIK>;ANBf_$7IyFUSaOM?*;|WfiA7$t$gTU zRlj}I(t4!Ei_iP%SKXg?O7aqPZ12{a*4KRfdpdYroSpjIlUV*Dapu|68NHLNx_H={ zEAV(Z4$l|-{0%3vRjK23GPLu<)BPBr5ZH+-(C7BYie(-fAFnH`ySKgTkBY~W>;fXw zyW~Ir?D;rsucmpH9+VGWS0zA!Y%rO~q58oc54`#u^=|IGhN9ZJ)36ccULGeRksrvYI zbevMcn7F*FSN-hLTn)J20+Eu$r=>c#3%jwhmDE8QBji(VMiq{IN+pV zKem9`ilH7oFm0`wwl8c?K+j{r!$AgN0)QNH-Jo-LNK4>_sG<=%u3kJ;7b*m%1+M=h zH$cMuyRx)sgkzRPDVCf56&Qgi`TQ76s}EYS=y7)Q#(rgIG};K2LY@Kmb|6c0iZS0^ zO@FbS{)kqQFEPj28Nrz*=7QWu`j;()5$rg`K?ka`o<6)r{M6(Cegv3`zuz7xwxRH= zK1ri#sg`B#=R>eLZK9}nkv(Nzzz;;0+@l-mU79s(rg-gC(CYp~R-^!RU|N8``CP94#AA!Lj zf0v?F=NO4k4&z81Vt$FBjSJ^X*GsDBJvHtXHs#Q$1}VKi1wXA)Ngjt(jZ+$w`1ub3 ziZ;-e1)8ii)Z+aicOpMm=jF_~(+|W|tzElvth10D<@W2u7}LTtT#z*3Nj5jBt;mAh zYp5ZP-d2c{fPjGVK_^e1Bvv{c8j6Nz-aMUtuD6ukXKT-l5G0qzrm_Po?bwf$32E0S z{B1;-bz~j{7LU}Orw!>=NCWttT1cjNaSjAhV3>g6)IL0kah#^`Z8WBs`@7R~NO4DX z?EE-;Crc5q)Sy^Y5Pdy>86m?uYaC0+WIP`N^Vz zmGA79*3c5LM>MpXGY!&iQo*pk;eTep;8j<0b}wRE?SE6fm-%FFpM!N z102JNJ4A`lUb#(!0fIOCF_6-I*}OUONEc$8W`A8LT*pH)Nb#ewi$%7iyv2}6hR!2 z<2;r-OvOP+q+=d-b9U(d{o?_KcuW*)CO3SOO;f__?S#D{JXhsxv}A{W`DrHy$@3Xt&!hDD$xFbI2#o>-a~?)+{b@hQnO zhf+|QaRinseM{;Rc9kDyx}r8sbs{OmXy{PwxoKNkwrf`)s2R<$!bn?~*$tv~p-Y~R zZ8BG=&$y0f&(n_@QTvK-?3pv?c3*t;Qq9F|-0)_xS1 zKJQ*W>e{S%bJjEEUbxVCl50d{WCx00hne5l0CW62-dre7O{ev&Kla#hTsnARWX~;S zi4 za)&xK@tIl<&VLO)_wA%cn=Z-{6XKUQFAWDwbl_L6m>ATbIV#GKfjMc*kZKoBiPfk- z=Mj|>LdMyAWRhJP#=5~yt$i$oAg2}qnP*Npu<|7Fyrrrt%Pnt1kO}U%XYXFK6TV|o zCRZpX*d)y5zDE`R)*=cytQf1Tci-ly#YW*0zO4(|wOU(fP~LUvjy^9ux29T;s24T1 z?E3Wt+tu6IJgA%(;el4E4PcM1{Wpq7oac}p83ghZ~_wPFrQh|hg*Osb$&>j1FpA8c> zBmlr9Tqg<$44*wI7Xn5dQl9Z1zzS}z1MADKIb!MsNHUy0#GDvjmGQ>B54byNCM5o& z7ymVIoLTPBJFaQcf$?g<$A0cNfBilz{QEGdpR)f)Fhi8&flga@5^#MzHWYOM$OASH zBYglMn~mKQ$}%|Y{;c|Cv;GDFm9+4BzDIc8(Bxhdk3_TW3H-Blz6ZD|nX`h&A`?821ZPcwKQ1u-W~!lj>;Yg{c4p5Ag)=grM;j3jvcTR zp2rfGJrOrPz}$QLtL3LF zuYU>&Q3cupBFMRTG3v_uIdgD;sU#s8(o1J-`&0?wUR+9i7 zGHIVWbB4nW1>xNxDpYqK|ag%Mi zesk^+P)+?Od={r#fAz=N*e2`*&|{kYFCWu^J5ON%iW2EDo(zIDCuF8xKVMZZ{*lBS zJ4*k0EHo0ddq}&DD)kpEnA~eWx$g;=4!VcKo<&N^PvU)(=FP|U3WITF6(z}matrO> zfv#UosXp++aEPla8Nuc7kJMwKiVfOt$pjJdLw6VFaZlFN?b>A#^KCWbFl10!1Ix&D zw>HgcK%AT(Weq|n2ZFQo4;M=_%!L&h9VQmbDd{(A-uo3EYuD0w&y>4KJf~5;gn58G z79b&2k3(UEWEpyeHc=3oiRM!#9vO#d2m52#uCRN)9=}2WNlKiyZ)!bT^Q36>Ti@Rr z+K>zdY<)R5H oZ;4>BIvQyiz-~fgfkOJ;=XKmj!2%M(&3X3WLlGBCB}8ad zL2Acf7w9Q`Lt@tJ)%lPyG%=tKkgn^kbj>t)GW_h@y5WI)J8z)98+_v9}wY6sY9`XJ>{Limo zhIA&|kWZb+T*A;o;JRa4LCgtBz7h!+$cwlI6mVcsIw{f7%}sWZ)p0I~X+q&c`MSrq zb@vmDn{pnsr;Uv&?RLj^c;uaD&mY8Ame#jM zjVgzL98X8t=1)4^sQfy+G+%nfKnep6NVi$pkeAk@%G0P)t~?7rW*MsptuT)`7L^E2 z;bj@Tq35UflM@vez)8F$I7Yewlt&SaLGKf8JufpcG4kgfp4iq!FHpG4M1W~KYTdw_j7NBT#3}9c8k@^pEau7I?m!y-PU?m-QLeJ zG0AGp3we|zvQ%mGsPd#P`FbiJ2J_y9L}_T7)sKyCwBx^O2L2&u?Kqz=$IY_Z4UWK~ z!ke$tCp9;mv&t9!>&Xp&)&uBlo_NF&%&9^DNLi+fh_LPS$x`-nLBJvZl{_f=sH)hH zlH_(^!|%>Ldb|yXDAfUMe^8c1!L0gd;u!Q(`B_2Hp>)bgI_H`VuJT+H2 z>CTg(^wdQU_fq^&wI2mMh+OiR2$RO6Z|-fdy-H z5Ew{Cspz%8e2JOM9Mj)FQK?-_myO-=*LlKcQ*|)(FWyDMhB1#KBpX~q73wyFMGpqT zN{dvuQ45U`w5uH88*x**RxGik1bB!*H^;Fl07TJFLT8*=_pcfArz=qq1-d6rn#23 z-^ecNBIQS2n0xv1k&MMx`|{A>woHCh{*PFuXoRJ$A4SYnl9{i4uSyQgWmsh8kWv5~ z>D&;e$@DFaz^GnfHxsAz6$%wq70ro|o5Cc4I}p;UBBJ5!xGReuW&OCAuKTx0XpT6F zj(w8KUhSFf`7NM8dl(@Hc9GFtVhXobW(|dHbIGGkGt#a+eA)R*{Ec{4Gy~NG79N)6T6Q%3CMYC#SNqWl8myFWDUJZ4&Ix@XmuKh%#SyYsK{zJj@W#A0L=E5!m{@207Z`$mg zqq#*QfCXn3wdbkDkAdCV0lXMp<|Ot5@wooXW8HMIa^cn6v17*`JjIFeMGt3zY_OH% z_pho+U=tLEG4yfe^Eo1a>$H9Q&Q+(F|0mXjIK@PnFZLp1B6EIQ*=C3@eg}MI(Uan2 z{fq$uoa+l(Q9FY(@o8&D}^x(t>JOpR6;ePzW+y%E-{Mf}1qqM!wHQ_)UeM4_|#fn1*MTYYFQ#kL`ZS4eJN#QPAH9D!yp&wHi z^?f-n1i#hTXKm#F2+DlhK_O{T5D3Ld;z!_Ed##qXdJD}w*a1UHpBdWy3|;+^WK;|f zvp*u@vF-IZ2y*aykn(z)E*b!0Qs3uFE)WeVz=USEZrhO|^r5wKy>;{g6VSFP4r7OD ze=`Lc=fm0-|Jy|?jVWCtBgq`FRfW#tjj z`>oW2y^jwl4MpHQMh9_IdsfOQkn6Sio{U0~BTfPIR2Kj_kSlElJ@CnOVuX^d^io;K4FHOo z@ujY=sv^?&#|0T^*ok&efpX~r_t29qLCS#D7iI|hpu$jqoKE(ldw(B_gp|(71=SfE zZB+O>NR*C?7x#YAT8E`#WN0G7INTk%fJ2~5l&hAIRIo=ysBI1mk_|Q>Bi@YmFn@($ zmg_1-h(RhnWEQH&F`t8a9my`(A-5@VIQmSmdoVG{4n&*vd*9-gfJw>H6qMX z&iDA;K_!{2`iwmRWMBX>4aSV8VIK~oiaK7bEl#i$N=6H%wS4)}8gcoJqe1u6X#tL& zUk9WaH|?YjwLa^@hThyfosoxh7A*1=UoqeqS;UKfFU7GDW;Lj3SPE@|f<}NFBrD<= zfJ2cc5_BRmAt#nx;mO1}7rg!UHmtVIak>RYKF{FJzuxN>^VmGg9nS8SYxRQIH)qXF6c=>PU{n!V z>umZra2OF9@rF(0w=s(2Nbr#^i+i^I_mBAy9DYvrNWN|*Ox=k-nI`Ef;Zf``&i($` z9at}ycZqHWUh|=+5^3@h&Oba$G0}(rF@`vCxe?WXz&sZfJXwSh#`!3>BE}v1&Lo{W zLX`uapG`Lo^5g@=M~1;^faoid$JsC_6O{xD!TEqZ*tpU~FOd=sh*_D*4d6Lq7cZyi zEWo>LJAb9zW^zFm?hsWrauZ3q@Q$`tidr+9leu|%GQpRE*q>5Y(j8zPK=V~FvpxEA zMs=7QuMf^7c0&rZnaq6k7;mGDx+aQJhB3;T@TLqRwp{n~%N*+LTkxfD-y@EWFmit} z9_a=-QK(3-*}Gh*S_lo7a&yP_h`a>>A+0F6LPSb({%G{_5FS#`7N@&Shp$&)vAyzI zF=0J{9%6lM^;E1?%o(~ThqL~8NKsZI)jBB(p_?D`9tR;>K|%f7517XSVFal8g zAvB@=l1VtY+3l#>pTJ>~H1WO}p(LFXBA}$%`$5EjT+~ zGT+a*x`Gx(Tj}N2VxW!CUfd$ZEJtK5H>PU`a(pSoV;igcR005uMwU%(`XH1#;p3`OP*)B^x@yzDA9vuIg{O&YjF1~~|V;^)$F-ItaW?or!bPfJ~R zFNmU5BwnKZG+Z9q0L3C%`|I}MVQitocoNg2XZv0jf+XlM=KNI)_5-$e8^2n=T^M1oh8RSh(=*w&j{25&_C1)GwQ%`j7^32>Bd0#=Jtjvlk=^(rblA zW8ilJL5x$ud9sQRb=5<#$k}Dp(GZvKM&IiE@2BOr56jU+^Gf(#Oooph!MeKuY6T$v z1448rC~hZ`N#M&oo-=o#moo=I1dx_=hu9V5Yy^u;rcBp9JMtKJX+gGRT^ARUN`SKw zZ!*~YAo+n?{~KY7u(Y0xH7u;Ck52Co_Ycd*)5^r5Mm&DdF_Iz?>Jgp*Y1^a&p!lg%_?QV!%e(Lue$w#hk)MNXauSsu)QQA!K}$o_B`^ z2EIa-kZWrVF%%kXWW*5I9a`akd5ic%-?nzv&cA;@*^)K!M^v7Mdf=_tg zvSLB^!kGNFPi~)=w>PBy6o^v&oD>*s7=Yj5s}UFkYpGuRJrtZ%zyL@HA`S*}W)r0w zRs;zhf;STddb!8O?PU&zr20di5H`$G>j6WMGQ^ICTG0Qi?igKqRBJd`ahyPa8xs%- z&{flfy^ANWf%24$2X1Yd{YBJwZYxWY%$M*i$cjYkfW4S;+9p0K%E(t$do8eYn5kJ; zn_F7?p^qe&K_YX5PC48xKpuM3wCVlfzQRtg-)t6UFM*c@pz#8XC=U9Cq|U=NF^>Kr z|84l+-^hOU2i609v(}4BHAz8I%VM;=ttqC!rVw($ouOlRKztNRlJd)}fDHg(DB|yuphv8@lPG;59$)9a4SD9fAb2?) zSOhA%Tp_9iJy5&7K?XEMo%lW|oiWjEC#D)W##e^(OQGW+S`mDBlE~qVq;-;3j&JLH zeTK-jnv2LJAe{TS%Vb~woFu&~YGz5fa6r#Q%_=8XajTMKl#t#4m9PJZF)n2m`#+w& z*-bVGzy}+5pOYO{4-Jh*_2mo4VQ@phVFetrn??$54=`f8S)k!gY@WmkWDXXjY+THD zb~|cN{2ro(0bol6UU&h---0jT_xkjYM#8>8#6`4<;OzYu4+Gb}gW?Ur(wk7zC*sdY zfCE9<34MqyOl;ngRuCSpDM#52vTOAY=1=^A!zo zuydOyHWSgu0}UXiUApOe7cDG6V}a*AgM#3oKJy`1uyOj4kprg{=I0UOXcBRLIui$| zDkKrmrrg8N23}ef{(GROr;Om*{U<<85sm2jpw^Wc(tW(v3&tH&py^lnjtqptu?~YX zLRj7&xqGVq`LQ#&vk7~EP7%C~JI-Rf!u?eiiO-4PAMyqQ6yfEQn-5qEw!=y+Y=0=4 z<6gZYA|rrw<`5Bg;kLJl`80 zr=^vseiPnp9QH)p*7ljhpP0;vIvonRWUvdMCvgRlQBCTpuiXPxbbh2bGbXa56NHTu zU*#CdIKtm2+MEp=ypQC(&&OONscXM;0^qR8m<#kM`{;$}4J35Jlnc>?9~5%9e5F-X zwgS||!C8sgbsm~KlC}w(1SbSEdiV3bp@M5IaAJjlh!Wd@qk7&p4m9{Nw-reU%eqkF z5F-_S0-Kbx!qux+*-CGr>;vs&hE|VcG@$nKsV+k;eFNhWaq!}X8HZe&BwZ7K9$09z+BXY3;9nqZjB;<~W&Yr8ePu?M>!|ne!9)F_IiP zwXq*MA$-?xyls}tVwgNh^huxrJ4Qz8aAZz0dkSs>KKT-orF#up#fV@aSxg}UQ|c}I z)h098%iXoOm0xRK6 zk;5*YN=W;k#{UsE6-Y21Jaz1Dz^4gEx~{ui_iXz0>nTtMa+}-$BL|QcPn#5uq=yHF zL&%7~YgsnnagfUp^UcVB2AqM&04YT<`*&k7iNTVPKN78p-PjjUHe}VnI^BoUUZh#- zF33#$Q)Y;1lVhUGPRQdbLEA_$NmR&G?)Rde1&5!tHp|D2OeALr-2m;5C=cvU619j2 ze_?fmFIg)!ENQL{4G5|YSjoz6bn`JcqY>yD8uEv%mRt@b+z`XkYlB0pEP%i?AY%b% zJCXMX*7TAvXTShtb_XCYqR=DNKa|qESE8sC_!UPBMMWPoSN~qvR1*CM4PXI|^t+h>&j}J0I>olJ?%4mvcgn zvr_Ul7$_rsyq+rabKy0;1RcXek8Qdtn}}Z#LUu0DFX9!$pd}!k zmZYtVjQolfN}T5?jxj^g99IA_TA@EjaTq>V!<0tN7-gB=8rL3|#@aXln#%eIFW5D{bPU5s{2 z)=6iBt^v;4cj$dlpjRMI>X zVuY*5WZfUc9x)b zoojYWitE8Y1q_WWkz^20z;7xL9agZPPh`USCZInlXNe?i_$6RKS<;>{sU3A zgvp~=U=C=9$xs_yZ)@F4cH%C=H3gARp8a0bZQDzmY}@bG$8)ST1nvRMTrF9hhbZ82 zZ_0|MA*n-m?_v0DaeW*@AP3c=`mWu(!94UcYJ46yl0g&(YHDm-u(KCn=i@m(QdLnQ zH{9ZamK^@3DbHEQ|DvSAUdhkm2!gPu9}JERSYF%~gs~-E6Hw}%CmnAW=Am{h#|>?A z{P_Wh{ZMg4;^-oR)|*aHGXayZz?oX^vdC@hPAD6Y>9`q)w;cNzK|AiC-k{fcRMz1H9JuQI|;vtB#Yqlh+a2>+x!Ba0H8UdPr=JZ7n3%ZiCp*#C@VOu%AWl}tn!*<%doPuFnm;@J0nIa+^MKI{g%z4dqgd9 z2?Hpgg8`8J*_+)}H$nChHZUAMbyEl347B1+n8A)Bre%VX9wakCFmY7s$56mRq6OJ} z6tHBH@rV}y1dMd&P)w_Wmj?tJT1h{db`!lX5NT-S*LqMC5XzE(KS7;p3*MBV%Y>$@ zp8wO@3UL$(YsLl2q`N5t$Ahp*;d7uUe!nprYYu2Osv#&uq3~W01>>aLzCB=KCk3c* zm2$wk?dIl&73Kd&Rht3Jl0bvlj)+b^1MDlw=wl9lD$%lGw}K-F#N7c44Ir_1pDjX5 zkUUGee%uKJGzVP~h=wHmb9yk3PKX^Br>w&YsKuW7^WfnLOhJAdjBJE4C)xTy&2@x- ziKE&9{#)Wz8Vq5=I&tju?$J z!(+stj*1N}7c>pNAi?3GBp5SNLIqG)L#$Lp64o3c(y;+p#2x&o3?>vhDdSSs?%*0T&$^I*K1?is~E~m6XZWA4dYf?cil$ ziuy;Hv@^i9aNs;Z+oB2<0{m&@(VJYTWmyZZ1NS3!as*Yq;dDsRQcxyQ`1Rg98uaHirR+n z067@t+A6q<187KVaI8;`oB+{@%aja(CG-&~{(p0gBJ%*59E?2jgki>yAiflozg}R* zNS?H(=USgkCG@v=6PNJFNn2HZayZuus&i-vP#*aLp6Nd%1=^sqqvJIG3Wl)@=d@>{ z3McY^R3AN8MDHR12}E%jL!Y7NFU)L!r0C{3B=vR^nN4%^zr+A5=*)V--jONF7yJm! zjctAb{Sk?1#Q^Hgyr$C4n>W*sOgoJ<2_QxdVU4Kw6`<`xUoig<9HDGH5U%nA+=jET^T`0;Oobmm?jRA&}xOEh0 zz`^7Sms)0@z$HdtRlHmoJswoy=vEqmD-nnoXF(+NTlgYzXiqVJGYVH_-<4u@Mx3FS z06>#F7?++Js6h2@=JjT2I%utNBiz7h!2)|`!wys#%m+}IPP`7hFHV!}f$+Mz>rE-l zgqR@b9g0cvqfwip|}DgT846O&*Z!yeQGur;Ewlup*r z1*!~Olgw>_fqOgDmgwzK1dwP&@b?&mRs_Tdi%tw$z)k=rSR;s6q=t662igXL zB0?nMdu40h8>lMvN=VCqdg|G{#twg$_XhP2ab0aTE&4i1tM1oq$UcZj&3Ry(O(4gD`DrnGJ|~$DP01 z*uacr<~@719nl-mERm>Ygx4TURShaYDtaadd;9TnRueL^6vK5$IfXu-fODV;E+0q6%oyI12Zn4*0G2nJLMD~lnK`9nVd<#6~8 zl``Dxgp?%LGGNv?obg?jP^Xd5I0$69Y#Y(T&gd5`D?Q~{7RS34{FcSRV7q|&q}*~C zP}y_LMnQd}AEE_;pp2d@vQkGzqWEkkuV4e9qU}RFa=|J%j^wS8L`rxMvEvZ2r9Gr?GQ!*^4gO3+mt>ISKeKBzuuOVk~UTpYw6{Oo6iwQI`I z^~__A3Ejgf;)TVRAY&u2C|J+$2vwSCtw&0AyU*_v2M-1jZ82yP%3k@nIqZLIPv3wc z{|ER`|Hbwo@dRh^WY2ENbLY>0x%C)HP2e?MVkdd=08uf-p;1x)ym;asCH52K!*Eeu zi0p>m1t1@>#}mHJ)3f{#v*jCYA^p)9Pn^O?!Vx&&ej6nn)MjQNd4QE4GWn4JHxKG# z!BMlm*sPo&CpKa~X|IdpMJ0zGja1R#^U-j-T9^OmuM))ZMNjYF-{(+3-X+w#q*Q<> zfv6+G^wS4}(OU&Sd7@@vC2JiUX-FbhA@Cyr92y3ZkV-W`F`$qS^yef2alME_0(BS$ zSuInJEjVQQVLJjml=1!!j~40vaRgwdSd$^PF9e6{u^Yj0hy104;w%Et^Rgj%L;CUv zAXe-X4@#^lBH&4`A5!-+(cu06JYK|k>)&sO4Br%48w8fa9w3R#8NQCv`Ijem!=F~Z z&zKdHoxuO1O7`NeEm{Z07dHta?fG+-o+Iuj!+wI;QAiD}o>;Mn{6QKuQHM8q@o9O00cns|bn$;7L4} ztlK`?ZN=Zjq*ULXjf)t;W0@h)g~M_Q8YJ?G(Pke&!40S{CU4c>B9wFvR~`>wi37ir zN6`xv%F>D0*X}QN9D@j;iE^o)IES%&z5$yb6K0`DZhWPmjKtScJOV?gFpf!rN{M;Vpc zXh#tNBCAf#)iU%OdpXy2zpq_5l7R@@i@&_Kvs@n@AiQ^S3Ikxljq0Oa4K)dvJ5AFo zD@V>*Gw0dKUVg&QM)KtFBhccnrS5!#5(#Qa3_&NEkU;Dl{HKq!a-l->qb^UrXSM1fz1N0R?|i&i=MDZhWYiktJ%m%*aho4ag@dherI8U zt~S&pNdM%THTTvi%1CPpYE3#w`7`2MT@+=#8A% z=zEEF>c)-J0D5r#lItAh+Zt(KNfc7iu1{?I0eZt1tJ{;u!#KtQ9U+qV!VrApA8QUh zC)idD0#^XEln2rgb^IFi8aE(UI1tlw-4+D|+Cbe4lk0et`0(j*5!oUz;^uGgVq|&* zBqeCM`wwWZ2h4%r!8Ivq5(e=QNk#>oa8}tTQ|2qyO*x>G7=cu%tVnnaV7#@Qx94DZ zljTay7e?pPff|p@yc&D{6$ZGeCLI&zq2;ieUYsD|)icuj!^wys)UL2{_O z!!Oy6kXH;KAwgA*kWHhRFAI~%oA<&MPe!HP9{%wJjO+z#>*#c&PtVB7N7QyjZmWzY z_b71|0D37XY)KLX$}5zt?m!7J_E!$550S0`6Cq~q7cV%V6_&wIy>ux_cPPF%+ z(o;1HO?i~IzVv|dj#o>S>lc=T%kq)0OFd0&Zj@fhUsV+c7`-!tbcd+?8l{u-fPDW~P1$yCW7Krfn+J_W_BBKR5vmTV+S zAav;@r2};yq4fxzOGshKoAWr~&~h^(ju%iY-gyG_4hf0<3sk3c2da}+>2m3ALF63{B#MlU+hV9!?|B0fbbT8OHr@r0 zD{;M=l(@I(+9r_m+K7-DjbmieTcaJbUPzRiORhbdz7htMHVOnXs_GVld7b zq$5J0QhzNX`s1=g@XI8&O(-@{MSwiy;SG`R`g58$_++lFshZv#sw!YjV5VQGhKX$S zedW-0d^v93gc0`w>73AwcfjhZ|K=sGF?_@VnOxdH{#{ClKFI{hM?`A4P*mgSPAi5e z8&?Qu>gPYf@jxJR(9;J2x$in+{ji{*V24pNIpJnrE9D>e4C05fn>5!zvdnsAs3i>CW5AOPq7XzZ~0{R**{Dg6g8~y1;STQkqCs$SECQ^rZP$iNyMBFrhg_oj+ zO#~-au(2u-9v*qb22r-Ml(^66nd3S$aBz{mNy09GL&J(M@@8U;76c52FTRCo4tmu& z(1Nm73vg|^!&;3(f)8Q|@^#;*XPgFFjiQ~{T2Tx#&ECMTKMG)n$nh{KmGZQkk#Y4g zyd3~`2;YbYrRe^!?1VJl0@2Ze7eckdO8%YSCRjfJ(3Itbpa{MIBOy^UgGy4|l5FSE z^6xQ%Q=zFTfn!OCr2|bq*!gepwjuu&lchCpnzSFPEX0E%ISE*viDGMg?H)mvN=7zf z4*(VWF9bM*7H<{3F~QQv_<})2;fXJVp@&3?Bxe2ha1_v`@M1!X@a4YFPH9{daN?l1 zXW_~5axY7r_{G-T`eW8T1Q9g*4)5Nt2xgjeH(;e<5g~@JC)Ebwm&A7+y}Kzzo2Ke4 z6>YsS4oi?I z3d~nmmJy^wX5rH2^};wxdDR#?r%%TyT5pwp4W~AVa{}CmP0Ea~#xZU72SO!uW<31- zN$=A|i8Gi?WkUBvBH}Ly6^KlD`S?Hh8gJHK*u|#k+X*s>s0mPT6MQZCvBeehVk-N& zx9{aA&XdEGsKOw-O#)nVbE`b1D}cn-<8cwXzvo<#eL9g+5XBaBo(>Nskg{;FxyU>} zXP_943c>|q3NJ==_+a%Q|APQE$b1Fbr=Q@`XiThYpf%23Gk#Mh6coHpME}zUcFDGv zji?X;gQ$33yfRnY&df~k6QGI$qp=G4bb_H`mh2}%QQ`3IV*7f+6U;F%NfK@d8Ih7P zzptGas#6m70{|v5I+byZ_|3{NV+Nyfy4d{63|^{o7K1Y)#R3|&|DQFK4EZjud`y*i zTb9>_XkO^u)U&?_owwkQcvf_@wxw<}CL)g=j@puD?CD95Nd2zDATB-uf#lmxTG{$P zvd_+q>=x%A{YuNtOE)!fJ#2RCX3wY-u`WYWM0o<8BMy|O(xe^Fk}U43KXeC~b< zYBoQpIidOp4bg&`mXwXCBOtBxbK(bQ`33m;Ji5BXJIg0KLR~w7z>!Xy#L_}wkJyY7 zc!>bMPa@59qG)xQY&v%g^JRSXnu-X)(VK)wdP)egJ4$$GIba=%b@m6X}Fa z_FwyGOS3Rr&pyevo4(W=^Kxwdu@%JQ?j_%3YT#AOg~_ZgQ8in|^DNEld@V78k7VTI zZQ4dko%2P^hr+Cd>Mg6s?lOQRh`n?M9uax~osFtoHK-M{n8AN5`4FVQb0C z^5auy&ptR|M{QQbk)J~YLXkNo^|Ix}4Plj!4D=HWm)~wq=v48&}Yhmuw<5TX* z1J8;?c_QEfhi2N}X~%q?la827zK{3TSh>y8H=S%pnO7=kZw*$>D8Aa>*VEWfyOouR zVrN&JI@q^9T(l_9Y1__?205RC^$8+2qW_9JOV?G1H*6Agt-7lC-9QfrHgyniZQg zH(%G*RxipH5+%Z)y!Qn9@Ecv!&yVx)Ut3*d|M9b+H7`rw?5p#WaLxGp*MFP-yH=b^ z_txyei#^&&9Q;1F`VoR!@`G_Z4b227#!B^M(-P{xjq-1wo*HsC$f5QB5FgKxbSls0 z{CQ(5C&TAAmh*}%AG3}3hCd!Vlh&~sT~OBe?VHjig&WTK(M@c z?#A#A%~amncEeId3e~2)3%aUbdZ({&a_3$=+t9)qotKbs#7=a-kvs1>wtAy`iou`i z$0pj-(#99ZO7cRLicd&7<%s=roI$SP8Vl6+h7E`KI0;LNQBp=+@vch3;i8C}D%s^W z@~eAf_u)^Ye<;6XwVPAfb>g>it+TcDD0=gnx-=lVtvmZyqlQBNj}Kb>DSXS;G$2)p zssm!D;*SP~a*2)^u_q2`=*Hw)yG#_yo>LDJD}26(({J>d4RxvG=!yOg=^e)yfByLx zM49x%%m-TZyAYe`B1B|SMy~LAanaGQm#ODhMWi02k8EiI{!2jmiE>u3hMs`15wG+a zD`_5kj}UaID@4K;{|S?BKJ(X)%h~SkVIkCS3{JWF!mQ2ZvEkCijf>4GJoZ|fHlcp| z)WFc3=6R`(nrG}{{^?`KO6(S99+X~62-21b2s{>X&9&Yr`p}c!vq(duvr)x! zH8+PPL{(m1`q=HeUTH?1^{3dq2m6Aa3)_GBmFp(#+|xr_?V~MZAAaczlkf=kK!4Qn z`{6C!=Fd;?@t5Vz{;k-|Z?ZD6VtXv>>df5OYzU;eLCpUmqC{x(!ou9ux)r;+Ouuq< zdTuFkyHy}?Fj8!_TN;OmQ z+`5uhtFK)dYmD+T#d6!3?hJn7i=nM=9{%ZmaCKT@aEQM8&i;|#^lV*$4AL3Pt1Gc| z=H{(;$0|DRgm};1Lq}x`q74{;7B=L3kE52f*wUR5L7APBS6ZFtRK=L22DY7aHH@e~ z=}23zBh983bv~lMCcJcR;_>63uOr9*+#R9rW%fWi#pB#(hgP-(BI*`pT6@6_g{bpl zL44e_L-1$U&r(L)MQdw*{K$<-OK74y*jSTq$0o_}e509J$qB^wQqTW=X2X^~qLu0n z`<%v0(@6_IpR!wE3?mA8ZsYVQ%b?tRq}p?kHO@R;z$fD^CbgG>BEZ_R8GNd66Z+RKm45 zlujv3j4?6ZN>5we@#mv^<(0V`oHBypt<~=@*3hv<)P&Q>-d>pQk1pFA`+0qInpg?r2}J2+rukgy zquf4Wi z#%2GnCY<%r@3)=)m7%{&s{a_#9(t{-#&E-RUUtW;*jNsQ)und~R7>1>Y%f|kkL#<7 zKa+5}rtI+T-P7QGhTX$I>$jSSYOG)Q#~dqU#Bt=meP zPOa{7oN@f;#K|Y0RIZx4U-G9&Y4u=+8Sm_vXc4!BSm47Gc7EN~;Yv{G!aS|%J0vqH zCw7=6PC!GQI`T@(e(5ypmbA;2v5gxum;9pzLY3_0oMu^ia!uTrxq55sXd(xznWfh| zuglM5LZSf1)b`;PKpTK;(R;`fgN&8ryrboUyo=!4#>IJS(2Box{w-SVy_Kf;-3u!f z#cjsJ%L~$Zw(gT!xufQ0`l-x?ZXNeHbX$`5*)y*$j{S6y%nQB!r`4lG?8=pT7Q8Da zso7~J;lI~^ORd!Y(pR-eH`?ye&F8>u25Pml=;lrLIIGM2w0sK-k7xBVW$XnIeBHgM z8MvXRc4u+?_o5=z&38TrR(f8FW{aoh=($of6NrTy>v+fPi=?l%5WnT_BXnN0hYl5Y zPtNz3?13L~Yz05G7%lj@Yo>5Jx5azE zxL5R$vQ)gG#GMbM5$({a7gOvOR@ZWm+3Jk6k4+4>nmpNHI9qg#6?(zzLSG?NKenb7~GM^vDI?c z5~ml`EqVOs2rC36D7=;_P^a^kMDe{p?_Qbmn`A2`F^r9f)P_qx@R)T~RdqF2xSR8q zH%wG?c$Mp0BhA|%Bll5+Ty?yw_m~n4 zE%{Ls=5klCLuj(vhD>vsB9$;?4UBvCY&uO$KnVPYn&f{96Kb#qXD(SMnQY`Zc5EkD z+24(A!AE?RX!gm`()BfRbJMmx^J8WgeQj+`@wuM%Hbt90HUrqyj~er9!acVNFSp*k z>wfyR`pLGir-D6qlA7E*3eDDCYhM{Q=Vx-S%;Kn7NVOdZx>2xh16}Z<;=(NJ>tWGq zzBD6)9aFyBOWLKRD5EC&PZ_wDGCN<4Z5&u--*VB~dgb5wpdhMump0OO1*4?1Ie6>k z3yQc+o85_ifgvLs=GUCK6H*fM->aGf&ax$(CuW$cWUaPG8Jt~T(xO?FaCu1x} z<~tfQ{@m*HC)*zG+(*l*Lg#qtlKZ!X*ZsmeZ)1+C(f(Y^`P{8Zuy|1W0Y2Bm-s2WH>ql8##@Lez zclRykT=}j*Yox5qG&Le)oa6Tvooh zyUZ(S7m?Vp!^hc$CjQ)o3vCmHkK`%IOe{A-2Q!XTwHfPX6|EYp-o;jBS@EGYYtuKN zZ@Xw>VsS6h5-xJI7xhd6w{dDTto1Sp&SZ0@G zkPKGo?&7L#e0Ib$E7|gE>;sNlHovBam$USXKV2~6lsxn3k-K$Z^nS@;PEN`$4P3mj zC#BdPu`yTN8F|WZy|0tg+3wRl*4?d*jR$-l3g2#dq@%7o)|yv}0$}IS8!vRrxL{}| zg6kqz7trh;umi&eNdQgwyux>QN^o*^j_7=Qm&5YZ5(m1HsW*FuE_T6?Cq^DOl54?E68god}P%P)+4o;7KxmSW`r*=u0_9~ghs+`^K zVP|e`*7orO9Zemb!Cph&8n;b+e4TqvI!XQeS6N1DUR762)0vm`mLbjQ)0*T6?fs0w8`UC0C zZ&W4EkA3dds~`H7NTL1|oaY=H>wYj&$Wl9nn|9gNwZ!s}B^{N$wbNVot0~IS!aG{L zH#z)!?lOK_?D&3zf4XMbZyD5cYHK}aA5b{$gL#!1+SZN5QM9OtU=ZRH!J7i!n6SfQ z5qGJMZpCmv>!xm%m!&5ixn*qbzb&vfGIOzM8oX&?p!~2SyXmUuv0nurO>1`^Qx7!D zFYu%UuU}7T6QH%#OS^XI0#}0HwLgL~UCPfR>C?O_ z3#)#a(2X$U%F4+R0F6WutBJwi0!gmVVmLKeuQpfDj#jZxJw26qVadrQ9)f=3O=U8i ztwT!=oZ%o_bjADYwo9r$+8X~Imb9^*$oVIGbsE)35^@g zbSobh&3$BHarYEEq%rQd-+0Nk|5}?N?SRpH)wdR>jE>edYe&p_dAYkEzy9>bu%H(8 zgUU+wA|sx!6@3#Cl9C>(b5Hh7J#H+H?J&IVn_B`f`0T@C@vQ>`>CH}RueMVM4_`Ce zQuj18*Zo78lPvH_h_^Z)w3hhuY>qlVP8G0oYWS^8w!XqEWZLyUM)D=aDvPf%+SZD_ z=iAEwhV#jd8>@$&AEU_B2X8y3a>(-0i%qpLtJjWQ{~DL?Idxjqu&!>Z8iNffb3MUo za*Q?)K^b7W;c4Pg5IMt5I|b;mzdj1#w{m-`E!-uUw>>F3tB@9}zTeF-m*WUm>1PEo z$K%H~9{lF79zaD!-CaA>)gyCts!vzSQ8UQ^Dyw6-$`A)OB@+_`CIFs~v(BOk5WeP` zJU49;l;&uQ(E@JnoC64@kgWtF1uU@RC z<KQ@7G&@BOs40xtXJ*=m3YN^| z(?%+%92#eVq^1I(Ef{>3S=Ba4x^n*$Y?sALo`bb-+nqPU9pfEkv~t1mV5I5NaASXO zMc8k}&O1-4e5`&1f@iBPSWz>sDw(e-iy`g^I^sz zn;vHfDWajDWbw~)HBu0D2o00W<9wf2fBVu)hCYKdF|P5|cvZ>6!}4-;YKGkvCr9i4 z)u;)oJM(N>t7+(cim|O0C@F zEU@GZ-Y8)R2YG;2o5;ERAn&H);``u|-y+oj70}sBmv~|B0q;!@p#&twP+XJYPp~j2 zf4U)z!EuQ7m{~R33;D9cq!`!(Xt03CLsp9a=QG{Tg=F+~NOUfsqB$PLaCb6`i(#(_ zEgOn4DkqBsu}DKh>cs5p2F3a-Gc7dz=ki64-wqT!5Jm7#ENR@_N zq_I(GdQZ#MY{aM*8Y{e2;;D^%RjRdhFJsD+^O0P&6M$aV?rWL*mYZ%GVEtpfH(+0J zG3BU5f#bZ;kWz@CZOhH5^WipZ7&w}_x_U7UdVjfoCK`NZK==*N*K`#~<74g}zpW5= zUwWnRcT>v7)##TP(_Y{;8_^Z0Wn6}a6dn;lHp|nF9~cfh_4);lZFHv`w@u(!?eWTc z7SSW(lk|$O=N`}&nVp|4++SrG=CusjM1}I~&^a}CT9vz9JXCNa^SfLdR=V2!^J{C> zDWiL|lQp4G3Q@Wkt@=BcGc|woC{%xs!FEWPls6dZx~>}WD?mim?JnJ4PjCD6&&QJ1 zbW|%x*-i@DFIu&rcgtehG>k*s&?(;NBX#;fW4UgO%zoAd3%9vg({YD-`@|+BAWr?X zj6nuh9XK+K-y@0KkepLRDpw#G!{0J*So!{RN8dxc*l^MskDNU5Q?nbSTd-i&6%(ISC?Q3oRcQaI- ze}T6sFTXB~mr)dV+Ipbxmq7Ty38M<}KZY)!hWU{AtxwoJf?flo)4aiQ6RpE-9};YU z@PTwzSyR(BZp)!=PS))CHQ}DspU_Kvda_74BeoEm9iu)lB(oOHE@Rjh79qtngzVKTlvz^x!} zC;Yp1Y9dHZ@&p!FMX7gy1_dwv{_e;w2ns+KKlt?NQ|{n0nSKk=qVtK+j%@v6D~qcO zM|W`?(-YIwP7{8;I5W6$U}xU8I|~c$lv&BdFP}~)QA;`>{hluR`JAVK0uVXlmxnB= zRT_T>I-rlr8{OLv_JvFvL~;$r6Uw0rzVbUq1ZO?CLN#p%2vWd4kq+l2zAMJg#@b0F;JxU!t7a zwsIWJ5*G1%w64^v!2X(wS4xp(ex}fQ2)$riB>9Gg&?{khk%0D43bKm`dkgZsBJbWa zY;@u_Bj$JT{UoB7Xii8YgHeFEsHb4vap3B=+nd|ZfJ;} zWozVc2EB3oHp*J51qIr6)aop~lfCUFD%@(ni2D?Bqms`R zF~dVVYAOG3K_PAkI2sRvJOxQb{CbEJAWBaV(qAE}B$**Fd*}h>9_;tjDUc>|OpJpq z6b#aT({au(KqPwVEYnj%yH-GDEX|QP-nmQ7y!h}^e;8(-qgj#8M{5N zsGwA9Xns#+z8>`X4Af-gA!FnV8KsS7{SNdsqCn`0J76bfokJ2#@>LPIjyc?#m-TwJ zka>?8fbLNH6-Bjbg(}guHw{qdtt{NxA)z!^TP~}rS+syMRu2g2x}l3#yEK!zriVp* z%4CXtw(k|`xxIUr)W=^RsmJ{gYsEqUAdrLCndEXhBRt!S<5{O$p6;KWw3hF(^_UIgT5|Hg9Q$I-*v&_*YN} zw`TA&DI@~>sad6PEt1Fp zh|6JreuQV*-QQmceFs_pF2*^5RYIAN0YmY`Kg(--nEt}lrC0@Z>%ZM4o|M?wBg}UV zV%-_E^KCD#H1A0F%g$zq7M}Qpl!0B~$T9cn5fr;H25>-~4awm*M0Ju`5Kv{1w}ImX z24#qs8Hgz+AUs)b!V7t5T_BACH!RofQBABCa=O;ae_LdSlp*a86hut-F3yD1Wy94|+f0y63#BMbJM?#+z@85=@U+P*$KPQhlteiL zgW_k7BTrQgJE|IL&{wbjCfg$&$o~^2#&Lw@`^B0;6bmZ^Kq7nr$lv{|6A^j=i5V0s z$*TRzXV0GD4KDcLEg^I^q8&|B9IEkYDub-QH`Ovim|_DOz9o7V9b;jkj0Sy|&;1Th zDw!xhx;-=!`H^H=MAKDByLkT%tS2B1yb%pg4vKPNIm6oBh*6+$N~#OzX?u9lUE$Jr zncos?khm9b0Ya?DW7|(skO5NW+zdy4cSi?DwID=9(95ytYdf$&JxhX<;WAjM^IT^o zUUG^duF|@VDaEU=T5&t?61&sv-o08A2=Dy(^P1z>X39neTI!$%%9uOz+XqX-x~kjM zn3#q6fAHUxW!QE_Q1Hcc+gVx(&V<>v#pcD83>)oc)o-T3VllUldjI+8G}Zd&r;}yh z7s++odBsQW54Wcy#$~QGuwF6g#Q)srRy#He1Xn@Cc(_$M07(T<2zfoin6pzP31qgt zSHt=(gfS4bZ9X%xn@5li*~35b_1bToJJ-awZTFG$r#2PZm>~4L3j zZKxoxB-IKr(ZM_nnxL@H6&^{PtvY*cU&7XMHGA{3_V9?LG0uz}uiw^(_%(3z9e&VOmh6wI-Cbz_>Y}0NY-3PT(|O!W<~#MrtICnFH|b*5_`iv z)+?&0yqcWc^zH~5glmsEY@6?dc6D9AMIaBp4Y~O|_^B|UmWuB~`9yf>HLOF1at$u8VCQMz?Tuhr`Pd-3HU(?1^uga+c$ERZ9Z=5W18 z4f5JjgoIyL$P9tM8Nn8th88bbSbR@Og_0`h=~Krhmu7Ca*GX2FT8|%vjyZnn#}0e@ zzb7tU^-5%A`pfR^HCW%WWy`_`6^*pP&cT$TThBr-E;&~|%B@Y=aPi{B&4oE8X~vcT zgM(Kw)N%kff?3`@WG^5WKm{qiQL`FqYSIXsK?7cnEEb%dQ%?h`hU#2WQc}2(hlN5+ zNLv#_GHmnm&8~FR)vX$}3>7X;GmlK#cGD!hQvSO2O@8FiNxBwpMMfOAh%-AZe9L;f zPe4E~B#ai2_7i@4{ORybof`!QjrH}bQ3NB#CcjJJJ|GvbIocxfpVx59tvti34iCDY> z+z3D3pU;uw7ZfDkcVb*XBXD5mxK)Zt>O-_pxb-*hJrR)DINb2QxEIpQ#&3RP~RLm?aL4kp%!{z$slV?R` zc!PH)7C<@9&*V3xEq7H@OKYNvFNOrujpp^UjeZzP6XL!6p><~0Qv`598*DPO0qe~V zI-Szu5`6;V(0omD7Q{BD$*-x%nh?dt#vlK9V0weDkuT8N@NuU2dcRoY{ zWafKWvA-%IXS470Wc}NkH>EYk2%DMtohaiW+z!U?H2SZ?sZ6}lkZ}i$dD`7-)M;B@ znlA0Jat%M@J-8|my85O2@S(=pXm9VK@PutUf7$rp26=~)^Vy0W_A3diCBz+yyZ*F~ zqb2>v&*-IV>A+irg8$Fuh>00|5=1&xN7=P0e)JOmcl-GA<+UazE`0_Kc7+koh7QdZ zE~dHKTbOm$U3p8F4N6rE`IN^4)UREID3rM2QP!X&EQ3lkJ<@x9nvo8&EXjkNwNuxj zp`n4e%J`;lt<|^7(`TGgQ>OO@?g}VuZFl{+vd8V+&@k?mv5?~Lnz|WX)in{>>Zpo{ zkv~W-ZLgDt)g+EEqTj?K`p>SJbKOd_cnfY!NTK86xNx*WV~o~M{CL1!)W{>dKd_`E zdC<;e0W=~(uf}f6cVHpAdRtx`QguLic$|o??CI+Zf>}>@pNcUL56@p6th(W}A|H$B`j_TrZW{H=ZiczxB`vfN_AD$Pq=30+E-Uv#-A`TGcP<0z`?m<;n~*(XCjGUB;U%=@M@D5p?a#S z@Qv0Y0TD$8V#lxCHtH?9x~nW~^WM%27irS-sTyAm)lUUc2lMW)*D$u>$WuOLF&AMG)q=Efl+;8%kNf2NFKtat8BNWQueUJt;iysAw|RW%p8ozr zoph1?_^m|NfQZQ+*vQ2;Js}es9;}?O$RJ6t1IEF}7xYt7rho1od%wUA}w~$X9~kzXu6P324BOzT^$thr|cV!la}m>sxZw)g9R_ z)4d;VSt_Ws@cC@>IIe_XV%TD^UkaLY@HwA*vpM?U3-EPDT z5AYR3irUAchTGn#eE6E$a&C#qvyWIqu_F(*7W%vA;y0~b;cS*S?RlS={xa5;e9yGz1 zhMij<_-om5X(ib3B0w6Qz5b(ped|A+6)h-3hkg|E6?6aG-MvrlVUmu!yO4mO;K78L zqL(RxE{}WSWXJ#gUidv5kW9NVcMpoVLd;DU;^l=j8v1HtFLF; z()(b_X>Iv&hik*;Z}lq79;-+04(pccPpoQbusgts{1f;WOqJ%ylH9~Dm2chO;YiX2 z@S5+`oO{tUc!Jx?_S$f3vWdERpK`Ek7-b66!|nSbE$ut*Z@79rRqt>Wc^2(%z`y3b zapW6A!}sjNis0W@Q$}`D*VKbtBS!dQ{>n4#IhkVfd}Vc^>YnSb;D)ZI)Z!FX{)fCP ztV~P{?CdHxZyrEGFuc=7x6JI>*<*%Sq);jUcQ@ztNAsTee0;0WjhkOZ*`9|ah>i#I z{yC_Z@A1;?;zil?q1NPy$;qpe<-Jo=Q$sng&z*CRc-q4;R$v*YqI5YK4+b$mvl1qa zP~v0CuF@BK!wep)V8>cc+EiN{L^O{*gyiQK&1Ge2aE!N{tFDedxq4XPF}pVuV>%`hinS#H zT4Rc*g1DbVMb!bg`H7&OtJ%BIJEAARP*phbk=UvTi-NUdU$qd*tCyLXlM6%P1Gg6> zBA-XiSPiUQT6NTPZ+MPa21Et9{26xrR1}#;6YF{6?(h!i6AnE5lAIgIrJJv7lDDmD z#^Vy|Jzo^6AM8FqKc2|GVs+_d{erNmxvis8KRy1YRvc5^x7^-VulR?6^28R@U%eH#Id+8@Ck= zxeJ6YOt*{`iH@ewi72V5eVuXQlx|-a6GFXD&S&E)#ow;y^ETj8&Er>48ZRyTDdW&_ip-@wscX>l_J-d zTTI?d4>p-LT^1)PLRe=c+Y}E^3;};kNo&`w7sI-S7)67GS%GZ$y*ub~yc8Whefsma zrenq~^|iO^2iVyqlhX6Wy?uYWt8T>*5uH)br4^@T>eZ)Ds8A5}_8y~-^~4R*i%lcB zIQ(w&Z)rteDf*?XD=(#(q?2TWxbevc!-O*{TxJJ?KhRR>nPFkXxpo#0- >`e3W-^Sf~u&FKznb{YSwpd-&^Noo;av zc(t0D-L#daP?+xlhd66fQxg)$n{2L|ojrT`@3-^r4Vw^!5sBmoQp_QZ_F7i8&waUg z6OB86j?YcVFW;8x3{y>S)q&kE+kh|Nr>X=`J2;hmAq2X{iV zZPomDEK6!8q_9|!Rz%s@I9lzX`01kN6y=vDvsM+GK3BVYvoYR!>Y>fa%ex--VXw^_ zw3Daz1{W5_xr>UQl^D8fl2Ni{;r!?4dzM!vb}btY&)I8ui_mV?eit@D2GjP2O5;+R znJJ;T1SJ}M{8TA_`%^n*@J_Z)YSWrK`ePaAul%raM++o$L>KP4_O=<(qVKccd_Ja9 z==UiylGl3v{dBd^!uQbvr_zNPF$*6B%#o)9I)|B&2`e_5$&e1Xtk;sh{1WR8M}bbd zRe9x5e-aDpkkEL4{r@rdrg1&D-TQCmP-Jc}giH}diDamVA~Yw3j44tGMH!kMMX<<%dlamcJDSFvQN>b0uX_Ha;@d?_eN)rxo=NQJsffVq2#z= zzpg4raoRyaM9ltOKS67o)HT;fm1}0r?6@pO)%D#iBeRR06X|Y{ey*bHL8|@OA5w7Fdei+XR zx6DaGF(5lP;bB;qV^oiA4dvLAziQ`Uv0})O9q&iDcn+|>00KiMm>jhlcdWs?76*9W z@TvF^X*s&D@JOJ;2flp=*$7@f03=>G%nlniOi0FrEl$GO=+n3s$IqCdE^qYdOCy~b zxu2gGw_miVblRA-d2CCj!_Y@scy{?^y`uHKtz!^f(rfy<>EG=J`@&_zr&*qC9n{pX zW%|w)_j|Yg&Mq@)E$gyqMnaCl2+3kA+K{x+7h7~*zAI)(KZ(O9q-M6BnVJ93vsLvd@GuBP5NCh&QWN z!Nhr$v#7TAR{oH@Oqa!v43rI84jJs;Eh*h;+B8$Y@sh5nQSe%eN1)Sk=-3%0kGj~4 z7ymRk?8wZ%q%4^vk`ac_g}aPeU3_le>y;k+)D|sjHDlYGf9{-k{J0#MEZDcAp;2RT?d0?Q{OT^4rZM$xZ(W5n47n=dPemXSVb*jlu?%Wfz91t#C|y z`1Q}rU7hxo?CUpRfH3xsW@z2+liJ&uK`PmG@6T_|rYHK3?<@Uysg2i`mVf^Frw{Jg z;IK&>=ggUN1KB@s(PA%jmD#+{GS(U|h&j9OP+(=ih!FtmK7=g+SLYipe=ZCCCO zFRAa_x9_P^`#;6e5y#>`hp+G4^-sqb;Whr=uY;FoT7d4CO`DGo)mtzqT2VcLx&{og z&31lvTHj+TUUkurf&$!{&8N8NN_FX9uj!Yb%rk<;dG_?_Tjlw4ot%=t9A1%b{c`CT z$)s5Z23=V>=^^8p6x%tmr2Z%#*-(w?5WKDXu=C}de;s4`an|20E;ed=6}hRY-wqx7 zIQYkbmVKY5w0m}?@}+HPVQyjfhU}#aKFByN@JhdW@8QGz+}w6`g_Crnr*}W;=}=$N zF~$IBpPpC1)0%IWu59v4cQhIlp{Rb9Z9{I-B>i~$PK3)jH})!~^~Qu1p!!4Iijy-Z zDA(&I#+BM<%J!SHXy;B3T@K)vQ#?Ltc&sv)8UN_K!UdcadB(CAE5ag_m8a+?9yxik z?epRYc{R0RG2tc3Jo?fYy)Lc$_hn9z)wT*ylCfs`&3q^U^gf-xoc4Theo9#PB%;$@ z>-j<9LUQuyL*s3Xz3NgjI)byd^l7UxQ}!qup<~~P5d0_VvsvtAlarm6 z%2*30>6Y>nIWOG{%c~`IL(1&cj~+exZrad)GJFbC?YirZMYKKU;QpJRd5lYZP9TsugyPHF)FT?w*{g1)VBT~4MJY!)GU*+ zp29_M-h6q+k(2tdIjPFNfnAfB`ZMS3BfCwTE}qOEC7C2*K+^}mpB~?jYxRxkbk4F+ zW#mZTux0KCc*dsrwxhTYCeYmfVK8cxsUN#F{@gjqhi%DM>OXI)Pw5zQf6m;wnB{o^ z!T$bpJzIB3|FwVr8fg?(k0}-jijyq@j`p%n2bC*b)wrU_#@1-i!IJjniMh|HYB;SA zUOTizuGPrKO(!p z!$TE=>yd%wZQ%(rfav;z?Y5ha0ow3b0_swtr?hKrX}vCaMKNs!tF5Km@0yh>n?8hf z)g5bMYbz|E@X6^iWeO8n@aGec^!@6!n}-hD_jT^zy_zooa~BLcqFANc>A-;<1Nzxb z-j$i4*UNgpyM|STgNC`eIilyhEg-={l4KBE-|Ux|nD~?#V#GN8hd_>}V`BwTf%%lP zy%cUaq+k0tcyen?Zb^)0%cOoCz36j-69+__CRT_RE(kYssYj$)Z}j>%DlE+J(eAp{ zJqJp*MO{&m)GPrNy#Mf_rhD?~)B8vdQy3ox##`^<@#7*vboIpJ**BJ2&02goq36=f zIUtgka0?`V4~T|;f-?ULe)5}fsJXlS8Obi1;TA}@y>Q{78If;m;52?P>89zUWB`8#ZNf=hWNoUdsw9N+g6xi%*&av9>_07z z%RHLRMP=|R$K2R>FDg;}kzViYTdQUq`DB}!3v?uiKsY!8rrLQdY4E2IImknN#W>|< z%LaqOV4)cM@6@Zw1qJ&jjIT|;Im>iNhI4m1HG-guEeq4QH|@wibH*F3QXd5c$8(8j zsiV(EyPr4s({SnB=+0GJf)aWzx7TrM>DLB8AS_137cg+X{9815x|n$kN^$jb_AdxI zueqCbIL}F4tV?N!;Y3zeYtDA7?%kXHdo7>wa*>SF9@elJ`2lu|!Mha3T@yDN> z{+*+Z1rn60+f1UKP0`~JBzbN)mw=noNX-Ej7xo0r=q+tg?W+Zk6j}Bfd_#O6gv-NQ z_{~LYBMzXp-3uA#3xi-LZg`^VtFC%rL0r1VRrx^Arz3STy9&c{jMpf>wj^!dNx>p4 zm|1=c!H?jDtit8G%B?Ocu+YwQvBX%5SK;CNzh`}qXc;bJeK>O2b@K!>#wyT+U+lP) z?g?iB+DmyKT^&+X@)t!cI+MP?OgMkO<@Go zlQ0O!WPumXU|L}>%8t1WsYpQNIt$ zjH|ne+#R|SHqqqT!?d+;=(vW;Po0qEZOddogQkwvouN_u`0?XiShrt6wBOaxpNntB zJ&JOVM9GCW>)V#A=l7~~Zf;S4sk!;xu>&P^^Jp;=`rR%lAMuq?7Oind1+n#(^0)sUo9yJm*a-64KI5C>pf;-{-y7PQw}MRUqm&GEV@t!8bK4deJMb@xN|9OaF9sHtzansHmr?3E+#b(p7!8Ep%*4>}!o0j9_$CQa@?b zEf^U!&C(J!V$7nGGep^r$C1KD|Dd3Jjt$k7H*v-R_*;Hu;_@v8KxH=XV=4aPgpN7LITCm1tGi{G+gRQG%tHIdVZ*X%HU*qpzf;y$#fSajv?O!mGGiUuu8gTf}b3{P`_ zJ0P4~C={{8Szqrgbv`kXp`ZC5+x!}=t*llsh1bR9i^j{a(9k;=4~bD!_2Y za$7$86>F@eJ-I>aMc-@+Ou?ln$`J!8d#A{)xQbnm^=z`lVjmEjQ^Pkz{;C z@#5_p$2-(3Yef0c=AZ}W_}CnuKSJzjeAe0?=S--3-}UCt)?bWGQ&so%d91=v57tfz zP^kW=v&*C(^Ps*$?!(BkzNsZX!r$dibVMOd+1~Q<4qG)f=>jA0Z4D}ZW}S2I_nNMK zydN9k&Ll5yNH036VAGb>+oGfetd2C z)2e%`FMl}pMOfTmIbvP6(;%24@b<%se&^5McKh9b5Ib!(rczfYzeGa=GdrAN~CW8v-3gnk+3ew$+zR;A5Xh_rsV7Wn#~xa zUHm6zd#Y_hGHF!nrmU834*GVF<-hiDZdA0%lz>8o0oU*#?UnUl-JWxo{!i;K6QCkm!Gaqh~+Psjshh|1^n_jr;bsXTYCCtL+(<3WE^8yXRXndgh{H zzy7y&e%7eh>+KycNC5PD?X7lCA)x@g+gE=z`3%jN*wS}nH&;}oY)eY$X(Yqr8V=Ki zgjg>6I)(_WRO!JkA;H1>{9=a>oY})te(gv}%_W$5y1BXe#uNZkK;}%=aqV{_t2(`M z^q4WZpfBZqBPI7_C-)vSXydzGMfdLEC7$}iwV&6$d6GTi&s+$gE9l#5;l0#4OW^{o zm;L)CnTJ^j{7>O1tVydt-+gxjM9oCxG71LHvR^$QvB#D+avlbsJV^_GC(BV_ea$mc zxP6aBYnMg;IjN}Lfj%x2)o9?$kgzZ@`3s;;454p+7R1et40&I-=fw2xqZWp@lyq!| zJMI2n!vzD2KGIOO8*;2ZGBVD^(LWvU7Z&E-xz&EFmduh7K5I`AKL2N~8K>IQbjJa_ z1}tq>tm2~R7^5!k6AXGp7glBb#KG-G2_P3RWK-8-mgCC z#hVQaqqXlts|n^%PZ(Wp zkZB-%tU;gHaq^0xpIFu&oiE>>Epz-{$3CH6CcC zy-v%ua|Rn=%f$>CDK~IKeO^B&9nop`^_4%iDfs?s!AziW^$lWzqCszZry{+}?ROO1 zK&$yv_dX-{yQcX>KENOu$K_bNL;S9=^(1q+Ox8?4Z%1>5rNZMeA&2)AlpcY^)A^$5 zILW|X`29N0+YI+s^wO=aZXWTH0Wb9FZ{WQreR8eq=PboAQ(3t&1y@TH|0^U@A*Krm zhXfb9_KQPTg|ihfJ7rjGC(vdpGn_|D%pX|cU zp`l?@=(+K>w^b+RDuE~b#L77!dXlDQUG&Zd4B-P{(V#+dOscRRKp}*~(1)?c&#(Lc zOx$N6e2Race9)UbAexCuViJpfZ0^gKIk}+&DO`1R_jz{I?ch0TUx^iEz?zT!YK>rX zpM=Ew;E`H>K1C)WZ+;4ws!La{ zRJLUf9O%?IceG@4U0q$}xpB|ffNk-Cn{G;W)X6;ZcC%a7G4<;VyVVo8$lM^^9aGd( zMo^PbE&^?6yf zDg6fwh^=sd1FE()6ks zrP?%TqE=M)F3rt4Yjx}oLHU2YW=*Eki*d&`^tL|vb@RK*prU#QUW3t~Kt=Ve%M)AE z;@Xzi$Dca&>b&NSj@8z*9K;bYK1@Mn;D+_)B6^zQsNk7;&so%6ju@x;`f z#~^$k*WxnWD=J=9+sS?k4-PKNH12v!C2^Uw1>>CYw4O?z$05Dt;6+Y%=&azE)V_WO zJ(M*yE!({I`*-NxUFJ=Vk41b+%Ho$}Ol_ndJZRZGUNUI``NW(D55i|Q?xFbgjvryL z*w%;!+s~gr0~NN7kt_~P9J?U0^R{hJ4=C|2USD=e(Z(Q4{sQN7fV_qi7e9BBY zFr#vu+c%hxBC=cGpW<|kW!ri`=Ymwj^@plY0c`v!mpslcFzDLUpi+~%ht@F-&NN!k-T|0K>kd-U%_3Wxo7L91($3ALyg=6pjX&HdT@uuR+7UeXpZ%?69QELvoE{(NIgo45MDlJjTcax*qI zzVea~d~D3Mj%z8$_&6-~e{lD%F9YMm7#Xrh_@8iPg|P(@r*`Cq(ZE`?TMrNDd$Ycx zv26b7)5Fu0!M0njCtkgpvc8XY^U(y(*O6yS?HdCF=gio4;oYvj)+e()V&?XIt+_$W zzl}enFB7rM?5O9_*sD_^=FuGbQZ|3Wf8Xz{f{&aSnVz1Wo)DBfZ_Sz!R8^@}%o@-c z_kC!Z7UwJox6Ur@+qc8D2I^!EB?oe`NJtqxrKCJYQuk5nl`cDXqB?q47U!Q3%Sx+^ z5A^(_3?xLp5osH#V?WjUdD2|UEiC5xbC5EA)$6kM`>oU^%;ZiWD)OB{>)}C3GMg66=WS^l< zrX4MdvbeTIqx;Lr8Bn6mnYlNZ+T(dI zP%%ya%a^7J6Et1>?kj0}5aqQn3Z-K%2kV7g<+K^sx8+{{EI$}ax)OYC#e!Oql$>i# zO!AK#QM>M8gjQc2`^_XA(D*0tX}>C?V3!bQL8dsV#T%|dA7oV{@~5(@-aU?PHIh2 z(6RRk{`XVf1zJ0PLnVzqKbRK7;Ifl5jNra5_#5B~4BtKKQqr?a17i-DRuF&4&_37N z)+=3c=8!9IS4(!EoFM6FbT;)HCTX-VvEnr26H-#WN)`jvN@~uw zvl}`lBjd@BUy(eIzw3#V;tpBw-}SU*TjxoW;K5=!8yudatZtp?n6kF6cICGpZ(hA$ zU%wbw^>5(qcWlzyJ@d+krkSYKJ)CsLW9Oxg`$`5J|kt zdXQpsH)+mE=kF#~ICP!9v;WIU>lP5*s$}J*J(p&zI2p34eqP6F-%I=b^+t-LDHe=nx!A$E?(OT=R?`XcRfQ;tW+s)&uo{IokB~uX>Cw1xaEt?a+eJsvj zz8rk|!nh09UB#Pl&ZBXMWMoynDj!Z{WSH0?_)v;{Vh8)^JD77!?A+PnqC0klQ@zBWM7~?hZlDvxfhb4jK(*@nA?i|^_tR9J-&4HT8U`yF&Yv&xPr3vm zN9o_+H!#pB>cys-h8~-8q*u<^U$TJr1G(raPMXxmfE&U~y6)}7pKfke{=GWtPGY0x zBB{!k8Brk7WckLkEOZ@Y-zup^8 zJ3c1Ve%p4dnaoMT8@5lkZr!YKt^r${3~21SGgY_U$`>Ie0W1kMxNws^^5Bh6~p* z5+wH2sXYj=RKYY4eh42uPD{(0a2A0VRr_UbHT{GYHMN*v;Vk$3xhCo**~lUiA5>KT zU9iP=D=`Efv`NgA5rrwbno-d^_7SLE;8sU!Y3tahF&5@9<5I;?ZS-s6af@k|J!kw- z95hGI%)Xi|MzHV>9Xce;46%Y2L+>A7T)qg?&1hHIf(Dai4^)#Lm$LyaU7^lVxr=G@ zVk#PSgLZSdslbD$HDtcI_L)Ba0M4r={~#^nL={%c^oqm`wi!&qXvk^v1cP#ZPIj4d z66-C2jn)IFA6V<^*9>I(KW#!9dq8zUnnXH!TuWWYzBf)2kSP2>gcOlDbnTi;;!de$ zN^us55f60U)g<_0cqLgC9_pi2R7R+&U0V37F*P+WT{Nc;(g4RYvaz{&1Px+}ZbWuu ztqEIf*YKNYj>^l4a+duF;f(uh{F`2g6^RKW@$USiU#UztAlaVXe?!cD^~B(#ID9iY zjF2wDWDUV?IB-DC)!qpIAX_?t$F&07r&5fOVaE4}odeo*r?9Z_@ayLl70hD#Zee+M ztg`ZxOqbyW{ngV_VOZs~{fPt-H}xx9X%IY{Zqdf^krAcY9y{p{0(i9>5u-Vz~i=wpB=SX!C-s&WVT=p_Cq712}(V)9Vl^1+F z0yWTOh?z_6DXgF8%(-L!5{?A*2JMB?jEzf+iX$tA+KN5Ebczkec_RonB-FRFI*n5> z3sF^MN;%TBNUeeBkiEF8p=y3MQbN2qmW=K4X5Uaynpas38|N8#%B4 z{sFUqI%$m%3@%-UJ20g(?(cD>NG~71&Er}FV>4+ZS6|YNnUz?vEcKf%<7)Bj1Wa(= zw(W7-jEJ9|u7W$z{kv9YW#5@_&bX6P7cLZmOh}7URF-Bx24B2QN zT5jO!IfEd8fqA6XQ7zuuFip)c7OttE$tWl;w4O*S7Ys5?%KWp`lDTcy*V@MMKB)_% z*ymUFJC1%g?F)grI)urUs&w-5e(zx%aGG&94(*xQB zj;)yy(+q+otQXu+QCo{{S;+u5UoPRpHH*&X48nQg{y~LCk^k5?DkB~rlTB4Jpcl1z z;*zLInh^(&AAhPbZM;Y|8MQ8Tabs8OXFnUiW-c`;(x}Ti_6nGbR(JpLHumKu8yl=w z%57S&R(T@-L&Tit<;nflp<+R_? o*M4eE+=#n2rgs(oIZ%2*{sPOL2SLyDQd-*V z*wVxFsIFZ3ZvOI7Z)L}Qi}NtBr;N|DJx4~yxqETyR^X@$bEJm!=_mo<&RXU>L=+73M$^7nfK9d~QDQ(OqRxYJDxrq#-bui1!^@?0iNoiWbvVM2j)vHgxwX9xZ8h@2uS2wFp#%kNonO zuDY?HR~Y+0ybykaiYZ^ri$p33Ns z{S8`>BJQ%urn=Vmg?@*z{5~Ji?+0dOI9Lmo3aSDs5e8X77Z1>#qYR+gO7 zq0uJVdGhtjY?Y?Ia#fxB95K-Fiitdc%1Mlj-cV;#|ck0taMJ^PdRg@Bgu|hou0kOea@Xf&xC`x zJF6e{@xE5}eybo+JbLf!;?nCI{H}!f9$W%@_+s(dwtmd z{$EA)`KMuv{qMj1-~aR#eCWHaX_wM6NJA+kOYnSPS_&FM2H6A=NeG_>F~QAEZiJ$c za3cc7Q=jR*G$J(%CUOSe=ai#KcdMYF8?;e&Rmh!XCMLHKM+rtf)6^vZkryspOOsn9 zRm=eb7&k5gdaLHWi#GcE_v?2|*n>+inEDjP;EkZy~aFBq@k z8Ln=qdfFsG%m?7W&~F5B9k>FN&S-F|^Ok;n`=;#t?sRC#%tIvZ{O9~2 zMCm|UTAJfAzq{pK<>x1uT}CNO+8+42pr~jyPoAjoimAgwQ&}WN!2;S+v@$QOvaybOA_H zKz78{0a4Ut^XB%ivcA7qg=y!}u&{5y(c&;=sG2`iBEnwLiK!uw z^g|*e3$Y4-1&a{+N%``y5($O_UJBhIu_m5nk;4JRQ?%BR99H zLdQX{;54Re;{Cn~wOlroz{XiAE|)Jth3mA7`bA$4HrpmAVK>ipNZiM zR<1m4=Wz`T{>`UP8|yF5x8K91FC&L0tqo$U5qOy$=F7+oj&7uHHRxd8uylz?DVQIU z5yJUAuR8ye##Dq?E}a{-kiU z@4CKIh~*`S*&KZb3=m36;-8Q&OR!p3qe>L|0x}1l%N&|!_V^@7pjdlDBk@#9%3B3ke$+uZr?Ot@d0xp*|35hzwFA|ZC4xKr3Q%yxa6w!RPdPbW(v z1$91J0tHY#QG)TN;%FWW>*LfNJ)$5~pfZA~D%u!)9Eze**=MENg7NiGG zSqy@ILdhUFjR*1`#Qg|)XdDYu41_EL%~PzYsnO8d7PW8`V>#!^tJ@zghjo>3;ez-~ z`gp;EJ%8AVA2xAwyLvP#O2z2J6yp|hf)V)XgK^HMIeL^4PQwz+l;2}KQN}3}$C*TN zZ6F5TiGn}w?AiICg@3R7IqV6OZQC*ua>mEBgJTrO&auU{I@D~*k};u2kIp0BTvCU_ zo^biHiPr8vH`KAS#*Sm7xF^(<%75?4z3H5|r>*Zz)5#|^%hrIhvWGe;9+<0EEPR->~TZ*?!Dvok(4EKE~t8$-TYnU`SV?3|+RR5c~< z8v(xz&%-$856olk{Wa(6ed^AyX02y8a64t_>*J_d4dE4ttFd5uUZA{&A))pt>nu+5 z#ST4pgc@fkgXxLRb|<{W4OqU*Sb5ACwdTqZ>NnYRaSRBVhp;VJT|V$AGuMoww9F%B zcw7^=v$j}%J=XXp^fk`Fk|sZy$Ly;k2ukDVIx-D$31a9+e?LAm)5`r*dgU1c(tPgJ z#N8d=n5^%H=yU_CqlVOi0L~s%3(dN=S1*mw$KPL1AVwn`& zzkdY!(~R{{d+_O){P#ww*+)Egt+X|P)PJVJAr7|Kdw%HHu?fq^n_F7Wr$rwcz2kLP zJ-2Wi#+nEd^Jtg^CihXdU~M!#Sruepdu{6xPNGo;!GEg$FeAhORD93TkdWcLyZHbZ zN1E;p51NO5(!|pb-4Qri6EMOZGkLOw{M5~7nt1Vs;;ruZ za{BM0B812PIq%&W=6l(ehto$*haV(9?2MUm907kmt2pf6hUa6&=LOAZIhTgcFd;3? ztf06!lxQiAwWnyTQ&)090T?jAmJrSsf<@@>f56`#wv0<%)OH5N1yZmu z3}+Sj)t>C{Lcd1KQILJ4rQ@*JfL|oLM+~mZAGBT8S5lCiAmX|3QuXtrN!f(tlkFWC z4UiK9`~!5g($UfX?AcMz-nEiKx3x-qYKUW7jH&Vtta;zcwcDDP_%h*dp~pd4iRK|{{6OqxEw90Gre^y7U+ z;s7Fd`=%N@L|P<^I2GK_ba_L!{0#;^l)tpPi{aJ<@f=7zg{ohigq)Q6cT@~|i;Li- zsb(Ecezpzr5u|Q`?!kmW6@q$>$kjzkt*64|00eY`c{O#j64f2@y2ZBNkk zL*m<5qSupWGlwF5L91(5>BA>Zh#`h@+B;N9v6nAj&d1FH+c-6$C%U(rFU39Ng6^bT zxiX3=hP*Fv;*-~qX?6#|5;9FO*ci4RX?{X-axGfqi8dXaa$2`>B>9!!>sP}wVZhBP z$QC3T@*6o!WjQ@WGnUUL2cT-IAwQr&PTjy*J8g1JQnM`-6<9MFGQbcWJ~G11#EF&b z*7@;Ysm3gOHynbl(gS|~_6_9% zC5 z!9WQ}>gPF~Nje2}pC+{I^pj8y=!Oej9=I-Gua9WmLZ&b_GrLW-!IcV`B46+d^B2rg zp62YHc`B!~fbDMHoW(_Pq@6)T06KWPe7U99fzFP@ndA2bYMQ*SaYfOAbI0>=WniG! zU0l){qe(4=U_gS()n&_;NdX2|u3X7ycAovOnk~W4GB)C@?6~!g70J1Kvjfxcbb#ps zGHPmT>6sm(v2I*M3I-iY{65)))VkD+jCddeeYMJ6gZIS51fMvu;cqTTAu{84qn@4| z>aW?f7i(&KNCINC%Zr}!D|cVMY&*Z^344tFVrbZnutkL@9~#C@(E+B!-)L6FUkXnm zu&hUTPQ`8n!$CqTuRM!gKt>~f=i`SDj^W*3S69P>?Z9>6rx6kn;m~&r)dM^AY)VSW z-|Sk15;XU|)&4-sJml)Sx-Rbuy+C^H8iMG&gX%GPPyiCv*49ZAPf)iT8}|ZQWsG!Q zvQ&&kBBOzl%?E>FQOr1QjXD)WOGgmQ z&rQB3C(xV#2J59*|cQ(}1f){7Ef7-i3J$9O5++|}DSw_-`!S9*y);N)4H*!)7 ztSIyguY}NnZNjJ#BRWy)=3QMrGNi&`xv1=GBd#;mm{CRx76c^ZBzNv3{a7Df*cpN$ zh3$WU$4YX61rh28h$68TiAR*WiqF`|0Cnk57n+PV<27w&Q+ffa0vk%S_}0{XP6zjb z?~XL_RX9EvVmKWzKc&}Yi(%4$nih#k{kFiV3|6W*__V0#<3Y7br5TwwdXG-iCP4X4 z)uBTF-SlXYSUv-m?Pv!tZ0s?_b%lw?stXIT7J?ZhJK_&UHt|rmjr*;_L<~isV`gOY za{L9M73ST^|L=2;+Ci&gL>56meq6BLIX#@u2FLN$Dyt}H+_`k&4HTTi)iHwP0)IGtZv}8ABP?Cb&FQoa0VCl#***z=A?WfNh zL$xb!i&kH`7O`LpbL%E?yW)ZvcIfDNHgHT&f_-2?CeB{VNOB5EQ}n4gs!tz{jAUN# zDzBBkj%}R@zq92w4DgMn_f9>(egxX_1b~S7Kx&oU z0o690J3*~07rejx`{&0;s}nU0=X9<(MomCQB(AqhkAbJeJmTo-3YM0Yi3@WwUrbE` zCiwvO48!Lf#$oNpjRv0vf@dt~H%!#T#}~(M+G?}$S9-Ea;DIHoDwq$Tw-NOg33-65 z+P?cCqZHO`XuJ5qHdVoNRAw|gS=4|HH@+@=iC;KR?&yMLk5h`1NLoo&tl>jUiPla5g52YK}91 z%}h~5qcL+Wk}MQV_ADO^431Qfii^cXz=&-5&|98{z*oT1uQ&mTDYkjqXXV+8Y-!n0 zKZKvwH8J{seG__c!g6k5$cH3RzF;dKY&TJ@5y-(HE)EHY?-J^O4XRalv zl)%DG5||;+SD2WY5n#>D`(c>lJjy7}M9#AV5_J81vq=xHB!s=`<y| z)*qAffD^^TJz7m2O5yarB&Gw4kz7ULj?Lo5E5c)wMXx;N)aA=%s{LMlB{%rWQ~e6{ zj$kLo8RMKe@!D{Qb?Yn>wQaJ?jaSuols-ZVwtBS^2{2u;4keYl36UZcgOsBh%N<=y zPMjBPN*4o^v_d&gqjC^pg1o5o2uBgR?=`*9rF{jmL+|4(r>}4%(I37j*@B3qFRJiE;MED~L7*>%_Sqd*$d2d1Y~0a+QH(I!+XLgh z4@C3uUlvwY^C+Z-?$xKZmv(%&*KkBM$~-x|1|E>0vP;D!rfqr|4dz9=#9;pX4o!ON=+$lP`;DC#kM)WM(Y$7_Xi;1Fs@ z9$xkQxxl@_-e~m6ZS2{jhg0L|_AFB-WEBHRO0BuE_G-|&Xt&O|aUhgg1k)}?kMuQ! zS{UXhlXXU5p*Qj1R7NZSs6${B-k>C?{PNwDRyMrwwz*#!)X(VDzIDVdJ~58Eea9=q|tQOU-N@i$D0|< zJ&F{SXh3SXC%Y8S5trET2|1n7S+d1!#_BAeKdd2TvM%&U0bmyv`U5k@Ze*3nGVkzK ze!du|@TQ?*Av##D(|6ey#vKc9qCY?DhR|#8Aw*h4E%tDGaenWDt}2jsNKKUcSkdl| zeQehZy|F5{lhkxgN*tJ1a`Jts*7Nu>j>1C&r{0?MP}>V!7UaE#+o`^o^XE0ihCsZD z6cd5LB#>0yrG+T`dIZ0MheMur(A8?GqCqh2i;DnJuIh3V*GO})sP#;jHDOmH293I}xioJ6 zfY_J!C_@-`d9-?aKL%wh8EklxvKKb=35+?h)G7dv=y4PF$rH3r$Y_OwCxk<2Xhyv3 z%~wxAj}w<6;6v2O0&s@aqMtK1W(|4e!0GNsVNem|R+%07-q^T;ohAH7ue@;gr+$&? z+gHqpOG!-~uB6lviZ#V8+NJKyf0hvTVwAeCKYa>BD1-QozZlHpCzrtJ;1#o_H#&P& z+`4u8@y{m8{yPMbln3?TLqFJdq6Nh}5!37E1+TVJ*z&F<=J%KUA#{QzNs|Svi7-px z!nBxamt5w7v3*h4)Z!qk&G_P#fd&1E83(bpZQqhUcfy42^bo25V5HDGUjak=`u3F{D(Pzn&x=LI7L9d31LWha2zDbuEI8Ess1Q!g-ev_Zd=_B04u^2iOiWGKk34a1Z9|JzJYCqupxd7{I&;Q4*M%I+dq*A)GjBd=>bHrN5ERstc=Ve1A{qgVWmRROJs1*n1c9PZ!+qtQUE*PIvbxoT zS_-ctmMYoQ3i|UXf5b9)@#2N($@ms+Gm7C=z{GO8{%Nw_O7Yx8cYtb0x^M^%$Gs>l zcp2{qF&OIDVS6n07cm{%P$&bjY!?J?mo_||$mZ~7IdMgO_4bKPCUsoVT-iKn;tcmD zd#XC@J>O!wiJ5^0-3oCm@M!V*|N9YIT3^~1Nj+R2((z_}uDng~iQ=;y_w7JIc z&yP1fIh?t&_!w>8WfYw(k>Lq;%IKct6jWxI%_3sh1NyqYObOuT-;q%h!hi{g#xX-b zkl6F;$e?RjdkmLueCBwOn%NIV1;%UaWYFaMlyd@x;c{+1e%zlj9zFx(8`Nw%G-UsB z5+#6I{>4tb*h#DVwtk*kMLYxxDuZX4H?R`O5;E>hx)=K(qs4ycXatey2rpDX5PS_o z(`oL}1G~ooEzXvUH@PXC+H9!;ay()RL|d2V)X-GADKH?hY{H8*Ne`d;9g-x zJ`9pDLxD@N>^kCqsuL@G`0(K!d~}Ty1Gay79LN$Y!7D+}ZHmk5jsVwoWoHcj!$(aa z^oo79`RzQsb?Iq)`}ny0>vU>@zl6jd34_@>W*?Uj$;3oqS=AL16c|dBuHqy_$0e#0 zxCFOwBXR>>ml{4A<~=m0Nj4qWs8IAHjN{)P(;ulZ`AW94tw236F$Lc`dv;w?baTWA z8Tz4)BYY?WMbc}L2u_x@KJ+z+x}fCXyJ}W6UP$G``)rP@j)YnY?#Y{}Joj_IZsB6n z!K}!OPhx^$znCQ4e)#aBg`9Oc*1mVJ?4 zkiy7!9?f(X`x6XP+bYc|Cir`8TCj5HrZoQ+RQ>$`mU zOKZI-4S8uqZWOCz9FCzPQwGDKxCU}EEn2>@#-YGV$$)aXT)AgpQlBbG8pZ(uQq-vs z(9$iY6JWtIXlQjO`11QKFtD{`cWD{xFcJf@)UYtiq^~96TDgJfpo}=pg=Zff_9?63ce-thmzY50HQJN*CY-)iG~ys|ip zqXMQG!Af~sKygVUu|{Rt@U8)!SWlyPL)86Z3EyAMt6}}%Kzj>Miru?+Q(@9bVC{(U z^t!t3IsQb&SG3Q++zuVI?zR>i|Yn z9a7xg+`bSo#MEtit9L}J5^Ez;p4Fh|piqsJQ}{CEads@Zyc(phD}7UEM4ppY-Iby- zw2ehg6o_-zopLt?2QIB>3uq4lpnd?=*T(@6m5f)P?t5z2`dP5R-CEj#eV zqUp_jg@&fK_Vg1;qc<3)Vw9@6i!WiKe1ss2?Q z9MknmCaaplOAaia^7gNnf+REGsTM%bgbY2tN8K?+-;R+qmh*mqtNfrhDVV3E{Gh`W z-hz_&Q2Uz{PH>~}K6=GDv9|h_&uo2t>UrT#N4Q-B$f|h4xrbCV?(k{;hA&1Wn=MLsG)Iv`t0|>@j z{bDD%*A~YQOr0`CG*_r9NmK`cqX~)@kiiyMgT&3$F4O-K#a4(B)@9H_slK4^ix9myk;#V5HxC@npLcWpbgYh+#`j>lzFu5AH1=w& z5XCx0wMCbM-j(3Mid>jx9jul-I@){Cx=uV^R6ZoKgtF@nNBx?n6L4USuRes0giRn> z-bA_&RC_^iF2O|U3veq70h1_7R-#jc?*^MlENMHc(&1aSm>mBuj+fn6*uEd?%x z*hL^~JSyQ#0I!2?4A=QA4Fh0+S{H0>TZp8=$?Lf&@ZTSmzlhk{FnaTDFM zNh-*PjvPNOmX=^T@ZbQop^y}Vh~dd94r0hNVtt@_JU}4hX^G|}m{nWVpl)xtW zXf=>(Ugi_>d3_~KYScJYE%~hT;4Bhk~b&YniS!2fAwYSEoX(%Ym*g{GT$cBr!QW; zE2-h%Ape_RuFo@ zlj8q8@$TP1C#;T+lW#tynut4?YxLg(XK@)yETT3Ku$m8jLw&azWI{xRvnM^}dCh{$ zN<raVOT;G;jptHsENE4eOu{06KDOhU+JA_u z!fI0Zk7Cqt3Ktd1g;HWb|8Z8c2$u8}I9}SNaC3Cn7p_>Lh;CHW2T%xJy?eJGqXOdl zZyGH$UH$0b2@0k-8AuHaMIL;x-&kH^9?(R1xU9&;ND&<>>^jD%!BhGP>MuR~Cgt!H zdXA8#;$aDM=foOO4U+w8w>CWjSKdpk76%2EYVIWTB#u)|_w0k@igL1U+0H{z>uFhU)ps%KMTVru!N` z)}D;XKtaJl>SRGQlFi^@qY}H_s2cNh2O8aXU3RWC2gUyW{N9fS(r5h^Bth|>f&zmUGV@vG;4n(mDfeXICg8!JWLy57R z;uR%(0uCLjWd{laHD3?Pg1>r4GH_9~3gbrcayVrL3@lD|VBS61dAm4E(fp=R0@9S7 zO13^9lv0!t+_^T(jxX6$#cRbR&9Q4p}6wsVN90 z4~0?4BU)}n79vg@`Z8jMfnXKVVwttO&w9#OfFZSY4+g^GE5^F`9heL}p;b^^r~v#s zk6qa#@jjNJ-{`ze34{~~GK9>AYz*ZTY+RgzHeu|B~z;6;eB zG6W&m=sz5MYIB93f%->bw+k&&#E zg8E!dhD1>#^=k-a4X``VvmK=g>jA8cF;S23tKF&WFnx1YJmIGPi^Eieq)U|Kb-zu_CYZpGBKdDP9`b_h{}%gjHKG?OHIq zO%!0XKBXTrWK3Q@MdLtz=bnQDu3nu=)D(dI1IDjy&*#5JYodH-X?bt{4M+*bzyFpi zC2wgU2^WZ%#bSN%h=1@hpw8#i?-Y)j-bt1m+~6#67iP&Ud`;GxMkgrJyMfCT0vAdu$iQEqWM4VT7*)^M|3c4eGAqn=uM9`=_Rj;{&-EOPS?$b2= zbXa~(b#-py#q5JH9sW9#yR)BhKw8PGvK;r>5`F6-G7^CwW1Hw*{gYFG75BmBX|Il^ zsP#BHf$O^nqk6qRdp=-reCzr1C=T@yXdRV}zaa`YQihvVwJqH{gk}gkLjZ1Ku7_~2 zAeYWBAL3Qg4c^)}Fc&pkpW+urxj(;q=HBk1X=c!j>aaPh1N9VRwJKE#!`k}OxDs+} zEdO<4ONHyY7z%o9+>%?I$VMLaDu?V-A`aV(Uk5g9`6&uZ5HSRBZSKBKJWyjg0D3*A z--bsht>?#Rg8NutVkb1Q$gz`Hf3V{t!I&Fyt|a#8jxM3r8Ts- zxwupVzo>~KP?a)LR2L*{-{7CHhF69!|Ac|@^Y746KV4vw4fV(y!VKhx9UqdeSf#Ak z8~$jh`citaq9h@mrgoS|W=M`Hp%C^v1}IMFl(Lj^V!NueN>hyo-0SJQHh;r&FY z7oZEd=0je*WFNk3j&#EYe0 zIe7PP7a?@P#EZm+yg2t=Loi@J$9{YdOBn}@iAiBpL=F3R=_a|2{{Zv5zHEaIm;$j{ z5aIY%6>kbV;B3Leroe}< zLT)f_X+}}G-{WET&qQiRauS54J67Uhd4MAl%pGjDv}7qoMiE+1K{5Y=Zx4AsV~$<~ z=e1#l6=x#GW|7JZFrM|3|NObF)TQ-z$$HQb$sn7duJ)ozk&h_p_14BZIj1Oh!7ya2E%|Md3Bi)(dDkH7I)`|P9ld^0JJ zf@e>SH8*NM|Nfq?5nGhpv{R;jBm){nSBR1m3duB6-Q4? zjjLT83Np@xyn9~W*%5T8d;f8-h+abG%QuN6hZ0&~I`$3ZKY6&A-w{&)d-SN~sK@!|QA)N5bBn|ngw9g0L|cb$0g=VTW`j@vn2}4W zIEBPWJ|G{SX%9TCUbxi_B1aca!{nfdK>dZ}m+DY7FpTf=Tbza>#1J3Fro-Zg6Gt?0 zESI?nPNM0e={~skagNo|`XO~oajw2)eIKXFsmgS}@k@jP*2t+Bg2x~v88Yd-Y@k=uT{&Bi4VDNCQ zT{@Y`WZ4T(Ov&X^4O!}^PVW2-A^l6a=_0`#1v^}lX5WiCZViTC;w68qx*Eb!fp=;5 zTdsqk;{tO5)9lG%Hq-5Vfq;Wbi86}3+m8B*RaHVD5$X`(ZD3^;o{%@dNk=m)QVI49 z1RdGIbexDiK$w_NzDCz2J9zz?HFM}L(kGqNT&E=ITKo3xZF<*;P}^8kV1KNe z$!8{hZ*GobbDj?9u3|{`f`8$u>Pw_ySYw15&Vij)8WF%DA_#r%W&28ci@u0x*$-a6 z%o5{*-lT}W)kmnP;FRwFmRYhAxPc6DwQSbtko!-2xQX16kZu?IL`Oa&lCSHV4+#R;_Vw*Q z2?-Y8{C-QIr+)tYkky@h?I9u{A`OS>0LKa4Is^5iFo4$VRPX%tdc(ECcP>s&<1vxb z*g6~A4Vi%hH-s;GUtfO^Ye$U_!m1c4PW9WjO1<}X>x>SAx}+5HCrOsVjUgecyFzBD zC9&B>UPfoG@vg!qn^YahlNYZU#g>Zlm&*SgQxal^mQ+!ozzsF7(-;%!#a1LBTHoG0 zjD{;m2GEuCQ;kg-pa2L$Zn51mISl2HlohS2r*3GVzs59LI}$a`)=j+bExZ^patK!1 z4kS#;SEC!!>y@bv#j{}5!oqZU6cjfUKImv=cF0OK*1exFhstO%2a~MTC~s&Q7vO#i1wCv(lDT z!Cn;oPl0w5vt~OxH=2)~1cY39ql1uvQx}WuVN<<}x;`qWx`!v%QEtTdhUX+Us0;NU^^Uj>aR~6z^--G>s_EssTQz|CSN#6*MPzT4SqBh6cvNiU!n}NaO!KjL3J4XaADaR2tZzs4H2nu%tH6C2ceDzaTEEaufMg6u z{BOt>n@z}>Nf3+%Z40@Kmlv?2N^OJky9pWeP&OE=MeYMi0}m1DClF02kxM~YjS3cl z0(kHG9UeHU^3+vUuGaS;nO7)&v#~4JBs+2S{)9aHf59z&d`x{PB#nSl=QZ3DIdAZI zkgHb)hQ4t0_=R=xE_94rX!-fGfIM zaP6e=Q_qlTqfQl$3)i;3Us*4+;B|aPI7D^<3<@zPOFrHAGhPFXAJro|pUQdT0>dIi zs_sT9-8C%jD@g8&PG0a5$pr+6kqb{)Iexm2SN~=pQFfbz$LD1Z4i1T9-%LC%<)d4A zT5F2+%A)wih%U()1jd(7sTJs6Mbb$=r78O5+S|#mh@()yLK5^gp$}WDvbLMy{6kOs z!u@^1BHMK1OJ~lm^ckr@KXg{54%{X$J`fPF!L`l>qs0D|dn*ayydR z*lPvAc=uQ_cie1Lk4HXR@d?5O$kB~jM>-ihpfVY?;OM3I^vM$>Dj+XHiE*Q)J~6(B zxs3en;^IJHRhw@bP?I`B=KuQ#9s6VdU;n9jB)+PY8Nx8@e;WNIACGHR93oF8ErdE3 z(Gnh!1Pvj(rG6~!>f_pO6A|ys8XpFwg5I^pjfH0)9bv{WRJH%?C1`PW*Nvh|PjRdq zj|0OyEHA)&<9itVci;B8@)awR&YfEUIp;TJH)ZuC>N2qQn`Dv4@n!_q$}3t>b-Pn0 z*(60k422_lCm*^2LpZ&;3c4=2FlBJ*WE~t<>PlWpM5LjssI6Tk5^0T^x;lnEuBG3g z58TZX5l9&e`Wx^(8n5J^f9PZgj~_4}A*dGJVUCUX8RJ55dZ6_y=s)obj^iyg{q|2Y zap~Sm7kVjN0GQ#J2?dEzW3Lc&h3#r??p_jFVJwR7b1w?3grL&5gq$61t>5JUrv%DH zycX&@gwzPIy9;toNXY+T>bt|a-uwSaQKHPOhFMnGl8~Lfva?5t2vHJIva+(NjLhs2 zN|_Nw3!zjpQX!R)6!m+&&i(!4cdqL`_kErF#OL#VzsB?VSkEIB%X?uvz-outXa}SP z2&U2k{`eo{@hJ^NHQ}P52_)zNnjX+;063apVES(*-YwyLYYWp3DKs;ntUlGev<KsYcweq;?=6KS5}oJY5Q24@evJ!F5C=h{&2#f@0QDRehkJ(1^%rOpYbW zdK`t4PK7 z<0~JoUi_?i^oayFA*0L<47LEgE>cnepbSK($?=axi7DE8q`k%}4dBTA4?6|Yj3J+= zBq8ntO-7*cSUZu8=u_}`0TU3B638@SLqv-}CReC#?i-^ngQx!KEZ&+UY7Bh_i9jKGVucXgtOU)($5lo%hy_W9iK0ZXjgplx>F=*T*I-ybu1N^k zQ!OjIiFK0nz;OBB{aTL`?51W|mL7H%!TYg zflX*b3Y7RkIGhO@4A_SFH2_x;VIaxa#{@&bz?@&V+s6SN1NA~2M8L!dE(+kg9mP)% zeh~KC{jaMEkS2EM9rT~+BGw|bSr%)5@w@S40ckiKKYkeWGu;j`v>9X&Dmsx`oS$qI(0_L@3;-n1T-?57HZ)A7;meFRut0(*+?ymvgv9@h8D=l%6spzx#j`w5P{zROt#`dN@SP->V2Z+MBDJc z%#R=-E##G>NVtkPk9k0+5Gr+`#C!;S1r(8Geec<&-57|Q{3bJTZX=-@CRm_<`%7Np zg0Qg#qk-oJTmfejN??cwcA;Jb(Hond&PIWa<$%NvT*tP0lZ@8z^|KHS3K{fCkOUym zcx_}5C6J|gIL*mKjw6w;C#3JT<9YQ!mP?p^bno!Gz#|q7gdL2vW`((ZJ<}Vqfl;jC zT8cP$SmyCn+yf*a7W)B_KhMNnh7h$3l?L`W$fd%w-$oUFf6k|mZwKZM2PMj(nRgBt z27<@wgN#4Y|Dibt=>k_SidalbXC|6itXb@Y+ep;_OXYT}Z5ROO6mjL!E$oE2=ii_A z|M0yDIh}LicQkro;%7%bO{^1s6vTxjt_PJ8%(GatkysKCtagD1>B0MhdT9%(j-gZ4 zj7#f1QEv^&%QKuD9EyR`#GeS^CIkmN_ z0jDXjxd;+I2BSPaUk7qMNQ$&#;GzRu=n&ya!ZLvo5il){q4ITtdjLQhR76CiQ#aIq(W>kcI_h zu5g5tA!gVZWQRiZunp3D-1j&D(WXiPM+Rp&$lu!sDl`VQDalK;Uy{LbgtQV40H-Kp zfo0!N3MLv2D3FMn7UrF9EPud8B-Y=tV^=R<|v(J zfO{p%GU%{AV&f3aI|!1^IQ{`$ku(4ZegO3mCppyU2-nL4et|^D7<^oUo#E%}A)s95 zTV?N*FZ#+yK&06JbLihd>y94`tMErW3uowc>Mw|(mVv8{Ofdqq_6sj{1!p-)Yerp* zbRh!cVh60?n6_*<3GnI3-LF}HArc{O9dNohlYz*xH#9aj5_An^tixh981!n?Bi5}d zu<8&V5Qig>Ao8k7+A(4vP%9F(D8T-<;o)@j^`tuD*LFe;+Jxj1!m2OxylL1uA?`r@U zN>X~z73!fTC%h!Cb_g=a@M6$AC?Vqd!rP!l-Nd}x`;F%vaAdJCk>ZhoZm^M|hCU3Z zRn^_6AV3I>42{%xfcuhAl0f)~w1IKZ365YV=(uF*;BOXbfj666sl+mdlOYZz17!2$ zXoH`z53MwgdZJY!r2-cGI+Afj5E~-b!D7Yd@_O|t?$=IhX6+<0ZVDP9qO>CK8aX%o z+7AW?mr8YC;L{Q302WTD>dm`elhv9~{}b#9o^*&fG0XKRYAWKd2Jb?qHw~SK5Rv#4 zuq;U;D~>>--R*c60y#&=wdY{OK9M9VVPRST*${@s13CxMA^AHFG6Zx)U7(0&w;%j` z(2QJy`R{SlPNBDgOFI@%k>l%j%ys*31~UdWULvFpj`xlQ!X2YVMI$zKW#P3g=IQwO zG)_;EsxSfoF5U}qLScFi``1!xYH9?gpCwXDq@khvg9Oj^_E&F3q`MIM2(ke*6w#_s5RyCPy@4Mqx_Q5_MkQh0_>F7jE30sKa`p5miRnKC^TI7Ij?3NXU|P zH+tCF`BJA7qmhfRk5IG(%=4nRY+4#R3ay2d+l$=mKkxkOMFuj^FSu)s!!Q8&62c^s zGBY0oGMQ0qYU}DMdK|$IO%kpWLy|v5RBz}mh>#lRGzmbt|Mk!RP^vrQU-x1%q-Q3` zHGHgNpaBqMHsE$2R1{IKKnbxP_=wl-x$roI`C((f@CGGp17#OViw-cL93Tuz2RnlD zfg%!&p0u!P9ghbFZa@20b?OElHOLCWom_>}3{2cZ6q|G%Zk4i!Wk7zRAxuW73l2C# ze2S`H_plm>K<@usDTg7MBZUP@5t-62auJIU(dKnJltLy3O`z(Ca#|nBh7Lm%* zfje0f$g;fuOu?HXBf;;hE2kB0!+WPl4B1jq*-IJ-$L^kAt~4+tB@-l8_#FNhL=+~4Hcf^{vIvRuGH zg+&Q^E^9fqcghJE_r1WJdmnLdd;9ucf;Fc(m<=Fa7q;7X$V#SLHt$P6I(^_Hz8a3P zj^W|uLY*~Dbb=E0-r@EbUpCw-|8(YF$7wG>FM-_hR-=g<^h8v-vqbx-m~_!;p;G&aWsLE zyY&y21Oppe@$qVKx(C~zMQdk(uvU)3oL)Zc;Jv6t@k zmF1<9{7jn=hO&4v{hEV*=U!)b1$+Y1Q+r(H8jS7_Y+|Uc!OTQ zJJ_HPak~1KsE*pP+#R;X z4l>dPfW}Vr-ePC;UTGZdst|f+&{jl zI}rvldyFhu;$%ST1{w+ypKSf(d|+T+ZY~7?PBLrC?);n#EEm61^1K8&l$aD1b8M4Jfp{^eh7n2 z+vGe}MLGzD2EKvs|u@G$@t7@##Hkmva1B-KUb zP<)bn_HwfDvE9Yd#bGt%YR>*eneYI(4N)q3b}Sy>PQ*Y0_o4W1SDM>q~0FvLwHja%w?9#EP3{!?8HFw2osTbzz6!jXZ696Cg>>{U(2q}&QMn?k~)iyKeD7m$TICA?l&_1 z1a+l%qmAuuJ3M)!Zbimle!pZ9%65nR1mpjID@p6d9d#6&+#zd{lXab#Jk6q&A zDHDd?7ot?>igSNNX3|Ek0_;FYWn^KYYvx8LL>=G;k-uMKxwu5mOexpS8qyIrrK!ppd>?qHsIDw*5<8Dv`vj~?l z%v$S#>o;?LMcV*8cgF79>}=WjeUMDRe5?Y^A2C-WS$hMKv*C8Ix2C~-(v7g*G~aRo zH4Gjfl)bSRXO`Y>s`TE9>Q5@ci1;L#r|8W-7vAwmP%A4Vmh$aeij9jZboiMN1Wkv1 zEe41LX8Q(V8NE_Zvc-V=Wbu&VKI`iOPQG^;&Cm$4?LRfziL+xP^69HZW3YP0@!d24 zIFfjlI`_ezTo0)Zar6)uv$C?jTBpLK2fqYKQYIh^k=O#DD6rNnY!LZ^ZWNsv8L}`2 zP7svsM(9*R0VZvNQ${n|8l6Dx1rq(xx3^N+?KTVI*|MjSJZWF=>S2?3}LL6r{8ZD+?D8lnu-PCJUs} zHmsvW)?M!X{Q|ppPvATzM4S?aQDDuPrQ2HAmU>oWN@ug?ey19CaK>U4uJm+t>TsyQ zW=@FBqu&~#7s12C|KFXZmGT9)x?KQz1Mf zdXPdDI|aGz&qT^{-AP_Qgm2+!Du9^IDq^%e_0EJy&>1jJ60qRj^^>+ojEq9b$&YN( zs$jBTKx^Qn!w3k;=aRjITt^K~ZhyWpF^z|6h zSLTstVPuqhy~Y!QOt?h}Fn%x54@hS?x-38f8`E4FnV2p?%fXB04=2I!te@>QK=5cX zn9-Krti0bKI~VK7{yji}81Tr=T$ps+#f3cRP>g8^%gB0%n-di#4X)odHhab4Gd4MsQ4kPJAmxrQha{>12Q@jE z46_wVmcJ3ijz(FM%N~9{dVCgAx1zOvk*Bd4FnLYiF-DvFXGru)C*ZfDPS+N;!kc%~WsS1xW59av8Xhp)T6{ zpue99mq7<^{V|wUt{SIiXUC&t#tBbeH=YoYQhkRE!4WtHYYWsMY{b!sTDUBWx2(^N zT#IDmwQKCVF5^*D%;GC|gZf0dEI;TbJZeT8f_@q8zt2elggzcpoKQ#h zruiYZ+0o{(0ccsLTxp1^t{u&u zBlb03nbmO6)sFEoq;_{eB2|r9lc0#$u|Y}4y_B*SwHzumXXvR{ z$CV#bq=ypza_f1mC`vghv^5}b@7qm1ef+o`D6!Anhzz+OL0%zRl^PoUd9-IT86!W~ z_-2XK2fS4j>N$Y_ZAkVcp*+N{hbDsD=}>Tze6jWGD9JA*0vePmx98n=i?k7Ph#1c;0A4n)j<4#VP{SByhoY2FN9Zm>2vd=(Mg23p;4#6ay zL?|txhMJX$Sg(~e4y}mF*P!?f%ZlLQ>=4WyQda0?ssS?vuSm5}|!;OtO zIuB76dV~7et`JcDz%3iG4)_P)u#G(YV`Mb2^6(|wu&KiF4LDyq)K-r9HWJ_%uayRG z$T(j!^;{g*7Fi1r`C?N}o1BJ>PWePHYyIo3qP*hnwDD<|P zAbv=rkF5%)z&Hx`4k%U3ZziI05s4IPp+fv|}ZK4|K31(gOlH_L-WHOoO8Zu#2>-mD0vQ^X5N8vV02lUy;bAtc#es@l*e&E7 zAY&$obq&DzbRnn5@CZqy|J=uiLyQIx7;y=WPfu?G%!AcInkIy$LUgg2)PHpz!&?B( zkpU*zc~Ynv6#~9fAeEEgW@wkfP^&qhK)`HfWt^MP2S#G+IR9G=BnLe#KNSWR5RzQR z-xUJ98$E8IiNKdwaxX766^7}U65zTBYIh=~1Nblr5{{~<7&WI*jyr4R0Oa;-eI zbY-VTItN_`TDDaHm;@03A2d(AID#~1(3MG?8dZR(q~yWu2H*f7*C0BOK$h^^`)9Wz zLYogLKj@*YTOz5B9617ZVF#*u5?uuFkW|VD^}?UtM3y!FxK7~%oH*5|KN%p)faG)D zqYi~kXG3I%3Ql-&q;y8cvlTEUa`A1W?rmhc7{XG_UvII5s4A z>1QbZ(6dlr0boNT*iHT0c8aL>#fbw2eQs^FqTqRK9gkb!Y{I?5w{PDT$d4Z4lI&fL zKrAMaDUR|t?uA?TXT1+jtbsnJ+eDi~7Q z4n7c4+!wuF%p2DSAld!D7#A#h;z03v2Q##pE~yq2+!EU3Oi&OcWw4rrVR>F`f~{U0dD9&YDimH ziOQVB=n?GOLm0*Y$o!~|9(^MplY{{fd^nTwfMkAJD4&>svV8*mF`2OJSI7!DNw2Q#vCQ;i#=rug(n$~4h_!32v#b(T(b$Ag~z?S+Lzz<*s9GvX{ zXrXf>^Nh$sYD(`I|2!kr+wt-x3xazoagfjV|6m8ggm0|O&ESAO+w!hA4@x0m*|N6B zUf{%u$5mr$`OSJKhzoLX>*?vWjeCEGOorp<H|?FAIETYPlci0TGuA6=k({N7i{UOTcAO7uY6UP`8O6<3%Yp=|+Ut`01q zH<+A~BIf0S%Z(^;ogjp{dtpr;r6&!h_ksQ;DS>g7RmlG8!2SMT!2+R-p+(K@+aBT^ ziq-%`LDPpqHSdgj=bi_qhv z@Zy7N2x5+b)4z$=9ECo;?^E+P4pR$cW2=GYMVYmfovg)=mQ@8dpt7bzpaP%_0+iqW z;K4ea78ozrMpi0T|M2TP=4f?NQlKiHq&~}Ja%~%M2`xQ6b&&Xk2ta4VgFT9trw!%N z>=UX0*bz{jl(uUK3F>c!!V#q+F}_SVRYrnjtEN%O7$*0vHp6n8xN zN+e*86nUs#twr7mAW>uY2A89B8lbBGDIU!1PA$AAsi;_wC9+#fs^B z>|1*cl42b|Z{jim2iNRo=YR0D$3Zfd+v3mplFg`13rvJA{GQ$e?2O=V`0QU*xe=h*MnD-U4gej|RtN{DL2C{DhRNOo zJ9g}-fryW^?Pz^z$YZ}hkT(04Ks7HFziG3?>nPXPi-mU;Ve_8r!$AD1KEx&xc>Lz=lRExy4(l0 zJMYG~I>!Y^-n_wd`6FT#B*h3QVK;i;Hu2cX$6*bpk9s2JmqZ#lV$GS|V>c0uaByP7r#6FJNN2mUJ zc5@W&T(9>n^eCwz^WuCLhBgW?9LZXG08uZAGzEZZUS;sAJ&cBsJ+eIk@%P9;0sIws zuLo+HMD(s$s*I=xz%l_un9>}bWsYzpEg!jUh*AS}pW~ORsxEYi+}zx`I#qkc1ae%h zHm<)EsO>lhr56EA!c{5JX+b?<@y78;4vY}#K6Rw6FbP80j|o;6#W7>h5W)sVHR4F z$c4%E4@9I)E}OJMPs9pja8U81w;ZW4p(+T+Rf}-~a2ar=Y_ZNkHqba8)m>29goj7D zOzKo%{{#rym*kEq9TwtDAAqL2egp{iE#>nBD zFSmiuF^j*#jwrqewCv;1J)qa>1XYTzkJJHx=;Abwd{Ucw_x3HYMMe$|E0M1F=H}om zS3cvO2b1$X()NsDzM?FYiuLLGZ#Q83SH9%;&j@D^z%6WdO`gJ6B1c`LT zrLA|pu!Z;KBL_27N5J>6cS-&bU_kh>m{9a*$a-G$QZiJ->)C@Ik(56AsRh#7q!R%& z_+rG2wvdH4aU*I5LO!SMv%P{kWVZ0xPj_$P$Q=~yHssqc9 z8#yTk#4laUzKB|+aD-S7A-{b9{qf?l$d(=`MrZztC2FDf>~r9HY()17g#o!%Q8L`R zus~}(XD^S)D1ws#>nBxmX!qnrY;GV4q>X8xocI3o)6;pEeWO;P30&8lFCS*8p3oy4 zCg}Cg{$?yXYO1r3h%RRRx{H0Oe*h?AG+A}~V1zLzjcD%#o`4GQXPh)~E!kY_V0aFE zPtv2;?5;83SrcFqr?p$#G;96{b(wWVgsHFwn%f>}QTL&8;@al?lKTt0$Xo7)=7A@f zswK2&lBM0%ZZ;`nRv$Ezec-d4W7dUASNvFB4q5uD;V#3lYipCZSh5p4LX|NdryF1G z9rwpp+4ZP&xf8Yg5)N#S_flsnoQF(*Jw(3%Fdn|jGn&bC=PJkAMTANlQk4Pi`<pCU8Bu^ApS;6$oSpjs$D^kL`@Lxkw*gQ_l&>&bLkX3`x34Bijgr z-;%mNJ*V35mBM~&`75P}Mbi0Qh1Njips~g2Q*S3fCc6!FBR-UW!e&}pH~z{^sG=dg zAj8klSA-4mw_bYbwq=$OpU~itJ`BJGNwtAHz-Q$L3G#8o`|VO{@jV9)9r#*o>)4?C z*avNHNhY^9+XDW0=3a~s$(1G}6!Cjzn+DG0N4j1tHiW<`_lnvbp}6a)K^@U|&>xG< zROVJD3sFSfgvonJ*vN9D=uzs8;cb~xZ3~}5ON7KdIbXCWo_qIqP1$_gi$9sVBtaK`r=idv>;kel(YT` zQj(Hxi;bJSm%rL>m-pL?y z*uwLR$DW2ZpBD+)+d;v!$-$BG#qOY*X53_w9pb-@Fz1erwxwfX{6`wtasBVhb;lsIpGe>+S9ZuOd2g^F5)QAINEnO~s z3*VqUOnbyp?aHxd87eH_`)>^@8#5YD)c2e|BkTQ393E(w9qw^q*SM1tpI3^G6q_V^ zgtvxR9S{s9 z!b(ljpO-h(xm)X_^1r_y)`Wyy6w}Y1%9c-Udfr*!%b}-FX)&DCExK7SwJmKgtWU}( zXqMibnr%(*=9Bc0{%~67`I(bLv5$U_N46dOMgd>SpEq7C=-6m*?FTQ-r=@`1M?b({4auQ*&Is2NG3QryQ&-@m*j zI2UKtv0J&Jg3P-@%1;HIFfIv2ZT{+ZB>Qcdo7GnZY3rwz1^4alNsifUY!tZ2t$D`4 zAo^^J^vX(GgQxOI(Xnxrln~jk zvDc?S^*lD_K%{kD+ca|C+Whzv^!GruGmWuH9DVv`y$qu>=PxK(6~k4$otMIUe4TOW zzds2t;g8ax(@(upuoxN67`{4V=Jdq;!-iv4wHG`+wGSwJ?W+!uIFK&4eeCUlwx@CG z1$z3?Dv)uB`w5C)2#Thx@Sk`wQcsK8rX8I%X52;-z7c#TKpOD|OR>Yd`L-}+#@^WT z^ctm)55GGH1BKr=1H0qLs6}nGZYT?zMYq4YNd=uKKbP_>-OAe8+I_nAY?f)O8`hf| z8f>9qm!;FvrNV$n_L3l_10m%Vz4A`KR)1tk6&1Ce|CV_E(dGpyp()hjM2CiCm)Q8F z07{r+6oP-7E?e?5wRbXPBFq@5&6YhVlR8-J2stc6}=?Hx9-o+oj^2KUG=uX{wAVWold#3HCf{TCt>~v7P zaHAkIRf&U%>5)C=W~tCcVmyn#5ry|3S>we+|F*jg(#i!~Eik20ue^Ocy8Rz*5^8ez zAR>l&z=)YFA3IL-w(-cJ94kw$Jj3Ct|*7RDv7Re{+uR6lSeHCuRI&k3KK$i(FA2IFxCU44L^WDE!_Hpr`#&YY++dy1f<(4(;AQ`1Un_@Nn zQO%+%DdqLM``Run+?rD(^_?efn!UYiGx5v&%~uY% zRjkL8XyzGrtk|G!|NE<^&+ebSc%+!(Q0ui_r};kMfFXtWJDJRY#f_G&Enm*mUn}?V zOipDJ3%Y%bQ4OhDBR;vN>Vfh;@wW!m$Jgfe*N)cd%hA$C@l>BKzVP&}rdlkF7<=N= zH*D4tu53`7!EA>n|I<`ZM0ZsyE4zP%QsVQQbcL@2kvI7E7|z&J)!k6iA3{sd@B6dw zu*>_A#X0R{t@VMUO-Xu}Lqn6ItlR`2zZhG%SaPIYL@;!qP}l!YmaGY~_!Dp@JU~4S zuSpwF=h;2CckhWTk1W&`Is5F^;D_baF)H7u1EXS*^n6yb_s>x7I5-rqS7Dv_dtfza zf$%$*9pjJ*g+2e2kJjZg>~w&3G08gymMNm|C%h(fn>O9=&kQDN@gw?7`t6${ zGiv*f<12qoO$Zt4@FsTdALZ_vXb8yr18BHNYf?pC#>)KsvLl z`5U92-k6{kb(3H!%ky8dv!}R(3nljNVOy-Uy(nTKY@p#a5W#-K&Q8FX^7QH2kP1iR zOx~r5r^UK#KQ4HR{Hgb_*A7c{wIWpmu#AY-Ovgv^r6=RcO!)(0?Kkc*6K0yj#guS@>*Eoao)n0*r-`-}fnqAgZb9vm$_s=|yy0i9!e_9%U1w@*^zDPMq zuT!2K;-|IeJbl_^>&&$F(9N_^3X|SX-wa)rzH`q$dYSUb(aGe;yA-8EkVCvbl&(9} znzPI5jhNlRCr_dzO8o`Dul)AN>fp&SlUIE<$#^{BXIHIDbY6nK1v( zG5y{z+#c1Zb*P-Sus~CZHXE%h;`k2+o87hXO?Z(l6{I9#cgK@2{%3lewu=>FpP~n# zTFCqNYGj>mx@t4g2TegjFB8PNAxL zvm05_O1$aGOO<#<7!6=od-hvWV8f5Az0s;a z#v|2VTQ*-4tZZm-+4Vli*)l)viC(Q`-DabsB7F^eqs_Avb&rkjwKn~6?TKE}mBFfA z=T5q6&_!QkU{+H)-WiqN+Hmf4?$P}}6g#^Bwxs@(x3F0IxP#&Vwa4s?%q8Z#P}OP>>s*r&6p^+X)W3fEhpQ8R z0*6}v#&>VEt^{iA^k-&H%3kGF>0m~HD|><=XSlJ z;MpOUV2=Eu8_)Mx3D?J!2%T^;=lUM-;oePii!b9L8km9@q0W2G{~ncxbZ4TiRpd{; z!;k8RI;gfCDN+_rg_IhtvrRgyrSs(nwe6SB%smR$()E#OG!RKG_|ow%%ZB-~>V%EH zE>**+J)aIu6~1jcmmFVR5?gGjee9^a@MGHZnSYO+uxij8d}rG#(o}NdbqjT|A#-z! z^I_k(J9YelV^?_uUOQ6Mq#9(cYx#ZU^M#TU{;0FxxlMh1)FU_@ymhJpg%bdq5Xb;@ zDJW_zV_}YVfNsX}{+YdGR{Ps~{$=e2W#~I$D?leTbH zV7K^(2iW(j@}8WH4CexwHY^YA&HJ#=*8MKe&jCYz&&#gS`&X}@dUS@FIoCoAawn|< zO%XO_g&o&rQ}w(1H7kTCCljKL8ClJegsqH}yMo!1yyQ;poUg93E-bb@GJ~NU7ok&i zg7!H=Cd0w1ze!lBrJQ+OfxfIS_P5{P5ewp14nr~ zuD*P8egEUh6c*os3B@M$?_|b1SEh-?eAB;m#;;=6Z^G z!NVGfey=vO^@QYvZxC9tyj|oK`>-fVnIa?eV!r96qg<|g^(oR^eV~7U=(b$N(D@ca z_$8Msw+2V30vo-4EDHJ9HCYann=r0z=Kl2ZQ%*AV8P7LRBC7Hx5*!~;-3OTwmbVX7 z*Pgqu=PlSYs`m5f+84`k=DFm^sl-0LVk8_b<&M-SxkEXhO}{lC_B)Vz<@0fmAtUGd zE7hW9*LC`H`_e33TnGQQAZ_e0q=bE$c695M&Z$-=pK8&reZWH7zrJ>HDakR49^-u) z4Jlo0?eA-@+id3eguVZDR)P(SvE+_RY0l@$p z1IwN5q9PVq*fRD(;)hCE7Pc_foS+|@`nj~$q5*rs@f;2&=$&`fhw#n({ykk0Hg%Zz zTq1KNVGrBmPW8MOTXq?2+nTQH-uLmDz#(scmM23M(cR)AG3-erw@S9{Okl6#WjxVj zVyfN*g2^?1FPm7JphaPy=6VW;MK=?X-+#UdJhiQ~6NvF`3oY%Pn@~-0FSQ&Nfy_z; z`rzNqcMa?!bTo##xlS=WM?giRgomQzNL|xgo7&2`FOOOkc7otz)85XdGO5l!k3)p%U|0U z`URY0`nb{P9>#})WyJZUv7LdN+iN@O$SsV;#SJAc)|}J+jN5JIxBq^km&czzOBo(@$|>gxV_{iIYk$a=@z^%)e{%}A&}qBfe8bZy`{`z+ zjz3!22{eyyamK7aM)`W-)ax&d5xm*V-QGT>o_#xfxFS-E>u)Wd>q(7yGz55*OI zXwuf={;imZyjn^1^|6Ttw8Q0y>hVaRSL zqY_ci5>5$vpLW|fuLq+@`HkkE-7ahUjB&osNZQ%Y%1tb4k?XkjC&eV?>{|I(YNj4t zS@E*z@7iPD`)3o_TVWUfCx801@4Q{h4N;S=3AI&^p9%E&cWQ-O&m80qRX>rR_vrmlMQYB$3m5pya$9cN%5kwmbPdjvfSPo@ z1tqb!1WhKyozzzrY7U;^=h)y^Zl1+GJ;Ito>`D+B)?_k4E$b=ZEJw)4wL}e*2@Ta9%#f*L~7CUP!Q0e)_2m+8s0UiK#h2#9caYq2I1k?kM&2DBz zbu(}RynWyg4)qh?46qwjUJkWcTQhC7_T((J#t>u5-?d!HMp|8Vt1HJJ8s~bw(=q^F zA>H~{CmzC}T6%jO5!*P4sn4te&|6^%$}}wJa(SUmqGdBux=?F*I)imm$6#<_QW_7L9 zCcpbl`!X_BO0y*XTsaN+VbOD==8O0p1xlWup8dE^ziEAeI-gKUrvodkc9Gun6PnYf z^D>ra%p_?`?q S$4Q`D7t``r~UUrWMh5a#buEKYu>DroeG5GgDAchnH*b-lf~Y zOw8KP*L&`w4*eWZIJ;ar($qKFGSesAqnoHxNZwqFctr9&NmF5_&yA-e%m17h&|=;v zCN}Z1N;o{+<(+^4WAJQ@*rR(P-xMeKzrD|CitipcoAPL?lUrYh*Y$`PXF0_trKXjl zzmxr^zh&LfQhTBqrE=E#Ki-CF!DQ!UENC9QCPk4e6r3EMd*SxoL3iHSnsdIX`KNXC z@q`Yc)?1C0$L!1pW8A$A+>eRm`Fofgq`aDpSof&zh%NWMn82B#d`~Sg6b4*f9A&&Wg1bP9XO}|!-`(CWL7192-DJU#WRe}N4092hE|4q8Eq0rjaGxtV4 z(QPPJuun%|`m$VrxX*_dvjPvJa0tRgO(L(CmTz*>OYCH3e%vT=lD+X*!uCx;{}iKZ z(~ehP%cNZRG+Vsl?(w}hzpj1x`JsY_XHVng4C?v!B+JhyDDf`e(s_0mZiieo8kRr5 zA4ng(zse?a%Aht{M`R&|S_LG=`gJN{k6ye~dt>SK!zJ%vYW(lpq7rT!)&+k$rb0P= zn3?VHc0QD`iXQV$Kdk5cHsdsTduG<0$vBu%SJ#BAqVF@!fE^>#x6ZC5lEue&p4}@Rp(?fo#<+#rzL5 z`}q;ga~XJuqfTJJ-M_2P@4=_gGrL1D#hM=uHz0AJ|K)WQ^7{H}?cJ(e8MBVkSiVjO z^8Ux`GWI7F{V|Ds2H_O~$P+-|ynG*7I>_HKwt6nl$b)r$pRHDe`8g|FVb+q_V^(W_ z5b~0;*nJDGWv$?Mhj%6J++e+ZyOv@29>XYAdu`iWkOD&ssF`0PS7<6&(B)1g0E-#) z!?I1CB`x9-JRbKKgH`x>*W~QJxMyF};azd^iSE$k=H9}wr#MNH`SfYY(em1&%z^0`oK|BNVb z7pJqbv9l}q8NaRf2&WvP0#HOI_G;6v?8!+f@EF_>AK%HPl^U$GgCK?{c(y%h6AJ7?v8nk0OfCRVgunYQ1RKu~YoLR={?HX3 zmt#ycy7$kB<(z~(BeDP1CgvWc<4&q3;rGu7EEOKTlSHjP_a=Ged!1iF7yBapwTujs z$3gNw31x@^9Voj4{FqS9lG3Z??{5#Nk;af-TkiKW_5U+jj+g)oCW1702cYOu2f9HQ z6Q8zlU?`%_^rNr#=HsVNyO*1|BWvO;8Dj#c@9Q#CVRQnUA-zEeHWe9;CcRErQSn_h zBTZ3}i=B6{`XPFb8nqKCL?#Wq&wHO)Bm{r4FuJK#nCGK0cf;g}tiWEVITEj44aLMw zvx+lIAN7--Std8$O|e*4VtNJF{^IPgVbyj-niLg!5>9^>OYH6}_0zE{|6E|I z+hUXEI=i3uofaAQh4Y%aNt_Y-TOy%>sS}RhNCbNZ_`jWN`CsD<`>E~RB2NIv{Je`_ zVKTFXq{zTSNlD>-36fHL9#Vo)FD*T!3x3{9J2=(S!(J|M2Ydu{!^#lHSf**fG3CGc z74(}3d{6=JK4z@FECh7~MBap~!ebqhvb=NZeLSWJv_F5o6_y8eNV;o~)zRUR^fw1Y zTqy@tV`IU=x7ChjRXYlegn0x6Hb9C-lp_Ar9*C7E6Dt=N{pwD>X2O~K0LglYZt8Il zF+h1+^7Cj z4S-9Js6YI>pMD?bpE;5)G@qggkL}&SC1IpGM_`S8<|rq&FbI(#L63wb4}Mh|F4>cz z`ko0x>oQ-oeOeedPrv;%I9NOXuo^&W^+KKDfvHc)yIs@L1SXID3X+`5mA!RI$pY)+ ze}rztb@LHr;>N+CE0AwXbVG2WA?blgy$F!}e}W@tBwY9lKL;fX0efWWv~)yUEq~$a zBgQ0+YS+zHcbd;9w@!9PI~Uf{?T9F8I^#E?o@7=Xa%lRA?ozq+Lx-?bM|OmUjzN1V zU!@B&8vZ*H8d7&&n<@_kG%->^5~6yWZH5MUMc$xDi3kGHSWU9sB| z71q~n+EieGl|Az9r|!8aLQX;P?Ts80g%HQTo~Ym99*@V0%?C943yL_B$46w$gFn}Q z`uNej@{u^*2`8Ubx#;qK8G{lY2=0haywa|jw#2mb%`^CWF#QrvAtKaA8BCJYz9U9JmpU7ef)dB#7!aV6rFS5$0+gbBB5NEaotJ*;`O-?N)KWRB1%Pk zsqdJaJq8IsL=SEiy-BdeBNlliM(**H%bCw}ZwiDkgp_r^dMVw}qy2OxbxiLIWB&ds zei)-Y&>Tp&9%3Hl3Uf@TP@bOPUv6nIi;yt z!bp8Tn{bDCsKm!k1Q%qK4PL%liKpp26BK0-1wc`3|FIFzp#v=@WOrd8p|O}5kO~Bi ziTf@GYzI;Mv~;p#TpbS_g`3y&Xf!V%U0Xj;hl70`T5MeL_KDM$w}J*B|x?MXp7>S|6f?w!?xfx}Wa{ zJ}(Jjh2k1N^aVmwphcMg7vR6!5MjN<@ln`xZg^1OP>!4#Ae$-sdnx6Qf0$esU>aLI zAvw;yKmEp@!V`THQ}G5DLl?j02AWqmUu`cn67>G{-4R1#{!8tK(2%4@O+tW03_|@% z>(;F!wqn1%#jka7s3OkGujdY40M0T-R#vk)Du_Ns5B*;Ju*s^ELg)ip6t5P?9+Akb ztR1`cuU&?EG)3DO(~ly`%E~JDy}>|7*l!2zCjax0w9h`2W{u#Xx|N@=t@!iO27!(d zp%hNUsYfFug`^S$3b4VNuzR|L)^nlQ6YHDUZV&dZ33_V|u)CqQKIsYrGt+X=U zEIN>=5~DEG{EbgeBl;Se(U%AK!&Ql|&5X~sP!c0P#-LRU4l@%L$aLwBXLLnyQ|uQm z`MP5}4%5){o))ZE*PuNA^X22S4ulpUw%|>3`9_EuqHBk6rRV9_{uyEG$bzsiDkSPL z+C~mQqwt^B%m)tMfZr^Z^;cfHyl=9qU;e(*d9(4X7!qW)y8}X9N(u`HU)LP5i&vrc zfYT73`gyn#W}74-|5qc=60#^~N6<`nZ7M5wr}a(|&IEW+YD*O`c7$I-(m3_?w%&k1 zNy>+ePal)I*Qu`6?v10Uw|Eyh}U24;VGc;=rN z1FY(Btg0OeJmbttb|`WnZ1OTnP1Su`0ul=pCFx4+YioEGZk;GK&!2!c6i;n`1ACz9 zVGBpAZ4e~~{9Y_+&tc`_`tyVCQqQlovnIETjjq-`94V+9Wn~`V{wlS5_ior3kzF9; z?q<1*g^g_zKSpfu^z3yuf7Xl2-%V^aK(S-}Yn2r_o`q5b`71BuliMF$iq-nPWa04h zOKEd==lWOmF$Hwbv-N8kTqJ*-{mk&#JfB?;g*#3d`cxgW{5AHV-3u#0&C-YA@_~eR zaQgAj+RgFE9KvPXWBUO$qj<2yQ}8^3GM*zZGoP?bLZ0>lI(?|+ht65tA4$3+33CiW zcI~?s1k-yVCb#>!{oL;qI*a`UYO;mKWVe6%1Pw;Y@982-Djkt!RtzTHf^46#YXw^T zOvK2ADQ!ZpGqMdny4~^8HyMLNT<8=FWnnv_MnhLipL{z>VOmrM3mtC#{%S@S3RG|? z&@AhBOKWJfoqeD9Q9_jmZXkY)x0CB<(0o&0*I*~ZsMfx-_rq=NBOVm*H!;?=`ZMj2t>+9?7_=-?EKxCSJ zCh!iD1ugsVEiNJ?vPJUSIOUf&$G=ovG|`3XBF$f(i^Idi1NQn^Kh}EdIM|!hXN2Jq ziimqnRg#WP;W4=5K-R%ztHg?<>2jj>Zr)ZN85viD-?tj}$AAyltVtaAmV+kya!JXF zuzRv;G&#@VLEuZO7jUP)jMW*fj-r+pbA@T&cTD+W0GUNaiXHL>V5vkv;HbBwtE9s_ z`MfuX8kKZ+r}FSfQVLn#r@hz9VfbbjNqoUMLi{UryTWWeanzyD#+M-?aJZ(>>%gq( zb^T{mX-!q-CB1g6dv~BPn}oMzD+QP8 z+8e?jidb-YvjLI4>Aes;SydI6c}&f{7D|za@b4x}i-BdUp3 z;ERv6l4!qtG+(ePs}F@$>{pPIi=D)SHYrj$B<&lF95 zOhP`gx|3b4$gpT7r53&70pdK*rj`!uv-<)BI2s5NA1Z$gHDH(XcBtQ4tru(`C_kTJ zdb(01+=Th}&$$@5^d~?`LqG-5>K-^3Fp(|6ea`S`e#P~i=!1XNM5_yCKKLl+< zDm&t)L*qf5fUrQ!epo;Vd|#2UYhY$(!6BN2!+9kXK4~w__PB4|>d-;$wBIQ9iVAfZ z?6&X**-Zr|^|;djW9Y*eqGr=wXa-1_Aw+WR=ra&V7zIb>-F}wKJq*$6C_zKB&hYfQ ze8^Kez6Bu_wjp;?EVjexxcKe8z2R$&21jkjFnha z5(Qi{cYV5)f_g-t?&E_jx=MF>%3F?ozt?t{FB-hw(Z*>~!DWuB+ve1&zNhz84{OG? zYia71&u{R=Kf0$AQqYuyEHfT)aiedXo}Qj@@NYT8)Rb{3LVJBARd`+C3}zLDyMLdU$3utC)TKk<8j> zSUA3LkwM_}G}yM9Takzs|vMCv`Yp8*i_<$DlVu7@Kx$ zHsV2?cgbR8<(pSkl|*U_ZMglY_ldtj&LOPyv`C&u2F*<)%Y#yi_KxnlsHzct0J}!d ztOa63yw4Ux-X7adU1qbdo4DZ7PAT&yw!MG93q3i^>m3LUAXaR6v$@dGiU&rLMs$t& z>9csjg}RM7{pu)ShO6lP&h80e(P7qr>>eINb|j29wpNRBZ)4VoknPWxW&{R+>DRm6 zx+Ns&a?O+JF7*eds;c;1(-d#1Xu*QP?O2h#Xr?pA3||(?X^Wlhad8_E5!i5xh0m#a zY8%Y_#36!Hs0)(A@rj8%DIE+Af!Qexa&hA5>_V6B&T^tc$kp2Aw4AhUY<~gjJ2&%t z$Hex=J)zB?z0t4Ucjon1XiOTHvFr|9g|hF{mLu2b(3jv|E%0$QX8a!hLBE26!t|=P zx{B$ioVNVKZMCj05emIYvuhhkwWELVF$ILO_uig3G_w&D>wVvm!v(8yGd_Tkcz2AFZ``XuaCc&ml zOPS23GWTG2G5c5+85a{1^9=RsJfyy4FFy>@@)Sj_uO-EY_ODcV`>_AfF&cJrx9;7W z`uN7nm4CWkyx%LjhaI)EsNS`*JhNi2@#VPny%HP8N15-AiP?JW$nAYw)C^N6ynMWJ z)MU3E4Ppii$U5D0eTnw!_LXWg)(>X(4OxfOpx=6h%68oYFHBI2OoPd7#Y{SesjU|) z;K{zaccLde;svZ7qo!sxO{nA;x>+7(Vytmw_7!*4#LkWgfDK6*Y_zr6sYAw77*LhL z93}&z{mzE8-DMej$l5VRgY>ogvhELH1GbrJHPuMAE;vE{PE*Q=CEc|jtRR`m0ww@g+`05g#mgc>#Pqv zK8wKF?>W)qbpz>zSi$4CR?#C#bK&1gGVAe;?hEAMCJ-8*q`!J`v%x*b*B?EB+lMBb%! zCm)1)IIk&VK*qK2*Tit$iw2Ax+NRFfZkLEP@tt|!Px`fUa4C*=&@u!mbDNUY?f$S% zo!SPuR1fn$@YiJz%-h}^&YxftUUG?5McS73qq<|>{reTqjO`eM@Lm3;f8yfeL-o#^ zE$A}8y4ky@5$xIS{VLAxc75#K-%?d=Ls}N#87ZUS1*8OPk2A|Tg6+cspMA34HoL@G zzHQAH8^;V{rF3nz?vv$JWu^Do6LjsPNu-dDyjR1+JMFuAo-uQ#=Q zdx!>}k#}^TLW~|gzg|y-@?dAaLwv&dgvzVZQGXg_WtBQ7zB2IYE+c!)w~AU%&sR;hYo7dD75=%uiS=-*FG@dSP`=;ZVGT|*&$CqU3;}j3B!ulHVNuy zsa?KyZ9$s|tEB!qgM$=@rVgk-$!&y#Za>GKm8m1UU0|Ygi}$7AY}8@jWoFvD_BB{) zWavWR3e8eK6|s2CwzUn4j(0S1?q$D&uK&rqUZL1jCUm#kwr1405yfvGru}#~Jf@|s zcgdh--!+aMKkx5ns2lZ{d9nYL9c!KR8eB9W5+IVfHDFY;hz;N4be7tw4~uu`gcy?5 z-;LTQMIUUy;%|Y{8chA6TmIT$8J7^NYrlC+vxY;y{|K}2YB9Hmt#|f=(@O5HuIn!D zwMtU9H|>DgU!#5HL;Lhq$?{WrzR)w$Ccacv?P4D8d!&{#FP>S1+)zQANUJ1uR8CPO zds=bUtmyEj^vm%MK0DJ47zL;X?8}W%TV<&|&er>=^@!o9gpgws=}oeXqiwCXnK%Qe zq-556v2E+s)0^5>XE^FcX)tEUfY8=-`^Jp}mQRS$XKjF882owGB=2g|U$mvV_gAmT zXz=fuGct{bMDXZiZ#3*CRMk#-prKl#;FC$9a1Yg_z;n8m;B>(c1gM!68ARgWtY2{(Eo9r+4p_h(9y)B4=N7h|$t- zvL;*^ifpWD=ZX5^IXNNQYa6L18mqKRk(qr~Y*e2`+er|};yXp{$jDGsMr|meJlEhB z4yL1c99O*gZ4%MIx>dV%7fk26jkY-K7}ILYmP?JRUaTbvVyv|N)Gn%ti?0>dTl}^@ zDm!P-wpjT^pCIP%m!uojA}pqF>o#q=Fh&SgQm`Y(kx3-C*|Wu0d*Seka}O)J5Cl;2 zM4^Cx^Dajo92-kO`kCt3IkMTV!^S4J_9*Qe+(1D=uD*AzmABpeGkG_5=RL_+>=eRm z0!2^3j`byt46jbx3)LN9y$avY;iy3b+!s+Z92r7qGZ~$l7f7&2y7N3G>e**Z)1&k;F|`qLBz*^kd>J$EJ_1x>@C%)m zb~M^O=KLL<&o?G$H9`+U^bX*d$KS^+E*&vJW@9HW9m1nU6SB*rLhn0@5hFZ1YOPgP z1pWSLCMwk=>2IykY2#&EbMMHejPnJ&&$|) zz=8!zLp$_0zF_CAizwq$l3{DqD*W3s#tj7YgdW9b?HAScCQS+*8VXU7c56KFbHU(2 zc3ltVx32$?Ss3(+-sw(URP*6a+fPTCK$za@Z|%W*bU$m@)uZgI0_#TIAmvu+>wZAn;BQHS~w3At66}Za=899pSmz*{}BE0f$fDsN0JY zN4Ev;ay)gDJyK&~VA6fJp=)1G9nEl&eoJp2X{9u@%Wsk3h3^p1F4DE{scs?ZM{6q^ z+}Ndb(l>&^(|AU93=-6v>XyX~YnQ^Xq1r{UL4D;KreZ|la;)-JUVwpNq7r)ZO@M@C ztktEsM;{X9g9}!fuD3}T7pw4oZE<*)}%L&vO<0a5iqfZk|qjO5P(AmR0hhlJQ0ZTXwt-e?$A+<);iwh zXdDLPfDfeER(5Bi|D*cPs#XnnEPQACZ3m5YLB_?cl_n*m)eCer(F_VtOl_;7m_9u; ztR8cbB0;&jMlagO<6VbI=yzaBiDyDR1p`qexhPc*H{Y=X`S6Qz#%W~Q?C>h4kcXnu z(GqP`>L1Zt2=*S(uS=_SGG!_%OGv)BX@-}kJv!g~Y$x>>WkdgK2@8RbbSSFN#*K78 zrJ2XHRTxTbj5HP}PwoePamMkVU?#G_^ZqB**~!8p_Uz#94Kyvv*{g=i)CeMoMfWId z3cUt7SZOc~XE3z}Bc|Ub=i)L_$~}CI9RnuZj{N*f>%WXe#ft-uHXPzqfDPFh2chU5T+w z-J-6jrh2`!A?Occ{}+*gO1?8u#Jecu&sx<%+DI^=M%3xey?iAs<3ca4f=V%_M?%-R z)6ofal)=Z7*dp5gFdTT{di6(!qsQjXMVboOqVLB_z$&e7aYH-hMfaK@77_z-q@`B? zE+L;4iOxgzg|EAJ?TWLBY-1i~h9ULIn=5{I*kzeB`>}E2_zO?xKY8{{8Iy3Iz#gf+ z!6DOWR)}s9xdORuo*3kcz~18SzEV0M43KSdY*KRZg#wkd%m$U2{lB=6-+%OS9O8%46GzPaA)6!VUspuC4s=ICb}29 znEBX+3o{;l-ut0OLJ-gmM?$-)8kW)kh3@h@gTP86o$9(-uY88$-HRV;I2Ma@ky20*)9gv)|xtE42 zs)Czim9;+)^B$_Z-A=k6JY@6Bd#HzR+(@}Two_=zg1$SO9M?KD*+Dlm%1Q4scY&%; z&uOXK=p%r+XBue-nVB`3KR}pH!Yd??#~I`kBb%& zi$<$WQN4UsJ}>xFBQm4X2K%U9_%4Qule5*)iC)#9iK1;=&eDt60^2(GDl2enYSljf zEc^$iyp$ox3*?BDT=sbshJ>OD=$`apg;|GY{tr&C0!QkCG6PCkRLeq|`1$Ks^4Y7X zZaeR2(6e`B>DTePP3!X_Q8y8xbp$O!UHgt8*6IviCO)SV?|t0GX!v#~JAaj?`V%)= ziSvo5=FuXe#*-F0Qq54HqVl&qIr--9mg=|7`p2{y{p3+&>XTg!b?n|$r?pjlX=emoWaOUqAZq80sGwl-i6Z*MyAw~24f6F>9DFQV zdsH!H1y8nbL(LZ{+eXCAd_Z0q#3kGjxG$m$BI_dT>ypX3Yi?X=__x-`>;2cV z)eO-~MFz(WA$}2dvVoO zj?=Y%czL$_k*2GG6yTtDC?h`QH6yq)0tlms3px5mZ`7!~KZn&Z>ds$v=srt$(ungC8%|N23|DKXJb3@{{?1B{ zgB~57*jv~#q94Oc6&OFPJ{gUabiM%&j66SSP+u{4@aJz69d-MSxPLDY%jmL47(=hx z7SYgP+3CIy!>zn^$(h)@lO{i&Jhxk`lN&dzIjnTAp@!Y^r#u0N*&B2oJu;67M$0lP z%~7j^%d}a+eP;Cn;JdKp%l7zEMVOECiHr=OaU@)qO#2i~X@x6Uz>8li0sQzCqOW}S zhtjA-Mr?at?8cOuJWX(jQA@Dh z@-|66Vl2? zaW6(kAF>TLZ=u;J)Yvs6Gqca`+jj9z^}_Vc|2}&jnVB^G(Tnq5$adnN?7QnOd$7?i zeE!o=mP z96p*mOv}FxQvfr|+DEzVnT&?>& zz7aE^;f5zI5F=@F?bnaA*Z1yJcsS?=+B!LDAS5ss$SB%;c;B5DFI`$=>UyAbm(Qsg z8jb1;In&%YO)KH-jWI#*{tdNFxVvtXmA5MA@UEG&@@vZb#7(pCj2s?c+U|^*nFF&) zU-AzbPb5*t)~yLhBwrvs)-SXg*;C8GXXLinA09ooAGou&iDGWHwfAaf9k2XcR&n`C z`>7hLiDS>8eLgc_hiT$E><9&pqWM+#m0L9L0T*Bz7-s24+m79aHwk5gyKj@Cd&{GH zG`eWu04QH^D5`Evs)4{)`K~%!$WYl$FDP$ye*X+7s15Z5i)w}s;6DHN`cjNmZhZt3< z^fXnL)wmuucBg&5eo{dFj)`r&994Q3W*7`GvFSfzL}>6LAh|QH&3al`?7DMRJ+X1Z z`p9~BZ>Yruj*ITG{y+VsIz*64jHP`*dv1$SNeyD6^JBJeUpor<$Jr)0RMq~vo5_FyS6cM% z?{ePGJL7c4{e&}X8w{PNi0%R45){3?!P{Mi|2-q9U{5iZty?}~@oL&Sp;U{X&z!AU z5gUUlPu;h*UmvM7WJqYUr-zKkTX$&**I*(NNikeIg+%+QYP@=$2n|0ub9li$thO0B zyU$kfd)%z(SV;7t^^r}|n5EXU_0Evf#3$H%^HvYR;LL|n#$k6zVr4HgeZUHp^Vd20 z8mB&(qOJ}uSK-Jak671uuJv&55>g)HapM}EwlCao7w;vzp2V;HO_c=W@qp84(tzY3 zw3E1Qs0l@#RlKogX@drvM|D#~NeG9e_3pr}Ks$Bn^LU8hFig0)5^FD)mXqqf+h^;E zj83s8k#aFRs7p_J66B`#lQ7ebX*&Llb zsa4T>kdg?!-d4($BiEJGC;bqbX&pxtpj2=_^ze%)iajH30eb;ng(-~YGBQCFziA{m zdLyFi+jZZ(zj;w3jYK7N#Oq1ux^lqd%7>9pGX!F$cb`V`-xo&o-t^|Htdd#)=rS%N zidL(f^}|_%LX1U4)amsvSRB|*?@w1!)l>5~goZu)pCRdKN8op}( z3QBPY-PQ-nCc>&%Eo@XS?K0@qJ9L`#a#y0h7vQI!IP4=zx3!g-P3jJqy%r5YO~ln? zV);ZSca}>Gu80`sZDp(*_8HMT%-+N;2J}+It`p$0GK;a}ze|RrPC_l|9(0V0@?AbY z>*4TgPBm6dEFjL}U+@&6ahZe>@;qckIoV+MPP{*Ad`rH&7W%B-bIN=AYZqach-nN$ z2nV3VJ57!^Zg(=<4svK<_Im1GGREXjD~5n@vjWx?$ElT*ro;FfR=w2N&~O-^@eY>^ z&&Ww{z#oQnYt?S5T9^=o@JdU+>{Uu(DePk;KN~7epFS=5t9{?dp-E0%PzdzNtgJZT z)Q-EPMLe1ofXT!YyNPUz8(i7C#UH2%La`g z?1OK#pSralK%oFF;bO!p zyy8Ha9?^BA_sst1VM}7tpW)=2QE*BI#AGVBsP`P?Bg)6rgKdAw zBR&0cQI*ZKwdAIh_3#&&kHDgUxz{?2sksAp}D zj%io5@Qi2@^HezT+uw}JyKzHJe)jrk$cz|JMbcaLjQrq8w#U{+gDs50`ts6V+fzjc zn#WTGX@%Y4ui?9A))c^VQ{Qm-&FZbkfy5GR$oJjKzbN z4L(0UwQaaN+osZMX1SL!JsGhwj_PB#{Yb6(+b2&#Y0sivc(8cj(RSg0o>8-N6*F%t zyo`--AEiId9BH}Nkfzt&S~sds^Z@Rq@`YfGPb!?n*5@LqK}7?t4Slj!AVz$He&Oni zPle{5I%#rcsK{mEK4p;gO2ckDor~b4CuB%`CkwLmxx@StDxs`&o5K!AtpfvE_w3p8 zKdrI}x)Yz-2$IK<=cJ{e0FCqA<2c9o$gW&wO`y6225vm zR?lLiQm6VAWtBt|pe+dG6;uJl$DLKz84*oPYy6A@Lo}`z!m+=CzGFIMK-cVm#`24i zdE~*&^DCz57!QH*)uUVs8H1q*50LCIZLNpXK)+e7OKtI>Y)lwjd+OAZgkIK%9n{3> zKrA(A3W|cC-vWMv(2iNkWV!sFkc66pqvM^nbCLWL$B9u#z8I)sm--R8G9=-70y-yC z!aSU`4(8i&W}z5uCI7PvJL)c}^8fy*YSV-?@&EbL+LBK3D!u>v&bsf=>UI`E|L5wS zO^Pe9zNI>2{6Yx@SE%*@x6}2T@Q5jg2xoc8flK?;b$RY>q z=bTmM0S@g{6msqmJu~Sh;?R3)#*SIQVZzfP=7FU#gJQWG`j50nphWODmb=Me7%U(+ zJk2C`;h3Zubs@tR+?|yT)tOm}fZKNiEV&TO4u|P&UoE8b;}2iHa7@zpuZvY;afmqj z{P}aX8q(4yH{3|P{krY#(WHgk0b6(Pjew{D)9Le!>}JJ>(eS{0uF&-(;Tz1#Iq|c@ z_n$%cEaOb+VFIIwu=O@GP!moRg@f&Es2gzU_&r>IuF21g5Jlp9AVBUSIp<~W>1Hro z{4LK0*(WEBb!bmd0_|(dUv{aJx)M-<*j^lS-#zBgwkb$+UUU7f6X%$8{Xsq5sODTA zHRZ!)&r9AEZT{?_y*KUC&k{g8%dU!nYYl=Ql(54XdLZfu{~6j*vzNJ0n8qXRHOJrZ_u`AL%=}-mv0k6 zL&@TwKBY;sOr1Vx!<~gvoXhmlhv%cHPU+QF;oP%|HgM{-m2?XH?qxK9C0Z1<7t-z} zR&7fioo2jI`Nhnr6+hJF?;?l=U0L~JO@JE}K+!y&94k}&4Tk*r*&j$z0?Xydwh81a z1;3MY%br6+6|*1+wO6IO)L`??503d0T zi8CHL^h4ki^Q8OEIJW`NOwxVmCeCSYtKRNu6M0!Q@@24!-V_6DPw=e3(k(H4rJkLM zjQESj3p-b6KoroD_fj{CNf`adXP}ELE@8KlItOakMn*<1KxI&@iclImrE~J`9qw=$ zqLo27AyasH=z3x))nN^W{^z zwR(z1>t)sfU5M3WU!LF+e11KJ%yE+zb7rlEWsOmN7?U@QN)tS)pf?N*{v@^rcuiok87{4F39YT<-b1Py@$!iB>grgXqwH& zjcS^yxp_=33&JAcmBSaxj#*Q7%)$V>g7iE?+^S-?ZoR-AZ_|eH8M|dmeujSGKC?yP z+Jk}&LZCOwk1v0dda42{?xyY6W@dI-Nug%3;%89#l`9c_`u3g8?@M&-mBYB)VnE_r z>dXp+=`*_8Zg9K3$Lykyu~rt2(%Kw>Ikd)1DbOFH9BJWKVSttG4lV})SH1fA%f$>& z)si{iLl5)|jaavcrtJ$5MLVHin1O!q>>r=sT$!F7unyDtIXnAIEx?0w7k885m^5=q zZAJdr-9~G1a}E@1dz(lSL10G1GCiXuwp+C$Z;e zN2s;CK*=bd#D?j3|NP38U$|OeRn;80y^CpqrdW4z$JX!%bIWPuU2Y)dHwUlyxeMC{ zy&N2NCnY&){ir_R6h_sO#TV=DgzCNSeaW3*fqshL?}u7)-S4NRI&<5$ZTC=1&wwrB z!3#)sOk!Jo&44;T^UsfehJ4||YiX5M(CJOLOWfKz@^UTf@Md2WBQb*Z=r=9<(m8D5 zYl(eR!rgsrexne+Rz%EmV7)!eDk^eVj0kQmA9dr!=G%T_$3E39G zpS(;54&2qWdGk4BH8)paUzcJP?&OFmy8P$EW9BKU(JVg+5X(;?fmR9nvYmYw--bhdG<$DIK2!>ZKY< zPS)U5sLJE%El2k8zCFYXTLuQVvaL z**VK$xD;mS%kt%LI)QemTgSo6u0A@qJa(^vyK)Haph=02>CnpRmOkMm4e*CX(VTm- zb7@++bIT$o-AvOURNjVcR$?3JkZyX)B-I|af8YEN#b-$aTYA}cd771gussLRwqsC~)nEivbX^~=(+X^Li zj0Tkz(qI%vB(6ndWeF#YK&p^@NBvqf{^&63K}c9>e)o(-FIRj|#2}F$1j6C9V6pDW zok*q(Zq$sAknW;Fh}z1@wTb!*dOh!mS2c+_ z_443q;0U)Chwyx?0f->0nI7Ou0Y_w3KpZ02kT&Ep1{lC#Vb_9EUO*T&6`nRtDcs=2 zHX@^u_Ze+R-qF|2M7T}j76o#qMQ=>rJgs6}{OXep@|Zkv0Th8q{*qmzCN6GN>L%rl zeXgUR0lYOK_v&HWg#XD~4iP>8?4Dg_i+V5=xOvyEzvP|aaV1Y<=uiclJVHFLsoz08 zm+OW>ZfRS}GHIAG8dPsP0%9qnHF~pX@ z6|q~#q?5Mh!|aRU#W3b1hPzpFX3dgWOa%6|1dzi`N$^RUI1%LVL=qr*Ub^`INU8!m zL*AWYVRHy`moOBPZ>e(|Qa>4I2N1K{B`-Z|g7H?8=)HhiR26iC2P{mNP!3oN{ zeS6&3Wt=&IO?ns`Ydm`NDBD;c&rf7YI1biyyR0=AhR(MkBc0FM_?hynZcuA#ha}UBh_u#Oi zbT66fIc3sezX*~WG9G4G6e6lX2X^A=@Z8*7ci=%_GHBfNOqDQNE46$KBnyMPOTT`{ zZ5Q@elZg)T0hfMLNMAHLSPmk3pjfCwdy#b|H*` zN^XEpnyDi+%#Fmm1mFn8RTI~Q;#bGqg)4am>^(PvS_I51w}}W014;}*@*&w zq@CSrNGo4;943UBz^);EyTY0L`Pf~BSwAwNxY4QefV?rVk&+%vfmE#K8vXYd#6Uup?^rP0%7<&mXGO^bJ6%em5=Osq?d#ek^`JFj-9gPhQS7A zeM%sq1-*i|b`dCV^7PAfBY({;e_UXNO+NY@EWdg4O}4oON%<}^;GI76S}Ou**sLki z;a~xZhzMXfG*f_B!NS{pY?wG)^A-R(<3rA!tbPFFNx!gM_xq&` zeIXIxi7s7y(6#cA>0P#P^RLRP&SYnIAy!eZ6MN~V74JRG!NRQv8}0OeGSk<$2`)?k zot}SOqkUvT>_+T>=9rK16cG$z-Crhgml~dnWQx#fo+Vu5lnW+)jHp0D;U{^Yl*qZ& z)22-uPtIR(->Z>+Va?eOOquE5sY7|Fawf05jHt(}S4VoD082gAc^!o*wfGhwDu7nv zgOL$YgdpC&E;HlM^&p2){B_ELoy04-b%mqVz zCNIxlf7HhF`WVB(ERb&;S?@T?56}b=ANbGa@#jX`mk)p4$UjhMh6GRyYV}A44qf-2 z-fu!d?`^x?ce#vue^7H1i$h9K$tZe^*%iw1JNf(fn3Sh6uFczp=Tixadd)J@$l!wc z6CX{zx2RvpzLL{RKRj=|vEz`9(#T=x>lFjWK$q8-&$Yih+WU=3E3_>iJqfiRH6+c&=1Iz9{Sos!Eca+dUCIMt9ar#Htd zxAxHoquuR4Y8_VbZOTZK{1~M%o{Hi@dY0@w`_o9BO@|}Lcw6@O$#mIS38o#e{@9m5 ztti{CgZAIPYtb1f;oGtmQQ2D_fo%%%`sGUjMJ6JIDhk0>fM^gF$<7SNc!55OTRhAUW7zvak*;kh>^>t?6bxL!W zlW&H#Ut7KKOmJHV0^p)WAGHDE5dHEqA1>32OvdyxkmeE`Og>Ag?zQJI;6 zshg_eOPhmn3%pA)H9)v@)D1!=3b?)VR5+$f^sEkzPEI*C)F^n6 z8+!fSq#-KSe_YGLA1(KRx__C%Wi#^vtY8mP4|C7f3QjT*%7D(n3Z%L2yx8;O-+y1v z{;G6vZJ>w8fkz){)n$LB90m`ZcK=I_aGyDWr6~#@pS-TV!rI2BGiOWRzI}<-w(ut3 zT2`9d(5@oYBhZ4dJABoaWA>VBW$bpCZ72aS5Lg$af%Vsp8}dtoAC~278>N-FWJ~$6 zhQx)?ln7E=)w}aqRA6nwNJS!ndLp-)Gf7;lx*pyD!-U>#i`K2zUcEXJ)RJamCz^RY z6%c0{vgL{8zP{h>3L%}Q8e2Pe?wnqAf>s=(EM-?4#cOV&vJppbnzkLq z(VL{;wOL1U>CMs62d6gYU2Qx*gh&R^0g9YLbz&1;SnBbZ)3pjowM>EM4}yqNg+ z(xCrLFU8p%+(+oo1bJvy;vP$HLdKjS#U2~wNM#5A)i85GXgbRVzo z6!=wecNrc5DqesiAs9&nb&?;f%$+(ZCN=Qw!-Z=6X;YwH!ik|g{j#0Kn@~Z6U9=I};V`gemW{%&5^U`A%(0_K-}$U92tGwy!t|#Mb%nnvoNtgaH?Pg4#M=;WX00;?UTLapN*slLggh|QFMvW%`x}kz=Yyt49&)> z@6IYNpGJojWhF61CIUpE4yb^?6fOjsEQfuc4D%jDpM+&gw^xhG-^R}`#ecbP>$T!% z&#@`q=2U%v?VY!b6iNk0wusuDUiNQ>zvUk6RC%-%(jp3Y!SpaLI0*qq} zN2DC%v{o!?&8`x3-C-V|XBNjD&Wye_{;-6rcVWm--fy3fq2RS;x%FBlyj zH+N5*O{W&2c)iB#&*bgEP`?CFE}o85MH1v>-dItxrW)K&yd}k?6How&phrcs#z}LQ zM|p)HiJzgtU#Y(~ouA|Px1GFAVleZCK~O~n0!RZ_)kbzw(KRJ%GaLv> z-vOV5PW5SDEd6P50v6dZFfqAYKlK_!wWoQ1O3sgn( zb+iAIiI+|kjqThIsh;!+o%fSNHP|xp(pYaZBmpY3<~B)mymstY znzKGvAlWjLS1mhs+(ZL_iaqOWcD5Fy4B?ZKZ3wQ~gZm`M$18E-I3`8!9qewj8;Xv& zu7e0cd?ICtwNDE$*CwTlAg+p4DhVI@BeRd6Id-fWDBfzgpdgqnH8sE8i0!d?$HnZv z*2!fHcZ^PSBtL)kIe3p{NE?h2he5?<^3CsAzIn_YFge@zZt2ca)a)6!KpBP!(*1pR zo}X`Y+a|f}0N7k{Wz*d0n`kvX!^@1!b2#uUwBL}ME~#x8_&mMjA00W-04x_zOTFwN zH5YMK0dA7$c+cy5%cX(4Zq6<(EuF7(w@Zsg6?^Z!sj1lv+3LfGg|~uqd-oQP%zHsz zkIz}Qzi6A#k|R#^BuN-7jNV+;JoQ%7d+vM}{Whg{?+V?hd-v{S4D88S7K`-TG~wmX zM>kh9$`)yy285G2cb8cHR+A$K8Lv&7HVr?%`=`a29_mn&t7rH2NZ)1oBJS+j4(RHI z-SeAWmEY0~n}bP{E|isRx35j@+2eV2^>E(izgtYgZ_e&7&YHj~r*m@T)D!{V5c}X< z9ndK`gXw{5H*|U*m9YG`uiKHZN=HBTrr?8ge;c3?08$Yf6jZz?uO*kE*Sj|- zaYcv69_ca_PW{QTz@=J44!Y;-@8y-pxf4bmET}HoC+T*r$<#TS4>=ilu(r<5yX+%= zC$G^;S<}L*QD^?+(oL&Xr~mz-q|5lQ?SlrqDei7OaGXo;^$SWWr)(M)_i^GVUz@LW ze@I@c?XgEI<@uS`ty@bAnzuK7(G1nj_Rmmay?Xt+6-83rpRE##-Q?@+`v1O;X!))I z^K?Q!HSC;vGk$mk1;OQgRh2RFiKmVp)x)E-fu(fy=1m*KHvfaYHd1w&9~_%HDUxsP%F##! zftEpJWLjz%otPN?FGr6aO=GZ`G`53@b1aXoF(t%W z5Or>uKp=oA1AwdO&_HdUPX$il;`iWmOY-!7(+*#od4Zc*K>bhLlS7aAEvb}`kB`CP zTIYzBV~Isz+|0(dr=J!V_F_g?@CYHSFgV+S)0_P6$&f$)Hdn4!54On0v@`?w0`)kg zuK-qPJ02$7Qpj-<@(8I4>73OE(1`r-_;<{x{)U=r6*RL+ zDZ(kLm|GGG8N>rK5>2u~E z^-8n7tavk;mchosU*ASmR=Py*+?jCjV0}^`E6$?L*kecuLK^qdFYOVt$SPs0J}$|j z_AV@pJTZn#f^S0Dy>zJ={FfX5P7liuy?)*9*6_=S7)WIY;s2vK!???rTat9H zfn6mqG5dXP#-o)bMiVDZ@}Lgn#-#s4@k(b)dQKv=hD;ueKwGwgdBCfpS94}``w}`L z=NY|yX-Cq)oHc8fF0qEWRI~OeoMg1S7qzvdN~6;d`)I6hhaoh?#Cw|zgL)8(NzsED zX8-OyLlX~P0-%M=bBm!kvC!kfn#Rz5=QA}%KJGAmdb8bQ%3sZS^XAR_C;9bmRaAZa z7j$Wl<#!k`yxoRS3sl10fzx1T$;o?5#Wyf8^P&9!U|Jq~b?$CRwqC{_e1a@TZjF3o ztoEkkEw^1;1K= zFqm^@kEe3r2#KEvU^)vO1y;>)qG0=6fBV@`4B9dzaR3Zd8b6<%&xKQz-b>mH5eq`t zaL5h!cb8<%oyxLWu=p(NyehKaxhD@k+30FK+cs|SFu)=EcC1p;qWc47UK329*8k0l zFSS$1oVR{lRfSkG9HdUh#wUy4FhobX#=^QlKtY8#rAPaFUHVJ~(1}%>YSG3bu;$XE zhV_({AtDa6m@ef8p4RjIzl(q{O)|>03x{r)XV02@I=V3YMT0^kh*uhMh(h{K#2hYzYU0XB)fNMa@L-8ka0iZt19ayO|9c*Z2d%g-m z!yW#^p7ZxXZb*1u)CPTe|K9u45)GQ~I-BNv9ej>*ggkP`2|}7b?XBlO{S;|)@0-dK zQ1d@lB zbM~3{8N0c2`@Fope-mqTH#!0csMca}Zv`0v~ z=FJ!5%+;{N|h4C)y2|_h=1|6>8Lq z@2`7)`uqOQkGX3qL<)~kyRcy2&`wXzo>$&v`@7?e8CR-5?lYzdasx){621YQ%*pJzA~vGq+1wW96Q(>-S3S?!r2Q zgE6qM_NHn7*qcQ@nFArfK&0G)F)}S+-R;+3{;s^*`#I!5LB0m^AeOD}Jvn_z$0cMI z40_bw>XNoIgripwbreLPAXK1$#MD{morSSKd>0M$Ny~CmyI5ga`fz;ilky?DA$Km< zn$(9XJ9~By>2reGN##c^%<}e<%k2Ujpc(%3fm@Y#`q#4*>fk7~rdp?KPC4sKLlH!t zI_l8az3Zi8MI>HaaY*gp=a4NGA=F=a{;t2z-?RinUfZ0pN)jl6>A?g1maK2zd0g=B ziVz*E_#%JHarXyZqr*%a@~@~Ukt7iJ02b4&*W1_3Us_xdGX2Z%=|O|`J={Y+-HD8u zn(sf26uHA5WA{|lENm@zhrGoFiUYGknYH6yb{?xD7Km3IJvjSz7Y#%UvD!5Rcmtw0 zz#|8=4j)^QHMhO(CLQ|;*RS>mv2s4;udOpE$ZbLo4OPQs-{y-fe!pqsNFPo8A6zx- zUICe9|MD-pZ)#|T{*uOekorc{Xg{HTYDu{-HHTT26wFdSjpEegDBT=lT5F#(IHx9f=oQr(V61 zCce-cOX0Hr@q@O-TD!;eXjEUB8LV}4KfV4O3|5b*yGmYyjz1|XBwq@ITQ8S5-(5NX z*NLwqJPb5J%N~1KZkv9bIk4e??l*t@yX@RUb0%)gLOR*^z%{WmDX~o75s+Br6t#a@ zADEQ$B-XgRs7OJtj=seg6N(dZzr_*e+paNcbo<^_vWsz13yy6Gesjcu#!14WeyLq; zT;ELJyJycntt!_$Pupv02@{BGLY505XF4gb0?dhj<_RB6+@dn2bXT`{Qv6br_a@}A z&_7WHJ-cb$PoKIew^_SmuHAt-zywGr;9&c`{i9rRDP_U-=;%ww2AF6{ zl@B)G>c?On3CGQ?pPy|!ZU?ViK4^0K;0Z|P`rZ0FtMBcQeajjJMJLnfLk)gM?k(;f zEP?VY>*x#ZL6N!fCjS-EjXED}S3Jz=|5I1{q_z6rSG>0z$+xSkw)rNT11i1$SF>(z zv+n;W#1H;aUblA5vnx$%nky8F?g2K=0W;kLW|?~~oW=hXMg~TPeGCTlF*J2DGB7t9 zXx`tnw}FAVfx(3V-S+>#eqiprnclOP{{Q}g6Fz&o@&jGu6XwsHZ9dZ5an>TMd4c{j zz5VA{%?nZ(_BS&PGBh;m-`k+KK~RXDGvBTLe}DI+Sqm3=&+}I}Sljk7>^mTLpvB*O zSy#RsGy5pK4!s%m4pB_1Tc3kNk{c Nl+9S{V^(gf|1aY;QP%(f literal 0 HcmV?d00001 diff --git a/docs/graphs/large/l3_perf_nn1_nt1.pdf b/docs/graphs/large/l3_perf_nn1_nt1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6c5ff9f06387b7a566b807fe44251252312ab28c GIT binary patch literal 17910 zcmb`vby!qi*Dy?qAkv};$bfVWEkic~Lr4ytLr612w@RyYr$~r&cStwVB_Z7?Ar0?A z{oT*~+}HDc&-=$a*9Ci@RePPicCEelI&>;hGHjge&v59vcEU4p=;-9^T=6HfCdV8f!hdk0T5|h6S$eUjg5@G z9TFjobaK8WQ+7aF*;;wP|9So+NJ~yi{ke*=oRYelFef*J4SlPgZMM0uCydVycy~#@) zAt5jnZfAkC1n~h7RWLw+ggb%2GB&`=Nx@C+&EO&;I0&Q@+}IY!EoIzWv6v0w^zHg8 zw0_6J=zSQgk2+gJ35sqBNG#57|BjB_V=~Q?md9iqJBIsEa~bn7NbZHt5&1INN6nL- z513OVq1${i$>Q`tI1pRQ6|2@3Wul;O3hTUX-FX zKZo{vOYN&aUaNrDM6R#9n$Hi?0nTdWC*$A3zMUzBZyj%bjBKeiY#`WXU2ce#FE#oO z+YEs1JSdg1uAb(P(`7GhWig4=a!FRLo>lq>>%9AlqNi?UTR(S|@rTtD=~AK6`JQ^}g_%N6PcpHTpN|RM%s>7+zrfAEbW=U8 zMK9NjO%?`iFMmuL_PK1^riflDULH65#9|5jn)Y$K(!o4^H`85oK}f*jRxSC>ytb!V zOYwR)J$v^??)vO(yQ%o5dQ`3Cmnd>2+Cs(Py53OZPCb*pIqBwABQ&p&Z^dzb=Iea* zJD=ncsI0JEWr|7OZVly2vSk;g-L`s?Q6C?>C0*yQGoIXGEqMd*$Y6c8Af8Opmrk;x zJ-eFjW?kprtU+b(6sZ~Gnu_nm?#9e)WY1bY!}MUPA2UCzaAfH#D;miH7o}G|*Eq+a zX`3s19vf%eBJI(R21|Y-8Q3S3$YaHhRlYAvy`BkeW~V5|YI{p$V_FE7bU0c0rj4ADeag zN;$62QhC9aDE)O9zxlOBy|=vWW5Y(d^f{#_k8M>dyP+8++mAMbGjNQldc$w50pO@CA(8XVKL#7$=pZg3MM0E9*)BPLu2v&?3tq04~kuD~QEW>01cfB95%hort_{nZZpWeud zk-A}+6Nl^jO@f1CSOn+&dYYz%>FCiF-pF;48;^MM8;|Toac*icKwHtEItsKjANk%q zZ&sxl$(Id2DhYYstRS&s?1&v{?yfjewm*_Dn7iHBOI#cqMSM!|q2{nh^@ls_YpnyC zr{@CK8WJN}%|IJTGyet(MNZyn2V-!#;HM^};PI;`!E5 z*qfD~rMvZuhXwH|2h*lWxxes!Nfcx~4(?D#FCDp8I>lmwIUWp?;-%4alV&~WWykpJ z4}`pU8%^%$t29y_=?JDXraDiOoDIXp;ZydVwyeq_lBwoV5{`eAboh z<)qOhe8pftxN(SAWnW*VN#^9LqQHkr)$83};RLVhzGTsimU$_;_EauLMML3i#?#A9 zy*5vBmhPDNi_(!mbL~N&jrJbq(!x@2QQF$+c=!~Hi|Fd^3)bL9E3>HS_i1WL2ME4Weklh0>Wn(q3^sgrRXQsVqBI_Sez_8`~bPF|hj@hxZ z#yUry+J{JboLY(G_cO)#9(!HJb4|=F20?^3GyT z{Y-N+aZ}>?qz|OOOktB3B|IcCxu;RKZZ5JdfbDGG_pAm5h3Ut35`-I+zY=$W8R1D4 z=#P=L0@D(!8c;pLFk^Y2g0ZqECMQN3WQkU$L!a(v^S=6x*)z?^2;{j^M9M@ec2t@^ z0d0`52imFw0WD*y8v$ePeJ$)>7k&ryn5|K82J|jHV|~zjMeY@0%By2Xfd46 zQDUN0raRDMj54w18Qx;c7Y;b0W)E+_G1G&9@+3RmvE*b${YPf$#cStYb1_q7y#3B3 zM%Jd*!ez(ohm9uPa&%6>(S~MZchb^si^=gq6m891J<2bn1{>Opc$K3jMuRz0CvBf- z{#ZE`a~T6wByn-!!i?!uDDLbL)1x3<9`9V1Hvy{dQxP>^_n)t>6DWDfTSb)hCjXx(4D~FKms9>wnqE z$@D&+7%vCE9$WzjA{CtPcna6X<%u?;F}G7W4@Y>RhKdyu`I zUsXENx#)$D(d65l~d0|5GpoVYC3T3QtvaTubL1#qR+Y^y>orbaL(4ly{&e%?Oo3!}6wr^$;)sRV# zqJ`FWo6pFgps_N%-V^ibdgWRVE(Yw@J+-0%r|LXwE$>GA*@I69&uv?st3NAxi|~D) zVbx9a6!!Yrsy!1kHu6w>=D;#}i2Zvtzn#n_t8}_RqL3ExY1!A9$vVYqn&Ft0>?vga zpaVFa8hkkr^`cK>Z7Q7id#KgVJh4p~>O#8pn4{?lcCSHCCCB;B4q2<61IIK7zEy`v zb21Q9Dqq1J|dP5HF zpu%}N`t4zg>XwzJc|HH0MvriO=$$yCXs1p0juk&T4(t^S`dRaByb!t6*@TDCmxhTC zp|4tL={a*F5^U;)8CtVkVXawypY=JXpKwfjsVG20hiXhl>KQqvDFs7%j1!qYj7&0N z$F@B}|KKE&;Utvd1S5skAgiQmQdXbDr!^6>=bAD{LfE1iYy`S!Y-p5B5Q?7&xbUg0 zi4u)bx$v8RO67oQ34zRj6umqh*9+#z5Y-SU1L948n>}}=yI(#QHGd~8OteDy`TUr` zapW|r7gufb$?uuvkW<28tBu_&phV>{K6x4GIi*#@TQoIUoT7YPH}sfUf)F2HQwawg z8vMraXVq+1TvTpeFN4^ZVqJPR(?sa92Oomld!baN{G+l3M-r^RLSd1cvaG#FPyb9P zPu~2-4f$Y!+f9nC?N^?6%NJk8fAosMrx5s*vU)*>vG0Fr8a;=8d2Ged@ap5vWpsd%*%SX*=M} z=98|r9)?5_UegO(Sl3ZdZ+l^LTz{JN!x3qC#Xf5O^8Iqf_^V4SU>x6xOvXU}W9e}v@DPUwZofoR*_sO1^P4xy2 zwy-H2j_-SWtat!~B6Aktrt?Np!T9WptrMT~&QDkE?_&n&ZI$QxjET$Aex}&YoIm^2 znnyfgeO3PeJXT-mTm90OH6*hbNNdL-pNoAJlq!6-2T=}x)IsTHxGTJs(_WG0rYuF@s+2p=-xF_A1Sw|B8ysq1(nXI5S9ln8iFeWMQhqv*vNBV*059jC>s9aBqt7-?!X@8BR`<=s zFHPHPZ_8DQDr#r$27if-+1EZ;71YSeF-SJHH4X{yC}la`awf|)%yTUsXs_hNv1Fc_ zH@}A}r|2;x^J&a+-dx0Aes#$d^FC~1`Wp&tV!2}1&yVhtSgVu00i#hZ{(dNSUmNwq z5}R2UR60xBS7i@iD7_n@sJ$CSsPZLq7<5sW7<5FZ7<6~Z{fRtth+^i4iDKs3Xc-$H z(lG`(3!!D(%hIZxf24zXG%1y&+`0d1)#jw+X#76fT5kt-T+o7e>%k8eI#&GIO_`D- zXOdUhx$_u(6)164{#@ejM)ekc2V&LuDkiVZT8U;Gh~mBzmk7OR_JhOF;@0=j#af8S z*@)v90@G2}HfIZo5~haYT8QdAr7mXLl=EQ zw}apqB|~o^qSc!xo2^8evZhtTJ^I&YZOcet(?)i?3dlT&$S=;IScdO3>) z3}HTsx%b}Wvr%ry7%MI*&&BdjY#8z=#$MM}TZsm-d#4Lbn%$Y*bpMt23a6Y$@w4(= zyL;WxhkTq|B|PUafvwiwvHNXnAMb?{$yO`v(^EfM{~T7NKbs(Mq7jQAFAZ_mSVQuG zO#5bVozl{VfHcYI-1@)~s(; zz1qH6f0J*^LfA(bH@fY#_e5*7pi<}K1%H2;x&CUb|NVuIEmgX0f&P&~cIN?RaG$aB za4MYCwIHvnPPw`5`vodbGk18cw*E_UgPZ9j&Q60x(~EHr*YJ;|efF})Gf&#Kxsv*% z@p@9U^03np)hTW#nkuB$acz5!@#d9``HJz`p89yL&AUl94sJ9_zCfW(%Pyvfj`*Z!N-DE7HAy1i|Kd!1mh2st4z|%4vXq0=SyDjq zg@@DZ-Qa@@KaUna#>PB~HIK~d5dW1xeFhUe?_qtHqA$Ly7J($`uCOiQ2 zA(a+xRiPfiCzfn1YKL8yMv#gHv=`GqBRr%fkZ_tXnD?U;C`X0Jzt97Whhc(;d3S7K zH)&7u4SxmLHkLN1B#YKPZFWZVi^N)&)8=u(H!L;Zc*b>&5gK%{Lz_|C8YPc`WxacJ ztU5W5gXm1Z#Fe&6lr|OCXpaUs?ldcYOR2`JwG(NX%nk3Otq1lPVl?nb3EP}P9vBdY zHPB2<$IpkF)UFb-G|-T`OUsT(e|=5YQ{wP@=Be9{Az`W8+?KS+fvp_Bh(u`3G>x-R z*YurTV7W=Ko#kHh1-MCmzqFQYDR=S2xZEzEKjpY9wEr%=RHOj;r0znZu3uMvo$tm= zetp^HE*OHW^gcTyj{ z*x-kd9K}jD|5b{vnV-jMcUg0rIF*wXxcpaBoCjdnBxrrL<}cYAIsLQQe4jFn&>ZsZ z?!H9pgV2>S#@%Q2UmdHbye`v2>ti}GtH$uphMf;Ykw1hf&5a6A za0E~@yGFz1u$%z_F{9eJ?Oi-QBc?d30jv-@X_t^}*o2`EP6(ZTA6=ZfWISx75^4sC zX9;7~GE#6+G?R1GHMq z9ff4lC-;Z;OHp+fv2KbaoAiffftcMXuf_{GxMH90q((xrwTE6uK%76d7L-#HyT)|m z@?dbuIu_jFYU_$1!}IUgwuRm`%TWLwmgvfx;T)C`f+ENoIU_c$c`*7%?V0@Y4g!5p z&2>D$2qF`Wxsu|+uXvXhoUe>r$Pot zi`|RLkkTiLxtpAEq$>}lu`?lZM2})54vuL;#@(YC<_PJoL4vrv&?2o+X&jkQoMy(< zmpEI|!W~EbWn-}UGy93h5jpr@CCysVsMemG7-+LMro~>Q{DKrm#jTK)zy%(VE|GoZ zR4XV^VXc0_3TuK`_+OtX4jw$YLd=trG^RL;cxvV}goi*jJmw}f~Th}XAI9Aby z@*+EuxGV8J4?xY(GL$oT6lOO#nziOq^&nvnO;OEIi7v!6mh(89WHWAB!aMO2ZL6<; zE~BYmh;*yZ?4F9DInyUFx8j>~u5!vxV5ax^;M}4Kq?%^ZIae{>aW+iJn&K0fZm{U$ zyP9_*L})C&M^eu_5u0l)KJiyA!1VGHcyp$x37Ug;1t-GU0CR@~UaPX+!^vyT`sH** zr_psBuTC{-s9Ilt&4uvT0B`CPHD{?7HAAU3z~%0GzKKjiNAYS5Zp!5awmONZlD>&v zXy*!{+vk0?zlsywW2%426AB-MRcHM7hlBT7aUG9p$6m4+CkPDzQ%+hi8g@u7?R547kT5wpH7 z!N#EDOO?2M%!l6Wub6cnqPR|vp{OU60dXalMNcApmO)mD%JY#6bpXXpf>;a3^3``* zo~KG;++Ju&gpdrfY_y~o%o$`Fc$PBD7(Bw9%4RaL=!tiuyQQ4b5{>VgEy?>s+ZD}Z zzKNNW;F!teiP76#hHxdBGw45y)JFG8K>6kguRZe6+(VpiG|204O$V_dl1fy{@C&)e z`B};C@g?00T1_U#vdr=&c69UdQ@Jw8`J67qu6A9f)s7=iQ0hj8Ug{)=@w?H{1;ZNN z`on5zU%^%$P%<&1n&`)IenmydwVResdgo@rgsglg6PYF~NClxRSKbGKCvQ=Pl6DFA zfqkFlt4BW?xJ5e8V~px1avQ!Qk~TS4Rw1rYmV-`d6Oo|dlsdD)&sLk(^g(>?YkCW$ z+By>P7D}n(Ec8=6?m1?6>azpczosOO;6^`B7E>eI^cjXK7^v(tqI&17l#Py{Y;bp2 z%+!;eLwiVa-XXtQ^v-%z$Da21cZ6 z=1OzI-xR5Bd!eD|70cpHYFxs7GtG()W5pVU5vW~7MFBsjvJY(w65@VA8cornB#BTi zGVX!$5NoTT=kLEzaVFrrac;wY-j_%g;Uyp}=)edO;fgy&2Z1eXxFP*qt+Mn9S!VCH>uga1tC? zAe|9e09su4K5<+I5w^nWqjk)zs{n*u!ZSC5j zo}5EwtDbL9s``yvC2ECfzv`L7EMLdYTE2e3x{$&kCUEsD=Tmj|sJj*cIwJ1m$cjyz z!&ABdghCj0SF9bUI9S?%kOj!mpb`8zbvXJ6LJ97`0G%7UQImSkoydhsL$bHRJdnA=8MKj@MbktVk;Ih(d(y|Jj8%rHt=#E6{Soe zr#^{}Gq16jkp8td7JZP?8ENNAME~hGaKae-FgO7hiAf*+@(?qmM@-9z2;&(CSq1-} zzga@`=n)~GLX>Q-*H>u0e)4`q;m^d#yhX&wbZb%MEgpx+Z9dZc-a@MxMHFoWJe5SY z(@t~|MQ4zP$Q5+Z(zg+1U|7kin7A5da=9?xVeMI}d3zO!3XOOtWo~1GYuWjv*U#L> z8OY#Ow3r**TkPjYvC#1we2b3kWUmynFreJPcq)gC3jatfqC7Tsq-m)0wx4Fv@jKx| zsu;O&3%E#bUXJyffCHcE7e46s5O*hK5v(x&xZ7i_rpA(1=W%<7;FHbIn^k(8Uetu) z;V*l#*hPY4R!|}nB>uQ(;#paDg4;1#X|nNYO7ZJdh|t54AMhuBj&wcM_u3P9D!p{{ zYb1bagy9qV(f9&`vm5z=^6U=YQt{Bm0_9HnujYFGbYku8mnWY5Z5GZG9}9~Ib88SA z2le*ye2(j1p(Zuo5FvseUY#}}x548N>Nd#+$J(z1^276lyu~qyhVs(scUga?Y%8R>P)EBd zT~S;bXG51Dld^Xd-7En#6_|>>(12)YdYAf&gl4F=X=jLLD32WP^Cpw}8WZ>o2RpDw zsKe>pw!UAjthtSgxLatIE*z`eEKSYO@8O5_ospI%!EbNU4qmd?L}3eqNmI+l*;*#> z*pC`ub>npJSGZDhYmmwz@q=nURG*qQg*wVSAMXk%TPGQnUHBFke|Wo>rC319;w?xYM$t+xv;!01v zc6T^vKSmoL9^v_vcCb)W*@YscoG}7s-nf=*4X52==sd>^QONWVUpgP#7<@hkUOKI$ z2-z+P>3Kfp@DSwU9wX$0T@dh)HBx)*AW9izVWbrvQ2gXDXozG$g_J#R=z`upntJlGxt9D^tm# z#$~Sbd0c=r^D^wMTx5$0M*!wI=QU8tegi`5tuxw!=)k{j7PaOmzC{1OPIe!ZB1eoZ zQh8Y8f3IJ+o7Bt;c5;gGQz5{@L6&zCtxb3TqD!cBS+ zW1F%;_F^~k$JE;->3DpZlaV~K3@W}j+7^@EPt#mjwJqW)RM!1|17N@IxM=6XD7qq? zi&WR^-uDT_4}GZ{G(dJDVx9DJVG)frMdkT#OG}Fo`g3?CynqZW&tcu@uuXwY`J!!- zzPDvOEgNl<(chW-xix9OidGk7@(4|%-tFPaw$eoNW0{zRl+r*}) zotMvz8K#oTxb1Lym1;4Si$CmSNwDws%LG+%x7Ac|&)yjsgp;xE_R|MeAhhWf1exAj z2~M`@IoV9Qm?zg)A;@+<=}o$f1aNF@=nq!s$uGS)5iA|6yHEL=`TL84=pR!&qax(r zOY`blCnzW)6vqk6=|GcrF3=B$ak}~5SK`y5PVBn0W3)S` zmJ<`?vA>t9i+CDIBS_?E6=XcBABfeH&lK1ev1P(x_WDc2<1fYu5AWbkn{f2&qT$ZY zly>s>pQ8rlyFXmYa9Y3H~E~ zqPz&|$i3dksR_;Y*AX?Wrg~O5>Jf=-VtSEh*j@>C|5_b$Z3C<0D4mwLX*D1bV^_T0 zWB4Xh+ImoEg=02hKN54&U2s?hB45KeoL=Ud{=LEg0V|B>e^wa5u^504<_FUB0G1TM z;^W^?C#hVfjvihMiQE$wmVm4Uu%i1A$B%;DC*n8f6|(dA7(<>brUlA!g~#7Ei^9~z ziLPfECy@|cFz6BdvcXeY}G}tCNNUx0vOyxP{nh< zzB1W%B|x&1QKk#C$u)b6jU1=2NILUYZ9*Qh!K|r@*7c-XD<brGhXw`vl>BpARX21>`aH z2Wm6bC$x)mwd2F`A%|xY1Bu7?zR51SZWRPDpK7*?We=|6%fLt7&!ferk?RAFM9j&9 zaB$LHb$V?S=H#5_d^<7tXvkUYI$E6KF;O`^dwgGKjSX{}Hge9fw(RDMz;?eeYJa~n zYCp6OwVxH!tVmUGyMWsYgNr*Klj}7MB_Xi{Eg|tG@CT&YaFpmJeh#MhsL}WDKyhqZ z`nMqLo9vYFNb0WDG72w7uRup5Rd31!l1_iQf%O*JF!bop%Pm>_c+jdx9C@$s`k~{f z{YQ>Q^h_umc@e1Pn~#;PFrdsUE&cR&IGCWc>cNLn=8DmgMOEEzQAMCAdyQp!9>Odq zkVIz#I<~}}wzxCst^cN%^pYa(Ehf35Z_uhPF;;iMAIB}?zM**z;J8KKFaPVft)i7W zJpdfHd!B;e4NYzgXer-z z!K$yiC*=JBeQPj_C&KlY;=zZPCQ8vjW}er2on|G~JGwbxadb7+E{UnsfNv^@Eh82 zzqQcv@*e_|X%tCv;-~{zd4A^XFHw{7)mQhaY1uG&@?s>Ti&np8sdj%d$zBink@V|H zZ5<8b&M60a%}MX?4dC@RlkTwxsQ<~&wCHdJUYi#S;;`g(0E`j}Z4k|1JkSHA{7 zRR>H0J|7Wc@Yd&}<^+b_3=G0BM)7|}#%t5(8p^r1 zSHCHuxHgRx+%?D21t(gJmj8IK(Fsr69V33bi=iLwUZrOEowd<`ScCntLRFO;(2_|AvHX$c!YyeJvBuEX|t9a)3l) zt+uA$!xKngUA^+uhdb7Xwc60NLcXI*U#9Te^lQ|CdLlT9P)EDfBdZe@at6W2s zq64%b_QM@%21{GPx45+2!a0aGW%5hZF23s2g#1`lXrvV27C}e-Gi|tfcU8f$58#hm7I(@`B{F zb?*|4dgq=;7jjcO9w=Lw$;|1Hjq^o z(3;Mxyd!OlnU>WlsKoavn{#Ku*A1rJWOqsuW6-9c@~Adz6C6g|#J)z+tnUY)m;OMb z=9@N|>c6Czg5E1}FJ-l%e}KXlZHAKE;66|?tq%8vw$jIQ*%+|iOJPwG#!jtR9T-m3 zL)mD6B~9iU43^WsHjchO$Ba0yP%zUmcF%0id3Z4bH={ufzn5BbngR6?VmX9d@#;Ti zjk&IWrc%NZO~e2S_7 zj?WnN`9kq~<&@|5WhacPKVDdsvR~K;D6IC{MSJu0)Mun{tmfjnkNy%9KbuCPyF0UO zsQU2r1ZGt6tXdG!Gm0kDTd>!vtXina*OX$^Z}K6-t$USAQ@(O19h9d=VGgSM@Lff` z=6%B=8W70qn_G7f&6aa%<3UtIRh-*9lw*}Ctf1u+nthh5t#?+-TC@#_9krz^npR)U zy`0y}Ujl^ji1JOoNw00d_?`>9lj_1&+%b1(GwDqq@@7xXE9nK#hDZlj#vQG>+)I(m zWagp^lBMal&%{rd;s%Jc0Gz6jp-!Fh+3rizcyA|yXm9q13h8>k^1@7;gy&1B-kzi~ z?^J5%{wO^xj6?S(viv5wuxvTD&spC+J^X3bbPe;JB0hHh+s>l4XTJF2ce}q<;=M90 zaKxaf|7v+teu`iiukn;k|64&v0hU12|cV3%;dM@Xqpp}pFJrr+C8;Fc34)_EPJ?|bc^6iVEEJp|y zzWc(zgyGp`Qu4Gbv*>F|L91O=^y|`3q~uoq(yPiU`o*glsayPy!rC?%{5dBa(B9f` zrnxMlPNCpO=o!Ukd*s@U328zb=X*j-9hU@b$?*txrL0B{O#9Xe>NqQ45-?*m`z8sY~uL zsv{^;wdFDTnH*yx3i~L>Br$HFeqQ~~%zHI2MSC6Bp1I4g4>`{YFW!f5*34JzwYhqT z+cR%=Wg)yfzL&5_I_S6>i==6QzvR{McsX?N7znt`*8IsO*THOoz zl&_TUqtaJKqm`<8l`*l}Zu=ABg*OzC8z(kh+DI?-pl^+2EFhOC*FALG^BL`6L+M_O zU8{30QKwJRUd)zBo!Z$29`)xrJki%qc%Es zmGbwbw&=z6qkB{>Pt=`3sp3UiM+O>3?LS8Ra?iaz7>Poy$qQCu#86;ICdPy)qXmprUgll~jMQBi zJJbZS{8ZsyXfHGs^$|%ArV5lQh~^9>wAVuWCY2YJfDY%Q%Jhz-_y<9hSA>Lj+ZQN( ze!8{{-Qex4E>L;{2g`nL!W-=dxd&@EQmHoYbN5mw_bhJl9?6^72C z=hyu9-sNhge={S-W%ifv@os&t~bMdFLFLZ<;Ck ziak5!U8@#aKp&wS7H6dw2#z%f?f^E!gO%E8GKv3eh9k2;0>K?p3<5^KH^W`!L#6Xh zy@BKK^bASIoL{bTLG#=gLe|x?EzKgk?@nH~9L}FU+HOQXUstS`I@1uyN>B4Jg1o#m z_4>GVzv~|5Ifbb|k$$J832U=P$on07-r`ETYhD^lgVhIXxg+97%r;p|v<}3LIFxyD zh2lp#&b}VcLh?flqS3^Tl6X)vCuuR+JqFqdGAFYb**#(l>Siv5q>44Dtc{0R&T2L^ zSeMNYkfRL~S4UFVNt4jY5AF~8u=NY947fRXiZz1PdR`vYknj|fOz{?zbn+Gxb&CGH zH;~U0FjOOXsB>&YBBBe^`?WpYx-??jm1u3i6dGpGv2?~NRV?QW@`#-|O(FrZk{KS^ z{6!vgR9cNiQfp3C9Qu74#U_c99p6*s@TpNA873=MVoLLW%Y@4T(RsMx$iKFFMgl8#l?m2%-PWZ}V-9pTKc)Uh> z(!W2m0zQ7f8_JAk7CK~nA?Vki$zt&<087%`i2@X@hnq+;8A;Yo<7eeAs*=dVWMqZc zVrGDz02M%o(o><+FPEXpg>U)wH=~E2*86kb;IW=BPzIm5wo#8`^W#3gL*aqpMHd`F4{{a#E4@^;nd5Mwj)AHpi@4Woa+2y=4~!05 zE`CP4{51QrU^Lol^&y|Q=Pm<Ei#U%!PapCqde?AAqi!{>RcmrCk&#(AsB zFp_5L9*Xi|0;=dT#(Ry^Rj({X>Ei4}&Fayrg4={ZR8jp`0DH!*=fE z)$4bPj=Ij8o$no7I@r!?mNdO`(#!TjXK8y>zD0#I+2!wr>nqTye3M`NK4PNvEj0W4 z4^y>oTWK^;@2Gq$@_rQipnA-ps0L{6-&8$~^+xo+WH$}ypd}pIoX*_wW6G#a5jTjl zsUYT<#BHSX+HYa4^B%5?L@(c1Yk%#K$in@;k5;sW?mp`KzD!;GXPt2Nb(DDZUdKe_ z-I^HO1tuzo+6fZYD}>$}4qirL3iel8sv!Cf$6NoB+lnYdP6SW!z|oQ(tn(F$CC1(J zAfrd*&OnBM7(E$oqMVOkKhn{tEDV)f9|gKB>)*SGCq~zS!rarPA}1mSKj&xe!TNNh zAV^#OMlwnsCCUKJtz8-G;|-El)lrs4+poQWk*LFsXb6vTh|Q3nh+p;5DF<2YUq6!p zpE5lEO+YY~O`=gYU5G3NrxLHrJ$pL@xvky3zG`~T58l2A$r(XB%JBNownipQ@AHZs z2m1qw0yBo-jvb7D!XX5K+ZalJZ)5n70PuuVshfMhZ)1o6w=w*i_WOX_7!>q)lsD{h z{hl&@7Q4TKPbPC4fBN8_2F#xlY&X)bjd52`UU#1iCpf;Beat;rP0@_{&e8Zfw&N1= z=1dwtdG_r2QRZX9S?;xih0Nlk*v^_uErUmGs$$)~+!l^!_8M0wD{8~}F9*(0)|fYw z)47+n7Cw{TQ6%5rTF^m0PK-TM{d|@~rgfv~J@im>O(D+;LN(T`@gOsHwOJywZnZ1o zM$>k;@54rGMPPog{8~S4aT=IFKS#0rOq5quv z<^feo(ayobXYS4kQudqJmVstn0P~qi$=IstG{w&Gtaz;E%)9RLSt7|wv$%Q2b)mol z1NMZv-`Hn4ZW-X zc{Tf337?@{(v^)S83&K0?`ukl3tcS-(in_Vw>t6da`5s*M z_=UhJ?wDsx`)MoLn%>C~Dn*Gbt1J8V~dBGFT+Z_5X1MS&YA}u z=iM1S=1K8D7Ey<%pLS53e*JtdWgs*XuvGAT<@T~@aLgEHWWwU#qsoW!OG}aLN1Xx& z=WbbHfGe5xJjKVbjzjt!qcTKtxLM)lA>T@R)ih^`qWV@M-Vk)fBuynZ&B5dKri#+B+wGDtD*85k{>C$9hVk zMcK=&oRf^}I(R(2pS!e06*KfSBz8?Ki5bf~FnTMui~}`$F?vlx!k;9FY4blXm%4`r z|Cu<+Jsa?5vBoH#(U5gui`${QK_O<^ga~eU!|CnBjxU~}$7?~(s`4H!N2y5QgPlZd zwi4-K%kXNB5@|TzW>H>rG`o0O|BD$NWN%-R5G0XcrNxr&yqE}WjFh5w)>wP5zdDsI|Rw}}F=UVXZjgID#NP}jzuJ(+nvMcAzu(_9A+R^Jd zxN`|62-R+}XN3ttw8p*kMzjZ((W&kcP!wvKg0}Dqdc;!BH$V4^32k9y;Rdfh8mvNX zcj`u*G}()>&Ughh&%GE#E87r*5@CRFpg0v)z*<#}JwTKc)LIvS;BML+; zjsq0ujubs6w^R-*c5uHlD=3OB@Oi0NX%#_F^Zm%$5lkrqFTDJFmMPR+OymSTq>?Kg zWdw;0g9O?~7f)Uj_B;vf4jYm%a}t-Njpvk#3VM2AvZ)!bV!a{FWcF$@9@K)_{D~O3Fi|UY2I=1R5#~9MCaa# z=;grq^34x@65kl#%x}J9H^C(v)bdcJ&_R0(GkEw z4GTP-1>ZbW=fG%%+nN21)&+965x%!^y|*bRFBcy_&R-zUUmy_vtM2Xq2j7Ml(cZXjI_0R21Qm=lOC*2e+RPH;OUh#LT_;(%4*2zzHIQ#b7uJrE$Uf|VHp zq>FQ_r7Dg-2$0|mNaXxCR?dI1{^gW3MjG4LTl^#9ud0f0q_G)*0YUzw=v(dn#y_h* z!+Ca#{V%<4AO9jHj1lnL*8x9=+ql4yR;I>(D(n`916Ff3LEcv1t@51z#eZAaU;f{< z`;Qj?#Q;-;{u6i2$;HnlBJ$T}LZWUC#-`SAB*+AAVPz-GuwUQI0J1U@X3*kQ9OA%zp=PBS3J3sW6SCDpZE%mcUKG z&DOyds0@&st&JT*z)hIpH;Vv}zXgLCK)*?l*1`-hMHLXz-rmOA3b@nB4q@kHm1tU;XYd<=j{Co^*aRlw+f1OiXO43gvkw%FS-?WC7-cKpTVSFE%>_`){4tP3>*LRxmIpI|ulmf~0`gak2t} z1R%XKb8&>YTg+ zT)YB2+-w}Y0vsGd;Qycmj0Lbt0s;3x0RI04>%UR{8>_vOI$%N(Styr^8elOVb}m*B zqa+jP)*x(vG8`aDc91OG*~8e%4*o}n--7<~34w2G`QNJH|FU9?9TXM+|EtE<_O`9v z(g{e}n>yP91B0BDFoUzRm6-s)2@fCKgx8#n6As~EN5s*tpE$rY7ca2tT*E z>Hn>b%uN5Khl8_|&F|J|W(tPez;BC00A}I@6frXuFt>NIHAVvc!NI`>=-#(Y3=A|D z1`#0uiL^r6z(szyb#WWypNx&wZL1To0lw@Y%;3gm1~)f$wm~w8SlDnmAPj+l(hTkd zd_uw8%EH+RsM_0#w6PNTlld(K{9l5BoawEl?VbL}R)ahKQ;3MNDbm>GKLz|IxE1g( zw*SFWzd=CX1dIm$SI-y00sTMv{l63RU&9?xYr+h!w}Tmw`|lGN*ME(s|8!u+|GED% z0dvi5Z2~5|?KzB60=M0c;~6I(F!}NR?r{tp91H*%&@=xR3iVsdGyab@{ihgvC&vHQ zzw{U-_5MG47{CI2GsF_$_+wQj5s|-hpqmri90!9`kx<&kC&I57i13l1LNf7 z=LDvd+aJj8AJ{W)ZeV)+?=aqHe89~8A22>HUqD=e83d<-(Z})d>p{^{2wp~ zCs6tS0psTf=GOm!L7x3v8AvB%;433ez^6q(U^U>ICcwLaz^eB4zyy9f&4R#kcINh= zTdM+V2{|c{t`HZ5L!6gKN|IkjLQ;l{N1TsCT$&FeE+q|-dM3#&&L{G}Q|RBW4G>6U WC*N)>@XDMlcWPcV_;nH{c8Y50I@A)zRN zK$J&gUl^mo*K{T_s)`7N7b61U{|13LgQxsg5C~UJ1Y*qyfe=bUAV?iP)~g7^KcK&q ze

vzWFb+F)tpTx#b|E?F=84{`=G3#gPl2V7SOCKE+tXARwY*>zmI?fn^ALscX5Y z8oSduIN6(9*_hF}csiKTdDuE25HevU8WM5YoH*akiO*5*jS&im8l6|a@cy@KU!zEv zbhluv@C`i@Ymu(1Jj2E1<%5gw7IE5vcH=%rmhlCXtln{!FUS6o6zTkU)i7zNNYzU! z?-Qr*%uJ!Sn?zJy!FIQ&Nr=zpgd7 z?$;vUBKWH#6=yE1%N^T*o#FaZ+DP}VVJgGk6kk!P-o0n3->ILCmhW(Gw+c_rjFp7) z=EY_2G|m2Pd2K8d_)cBlK}pp`H31g~w^eB}SGVm@k$$9zB9K}b4OKyN=lKkNivyh? ziLA@fn|o_m20iD`3fp9?-8Tjh^*b#uP?OQ6Pad}2dOfVd60-Ct?iPaeT#%)D;qypUDUmz3r`&-WBpGnkCLC zH?zp`#+^@p`JGQb-53|Q-?2qsAzvLCJ&8#f?~MwZ2G66OE}JS%q>%DiF_euiAq+C< zHykKbJq~h$!&B5H1)SnBoX@_L57A+wWYO1i-m6tj|8S=%%S(9KUD88u(e+2UYyK7w zRikKH6#Il@a=EaEL0iamTcexYUkBqr#$Vp@)=$~_6R0Y-E{_o+FvDb8ow&-RZQFnNG!EGg^DxW#zFJ zg$AMzeGjMdp4)epCxi>sOyTtsJZS2OdcLy3b#cCly6X8{D38|(ndHtubR=p^Dk&BQ zeou+~R~k>=EzZr%Z}-1jlqE7>Tj*Up}^_2gUD14?PN9J>VrfpyT~fT_th% z+tQAUvJ*x&yg(X-!Y9u_~4I_m6A~R7~hz57r(Q7j@Wtd z`0m|}#@PFrI1=v{X=2=rq(s-&wj?qVH?jM^?AOf9nXSnN9Q1oaLgRn_;2{>AJZ){+GJ2bRPPx;Rwug&I z;d9?}x2I2^N=dy54Gq;}|58+BQZY7Kst# zQ}+!^AvkzCIW8_V#{4i?X0dL`>|v-@`8}D=QG? zzCL1my0=(bT1p>l*%@~qpGH`@Xn1JV)Yi7qb$uj1KmY3T97SATfAZpVzxiy9B~p6- ze7hN5Y8!qLT*z6^hlGR@-5R?Vf26h$+(-Z0cW#COvS7*G#Y97N3=DhaJD{Z0pu0oTY@eh1`e9jKnM=LCPmC}UL#QbR5 z%9^jvSYXqByuJM-6b}~@bEQAKBZsLz(LP0EW~On3k(c|yYxJjZ}VVv@WJ!+ zh_`S5R$9gDv5&60z*)|I7Jaq;rMc<)>Kr1&b>r{ukFYxsj5TFtwcf|28bxd@EUDaP zU50*_RaS$ai!{HuZBJb;bjEuhZQRC0g)4fvBjc{QUVdIXM~OaI`U2s^3swTZ`~dP9A&zQ0-${ z8U{WM4r0DNa{ut~aDRV&ysAi@84dC3)%ruV&wnQ-@G1Fl+R9|(8DXhhbaZr_oH05` zR^i;-+f}k5GFd%c3ECQ7zW{nr>-?_W)Hf?@>$SBtui5Wt=s_wvI$qaT zmsPJuv4SN{Oy-ZL{W9K}L#)IlCXRkFZ2tI>3b8WXeBGb@%#52HQCnLp_4Fx{e`+cv z0*6-g->+Z4{`~pV*?FstlP`61b(NfisVSN^&L_0!IkM(|K0+YR z_4W0Ym6g%J>0KXovn(M)hZ-**!Mqh%5li|WQXKQQQ;JRLIGxl|E?j?Jo{IH0+zvQsU z>FFtRg7tW%)sVZZ5Or$Lh;n_xSG*4G ztLy(0!ypLx3Nm+YP7Y*Kcc>4rrxS2nA-P4-!g6=#Pfku0Q~C7koXlWzkOMN^x2KkS zGxFG3TU%R)q{(okFR`qkdO;?=oC(5@q7e~t-}+Z=pm{fu!*B|6Sf$k<951*LHIDP` zFN!ogy}kE6gz@q5Sy+nTm>28S4Rmxo;Nck>7}$foSsy7`AIR;_k&TBCuC(l@hk6Nz zK3~0%_ti*kL&IZ1!4jP+W?^BU_6TzLP05el?Ua@KAQwHQ?{AN!yu7^Zh^Dn)Nt2%X zS8Y2nH#fKRJ6TOr6Mnn&ojdUm91RT(&H_}1hK8aU9E4#YJzfaaioVXMfTjAH#NJ`0U9ZZ%sanrn$*~urp7eJyTqq6O)oc0XKKHEff~6T%g``Jmn367{V_4lC!F+symT`R={BfQuza=j~-_StB`s8 zU`^@-kBq;+;WBCe@$=^)#5HU=a)8lLz6!0+j)bgiC+vWyutCto24p7(hn+;j*L7Zp zYfZlA=;-JaUL3L43sCai+}vQB__(>TFfnE1j${ydhB`C^BWr%hogoKb#-<1^)!#&YNmKtJ$zUP7}3#@3oCG<`S+JF+KbL_`GKOb9{Dz?oeS$S06|Ik~tX z6yWZ@tQcctWc0f{)Rd9IJp9-XiM+d;1;7koyAbxrFJ<7M6&70WFZEDFhO`=OOgCr9 z(cMezhK#KgmlGS?Z{K`LLPMhz{4G5_9S;vLI5-%>x?jWSbZb@gjI zs9XVCof82_DCifk8M(Q+#>U0~n0>V8GFMhC*SJ)mr~&+BN|3wv-$P$dFaKE-A0MB3 zyEIl6HD9Wt$SwPIWrBYK&!0bs!(O14hmMN+J|Q9D!-rD6y14E|cD-5$NI3v($wG#Q zhaKzm@IvR8m-!(?03Ovso&x;M%j*i;kiX^mtW2vI@`vqolP_c<9tt)F22((=pFe+w z(g1fn6rZ-;@!#~cAFQjox*B2h>J=ps(T|tq$jHd~YyyuT-vDsf^`Qx<1s>ksD2Sle zn`#){;q2_33i-dhoD0#ndT?LribTP_iQm!d#1{a+aRG{Z(mmJ{uN4izk2~zZT zWe5KLO)AXA3}oWqcrGjJx!ri2e+LEeZ*o%X@<8o!M59FKX`>N$^1WwKIR@UIo+=zG zE@wA;J!-?}z45m^WXgM?r>>#F?|9OkFzk&|9(J%~VZzUm5x1Tc-rh7NR0$#y5~zVL zBgNWh=l21Pti8U4Aj4_@SLb}F3;kB#wAo$T_tM#?^U-2(HHqx{%n5RXgM-k<37E9w z@VMbx#YoYQHT(HBzrF+@$r|5z1D0=69vP~dDfbxUU_e07oIZ%}RQf1T4tEpch|Eh& zhYB|SS>B+njSZthlCocxbhdV-rHX>W?{~y3@Pn`dz$vV3Y)Jmyi_*_-9;1eaOhjw6 zOl4ZEetv#yyg{v*geFM-S5ED-4hJlp$qI!}F|1g$SVzLyzA7-r3yB=e&ClNi_qT7~ zU>(^3c9@8=v5m9d&ks4$l^}`3O1GgpLhSStA&_9JSyigN`aJGBWoJ zBk%KP0A`KYiHRR@0~ej@WEci8JW(ZvMs6NhsK|iY`6gTbm{$Pi{ssXAIEf`ejxio` z?$J^I({?(UI968H%CBEPF}mN!geFNcS5@T^e$A@dqL~0yR71o0##lu^mHn5e+5Nx^ z)YjFhSOhEOv9Pe*lO2H4Ku7QmTho>%dU<)d#${FY5(PC!VzU4lQtZv~0BE}WI~fHb zndJakL){|D-`^jq_O6F8tgEF(+*;HY!TIPBV2>6+&-NvT8wh>p~xq{s?lV z#lltO02YC>-uECtG6sq!0OPAK%~x|yby&f30E{m8d&O=A?i?RyJa%WV&lYDQ3jsdk z1d<^f%{x^KQ;psL6YsfJh?sS~hc9aCuw+U!mKPRQT!b!zo+0uJ3ZS&@@9c2$^YgnP zO)%8yH31Y05HE9>m4tNb0`*haX+cI`KP@!0BZB-9w5mAz?feX}GEPoTaQ}g{xvG;e zHU<*uaBt6xt2{!1vAeU=n3D|Xmmx&Z(U(QWY#@hL26E*U6mSkTSV@in)IhiMvit

R>V^D{ZOLU78i+WX`jcL!iJcdEJ^-Hrua@Aed(MROsuR_ zoJRhMiHWKO448AkUtFFY!fUfKGef^U^38KUzw^!Q1ptHqfdL_8O}+a$2ic|Au-Sl> zBtyXQ6QDylKin?lu^O{*b2f$w1_24@*Fa-*`}S=(Ik&@qLK96R?7>;+KVD8^weUdp z>$h(gEdeNJ%gP9c>nmTHXnwnY0*@Y5K>dZ<)zRKgaYlmR5{_el=105v$pirE#>PfS z*RRaY`2h7(M1FYx-gM<9eaM5v?&Hmgylvj_`4eDDypOl2DJa$!7I+-zen7Kn&27A( zFCAy9tv#0YBy>$v999MR57MYQ+QP~T87Zm7N*p~pd6i`|Lq{GXuYPK=Hw7WzYl zdPX1x86!*nUFCa{TY;3VP!=Hw0Pobk#q=$t$%gUD+9(?w!anVadF~!1Bpg@d!LtR0K6ls zPkI|LE6@aejJx}B01Cz$si^1PkMZ%8LjPJlJK)t+(B43^ZZq}mGzedGXMcZ?!{=Zn zHY0;}_(jXq)D+NyFT^*s_K)AvW6r^i2Ji>vtp4?7{cBHsed_qmoO<;~WH{K^;qTtP zLvKz`O48EMSXx>6+x6IHRzN9lP}fh)D2~CA%5OeMHZDhoL6wOJf~zRQy;1x9hYwp2 z^T2bcf7O)(<}tb>`>W>OsZW4--%Pj+%6hQ3XMOY#-;qiTT3%(Qjft9Vz`6(B z1z2x|YS568GJ3xh7yIn418D;dU{y_x{oS4>%FZH-N8_3(~8V{(HU?Zm%?Sb)4dhyzmWXnm9eY73-Dvm%wmbJ9f37ccHVcz}V9-evk# zg9g>G*5NKAV|!n&B^9s57g4iAN`f#cz~VhLs1{~saKh|Y2XZ4d-r%M)%jI%&ag|&2 zrk8v>QYjjSO|W%o5)W!~6ux`6bzc2u(53}^`rtJ1bxV-(tCMIX@w;vxqt4&F<9|XM9m9C z@e8aa%asd516t$3IvoR9CN%UJ3w|>~oH1TDci`H|5H9ozNDp&-I-q9~Qd8HLma3te zfBnjh_zN_GmKIrj=g_FQPRWNJbAB=(wJ{%Q=$DW17E{00)Qmny9!)ttI~!O%xQ@#% z0j3gm)%Gaxroq)IIfCnRbaaH9z{|@EH-J{q$@C5jU4S?N1%;5$$xe+Y=QJY382I?e zY94Yb2?A(_0aE{657P>%jnw<*t5GH zKy`xXQCRq{ez>%xq@<__NKI%t`z?9&*lXZ*K7FDG9Nrgv`FvbT3k3m;LCGVwsN7sL z506^a0?=>(rXiH|^-I*5L2XD%O_fihQ8&-1POte|@{zgZNWj~}BY${pnVBw9dVYRB zJ}r%nB%Fl&tny>xx z#op1eT%!oEJ`ixwi`idLRq+yFqT0S}a^Xw0jZv#V4VnSUZhhHZ1nvS5HPEwkzAUfR z+X71E4k00IrLMAa%=`D1knC%k+ho>fXJ^Yjc3-YlX%vb0o*iJ&`t(6;0P2F8w7hr| z;`bRDVTILKd9P){*C3BtS~4A;0XYX zO5%o7;Xl1$H@2IvKQq(eg*wl)1VE)b+?ssM!NDOY=n2R_gq`&zC+sJ!@4@ZUUg2BL z%YS&G=O{Bm0tPfHYO2~+_x0PWARR>^7(Icht)X@<-;-W6pzA6aV1(7T_TkTF)J823f%7+^?)rsqoTzF#-@2 z8{9?8{xWZdhdWV`73dIgM5g8g>dfd-q(TCg+N{`NQcUr~6qK4MOb9Zb6c&k{F&p;K zwu4hp!551*{-g5xD-+*|(P$c09iY1|r|` zk5vY4ue+-Y6&We9`={f#oSZjn3~KuNY=mJ@oh^ANBnUFqU%J%SRaHUN=L*UyEnRv< zW}T(nVG2?asDg0DpncBnhhB(*lyqM>VK&K$fWE zyJBF3LRe1LIX^4W07(#clJ)MPVlwyQ`8O zIOHij)m?Lv8Y;bUPj0)Rm2oYnwe5ob~tbbL9>|VNj8JlblNVrLb%1%2&CFcw~gPzdy6T zCGBxs-t*mofwgoIu2-)}gXtL`r)Kw?j*m0CZ+87CYkBb8Mqa*(N2fT0c5C&adXY%F zDDQ{Py{Dfjo0_QbFuqpI_TQtVP@E`GgPcp8dYLZu9WbH5>z{2X{QUTtgOAP33v2(m zP^Rjcn2>~<>8o&Ms^xt+f7NHngu^t=`0GYLt0^R?agSr=|@)!#|2$A`gtP68gvzs&c)H5&gD zpu;>_B(3$hjY!&GPQ|7A#l#-clbV$y_0yd6PllnITH7~=fa`@Y|LzwZKP=A=0^VT> zc*bRGqz;V?Ed4>#)gA9i{hsl9ZdL$?8jIF9N3~!fm0#>n*Ie}ifjFpM*Xp1QqM+k& z9@OpNWwJ+{HM7bh^O_ony-%g{X9$o+5u&!Nyw;a5aLqp~2*zozxPL~hgPz(`Iz8TD z%6)lpp&Xjrz6$-d->)Ibwe6zoSI7ug4v9GbdGR>$Jd~i_<{Cur)xYbZJn?Lib2(2z z^%qEy=y-TW#z@@cdiAIjxR}8N)VPd{F=@ER+bH2^Y_uUGv`31(gf)i&Ofq*GO=`X` zF2=;8FcTka<0F(+km~A`)lolxej}bKBR@ag5K2rMp8WfLUmx(0r~m5aACNpA8#6mU zFKTRT+t~C?kehMJotlCw=T=%u76P@5M$o;~=K3|T2j=$Mi)(VoM7yWe!Uirbm6NrO zM@L5heOOOjT=~&Zz#K-66#@#DwMS`l0vW8Xo-C$r6R_6^T32sm(2F1pJEngHv=hYy*V z0aFl$|6KghvjgU-$j(dRZLC0IVmd;!CZfG?4M_a*#%)F6q6`vVj z(3e|qwlgs9Ejs;c|7>~Y(rY)lA}i}KkItFgSHt15g@fR$75fN~H z9MJ2habHTxUk{I$cV_igbu|I`ts;$9v5EpDgm5B%TN?^OoEZ%RqpTbWkt9rxj*EQf zPFW3x5VfAZ@hCfD2I)lE(6f|cXEy+kT09#G9p&F-Ku^f-yzzy{<+3nMbe%`}Nw3)M zNLo2wKng{EJ|Ye8V_NHXEkIxTY3CY1FI+I3pv1YpC%Lf}~N-kDo}R zFD$pcBPJkeXM0#K?M(2E*Gcf#qLCe*4d04OOadAOu7kkLCN3+9uor4_a;B=pXz9#4 zHsP&}F%&Gc+B(R!i1=IORC|3AIA(SC0I6=iV2%*^5isFCOg zvK@s*5=Cfns4lV+I?|N*fdxzw(?lv7=#qEI$X64g0O=JJ-wr#0uKmS}1MrVPNiO3= zv>?=d&zbApdwY5s{kHUUbSN4tD=XBQ`M6cFf_V}OK~K6eX-^^}FK1)ROG=h#)EhiA ziusS^2I__XJRP)x50;f>AQNXH)8)`W(Nt#!HRKUDH()-{cDwsl#f#8HX)%-y8<`tC z$~73|_KuoVbh`<{yrAeJoaN<5qeZ`hV)e~;VrlKg=h)jW>*zrkK?>-nr*yI6j$Ejy zh-eJ8Jmm17x&7$KIH(9Ji~=G|wUpG=8Y z5g1HTf9MVMx?_oi_&fgnm6^b-L;BN!w@9%JD=wM+2nm021P;0-kFpv{aIko`#PL?9 zM%(J3IKj*cW>%JwDXN*7!y)tg#GoJ)CwpXm@uiHH z_f0TbvDWT}?~4}z`>8tCeSEsQy68mH-HzFVTJz)N66Eg7|XTtrWc^T?=RNMz7d#^CcqzX8w;&!Z0bfJ%-_9of3_~ zRTm&ZDJdyMGkO=D1gVsOe|rC+YtboK5(y!cR>~-vo-z{KyF4$zP8i0_!UDX@@bP3{ zXhYS2wL~tY&OS>zUgEAU|Bj$vGMwCjb1!S1(DwG|gfGU@KzdR(z}wp5w51@UM2W`0 z!<$*dLPF~Ov~XYkstPQCe;bYszr$PLEVaB`oaU6LLI-5{Zt5opC5a!G)}+=VHOrt zeKt_h?Wnj#e74Eg*Vm^jp$p(Uh^iy4tpS~;U&l5ec%rBukB^VzLElhcF9aoCP!M8Cd4d7+`^d7%OXH$SqbdI-{e!G@1QzE58^7exn0R=*S>qy6-ZrjA>f(&|#8sA`Twr zHeV15P!V9JSB(n^2E@lpXUBFgzL7cjC-!UB!9$ovf&ikRL_>|kfl9Mz7)UC0qpn#8 z2*~-lIrv*%PA*6FA#aLCmNN9e?(XggPH1%RS&sa&Myys%xe2p*Bc06`=?*6Mq(ytfx?z6GRGW0$C^5xB&6%I$R z>fE&1Jw0q39C?H;%8_3{Ot-Gm$?gZM35e!not5f7%xU+7EJR>lH!Br$GsT5{m&Dv>slP+T>IsSao$X%NUO=0?WB- zYg;gA{Tt3JLZ7y`L*cj{56r%&kfGH0UWS1_Rwi!{$OLl>3r$VURG^cRlOHB@MMU5P zO9rSkG9nZd6(b`efOnG2f`(ZrEvjR_VAz^3l`Xy#XovqXMIZ=vnu2+Y?(NLX4PgUH z=-SB7J`6mGE9gNrf?{vKmD*XkZA1p8UL0h?e*>yN#rX9Vhdu`y;bCZ$qMDcxV9idC z*@89+t{0Ff2#AS!L7;*A41_FydH{u${n{EG3-;XnEsihGs;gn2Hbh?jzzoO);UOoT zBQ@oFAll3V>X*}Vi5S0D$NXZ{5)w`vJ^g^rH=pQheJKY9v!2tg3}Rp+3s5C-quJQH z;)K+GRb5+otC-ZXxSy-KdYSPW)LjzUIIxBSh3{1Nt+-f$JrR6Eg%bRaA2qH|x9&8 zRuWBhbyAzA`$x7ew0WTC~JVy$+WJw)~P{fVcnj$|{0#CYnrX}zbfwu6nJgIjXV5i^NY926DJrg5srriB5k7S%x3;gN)dojXL z|D>UWp>}l}Z|WfYYXwfu?PEIW7}2F8k*g@!4Nzk-O}UdElX?i#VYWCHdiPe8C?+Fs zH{*Z!(3>Lh*zJv9?>W=@Ulm==V;wY`dlC*3NX0s6x;3KijdYlwa&k<;;SI{gM%p5bQ&50io&D)3BQAtpl8v1Hvd*uR^Fx)()CEQhP)a@v>AOzCBQUVl*{b* zZ}s4#;1)X59yLHDD;$ACFHOj`YjE%p1H(c$M>E)JM)-R`KL=|}TJjmq2bqZ{amw1a z5)%3f?l=e#+(DrhLZn`9JRFqQnOk)7^z;;rnz!L=sICT0r~=Y33I-l%$qn)G@v*U> zAbzU1n-@^Z?(ZYU;o^)$H|5SRLG$*W*_U7p3Mb7f#!*l(nv*gr2a0sI?`~$l(UHx{ z*qs%Z8HbiuqysgyXVQ|EOnsKU>S21VF9-bl@_Y2TIZ@t2nDAT}x?AG!t zmSc*`!qn8~e6t4Z7gT~y4DbA7ulucM+j0Na-4Q**6c%=2INjfF7V|kF5MdEd(N?+b zbIF^O+6Mg)0$x8(=z9LoWP^{W7$>K4iH0A%5ZD~Sf&hkhrngoZl{x~Rgi*Ai=^_x~ z5fNAXnVa{ag$2{9GdM0yOiUQ#3zV5;;%+o4a7)0!G}Oyb;n02%_)UcH77-CNpT@>5 zFI)el#k@zg;bYZ7ZE6xH$Q%w#qZW89CSN(4jq&D~d24LS0M8gI~M|t%Af*}?b-Cd-kZ464vDjnaQnO5*0 z0DGO9^~3a+pQawp110}NQXCqb>gqD*2T4go8M^X$gJu>MpqHM5R0>B^C{2lh&{5zc zBxA4|s|5|m92Jz}F|!yiD_L4H(_={xzc`&`KdT(w@@fLqiSh3r1rOKB z0&jGgD!=P1@Swa23HgLc=9wOf!D^_~w=W!C11Lt!q5+PnLCeaxIKpIwrP zQD`hINso_zz5uBT*6-?CC7E?2pu!IJgEIzUGpVx0Sz%LDgH*(r4~4gaCGho!Pek8IibL zW@9v@i-O;vu8b#8SYOVx27%$$e!MbFZabRpi(*NL_wme`E6`*sL$@|Ijn_5G$~^4Y z;urwGfV{N4y?yk&P{3<{RuZ&l* zM0Y&m=f4K0u#MrXuhOVi29AyntAv(3oUY7}Gv)3()b-`zpPUGHt8zOpHIL*CPMD$M z*wSH^mWGt*8XsC;4$Xp58DsXw)!Ya7bMx|ab#)u+>%rv@eKc4k9=mV-{wVlJNQgpF_!KW| z(0Giyth#hs6$ODR0kb9W&w5Qhb&xq4kEg;TB8oFAa3rQ^((s&|GRp{DgjS|$1*kIn z>9`5?^fUBFHa3j9h@K=4aO(IHe0+~8nI($c&S?Wj6i)nxlt{Q#%<{hmXerc#4`7%C z2xqp1j4*V91zskiOlAKnUWF7?+Sf9!9rUu?Y;3kSX7{Qp5Evn$gYy*pGcZ^KEfUBx zwhjz;2Jmj9{^$rQVe{}1f6{XS((nl=e{ih9yGTMz94sVEXP>Hs5ggpI3}nsQpJ3sf z*}wtpw%$*9+jwQKQ817WQ0NP0;=^h%|A9rcx>|#YNaMweI-d73$HX_+kb$q?pvoZ| z_sw}3)aQHB5#lCAu_4;pfXO(?R0|HkoCdQeoGt+|)t^i8*bPJWmq9IAnKtypP$H0$ zmTm__H3*{=#KiZBu&j7ebV|?>JS8m+_wS*+z9N73Zl+<|;DVs7?8ag%=(GT?F+6sOi6c{hgSU!qi3)B2gL3496GDAz{s~RuJbf5xdQH?oa=vAcjy#%>} z=yXxvck==g1P~vP>g(%ua#dewXsFg9Bi*=wz&7sX#Z49gzyy*+lijmy4O&!u+S1;L z&%@SWF%f!w;lWdj>RB+=Sa3}XhG{^ve>iBfBk6EiSl{F@B#BU7UE*Zb)_Q(s9W7xW)F(7CMc>5*dIxNtykdhD??1O67UZ`-V>(r-tskIqhsFtG%EX+pNJ zSQ*ClIzZ@7Sd4Y$Q#F=^s58Sz2>1rni-y5_DkvxjLQ5NG90RBrn1Pv+x`ie2-HSTf z+JKt0b#+0Afwl%xIjDc=H6K5I)Ek{*>@KSrZT<$fhVpVpDP6nvXF3ED;1(tiZ0KODeDzvCbBf1Xp`60!Obf+xctWiw&xVH?hm*WJmA#%-G}xR=NT)y5cQSF z+HHhP-0!X~A>bCQtqbewPQmX1hHzYcjhsec0oZ}i0Y2#-%EjgEd~}A{{YREUU~dEH zs9KP7B_I3^lEKSzfIJ^Sp)lo!Uj09lv4ySejgSvlz>LHP+Q4VrPmBkzv2$%FB^-1p zjL1+?mA9ckbt*Zp@!Df>S(QXsrRfrVp;IKL z#Sg`AOW{4-j~yKuc@5QDF4qahPn?&21Jssya(l|^^b`t6g}F#gEzIMAB{mssEb8h+ z^-FYYm~DtQ|8BGbm?NA`wQuRco!WYs*KY|6g$FMME&?hl!-EH;#KevQR4^0(YCS|^ zGzD+h&!554*(zcf<6=0zCln*^f{X(Fi&>L|lf5(z4D2i{My;doz4RbMUs`IJ2MrK* z^gRi#C)j~^4o8)@KgM>Nf?qZbtkQ6l6~P#$FPcLv^E^}~yJbMN*`Gl+S)uyn&v#HK z!N5;h?;~>_JP%?(M|4mXug~5sE$!C1pZ2pE_hkv&IMb9gVO%jTZV&7#P{|nMp~1HT z6Y0yBl5qdlXlfZ$W_E{97IbHfV!ye^-KApZ|Jn{p0O22@xqN7{f<{&~YgqnMY({hgQB4 z6f84Fi6BD)t4y#TZd(~lHNfl<82rG2`=1>FRuC8zWYLpiW@0j!o*q^uL%^EY4do6z zBOqGBh!ePd^EY<6lUYSYuOR!uND*{NT>SigFS<*^ZsCxB@=Huk4hzYt6e9~+Tv{@y zb9!2$fq{zZvN@hmUq=-F6MTA5#y0?gX=x=ns(q&G;56b!ngJ((k~*?O4mFr!0M$be zEKI{8aPvrNnStyH=wZhKGbs7+^#`9U(}BQ_)o!|3Q+{N^NhF;8f~yLd#5S6!=Mb{Jn-%Exn@7l`2N)_b9jvu zx?->x2nh*!!MqIkYze> zlT!(yvSjHlHq~Vw19iz441d z=R2Zxb#!jJ@E&tom=A$D8%Sj}ue`kIVlnsjjn7;z1Ht_deoC0V!t1#lkzQO__yLn! zfLIItM@sd*L5ttn*?|lV6H%;&O<>Ib3N;%>!Qec>jKeFRa4a;NF=UhW7BNSJ1i_8* z-qDeq-;N5-PHAne08Sg9&EJ`=EwKEV_oM*T&db5^1;7dLNY{X$fYgGyuABK-nC+Q% zvsWk}k|C_N$6zU1n&ga{2d5Rd5~5+mc5guzO#9;E;&2O~4OeF3;o`E&*7u&)+Q0T6 zvvC8+k^fns5XDtR<#vcMWI-6T2nYzkL2rYo;o*VrE)SE~0jCL=(TY>fZNzc@0Jp}> z{H}Ky#UI8hVYFL@0n!A3Ls%0`r@~ExPhb|xh5_?WHwrr`DzdBrP1Jn8xwXZLixS?7Aq@JrhvY+)t49jS^~N?7@LEkB!Lnu zWaI@`a&mH9a_%2D8YYYc1WVpMbP<9x$;0!WVihuqk-dG!$B!3BfitxNm^i2y7)e~& zp8Un?%n+^i(@pxSsxLw4SX&zhHxo<*tHvvT%WZt+nh^kn+^^RV*Gli-hAUll~7k# zhf$iUA1{o692l|*1^funnO9O`3)65gk^+vnOAj+~s562=DKw;F-Bb!lPYh6$mk|K8eq_rZf_ z)7KM|lltJoS9y+a2#z#;b#*WdpM1GX?IY^4`ibKL<{uz5qxU$QsKMR^j+!z4_07#j zaC-Zq-B|Fz^|b5p4LryJ;^LE2Wn(t@_|Xuh{I$W*@B>P@K5VKe4}h7B445;3EUSA|$~ z^UJ(|0o>AZlV@4Ve}&PQnFBW`TK{T^@k>@PF)?|Z>=?tiFvv=Ic{d&VW7oAG%lmbD zc8|k-?-R}o;HK4 zFic8P5l7JzXo67M*C!hI5BBim7;Im}`kVzg z-&L1-xm@d=bzt3DNnk!au;3vo8j`q#L;$?cf$OE@6n7vU17VnIfz6Q`fTHp-n<|}4 z@a997Ery|0Fn$fS2>2=kWObw(9a*o{&XCN4(f#YS2ib$I^Bn|K3XOoaz=#dN?5g32 zr%zdmK31p1?x;tVI7W;0W{^CAMHATNf)T?)|VUa4z#K~8^i#o zni?Ge+{DSgj&1z_<0Kh|@0`VBH3}k}(Aw+Q*y%&3;^0sg9*&LRDMzN4{aRWIgAx3? z1U`#c0)nl18GNHe9T26>fGYu&z(CxyNDA;M9DW@OnZT&hVT+Z?$LLlZRrO?tm4Lg@ z6N)?-#vvVkYrTufLD|7ZfC*C+hj-qogF$<7F-w~XH@MXZ_7onKdJde>zW_A}?{hE@ zAjR+c@;IF3+lD$|9Q<3ixaJ(_9#g)TLwFpls6Zn=K894@9 z5(ye435UT26d52tPwI$a&Zyh#du@HaYHQy$iJ3(kIP~3NR)9?G^~Jh@cr5=2K;;kX zvrtc}p@2Z0g?E|YfF+lfZvAwn_*MwG6nyK~X5<_!K?Q1u8FK0)R-foyD)EFGXNX1=kSxdc%K$eVH7*v>F;8 zz{g} z|37&l@?dYs(&+j%U^0JD457$8*(M+O4QII2ps`T3AW4BSqnDP~nuxc$4ImJ6!rG{D zxfX1Z;-sY#-i34F&CDhd(ZSj<_M~W)EAZ%0=G4(lt*wQe7JmKytpI#8fS$VsEB||fXvFAf)R08CRg%r&?uo$0R4^d*C}}oF7-S6S3o@g#>%3r zy=>*Qfni0+_@C3$M~8=DpBvIfzJn%gg_`2sMo_BGQ4RA z5F~uMVv%kovYYL2bAgFcIkLUQ| z2Cyn1F@xMd`ePVR%nv|ijlAuOaYE3(eXiR!#>aO&6&ovs&d!0XNh){0$$0xqPpMi+ za$j6pJ?}pjHfOCiwjZLs`}R$yAQNQ03JN}T#l003W7x)-R+FF&B`=6y|5E4YMAHb; zT$cG<&a9vygr_kX853N;Z67^X(mzyr$`yF@kg-WcJ6}gPFc2|&#jHWzTeieb+v-Iy zMKBw|x0Pd;apkrFi8ixtaq zZ~VOATHC|L%aw_G9JUr3QF(cJLU|3&U?4-#n~IG9M+j|EJuD84#j__(Da?G?Ss(I9 zTwGlIx9%>!2=J(tgbo7!9HLDhUi?T1)gusnq!=c;-yvg;(yn=@sR@To!Q10qN$C} zi6_OVO+GddomNW=46F7YF++w9?b5ZNB?@iT;^o_kOwQUzEry>;Q4sTCCTx{a6 z!e=ZW+Bt8#;)adMW9PB1?kCn6J==+)!@H3xef##^zBn6OnQfp68?s`{8+1HREVS&i z#mC=5LPXnPj(Xp@w$F~=ym^y252;HtI`!jJ{lrSjgr?fsFWLR>yICZ^YIBjF)W^0Gmp(~LG8}r3x{7} z>X!3=M&kKVT%1(!v(LcLZ`rl36#CUKUPx^}5q1c4 z+p@N$JJDz2#C7ieZI9D-*p(LqgdOj>@^;RLZxa6Bc^m1rS#RFH{R-@y@T2daYqtaJ zssC%Q>TQ@iZyp9r>)sr`vTCP|fB2dEt{D_Z!w(u9J#|(}@B0qb60*<0UAuBj<%2J8 z)##^nf zNbH~g{rLffUj9=yHNGMtU;BJw&~L3JW4Fn=b(&4CyMHa#=(l&q ztHvj764nV7w*S?c^zKn7(*8E|wEf~bRbthpr!?1WKq*Lay8BCX?>=Dirs2oT(RDq+ zdhqA9uF@27u}+nVI!>MusuL8-XueTTkN5lY3thW*N1B?pi!8lZ~$t}fQD z8n#zOJ>*1@M z;SYBu0Ew|}^9`Sg(iV+m_3qO?<>BkAFCz+zi~o{`fqyzKvW&nlxWqE5ly+)H5?tREaLFs9J6kd0VfL7e{xGX(qFdo=grt3(Dlv z)(8laoSZx)94*n4r%$uU*U)^AUL0QRc#-b8cJhb^vUAic`DOP{>)YfWka}{V%S+%` zt*Rg;@8sskJ5}soT^DY9u)KHQ$;xw%OO1K=cDBq-9*iA|Sx&D@)}xG~+p8Z|=@sPT z{zguV)Q@%*46Zx)M65`Y15_|)q}&%&@h`Zy1=B?)fELm zAyyKB{`PgJ$Bn!Gu%8I;JI`#U46Cbm0FTKZKk~Tg7rkJPq zoG}`8JBVI%-^3n5km5P3B(>pr0r;8QEgH4+852vu(yLT6a4Q4k% zB5SPZ)M`0s`;2-oB$!4nlfxf>eZ1Vg_I9b>FDfVM{Jp)0T5MVA9{jiyNv~z{W#|)Q zzW#R7l8=v%C(3Y}Zb3I(GHm=+EM{vxfCs!=T~Gh$t}^kSiAhNk(nZF0w5*!SD3w#< zwE7twoskNdVE0IQ$h2v{aiO3tG+MTEX~6AkMtg6qkSer0WD!|lkEUaKwEo;ta({WT zY`&WwQ-Rv?W~8$%vRJf8F!RCJuYdn&ZWo7>{A@Mj#7eZIRW_9&E*a0jLy5-GhYlg! zx_$HJ_a8rci-pI;{G^zIrzR#Y-V-y&iY+tMBjdyWw8R+3U-kXn!TLC5>0)pH3foE= zSDurX;V(#41K8W6;LYi%rYX+qWJw+ks?@eErGRMr+l+8@J_pT;)iXe}RP1(U(2=Mo( z#ntvqxn;(X^BKE)Dl2r%xa?3ENTPJ|?ANpzwV&1E$^pm5l=ht?v&-ys%JR>+fUyn! z0Pz!OIf$r@tMmQxjT1WU0EVGfpMXRY78k`1U+I4myZfVCFz8bI>;htOKYA!4E%#T~ z$;s#+Q`{}Y=yS=%Q=lY#5)G~w7oBD&JP$Wf6*G|cSs!-DzxF!@5oAz;m(ak0zJGrG zz)ljk2R3z@ZI-2q48%a?|2(#W*O(;ioE@#NGF(H$FsuJ_wJ5U;K9%GT!urpSjz1JO z07qn$qE9S1!Gwp4J%8SvizUYlUYCYwP(`9jrc#q@2mE}`%BDMf<@|YTtm4qSciNG% z!xcLngr8gpFEQ`ynYC+PHK)hN?|FjT8uf3}gVQ~HZdRLV4_po$cl7XKdLPfq%Rk=I zi{{1MDJzqfmVP*<1Stq@EWto0P!4}JOf~1l>W)RKHf-2xY8+T=r0kz|b?>Vw*r0YB zl=CL;P-#70vd2cYJfXC_QCLWitH5y_j#A&=CB_LAj+JH3`n(^vL1Z64{ywd)((Es95yWfD*Y``7FMmn&IEpiljSX4oDb`i` zorA+7I~gRw7s~Xj*ETm%Z0Bc*s2}rwhNMJo+rAxh(^MQF0r%@08tC6Vefe^_y80>* z0RfEAc6pGR3F~^YcGdAcm%DJFW3pO$Xf#X|-mRkI$CvqjG_kSb0R4Ws?MpDLHZ$`Z zUT5@q`^d|GSBN=N86U5yP5vu%eKKGA?mv0-QNm|MWPr8MBXLD-2S?hFr6kD7$4WtQ z;3E*Me-i#E}??1a=xpP;o!42YT+ibl3i2wJX4b@j6HU& zOP4NjsXIW=Ti(A14yVct51(&gVUf^JMnn(bWJpX)lyO4$o;}Z)cSpQ|$whQ&Clw4a zG2JY0;I>J8k)sOcKDWJ0`*?i}>_6~01`Z$zxY1I6p4dOHx#HA!fiuu2$Pu6&PLPD8 zB)vG@kp8q~)zt99(X>8)!uL!(E~(^smgyif*yFxyXees<(nC@bPI`uVOxf2@pLB7R z4EW38-FN>jig3CNsDY1Z2F{E)Wu-x2cMBS|h5qJ)zau|?{>%>Vt3KyD?nm7SSx)n4 zhF~*9%VpiCW+eHzCstL#TG4(NQt(<=6Ct-`SJQ~HYfiYXV2U$8J;SDjt$~ytwmnr= zyL2|o$0qGcPEJk%s-0G_PwE4kMDy&EL)bh$zf;oNiHXx><2aXxbw>a&|5 z$7>89{_4vQtFAX+yVD^QOi#r#Re*O_N?B?2Ym!-GjRjg&m3@_%Z<#BCZNkPGJG23G zM;3&p(*p&rN{$&svO|Oxs6gL6{Z&{?4_znVH1N1vR#wJ zX?W1giEOIb_yDQgB&BJT?_=E3 z*G_3Ud;>5F_>2rSso?gjB!>{rX`D3{z=1k8E_zX0gr%wHqRS(!nLdhm-(v5F?K{S+j}R7D9(El!W%r0FWYZW4Qg*K;m4b!*!C-;d85YeZ>shyfSg?^a-T zx|NIsYqurYn_Nf<+WIRHGiYpP*=ov2ApgSO2_J@3opTn$EoxG~j;6>3Ugdv^;~e+q zUG(tslG}d^#lx)?4<9|+H$!j*UA#uD=rdHXj)d{yUgCx4v7xHPl2d2_v4A}@;>5%VSTpR@%;B>8=pO)QZh3mqi7?&q=~hL>h0B)@rtG+E zvlbGo5dMHaw(B3bJRUY97^mX}Ygi_myu2PwpM6S0MXK=doHN#=u=Lp8e*S$!uDcyE zo1o=Q#cFlV`qaW7={jnvrT~f^Pm{?o$e^y_3dgqT$b5$T7K{x{yF?;dc|eW4n_Qn- z=bq=iK2l?IK?}~zNq+gFkDT0BNHI-2J<O$%P*|;>}7R#lpkz@%rmMx_2Le z@|hnSgJwoiMC?`uw1I)v@uQ=IOTpwP2}>AjNqr9bIu8i7M* z{p8@m;)_|Z78d8BZh-H<>rV%Y8mvMvOwC6FOZx>j0-3S)gPj`V`iKrjkIB3_eBP-E zya$dad~5NqZDiT-$&}m>o(GOyag!wtv? z_O`#PY#_Dgr+^9cFbcJ|x2LfglU-fsxHojq%>8XWFz4qH(C#R^dsiyA(xl7L=@Np$ zJJ@yHQtb6%Qv-Yb#j=4~uC%aF@MsSogee#rY6Ms4vI` z|90fV>TNi~rhk~VyLJo3>75}DLU`c0TNZw9Yr|h_Da9K({iTZ+@g2P1nA};Zr?8-_ z#;51f?f3aC$n)o@Z^T0$+pqfi33MNfr~Kqh2sgp4QYv?z$rl_b$=ba&e~xLddsln!3bv0t~Ax!DohNmbaavzA5e83?1Hh@gAs-}~V)ih=VsG5>lRP}ig!i|FR zpD73>-8G+f7DLLpIlZyoa?ZZ`^$zZu<>l!Gp)dgGgN}C>5l#@S0?HBcP;u%0dWn2hO~OIp-sSTzZxj2q;36!W=Q*XT-07Y zrm}6pK2=lA=;!u-Ce$sv-zE;7{{CJ6kE0ZVzh*m_{j-UEIXAsuC2vnV`OcJP={V8d z!}I?hvrNV<<~4Lf>0F)S*%R!%QO5zsEPhc0cRog2+vt^_m8Ip$%6E2$6nhx7nM5#q zpwY_UXfatx*^V9B5W>&{9$ztZ{iul(AKkiDzp!uC(7;DqWY(--|F>jV+@~!$nJ>l{ z;FAo;a%`1{!7~}9A~xin!m*nS!mh-_)2}d?FM+Yw{#%Ar`H^9&Xr7)tFP#2ZF3QZa zwXTHSc*5>j+LGFDxF}{}7;0i_8eap~0^`5FfA5JN^dT#S_bj!2Ti8SX) zcPz*L*4{4oW5!RGQOs5`S^Cd+!(vQd_Z2IK?7!u+?!EGZXWD5~s5Q_sjZighdP%UeOqN=_r>yR@*Fz z2U>cA&dg1H*(WwG?(fAJY9`G)Bl1<&9!I3km+&tK=!SgxvCSyDvr7x=IVcKd;ep!S z_jEaaykj$HA=;ziJ?1v=@tHDMqIF$zN{V5)^n)bA*_mN+#xFGv-SFs_b(0^5rTDFG_Goyb6z5F)LS&2Sx!} zUEZf}-!CsFCj}@oM*z(6Gfs?~Mubh&KWz!sM=@L0M6n`k{P*t`V_-kd_kVolq^ZFt zaB=BnxTc1!vaq(^YiXo1mnzBP&CFBDHCyNXrIWQ1gd$K==7FlSy^_DmM;YKbk^$j@ z2;#;DTCF(NL1RCZnVHE@5^dAS4zWnp5p>sUa@`2`I74B${99~;;BQWOy`w{HsKkQ~ z2BYlMw--JIYhL~8$>HjzraPM-JN(%uGBGi6M_vgow&_Tix{KxP^Cqe`M7RjG*{xr` zcqonemtKPuOYT02-4~bVoHT(uHbjg<0VON~{5`UqR~ei##h@r5-VW#VSutXz3+K-lp7YP14^#^ZV1Wzq8b95GV^XDErb?VkO1kbb75Z7W z62Kj_)Zf1^keLJr&)v2yCw?>H{!_hmLuz$v;*Gi)^%rw#q4Y$XF5G%=b90nKB4CE8 z1ADcfbTLx+*wY|FRn)kCFNmBFLVy2q|5vF#EFW@!o16^MlT@$)^_n$-Gf^zYxF+?MCZTDS1fqO4^m z2}IKf)d79_tY}nb8ji~@D~*|IsLh48lJ#F z=2+dGT3N0+$c^7qcb(BZfu3pXg4#znrJp461*Fk}XGBY4ru*e8u zSRwm7+0}?|86$9-ejZ&Z&s~BR;cM!+etqciw_{$L^uo!4>B2bjuKf&d}Da;pf1K#^OT$Q{c3&afHYzqT-Mw1Y$DSVI2Iis&) zecpa&e*WK)R~Qmu@ulV(EmEp%a%;vC7z`3)M|FUmfVV^D?{|M>>n?m@tI9qu)0eqP zIl}_t0%nh%uMtIQY_El>F}YYbV8N8R@`{DAldjhyr`!DgNpDYybxVQp!R#^c<@OOt zCnhEm331z8y!bJHDL$-90Jgx~2KoIeSaG3k+Hk0VUW|Of$xwigsl|OTg@DR#)AidG-s5mOyjq(L*4m0HOnSRaDLWc$IetL4?dJH7RLtcDTC4 z8!i>d_C?7ju!n*NjDLHnTrXy_rz>P?J1-ls8yFN9@PY2ZbO3t?2Ur6# zWHkV#JS;qo2Kao7F#tWopmu75&-YX?${-eH0=e(*GT9i;MUuAJ$U%A^V+o->f5Ju zrX5)cZGQQM-jv4p$s2u?d~gxaDtUl=^Oysi=m4ADyT9I$MZyZ$3>26C?9NNS!rx@H zMDUR+D%bD-+?8`&Za|^Dvhr^LSb+fZ=LP|qm5hsx$o_)0o}M13SYXZXRVS=8Xtp`5 zS)&9Cc));n&>@IJnl_h{(3&M%39PWISHEYa)3ArVKfe^;R9Y6ueThAGrKX0wim<><``X2+Im(b$tU}FPFpxAzj2>a~Oqp#IjBlYcW!*GGgPEGN!n~s

O!d1s1L40mnwg6iJB#qnhfUce z>5TJ*U}ag;@01fB)GpL%thV<4XOL4|yS5 zGw|zD_DEx%B85Zygy_7aGiOi*jA!M#jvV01?xa-3q`8YoH>6wFuG;}d0fz8F3HHUw zvWc{T;Jy5^bMJrx6;nX3R?rcI5gqVhog1fh5rk$SvC!ju&sn-mf|F9VYJs%xKc4BM zsw(ZPs~j9&1Mt_RFQ2~Yor|=rO!^Mlj(hcZiilzu>>WUw(54aI#smi7Xa z1os`LrZym3rJUJi#4CtQT3aXTPClfjdidzklOvu=7c*lC;DWP&Ms7tKOfVQ0K>E$(XQ+(K?y`}Zo zXEec?gy;mqnr=O){mItVjCN2eYVQ{Yg?U;(nFlFV={ra_C>I(&tg&|264kwZ+wuK- zPHDf*itS(h?P&;bNH;aPBJtpaCP)Z5&Bs6EaiA})&zoG>xUos6%{Aj0wOous&S|#D z?Ae}ZtrOkucwy^5YtZ((H*YpDOao!HDdcZPMk(ojN=?u6Sv}}1+_Op3r!P;aprE2{ z41CCaD`s1mq%v%QE(N3Kz+Zcbn7f2&XH`-_w0GJ|-3P~UQpD??h5F>#Gs_XXmd@+C z>#_#V@E)o=*9^>DL3!8(S)I(KHAaovR^9EhiQ6ai0W2F3sRqh?&`ooMuBN6>FRa`X zzxnR~z+C*jb3Q*N4HHy;?cK$9aOVaA;;bdui#ng11q^KB8hH`wkw1V2kV{r-$mSd=x8D%A{gRHzjk=IiQviAXgD~nXFx$Cf_V}hdNiQuG)0B;GrP}2mvpzY4 zhlCH;)HK(O&hL1-?iy_*0#bW!GRo7bjJ5&BA>@)(QLW}&3Vu_}HV=X57^On_!n43d z*$rW&m02r2u>?JDiJ2-g)?7~CCK3#oXWO9m?lv6j@<0`do>6pfmwqVVn#;TQmzURF z^W;4GeVyFS5oV`pXU#qQLN3EzX6mZD^4n{NJFzk0=8~>@!fWyLoY0n+c@U8uC{yfgj^N&hwjMl_m|4)-3gqZhK4<7 zO7z>3Y@c;x!kvJt+QP}sj>*rTp1s7d4U+(#+|`cR#=#0Wzo0v9c>n&u+bc8M7sPMS}uC0m_RH**E8;gWTdK@tEFWpr9And zs%eHzE({)YM;30-&v=+s>vs|nkCBm)?(muw2Mh?R{mvC|x47)Khx|R6lbbt#f%F6X zhz2lOM@ZFS$_iiq3MkIUx$_dOaR3Q<-u^6e^Mz8EFEBX&Hl?PYvP||Zp$pvc?c;hk zHyorg5w>-7pF%q+D{E3{5onZyI~yJATdst&@-^E*(C>s6^^lT^9lyHgoeDG+{wit9 zndIZM;;C`G%J+OA=_5ehrFsx7R|O238>hGIFX9+sL(pDgG!FIbmYmHSr&5G6J#sV+ zC0SXENv>fNyLDZf))_x!cioTX!|bVz#9X=oGc!7pBbbsjr@O3}PA{b*9P7#Jh((By zNs!TcHTBZ{9SR1Fv9qrkuKMr|jZeR278i>W28^`V6?M~#ya+n)B+d{6I_gnO*4LA# zHurc>ebAV&SGbY2LFS1GDf6ksy!$fMQkw?X5@=R0XMA{IGEhJE>PX-xb0UkSJvo6$BK~flP0W| z6rkF<;fvHESb2(IXo3j_gPrg+OSYQ$edYyg!3zsG zm{o=-!?FixJEhkbnm|O#nyih(7=OXc7m5FK}-W@fhWQN$^#o*2i z6kC4Lu>`GRvezOYTOd2yCnPQ->0Jt@A|uT2k2 zA3c&2*p{+{)Y#ZqZl?S08{PY==NzuwQ{I4}xCsol{gS1&(5_-D!kQ33SK!ek&Av3u zKtnsSro!eC2+Sxg9pqOv76$!$k+(@|n3q!#cKp8!Cb?uV7$9~>)@PCaa9Lqc8Wj;) z02G2>jR~Q`#IgDF!y_UD`)FzcLCRCG^<(blH#fVB2V5Y)F{ms3%o*d;mZNR2eiEW1 zp~F^lIvsM{GB}ft|C~kdimgemw)|M@vUm%GSSf(!yfoplM3u;kXi1^P1aW8bBt1mY z_l^$!;}J>a{XMX<#SESuKuQ5o3f#-vC`s-6^a|Gu`ZbvMb5i+yNPuCph(detswGRF z(vTtqF$(t3s3jM{(y9G^@pv%i;TLZ7^5WR~&7JXVLIIV#?6B1oN=h&;eETqJLC=ly zUrP$E)pOQxCs@QZ64R_N+_-gX<|w)Rvmn$<$iI${T^Y#88FE4hJ(dJ*nECJCWw9F3 z`VUi8Oci}df?7y05q zKnbJ2V|i0Dr`BTV<-DQtN{_Aue17fDOXHn0RAlv*u5juq)(PJz?|Igv+&6R%7C#Ui zZ2kyjcl2D&mLL5wq>ntDOl6L*vT3i3Wxy_R^%=K>ZIA16svzrpnB)O{fs- z+V!<=Pf_{WePpdDvgr)y=K>PE=LGMiJoz%(QSon(mT8$_yT`P!E(8!n#Mmeh>qcv8 zK6GV@QMP52uB@(Ht%MAFDuIB6e zs^MFH=G1{RX5@UIrV#vyN}Y<1#?VVBCzZp*50yC5n5FJc$TlVTko|#A3Pe@tz*q$V zwj>w_chp7BOG=3lgl@>C;mUk5wH>aJ8+<>vdLx%X9 zUAqSK0lG>j)S~-k`pv$)Q6dWj0Ny4e&4lc7ke(oEp{?h!aU-K*Y?)k0#kYKpSp2LA zN6N2l+op{DKp{hBq1OG)GAFIi&Ro}n)+8!?w}!X@Ld2Qf~p@I2F6NX-5j5>i1jW$(=S^IERT zJM{&=0nw6qkvZGoMZ=OZeReD(O6c&ShW-O{h*rHj z6%5^VImz8z*pP6G!uL|!b*UVx({| z2Ar{$>D(Ig&@NNQ&^$)$2JdiS=<3EU|029Z>!A}WSD#9V`BSUrzFz>thxnIsbt<}| zti3pZB)!plz||L@HhNJ(}r z7=7>NO|dflf4f5BLCT2_GXMSi;wcSjSHx;{j|2QgMb*E0CG&4komB2mlacuQGxnE! zkHWV0_3MHpo!Nu@s}e1S!)a$y0}%&p`~vl(1~qdr03bVAy&Aa=qZv<1N_Y=IV9fv< zo+!z?z`c=7Qr?}#z>Ng;@rGd?d^o9P^s!^m zUNGG`44fX{WB28QnU9qr$b4k_0Fi}i;u}peluxJ1ckEYbuNEM?>T7UPUh*d}1ej_z z^XLDoMgVqcM=`pF4&7h+x%ud|1~LM6NVt5qt%=*g=;+M|odNgU`3UG;Out(pOef-( z8bbhrzd+;u1s@B@WYM)MFE>}^JE0phZq`22K-hJRLgt%+@tX&^1oMdd_U`=}+R=&& zKHomHyn5x1IK!yFq>HD7|K9!k6(OhL7ND;pk68nSoR`;?_)>lq83Bk3X&Ev{Haqy@ zEUMfSOjjgPWcyyUK8q-UlbL&eEIisf&zpNf(=2WHLZBAm^aC=3MQ-!0hb9rK5{77) zHf1lW$sIFc!b66y)zsA7@9MH^#t>7Otsx=4m(n9u8AW8CxYeyIMjj!1_nz>1&#?8S z*48(q4iOv%aU3>FTN z4v4R_D5lJ7TAMiah*T!dW+E+vCI7Gz12Xg z-&qUgyVx*?7NsJ0cP}G>CoLj(3Yy$wX&MR-AVL^ZM*UBL1j4ZVWfbMQ&UGy= z>ZU_~5ukz*LnzI=2Namho99U4_rww3@|6(Y z&Xw8K$!)iIk1?{6oa_u2V#wCW=?jH1Eb6!W|1DauTeXA|6NC^{HQ->Jal!=aEuJ;vAiG_`XLw$8Q^xdj6SH91~6&=#UNoFvtAcp zD`C(FV+PaVVm?_ju|4PKB;1k;3k&b>o+)MekG%D60RZ_BDwrs9vJ1#ni2a-A)DhNM*{J44bGRWODpW=l` zzSY2;qg6|43ZpFXCnfu0a{NEUr+Su60=|_0RX#MT;wF1C_MJxNzu?8t9}Z0-!Lf)4 z$TFWOQ52HM30f5d-0A)PX zx+uV2B(wh(Hp;5p4Q{JrYR`B0}EV06Ut%{rYzr&nzV~i~Q=zst5QR#pE@wU#(Z7UncUABbM z2ydmm+b0M4Lp<`Rs9W}0Vx4#Ty7t94k%kQv;(?(>OjA$xIA~T+X@l621r}GnJ}9VO zOA0fFYkq%ynJP@X*tf}Est^c{E+Kc!)6R>3$(D(E9V3#VXqJ{HoLPiWW_?idc2*`QFMu4R zE7q6qzM&+{VEHI8I#!QZcn>gxd>ayRM?Eu3@WEV{wc*NE{w08Gc|&tF)wnR&7?Te6 z&ELN25+`x`f(1>Rhj{CkwES!3I^%Ii-t$*`ta_ZFLPWz|NQ3!1UQ#bwD~3pR)<@_tp|+ob7L(tj?; zET&mS&IzaE>6$A%HrlEC_k(@fExOlmFE!I*{+0EBET}yYopc!OtKEW+A9oEd!ZA7q zCu`=#!4p7*jjOp|lHxBHouNuaKo&`tD(GELUpp+MmTRL-rOatBrSZ~Mf7Z{;YVl%F zci0S)V*aa?x>Tnm{v5C3uVT&^Mo@AAj0l>e{+t|x|81Fi+&qz4!mZ2;7M+0HPnyAE z`RAH|qWTWp%bc8TPK_gJ_*IMDy=*3by z`*viLN?`tkz@LMB^07RIrD}}!gFg#c*$K7}0Wv+WW1qswKlmgFO|Oq|TKJrLpkkDvL}xlDoJGZijqr-chqc3bss$#V^BR8#!pnJwfH~^Kbh?x@um6*0^9s2x>5y6xwm_G-$*o(rigZa2 zNQs}U`A?sJ6dXcdC9JewP$N*AExacnZv&?Ro#c-mY&s4dgwS`cQ?D05S2GE-?SbVoLcKO{N* z^vIGN_t!UecIlS1A#nF@p=wfBS4WpXZ6a7k+?nKBmBe*GE&BkXk@tH(`2l4LYTKGz*-3vpjfMz)Q$lt zt3Pi0*QX!?7>bh$2lg6tJT7ke(xum&D^4C6znb>K+tGE9{$Xu?hW+H6@eC@dP_Fqja`F+j9^)@*_ED!p zHer18j@qm7e><-YuuKMC9kQ@M7-n8&c+5PLn<5wi81J2Na!mfO4V1{^j?6(B3b>EU zdxnWBJVU{gh4KP&gCxMj$&`1Q6t6YwM&%se(X&k{sSDj~_h#Zrt>Iei32i*Z0r-?IUMx zU~+~xF;sdGA4b#_KFl@?ze6fQI7!r|0F=BI28}ITwrp)$*!-mb8G2<68l@-N1;bV%+%G69ZunVc+A&GH_P>c$`X=I0X zoNTuZS2;RL(LkhWM$91Y^2q4ujxvqlh{VzZbYWrYLxS4Ay?eKGw2ki_ zu?&2Z?hGnAD-B^b-P?P|NQRf#+D>)2v5SOg;^Ija3sS^MV1Sf5=4EzL0Rcf9#)bxG zW(s}hov-LnRxS9*dL?5-ib`L91arnXgaMwxxVU8UH)Wl5wj96o6~4%|f}zr{;h3Pz z#e5gc&ga+nu?z3<4zP7}*NvqP?y>s@2Vay~)ZxRrc-jpcHjFMe`Yvu4NC^EbnPjM< z&b_Lt%ls0a4m^-k>#l?uJIF7xun>vb6=%?ZC(Zk-;tL=%cHQVSTAd>CX0 zNN+s{j8mOBk*N|duWiPJEZ*<$ABF@2v#O}P0ue1+YTh)i>h$dX;sL#BYs<d8^_#ZQIMf_I{C@OCuv_}-|HiUbU_Pg=&R|7h>N*K57)0Qo9+wAD z{o$}s-thS2I=T~w!o$a+9zxkVY*-?r4OqMk1!RBU2MOjW+iL2lsGIuK79~W0zBxeoVfE8Obt5! zyHZ3SnI}##o01}!^PH1ony{s%E?C=QjBDExdGzR=6&H*T$U8VX3WpQ!$gpl3viE~#oD9|LD26h_WwTvrJ@NM0(G>G`0ro8a6H#lGG`B9>xRvG z7ZJLC1vom_u3fVju7nXO;qdO=ZBOT28_gs8lnmtA9UgjzH?OO|5bEY89@VKof!(YkUThSDNv>Ss<|H{tQ}29tz6w zcC@3jb0Z9jbGeE;STJ<+d{f%<)J)XbhZPWjdKw0`bO-7*$6H(flpXzg0%>V?g&${?j5 zvoqFttQYI*bVpsxq_`&Q+RH(!ztpAnXav;sQwg zH-P3d!5dkz;Ne5&b~6M z>2QR3yDOYAOvc-fVqn8G)mok=i4*hU9nE?L8^goOxCvd#Nd3}enu?PrPxOvWzd*PL zal}pW)Q3ClWr!MQ?uhhpi>kLk(-8i2fnogphY#DS9?CmhbR5EnU|5XqcJeY#wz^{v z)?f^Ff}VD2;yFcFf5Vn(o%_(8z3v-@*4&a1EZ00jlXAi8w!GJ>YJ^xb>+0I!CQ9L>Px=g+Uz=$2S* zIWk9GZh!M_LCU=-HAdC&VBZ&Xf&=d^?JSxYQ)aJ4eu2|obNeCN4dYB_Csa6R*p!;V zRlywEGWqDlfwMhRa12DiBJc^xH%5CzOnOVcU*h;kG0`slv1^8SK*5@cOCNpt@e7)5 z=d!rgK{5iu)#Uq=Nn-!NsmKzmZ-S+tRN-Zt(XczWri3eN;!9W=IOEJyswF}Q`4H*L zU4ZjB$%(x{!TmE%N945wiPF6-NF!GvXH(iE$lz(^8 zS!)7tDcVuH#mnm}Sre;*(>&_fv9;O?&9^~J=Bh{1!?qai80*;Ws!uHfs5+mz)9>1S zMCYd(<4!hjkSnUJ*Y3qqM|}LA&6V#JS~PTCKdC~Y2w>lk2r%Y3q9w09luL>5=&V0p z_TXABlpe{@F98Fj#G^|QSML5ht;*5o9FhFUoOa4Qt$~DBzC?^vA*FpzDMbxZqC0w- zONs8xA-%I|`Xu6u7*fmR)HM_HvvudIU41vLIzP%VK4+hKx^4*4UO%S)vadO&2dA5v z%>HunrEg@MhL{1aX_PkL*Za*A;mgpu`T6eo_SB*vYskfpkKS@zk$vKEU~uL4(6nc% z!*@vdgELT|OveCCusQ+X_**mgE6Z+e?89CoLTU*mLVZ9MGi>hBwzD5{9_{P9&|k5X z;Nh&z#3rx62}kB|XfdAW{iu2C5^#fr0ssE}Roin;r_D~|H$G4G!>`S&F0n%dQk&o> zPmwn&RIymsLn+ZR*(bNgS(^x^r!3sIcLQU_rNz5=f23JkeC*=af_1lO>+C5^{@Fs2 zBw&!tbx+!w1WpY`8eOXP{4bQ-KnH4+*87)|iIFs0fDH57A1!t4*7je9@}X3BeZz@uBlZbIF)gFRV?qsN}k!Y z=EUkqGY#8jJsvm=5Al!u}!ZrU_?oYTnxl6S7~Ady8o$WvBOM=wfM z%lFUT1LthKyc8tq)xU)m^RL>;qkF?KcP~b#eB9gjm)=|kOu0!F4@nc_tjc#R<2AIP zhRaTT=)5i|yV9D3hAUx^FXJP_tu&-1Bp|J!y(2lRn+V=@iTU0eHn`%-?BoQIH?m|+ z>l=;Hqg$v2p#;QfIT%*k$?_o-^L+Nc)%yA%;XR3fy95(yJWjoeE?l})T3$ZvF;R=G z%i7p$hF#}?0&>ptN=*a@cU>i31*G?E(^!BDN8i=>y3$x6GKm`7J8(*!ZEUq*67KML z$)YqP8CvCSX_{EseEc*$M+8LR|73RrOrW|$y0ETb9)hNoC@m6LocxT#D6h9@G?>p(RYX>qmC4aUXhHvd&i3$mp@)x+bW?V ztk>AGRbQwoN^}zv63}E9M^G#6zqMlazDcbc=oHOSUw{fNOCSYLssdG}!yv+~0Mrem zhvV&)>uzizo2T(ZI-a-X^PB8p4)*pcM{fSu6iX4~XcjnQ$OzQUl9HgfWfm4!1kjfv z#Ia%GkvSOiuF3MT?&<04tGC!?7L5@tD5i7c}N9IsEp-{sho;vOcH5iZ*-jEM5 znG+~c==(?{SR~~9)L*O+Y^z*uppU~=&cDJL>r>pMEB}YaMJ+e(9$Of{-$dd+5J;VF z0%_&gv7ZHDD9)p%rh>sFE`}t?XqNGT66TD^?T3bPFK&9=!Gl$ob^O;#N0z*Qm6%-U?W$T)o6_u27C^tCdSw6Ts2woxliot^!P9Hnf+fp+cXMzdChBXB1 z)HvVRa==&^{Y~Rk=*MCi8=iQFlXAd6JXpwXqoZ-t7d#nB6kolvr^i4~hJ+sJIXS)1 zO(#(YatEVD{7iEW%@sLF-QK>B^q!=}T)VavgIzLAUNaRp_5CA^8TH~`V(1iEvWYuB zGRF%fg=Ys*xL~|5r$nfEBKY;?W5=pF@BjY#GOyLDCh@lFk zV_U(~xB_ggkIZ?*cmG9;3>@3P#G2B4y2o4ZqtJ-bFRLGS91xQUk=63Bu zC_x{>GnbXzT~v!tPBFw?&1h`ro2dKh-dk|kxIJ?VV^%kO8%~7gb0Bp;!06h1ep5e{ z$4-I_>!SUPBnATMtlL@CRzSTMQ_Hy6+}zwwDy%Wwv&P4i$p_xLd2<_RR^!g99?9p< z2_69Kf_ZYYMK~N_Xlh{I?1V?F-=A$4FPs7ttvETeZu|*A4(9rWMaKYOP}Gwb94UKC z5u91$liEG8g@yqGOQKG%4MYc^%WRD-+dOzetjY)vfq~)p?K$-&7CTTWqzVmw*56XPf(vF35#*jNy%h#9_#X-cFqF zFVKzPMmATyQgMCGSU90Q_k1B^T!2Da=xW4SPXa`Q7(m4#I1n=Kt;ho%I@7kz8{(k8Z=5f%f+>Q|qMQ+_u-srg{m&8TH zPTpG0_HWC^71<+*fr*Ir)=HxtodTi#jKy#vFOH%Wdd@bhH7E*r0#D;C&O3Zl%-)vZ zr0Yh_R2T4(4vF$KpM&O!3x*$L8}(Aq-h1Fnl-$9!1#mbP&7Xh4zM%8PcJd}{H$Tz< zsOwuzQ)45Xab`CqTX8jS2~eQ^g|AUB#KcOxUoyQ#o$QgD5R40kJpxr!Dm>3!;Cr*< zz(#0r>^-$wAvvOOCDn*2J3he;I7e2*>;Qrd%~7Qy2*C|yF^{b-`NW%j+Aj=#qYu-g zNA2e=hoZy-2J7fV3j+)7XLmEc^fWdy5=fU4IL2>98hYVmD}tX*rm}U%EZzGL8qvbf@VMJ~9JUpYciDljZ8EPLey zJ(43S@~fFc!t?X}sy8Wb*rbqaw|sqUo&L$W-*Xfe`v1qDf#+KP|2tJ{d#3y!{;_x@ zTns?E^)1809yJiyIn&4d_M#nDdt2J=BOEJ}{ec<=H`6HURsXM~>>%OQ&*rP72@ZnJ+K)myyKJv=&42;% zE;DA1I?BS9WAJWVDjFa4JmHBv((E*ITLy+M-ki%;NVfXt`G0TCM4J96++}uQc&QO3 zQjw8BOCqwDUo|l%*(zm@IG$ zR?Pq@Au+-L4U`(N;)gZ9GOn%!fyV&-EOxh#Y7eyOACf#wb9*$3Oft%hn$LRFp^N}S zo6CX|$(}vm-an086D`P5y=IK=oFb+(haSMv%n$GiVNk=>woKO6(vlGp!b6(NIGdMX z|AE~)^kVa)+9k9cRr_ow#y(D8PQ?T8EKrAV^Pz)B6!V^1LuN5t_6|fH&@h1YMbP1? z7lca2$?4dnjm*{M7!ED4pL1BNlL(}3bd;HpXZp0eUK|&E*&Rqvi8i&nyF0lt0xg(- zg6M`g3Zd=-4-aezo|vYF#!>9&VD1f&x=rU=bb{j@aO(M`)uAPhM|C|`@Cjt-Oampk zgFuNkkgQmcH%*MeB#3yg0LMg8@h9H~VifPz4Xl}#207SzGEMv%9_fh}m+De6L52d+ zVol&TZ)0$x7l_iX5NIn=24`m$2Bn&$$aU`J-57=2?PpcmI zijCWI&ro!{3c;v1fTb;l`{WK39WY*rK~;1q0+%7nYjr;`$kE2mW(sG^vSk=!qNB_r zz%of`ezFxB<`oyv7RQvq%&nVPHvqr^B~Cy<7lsM2t1n@2jgujv>CGG3p=|9opIl6Q z-0sI7&?u1RlE{^T&s#|$VK(TifB~wDNVEvG{QrBHwa|6JCZV*x(Om5NyY^Z)SA3?Gy3oP` zCj`N54{LT5%}l}cWKksQtEfz00PYc|_xsn6FFd$wrzK|Cq_gk^7Z{2Hj)Y+!&(&}E zLFK^EcxDMTzj`Hj91@p7U2U^AsErx3o~;R`c>~8shFi2v8VpKjYwITn%aHqWK;*Bu zB?$n-SxozpHXR*6#*q7cD(FU~j5b5($Q+bihjDQ^7NXHTEsB;Fj)at_cUOI(jqEsX zfC$ZLEuCsvNhicAc=UZBkO;JE+FQmmXKv?W0Zjf75AdE*QdUOctxZcBa%Swbt>`vr zT%ZhsR1H=G&ze3MXtUt&p7z_kyXOHnef*(p(n5lR)zZ7dJ8;yz$+4*=HMFu7JS#nY zOhAEX$Abr!0R@7!A}0#!byJ3@MRV-PO9|&Z{(7V1<~-WM=hrK3*tAKt2d#UR=>uGT zU?~nkK zL96`GL{;{!^9pSRx~lLN%0LRTE?#`_?AqR6!yL1f{FF++rf-1t>@-8XKG zLFY{pAbLe_RA6CnfRo2BShg&e_yronb2VA3qh_@?OX&GAsrF(w?@lu%Tn%pw=JM1bN6PU@ho zKt^{BUwtS}9-f0s#SWvJDj3~l?@JoTzR}eC0S%f*{=y5onLnSx;}82+n!Ks z95P1kzXb%K-rs550B<>z8;q7gXeO9)0dMXhI8!=`ROqdPLX*0tKCf;GO5LNc@|^-3 z)#TvlQnIV{Ez4&7iKHt{>xJPzgGY^812Dtm>?~3UX7U_$QCxY$hOD|n#4aAI+LB=q z;@MS%IQ&ryJcL!(vquj%l0B+FhMQ(>N4J3#o}LnahfNd-1633jqQxsN{6C()JD%$P zfB)E9l6h<*8IkPlB)5bjNs^V!j3le9D9K7G$x4!t%p$6jgrq1VDkG#w_Db=+-hFa)1Z;Tib>rx{2W08pqRN=oE2+Q4je(P1UA zl9j(lZ_r_6XYZa^K#zi_g_{CSo!Pleqs2TARQLN0(%#_{Q zUBZiM?)kvLz}vUKu-05oOrj&3{iY6OgearGawuy9&^v@W|U*+1tNG*tJ$xemSgaoKKus0y{B< zDaSYi*Dbe*2qvJvaUh|g^y)1i#n6@npJ~XFeG!sTZx4y*JD}3>{}~ui$2y{LCUg`P zI;kf4kToEbD;$kF)=NExq52%%{rg8G@y=v7Z+2@o5DlE11}mhyzmi4MzTK!z zdWc6XFUf^L$sRqlmiKnwJqSiF8gSeM2TU;b;nu7X5cX^NozX&k9=>X;| z`_h$0v4C95Yz`C#fEnWQq!h!`fB@xKM}XiXTR;g04|nT1oKEt*WX!Of%Ch<+!ju6T zqJH!=f9Fww@)|qkE(8`(&O-g4*MSNO6GzNL`a>p4aKEepYsAnS75Ofs1SL{U5bHNPGZTzMiGWf8cNq*}okEF+XhmxJ=nmXF@ISoT=87pGcqumo ze<9ThiHEB$)zYagM0gUElBq4GW@b{SwCSiR7^<1~r&RR*ga{kQ)7!oaRPJ2JDZ_~h zLE|1UzqMO8~1R?Ia|k5^)sVD=5V0c*ort5=lq5e`mHRAr!- zrz9uq<#oLO@B!F*)_NMwDCjziZL4HZ_+qo8KY(M<1?;_&df8GQ?DoqILt(QmTZrj0 zBlLSQUfrl72@N_-Yf#d_SrAVebx?CiNk+yef^OyE(L7_=oK^?g=ZzbDvotGVYxs1G zo__PMfhXz46@th+!h!*FD?Pdf2$3omx9i!HE;PUqdUca5`nyfhpP|zt?hT{|0XZve zIwd)xi>We5mFGgvqLCTzr|;?PL=ek1ZSAuJ!SNS!^MpJIbgHU>;4cGoOSF-b`ea9n zFe~_%`(|fpxl5f`yRXMxSHJfgQVrZ+suOMOa4qq5N2lP2;to}gfgfz$0aCn6S zJ^(J|Q$uvn;8Yd_RwM2)EQQ`)4J0{I95`qp*~bFM6~h#T%NmVh$=T)IuZzCR zz^x5eY%;i~jyAaU@v(WJ$0Z?g8|5pm$)%Z@|4dAvt@o}!uB*?1v@zoKO!bfY*Wg^9 z0J-nx=jGkNfnj9^Kn1+G^D!|9Ao$eN!`n220tAI3P(={`fQ(=97B1z7V&QEdUbuzG zRr((bzyB9}emK_x9y;?=0|Wj2{a&hLV`G@ALLWz{$50E)|Dgv6h|bQ}*NtW8D(C-P z^HCc434)s%ie1=D4T%x4*3d2r{>@S0(Zj$o{{U<(TEqBB=vr~k=!xp}K36c=%n8-s zR&8zYq@T^a5`fn~a&?;gQj7xtjp6{nAPwrZ%6{r*FI9~5nPbGcnB`CGjCBN9Vk%W} zQo)<$pM!>mU}Anl%S6eKBT)+>=td?8%z}ppMv$l>Q66II1~?wzJ5Dw}Zf?Ye1wzr) zT2-hbzAA{2#7FP+^T4=aj;?Rg=q({QQ{uO7zn!;i}*xE?}*^0QZ=6V z-B}vl>~|#JPb}b)g?xskSK%Bm0=`(le<+U$GyxMokb-~?LrZjkIN%yCRq+6R>h4a1 znQOgUvh047u6UGuMDMF{?l^P}xNFSb(pXqp&S6J_FbRz%&S^qUh4Z1CeNi%H*UQey zz21jgTq`EzLr@xOT(&KGvODZQLrWrW3x_afTi)OkND(?X6szzy!8XG~C34?b8(up3 zVR-{4`ZBUEJrVwZtYI^mE0@E;{-3GhC= zn7EiR{j{L!eAVJoPK?1HDuTv=97byD&74dmAP)#izOq7te+oZgoEVP+A2t`gpWk2s zHZOV!vqU3klCY1t1qI&>`GS;-*=Dm`C}@LRTmfiO3I@zL455{zFMQ7leomLiR<27xB|qpCLqm_MeCJBKRO(i!3vkS!x6{=fP%^T?=%K@nE6&jT z*%ucj1o1)GcJ| z_CF@+7aH_;q(0(MPed{oI-ila8lq^&%4t#N#OM%v-=DSs#Tm+iS&w1BOy9r5<#%=d zQz%C^8T3q>NT@Y+=8 zDpOqKaDVJhyzR&tcOJc?lwv3o7!`KgmQc|7^gWlOOEwf=L5++Fni3NCP`4WK=>0DN z2rkIMKaT0oDQ2{R5}98*-WDs$m||=gxPG?AynSK;H(5NOhZC1h8n~1yur{F3f@!5F zXWUa{nJT%uy(|P6Ujf}V%61-|2)L49VzBSrquvobe&fwVdHTcrUP%ccC=pe!8^@*p zz6z2&qljh;H~=nEc$Xb5bHXz-tk38=;wP4og4U>e_E`n~4ivg8mf=;P*8Btqc*JT} zVG_#l(%X5`|ENbW0)jjqX3;;uda6HqFzfVv*c6#V}O^? zsfkfN`q8R9w~sg(n;AkB{xSj2#Wix4y9{KmauN~;jzKhV{6PJd&d`H!)W9MW0A5j~ z;kD1GwCC89z>?7=zE_GrQ8?qC0x4VJuNY?3K$%L#wc-rR0TBS+264g}1>VTApLx-e z6>^0g=_gq`DQzD&pjQAI@#ID$qtj!<7aSN0I=GIY8VGB=653| z@XUa1f@%iJMFi=#jv}qPJ8@q32LXNi(9@G}qNZu|>%=-D$$*fx(1VF_>y{}!q)JFo z09@PR4S@GYK*)s9n~R;ZMJk)i`Jd9kJRcw&sE0qn7(ux$Z$6Ii#y7T5U-UZV(M1QM z**FG3c))yk@+gd2P5EGRNx;WI++f5F6;J-%BacW(A_rSeN$Gtb<*62~AAfQw&)Zk8 z?h-ZGF$>kRLffEU!LA_;e$C9PJ0qxaP)ULhhr67V(=&lG;mfQ~ZWiVxm6!IKR#XJNJ31A zTvve1wLa`r6!2H#2_n-Jk0--wc4bfrmz;rhMvhsypba?$uj)sFqemAMhteF*+UZ-gn*u^RsvPy`oG9guhjbsbC* z!F>EO)(NBt6!s;QK7_jo`1Gg&QExtyXI;CbwFeSIya-HIP%U^L)Jj0{X zT&QY@ke8C$;0-HN&D7uFFn0e4=`XZXD2hQ_h8_kvxSjZZ)NUV6Wk`5m=y0n;?xvsL z3|4_0w*t;D%z22Z12)o^$A0*rgz!s#AT3T<_~c|z%Ax!pvJ@aP=$gWuBzt=U(cYs- zfQLzaY3UTsf+Ob+=+2t#m%fSt;P}GtT3O7ca4!H_sRXS8UP&-zWE0V;J6^3J*7gd9 zgFlKcWn^R|B_(yZ!ARp-hZ|9XfW>ax*uX#1<&HuQm1C9-imStN-8ddvsGHxt^TCWH z&2M*dbs_F{^8jt}6H_?XAnh8$I1b{MPjG0nBGvcbTq{Z-=IZf9OXj#0q5Vm>FFm1@ zUNTxJGCMxK@&^K^o>e3!-9xVmO5u>NZDXF`$hFzU#dCoAC;nREUWHM9)Uf*X*4-)& zgvk=7H}{szQDk8{)G@Ju;KYL?oT8;5|_S*GyBGz=-PJbc+aals(BvcI<{#Wj{$^iB$D5Tl$Uw+`tQ(3J6cdFxBlmb;tC z&}j+nW`2{QWu*P~$s9V!wM+S}S1tdxG?0z+FXi<@%J0j4XVX{w!Q%`x0uml z2xLv~4(nj%`cTVHRe2;*ENbTGt~45g2T!Y59gKXW=IFWCLfS#m;!?brJlhRZKJ{d$ z6AVehfZ8G)k*s2G@1Uf=f1}kw{nIJ|lgtkuaQS*t)6bcxp zeefV+xVX=jIT)>Tw3hC=cI3q2%3DJt+B*_;pZZ_n{Vyuwwzx&1s3M2DV2%Z;X?SNJ$s^Hkp$+zZNheir_ zXBnG&?ON+9kJU1l^K5=(V|FL}o%W^U0<`WPrOuKzao_B2QtD+GQXnq~_Y@OcR z{Cm%au(SG7(eg(`)p+gf2It;Iwn zRn^}gI5Y$Y@oyOzHQlu-eRSHNS&dBd;Mj@2-bf1R!(Cz^U~=$Z7Pm!4+tmnCnGNf) z@V4QZ<*=tMZSwG?L?FQS9jOMnd@mZV+1taQ?A)AdTbVOKf_L6EJTvcKgR~ZyDy}a7 zo)wosxuVZ8&cZw5uTkW@$JX?+=9Z?#d3OZfFD;z@{pS?&OznrepwbjV z6d}U~Q8E3lr)M8n+4`+k&;4G=v9exX9bn!tg&5MHb)Bp%`+y1St|k%ikADC8^J`C{ zZ|~2r4N3K3Pum-hBg(1y^%boki@AY>C z+Sj^>@TY(O5vOqU{(hOb#mVE&uf8vYJPu&0_NJ+*(cHfOeNDxtn6LiB^p+*6`wO|s z9v>|*Q*CTqJo9UiKKf4WO=Hum)(0uEY&l&{0k!i(yFBzF9<_{IPyaFg-FR+BY{`mK z(5v!M!No>u!4pG|<#Y!gx+WCX+B$i%h=e^qb87CfU-YR(xyQ9HKKbbC7>2KHmEWd{ z?12;6{K0j7p-VRoAGQzB9OxIeJG#|%$D`X(mt-iVw*N6TyKrz)r@iGik;FmjOyjR{ z=WY~*WM;-8vO!ffIeSBa84lquPjGgSNTG|*zdXUW`_xhW`ek9m=-C~CogSxm_s3Kp zU18zi2(8akrm`|oP71Q8qwzgBNMN_$l+>^?My>{3F_v6QU z9x`5^=t+t$jI09mt~)+IWpmn2iYQf3Ot;EQdC+CMs$T6VwW+g{M7q&7_=Aay*?2R~ zbXux&=R~=p%ozCja}H3&P`mv0`j2VkTa-1++qdocEI6TmD@$`dchECoo?gNC?+k=r znc<#>kromO|9@IVeJ8R=2gexvHGcN9d8h;hJ#Wu_KKgSrJRvyr80(^yyk-K(8rfd3>pH#DV^DqDH4`Z8fj;k(+5}h5bW<(iBuwp>mn$J9 z#?T2tk|z0}aK;%TVKp3NG5^HV6}@sCj37*bes#^HtG5>-Jy;rn5ee=bN@l_-L&_57 zSsDVHk-{`LXIaz*JxyzCyg@FMIOPNLfO~PQ0H4Gp>_P*m180i=cByWSB9-yEDfzod5DLT5BFJeL>^)_xy0rvvR*H%^4yVKKw=g#Txy?!7w z^XTN{3LPCv{*M7au7Cl%Z{Vq!QC>#$uFm${2R_dd=7Nr%e?b)(=%xC}MJY~3X8H=} zO-c>}dcueM%%IFUUeVh?<)%!|ez%3efzWy(78pu*s850gz?i^FTPy z2wi%)F7e_$JEaj+*5v`4N{k)c6;Y^oGtuvYO{m;;&nnxK0hKTkO&kV+Y%^#6p@VTU zvitJ$eV4WgY9TscA*3F4o{ijH5giZ>K$p`K^_^ZiB+d1Qh5lrqcKHXUiVo3H>&JEQy(iP}KDqS`TygbE8Mg7N z3%0w7O3-+DQ@#iZ3Pl=3wq76V+?vyo6ME+RC*GG%Hf7Yp+SI(fHdUam%pU((PPdtp zPmR5~gI%mnK1@g+QGekz7b3PH0%s=5>J-CQ;CT=oB&;! zA}svSebe)2Nrt2xbCK`w=|~Z#&8~{VJy%x#G__cr{jHa+v;Hi@OSNwGdQ zo{!N#%>O2H@8jB=bktuf-bI*Q*tBIZJKW7w%KJ4}@h%2anV4IG6$fwqXlY5QOyV1*uW=u5hW*39I6-d0yb3;cRrh8JK!`=xggdW&qhrnCJN?s^V`_Z zc8Yusozf0UL(Lo2gE_C7+)$zmnU5IvSe{f%Zi#3)FA`2G!#(1E?|xJtO2YHc^%T?f za!Cvq*27=g9iWD({<2=tc#|c#RhWdINZ8;kZ<0UzvCLWic%MrZneppPW=sBFi^EKs zLYpOF&3NbTi|!IpE`x!mH&1@Kqr_iq=Pn~#RQc$FT0e>Rdf$A&hZG4T`dx2^_3Z@P ze_Sb+I}16F;@`i)BXfEr==u&Hc2_3TQBwsAym3*&fs0`RCIOEgm7*!?a6{Wjg645E zXIw%;0;tRw*(MI7{t;zlU|;~5AhqQllvFT!EVIEeIi5%b`|J4-?dz>wZpt`Z4RW#G z;F$tXZcQN)#D9njLsqSUSW*)Ws8#quFurJU0lJD&a%9!Bq;YpQIobqzyQ7o=k$YCg<_uNj`H=6OOv_;w(t|$_D`rr-dHREB zq`B0?Yy!TS!leW|!I+%|{F;RPXICRNyFh<-m{@J`yk?(oW^*=j}3VJR!f=sR+ z_>&J)S5Oz>zXog^CJTcoDq(wvsXsqIQaHbkkH0PrZ9jGfD1P?O-Am$7%aHWscpkku z98TVwPhG;}IdGR=qZ%JVHyb8UJNzhSlYNTX@}^}!1L`E4<9i^l1TGf!f!HHbuC$$X-Be2 z&q0d2D=s8oF~;!la0nRI&DdYK zC}s1BC7R@y+sMs!+|IuS^fcwzCNLnTLJ@1~QI!^pkQbv|3(TjT15>n44-M4(Y932o zUa$THSGFz>gzw19V+|6Dh7hdD>#Ze!CB@cTU8wUmqFp(lUlh}4E6V6l_Pm#$q_;Wh z{egQAVwZz(4d-)0D&m)*s=@!*?qe2Dv=~_S1dunOkL8qx1GqniyUs zh5zAmqfobsoYwq;u^!UD!OdL+w-Rx1$p_WS`4x8V1klsd*qDF1XDs=Z#Kor51I2P5 zD;T(#`IH%BH(W^OSoyPpNe~R1;xArYcA`XKx;iU_86Wrs(6NVsb%2TKWuvV|WNQnO zhl=4I-Cw&F2dO?k+aMCo8m6;(v)u6Cc z#zTPO4{r!ZStO^zshRof+qcEy3(oMq;^h?q<5f+K7Yvg`D;c=+!_ouv(57MND6JFG z(!hMnx=1CXO^}_T$xpBeiqdEb=AlSv2+C*^QX>!bYfDQ+W& zVZs64T*ais?44Qvpk*pSyX1QvzycRk=9f2glIdjDfF&*K%rc@;L zP@YTiI9{4vPSVpvMWHwl-q0*{ML!x$&)kzUzk7~}UXQl@`%~m0okd+Y3}y^=_K;;LDk=}p?zB2cmC1hW9Fv#oWip*tHC5AT|C$^#ffiud zV7c*v1_!{fqp4{uZaiL36!7Q6#kZnu#&WSopO6|Zn(I(*=lzI`%sCLLhr<~ob4(K9 zq_otqK^Jq$PVcWM@4#E>tqGhQkm{@Yup@o;wMmWD&8Z*hK1y*#_veqDgM}8B`FVrf($Z4MG*J=*NpJpk z#D9Dz^Xr;DYZ9z9ne1L(5eC6pzmo0Pw{7F#qV?DC`bgsD#(fOXDVl0@)OP{N6r^;W zCKrQjo#bQ^7qhws70GZnB}tc>%_IDwpah_wN0<2rUarou2c+R#RSK7vAG;epZ@ zg0}~MR{?Z_^gsT6?2x5p#q2~W_-)|n5xj><=@jc2(ZTnUHcxoO#dFU1pnnYoYm1RN zYRdzSb*b~Pz5$ea6gl7u3h$bl;3Q=&%8|UF3)gzwMxY)oFE4}5QS0qR5Jy}LalR%z zbCj@rBr79>0S*RyF$QIrLzb0c(vtRj+kfJ)7rx5Z{NNCY1peLgQGcokgIt)}7RR`OSjj}lT&;thmy5(cxY>|;k@eDd5S@$1)RR=K|)@PtX%|Mm`p(_%KFsuWzKRPmk z+6cNV@NPmwLdtu8e*1aF^S>`AefKKy)@f{*^USX$vkOLKsO9e202a^BNyiTHk`5bj z&QXvU*jR;_8!lie zmz61osWZ|hRk&Lo5Y^y!t87v^B(;frcGDW!ZRXnGYm<-FY~S9NJc0SSD}FvP_Q&** zFrE_D#slgEo2~kj5Cav7l_r56>{8ulO$`g5dcV}1_C$G&QIRv$0k~bVZr%i2s~v8A zov)*mFSXEy|Dihb>p8F)VLeUh?<-T%P%dL&gPREB0$4~QpTo43LlBTX4(slI$&oHu zS(*xdoj)9XkG3%0etc&H>lzbdh*Z!JfZdmL@ghj-yC+qQKUfwz%(~=!5Zi`~a5{RL z+j*2$mYb@|>)q3$zPul8Qga&v`y$~tKJV1Pr{$w&&*SwAiCj3-#b z^xLK#pZ;P^JOl3=dWsebTU2zkfZ!-eKSDI;KC{w}T+c&;{zH?GhW;+)<_dU4iGn6b z=q)ig!Ia}?s=1EX9_BrLw5-vcR_gw=S5h6HlP<FSf$}!hsxtK@xABO@O z0MJ!2RRqn4krq8lzry1R`KO$>GCY5Xp}hw|?aQ*)G5p%doc-~p1Oa4M+s(2_pq9)71_1l3o=rdR}9S_mzItG@qyCI z9~Dh$W3#gpKBJO%U?_ZU^$1He1q4!dSel{hgGUPd{-Efsw_h(dL7F1&0qh7IGN?7+ zkF1m~hw5lDE^IWma@k~hbt*cVhJ<5>po9ZF2gwJpPI;h>aVt>2>7e%fcnG8*M~9Oc zsDH7s61&MPO_8DzW(>I;-49|;sP$pFoq!z`JX;rlvN$M{!9K(6tR6Oposy5dJ3RDD z8A3z9cgEEnSITHRq;Rwksw?}_aG*EPEyXFFO_55z&cKHvJYx7^xNcA=@H9Qw$@QJP z3}hiqfm9w}rAMDgU2lcDf*8%WUd&gJ(bhgXQ_GgJ=%aZQm=tUjAXA`g(}|*guZe1U z>sFF47%^Z2K#QD1W_mBJ&R$dV(3#<~00V`Rbm8p1B-@hyXW{D@TY+@kUUWzSoZA-t z$9CIZPkEi{^Z>?zQ=w1Up$ys@XxBmC|BvG(8f63tx$xKV>)YG!rYp9g;$+$^sc*s2 z?l!$LCNCC$2rfoeGVp73Dg)dGQA~&1OTSy7q^S!6@ab@CE^DBA?xy^0Yz!PYd|_|~ zW+kKo78c2H_DD?~#*}qpVkbP9jvYIOA)@QkwCbNGzJaPFJvV6*L6^d|xJQ4v(>yog zmf#KD0WVcdJbf|Z#vdwoV>yMW)UE5#(rTS3f=?$gEF~d<$UVK4**#C0y!>O!pugj1 z^&ATTX@vS2*D|^%sH*c)P9(3Jh*MSYqbYnvPtSh-rQt)D&br^^hgLH3a&qtyM~^uI z%`X&b<7SjyW?y%4&}!s@wO#VyL3LS~*e1T_LRzLM(Sl)m5*|E+V==L@uU*P<8-ibh zs|(i{bi+rM0?UpcirOF)b%T$Q4MY)ep->c62I?8_Z5+iM>G|-%v}>@#!Xf9$aK2Lb z9G!?&okQbNFiLduoS8;fW-Xfoks~koVk004qNe zt-k$%CeVM!Iud;#fJpmNwTw0p=&%IwWqyK|O-K_K zbv#N&maPwHPn>AT@S%Q=Nh(g_)!UhC$*Simq8T&rgat*vBii`R{yPVJeS^-?D3S!B zq{HrmigXGH1xt;ug!SCCSi2jCAnV#1+!+uI2gi|hWDsTD_`$1hf5nQqx*AHj!5=F! zI>HSPF@DEK1XKkIW}WQ+DVo5x#Fw~)1pGrjfB7P?E`6sw{BfFx@<(okZVl~pn8!trJ{5YM zFx3N51s)RmanHPu?qicXLjU3JKgZWf=wm>x#;FqHnJ&+xZ*QPa_45mhpC3$s&tKRT zq^{hNPv4*zXGnPN+S!@DEw#6^dow(Ps{tEq9?V-*rT~8(6#Kk#d!x>LdPOf~N{Yan zeEeAbD%)kPR0;{<1tRBc-Z+3bilt(8Sf6ZhfO?{*;WGcbLMq^3NSi_41gH$JAg}D( zDA8@i;XBS16r^Womv20e;+GJt;8t{FB#-E}yp4@v$=!Rs$#Rq0?xEwV8E_gWY$MXr zq*9ELp6uY@01^fC{@5_~0UTvLpJa<^GfeN^zAcW8+R71!N*`7{<&lU0gdgzqDIjd% zr=FOkiK(-N-wD6kvXp(j(Vsc z%PFlR`%eF2B`nGb<)$#wKx^MQUna+*BF-`A4Ma8E4#P z2#WCq8@$b&FcAj_;w;c|^n7SMO!9A6=Z4hZH&N8*0NVsQ&~+%xdTUSXv?jWB^eDjs!l~lB684(?B+WpW(={LMt$0Jh3XX=FWZZ&m5cs zQ{)0v%9Hi__S{i00c$Kj-g?(O8`l;)2^5n*AdJE;t0ofE2oOhxjoGtT!m_f=3qB92 zuc1}r;^LyEr5za?1H~GadHifU-jlHoZEPOHS^_MfxBfbR@T|aWOS?^^mQDQqIW(QM zW+=+Y$of*z3!G29VF&|q$H;jhAM!ARMgbn9}ZFFLs$JLa$>i6&%b>ijo&fFYO? zrBIocv|Ent@Go}Dg7L~eNLxEkt)G@mF~)KuOgbUr+55rg_;KJiw^TfNYTP_M_f~VL zp$WPzelhW!(*n3okhX&v3s+Mx6r1CEQ>+UrE3e8$0AAGRNL#?u7F2S%c$~QfnF74a z?Vs(W{wtWQcJlfA zd{6-tw4Y>H$-vxq9$kp1hsO<5S1g4zg=4=~Jbu5g@3;R?QXP{faFcL5{_>5(;vf+A z?GVK+;*6S+e*%#3!D%y^i^|4@R^WzLpPXYQ!Ki~d<^{BOh!uLJ`%;YY1*Dv#WAzW9 z&4kS4rBm5xm?-0IXvPMRxbf^9Uu?Zz#h(2e1$F)yN}{V@#^+6Faj^tXOrG7nmWp+Z z#S4$U+B1fD32?E-B@1Pcf)$AhHUh9f96h3@9KF z?20+z+@GdPmze!Uccr#eLTLakQ0JAbh4Z)}AT4?AqC`S~%eyhP`<9^5BCQT|qvI|v z$utGeJOGPWS!u!c{NY23Tnkvw5=ty+4Eqx>hBynwYf~`SdZNreRI6~M0acuDhb!xN zXuB7vVn?B`^;$RE%!&7b+Y)Cj8s9Jkf1`*o_c7vl3CcIx*hO7W8tTE-sRX%h5Y=H9 z>^@4<^9r{Sc4-e>O+STiX32rk6J|d6lW_cdNGX1_!PN;kHl(!V)fyv-Tk2GM)AWW- z4gj#h75S~-G(&Zg5U`t@!!G$9q@K@~zp_}-oU=7Edy8Kqr%e_pT!}SCcEC+FAA8W< zkPc^Ck_9q4*89OJ(lA;8T-Iq}7IB7A@5PD{Uk~eExRt9M`KyF`1VtX6Bt{l642GH# z%xR)R%~OO<#4TH1R>lvpaRu9X*k|AjLuMJ+_7331+Q0hzh(dV(?doE6>G;GzP300{pT>w0lBnoQ3L^N9Q^#fDBq=03GW6h zgL$kxmkht7rc$S1P@#~lefp0&TC%xsDz5#`IL!eO;K}0{Ff=m44&A*Po`f6}Ak0uS_c|zQ>2~RFyACft3z<@5y z$$ws6JywwVuqvkJRBrAs&`B0SfQP^;{_^F`tH*AOk&hL+dJw)SgaE(v*cRTcT+FeK z0h;v`B8TpEf4{JHp4U#8LJc)sK&jKRn>hwzl*4k?(fjJ?qjPI($(b_VVAi1)-SZ`P z9!^Gizc-+QbK1VbM0E174102VBKGaA9=`Msr-}J~*$f>A7k8uql~>^I-HoWxaiK@O zP3_5mFBk;Y@Y$&`&b4SRGTEf%41OP_aL`Jo3p__+Xygg&^$J(GnPN3}kr(Q}s-ql4 z_L;uYU%G~efO7w5dR|s!U4mEdF%4PbZ>%8bf3KiGyJmtbZEJ|Zhj9O zcAN}s=Zv9HxXCqEwFeP~1Qk_h_%XPwNibn%4;w5Bi5}B*n zu<61}Fy5>kx_EiZ7UgqHv{V`z0QQknSfM8NKX1X~O1qHQU$RybV# z64o2|<5OqOz_S%63>zz}U71r1gaMIyAW)Bv=gsVDkER5yScbGC|6}-|gB)E+^r*@EEb$ zH$63Y8Ql|_le<;OdzXY;%KkG%)q^eE+T09Lfj90VXZr79Qi2y6E{JLHXKf$es?UK7 z`dGEsVT=!8GjaKf$jxI7*9P%+3atov@2nMqLg2cCTl^V3KfoD`1>*zq)`IP4UEP&! zeSIGuf06YHq9M21$ErB~>)aH65;bIGIUA1j;Gx1VwE@5&8_=Ml0`i{J*1Orl!mFn# zboF}lv+#Z1B^_>tAN;C>Sn2aXXx=xi&&7J1>ZU%!5R`O>hp!Sf&Y<=Fl(a1q?@vfkMaIR#ptsysd0x|`&Cg;Wvf zVFjByd^4JNjE=(M%|R7qwWOFB)xMU*Iz<`oKYu4O1^An00J|ZJ(g}DWMl^aV4xa9h zIp2)>gI!=-BE^ei+Z_q;`~n-A{L1IniWuSRmT7h&%TFRv-TJjX3Wrq}I-`wEfR?KOnQvK8gJw+t%aAO9yzBQP|U1D|)2p$QW z_olz9)yP7_5ZE1Eod2P$3kayJ%G0;a$lN)^uYD6z0ODLk`=PvasY~Uv|F^-xX|!F0 zp9o}RaB69B0ql3nBp(CNUyyX52*-n9^mh9*b{$r0Fezy_d~EEDawlh|BRQ5Sru{)? zi_7bl7X71{$zwlyH%VWeorl%PnNDY>;O#Qp(p=M>^_vohi6id%Hux{ON?c&jvoDO5 zrJx!IBugDRh^<1a4_Pe8>@bRf2Q`8JL3RkC?pxWL-8MyC_pM}6p$P#m87)iK?bHMBoNA5A4mE5Rl!6Xn_C@be?K9gJ!Z%`>tD3uy(B z{542i+}ydIEVE7~dvj~aYia{<%|KA%LljleRP0h2Ov@YkjcG6@&jdOKFhS?{cL}M~ z8VDOux4JT7Wae9A*I9Q#`QH8e%-Y-a$kzLD7?Oy%88{pUycK16W-qUJnwnI^Hk3mC-TU;br*w|d;)J_@WS7FAeSymRlY;N5LM(8+bKR;nO zec+s*pl;amL;KQfWp_@GPEU98nhp^2XbL#u+AE_t(-3*5X1@2Kja;CgpZ*rM_o65^ zZ5NVb>1ZRK=@Px$)Ar75#u>w-bflZ*O1TzplNQO=jV@7Y1LrlJli~X{32hjZ^_Wog zQN+wc@W`&2cvtsrq{43tgW5+TnSq{7#HVC@!#1FxqG|UzGfCh za_$}4aV{fp0pOiX)(TP#vF=q;xi9(AP5BqPHWc0@2h|KHBf-5pP*~}uc>ML%Lw-+S z1ipW-;(JlSriBpgo9~(2H-n$^>p{vZaIo&N8rbU{m;AFRAZN!Y$WGWL;qXd8GEJYO z*gM>4H;nS&s}2weU)tw3WInKv-gfM%S1-I+wwH>lU;Q4ZO#hEZaIaPVy)k&n3(9(A zK$px%ZIXFGD5a%Zx3|YT!1^CQ*M3FCqi+OyJ)k4EyzM(^kcYINjE^Ep6vS$n-dEA+ z+cKoKoPRHd=SfaLxkEfN?MmD4-E9Y$fQyHtUPt#()nJfSIAo%D&7`W+4+(jQ(S^@c zQq$bhQh3*f9LUStm)(q`6sL zZ3y?igpMmJ+OF4U?{WgpwxuvpzefhfJbhV2Yr1&l(gHf((V zen|bTQ<=p&_nj0lwX?IciR*$YngYA0ot}FlD}@7U^ZA$mr%HAst@1qBe{B$#E#X(> zph%Qac$4VUVeC*`?1deOCU*yEv!tX-aAGEQ#nRHF#Tk8DD#0dP8696#-y8n}iEUtF zm=D`(&Nx~)GA3M0qi<|fbfT>Z7dhAe6!c7KN8xXt0Ze4x?MG`zn z5ca-+?3WQBKCbVmb!yayRXJjf9NT@g1H*m1}Mbk;$xZ_XHfsWvpFbKmyC>E&>= zBf>S;0NkMJ5U?b{hdxTwBCE@@w6<1LLL%=M8P=gt#p9O8SoT%m?fe56U`#L4lWOb; z*LPL{Rn{;1RbdHap|2W;sG>v_Swl6&_?l+mz5o;jyXfhJ27Ac}&h~21P8W0R!@S%R?TgE#PFI=xul9Go*wv4^N>1Wk%Sqh+_9&(+FVm* zA9;TYgpR$OG>*#}H+YTRuTwg-4T{^Y$B%C>RTFH2y&PIwiB`8m0B(U&)t{c=IZVPa z@f(&CUdz!pOqZ870nPnBI2b`6I>uGtIca!lBn|^LP>_@R14}Sk zY6?(XC~7fJ`RpKD3R_F7%gH_FNF{(38xohCU^=-q0{sgo}w;~PR>8%CqQ!1J)!LA1%adWHBcl3KRdye3>qce5>%r=v-)SG9GuGN*#)s0e9_~? z>Q0Owd>qI|LRXKAAMpuQZjU)l^A8M!PHFSR!;0q^mijSlWr%73*=wK(!Zw8^^bXvQ z5Wm*?{AX-@W9ONZ=x4_tATV%jCTL`mE477{m9l z>e1_X=9pc=ORZRFtJ84|44@6>Q}MW(zlN|rydwt((VFj5Q$EUMfgMjGJKVz7PWj$B zIDxcb{Ca4851u)0Z!Zk~yyvR~xrjmlOeMD)?#mbPtD)B^$Y0Rj0xV2FEb?dv&N-&-4eM04|Qy;$>9MKw0s5 zfTPfPL*E4+hy)w*J}hE!B^rtsp4kf%`q9_iCDxvk(a|IzziDs)%F62ZO7(KDvPHzf zw|>+$Jn<+}A1)SWv3=)G9QJ`n)LW&u(fGM7F1QmHPNt`( zp2B%IEBMM!@L*KQFZiYa?`&I~!mB_r$07>nMQ3MAA1XtRmuQSMofT^(rD z(fJO&J}O!yuNGiB2>&@3C5RAovZtn|-i^(nPKj>}iR-6%_qqx^0jT%yjhV2Z(+6&g zuab}YBZmtv;j@JrA*Aw)0c68)l1FpqnZN&l^wkr8J+Xs9{Fp5TP~GKKybVE1FcP># z)fsx~$=#DsAbn4rDPP`?Sl$hTSweMr6i)cCmVR2uknaLqj! zb%YTt%O5|saxmVUv>NiSN!yO0H+O0asO$?EUNg4{Xp}#BQK7o)WH&5D3Xi1gTe)&NpwEB+ zr?$?9E~zOruDf=(14z*g$sX=mKZEg*2j!a^@m z@hGOH2#7I!%M(Q+L6Vc0SpB9G8v>s0*#|4iq!kqQoZ07Xfs(;W21pq-3C@Y&EYPzM zMYxfOPqdS>GuRYhg~G%e5hY$ff~HLy8y|WLkU*GXzYr~~TqWMN6!8SbDO3V*_=GwD z9Uripu;B!?V&vcM?Zl`Aj)TB?Vc{aYGtjie8q@d>efnbZ>5^?rjB19xEnqhYB7+`}9vYUe3BpB` zt*)R>(BGek#cala=Zotj@zUvYyQR5+Y1p2OJ8v)qJu_hq3hzM<+U+R9>EmVZB;fD_ zJvz_l1XV$s!27|v9B z@Wtl-SDo?-=fm#2dU3V}KaO|`We9*Y7KXX$!Hbo?=we}?i5to}J+j_jnoUqY**X_< zJ^bQ-uO-$IqVbuwSH_?6yIz>LM7UD?pL0~M2^zk7`=s}-$**Q-I^MhC9V&F{Qn@psG{f&?ZIFux zoQ|2}O7BKQ56dH&VOPbVc!F4pF+xj;9)@P;&gJhCLJ=T-k}XTS^xawU)4AW>q~g}k zf}`yI7L*tl;F_wkC0ZY9sZ^!}i$vwuKRyh8zvrkF ztR;-IPZQlezKyy;v2#Gg9R`-8dC zr+j@MBTE9d(#1s~`0vaK?$XLLf2( zys!$D{;U?35Ecp10MjA9rC5miy$4hqsLhdXH>rnTbkF2rhnp7AdvN>Vu1Z8~K3n^Z zQWfA3{1Kx>VL=VAoQpC@ov!pne^nO9$QV_3674v40MtHcx9x56VvTpg_!=dHww4w+ zIV~!%U(IrIrIbuvf~{P!;_RWNqN#ew~ykT;Yzhz$sRrq@SBU{zbz3=2MBYy;H? z4nyZjgk#a-KlhU;K?&=7;-Bo7r|`w1HBX7hL75#k%Y^VRULdLo3H$u~%Bs!~{{RmD zHC>&ZIq6Ea1U87<4P)Ty>NGnSMLv|wY0ti@BZY&XHTM61`{5z=Z7If>5taxs&MJ}3 zE=?4UyI-&gF6s;DYw>`Ogd!S-_#HQ|^?E%(csO%RH#==GKaK)eU>!K{ z9$gk7dYm?L$obHP?Fwoi#FoMjt)%|{rF*v_2m_lB&=5Yq>mB#}qVTa59w@}Ws|G-Z zy1fS6{_bu79nWCMj_zU3avLuPi4;Nl&rsj$TN@al2`jhI=%I(Q<-vLU&s0VtqEcNc z-K*<9ZsCI?;xPuW$q*yQZU`m@u$VD)d8pKU|MhFFWlOf!ml$4sk^@|mfTgB2;^5t& z-C&r7Q57l}@AQqAjq=W-Ab+lUwqj_5J86Wd!zXy4nOeW(^Yjo920Dphbm6OJRD zg_~-O105a)S{SpO2167j*z&TDjquu(>Eu>nl4;~NP-TUdl=RcOaLLO_&hR$}E` zun(+YG$Qy?_?rzD5J)AUg4)^Ls#+qucR-f}$_M;N&KubqW<`r$M288^2u=Xoojm_g z6TChU7S{8*cJ)Oe4^KGh5dxx&bS!~%087o^Re2zxgm(Q53Sn0IT~N@Bjlon4uW#FR zXlIJg(kAm1+`jz+F2IQKI&^4`(7YL|GD#R8rPaS^D2^BidwUGwgn)Ak3Vxse_T;Wu z$we?;mC{|PkZFcy#e(X5XVLB3QBA`zr75%de1jS^68{=ISmNj3EPal8skEfzFB)8A zWa2XgNB}a!B8FjOx@lLh!gmQn0%-LC9-|C} z(|gbA!$R}~sO(`@3qY%4(a!+7Ai^FA9Si{VwrOl`+;&8huq8VTW{%cHe-THEDYETU zMJtLb!m;Vs=am(6)fCrL|5AiHz1AO~a!2eG81l#k!>fdd1SQzHko!lXL{Uy8U%SS= zaiim;#CZ}4B{eEHl@m8DQ2-##0H_4qu<#}bmzi1;(KVd>JUu@zCo7BdT922f2&Y&* z$`%WYc9#006&VUbTU1pS;T;Ogc7!0^(hXk&ONC&B6g+}ADaL&gAN)9qfC_^O0<5xa zMPjPEl<5BV!b>Ds8rvMxz~LXZ>hX=`Z;oHIK}(0{3$)2S%Q6O$gutN6p^W^9SAqHu#__*@ z5Z3eUdRg+9Qd2EY-PM2tZu@~G=sp0we1|FpIg1Mc`mjwtkGEjVjdquVv0aib zn4CC_BZXk;h#x+ShJ{w&W~cI<4O(K$#Aqj~RN-5eJB%w%Dqzp+YiU{A=r+q)i^Zf&! zpB~3?4|`*^*805P*Kl6vb)LrorkK=r7FxtJ8q4w*-L~QvLy#9dCkor+5SYYifye&n z^1{MC?kY=DU5e0Xp0jW}7MhU2zxGgzFyF0tJ9&{#0$c1?xtZKjj*AY@o7;}>2Q%mE zZJ&mWk`1`6IL@U|Frl%w=8yqUdn<+L;+MR(An>Xl*^e_SxDN1^R*!$qfQb5DY0#hz zO*g_7f~TJzI(&F3pd}i$96H7xTn6?uPIrEL!y@&uA%TEO4-BP}5D-%20+`XFzhS`m z)erY?$~UzOeOr7}P>ACbAfSziFWgtWt`Oy1B?R?T=nA9Oe?oub*{fH*a2R1eZuXB) zVHp?(mLIqlu>9bv8v_ULIfhW>v*eR70e?mThLGqFaw(yhpmbVXI%yG7Xpqke#aE*+ z$;)j2^L{Ff{j^Pmg~e;1K3K*3=hxGk6=xEd8VcAOgG^yWU|?_mf+s}id+~giTC(vL z;nxkJXf;5EiA0PcgWV*#Gy3sj0skd!oQwW16QloM{NLc3<9`pRNki;^v>JCD+V{iz z&c8DM?XH5&bf+>E355!+X-~L9_dIPJh2(a_-raxKG_iY&mZS-cHY_(*rh#hB6WeXr zL(}hh=*nbC23_lFvh>*z!y&hN^UFEr=J91Q6GUegkZcxRa;QY?-b}q)&z=~qY|hKe zOCH$wu{_FkfJ1}y-0>Sil9^7FBVs>k#F@ZMur96^+Y;4PRrz7`T!R>9UgR&mhGYgu zcSB`r!nJE~yxcICK6&!wQ`zoFi+)}xP$}%LGQ$?g#OUb$gcwP4+1cvT5eUo_6xy`x zugQpRVO$eyxk_-F30B*Kj_(FWHM|;hS$-|OzCIRh*i8=Rv(7BMKpR=OGEEpc(z*&F z;|w7Ak!$wi>V=KOO+qvd9Wr^Bp+KM^$Y-cuu;!M<7g7kgCP^>dHGqaJaC-BnKL zd8jWIRX%?X%>76}D;^OU*f--^U@k`md`{#OGGXb^9{?`^z^LO1#?0s&pn|rxsrv6+ zup6|QH}&lazONop_EZt>g*VlFv_$p-bm0uW&K{YR6xBBJeu!6egEv=59JXBJWk}^Es?6Vr-R$KIjuK@u}+v3lA=A$%A z6e6`B*)}P(j%tO57-msPpUYQ3=jb64^8$p#pRDH--c+l|}8CT0)0|6^+GphR1q2s^z>}UUX$uAS=-c9t!dehM~C#5)dc5Z_=de9rFX#kPrC#_wj<+f-cuR0!)9OMPHwc z0{|5ovhXqPD%Z!?meWgPa8&L#Z}2EN2E)hs*?lDEZMbj%hbuT`Ynz;UuT>v2tbVOb zo}%Cdy~9Um)v9-b8Oih8>QBoH#S8D=T|r7l-1y2_%2W0Zsb{Yt2<-c0d+91hr{%5_ z+AlLt#I6O^Q~Pb-BceuVY0cJiId;mF6F6EAejM(D zupe`r=-E<0sMrtoU$P+Fhwlk1c-GlP&?<1(X#4s#Z4*qy$J-CT0#{42ITjtgW?VFv zRgj1T`&j#V@?-g-=u*N6(NbWQ1i~owJ~2y(wRfp~IMUDqY%V+jC8hrq-=&6DnGkE{ zkrTlfAS6PWu1;Z_(}BCRz%gS$n*3 zV+*Gft{?iU2hu&eMnv7m%4*1vItI6Fv2aBMUDKtRiw3nQ;c5p&F??=Z7H%P2h|<_e z6B+416cGv-Ge8yfh}Gn7rp9!WGP1Hee>GjV-*RNwznVx|g6+D9jmJ@l;gzireNT>m z=hqh=DcvU{qFypG2g1S_0FBCu?7pm@q{)cG_qYyB)cJk1>?>m4?^T0Iy>MaNq|KHo z6@mb_w)W+~%@Jl7%`ZHG$T~?!NAPu|`Z^~v{V+|AWpML3b8v^*6BgDB)?k;1YcDPy z31k8ms3!eb{_3|~vp{}?I^s+bxZ3~z{oDBe_y1mf|4-$`9(md-1&wWK(EeS<@fQT+ zz+(yXy9k0C{$N=a2?rY+msDPr_RrP6@1{F{L|?o%q7Q!$e63LsGJMKzhYd{_-^aMqjQRY#}(7M$kQ z=XPsZTK>CY)Tjp)%3DR4Un>ShEc%rh{M=2qPOs+Zg&k$;In1!Ze%^ej5U<;w9Z~Ti zU&WOL!$;pw=#>j*=npq)YK|IgY13`zSb`lboiMS4X=ssNq*5rCMx@4ga9YH%bh&-oMR zq$RIIG)>s>rU98(0rAttC|Y6$Bx60WbZq4q(Mg{d*8MX)|&#VfK%(vQtEO7)ei zR{i?%18&$PTEP_inLj^sFKAq7aV#t>kbdD!BClY4(u%1i*G#F{m&EMRO05$7a|htb4#OA?-~sv(6e3Yk3_?mtfoD z<8yE=um}q0aS;)<_4UH|9epOKfRHT5jRTMxCchW>u!g${CCum@?&-jW{P~^l9ssph zu$+mL(h-^TtgP)kZni^76r2G}D2EJqM{UkaOt_Ntyi!{Ip0cv4IxA96%}xjs41+6iRyxPm%xh9{J@#l=0U zLnlgw*0Hk3!-la$O%6HNNF*oe0gWGuiR^%Jf}AZ?P{r!;j3@O%Q}D=SY$gzNtVUE_ zTNi`A2TB;7Ew0pI3t`M@t5t*AFJE+H?88?2>V%RLQ*yYFJX1z(1dkO+(>zBYvyR`tkr-oQM16xR4FAo) z$B9OxZ}-OEYbG}BC?pLEr5ZOFUQ9w_;t2z9O*Rrr-_c^c0SMwlcwycyG4UPGYq#m(tvMBvgXIh7 z(-cM>JSab4fIjN@#rJ?Q7>Y8`k&}#o*#M!I+JLP9ZXB$VgHb{DEEvCY`zP0=ULUHhE@3P+SDK!A&-{jw+p>> zjBbb_vcXLZj0IOzZAJXBGPg7@^NfKVEW`E1uL8x_p+kl9TmO@TW*MW4s1ESwPNrat zLnwgy?jsogfkE^1kCa^q@EuwwH$ZF&Mu7T57a?MgFZX*PO(V|yM;PcNSZ`${1_*Wk z{%n6WFN11C_wjO^WmuP3ebobz^%wypTZ-{250#Nj+L<#hjE9Nc>J`GqB}QT3(A*2) zT;ah;L-aiWjC5q_PJAvH{O~1RBuh%aCM6RmZ+SOHrRVHgIAp>E!DAe8p}CfpxZkQT zV!k#69GIDvgP?9DIOoaz3)(g#Ly9bI7tu>^BiO4)>`~hT=EiGA20q|5Cja`V;~6tQ(ULE#{_R`AD0{aF>@^|`xLU-1 zxO;oBYUsojIRsRZ@VCIlFyjNa=w%`Y<>;ue@Y>8>#I*r35;-AAb|AROPS4*U`NQWN z(}*lOreW$;ydaFz{+EmyH6!W_c?89o@KIlCY6RU<2;-FW6ahTddR_1k`UQo(X4KwM zhsp3rJ25a?695tq#ftdcl~wLLZ#;{}9|NU7cW#8I=41jZ1p-eT*$*o#r*26EM1+gA z6c#O9$PBkw!=BgX=ePuoe5|G#Hj#lLj@c1=$7(=yjEPyRUru$>>3NuS{=CKLBL#Qw zMl>?vVxrJfYU-v<*C5}uYhVw711CF;Q%2AwaH=F?>*Ql%;A8R6E%ElXZ2&eFLIWDDG~>#~UXv zeD~%JX6G$uKzP9g)p~goyUvE_Q0e2uQ@@r$jhtks_R?tjFjPsblj$^@7qJ8L`txUBYBQOBp3>T8~>akzzG(NZ*z?F5b{ z>TyLC-gO92P!<3Itv?Pk%Ob&A5WOA;V|Hd<%lGey^WD|H|NNzKBQ9{lcJY!WjVtFZ zS42H#R4`sH_80KHe2Onm$n>dy-il%Mn8W)*!PZP&HhAg-b9iLTp@k(B-3QQ;eF<~* zk*6u00Xk?ABnp}{S0&;7?wE4KB7q;CVErT{4g6sBcp#qCO1IQuCG12R83`vy5#}<1 z*VFU-uX8A%u>N1X=+)=XOMx=96)F0_Yy>?6=wr+QTC6I5m92lS2PXJy&mL(JuI@}{ z(HS!|>}b$~mE@R`a{fGvc2Rl|{vI1C$X3&9Q?OS zd()KIKJ6E${@!Qznn?~xeiM(0)ItnZmI?By+x|18L}isd#`g1G_KhnwC{KAkGS|a! z;zX>Ws)^h`-=7YHc0dJOBVOlL=lI?#=al%K?Z)3If;9W?qZR=;HO%g*-xA>c-|tUp zO#D6QpTqZ{SgvOU_R%8`y3~-c$nA;?HgwGC2f;2!W1K{7Lo54RSuc|@;2ys(G|Z0V(2(iB zK%lYtEOgS2yCgZKQR|JnUhAd>Yg+Vr>&8_75Ss5cUuWGI!P*@rGV=`#5Al(E^5SGj z$;yhGe-yStCzO*cT$c=kB{h3RXv2DuQNsI=tw7RL0uVhcH#0E}rK{i;^4p@2K!XM_?4 zLRuqPw+80Yzdxgo`mZiM0`Jf_hAK_CXDtB$!{}Dya}I(zsJ>!n3@NjobuNG%>n8n>B{aac*J$I~Jd6uQK zfZNhCG5~af(dajy9fXv~u;Jb`hk`}``I3`JwKXaF;>^>JsARP~ATb~UiZlklVC>G4 za^{5~NV2q3aVgd=84Gj~gI2Jehc_1ejo_+5DIB^$O-HA!x;nPgVG|3!zso=0syn9l zx2vCTY9DEhkG*xJ$F1n$zimU4d-~Ib_P-adTX$iJt9`nw{i5XLCC{Hf&Az_C#zA^S zp>^dpDfvRRNf|d^4k}<2y6?3}yd~@J`(lSNwHuEV8=cs;a{l~2071$I?}I~U9T;#K#THR` zth(x+v|G304x~}70NSDqYLT$X#H8{Ck%gFf*rJPwU7ys{gB_dL@CZrFJaa};1n)q# zwd_;w7L+HT@vPTYq*iWz8?!+vWh(XL6v4!($h1P z8@@Q)Xv(2ahDkW5V6B^Ay|-@5%$a-5HHggYLogQ1#9m$BXp5(3Dp@f&c*d43dKnQH zcc)#wIzDvGd*CvE(IKpf1UB)-1k+W=siF@&m9KXsdb}ug6x1sY_v{I{K6Py?kwLkB)tSe-ljOrlwJH(on?dzC%2M zf`enCqe=Tz6{!U$O;2Kn`th0Gw!s`t9_E=d8mVi>QKu58A3siIn^b=9nHx703p~>8 zV&md0<38y6J|Cz^y~lBnG2H5DW+oY0*U4Pr4sy2Xfq9Ju%)azFlBS}+8p|bsKMIV_-hB-@(tUt1 znccC&sB%&UTR63PBv>#-ejto&4nm_IoTo{y7Hi4}zMd)Uh|nkC%d^loQJ(qa&g`^8$$wXpa}h@~chU69T`1p{g%R0!{*Z7;`{{IM=Y7K;HIGUC0Ib?@JJ zR9V^1YhoCi3BoMU)cW>i@u|ylK`rIahYKmZ@LeZuRu?BDPzb#mxdi(1-~0@oFz5pb zkFovw?3Mv7SC7tqyEY}ejep*W=OiHs=wlmgLE(oGL5>A>i`GB?Bmt5TJT5zl{pSG$ zTFFXIP_IEd8=sEC%06_?XfRC9FG@L}RtJ;|LX^K_09$%&LL;3Z!U_q2fp_VXYj%8J z!f_BCrJ_TE8hdw5c~taA(x~B)jN}fii5$KKK#a2q39Suf!CE!pt5$B(FhSlYRJ$10HBdoL1 zL2+kSj6)5z26;Nv0zg-&E3^J{u3`kt{3zT5wTRaM1q^91!?QFuS7eR8V5L)bI5jW> zv59Wa1Z_Av1LWSJCx{fOUa6jLlD7aX>0X#v{`&b7;+JrcED#$o*YE)_wcEfS#0FVt zN#Xq*e@H8BoxZAWA86;){X>`~xX7xQrx(*_KW+A^;f{Xm$X6Z`}Bm-fB^@-IIA9)&{5h zeA|CAJ{~tQcu!z71U)M!m!Ve#G!Z`{5@p(68>P(;FZk{gG7;?gFEW7y0=s0Xc+( zJn7&0gMUdqV@qJeOTEx{Z{NmE@kMVtGV$2QoyirA=n_U9J9e)VG5>`8RGcfDjhWmy#I3BY?@vtSSiEV%qQ z%s<9+&8@A1-Dp96rSkZR6OTkiQT8e}+G)$&-1*yw5O<`QW4N0Anyu$KWz4Ec?Cex< zXoxVZBc9&*eBiVMYa_!W(afjGoI%bU1kB@fCr*TDw3=RMKq6yw@KMakTUwMRxY{)8 z747}hRYnmJ>i~rXTkTw42C?*|_+mTxWb1i&BvAV5K7M4csGI+}loS}4ZcLcCspJC6 zsQ>-c1Skeq8R>-7yQ~8_C5$lHK+qQCfd9_dV^{@yPgU3->%Zzi+Wa2(YT~o99Fi9* z&+6l^Mmpty+gn@9D=6q9ut8*Lu{m<+5Kxa@f^|=&@zj|6vaSlbSI))WYR8V>5-*Cp z4gOu$OymQ)0G?*a$K?P?@*7VaQ#(LoPD{QXfFmpXb7X^cXhZ=aL-+L=`o9^Jc}l>C zj{nhE9J~Hs|MsBM|He`kCNuw^u~cKchGOTeO0vP*N}ln&Fs}M$FrwS^|6ajo;30_5 zoinU#SquOF23z6(-@fcRn5bt%Pj+SX2!sk8@BV`aq&(U_wt(|4v;PSH0!y~S2M;bm zg}8IazsLnAIsV{B>O+0~=&EQ7L70wii5(~!3XS2zU%YtH*!q*p0g^+V+(XhI`9tu- zBpX5Gl$xV!*RG&iTs|m;s9NqaW^^-qhf?^e!SZ`|{%&0aCIYo#eu4it4K5nxJjV*Z z@43Hf7bF?kbxQ3UvK@1HXGA8r0I0Shx8aO*1_LqxcyuKl zu)kce;0f6R7NH(04PcL~%<_u-VYqwF5Lg&WIO9(Y1~&}eAV)-?MWBquz8;C}?Bt$> zItR@)1dSMTjd9*;=Dmi!^_^&81R2<3oQbnNakq#}AKg}g~#L|kutYI|#x%y^*6&2PuUvvJkQ@_u5 zG-(nP=(5#U8N9lnX^R)WP~40!4F2a;i&HB4Nbah;F?-l@ZN;dlD810)H=Z(i+L>PO;1Ab&mXzFYBU@W&BHW3e80{2uvNvU zpmoJ`N5%TXWj`Z}?wPV|mv~xvXK7H~4bziWNj5Mv3hH`GvNpljr@#G&JFW^Rt=Mq} zI(Btc|2*6$JI*qZ?WusA!>Y5fT(xZ!x4DVqTxe_0(>vH_Ah<|NkGgGH3;nk z46+lfAxP9Ug9)=j*4T;%M9IpSpK>SqXAZxX5+4uw1m%~`UBTAofmyx3MVcJBnY(P1PpiW+M5h9X(hN_ga>d{L3acYo~n)l zneuI;msdxr?;mRBjZ}rn3kfV$KWtg6eDM z&RaP-e?etgoFZXR8!%Bha`B=T10BYu2=c5|k$nf8FA6`w@$YEoPecydU-yG0EMZ?2 zSv`=X`GbbGI0&neG>R~Ze70=^Gas*~_XCypa~CfBqA;Y^rQ_mX*MIz&#blDG0Y7d# zQ%3qACI+CWDOoKRl*|l?tnKabd2gn8GuJrCVBlG0Wm8Qysr$jP3B}A~xOXV0lSY(v z-7d2l)!KVVsb!K4m^Pf!zY8@FgX{8tZdoVc;*~=(VsiVPWdHy}LFbO&_3da5=7anG z`FaB0+b|raowRw5zAZ(=5sIYm;aeTjxux%>`-^!NlqImSHEwPy>~~>S%0)}+^(`a* zlU&n$&BFxx{O-Ce?iLoB?=`!#?8<>kBvaJ}9b@&5!Aa2H>&#bhwip)(*(fZ>fz{RX z`Hhc&@S&Z!tzUl&W+yw>f}Qib(i6k~R!~Q>y0(!8e~iuH=@!0e0YwL8q4gv33RGu7 zE9>v?i6TMA3?vC(1&R&vT~6pBpb5*l;xtvUpiOTG5F)f6%=h3d+S}S9g(4>+MO1Li z{!nJanH56oeBQ0l2+&K@^rKgPf6T2PuQ@!T_$RJmgQnafs*#VDE4C3J5!XjW6TU1U zUmX1(>|5>;7Skg{Kw;ZisbFDIPz;}EiqpdAxlEM_XjwWRP0;Bm`x#|Qy z%!!vSeG2F``Ac(iGz7&HrRZUW;^b5O24YXuW#_4|>8N0Yzr)j72XYSE0I@+b`|!{@ zqNlKj+1Luqrr5bNKZ_<&mkj1obdV=@@bC^JrSoH52OTM9kEZ^Wgb49Y3rkC;c+4@*O4)$Zzs#oA0+v>A9RJmeY-P7cU`w`psK1P)Bo~vGe+QMBj40YR;F|J`HbRtUiW*7elX#D5u#@~)HSO0;+pIkq&T0?GIt79q7I zUo;N!kVCMKoHAF_~8Lu;#E~K_C0MZrG4R90g+l?*nAbYR*>% zV)D#jSfJvq-1LYgCS4R+uQ zxiEX>(Uq?rFNCnmmuZhTf!v@H-pmrl$iw$I$!o?diI78i=b4ey(NfIxRa@r8)iv{} z@oj7lCG`?-!1i=n{S(t$9ZdB}+|#CY5t+@H5vg2sZ>LON_Z~eEjqV)hZDW_vT;qWwZd|QAiwT-DPFg`^ z#%Q|}>&o(EJBUV-+KMMt*o11>xab2}o}{J&+20B;U?0vRWO+tfA|~<-rup)L7la3d z^Dv@KLiy*zYTZQkR_kWLN&{=Eo7G|o@hPPL-oU_Vjgx!j4}{v?c{`~#QYwN210#k` zT=(hd@ z@ni-&BFxkVPTA>j;R)pe8Rck946J82ZZV|~rPjTB`>3JxRM<@DHTlOY0#_v~X}6tL zySa5Rzb$;B&~gWcm$z@#)6FW&AGlVPIcZZR2+JZj9^JoB+aX##afM0Iryg(TEs_m1 z-TLY#`T}{{j3Da{>(DqYZ>#drhEHM#Q)J$}N38NJ^=TM7KJ{86v*axiEa@+wd zPjAaC=)GemJj7afoBsarIOG}F;W9ZNoM0yq%pJ#fH%zEFy zn}z?28Hr^}xjwad!JJs)Jcj|78I=O=CzO@LadB+cFSfRBA$rnXP>~8FS{*}lGsrjm zd8-i|va_>~jo*8JH#i~k>RTs4GiK{kn@BWLMw4W~m@R z0-NC7l)sBFJKZKD5}aD$DMbI&GZ<)*9LMrGE4n_sA%Hv>d9bZ$idgaHZy_nDCMV6& z6rh6Cf!GV^O`4R`a&M#{qkr|P#-Nk%Fk&r-|DAKv>VQN8!p^aliPlL?pPZ<~uWq0f zCV+tyRzjK+_O*~z6Nn4*wl*Q}*!bZC`pg3FM#x@{*~dOU@%yHW*HnFB1d$cl6V5$x zhlwhc={B3DVqmY-v^4OV!v_xvD4_tbhbi5ii@C=3Ka^tZ|1?n!Ml{Ko&t>XJRDMFw zK<2DwJBy(Uk~L%q^cD&{uGr2FRWO#5hcx`=@FKAED^xIGGQvyN`w1ewPh6*|_paPk z25trTD_DY?Z1qIsjh_L=(nQO3yp3A(z#Ews21W1y;|cUn@i;KGyS;degnc9ug|Qb4T)e?mZ0@R);c(-EDm68g?<-o2*wJy z2(|E4DhY6Hp8lWeXT5;h^Q-fN?Iu;oH1>Ur6oB@Cx(f45XlG?x9YItda zgsP3m$WP0RB5@m38IrA_?L`fI@7@Q9tI)9#L1IO5Y}6bcgQn&O?m8bddwY8-I4Z^S z#box`CLAfo$QC*S4Q7%JcPJ{a_LUx6qE_w}f3XJgj6wbp6M}B6p~v zR5BmD3^WfLu$@ap`t~h4YMOe4trnfJOKG#}&-r8Z{ng;bU3ag&L)~mPZ5jkpO0?kP z&H*JWsY$kRl+T7X{}Vcm6;=&-*2b+E< zk&1~>pAs4w!fl`_w*gjOJ{as`z>Xfp>XO;WWWWpLxaJq^@#;ax01>yXwRJ&~jbhMmq(5j~kdA~J;y_mQ z@6x02O{iQwwRO*K-J-Rf#oJTSc!E%cVsVUJ@~!Rdo(>LG!iv$rA&*Cq%*U#!IT6w7 zXN{YZv8f5w&>Gl@Fhwb0gg>&_Ai)Tt5~=_k00f3#-WZE|krt4d*^uMfR=TzA?$+l&$$>cohb{Agicc715w(66+wS~if*WZJCJAV8 z_Sv&RA587?$;XnU0aMF_B9Y%m~ zA8OC5l+nRbKTmG& zXcQQ_>oa`+-nxE$Vx=iWGMHik>sOs(bdDY{lbrQ>{^?id_FE6OW^(FXasPi*Qb2tu6+cVp3?sLAjm{kwZ zcl;22b3Zx)=(dcj1oQ$nnWTG-ha*54Y%H_dGhsb#EgMiS8|-vr=4|@C{YN%JCLCS6 z6NpDRgIT9FY#DXlk@soYgMnc=4|Mx9rfLhm8P9B{ZGlNe`cdG z*^)2aAa*@$NQ#+>>#=Bt0%$8*Y-e|EF_MU<=nXS6GY_^-s8$HABMtuAEIv#cT@Rh; z`>WzQ$P5a7`yv#xfic2BWpuQEfhECWfa?Tg=u%ZvRds*al|tn%!FkSUmFpCYX$Qe_ znB2N%hnr30LGj~G59{`6GP{aNHl_2eFqCFmMWxoYOBbW|n+B7+-SYkWhcb+*z_U+t zcCTJLTQ#(f`wKcksIJbs0gw~Q7fn-_B(Y`&Gh9ncS^ZtPX=8PCctX(H2F6GN81qTk zyb+ef8kZPZ#2ndYmyF)w{=IvzT)q17?eZEhA_#cpzaxbeH!y`CKYpwd7R1;K$H*zc z+S^UOs=WLvOgX|U<=*78YYu*_YyGk*!{L&35^1Zbn?WY*$TL(*#|-whp*@CPBw=FJ zA4%Y7Zk%kdk6a7Tl1i$In{{1YWZ6jj0~|8xt~Sh};~|_gd>HGk)|BhfL2@R&ml|&G zBW!Gvk7{anv7^H|SXXu9O0{Hm$*gI^BKa%ce>IOX@@5*(CRnsV_$DAffOQHEqm3qO zoSjYH-X3P@FZkV6G*ZDW5+Rw|r+Xj@;na0v>}@S7g39D_@8w(fEi6h&9c~m3^wKEA#QVwSNCc&6OWUJO{U9Ev9h6rV}BUA)2W- ze90vOVofV>p9M`Xw9SOwWj4N8ZXGML_PXdl2{D@6C0jfuVi zZtdD%h7%%4*F>j=XSv{^g$tGtm+b`=ad!KJi6B}CY_Q|0q+slmT_Yrwx$sgLg2N+R zvTgc_W3=)h5MQgF+zhfz1b*a6Y>)E?7KcWi=rgNe_-%@ke$tZlIYmkKT)XAe-=9C9 zh>pf#gR;rXU^-}roGB5dz^6;e7s{4}MC04iGfTf=zXGEbRAJR6Kg(#N5f! z%(mEvjhnpGHmuN3EdE8Cf}*Y)9iVdih75R;srUTu{K%d?hyLYcHgRPmLS~_I zLgX2M=wEbnB|~-U8Wwox4Afd?k^J>A37Mc7*+nj&fXStRn+QwrJxO)CJzX;WUX zfwOGcDTmP~bk`k&9Ff%PuWxwB3ZeM4zy3~M5*69PCozSiTykVva=_kiI~i6j$o+w8 zJD$0TWoX4+x%pdXNWj>jlNF6fLP&GfndcnelN-|P_3QpSEGDDV9KS7i9G@=UKZR)&y`RQf}+UNmN zCrUSadDVYyhx~KxnnHm`ok30K&fdX*Qd*F$W9tM>UC-9#@(LOB1r}fxiB1(X({LrXNY88l@l4W5hGRqEpPH6|)sRby`~bBKZDf=E6G|P9?6av-E6Xb2R3}(dZSfnJOv&%1jbuv~a;oNGJ|Wt!PoFB}X)o_3 zH-sTdggL*>Ahhx_kPbwHB3WyfHsebL=4bo5_}pHZ_Pg^JAO-J+-9)VGyDr1G>D&|7 zfZJ9}me4zMKq+aLEWwsOpm~ZqQkgiQJF0(fZiOc@O;3Us zGp`X`!B0P_U^e6fi1+FhBwd$ri@^{{u0+Q4^kjmytE-PdjNe*Vu2WcA3iqW~_wGJ6 zIaA>fu)f3GCM|6UTn_LymjaoSR?i+jBpd>J6#Yrz^&JufG6B{2oja#8GNNa%EBcc} zCSy`o|7!PHF4tbu8~;R;GC0sQ=>9F$vbBwfSBjqSA65= z&ue-{uV7&qx|_oO42SiNWiS%#Cfc)}z&D<}@I|I?98WbMAWx$DGS=3Oe~xJ>6hji) zRXcuwAae>LuTWEx%4Zl=pAn=A+rFg0!K|Qyo@Kg|F*R ze;h~Orb#X~L8pk0{I(N#J&iA`r+1fi3%mmntfdqkV_#oixM`ymJ(vJ6*`njFEoGQDcB0wxLH`yo zncaTp92&xX{+#>flWm>kw(mN4?CnjOi(fhnj*C0iz9fPfsXbtT}?K zdQu#8l4o|a0~7qMKyA?}-}BcEcd9-O{`zs#yrlB}AGaa3x|=vO=-DajBsQiNPhz6{ ze7n}fYwp+&1$rDf)?5FxUVGT5DS7=LPt8`FiT54tF5e_TD1%zA0wW}vk0i_qg7XH? z+TnR`b?W9iow-%j;EU6*-}Box-t#TWu@E51SP0wlENBruJ;&|83!Utt{f2CA*nvuy zFvX1_S?BDc(l#`jOiNxk3lT4y^JNW;(?tvq0hc^4P5wl=N$}T&LtHST{~#DJ z;R^fd&p8u33}WW>>zkdN;@KQ2T6KmQEWe0BA0Ob2j+6o%e>ZJKswUnO`!K>5#=Rk> zSP`~CxZ_$dOGv)nLsA?7o*=`5415-rDGARPE)0<-tU`L=X(U(+P9kg9A}xTlv#}aL z8TZID15_(c;MDk}8X|EECnDMYICVIR4WJ;YsUK2%FDGG~o&2V7f6|O$fwYed>TZDj z!j)x?2?oG?M<|I9HaT@%YAnOZYB(dym)Eh$NDT#-Zj8eX?VTMz5My|E;;e*8T$Duu zGx{OH+6x)S961sxAqUyv!i5V72`iXc3cN=+u;7Drxn{g!&qLb!8+G+pp+IrcY6O?u zNt^1!*4F&35n#+650o?=I}s_uK4Kxut}vC76c?-9Tuf7JWAjRH{c*c9*RCy0u#Pyq zo#+8-M(86qgIc+aQz_`wme3e^vgjOXiKha`1ELvk@%(u{9}P;qlMMDg@4g}@9J02Q zl0le@q9w6+l8GF;f#(*Unu8k*f1GI!Rm{ngpdCjJWB;lSXpY-mA?g`q%vc{$Vl*3$IXKU0e9|u-s$t z7`@af5}|eQhB;7#9e9F}Tkqf3f*{XB5_o0#`N%QCB;qTgcI&nInA=A4_j!#+!zIRw z3G$HY>f*SDPX+aBoM+~8p0CfAm3&62mv+2?OZW6k#7uV{A6N0ffUw(6oG+IL8UjIu zK?pnbu1f|&3AWEUV>JT8M!m{ba0vkL>QA5RKgyl`lQ}!6Bebx}Mnj&{bCbm_c7Mvt z$Ls1M_y(sFl^o=NCZ9ku%-8dQdP`-H=}BNJ=8t#p+<85(xM%pf9&-L_@pA1VgqC7VkTW*X#cFYcT7loFk;{Y#d+tZIg>mtrvJ=x8+6V$rrxqEyBfwDC0RuCJ$|sK-0lTe#-B^%N&acM`wsfU0iZl8BxYHv_vw$L-$&53WBnF1M}GiRvUD-RnG zq93a)3omeQ2%RqLF7Xaw!eq!^K=i(GGi4PE2nDL{84l@oJw^4R&6G<>hO=8k)k(BZ zj8zuCT@+N~fM-8k4l{I|Kfer6N6?lkmk;sMinZK~922wBtp5gD+n;;S4R;hTXa?wG zK34hSh0s1?+V|#GVUMY!k5ICgS5%BVY#~Baf#Mp_Q$0euz(WK>2TmdMFE9|k#QfSc zM4Qk|s;P;yNI<|bT<1H=6!hchhc8dVV5s!tOIuZwEZ3)_^AX0nqeq#Z=WGXIr-S04~AE-j&2uaM#M;i8cJ8Wh9wv z9;O3yXaZ0;{&5YPOAk9ye}Xb!SAV0yjmwG-RRD{Xm2HGQkUuzwrm@K7af25fp36A< zxp@^vzOqJY-B>yhhVty}&+9+8zL_G716JeHILfBg-zy${1gQ5@*tcZdG~w7!(N-v` zG!Jy?SVxD^RzVG5b15mnH2fM|H&68sgk0h@+MEje1MBoEZ{`x@o;*1`+ZOW|`W-a+ zhmX2)`SSbMl`Auidzt&Q^Y8(h|I*JG2=ddpl zA3U<&8EX}&Hi(0qfZ9RMlWk)BVq1e{8^LcMy z^9@rkwGo$Gc;Otgr8Sd&*=sJ8W=` zP{*MWLq#Zs#Yx5;geCeDT0Yeew_CUHJfqAtVzckdw}?M%Y{i?g)%ucj3$Udgj25{0 z_;9<_%isi1Y3@{(W88PVFVi*U#jDai_ z57>&;{`Y-8h|Rq0Fd8<;*--I@PY@;{{PamzFa1adA&j$Gyiie94bA+k9@Y#i31GM+ zc2fTYCG-AyLIuW()pw?Al)FTb4cV;=A%3NND15QY$5{TN)d;IO+K)$jHrSPKR@th{ zAA;P_&SD57&e;QdJl@e;(u2bX@t2tk@v6asT^uABVWBXBfW-rLxsw4t#o41izrMZ~ z8!(lu?z+MBc|utOdQD2XyA5L)#p}aS_Woq1I`==a9RxkI1v=(O{uw!vw$5VqY$%)z zf|MCtx~hI2QEk+1^vIDAcdoP5FWlZlK@e)#hdu`kN2wIBJqXSN%`HIcm3r7N=V!Dr zm2R76&pwc}+{6)>^#cKqxgnq&s3Wr2UI?W&Y$*8fX{I~Vtz(;|w#svIiHF|7#&HV? zv%u%j_SRwq_`x>((|$EU{8SKVm$`sYkqHFuN`vBuT)7(fw~pVg-qGYy6=$K(%A zm%;v5td6pHBm%wdxwEfUy5Jv&+LOd51X#>0-NP?oHmF>PAN0G*)H zccAuMSQuE#6rbk!{$)ltdEGp|4~CTp^1OQUX2pl8W(Lza)OzPoZzKd@tl5hMw=>Nup|BT^_v0>j3GN86cJsyNtC_4r0hH6+n@; zSVW@%%LZk>OCuxYkW1y9P&x$U)xv?Eh-qb2l}fsmcUTTY3vWe90Lzmk9bdn zq-zKZ2{D4T1tI1{{b`c!hlKB^9yhiMB8YI+o~JCG!n%>b-6Q8K2GLXb!YzdBTT$_y z3dqp2zOZbk;P+itROkNuR32%WcR(RJ3cs40O^U+evOG-FV|zI{a^hDCzM{WDk; zFaQ&ZP2q(Kj2GUi5YpFIrVceQIL3ah%#Hcp+vm8$^x?uG1ZDoThNg=QPj5|Ii1wZ^ zdeqa~La7SCg8K6vWrn*y=F$`@8TnjIDwTh{7)_{gOiBpigkQ{R(1ch?uKu2a z>6S~O90QNx9kvP9T#N_&2MtTO(|mj@D|a+k_*NskgrhV?6y#bUCqib#x(Ji^@pMe^ zgO|W`Pp@#?b|(z&Vuk`0C=yvsZ@GipJ1a5OSzC4BQ5tS-0qouZT?uvsBXFyvavaf` zw;1w~pg!=Cyxa1qwQs;Lg#b5p&Eqp@2=#5f-Y{<9(mrl$*G_`t$d`tQuAF47Wr6kq zwLO%LkjLPWA^K43ekU)Fd#d_(o?;M$lT(v#kGafq@ol6*mZVp6Cw`afx z2z@f5YT9|oI?2pvm7@*3HKqsDj6>$j8N`Ox=><7cy#PRwtMES_KOQYBkYc0u0%UI} zS_w?2@4pRnUbX7GvDof9Jrg+q01h}EN8j#u=|_-Qt`USlhk>l4$2!j;vCRpgDc#z>eAxo;Cf#lw ztuzUhObShS!}|4rj1Owv3vcyAsS^S)v*!mn8?%o&4|c#c$4N$zuYLOh$~#nBj{UzY)0CBztmNJ7 z?2>hLbjaKXKmIA-1@S>1G! z6&Mz{yl;%#OHv-*(EIyUfMtH!^$)(okM%5QQWlLMCOYUp6IsoN0X!py^XM% zK}2qCV{*xMWAxACtRuVCRidGzKWeh_&F#Wm=6GI^>s zCMgxy+o$WsU|_;*(M+9bB7A`IRTmaP3ZYYFh!yLVgv(Jxgao!;Uj zU$xodA=q{mhhHHhEp0q2PeS6kmLHSUkdTtE_h%!CM61MtB$4zm z>nf5fa!ZTIO;4Y;t@->e*84lca!_$6K^^94Q%j-^rCRfD-lAUUPN1YYE}HQ^`J$)o z*iqzf((n8C&$edVq3~qbm_d)|DRnA%^Oo}J+l1DvgKdXf-x&Y|MH$6xn@t z0RX0U66E1s_}++=37NjF{Pj#A3X*6qhdj__>guzT*JU3{OkDMDjtKjtDR|vO*g^M6 z@Dbx7)$UMTASlcNb6l@sEmrVvv6GqYe|Oo|EnB>5B)`yyK%L`*T!svNIzKmku?R?c zm+_bTi@KP|v5qb5>at=i#51Xm4$3*&lCqTt2LAOD7W^sIIf)V?(8ICI{-r-Z`^2@2 zxCygB)01ebBTmB+sNTDh4h|=tZyL{>MCnTJy|iok6pexpD~gp>lv+0Sa%Ju^vHe~8 zc3+)t{8U|AF?XdzXuxy{MJc78&z_41itEYt?XEaq%wH5DRcN5Oabvu-Nuor8yx#Gs zUXr5p%oSY~dK{gd>tU7Tr{*_ZLRa~m%|lm{WUR99m+aCqyjQ$yx`dR(#pGQ@lF15( zEaI(qnTpMt6>nv%A)c)&BXv1VLbmHwyK|S;>yJ)dc=}53GuBo~xp%u-TBKI(mRww0#I2uac9t8N*Sr>R?M@-^7M zh#L8BSC6S0{%==`pzB?}%ww^bWs=fWwrfX+NTj3`e@#J@ThvWO#rX8D7SD&PMFmM= z=dSsx3hZ^rm;x}O$kD7rxyR{q$& zbNhBf5!e)xG+!t$#6_thI6L$vNl7!+Bd$);`n9H0&A64ih?gy(Mm{(8-tIW{Zc#-z zs)5d_h?JC-;d?ORke2?^+PW^=e#w$*p=e_5ICWam<;xF|Ke+7#T&#k675pN!c}_n< z9yRi}$)KrhtIpaitr8q0b?$=1m@%)KjV}b=vk-|!v*6}u5ja0lZD=sHlo&&e~A z`JsW=YdGXMKYwst?Y4W6ucVa3?Orv%O1_?3+{0p4 zZP*m!(ZR2rY_k+auj*&s*w%mbc*QBIKfiZP>U(Wnz$tY_ZBfk@2`w$PQ9Uh>i}p&c zyzdt=L`5u*JuhiJm9xQJ4V&YC z4bT4Z{u|9G3Oe-7-2<%|4cKTF*}cOpMDwu4f@Dsg4LcTN?yn5Hw zbnx~sV$zbi%hDYb_4VC@{$l@R?~koW3# z{F6IL^TT6%S`5AD*z>^ytHs?`mHJzDUB6y?o36;%T3$oLFWU&UzaKyS9b_kMHeEML zZ&Kd2&thR+kBl~1IZTwhr#D8cqTIG7EBhYeVp@(emX=oYC4zee+!p=$^X2g@bcE%khYzh%$7_xr{nj&svIp@7_V_inwUCza zN(IUq-f>?LJirF?{M#G-Z)=}R34Ux%P@6I(*Mk$wY&f?%`_16Eex<=;5~4h{ ziTyo2IaeNa#5pD9!8(03t#oMkh{XP$qaOC%tbGMoAe3B2NY~I6rDtJrl9d?8TzKkS;J8-Bsa2 z!s~@eZbg&3L7@|gqO`=4`f~lEb#@g5R5{pQmsONGYS3@kTq%VfhK7D6Qi~R;O|qI~ zaA#k?__AGgJ?rPqjut;XLn2nz#p6NU?fy&s9D7NMxtyNupt3t~mt|L%*LUCCxgR~M z-#M47w{II?dk|@IEc*7O&*A@=&RPgMKsAi}yn9~->ltwj5uc;>sh8V{2Q6z}NpD+q|0=v;u=X;8$F z+oiqE+1%LJH8qt)wemizsjoZ->nhwYIXU-4RLYv?<8!q+({AEv*MpPocNMuMC3p4P zCBd)^^!#@hokZ9lonagr`5RZ(o(>{L^bo`-&DX5z<}h<+qh%dJmf$H1D+C zC3VrOug*mWqlSsMY&&5%AV0K=Nc)=N&Ocj=15Iw-eW4sRO2S{%J>kvgTVG|v2f7vw zbZ~Hbp&M`ISF(Q1ZN-VQLA|;vCBMn#n`-xrFZ-+fCr<2E{y!~!_Ts7w{GE4JgclWrEd)qUAlxHml-#}cJb|t zi*ucFqjlWdv{sMT-aJ}HMY^^1|1kHa;aIhA-~TB>5~8HYl#7xisf2_isZeP`LYg!x zGDJj^%bXOYkRr{R45dhgN=2Fn6%Cpt6)OMF>bmdk|LS@1ynA|aZ@;@M&ULPJtmD}C z@BST5_|BLF?R=BxO8Ir)OS8t6zPj|Jh)s{p^?Sd{SI?h|mfnxF65T&ue*B_MhNXgP z_x0-|yjHatZ{N{H#D;kGEW2vwFYz5atnTen&{mmXn*7@uJ(NEnChN7KVN)lM+Y)-7~Q>B?_O>bCzG@EgoH?kt`;7X6b({r0sH)5OWTsk_BD)z(JJbk={df93`KOnlKIPa{{? z{V5I#(xv6%vXX`C3SJG#&o5m#Zeln0qAttMrnwugyLQT9$(qZh;#jGMAhAIk<+#fP z>)uw3avrivN@;{XZ)jKpPymi5_!F*hh3PRCE+RHjfG)7rx5)Oj`F=rBW)lXh5O=>f zHwQNz*a>@DeFeL?EJD)QxVV6|Yw4CRVy@h2as2SzFV}~zdL?6OcBw)m>iEzn#a%nT zi|y{Aq<7Y0#mEt2t|nR8Nqv185>~7~LV_&2F?2{Kdd&5?C+{(kbUX;Seu18t`g6ij z6}=>T$GsL9ny&0p3jX)@-8(Z(>^lE^{uTQ3yPM}JEnUfHp-Ljpr)CN=Wz{_>ZYXw- zOG%d3y=fhveg8ea3w<>->R${GFflPftzb5Mc(P5_H*6l@+A~;J*-VS|o~K4mE`wsB9%}@a@5MmY2EzIHg1mG)1M_e zQ`u8x_8jZkDko+;d&n90KrBA0X!OsYqL{AL)tPrA&jd``YppQk_w3Zk zC(cUVi7WhHVpIIFZjuogJF&kobb`&1*j3@-i<78+d2$d*uy(}u6ho=?g`#cOUkPx{ zQwRKb_)~}@xS`q5fPlnZGNy7XRw(rDDmt-t$oB0K4qY`uMfv%+Z;R@~q^heqsBi4) zt|q6WYh%;p!GqH6gKp=_tH`cu))t9&cAq+x6&E(f#YH44EQ~m{W|Gv{llQLH)NH(8 zs&we!L8h2DfpF_yWFRXFfhWpKuBVp^=!+tg<+IwR1iXg0KUo!{=Tiq?r4mB+v({~p z2ChoQXc>iLSk3grgzwnFY`JPD61lmZD3FSiT=8t%*}2a2v-~L% zm0pvv{N3xw5zPVO$K1uDB_!oJkLVU-`fNy-Ui^A8rT^E}g<+}&248LWg^ViD8n@Y^ zP3%mD1gN;MwFxlk;p2jXCV*1qw;wS>FTkT?EC_pa_pter54)w>3a%}U50+zp6tZ0w zC@y~b!0i)Szb)S*A-F-IPh(&zRiq}7d8-rdO`K?i zA-MHTY4{Ot^}Q;a4rn}huxnU1wAq3@fy0;CIQ>WJa_%@OfrHcgN)LF$IZAAAYZKUrtB^syog|poa&X z4Ybht^AopAxEt~={_Yqzr|*$D;;pUEo4P$N5Erq$9b`&7f4oh;{-Q2#huv4Lai!1H zeT)YV>8d6*c(CUEefNFc`l(lSSE+DY5p}gg{h_;67NCcVzJl%b)WHpMjdOcuK2?Pj zo-CLd6b>;h~ z4a&I1LYsX`vWvKznYhL6Hwkv*npP&*!5aJU{dc3dwx1e^1QX}PRB3Kn_dbz_2Y+e% zV%1&u(2tKV9-Uuj@vc4C;;-)H(aMIaex=PjTpCKn3WIw3E`1Rt@A+-9^T6ldmfU`? zrI~p^zUs&vqeVBvS564Eyphp4Hp4D=+qmK4#;Z69P5Z2To?$dAF{;X#?rzc*-qesTTv z_;8PR772L?@4xhH<7GgSr9OIfBZ#}%KjI@TU6dq@(yR{HOzftkH*Uc{KYx~fYs$P+ zmNUh9mejJ52J_^)h#kC3`!#c>RlHt~v!+Qei7wAqyO|jm6<^AkVwDx)S6k{;?)l(E z=bx#QCI$U&?d%p+5hZoCr=`&;sg(BTJGQKhmH%)%E%3_s?(#=x#xxY>r;?0=&yLe$ zi7(3_;ALkI(9yk=I_%54!$aTAH~gB8Nzh^wwG#DsjV%H1pGR-8(<*_2OCsIfhMaAs zCwAQIHf@l>oGuF&QYrQ9>FGYM>`}iRLpw7HGAG7+*b2m3pH26Yo%1qtBtJb_Qne!3 zf^vU{;qR4M>b>vQR&BWKnb|R8CEz;)CRMgTu(rSlo=;vcth^qzS&}_%_4jQ(R5Opr zN_V;Nvx|iVO`3|VbdTjt)wjIM#7jcb_q@nX_|vh?PJG_H#ZOJ`a?6!&72FxeMwA@W z6JZx1pr?x92Ei1p;YrY6dPVW@M}tB>)lrg3tc(RfMQ%(d9E<5l;Vl|`7265pCOGA8`sR;|MhVjz zxb}Nj#P^hq+NM}p-PKY=Q7e zu#J?EePJ?->Y&dxq^1_JRDDXYPAmR}Sh7tGl_?0*y(OdEgBt zrL4P^dJ@6CWXs7Mk7`j$*;&402mp%H@Pl1~^GA#tbuMww01d0}=l9pJJ%eALKcCrM z`Mq^WCV(w7#3A!8od2DFNkYtbgvluJ1YVyfg<|K{T6;bZ?|8nxdHkeB^Y%AbO#SCD z%xa39ckkbKi^S_xPf8i@QRJisWDAv*Wlh)4zVq=;Esd$k?lE6yn;lqfKV8YSRTOLk z!NhQ9!MCPqhqcCPT)ITDd3O1@sM^GPzy^;~{hf|(4(lkb*N~L8YQ0c7pEbzXn5p&H znrquD;>(KGcrNYU@_NJZp_i}PxIgKXjkG%<8R4TrSsim^QrGU8_&g_A>g(iRE^ zsR>T^>PmOci!2!Bf5~2@u`3>0fb*vyLr(aVRbw?{#`W4*aGtp`v@nBx#~6+4QagI@ zz-|{9h$2G)3a2Bxdw3S!TEH~3;{E(G9WVLNo8KON_eN#R#%tcg#&)WiWjV=xwP6!! zth)rZV`^Gz{yEi5d;f0V#fwis4#I<);z?IYx{F6hH@jG?(K5BiK+867+B9bM0t%m> zt+Q@7d0vt6G=LRy7tV&cTU_d<>pIz}3MHcnLO z<3B!84?`~3LqBqyq|_$F#a3pUj2N+L^X4w1509n%&49Y3B{UT>AO4%T7PkvM9UUaW z4P_frB*lv^S*V$cw6u%@^xw5CvHAGJWbuj>SN%dkZs9gVurV;Wap%sdBS&u3CE+P& z5d)YP>E1P}T5|aC>06@wr~i$^g$C(-7#}9hQ(mjRdPyu1!YQE?z@zIt7wtYXtkt*F z&(nxd2hT?)p82}1zJ_7ng0Ye7( z$atTyrDRVSXTLRXfKA9Yj0Olp|MO!+eECtgAu;olm%3-?sTYfxbR8P0SrX%+|LW<5 z9^t14U`+j$QcQAk8udv*LBVsgYww;CP2Ii}o9+Ix%4C~jutl65j?9Z~v}VuF^w?cF zeKfNk#zS)&-%dQ!&v$9r9i`3kqXOJ3qu9JiFpF;xEOP)4MUk4jcJF53%@(%&$s1to z@PF^##h`C;u1l_xfveBEfr+0`*yS%VdAD=krSBg-7cF|XDp7r$ehOBhL)ZNO^dfL;(#?lgV)N3l4q@3n}e zT<8kR=FcyGwOXv=#TTzqqo6naGqa!JBid0fv!%Q*)AP2KP{xz&PrD9@Dl_73qtOZY z#jkYIJ9y+q&ly#kAK~=v2pO~d+v4d`k>89!^`|zzoYG4MR3Y?Fr@^ny{(4UBixIhl zq^*TF!6d54oslgn7>sGjzW8^VmxOglV(EDc<@nuZGW*Z*(vC1RlyL1U7wo{OOR%XB zF&?>aAxBOsI9<;nlHRWX5(Y+_h%SHr3}Ot9C2vX!)T3jtK3%yN)b&e0+7E^j&=bEmG?;X7j1sb&iME9ie9{MKY;t@@%#&GYFKjISjVI!2TM zWHN4J_8WUPWG(IqK_MYnYQuU{8KnpdIaTn(Lp9}Yx?58^stj=`wmQjGwqph=h)Vt3 zbMc>p*2bwFJVQak>xq5totguYB2n-xE$Mx&GcehkdAKV~8N$sT6}hZBpHrx^pmgZj zc>RdefuCK!HAP9bUM#JgWP`s#o{MtK_}`uSor?@3$g-MwINVnnYRfd$1F(Br+^>OO9qpMTx5xO%gPE#jY57iv?I(kp=Pu}sWh8*`7*SAnsMRm zU?QZV^7EeCN~Y)&4&NnxQ%G`A5u%qcMtF=VkzdBN84J!VNfsUb(`s`VE**>N>A65Z zwSW(p-qnYdkM$S~tz&KlmERR%&bKMj|8_0ID?V;%iY+{Uc?91+^h3#6-Y zjpz7uE(hB~yX-k;dDU$6$S8+rda4>y%JFox!}t zs^=on_LLMHUmjtWo}FNMC@*ggVyWalR$tpgZYk-dDn{qY%LPyScwYkEEXD#i9)(aJ}f3PP#qpU3c*_kRzXDr0{?ZeM<54{3OKQ~iZn6E zRp<{t^C{c=-I7n1{>G=E+Mp8PE*>=lCl^ zTZQUpPQ$v+KfkH@{7ltEtNi@%0tN$f`k9sz_}R>9kBnOU-!&Bs(~ghd>*>;aWYa4iu8LoY=_=i?f8@;Y8^3F;}#QtMazIe95S;_H_n4 zaQ18Dy8b5GS*oRe=b;?-1Q}i^LSV+1i<9kPyDM(8|34vP$r5s-D#lpg#6(5~b_U0jUV1K>&w&y79z#A^Yo^sk_HUeUu z=wM!6H+gyV^A_sIPoDhxq9A!rB0_^x{b~a?dEYp_#NFgURjFx18e04JecP#lZ@K+?)Xu?M&Md@(W-%3V^uT9>G2(=Im?LaF@?x&bZEXapB(2 z+b$6(^;c3%|5{vB6i}N%QmfD@czeJ6@bDD8YK>&zpk=cKakpXa#fC zain?am0i7ctGXJ7>>}bqFp1XqG#MWHX0PQJFw5AHl7ip{84qhL_Rrv&i)sv>KFci;hk#T5j=C;BphP1c$-CU;-FHS$mesHvy zi=2LjOAs4C8zaaw>f6!QV$8k@Rt`uhq~fVxwq8k7pK%JE4qy85`Q=)3Q2;XxT#5B;TJ9a>=bx+wId!vW#gVKd3$gvPCC zLR{3cQzEQ>u)E;P7gbk>nE0PlJ#qT98WbWJdu9#g7;sYCO`AJ+43!o}gG_+V7P;;2 zst{F1=+^SEfczED(yuiHTzx7ZtO(f+R38FZ~@h%7r z8b@`5#5ye0(TONVU#7Rpy+gMB*tXLHDr4S3YGcUOfh#I@_{)w|qZ?R>&>Bkv<20-V z&;_&_)Y%V(l_ErE?%LH3Ll1+IWG4rhj`!~u?AsT)Yz!-ne~_S2Mc#N#ox;$Om{fqv&%s{!$pBMQbzkbcw6Qa`!C^?6A&M$ zfDH#~i$#Y}W-nfBBxez%mU8X;>nZg!=NcrcXEkWdf#1xFNTKh8931NmNHt$9Nb0D}E64&to55ZK-Jg zS!?HY;r(6ndNMMtGpe;36f={3pWEi1=mR`5?cxudmoZ`7v??t>Pq0AU%(V)t;NN{!vH+ zVGKr`2yluG=Uqs@^#)`;j1Hyoq;vuV4A+F0<2@`mQdk3c_$?`E{?DidPZ2)7KN;g8 zA;q|PAq7-#8BM*vTyBB2hb*bZ6tW!Z{YS6f@0U%@sZZ*E&Vm{b-hTX8-OzmR_&hXPqvPya0M*Yt zUT$}E=Ga4zHNJYQ3|}ZadVJVYpE)gMQW~UHV1Ts<7ywi*?AN=86zW4<+PdJbP05rx za=GjKIO2yzaBAsa)=<*W@K6PP?$%wi$n|LBst0fTp37Y_w%b*s$p*?}eIpf=A;Vsg4ITw{b*&q$OP@Jx|0OzGdXR$NxyLS@Qrh4vgod;L1F2qI;KJBoM%)VFX@c+PBros5{XEZFI)M;ss+2UG#@ZfF1e8e;{ z_uhKN8?RP#*5d$O-?#f_E$K-Az74*`G|2t!4wrmx7&q=t^f#-IPD$v0i^$;QpC%85 zRxGb>u{Ff{Kc{|CM)=+89#iycl|W~gn)D;72b++clotX;{eS)Xtqx3^q$AKiUvQmVb@WThB~ zX#Q^wfQ6E5^7FzmyVm9`@(o`rjWIx$^Zsc|rcCJ}+G?vUTMky|c54f%PzgB|+?ki3?^wFXwKhPFz^NBUZKb5y$Z*r>D ziVt^kZ&7u(aQws(kICzaf6(8?o}aHhfSL}|dgHVlIljV2y)tN(RLck-Hh&(s#u{kj zV(UFO{suFF982L205ljQ7<0bO?Os>w47Ydv^)`HYFO>8)2hzvZ-fe* z?qK;l@&Ac~nAR^&vchk>#1H3>^0DxrDduzUBn{#%}1!XsX@1}jsP zp`CD;jbw+f(39Smlk*xn|B16_;|Y>#-SGEkBK-#?*>iTy2jW^(w?2?X_#OEPw`&oW zw79yWfyc@d!(z`22b0(xsCwfa^i1Bf4wgz)I`{ zY_d|#toFM(AHFL{DeEte#Ydq4MtVlZyxFtgzI>^OT2@ALBeoB8**%q%=0DkY|2?;R zh;j_}Ei4v|c_Kl3vFrKX_tDDNzkK^fOmzhyhLE0$yC-6|+&fshVl0WNbUCsJv)^U6 zaM43V1@b2^Ep4P*CIBH9JO8|Y`BEF8VB9#tep<+|0QdODmP;$!{%Gm(Btb_f99J~5 zd-gnqr=~k)mkIiFtO--q|7tOWkASu@QMZf-N-c$^k%FGmW*)6wqSm;{f)k)=JV^tH zV+JEGPn8hfXV^zH3p$FdKq0@H0z4I^}t}2atk^C!c#0xbPkWtM?Ya@@SOQ53U!vZDqm0t zEJg!0%Om&|+6C03YIstB%}fcCRG1EiE*REwlTcm&os3^G8M_S(H#jA6a*_Yfoe9_8 zZbK?-uM=?g3w8<0jGa|JjdMxr0E@vhm%0vzUq}M4XvMLOdezdFBd0v<0OCZNicwlx zWe?m%B3cjFX!9Vxga09^5r*Cj@getB)|F}J_;ZYd29m{Ga6yr9S8A|u?o(-LDWs(j zd@E!|34j~+h&@PbWcS4l}gLyB}HGyKgi0pCJM zNt)k}AEVBiC!Go7unbtenwS%4L%1#TApbtWJ&x34tyW_GWFZ5af^i^@<+4%#yNI-K z;}E=adX2XFfLK4A}0eB6_|HdpNP@9c^iwvVQUk{Qa z|LrX=qyO5qy1At%De6#HkiPn}r19gi2_qt_Qk-CJzJVd$Q|2}N_-T&F1v`>3C4RnUt#fiFI5S@nW(Za2Rp9vQh*RRN8b&jKfC}|b^Xs1C1opYI2#+0 z?cTzDXbIIhwlwv?A`8Nk3?eGE&NARE^E^FAL?|N5^4R&!Nfm>K4yB7|zhhapcCO;D z6BFVJ$x>l;+P**TuVm712*Mvi2VT5Ln7euMR*l!spBtfPK$d^Na3^zAd{efBhirJ;{j%9)t^pY zT#Rz{LnCC@pPR+6}`=kvz3J-w|yq(K8^P zH6A~(Au(0EdFNTAHt1bRSNF*)p~eIOfPCPOot4eUAD!Wfz3?wtDf zV`~=E6P&~Ah8;9GxQziSs1ygwl>U8c`_tnvaFW0G((R{CtzpenYV8o=ax&|fdmCMA2~aLa2ET!w^SE%ZDpPMU zc#j+Gge3lg5CcIY0OOk`2li_0ou>mz)G}^S{^^-ZPaC*>VzMq;Q2=yB3Bg*1knVmf zP3?#3DM*8W#~!^KYS{_>>8Idi;ZZ{N0x0(8y*m}A;Z*7tSU`az;PdtD&0NfAGc)4v z4h>!>u08jNQu6bubWr_99BSS1_U&Sr@VEvbaFY^}BFt;&N=roYsr}~FHhlvSVGu$_ z4m-!`LrUJgP?jhvdWT|IP4G!4TqtHAh3C#?pOhKaPYCWFHs5t6{UUq|CqMvBb^hdu z+ZM-+k5N9e9+ZlaRlnZ7H_2o^%RF4sfp;c~LAWE{hpj>qV` z;Sv7k64sTkzOsyPBg251oMj5x=~A2gCz!@Vs^_ftR8i(FLT2e{BzOxp_vtBU^=ylXcGpHSe?6k)d*sm!BXQFiu(;N&WGd(+PSxNg{%SG6+xImIVI^ ziqSq;q@(KW9c^2WYdh%ml>R+yr-1wjwjCIo78d>_=3w-???)IT7ayllU-{^CUVgqG zD-fmA0T=xd0k-ymbq^l=ME<-7okLcZ7ef)m;o1Cz+0^@YCF)DojiP|#_w)r~S+L+0 zH9+#VZPa5vf4@9yi5be+-`|pRbKju#*%}!%Mh1mx(TlKN)&*wGR6z^g1aR2Vz<@FaR$A6<} zB<1HRjDUB7WWF%bu_j&lio2{y;^-Q{KTQ_g|I3G*twu&+CeZ>~xJkYbfFHO&A`~>m!84fd!2X(t{2Qctg#_i1eQqOc@L zDMzkx5MIiMN%ys26RLS z*z{B6KietTVl&7@2-Vr!Z-)H~e&C^bjmiM4g30~u&FQ=_8-(h2wjeoMg1Ho!Xzd6w zRlzvZA%0F46=J_B=*`ZnZuX*NW7jK5NtrR~q6@qf8n@!kf$5m00jk^x*4gaEox%2i zPy4(pVwfz*M&m`&B{zEkKD~H~@(n8koFoBjirWej_wJ4NN)>J%7Lh1P_gWNknhjp} z%F;^=39>(hCL_lsOCB!BA0xiUk(l*d=Yhf{6GuOAATj5`uUl~31d0Z2IcM-%N=8#Z z<(Cww;z2`r7TZ!vXyvL*nSC4+W+XS9iEw zU3RNu#@O)n?+>4NtS#L|R2-am)j31YQLUm|v*jv-< zp4;}eY||BbRW_AFWpyt4XR0f-DDEgL$(+&IFs8_3b)HDnc74PPEy|&y!op?5gr^z? zF1i&w>+?0!7e$}-b0|crFRHWEC@9?iJt^Z^^1;NN*X~#*`_EOp@umgNC0qw;1r=G` zd8(?KUOb&VJ5u}UM%a3;M|B6c!2?8C45K}06b{H#CW8Mg^F`yCo1?FIGa+AF7$SNOcbSU_8DfqzsH2pv@V~CF?|ku;!TYYn=B~E` z~(f9>yE8fO<17dOShL3^+`nr*{sA4YaiM&{6C^c73`*1lK_qxro2MX`|H< z;~&R|Ard0xRz@NcmIT%y0iEk-+21es`^A_Y2#WU?3!Z_y79}wTbvOE~0{x!%)%*V@X zL;KbN30q_`J@y?={-01ziD9GbR5tzxZ~UL~H~%LnbN3+(1p`0t1@Q5U@i=~wSwTpM z8H$iunr%Z}zl2F7>J@_DJP^iV{=F>*FNEHqa^2g`mr z&E)H^SPBg4Fux5*2h{ROr?WfSa2{U=W~L-`>s@KPB+vrHuMfl897rUOEEHC1;lKf# zy})=lC^@+)mV*vykfmy+7~Kd^PWP#@8J=ceRObP$_<+D3VC}@d@9U+qgS{#>^+8?T zH>-3=3Ql}+=Ye~Ay@}~j(M(lAWM1F!u)pcHdb+w8Wo@v!-R?pIQ(qj=y}}O^wjpmt zOYYNr%(EaLFf4(3hUr$EA4W;;7eeRgD2~Q7Vb&_fUt9)>D@fq`)tKyb z%%cSAE3*dN&Z@edCQqS&oHhQixeM$vY-=7qEIN4bEg?YXVH4o?0zygbGS+WqBW=#O z{)W@>hhc(bl-8IrLdZ~aON)}U1ng@#1cT;;@~>whXB<4`30NMK1XiPN!?%_eq}yT< zRL93>=QGO$iPP25*|=p3J`d9~{_M5YQN->go|tMTWNvI6L`g+4Ut*i&sw${b%FD$? zWIFz&)EPBm1SWwUR7~ib8bdt<9~mx$K_Zd7feh!F2FHi>$~$(<#?@E6Cf;Vr;s187{7h)xB05E?=VIP4TqDU)5%YeL5Y2Z>dRPGigF&D;F`0IV@$9MEun%^SJCxmdQ5H%)>i zu8P2FLw8=UIoZnzoC+Qz0yhf1y236m*}ssG(&~q{LIc^ponS@UTlVT#*LPxH;i#XY zZV}|Hqj=@&ReCKUZ5wih;AIY$&84}_oht;Hi>$1yM0*@NdC)@I^3^ob-mm644@A1} zM^qi}{?9L7xNcs3*tP4cm>%VV!@ppq#;*EL#wcKIT*Ul7JQ!BwzMer82qaiQzc{+i z@3eET8yk6*$)zNu`s)4reMCYC@<0-6aLB-YtH$>X?=_E|1~Cf$)Xc-bv06@08(qtn zg6hK=WqgyiT8Kd;b>DAwioobSn15my~DMvBcX!g$oCACo>vU76gbh-KJfkLVTX>WS2%WVq}JTN zo#{M~>DX8}OB%3jF(ObsMx`!S6KB#nsKtoS`BwfqpL;cqUMOxY`;|qttfQw|$uyqk08=KZI>WEf$Zjl4-)G9Q-~ZsLH$=|JYiVI6BrJT9W#!k;KEb`3zU;F<Us(~`^ z=?_>Z5MBtCfjYz+b$77|RXX!lQ)pblj~W^pNJ$-M@sf?!PrwY2c+rsI-pPr=zXE8r z!SGJN8#n{0kdT4f_$|YHV#4{~01_-;VYl&2I(`UT=c<*0G=wxChgiqiD^ae%ddta1PzHk|FFxU*jQO( zM>HFF8l`OC&Yl8hHA!ci5U{d)x4Ee)LE2haY(rJ#B^qX`=7{$Sb+liMV=fiTZYIlM zbp*9xPJ&blDh*Wvdlkck-T<2`SB}L{3+Kc*yW|}^Fj-XB(75Gk1hWPm99URQzBLBM zvDm*c&7tQvH8s`L%shC|{`h@>fZgXEirfu@!wo0_1V5)^f-w(KFw9*DC19T-u^J`- z<~V#uCQvXoCQqKcrJ}71)kvu2SYzYb2QS1#8*RM&{F=d=gr0Eua$e6}J9g;qyiUPK z&9SA2DmWBQP}IuME(3xtJDr=~01%l5=ah;es!YWq$Y^)L zUUK{1y|lRa;-QTOhs6jP>v|YQ$m#5^j%H;;JxSDf`SxwiCJ+sI6nQwdBOil4`sK?PP1Ul;zl0S?`*U>WMe>l*z_RbK8hP4m zPHgf_3xLiHlqYE_#P%EUj#t|!ojwb)2B|u_x>v7XA56ykvNNAQM1JjxML1J+joAY& z!C-{m)DF^-@JJxwvY|%m4ZLb3!*sCj?9W&!r#S6NuqvLo=g6foVpvEC9V$ z&wV)$msL2f3T zqe61D_vNjiI@>?RKZ_lW%2?tiUO zsKly1GTGs6QGs#A*}xn{U{^A*g~a|#KeVD(>?E^vcUD?f}rft_V>@uTW$&hvYIA;H1$);LaxE{nFUHRH%# z%@75XqyD`xt^@?s={&Yp1hO>`e#;BWEg+y^@L(cd8pkuk1Zu9F z{`9r5Lfx4`PM$1_+;pm|tHbv}+Z_%-myy4mH!TmZ;W-A%ef4mEVj^DdhtD*B{Dabm z8ij`oMjY(s%xdt2A^S92j<+#3_fc8d27|7g_&Ae6C>8pEQcdm@W7=gaSIT$RhwrIq zU?YO?LQea9^MKuckg24kw$Qz(Rp2%qy5UMig+Si@FvJsr6J5@PwP|MvXI#5>57fl(-g zw}5nZeE90)wD$+zENJNhu?acBhwN%0EWwq`dTRv+{iZjzmcIbYiL(Q>i52g{AP@`f z!*~0u$_I;giJ;%%)F=2MjKf&IVq)%{$Fbxh_?q;KDvM|+9OfdoW!$G_?~mWyHic%0)kn@LEC=NS$;yw@yxWm%SMxr-=9vZq6`0rM6$i_MutD?ud4WPibDAb>Psq?%^NqO zo;IS}+hB?Shk@Q#Q}eSc^7?4Iy@`!_VX7A2?1HoPYTKSn6OB|F|F%yXnz2~Yy3`H<2u!^_wTp6B;V?N)M-S_l}kpXr;G{huFWN=B&4MD z9!;E9UXx%TKrY|g?1ajoSU(QghJUO*^sk!wsp)V91 zPggbALOSLbY6KiI|8?I%!IqUFgs(e0hgg%bYN-ghp6c48YFf@aRVhk@qZ z+qaDveF$%Awbx1A1f?Y{E~*;yG`<|n_})VFCe5e-3yVTIkC-ez)pvA>9WzKhA-87n z-_$IQ-otm<#%?b*;dokR$r%qF1p)&sxWcL0Nz5%9H0$G`7lgMr}{v^C*3V4czM}TehVAn+mYWapm~BJHoMT6B(XZL%RXTICiUt&#QZI?N zW`kze=iO##UFAdG&+2!E(=$Hojl1HF7ZI5RD^|#fyz?L2yGJePinG9s&n(*tzt!J| z{XNUZUAuK_Ucrhz1ai|+)KGV?9zgCxgN~B-Um6HGB%HGG4X_iBoS?-GHFZpJI56|D z#^U_(dzFJXCFb@M2{A9zjqxoO1nDV?jBC=v*ed7+ceO{H^=cqsmTDq`|_Q zjQ>4_p3l)Q$rTf`YtXI5`EAQ52w>^b_wl=SOCKAZA{7ySpT59D|{=+2t)M#;&M#8Ybq(JSKfz4vlzbnKtj1biK1v*6NF?{Un)1=_c- ziwGJu@Kx|3p-@<653aE`FcAJ?!25;&{AW@+KOOw!*|TTvhO`@}sTj_mSHme|Y5zBn zC9Bqcq*|jioiJjWmJ^`!*7V4zP+a+aE$;G9Wj1vY%BSGq(WA8wRcEY~x;ov>t%Gj; z7M&Bo)>T;*5yR*nl2Q5HgWj8x_+E9aA4`?`h|&Ude6F+e_mKo;QbTyZ@+Qi)8J`z1!n9H?1b=lXf_C^V95K{znw;f>Agn({!?=A z@b_e*hBF~Y0ERMW9j&K@`e*(L4NeqZoLsK?r22>Qz`mA z?EwP-3YeII_rRIx8_qGDT!nPTiZut6?Q#4^bMt5b-Rx{3_rt;A8q=Y;F)I@nmHHU7 z43KlYD2wV$DCvk_`tpy4)Sqj*8%xTcX;$`ur%npz`9BGzWlb!C1Qfvu+a~#dwIDVJXc-^TnhyG6XSjAUVpRS-emUU1H0 zD9M5WY%x##Y*=R#XlRf)b5jKT zDKwN9wWapcLC?R54tYn9{uWHcJ3fzn`W^b$NPT@@)N_JeHJ&)a?^2B3T9%2QI?H<0 z)Qli*5F5U*eB-h&ZNI#q!AkWi;lh8J0F-z z$b#}YiRA#!jg$wN?o)q~|2AAVxe4z1v^?w~SABdm0#Xy*Z(!Z@?%1BtKPrU}?3DL& z;{44&b;=x*Y%2WJ$*&Q12xp+r$&3sQA_ZQ4!xs3Z&O*H+-aAGePzMDn9ikBzrC-zF znba6q#m>dflN}Fb3{4e>J65f37eb4Yl6s~%^yuFG9JK}exdt6t$hWuPGFU^v<9hQZ z5abT89mvc8*C2swJrjEEI~H$~HP8*ZksNML-@g-U!rOQr?`TxbNexbw#&s}Vv7o^OTsXC` zTH7%I^snaWMQ{rsX*P9ESz#%c3oMsIhfw7nP5u{T4jWkvk9lIao1$>wO%M#HwQuqO zfzGkw;CyYo<207IYHi(No~IsC>Wv9&H;F0}%(RE8t1TIuWRLQJeQs;K0+H9G)sPL$ z6+>gq2>n_GAhLCbH)o7M1P8*g)jsN8R7;BZWZi;bddncbnRou|!*|a*>T{W#fM`Uh z`Yi}i@bGvBdsKxZ%HYJi4~U2rh(E({Rkv{2Y>;4!rLKr&B=ZY+E}Hf%|t%d*S1|3D=RupUnm zcfS$WXjRI~8-BI5u!m7S9&JF+`aka>dCCkX=zF8>q9DYu^HA&+P&m+ky+61@RmY@v z>7*B{iOOy?r#l^%YOi&g3!`5!|)QC(_A2Ml>( zJ~l>9UjqAp|0n0xG)MZ0N`HK^?4cMk={C#|E4@~n@ z!&{nZfQ}*_X?LOeP;#ZX8=|nM@(?}$Bq;ac8X6?oGlj$u{jZMzmM+wb;K1Zo+GLT+ zK{Mbj6_z2{*c*;PZLL65QjR(BnfL}<{S*+kbs-_fX*YI;4fX5JIe7brpc?R>dr;w* zHx(L38z_765D30~ErM3jzCgqR)MPqk#wdWP3#CH)&!41o4Rt$vBr`L@R+}nlIkWLA zS6=aY^CFKhHZ}|YV-an;i{wj62tYm-A50z&NbIOz42WrKfBRV*d?V+@ z+GQh3hf}_H?|RM#(j%@uetZroEDRzxOSU~)It>K6;I$%6pm|^~NYIcxt9%USob*BP zgnm=Rp)3p=L|bEHB_gO&K}7KLfqjb;JDmu_e&(!AYg30WiU+HQQb0XO!}w!~mB(aF z1@7QGhas|(*;v?I7UZ5>BgHuo2V2>wkt1Q5@RvMA>U?lrqzrOoM<-V?)CB?5QKX^@ z$IF><5PKj4XSh2;awsSg75Zi`b_U^y^z-XGdbLh-!GBq}oNYi)Khmr`9D)9D3p`d7 zUFD-VNUFjwe|9J%J=O+fWD4(7vkk~~O0xftTMe5Ap5Fr4`nt&=y%Tk&F$25U`A1b* z*)FtKb2pA5IKhNF-}|va`#57qK%eKPj^|;65g7o9gJegB(i|!9a<9K~b#>)1^T8J% zVmrKe8Z5?o@H+VBTyYzNFB&LlnBMc})6)Wc;c5s}#eipHHzjad>i2nb=Je&*QBoS0 zUGpvMcI9D0uE`axv{MhRF?8EDe)Bj1QgIR!5nUo2E|05EGIO`UFpi-N{N zvqo^A!-2hH!}nhCj^mZH$wG~F;Otq!8HZUy)2D|$^d$;rVfaV|&$J425qHIC7iVWH z-DGxV5wA2K2P!0j1ObadFwAH+q?=Bz+Nyno1;E4oBp!X?65lgufQXbSCDn`vph<5# z-X^WBx^;{8(7YxLeixx_%w6Mu_D4uZ;de$vY*&!V0(BId=&jF@GsbQP4g&e( z$H&Lqhf@^kG)*Yw7?%U;Q3u_o=q2O+#ECBt^5=K5U>If+LGlX0=JV&PcrEB-*h}%X zVl>R47NTbj>3_mLGMdFq1u8Z5Lh!e@;LGhi6EU2Hg|IWgg)81i=bX|VH7XFzJZvX^ z^-u!?H6zbb%C5i<4|<|nR8%~P5)hw}w8;z6Mh=!}Y5R4{vO`vc_nlst#y?+One6!S z!oU1XYAP z=+rza<_ZFv$Ax~`E6$NPeC=AL+fZZUuk1Ib@oP-e2|w_G_y2KBM8=%WtUYxlC2^5SzvX-^MJ&Be2W$oWmAQ?<94Z(Hq8^eJ z9dvoe7DJW6w-Ch3^k4ww)R_izi?P#G>>YV-542`@Kw?%5P6+TNjjW-m3HRWwUb9)U z>^uZR@=SAq8^|W3szU_uo3?~sg>%OrP(-Hd7uWq*Arf@p;eXDbJI7&a7T>_da$qK1 ze+9TrN7`=b(0E3Gd>A!|R?+q`2xO0|hkVKqm|gNWN)~nmgt1V+3%LKtI8xKnD9Q*| z<_M-9MgL43oKu?d0>*aJ^8C42UP`V6luJ-hk}#K0K}vi&MJ)iBmVz`k1`KP)xk!Jy zx1Np*z-;pR5U<*oC;?VMV+6DHLEH(aFK&M;Xr?Z2TD-^f|(=SL*#?9IpT`l4Bp0{ZdJ zW+lhjF;PZ+1RBXXq>pPj=ZldnmF4$u-y|Zqujyt1`etUX^->%BB?V-=Mxsj@&$03O z^DivWrmL+^yG)`FDz5eS*HKRsst<4P$OcsR+M=KB?NC+IAK_#A&p&=Vsw(Q!zdy8n+tc1eMGNHzPm*Q&kFQkHia5Cb zDniODO*O^MTE?(WpBo6Ahp$XV!OH_w#j|ItkYaNBv_u5WHu+Eug;>7$>5*#!?N}?h z>arVK1Fnp4bcyPE-z_Qe)7HY3>E?keSU6Nay6UGgMiS%5;Vok7KrYoMx>inQ*zGy0 zqrnO6Vypj|*0aV*@V$BRZ|diyy4!Fp#0_mVL;Ki6}B9rJf{U0umD6H2<04M$JJcP&9*H8RkM(`g^4B2}*&fzKZ>H#ISu?c~;5zEEA z&oR4(h$v-4cl)}oyHZn^Jw97qRwgC_2oTJSDbWv1qhN_%geZ6S^2cX;I?cs}4>GrP zOW4{za6!oQs^#}_aQ#Vt$_tpjXAcblq=x7r)I%h%EwCRWj095B(}mXZ&HYD;r?#1! znei4RKbVFy#FHnJ_ov@Wuw!5fMKT;f31cuJTN?h%yAFk<`>h-wNY(^Xh&l7sARdNT z*hw?o=DmxJiRr*|Dr5R#8y!|9BQ`S%0PuWq-Lge$*vb7P z2(t3Mdx-r>0KQZ<6VY{rm#%Qw99LKL6Alo%>~+xXXg70QH|cix z`9ql`mzW1=F?cda1s+#hQa688aAvM zf>-U3Zpe^sfAt)5o2Fpv)CU|~1I~tWhi$h!&821P0Z$)75uFU3V3p6=hgI^y7-{X| z#5#0Hf8Trce8Ea1z|U{VnoeA4?8c8D!fh)r&pvkSCc<(+i(Boh^i#|@E|@f9N3w|h z;$U9Z?rl{*9A#+aImYfSM*}Eb^RJNRDga#-Tm7|I3h+cYwNLg|QcA`iiGB$s1zQ-Z znc~*V(|H2a-_Lk`ApuShtZipJ4$YB#ke+M|kgJK?~htmY4 zoL;?tu|l{;o*1ItcZTI5r(HiPiFbt>PsQ(h=CTInM=G1WE>HUFae{-(#}6OSnAz(l zNAD4f_^DcMM9BEJ3RHuK?{Y6*?z;PtAMiV@z4mrkjhmg0;@k_Nqv2lACF$r09T19d zl9kwOfo2%CAwb}RL-t86FNkO`Z8ksq?B}KJEg+FhH^60~5RRN4Z|GIpIb~kU*_=2% z1Y9UNA}b@f2vcxAZSfFZN(e1N&f$|Mlvc0bzkeL}>A`YcMY#FeNzCCHdk~k4r_TOK zT3+%xA)hZ=ARYb9|WRi zyv;sMSz<_1B7~Wzej_Zk&4czufKhYKF%-tLN+3%6`nF&0k3Q@>^)+HWZS6}pZ#o9* zE!ktg)g||OLUGObx}5l-LMF0M=i?7S^F$%c_r*;ERCY9EKPsQ?4zIQkn!dHK$NE2Q zi}sC>YRU(AW}TT+I#63sbAw7UrW&4hL$GwNhhlYcJs`C4$dOo{%?i9&n|L2l0?LaZ z+7RxJ+nhPvwo8v5g;3Psqk*u(858CGh2axKY3_UH6 zdqgdMeEfq_SID)+H*9MheWTi_74C>SE<}z0Hptw39c>N0%jN6L(iua;)MsnpQgDlV zZYn^NL=Jg7tlVEwyz0dzm|M}~fAXdX_&MwwwF)6ph4UPpaD(Yy*i6iyX_LaFV$Kg* z+4$mWef_*qyEicI5C-{2unMFXT6}+`k&$$d9@SM5|RU#-02NtaA??a;Q(z4!OsT1Qt>X ze{Y=0R0p2G>7}d@Mw9-tnqJuJOk>Pcqc%l53Es`C)uRP3Jc+Jddk!Dbb9g>Cy~%ub zC>UlJ$O*4=l}0V<)5J!P&%2@)kxyIf;eY&>8ak-U4jWY1fJc{66T3JUhDZs!pM{h=lRJXr@-i$BF% z2JRWTs1xkMEacn&$K9L%bG5dA|8GM`NR%W=DoH|;R1&f!2}zO=Qb`j&zFf#IlTN1p`BZ12<{s=-7>xIr@(Q z^M?|r{qq*ZH8g@?G?*|USlq0(n|jN z-q<_U=`v-@qj%GqFC>X!oZ&+&jxs99iLVi>3J77<*Q882;xD;2FmU~A*}jMoSL6Iz3y%2=z(Z2)D@aSG(s4jjGnyC`er^TA%zk}CjM&l=R&%`$u-)7 z>q_1@!EO*}2g<{`j~(wspm_7`oj@EoM&RkZI+UfLb}6pL_eu89uji9EJ3ru)2yh{; zd6RX^zMSDMO~hKvo_;;<{G-&;gZuXz2@r+!XK|)mjvL1`+;0=cp1AJBn_y#eD_Q0& zlh=_V-zzH{IAzlh{`3*In%O7sja$I%5^#!?aT1GbiW)!B=sic*A^aW?ym8|~*bvXg z&tei2BID~-^CuQ4A}PsT@r*2-4!RlwrCS2lczSs7gJ6D$CQ&TRoA>0rY-#Nu-rnB* z&(6_8uz0Jaq+U_Nyy^Q1c#MGpli|)iv*eQO6@Igp z`k#y#cYDXLVD*AM`f0K2{;g~|x9&Q$HYXRVU;zG01+6IWKd(3Qea;Jf`|jO;AFb=B z6jL>NKh<3t%8K)Sh#H-xpD5fhHoYw6H2~3!k+#>C5Gm}K4U*QL8J3b>R#%;3F)ZPJ zxz40tRh$_VVUsIAf9G)Ek8p4=e|ljlhb>4Y5z{6nm*V%Tv^X;Xkd;pl1rHrOkTKa~ z>Fy9HtwIw|A9RHbK*Xb7C#(B~_Chnv7b2z&08)9oapW9kTo85e_!UnvYkdN8vrjf* z=JdNsZq_N(XvqAiA0KaMq>$dmtFv{hFt7{$7W@oBAx`^dzH=46O+FSQLD?ii1|r;L z1PAjUD-gP4*&RZV2YqIVom=Og{-LvZ!-W-4Yr1j}eUwqs4|ahCdBH8DQUIfY>a@HZ z+I!#ldleOdaD|R?j>zi)7M`M#hMfK%^~};-IcxXM{0V4)9P8&GY=IN6$(m2dz!e@Y z%~s3taq=zB6r|m4>y_BTjFJQ%AEWAI#Niy@?5Q63`VTe6ng4(ma6KtI_YG1i2b`Wem3a)pUtGa8d* zk7wg#;zcB18_gG>*nRr+(uCfxdo`GPEm+`~UYYl6=V*aQd{BMVDsZlghsWK1Vkj`{ikpkb*mG`SNJ3)wHLdpamN#?x=l884J9E*9&RsN^TyN9VHakjyeUd z$81c~btfHRxHZk~JiSNQj9?dzIE*&k^QB{>)HWYqU#sQt0jg0gu78qAiH=k7K}@#m zS8S-T14t0(9J`5YW;;z8E)3X=I4`S1RKVwSq0i-@jS; zT_{X2N*F+OXlQU`un4o9?UVO3TBRC}9ZQnSix)yTz(x0a4T0Y`VtTujn_J^=%wYr4 z+Wz_T&X|~->}(n=no7)Q);}n3xRquave>4%WXkd$66LM4~Tt>wpOWZRw&mQ_jgn|dz9p6ta zx!x?nZrqBeLR_bA!lA>5g}L7>ImHzGJsw_O4th7yl?mL*(!CLLhlhEJKsRWm|Xlmyz@{dP0dn49}%Y zu3B-ApFL}Ra=w&>Qd@hCx(6*_vEgnTB*QTi)+O1$0eYY$Wh>+JP@}-y3kc`vwzt(~ zwEn;YXhlndAWxPrH={KP@6H$2*Jtrnp+nq=-a=HzwuVC*t{QFKMR$4&@Lbvp6ga4$ zc>B?0BiWC$jh0%5S8U$A3Fw&UZ2K?k0Z@nJ;GS?pAfSnF z8b6)~f)R%vlg19bwWqACmaGkL64LARN&t6`8|-&v!vY=7)K5%4bO`Q`0Wv-Zv5|}& z+e<;=2+@fIv=DwD-6S1jt)!}Ysu6U$6 z0X_t*Sb?A&9Uec8IoWIh7u~huOealJ zE{{NlaOR9cF|)eMfG&@!$O*B5?}mn4jGD3AW(`XKcJgG?hK)niODWgJ!ZdyPfRGi3l6gsf~|p**6PLHBrntz zF(P|9&{oXiHdKL-hH}M$>I{%$G4uXlXv$&uNt`X~gf%%frF`$MT?G6&LSipnN{-+=Vqf==PKmqFt;g%C*~37Z*@tN> z@J&Xas=?ugNaEbZi+>!?twDo@08f!{x+P0awz~Z~i4ThE!}0WsZDy9b371x`;{}D5 z?kKxK>VMN0PJOz}hzPORo2!8?n`(#^qma0lGCbK+2i7)J9(e^4Ef0>^>Keeq z_jy0-{qyVi)Q1jjjOpRjrjHCd!)5Keo3HB3iF2j1HMp4eeg6K}$~OfNTYd6|s+%{5 z>+0HNRpRp8+3)YWNj%ajw9{r4^%+Qv>zp~170RoGbO;jgVJ9zM9A4K> zgzz5KF)w3&$xY;m5X0R8Bpy=#%X$tc_AYSDojW%?vg9Xma($XG>H{0c#R3Deo*qSV zzQ29*=IdXbGZ2aWc%}#-8KUw2(2Qf3q{l>qGd*X^%*pw5wY8wwa5+UKC$HAdd^xGc zdCE3gwz(E&MdDjNc%_C-7fxtC8Y_*|2gkEMnNB+7AMD?^uYz(A_77b_9okOFZir91 zYw9|B88(8OzLX9xCD0G>wzk3vBqg=HctL(CLT*r93yVUkT8QSs<{LpYs8QZEHli!A z)&lE4H3$0#>*)t}>f?nZ+p`AxR!d51k~0Ov=UHvqnCAvl)C5>#)EB(fNVo z4pW+*;Zn!+f#2s~#3}oc1_tB?P)rloSsJ9UV+~vq<~#6l)qw*AZ7Jf--my0ygj>Da z@>|tpXKzonAtx_C{Lr!BI|D5zKA-O_gJnzzablfdN5}hHH6#Uby!*Od1W`JUUos_A z(*Eefma|v#ubz?HnXe{S4CUVrC=U6Pc6$Lm*&N!$3`0gG`KO6iEEm ztvA$-dI+pWX4&U@;NdjUInHn{0*s$jX;m?YS8t7(tHtgUK+hnl# zks3xjPZT*j8DzQ>4R+(+DgSvV@tgq_$rVS>l1D8gDW4s&hJvcA)BKZd zQv)_0g_E%R@0Ta9I2^UWb-cc$@-?MUGcUB`^v{^4$Dl|(IqmNulL3S#AQ_vrw9mkS z&%#HYkIRj5&BWCUiOCs)`!6vjW;#Dc^#VoWqAR%UnfRWW2_|W;{gBe+)-7$e513IN zi|PrsLQu!~7`7HK27Mo;63eogwp$H}&TvH;LOqxk!$5~1_01H3Xu;B|1D4!7)Q@5; zy^^yEosQjct*t(Wsx~$@NF?yTvNv$2L88xCXliO(iESZvNM`iCg1PWQxJydXntY9jDPrX@JXO5mput?*TZ(W6GiHlJ9vVnuCd-Awzrb6s5i zJaE9(&F!{t4kRqEC1*F94FTCoyXvrZ?ImzuyMatBC~u;K1MGr?q!mHpfhB+~hM-Ht zLB3IwM5-LxobLsWU&<`Qw=O6`^CZYfI48fX&xchxVv|l+F?`rC9N$djrM3_RLu?nE zKTy3Fv1H=A1?EJ6P3`ES`})&*^ohMHZ-l~)@G!uU_GnCg`tadc-f>KsR9K+b+MQjW zJAQCn0j8&Qas~)v_`<^0ZJMe4R{8lXV>S7p@*#1y?6R~VB}0u<%4=rUgE;+at{hsM z|DA<$Z*lIKxCGl_;t_&QALBq@+2a`YfyHP_z=VW^G9E580J*Td=Y;phb?Elp$+X^* z5^GdJxXt|iO@S|1c*?yWumRv!<+Fl$9Cs!=VZNS<0W=9~5mJSRi0T5(3c#IS#;dW=cc{|5o_OW8pqr$MooaoBEUcs6nAg&^~H%2 zud;^4-_iHDob7i<$wl0zPRDmnUk)M=a_(w&b~e|@c2^5V!Gjv61Y`(;%+^OaOJvLD z&E1Gwnos=Fe7ZH}$E^nsR&xW&JID~oPJaXnIr4I5| zLA--g0(3`MaRNY8f4v@oC@Oi{iq%&(q9mgk_A?S59_aWa@C*(RF#0*Va~OTXn{xjA z*iTF8;N+kH#*Q7TTduCIZnIYd79=HcWN8-S~;E1{2jB-F|yZnFE4oiDId?_HcPquSlyH>zAy2qKTI*8NmS1zyB6( zu}XbmieY)XRVr44F&WK>)HpNIy`cFbxn7QNAg~@3u7aE@z}D6lFq;4oW+?D&czzQS zs@d6)*ijIm*yF>^sPY-3y6ZRl=Ye*KCKZSzF)^2<2s$PfA<9N1Dx9w{ySHDjMx{gI zse6{S7V@RPP@? z12l~bRiSA~b>aY_ryWxzCqhCXd=tJlcM_a-k1hjwjG+GK zc>tXdTSI4u6zn&K9ua)r$}q|YHN{vH6E8X@U9q9%xK#rY+o7OIe>U+!!}pE9phzKD z--B&5G>j2uaV%Lvll&;$kM3W2)Q=%DBK`@TT`)o$Vn2q)*@{QJ;GyA>n|E!)Mb2)r z?09b2IfT!}Yw_&y0&ua&%ZG~g*-&9fB#%yIk{qjhftTg!i(xHJTvMu$<%3=zCP80=?HKgKeV4CVR+aYc@Cf4n6=U>a@F?m-zX$1q@mn_w}7d6Z96RYID~}<;Y~yogx@K90H}2cxjMiXTjedkhwxBrZIrM=Bm1W-aIJyMNIR=aN zMU~X2&nyp*d`v~YN$>_Wn?K(fA+X3-!HiKBr?x(hyC2oKo*XT%o885B*aMvIQu1os zpBhe@JzmxSuap!bzFx9Mgq)n(mHPI_9xBH6I$G_#Pa&ewG_ec{>pa06;H9&AsxIw) z#p4$b;IMhPyI>;b(+ONCCOQx?P{DkrYvh^#2^yWJWcWa$!Ssx4rZE1S8BHp&7s2~^ zX2Kkcge{ZijX%}X!QLK~=35jKPA@HvFduT~OGt@mUSvfx0vu7Rmo)G|O~vN5A3uNI;Ct?aNy32x zX&KRV8)&8|b4v0D)8fP3J7@g)-9Ra(ue$n=8%eV{6oQ9l#N`T(HrS2mE5ks{sds4- zew7ik#E9{b1N)%zF{3d+;)wCm2gv)|!cX7ML{&zZl$@;WgNq6j{LILEyVie+RMjR5 z%d*!}($WkGq_f^kfH=FxspyC8(51>#)IaTayE!*ejI~X$(^tpZ_KgBN?%?2cY56?T z1l<3s-zW#mWty@&obYg71H#CgHfH*J%$=+czFcJ_zjs z;wCNGB5FDIcy_Fkf9w5>I)r4Xh6{B#fN1wS&HKKRzY&!G_um1$T)gvqKS~jDkE-rT zzI^*Ok27|z#sqAcu4lb!5k&%u9X&TAH_FG2c8d}%hgIP@^vlw9ADg_gVs5qN!ti*(b7TF@8G2zNGT&%msCfL=YWP0cLXh?z` zT4qqw4h#g?0@822*MGUY?^fbakb;LYx|5z%y)@(*FaVw0@%fVF%uW2C7i=0v%KL$|Af^54WV8c&YAZ* zng*UcT3-5?MARCp3Ei?t4R{IB8=L=Zn{BMTBv(*OL34t_g6!ws-vacBk}Cb(<|jww zPW|MaB~ReZ{lhc*p5qh(`^{(_8x#-eo)`vC?`u$n4yb79pg0WVwSral4 zV1|HsD#d@AzHp0~o$~QU&%enSiHYeMFlK!$;c5w1;Igi>pmd+%F&2M_Msx9x}AsmVeL z!=Xd`S>_N>O2w|UtnD@C^F>I9cJwQ1(5akjzx$4pRABjw(~Bsom6eo`Vn_qWhYud} z`PlvB5eJbFREMCtGu?=XJ$@~U-|a1i|%CLFybS|On;lq*+a*R zwg;JxlT!t+r(lk5`ts~~2c0W0EyD*KyJ0K@zZ|j*cK=z^t~sosNTu3z=*r`xrNtOc zoXBy5m+Td%sADU?6n-7u`m--dY%u^=-#BAgW0~B0D4XD?4Q=X#-ogrH?WnrG6oe=* z@6b(O)CWz(XuD+}1z>KeC58o15}{;D_fk4Xbl|lYmMXi8f1Ei=PcMIC$2r1)GuDOF zIdyWl3CY^v9q6@R-Hr>m*hw+X5fnbL1}dEH*l651fGvdj{zbt*kr)uCK(pIr zTNqNpgP!jgVJ&m+$CAK52_qypl;ulZK&g$s6?e`hIG6u~2 zYu6a$8XXN%i19}7#iuyvg2utjFa5$G2E1uMkojLhWZu0tW0WaIM*F`f!<%$= zV#Eb=OZ+J?XiRfcN;izlrK>%EM)!V2#lGh3Z$IRkBdNZ}Qgzj|&W;Ek$z6W*bH%Dv zgE>+8#|d-;K$=V*Khe6->PXA*O)mIA!s-b7^UYB$4bqN(a-`!63MN2TP^X|Wl8Q7H z8PM4@S0B)y)B0M`h~>V!CM3zJO6m4{VY=^*#dGCxYEGFE*5bPTPPr5YIqA;-_3QdW zG2bzwEfHdhODy!I6J&4udFO1gm&#J{GpYzEs!;Sd){NsHjw=PK@7lG+Mj}haT1zC7 z?GZg(W0;bY4)fGidNl`E1XyZ`WvLjeX>NE}<~Mr5KfV!heY{nT$8WRiF-*zLO_U(Z zk62rzinfW_*o;n@t}YW>{rR|3PjA(Icf6O&`58su)QlGssSXg!IksfB@(WWZokCx+ zNO7f}w#Td9ELQpyvFf0QKI-W}zO1b!Ar}0NISAR2%BM>~is|%S`@m>`se-DML4tNj zMy!?C#6f}TRRNk4Tie`CsTBZg^46SncS;2&<(}EHis#sYE~N-B;F0$9Pm;5- zvC!UTCANBkw<>=kwuh>e^GL-(a(C};u@+B|RU4S$Jo19t{_ZzSf)~b6vwtG;5FK^V zg|k|)cCCzgN=kC_)kS3oj?DS9$dIS8Uphw8ee|j7?ip!(n|PF1jAU?U_qrX&oZLSw zw%#h~KmCGrfSAZ=Ky$EWocqeuyg?#Sspav2=~E8RZ=qU+h_X2OP%yM;$B&W2lfThN zstR+dwVc3mW1ZN&J*@2Z>(4BH2EUCkGxJCkOJe?&)r9TkO@e?9#4YUg$YuK+D$ZDC zyJn(4n}Kr_P)kMzc>tvjBDrqD#>sTzi1>bEWh4KOexh&+>Zz2WVxxTJ zk{|2BUNlKpR}*C!X!_!V!m=81k;gUJCFSCg;V~Ei zOWnw5nS)~_-1IK5l|SRvBT4RnUW9o#pRa!Nj;-R^=eC}`F12u>?aau`xqRiGeJXr>`Sr*P=adZPzHe-<9eB{*KR{fg_uF?x#S-dRVgAl!n~M-_-MZ!G z*1yzkXZBE54qO|x>8C|zI1B&k%|B$2Z&`B-*m+r>4HbFs10dJ9f!spWoJRRN-^=)oXmp zHJmc%@7EE5bocGYtIjKSEo|zF(WmE2n<`jZ(jj>J%OvU(J9Em#aP_dZchb83^xP%a zu6@xU7C&MeYhzK-op{Z@H!cT#ygx}Zk++(qC?~c2vFObksp{$j9UX`FIS3R44t9!WU28Q(T9r^iscT}2|x1sls^&dZqB1A76Ewv@4P2(Ns)!8)YL%sCG zK@)?X4%Utb*x$q!#{3&n-;5xFCQa(<7*KJZm;xhBP0_u6Ki!Sh$bHO^_A|85iinaD ziDu8%eA9L+YemMPoJ&&5%I9pR{<_}N_*sT$jKq0c2 zuH*;6f@6IziXh;OkkK;A+V=E%6lymOcUfy;Zffi3Wt1T#?)m})6=ok$lc}AaaY_^; zsnfT)y4NX}+jk;THi@hBl2X|I!rMtlbHadEtFtdjrNzwFi+UT7eNH@Wid0mDt)zlN zMeeFHm4O3UNv}R1m^fC{{H(u+2ZxZa-ISr}2L>Iyd*E;1!Gqm$75t4W=4^LNmU18M zq+{`yY*?6ELH_G&)*qEeT>8@}W=PtjM<4Hvw>Z9)YM`G|`n0JY9%a4dL~CHA@Ms$9G51Y9=FVkp2j;&Xp&4Iv zU*YY$`2qJug?BQJ>si@*czcWDM@ZzT_le6>SAWsC*D_tq(=+U>*XWFBH?5KfO3@<} zJ)L8ANv>KY^6zj~>80(kJ!fHr-HOnKfq^p$^YYHLJhV*xzQE4=d_$RmWtNnbMD&8S z6`IWtr7t`^uvl6`;^4%7lpxOY_uJJMSlrFA%QrS&enltv{fN4{r|s`TYtCNH&k5;r z(>s95rLq&)=6c7QWI*aQP@krI&byta@w*;4E8!6|ES&^R(6e8wUj7;t|MRS2+ySYU z7Vnh`Q+I4VJjliMM$*{AMoTxCD69#$;cmKbq2qhuw80V1Pn7!Ymcq*G zpF$BwoCinyJce~ywqgYX zmt_LHVHP#mD!2O0~MR%zqjA<^FCknY@o~E2kzd!EwgnSe! zL2`>&CpGyWoARo+2fBLhe)L(Qr{qy5%6AhJ4w2Op`t}nOWxKt2D{HJ4Xj<`D#%{Rj z1nL`|gnj$Pt%gv`>>9xLt@~i%tR%AV*V(1s({~I7RY-_PL?IL*_UC~cG0K|p5$DHl z{4vJ%_{TDXz*Y@W;m&>)*QfMVxv_W3Rtr%o8eE?TUnZ2SmkKvcPxYUp~lFm9c%tncUrL$*~Jt-)ORqmcd9Rc%X{l_0k z2M;3e$R|FI7eKl}$G1_sa2iL>T(xD*P}H}sfj-k)_P?A~4YT6~AZuU{`-ifi>z zH)TdV1)8|Gs61^r>q8qOl3D&Fqs~GeBL(&PWbSU~#H=GTu;JGFb}jIbV+-yS?wbJL?63kbb;&+?m@z45$! zVE_t9gK3&kPPmtc&+K{gMykZthqX@D8d=2@hW`Btsz^9^1({PdQ;Z1Fct27y{?4mM z#f?Yy9-N^nD!e77ZNFvnuZV%;y~{NW4YxQ>Sze=j^!Ln7i%D zmCzYI-E<@G+JApPK0A9$!lTR|>R|I(9NQIDKb~*!c0^-;VM&{8e+aBc=$6a794sNRh}mWnq6T*Y3UdhK*8%RWQdsa?4qY?kCz7qiD@fq&)hmaev93TJNX;F zN%`ii)9Rz9T6jCIyMBrL{9@&-(~hewC#r#o4&RhG_xCoucQ8ZJ|6Z7uaVF}4b74^6 z8i~1c=|SR044E?cufJZzI~Lv$-x)hU?~v+%h#k^S8Qwl(uC5kZ7yItEvT}D>-)kPwQq@Z&CH3*vh9J$_xtLIZkf5S`s0w=ae9ci%ZPUE1Y>mE`CJ! zDn~W>BhBFssrk=!<{z76iUtQoyI0W5J`p14*+UF^4^!f35ot~#Pp7c1VECtl2Q56* z+gf{@n2c^bHdO0qNThL$q=fXZ%_(>M%%Auleb;PN5wi4MC?6J)1j=ymMC-0uOl{$R ztn6Ws=A;sEL12I+Zt}KiLr(iONg86bn5nYQ)p$=ihyIBk1zR z0aEf?9g>oIwcPsCj)rM%UGP2)M?IxGhmtY1OixN;PolhRx_fp<@OV^R*e{7OBSCg) zhJ>rmi~EBt5FJD>2qq)p?Wu`Yo^ffq`2t|;dWwm3 zMvCe8@BFr^#mswIFGu^5?ME`|j;*bh$rFg~^u& zu1tJz#)^^C@C&FlR|Ev0{VUnE0oS=I1z+-Rg@i3}Uo936k4wa)cxYDrf@6d5$+=5K z&YdffEI(w(=$VgY9vq;9LS;E(N6G6~g$3cPuz};nW7}W-X%y{sF;mgQ-8~NqgnOf2 zI@9S;AchUJkGr`$u z>aYh2gl6g9C=4d?hVL(lF>9kQ`Kp|f^{r;R;JH8SM#VVUq58EI_*2-z)4(P{iHml_rsILV-J}Ap(q~ZgC3DP z_q|?EE7_!|BK4zDX5x^;tv6?{@>Ug$+a4a#X9_udP(ATcJNKwtD!v+CU1j00=b*LG z)MfTs23p8ajgsOL`>$W$OsrofE&;mRA=ZgOSt<@kBokzc9fMpe4h>I)xFB-4+fO$-@YX! zf6dd@18t}t4y3cHA(WbmG8y88|A&H9UpLP5F#&l1X`aM<)hVaC#fI1 z#;;+FwJ@ZoWs(C|p}=Tfwo}I>I2#og1S!&Ie6r|3DG(9C;(JtcKi$i6-%2t+^A^b)N`@-P#Rr-ah1a=9qD6y8 zLed6RbK6@Xgfj6z?DZOi4N(OLkgqdG1?z*0E=es01d;$HBZcaprLjZPHkXkbY&&g? zzC{vwX32e-xp)ZGZNF(M$`GDc1OgGEX>igrvIZ5@*@Di05MrY>tgEXFo;ax3Fv21b zW5f6F>$6;8Ts))###$*JD}&6$^3}OVyyf$QhaI{Jkwg_XTI5Mv2IYh-9r^vcsE7Qq ze;vHc3~?1Au}Q(~gX#qqk#Yi)3Xyyt+uA(H&qb?<0NQ=-+>@;mxa2~|poT=m(5kaf z|Lh6vZ8lSDhwS^RC?lzFb|l17?C6}ndCC2|z2TGiVi1piQjffT2-eVL>@D(UBDZ6` zwBU>KMM{DtNVvea7rrz#B_%z1`t+^v#_^QnZr$RX9=4yI)$4w2uObM0G^t4bsvezP zM&-nekcPk`zYNfVAf90oMy*PgYZvV!YEDL@%|~I|^*By{7~{oHpSJ(zoaKsB>I-Zf z@jYtB^Cmy|2OiLU{AA`V3x1->VV6i_FV1edJ~zV!pn_U3e6Zx_TL5av(*E%?5`KL$ zjx^Y6ZZQ*KVv@EG!N)!vCIS{@X%h``YVI@%$ zgbE`w7$*n6S>7#~ve>sF z+%4EE5eiLk4=VyXu#?ZU`pPEi%{#{?Oiw~e;Bu%-gCB}HZg=Hn#j8rHr7?MOWX2OIkAxolI@@Q#=j{?2Q^$79H zNO_42`+T}LK2U^&HQ!P-S3M#ejV&$&0U(Oo+FOXbFw6e?{K~_!vUiBgSuuwM=A@=1 z!Fb%fq=U^}{`=;BdU?nj;|S*ywjM!YmG!Jgp!(S}XX?%^nn4@|Hfs0lk?g=>VI;d= zs3QjF=Hl~MUvTQ8d__6Hz@)bD@6kuFb+9;5AU$+V&nCF2|9}C+dH<1K2`N5EU7cgA zjS!Y4SR3o=3J~t*#oVFzg0%4Poi*dvK`8`%4C*E_cDg{T(Bm7YAwaGKbd7@zY`4Ak5E7QR+Eb>`8x zO3kxrp0khLJ2)nttXQli(6{dx?}rJ+HFLCq!S&UzcY5eAz{Y`V%$3BzCjvt?58ss~D`W;kdAp^-Bk(N`1851kyLq2`V=u%rD zdF2g7$=kOZ-aQ)a8O5X#m@a*ZH6bB$GF+GoFkvu7jjjKyf1WW*g)iF-7nIC%atW%d zVkqJI$0r`PpdQA@vf=&?n!v8_-i&sQ>?EvJ^w#fLI()l()}ySdVsArYvOV-yttl6TYuCb!bMYR^psQGr~Z3D6^CJc0)OI#5)cLQg{K~@~e7HnRwnK)}vfwF73=(a;4BEq!3 zR$CVSZS13@g*k3G`Tg5>>u zuAI9f^UR-Lgg?(@WaPnu?B~ePSKX3QQnHfblcg#aOL5*}XW$E_GDga2ZI9nW*s}BU zJ@$^d5XuBjrskTXjkC-8dg@Atl}efC-XU5dHXC{ygXFIKl861UdGn^CfB?5%vxX^6 zTs`4M=&V*a#C?_I=*MSxch`yPfF9!1{k@gBJ}jW_JFmoQ=DkPV6XJg77MVb)PKpmb z@Aw%BD%)c>ybTdH%yMqgVww!!9eeZbg=2_N(*}+p^JJ;<)AMgG5EgYQJDU*?nraq( zt`Qjd;jZ)IcXGDrl_WDSshxw@2G2c;&5GT}M-hB|cw_vk!(yCrN%oHxI}@hJJ?34K zu23B~dzR;}f&pr3I9~FXrx(358G~t3V{groB#+`7H%Nwxeu;>LT1KvC&&v81tGqKv zFnIF5E7K{;6?Q1AtDkN?x=(+HRchP&XLD;(3~1U2M&GR9EiEm5!Na4xuZ9LenF#3R zMVOOEjq0_?!Zw*H8k{xJC&@e)=8qtSZTi=7!;i-~Iejh;44!?srabC$<6naRvEUa0 z(DWwU=1yj(kHV&&!JJ&Q+ACJP#{x(+5~AUD#@D~zcjuMmHmO>)DKz1%H*zLOk0p<| zVx5QW+`ejRc6T%8E?CgbNwxZl$Dd+DR`AP0e|!%D9*Pq~+>E!A%eWH$Y-WaaN_k{9|Vo;l+6X~4s3<9S_8K$s;NEAlNX9Y$_jIGu2J5#ynp|bR2Q=;63&bKWL7M% zF#WaiE&YgkY07!kmNl+@Niz(4IqMliuK|Kr`TOIq-h6-iS6j#8#pmZ{xgQUkUiGn~ zs|(MhN%HQM`q2)YCrrxWzo%C&#(ahtJIl3f@%hSPrL?ig6S>D1>y#7~Kay(nrDG#X zDq~OMw6XX+9vRT1G2Ib=eu;0+wQJYN5dBi`^}eBKu1=e2pnB*aY8YmPptb9qH_HB! z*PrJmqh+t``-g29X#khJKK8VHO2b+au+?@_pM@LSDQM*{Op(z2_}M7yY|Xm-+oLuF z4LSH58DN}^*)nOa*rrnJcMiqFj~yG0pI_?Q5Y$l3XGMwCxkveT7RIq<Ed@ngajGbWKG@JH4GG`>Nly%ZeVeX5~=5(7nApPJ{%~;x)I5 z|I5#B6G~!UDT>a}Klm&pP|=vPTj#T831pN0U%LK-;w<80@&44$(mZ?0GjJB7JhL?;&I zLk*jiID#t1A~WIlN8YMgdz0E0lL_-yu&I^2jG*?^j%Nle++xeu4ytlue%iwIgc~Ge zb?9@==b!wGklK9n5$_91M*sWoyYnwAF8|L=6kZzh`+K))4tm2|bQt*tF&$))uxgl# za=Cr;y(5zm;-TKD= zy^xz1m+ibDDD#0IF;KMDoE@CA2*8^pzQ+V3@7OVoa~{&PE&P8MBlEz^;7m_~K1}hm zawbwd`g(f{_{MSub*q|hynOulYbsRKcf?Gz|GyvkOixnG_zd{b*`P%hL?|#F&0hN# z50a#lYn!_Z3VQ$VDOf&zcXJ#L2_|A-Ra7=WOIXnHW*{@>^RZFMNo^}_Z0WN2FI7eH zzgBVia-^=IW*TEUnJYjB#C#Icx)~1rhgKh)z274K_W^9fc(pJDl2H`?>qlsk{nrw~ zKTNU0S@urq?*9ff*w%b|`9YD22@5=o-zr$EX};#(#I$feD&2wgBYkMh+JD{XD!*<_ zBFiYVIxmr2gqkdI%V0yfp%WXLFOlAa2P?cZGI#=-DBT2KRhxVwVXey&3K$cw#m@IW z!sw{RanCQ3z156)$^J!N3x&wC$CyTK!Gpu=#@ZMB5R#i@Vo=uwMrpGjy%2vq%T zfJL5qgvCcV+P4!AfpPQ7{a!)($I3?P>5=$$l4NVIi4!J}45W7NarA|yXqB0bmX`?a z@^Ah1>jeyw7v}%c+zP=T>p7eRA0lT4m!03|y{_cLmNrzNZSQIT6vs`Te7Lv)7Z0*_ zZ4;A6cp)n)vJWqyc~((XZGQREDxVbihr(VQXRRenGazZ;Q{W$3aZV@K9Gi)H^4z)A z3>GVy!EWV}yTfb-LQsUkg2C(o_&|N1A)aw=diVeNM~#f*S(i8>-y6KrtaS)WoT+`Q zn87p*GM#?*>r8xOs!=nOj4&x$*H7CWycCCWNr@jP!YRAnp$h7yp=Jc~*NjNnm_`$S z<_xneN`&Ce zZ-pT_29^B?NX8$7isb%uUDc?dE6-_BxMUIg8SU~{y6IC@CM%cGC^6d!eY>TEM_x1l z!rMI0wc?&qC5z89a1*;div6Znui8phn(moGTpqs3nwq)wOJu}ERGoNz4|iNP#i$P~jEnL#Ss38y`WfRGD&aH3reX2+cU={)G+wuybyD zwEKU${#Ne|VKvjhqO+!c#Pk^*HRX4wUu0(kpqI507l`BIM*G zsea^8M-cgu^Ukr=MmwJ0JaMf2(DEgq0@$U`RjVJKt1O_=wN4(mPZ;aWb$DdCPVwA2 z#?a6r^2M2Ll!B2j97b6X)HU;4E+bbduve~R!wH`lq#WrTo-*WEj|98-cLlJQ;oHMM zjZd=QLqsxqKR%qgvhtT;p)mH}xGoXb1N$aj!gwS?LcOyYl_Lp(pNU&ovDkvL>6GI0 z5JOcte8Q}Lhpt-;qiX*3^~;xD`0B?>(6I0g$JxM`kwYfu^{T5AGgj{PcdnV~I8`f- z^lXCX`i;F!R!%kdC|YWzh?%;4fO|i7n$6UXhoOD9G8CKv;o0zhua+`L4}SR6sp$0> zJh1N1lG<)I^SCp`VbB$im%}5i#vthxBDqHVyf91&%ws7i3Hyy~WLiOiITBtuzEl1{q3&Sypy;@}S1W`A$83}OFeWK~8 z8GlExVqID>FtKcoPG9;j8YR#Z9SN%=7W_)I{MdI`!K6(89%L9}bd-y!6-UfKl}M{} zm9#Z(7K6G(ZkLuP74#n5vKsw9k6SQQ5`m0vnU@k7(_Xc4cJ)C+JoiLI6rG-To_B+& zAKnONn-Rx4P$<-K0PV2X=9T=#>yFWm*&Di)bRrm!|BNSLjF2@Bmkcma9y75Q^*lPOx6u@FXdih0Tuq|A(_F?N*s zY>d;Dp?OfeU|#e-6V|*Tmp<*L(LR$amoNYO^x~+YL(_4|Ztng?ug_-n-~?0vkF~BK z{3<3K%-=c^8?J@~EZj#4jB30+{I>1fY4%2jA#Yu9SOtd=9m-KqS~^Xb39|(pH(<7v zSM=>I?t9B^>U>XK9sj@jrtzqi;s3w>r_-b_WrlK%UphE4w6(PPoFL^#B_DPq+~ls%OtI1u^6ROQzOcok`Em^_hdO$K{@#j z6V&(EYfl|eG5_rBUPaW+Aao>{jR8wRT){Iy_fSwSdhnVUTcibpCU0=jQz3N}OM<46 zk#oHz2Pui2v?ho$0@c5L{pywD8yO!TFC6%%cn^eZzTZPveMtbO^qa;Rg&8`8jB7Sm zyJ1SCS)&d$#TfN91VAi#{^i)|#Lg%37=&<`2GD$q21VTkqz|wNM%TVJ$^RxPjmnUdtbh00i0P z54P;&?Pj)vFg%yRG!~HmoHGrHABKDI^gFuBCezJw59%oW}_;=g6ATfkY z30SChJgFqsAl95OoVCZwNGIW7RGS#=FW|CSS;RdfS3!t)h6Q;>XYypG?FjSjK_K+? z?-v(K2pw{?oQ}43)a@m~qNyq&Z&abI#Wp*9Qs{bfY|H7&K*D#>l>Um7F?XcAmddrC3J$IZ*j%hp!r z(}VJI$5bcAV!fSorbbAtXjOcQyNf_JKThjs{QOaEw1DSDekK z#xT}u;bnzmipG&0THuTbS)+dnmu?r10}ki5K{sOwJfa2=qA1TgX5-&wTF1meqTe%f z{~-D9;Z?krPCArAqTo+s^bt}rA_S?t5UxuiWkA5+3@RSAd8@gtHQKupYrUGF0e;1{PN{L+E!AWkWTYpU3Rqj#jz6Sa&kIIRA>{vmxn=DULEl4 zw1^xo%jvI2SA;R(a3~fOYywRdwyD(=2w0|YCjSnq zXrM6!FSfAQB6N6ZM7~j75y66`o#KG^H%(4v=|g^iTxfuBW^7>xT)1#(z~N1?Aztfb zNM4;DuZ=&Y38V?Gh+bLes?}41(^UHO`9z@yYQWQ_SYwRUJ7H6!MPhixeKt9aU@%;v zW_9em)$LbTYi(#~ZZ@T1s^8T09xo`rj*BKW<@ff=09d~8O)g^=`1el>{}j7cYkSyi z$RG5aQ0q+g-PCnus3*>twJb&E)bcI~`Uw4t?%{;>9Csd|p|KJ;v^8*_eiljTQ>N6D zyFuGsp92w%_#v^3E!p6$Py5<;|Nh@oAFV**k$Q#y;1$a{$^S1IRsH|@pZ_3%Ya2AR zKfzP(2LH&4RHI0>l!>%rhdcQ&B({rW56~W@BFCLWW_Yr^sMeyxYdh1qEIJS}kq*1#Rv%a9!9x-OmoJdUN*dq-t_q+FK(V z*W=cNC9_GVAYZRMJ<=^CBUK169;+%>J29nv=WPAN+yMZc8`6Uc+KKKY_3m2sN&V>; z3OWPT`^Vn=O>XA3XKZ@JfLij;7&B?1Ss3&eS5jb=IPB?w5h>-H`Po_+qs+{(mv0Ff zplkpIW1fb2H8;4VwYt+~ot2dJFrk|?N;-YDtIw!iNFIfxK0p4BkZhN@R{BBh@DkrKhaFBjXXKeiTKN!up&SJ7%3Z^L zl3_fp3DLP=hJbLphHBWh@ii?)Y(rlIM?Z0%j>zW%LH++3M&x1uH(x0TxI27YK3$QRLTBazpm< zHnHZQ0wKM|_gTWITOvbZOcV7z?evC}ypG^1?U4uBH}9T7;7E|fc53|pS1QgBH!UdY2C79gc<~-8RJ6m z>tn(s0&&DkT5mHz+0f7{oGtt?X(#m&>(YdVf;l@YKq}~`hg|wk2|JEBr#pVaJPGI} z8YK}XU}h?iPworg2l+t)wr)b_X{OV{l;S?6h0SsM_vK3?LZ2ZhweGW=$b7(4$at0e z$>aA%k2Nr$Ityo4V{b>lHo!Dcvk(0yK^gf@I&GgmB~96Een|eyA|hpQM-Uk^PB2s+ zU~dB)Os9n>bA;+oh_pR0!`{x0QMgTCI`Fx}`jOqa=|1l<@CEm*gG3FGk$3N|qV)B> zu^XF&mIG}pvCRqtO&IG4goaz4IubmXw)JlEc!OcXwnv{=Hu@)bzJtSkWD+Rn2TnY! zz;b4bRR<{~R0aVAP9VZZNchlCOrb8Np(2Id(C{M=S5RQ!mL&_G4$>JqbRh>6kDNg) zB=>W9m^X_J|AjQ((R=Wl)mPY`g=_BJd&El;^nRcKV^xqY2mnO)Eh7RinzZ&`UIj76 zY=OfA{7lkwP!F{O_aLVYEZy~tN5->*OdY0S=IIjLi#5d;bOO#Uz4HI`b>?w3?(G^c zwFyasAsRGMl3FEMVv|}DB~;o;(S%BAFf`aZL!^*YhBgfwQDjb1GVLNHN!S#d%Fu|A zUSd1nhtE0ZJ?HZ|r$64;uC<=^tmpYX_wT;1`?{~IjTDbEh%0dey#$NkS{nRVjIhS} z{7f{KXU`S^yfb->+|;&yr?a#D2;HO)xNai^hf_!ySVer}zlhUeH$@kc6V{P#}_2uWU}3y%Yv9C16ouE=uzsxX_`EF4L6TYOu=_? z2wEYSIL45FqPb6(8W!%=pp8u%X`V%8dG6b7Pi-nvzm6&4T?lup-P1LIe&1d=@n zIZo#D+S-C(!y)>>>O_4X9?5F19-)UMMNb*6k1&QeAhqz&4Keq>!!1D`=jP4LiHSe2 zYQv{t9<6`$Ff1qDE=;r1qR-}56#jxQLR#;d;}kXt z#{%RzO{=23gRo1qK~;ZWZmvIN87-8DlZNDf<_+K92E^)+UC=S{dOkkky()kHG-fR&+IsHkUJ=F_tE@l$raDZ zdS<~6Ac%2D=4ak!_yNaqM#0-jM^=9fO19gPnu_KU)eT|#KB*IuAptdq0NZw#*0Xyc zN#?~-y|9~Kyb?a`YzlL_QUj~XH94?dffHJNXLaLl=L=fFwm$8WSw#*oZQM3+uTxu`l} zkn&8kKB?R>Xo6#sUr>;N7BaCBiH|wQ0530t%i#erGi%b<`qE7lJN)oeao@m5BN&h% zug+kijoL%F@qPNxQXPUM zIT;#2{PpXA$NYX(cz85p`6#S_1k?}Ny{i0L6e4h#Bb1kwfw*GXMdA4B1?;tw4r!|+h*S9+47pV;Ae z-QcOuFULI|fvBro3Sx_6tA^)*4qA@G?&vF!KC74#Z+aifUTf^qiUm^EJGmKtY~?*Lr38_CzZoL zg!)tV5%>`F0qz3H8B&TNA^oMkQ#C{soW6}8y1MW$Jbk13BG^Zaz##Z@gPgNoh(6 z6eetC(HVogh?hG^skD^gP5^)z=`9u}?b{NCZHv4^{~y}4-?=&DYM`{9f~+|=&*xN{ z|GQPom+Lmj_wB3O07CC=WhJGQ6B`>#5xxn_c<%ND&|enUgJ|EdFw@iAs#sV~{D&Vfa=n(^&zgzuexzXNAa>}>gvI0Sf0t0898Ce)5#3LS2g3)2->$Wvg?6H%OX!ge({Njsw zVahggnFC*`-6Y^hoT7^V_|wlUVP7F?zetv!`z+gI-D?*-S0VGyVl5)>1cMy(7UG{{tbb@YU7jUl;y zH(rDbHm+nd#8cwdtFev0KDd3oHS&=&Sjtn0D$zoa__}`mUkDEg*iu{LUU-CIJ zCLOkb!w6@D*OUAuSB3I??AWQP_eu^u$ZfN1@8N6ZDIGOr70%8D_iL} zA~j6?ufT$1^KVVqFymk`^QUYSm@P9vRFCiJm$KkPPTX>d(EXOwUEjMcKL;$OZ0eB- z_fb(sI$c~^?0gJiZs)JU6G;t4I{Jka6J9m>H8eAU!jL&oU#djpC|^ppn0cu4aL=`z z)$UQ=s<6#yEk9$A?RCw@r2#TVlEC^Mpi|bSBI(m?8TQeted2xEXNctp;pAFuVbPnMY7kt<9M)q9aV$k5hHU4K5vi2RqX^tX^VvABJJ|UsHz={L?GIy?0Y#sD3 zLMo&2xWjDPw8{83p&s5Gdw|8>U_ZaI`q21xdf$-jra1p}NdeJ0yoS3w!JqvgfCdBv z5b9`Ews!2on5?OZO#m+m^3-@+D>`Z(9&N8*|9wiEy6+4ArquqhxbrazFiln1XdR%j z1b2nlE66XSGYX*&)GSiR5wT*ximijN5+E*VXfB!M%c&#l-`$%UpA?2gM3F;!1b!yP zUcY{Qj|LavK@GBlzH$!g5WFq?Yn@178eF}+y{|0JpI?4saejjF5g^T~S2BRZl~q-L zUBw(<>MHK=$?tVz5finZ6bc5GyTK4$WHm z{{3g!*&)xyV4vVy1G$q1=-am^&z^O(vm*oOu}+INuCej)P6vAQJ|b1NV>quA;y_89 z6?qCj`t;t( zE&_kVoJ!RN*u&XbMM-~e<;RYWFs@KoSP=|vbZ`4?1bAYPqgO#LMW`Y<1#>H{UlE!9 z^LE&ABz+{%Oy_N|uBUi|6A4B|P{B3~;`q)P!;{J}(VP@^q^{)51d%3g9w_CJyu)(@ z_|!QpC)-2E+oLEPnhRKu=cO2OEXGcZ;HKall#|1FE&Tx+c#y9)#E(G{nZ_W_W{VeN zsK}6()l>cRtNY5!qmj>(#9j(rk6NVcFhd+SuxQB5#S(_4_yZ5FD&{0IX*m11V_WpO z=^KqSpN19`X_aS9ZAp|&b&f~c8VZ|_p#3*<9Iv_}> z>v^V=s*+5MNTu;t0l7s>ixlZN%b{3&!t?xt!jOeNv!ApwS5i9){dQnRAOh|TX;e(t zlL#Dmq`l}1Leu<;r6}isj6W~p;xh?wf%%v~+Tq-_lP5!8+}x>=c2WvtjX5>^)1XAz z6yn_S53lDQ&BECV_qn{hrN}%pw}Hj3lov&|B;CajZt#b_As05JbJdlJ%4k`eehJaDH6h`loPBJ zka%9b!uR+D33A0iNpRc&Ggqon*A^ft?P&;bZDeTp89melF{#xhX6RTlKEq3(RN*%d zOCD7-OApv@t$B=Ta|EaG4G_fEs4}1K$#J^gnXKZ~-izNUH zb*34grk8UR!keMeyoBsw}8fycta2oDc+r6maBvcW;i z1qEe}VcMB7CCyt;_d>X|1qvJ-e6)Vi;s;!FJX7ib({9wLR=^FksYe?axH0Qd?R=2@ z?!bDQT;QJ=NG3?za|5tlrQDV@`XSy*42_?@3*E!4yJc-wI#0GrSXse|Kjqx)rwOqZ-99sO=p*HIac;m@+)hTc|jGs$tR4?vkVf>XAzl{MZR3?%M8$QH4(Q zcJuSoQ_}yk&G1W=%W| zuZoWHJa82LMvw(KE*8_sgdc`DppCV)rN!+`E1a^ff>Qc6JdlKg?e(eT29wN}>WZd86N%~&%cf67KL?OpG?uW75VS~YIYAeb)w3ozujiv$vPo>jUSqJN`?!g24X!JP$kbB|3FRK&_+q53Trl$yI z@hD0NW`W=n&c7@zsAZgtP&f{kZ^NB;Uo08cc|XP-N`8gxT)s5&YWEy}fEf(Y1uh{< zB8N@|=3;noRQR{LxNtM*aXEjr&Wx+`H)ntX17%{yD0T$kyW(QHYljXoPKp*`^y<|$ z&aHD6w&(Q7Si-Ng-a%8m+aBca|3{bLB7A5S`#g>9`DgQ~FE3vF8+8T-nrEHLcqS_& z_5CMRA-;FdAO9So@l1BI+Ux;Sf4Q026eo37!C|1+hC5M}T4P4Nm3YmSww23x(>7#} zq3|m9DjJe|oT{WQAAk)gL}mBk!`FQiG5$7Y%zGquGlWCj7Ca14(#JBOx}S7V2H6{2 zyva{$E53a{43#MZH}iX@$w>W4DCRO|Eo-)aHu(n4SsNRfjHba-nlh?dUK_CLWBRLe zh?&*Do%wZ@tp(_!Ql-)9C752iv=HL-ywln2fy6&Av)_Rvd8A$MCEaQ5Bu85DAc?vu zRgYN-oF(s`dbJcY0-zQmG%UnJ9`={Tp^!m!b!a6T}F54OK5xU(6D- zs!YhPT3V=K*#x07_o;;M44O*9!Vg166)!L*pecE$S)u&lOHxa^0fJS*kyVLm^rd(y zBMMZJgv(ziP1e>%)!-R~YMixx_>8#tEp-uqOLKEf7TKLn=jByYR*oJ&K8lUacb3su zvDj_S98Br*wOr*7_fHO&NeHcPOMtYwKI z&k_K7Oo-NCJ`9$^ue-*6|EjT(785g8!l)yco-N0HpX-5YGLD+zHvFD)f~qVvwMMmd z9pMZ=3gPgv{U@b0W$yLZt`|is3uPu}@?E_Y5-y=rSl&2QB8iNSrU;4vzpE>(`hK72 zbxx&i$J06%L{k$7p|;Lc!1&aR48dY1XZ~F2$ccO;g@-Y^UDyd%;#;G|>DgqSZvZAVV0 zo%1GgziS?s27z1#-DfJ6k!CYM{Hm!5*Bl^^uC_-A<1Mbr4Bohy89wR?C0U{wl!xYj zVU4k*^~~?%BDuO$;U~jnWETuT#X2P94gGy|f4jT8S)>roC~`4A_&{<%#>a$=uH#$g zYiFkD?JuJ=I9&^qgy>e7m6f&N#GAIZ?7lts*rYtCe*v|=X!{i}#%&nQS>8ML3cxWM z!Dt}D+}XT!t9o}we~~*~%$;CxEOQyn_rf?}i^~&GGzvXD!9FNk{Jo;+2(r1%#ozQ9EY>S2i7{CnCOb z>&coaRKYfd51clw7Ed?^Cbv(ODG$4_VBx%ZQMBmTV2d}`OS&FEXW$FnX5uoeE)Myo zwxAWbmV%k~KhQJ+!!Z~bRqozGQsO>owZ^K@y1Kfj(XGemORJXy5kew`lBJ}7E-%kM zr$)P$5;ZyLclp_*A{1XIa4FT%xh$19hKC<2+(PJhT>IA6JcdWP6>#wa3xF2*i|ykF zpx0Uh3m_JTiWxwdvH`mYOpwNo-C6ajkizKfS&MPw_AcejxafA(!{tj6CSdaX*6%#`TtYc1`)(K*?NAyGwVR|R$x}J`(cJf{swzEgIc7`X%5|}YgITC{57GA0fWUG_e@e(c#^Rbn zOg&|KKN{)@$%iB$D)M0aebI3>|v(k;zNl*RB(2>rhNUdI8 z3QP9JjgcrEmWs$)fYI%zy@1Y1SsE`wt6x`FMyWxp3A|lCrQg!EpS!!^L)@>f_BQqt zsR|_#4V|LxTJeNVp4MH+J1kyYdr^A*@T#FC?ifEQ)t_9q;$@Dwbf_wJ4~r@Wl|}s- z_84l8hxMF$s82I<1RL*WMaAf@U)T2gv#~=Mj4MgK8ulyO%5{D4pMWEGHf`Kh%e8>Y zpCG19S;1Nn-cF9g?q_ZNCk&&3g9mT=RqEsv29$|w$yW;U^KBkT%a^7(92%*J%dPjU zMkGR#&fR9{K&-BJy~^L*X#iVM0V|Ddirtt>Y1WEaQnS2hA}c4zD#3ky-`=m^maAqn zlcoiS(2!@UI9%c%b#VHe9a#ACSUETtnV5h((2F4GOJ!b0Or``wC5bqwM8#*V(Jupy zcb})3U12X{-^dBmd+NPDJ}GH3ExDi|yzbJQG9%TW$<9XrVb_qC$0kVj>1VyrICxM^ zq9_4Q2SXK13L%XT81THUZDRG$(i0UJI#QHCzpCraM+nVv!eHVS6Oi@77coVNys zX362JbI+n}IgQ&?=4aH^AK$&3!bdAAu6-OUbN1TxUJ-h3w@vX#ZvLp6Xv0o~dtJ4H`lUsq|od`}yrL6ic3 zt2qY`4jeS-I0He)O@fC&=VDGC@8;#&NHp;jQf5td52gqw64;41&V4UTysNy(;Z zXs6fk{?myIA8UzO*yG%?St_NLjbMg}N&ljS;j32t&D7Y7Q%_Zj;E4dxurq7~ynZ}% zNTO|u9(;Uo8~7H2lm-XO%-r_)h>XQmDsm4Omkn}|iT3Q=Ikntn>tC;VDhIaN@pF&4uSu4n2LRbjJW??cqqVDI`^Ga<*(VBTsT9u zyIgyE3OJKavS9rU#0`Ukx`~`p<;jpFSr3Dy59xJ>FOHpeesI^@x6lN4?Q>6>cY87H z01XY#*|Se_jEXv^%d4KfM}FhxyStak!TSUY+kF0#{$_LeBwvqRu!`tJ^V#61u}gd_ z*h|)YsGfFU=;QK}jFLmT(6&}DAC_-Ro&Y5~1{K7r8yUfVIoJ6%fqGg38mWuS`KeG+UvqRPU%gK?42U*m8JaGG) zJxnV)#U%QlrzP5|sH)mc{%x$Pym*`L#Sz;YOC1M3|B^XrduqjtLyu2=_@u6EdCSji zQ--0P5f1R%^YU7mB9mM1mH!9^a%kxG^2RCe{tAoLq`$lH^Bki%ZxS#HUGjp5-;!FW zO>gyIG);VNz|=Vd1}ZCK7UuX(_rii`|S0DPTHEZ?j@q#G>6lVSD zDf z!V|3_8ln~biFW17qFc@9D)np4@Jxz}15jQc8{7QtyCwPPtg6CC;ymxnEok%zKbP02 zU4P%}$G;scFaLWn_g^Jm=z8hzFIYRhm-xz$mtM|pT(iUHuA6fIJb!*Oe8p6^6@K0; z7Fqg+FXI2C%tU6UqebIJn_9S-i7d^;mSZeNiA0tn(Vye=5Yqg&Hw1_H1uTyGzu%A_ zn6Aqkbc9a`^;>Ld7vQvLxpm0OWqtw6mRN^GN|}xsZxLx~YBpw+Xp|^&^$gUvrBwd& z)vk-emj{F_lbS8D9c^kn?t*wiJAXD1{u~(`YH9QRY$gBej7`T{nvJzIA2V7smNOnv zc1<_|i~sSSkZ_lfkRT}rQD!@9nHrlLn;B^t*o@Rd2xRO<^S?iJ=J&7s|9@(V<>#TiM{4?1N6B$(ul4@{)dDF+ literal 0 HcmV?d00001 diff --git a/test/3/octave/plot_l3_perf.m b/test/3/octave/plot_l3_perf.m index 02f3eaef32..4ac7ab73b6 100644 --- a/test/3/octave/plot_l3_perf.m +++ b/test/3/octave/plot_l3_perf.m @@ -55,8 +55,7 @@ % Set the legend strings. blis_legend = sprintf( 'BLIS' ); -%open_legend = sprintf( 'OpenBLAS' ); -open_legend = sprintf( 'ARMPL' ); +open_legend = sprintf( 'OpenBLAS' ); eige_legend = sprintf( 'Eigen' ); %vend_legend = sprintf( 'MKL' ); %vend_legend = sprintf( 'ARMPL' ); diff --git a/test/3/octave/runthese.m b/test/3/octave/runthese.m index 8c84725adb..6a88d8b32e 100644 --- a/test/3/octave/runthese.m +++ b/test/3/octave/runthese.m @@ -27,3 +27,7 @@ plot_panel_4x5(2.20,32,1, 'st','../results/a64fx/20210520/st', 'a64fx','Fujitsu SSL2'); close all; clear all; plot_panel_4x5(2.20,32,12,'1s','../results/a64fx/20210520/jc1ic1jr12','a64fx','Fujitsu SSL2'); close all; clear all; plot_panel_4x5(2.20,32,48,'2s','../results/a64fx/20210520/jc1ic4jr12','a64fx','Fujitsu SSL2'); close all; clear all; + +% nn1 +plot_panel_4x5(2.50,8,1, 'st','../results/neoverse_n1/20210715/st', 'nn1','ARMPL'); close; clear all; +plot_panel_4x5(2.50,8,64,'1s','../results/neoverse_n1/20210715/nt64','nn1','ARMPL'); close; clear all; From 8dba1e752c6846a85dea50907135bbc5cbc54ee5 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 27 Jul 2021 12:38:24 -0500 Subject: [PATCH 043/226] CREDITS file update. --- CREDITS | 1 + 1 file changed, 1 insertion(+) diff --git a/CREDITS b/CREDITS index 626874faf1..75b994c9a1 100644 --- a/CREDITS +++ b/CREDITS @@ -41,6 +41,7 @@ but many others have contributed code and feedback, including Shivaprashanth H (Global Edge) Jean-Michel Hautbois @jhautbois Ian Henriksen @insertinterestingnamehere (The University of Texas at Austin) + Greg Henry (Intel) Minh Quan Ho @hominhquan Matthew Honnibal @honnibal Stefan Husmann @stefanhusmann From 868b90138e64c873c780d9df14150d2a370a7a42 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 4 Aug 2021 18:31:01 -0500 Subject: [PATCH 044/226] Fixed one-time use property of bli_init() (#525). Details: - Fixes a rather obvious bug that resulted in segmentation fault whenever the calling application tried to re-initialize BLIS after its first init/finalize cycle. The bug resulted from the fact that the bli_init.c APIs made no effort to allow bli_init() to be called subsequent times at all due to it, and bli_finalize(), being implemented in terms of pthread_once(). This has been fixed by resetting the pthread_once_t control variable for initialization at the end of bli_finalize_apis(), and by resetting the control variable for finalization at the end of bli_init_apis(). Thanks to @lschork2 for reporting this issue (#525), and to Minh Quan Ho and Devin Matthews for suggesting the chosen solution. - CREDITS file update. --- CREDITS | 1 + frame/base/bli_init.c | 41 +++++++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/CREDITS b/CREDITS index 75b994c9a1..b77e08098e 100644 --- a/CREDITS +++ b/CREDITS @@ -52,6 +52,7 @@ but many others have contributed code and feedback, including Kyungmin Lee @kyungminlee (Ohio State University) Michael Lehn @michael-lehn Shmuel Levine @ShmuelLevine + @lschork2 Dave Love @loveshack Tze Meng Low (The University of Texas at Austin) Ye Luo @ye-luo (Argonne National Laboratory) diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 1e28ace096..b8c53edd83 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -64,6 +64,24 @@ void bli_finalize_auto( void ) // ----------------------------------------------------------------------------- +// A pthread_once_t variable is a pthread structure used in pthread_once(). +// pthread_once() is guaranteed to execute exactly once among all threads that +// pass in this control object (until/unless the variable is reset). +static bli_pthread_once_t once_init = BLIS_PTHREAD_ONCE_INIT; +static bli_pthread_once_t once_finalize = BLIS_PTHREAD_ONCE_INIT; + +void bli_init_once( void ) +{ + bli_pthread_once( &once_init, bli_init_apis ); +} + +void bli_finalize_once( void ) +{ + bli_pthread_once( &once_finalize, bli_finalize_apis ); +} + +// ----------------------------------------------------------------------------- + void bli_init_apis( void ) { // Initialize various sub-APIs. @@ -72,6 +90,9 @@ void bli_init_apis( void ) bli_thread_init(); bli_pack_init(); bli_memsys_init(); + + // Reset the control variable that will allow finalization. + once_finalize = BLIS_PTHREAD_ONCE_INIT; } void bli_finalize_apis( void ) @@ -82,24 +103,8 @@ void bli_finalize_apis( void ) bli_thread_finalize(); bli_ind_finalize(); bli_gks_finalize(); -} - -// ----------------------------------------------------------------------------- - -// A pthread_once_t variable is a pthread structure used in pthread_once(). -// pthread_once() is guaranteed to execute exactly once among all threads that -// pass in this control object. Thus, we need one for initialization and a -// separate one for finalization. -static bli_pthread_once_t once_init = BLIS_PTHREAD_ONCE_INIT; -static bli_pthread_once_t once_finalize = BLIS_PTHREAD_ONCE_INIT; -void bli_init_once( void ) -{ - bli_pthread_once( &once_init, bli_init_apis ); -} - -void bli_finalize_once( void ) -{ - bli_pthread_once( &once_finalize, bli_finalize_apis ); + // Reset the control variable that will allow (re-)initialization. + once_init = BLIS_PTHREAD_ONCE_INIT; } From c8728cfbd19ecde9d43af05829e00bcfe7d86eed Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 5 Aug 2021 15:17:09 -0500 Subject: [PATCH 045/226] Fixed configure breakage on OSX clang. Details: - Accept either 'clang' or 'LLVM' in vendor string when greping for the version number (after determining that we're working with clang). Thanks to Devin Matthews for this fix. --- configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure b/configure index 27986204a2..fd4f490386 100755 --- a/configure +++ b/configure @@ -1422,7 +1422,7 @@ get_compiler_version() cc_version=$(${cc} -dumpversion) # If compiler is AOCC, first grep for clang and then the version number. elif [ "${cc_vendor}" = "clang" ]; then - cc_version=$(echo "${vendor_string}" | egrep -o 'clang version [0-9]+\.[0-9]+\.?[0-9]*' | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*') + cc_version=$(echo "${vendor_string}" | egrep -o '(clang|LLVM) version [0-9]+\.[0-9]+\.?[0-9]*' | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*') elif [ "${cc_vendor}" = "oneAPI" ]; then # Treat Intel oneAPI's clang as clang, not icc. cc_vendor="clang" From a32257eeab2e9946e71546a05a1847a39341ec6b Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 5 Aug 2021 16:23:02 -0500 Subject: [PATCH 046/226] Fixed bli_init.c compile-time error on OSX clang. Details: - Fixed a compile-time error in bli_init.c when compiling with OSX's clang. This error was introduced in 868b901, which introduced a post-declaration struct assignment where the RHS was a struct initialization expression (i.e. { ... }). This use of struct initializer expressions apparently works with gcc despite it not being strict C99. The fix included in this commit declares a temporary variable for the purposes of being initialized to the desired value, via the struct initializer, and then copies the temporary struct (via '=' struct assignment) to the persistent struct. Thanks to Devin Matthews for his help with this. --- frame/base/bli_init.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index b8c53edd83..e616ac2d7b 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -92,7 +92,13 @@ void bli_init_apis( void ) bli_memsys_init(); // Reset the control variable that will allow finalization. - once_finalize = BLIS_PTHREAD_ONCE_INIT; + // NOTE: We must initialize a fresh pthread_once_t object and THEN copy the + // contents to the static control variable because some implementations of + // pthreads define pthread_once_t as a struct and BLIS_PTHREAD_ONCE_INIT as + // a struct initializer expression (i.e. { ... }), which cannot be used in + // post-declaration struct assignment in strict C99. + const bli_pthread_once_t once_new = BLIS_PTHREAD_ONCE_INIT; + once_finalize = once_new; } void bli_finalize_apis( void ) @@ -105,6 +111,12 @@ void bli_finalize_apis( void ) bli_gks_finalize(); // Reset the control variable that will allow (re-)initialization. - once_init = BLIS_PTHREAD_ONCE_INIT; + // NOTE: We must initialize a fresh pthread_once_t object and THEN copy the + // contents to the static control variable because some implementations of + // pthreads define pthread_once_t as a struct and BLIS_PTHREAD_ONCE_INIT as + // a struct initializer expression (i.e. { ... }), which cannot be used in + // post-declaration struct assignment in strict C99. + const bli_pthread_once_t once_new = BLIS_PTHREAD_ONCE_INIT; + once_init = once_new; } From 64a1f786d58001284aa4f7faf9fae17f0be7a018 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 11 Aug 2021 17:53:12 -0500 Subject: [PATCH 047/226] Implement proposed new function pointer fields for obj_t. The added fields: 1. `pack_t schema`: storing the pack schema on the object allows the macrokernel to act accordingly without side-channel information from the rntm_t and cntx_t. The pack schema and "pack_[ab]" fields could be removed from those structs. 2. `void* user_data`: this field can be used to store any sort of additional information provided by the user. The pointer is propagated to submatrix objects and copies, but is otherwise ignored by the framework and the default implementations of the following three fields. User-specified pack, kernel, or ukr functions can do whatever they want with the data, and the user is 100% responsible for allocating, assigning, and freeing this buffer. 3. `obj_pack_fn_t pack`: the function called when a matrix is packed. This functions receives the expected arguments, as well as a mdim_t and mem_t* as memory must be allocated inside this function, and behavior may differ based on which matrix is being backed (i.e. transposition for B). This could also be achieved by passing a desired pack schema, but this would require additional information to travel down the control tree. 4. `obj_ker_fn_t ker`: the function called when we get to the "second loop", or the macro-kernel. Behavior may depend on the pack schemas of the input matrices. The default implementation would perform the inner two loops around the ukr, and then call either the default ukr or a user-supplied one (next field). 5. `obj_ukr_fn_t ukr`: the function called by the default macrokernel. This would replace the various current "virtual" microkernels, and could also be used to supply user-defined behavior. Users could supply both a custom kernel (above) and microkernel, although the user-specified kernel does **not** necessarily have to call the ukr function specified on the obj_t. Note that no macros or functions for accessing these new fields have been defined yet. That is next once these are finalized. Addresses https://github.com/flame/blis/projects/1#card-62357687. --- frame/include/bli_type_defs.h | 86 +++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index cba112256f..566ad5f507 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -150,7 +150,7 @@ typedef uint32_t objbits_t; // object information bit field // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX - typedef struct + typedef struct scomplex { float real; float imag; @@ -161,7 +161,7 @@ typedef uint32_t objbits_t; // object information bit field // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX - typedef struct + typedef struct dcomplex { double real; double imag; @@ -1232,6 +1232,47 @@ typedef struct constdata_s // -- BLIS object type definitions --------------------------------------------- // +// Forward declarations for function pointer types +struct obj_s; +struct cntx_s; +struct rntm_s; +struct thrinfo_s; + +typedef void (*obj_pack_fn_t) + ( + mdim_t mat, + mem_t* mem, + struct obj_s* a, + struct obj_s* ap, + struct cntx_s* cntx, + struct rntm_s* rntm, + struct thrinfo_s* thread + ); + +typedef void (*obj_ker_fn_t) + ( + struct obj_s* a, + struct obj_s* b, + struct obj_s* c, + struct cntx_s* cntx, + struct rntm_s* rntm, + struct thrinfo_s* thread + ); + +typedef void (*obj_ukr_fn_t) + ( + dim_t m, + dim_t n, + dim_t k, + void* restrict alpha, + void* restrict a, + void* restrict b, + void* restrict beta, + void* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + struct cntx_s* restrict cntx + ); + typedef struct obj_s { // Basic fields @@ -1261,6 +1302,15 @@ typedef struct obj_s // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel + pack_t schema; // pack schema, which may be unpacked + + // User data pointer + void* user_data; + + // Function pointers + obj_pack_fn_t pack; + obj_ker_fn_t ker; + obj_ukr_fn_t ukr; } obj_t; // Pre-initializors. Things that must be set afterwards: @@ -1297,7 +1347,14 @@ typedef struct obj_s .ps = 0, \ .pd = 0, \ .m_panel = 0, \ - .n_panel = 0 \ + .n_panel = 0, \ + .schema = BLIS_NOT_PACKED, \ +\ + .user_data = NULL, \ +\ + .pack = NULL, \ + .ker = NULL, \ + .ukr = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ @@ -1325,7 +1382,14 @@ typedef struct obj_s .ps = 0, \ .pd = 0, \ .m_panel = 0, \ - .n_panel = 0 \ + .n_panel = 0, \ + .schema = BLIS_NOT_PACKED, \ +\ + .user_data = NULL, \ +\ + .pack = NULL, \ + .ker = NULL, \ + .ukr = NULL \ } // Define these macros here since they must be updated if contents of @@ -1359,6 +1423,13 @@ BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; + b->schema = a->schema; + + b->user_data = a->user_data; + + b->pack = a->pack; + b->ker = a->ker; + b->ukr = a->ukr; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) @@ -1392,6 +1463,13 @@ BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; + b->schema = a->schema; + + b->user_data = a->user_data; + + b->pack = a->pack; + b->ker = a->ker; + b->ukr = a->ukr; } // Initializors for global scalar constants. From e366665cd2b5ae8d7683f5ba2de345df0a41096f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 12 Aug 2021 14:06:53 -0500 Subject: [PATCH 048/226] Fixed stale API calls to membrk API in gemmlike. Details: - Updated stale calls to the bli_membrk API within the 'gemmlike' sandbox. This API is now called bli_pba (packed block allocator). Ideally, this forgotten update would have been included as part of 21911d6, which is when the branch where the membrk->pba changes was introduced was merged into 'master'. - Comment updates. --- sandbox/gemmlike/bls_l3_packm_a.c | 14 +++++++------- sandbox/gemmlike/bls_l3_packm_b.c | 14 +++++++------- sandbox/gemmlike/thread/bls_l3_decor_openmp.c | 2 +- sandbox/gemmlike/thread/bls_l3_decor_pthreads.c | 2 +- sandbox/gemmlike/thread/bls_l3_decor_single.c | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c index c55a19c7b7..0dcc531fdb 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.c +++ b/sandbox/gemmlike/bls_l3_packm_a.c @@ -67,7 +67,7 @@ void PASTECH2(bls_,ch,opname) \ siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ \ /* Check the mem_t entry provided by the caller. If it is unallocated, - then we need to acquire a block from the memory broker. */ \ + then we need to acquire a block from the packed block allocator. */ \ if ( bli_mem_is_unalloc( mem ) ) \ { \ if ( bli_thread_am_ochief( thread ) ) \ @@ -79,7 +79,7 @@ void PASTECH2(bls_,ch,opname) \ the current function before the other threads have a chance to copy from it. (A barrier would fix that race condition, but then again, I prefer to keep barriers to a minimum.) */ \ - bli_membrk_acquire_m \ + bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ @@ -104,8 +104,8 @@ void PASTECH2(bls_,ch,opname) \ else /* if ( bli_mem_is_alloc( mem ) ) */ \ { \ /* If the mem_t entry provided by the caller does NOT contain a NULL - buffer, then a block has already been acquired from the memory - broker and cached by the caller. */ \ + buffer, then a block has already been acquired from the packed + block allocator and cached by the caller. */ \ \ /* As a sanity check, we should make sure that the mem_t object isn't associated with a block that is too small compared to the size of @@ -123,12 +123,12 @@ void PASTECH2(bls_,ch,opname) \ above for why the acquisition needs to be directly to the chief thread's passed-in mem_t and not a local (temporary) mem_t. */ \ - bli_membrk_release \ + bli_pba_release \ ( \ rntm, \ mem \ ); \ - bli_membrk_acquire_m \ + bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ @@ -182,7 +182,7 @@ void PASTECH2(bls_,ch,opname) \ is allocated, which it should be. */ \ if ( bli_mem_is_alloc( mem ) ) \ { \ - bli_membrk_release \ + bli_pba_release \ ( \ rntm, \ mem \ diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c index cae93df012..9d563109a6 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.c +++ b/sandbox/gemmlike/bls_l3_packm_b.c @@ -67,7 +67,7 @@ void PASTECH2(bls_,ch,opname) \ siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ \ /* Check the mem_t entry provided by the caller. If it is unallocated, - then we need to acquire a block from the memory broker. */ \ + then we need to acquire a block from the packed block allocator. */ \ if ( bli_mem_is_unalloc( mem ) ) \ { \ if ( bli_thread_am_ochief( thread ) ) \ @@ -79,7 +79,7 @@ void PASTECH2(bls_,ch,opname) \ the current function before the other threads have a chance to copy from it. (A barrier would fix that race condition, but then again, I prefer to keep barriers to a minimum.) */ \ - bli_membrk_acquire_m \ + bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ @@ -104,8 +104,8 @@ void PASTECH2(bls_,ch,opname) \ else /* if ( bli_mem_is_alloc( mem ) ) */ \ { \ /* If the mem_t entry provided by the caller does NOT contain a NULL - buffer, then a block has already been acquired from the memory - broker and cached by the caller. */ \ + buffer, then a block has already been acquired from the packed + block allocator and cached by the caller. */ \ \ /* As a sanity check, we should make sure that the mem_t object isn't associated with a block that is too small compared to the size of @@ -123,12 +123,12 @@ void PASTECH2(bls_,ch,opname) \ above for why the acquisition needs to be directly to the chief thread's passed-in mem_t and not a local (temporary) mem_t. */ \ - bli_membrk_release \ + bli_pba_release \ ( \ rntm, \ mem \ ); \ - bli_membrk_acquire_m \ + bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ @@ -182,7 +182,7 @@ void PASTECH2(bls_,ch,opname) \ is allocated, which it should be. */ \ if ( bli_mem_is_alloc( mem ) ) \ { \ - bli_membrk_release \ + bli_pba_release \ ( \ rntm, \ mem \ diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c index 851a29e52b..bf0d4d8bcd 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c @@ -75,7 +75,7 @@ void bls_l3_thread_decorator // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. - bli_membrk_rntm_set_membrk( rntm ); + bli_pba_rntm_set_pba( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c index f87d79fd6c..0a4012029a 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c @@ -140,7 +140,7 @@ void bls_l3_thread_decorator // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. - bli_membrk_rntm_set_membrk( rntm ); + bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.c b/sandbox/gemmlike/thread/bls_l3_decor_single.c index 7d9017dcd5..8bb04817fb 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_single.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_single.c @@ -68,7 +68,7 @@ void bls_l3_thread_decorator bli_sba_rntm_set_pool( 0, array, rntm ); // Set the packing block allocator field of the rntm. - bli_membrk_rntm_set_membrk( rntm ); + bli_pba_rntm_set_pba( rntm ); #ifndef SKIP_THRINFO_TREE // Allcoate a global communicator for the root thrinfo_t structures. From 20a1c4014c999063e6bc1cfa605b152454c5cbf4 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 12 Aug 2021 14:44:04 -0500 Subject: [PATCH 049/226] Disabled sanity check in bli_pool_finalize(). Details: - Disabled a sanity check in bli_pool_finalize() that was meant to alert the user if a pool_t was being finalized while some blocks were still checked out. However, this is exactly the situation that might happen when a pool_t is re-initialized for a larger blocksize, and currently bli_pool_reinit() is implemeneted as _finalize() followed by _init(). So, this sanity check is not universally appropriate. Thanks to AMD-India for reporting this issue. --- frame/base/bli_pool.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c index 08cbbbf2e7..350d59b739 100644 --- a/frame/base/bli_pool.c +++ b/frame/base/bli_pool.c @@ -127,6 +127,12 @@ void bli_pool_finalize // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); + // NOTE: This sanity check has been disabled because bli_pool_reinit() + // is currently implemented in terms of bli_pool_finalize() followed by + // bli_pool_init(). If that _reinit() takes place when some blocks are + // checked out, then we would expect top_index != 0, and therefore this + // check is not universally appropriate. +#if 0 // Sanity check: The top_index should be zero. if ( top_index != 0 ) { @@ -135,6 +141,7 @@ void bli_pool_finalize printf( "bli_pool_finalize(): Implication: not all blocks were checked back in!\n" ); bli_abort(); } +#endif // Query the free() function pointer for the pool. free_ft free_fp = bli_pool_free_fp( pool ); From ec06b6a503a203fa0cdb23273af3c0e3afeae7fa Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 12 Aug 2021 19:27:31 -0500 Subject: [PATCH 050/226] Add dependency on the "flat" blis.h file for the BLIS and BLAS testsuite objects. This fixes a bug where "make -j check" may fail after a change to one or more header files, or where testsuite code doesn't get properly recompiled after internal changes. --- Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 9a69fca8b2..be9fd69121 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # @@ -36,7 +36,7 @@ # Makefile # # Field G. Van Zee -# +# # Top-level makefile for libflame linear algebra library. # # @@ -688,7 +688,7 @@ endif # --- BLAS test suite rules --- -testblas: blastest-run +testblas: blastest-run blastest-f2c: check-env $(BLASTEST_F2C_LIB) @@ -697,7 +697,7 @@ blastest-bin: check-env blastest-f2c $(BLASTEST_DRV_BIN_PATHS) blastest-run: $(BLASTEST_DRV_BINS_R) # f2c object file rule. -$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_F2C_SRC_PATH)/%.c +$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_F2C_SRC_PATH)/%.c $(BLIS_H_FLAT) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) $(BLAT_CFLAGS) -c $< -o $@ else @@ -706,7 +706,7 @@ else endif # driver object file rule. -$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_DRV_SRC_PATH)/%.c +$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_DRV_SRC_PATH)/%.c $(BLIS_H_FLAT) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) $(BLAT_CFLAGS) -c $< -o $@ else @@ -793,7 +793,7 @@ testsuite: testsuite-run testsuite-bin: check-env $(TESTSUITE_BIN) # Object file rule. -$(BASE_OBJ_TESTSUITE_PATH)/%.o: $(TESTSUITE_SRC_PATH)/%.c +$(BASE_OBJ_TESTSUITE_PATH)/%.o: $(TESTSUITE_SRC_PATH)/%.c $(BLIS_H_FLAT) ifeq ($(ENABLE_VERBOSE),yes) $(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) -c $< -o $@ else @@ -1243,7 +1243,7 @@ endif changelog: @echo "Updating '$(DIST_PATH)/$(CHANGELOG)' via '$(GIT_LOG)'" - @$(GIT_LOG) > $(DIST_PATH)/$(CHANGELOG) + @$(GIT_LOG) > $(DIST_PATH)/$(CHANGELOG) # --- Uninstall rules --- From 3cddce1e2a021be6064b90af30022b99cbfea986 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 12 Aug 2021 22:32:34 -0500 Subject: [PATCH 051/226] Remove schema field on obj_t (redundant) and add new API functions. --- frame/include/bli_obj_macro_defs.h | 51 ++++++++++++++++++++++++++++++ frame/include/bli_type_defs.h | 23 ++++++-------- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 855384425e..bb7045099d 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -1187,6 +1187,57 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) } +// -- User-provided information macros -- + +// User data query + +BLIS_INLINE void* bli_obj_user_data( obj_t* obj ) +{ + return obj->user_data; +} + +// User data modification + +BLIS_INLINE void bli_obj_set_user_data( void* data, obj_t* obj ) +{ + obj->user_data = data; +} + +// Function pointer query + +BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) +{ + return obj->pack; +} + +BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) +{ + return obj->ker; +} + +BLIS_INLINE obj_ukr_fn_t bli_obj_ukf_fn( obj_t* obj ) +{ + return obj->ukr; +} + +// Function pointer modification + +BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack, obj_t* obj ) +{ + obj->pack = pack; +} + +BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker, obj_t* obj ) +{ + obj->ker = ker; +} + +BLIS_INLINE void bli_obj_set_ukf_fn( obj_ukr_fn_t ukr, obj_t* obj ) +{ + obj->ukr = ukr; +} + + // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 566ad5f507..2abcf35fed 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -385,7 +385,7 @@ typedef void (*free_ft) ( void* p ); #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 -#define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT +#define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 @@ -395,10 +395,10 @@ typedef void (*free_ft) ( void* p ); #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) -#define BLIS_BITVAL_ZEROS 0x0 +#define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) -#define BLIS_BITVAL_DENSE BLIS_UPLO_BITS +#define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT @@ -1242,7 +1242,7 @@ typedef void (*obj_pack_fn_t) ( mdim_t mat, mem_t* mem, - struct obj_s* a, + struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, @@ -1251,8 +1251,8 @@ typedef void (*obj_pack_fn_t) typedef void (*obj_ker_fn_t) ( - struct obj_s* a, - struct obj_s* b, + struct obj_s* a, + struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, @@ -1302,7 +1302,6 @@ typedef struct obj_s // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel - pack_t schema; // pack schema, which may be unpacked // User data pointer void* user_data; @@ -1348,7 +1347,6 @@ typedef struct obj_s .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ - .schema = BLIS_NOT_PACKED, \ \ .user_data = NULL, \ \ @@ -1383,7 +1381,6 @@ typedef struct obj_s .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ - .schema = BLIS_NOT_PACKED, \ \ .user_data = NULL, \ \ @@ -1423,7 +1420,6 @@ BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; - b->schema = a->schema; b->user_data = a->user_data; @@ -1463,7 +1459,6 @@ BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; - b->schema = a->schema; b->user_data = a->user_data; @@ -1624,13 +1619,13 @@ typedef enum BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), - // Structure-specific errors + // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), - // Storage-specific errors + // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors @@ -1644,7 +1639,7 @@ typedef enum // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), - // Buffer-specific errors + // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors From 4f70eb7913ad3ded193870361b6da62b20ec3823 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 13 Aug 2021 11:12:43 -0500 Subject: [PATCH 052/226] Clean up some warnings that show up on clang/OSX. --- common.mk | 4 +-- frame/base/bli_pool.c | 6 ++--- .../haswell/1m/bli_packm_haswell_asm_c3xk.c | 26 +++++++++---------- .../haswell/1m/bli_packm_haswell_asm_c8xk.c | 26 +++++++++---------- .../haswell/1m/bli_packm_haswell_asm_z3xk.c | 26 +++++++++---------- .../haswell/1m/bli_packm_haswell_asm_z4xk.c | 24 ++++++++--------- 6 files changed, 56 insertions(+), 56 deletions(-) diff --git a/common.mk b/common.mk index 4a5c5b8d5e..712482d82d 100644 --- a/common.mk +++ b/common.mk @@ -1,6 +1,6 @@ # # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # @@ -626,7 +626,7 @@ endif # Disable tautological comparision warnings in clang. ifeq ($(CC_VENDOR),clang) -CWARNFLAGS += -Wno-tautological-compare +CWARNFLAGS += -Wno-tautological-compare -Wno-pass-failed endif $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CWARNFLAGS,$(c)))) diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c index 350d59b739..08876c68a9 100644 --- a/frame/base/bli_pool.c +++ b/frame/base/bli_pool.c @@ -124,15 +124,15 @@ void bli_pool_finalize // Query the total number of blocks currently allocated. const siz_t num_blocks = bli_pool_num_blocks( pool ); - // Query the top_index of the pool. - const siz_t top_index = bli_pool_top_index( pool ); - // NOTE: This sanity check has been disabled because bli_pool_reinit() // is currently implemented in terms of bli_pool_finalize() followed by // bli_pool_init(). If that _reinit() takes place when some blocks are // checked out, then we would expect top_index != 0, and therefore this // check is not universally appropriate. #if 0 + // Query the top_index of the pool. + const siz_t top_index = bli_pool_top_index( pool ); + // Sanity check: The top_index should be zero. if ( top_index != 0 ) { diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c index 273caeb3db..843335ad5d 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c @@ -104,10 +104,10 @@ void bli_cpackm_haswell_asm_3xk // ------------------------------------------------------------------------- - if ( cdim0 == mnr && !gs && !bli_does_conj( conja ) && unitk ) + if ( cdim0 == mnr && !gs && !conja && unitk ) { begin_asm() - + mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca @@ -122,14 +122,14 @@ void bli_cpackm_haswell_asm_3xk mov(var(one), rdx) // load address of 1.0 constant vbroadcastss(mem(rdx, 0), ymm1) // load 1.0 and duplicate vxorps(ymm0, ymm0, ymm0) // set ymm0 to 0.0. - + mov(var(kappa), rcx) // load address of kappa vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate - + // now branch on kappa == 1.0 - + vucomiss(xmm1, xmm10) // set ZF if kappa_r == 1.0. sete(r12b) // r12b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm11) // set ZF if kappa_i == 0.0. @@ -143,7 +143,7 @@ void bli_cpackm_haswell_asm_3xk cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.CCOLNONU) // jump to column storage case - + // -- kappa non-unit, row storage on A ------------------------------------- label(.CROWNONU) @@ -156,7 +156,7 @@ void bli_cpackm_haswell_asm_3xk label(.CCOLNONU) jmp(.CDONE) // jump to end. - + @@ -167,7 +167,7 @@ void bli_cpackm_haswell_asm_3xk // -- kappa unit, row storage on A ----------------------------------------- - + label(.CROWUNIT) //lea(mem(r8, r8, 2), r12) // r12 = 3*inca @@ -251,7 +251,7 @@ void bli_cpackm_haswell_asm_3xk // -- kappa unit, column storage on A -------------------------------------- label(.CCOLUNIT) - + lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; @@ -315,8 +315,8 @@ void bli_cpackm_haswell_asm_3xk label(.CDONE) - - + + end_asm( : // output operands (none) @@ -370,7 +370,7 @@ void bli_cpackm_haswell_asm_3xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -390,7 +390,7 @@ void bli_cpackm_haswell_asm_3xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c index be6877e71a..862a33b86a 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c @@ -104,10 +104,10 @@ void bli_cpackm_haswell_asm_8xk // ------------------------------------------------------------------------- - if ( cdim0 == mnr && !gs && !bli_does_conj( conja ) && unitk ) + if ( cdim0 == mnr && !gs && !conja && unitk ) { begin_asm() - + mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca @@ -122,14 +122,14 @@ void bli_cpackm_haswell_asm_8xk mov(var(one), rdx) // load address of 1.0 constant vbroadcastss(mem(rdx, 0), ymm1) // load 1.0 and duplicate vxorps(ymm0, ymm0, ymm0) // set ymm0 to 0.0. - + mov(var(kappa), rcx) // load address of kappa vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate - + // now branch on kappa == 1.0 - + vucomiss(xmm1, xmm10) // set ZF if kappa_r == 1.0. sete(r12b) // r12b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm11) // set ZF if kappa_i == 0.0. @@ -143,7 +143,7 @@ void bli_cpackm_haswell_asm_8xk cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.CCOLNONU) // jump to column storage case - + // -- kappa non-unit, row storage on A ------------------------------------- label(.CROWNONU) @@ -156,7 +156,7 @@ void bli_cpackm_haswell_asm_8xk label(.CCOLNONU) jmp(.CDONE) // jump to end. - + @@ -167,7 +167,7 @@ void bli_cpackm_haswell_asm_8xk // -- kappa unit, row storage on A ----------------------------------------- - + label(.CROWUNIT) lea(mem(r8, r8, 2), r12) // r12 = 3*inca @@ -271,7 +271,7 @@ void bli_cpackm_haswell_asm_8xk // -- kappa unit, column storage on A -------------------------------------- label(.CCOLUNIT) - + lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; @@ -335,8 +335,8 @@ void bli_cpackm_haswell_asm_8xk label(.CDONE) - - + + end_asm( : // output operands (none) @@ -390,7 +390,7 @@ void bli_cpackm_haswell_asm_8xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -408,7 +408,7 @@ void bli_cpackm_haswell_asm_8xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c index 26b98f4daf..1a714abe26 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c @@ -104,10 +104,10 @@ void bli_zpackm_haswell_asm_3xk // ------------------------------------------------------------------------- - if ( cdim0 == mnr && !gs && !bli_does_conj( conja ) && unitk ) + if ( cdim0 == mnr && !gs && !conja && unitk ) { begin_asm() - + mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca @@ -124,14 +124,14 @@ void bli_zpackm_haswell_asm_3xk mov(var(one), rdx) // load address of 1.0 constant vbroadcastsd(mem(rdx, 0), ymm1) // load 1.0 and duplicate vxorpd(ymm0, ymm0, ymm0) // set ymm0 to 0.0. - + mov(var(kappa), rcx) // load address of kappa vbroadcastsd(mem(rcx, 0), ymm10) // load kappa_r and duplicate vbroadcastsd(mem(rcx, 8), ymm11) // load kappa_i and duplicate - + // now branch on kappa == 1.0 - + vucomisd(xmm1, xmm10) // set ZF if kappa_r == 1.0. sete(r12b) // r12b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm11) // set ZF if kappa_i == 0.0. @@ -145,7 +145,7 @@ void bli_zpackm_haswell_asm_3xk cmp(imm(16), r8) // set ZF if (16*inca) == 16. jz(.ZCOLNONU) // jump to column storage case - + // -- kappa non-unit, row storage on A ------------------------------------- label(.ZROWNONU) @@ -158,7 +158,7 @@ void bli_zpackm_haswell_asm_3xk label(.ZCOLNONU) jmp(.ZDONE) // jump to end. - + @@ -169,7 +169,7 @@ void bli_zpackm_haswell_asm_3xk // -- kappa unit, row storage on A ----------------------------------------- - + label(.ZROWUNIT) //lea(mem(r8, r8, 2), r12) // r12 = 3*inca @@ -257,7 +257,7 @@ void bli_zpackm_haswell_asm_3xk // -- kappa unit, column storage on A -------------------------------------- label(.ZCOLUNIT) - + lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; @@ -321,8 +321,8 @@ void bli_zpackm_haswell_asm_3xk label(.ZDONE) - - + + end_asm( : // output operands (none) @@ -376,7 +376,7 @@ void bli_zpackm_haswell_asm_3xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -394,7 +394,7 @@ void bli_zpackm_haswell_asm_3xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c index 6552317541..4e11872afb 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c @@ -104,10 +104,10 @@ void bli_zpackm_haswell_asm_4xk // ------------------------------------------------------------------------- - if ( cdim0 == mnr && !gs && !bli_does_conj( conja ) && unitk ) + if ( cdim0 == mnr && !gs && !conja && unitk ) { begin_asm() - + mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca @@ -128,10 +128,10 @@ void bli_zpackm_haswell_asm_4xk mov(var(kappa), rcx) // load address of kappa vbroadcastsd(mem(rcx, 0), ymm10) // load kappa_r and duplicate vbroadcastsd(mem(rcx, 8), ymm11) // load kappa_i and duplicate - + // now branch on kappa == 1.0 - + vucomisd(xmm1, xmm10) // set ZF if kappa_r == 1.0. sete(r12b) // r12b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm11) // set ZF if kappa_i == 0.0. @@ -145,7 +145,7 @@ void bli_zpackm_haswell_asm_4xk cmp(imm(16), r8) // set ZF if (16*inca) == 16. jz(.ZCOLNONU) // jump to column storage case - + // -- kappa non-unit, row storage on A ------------------------------------- label(.ZROWNONU) @@ -158,7 +158,7 @@ void bli_zpackm_haswell_asm_4xk label(.ZCOLNONU) jmp(.ZDONE) // jump to end. - + @@ -169,7 +169,7 @@ void bli_zpackm_haswell_asm_4xk // -- kappa unit, row storage on A ----------------------------------------- - + label(.ZROWUNIT) lea(mem(r8, r8, 2), r12) // r12 = 3*inca @@ -267,7 +267,7 @@ void bli_zpackm_haswell_asm_4xk // -- kappa unit, column storage on A -------------------------------------- label(.ZCOLUNIT) - + lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; @@ -331,8 +331,8 @@ void bli_zpackm_haswell_asm_4xk label(.ZDONE) - - + + end_asm( : // output operands (none) @@ -386,7 +386,7 @@ void bli_zpackm_haswell_asm_4xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -404,7 +404,7 @@ void bli_zpackm_haswell_asm_4xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } From 1772db029e10e0075b5a59d3fb098487b1ad542a Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 13 Aug 2021 14:46:35 -0500 Subject: [PATCH 053/226] Add row- and column-strides for A/B in obj_ukr_fn_t. --- frame/include/bli_obj_macro_defs.h | 4 ++-- frame/include/bli_type_defs.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index bb7045099d..fbf979c8a6 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -1215,7 +1215,7 @@ BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) return obj->ker; } -BLIS_INLINE obj_ukr_fn_t bli_obj_ukf_fn( obj_t* obj ) +BLIS_INLINE obj_ukr_fn_t bli_obj_ukr_fn( obj_t* obj ) { return obj->ukr; } @@ -1232,7 +1232,7 @@ BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker, obj_t* obj ) obj->ker = ker; } -BLIS_INLINE void bli_obj_set_ukf_fn( obj_ukr_fn_t ukr, obj_t* obj ) +BLIS_INLINE void bli_obj_set_ukr_fn( obj_ukr_fn_t ukr, obj_t* obj ) { obj->ukr = ukr; } diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 2abcf35fed..5a180e2e97 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1265,8 +1265,8 @@ typedef void (*obj_ukr_fn_t) dim_t n, dim_t k, void* restrict alpha, - void* restrict a, - void* restrict b, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict b, inc_t rs_b, inc_t cs_b, void* restrict beta, void* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, From 4b8ed99d926876fbf54c15468feae4637268eb6b Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 13 Aug 2021 15:31:10 -0500 Subject: [PATCH 054/226] Whitespace tweaks. --- frame/include/bli_obj_macro_defs.h | 16 +++++----- frame/include/bli_type_defs.h | 51 +++++++++++++++--------------- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index fbf979c8a6..84c977289c 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -1193,48 +1193,48 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) BLIS_INLINE void* bli_obj_user_data( obj_t* obj ) { - return obj->user_data; + return obj->user_data; } // User data modification BLIS_INLINE void bli_obj_set_user_data( void* data, obj_t* obj ) { - obj->user_data = data; + obj->user_data = data; } // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { - return obj->pack; + return obj->pack; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { - return obj->ker; + return obj->ker; } BLIS_INLINE obj_ukr_fn_t bli_obj_ukr_fn( obj_t* obj ) { - return obj->ukr; + return obj->ukr; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack, obj_t* obj ) { - obj->pack = pack; + obj->pack = pack; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker, obj_t* obj ) { - obj->ker = ker; + obj->ker = ker; } BLIS_INLINE void bli_obj_set_ukr_fn( obj_ukr_fn_t ukr, obj_t* obj ) { - obj->ukr = ukr; + obj->ukr = ukr; } diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 5a180e2e97..f03fc72acd 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1240,22 +1240,22 @@ struct thrinfo_s; typedef void (*obj_pack_fn_t) ( - mdim_t mat, - mem_t* mem, - struct obj_s* a, - struct obj_s* ap, - struct cntx_s* cntx, - struct rntm_s* rntm, + mdim_t mat, + mem_t* mem, + struct obj_s* a, + struct obj_s* ap, + struct cntx_s* cntx, + struct rntm_s* rntm, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( - struct obj_s* a, - struct obj_s* b, - struct obj_s* c, - struct cntx_s* cntx, - struct rntm_s* rntm, + struct obj_s* a, + struct obj_s* b, + struct obj_s* c, + struct cntx_s* cntx, + struct rntm_s* rntm, struct thrinfo_s* thread ); @@ -1303,13 +1303,14 @@ typedef struct obj_s dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel - // User data pointer - void* user_data; + // User data pointer + void* user_data; + + // Function pointers + obj_pack_fn_t pack; + obj_ker_fn_t ker; + obj_ukr_fn_t ukr; - // Function pointers - obj_pack_fn_t pack; - obj_ker_fn_t ker; - obj_ukr_fn_t ukr; } obj_t; // Pre-initializors. Things that must be set afterwards: @@ -1348,11 +1349,11 @@ typedef struct obj_s .m_panel = 0, \ .n_panel = 0, \ \ - .user_data = NULL, \ + .user_data = NULL, \ \ - .pack = NULL, \ - .ker = NULL, \ - .ukr = NULL \ + .pack = NULL, \ + .ker = NULL, \ + .ukr = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ @@ -1382,11 +1383,11 @@ typedef struct obj_s .m_panel = 0, \ .n_panel = 0, \ \ - .user_data = NULL, \ + .user_data = NULL, \ \ - .pack = NULL, \ - .ker = NULL, \ - .ukr = NULL \ + .pack = NULL, \ + .ker = NULL, \ + .ukr = NULL \ } // Define these macros here since they must be updated if contents of From 4a955e939044cfd2048cf9f3e33024e3ad1fbe00 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 16 Aug 2021 13:49:27 -0500 Subject: [PATCH 055/226] Tweaks to gemmlike to facilitate 3rd party mods. Details: - Changed the implementation in the 'gemmlike' sandbox to more easily allow others to provide custom implementations of packm. These changes include: - Calling a local version of packm_cxk() that can be modified. This version of packm_cxk() uses inlined loops in packm_cxk() rather than querying the context for packm kernels (or even using scal2m). - Providing two variants of packm, one of which calls the aforementioned packm_cxk(), the other of which inlines the contents of packm_cxk() into the variant itself, making it self-contained. To switch from one to the other, simply change which function gets called within bls_packm_a() and bls_packm_b(). - Simplified and cleaned up some variant names in both variants of packm, relative to their parent code. --- sandbox/gemmlike/bli_sandbox.h | 2 + sandbox/gemmlike/bls_l3_packm_a.c | 2 +- sandbox/gemmlike/bls_l3_packm_b.c | 2 +- sandbox/gemmlike/bls_l3_packm_var.h | 5 + ...bls_l3_packm_var.c => bls_l3_packm_var1.c} | 35 ++- sandbox/gemmlike/bls_l3_packm_var2.c | 244 ++++++++++++++++++ sandbox/gemmlike/bls_packm_cxk.c | 161 ++++++++++++ sandbox/gemmlike/bls_packm_cxk.h | 58 +++++ 8 files changed, 487 insertions(+), 22 deletions(-) rename sandbox/gemmlike/{bls_l3_packm_var.c => bls_l3_packm_var1.c} (90%) create mode 100644 sandbox/gemmlike/bls_l3_packm_var2.c create mode 100644 sandbox/gemmlike/bls_packm_cxk.c create mode 100644 sandbox/gemmlike/bls_packm_cxk.h diff --git a/sandbox/gemmlike/bli_sandbox.h b/sandbox/gemmlike/bli_sandbox.h index d6e6522e8c..a396c97690 100644 --- a/sandbox/gemmlike/bli_sandbox.h +++ b/sandbox/gemmlike/bli_sandbox.h @@ -50,6 +50,8 @@ #include "bls_l3_packm_b.h" #include "bls_l3_packm_var.h" +#include "bls_packm_cxk.h" + #include "bls_l3_decor.h" diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c index 0dcc531fdb..ebad20f356 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.c +++ b/sandbox/gemmlike/bls_l3_packm_a.c @@ -300,7 +300,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Pack matrix A to the destination buffer chosen above. Here, the packed matrix is stored to column-stored MR x k micropanels. */ \ - PASTECH2(bls_,ch,packm_var1) \ + PASTECH2(bls_,ch,packm_var2) \ ( \ conj, \ schema, \ diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c index 9d563109a6..4a4918ac1b 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.c +++ b/sandbox/gemmlike/bls_l3_packm_b.c @@ -300,7 +300,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Pack matrix B to the destination buffer chosen above. Here, the packed matrix is stored to row-stored k x NR micropanels. */ \ - PASTECH2(bls_,ch,packm_var1) \ + PASTECH2(bls_,ch,packm_var2) \ ( \ conj, \ schema, \ diff --git a/sandbox/gemmlike/bls_l3_packm_var.h b/sandbox/gemmlike/bls_l3_packm_var.h index 0e8eb9ee8a..c2c0520c64 100644 --- a/sandbox/gemmlike/bls_l3_packm_var.h +++ b/sandbox/gemmlike/bls_l3_packm_var.h @@ -61,3 +61,8 @@ GENTPROT( double, d, packm_var1 ) GENTPROT( scomplex, c, packm_var1 ) GENTPROT( dcomplex, z, packm_var1 ) +//INSERT_GENTPROT_BASIC0( packm_var2 ) +GENTPROT( float, s, packm_var2 ) +GENTPROT( double, d, packm_var2 ) +GENTPROT( scomplex, c, packm_var2 ) +GENTPROT( dcomplex, z, packm_var2 ) diff --git a/sandbox/gemmlike/bls_l3_packm_var.c b/sandbox/gemmlike/bls_l3_packm_var1.c similarity index 90% rename from sandbox/gemmlike/bls_l3_packm_var.c rename to sandbox/gemmlike/bls_l3_packm_var1.c index 3265ef834d..c0649a9ec4 100644 --- a/sandbox/gemmlike/bls_l3_packm_var.c +++ b/sandbox/gemmlike/bls_l3_packm_var1.c @@ -35,7 +35,7 @@ #include "blis.h" // -// Define BLAS-like interfaces to the variants. +// Variant 1 provides basic support for packing by calling packm_cxk(). // #undef GENTFUNC @@ -66,13 +66,11 @@ void PASTECH2(bls_,ch,varname) \ dim_t it, ic; \ dim_t ic0; \ doff_t ic_inc; \ - dim_t panel_len_full; \ - dim_t panel_len_i; \ + dim_t panel_len; \ dim_t panel_len_max; \ - dim_t panel_len_max_i; \ - dim_t panel_dim_i; \ + dim_t panel_dim; \ dim_t panel_dim_max; \ - inc_t vs_c; \ + inc_t incc; \ inc_t ldc; \ inc_t ldp; \ conj_t conjc; \ @@ -95,10 +93,10 @@ void PASTECH2(bls_,ch,varname) \ { \ /* Prepare to pack to row-stored column panels. */ \ iter_dim = n; \ - panel_len_full = m; \ + panel_len = m; \ panel_len_max = m_max; \ panel_dim_max = pd_p; \ - vs_c = cs_c; \ + incc = cs_c; \ ldc = rs_c; \ ldp = rs_p; \ } \ @@ -106,10 +104,10 @@ void PASTECH2(bls_,ch,varname) \ { \ /* Prepare to pack to column-stored row panels. */ \ iter_dim = m; \ - panel_len_full = n; \ + panel_len = n; \ panel_len_max = n_max; \ panel_dim_max = pd_p; \ - vs_c = rs_c; \ + incc = rs_c; \ ldc = cs_c; \ ldp = cs_p; \ } \ @@ -147,31 +145,28 @@ void PASTECH2(bls_,ch,varname) \ for ( ic = ic0, it = 0; it < n_iter; \ ic += ic_inc, it += 1 ) \ { \ - panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ + panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \ \ - ctype* restrict c_begin = c_cast + (ic )*vs_c; \ + ctype* restrict c_begin = c_cast + (ic )*incc; \ \ ctype* restrict c_use = c_begin; \ ctype* restrict p_use = p_begin; \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ \ /* The definition of bli_packm_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. (The default is slab.) */ \ if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ - PASTEMAC(ch,packm_cxk) \ + PASTECH2(bls_,ch,packm_cxk) \ ( \ conjc, \ schema, \ - panel_dim_i, \ + panel_dim, \ panel_dim_max, \ - panel_len_i, \ - panel_len_max_i, \ + panel_len, \ + panel_len_max, \ kappa_cast, \ - c_use, vs_c, ldc, \ + c_use, incc, ldc, \ p_use, ldp, \ cntx \ ); \ diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c new file mode 100644 index 0000000000..bf02f67112 --- /dev/null +++ b/sandbox/gemmlike/bls_l3_packm_var2.c @@ -0,0 +1,244 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Variant 2 is similar to variant 1, but inlines the contents of packm_cxk(). +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bls_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ +\ + dim_t iter_dim; \ + dim_t n_iter; \ + dim_t it, ic; \ + dim_t ic0; \ + doff_t ic_inc; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + dim_t panel_dim; \ + dim_t panel_dim_max; \ + inc_t incc; \ + inc_t ldc; \ + inc_t ldp; \ + conj_t conjc; \ +\ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + bool row_stored = bli_is_col_packed( schema ); \ + /*bool col_stored = bli_is_row_packed( schema );*/ \ +\ + /* If the row storage flag indicates row storage, then we are packing + to column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( row_stored ) \ + { \ + /* Prepare to pack to row-stored column panels. */ \ + iter_dim = n; \ + panel_len = m; \ + panel_len_max = m_max; \ + panel_dim_max = pd_p; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( col_stored ) */ \ + { \ + /* Prepare to pack to column-stored row panels. */ \ + iter_dim = m; \ + panel_len = n; \ + panel_len_max = n_max; \ + panel_dim_max = pd_p; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ +\ + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ \ + { \ + ic0 = 0; \ + ic_inc = panel_dim_max; \ + } \ +\ + ctype* restrict p_begin = p_cast; \ +\ + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ \ + const dim_t nt = bli_thread_n_way( thread ); \ + const dim_t tid = bli_thread_work_id( thread ); \ +\ + /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ + ( void )nt; \ + ( void )tid; \ +\ + dim_t it_start, it_end, it_inc; \ +\ + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ +\ + /* Iterate over every logical micropanel in the source matrix. */ \ + for ( ic = ic0, it = 0; it < n_iter; \ + ic += ic_inc, it += 1 ) \ + { \ + panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + ctype* restrict c_begin = c_cast + (ic )*incc; \ +\ + ctype* restrict c_use = c_begin; \ + ctype* restrict p_use = p_begin; \ +\ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. (The + default is slab.) */ \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + { \ + /* NOTE: We assume here that kappa = 1 and therefore ignore it. If + we're wrong, this will get someone's attention. */ \ + if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + bli_abort(); \ +\ + /* Perform the packing, taking conjc into account. */ \ + if ( bli_is_conj( conjc ) ) \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t d = 0; d < panel_dim; ++d ) \ + { \ + ctype* cld = c_use + (l )*ldc + (d )*incc; \ + ctype* pld = p_use + (l )*ldp + (d )*1; \ +\ + PASTEMAC(ch,copyjs)( *cld, *pld ); \ + } \ + } \ + } \ + else \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t d = 0; d < panel_dim; ++d ) \ + { \ + ctype* cld = c_use + (l )*ldc + (d )*incc; \ + ctype* pld = p_use + (l )*ldp + (d )*1; \ +\ + PASTEMAC(ch,copys)( *cld, *pld ); \ + } \ + } \ + } \ +\ + /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ + if ( panel_dim < panel_dim_max ) \ + { \ + const dim_t i = panel_dim; \ + const dim_t m_edge = panel_dim_max - panel_dim; \ + const dim_t n_edge = panel_len_max; \ + ctype* restrict p_edge = p_use + (i )*1; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ +\ + /* If panel_len < panel_len_max, then we zero those unused columns. */ \ + if ( panel_len < panel_len_max ) \ + { \ + const dim_t j = panel_len; \ + const dim_t m_edge = panel_dim_max; \ + const dim_t n_edge = panel_len_max - panel_len; \ + ctype* restrict p_edge = p_use + (j )*ldp; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ + } \ +\ +/* +if ( !row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +else \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +*/ \ +\ + p_begin += ps_p; \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_var1 ) +GENTFUNC( float, s, packm_var2 ) +GENTFUNC( double, d, packm_var2 ) +GENTFUNC( scomplex, c, packm_var2 ) +GENTFUNC( dcomplex, z, packm_var2 ) + diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c new file mode 100644 index 0000000000..1258e0af54 --- /dev/null +++ b/sandbox/gemmlike/bls_packm_cxk.c @@ -0,0 +1,161 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bls_,ch,opname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_dim_max, \ + dim_t panel_len, \ + dim_t panel_len_max, \ + ctype* kappa, \ + ctype* a, inc_t inca, inc_t lda, \ + ctype* p, inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + /* Note that we use panel_dim_max, not panel_dim, to query the packm + kernel function pointer. This means that we always use the same + kernel, even for edge cases. */ \ + num_t dt = PASTEMAC(ch,type); \ + l1mkr_t ker_id = panel_dim_max; \ +\ + PASTECH2(ch,opname,_ker_ft) f; \ +\ + /* Query the context for the packm kernel corresponding to the current + panel dimension, or kernel id. If the id is invalid, the function will + return NULL. */ \ + f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ +\ + /* If there exists a kernel implementation for the micro-panel dimension + provided, we invoke the implementation. Otherwise, we use scal2m. */ \ + /* NOTE: We've disabled calling packm micro-kernels from the context for + this implementation. To re-enable, change FALSE to TRUE in the + conditional below. */ \ + if ( f != NULL && FALSE ) \ + { \ + f \ + ( \ + conja, \ + schema, \ + panel_dim, \ + panel_len, \ + panel_len_max, \ + kappa, \ + a, inca, lda, \ + p, ldp, \ + cntx \ + ); \ + } \ + else \ + { \ + /* NOTE: We assume here that kappa = 1 and therefore ignore it. If + we're wrong, this will get someone's attention. */ \ + if ( !PASTEMAC(ch,eq1)( *kappa ) ) \ + bli_abort(); \ +\ + /* Perform the packing, taking conja into account. */ \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t d = 0; d < panel_dim; ++d ) \ + { \ + ctype* ald = a + (l )*lda + (d )*inca; \ + ctype* pld = p + (l )*ldp + (d )*1; \ +\ + PASTEMAC(ch,copyjs)( *ald, *pld ); \ + } \ + } \ + } \ + else \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t d = 0; d < panel_dim; ++d ) \ + { \ + ctype* ald = a + (l )*lda + (d )*inca; \ + ctype* pld = p + (l )*ldp + (d )*1; \ +\ + PASTEMAC(ch,copys)( *ald, *pld ); \ + } \ + } \ + } \ +\ + /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ + if ( panel_dim < panel_dim_max ) \ + { \ + const dim_t i = panel_dim; \ + const dim_t m_edge = panel_dim_max - panel_dim; \ + const dim_t n_edge = panel_len_max; \ + ctype* restrict p_edge = p + (i )*1; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ +\ + /* If panel_len < panel_len_max, then we zero those unused columns. */ \ + if ( panel_len < panel_len_max ) \ + { \ + const dim_t j = panel_len; \ + const dim_t m_edge = panel_dim_max; \ + const dim_t n_edge = panel_len_max - panel_len; \ + ctype* restrict p_edge = p + (j )*ldp; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_cxk ) +GENTFUNC( float, s, packm_cxk ) +GENTFUNC( double, d, packm_cxk ) +GENTFUNC( scomplex, c, packm_cxk ) +GENTFUNC( dcomplex, z, packm_cxk ) + diff --git a/sandbox/gemmlike/bls_packm_cxk.h b/sandbox/gemmlike/bls_packm_cxk.h new file mode 100644 index 0000000000..f6582d64a7 --- /dev/null +++ b/sandbox/gemmlike/bls_packm_cxk.h @@ -0,0 +1,58 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bls_,ch,varname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_dim_max, \ + dim_t panel_len, \ + dim_t panel_len_max, \ + ctype* kappa, \ + ctype* a, inc_t inca, inc_t lda, \ + ctype* p, inc_t ldp, \ + cntx_t* cntx \ + ); + +//INSERT_GENTPROT_BASIC0( packm_cxk ) +GENTPROT( float, s, packm_cxk ) +GENTPROT( double, d, packm_cxk ) +GENTPROT( scomplex, c, packm_cxk ) +GENTPROT( dcomplex, z, packm_cxk ) + From 7144230cdb0653b70035ddd91f7f41e06ad8d011 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 18 Aug 2021 13:25:39 -0500 Subject: [PATCH 056/226] README.md citation updates (e.g. BLIS7 bibtex). --- README.md | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 25ebf73025..b0865012ac 100644 --- a/README.md +++ b/README.md @@ -580,7 +580,7 @@ Citations For those of you looking for the appropriate article to cite regarding BLIS, we recommend citing our -[first ACM TOMS journal paper]( https://dl.acm.org/doi/10.1145/2764454?cid=81314495332) +[first ACM TOMS journal paper](https://dl.acm.org/doi/10.1145/2764454?cid=81314495332) ([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis1_toms_rev3.pdf)): ``` @@ -594,12 +594,12 @@ recommend citing our month = {June}, year = {2015}, issue_date = {June 2015}, - url = {http://doi.acm.org/10.1145/2764454}, + url = {https://doi.acm.org/10.1145/2764454}, } ``` You may also cite the -[second ACM TOMS journal paper]( https://dl.acm.org/doi/10.1145/2755561?cid=81314495332) +[second ACM TOMS journal paper](https://dl.acm.org/doi/10.1145/2755561?cid=81314495332) ([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis2_toms_rev3.pdf)): ``` @@ -616,7 +616,7 @@ You may also cite the month = {June}, year = {2016}, issue_date = {June 2016}, - url = {http://doi.acm.org/10.1145/2755561}, + url = {https://doi.acm.org/10.1145/2755561}, } ``` @@ -653,12 +653,12 @@ for determining blocksize parameters in BLIS month = {August}, year = {2016}, issue_date = {August 2016}, - url = {http://doi.acm.org/10.1145/2925987}, + url = {https://doi.acm.org/10.1145/2925987}, } ``` A fifth paper, submitted to ACM TOMS, begins the study of so-called -[induced methods for complex matrix multiplication]( https://dl.acm.org/doi/10.1145/3086466?cid=81314495332) +[induced methods for complex matrix multiplication](https://dl.acm.org/doi/10.1145/3086466?cid=81314495332) ([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)): ``` @@ -672,7 +672,7 @@ A fifth paper, submitted to ACM TOMS, begins the study of so-called month = {July}, year = {2017}, issue_date = {July 2017}, - url = {http://doi.acm.org/10.1145/3086466}, + url = {https://doi.acm.org/10.1145/3086466}, } ``` @@ -697,7 +697,7 @@ article and derives a ``` A seventh paper, submitted to ACM TOMS, explores the implementation of `gemm` for -[mixed-domain and/or mixed-precision](https://www.cs.utexas.edu/users/flame/pubs/blis7_toms_rev0.pdf) operands +[mixed-domain and/or mixed-precision](https://dl.acm.org/doi/10.1145/3402225?cid=81314495332) operands ([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis7_toms_rev0.pdf)): ``` @@ -706,7 +706,13 @@ A seventh paper, submitted to ACM TOMS, explores the implementation of `gemm` fo title = {Supporting Mixed-domain Mixed-precision Matrix Multiplication within the BLIS Framework}, journal = {ACM Transactions on Mathematical Software}, - note = {submitted} + volume = {47}, + number = {2}, + pages = {12:1--12:26}, + month = {April}, + year = {2021}, + issue_date = {April 2021}, + url = {https://doi.org/10.1145/3402225}, } ``` From 3eccfd456e7e84052c9a429dcde1183a7ecfaa48 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 19 Aug 2021 13:22:10 -0500 Subject: [PATCH 057/226] Added local _check() code to gemmlike sandbox. Details: - Added code to the gemmlike sandbox that handles parameter checking. Previously, the gemmlike implementation called bli_gemm_check(), which resides within the BLIS framework proper. Certain modifications that a user may wish to perform on the sandbox, such as adding a new matrix or vector operand, would have required additional checks, and so these changes make it easier for such a person to implement those checks for their custom gemm-like operation. --- sandbox/gemmlike/bli_sandbox.h | 1 + sandbox/gemmlike/bls_gemm.c | 2 +- sandbox/gemmlike/bls_gemm_check.c | 122 ++++++++++++++++++++++++++++++ sandbox/gemmlike/bls_gemm_check.h | 49 ++++++++++++ 4 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 sandbox/gemmlike/bls_gemm_check.c create mode 100644 sandbox/gemmlike/bls_gemm_check.h diff --git a/sandbox/gemmlike/bli_sandbox.h b/sandbox/gemmlike/bli_sandbox.h index a396c97690..f3782b3dbc 100644 --- a/sandbox/gemmlike/bli_sandbox.h +++ b/sandbox/gemmlike/bli_sandbox.h @@ -44,6 +44,7 @@ // made available to applications (or the framework) during compilation. #include "bls_gemm.h" +#include "bls_gemm_check.h" #include "bls_gemm_var.h" #include "bls_l3_packm_a.h" diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c index 3e4c9b2a33..4ee3a773f2 100644 --- a/sandbox/gemmlike/bls_gemm.c +++ b/sandbox/gemmlike/bls_gemm.c @@ -94,7 +94,7 @@ void bls_gemm_ex // Check parameters. if ( bli_error_checking_is_enabled() ) { - bli_gemm_check( alpha, a, b, beta, c, cntx ); + bls_gemm_check( alpha, a, b, beta, c, cntx ); } // If C has a zero dimension, return early. diff --git a/sandbox/gemmlike/bls_gemm_check.c b/sandbox/gemmlike/bls_gemm_check.c new file mode 100644 index 0000000000..bd6c2647e2 --- /dev/null +++ b/sandbox/gemmlike/bls_gemm_check.c @@ -0,0 +1,122 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bls_gemm_check + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx + ) +{ + //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_noninteger_object( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_noninteger_object( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( c ); + bli_check_error_code( e_val ); + + // Check scalar/vector/matrix type. + + e_val = bli_check_scalar_object( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_scalar_object( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( c ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( c ); + bli_check_error_code( e_val ); + + // Check for sufficiently sized stack buffers + + e_val = bli_check_sufficient_stack_buf_size( bli_obj_dt( a ), cntx ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_level3_dims( a, b, c ); + bli_check_error_code( e_val ); + + // Check for consistent datatypes. + // NOTE: We only perform these tests when mixed datatype support is + // disabled. + + e_val = bli_check_consistent_object_datatypes( c, a ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, b ); + bli_check_error_code( e_val ); +} + diff --git a/sandbox/gemmlike/bls_gemm_check.h b/sandbox/gemmlike/bls_gemm_check.h new file mode 100644 index 0000000000..8b97069911 --- /dev/null +++ b/sandbox/gemmlike/bls_gemm_check.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based check functions. +// + +void bls_gemm_check + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx + ); + From 3b275f810b2479eb5d6cf2296e97a658cf1bb769 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 19 Aug 2021 16:06:46 -0500 Subject: [PATCH 058/226] Minor tweaks to gemmlike sandbox. Details: - In the gemmlike sandbox, changed the loop index variable of inner loop of packm_cxk() from 'd' to 'i' (and likewise for the corresponding inlined code within packm_var2()). - Pack matrices A and B using packm_var1() instead of packm_var2(). --- sandbox/gemmlike/bls_l3_packm_a.c | 2 +- sandbox/gemmlike/bls_l3_packm_b.c | 2 +- sandbox/gemmlike/bls_l3_packm_var2.c | 16 ++++++++-------- sandbox/gemmlike/bls_packm_cxk.c | 16 ++++++++-------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c index ebad20f356..0dcc531fdb 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.c +++ b/sandbox/gemmlike/bls_l3_packm_a.c @@ -300,7 +300,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Pack matrix A to the destination buffer chosen above. Here, the packed matrix is stored to column-stored MR x k micropanels. */ \ - PASTECH2(bls_,ch,packm_var2) \ + PASTECH2(bls_,ch,packm_var1) \ ( \ conj, \ schema, \ diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c index 4a4918ac1b..9d563109a6 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.c +++ b/sandbox/gemmlike/bls_l3_packm_b.c @@ -300,7 +300,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Pack matrix B to the destination buffer chosen above. Here, the packed matrix is stored to row-stored k x NR micropanels. */ \ - PASTECH2(bls_,ch,packm_var2) \ + PASTECH2(bls_,ch,packm_var1) \ ( \ conj, \ schema, \ diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c index bf02f67112..8d2b90cac1 100644 --- a/sandbox/gemmlike/bls_l3_packm_var2.c +++ b/sandbox/gemmlike/bls_l3_packm_var2.c @@ -167,12 +167,12 @@ void PASTECH2(bls_,ch,varname) \ { \ for ( dim_t l = 0; l < panel_len; ++l ) \ { \ - for ( dim_t d = 0; d < panel_dim; ++d ) \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ { \ - ctype* cld = c_use + (l )*ldc + (d )*incc; \ - ctype* pld = p_use + (l )*ldp + (d )*1; \ + ctype* cli = c_use + (l )*ldc + (i )*incc; \ + ctype* pli = p_use + (l )*ldp + (i )*1; \ \ - PASTEMAC(ch,copyjs)( *cld, *pld ); \ + PASTEMAC(ch,copyjs)( *cli, *pli ); \ } \ } \ } \ @@ -180,12 +180,12 @@ void PASTECH2(bls_,ch,varname) \ { \ for ( dim_t l = 0; l < panel_len; ++l ) \ { \ - for ( dim_t d = 0; d < panel_dim; ++d ) \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ { \ - ctype* cld = c_use + (l )*ldc + (d )*incc; \ - ctype* pld = p_use + (l )*ldp + (d )*1; \ + ctype* cli = c_use + (l )*ldc + (i )*incc; \ + ctype* pli = p_use + (l )*ldp + (i )*1; \ \ - PASTEMAC(ch,copys)( *cld, *pld ); \ + PASTEMAC(ch,copys)( *cli, *pli ); \ } \ } \ } \ diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c index 1258e0af54..ca11c207c0 100644 --- a/sandbox/gemmlike/bls_packm_cxk.c +++ b/sandbox/gemmlike/bls_packm_cxk.c @@ -96,12 +96,12 @@ void PASTECH2(bls_,ch,opname) \ { \ for ( dim_t l = 0; l < panel_len; ++l ) \ { \ - for ( dim_t d = 0; d < panel_dim; ++d ) \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ { \ - ctype* ald = a + (l )*lda + (d )*inca; \ - ctype* pld = p + (l )*ldp + (d )*1; \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* pli = p + (l )*ldp + (i )*1; \ \ - PASTEMAC(ch,copyjs)( *ald, *pld ); \ + PASTEMAC(ch,copyjs)( *ali, *pli ); \ } \ } \ } \ @@ -109,12 +109,12 @@ void PASTECH2(bls_,ch,opname) \ { \ for ( dim_t l = 0; l < panel_len; ++l ) \ { \ - for ( dim_t d = 0; d < panel_dim; ++d ) \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ { \ - ctype* ald = a + (l )*lda + (d )*inca; \ - ctype* pld = p + (l )*ldp + (d )*1; \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* pli = p + (l )*ldp + (i )*1; \ \ - PASTEMAC(ch,copys)( *ald, *pld ); \ + PASTEMAC(ch,copys)( *ali, *pli ); \ } \ } \ } \ From e320ec6d5cd44e03cb2e2faa1d7625e84f76d668 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 20 Aug 2021 17:15:20 -0500 Subject: [PATCH 059/226] Moved lang defs from _macro_def.h to _lang_defs.h. Details: - Moved miscellaneous language-related definitions, including defs related to the handling of the 'restrict' keyword, from the top half of bli_macro_defs.h into a new file, bli_lang_defs.h, which is now #included immediately after "bli_system.h" in blis.h. This change is an attempt to fix a report of recent breakage of C++ compilers due to the recent introduction of 'restrict' in bli_type_defs.h (which previously was being included *before* bli_macro_defs.h and its restrict handling therein. Thanks to Ivan Korostelev for reporting this issue in #527. - CREDITS file update. --- CREDITS | 1 + frame/include/bli_lang_defs.h | 111 +++++++++++++++++++++++++++++++++ frame/include/bli_macro_defs.h | 71 --------------------- frame/include/blis.h | 5 +- 4 files changed, 115 insertions(+), 73 deletions(-) create mode 100644 frame/include/bli_lang_defs.h diff --git a/CREDITS b/CREDITS index b77e08098e..827d63e686 100644 --- a/CREDITS +++ b/CREDITS @@ -49,6 +49,7 @@ but many others have contributed code and feedback, including Tony Kelman @tkelman Lee Killough @leekillough (Cray) Mike Kistler @mkistler (IBM, Austin Research Laboratory) + Ivan Korostelev @ivan23kor (University of Alberta) Kyungmin Lee @kyungminlee (Ohio State University) Michael Lehn @michael-lehn Shmuel Levine @ShmuelLevine diff --git a/frame/include/bli_lang_defs.h b/frame/include/bli_lang_defs.h new file mode 100644 index 0000000000..8cf3f99862 --- /dev/null +++ b/frame/include/bli_lang_defs.h @@ -0,0 +1,111 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_LANG_DEFS_H +#define BLIS_LANG_DEFS_H + + +// -- Undefine restrict for C++ and C89/90 -- + +#ifdef __cplusplus + // Language is C++; define restrict as nothing. + #ifndef restrict + #define restrict + #endif +#elif __STDC_VERSION__ >= 199901L + // Language is C99 (or later); do nothing since restrict is recognized. +#else + // Language is pre-C99; define restrict as nothing. + #ifndef restrict + #define restrict + #endif +#endif + + +// -- Define typeof() operator if using non-GNU compiler -- + +#ifndef __GNUC__ + #define typeof __typeof__ +#else + #ifndef typeof + #define typeof __typeof__ + #endif +#endif + + +// -- BLIS Thread Local Storage Keyword -- + +// __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. +// There is a small risk here as __GNUC__ can also be defined by some other +// compiler (other than ICC and CLANG which we know define it) that +// doesn't support __thread, as __GNUC__ is not quite unique to GCC. +// But the possibility of someone using such non-main-stream compiler +// for building BLIS is low. +#if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) + #define BLIS_THREAD_LOCAL __thread +#else + #define BLIS_THREAD_LOCAL +#endif + + +// -- BLIS constructor/destructor function attribute -- + +// __attribute__((constructor/destructor)) is supported by GCC only. +// There is a small risk here as __GNUC__ can also be defined by some other +// compiler (other than ICC and CLANG which we know define it) that +// doesn't support this, as __GNUC__ is not quite unique to GCC. +// But the possibility of someone using such non-main-stream compiler +// for building BLIS is low. + +#if defined(__ICC) || defined(__INTEL_COMPILER) + // ICC defines __GNUC__ but doesn't support this + #define BLIS_ATTRIB_CTOR + #define BLIS_ATTRIB_DTOR +#elif defined(__clang__) + // CLANG supports __attribute__, but its documentation doesn't + // mention support for constructor/destructor. Compiling with + // clang and testing shows that it does support. + #define BLIS_ATTRIB_CTOR __attribute__((constructor)) + #define BLIS_ATTRIB_DTOR __attribute__((destructor)) +#elif defined(__GNUC__) + #define BLIS_ATTRIB_CTOR __attribute__((constructor)) + #define BLIS_ATTRIB_DTOR __attribute__((destructor)) +#else + #define BLIS_ATTRIB_CTOR + #define BLIS_ATTRIB_DTOR +#endif + + +#endif diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index 907a5a26c8..03451d4407 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -37,77 +37,6 @@ #define BLIS_MACRO_DEFS_H -// -- Undefine restrict for C++ and C89/90 -- - -#ifdef __cplusplus - // Language is C++; define restrict as nothing. - #ifndef restrict - #define restrict - #endif -#elif __STDC_VERSION__ >= 199901L - // Language is C99 (or later); do nothing since restrict is recognized. -#else - // Language is pre-C99; define restrict as nothing. - #ifndef restrict - #define restrict - #endif -#endif - - -// -- Define typeof() operator if using non-GNU compiler -- - -#ifndef __GNUC__ - #define typeof __typeof__ -#else - #ifndef typeof - #define typeof __typeof__ - #endif -#endif - - -// -- BLIS Thread Local Storage Keyword -- - -// __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. -// There is a small risk here as __GNUC__ can also be defined by some other -// compiler (other than ICC and CLANG which we know define it) that -// doesn't support __thread, as __GNUC__ is not quite unique to GCC. -// But the possibility of someone using such non-main-stream compiler -// for building BLIS is low. -#if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) - #define BLIS_THREAD_LOCAL __thread -#else - #define BLIS_THREAD_LOCAL -#endif - - -// -- BLIS constructor/destructor function attribute -- - -// __attribute__((constructor/destructor)) is supported by GCC only. -// There is a small risk here as __GNUC__ can also be defined by some other -// compiler (other than ICC and CLANG which we know define it) that -// doesn't support this, as __GNUC__ is not quite unique to GCC. -// But the possibility of someone using such non-main-stream compiler -// for building BLIS is low. - -#if defined(__ICC) || defined(__INTEL_COMPILER) - // ICC defines __GNUC__ but doesn't support this - #define BLIS_ATTRIB_CTOR - #define BLIS_ATTRIB_DTOR -#elif defined(__clang__) - // CLANG supports __attribute__, but its documentation doesn't - // mention support for constructor/destructor. Compiling with - // clang and testing shows that it does support. - #define BLIS_ATTRIB_CTOR __attribute__((constructor)) - #define BLIS_ATTRIB_DTOR __attribute__((destructor)) -#elif defined(__GNUC__) - #define BLIS_ATTRIB_CTOR __attribute__((constructor)) - #define BLIS_ATTRIB_DTOR __attribute__((destructor)) -#else - #define BLIS_ATTRIB_CTOR - #define BLIS_ATTRIB_DTOR -#endif - - // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" diff --git a/frame/include/blis.h b/frame/include/blis.h index 61b7a0f82f..a42c7cce84 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -48,10 +48,11 @@ extern "C" { // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. -// -- System headers -- -// NOTE: This header must be included before bli_config_macro_defs.h. +// -- System and language-related headers -- +// NOTE: bli_system.h header must be included before bli_config_macro_defs.h. #include "bli_system.h" +#include "bli_lang_defs.h" // -- configure definitions -- From 5fc65cdd9e4134c5dcb16d21cd4a79ff426ca9f3 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sat, 21 Aug 2021 15:59:27 -0500 Subject: [PATCH 060/226] Add test to Travis using C++ compiler to make sure blis.h is C++-compatible. --- .travis.yml | 1 + travis/cxx/Makefile | 38 +++++++++++++++++++++++++++ travis/cxx/cxx-test.cxx | 50 +++++++++++++++++++++++++++++++++++ travis/cxx/cxx-test.sh | 58 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+) create mode 100644 travis/cxx/Makefile create mode 100644 travis/cxx/cxx-test.cxx create mode 100755 travis/cxx/cxx-test.sh diff --git a/.travis.yml b/.travis.yml index 51e9cf75fc..22b9fda033 100644 --- a/.travis.yml +++ b/.travis.yml @@ -81,6 +81,7 @@ script: - $CC --version - make -j 2 - make install +- $DIST_PATH/travis/cxx/cxx-test.sh $DIST_PATH $(ls -1 include) # Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx). - if [ "$CONF" = "armsve" ]; then sed -i 's/.*\.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi - if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi diff --git a/travis/cxx/Makefile b/travis/cxx/Makefile new file mode 100644 index 0000000000..d7f401c1a4 --- /dev/null +++ b/travis/cxx/Makefile @@ -0,0 +1,38 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2021, Southern Methodist University +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +.PHONY: all cxx-test + +all: cxx-test + $(CXX) -std=c++0x -o cxx-test.x -I$(INCLUDE_DIR) cxx-test.cxx -L$(LIB_DIR) -lblis diff --git a/travis/cxx/cxx-test.cxx b/travis/cxx/cxx-test.cxx new file mode 100644 index 0000000000..bccbd9e430 --- /dev/null +++ b/travis/cxx/cxx-test.cxx @@ -0,0 +1,50 @@ +// +// +// BLIS +// An object-based framework for developing high-performance BLAS-like +// libraries. +// +// Copyright (C) 2021, Southern Methodist University +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// - Neither the name(s) of the copyright holder(s) nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// + +#include + +#include "blis.h" + +int main() +{ + const int N = 5; + std::vector A(N*N), B(N*N), C(N*N); + scomplex one{1.0, 0.0}; + scomplex zero{0.0, 0.0}; + + bli_cgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, N, N, N, + &one, A.data(), 1, N, + B.data(), 1, N, + &zero, C.data(), 1, N); +} diff --git a/travis/cxx/cxx-test.sh b/travis/cxx/cxx-test.sh new file mode 100755 index 0000000000..f62937d89c --- /dev/null +++ b/travis/cxx/cxx-test.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2021, Southern Methodist University +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +SOURCE_DIR=$1 +CONFIG=$2 + +if [ -z $SOURCE_DIR ] || [ -z $CONFIG ]; then + echo "usage: cxx-test.sh " + exit 1 +fi + +BUILD_DIR=$(pwd) +INCLUDE_DIR=$BUILD_DIR/include/$CONFIG +LIB_DIR=$BUILD_DIR/lib/$CONFIG + +if [ ! -e $INCLUDE_DIR/blis.h ]; then + echo "could not find blis.h" + exit 1 +fi + +if [ ! -e $SOURCE_DIR/travis/cxx/Makefile ]; then + echo "could not find cxx-test Makefile" + exit 1 +fi + +make -C $SOURCE_DIR/travis/cxx INCLUDE_DIR=$INCLUDE_DIR LIB_DIR=$LIB_DIR From 2f7325b2b770a15ff8aaaecc087b22238f0c67b7 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 23 Aug 2021 15:04:05 -0500 Subject: [PATCH 061/226] Blacklist clang10/gcc9 and older for 'armsve'. Details: - Prohibit use of clang 10.x and older or gcc 9.x and older for the 'armsve' subconfiguration. Addresses issue #535. --- configure | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/configure b/configure index fd4f490386..eede1e782d 100755 --- a/configure +++ b/configure @@ -1488,6 +1488,8 @@ check_compiler() # cortexa15: any # cortexa9: any # + # armsve: clang11+, gcc10+ + # # generic: any # # Note: These compiler requirements were originally modeled after similar @@ -1533,6 +1535,9 @@ check_compiler() # gcc 5.x may support POWER9 but it is unverified. blacklistcc_add "power9" fi + if [ ${cc_major} -lt 10 ]; then + blacklistcc_add "armsve" + fi fi # icc @@ -1595,6 +1600,9 @@ check_compiler() #blacklistcc_add "zen" : # explicit no-op since bash can't handle empty loop bodies. fi + if [ ${cc_major} -lt 11 ]; then + blacklistcc_add "armsve" + fi fi fi } From d6eb70fbc382ad7732dedb4afa01cf9f53e3e027 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 26 Aug 2021 13:12:39 -0500 Subject: [PATCH 062/226] Updated stale calls to malloc_intl() in gemmlike. Details: - Updated two out-of-date calls to bli_malloc_intl() within the gemmlike sandbox. These calls to malloc_intl(), which resided in bls_l3_decor_pthreads.c, were missing the err_t argument that the function uses to report errors. Thanks to Jeff Diamond for helping isolate this issue. --- sandbox/gemmlike/thread/bls_l3_decor_pthreads.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c index 0a4012029a..ff723a4ce4 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c @@ -121,6 +121,8 @@ void bls_l3_thread_decorator rntm_t* rntm ) { + err_t r_val; + // Query the total number of threads from the context. const dim_t n_threads = bli_rntm_num_threads( rntm ); @@ -151,12 +153,12 @@ void bls_l3_thread_decorator #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif - bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads ); + bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif - thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); // NOTE: We must iterate backwards so that the chief thread (thread id 0) // can spawn all other threads before proceeding with its own computation. From 8e0c4255de52a0a5cffecbebf6314aa52120ebe4 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 26 Aug 2021 15:29:18 -0500 Subject: [PATCH 063/226] Define BLIS_OS_NONE when using --disable-system. Details: - Modified bli_system.h so that the cpp macro BLIS_OS_NONE is defined when BLIS_DISABLE_SYSTEM is defined. Otherwise, the previous OS- detecting macro conditionals are considered. This change is to accommodate a solution to a cross-compilation issue described in #532. --- frame/include/bli_system.h | 48 +++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index db326dd874..79333017b9 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -70,28 +70,32 @@ #endif // Determine the target operating system. -#if defined(_WIN32) || defined(__CYGWIN__) - #define BLIS_OS_WINDOWS 1 -#elif defined(__gnu_hurd__) - #define BLIS_OS_GNU 1 -#elif defined(__APPLE__) || defined(__MACH__) - #define BLIS_OS_OSX 1 -#elif defined(__ANDROID__) - #define BLIS_OS_ANDROID 1 -#elif defined(__linux__) - #define BLIS_OS_LINUX 1 -#elif defined(__bgq__) - #define BLIS_OS_BGQ 1 -#elif defined(__bg__) - #define BLIS_OS_BGP 1 -#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__bsdi__) || defined(__DragonFly__) || \ - defined(__FreeBSD_kernel__) || defined(__HAIKU__) - #define BLIS_OS_BSD 1 -#elif defined(EMSCRIPTEN) - #define BLIS_OS_EMSCRIPTEN -#else - #error "Cannot determine operating system" +#if defined(BLIS_ENABLE_SYSTEM) + #if defined(_WIN32) || defined(__CYGWIN__) + #define BLIS_OS_WINDOWS 1 + #elif defined(__gnu_hurd__) + #define BLIS_OS_GNU 1 + #elif defined(__APPLE__) || defined(__MACH__) + #define BLIS_OS_OSX 1 + #elif defined(__ANDROID__) + #define BLIS_OS_ANDROID 1 + #elif defined(__linux__) + #define BLIS_OS_LINUX 1 + #elif defined(__bgq__) + #define BLIS_OS_BGQ 1 + #elif defined(__bg__) + #define BLIS_OS_BGP 1 + #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__bsdi__) || defined(__DragonFly__) || \ + defined(__FreeBSD_kernel__) || defined(__HAIKU__) + #define BLIS_OS_BSD 1 + #elif defined(EMSCRIPTEN) + #define BLIS_OS_EMSCRIPTEN + #else + #error "Cannot determine operating system" + #endif +#else // #if defined(BLIS_DISABLE_SYSTEM) + #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. From 2be78fc97777148c83d20b8509e38aa1fc1b4540 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 27 Aug 2021 12:17:26 -0500 Subject: [PATCH 064/226] Disabled (at least temporarily) commit 8e0c425. Details: - Reverted changes in 8e0c425 due to AppVeyor build failures that we do not yet understand. --- frame/include/bli_system.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 79333017b9..2541018ac1 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -70,7 +70,7 @@ #endif // Determine the target operating system. -#if defined(BLIS_ENABLE_SYSTEM) +//#if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) @@ -94,9 +94,9 @@ #else #error "Cannot determine operating system" #endif -#else // #if defined(BLIS_DISABLE_SYSTEM) - #define BLIS_OS_NONE -#endif +//#else // #if defined(BLIS_DISABLE_SYSTEM) +// #define BLIS_OS_NONE +//#endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS From ade10f427835d5274411cafc9618ac12966eb1e7 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 27 Aug 2021 12:47:12 -0500 Subject: [PATCH 065/226] Updated travis-ci.org link in README.md to .com. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b0865012ac..ced473e594 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ![The BLIS cat is sleeping.](http://www.cs.utexas.edu/users/field/blis_cat.png) -[![Build Status](https://travis-ci.org/flame/blis.svg?branch=master)](https://travis-ci.org/flame/blis) +[![Build Status](https://travis-ci.com/flame/blis.svg?branch=master)](https://travis-ci.com/flame/blis) [![Build Status](https://ci.appveyor.com/api/projects/status/github/flame/blis?branch=master&svg=true)](https://ci.appveyor.com/project/shpc/blis/branch/master) Contents From fbb3560cb8e2aeab205c47c2b096d4fa306d93db Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 10 Sep 2021 13:38:27 -0500 Subject: [PATCH 066/226] Attempt to fix cxx-test for OOT builds. --- travis/cxx/Makefile | 2 +- travis/cxx/cxx-test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/travis/cxx/Makefile b/travis/cxx/Makefile index d7f401c1a4..0f8da14e3b 100644 --- a/travis/cxx/Makefile +++ b/travis/cxx/Makefile @@ -35,4 +35,4 @@ .PHONY: all cxx-test all: cxx-test - $(CXX) -std=c++0x -o cxx-test.x -I$(INCLUDE_DIR) cxx-test.cxx -L$(LIB_DIR) -lblis + $(CXX) -std=c++0x -o $(BUILD_DIR)/cxx-test.x -I$(INCLUDE_DIR) cxx-test.cxx -L$(LIB_DIR) -lblis diff --git a/travis/cxx/cxx-test.sh b/travis/cxx/cxx-test.sh index f62937d89c..c0036611f4 100755 --- a/travis/cxx/cxx-test.sh +++ b/travis/cxx/cxx-test.sh @@ -55,4 +55,4 @@ if [ ! -e $SOURCE_DIR/travis/cxx/Makefile ]; then exit 1 fi -make -C $SOURCE_DIR/travis/cxx INCLUDE_DIR=$INCLUDE_DIR LIB_DIR=$LIB_DIR +make -C $SOURCE_DIR/travis/cxx INCLUDE_DIR=$INCLUDE_DIR LIB_DIR=$LIB_DIR BUILD_DIR=$BUILD_DIR From e486d666ffefee790d5e39895222b575886ac1ea Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 10 Sep 2021 13:50:16 -0500 Subject: [PATCH 067/226] Use C++ cross-compiler for ARM tests. --- .travis.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 22b9fda033..8b4c22939e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -53,19 +53,22 @@ matrix: - os: linux compiler: arm-linux-gnueabihf-gcc env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa15" \ - PACKAGES="gcc-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-system-arm qemu-user" \ + CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ \ + PACKAGES="gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-system-arm qemu-user" \ TESTSUITE_WRAPPER="qemu-arm -cpu cortex-a15 -L /usr/arm-linux-gnueabihf/" # cortexa57 build and fast testsuite (qemu) - os: linux compiler: aarch64-linux-gnu-gcc env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa57" \ - PACKAGES="gcc-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ + CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ \ + PACKAGES="gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/" # armsve build and fast testsuite (qemu) - os: linux compiler: aarch64-linux-gnu-gcc-10 env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="armsve" \ - PACKAGES="gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ + CC=arm-linux-gnueabihf-gcc-10 CXX=arm-linux-gnueabihf-g++-10 \ + PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/" install: - if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8"; fi From c76fcad0c2836e7140b6bef3942e0a632a5f2cda Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 10 Sep 2021 13:57:02 -0500 Subject: [PATCH 068/226] Fix AArch64 tests and consolidate some other tests. --- .travis.yml | 34 +++++++--------------------------- travis/do_testsuite.sh | 19 ++++++++++++++----- 2 files changed, 21 insertions(+), 32 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8b4c22939e..b3e0563d58 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,45 +3,25 @@ sudo: required dist: focal matrix: include: - # full testsuite (all tests except for mixed datatype) + # full testsuite (all tests + mixed datatype (gemm_nn only) + salt + SDE + OOT) - os: linux compiler: gcc - env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" \ - PACKAGES="gcc-8 binutils" - # mixed-datatype testsuite (gemm_nn only) - - os: linux - compiler: gcc - env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto" \ - PACKAGES="gcc-8 binutils" - # salt testsuite (fast set of operations+parameters) - - os: linux - compiler: gcc - env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto" \ - PACKAGES="gcc-8 binutils" - # test x86_64 ukrs with SDE - - os: linux - compiler: gcc - env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64" \ + env: OOT=1 TEST=ALL SDE=1 THR="none" CONF="x86_64" \ PACKAGES="gcc-8 binutils" # openmp build - os: linux compiler: gcc - env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto" \ + env: OOT=0 TEST=FAST SDE=0 THR="openmp" CONF="auto" \ PACKAGES="gcc-8 binutils" # pthreads build - os: linux compiler: gcc - env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto" \ - PACKAGES="gcc-8 binutils" - # out-of-tree build - - os: linux - compiler: gcc - env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto" \ + env: OOT=0 TEST=FAST SDE=0 THR="pthreads" CONF="auto" \ PACKAGES="gcc-8 binutils" # clang build - os: linux compiler: clang - env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto" + env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="auto" # There seems to be some difficulty installing 2 Clang toolchains of different versions. # Use the TravisCI default. # PACKAGES="clang-8 binutils" @@ -60,14 +40,14 @@ matrix: - os: linux compiler: aarch64-linux-gnu-gcc env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa57" \ - CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ \ + CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ \ PACKAGES="gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/" # armsve build and fast testsuite (qemu) - os: linux compiler: aarch64-linux-gnu-gcc-10 env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="armsve" \ - CC=arm-linux-gnueabihf-gcc-10 CXX=arm-linux-gnueabihf-g++-10 \ + CC=aarch64-linux-gnu-gcc-10 CXX=aarch64-linux-gnu-g++-10 \ PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/" install: diff --git a/travis/do_testsuite.sh b/travis/do_testsuite.sh index bb176b6819..6778f81d85 100755 --- a/travis/do_testsuite.sh +++ b/travis/do_testsuite.sh @@ -8,19 +8,28 @@ export BLIS_IC_NT=2 export BLIS_JR_NT=1 export BLIS_IR_NT=1 -if [ "$TEST" = "FAST" ]; then +if [ "$TEST" = "FAST" -o "$TEST" = "ALL" ]; then make testblis-fast -elif [ "$TEST" = "MD" ]; then + $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite +fi + +if [ "$TEST" = "MD" -o "$TEST" = "ALL" ]; then make testblis-md -elif [ "$TEST" = "SALT" ]; then + $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite +fi + +if [ "$TEST" = "SALT" -o "$TEST" = "ALL" ]; then # Disable multithreading within BLIS. export BLIS_JC_NT=1 BLIS_IC_NT=1 BLIS_JR_NT=1 BLIS_IR_NT=1 make testblis-salt -else + $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite +fi + +if [ "$TEST" = "1" -o "$TEST" = "ALL" ]; then make testblis + $DIST_PATH/testsuite/check-blistest.sh ./output.testsuite fi -$DIST_PATH/testsuite/check-blistest.sh ./output.testsuite make testblas $DIST_PATH/blastest/check-blastest.sh From 98ce6e8bc916e952510872caa60d818d62a31e69 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 10 Sep 2021 14:12:13 -0500 Subject: [PATCH 069/226] Do a fast test on OSX. [ci skip] --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b3e0563d58..6bd123ec04 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,7 +28,7 @@ matrix: # macOS with system compiler (clang) - os: osx compiler: clang - env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" + env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="auto" # cortexa15 build and fast testsuite (qemu) - os: linux compiler: arm-linux-gnueabihf-gcc From 5191c43faccf45975f577c60b9089abee25722c9 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 16 Sep 2021 10:16:17 -0500 Subject: [PATCH 070/226] Fix more copy-paste errors in the haswell gemmsup code. Fixes #486. --- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c | 466 +++++++++--------- 1 file changed, 233 insertions(+), 233 deletions(-) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c index 4c6094b1cd..21dd3b895b 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c @@ -101,7 +101,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -172,19 +172,19 @@ void bli_dgemmsup_rd_haswell_asm_6x4 prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -219,7 +219,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) @@ -250,7 +250,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -312,27 +312,27 @@ void bli_dgemmsup_rd_haswell_asm_6x4 vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -343,7 +343,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -365,21 +365,21 @@ void bli_dgemmsup_rd_haswell_asm_6x4 vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -387,12 +387,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -414,12 +414,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4 vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -427,11 +427,11 @@ void bli_dgemmsup_rd_haswell_asm_6x4 label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -469,7 +469,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 // xmm6[0:3] = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -477,73 +477,73 @@ void bli_dgemmsup_rd_haswell_asm_6x4 mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -560,7 +560,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 label(.DRETURN) - + end_asm( : // output operands (none) @@ -629,7 +629,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. @@ -649,7 +649,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -682,7 +682,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 //lea(mem(r14), rax) // rax = a; //lea(mem(rdx), rbx) // rbx = b; - + #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -690,18 +690,18 @@ void bli_dgemmsup_rd_haswell_asm_2x4 prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -730,7 +730,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) @@ -756,7 +756,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -807,27 +807,27 @@ void bli_dgemmsup_rd_haswell_asm_2x4 vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -836,7 +836,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -854,21 +854,21 @@ void bli_dgemmsup_rd_haswell_asm_2x4 vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -876,11 +876,11 @@ void bli_dgemmsup_rd_haswell_asm_2x4 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -898,12 +898,12 @@ void bli_dgemmsup_rd_haswell_asm_2x4 vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -911,10 +911,10 @@ void bli_dgemmsup_rd_haswell_asm_2x4 label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -943,75 +943,75 @@ void bli_dgemmsup_rd_haswell_asm_2x4 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) label(.DRETURN) - - + + end_asm( : // output operands (none) @@ -1079,7 +1079,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. @@ -1099,7 +1099,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1128,26 +1128,26 @@ void bli_dgemmsup_rd_haswell_asm_1x4 //lea(mem(r14), rax) // rax = a; //lea(mem(rdx), rbx) // rbx = b; - + #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c + //prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1170,7 +1170,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) @@ -1191,7 +1191,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif @@ -1231,27 +1231,27 @@ void bli_dgemmsup_rd_haswell_asm_1x4 add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1259,7 +1259,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) @@ -1273,21 +1273,21 @@ void bli_dgemmsup_rd_haswell_asm_1x4 add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1295,10 +1295,10 @@ void bli_dgemmsup_rd_haswell_asm_1x4 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) @@ -1312,12 +1312,12 @@ void bli_dgemmsup_rd_haswell_asm_1x4 add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1325,9 +1325,9 @@ void bli_dgemmsup_rd_haswell_asm_1x4 label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 - + + // ymm4 ymm7 ymm10 ymm13 + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -1339,15 +1339,15 @@ void bli_dgemmsup_rd_haswell_asm_1x4 vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) - vhaddpd( ymm8, ymm5, ymm0 ) - vextractf128(imm(1), ymm0, xmm1 ) - vaddpd( xmm0, xmm1, xmm0 ) + //vhaddpd( ymm8, ymm5, ymm0 ) + //vextractf128(imm(1), ymm0, xmm1 ) + //vaddpd( xmm0, xmm1, xmm0 ) - vhaddpd( ymm14, ymm11, ymm2 ) - vextractf128(imm(1), ymm2, xmm1 ) - vaddpd( xmm2, xmm1, xmm2 ) + //vhaddpd( ymm14, ymm11, ymm2 ) + //vextractf128(imm(1), ymm2, xmm1 ) + //vaddpd( xmm2, xmm1, xmm2 ) - vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) + //vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) @@ -1355,67 +1355,67 @@ void bli_dgemmsup_rd_haswell_asm_1x4 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) label(.DRETURN) - - + + end_asm( : // output operands (none) From e3dc1954ffb5eee2a8b41fce85ba589f75770eea Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 16 Sep 2021 10:59:37 -0500 Subject: [PATCH 071/226] Fix problem where uninitialized registers are included in vhaddpd in the Mx1 gemmsup kernels for haswell. The fix is to use the same (valid) source register twice in the horizontal addition. --- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c | 624 +++++++++--------- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c | 11 - 2 files changed, 312 insertions(+), 323 deletions(-) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c index 6e3c1a0e85..457ef9f22d 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c @@ -99,9 +99,9 @@ void bli_dgemmsup_rd_haswell_asm_6x1 // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -163,19 +163,19 @@ void bli_dgemmsup_rd_haswell_asm_6x1 prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -206,7 +206,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) @@ -233,7 +233,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -287,27 +287,27 @@ void bli_dgemmsup_rd_haswell_asm_6x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -336,21 +336,21 @@ void bli_dgemmsup_rd_haswell_asm_6x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -358,7 +358,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rbx ), xmm0) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; @@ -381,12 +381,12 @@ void bli_dgemmsup_rd_haswell_asm_6x1 add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -399,28 +399,28 @@ void bli_dgemmsup_rd_haswell_asm_6x1 // ymm10 // ymm12 // ymm14 - - vhaddpd( ymm5, ymm4, ymm0 ) + + vhaddpd( ymm4, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) - vhaddpd( ymm7, ymm6, ymm0 ) + vhaddpd( ymm6, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) - vhaddpd( ymm9, ymm8, ymm0 ) + vhaddpd( ymm8, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) - vhaddpd( ymm11, ymm10, ymm0 ) + vhaddpd( ymm10, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm10 ) - vhaddpd( ymm13, ymm12, ymm0 ) + vhaddpd( ymm12, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm12 ) - vhaddpd( ymm15, ymm14, ymm0 ) + vhaddpd( ymm14, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm14 ) @@ -435,114 +435,114 @@ void bli_dgemmsup_rd_haswell_asm_6x1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm14) vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm8, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm10, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm12, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - + label(.DRETURN) - + end_asm( : // output operands (none) @@ -613,9 +613,9 @@ void bli_dgemmsup_rd_haswell_asm_3x1 // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -633,7 +633,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -671,19 +671,19 @@ void bli_dgemmsup_rd_haswell_asm_3x1 prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -705,7 +705,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) - + // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) @@ -723,7 +723,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -759,27 +759,27 @@ void bli_dgemmsup_rd_haswell_asm_3x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -799,21 +799,21 @@ void bli_dgemmsup_rd_haswell_asm_3x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -821,7 +821,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rbx ), xmm0) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; @@ -835,12 +835,12 @@ void bli_dgemmsup_rd_haswell_asm_3x1 add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm8) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -850,16 +850,16 @@ void bli_dgemmsup_rd_haswell_asm_3x1 // ymm4 // ymm6 // ymm8 - - vhaddpd( ymm5, ymm4, ymm0 ) + + vhaddpd( ymm4, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) - vhaddpd( ymm7, ymm6, ymm0 ) + vhaddpd( ymm6, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) - vhaddpd( ymm9, ymm8, ymm0 ) + vhaddpd( ymm8, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) @@ -871,87 +871,87 @@ void bli_dgemmsup_rd_haswell_asm_3x1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm8, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - + label(.DRETURN) - + end_asm( : // output operands (none) @@ -1022,9 +1022,9 @@ void bli_dgemmsup_rd_haswell_asm_2x1 // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1042,7 +1042,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1078,19 +1078,19 @@ void bli_dgemmsup_rd_haswell_asm_2x1 prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1109,7 +1109,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) - + // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) @@ -1124,7 +1124,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1154,27 +1154,27 @@ void bli_dgemmsup_rd_haswell_asm_2x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1191,21 +1191,21 @@ void bli_dgemmsup_rd_haswell_asm_2x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1213,7 +1213,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rbx ), xmm0) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; @@ -1224,12 +1224,12 @@ void bli_dgemmsup_rd_haswell_asm_2x1 add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm6) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1238,12 +1238,12 @@ void bli_dgemmsup_rd_haswell_asm_2x1 // ymm4 // ymm6 - - vhaddpd( ymm5, ymm4, ymm0 ) + + vhaddpd( ymm4, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) - vhaddpd( ymm7, ymm6, ymm0 ) + vhaddpd( ymm6, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) @@ -1254,78 +1254,78 @@ void bli_dgemmsup_rd_haswell_asm_2x1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - + label(.DRETURN) - + end_asm( : // output operands (none) @@ -1396,9 +1396,9 @@ void bli_dgemmsup_rd_haswell_asm_1x1 // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1416,7 +1416,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1450,19 +1450,19 @@ void bli_dgemmsup_rd_haswell_asm_1x1 //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1478,7 +1478,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) - + // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) @@ -1490,7 +1490,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1514,27 +1514,27 @@ void bli_dgemmsup_rd_haswell_asm_1x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1548,21 +1548,21 @@ void bli_dgemmsup_rd_haswell_asm_1x1 add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1570,7 +1570,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rbx ), xmm0) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; @@ -1578,12 +1578,12 @@ void bli_dgemmsup_rd_haswell_asm_1x1 add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm4) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1591,8 +1591,8 @@ void bli_dgemmsup_rd_haswell_asm_1x1 label(.DPOSTACCUM) // ymm4 - - vhaddpd( ymm5, ymm4, ymm0 ) + + vhaddpd( ymm4, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) @@ -1602,69 +1602,69 @@ void bli_dgemmsup_rd_haswell_asm_1x1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - vmovsd(mem(rcx), xmm0) + + vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - + label(.DRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c index 21dd3b895b..516bfced54 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c @@ -1338,17 +1338,6 @@ void bli_dgemmsup_rd_haswell_asm_1x4 vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) - - //vhaddpd( ymm8, ymm5, ymm0 ) - //vextractf128(imm(1), ymm0, xmm1 ) - //vaddpd( xmm0, xmm1, xmm0 ) - - //vhaddpd( ymm14, ymm11, ymm2 ) - //vextractf128(imm(1), ymm2, xmm1 ) - //vaddpd( xmm2, xmm1, xmm2 ) - - //vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) - // xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) From 849aae09f4fbf8d7abf11f4df1471f1d057e874b Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 16 Sep 2021 14:47:45 -0500 Subject: [PATCH 072/226] Added new packm var3 to 'gemmlike'. Details: - Defined a new packm variant for the 'gemmlike' sandbox. This new variant (bls_l3_packm_var3.c) parallelizes the packing operation over the k dimension rather than the m or n dimensions. Note that the gemmlike implementation still uses var1 by default, and use of the new code would require changing bls_l3_packm_a.c and/or bls_l3_packm_b.c so that var3 is called instead. Thanks to Jeff Diamond for proposing this (perhaps NUMA-friendly) solution. --- sandbox/gemmlike/bls_l3_packm_var.h | 6 + sandbox/gemmlike/bls_l3_packm_var3.c | 200 +++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 sandbox/gemmlike/bls_l3_packm_var3.c diff --git a/sandbox/gemmlike/bls_l3_packm_var.h b/sandbox/gemmlike/bls_l3_packm_var.h index c2c0520c64..98300536bc 100644 --- a/sandbox/gemmlike/bls_l3_packm_var.h +++ b/sandbox/gemmlike/bls_l3_packm_var.h @@ -66,3 +66,9 @@ GENTPROT( float, s, packm_var2 ) GENTPROT( double, d, packm_var2 ) GENTPROT( scomplex, c, packm_var2 ) GENTPROT( dcomplex, z, packm_var2 ) + +//INSERT_GENTPROT_BASIC0( packm_var3 ) +GENTPROT( float, s, packm_var3 ) +GENTPROT( double, d, packm_var3 ) +GENTPROT( scomplex, c, packm_var3 ) +GENTPROT( dcomplex, z, packm_var3 ) diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/sandbox/gemmlike/bls_l3_packm_var3.c new file mode 100644 index 0000000000..5ea80ff424 --- /dev/null +++ b/sandbox/gemmlike/bls_l3_packm_var3.c @@ -0,0 +1,200 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Variant 3 is similar to variant 1, except that it parallelizes packing +// along the k dimension. (Our current hypothesis is that this method of +// parallelizing the operation may perform better on some NUMA systems.) +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bls_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ +\ + dim_t iter_dim; \ + dim_t n_iter; \ + dim_t it, ic; \ + dim_t ic0; \ + doff_t ic_inc; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + dim_t panel_dim; \ + dim_t panel_dim_max; \ + inc_t incc; \ + inc_t ldc; \ + inc_t ldp; \ + conj_t conjc; \ +\ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + bool row_stored = bli_is_col_packed( schema ); \ + /*bool col_stored = bli_is_row_packed( schema );*/ \ +\ + /* If the row storage flag indicates row storage, then we are packing + to column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( row_stored ) \ + { \ + /* Prepare to pack to row-stored column panels. */ \ + iter_dim = n; \ + panel_len = m; \ + panel_len_max = m_max; \ + panel_dim_max = pd_p; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( col_stored ) */ \ + { \ + /* Prepare to pack to column-stored row panels. */ \ + iter_dim = m; \ + panel_len = n; \ + panel_len_max = n_max; \ + panel_dim_max = pd_p; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ +\ + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ \ + { \ + ic0 = 0; \ + ic_inc = panel_dim_max; \ + } \ +\ + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ \ + const dim_t nt = bli_thread_n_way( thread ); \ + const dim_t tid = bli_thread_work_id( thread ); \ +\ + /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ + ( void )nt; \ + ( void )tid; \ +\ + dim_t pr_start, pr_end; \ +\ + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. */ \ + bli_thread_range_sub( thread, panel_len, 1, FALSE, &pr_start, &pr_end ); \ +\ + /* Define instances of panel_len and panel_len_max that are specific to + the local thread. */ \ + dim_t panel_len_loc = pr_end - pr_start; \ + dim_t panel_len_max_loc = panel_len_loc; \ +\ + /* If panel_len_max > panel_len, then there are some columns in p that + need to be zeroed. Of course, only the last thread will be responsible + for this edge region. */ \ + dim_t panel_len_zero = panel_len_max - panel_len; \ + if ( tid == nt - 1 ) panel_len_max_loc += panel_len_zero; \ +\ + /* Shift the pointer for c and p to the appropriate locations within the + first micropanel. */ \ + dim_t off_loc = pr_start; \ + ctype* restrict c_begin_loc = c_cast + off_loc * ldc; \ + ctype* restrict p_begin_loc = p_cast + off_loc * ldp; \ +\ + /* Iterate over every logical micropanel in the source matrix. */ \ + for ( ic = ic0, it = 0; it < n_iter; \ + ic += ic_inc, it += 1 ) \ + { \ + panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + ctype* restrict c_use = c_begin_loc + (ic )*incc; \ + ctype* restrict p_use = p_begin_loc + (it )*ps_p; \ +\ + { \ + PASTECH2(bls_,ch,packm_cxk) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_dim_max, \ + panel_len_loc, \ + panel_len_max_loc, \ + kappa_cast, \ + c_use, incc, ldc, \ + p_use, ldp, \ + cntx \ + ); \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_var3 ) +GENTFUNC( float, s, packm_var3 ) +GENTFUNC( double, d, packm_var3 ) +GENTFUNC( scomplex, c, packm_var3 ) +GENTFUNC( dcomplex, z, packm_var3 ) + +/* +if ( !row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_var3: a packed", panel_dim_max, panel_len_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +else \ +PASTEMAC(ch,fprintm)( stdout, "packm_var3: b packed", panel_len_max, panel_dim_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +*/ + From 52f29f739dbbb878c4cde36dbe26b82847acd4e9 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 17 Sep 2021 08:38:29 -0500 Subject: [PATCH 073/226] Removed last vestige of #define BLIS_NUM_ARCHS. Details: - Removed the commented-out #define BLIS_NUM_ARCHS in bli_type_defs.h and its associated (now outdated) comments. BLIS_NUM_ARCHS has been part of the arch_t enum for some time now, and so this change is mostly about removing any opportunity for confusion for people who may be reading the code. Thanks to Minh Quan Ho for leading me to cleanup. --- frame/include/bli_type_defs.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index f03fc72acd..adad202574 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1032,10 +1032,6 @@ typedef enum } arch_t; -// NOTE: This value must be updated to reflect the number of enum values -// listed above for arch_t! -//#define BLIS_NUM_ARCHS 25 - // // -- BLIS misc. structure types ----------------------------------------------- From fb93d242a4fef4694ce2680436da23087bbdd5fe Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 20 Sep 2021 15:42:08 -0500 Subject: [PATCH 074/226] Re-enable and fix 8e0c425 (BLIS_ENABLE_SYSTEM). Details: - Re-enable the changes originally made in 8e0c425 but quickly reverted in 2be78fc. - Moved the #include of bli_config.h so that it occurs before the #include of bli_system.h. This allows the #define BLIS_ENABLE_SYSTEM or #define BLIS_DISABLE_SYSTEM in bli_config.h to be processed by the time it is needed in bli_system.h. This change should have been in the original 8e0c425, but was accidentally omitted. Thanks to Minh Quan Ho for catching this. - Add #define BLIS_ENABLE_SYSTEM to config_detect.c so that the proper cpp conditional branch executes in bli_system.h when compiling the hardware detection binary. The changes made in 8e0c425 were an attempt to support the definition of BLIS_OS_NONE when configuring with --disable-system (in issue #532). That commit failed because, aside from the required but omitted header reordering (second bullet above), AppVeyor was unable to compile the hardware detection binary as a result of missing Windows headers. This commit, which builds on PR #546, should help fix that issue. Thanks to Minh Quan Ho for his assistance and patience on this matter. --- build/detect/config/config_detect.c | 12 ++++++++++++ frame/include/bli_system.h | 8 ++++---- frame/include/blis.h | 12 ++++++++++-- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/build/detect/config/config_detect.c b/build/detect/config/config_detect.c index 2b59f78bf9..8ffb300ace 100644 --- a/build/detect/config/config_detect.c +++ b/build/detect/config/config_detect.c @@ -33,8 +33,20 @@ */ +// The BLIS_ENABLE_SYSTEM macro must be defined so that the proper branches in +// bli_system.h are processed. (This macro is normally defined in bli_config.h.) +#define BLIS_ENABLE_SYSTEM + +// Use C-style static inline functions for the static inline functions that are +// defined by the headers below. (This macro is normally defined in +// bli_config_macro_defs.h.) #define BLIS_INLINE static + +// Since we're not building a shared library, we can forego the use of the +// BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro is +// normally defined in bli_config_macro_defs.h.) #define BLIS_EXPORT_BLIS + #include "bli_system.h" #include "bli_type_defs.h" #include "bli_arch.h" diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 2541018ac1..79333017b9 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -70,7 +70,7 @@ #endif // Determine the target operating system. -//#if defined(BLIS_ENABLE_SYSTEM) +#if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) @@ -94,9 +94,9 @@ #else #error "Cannot determine operating system" #endif -//#else // #if defined(BLIS_DISABLE_SYSTEM) -// #define BLIS_OS_NONE -//#endif +#else // #if defined(BLIS_DISABLE_SYSTEM) + #define BLIS_OS_NONE +#endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS diff --git a/frame/include/blis.h b/frame/include/blis.h index a42c7cce84..b374e85398 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -48,6 +48,15 @@ extern "C" { // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. +// -- configure definitions -- + +// NOTE: bli_config.h header must be included before any BLIS header. +// It is bootstrapped by ./configure and does not depend on later +// headers. Moreover, these configuration variables are necessary to change +// some default behaviors (e.g. disable OS-detection in bli_system.h in case +// of --disable-system). +#include "bli_config.h" + // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. @@ -55,9 +64,8 @@ extern "C" { #include "bli_lang_defs.h" -// -- configure definitions -- +// -- configure default definitions -- -#include "bli_config.h" #include "bli_config_macro_defs.h" From 7b39c1492067de941f81b49a3b6c1583290336fd Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 20 Sep 2021 16:13:50 -0500 Subject: [PATCH 075/226] Reverted fb93d24. Details: - The latest changes in fb93d24 are still causing problems. Reverting and preparing to move them to a branch. --- build/detect/config/config_detect.c | 12 ------------ frame/include/bli_system.h | 8 ++++---- frame/include/blis.h | 12 ++---------- 3 files changed, 6 insertions(+), 26 deletions(-) diff --git a/build/detect/config/config_detect.c b/build/detect/config/config_detect.c index 8ffb300ace..2b59f78bf9 100644 --- a/build/detect/config/config_detect.c +++ b/build/detect/config/config_detect.c @@ -33,20 +33,8 @@ */ -// The BLIS_ENABLE_SYSTEM macro must be defined so that the proper branches in -// bli_system.h are processed. (This macro is normally defined in bli_config.h.) -#define BLIS_ENABLE_SYSTEM - -// Use C-style static inline functions for the static inline functions that are -// defined by the headers below. (This macro is normally defined in -// bli_config_macro_defs.h.) #define BLIS_INLINE static - -// Since we're not building a shared library, we can forego the use of the -// BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro is -// normally defined in bli_config_macro_defs.h.) #define BLIS_EXPORT_BLIS - #include "bli_system.h" #include "bli_type_defs.h" #include "bli_arch.h" diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 79333017b9..2541018ac1 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -70,7 +70,7 @@ #endif // Determine the target operating system. -#if defined(BLIS_ENABLE_SYSTEM) +//#if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) @@ -94,9 +94,9 @@ #else #error "Cannot determine operating system" #endif -#else // #if defined(BLIS_DISABLE_SYSTEM) - #define BLIS_OS_NONE -#endif +//#else // #if defined(BLIS_DISABLE_SYSTEM) +// #define BLIS_OS_NONE +//#endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS diff --git a/frame/include/blis.h b/frame/include/blis.h index b374e85398..a42c7cce84 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -48,15 +48,6 @@ extern "C" { // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. -// -- configure definitions -- - -// NOTE: bli_config.h header must be included before any BLIS header. -// It is bootstrapped by ./configure and does not depend on later -// headers. Moreover, these configuration variables are necessary to change -// some default behaviors (e.g. disable OS-detection in bli_system.h in case -// of --disable-system). -#include "bli_config.h" - // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. @@ -64,8 +55,9 @@ extern "C" { #include "bli_lang_defs.h" -// -- configure default definitions -- +// -- configure definitions -- +#include "bli_config.h" #include "bli_config_macro_defs.h" From 1f527a93b996093e06ef7a8e94fb47ee7e690ce0 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 20 Sep 2021 17:56:36 -0500 Subject: [PATCH 076/226] Re-enable and fix fb93d24. Details: - Re-enabled the changes made in fb93d24. - Defined BLIS_ENABLE_SYSTEM in bli_arch.c, bli_cpuid.c, and bli_env.c, all of which needed the definition (in addition to config_detect.c) in order for the configure-time hardware detection binary to be compiled properly. Thanks to Minh Quan Ho for helping identify these additional files as needing to be updated. - Added additional comments to all four source files, most notably to prompt the reader to remember to update all of the files when updating any of the files. Also made the cpp code in each of the files as consistent/similar as possible. - Refer to issues #532 and PR #546 for more history. --- build/detect/config/config_detect.c | 39 ++++++++++++++++++++++++----- frame/base/bli_arch.c | 18 +++++++++++++ frame/base/bli_cpuid.c | 21 +++++++++++++++- frame/base/bli_env.c | 20 +++++++++++++++ frame/include/bli_system.h | 8 +++--- frame/include/blis.h | 12 +++++++-- 6 files changed, 105 insertions(+), 13 deletions(-) diff --git a/build/detect/config/config_detect.c b/build/detect/config/config_detect.c index 2b59f78bf9..5e29defe15 100644 --- a/build/detect/config/config_detect.c +++ b/build/detect/config/config_detect.c @@ -33,12 +33,39 @@ */ -#define BLIS_INLINE static -#define BLIS_EXPORT_BLIS -#include "bli_system.h" -#include "bli_type_defs.h" -#include "bli_arch.h" -#include "bli_cpuid.h" +// NOTE: This file will likely only ever get compiled as part of the BLIS +// configure script, and therefore BLIS_CONFIGURETIME_CPUID is guaranteed to +// be #defined. However, we preserve the cpp conditional for consistency with +// the other three files mentioned above. +#ifdef BLIS_CONFIGURETIME_CPUID + + // NOTE: If you need to make any changes to this cpp branch, it's probably + // the case that you also need to modify bli_arch.c, bli_cpuid.c, and + // bli_env.c. Don't forget to update these other files as needed! + + // The BLIS_ENABLE_SYSTEM macro must be defined so that the correct cpp + // branch in bli_system.h is processed. (This macro is normally defined in + // bli_config.h.) + #define BLIS_ENABLE_SYSTEM + + // Use C-style static inline functions for any static inline functions that + // happen to be defined by the headers below. (This macro is normally defined + // in bli_config_macro_defs.h.) + #define BLIS_INLINE static + + // Since we're not building a shared library, we can forgo the use of the + // BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro + // is normally defined in bli_config_macro_defs.h.) + #define BLIS_EXPORT_BLIS + + #include "bli_system.h" + #include "bli_type_defs.h" + #include "bli_arch.h" + #include "bli_cpuid.h" + //#include "bli_env.h" +#else + #include "blis.h" +#endif int main( int argc, char** argv ) { diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 7fe69919f6..e1061985ec 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -34,8 +34,26 @@ */ #ifdef BLIS_CONFIGURETIME_CPUID + + // NOTE: If you need to make any changes to this cpp branch, it's probably + // the case that you also need to modify bli_arch.c, bli_cpuid.c, and + // bli_env.c. Don't forget to update these other files as needed! + + // The BLIS_ENABLE_SYSTEM macro must be defined so that the correct cpp + // branch in bli_system.h is processed. (This macro is normally defined in + // bli_config.h.) + #define BLIS_ENABLE_SYSTEM + + // Use C-style static inline functions for any static inline functions that + // happen to be defined by the headers below. (This macro is normally defined + // in bli_config_macro_defs.h.) #define BLIS_INLINE static + + // Since we're not building a shared library, we can forgo the use of the + // BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro + // is normally defined in bli_config_macro_defs.h.) #define BLIS_EXPORT_BLIS + #include "bli_system.h" #include "bli_type_defs.h" #include "bli_arch.h" diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index bc04f55861..5360d39174 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -47,12 +47,31 @@ #endif #ifdef BLIS_CONFIGURETIME_CPUID + + // NOTE: If you need to make any changes to this cpp branch, it's probably + // the case that you also need to modify bli_arch.c, bli_cpuid.c, and + // bli_env.c. Don't forget to update these other files as needed! + + // The BLIS_ENABLE_SYSTEM macro must be defined so that the correct cpp + // branch in bli_system.h is processed. (This macro is normally defined in + // bli_config.h.) + #define BLIS_ENABLE_SYSTEM + + // Use C-style static inline functions for any static inline functions that + // happen to be defined by the headers below. (This macro is normally defined + // in bli_config_macro_defs.h.) #define BLIS_INLINE static + + // Since we're not building a shared library, we can forgo the use of the + // BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro + // is normally defined in bli_config_macro_defs.h.) #define BLIS_EXPORT_BLIS + #include "bli_system.h" #include "bli_type_defs.h" - #include "bli_cpuid.h" #include "bli_arch.h" + #include "bli_cpuid.h" + //#include "bli_env.h" #else #include "blis.h" #endif diff --git a/frame/base/bli_env.c b/frame/base/bli_env.c index 23b8e059e1..92aba69700 100644 --- a/frame/base/bli_env.c +++ b/frame/base/bli_env.c @@ -34,10 +34,30 @@ */ #ifdef BLIS_CONFIGURETIME_CPUID + + // NOTE: If you need to make any changes to this cpp branch, it's probably + // the case that you also need to modify bli_arch.c, bli_cpuid.c, and + // bli_env.c. Don't forget to update these other files as needed! + + // The BLIS_ENABLE_SYSTEM macro must be defined so that the correct cpp + // branch in bli_system.h is processed. (This macro is normally defined in + // bli_config.h.) + #define BLIS_ENABLE_SYSTEM + + // Use C-style static inline functions for any static inline functions that + // happen to be defined by the headers below. (This macro is normally defined + // in bli_config_macro_defs.h.) #define BLIS_INLINE static + + // Since we're not building a shared library, we can forgo the use of the + // BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro + // is normally defined in bli_config_macro_defs.h.) #define BLIS_EXPORT_BLIS + #include "bli_system.h" #include "bli_type_defs.h" + //#include "bli_arch.h" + //#include "bli_cpuid.h" #include "bli_env.h" #else #include "blis.h" diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 2541018ac1..79333017b9 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -70,7 +70,7 @@ #endif // Determine the target operating system. -//#if defined(BLIS_ENABLE_SYSTEM) +#if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) @@ -94,9 +94,9 @@ #else #error "Cannot determine operating system" #endif -//#else // #if defined(BLIS_DISABLE_SYSTEM) -// #define BLIS_OS_NONE -//#endif +#else // #if defined(BLIS_DISABLE_SYSTEM) + #define BLIS_OS_NONE +#endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS diff --git a/frame/include/blis.h b/frame/include/blis.h index a42c7cce84..b374e85398 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -48,6 +48,15 @@ extern "C" { // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. +// -- configure definitions -- + +// NOTE: bli_config.h header must be included before any BLIS header. +// It is bootstrapped by ./configure and does not depend on later +// headers. Moreover, these configuration variables are necessary to change +// some default behaviors (e.g. disable OS-detection in bli_system.h in case +// of --disable-system). +#include "bli_config.h" + // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. @@ -55,9 +64,8 @@ extern "C" { #include "bli_lang_defs.h" -// -- configure definitions -- +// -- configure default definitions -- -#include "bli_config.h" #include "bli_config_macro_defs.h" From 1fc23d2141189c7b583a5bff2cffd87fd5261444 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 21 Sep 2021 14:54:20 -0500 Subject: [PATCH 077/226] Safelist 'master', 'dev', 'amd' branches. Details: - Modified .travis.yml so that only commits to 'master', 'dev', and 'amd' branches get built by Travis CI. Thanks to Devin Matthews for helping to track down the syntax for this change. --- .travis.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.travis.yml b/.travis.yml index 6bd123ec04..a61a879fa1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,11 @@ language: c sudo: required dist: focal +branches: + only: + - master + - dev + - amd matrix: include: # full testsuite (all tests + mixed datatype (gemm_nn only) + salt + SDE + OOT) From 89aaf00650d6cc19b83af2aea6c8d04ddd3769cb Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 28 Sep 2021 18:34:33 -0500 Subject: [PATCH 078/226] Updates to FAQ.md, Sandboxes.md, and README.md. Details: - Updated FAQ.md to include two new questions, reordered an existing question, and also removed an outdated and redundant question about BLIS vs. AMD BLIS. - Updated Sandboxes.md to use 'gemmlike' as its main example, along with other smaller details. - Added ARM as a funder to README.md. --- README.md | 3 ++- docs/FAQ.md | 51 ++++++++++++++++++++++++++++----------------- docs/Sandboxes.md | 53 +++++++++++++++++++++++------------------------ 3 files changed, 60 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index ced473e594..1fe4e6dd49 100644 --- a/README.md +++ b/README.md @@ -727,8 +727,9 @@ This project and its associated research were partially sponsored by grants from [HPE](https://www.hpe.com/), [Oracle](https://www.oracle.com/), [Huawei](https://www.huawei.com/), -and [Facebook](https://www.facebook.com/), +and +[ARM](https://www.arm.com/), as well as grants from the [National Science Foundation](https://www.nsf.gov/) (Awards CCF-0917167, ACI-1148125/1340293, CCF-1320112, and ACI-1550493). diff --git a/docs/FAQ.md b/docs/FAQ.md index aaa8f89839..528504dfbc 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -9,6 +9,7 @@ project, as well as those we think a new user or developer might ask. If you do * [Why should I use BLIS instead of GotoBLAS / OpenBLAS / ATLAS / MKL / ESSL / ACML / Accelerate?](FAQ.md#why-should-i-use-blis-instead-of-gotoblas--openblas--atlas--mkl--essl--acml--accelerate) * [How is BLIS related to FLAME / libflame?](FAQ.md#how-is-blis-related-to-flame--libflame) * [What is the difference between BLIS and the AMD fork of BLIS found in AOCL?](FAQ.md#what-is-the-difference-between-blis-and-the-amd-fork-of-blis-found-in-aocl) + * [Who do I contact if I have a question about the AMD version of BLIS?](FAQ.md#who-do-i-contact-if-i-have-a-question-about-the-amd-version-of-blis) * [Does BLIS automatically detect my hardware?](FAQ.md#does-blis-automatically-detect-my-hardware) * [I understand that BLIS is mostly a tool for developers?](FAQ.md#i-understand-that-blis-is-mostly-a-tool-for-developers) * [How do I link against BLIS?](FAQ.md#how-do-i-link-against-blis) @@ -17,6 +18,7 @@ project, as well as those we think a new user or developer might ask. If you do * [What is a macrokernel?](FAQ.md#what-is-a-macrokernel) * [What is a context?](FAQ.md#what-is-a-context) * [I am used to thinking in terms of column-major/row-major storage and leading dimensions. What is a "row stride" / "column stride"?](FAQ.md#im-used-to-thinking-in-terms-of-column-majorrow-major-storage-and-leading-dimensions-what-is-a-row-stride--column-stride) + * [I'm somewhat new to this matrix stuff. Can you remind me, what is the difference between a matrix row and a matrix column?](FAQ.md#im-somewhat-new-to-this-matrix-stuff-can-you-remind-me-what-is-the-difference-between-a-matrix-row-and-a-matrix-column) * [Why does BLIS have vector (level-1v) and matrix (level-1m) variations of most level-1 operations?](FAQ.md#why-does-blis-have-vector-level-1v-and-matrix-level-1m-variations-of-most-level-1-operations) * [What does it mean when a matrix with general stride is column-tilted or row-tilted?](FAQ.md#what-does-it-mean-when-a-matrix-with-general-stride-is-column-tilted-or-row-tilted) * [I am not really interested in all of these newfangled features in BLIS. Can I just use BLIS as a BLAS library?](FAQ.md#im-not-really-interested-in-all-of-these-newfangled-features-in-blis-can-i-just-use-blis-as-a-blas-library) @@ -36,8 +38,7 @@ project, as well as those we think a new user or developer might ask. If you do * [Who funded the development of BLIS?](FAQ.md#who-funded-the-development-of-blis) * [I found a bug. How do I report it?](FAQ.md#i-found-a-bug-how-do-i-report-it) * [How do I request a new feature?](FAQ.md#how-do-i-request-a-new-feature) - * [What is the difference between this version of BLIS and the one that AMD maintains?](FAQ.md#what-is-the-difference-between-this-version-of-blis-and-the-one-that-amd-maintains) - * [Who do I contact if I have a question about the AMD version of BLIS?](FAQ.md#who-do-i-contact-if-i-have-a-question-about-the-amd-version-of-blis) + * [I'm a developer and I'd like to study the way matrix multiplication is implemented in BLIS. Where should I start?](FAQ.md#im-a-developer-and-id-like-to-study-the-way-matrix-multiplication-is-implemented-in-blis-where-should-i-start) * [Where did you get the photo for the BLIS logo / mascot?](FAQ.md#where-did-you-get-the-photo-for-the-blis-logo--mascot) ### Why did you create BLIS? @@ -60,7 +61,9 @@ homepage](https://github.com/flame/blis#key-features). But here are a few reason ### How is BLIS related to FLAME / `libflame`? -As explained [above](FAQ.md#why-did-you-create-blis?), BLIS was initially a layer within `libflame` that allowed more convenient interfacing to the BLAS. So in some ways, BLIS is a spin-off project. Prior to developing BLIS, [its author](http://www.cs.utexas.edu/users/field/) worked as the primary maintainer of `libflame`. If you look closely, you can also see that the design of BLIS was influenced by some of the more useful and innovative aspects of `libflame`, such as internal object abstractions and control trees. Also, various members of the [SHPC research group](http://shpc.ices.utexas.edu/people.html) and its [collaborators](http://shpc.ices.utexas.edu/collaborators.html) routinely provide insight, feedback, and also contribute code (especially kernels) to the BLIS project. +As explained [above](FAQ.md#why-did-you-create-blis?), BLIS was initially a layer within `libflame` that allowed more convenient interfacing to the BLAS. So in some ways, BLIS is a spin-off project. Prior to developing BLIS, [its primary author](http://www.cs.utexas.edu/users/field/) worked as the primary maintainer of `libflame`. If you look closely, you can also see that the design of BLIS was influenced by some of the more useful and innovative aspects of `libflame`, such as internal object abstractions and control trees. + +Note that various members of the [SHPC research group](http://shpc.ices.utexas.edu/people.html) and its [collaborators](http://shpc.ices.utexas.edu/collaborators.html) routinely provide insight, feedback, and also contribute code (especially kernels) to the BLIS project. ### What is the difference between BLIS and the AMD fork of BLIS found in AOCL? @@ -68,6 +71,10 @@ BLIS, also known as "vanilla BLIS" or "upstream BLIS," is maintained by its [ori AMD BLIS sometimes contains certain optimizations specific to AMD hardware. Many of these optimizations are (eventually) merged back into upstream BLIS. However, for various reasons, some changes may remain unique to AMD BLIS for quite some time. Thus, if you want the latest optimizations for AMD hardware, feel free to try AMD BLIS. However, please note that neither The University of Texas at Austin nor BLIS's developers can endorse or offer direct support for any outside fork of BLIS, including AMD BLIS. +### Who do I contact if I have a question about the AMD version of BLIS? + +For questions or support regarding [AMD's fork of BLIS](https://github.com/amd/blis), please contact the [AMD Optimizing CPU Libraries](https://developer.amd.com/amd-aocl/) group at aoclsupport@amd.com. + ### Does BLIS automatically detect my hardware? On certain architectures (most notably x86_64), yes. In order to use auto-detection, you must specify `auto` as your configuration when running `configure` (Please see the BLIS [Build System](BuildSystem.md) guide for more info.) A runtime detection option is also available. (Please see the [Configuration Guide](ConfigurationHowTo.md) for a comprehensive walkthrough.) @@ -76,9 +83,9 @@ If automatic hardware detection is requested at configure-time and the build pro ### I understand that BLIS is mostly a tool for developers? -Yes. In order to achieve high performance, BLIS requires that hand-coded kernels and microkernels be written and referenced in a valid [BLIS configuration](ConfigurationHowTo.md). These components are usually written by developers and then included within BLIS for use by others. +It is certainly the case that BLIS began as a tool targeted at developers. In order to achieve high performance, BLIS requires that hand-coded kernels and microkernels be written and referenced in a valid [BLIS configuration](ConfigurationHowTo.md). These components are usually written by developers and then included within BLIS for use by others. -The good news, however, is that end-users can use BLIS too. Once the aforementioned kernels are integrated into BLIS, they can be used without any developer-level knowledge, and many kernels have already been added! Usually, `./configure auto; make; make install` is sufficient for the typical users with typical hardware. +The good news, however, is that BLIS has matured to the point where end-users can use it too! Once the aforementioned kernels are integrated into BLIS, they can be used without any developer-level knowledge, and many kernels have already been added! Usually, `./configure auto; make; make install` is sufficient for the typical users with typical hardware. ### How do I link against BLIS? @@ -98,9 +105,9 @@ For a more thorough explanation of the microkernel and its role in the overall l ### What is a macrokernel? -The macrokernels are portable codes within the BLIS framework that implement relatively small subproblems within an overall level-3 operation. The overall problem (say, general matrix-matrix multiplication, or `gemm`) is partitioned down, according to cache blocksizes, such that its operands are (1) a suitable size and (2) stored in a special packed format. At that time, the macrokernel is called. The macrokernel is implemented as two loops around the microkernel. +The macrokernels are portable codes within the BLIS framework that implement relatively small subproblems within an overall level-3 operation. The overall problem (say, general matrix-matrix multiplication, or `gemm`) is partitioned down, according to cache blocksizes, such that its `A` and `B` operands are (1) a suitable size and (2) stored in a special packed format. At that time, the macrokernel is called. The macrokernel is implemented as two loops around the microkernel. -The macrokernels in BLIS correspond to the so-called "inner kernels" (or simply "kernels") that formed the fundamental unit of computation in Kazushige Goto's GotoBLAS (and now in the successor library, OpenBLAS). +The macrokernels, along with the microkernel that they call, correspond to the so-called "inner kernels" (or simply "kernels") that formed the fundamental unit of computation in Kazushige Goto's GotoBLAS (and now in the successor library, OpenBLAS). For more information on macrokernels, please read our [ACM TOMS papers](https://github.com/flame/blis#citations). @@ -118,6 +125,18 @@ In generalized storage, we have a row stride and a column stride. The row stride BLIS also supports situations where both the row stride and column stride are non-unit. We call this situation "general stride". +### I'm somewhat new to this matrix stuff. Can you remind me, what is the difference between a matrix row and a matrix column? + +Of course! (BLIS's primary author remembers what it was like to get columns and rows confused.) + +Matrix columns consist of elements that are vertically aligned. Matrix rows consist of element that are horizontally aligned. (One way to remember is that real-life columns are vertical structures that hold up buildings.) + +Furthermore, it is helpful to know that the number of rows in a matrix constitutes its so-called *m* dimension, and the number of columns consistutes its *n* dimension. + +Matrix dimension are always stated as *m x n*: the number of rows *by* the number of columns. + +So, a *3 x 4* matrix contains three rows (each of length four) and four columns (each of length three). + ### Why does BLIS have vector (level-1v) and matrix (level-1m) variations of most level-1 operations? At first glance, it might appear that an element-wise operation such as `copym` or `axpym` would be sufficiently general purpose to cover the cases where the operands are vectors. After all, an *m x 1* matrix can be viewed as a vector of length m and vice versa. But in BLIS, operations on vectors are treated slightly differently than operations on matrices. @@ -214,31 +233,25 @@ Lots of people! For a full list of those involved, see the ### Who funded the development of BLIS? -BLIS was primarily funded by grants from [Microsoft](https://www.microsoft.com/), -[Intel](https://www.intel.com/), [Texas -Instruments](https://www.ti.com/), [AMD](https://www.amd.com/), [Huawei](https://www.hauwei.com/us/), [Oracle](https://www.oracle.com/), and [Facebook](https://www.facebook.com/) as well as grants from the [National Science Foundation](http://www.nsf.gov/) (Awards CCF-0917167 ACI-1148125/1340293, and CCF-1320112). +BLIS was primarily funded by a variety of gifts/grants from industry and the National Science Foundation. Please see the "Funding" section of the [BLIS homepage](https://github.com/flame/blis#funding) for more details. Reminder: _Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation (NSF)._ ### I found a bug. How do I report it? -If you think you've found a bug, we request that you [open an issue](http://github.com/flame/blis/issues). Don't be shy! Really, it's the best and most convenient way for us to track your issues/bugs/concerns. Other discussions that are not primarily bug-reports should take place via the [blis-devel](http://groups.google.com/group/blis-devel) mailing list. +If you think you've found a bug, we request that you [open an issue](http://github.com/flame/blis/issues). Don't be shy! Really, it's the best and most convenient way for us to track your issues/bugs/concerns. ### How do I request a new feature? Feature requests should also be submitted by [opening a new issue](http://github.com/flame/blis/issues). -### What is the difference between this version of BLIS and the one that AMD maintains? +### I'm a developer and I'd like to study the way matrix multiplication is implemented in BLIS. Where should I start? -AMD has chosen BLIS as the open-source foundation for the BLAS component of their [AMD Optimizing CPU Libraries (AOCL)](https://developer.amd.com/amd-aocl/) toolkit. Our group enjoys a great collaboration and partnership with AMD, and we are pleased to have their enthusiastic support for our project. +Great question! The first thing you should know is that the core framework of [level-3 operations](https://github.com/flame/blis/blob/master/docs/BLISTypedAPI.md#operation-index) was *not* designed to be used to teach or explain a high-performance implementation of matrix multiplication. Rather, it was designed to encode the family of level-3 operations with as little code duplication as possible. Because of this, and also for historical/evolutionary reasons, it can be a little difficult to trace the execution of, say, `gemm` from within the core framework. -At a technical level, AMD's fork of BLIS is considered to be a downstream variant. AMD uses their fork to develop optimizations specific to AMD hardware. Occasionally, AMD will submit pull requests to merge their features, enhancements, and fixes back into our "plain vanilla" upstream repository. So our upstream BLIS will eventually contain most of the modifications originally developed by AMD in their fork, but with a lag. Similarly, features introduced into the upstream BLIS may not be immediately available in AMD's fork, but eventually their team will perform a merge and synchronize with our latest code. +Thankfully, we have an alternative environment in which experts, application developers, and other curious individuals can study BLIS's matrix multiplication implementation. This so-called "sandbox" is a simplified collection of code that strips away much of the framework complexity while also maintaining local definitions for many of the interesting bits. You may find this `gemmlike` sandbox in `sandbox/gemmlike`. -AMD also uses a different versioning system for AOCL which is independent of the versions used by the [upstream BLIS](http://github.com/flame/blis) project. - -### Who do I contact if I have a question about the AMD version of BLIS? - -For questions or support regarding [AMD's fork of BLIS](https://github.com/amd/blis), please contact the [AMD Optimizing CPU Libraries](https://developer.amd.com/amd-aocl/) group at aoclsupport@amd.com. +Sandboxes go beyond the scope of this FAQ. For an introduction, please refer to the [Sandboxes](https://github.com/flame/blis/blob/master/docs/Sandboxes.md) document, and/or contact the BLIS developers for more information. ### Where did you get the photo for the BLIS logo / mascot? diff --git a/docs/Sandboxes.md b/docs/Sandboxes.md index ce1548f6e0..eff98906ed 100644 --- a/docs/Sandboxes.md +++ b/docs/Sandboxes.md @@ -37,11 +37,11 @@ utility functions. To enable a sandbox at configure-time, you simply specify it as an option to `configure`. Either of the following usages are accepted: ``` -$ ./configure --enable-sandbox=ref99 auto -$ ./configure -s ref99 auto +$ ./configure --enable-sandbox=gemmlike auto +$ ./configure -s gemmlike auto ``` -Here, we tell `configure` that we want to use the `ref99` sandbox, which -corresponds to a sub-directory of `sandbox` named `ref99`. (Reminder: the +Here, we tell `configure` that we want to use the `gemmlike` sandbox, which +corresponds to a sub-directory of `sandbox` named `gemmlike`. (Reminder: the `auto` argument is the configuration target and thus unrelated to sandboxes.) @@ -50,7 +50,7 @@ sizes and shapes, you'll need to disable the skinny/unpacked "sup" sub-framework within BLIS, which is enabled by default. This can be done by passing the `--disable-sup-handling` option to configure: ``` -$ ./configure --enable-sandbox=ref99 --disable-sup-handling auto +$ ./configure --enable-sandbox=gemmlike --disable-sup-handling auto ``` If you leave sup enabled, the sup implementation will, at runtime, detect and handle certain smaller problem sizes upstream of where BLIS calls @@ -62,13 +62,14 @@ As `configure` runs, you should get output that includes lines similar to: ``` configure: configuring for alternate gemm implementation: -configure: sandbox/ref99 +configure: sandbox/gemmlike ``` And when you build BLIS, the last files to be compiled will be the source code in the specified sandbox: ``` -Compiling obj/haswell/sandbox/ref99/blx_gemm_ref_var2.o ('haswell' CFLAGS for sandboxes) -Compiling obj/haswell/sandbox/ref99/oapi/bli_gemmnat.o ('haswell' CFLAGS for sandboxes) +Compiling obj/haswell/sandbox/gemmlike/bli_gemmnat.o ('haswell' CFLAGS for sandboxes) +Compiling obj/haswell/sandbox/gemmlike/bls_gemm.o ('haswell' CFLAGS for sandboxes) +Compiling obj/haswell/sandbox/gemmlike/bls_gemm_bp_var1.o ('haswell' CFLAGS for sandboxes) ... ``` That's it! After the BLIS library is built, it will contain your chosen @@ -92,16 +93,19 @@ will be found! 2. Your sandbox must be written in C99 or C++11. If you write your sandbox in C++11, you must use one of the BLIS-approved file extensions for your source files (`.cc`, `.cpp`, `.cxx`) and your header files (`.hh`, `.hpp`, `.hxx`). -Note that `blis.h` -already contains all of its definitions inside of an `extern "C"` block, so -you should be able to `#include "blis.h"` from your C++11 source code without -any issues. +Note that `blis.h` already contains all of its definitions inside of an +`extern "C"` block, so you should be able to `#include "blis.h"` from your +C++11 source code without any issues. 3. All of your code to replace BLIS's default implementation of `bli_gemmnat()` should reside in the named sandbox directory, or some directory therein. -(Obviously.) For example, the "reference" sandbox is located in -`sandbox/ref99`. All of the code associated with this sandbox will be -contained within `sandbox/ref99`. +(Obviously.) For example, the "gemmlike" sandbox is located in +`sandbox/gemmlike`. All of the code associated with this sandbox will be +contained within `sandbox/gemmlike`. Note that you absolutely *may* include +additional code and interfaces within the sandbox, if you wish -- code and +interfaces that are not directly or indirectly needed for satisfying the +the "contract" set forth by the sandbox (i.e., including a local definition +of`bli_gemmnat()`). 4. The *only* header file that is required of your sandbox is `bli_sandbox.h`. It must be named `bli_sandbox.h` because `blis.h` will `#include` this file @@ -116,16 +120,17 @@ you should only place things (e.g. prototypes or type definitions) in Usually, neither of these situations will require any of your local definitions since those local definitions are only needed to define your sandbox implementation of `bli_gemmnat()`, and this function is already prototyped by -BLIS. +BLIS. *But if you are adding additional APIs and/or operations to the sandbox +that are unrelated to `bli_gemmnat()`, then you'll want to #include those +function prototypes from within `bli_sandbox.h`* 5. Your definition of `bli_gemmnat()` should be the **only function you define** in your sandbox that begins with `bli_`. If you define other functions that begin with `bli_`, you risk a namespace collision with existing framework functions. To guarantee safety, please prefix your locally-defined sandbox -functions with another prefix. Here, in the `ref99` sandbox, we use the prefix -`blx_`. (The `x` is for sandbox. Or experimental.) Also, please avoid the -prefix `bla_` since that prefix is also used in BLIS for BLAS compatibility -functions. +functions with another prefix. Here, in the `gemmlike` sandbox, we use the prefix +`bls_`. (The `s` is for sandbox.) Also, please avoid the prefix `bla_` since that +prefix is also used in BLIS for BLAS compatibility functions. If you follow these rules, you will be much more likely to have a pleasant experience integrating your BLIS sandbox into the larger framework. @@ -207,15 +212,9 @@ enabled in `input.general`. However, if those options *are* enabled and BLIS was built with mixed datatype support, then BLIS assumes that the implementation of `gemm` will support mixing of datatypes. BLIS *must* assume this, because there's no way for it to confirm at runtime that an implementation was written -to support mixing datatypes. Note that even the `ref99` sandbox included with +to support mixing datatypes. Note that even the `gemmlike` sandbox included with BLIS does not support mixed-datatype computation. -* **Multithreading in ref99.** The current reference sandbox, `ref99`, does not -currently implement multithreading. - -* **Packing matrices in ref99.** The current reference sandbox, `ref99`, does not -currently implement packing of matrices A or B. - ## Conclusion If you encounter any problems, or are really bummed-out that `gemm` is the From 3442d4002b3bfffd8848f72103b30691df2b19b1 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 28 Sep 2021 18:43:23 -0500 Subject: [PATCH 079/226] More minor fixes to FAQ.md and Sandboxes.md. --- docs/FAQ.md | 8 ++++---- docs/Sandboxes.md | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 528504dfbc..4120483e1d 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -229,7 +229,7 @@ If this feature is important or useful to your work, we would love to hear from ### Who is involved in the project? Lots of people! For a full list of those involved, see the -[CREDITS](https://github.com/flame/blis/blob/master/CREDITS) file within the BLIS framework source distribution. +[CREDITS](CREDITS) file within the BLIS framework source distribution. ### Who funded the development of BLIS? @@ -247,13 +247,13 @@ Feature requests should also be submitted by [opening a new issue](http://github ### I'm a developer and I'd like to study the way matrix multiplication is implemented in BLIS. Where should I start? -Great question! The first thing you should know is that the core framework of [level-3 operations](https://github.com/flame/blis/blob/master/docs/BLISTypedAPI.md#operation-index) was *not* designed to be used to teach or explain a high-performance implementation of matrix multiplication. Rather, it was designed to encode the family of level-3 operations with as little code duplication as possible. Because of this, and also for historical/evolutionary reasons, it can be a little difficult to trace the execution of, say, `gemm` from within the core framework. +Great question! The first thing you should know is that the core framework of [level-3 operations](docs/BLISTypedAPI.md#operation-index) was *not* designed to be used to teach or explain a high-performance implementation of matrix multiplication. Rather, it was designed to encode the family of level-3 operations with as little code duplication as possible. Because of this, and also for historical/evolutionary reasons, it can be a little difficult to trace the execution of, say, `gemm` from within the core framework. Thankfully, we have an alternative environment in which experts, application developers, and other curious individuals can study BLIS's matrix multiplication implementation. This so-called "sandbox" is a simplified collection of code that strips away much of the framework complexity while also maintaining local definitions for many of the interesting bits. You may find this `gemmlike` sandbox in `sandbox/gemmlike`. -Sandboxes go beyond the scope of this FAQ. For an introduction, please refer to the [Sandboxes](https://github.com/flame/blis/blob/master/docs/Sandboxes.md) document, and/or contact the BLIS developers for more information. +Sandboxes go beyond the scope of this FAQ. For an introduction, please refer to the [Sandboxes](docs/Sandboxes.md) document, and/or contact the BLIS developers for more information. ### Where did you get the photo for the BLIS logo / mascot? -The sleeping ["BLIS cat"](https://github.com/flame/blis/blob/master/README.md) photo was taken by Petar Mitchev and is used with his permission. +The sleeping ["BLIS cat"](README.md) photo was taken by Petar Mitchev and is used with his permission. diff --git a/docs/Sandboxes.md b/docs/Sandboxes.md index eff98906ed..8f404d0a6b 100644 --- a/docs/Sandboxes.md +++ b/docs/Sandboxes.md @@ -121,7 +121,7 @@ Usually, neither of these situations will require any of your local definitions since those local definitions are only needed to define your sandbox implementation of `bli_gemmnat()`, and this function is already prototyped by BLIS. *But if you are adding additional APIs and/or operations to the sandbox -that are unrelated to `bli_gemmnat()`, then you'll want to #include those +that are unrelated to `bli_gemmnat()`, then you'll want to `#include` those function prototypes from within `bli_sandbox.h`* 5. Your definition of `bli_gemmnat()` should be the **only function you define** From b36fb0fbc5fda13d9a52cc64953341d3d53067ee Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 28 Sep 2021 18:47:45 -0500 Subject: [PATCH 080/226] Fixed newly broken link to CREDITS in FAQ.md. --- docs/FAQ.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 4120483e1d..6fee25e7d4 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -229,7 +229,7 @@ If this feature is important or useful to your work, we would love to hear from ### Who is involved in the project? Lots of people! For a full list of those involved, see the -[CREDITS](CREDITS) file within the BLIS framework source distribution. +[CREDITS](https://github.com/flame/blis/blob/master/CREDITS) file within the BLIS framework source distribution. ### Who funded the development of BLIS? From 5013a6cb7110746c417da96e4a1308ef681b0b88 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 29 Sep 2021 10:38:50 -0500 Subject: [PATCH 081/226] More edits and fixes to docs/FAQ.md. --- docs/FAQ.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 6fee25e7d4..3d0852d36f 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -129,9 +129,9 @@ BLIS also supports situations where both the row stride and column stride are no Of course! (BLIS's primary author remembers what it was like to get columns and rows confused.) -Matrix columns consist of elements that are vertically aligned. Matrix rows consist of element that are horizontally aligned. (One way to remember is that real-life columns are vertical structures that hold up buildings.) +Matrix columns consist of elements that are vertically aligned. Matrix rows consist of elements that are horizontally aligned. (One way to remember this distinction is that real-life columns are vertical structures that hold up buildings. A row of seats in a stadium, by contrast, is horizontal to the ground.) -Furthermore, it is helpful to know that the number of rows in a matrix constitutes its so-called *m* dimension, and the number of columns consistutes its *n* dimension. +Furthermore, it is helpful to know that the number of rows in a matrix constitutes its so-called *m* dimension, and the number of columns constitutes its *n* dimension. Matrix dimension are always stated as *m x n*: the number of rows *by* the number of columns. @@ -145,15 +145,13 @@ If an application wishes to perform an element-wise operation on two objects, an However, if an application instead decides to perform an element-wise operation on two objects, and the application calls a level-1v operation, the dimension constraints are slightly relaxed. In this scenario, BLIS only checks that the vector *lengths* are equal. This allows for the vectors to have different orientations (row vs column) while still being considered conformal. So, you could perform a `copyv` operation to copy from an *m x 1* vector to a *1 x m* vector. A `copym` operation on such objects would not be allowed (unless it was executed with the source object containing an implicit transposition). -Another way to think about level-1v operations is that they will work with any two matrix objects in situations where (a) the corresponding level-1m operation *would have* worked if the input had been transposed, and (b) all operands happen to be vectors (i.e., have one unit dimension). - ### What does it mean when a matrix with general stride is column-tilted or row-tilted? When a matrix is stored with general stride, both the row stride and column stride (let's call them `rs` and `cs`) are non-unit. When `rs` < `cs`, we call the general stride matrix "column-tilted" because it is "closer" to being column-stored (than row-stored). Similarly, when `rs` > `cs`, the matrix is "row-tilted" because it is closer to being row-stored. ### I'm not really interested in all of these newfangled features in BLIS. Can I just use BLIS as a BLAS library? -Absolutely. Just link your application to BLIS the same way you would link to a BLAS library. For a simple linking example, see the [Linking to BLIS](KernelsHowTo.md#linking-to-blis) section of the BLIS [Build System](BuildSystem.md) guide. +Absolutely! Just link your application to BLIS the same way you would link to a BLAS library. For a simple linking example, see the [Linking to BLIS](KernelsHowTo.md#linking-to-blis) section of the BLIS [Build System](BuildSystem.md) guide. ### What about CBLAS? @@ -163,11 +161,13 @@ BLIS also contains an optional CBLAS compatibility layer, which leverages the BL In principle, BLIS's native (and BLAS-like) [typed API](BLISTypedAPI) can be called from Fortran. However, you must ensure that the size of the integer in BLIS is equal to the size of integer used by your Fortran program/compiler/environment. The size of BLIS integers is determined at configure-time. Please see `./configure --help` for the syntax for options related to integer sizes. +You may also want to confirm that your Fortran compiler doesn't perform any name-mangling of called functions or subroutines (such as with additional underscores beyond the single trailing underscore found in the BLAS APIs), and if so, take steps to disable this additional name-mangling. For example, if your source code calls `dgemm()` but your Fortran compiler name-mangles that call to `_dgemm_()` or `dgemm__()`, your program will fail to link against BLIS since BLIS only defines `dgemm_()`. + As for bindings to other languages, please contact the [blis-devel](http://groups.google.com/group/blis-devel) mailing list. ### Do I need to call initialization/finalization functions before being able to use BLIS from my application? -Originally, BLIS did indeed require the application to explicitly setup (initialize) various internal data structures via `bli_init()`. Likewise, calling `bli_finalize()` was recommended to cleanup (finalize) the library. However, since commit 9804adf (circa December 2017), BLIS has implemented self-initialization. These explicit calls to `bli_init()` and `bli_finalize()` are no longer necessary, though experts may still use them in special cases to control the allocation and freeing of resources. This topic is discussed in the BLIS [typed API reference](BLISTypedAPI.md#initialization-and-cleanup). +Originally, BLIS did indeed require the application to explicitly setup (initialize) various internal data structures via `bli_init()`. Likewise, calling `bli_finalize()` was recommended to cleanup (finalize) the library. However, since commit `9804adf` (circa December 2017), BLIS has implemented self-initialization. These explicit calls to `bli_init()` and `bli_finalize()` are no longer necessary, though experts may still use them in special cases to control the allocation and freeing of resources. This topic is discussed in the BLIS [typed API reference](BLISTypedAPI.md#initialization-and-cleanup). ### Does BLIS support multithreading? @@ -181,7 +181,7 @@ We have integrated some early foundational support for NUMA *development*, but c ### Does BLIS work with GPUs? -BLIS does not currently support graphical processing units (GPUs). However, others have applied the BLIS approach towards frameworks that provide BLAS-like functionality on GPUs. To see how NVIDIA's implementation compares to an analagous approach based on the principles that underlie BLIS, please see a paper by some of our collaborators, ["Implementing Strassen’s Algorithm with CUTLASSon NVIDIA Volta GPUs"](https://apps.cs.utexas.edu/apps/sites/default/files/tech_reports/GPUStrassen.pdf). +BLIS does not currently support graphical processing units (GPUs). However, others have applied the BLIS approach towards frameworks that provide BLAS-like functionality on GPUs. To see how NVIDIA's implementation compares to an analogous approach based on the principles that underlie BLIS, please see a paper by some of our collaborators, ["Implementing Strassen’s Algorithm with CUTLASS on NVIDIA Volta GPUs"](https://apps.cs.utexas.edu/apps/sites/default/files/tech_reports/GPUStrassen.pdf). ### Does BLIS work on _(some architecture)_? @@ -193,7 +193,7 @@ No. BLIS is a framework for sequential and shared-memory/multicore implementatio ### Can I build BLIS on Mac OS X? -BLIS was designed for use in a GNU/Linux environment. However, we've gone to greath lengths to keep BLIS compatible with other UNIX-like systems as well, such as BSD and OS X. System software requirements for UNIX-like systems are discussed in the BLIS [Build System](BuildSystem.md) guide. +BLIS was designed for use in a GNU/Linux environment. However, we've gone to great lengths to keep BLIS compatible with other UNIX-like systems as well, such as BSD and OS X. System software requirements for UNIX-like systems are discussed in the BLIS [Build System](BuildSystem.md) guide. ### Can I build BLIS on Windows? @@ -222,7 +222,7 @@ Yes. By default, most configurations output only a static library archive (e.g. ### Can I use the mixed domain / mixed precision support in BLIS? -Yes! As of 5fec95b (circa October 2018), BLIS supports mixed-datatype (mixed domain and/or mixed precision) computation via the `gemm` operation. Documentation on utilizing this new functionality is provided via the [MixedDatatype.md](docs/MixedDatatypes.md) document in the source distribution. +Yes! As of 5fec95b (circa October 2018), BLIS supports mixed-datatype (mixed domain and/or mixed precision) computation via the `gemm` operation. Documentation on utilizing this new functionality is provided via the [MixedDatatype.md](MixedDatatypes.md) document in the source distribution. If this feature is important or useful to your work, we would love to hear from you. Please contact us via the [blis-devel](http://groups.google.com/group/blis-devel) mailing list and tell us about your application and why you need/want support for BLAS-like operations with mixed-domain/mixed-precision operands. @@ -247,11 +247,11 @@ Feature requests should also be submitted by [opening a new issue](http://github ### I'm a developer and I'd like to study the way matrix multiplication is implemented in BLIS. Where should I start? -Great question! The first thing you should know is that the core framework of [level-3 operations](docs/BLISTypedAPI.md#operation-index) was *not* designed to be used to teach or explain a high-performance implementation of matrix multiplication. Rather, it was designed to encode the family of level-3 operations with as little code duplication as possible. Because of this, and also for historical/evolutionary reasons, it can be a little difficult to trace the execution of, say, `gemm` from within the core framework. +Great question! The first thing you should know is that the core framework of [level-3 operations](BLISTypedAPI.md#operation-index) was *not* designed to be used to teach or explain a high-performance implementation of matrix multiplication. Rather, it was designed to encode the family of level-3 operations with as little code duplication as possible. Because of this, and also for historical/evolutionary reasons, it can be a little difficult to trace the execution of, say, `gemm` from within the core framework. Thankfully, we have an alternative environment in which experts, application developers, and other curious individuals can study BLIS's matrix multiplication implementation. This so-called "sandbox" is a simplified collection of code that strips away much of the framework complexity while also maintaining local definitions for many of the interesting bits. You may find this `gemmlike` sandbox in `sandbox/gemmlike`. -Sandboxes go beyond the scope of this FAQ. For an introduction, please refer to the [Sandboxes](docs/Sandboxes.md) document, and/or contact the BLIS developers for more information. +Sandboxes go beyond the scope of this FAQ. For an introduction, please refer to the [Sandboxes](Sandboxes.md) document, and/or contact the BLIS developers for more information. ### Where did you get the photo for the BLIS logo / mascot? From 828ac8e2dda7c331ffc2479f957b4974f6cc8a30 Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Wed, 26 Apr 2023 18:05:56 +0530 Subject: [PATCH 082/226] Partial completion of work in L1 APIs - Partial completion of compute was happening since BLIS was unable to launch the required number of threads. This was because rntm was returning a thread count greater than the maximum number of threads that can be launched in the subsequent parallel region. - Added 'omp_get_num_threads' inside the parallel regions to get the actual number of threads spawned. The work distribution happens based on the actual number of threads launched in that region. AMD-Internal: [CPUPL-3268] Change-Id: I086ad4b9b644f966b7bab439e43222396f0c2bf0 --- frame/compat/bla_axpy_amd.c | 10 ++++++++-- frame/compat/bla_dot_amd.c | 27 +++++++++++++++++++-------- frame/compat/bla_scal_amd.c | 20 ++++++++++++++++---- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/frame/compat/bla_axpy_amd.c b/frame/compat/bla_axpy_amd.c index f6a64c40aa..0e24d7d4a5 100644 --- a/frame/compat/bla_axpy_amd.c +++ b/frame/compat/bla_axpy_amd.c @@ -398,11 +398,17 @@ void daxpy_blis_impl // Get the thread ID dim_t thread_id = omp_get_thread_num(); - // Calculate the compute range for the current thread + // Get the actual number of threads spawned + dim_t nt_use = omp_get_num_threads(); + + /* + Calculate the compute range for the current thread + based on the actual number of threads spawned + */ bli_thread_vector_partition ( n_elem, - nt, + nt_use, &start, &length, thread_id ); diff --git a/frame/compat/bla_dot_amd.c b/frame/compat/bla_dot_amd.c index 3d0648d27b..213fd14a42 100644 --- a/frame/compat/bla_dot_amd.c +++ b/frame/compat/bla_dot_amd.c @@ -466,11 +466,17 @@ double ddot_blis_impl // Get the thread ID dim_t thread_id = omp_get_thread_num(); - // Calculate the compute range for the current thread + // Get the actual number of threads spawned + dim_t nt_use = omp_get_num_threads(); + + /* + Calculate the compute range for the current thread + based on the actual number of threads spawned + */ bli_thread_vector_partition ( n_elem, - nt, + nt_use, &start, &length, thread_id ); @@ -492,13 +498,18 @@ double ddot_blis_impl ); } - // Accumulating the nt thread outputs to rho - for ( dim_t i = 0; i < nt; i++ ) - rho += rho_temp[i]; - - // Releasing the allocated memory if it was allocated - if( bli_mem_is_alloc(&mem_buf_rho)) + /* + Accumulate the values in rho_temp only when mem is allocated. + When the memory cannot be allocated rho_temp will point to + rho + */ + if (bli_mem_is_alloc(&mem_buf_rho)) { + // Accumulating the nt thread outputs to rho + for (dim_t i = 0; i < nt; i++) + rho += rho_temp[i]; + + // Releasing the allocated memory if it was allocated bli_membrk_release(&rntm, &mem_buf_rho); } #endif diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c index bec3515a0d..041c1b6a87 100644 --- a/frame/compat/bla_scal_amd.c +++ b/frame/compat/bla_scal_amd.c @@ -383,11 +383,17 @@ void dscal_blis_impl // Get the thread ID dim_t thread_id = omp_get_thread_num(); - // Calculate the compute range for the current thread + // Get the actual number of threads spawned + dim_t nt_use = omp_get_num_threads(); + + /* + Calculate the compute range for the current thread + based on the actual number of threads spawned + */ bli_thread_vector_partition ( n_elem, - nt, + nt_use, &start, &length, thread_id ); @@ -563,11 +569,17 @@ void zdscal_blis_impl // Get the thread ID dim_t thread_id = omp_get_thread_num(); - // Calculate the compute range for the current thread + // Get the actual number of threads spawned + dim_t nt_use = omp_get_num_threads(); + + /* + Calculate the compute range for the current thread + based on the actual number of threads spawned + */ bli_thread_vector_partition ( n_elem, - nt, + nt_use, &start, &length, thread_id ); From bf26b8ffbc81ba13c9e2cb38349fd410b0b1e299 Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Fri, 28 Apr 2023 15:16:17 +0530 Subject: [PATCH 083/226] Removing /arch:AVX2 flag from-high level CMake - Previously, this flag was set as a default at the high-level CMakeLists.txt which means that this flag is used to build everything,all files and all subdirectories, including ref_kernels and testsuite. Also, all files as target sources for this project and compiled with the same flags. - Now, we create object files using the source in kernels/ directory and add to the object files the AVX2 flag explicitly. So, now only those files will have this flag and it should not be used to compile ref_kernels, etc. - This is a quick solution to enable runs on non-AVX2 machines. AMD-Internal: [CPUPL-3241] Change-Id: Id569b26ffeea40eaa36ab4465b0c52b6446d7650 --- CMakeLists.txt | 16 +++++++++++++--- kernels/zen/1/CMakeLists.txt | 5 +++-- kernels/zen/1f/CMakeLists.txt | 5 +++-- kernels/zen/2/CMakeLists.txt | 11 +++++++---- kernels/zen/3/CMakeLists.txt | 5 +++-- kernels/zen/3/sup/CMakeLists.txt | 5 +++-- 6 files changed, 32 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 91132b8105..075309c9db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -347,7 +347,7 @@ SET(ENABLE_SIMD_FLAGS "AVX2" CACHE STRING "Set compiler SIMD flags") SET_PROPERTY(CACHE ENABLE_SIMD_FLAGS PROPERTY STRINGS none SSE2 AVX AVX2) if(${ENABLE_SIMD_FLAGS} MATCHES "AVX2") - add_definitions(/arch:AVX2) + #add_definitions(/arch:AVX2) elseif(${ENABLE_SIMD_FLAGS} MATCHES "AVX") add_definitions(/arch:AVX) elseif(${ENABLE_SIMD_FLAGS} MATCHES "SSE2") @@ -642,7 +642,12 @@ add_definitions(-DBLIS_VERSION_STRING="AOCL-BLIS ${BLIS_VERSION_STRING} Build ${ if(BUILD_SHARED_LIBS) add_library("${PROJECT_NAME}" SHARED ${CMAKE_SOURCE_DIR}/bli_config.h ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/blis.h - ${headers}) + ${headers} + $ + $ + $ + $ + $) if(ENABLE_OPENMP) target_link_libraries("${PROJECT_NAME}" PRIVATE OpenMP::OpenMP_CXX) endif() @@ -652,7 +657,12 @@ endif() if(NOT BUILD_SHARED_LIBS) add_library("${PROJECT_NAME}" STATIC ${CMAKE_SOURCE_DIR}/bli_config.h ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/blis.h - ${headers}) + ${headers} + $ + $ + $ + $ + $) if(ENABLE_OPENMP) set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C OUTPUT_NAME "${LIB_NAME}" STATIC_LIBRARY_OPTIONS "${OpenMP_libomp_LIBRARY}") else() diff --git a/kernels/zen/1/CMakeLists.txt b/kernels/zen/1/CMakeLists.txt index 1a1fa9929a..1a0b644a99 100644 --- a/kernels/zen/1/CMakeLists.txt +++ b/kernels/zen/1/CMakeLists.txt @@ -1,7 +1,7 @@ ##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(zen_1 + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_amaxv_zen_int.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpbyv_zen_int.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpbyv_zen_int10.c @@ -18,3 +18,4 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_norm2_zen_int.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_scal2v_zen_int.c ) +target_compile_options(zen_1 PRIVATE /arch:AVX2) \ No newline at end of file diff --git a/kernels/zen/1f/CMakeLists.txt b/kernels/zen/1f/CMakeLists.txt index 3a77f69ef1..4a110392da 100644 --- a/kernels/zen/1f/CMakeLists.txt +++ b/kernels/zen/1f/CMakeLists.txt @@ -1,7 +1,7 @@ ##Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(zen_1f + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_8.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxf_zen_int_8.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_5.c @@ -10,3 +10,4 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpy2v_zen_int.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxaxpyf_zen_int_8.c ) +target_compile_options(zen_1f PRIVATE /arch:AVX2) \ No newline at end of file diff --git a/kernels/zen/2/CMakeLists.txt b/kernels/zen/2/CMakeLists.txt index 791eafb97b..9618ce256c 100644 --- a/kernels/zen/2/CMakeLists.txt +++ b/kernels/zen/2/CMakeLists.txt @@ -1,14 +1,17 @@ ##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(zen_2 + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_zen_ref.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_zen_int_4.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_zen_int_4.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_zen_int_amd.c ) +target_compile_options(zen_2 PRIVATE /arch:AVX2) +# For any other TARGET_ARCH, it would fail to configure. # Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR +#[=[if(${TARGET_ARCH} STREQUAL zen OR ${TARGET_ARCH} STREQUAL zen2 OR ${TARGET_ARCH} STREQUAL zen3 OR ${TARGET_ARCH} STREQUAL zen4 OR @@ -17,4 +20,4 @@ ${TARGET_ARCH} STREQUAL amdzen) PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_zen_int_amd.c ) -endif() +endif()]=] diff --git a/kernels/zen/3/CMakeLists.txt b/kernels/zen/3/CMakeLists.txt index d90e4e3902..b7187e59e5 100644 --- a/kernels/zen/3/CMakeLists.txt +++ b/kernels/zen/3/CMakeLists.txt @@ -1,11 +1,12 @@ ##Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(zen_3 + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_small.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_ref_k1.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_ref_k1.c ) +target_compile_options(zen_3 PRIVATE /arch:AVX2) add_subdirectory(sup) diff --git a/kernels/zen/3/sup/CMakeLists.txt b/kernels/zen/3/sup/CMakeLists.txt index 8f773d7775..b1f829eebf 100644 --- a/kernels/zen/3/sup/CMakeLists.txt +++ b/kernels/zen/3/sup/CMakeLists.txt @@ -1,7 +1,7 @@ ##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(zen_3_sup + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_s6x16.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_s6x16m.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_s6x16n.c @@ -18,3 +18,4 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4m.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4n.c ) +target_compile_options(zen_3_sup PRIVATE /arch:AVX2) \ No newline at end of file From a6621f12416f0b854f2fd73bb68f89f29ea3d93d Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Thu, 4 May 2023 10:44:15 +0530 Subject: [PATCH 084/226] Incorrect accumulation of results in DDOTV - When the number of threads launched is not equal to the number of threads requested the garbage value in the created buffer will not be overwritten by valid values. - To handle the above scenario, the created temporary buffer is initialized with zeroes. AMD-Internal: [CPUPL-3268] Change-Id: I439a1da18eb1b380491fea14f42b0ede05ccf5a9 --- frame/compat/bla_dot_amd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/frame/compat/bla_dot_amd.c b/frame/compat/bla_dot_amd.c index 213fd14a42..0b6651d8fc 100644 --- a/frame/compat/bla_dot_amd.c +++ b/frame/compat/bla_dot_amd.c @@ -452,6 +452,19 @@ double ddot_blis_impl if ((bli_mem_is_alloc(&mem_buf_rho))) { rho_temp = bli_mem_buffer(&mem_buf_rho); + + /* + This is done to handle cases when the + number of threads launched is not equal + to the number of threads requested. In + such cases, the garbage value in the created + buffer will not be overwritten by valid values. + + This will ensure that garbage value will + not get accumulated with the final result. + */ + for (dim_t i = 0; i < nt; i++) + rho_temp[i] = 0.0; } else { From 9164427e86ede2de4988eb2b5c021819a273f111 Mon Sep 17 00:00:00 2001 From: vignbala Date: Wed, 26 Apr 2023 05:55:46 +0530 Subject: [PATCH 085/226] Code cleanup: Mismatch in assembly macros - In the bli_x86_asm_macros.h file, the set of vinsertf?x? and vextractf?x? instructions are facing macro expansion errors due to ambiguous macro redirection. The lower-case macro definitions of these instructions are not properly redirected to their corresponding upper-case macro definitions. - This error occurs due to ambiguity in the upper-case macro name. At the place of lower-case macro definition, the redirection is to macros of the form VINSERTF?x? and VEXTRACTF?x?, while at the place of upper-case macro definition, they are of the form VINSERTF?X? and VEXTRACTF?X?. This causes a mismatch of the upper-case macro due to different case sensitive 'x' being used. - This patch corrects this issue, by changing the lower-case 'x' to upper-case, among the upper case macros at the place of redirection. This provides uniformity and facilitates the expected macro-expansion. AMD-Internal: [CPUPL-3276] Change-Id: Id1f45f8e4bb083cd4b87632b713ff6baba616ff2 --- frame/include/bli_x86_asm_macros.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h index ccc2989eed..84bc76c21d 100644 --- a/frame/include/bli_x86_asm_macros.h +++ b/frame/include/bli_x86_asm_macros.h @@ -1220,18 +1220,18 @@ #define vunpckhps(_0, _1, _2) VUNPCKHPS(_0, _1, _2) #define vunpcklpd(_0, _1, _2) VUNPCKLPD(_0, _1, _2) #define vunpckhpd(_0, _1, _2) VUNPCKHPD(_0, _1, _2) -#define vshuff32x4(_0, _1, _2, _3) VSHUFF32x4(_0, _1, _2, _3) -#define vshuff64x2(_0, _1, _2, _3) VSHUFF64x2(_0, _1, _2, _3) +#define vshuff32x4(_0, _1, _2, _3) VSHUFF32X4(_0, _1, _2, _3) +#define vshuff64x2(_0, _1, _2, _3) VSHUFF64X2(_0, _1, _2, _3) #define vinsertf128(_0, _1, _2, _3) VINSERTF128(_0, _1, _2, _3) -#define vinsertf32x4(_0, _1, _2, _3) VINSERTF32x4(_0, _1, _2, _3) -#define vinsertf32x8(_0, _1, _2, _3) VINSERTF32x8(_0, _1, _2, _3) -#define vinsertf64x2(_0, _1, _2, _3) VINSERTF64x2(_0, _1, _2, _3) -#define vinsertf64x4(_0, _1, _2, _3) VINSERTF64x4(_0, _1, _2, _3) +#define vinsertf32x4(_0, _1, _2, _3) VINSERTF32X4(_0, _1, _2, _3) +#define vinsertf32x8(_0, _1, _2, _3) VINSERTF32X8(_0, _1, _2, _3) +#define vinsertf64x2(_0, _1, _2, _3) VINSERTF64X2(_0, _1, _2, _3) +#define vinsertf64x4(_0, _1, _2, _3) VINSERTF64X4(_0, _1, _2, _3) #define vextractf128(_0, _1, _2) VEXTRACTF128(_0, _1, _2) -#define vextractf32x4(_0, _1, _2) VEXTRACTF32x4(_0, _1, _2) -#define vextractf32x8(_0, _1, _2) VEXTRACTF32x8(_0, _1, _2) -#define vextractf64x2(_0, _1, _2) VEXTRACTF64x2(_0, _1, _2) -#define vextractf64x4(_0, _1, _2) VEXTRACTF64x4(_0, _1, _2) +#define vextractf32x4(_0, _1, _2) VEXTRACTF32X4(_0, _1, _2) +#define vextractf32x8(_0, _1, _2) VEXTRACTF32X8(_0, _1, _2) +#define vextractf64x2(_0, _1, _2) VEXTRACTF64X2(_0, _1, _2) +#define vextractf64x4(_0, _1, _2) VEXTRACTF64X4(_0, _1, _2) #define vblendps(_0, _1, _2, _3) VBLENDPS(_0, _1, _2, _3) #define vblendpd(_0, _1, _2, _3) VBLENDPD(_0, _1, _2, _3) #define vblendmps(_0, _1, _2) VBLENDMSD(_0, _1, _2) From 7739a3fbfed78e85f7c155736597ad509010ad19 Mon Sep 17 00:00:00 2001 From: Mangala V Date: Thu, 27 Apr 2023 03:46:39 +0530 Subject: [PATCH 086/226] Bug fix for 4xk AVX512 packing kernel Few tests failed on windows OS as some registers were not added as part of cobbler list Updated below registers into clobber list: In function bli_zpackm_zen4_asm_12xk : ZMM12-ZMM15 In function bli_zpackm_zen4_asm_4xk : ZMM4-ZMM7 AMD-Internal: [CPUPL-3253] Change-Id: I3e42130bf1a3b48717c4b437179ae3f116e5cf1d --- kernels/zen4/1m/bli_packm_zen4_asm_z12xk.c | 3 ++- kernels/zen4/1m/bli_packm_zen4_asm_z4xk.c | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_z12xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_z12xk.c index 6f83940b4f..3145801e11 100644 --- a/kernels/zen4/1m/bli_packm_zen4_asm_z12xk.c +++ b/kernels/zen4/1m/bli_packm_zen4_asm_z12xk.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -313,6 +313,7 @@ void bli_zpackm_zen4_asm_12xk "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", + "zmm12", "zmm13", "zmm14", "zmm15", "memory" ) } diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_z4xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_z4xk.c index 387445d599..02f2776c17 100644 --- a/kernels/zen4/1m/bli_packm_zen4_asm_z4xk.c +++ b/kernels/zen4/1m/bli_packm_zen4_asm_z4xk.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -50,8 +50,8 @@ /* ZMM6 = Ar8 Ai8 Ar10 Ai10 Ar12 Ai12 Ar14 Ai14 */ /* ZMM7 = Ar9 Ai9 Ar11 Ai11 Ar13 Ai13 Ar15 Ai15 */ /* Output R0 = Ar0 Ai0 Ar4 Ai4 Ar8 Ai8 Ar12 Ai12 */ -/* Output R2 = Ar1 Ai1 Ar5 Ai5 Ar9 Ai9 Ar13 Ai13 */ -/* Output R1 = Ar2 Ai2 Ar6 Ai6 Ar10 Ai10 Ar14 Ai14 */ +/* Output R1 = Ar1 Ai1 Ar5 Ai5 Ar9 Ai9 Ar13 Ai13 */ +/* Output R2 = Ar2 Ai2 Ar6 Ai6 Ar10 Ai10 Ar14 Ai14 */ /* Output R3 = Ar3 Ai3 Ar7 Ai7 Ar11 Ai11 Ar15 Ai15 */ /******************************************************/ #define TRANSPOSE(R0, R1, R2, R3) \ @@ -247,6 +247,7 @@ void bli_zpackm_zen4_asm_4xk "r8", "r10", "r12", "r13", "r14", "xmm0", "xmm1", "xmm2", "xmm3", "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "memory" ) } From b167e470910e7c72e8bf06f4b91e07155c94d1d0 Mon Sep 17 00:00:00 2001 From: mkadavil Date: Tue, 9 May 2023 18:15:13 +0530 Subject: [PATCH 087/226] LPGEMM frame and micro-kernel updates to fix gcc9.4 compilation issue. -Micro-kernel: Some AVX512 intrinsics(eg: _mm512_loadu_epi32) were introduced in later versions of gcc (>10) in addition to already existing masked intrinsic(eg: _mm512_mask_loadu_epi32). In order to support compilation using gcc 9.4, either the masked intrinsic or other gcc 9.4 compatible intrinsic needs to be used (eg: _mm512_loadu_si512) in LPGEMM Zen4 micro-kernels. -Frame: BF16 LPGEMM api's (aocl_gemm_bf16bf16f32obf16/bf16bf16f32of32) needs to be disabled if aocl_gemm (LPGEMM) addon is compiled using gcc 9.4. BF16 intrinsics are not supported in gcc 9.4, and the micro-kernels for BF16 LPGEMM is excluded from compilation based on GNUC macro. AMD-Internal: [CPUPL-3396] Change-Id: I096b05cdceea77e3e7fec18a5e41feccdf47f0e7 --- addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c | 13 + addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c | 13 + bench/bench_aocl_gemm/bench_lpgemm.c | 81 ++- .../lpgemm_6x64rowmajor_bf16_amd512vnni.c | 10 + .../bf16bf16f32/lpgemm_f32_kern_macros.h | 6 + .../lpgemm_m_fringe_bf16_amd512vnni.c | 2 + .../lpgemm_mn_fringe_bf16_amd512vnni.c | 2 + .../lpgemm_n_fringe_bf16_amd512vnni.c | 2 + .../lpgemm_packb_bf16_amd512vnni.c | 146 +++-- .../lpgemm_6x64rowmajor_s8_amd512vnni.c | 88 +-- .../s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c | 320 +++++------ .../s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c | 510 +++++++++--------- .../s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c | 138 ++--- .../s8s8s32/lpgemm_packa_s8_amd512vnni.c | 121 +++-- .../s8s8s32/lpgemm_packb_s8_amd512vnni.c | 208 +++---- .../u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c | 88 +-- .../u8s8s32/lpgemm_m_fringe_amd512vnni.c | 280 +++++----- .../u8s8s32/lpgemm_mn_fringe_amd512vnni.c | 440 +++++++-------- .../lpgemm_n_extMR_fringe_amd512vnni.c | 88 +-- .../u8s8s32/lpgemm_n_fringe_amd512vnni.c | 124 ++--- .../lpgemm/u8s8s32/lpgemm_packa_amd512vnni.c | 121 +++-- .../lpgemm/u8s8s32/lpgemm_packb_amd512vnni.c | 164 +++--- .../lpgemm/u8s8s32/lpgemm_s32_kern_macros.h | 52 +- 23 files changed, 1594 insertions(+), 1423 deletions(-) diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c index b759b51115..0e0f93e191 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c @@ -46,6 +46,19 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) trans_t blis_transa; trans_t blis_transb; + // There is this use case where lpgemm will be compiled using gcc9.4 + // (where bf16 ISA is not supported), but deployed on a zen4+ sustem + // (which supports bf16 ISA). Here the bf16 kernels will be concealed + // and not compiled, and subsequently this api should error out and + // return early, even if bf16 ISA is supported by machine. +#if defined( BLIS_GCC ) && ( __GNUC__ < 10 ) + { + bli_print_msg("bf16bf16f32obf16 compiled using a compiler not " + "supporting BF16 ISA.", __FILE__, __LINE__ ); + return; // Error. + } +#endif + // Check if avx512_vnni ISA is supported, lpgemm matmul only works with it. if ( bli_cpuid_is_avx512bf16_supported() == FALSE ) { diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c index e0ffa4a0dd..ca8b160220 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c @@ -46,6 +46,19 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) trans_t blis_transa; trans_t blis_transb; + // There is this use case where lpgemm will be compiled using gcc9.4 + // (where bf16 ISA is not supported), but deployed on a zen4+ sustem + // (which supports bf16 ISA). Here the bf16 kernels will be concealed + // and not compiled, and subsequently this api should error out and + // return early, even if bf16 ISA is supported by machine. +#if defined( BLIS_GCC ) && ( __GNUC__ < 10 ) + { + bli_print_msg("bf16bf16f32of32 compiled using a compiler not " + "supporting BF16 ISA.", __FILE__, __LINE__ ); + return; // Error. + } +#endif + // Check if avx512_vnni ISA is supported, lpgemm matmul only works with it. if ( bli_cpuid_is_avx512bf16_supported() == FALSE ) { diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index d8ddc40023..7dd049b159 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -70,6 +70,17 @@ inline void float_to_bf16( float* float_value, bfloat16* bf16_val ) memcpy( ( bf16_val ), (char *)( float_value ) + 2, sizeof ( bfloat16 ) ); } +inline float bf16_to_float + ( + bfloat16 bf16_val + ) +{ + int32_t inter_temp = *( ( int16_t* ) &bf16_val ); + inter_temp = inter_temp << 16; + float float_value = *( float* ) ( &inter_temp ); + return float_value; +} + inline void convert_float_arr_to_bf16( float* array, bfloat16* array_bf16, int size ) { for (int i=0; i< size; i++) @@ -97,7 +108,10 @@ GEN_FILL_ARRAY_FUNC(int32_t) void fill_array_bfloat16( void* arr, dim_t size ) { float* c_float = ( float* ) bli_malloc_user( sizeof( float ) * size ); - fill_array_float( c_float, size ); + for ( dim_t i = 0; i < size; ++i ) + { + c_float[i] = 2.0; + } convert_float_arr_to_bf16( c_float, arr, size ); if ( c_float != NULL ) { @@ -337,35 +351,32 @@ int min (int a, int b) return ( a < b ? a : b ); } -#define GEN_MAT_MUL_ACC_CHK_DOWNSCALE(C_type,ACCUM_type,SCALE_type,BLAS_DOWNSCALE_SFX) \ -inline C_type mat_mul_accuracy_check_downscale_ ## BLAS_DOWNSCALE_SFX \ +#define GEN_MAT_MUL_ACC_CHK_DOWNSCALE(ACCUM_type,SCALE_type,BLAS_DOWNSCALE_SFX) \ +inline ACCUM_type mat_mul_accuracy_check_downscale_ ## BLAS_DOWNSCALE_SFX \ (\ ACCUM_type temp_accum,\ - C_type out_temp_accum, \ aocl_post_op* post_op, \ dim_t j \ )\ {\ - out_temp_accum = ( C_type ) min ( max ( nearbyintf( ( SCALE_type )temp_accum * \ + ACCUM_type out_temp_accum = ( ACCUM_type ) min ( max ( nearbyintf( ( SCALE_type )temp_accum * \ ( *( ( SCALE_type* )post_op->sum.scale_factor + j ) ) ), S8_MIN ), S8_MAX ) ; \ return out_temp_accum; \ }\ -GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int16_t,float,u8s8s16os8) -GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int32_t,float,u8s8s32os8) -GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int32_t,float,s8s8s32os8) -GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int16_t,float,s8s8s16os8) +GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int16_t,float,u8s8s16os8) +GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int32_t,float,u8s8s32os8) +GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int32_t,float,s8s8s32os8) +GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int16_t,float,s8s8s16os8) -inline bfloat16 mat_mul_accuracy_check_downscale_bf16bf16f32obf16 +inline float mat_mul_accuracy_check_downscale_bf16bf16f32obf16 ( float temp_accum, - bfloat16 out_temp_accum, aocl_post_op* post_op, dim_t j ) { - float_to_bf16( ( &temp_accum ), ( &out_temp_accum ) ); - return out_temp_accum; + return temp_accum; } #define GEN_MAT_MUL_ACC_CHK_ACCUM(A_type, B_type, C_type,ACCUM_type,BLAS_SFX) \ @@ -409,17 +420,6 @@ GEN_MAT_MUL_ACC_CHK_ACCUM(int8_t,int8_t,int32_t,int32_t,s8s8s32os32) GEN_MAT_MUL_ACC_CHK_ACCUM(int8_t,int8_t,int8_t,int16_t,s8s8s16os8) GEN_MAT_MUL_ACC_CHK_ACCUM(int8_t,int8_t,int16_t,int16_t,s8s8s16os16) -inline float bf16_to_float - ( - bfloat16 bf16_val - ) -{ - int32_t inter_temp = *( ( int16_t* ) &bf16_val ); - inter_temp = inter_temp << 16; - float float_value = *( float* ) ( &inter_temp ); - return float_value; -} - inline float mat_mul_accuracy_check_accum_bf16bf16f32of32 ( bfloat16* a, @@ -553,6 +553,31 @@ GEN_GELU_ERF_POSTOP_FLOAT(f32f32f32of32) GEN_GELU_ERF_POSTOP_FLOAT(bf16bf16f32of32) GEN_GELU_ERF_POSTOP_FLOAT(bf16bf16f32obf16) +#define GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(C_type, ACCUM_type) \ +void mat_mul_get_output_type_val ## ACCUM_type ## C_type \ + ( \ + C_type* out_temp_accum, \ + ACCUM_type* temp_accum \ + ) \ +{ \ + ( *out_temp_accum ) = ( C_type )( *temp_accum ); \ +} \ + +GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int32_t,int32_t) +GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int8_t,int32_t) +GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int16_t,int16_t) +GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int8_t,int16_t) +GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(float,float) + +void mat_mul_get_output_type_valfloatbfloat16 + ( + bfloat16* out_temp_accum, + float* temp_accum + ) +{ + float_to_bf16( temp_accum, out_temp_accum ); +} + #define GEN_MAT_MUL_ACC_CHK_DRV_FUNC(A_type,B_type,C_type,ACCUM_type,SCALE_type,BLAS_SFX,BLAS_DOWNSCALE_SFX) \ void mat_mul_accuracy_check_driver_ ## BLAS_SFX \ ( \ @@ -666,13 +691,17 @@ void mat_mul_accuracy_check_driver_ ## BLAS_SFX \ else if ( post_op->seq_vector[op_id] == SCALE ) \ { \ temp_accum = GEN_FUNC_NAME(mat_mul_accuracy_check_downscale_,BLAS_DOWNSCALE_SFX) \ - (temp_accum, out_temp_accum, post_op, j); \ + (temp_accum, post_op, j); \ } \ else \ {} \ } \ } \ - out_temp_accum = ( C_type )temp_accum; \ + /* Need to convert to downscaled type if required.*/ \ + mat_mul_get_output_type_val ## ACCUM_type ## C_type \ + ( \ + &out_temp_accum, &temp_accum \ + ); \ \ if ( *( c + ( rs_c * i ) + ( cs_c * j ) ) != out_temp_accum ) \ { \ diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c index 21ab6bfb9c..53df235a2d 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c @@ -39,6 +39,14 @@ #include "lpgemm_f32_kern_macros.h" +#ifdef LPGEMM_BF16_NOT_SUPPORTED + +// BF16 ISA is not supported by gcc < 10. Use a dummy kernel here. +LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) +{} + +#else + // 6x64 bf16 kernel LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) { @@ -1497,4 +1505,6 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) } } } + +#endif //LPGEMM_BF16_NOT_SUPPORTED #endif diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h index 1e2f82280b..92980193c4 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h @@ -38,6 +38,12 @@ #include "../gelu_avx512.h" #include "../math_utils_avx512.h" +// Disable BF16 kernel in cases where compilers support other avx 512 +// features except BF16 ISA. +#if defined( BLIS_GCC ) && ( __GNUC__ < 10 ) +#define LPGEMM_BF16_NOT_SUPPORTED +#endif + /* ReLU scale (Parametric ReLU): f(x) = x, when x > 0 and f(x) = a*x when x <= 0 */ #define RELU_SCALE_OP_F32_AVX512(reg) \ /* Generate indenx of elements <= 0.*/ \ diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c index 0d8d24152b..d364ba247b 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c @@ -40,6 +40,7 @@ #include "lpgemm_f32_kern_macros.h" +#ifndef LPGEMM_BF16_NOT_SUPPORTED // 5x64 bf16 kernel LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64) { @@ -3249,3 +3250,4 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x64) _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); } #endif +#endif diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c index 780f2ac10b..1eab70432c 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c @@ -40,6 +40,7 @@ #include "lpgemm_f32_kern_macros.h" +#ifndef LPGEMM_BF16_NOT_SUPPORTED // 5xlt16 bf16 fringe kernel LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5xlt16) { @@ -7261,3 +7262,4 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x48) _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); } #endif +#endif diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c index ede6823545..56795a00ba 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c @@ -39,6 +39,7 @@ #include "lpgemm_f32_kern_macros.h" +#ifndef LPGEMM_BF16_NOT_SUPPORTED // 6xlt16 bf16 fringe kernel LPGEMM_N_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6xlt16) { @@ -3324,3 +3325,4 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48) } } #endif +#endif diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c index fbe3e281c5..fe39c8c038 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c @@ -115,10 +115,10 @@ void packb_nr64_bf16bf16f32of32 for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 ) { // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. - a0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 0 ) ) + jc ); - b0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 0 ) ) + jc + 32 ); - c0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 1 ) ) + jc ); - d0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 1 ) ) + jc + 32 ); + a0 = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) + jc ); + b0 = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) + jc + 32 ); + c0 = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) + jc ); + d0 = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) + jc + 32 ); a01 = _mm512_unpacklo_epi16( a0, c0 ); a0 = _mm512_unpackhi_epi16( a0, c0 ); @@ -132,16 +132,16 @@ void packb_nr64_bf16bf16f32of32 c0 = _mm512_permutex2var_epi64( c01, selector1_1, c0 ); //store to pack_b buffer - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 0 ) * NR ), b0 ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) + 32, a0 ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 1 ) * NR ), d0 ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) + 32, c0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 0 ) * NR ), b0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) + 32, a0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 1 ) * NR ), d0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) + 32, c0 ); } // Handle k remainder. if( k_partial_pieces > 0) { - a0 = _mm512_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); - b0 = _mm512_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) ) + jc + 32 ); + a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); + b0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc + 32 ); c0 = _mm512_setzero_si512(); d0 = _mm512_setzero_si512(); @@ -157,10 +157,10 @@ void packb_nr64_bf16bf16f32of32 c0 = _mm512_permutex2var_epi64( c01, selector1_1, c0 ); //store to pack_b buffer - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ), b0 ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) + 32, a0 ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ), d0 ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) + 32, c0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ), b0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) + 32, a0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ), d0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) + 32, c0 ); } } @@ -256,8 +256,8 @@ void packb_nr48_bf16bf16f32of32 for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 ) { // Rearrange for dpbf16_ps, read 2 rows from B with 32 elements in each row. - a0x = _mm512_loadu_epi16( b + ( ldb * ( kr + 0 ) ) ); - c0x = _mm512_loadu_epi16( b + ( ldb * ( kr + 1 ) ) ); + a0x = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) ); + c0x = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) ); a01x = _mm512_unpacklo_epi16( a0x, c0x ); a0x = _mm512_unpackhi_epi16( a0x, c0x ); @@ -266,12 +266,12 @@ void packb_nr48_bf16bf16f32of32 a0x = _mm512_permutex2var_epi64( a01x, selector1_1, a0x ); //First 2x32 elements - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x ); // Rearrange for dpbf16_ps, read 2 rows from B with next 16 elements in each row. - a0 = _mm256_loadu_epi16( b + ( ldb * ( kr + 0 ) ) + NR1 ); - c0 = _mm256_loadu_epi16( b + ( ldb * ( kr + 1 ) ) + NR1 ); + a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 0 ) ) + NR1 ); + c0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 1 ) ) + NR1 ); a01 = _mm256_unpacklo_epi16( a0, c0 ); a0 = _mm256_unpackhi_epi16( a0, c0 ); @@ -280,15 +280,23 @@ void packb_nr48_bf16bf16f32of32 a0 = _mm256_permute2f128_si256(a01, a0, 0x31); //Last 2x16 elements - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ), b0 ); - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ) + NR2, a0 ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ), + 0xFF, b0 + ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ) + NR2, + 0xFF, a0 + ); kr_new += 3; } // Handle k remainder. if ( k_partial_pieces > 0 ) { - a0x = _mm512_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) ) ); + a0x = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) ); c0x = _mm512_setzero_si512(); a01x = _mm512_unpacklo_epi16( a0x, c0x ); @@ -298,10 +306,10 @@ void packb_nr48_bf16bf16f32of32 a0x = _mm512_permutex2var_epi64( a01x, selector1_1, a0x ); //First 2x32 elements - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x ); - a0 = _mm256_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) ) + NR1 ); + a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + NR1 ); c0 = _mm256_setzero_si256(); a01 = _mm256_unpacklo_epi16( a0, c0 ); @@ -311,8 +319,16 @@ void packb_nr48_bf16bf16f32of32 a0 = _mm256_permute2f128_si256(a01, a0, 0x31); //Last 2x16 elements - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ), b0 ); - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ) + NR2, a0 ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ), + 0xFF, b0 + ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ) + NR2, + 0xFF, a0 + ); } } @@ -344,8 +360,8 @@ void packb_nr32_bf16bf16f32of32 for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 ) { // Rearrange for dpbf16_ps, read 2 rows from B with 32 elements in each row. - a0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 0 ) ) ); - c0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 1 ) ) ); + a0 = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) ); + c0 = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) ); a01 = _mm512_unpacklo_epi16( a0, c0 ); a0 = _mm512_unpackhi_epi16( a0, c0 ); @@ -353,15 +369,15 @@ void packb_nr32_bf16bf16f32of32 b0 = _mm512_permutex2var_epi64( a01, selector1, a0 ); a0 = _mm512_permutex2var_epi64( a01, selector1_1, a0 ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new ) * NR ), b0 ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new ) * NR ), b0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 ); kr_new += 2; } // Handle k remainder. if ( k_partial_pieces > 0 ) { - a0 = _mm512_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) ) ); + a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) ); c0 = _mm512_setzero_si512(); a01 = _mm512_unpacklo_epi16( a0, c0 ); @@ -370,8 +386,8 @@ void packb_nr32_bf16bf16f32of32 b0 = _mm512_permutex2var_epi64( a01, selector1, a0 ); a0 = _mm512_permutex2var_epi64( a01, selector1_1, a0 ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new ) * NR ), b0 ); - _mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new ) * NR ), b0 ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 ); } } @@ -399,8 +415,8 @@ void packb_nr16_bf16bf16f32of32 for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 ) { // Rearrange for dpbf16_ps, read 2 rows from B with 16 elements in each row. - a0 = _mm256_loadu_epi16( b + ( ldb * ( kr + 0 ) ) ); - c0 = _mm256_loadu_epi16( b + ( ldb * ( kr + 1 ) ) ); + a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 0 ) ) ); + c0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 1 ) ) ); a01 = _mm256_unpacklo_epi16( a0, c0 ); a0 = _mm256_unpackhi_epi16( a0, c0 ); @@ -408,15 +424,23 @@ void packb_nr16_bf16bf16f32of32 b0 = _mm256_permute2f128_si256(a01, a0, 0x20); a0 = _mm256_permute2f128_si256(a01, a0, 0x31); - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), b0 ); - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), + 0xFF, b0 + ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), + 0xFF, a0 + ); kr_new += 2; } // Handle k remainder. if ( k_partial_pieces > 0 ) { - a0 = _mm256_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) ) ); + a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); c0 = _mm256_setzero_si256(); a01 = _mm256_unpacklo_epi16( a0, c0 ); @@ -425,8 +449,16 @@ void packb_nr16_bf16bf16f32of32 b0 = _mm256_permute2f128_si256(a01, a0, 0x20); a0 = _mm256_permute2f128_si256(a01, a0, 0x31); - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), b0 ); - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), + 0xFF, b0 + ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), + 0xFF, a0 + ); } } @@ -460,8 +492,8 @@ void packb_nrlt16_bf16bf16f32of32 memcpy( buf0, ( b + ( ldb * ( kr + 0 ) ) ), ( n0_partial_rem * sizeof( bfloat16 ) ) ); memcpy( buf1, ( b + ( ldb * ( kr + 1 ) ) ), ( n0_partial_rem * sizeof( bfloat16 ) ) ); // Rearrange for dpbf16_ps, read 2 rows from B with next 16 elements in each row. - a0 = _mm256_loadu_epi16( buf0 ); - c0 = _mm256_loadu_epi16( buf1 ); + a0 = _mm256_maskz_loadu_epi16( 0xFFFF, buf0 ); + c0 = _mm256_maskz_loadu_epi16( 0xFFFF, buf1 ); a01 = _mm256_unpacklo_epi16( a0, c0 ); a0 = _mm256_unpackhi_epi16( a0, c0 ); @@ -469,8 +501,16 @@ void packb_nrlt16_bf16bf16f32of32 b0 = _mm256_permute2f128_si256(a01, a0, 0x20); a0 = _mm256_permute2f128_si256(a01, a0, 0x31); - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), b0 ); - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), + 0xFF, b0 + ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), + 0xFF, a0 + ); kr_new += 2; } @@ -478,7 +518,7 @@ void packb_nrlt16_bf16bf16f32of32 if ( k_partial_pieces > 0 ) { memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( bfloat16 ) ) ); - a0 = _mm256_loadu_epi16( buf0 ); + a0 = _mm256_maskz_loadu_epi16( 0xFFFF, buf0 ); c0 = _mm256_setzero_si256(); a01 = _mm256_unpacklo_epi16( a0, c0 ); @@ -487,8 +527,16 @@ void packb_nrlt16_bf16bf16f32of32 b0 = _mm256_permute2f128_si256(a01, a0, 0x20); a0 = _mm256_permute2f128_si256(a01, a0, 0x31); - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), b0 ); - _mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), + 0xFF, b0 + ); + _mm256_mask_storeu_epi64 + ( + pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), + 0xFF, a0 + ); } } #endif diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c index ae79b149cd..a2e487bcb3 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c @@ -211,7 +211,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) // instructions and each load to ZMM register will have 4 elements // along k direction and 16 elements across n directions, so 4x16 // elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -219,9 +219,9 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -306,7 +306,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -319,9 +319,9 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -433,7 +433,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr ); + b0 = _mm512_loadu_si512( bsumptr ); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -442,7 +442,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 ); c_int32_5p0 = _mm512_sub_epi32( c_int32_5p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); @@ -451,7 +451,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) c_int32_4p1 = _mm512_sub_epi32( c_int32_4p1 , b0 ); c_int32_5p1 = _mm512_sub_epi32( c_int32_5p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 ); @@ -460,7 +460,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) c_int32_4p2 = _mm512_sub_epi32( c_int32_4p2 , b0 ); c_int32_5p2 = _mm512_sub_epi32( c_int32_5p2 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 48 ); + b0 = _mm512_loadu_si512( bsumptr + 48 ); c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 ); c_int32_1p3 = _mm512_sub_epi32( c_int32_1p3 , b0 ); @@ -562,16 +562,16 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) POST_OPS_BIAS_6x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -1048,16 +1048,16 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) POST_OPS_DOWNSCALE_6x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -1226,76 +1226,76 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_int32_0p3 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 ); // c[1,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_int32_1p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_int32_1p3 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 ); // c[2,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_int32_2p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_int32_2p3 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 ); // c[3,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_int32_3p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_int32_3p3 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); // c[4,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 ); // c[4,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_int32_4p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_int32_4p3 ); // c[5,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); // c[5,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); // c[5,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 ); // c[5,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_int32_5p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_int32_5p3 ); } a = a + ( MR * ps_a ); diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c index 991923b1ae..a338484df6 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c @@ -99,7 +99,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -107,9 +107,9 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -180,7 +180,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -193,9 +193,9 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -288,7 +288,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -296,7 +296,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 ); c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); @@ -304,7 +304,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 ); c_int32_4p1 = _mm512_sub_epi32( c_int32_4p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 ); @@ -312,7 +312,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) c_int32_3p2 = _mm512_sub_epi32( c_int32_3p2 , b0 ); c_int32_4p2 = _mm512_sub_epi32( c_int32_4p2 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 48 ); + b0 = _mm512_loadu_si512( bsumptr + 48 ); c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 ); c_int32_1p3 = _mm512_sub_epi32( c_int32_1p3 , b0 ); @@ -400,16 +400,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) POST_OPS_BIAS_5x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -814,16 +814,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) POST_OPS_DOWNSCALE_5x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -963,64 +963,64 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[1,48-63] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); // c[2,48-63] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); // c[3,48-63] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); // c[4,32-47] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 ); // c[4,48-63] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 3*16 ), c_int32_4p3 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 3*16 ), c_int32_4p3 ); } } @@ -1077,7 +1077,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1085,9 +1085,9 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -1144,7 +1144,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1157,9 +1157,9 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -1233,28 +1233,28 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 ); c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 ); c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 ); c_int32_2p2 = _mm512_sub_epi32( c_int32_2p2 , b0 ); c_int32_3p2 = _mm512_sub_epi32( c_int32_3p2 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 48 ); + b0 = _mm512_loadu_si512( bsumptr + 48 ); c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 ); c_int32_1p3 = _mm512_sub_epi32( c_int32_1p3 , b0 ); @@ -1331,16 +1331,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) POST_OPS_BIAS_4x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -1673,16 +1673,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) POST_OPS_DOWNSCALE_4x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -1798,52 +1798,52 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[1,48-63] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); // c[2,48-63] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); // c[3,48-63] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 ); } } @@ -1895,7 +1895,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1903,9 +1903,9 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -1948,7 +1948,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1961,9 +1961,9 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -2018,25 +2018,25 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 ); c_int32_2p2 = _mm512_sub_epi32( c_int32_2p2 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 48 ); + b0 = _mm512_loadu_si512( bsumptr + 48 ); c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 ); c_int32_1p3 = _mm512_sub_epi32( c_int32_1p3 , b0 ); @@ -2100,16 +2100,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) POST_OPS_BIAS_3x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -2370,16 +2370,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) POST_OPS_DOWNSCALE_3x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -2471,40 +2471,40 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[1,48-63] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); // c[2,48-63] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); } } @@ -2551,7 +2551,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2559,9 +2559,9 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -2590,7 +2590,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -2603,9 +2603,9 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -2641,22 +2641,22 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 48 ); + b0 = _mm512_loadu_si512( bsumptr + 48 ); c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 ); c_int32_1p3 = _mm512_sub_epi32( c_int32_1p3 , b0 ); @@ -2708,16 +2708,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) POST_OPS_BIAS_2x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -2907,16 +2907,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) POST_OPS_DOWNSCALE_2x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -2984,28 +2984,28 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[1,48-63] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); } } @@ -3047,7 +3047,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr] a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -3055,9 +3055,9 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -3072,7 +3072,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -3085,9 +3085,9 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64) //convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -3103,19 +3103,19 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 48 ); + b0 = _mm512_loadu_si512( bsumptr + 48 ); c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 ); } @@ -3155,16 +3155,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64) POST_OPS_BIAS_1x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -3281,16 +3281,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64) POST_OPS_DOWNSCALE_1x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -3334,16 +3334,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64) { // Store the accumulated results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); } } #endif diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c index 96573f9fb8..c009bdeaf3 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c @@ -75,7 +75,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5xlt16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -133,7 +133,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -198,7 +198,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5xlt16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - __m512i b0 = _mm512_loadu_epi32( bsumptr ); + __m512i b0 = _mm512_loadu_si512( bsumptr ); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -529,7 +529,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4xlt16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -577,7 +577,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -631,7 +631,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4xlt16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - __m512i b0 = _mm512_loadu_epi32( bsumptr ); + __m512i b0 = _mm512_loadu_si512( bsumptr ); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -921,7 +921,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3xlt16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -959,7 +959,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -1002,7 +1002,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3xlt16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - __m512i b0 = _mm512_loadu_epi32( bsumptr ); + __m512i b0 = _mm512_loadu_si512( bsumptr ); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -1252,7 +1252,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2xlt16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1280,7 +1280,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -1312,7 +1312,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2xlt16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - __m512i b0 = _mm512_loadu_epi32( bsumptr); + __m512i b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -1522,7 +1522,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1xlt16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1540,7 +1540,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -1561,7 +1561,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1xlt16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - __m512i b0 = _mm512_loadu_epi32( bsumptr); + __m512i b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); } @@ -1740,7 +1740,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1798,7 +1798,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -1862,7 +1862,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - __m512i b0 = _mm512_loadu_epi32( bsumptr); + __m512i b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -1935,7 +1935,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x16) POST_OPS_BIAS_5x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -2070,7 +2070,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x16) POST_OPS_DOWNSCALE_5x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -2120,19 +2120,19 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); } } @@ -2167,7 +2167,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2215,7 +2215,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -2268,7 +2268,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - __m512i b0 = _mm512_loadu_epi32( bsumptr); + __m512i b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -2332,7 +2332,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x16) POST_OPS_BIAS_4x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -2449,7 +2449,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x16) POST_OPS_DOWNSCALE_4x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -2493,16 +2493,16 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); } } @@ -2535,7 +2535,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2573,7 +2573,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -2615,7 +2615,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - __m512i b0 = _mm512_loadu_epi32( bsumptr); + __m512i b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -2670,7 +2670,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x16) POST_OPS_BIAS_3x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -2769,7 +2769,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x16) POST_OPS_DOWNSCALE_3x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -2807,13 +2807,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); } } @@ -2844,7 +2844,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2872,7 +2872,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -2903,7 +2903,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - __m512i b0 = _mm512_loadu_epi32( bsumptr); + __m512i b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -2949,7 +2949,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x16) POST_OPS_BIAS_2x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -3030,7 +3030,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x16) POST_OPS_DOWNSCALE_2x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -3062,10 +3062,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); } } @@ -3094,7 +3094,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -3112,7 +3112,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -3132,7 +3132,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - __m512i b0 = _mm512_loadu_epi32( bsumptr); + __m512i b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); } @@ -3169,7 +3169,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x16) POST_OPS_BIAS_1x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -3232,7 +3232,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x16) POST_OPS_DOWNSCALE_1x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -3258,7 +3258,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); } } @@ -3307,8 +3307,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -3371,8 +3371,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -3441,7 +3441,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -3449,7 +3449,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32) c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 ); c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); @@ -3527,10 +3527,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32) POST_OPS_BIAS_5x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -3755,10 +3755,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32) POST_OPS_DOWNSCALE_5x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -3838,34 +3838,34 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); } } @@ -3911,8 +3911,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x32) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -3964,8 +3964,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -4022,14 +4022,14 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x32) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 ); c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); @@ -4097,10 +4097,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x32) POST_OPS_BIAS_4x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -4289,10 +4289,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x32) POST_OPS_DOWNSCALE_4x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -4360,28 +4360,28 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); } } @@ -4424,8 +4424,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x32) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -4466,8 +4466,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -4512,13 +4512,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x32) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); @@ -4576,10 +4576,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x32) POST_OPS_BIAS_3x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -4732,10 +4732,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x32) POST_OPS_DOWNSCALE_3x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -4791,22 +4791,22 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); } } @@ -4846,8 +4846,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x32) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -4877,8 +4877,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -4911,12 +4911,12 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x32) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); @@ -4964,10 +4964,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x32) POST_OPS_BIAS_2x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -5084,10 +5084,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x32) POST_OPS_DOWNSCALE_2x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -5131,16 +5131,16 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); } } @@ -5177,8 +5177,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x32) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -5197,8 +5197,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -5219,11 +5219,11 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x32) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16); + b0 = _mm512_loadu_si512( bsumptr + 16); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); } @@ -5261,10 +5261,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x32) POST_OPS_BIAS_1x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -5345,10 +5345,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x32) POST_OPS_DOWNSCALE_1x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -5380,10 +5380,10 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); } } @@ -5438,9 +5438,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -5508,9 +5508,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -5584,7 +5584,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -5592,7 +5592,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48) c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 ); c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); @@ -5600,7 +5600,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48) c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 ); c_int32_4p1 = _mm512_sub_epi32( c_int32_4p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 ); @@ -5683,13 +5683,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48) POST_OPS_BIAS_5x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -6004,13 +6004,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48) POST_OPS_DOWNSCALE_5x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -6120,49 +6120,49 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); // c[4,32-47] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 ); } } @@ -6213,9 +6213,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x48) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -6271,9 +6271,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -6334,21 +6334,21 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x48) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 ); c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 ); c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 ); @@ -6420,13 +6420,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x48) POST_OPS_BIAS_4x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -6687,13 +6687,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x48) POST_OPS_DOWNSCALE_4x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -6785,40 +6785,40 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); } } @@ -6865,9 +6865,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x48) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -6911,9 +6911,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -6961,19 +6961,19 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x48) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 ); @@ -7034,13 +7034,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x48) POST_OPS_BIAS_3x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -7247,13 +7247,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x48) POST_OPS_DOWNSCALE_3x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -7327,31 +7327,31 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); } } @@ -7394,9 +7394,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x48) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -7428,9 +7428,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -7466,17 +7466,17 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x48) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 ); @@ -7526,13 +7526,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x48) POST_OPS_BIAS_2x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -7685,13 +7685,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x48) POST_OPS_DOWNSCALE_2x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -7747,22 +7747,22 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); } } @@ -7801,9 +7801,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x48) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -7823,9 +7823,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) ); @@ -7848,15 +7848,15 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x48) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr); + b0 = _mm512_loadu_si512( bsumptr); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16); + b0 = _mm512_loadu_si512( bsumptr + 16); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32); + b0 = _mm512_loadu_si512( bsumptr + 32); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); } @@ -7895,13 +7895,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x48) POST_OPS_BIAS_1x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -8000,13 +8000,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x48) POST_OPS_DOWNSCALE_1x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -8044,13 +8044,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); } } #endif diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c index 9d4a73bf8c..b88ef512d6 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c @@ -94,7 +94,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6xlt16) // in vnni instructions and each load to ZMM register will have 4 // elements along k direction and 16 elements across n directions, // so 4x16 elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -162,7 +162,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -262,7 +262,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6xlt16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr ); + b0 = _mm512_loadu_si512( bsumptr ); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -724,7 +724,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x16) // instructions and each load to ZMM register will have 4 elements // along k direction and 16 elements across n directions, so 4x16 // elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -792,7 +792,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -892,7 +892,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x16) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr ); + b0 = _mm512_loadu_si512( bsumptr ); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -974,7 +974,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x16) POST_OPS_BIAS_6x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -1127,7 +1127,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x16) POST_OPS_DOWNSCALE_6x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -1183,22 +1183,22 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); // c[5,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); } a = a + ( MR * ps_a ); @@ -1336,8 +1336,8 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32) // instructions and each load to ZMM register will have 4 elements // along k direction and 16 elements across n directions, so 4x16 // elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1411,8 +1411,8 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1518,7 +1518,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr ); + b0 = _mm512_loadu_si512( bsumptr ); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -1527,7 +1527,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32) c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 ); c_int32_5p0 = _mm512_sub_epi32( c_int32_5p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); @@ -1615,10 +1615,10 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32) POST_OPS_BIAS_6x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -1879,10 +1879,10 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32) POST_OPS_DOWNSCALE_6x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -1974,40 +1974,40 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); // c[5,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); // c[5,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); } a = a + ( MR * ps_a ); @@ -2151,9 +2151,9 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48) // instructions and each load to ZMM register will have 4 elements // along k direction and 16 elements across n directions, so 4x16 // elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2233,9 +2233,9 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -2347,7 +2347,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48) int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset; - b0 = _mm512_loadu_epi32( bsumptr ); + b0 = _mm512_loadu_si512( bsumptr ); c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 ); c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 ); @@ -2356,7 +2356,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48) c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 ); c_int32_5p0 = _mm512_sub_epi32( c_int32_5p0 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 16 ); + b0 = _mm512_loadu_si512( bsumptr + 16 ); c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 ); c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 ); @@ -2365,7 +2365,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48) c_int32_4p1 = _mm512_sub_epi32( c_int32_4p1 , b0 ); c_int32_5p1 = _mm512_sub_epi32( c_int32_5p1 , b0 ); - b0 = _mm512_loadu_epi32( bsumptr + 32 ); + b0 = _mm512_loadu_si512( bsumptr + 32 ); c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 ); c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 ); @@ -2459,13 +2459,13 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48) POST_OPS_BIAS_6x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -2835,13 +2835,13 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48) POST_OPS_DOWNSCALE_6x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -2969,58 +2969,58 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); // c[4,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 ); // c[5,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); // c[5,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); // c[5,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 ); } a = a + ( MR * ps_a ); diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_packa_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_packa_s8_amd512vnni.c index afb461b00b..cb663f7425 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_packa_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_packa_s8_amd512vnni.c @@ -132,12 +132,12 @@ void packa_k64_s8s8s32os32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 6 rows from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * ( ic + 0 ) ) + kr ); - b0 = _mm512_loadu_epi8( a + ( lda * ( ic + 1 ) ) + kr ); - c0 = _mm512_loadu_epi8( a + ( lda * ( ic + 2 ) ) + kr ); - d0 = _mm512_loadu_epi8( a + ( lda * ( ic + 3 ) ) + kr ); - e0 = _mm512_loadu_epi8( a + ( lda * ( ic + 4 ) ) + kr ); - f0 = _mm512_loadu_epi8( a + ( lda * ( ic + 5 ) ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * ( ic + 0 ) ) + kr ); + b0 = _mm512_loadu_si512( a + ( lda * ( ic + 1 ) ) + kr ); + c0 = _mm512_loadu_si512( a + ( lda * ( ic + 2 ) ) + kr ); + d0 = _mm512_loadu_si512( a + ( lda * ( ic + 3 ) ) + kr ); + e0 = _mm512_loadu_si512( a + ( lda * ( ic + 4 ) ) + kr ); + f0 = _mm512_loadu_si512( a + ( lda * ( ic + 5 ) ) + kr ); a01 = _mm512_unpacklo_epi32( a0, b0 ); a0 = _mm512_unpackhi_epi32( a0, b0 ); @@ -170,12 +170,17 @@ void packa_k64_s8s8s32os32 d0 = _mm512_permutex2var_epi64( a0, selector5, e01 ); // 2nd 64 a0 = _mm512_permutex2var_epi64( a0, selector6, e0 ); // 2nd 32 - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 0 ) ) ), b0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 64 ) ) ) , a01 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 96 ) ) ), d0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 0 ) ) ), b0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 64 ) ) ) , a01 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 96 ) ) ), d0 ); // Last piece last_piece = _mm512_castsi512_si256( a0 ); - _mm256_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 160 ) ) ), last_piece ); + _mm256_mask_storeu_epi64 + ( + pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 160 ) ) ), + 0xFF, + last_piece + ); // Second half b0 = _mm512_permutex2var_epi64( c01, selector7, e01 ); // 3rd 64 @@ -183,12 +188,17 @@ void packa_k64_s8s8s32os32 d0 = _mm512_permutex2var_epi64( c0, selector9, e01 ); // 4th 64 c0 = _mm512_permutex2var_epi64( c0, selector10, e0 ); // 4th 32 - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 192 ) ) ), b0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 256 ) ) ) , c01 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 288 ) ) ), d0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 192 ) ) ), b0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 256 ) ) ) , c01 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 288 ) ) ), d0 ); // Last piece last_piece = _mm512_castsi512_si256( c0 ); - _mm256_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 352 ) ) ), last_piece ); + _mm256_mask_storeu_epi64 + ( + pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 352 ) ) ), + 0xFF, + last_piece + ); } //TODO: Handle kc < 64 case, 48,32,16 } @@ -280,11 +290,11 @@ void packa_m5_k64_s8s8s32os32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 5 rows from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr ); - b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr ); - c0 = _mm512_loadu_epi8( a + ( lda * 2 ) + kr ); - d0 = _mm512_loadu_epi8( a + ( lda * 3 ) + kr ); - e0 = _mm512_loadu_epi8( a + ( lda * 4 ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr ); + b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr ); + c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr ); + d0 = _mm512_loadu_si512( a + ( lda * 3 ) + kr ); + e0 = _mm512_loadu_si512( a + ( lda * 4 ) + kr ); a01 = _mm512_unpacklo_epi32( a0, b0 ); a0 = _mm512_unpackhi_epi32( a0, b0 ); @@ -314,12 +324,17 @@ void packa_m5_k64_s8s8s32os32 d0 = _mm512_permutex2var_epi32( a0, selector5, e0 ); a0 = _mm512_permutex2var_epi32( a0, selector6, e0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 0 ) ), b0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 64 ) ) , a01 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 80 ) ), d0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 0 ) ), b0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 64 ) ) , a01 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 80 ) ), d0 ); // Last piece last_piece = _mm512_castsi512_si128( a0 ); - _mm_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 144 ) ), last_piece ); + _mm_mask_storeu_epi64 + ( + pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 144 ) ), + 0xFF, + last_piece + ); // Second half b0 = _mm512_permutex2var_epi32( c01, selector7, e0 ); @@ -327,12 +342,17 @@ void packa_m5_k64_s8s8s32os32 d0 = _mm512_permutex2var_epi32( c0, selector9, e0 ); c0 = _mm512_permutex2var_epi32( c0, selector10, e0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 160 ) ), b0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 224 ) ) , c01 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 240 ) ), d0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 160 ) ), b0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 224 ) ) , c01 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 240 ) ), d0 ); // Last piece last_piece = _mm512_castsi512_si128( c0 ); - _mm_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 304 ) ), last_piece ); + _mm_mask_storeu_epi64 + ( + pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 304 ) ), + 0xFF, + last_piece + ); } } @@ -362,10 +382,10 @@ void packa_m4_k64_s8s8s32os32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 4 rows from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr ); - b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr ); - c0 = _mm512_loadu_epi8( a + ( lda * 2 ) + kr ); - d0 = _mm512_loadu_epi8( a + ( lda * 3 ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr ); + b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr ); + c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr ); + d0 = _mm512_loadu_si512( a + ( lda * 3 ) + kr ); a01 = _mm512_unpacklo_epi32( a0, b0 ); a0 = _mm512_unpackhi_epi32( a0, b0 ); @@ -389,10 +409,10 @@ void packa_m4_k64_s8s8s32os32 a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // a[1] c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // a[3] - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 0 ) ), a01 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 64 ) ) , a0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 128 ) ), c01 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 192 ) ), c0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 0 ) ), a01 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 64 ) ) , a0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 128 ) ), c01 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 192 ) ), c0 ); } } @@ -427,9 +447,9 @@ void packa_m3_k64_s8s8s32os32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 3 rows from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr ); - b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr ); - c0 = _mm512_loadu_epi8( a + ( lda * 2 ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr ); + b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr ); + c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr ); a01 = _mm512_unpacklo_epi32( a0, b0 ); a0 = _mm512_unpackhi_epi32( a0, b0 ); @@ -440,16 +460,21 @@ void packa_m3_k64_s8s8s32os32 a0 = _mm512_permutex2var_epi32( b0, selector3, c0 ); b0 = _mm512_permutex2var_epi32( b0, selector4, c0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 0 ) ), a0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 64 ) ) , b0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 0 ) ), a0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 64 ) ) , b0 ); a0 = _mm512_permutex2var_epi32( a01, selector5, c0 ); b0 = _mm512_permutex2var_epi32( a01, selector6, c0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 96 ) ), a0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 96 ) ), a0 ); // Last piece last_piece = _mm512_castsi512_si256( b0 ); - _mm256_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 160 ) ), last_piece ); + _mm256_mask_storeu_epi64 + ( + pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 160 ) ), + 0xFF, + last_piece + ); } } @@ -474,8 +499,8 @@ void packa_m2_k64_s8s8s32os32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 2 rows from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr ); - b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr ); + b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr ); a01 = _mm512_unpacklo_epi32( a0, b0 ); a0 = _mm512_unpackhi_epi32( a0, b0 ); @@ -483,8 +508,8 @@ void packa_m2_k64_s8s8s32os32 b0 = _mm512_permutex2var_epi64( a01, selector1, a0 ); // a[0] a01 = _mm512_permutex2var_epi64( a01, selector1_1, a0 ); // a[1] - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 2 ) + ( 0 ) ), b0 ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 2 ) + ( 64 ) ) , a01 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 2 ) + ( 0 ) ), b0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 2 ) + ( 64 ) ) , a01 ); } } @@ -501,9 +526,9 @@ void packa_m1_k64_s8s8s32os32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 1 row from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr ); - _mm512_storeu_epi64( pack_a_buffer_s8s8s32o32 + ( ( kr * 1 ) + ( 0 ) ), a0 ); + _mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 1 ) + ( 0 ) ), a0 ); } } #endif diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_packb_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_packb_s8_amd512vnni.c index c0a7419912..532f2c264b 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_packb_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_packb_s8_amd512vnni.c @@ -127,18 +127,18 @@ void packb_nr64_s8s8s32os32 for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR ) { //load the temp buffer to compute column sum of B matrix - sum1 = _mm512_loadu_epi32( pack_b_column_sum + jc ); - sum2 = _mm512_loadu_epi32( pack_b_column_sum + 16 + jc ); //offset 16- as 16 int32 elements fit in 1 zmm register - sum3 = _mm512_loadu_epi32( pack_b_column_sum + 32 + jc ); - sum4 = _mm512_loadu_epi32( pack_b_column_sum + 48 + jc ); + sum1 = _mm512_loadu_si512( pack_b_column_sum + jc ); + sum2 = _mm512_loadu_si512( pack_b_column_sum + 16 + jc ); //offset 16- as 16 int32 elements fit in 1 zmm register + sum3 = _mm512_loadu_si512( pack_b_column_sum + 32 + jc ); + sum4 = _mm512_loadu_si512( pack_b_column_sum + 48 + jc ); for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 ) { // Rearrange for vpdpbusd, read 4 rows from B with 64 elements in each row. - a0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 0 ) ) + jc ); - b0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 1 ) ) + jc ); - c0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 2 ) ) + jc ); - d0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 3 ) ) + jc ); + a0 = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) + jc ); + b0 = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) + jc ); + c0 = _mm512_loadu_si512( b + ( ldb * ( kr + 2 ) ) + jc ); + d0 = _mm512_loadu_si512( b + ( ldb * ( kr + 3 ) ) + jc ); //add all the columns : sum = add (sum, a0, b0, c0, d0) sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( @@ -187,19 +187,19 @@ void packb_nr64_s8s8s32os32 a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // b[1] c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // b[3] - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) ), a01 ); - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) ) , a0 ); - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 2 ) * NR ) ), c01 ); - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 3 ) * NR ) ), c0 ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) ), a01 ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) ) , a0 ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 2 ) * NR ) ), c01 ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 3 ) * NR ) ), c0 ); } // Handle k remainder. if ( k_partial_pieces > 0 ) { if ( k_partial_pieces == 3 ) { - a0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); - b0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + jc ); - c0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) + jc ); + a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); + b0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 1 ) ) + jc ); + c0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 2 ) ) + jc ); d0 = _mm512_setzero_si512(); //add all the columns : sum = add (sum, a0, b0, c0) @@ -226,8 +226,8 @@ void packb_nr64_s8s8s32os32 } else if( k_partial_pieces == 2 ) { - a0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); - b0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + jc ); + a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); + b0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 1 ) ) + jc ); c0 = _mm512_setzero_si512(); d0 = _mm512_setzero_si512(); @@ -250,7 +250,7 @@ void packb_nr64_s8s8s32os32 } else //k_partial_pieces == 1 { - a0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); + a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); b0 = _mm512_setzero_si512(); c0 = _mm512_setzero_si512(); d0 = _mm512_setzero_si512(); @@ -291,16 +291,16 @@ void packb_nr64_s8s8s32os32 a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // b[1] c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // b[3] - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) ), a01 ); - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) ) , a0 ); - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 2 ) * NR ) ), c01 ); - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 3 ) * NR ) ), c0 ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) ), a01 ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) ) , a0 ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 2 ) * NR ) ), c01 ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 3 ) * NR ) ), c0 ); } //store the sum column - _mm512_storeu_epi32( pack_b_column_sum + jc, sum1 ); - _mm512_storeu_epi32( pack_b_column_sum + 16 + jc, sum2 ); - _mm512_storeu_epi32( pack_b_column_sum + 32 + jc, sum3 ); - _mm512_storeu_epi32( pack_b_column_sum + 48 + jc, sum4 ); + _mm512_storeu_si512( pack_b_column_sum + jc, sum1 ); + _mm512_storeu_si512( pack_b_column_sum + 16 + jc, sum2 ); + _mm512_storeu_si512( pack_b_column_sum + 32 + jc, sum3 ); + _mm512_storeu_si512( pack_b_column_sum + 48 + jc, sum4 ); } // Contiguous packing of fringe panel (n` < NR). @@ -401,17 +401,17 @@ void packb_nr48_s8s8s32os32 __m512i mul_128 = _mm512_set1_epi32 (7); //load the temp buffer to compute column sum of B matrix - sum1 = _mm512_loadu_epi32( pack_b_column_sum ); - sum2 = _mm512_loadu_epi32( pack_b_column_sum + 16 ); //offset 16- as 16 int32 elements fit in 1 zmm register - sum3 = _mm512_loadu_epi32( pack_b_column_sum + 32 ); + sum1 = _mm512_loadu_si512( pack_b_column_sum ); + sum2 = _mm512_loadu_si512( pack_b_column_sum + 16 ); //offset 16- as 16 int32 elements fit in 1 zmm register + sum3 = _mm512_loadu_si512( pack_b_column_sum + 32 ); for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 ) { // Rearrange for vpdpbusd, read 4 rows from B with 32 elements in each row. - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 1 ) ) ); - c0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 2 ) ) ); - d0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 3 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 1 ) ) ); + c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 2 ) ) ); + d0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 3 ) ) ); //add all the columns : sum = add (sum, a0, b0, c0, d0) sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( @@ -449,14 +449,14 @@ void packb_nr48_s8s8s32os32 b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 ); // First 4x32 elements. - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); // Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row. - a0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 0 ) ) + ( 32 ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 1 ) ) + ( 32 ) ); - c0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 2 ) ) + ( 32 ) ); - d0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 3 ) ) + ( 32 ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 0 ) ) + ( 32 ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 1 ) ) + ( 32 ) ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 2 ) ) + ( 32 ) ); + d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 3 ) ) + ( 32 ) ); //add all the columns : sum = add (sum, a0_32, b0_32, c0_32, d0_32) sum3 = _mm512_add_epi32 ( sum3, _mm512_sllv_epi32 ( _mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ), @@ -480,7 +480,7 @@ void packb_nr48_s8s8s32os32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm ); // The 4th 16byte chunk will be ignored, since its not part of the original data, // but is here due to the packing in 4 16byte chunks format. @@ -491,9 +491,9 @@ void packb_nr48_s8s8s32os32 { if ( k_partial_pieces == 3 ) { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); - c0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); + c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) ); d0_32 = _mm256_setzero_si256(); //add all the columns : sum = add (sum, a0, b0, c0) @@ -507,9 +507,9 @@ void packb_nr48_s8s8s32os32 _mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 1)), _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( c0_32, 1)))) , mul_128)); - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) ); - c0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) + ( 32 ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) + ( 32 ) ); d0_16 = _mm_setzero_si128(); sum3 = _mm512_add_epi32 ( sum3, _mm512_sllv_epi32 (_mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ), @@ -518,8 +518,8 @@ void packb_nr48_s8s8s32os32 } else if( k_partial_pieces == 2 ) { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); c0_32 = _mm256_setzero_si256(); d0_32 = _mm256_setzero_si256(); @@ -532,8 +532,8 @@ void packb_nr48_s8s8s32os32 _mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 1)), _mm512_cvtepi8_epi32( _mm256_extracti32x4_epi32 ( b0_32, 1) )) , mul_128 )); - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) ); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); @@ -542,7 +542,7 @@ void packb_nr48_s8s8s32os32 } else //k_partial_pieces == 1 { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); b0_32 = _mm256_setzero_si256(); c0_32 = _mm256_setzero_si256(); d0_32 = _mm256_setzero_si256(); @@ -554,7 +554,7 @@ void packb_nr48_s8s8s32os32 sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 1)) , mul_128)); - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); b0_16 = _mm_setzero_si128(); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); @@ -586,8 +586,8 @@ void packb_nr48_s8s8s32os32 b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 ); // First 4x32 elements. - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 ); a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 ); @@ -606,12 +606,12 @@ void packb_nr48_s8s8s32os32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm ); } //store the sum column - _mm512_storeu_epi32( pack_b_column_sum, sum1 ); - _mm512_storeu_epi32( pack_b_column_sum + 16, sum2 ); - _mm512_storeu_epi32( pack_b_column_sum + 32, sum3 ); + _mm512_storeu_si512( pack_b_column_sum, sum1 ); + _mm512_storeu_si512( pack_b_column_sum + 16, sum2 ); + _mm512_storeu_si512( pack_b_column_sum + 32, sum3 ); } void packb_nr32_s8s8s32os32 @@ -643,16 +643,16 @@ void packb_nr32_s8s8s32os32 __m512i mul_128 = _mm512_set1_epi32 (7); //load the temp buffer to compute column sum of B matrix - sum1 = _mm512_loadu_epi32( pack_b_column_sum ); - sum2 = _mm512_loadu_epi32( pack_b_column_sum + 16 ); //offset 16- as 16 int32 elements fit in 1 zmm register + sum1 = _mm512_loadu_si512( pack_b_column_sum ); + sum2 = _mm512_loadu_si512( pack_b_column_sum + 16 ); //offset 16- as 16 int32 elements fit in 1 zmm register for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 ) { // Rearrange for vpdpbusd, read 4 rows from B with 32 elements in each row. - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 1 ) ) ); - c0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 2 ) ) ); - d0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 3 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 1 ) ) ); + c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 2 ) ) ); + d0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 3 ) ) ); //add all the columns : sum = add (sum, a0, b0, c0, d0) sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( @@ -690,8 +690,8 @@ void packb_nr32_s8s8s32os32 b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 ); // First 4x32 elements. - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); // The 3rd and 4th 16byte chunk will be ignored, since its not part of the original data, // but is here due to the packing in 4 16byte chunks format. @@ -702,9 +702,9 @@ void packb_nr32_s8s8s32os32 { if ( k_partial_pieces == 3 ) { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); - c0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); + c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) ); d0_32 = _mm256_setzero_si256(); //add all the columns : sum = add (sum, a0, b0, c0) @@ -721,8 +721,8 @@ void packb_nr32_s8s8s32os32 } else if( k_partial_pieces == 2 ) { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); c0_32 = _mm256_setzero_si256(); d0_32 = _mm256_setzero_si256(); @@ -737,7 +737,7 @@ void packb_nr32_s8s8s32os32 } else //k_partial_pieces == 1 { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); b0_32 = _mm256_setzero_si256(); c0_32 = _mm256_setzero_si256(); d0_32 = _mm256_setzero_si256(); @@ -773,12 +773,12 @@ void packb_nr32_s8s8s32os32 b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 ); // First 4x32 elements. - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); } //store the sum column - _mm512_storeu_epi32( pack_b_column_sum, sum1 ); - _mm512_storeu_epi32( pack_b_column_sum + 16, sum2 ); + _mm512_storeu_si512( pack_b_column_sum, sum1 ); + _mm512_storeu_si512( pack_b_column_sum + 16, sum2 ); } void packb_nr16_s8s8s32os32 @@ -809,15 +809,15 @@ void packb_nr16_s8s8s32os32 __m512i mul_128 = _mm512_set1_epi32 (7); //load the temp buffer to compute column sum of B matrix - sum1 = _mm512_loadu_epi32( pack_b_column_sum ); + sum1 = _mm512_loadu_si512( pack_b_column_sum ); for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 ) { // Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row. - a0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 0 ) ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 1 ) ) ); - c0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 2 ) ) ); - d0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 3 ) ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 0 ) ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 1 ) ) ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 2 ) ) ); + d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 3 ) ) ); //add all the columns : sum = add (sum, a0, b0, c0, d0) sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( _mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ), @@ -841,7 +841,7 @@ void packb_nr16_s8s8s32os32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); // The 2nd, 3rd, and 4th 16byte chunk will be ignored, since its not part of the original data, // but is here due to the packing in 4 16byte chunks format. @@ -852,9 +852,9 @@ void packb_nr16_s8s8s32os32 { if ( k_partial_pieces == 3 ) { - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); - c0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) ); d0_16 = _mm_setzero_si128(); sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (_mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ), @@ -863,8 +863,8 @@ void packb_nr16_s8s8s32os32 } else if( k_partial_pieces == 2 ) { - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); @@ -873,7 +873,7 @@ void packb_nr16_s8s8s32os32 } else //k_partial_pieces == 1 { - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); b0_16 = _mm_setzero_si128(); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); @@ -898,10 +898,10 @@ void packb_nr16_s8s8s32os32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); } //store the sum column - _mm512_storeu_epi32( pack_b_column_sum, sum1 ); + _mm512_storeu_si512( pack_b_column_sum, sum1 ); } void packb_nrlt16_s8s8s32os32 @@ -938,7 +938,7 @@ void packb_nrlt16_s8s8s32os32 __m512i mul_128 = _mm512_set1_epi32 (7); //load the temp buffer to compute column sum of B matrix - sum1 = _mm512_loadu_epi32( pack_b_column_sum ); + sum1 = _mm512_loadu_si512( pack_b_column_sum ); for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 ) { @@ -948,10 +948,10 @@ void packb_nrlt16_s8s8s32os32 memcpy( buf3, ( b + ( ldb * ( kr + 3 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); // Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row. - a0_16 = _mm_loadu_epi8( buf0 ); - b0_16 = _mm_loadu_epi8( buf1 ); - c0_16 = _mm_loadu_epi8( buf2 ); - d0_16 = _mm_loadu_epi8( buf3 ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf2 ); + d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf3 ); //add all the columns : sum = add (sum, a0, b0, c0, d0) sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( _mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ), @@ -975,7 +975,7 @@ void packb_nrlt16_s8s8s32os32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); // The 2nd, 3rd, and 4th 16byte chunk will be ignored, since its not part of the original data, // but is here due to the packing in 4 16byte chunks format. @@ -990,9 +990,9 @@ void packb_nrlt16_s8s8s32os32 memcpy( buf1, ( b + ( ldb * ( k_full_pieces + 1 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); memcpy( buf2, ( b + ( ldb * ( k_full_pieces + 2 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); - a0_16 = _mm_loadu_epi8( buf0 ); - b0_16 = _mm_loadu_epi8( buf1 ); - c0_16 = _mm_loadu_epi8( buf2 ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf2 ); d0_16 = _mm_setzero_si128(); sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (_mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ), @@ -1004,8 +1004,8 @@ void packb_nrlt16_s8s8s32os32 memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); memcpy( buf1, ( b + ( ldb * ( k_full_pieces + 1 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); - a0_16 = _mm_loadu_epi8( buf0 ); - b0_16 = _mm_loadu_epi8( buf1 ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 ); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); @@ -1016,7 +1016,7 @@ void packb_nrlt16_s8s8s32os32 { memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); - a0_16 = _mm_loadu_epi8( buf0 ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 ); b0_16 = _mm_setzero_si128(); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); @@ -1041,9 +1041,9 @@ void packb_nrlt16_s8s8s32os32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); } //store the sum column - _mm512_storeu_epi32( pack_b_column_sum, sum1 ); + _mm512_storeu_si512( pack_b_column_sum, sum1 ); } #endif diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c index 40fc6ccb1a..f79cd8775a 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c @@ -212,7 +212,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) // instructions and each load to ZMM register will have 4 elements // along k direction and 16 elements across n directions, so 4x16 // elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32 @@ -220,9 +220,9 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Broadcast a[1,kr:kr+4]. a_int32_1 = _mm512_set1_epi32 @@ -302,7 +302,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -312,9 +312,9 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) ); a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -496,16 +496,16 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) POST_OPS_BIAS_6x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -761,11 +761,11 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) for ( dim_t gelu_id = 0; gelu_id < temp_buf_4elem_len; ++gelu_id ) { - c_int32_0p0 = _mm512_loadu_epi32( temp_buf + ( gelu_id * 16 ) ); + c_int32_0p0 = _mm512_loadu_si512( temp_buf + ( gelu_id * 16 ) ); GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, selector1) - _mm512_storeu_epi32( temp_buf + ( gelu_id * 16 ), c_int32_0p0 ); + _mm512_storeu_si512( temp_buf + ( gelu_id * 16 ), c_int32_0p0 ); } S32_GELU_STORE1R_4C(temp_buf,0,16,c_int32_0) @@ -795,11 +795,11 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) for ( dim_t gelu_id = 0; gelu_id < temp_buf_4elem_len; ++gelu_id ) { - c_int32_0p0 = _mm512_loadu_epi32( temp_buf + ( gelu_id * 16 ) ); + c_int32_0p0 = _mm512_loadu_si512( temp_buf + ( gelu_id * 16 ) ); GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf) - _mm512_storeu_epi32( temp_buf + ( gelu_id * 16 ), c_int32_0p0 ); + _mm512_storeu_si512( temp_buf + ( gelu_id * 16 ), c_int32_0p0 ); } S32_GELU_STORE1R_4C(temp_buf,0,16,c_int32_0) @@ -894,16 +894,16 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) POST_OPS_DOWNSCALE_6x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -1072,76 +1072,76 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_int32_0p3 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 ); // c[1,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_int32_1p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_int32_1p3 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 ); // c[2,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_int32_2p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_int32_2p3 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 ); // c[3,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_int32_3p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_int32_3p3 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); // c[4,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 ); // c[4,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_int32_4p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_int32_4p3 ); // c[5,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); // c[5,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); // c[5,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 ); // c[5,48-63] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_int32_5p3 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_int32_5p3 ); } a = a + ( MR * ps_a ); diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c index 1b8547560a..bcaa2d81c3 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c @@ -96,14 +96,14 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -162,7 +162,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -172,9 +172,9 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64) ); a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -327,16 +327,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64) POST_OPS_BIAS_5x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -741,16 +741,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64) POST_OPS_DOWNSCALE_5x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -890,64 +890,64 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[1,48-63] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); // c[2,48-63] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); // c[3,48-63] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); // c[4,32-47] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 ); // c[4,48-63] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 3*16 ), c_int32_4p3 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 3*16 ), c_int32_4p3 ); } } @@ -1001,14 +1001,14 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -1056,7 +1056,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1066,9 +1066,9 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64) ); a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -1194,16 +1194,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64) POST_OPS_BIAS_4x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -1536,16 +1536,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64) POST_OPS_DOWNSCALE_4x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -1661,52 +1661,52 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[1,48-63] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); // c[2,48-63] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); // c[3,48-63] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 ); } } @@ -1755,14 +1755,14 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -1799,7 +1799,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1809,9 +1809,9 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64) ); a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -1910,16 +1910,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64) POST_OPS_BIAS_3x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -2180,16 +2180,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64) POST_OPS_DOWNSCALE_3x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -2281,40 +2281,40 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[1,48-63] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); // c[2,48-63] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 ); } } @@ -2358,14 +2358,14 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -2391,7 +2391,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -2401,9 +2401,9 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64) ); a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -2475,16 +2475,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64) POST_OPS_BIAS_2x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -2674,16 +2674,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64) POST_OPS_DOWNSCALE_2x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -2751,28 +2751,28 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[1,48-63] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 ); } } @@ -2811,14 +2811,14 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x64) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr] a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -2833,7 +2833,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x64) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -2843,9 +2843,9 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x64) ); a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); - b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -2890,16 +2890,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x64) POST_OPS_BIAS_1x64: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0,0-15] @@ -3016,16 +3016,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x64) POST_OPS_DOWNSCALE_1x64: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); a_int32_1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] @@ -3069,16 +3069,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x64) { // Store the accumulated results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[0,48-63] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 ); } } #endif diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c index d6b94586e1..940d9e92fa 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c @@ -72,7 +72,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5xlt16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -115,7 +115,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -496,7 +496,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4xlt16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -532,7 +532,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -862,7 +862,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3xlt16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -891,7 +891,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1170,7 +1170,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2xlt16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1192,7 +1192,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1420,7 +1420,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1xlt16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1435,7 +1435,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1621,7 +1621,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1664,7 +1664,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1791,7 +1791,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x16) POST_OPS_BIAS_5x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -1926,7 +1926,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x16) POST_OPS_DOWNSCALE_5x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -1976,19 +1976,19 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); } } @@ -2020,7 +2020,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2056,7 +2056,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -2163,7 +2163,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x16) POST_OPS_BIAS_4x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -2280,7 +2280,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x16) POST_OPS_DOWNSCALE_4x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -2324,16 +2324,16 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); } } @@ -2363,7 +2363,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2392,7 +2392,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -2479,7 +2479,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x16) POST_OPS_BIAS_3x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -2578,7 +2578,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x16) POST_OPS_DOWNSCALE_3x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -2616,13 +2616,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); } } @@ -2650,7 +2650,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2672,7 +2672,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -2739,7 +2739,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x16) POST_OPS_BIAS_2x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -2820,7 +2820,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x16) POST_OPS_DOWNSCALE_2x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -2852,10 +2852,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); } } @@ -2881,7 +2881,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x16) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. __m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2896,7 +2896,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - __m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + __m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -2943,7 +2943,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x16) POST_OPS_BIAS_1x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -3006,7 +3006,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x16) POST_OPS_DOWNSCALE_1x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -3032,7 +3032,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); } } @@ -3078,8 +3078,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x32) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -3127,8 +3127,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -3265,10 +3265,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x32) POST_OPS_BIAS_5x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -3493,10 +3493,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x32) POST_OPS_DOWNSCALE_5x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -3576,34 +3576,34 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); } } @@ -3646,8 +3646,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x32) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -3687,8 +3687,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -3803,10 +3803,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x32) POST_OPS_BIAS_4x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -3995,10 +3995,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x32) POST_OPS_DOWNSCALE_4x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -4066,28 +4066,28 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); } } @@ -4127,8 +4127,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x32) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -4160,8 +4160,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -4254,10 +4254,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x32) POST_OPS_BIAS_3x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -4410,10 +4410,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x32) POST_OPS_DOWNSCALE_3x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -4469,22 +4469,22 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); } } @@ -4521,8 +4521,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x32) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -4546,8 +4546,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -4618,10 +4618,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x32) POST_OPS_BIAS_2x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -4738,10 +4738,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x32) POST_OPS_DOWNSCALE_2x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -4785,16 +4785,16 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); } } @@ -4828,8 +4828,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x32) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -4845,8 +4845,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -4895,10 +4895,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x32) POST_OPS_BIAS_1x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -4979,10 +4979,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x32) POST_OPS_DOWNSCALE_1x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -5014,10 +5014,10 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); } } @@ -5069,9 +5069,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x48) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -5124,9 +5124,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -5273,13 +5273,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x48) POST_OPS_BIAS_5x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -5594,13 +5594,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x48) POST_OPS_DOWNSCALE_5x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -5710,49 +5710,49 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 ); // c[4,32-47] - _mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 ); + _mm512_storeu_si512( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 ); } } @@ -5800,9 +5800,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x48) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -5846,9 +5846,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -5971,13 +5971,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x48) POST_OPS_BIAS_4x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -6238,13 +6238,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x48) POST_OPS_DOWNSCALE_4x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -6336,40 +6336,40 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 ); } } @@ -6413,9 +6413,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x48) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -6450,9 +6450,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -6551,13 +6551,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x48) POST_OPS_BIAS_3x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -6764,13 +6764,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x48) POST_OPS_DOWNSCALE_3x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -6844,31 +6844,31 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 ); } } @@ -6908,9 +6908,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x48) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -6936,9 +6936,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -7013,13 +7013,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x48) POST_OPS_BIAS_2x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -7172,13 +7172,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x48) POST_OPS_DOWNSCALE_2x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -7234,22 +7234,22 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 ); } } @@ -7285,9 +7285,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x48) for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -7304,9 +7304,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -7357,13 +7357,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x48) POST_OPS_BIAS_1x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -7462,13 +7462,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x48) POST_OPS_DOWNSCALE_1x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -7506,13 +7506,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 ); } } #endif diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c index 359e29f2fe..f59c82721c 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c @@ -123,7 +123,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12xlt16) // in vnni instructions and each load to ZMM register will have 4 // elements along k direction and 16 elements across n directions, // so 4x16 elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -216,7 +216,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1041,7 +1041,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12x16) // instructions and each load to ZMM register will have 4 elements // along k direction and 16 elements across n directions, so 4x16 // elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1134,7 +1134,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1401,7 +1401,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12x16) POST_OPS_BIAS_12x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -1662,7 +1662,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12x16) POST_OPS_DOWNSCALE_12x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -1754,40 +1754,40 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); // c[5,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); // c[6,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 6 ) ) + ( 0*16 ), c_int32_6p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 6 ) ) + ( 0*16 ), c_int32_6p0 ); // c[7,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 7 ) ) + ( 0*16 ), c_int32_7p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 7 ) ) + ( 0*16 ), c_int32_7p0 ); // c[8,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 8 ) ) + ( 0*16 ), c_int32_8p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 8 ) ) + ( 0*16 ), c_int32_8p0 ); // c[9,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 9 ) ) + ( 0*16 ), c_int32_9p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 9 ) ) + ( 0*16 ), c_int32_9p0 ); // c[10,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 10 ) ) + ( 0*16 ), c_int32_10p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 10 ) ) + ( 0*16 ), c_int32_10p0 ); // c[11,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 11 ) ) + ( 0*16 ), c_int32_11p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 11 ) ) + ( 0*16 ), c_int32_11p0 ); } a = a + ( MR * ps_a ); @@ -1884,8 +1884,8 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_9x32) // instructions and each load to ZMM register will have 4 elements // along k direction and 16 elements across n directions, so 4x16 // elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1966,8 +1966,8 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_9x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -2192,10 +2192,10 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_9x32) POST_OPS_BIAS_9x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -2564,10 +2564,10 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_9x32) POST_OPS_DOWNSCALE_9x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -2695,58 +2695,58 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_9x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); // c[5,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); // c[5,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); // c[6,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 6 ) ) + ( 0*16 ), c_int32_6p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 6 ) ) + ( 0*16 ), c_int32_6p0 ); // c[6, 16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 6 ) ) + ( 1*16 ), c_int32_6p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 6 ) ) + ( 1*16 ), c_int32_6p1 ); // c[7,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 7 ) ) + ( 0*16 ), c_int32_7p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 7 ) ) + ( 0*16 ), c_int32_7p0 ); // c[7,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 7 ) ) + ( 1*16 ), c_int32_7p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 7 ) ) + ( 1*16 ), c_int32_7p1 ); // c[8,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 8 ) ) + ( 0*16 ), c_int32_8p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 8 ) ) + ( 0*16 ), c_int32_8p0 ); // c[8,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 8 ) ) + ( 1*16 ), c_int32_8p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 8 ) ) + ( 1*16 ), c_int32_8p1 ); } a = a + ( MR * ps_a ); diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c index 3e9514c271..d5f86338a6 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c @@ -96,7 +96,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6xlt16) // in vnni instructions and each load to ZMM register will have 4 // elements along k direction and 16 elements across n directions, // so 4x16 elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -146,7 +146,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6xlt16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -674,7 +674,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x16) // instructions and each load to ZMM register will have 4 elements // along k direction and 16 elements across n directions, so 4x16 // elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -724,7 +724,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x16) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -871,7 +871,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x16) POST_OPS_BIAS_6x16: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j ); // c[0,0-15] @@ -1024,7 +1024,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x16) POST_OPS_DOWNSCALE_6x16: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] @@ -1080,22 +1080,22 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x16) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); // c[5,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); } a = a + ( MR * ps_a ); @@ -1237,8 +1237,8 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x32) // instructions and each load to ZMM register will have 4 elements // along k direction and 16 elements across n directions, so 4x16 // elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -1295,8 +1295,8 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x32) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -1455,10 +1455,10 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x32) POST_OPS_BIAS_6x32: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0,0-15] @@ -1719,10 +1719,10 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x32) POST_OPS_DOWNSCALE_6x32: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] @@ -1814,40 +1814,40 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x32) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); // c[5,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); // c[5,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); } a = a + ( MR * ps_a ); @@ -1988,9 +1988,9 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x48) // instructions and each load to ZMM register will have 4 elements // along k direction and 16 elements across n directions, so 4x16 // elements to a ZMM register. - b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2052,9 +2052,9 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x48) __m128i a_kfringe_buf; __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) ); - b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); - b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); - b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); + b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); + b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); + b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. a_kfringe_buf = _mm_maskz_loadu_epi8 @@ -2225,13 +2225,13 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x48) POST_OPS_BIAS_6x48: { selector1 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 + + _mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0,0-15] @@ -2601,13 +2601,13 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x48) POST_OPS_DOWNSCALE_6x48: { selector1 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); selector2 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); a_int32_0 = - _mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor + + _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] @@ -2735,58 +2735,58 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x48) { // Store the results. // c[0,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 ); // c[0, 16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 ); // c[0,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 ); // c[1,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 ); // c[1,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 ); // c[1,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 ); // c[2,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 ); // c[2,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 ); // c[2,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 ); // c[3,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 ); // c[3,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 ); // c[3,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 ); // c[4,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 ); // c[4,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 ); // c[4,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 ); // c[5,0-15] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 ); // c[5,16-31] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 ); // c[5,32-47] - _mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 ); + _mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 ); } a = a + ( MR * ps_a ); diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_packa_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_packa_amd512vnni.c index b1b424ec35..32cd7aef3d 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_packa_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_packa_amd512vnni.c @@ -132,12 +132,12 @@ void packa_k64_u8s8s32o32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 6 rows from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * ( ic + 0 ) ) + kr ); - b0 = _mm512_loadu_epi8( a + ( lda * ( ic + 1 ) ) + kr ); - c0 = _mm512_loadu_epi8( a + ( lda * ( ic + 2 ) ) + kr ); - d0 = _mm512_loadu_epi8( a + ( lda * ( ic + 3 ) ) + kr ); - e0 = _mm512_loadu_epi8( a + ( lda * ( ic + 4 ) ) + kr ); - f0 = _mm512_loadu_epi8( a + ( lda * ( ic + 5 ) ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * ( ic + 0 ) ) + kr ); + b0 = _mm512_loadu_si512( a + ( lda * ( ic + 1 ) ) + kr ); + c0 = _mm512_loadu_si512( a + ( lda * ( ic + 2 ) ) + kr ); + d0 = _mm512_loadu_si512( a + ( lda * ( ic + 3 ) ) + kr ); + e0 = _mm512_loadu_si512( a + ( lda * ( ic + 4 ) ) + kr ); + f0 = _mm512_loadu_si512( a + ( lda * ( ic + 5 ) ) + kr ); a01 = _mm512_unpacklo_epi32( a0, b0 ); a0 = _mm512_unpackhi_epi32( a0, b0 ); @@ -170,12 +170,17 @@ void packa_k64_u8s8s32o32 d0 = _mm512_permutex2var_epi64( a0, selector5, e01 ); // 2nd 64 a0 = _mm512_permutex2var_epi64( a0, selector6, e0 ); // 2nd 32 - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 0 ) ) ), b0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 64 ) ) ) , a01 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 96 ) ) ), d0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 0 ) ) ), b0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 64 ) ) ) , a01 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 96 ) ) ), d0 ); // Last piece last_piece = _mm512_castsi512_si256( a0 ); - _mm256_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 160 ) ) ), last_piece ); + _mm256_mask_storeu_epi64 + ( + pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 160 ) ) ), + 0xFF, + last_piece + ); // Second half b0 = _mm512_permutex2var_epi64( c01, selector7, e01 ); // 3rd 64 @@ -183,12 +188,17 @@ void packa_k64_u8s8s32o32 d0 = _mm512_permutex2var_epi64( c0, selector9, e01 ); // 4th 64 c0 = _mm512_permutex2var_epi64( c0, selector10, e0 ); // 4th 32 - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 192 ) ) ), b0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 256 ) ) ) , c01 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 288 ) ) ), d0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 192 ) ) ), b0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 256 ) ) ) , c01 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 288 ) ) ), d0 ); // Last piece last_piece = _mm512_castsi512_si256( c0 ); - _mm256_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 352 ) ) ), last_piece ); + _mm256_mask_storeu_epi64 + ( + pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 352 ) ) ), + 0xFF, + last_piece + ); } //TODO: Handle kc < 64 case, 48,32,16 } @@ -280,11 +290,11 @@ void packa_m5_k64_u8s8s32o32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 5 rows from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr ); - b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr ); - c0 = _mm512_loadu_epi8( a + ( lda * 2 ) + kr ); - d0 = _mm512_loadu_epi8( a + ( lda * 3 ) + kr ); - e0 = _mm512_loadu_epi8( a + ( lda * 4 ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr ); + b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr ); + c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr ); + d0 = _mm512_loadu_si512( a + ( lda * 3 ) + kr ); + e0 = _mm512_loadu_si512( a + ( lda * 4 ) + kr ); a01 = _mm512_unpacklo_epi32( a0, b0 ); a0 = _mm512_unpackhi_epi32( a0, b0 ); @@ -314,12 +324,17 @@ void packa_m5_k64_u8s8s32o32 d0 = _mm512_permutex2var_epi32( a0, selector5, e0 ); a0 = _mm512_permutex2var_epi32( a0, selector6, e0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 0 ) ), b0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 64 ) ) , a01 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 80 ) ), d0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 0 ) ), b0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 64 ) ) , a01 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 80 ) ), d0 ); // Last piece last_piece = _mm512_castsi512_si128( a0 ); - _mm_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 144 ) ), last_piece ); + _mm_mask_storeu_epi64 + ( + pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 144 ) ), + 0xFF, + last_piece + ); // Second half b0 = _mm512_permutex2var_epi32( c01, selector7, e0 ); @@ -327,12 +342,17 @@ void packa_m5_k64_u8s8s32o32 d0 = _mm512_permutex2var_epi32( c0, selector9, e0 ); c0 = _mm512_permutex2var_epi32( c0, selector10, e0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 160 ) ), b0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 224 ) ) , c01 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 240 ) ), d0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 160 ) ), b0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 224 ) ) , c01 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 240 ) ), d0 ); // Last piece last_piece = _mm512_castsi512_si128( c0 ); - _mm_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 304 ) ), last_piece ); + _mm_mask_storeu_epi64 + ( + pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 304 ) ), + 0xFF, + last_piece + ); } } @@ -362,10 +382,10 @@ void packa_m4_k64_u8s8s32o32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 4 rows from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr ); - b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr ); - c0 = _mm512_loadu_epi8( a + ( lda * 2 ) + kr ); - d0 = _mm512_loadu_epi8( a + ( lda * 3 ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr ); + b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr ); + c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr ); + d0 = _mm512_loadu_si512( a + ( lda * 3 ) + kr ); a01 = _mm512_unpacklo_epi32( a0, b0 ); a0 = _mm512_unpackhi_epi32( a0, b0 ); @@ -389,10 +409,10 @@ void packa_m4_k64_u8s8s32o32 a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // a[1] c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // a[3] - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 0 ) ), a01 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 64 ) ) , a0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 128 ) ), c01 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 192 ) ), c0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 0 ) ), a01 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 64 ) ) , a0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 128 ) ), c01 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 192 ) ), c0 ); } } @@ -427,9 +447,9 @@ void packa_m3_k64_u8s8s32o32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 3 rows from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr ); - b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr ); - c0 = _mm512_loadu_epi8( a + ( lda * 2 ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr ); + b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr ); + c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr ); a01 = _mm512_unpacklo_epi32( a0, b0 ); a0 = _mm512_unpackhi_epi32( a0, b0 ); @@ -440,16 +460,21 @@ void packa_m3_k64_u8s8s32o32 a0 = _mm512_permutex2var_epi32( b0, selector3, c0 ); b0 = _mm512_permutex2var_epi32( b0, selector4, c0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 0 ) ), a0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 64 ) ) , b0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 0 ) ), a0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 64 ) ) , b0 ); a0 = _mm512_permutex2var_epi32( a01, selector5, c0 ); b0 = _mm512_permutex2var_epi32( a01, selector6, c0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 96 ) ), a0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 96 ) ), a0 ); // Last piece last_piece = _mm512_castsi512_si256( b0 ); - _mm256_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 160 ) ), last_piece ); + _mm256_mask_storeu_epi64 + ( + pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 160 ) ), + 0xFF, + last_piece + ); } } @@ -474,8 +499,8 @@ void packa_m2_k64_u8s8s32o32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 2 rows from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr ); - b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr ); + b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr ); a01 = _mm512_unpacklo_epi32( a0, b0 ); a0 = _mm512_unpackhi_epi32( a0, b0 ); @@ -483,8 +508,8 @@ void packa_m2_k64_u8s8s32o32 b0 = _mm512_permutex2var_epi64( a01, selector1, a0 ); // a[0] a01 = _mm512_permutex2var_epi64( a01, selector1_1, a0 ); // a[1] - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 2 ) + ( 0 ) ), b0 ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 2 ) + ( 64 ) ) , a01 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 2 ) + ( 0 ) ), b0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 2 ) + ( 64 ) ) , a01 ); } } @@ -501,9 +526,9 @@ void packa_m1_k64_u8s8s32o32 for ( dim_t kr = 0; kr < KC; kr += NR ) { // Rearrange for vpdpbusd, read 1 row from A with 64 elements in each row. - a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr ); + a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr ); - _mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 1 ) + ( 0 ) ), a0 ); + _mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 1 ) + ( 0 ) ), a0 ); } } #endif diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_packb_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_packb_amd512vnni.c index 60fae67e36..539386f5d0 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_packb_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_packb_amd512vnni.c @@ -120,10 +120,10 @@ void packb_nr64_u8s8s32o32 for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 ) { // Rearrange for vpdpbusd, read 4 rows from B with 64 elements in each row. - a0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 0 ) ) + jc ); - b0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 1 ) ) + jc ); - c0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 2 ) ) + jc ); - d0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 3 ) ) + jc ); + a0 = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) + jc ); + b0 = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) + jc ); + c0 = _mm512_loadu_si512( b + ( ldb * ( kr + 2 ) ) + jc ); + d0 = _mm512_loadu_si512( b + ( ldb * ( kr + 3 ) ) + jc ); a01 = _mm512_unpacklo_epi8( a0, b0 ); a0 = _mm512_unpackhi_epi8( a0, b0 ); @@ -147,32 +147,32 @@ void packb_nr64_u8s8s32o32 a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // b[1] c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // b[3] - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) ), a01 ); - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) ) , a0 ); - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 2 ) * NR ) ), c01 ); - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 3 ) * NR ) ), c0 ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) ), a01 ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) ) , a0 ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 2 ) * NR ) ), c01 ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 3 ) * NR ) ), c0 ); } // Handle k remainder. if ( k_partial_pieces > 0 ) { if ( k_partial_pieces == 3 ) { - a0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); - b0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + jc ); - c0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) + jc ); + a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); + b0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 1 ) ) + jc ); + c0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 2 ) ) + jc ); d0 = _mm512_setzero_si512(); } else if( k_partial_pieces == 2 ) { - a0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); - b0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + jc ); + a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); + b0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 1 ) ) + jc ); c0 = _mm512_setzero_si512(); d0 = _mm512_setzero_si512(); } else //k_partial_pieces == 1 { - a0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); + a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc ); b0 = _mm512_setzero_si512(); c0 = _mm512_setzero_si512(); d0 = _mm512_setzero_si512(); @@ -200,10 +200,10 @@ void packb_nr64_u8s8s32o32 a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // b[1] c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // b[3] - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) ), a01 ); - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) ) , a0 ); - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 2 ) * NR ) ), c01 ); - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 3 ) * NR ) ), c0 ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) ), a01 ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) ) , a0 ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 2 ) * NR ) ), c01 ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 3 ) * NR ) ), c0 ); } } @@ -298,10 +298,10 @@ void packb_nr48_u8s8s32o32 for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 ) { // Rearrange for vpdpbusd, read 4 rows from B with 32 elements in each row. - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 1 ) ) ); - c0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 2 ) ) ); - d0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 3 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 1 ) ) ); + c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 2 ) ) ); + d0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 3 ) ) ); a01_32 = _mm256_unpacklo_epi8( a0_32, b0_32 ); a0_32 = _mm256_unpackhi_epi8( a0_32, b0_32 ); @@ -326,14 +326,14 @@ void packb_nr48_u8s8s32o32 b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 ); // First 4x32 elements. - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); // Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row. - a0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 0 ) ) + ( 32 ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 1 ) ) + ( 32 ) ); - c0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 2 ) ) + ( 32 ) ); - d0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 3 ) ) + ( 32 ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 0 ) ) + ( 32 ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 1 ) ) + ( 32 ) ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 2 ) ) + ( 32 ) ); + d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 3 ) ) + ( 32 ) ); a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 ); a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 ); @@ -352,7 +352,7 @@ void packb_nr48_u8s8s32o32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm ); // The 4th 16byte chunk will be ignored, since its not part of the original data, // but is here due to the packing in 4 16byte chunks format. @@ -363,37 +363,37 @@ void packb_nr48_u8s8s32o32 { if ( k_partial_pieces == 3 ) { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); - c0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); + c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) ); d0_32 = _mm256_setzero_si256(); - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) ); - c0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) + ( 32 ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) + ( 32 ) ); d0_16 = _mm_setzero_si128(); } else if( k_partial_pieces == 2 ) { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); c0_32 = _mm256_setzero_si256(); d0_32 = _mm256_setzero_si256(); - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) ); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); } else //k_partial_pieces == 1 { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); b0_32 = _mm256_setzero_si256(); c0_32 = _mm256_setzero_si256(); d0_32 = _mm256_setzero_si256(); - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) ); b0_16 = _mm_setzero_si128(); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); @@ -422,8 +422,8 @@ void packb_nr48_u8s8s32o32 b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 ); // First 4x32 elements. - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 ); a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 ); @@ -442,7 +442,7 @@ void packb_nr48_u8s8s32o32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm ); } } @@ -472,10 +472,10 @@ void packb_nr32_u8s8s32o32 for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 ) { // Rearrange for vpdpbusd, read 4 rows from B with 32 elements in each row. - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 1 ) ) ); - c0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 2 ) ) ); - d0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 3 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 1 ) ) ); + c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 2 ) ) ); + d0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 3 ) ) ); a01_32 = _mm256_unpacklo_epi8( a0_32, b0_32 ); a0_32 = _mm256_unpackhi_epi8( a0_32, b0_32 ); @@ -500,8 +500,8 @@ void packb_nr32_u8s8s32o32 b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 ); // First 4x32 elements. - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); // The 3rd and 4th 16byte chunk will be ignored, since its not part of the original data, // but is here due to the packing in 4 16byte chunks format. @@ -512,22 +512,22 @@ void packb_nr32_u8s8s32o32 { if ( k_partial_pieces == 3 ) { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); - c0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); + c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) ); d0_32 = _mm256_setzero_si256(); } else if( k_partial_pieces == 2 ) { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); c0_32 = _mm256_setzero_si256(); d0_32 = _mm256_setzero_si256(); } else //k_partial_pieces == 1 { - a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); + a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); b0_32 = _mm256_setzero_si256(); c0_32 = _mm256_setzero_si256(); d0_32 = _mm256_setzero_si256(); @@ -556,8 +556,8 @@ void packb_nr32_u8s8s32o32 b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 ); // First 4x32 elements. - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm ); } } @@ -586,10 +586,10 @@ void packb_nr16_u8s8s32o32 for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 ) { // Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row. - a0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 0 ) ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 1 ) ) ); - c0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 2 ) ) ); - d0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 3 ) ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 0 ) ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 1 ) ) ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 2 ) ) ); + d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 3 ) ) ); a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 ); a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 ); @@ -608,7 +608,7 @@ void packb_nr16_u8s8s32o32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); // The 2nd, 3rd, and 4th 16byte chunk will be ignored, since its not part of the original data, // but is here due to the packing in 4 16byte chunks format. @@ -619,22 +619,22 @@ void packb_nr16_u8s8s32o32 { if ( k_partial_pieces == 3 ) { - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); - c0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) ); d0_16 = _mm_setzero_si128(); } else if( k_partial_pieces == 2 ) { - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); - b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) ); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); } else //k_partial_pieces == 1 { - a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); b0_16 = _mm_setzero_si128(); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); @@ -657,7 +657,7 @@ void packb_nr16_u8s8s32o32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); } } @@ -697,10 +697,10 @@ void packb_nrlt16_u8s8s32o32 memcpy( buf3, ( b + ( ldb * ( kr + 3 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); // Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row. - a0_16 = _mm_loadu_epi8( buf0 ); - b0_16 = _mm_loadu_epi8( buf1 ); - c0_16 = _mm_loadu_epi8( buf2 ); - d0_16 = _mm_loadu_epi8( buf3 ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf2 ); + d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf3 ); a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 ); a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 ); @@ -719,7 +719,7 @@ void packb_nrlt16_u8s8s32o32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); // The 2nd, 3rd, and 4th 16byte chunk will be ignored, since its not part of the original data, // but is here due to the packing in 4 16byte chunks format. @@ -734,9 +734,9 @@ void packb_nrlt16_u8s8s32o32 memcpy( buf1, ( b + ( ldb * ( k_full_pieces + 1 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); memcpy( buf2, ( b + ( ldb * ( k_full_pieces + 2 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); - a0_16 = _mm_loadu_epi8( buf0 ); - b0_16 = _mm_loadu_epi8( buf1 ); - c0_16 = _mm_loadu_epi8( buf2 ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 ); + c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf2 ); d0_16 = _mm_setzero_si128(); } @@ -745,8 +745,8 @@ void packb_nrlt16_u8s8s32o32 memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); memcpy( buf1, ( b + ( ldb * ( k_full_pieces + 1 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); - a0_16 = _mm_loadu_epi8( buf0 ); - b0_16 = _mm_loadu_epi8( buf1 ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 ); + b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 ); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); } @@ -754,7 +754,7 @@ void packb_nrlt16_u8s8s32o32 { memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) ); - a0_16 = _mm_loadu_epi8( buf0 ); + a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 ); b0_16 = _mm_setzero_si128(); c0_16 = _mm_setzero_si128(); d0_16 = _mm_setzero_si128(); @@ -777,7 +777,7 @@ void packb_nrlt16_u8s8s32o32 a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 ); // Last 4x16 elements. - _mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); + _mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm ); } } #endif diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h b/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h index f5f40d6ce0..deb35e8e09 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h @@ -43,7 +43,7 @@ reg = _mm512_add_epi32( scratch1, reg ); \ #define S32_S32_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \ - scratch1 = _mm512_loadu_epi32( c + ( rs_c * ( m_ir + m_ind ) ) + ( n_ind * 16 ) ); \ + scratch1 = _mm512_loadu_si512( c + ( rs_c * ( m_ir + m_ind ) ) + ( n_ind * 16 ) ); \ S32_BETA_FMA(reg,scratch1,scratch2) \ #define S32_S32_BETA_OP2(m_ir,m_ind,scratch1,scratch2) \ @@ -66,8 +66,9 @@ scratch1 = \ _mm512_cvtepi8_epi32 \ ( \ - _mm_loadu_epi8 \ + _mm_maskz_loadu_epi8 \ ( \ + 0xFFFF, \ ( int8_t* )post_ops_attr.buf_downscale + \ ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ post_ops_attr.post_op_c_j + ( n_ind * 16 ) \ @@ -92,7 +93,7 @@ // Default n < 16 beta macro #define S32_S32_BETA_OP_NLT16F(reg,buf_,scratch1,scratch2) \ - scratch1 = _mm512_loadu_epi32( buf_ ); \ + scratch1 = _mm512_loadu_si512( buf_ ); \ S32_BETA_FMA(reg,scratch1,scratch2) \ // Default n < 16 mask load beta macro @@ -100,11 +101,6 @@ scratch1 = _mm512_maskz_loadu_epi32( lmask, c + ( rs_c * ( m_ir + m_ind ) ) + ( n_ind * 16 ) ); \ S32_BETA_FMA(reg,scratch1,scratch2) \ -// Downscale n < 16 beta macro -#define S8_S32_BETA_OP_NLT16F(reg,buf_,scratch1,scratch2) \ - scratch1 = _mm512_cvtepi8_epi32( _mm_loadu_epi8( ( int8_t* )buf_ ) ); \ - S32_BETA_FMA(reg,scratch1,scratch2) \ - // Downscale n < 16 mask load beta macro #define S8_S32_BETA_OP_NLT16F_MASK(lmask,reg,m_ind,n_ind,scratch1,scratch2) \ scratch1 = _mm512_cvtepi8_epi32 \ @@ -187,40 +183,40 @@ // Load helper macros. #define S32_GELU_LOAD1R_1C(temp_buf,offset,stride,reg_base) \ - _mm512_storeu_epi32( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \ + _mm512_storeu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \ #define S32_GELU_LOAD1R_2C(temp_buf,offset,stride,reg_base) \ - _mm512_storeu_epi32( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \ - _mm512_storeu_epi32( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ), reg_base ## p1); \ + _mm512_storeu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \ + _mm512_storeu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ), reg_base ## p1); \ #define S32_GELU_LOAD1R_3C(temp_buf,offset,stride,reg_base) \ - _mm512_storeu_epi32( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \ - _mm512_storeu_epi32( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ), reg_base ## p1); \ - _mm512_storeu_epi32( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ), reg_base ## p2); \ + _mm512_storeu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \ + _mm512_storeu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ), reg_base ## p1); \ + _mm512_storeu_si512( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ), reg_base ## p2); \ #define S32_GELU_LOAD1R_4C(temp_buf,offset,stride,reg_base) \ - _mm512_storeu_epi32( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \ - _mm512_storeu_epi32( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ), reg_base ## p1); \ - _mm512_storeu_epi32( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ), reg_base ## p2); \ - _mm512_storeu_epi32( ( temp_buf ) + ( ( 3 + offset ) * ( stride ) ), reg_base ## p3); \ + _mm512_storeu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \ + _mm512_storeu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ), reg_base ## p1); \ + _mm512_storeu_si512( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ), reg_base ## p2); \ + _mm512_storeu_si512( ( temp_buf ) + ( ( 3 + offset ) * ( stride ) ), reg_base ## p3); \ // Store helper macros. #define S32_GELU_STORE1R_1C(temp_buf,offset,stride,reg_base) \ - reg_base ## p0 = _mm512_loadu_epi32( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \ + reg_base ## p0 = _mm512_loadu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \ #define S32_GELU_STORE1R_2C(temp_buf,offset,stride,reg_base) \ - reg_base ## p0 = _mm512_loadu_epi32( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \ - reg_base ## p1 = _mm512_loadu_epi32( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ) ); \ + reg_base ## p0 = _mm512_loadu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \ + reg_base ## p1 = _mm512_loadu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ) ); \ #define S32_GELU_STORE1R_3C(temp_buf,offset,stride,reg_base) \ - reg_base ## p0 = _mm512_loadu_epi32( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \ - reg_base ## p1 = _mm512_loadu_epi32( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ) ); \ - reg_base ## p2 = _mm512_loadu_epi32( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ) ); \ + reg_base ## p0 = _mm512_loadu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \ + reg_base ## p1 = _mm512_loadu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ) ); \ + reg_base ## p2 = _mm512_loadu_si512( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ) ); \ #define S32_GELU_STORE1R_4C(temp_buf,offset,stride,reg_base) \ - reg_base ## p0 = _mm512_loadu_epi32( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \ - reg_base ## p1 = _mm512_loadu_epi32( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ) ); \ - reg_base ## p2 = _mm512_loadu_epi32( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ) ); \ - reg_base ## p3 = _mm512_loadu_epi32( ( temp_buf ) + ( ( 3 + offset ) * ( stride ) ) ); \ + reg_base ## p0 = _mm512_loadu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \ + reg_base ## p1 = _mm512_loadu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ) ); \ + reg_base ## p2 = _mm512_loadu_si512( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ) ); \ + reg_base ## p3 = _mm512_loadu_si512( ( temp_buf ) + ( ( 3 + offset ) * ( stride ) ) ); \ #endif // LPGEMM_S32_KERN_MACROS_H From 30b931ae609d0dd7c5e7a8c14b12256d0552b4b2 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Fri, 12 May 2023 00:18:57 -0500 Subject: [PATCH 088/226] Fixed compilation error due to inconsistent compiler behavior towards AVX512 zero masking instruction syntax MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Since the code used whitespace variant of AVX512 mask instruction. But some compilers accept whitespace variant and some don't - to be safe, we removed whitespace. - Whitespace variant of masked instruction "vmovupd    (%rax,%r8,1),%zmm8{%k2} {z}" is replaced with this instruction "vmovupd    (%rax,%r8,1),%zmm8{%k2}{z}" to resolve the compilation failure issue. - Thanks to Shubham Sharma for identifying issue. AMD-Internal: [CPUPL-1963] Change-Id: I290589132e8cce25cab0d1e4c195a7dd0a014937 --- kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c | 48 ++-- kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c | 16 +- .../3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c | 16 +- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c | 228 +++++++-------- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c | 234 ++++++++-------- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c | 240 ++++++++-------- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c | 246 ++++++++-------- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c | 252 ++++++++--------- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c | 258 ++++++++--------- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c | 264 +++++++++--------- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c | 198 ++++++------- 11 files changed, 1000 insertions(+), 1000 deletions(-) diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c index 34ccb63d54..e355da8bff 100644 --- a/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c +++ b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c @@ -294,14 +294,14 @@ void bli_dpackm_zen4_asm_24xk label(.DKLEFTROWU) // EDGE LOOP (k_left) - vmovupd(mem(rax, 0), zmm6 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 1, 0), zmm8 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 2, 0), zmm10 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 4, 0), zmm14 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_(k(2)) MASK_(z)) + vmovupd(mem(rax, 0), zmm6 MASK_KZ(2)) + vmovupd(mem(rax, r8, 1, 0), zmm8 MASK_KZ(2)) + vmovupd(mem(rax, r8, 2, 0), zmm10 MASK_KZ(2)) + vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_KZ(2)) + vmovupd(mem(rax, r8, 4, 0), zmm14 MASK_KZ(2)) + vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_KZ(2)) + vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_KZ(2)) + vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_KZ(2)) UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3) SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31) @@ -386,14 +386,14 @@ void bli_dpackm_zen4_asm_24xk LABEL(.UPDATEDONE) - vmovupd(mem(rax, 0), zmm6 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 1, 0), zmm8 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 2, 0), zmm10 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 4, 0), zmm14 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_(k(2)) MASK_(z)) + vmovupd(mem(rax, 0), zmm6 MASK_KZ(2)) + vmovupd(mem(rax, r8, 1, 0), zmm8 MASK_KZ(2)) + vmovupd(mem(rax, r8, 2, 0), zmm10 MASK_KZ(2)) + vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_KZ(2)) + vmovupd(mem(rax, r8, 4, 0), zmm14 MASK_KZ(2)) + vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_KZ(2)) + vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_KZ(2)) + vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_KZ(2)) UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3) SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31) @@ -479,14 +479,14 @@ void bli_dpackm_zen4_asm_24xk LABEL(.UPDATEDONEL2) - vmovupd(mem(rax, 0), zmm6 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 1, 0), zmm8 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 2, 0), zmm10 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 4, 0), zmm14 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_(k(2)) MASK_(z)) + vmovupd(mem(rax, 0), zmm6 MASK_KZ(2)) + vmovupd(mem(rax, r8, 1, 0), zmm8 MASK_KZ(2)) + vmovupd(mem(rax, r8, 2, 0), zmm10 MASK_KZ(2)) + vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_KZ(2)) + vmovupd(mem(rax, r8, 4, 0), zmm14 MASK_KZ(2)) + vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_KZ(2)) + vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_KZ(2)) + vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_KZ(2)) UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3) SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31) diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c index e486681bc0..5ab39dae5f 100644 --- a/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c +++ b/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c @@ -197,14 +197,14 @@ void bli_dpackm_zen4_asm_8xk label(.DKLEFTROWU) // EDGE LOOP (k_left) - vmovupd(mem(rax, 0), zmm6 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 1, 0), zmm8 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 2, 0), zmm10 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r8, 4, 0), zmm14 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_(k(2)) MASK_(z)) - vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_(k(2)) MASK_(z)) + vmovupd(mem(rax, 0), zmm6 MASK_KZ(2)) + vmovupd(mem(rax, r8, 1, 0), zmm8 MASK_KZ(2)) + vmovupd(mem(rax, r8, 2, 0), zmm10 MASK_KZ(2)) + vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_KZ(2)) + vmovupd(mem(rax, r8, 4, 0), zmm14 MASK_KZ(2)) + vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_KZ(2)) + vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_KZ(2)) + vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_KZ(2)) UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3) SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31) diff --git a/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c index 67df513fe9..c7b618fb7e 100644 --- a/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c +++ b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c @@ -130,28 +130,28 @@ */ #define UPDATE_MASKED_C \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(2)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(2) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(2)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(2) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(2)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(2) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(2)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(2) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(2)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(2) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(2)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(2) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(2)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(2) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ - vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_(k(2)) MASK_(z) ) \ + vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(2) ) \ vfmadd231pd( zmm31,zmm12,zmm8 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(2))) /*Stores back to C*/\ diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c index 47ba859263..4f82f8895b 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c @@ -182,28 +182,28 @@ */ #define UPDATE_MASKED_C_8 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ - vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm8 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -223,25 +223,25 @@ */ #define UPDATE_MASKED_C_7 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -259,22 +259,22 @@ */ #define UPDATE_MASKED_C_6 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -291,19 +291,19 @@ */ #define UPDATE_MASKED_C_5 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -319,16 +319,16 @@ */ #define UPDATE_MASKED_C_4 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -343,13 +343,13 @@ */ #define UPDATE_MASKED_C_3 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -363,10 +363,10 @@ */ #define UPDATE_MASKED_C_2 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -379,7 +379,7 @@ */ #define UPDATE_MASKED_C_1 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/ @@ -487,11 +487,11 @@ void bli_dgemmsup_rv_zen4_asm_24x1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -504,7 +504,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -516,7 +516,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -528,7 +528,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -540,7 +540,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -552,7 +552,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -564,7 +564,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -593,11 +593,11 @@ void bli_dgemmsup_rv_zen4_asm_24x1 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -610,7 +610,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -622,7 +622,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 prefetchw0( mem(rdx, 128)) // prefetch C vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -633,7 +633,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -644,7 +644,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -655,7 +655,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -666,7 +666,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -693,11 +693,11 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -709,7 +709,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -720,7 +720,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -731,7 +731,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -742,7 +742,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -753,7 +753,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -764,7 +764,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -791,7 +791,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -833,7 +833,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 vmovupd( 0x40(rcx),zmm1) vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx)) - vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm28) vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask @@ -1180,10 +1180,10 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1194,7 +1194,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1204,7 +1204,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1214,7 +1214,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1224,7 +1224,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1234,7 +1234,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1244,7 +1244,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1270,10 +1270,10 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1284,7 +1284,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 2 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1293,7 +1293,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1302,7 +1302,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1311,7 +1311,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1320,7 +1320,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1329,7 +1329,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1353,10 +1353,10 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1366,7 +1366,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1375,7 +1375,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1384,7 +1384,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1393,7 +1393,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1402,7 +1402,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1411,7 +1411,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1435,7 +1435,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1472,7 +1472,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 vmovupd( mem(rcx),zmm0) vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx)) - vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask @@ -1792,9 +1792,9 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1803,7 +1803,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1811,7 +1811,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1819,7 +1819,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1827,7 +1827,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1835,7 +1835,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1843,7 +1843,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1866,9 +1866,9 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1876,42 +1876,42 @@ void bli_dgemmsup_rv_zen4_asm_8x1 vfmadd231pd( zmm0,zmm30,zmm6 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b vfmadd231pd( zmm3,zmm30,zmm6 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b vfmadd231pd( zmm0,zmm30,zmm6 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b vfmadd231pd( zmm3,zmm30,zmm6 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b vfmadd231pd( zmm0,zmm30,zmm6 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b vfmadd231pd( zmm3,zmm30,zmm6 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1932,9 +1932,9 @@ void bli_dgemmsup_rv_zen4_asm_8x1 label(.LOOP3) // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1942,42 +1942,42 @@ void bli_dgemmsup_rv_zen4_asm_8x1 vfmadd231pd( zmm0,zmm30,zmm6 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b vfmadd231pd( zmm3,zmm30,zmm6 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b vfmadd231pd( zmm0,zmm30,zmm6 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b vfmadd231pd( zmm3,zmm30,zmm6 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b vfmadd231pd( zmm0,zmm30,zmm6 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b vfmadd231pd( zmm3,zmm30,zmm6 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -1998,7 +1998,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 je(.DPOSTACCUM) // if i == 0, jump to post-accumulation label(.DLOOPKLEFT) // k_left loop - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b @@ -2030,7 +2030,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 jz(.DROWSTORED) // jump to row storage case label(.DCOLSTORED) - vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c index bb12cff698..bdada9417a 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c @@ -182,28 +182,28 @@ */ #define UPDATE_MASKED_C_8 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ - vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm8 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -223,25 +223,25 @@ */ #define UPDATE_MASKED_C_7 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -259,22 +259,22 @@ */ #define UPDATE_MASKED_C_6 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -291,19 +291,19 @@ */ #define UPDATE_MASKED_C_5 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -319,16 +319,16 @@ */ #define UPDATE_MASKED_C_4 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -343,13 +343,13 @@ */ #define UPDATE_MASKED_C_3 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -363,10 +363,10 @@ */ #define UPDATE_MASKED_C_2 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -379,7 +379,7 @@ */ #define UPDATE_MASKED_C_1 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/ @@ -491,11 +491,11 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -512,7 +512,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -529,7 +529,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -545,7 +545,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -561,7 +561,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -577,7 +577,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -593,7 +593,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -630,11 +630,11 @@ void bli_dgemmsup_rv_zen4_asm_24x2 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -651,7 +651,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -668,7 +668,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 prefetchw0( mem(rdx, 128)) // prefetch C vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -683,7 +683,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -698,7 +698,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -713,7 +713,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -728,7 +728,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -763,11 +763,11 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -783,7 +783,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -799,7 +799,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -814,7 +814,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -829,7 +829,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -844,7 +844,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -859,7 +859,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -894,7 +894,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -943,7 +943,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vmovupd( 0x40(rcx),zmm1) vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx)) - vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm28) vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) @@ -952,7 +952,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vmovupd( 0x40(rcx,rdi,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1)) - vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm29) vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask @@ -1304,10 +1304,10 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1321,7 +1321,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1335,7 +1335,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1348,7 +1348,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1361,7 +1361,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1374,7 +1374,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1387,7 +1387,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1419,10 +1419,10 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1436,7 +1436,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 2 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1449,7 +1449,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1461,7 +1461,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1473,7 +1473,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1485,7 +1485,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1497,7 +1497,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1527,10 +1527,10 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1543,7 +1543,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1556,7 +1556,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1568,7 +1568,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1580,7 +1580,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1592,7 +1592,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1604,7 +1604,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1634,7 +1634,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1676,13 +1676,13 @@ void bli_dgemmsup_rv_zen4_asm_16x2 vmovupd( mem(rcx),zmm0) vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx)) - vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1)) - vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask @@ -2005,9 +2005,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2018,7 +2018,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2029,7 +2029,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2039,7 +2039,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2049,7 +2049,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2059,7 +2059,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2069,7 +2069,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2096,9 +2096,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2108,7 +2108,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm0,zmm31,zmm8 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2118,7 +2118,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm3,zmm31,zmm8 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2127,7 +2127,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm0,zmm31,zmm8 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2136,7 +2136,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm3,zmm31,zmm8 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2145,7 +2145,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm0,zmm31,zmm8 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2154,7 +2154,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm3,zmm31,zmm8 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2179,9 +2179,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 label(.LOOP3) // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2191,7 +2191,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm0,zmm31,zmm8 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2201,7 +2201,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm3,zmm31,zmm8 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2210,7 +2210,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm0,zmm31,zmm8 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2219,7 +2219,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm3,zmm31,zmm8 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2228,7 +2228,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm0,zmm31,zmm8 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2237,7 +2237,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vfmadd231pd( zmm3,zmm31,zmm8 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2262,7 +2262,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 je(.DPOSTACCUM) // if i == 0, jump to post-accumulation label(.DLOOPKLEFT) // k_left loop - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2297,10 +2297,10 @@ void bli_dgemmsup_rv_zen4_asm_8x2 jz(.DROWSTORED) // jump to row storage case label(.DCOLSTORED) - vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c index d84e41597d..fa3b4cf042 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c @@ -182,28 +182,28 @@ */ #define UPDATE_MASKED_C_8 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ - vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm8 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -223,25 +223,25 @@ */ #define UPDATE_MASKED_C_7 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -259,22 +259,22 @@ */ #define UPDATE_MASKED_C_6 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -291,19 +291,19 @@ */ #define UPDATE_MASKED_C_5 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -319,16 +319,16 @@ */ #define UPDATE_MASKED_C_4 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -343,13 +343,13 @@ */ #define UPDATE_MASKED_C_3 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -363,10 +363,10 @@ */ #define UPDATE_MASKED_C_2 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -379,7 +379,7 @@ */ #define UPDATE_MASKED_C_1 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/ @@ -495,11 +495,11 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -520,7 +520,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -541,7 +541,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -562,7 +562,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -582,7 +582,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -602,7 +602,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -622,7 +622,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -667,11 +667,11 @@ void bli_dgemmsup_rv_zen4_asm_24x3 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -692,7 +692,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -713,7 +713,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 prefetchw0( mem(rdx, 128)) // prefetch C vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -733,7 +733,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -752,7 +752,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -771,7 +771,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -790,7 +790,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -833,11 +833,11 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -857,7 +857,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -877,7 +877,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -897,7 +897,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -916,7 +916,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -935,7 +935,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -954,7 +954,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -997,7 +997,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1053,7 +1053,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vmovupd( 0x40(rcx),zmm1) vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx)) - vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm28) vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) @@ -1062,7 +1062,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vmovupd( 0x40(rcx,rdi,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1)) - vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm29) vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) @@ -1071,7 +1071,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vmovupd( 0x40(rcx,rdi,2),zmm1) vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2)) - vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm26) vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask @@ -1422,10 +1422,10 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1442,7 +1442,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1459,7 +1459,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1476,7 +1476,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1492,7 +1492,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1508,7 +1508,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1524,7 +1524,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1562,10 +1562,10 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1582,7 +1582,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 2 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1598,7 +1598,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1614,7 +1614,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1629,7 +1629,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1644,7 +1644,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1659,7 +1659,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1695,10 +1695,10 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1714,7 +1714,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1730,7 +1730,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1746,7 +1746,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1761,7 +1761,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1776,7 +1776,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1791,7 +1791,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1827,7 +1827,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1874,19 +1874,19 @@ void bli_dgemmsup_rv_zen4_asm_16x3 vmovupd( mem(rcx),zmm0) vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx)) - vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1)) - vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2)) - vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask @@ -2208,9 +2208,9 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2223,7 +2223,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2236,7 +2236,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2249,7 +2249,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2261,7 +2261,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2273,7 +2273,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2285,7 +2285,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2316,9 +2316,9 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2330,7 +2330,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm0,zmm30,zmm10 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2342,7 +2342,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm3,zmm30,zmm10 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2354,7 +2354,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm0,zmm30,zmm10 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2365,7 +2365,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm3,zmm30,zmm10 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2376,7 +2376,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm0,zmm30,zmm10 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2387,7 +2387,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm3,zmm30,zmm10 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2416,9 +2416,9 @@ void bli_dgemmsup_rv_zen4_asm_8x3 label(.LOOP3) // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2430,7 +2430,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm0,zmm30,zmm10 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2442,7 +2442,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm3,zmm30,zmm10 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2454,7 +2454,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm0,zmm30,zmm10 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2465,7 +2465,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm3,zmm30,zmm10 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2476,7 +2476,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm0,zmm30,zmm10 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2487,7 +2487,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vfmadd231pd( zmm3,zmm30,zmm10 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2516,7 +2516,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 je(.DPOSTACCUM) // if i == 0, jump to post-accumulation label(.DLOOPKLEFT) // k_left loop - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2554,13 +2554,13 @@ void bli_dgemmsup_rv_zen4_asm_8x3 jz(.DROWSTORED) // jump to row storage case label(.DCOLSTORED) - vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c index e77e380548..52dbf10912 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c @@ -182,28 +182,28 @@ */ #define UPDATE_MASKED_C_8 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ - vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm8 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -223,25 +223,25 @@ */ #define UPDATE_MASKED_C_7 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -259,22 +259,22 @@ */ #define UPDATE_MASKED_C_6 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -291,19 +291,19 @@ */ #define UPDATE_MASKED_C_5 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -319,16 +319,16 @@ */ #define UPDATE_MASKED_C_4 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -343,13 +343,13 @@ */ #define UPDATE_MASKED_C_3 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -363,10 +363,10 @@ */ #define UPDATE_MASKED_C_2 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -379,7 +379,7 @@ */ #define UPDATE_MASKED_C_1 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/ @@ -498,11 +498,11 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -527,7 +527,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -552,7 +552,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -577,7 +577,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -602,7 +602,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -626,7 +626,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -650,7 +650,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -703,11 +703,11 @@ void bli_dgemmsup_rv_zen4_asm_24x4 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -732,7 +732,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -757,7 +757,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 prefetchw0( mem(rdx, 128)) // prefetch C vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -781,7 +781,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -805,7 +805,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -828,7 +828,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -851,7 +851,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -902,11 +902,11 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -930,7 +930,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -954,7 +954,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -978,7 +978,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1002,7 +1002,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1025,7 +1025,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1048,7 +1048,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1099,7 +1099,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1163,7 +1163,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( 0x40(rcx),zmm1) vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx)) - vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm28) vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) @@ -1172,7 +1172,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( 0x40(rcx,rdi,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1)) - vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm29) vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) @@ -1181,7 +1181,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( 0x40(rcx,rdi,2),zmm1) vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2)) - vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm26) vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,r13,1),zmm3) @@ -1190,7 +1190,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 vmovupd( 0x40(rcx,r13,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm13) vmovupd( zmm13,0x40(rcx,r13,1)) - vmovupd( 0x80(rcx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,r13,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm27) vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask @@ -1541,10 +1541,10 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1564,7 +1564,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1584,7 +1584,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1604,7 +1604,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1624,7 +1624,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1643,7 +1643,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1662,7 +1662,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1706,10 +1706,10 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1729,7 +1729,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 2 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1748,7 +1748,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1767,7 +1767,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1786,7 +1786,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1804,7 +1804,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1822,7 +1822,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1864,10 +1864,10 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1886,7 +1886,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1905,7 +1905,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1924,7 +1924,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1943,7 +1943,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1961,7 +1961,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1979,7 +1979,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2021,7 +2021,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2074,25 +2074,25 @@ void bli_dgemmsup_rv_zen4_asm_16x4 vmovupd( mem(rcx),zmm0) vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx)) - vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1)) - vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2)) - vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,r13,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm12) vmovupd( zmm12,(rcx,r13,1)) - vmovupd( 0x40(rcx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,r13,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm13) vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask @@ -2418,9 +2418,9 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2435,7 +2435,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2450,7 +2450,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2465,7 +2465,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2480,7 +2480,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2494,7 +2494,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2508,7 +2508,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2543,9 +2543,9 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2559,7 +2559,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm0,zmm31,zmm12 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2573,7 +2573,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm3,zmm31,zmm12 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2587,7 +2587,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm0,zmm31,zmm12 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2601,7 +2601,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm3,zmm31,zmm12 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2614,7 +2614,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm0,zmm31,zmm12 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2627,7 +2627,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm3,zmm31,zmm12 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2660,9 +2660,9 @@ void bli_dgemmsup_rv_zen4_asm_8x4 label(.LOOP3) // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2676,7 +2676,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm0,zmm31,zmm12 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2690,7 +2690,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm3,zmm31,zmm12 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2704,7 +2704,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm0,zmm31,zmm12 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2718,7 +2718,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm3,zmm31,zmm12 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2731,7 +2731,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm0,zmm31,zmm12 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2744,7 +2744,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vfmadd231pd( zmm3,zmm31,zmm12 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2777,7 +2777,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 je(.DPOSTACCUM) // if i == 0, jump to post-accumulation label(.DLOOPKLEFT) // k_left loop - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2819,16 +2819,16 @@ void bli_dgemmsup_rv_zen4_asm_8x4 jz(.DROWSTORED) // jump to row storage case label(.DCOLSTORED) - vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,r13,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm12) vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c index eba43abe68..05cfa12441 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c @@ -182,28 +182,28 @@ */ #define UPDATE_MASKED_C_8 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ - vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm8 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -223,25 +223,25 @@ */ #define UPDATE_MASKED_C_7 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -259,22 +259,22 @@ */ #define UPDATE_MASKED_C_6 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -291,19 +291,19 @@ */ #define UPDATE_MASKED_C_5 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -319,16 +319,16 @@ */ #define UPDATE_MASKED_C_4 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -343,13 +343,13 @@ */ #define UPDATE_MASKED_C_3 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -363,10 +363,10 @@ */ #define UPDATE_MASKED_C_2 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -379,7 +379,7 @@ */ #define UPDATE_MASKED_C_1 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/ @@ -507,11 +507,11 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -541,7 +541,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -571,7 +571,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -601,7 +601,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -631,7 +631,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -661,7 +661,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -690,7 +690,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -754,11 +754,11 @@ void bli_dgemmsup_rv_zen4_asm_24x5 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -788,7 +788,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -818,7 +818,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 prefetchw0( mem(rdx, 128)) // prefetch C vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -847,7 +847,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -876,7 +876,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -905,7 +905,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -933,7 +933,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -995,11 +995,11 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1028,7 +1028,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1057,7 +1057,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1086,7 +1086,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1115,7 +1115,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1144,7 +1144,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1172,7 +1172,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1234,7 +1234,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1307,7 +1307,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( 0x40(rcx),zmm1) vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx)) - vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm28) vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) @@ -1316,7 +1316,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( 0x40(rcx,rdi,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1)) - vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm29) vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) @@ -1325,7 +1325,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( 0x40(rcx,rdi,2),zmm1) vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2)) - vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm26) vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,r13,1),zmm3) @@ -1334,7 +1334,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( 0x40(rcx,r13,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm13) vmovupd( zmm13,0x40(rcx,r13,1)) - vmovupd( 0x80(rcx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,r13,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm27) vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx),zmm0) @@ -1343,7 +1343,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 vmovupd( 0x40(rdx),zmm1) vfmadd231pd( zmm1,zmm31,zmm15) vmovupd( zmm15,0x40(rdx)) - vmovupd( 0x80(rdx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rdx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm24) vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask @@ -1719,10 +1719,10 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1746,7 +1746,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1770,7 +1770,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1794,7 +1794,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1818,7 +1818,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1842,7 +1842,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1865,7 +1865,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1918,10 +1918,10 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1945,7 +1945,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 2 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1968,7 +1968,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1991,7 +1991,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2014,7 +2014,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2037,7 +2037,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2059,7 +2059,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2110,10 +2110,10 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2136,7 +2136,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2159,7 +2159,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2182,7 +2182,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2205,7 +2205,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2228,7 +2228,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2250,7 +2250,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2301,7 +2301,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2361,31 +2361,31 @@ void bli_dgemmsup_rv_zen4_asm_16x5 vmovupd( mem(rcx),zmm0) vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx)) - vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1)) - vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2)) - vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,r13,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm12) vmovupd( zmm12,(rcx,r13,1)) - vmovupd( 0x40(rcx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,r13,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm13) vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx),zmm0) vfmadd231pd( zmm0,zmm31,zmm14) vmovupd( zmm14,(rdx)) - vmovupd( 0x40(rdx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rdx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm15) vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask @@ -2728,9 +2728,9 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2748,7 +2748,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2766,7 +2766,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2784,7 +2784,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2802,7 +2802,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2820,7 +2820,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2837,7 +2837,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2879,9 +2879,9 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2898,7 +2898,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm0,zmm30,zmm14 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2915,7 +2915,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm3,zmm30,zmm14 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2932,7 +2932,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm0,zmm30,zmm14 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2949,7 +2949,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm3,zmm30,zmm14 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2966,7 +2966,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm0,zmm30,zmm14 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2982,7 +2982,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm3,zmm30,zmm14 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -3022,9 +3022,9 @@ void bli_dgemmsup_rv_zen4_asm_8x5 label(.LOOP3) // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3041,7 +3041,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm0,zmm30,zmm14 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3058,7 +3058,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm3,zmm30,zmm14 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3075,7 +3075,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm0,zmm30,zmm14 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3092,7 +3092,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm3,zmm30,zmm14 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3109,7 +3109,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm0,zmm30,zmm14 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -3125,7 +3125,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vfmadd231pd( zmm3,zmm30,zmm14 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -3165,7 +3165,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 je(.DPOSTACCUM) // if i == 0, jump to post-accumulation label(.DLOOPKLEFT) // k_left loop - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -3212,19 +3212,19 @@ void bli_dgemmsup_rv_zen4_asm_8x5 jz(.DROWSTORED) // jump to row storage case label(.DCOLSTORED) - vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,r13,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm12) vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rdx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rdx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm14) vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c index 449b7b2329..42335cac7f 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c @@ -182,28 +182,28 @@ */ #define UPDATE_MASKED_C_8 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ - vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm8 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -223,25 +223,25 @@ */ #define UPDATE_MASKED_C_7 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -259,22 +259,22 @@ */ #define UPDATE_MASKED_C_6 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -291,19 +291,19 @@ */ #define UPDATE_MASKED_C_5 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -319,16 +319,16 @@ */ #define UPDATE_MASKED_C_4 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -343,13 +343,13 @@ */ #define UPDATE_MASKED_C_3 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -363,10 +363,10 @@ */ #define UPDATE_MASKED_C_2 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -379,7 +379,7 @@ */ #define UPDATE_MASKED_C_1 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/ @@ -510,11 +510,11 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -548,7 +548,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -582,7 +582,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -616,7 +616,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -650,7 +650,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -684,7 +684,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -718,7 +718,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -790,11 +790,11 @@ void bli_dgemmsup_rv_zen4_asm_24x6 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -828,7 +828,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -862,7 +862,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 prefetchw0( mem(rdx, 128)) // prefetch C vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -895,7 +895,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -928,7 +928,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -961,7 +961,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -994,7 +994,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1064,11 +1064,11 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1101,7 +1101,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1134,7 +1134,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1167,7 +1167,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1200,7 +1200,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1233,7 +1233,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1266,7 +1266,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1336,7 +1336,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1416,7 +1416,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( 0x40(rcx),zmm1) vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx)) - vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm28) vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) @@ -1425,7 +1425,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( 0x40(rcx,rdi,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1)) - vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm29) vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) @@ -1434,7 +1434,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( 0x40(rcx,rdi,2),zmm1) vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2)) - vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm26) vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,r13,1),zmm3) @@ -1443,7 +1443,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( 0x40(rcx,r13,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm13) vmovupd( zmm13,0x40(rcx,r13,1)) - vmovupd( 0x80(rcx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,r13,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm27) vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx),zmm0) @@ -1452,7 +1452,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( 0x40(rdx),zmm1) vfmadd231pd( zmm1,zmm31,zmm15) vmovupd( zmm15,0x40(rdx)) - vmovupd( 0x80(rdx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rdx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm24) vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,rdi,1),zmm3) @@ -1461,7 +1461,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 vmovupd( 0x40(rdx,rdi,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm17) vmovupd( zmm17,0x40(rdx,rdi,1)) - vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm25) vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2))) // store to C with mask @@ -1840,10 +1840,10 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1870,7 +1870,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1897,7 +1897,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1924,7 +1924,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1951,7 +1951,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1978,7 +1978,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2005,7 +2005,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2064,10 +2064,10 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2094,7 +2094,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 2 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2120,7 +2120,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2146,7 +2146,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2172,7 +2172,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2198,7 +2198,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2224,7 +2224,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2281,10 +2281,10 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2310,7 +2310,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2336,7 +2336,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2362,7 +2362,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2388,7 +2388,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2414,7 +2414,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2440,7 +2440,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2497,7 +2497,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2562,37 +2562,37 @@ void bli_dgemmsup_rv_zen4_asm_16x6 vmovupd( mem(rcx),zmm0) vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx)) - vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1)) - vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2)) - vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,r13,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm12) vmovupd( zmm12,(rcx,r13,1)) - vmovupd( 0x40(rcx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,r13,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm13) vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx),zmm0) vfmadd231pd( zmm0,zmm31,zmm14) vmovupd( zmm14,(rdx)) - vmovupd( 0x40(rdx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rdx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm15) vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,rdi,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm16) vmovupd( zmm16,(rdx,rdi,1)) - vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm17) vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2))) // store to C with mask @@ -2938,9 +2938,9 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2960,7 +2960,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2980,7 +2980,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3000,7 +3000,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3020,7 +3020,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3040,7 +3040,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3060,7 +3060,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -3106,9 +3106,9 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3127,7 +3127,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm0,zmm31,zmm16 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3146,7 +3146,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm3,zmm31,zmm16 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3165,7 +3165,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm0,zmm31,zmm16 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3184,7 +3184,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm3,zmm31,zmm16 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3203,7 +3203,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm0,zmm31,zmm16 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3222,7 +3222,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm3,zmm31,zmm16 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -3266,9 +3266,9 @@ void bli_dgemmsup_rv_zen4_asm_8x6 label(.LOOP3) // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3287,7 +3287,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm0,zmm31,zmm16 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3306,7 +3306,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm3,zmm31,zmm16 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3325,7 +3325,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm0,zmm31,zmm16 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3344,7 +3344,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm3,zmm31,zmm16 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3363,7 +3363,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm0,zmm31,zmm16 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3382,7 +3382,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vfmadd231pd( zmm3,zmm31,zmm16 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -3426,7 +3426,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 je(.DPOSTACCUM) // if i == 0, jump to post-accumulation label(.DLOOPKLEFT) // k_left loop - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -3476,22 +3476,22 @@ void bli_dgemmsup_rv_zen4_asm_8x6 jz(.DROWSTORED) // jump to row storage case label(.DCOLSTORED) - vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,r13,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm12) vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rdx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rdx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm14) vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask - vmovupd( mem(rdx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rdx,rdi,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm16) vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2))) // store to C with mask diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c index 95b3ca452d..c3471d8d68 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c @@ -182,28 +182,28 @@ */ #define UPDATE_MASKED_C_8 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ - vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm8 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -223,25 +223,25 @@ */ #define UPDATE_MASKED_C_7 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ - vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm3 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -259,22 +259,22 @@ */ #define UPDATE_MASKED_C_6 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ - vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm18,zmm5 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -291,19 +291,19 @@ */ #define UPDATE_MASKED_C_5 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ - vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm14,zmm1 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -319,16 +319,16 @@ */ #define UPDATE_MASKED_C_4 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ - vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm16,zmm6 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -343,13 +343,13 @@ */ #define UPDATE_MASKED_C_3 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ - vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm12,zmm2 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -363,10 +363,10 @@ */ #define UPDATE_MASKED_C_2 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ - vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm10,zmm4 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/\ @@ -379,7 +379,7 @@ */ #define UPDATE_MASKED_C_1 \ \ - vmovupd( mem(rcx), zmm30 MASK_(k(3)) MASK_(z) ) \ + vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \ vfmadd231pd( zmm31,zmm30,zmm0 ) \ \ vmovupd( zmm0, (rcx) MASK_(k(3))) /*Stores back to C*/ @@ -513,11 +513,11 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -555,7 +555,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -593,7 +593,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -631,7 +631,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -669,7 +669,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -707,7 +707,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -745,7 +745,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -826,11 +826,11 @@ void bli_dgemmsup_rv_zen4_asm_24x7 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -868,7 +868,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -906,7 +906,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 prefetchw0( mem(rdx, 128)) // prefetch C vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -943,7 +943,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -980,7 +980,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1017,7 +1017,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1054,7 +1054,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1133,11 +1133,11 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1174,7 +1174,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1211,7 +1211,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1248,7 +1248,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1285,7 +1285,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1322,7 +1322,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1359,7 +1359,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1438,7 +1438,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1525,7 +1525,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( 0x40(rcx),zmm1) vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx)) - vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm28) vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) @@ -1534,7 +1534,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( 0x40(rcx,rdi,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1)) - vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm29) vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) @@ -1543,7 +1543,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( 0x40(rcx,rdi,2),zmm1) vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2)) - vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm26) vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,r13,1),zmm3) @@ -1552,7 +1552,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( 0x40(rcx,r13,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm13) vmovupd( zmm13,0x40(rcx,r13,1)) - vmovupd( 0x80(rcx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,r13,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm27) vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx),zmm0) @@ -1561,7 +1561,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( 0x40(rdx),zmm1) vfmadd231pd( zmm1,zmm31,zmm15) vmovupd( zmm15,0x40(rdx)) - vmovupd( 0x80(rdx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rdx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm24) vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,rdi,1),zmm3) @@ -1570,7 +1570,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( 0x40(rdx,rdi,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm17) vmovupd( zmm17,0x40(rdx,rdi,1)) - vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm25) vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,rdi,2),zmm0) @@ -1579,7 +1579,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 vmovupd( 0x40(rdx,rdi,2),zmm1) vfmadd231pd( zmm1,zmm31,zmm19) vmovupd( zmm19,0x40(rdx,rdi,2)) - vmovupd( 0x80(rdx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rdx,rdi,2),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm22) vmovupd( zmm22,0x80(rdx,rdi,2) MASK_(k(2))) // store to C with mask @@ -1959,10 +1959,10 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1992,7 +1992,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2022,7 +2022,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2052,7 +2052,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2082,7 +2082,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2112,7 +2112,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2142,7 +2142,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2208,10 +2208,10 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2241,7 +2241,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 2 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2270,7 +2270,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2299,7 +2299,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2328,7 +2328,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2357,7 +2357,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2386,7 +2386,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2450,10 +2450,10 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2482,7 +2482,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2511,7 +2511,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2540,7 +2540,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2569,7 +2569,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2598,7 +2598,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2627,7 +2627,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2691,7 +2691,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2762,43 +2762,43 @@ void bli_dgemmsup_rv_zen4_asm_16x7 vmovupd( mem(rcx),zmm0) vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx)) - vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1)) - vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2)) - vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,r13,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm12) vmovupd( zmm12,(rcx,r13,1)) - vmovupd( 0x40(rcx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,r13,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm13) vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx),zmm0) vfmadd231pd( zmm0,zmm31,zmm14) vmovupd( zmm14,(rdx)) - vmovupd( 0x40(rdx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rdx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm15) vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,rdi,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm16) vmovupd( zmm16,(rdx,rdi,1)) - vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm17) vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,rdi,2),zmm0) vfmadd231pd( zmm0,zmm31,zmm18) vmovupd( zmm18,(rdx,rdi,2)) - vmovupd( 0x40(rdx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rdx,rdi,2),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm19) vmovupd( zmm19,0x40(rdx,rdi,2) MASK_(k(2))) // store to C with mask @@ -3143,9 +3143,9 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3167,7 +3167,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3189,7 +3189,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3211,7 +3211,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3233,7 +3233,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3255,7 +3255,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3277,7 +3277,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3328,9 +3328,9 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3351,7 +3351,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm0,zmm30,zmm18 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3372,7 +3372,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm3,zmm30,zmm18 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3393,7 +3393,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm0,zmm30,zmm18 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3414,7 +3414,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm3,zmm30,zmm18 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3435,7 +3435,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm0,zmm30,zmm18 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3456,7 +3456,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm3,zmm30,zmm18 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3505,9 +3505,9 @@ void bli_dgemmsup_rv_zen4_asm_8x7 label(.LOOP3) // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3528,7 +3528,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm0,zmm30,zmm18 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3549,7 +3549,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm3,zmm30,zmm18 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3570,7 +3570,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm0,zmm30,zmm18 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3591,7 +3591,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm3,zmm30,zmm18 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3612,7 +3612,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm0,zmm30,zmm18 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3633,7 +3633,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vfmadd231pd( zmm3,zmm30,zmm18 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3682,7 +3682,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 je(.DPOSTACCUM) // if i == 0, jump to post-accumulation label(.DLOOPKLEFT) // k_left loop - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -3735,25 +3735,25 @@ void bli_dgemmsup_rv_zen4_asm_8x7 jz(.DROWSTORED) // jump to row storage case label(.DCOLSTORED) - vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,r13,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm12) vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rdx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rdx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm14) vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask - vmovupd( mem(rdx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rdx,rdi,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm16) vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rdx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rdx,rdi,2),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm18) vmovupd( zmm18,(rdx,rdi,2) MASK_(k(2))) // store to C with mask diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c index 60f23206b0..32471c0d25 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c @@ -472,11 +472,11 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -518,7 +518,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -560,7 +560,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -602,7 +602,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -644,7 +644,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -686,7 +686,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -728,7 +728,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -818,11 +818,11 @@ void bli_dgemmsup_rv_zen4_asm_24x8 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -864,7 +864,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -906,7 +906,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 prefetchw0( mem(rdx, 128)) // prefetch C vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -947,7 +947,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -988,7 +988,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1029,7 +1029,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1070,7 +1070,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1158,11 +1158,11 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1203,7 +1203,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1244,7 +1244,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1285,7 +1285,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1326,7 +1326,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1367,7 +1367,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1408,7 +1408,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A vmovupd( 0x40(rax),zmm4 ) - vmovupd( 0x80(rax),zmm5 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm5 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -1496,7 +1496,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A vmovupd( 0x40(rax),zmm1 ) - vmovupd( 0x80(rax),zmm2 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x80(rax),zmm2 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -1590,7 +1590,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( 0x40(rcx),zmm1) vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx)) - vmovupd( 0x80(rcx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm28) vmovupd( zmm28,0x80(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) @@ -1599,7 +1599,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( 0x40(rcx,rdi,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1)) - vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm29) vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) @@ -1608,7 +1608,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( 0x40(rcx,rdi,2),zmm1) vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2)) - vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm26) vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,r13,1),zmm3) @@ -1617,7 +1617,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( 0x40(rcx,r13,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm13) vmovupd( zmm13,0x40(rcx,r13,1)) - vmovupd( 0x80(rcx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rcx,r13,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm27) vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx),zmm0) @@ -1626,7 +1626,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( 0x40(rdx),zmm1) vfmadd231pd( zmm1,zmm31,zmm15) vmovupd( zmm15,0x40(rdx)) - vmovupd( 0x80(rdx),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rdx),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm24) vmovupd( zmm24,0x80(rdx) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,rdi,1),zmm3) @@ -1635,7 +1635,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( 0x40(rdx,rdi,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm17) vmovupd( zmm17,0x40(rdx,rdi,1)) - vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm25) vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,rdi,2),zmm0) @@ -1644,7 +1644,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( 0x40(rdx,rdi,2),zmm1) vfmadd231pd( zmm1,zmm31,zmm19) vmovupd( zmm19,0x40(rdx,rdi,2)) - vmovupd( 0x80(rdx,rdi,2),zmm2 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rdx,rdi,2),zmm2 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm2,zmm31,zmm22) vmovupd( zmm22,0x80(rdx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,r13,1),zmm3) @@ -1653,7 +1653,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 vmovupd( 0x40(rdx,r13,1),zmm4) vfmadd231pd( zmm4,zmm31,zmm21) vmovupd( zmm21,0x40(rdx,r13,1)) - vmovupd( 0x80(rdx,r13,1),zmm5 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x80(rdx,r13,1),zmm5 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm5,zmm31,zmm23) vmovupd( zmm23,0x80(rdx,r13,1) MASK_(k(2))) // store to C with mask @@ -2034,10 +2034,10 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2070,7 +2070,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2103,7 +2103,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2136,7 +2136,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2169,7 +2169,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2202,7 +2202,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2235,7 +2235,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2308,10 +2308,10 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2344,7 +2344,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 2 prefetchw0( mem(rdx, 64)) // prefetch C vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2376,7 +2376,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2408,7 +2408,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2440,7 +2440,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2472,7 +2472,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2504,7 +2504,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2575,10 +2575,10 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2610,7 +2610,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 2 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2642,7 +2642,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2674,7 +2674,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 4 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2706,7 +2706,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2738,7 +2738,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 6 vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2770,7 +2770,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A - vmovupd( 0x40(rax),zmm4 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm4 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -2841,7 +2841,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 label(.DLOOPKLEFT) // k_left loop vmovupd( mem(rax),zmm0 ) // load A - vmovupd( 0x40(rax),zmm1 MASK_(k(2)) MASK_(z) ) // Load A with mask and zero hint + vmovupd( 0x40(rax),zmm1 MASK_KZ(2) ) // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -2918,49 +2918,49 @@ void bli_dgemmsup_rv_zen4_asm_16x8 vmovupd( mem(rcx),zmm0) vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx)) - vmovupd( 0x40(rcx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm7) vmovupd( zmm7,0x40(rcx) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1)) - vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm9) vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,rdi,2),zmm0) vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2)) - vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm11) vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rcx,r13,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm12) vmovupd( zmm12,(rcx,r13,1)) - vmovupd( 0x40(rcx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rcx,r13,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm13) vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx),zmm0) vfmadd231pd( zmm0,zmm31,zmm14) vmovupd( zmm14,(rdx)) - vmovupd( 0x40(rdx),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rdx),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm15) vmovupd( zmm15,0x40(rdx) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,rdi,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm16) vmovupd( zmm16,(rdx,rdi,1)) - vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm17) vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,rdi,2),zmm0) vfmadd231pd( zmm0,zmm31,zmm18) vmovupd( zmm18,(rdx,rdi,2)) - vmovupd( 0x40(rdx,rdi,2),zmm1 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rdx,rdi,2),zmm1 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm1,zmm31,zmm19) vmovupd( zmm19,0x40(rdx,rdi,2) MASK_(k(2))) // store to C with mask vmovupd( mem(rdx,r13,1),zmm3) vfmadd231pd( zmm3,zmm31,zmm20) vmovupd( zmm20,(rdx,r13,1)) - vmovupd( 0x40(rdx,r13,1),zmm4 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( 0x40(rdx,r13,1),zmm4 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm4,zmm31,zmm21) vmovupd( zmm21,0x40(rdx,r13,1) MASK_(k(2))) // store to C with mask @@ -3300,9 +3300,9 @@ void bli_dgemmsup_rv_zen4_asm_8x8 // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3326,7 +3326,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3350,7 +3350,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3374,7 +3374,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3398,7 +3398,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3422,7 +3422,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3446,7 +3446,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3502,9 +3502,9 @@ void bli_dgemmsup_rv_zen4_asm_8x8 // ---------------------------------- iteration 1 prefetchw0( mem(rdx)) // prefetch C - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3527,7 +3527,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm0,zmm31,zmm20 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3550,7 +3550,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm3,zmm31,zmm20 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3573,7 +3573,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm0,zmm31,zmm20 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3596,7 +3596,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm3,zmm31,zmm20 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3619,7 +3619,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm0,zmm31,zmm20 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3642,7 +3642,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm3,zmm31,zmm20 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3696,9 +3696,9 @@ void bli_dgemmsup_rv_zen4_asm_8x8 label(.LOOP3) // ---------------------------------- iteration 1 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3721,7 +3721,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm0,zmm31,zmm20 ) // ---------------------------------- iteration 2 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3744,7 +3744,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm3,zmm31,zmm20 ) // ---------------------------------- iteration 3 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3767,7 +3767,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm0,zmm31,zmm20 ) // ---------------------------------- iteration 4 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3790,7 +3790,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm3,zmm31,zmm20 ) // ---------------------------------- iteration 5 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3813,7 +3813,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm0,zmm31,zmm20 ) // ---------------------------------- iteration 6 - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3836,7 +3836,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 vfmadd231pd( zmm3,zmm31,zmm20 ) // ---------------------------------- iteration 7 - vmovupd( mem(rax),zmm3 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a prefetch( 0,mem(r15,r9,2) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) @@ -3890,7 +3890,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 je(.DPOSTACCUM) // if i == 0, jump to post-accumulation label(.DLOOPKLEFT) // k_left loop - vmovupd( mem(rax),zmm0 MASK_(k(2)) MASK_(z) ) // load A // Load A with mask and zero hint + vmovupd( mem(rax),zmm0 MASK_KZ(2) ) // load A // Load A with mask and zero hint add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) @@ -3946,28 +3946,28 @@ void bli_dgemmsup_rv_zen4_asm_8x8 jz(.DROWSTORED) // jump to row storage case label(.DCOLSTORED) - vmovupd( mem(rcx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm6) vmovupd( zmm6,(rcx) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm8) vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm10) vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2))) // store to C with mask - vmovupd( mem(rcx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rcx,r13,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm12) vmovupd( zmm12,(rcx,r13,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rdx),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rdx),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm14) vmovupd( zmm14,(rdx) MASK_(k(2))) // store to C with mask - vmovupd( mem(rdx,rdi,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rdx,rdi,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm16) vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2))) // store to C with mask - vmovupd( mem(rdx,rdi,2),zmm0 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rdx,rdi,2),zmm0 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm0,zmm31,zmm18) vmovupd( zmm18,(rdx,rdi,2) MASK_(k(2))) // store to C with mask - vmovupd( mem(rdx,r13,1),zmm3 MASK_(k(2)) MASK_(z)) // Load C using mask and zero hint + vmovupd( mem(rdx,r13,1),zmm3 MASK_KZ(2)) // Load C using mask and zero hint vfmadd231pd( zmm3,zmm31,zmm20) vmovupd( zmm20,(rdx,r13,1) MASK_(k(2))) // store to C with mask From 07df6ec46b1f3d7029c6cd2520643b6ddc7d1a69 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Fri, 12 May 2023 06:22:56 -0500 Subject: [PATCH 089/226] Ticket id correction for previous commit. Previous commit (30b931ae609d0dd7c5e7a8c14b12256d0552b4b2) is having incorrect ticket id. Correct ticket id for that commit is AMD-Internal:[CPUPL-3328] Change-Id: If3242714984ae3d3d9bbb0198bda91b4dd9a4bdc From 1a7f60ff5b6906c48989716eccd4d5c566d1bec0 Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Fri, 12 May 2023 02:05:45 +0530 Subject: [PATCH 090/226] Update CMake system to use object libraries for haswell, skx and zen4. - AVX2 and AVX512 flags are set up locally for each object library that requires them. - Default ENABLE_SIMD_FLAGS value is set to none and for AVX2 option the corresponding compiler flag is set globally. - To be able to build zen4 codepath when ENABLE_SIMD_FLAGS=AVX2, the compiler option is removed by removing the definition before building the corresponding object library. AMD-Internal: [CPUPL-3241] Change-Id: Ia570e60f06c4c72b7c58f4c9ca73bac4c060ae73 --- CMakeLists.txt | 70 +++++++++-------------- kernels/haswell/1m/CMakeLists.txt | 6 +- kernels/haswell/3/CMakeLists.txt | 8 ++- kernels/haswell/3/sup/CMakeLists.txt | 11 ++-- kernels/haswell/3/sup/d6x8/CMakeLists.txt | 8 ++- kernels/skx/3/CMakeLists.txt | 7 ++- kernels/skx/CMakeLists.txt | 6 +- kernels/zen4/1/CMakeLists.txt | 6 +- kernels/zen4/1m/CMakeLists.txt | 6 +- kernels/zen4/3/CMakeLists.txt | 8 ++- kernels/zen4/3/sup/CMakeLists.txt | 7 ++- kernels/zen4/3/sup/d24x8/CMakeLists.txt | 6 +- kernels/zen4/CMakeLists.txt | 4 +- 13 files changed, 76 insertions(+), 77 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 075309c9db..3ee512e40e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -343,57 +343,17 @@ message(DISABLE_BLIS_ARCH_TYPE : ${DISABLE_BLIS_ARCH_TYPE}) message(RENAME_BLIS_ARCH_TYPE : ${RENAME_BLIS_ARCH_TYPE}) message(RENAME_BLIS_MODEL_TYPE : ${RENAME_BLIS_MODEL_TYPE}) -SET(ENABLE_SIMD_FLAGS "AVX2" CACHE STRING "Set compiler SIMD flags") +SET(ENABLE_SIMD_FLAGS "none" CACHE STRING "Set compiler SIMD flags") SET_PROPERTY(CACHE ENABLE_SIMD_FLAGS PROPERTY STRINGS none SSE2 AVX AVX2) if(${ENABLE_SIMD_FLAGS} MATCHES "AVX2") - #add_definitions(/arch:AVX2) + add_definitions(/arch:AVX2) elseif(${ENABLE_SIMD_FLAGS} MATCHES "AVX") add_definitions(/arch:AVX) elseif(${ENABLE_SIMD_FLAGS} MATCHES "SSE2") add_definitions(/arch:SSE2) endif() -if(${TARGET_ARCH} STREQUAL zen4 OR - ${TARGET_ARCH} STREQUAL amdzen) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/1/bli_amaxv_zen_int_avx512.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/1/bli_scalv_zen_int_avx512.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/1/bli_dotv_zen_int_avx512.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/1/bli_axpyv_zen_int_avx512.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_dgemm_zen4_asm_32x6.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/3/bli_dgemm_skx_asm_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_zgemm_zen4_asm_12x4.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_trsm_small_AVX512.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.h PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64m.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64n.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.h PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_gemmsup_cv_zen_z12x4m.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/1m/bli_packm_zen4_asm_z4xk.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/1m/bli_packm_zen4_asm_z12xk.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c PROPERTIES COMPILE_FLAGS /arch:AVX512) -endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W0 ") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Oi") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP") @@ -647,7 +607,18 @@ if(BUILD_SHARED_LIBS) $ $ $ - $) + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + ) if(ENABLE_OPENMP) target_link_libraries("${PROJECT_NAME}" PRIVATE OpenMP::OpenMP_CXX) endif() @@ -662,7 +633,18 @@ if(NOT BUILD_SHARED_LIBS) $ $ $ - $) + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + ) if(ENABLE_OPENMP) set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C OUTPUT_NAME "${LIB_NAME}" STATIC_LIBRARY_OPTIONS "${OpenMP_libomp_LIBRARY}") else() diff --git a/kernels/haswell/1m/CMakeLists.txt b/kernels/haswell/1m/CMakeLists.txt index 58ce19c61a..effa9d22d0 100644 --- a/kernels/haswell/1m/CMakeLists.txt +++ b/kernels/haswell/1m/CMakeLists.txt @@ -1,7 +1,7 @@ ##Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(haswell_1m + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_c3xk.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_c8xk.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_d6xk.c @@ -11,3 +11,5 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_z3xk.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_z4xk.c ) + +target_compile_options(haswell_1m PRIVATE /arch:AVX2) \ No newline at end of file diff --git a/kernels/haswell/3/CMakeLists.txt b/kernels/haswell/3/CMakeLists.txt index c3bd3b2ee5..0f491c84e8 100644 --- a/kernels/haswell/3/CMakeLists.txt +++ b/kernels/haswell/3/CMakeLists.txt @@ -1,11 +1,13 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(haswell_3 + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_haswell_asm_d6x8.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_haswell_asm_d8x6.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_l_haswell_asm_d6x8.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_u_haswell_asm_d6x8.c ) +target_compile_options(haswell_3 PRIVATE /arch:AVX2) + add_subdirectory(sup) diff --git a/kernels/haswell/3/sup/CMakeLists.txt b/kernels/haswell/3/sup/CMakeLists.txt index 6d13252de5..d0bf6a16f9 100644 --- a/kernels/haswell/3/sup/CMakeLists.txt +++ b/kernels/haswell/3/sup/CMakeLists.txt @@ -1,7 +1,7 @@ -##Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(haswell_3sup + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_d6x8m.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_d6x8n.c #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_s6x16m.c @@ -11,6 +11,7 @@ target_sources("${PROJECT_NAME}" #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_s6x16m.c #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_s6x16n.c ) -add_subdirectory(d6x8) -#add_subdirectory(s6x16) +target_compile_options(haswell_3sup PRIVATE /arch:AVX2) +add_subdirectory(d6x8) +#add_subdirectory(s6x16) \ No newline at end of file diff --git a/kernels/haswell/3/sup/d6x8/CMakeLists.txt b/kernels/haswell/3/sup/d6x8/CMakeLists.txt index ce3bade013..2ad43e0a2f 100644 --- a/kernels/haswell/3/sup/d6x8/CMakeLists.txt +++ b/kernels/haswell/3/sup/d6x8/CMakeLists.txt @@ -1,7 +1,7 @@ -##Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(haswell_3supd6x8 + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_r_haswell_ref_dMx1.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx1.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx2.c @@ -13,4 +13,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx6.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx8.c ) +target_compile_options(haswell_3supd6x8 PRIVATE /arch:AVX2) + diff --git a/kernels/skx/3/CMakeLists.txt b/kernels/skx/3/CMakeLists.txt index 30857ba975..4faf5cfefc 100644 --- a/kernels/skx/3/CMakeLists.txt +++ b/kernels/skx/3/CMakeLists.txt @@ -1,7 +1,8 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(skx_3 + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_skx_asm_16x14.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_sgemm_skx_asm_32x12_l2.c ) +target_compile_options(skx_3 PRIVATE /arch:AVX2 /arch:AVX512) \ No newline at end of file diff --git a/kernels/skx/CMakeLists.txt b/kernels/skx/CMakeLists.txt index bc8f1eaab3..a9ba638da8 100644 --- a/kernels/skx/CMakeLists.txt +++ b/kernels/skx/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## - -add_subdirectory(3) +##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.## +remove_definitions(/arch:AVX2) +add_subdirectory(3) \ No newline at end of file diff --git a/kernels/zen4/1/CMakeLists.txt b/kernels/zen4/1/CMakeLists.txt index 8f787a0c4e..a8d2c80097 100644 --- a/kernels/zen4/1/CMakeLists.txt +++ b/kernels/zen4/1/CMakeLists.txt @@ -1,9 +1,11 @@ ##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(zen4_1 + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_amaxv_zen_int_avx512.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_scalv_zen_int_avx512.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotv_zen_int_avx512.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyv_zen_int_avx512.c ) + +target_compile_options(zen4_1 PRIVATE /arch:AVX2 /arch:AVX512) \ No newline at end of file diff --git a/kernels/zen4/1m/CMakeLists.txt b/kernels/zen4/1m/CMakeLists.txt index 27a8e8c4cb..7e7cfda5f8 100644 --- a/kernels/zen4/1m/CMakeLists.txt +++ b/kernels/zen4/1m/CMakeLists.txt @@ -1,7 +1,7 @@ ##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(zen4_1m + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d8xk.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d16xk.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d24xk.c @@ -9,3 +9,5 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_z12xk.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_z4xk.c ) + +target_compile_options(zen4_1m PRIVATE /U__PRFCHW__ /arch:AVX2 /arch:AVX512) \ No newline at end of file diff --git a/kernels/zen4/3/CMakeLists.txt b/kernels/zen4/3/CMakeLists.txt index c1c715c5e6..2590a6aa34 100644 --- a/kernels/zen4/3/CMakeLists.txt +++ b/kernels/zen4/3/CMakeLists.txt @@ -1,7 +1,7 @@ ##Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(zen4_3 + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_l_zen_16x14.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_u_zen_16x14.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_l_zen4_8x24.c @@ -12,4 +12,6 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_zen4_asm_12x4.c ) -add_subdirectory(sup) +target_compile_options(zen4_3 PRIVATE /arch:AVX2 /arch:AVX512) + +add_subdirectory(sup) \ No newline at end of file diff --git a/kernels/zen4/3/sup/CMakeLists.txt b/kernels/zen4/3/sup/CMakeLists.txt index d56ee33239..642cf9da6a 100644 --- a/kernels/zen4/3/sup/CMakeLists.txt +++ b/kernels/zen4/3/sup/CMakeLists.txt @@ -1,7 +1,7 @@ ##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(zen4_3sup + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64.h ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64m.c @@ -13,5 +13,6 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_24x8m.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_cv_zen4_z12x4m.c ) +target_compile_options(zen4_3sup PRIVATE /arch:AVX2 /arch:AVX512) -add_subdirectory(d24x8) +add_subdirectory(d24x8) \ No newline at end of file diff --git a/kernels/zen4/3/sup/d24x8/CMakeLists.txt b/kernels/zen4/3/sup/d24x8/CMakeLists.txt index 640067f6cf..254a031866 100644 --- a/kernels/zen4/3/sup/d24x8/CMakeLists.txt +++ b/kernels/zen4/3/sup/d24x8/CMakeLists.txt @@ -1,7 +1,7 @@ ##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## -target_sources("${PROJECT_NAME}" - PRIVATE +add_library(zen4_3supd24x8 + OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx1.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx2.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx3.c @@ -11,3 +11,5 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx6.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx7.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx8.c ) + +target_compile_options(zen4_3supd24x8 PRIVATE /arch:AVX2 /arch:AVX512) \ No newline at end of file diff --git a/kernels/zen4/CMakeLists.txt b/kernels/zen4/CMakeLists.txt index b6bd12d444..7878918053 100644 --- a/kernels/zen4/CMakeLists.txt +++ b/kernels/zen4/CMakeLists.txt @@ -1,7 +1,7 @@ ##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.## +remove_definitions(/arch:AVX2) add_subdirectory(1) add_subdirectory(1m) add_subdirectory(3) -add_subdirectory(aocl_smart) - +add_subdirectory(aocl_smart) \ No newline at end of file From 1e266bbcbcbfa52cbaeb569b14250f3308b9fc2f Mon Sep 17 00:00:00 2001 From: mkadavil Date: Wed, 17 May 2023 18:07:16 +0530 Subject: [PATCH 091/226] LPGEMM framework updates to avoid unnecessary pack buffer allocation. -Currently when any of the downscale API is called, a temporary pack buffer is allocated (with bli_membrk_acquire_m) by each thread. It is used to persist intermediate higher precision output accumulated by the micro-kernel across pc loop when the number of pc iterations is more than 1 (k > KC). The bli_membrk_acquire_m is a thread safe operation and uses locks (pthread_mutex) to ensure thread safe checkout of memory/ block from the memory pool. -However when k < KC, this temporary buffer is not required. But since this pack buffer is allocated by default in downscale API, the overhead from locks affects performance when k < KC, m or n is sufficiently small and the number of threads involved is high. This default allocation is removed and the pack buffer is now only allocated if k > KC. AMD-Internal: [CPUPL-3430] Change-Id: I492586ff4c47bc7480d364efb7af3674e31bd2c1 --- .../frame/f32f32f32/lpgemm_f32f32f32.c | 2 +- .../aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c | 54 +++++++++++-------- .../aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c | 26 +++++---- .../aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c | 24 ++++++--- .../aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c | 26 +++++---- 5 files changed, 82 insertions(+), 50 deletions(-) diff --git a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c index 62c85d905f..1864d78330 100644 --- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c +++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c @@ -176,7 +176,7 @@ LPGEMM_5LOOP(float,float,float,f32f32f32of32) dim_t jc_cur_loop = jc; dim_t jc_cur_loop_rem = 0; - dim_t n_sub_updated; + dim_t n_sub_updated = 0; if ( mtag_b == REORDERED ) { diff --git a/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c b/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c index 6bb2278f66..86ee194eb5 100644 --- a/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c +++ b/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c @@ -163,23 +163,31 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) // Temp accumulaton buffer for C allocation. else if ( c_downscale == TRUE ) { - mem_scale_c_size_req = sizeof( int16_t ) * nc0 * ( ic_end - ic_start ); + // Buffer memory is only required if output needs to be + // persisted across iterations of the pc/KC loop. + // It was observed that the locks used while checking out + // a buffer from memory pool had an impact on performance + // and is better to not checkout if k <= KC. + if ( k > KC ) + { + mem_scale_c_size_req = sizeof( int16_t ) * nc0 * ( ic_end - ic_start ); - lpgemm_alloc_mem_panel - ( - mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, - &mem_scale_c, rntm - ); + lpgemm_alloc_mem_panel + ( + mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, + &mem_scale_c, rntm + ); - temp_scal_c_buffer_s8s8s16o16 = bli_mem_buffer( &mem_scale_c ); + temp_scal_c_buffer_s8s8s16o16 = bli_mem_buffer( &mem_scale_c ); - c_use_jc = ( int16_t* )temp_scal_c_buffer_s8s8s16o16; + c_use_jc = ( int16_t* )temp_scal_c_buffer_s8s8s16o16; + } // The temp c buffer stride is modified as opposed to original C matrix. rs_c_use = nc0; } - int16_t* pack_b_column_sum = NULL; + int16_t* pack_b_column_sum = NULL; for (dim_t pc = 0; pc < k; pc += KC) { @@ -248,7 +256,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) &jc_packb_start, &jc_packb_end ); - if ( pc == 0) + if ( pc == 0) { pack_b_column_sum = ( int16_t* )( pack_b_buffer_s8s8s16o16 + ( sizeof( int8_t ) * nc0_updated * kc0_updated ) ); } @@ -259,7 +267,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) if ((jc_packb_end > jc_packb_start) && (jc_packb_start < (jc + nc0))) { - if ( pc == 0 ) + if ( pc == 0 ) { for (int idx = jc_packb_start; idx < jc_packb_end; idx++ ) { @@ -269,15 +277,15 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) ( ( packb_s16_s8 )lcntx->packb_fun_ptr ) ( - pack_b_buffer_s8s8s16o16 + - (jc_packb_start * kc0_updated), - pack_b_column_sum + ( cs_b * jc_packb_start ), - (b + (rs_b * pc) + (cs_b * jc) + - (cs_b * jc_packb_start)), - rs_b, - (jc_packb_end - jc_packb_start), kc0, - &rs_b_use, &cs_b_use - ); + pack_b_buffer_s8s8s16o16 + + (jc_packb_start * kc0_updated), + pack_b_column_sum + ( cs_b * jc_packb_start ), + (b + (rs_b * pc) + (cs_b * jc) + + (cs_b * jc_packb_start)), + rs_b, + (jc_packb_end - jc_packb_start), kc0, + &rs_b_use, &cs_b_use + ); } else { @@ -293,7 +301,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) ); b_use = pack_b_buffer_s8s8s16o16; - post_ops_attr.b_col_sum_vec_s16 = pack_b_column_sum; + post_ops_attr.b_col_sum_vec_s16 = pack_b_column_sum; } else if (mtag_b == REORDERED) { @@ -307,7 +315,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use ); - post_ops_attr.b_col_sum_vec_s16 = ( ( int16_t* )( b + ( k_updated * n_updated ) ) ) + jc; + post_ops_attr.b_col_sum_vec_s16 = ( ( int16_t* )( b + ( k_updated * n_updated ) ) ) + jc; } else { @@ -356,7 +364,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) alpha, beta0, post_op_list, post_ops_attr ); - post_ops_attr.b_sum_offset += NR; + post_ops_attr.b_sum_offset += NR; } } } diff --git a/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c b/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c index bee2eb8ea7..98b8081b51 100644 --- a/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c +++ b/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c @@ -151,7 +151,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32) dim_t jc_cur_loop = jc; dim_t jc_cur_loop_rem = 0; - dim_t n_sub_updated; + dim_t n_sub_updated = 0; if ( mtag_b == REORDERED ) { @@ -170,17 +170,25 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32) // Temp accumulaton buffer for C allocation. else if ( c_downscale == TRUE ) { - mem_scale_c_size_req = sizeof( int32_t ) * nc0 * ( ic_end - ic_start ); + // Buffer memory is only required if output needs to be + // persisted across iterations of the pc/KC loop. + // It was observed that the locks used while checking out + // a buffer from memory pool had an impact on performance + // and is better to not checkout if k <= KC. + if ( k > KC ) + { + mem_scale_c_size_req = sizeof( int32_t ) * nc0 * ( ic_end - ic_start ); - lpgemm_alloc_mem_panel - ( - mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, - &mem_scale_c, rntm - ); + lpgemm_alloc_mem_panel + ( + mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, + &mem_scale_c, rntm + ); - temp_scal_c_buffer_s8s8s32o32 = bli_mem_buffer( &mem_scale_c ); + temp_scal_c_buffer_s8s8s32o32 = bli_mem_buffer( &mem_scale_c ); - c_use_jc = ( int32_t* )temp_scal_c_buffer_s8s8s32o32; + c_use_jc = ( int32_t* )temp_scal_c_buffer_s8s8s32o32; + } // The temp c buffer stride is modified as opposed to original C matrix. rs_c_use = nc0; diff --git a/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c b/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c index 43b27f8ad9..5a03493a44 100644 --- a/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c +++ b/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c @@ -160,17 +160,25 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16) // Temp accumulaton buffer for C allocation. else if ( c_downscale == TRUE ) { - mem_scale_c_size_req = sizeof( int16_t ) * nc0 * ( ic_end - ic_start ); + // Buffer memory is only required if output needs to be + // persisted across iterations of the pc/KC loop. + // It was observed that the locks used while checking out + // a buffer from memory pool had an impact on performance + // and is better to not checkout if k <= KC. + if ( k > KC ) + { + mem_scale_c_size_req = sizeof( int16_t ) * nc0 * ( ic_end - ic_start ); - lpgemm_alloc_mem_panel - ( - mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, - &mem_scale_c, rntm - ); + lpgemm_alloc_mem_panel + ( + mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, + &mem_scale_c, rntm + ); - temp_scal_c_buffer_u8s8s16o16 = bli_mem_buffer( &mem_scale_c ); + temp_scal_c_buffer_u8s8s16o16 = bli_mem_buffer( &mem_scale_c ); - c_use_jc = ( int16_t* )temp_scal_c_buffer_u8s8s16o16; + c_use_jc = ( int16_t* )temp_scal_c_buffer_u8s8s16o16; + } // The temp c buffer stride is modified as opposed to original C matrix. rs_c_use = nc0; diff --git a/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c b/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c index 3474ae0a90..feedda0212 100644 --- a/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c +++ b/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c @@ -150,7 +150,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32) dim_t jc_cur_loop = jc; dim_t jc_cur_loop_rem = 0; - dim_t n_sub_updated; + dim_t n_sub_updated = 0; if ( mtag_b == REORDERED ) { @@ -169,17 +169,25 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32) // Temp accumulaton buffer for C allocation. else if ( c_downscale == TRUE ) { - mem_scale_c_size_req = sizeof( int32_t ) * nc0 * ( ic_end - ic_start ); + // Buffer memory is only required if output needs to be + // persisted across iterations of the pc/KC loop. + // It was observed that the locks used while checking out + // a buffer from memory pool had an impact on performance + // and is better to not checkout if k <= KC. + if ( k > KC ) + { + mem_scale_c_size_req = sizeof( int32_t ) * nc0 * ( ic_end - ic_start ); - lpgemm_alloc_mem_panel - ( - mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, - &mem_scale_c, rntm - ); + lpgemm_alloc_mem_panel + ( + mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, + &mem_scale_c, rntm + ); - temp_scal_c_buffer_u8s8s32o32 = bli_mem_buffer( &mem_scale_c ); + temp_scal_c_buffer_u8s8s32o32 = bli_mem_buffer( &mem_scale_c ); - c_use_jc = ( int32_t* )temp_scal_c_buffer_u8s8s32o32; + c_use_jc = ( int32_t* )temp_scal_c_buffer_u8s8s32o32; + } // The temp c buffer stride is modified as opposed to original C matrix. rs_c_use = nc0; From 9ee95e171aa3537bbc33c72b11f44f871373c84d Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Thu, 18 May 2023 14:18:53 +0530 Subject: [PATCH 092/226] Control flow issue reported during static code analysis - Missing break statement will result in unexpected control flow. This function will not launch the threads for the API in question according to the AOCL dynamic logic without the break statement. AMD-Internal: [CPUPL-3436] Change-Id: Ic47d773169c09e84086a27b50cd59dba33529698 --- frame/base/bli_rntm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index 8425ae8060..98131623d8 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -1685,6 +1685,8 @@ static void aocl_zdscalv_dynamic else *nt_ideal = 64; + break; + default: /* Without this default condition, compiler will throw From 26e120ea25bfc63fd1d9fa6d1eda969b996b9306 Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Wed, 17 May 2023 03:02:26 +0530 Subject: [PATCH 093/226] Fixed diagonal packing for C/Z TRSM small - In C/Z TRSM small, packing in case of unit diagonal is not handled properly. - Diagonal elements are still being read even in case of unit diagonal. - This causes "Conditional jump or move depends on uninitialised value" error during valgrind tests. - To fix this, diagonal elements should not be read in case of unit diagonal. AMD-Internal: [CPUPL-3406] Change-Id: If3d6965299998a83d87f3a032f654fc7f8c43d4e --- kernels/zen/3/bli_trsm_small.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c index 6999a225e1..d08dbb2279 100644 --- a/kernels/zen/3/bli_trsm_small.c +++ b/kernels/zen/3/bli_trsm_small.c @@ -8188,6 +8188,16 @@ BLIS_INLINE void ztrsm_small_pack_diag_element dim_t size ) { + if ( is_unitdiag ) + { + dcomplex ones = {1.0, 0.0}; + for( dim_t i = 0; i < size; i++) + { + d11_pack[i].real = ones.real; + d11_pack[i].imag = ones.imag; + } + return; + } #ifdef BLIS_ENABLE_TRSM_PREINVERSION // If Preinversion is enabled, inverse the diaganol // elements from A and pack into diagonal buffer. @@ -39478,6 +39488,16 @@ BLIS_INLINE void ctrsm_small_pack_diag_element dim_t size ) { + if ( is_unitdiag ) + { + scomplex ones = {1.0, 0.0}; + for( dim_t i = 0; i < size; i++) + { + d11_pack[i].real = ones.real; + d11_pack[i].imag = ones.imag; + } + return; + } #ifdef BLIS_ENABLE_TRSM_PREINVERSION // If Preinversion is disabled, inverse the diaganol // elements from A and pack into diagonal buffer. From 061a68ff0d85df34b7dbf0b96fe39940b3813dd0 Mon Sep 17 00:00:00 2001 From: eashdash Date: Sun, 14 May 2023 17:57:09 +0000 Subject: [PATCH 094/226] BF16 Downscale and Performance fix for bf16 API This change contains the following: 1. Downscale optimization fix a. Similar to downscale optimizations made for s32 and s16 gemm, the following optimizations are done to improve the downscale performance for BF16 gemm b. The store to temporary float buffer can be avoided when k < KC since intermediate accumulation will not be required for the pc loop (only 1 iteration). The downscaled values (bf16) are written directly to the output C matrix. c. Within the micro-kernel when beta != 0, the bf16 data from the original C output matrix is loaded to a register, converted to float and beta scaling is applied on it at register level. This eliminates the requirement of previous design of copying the bf16 value to the temporary float buffer inside jc loop. 2. Alpha scaling a. Alpha scaling (multiply instruction) by default was resulting in performance regression when k dimension is small and alpha=1 in bf16 micro-kernels. b. Alpha scaling is now only done when alpha != 1. 3. K Fringe optimization a. Previously memcpy was used for K fringe case to load elements from A matrix in the microkernels b. Now, masked stores are used to store the downscaled and non-downscaled outputs without the need to use memcpy functions 4. N LT-16 fringe optimization a. Previously memcpy was used for N LT 16 fringe case in the microkernelsfor storing the downscaled and non-downscaled output. b. Now, masked stores are used to store the downscaled and non-downscaled outputs of BF16 without the need to use memcpy functions 5. Framework updates to avoid unnecessary pack buffer allocation a. The default allocation of the temporary pack buffer is removed and the pack buffer is now only allocated if k > KC. AMD-Internal: [CPUPL-3437] Change-Id: I71ff862e7d250559409a12a3533678c7a7951044 --- .../aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c | 76 +- .../lpgemm_6x64rowmajor_bf16_amd512vnni.c | 583 ++-- .../bf16bf16f32/lpgemm_f32_kern_macros.h | 65 +- .../lpgemm_m_fringe_bf16_amd512vnni.c | 1524 ++++---- .../lpgemm_mn_fringe_bf16_amd512vnni.c | 3070 +++++++++++------ .../lpgemm_n_fringe_bf16_amd512vnni.c | 1219 ++++--- 6 files changed, 3969 insertions(+), 2568 deletions(-) diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c index ac56763d19..1ece1db727 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c @@ -102,10 +102,21 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) dim_t k_updated = k; k_updated += (k_updated & 0x1); - // Is required to decide whether to apply post ops or not. + // To decide whether to apply post ops or not. bool is_last_k = FALSE; + // To decide whether to use original s8 C or temp buffer for beta scale. + bool is_first_k = FALSE; + lpgemm_post_op_attr post_ops_attr; + if ( c_downscale == TRUE ) + { + post_ops_attr.buf_downscale = c; + } + else + { + post_ops_attr.buf_downscale = NULL; + } // Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t. thrinfo_t thread_jc; @@ -126,7 +137,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) dim_t jc_cur_loop = jc; dim_t jc_cur_loop_rem = 0; - dim_t n_sub_updated; + dim_t n_sub_updated = 0; if ( mtag_b == REORDERED ) { @@ -145,45 +156,24 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) // Temp accumulaton buffer for C allocation. else if ( c_downscale == TRUE ) { - mem_scale_c_size_req = sizeof( float ) * nc0 * ( ic_end - ic_start ); - - lpgemm_alloc_mem_panel - ( - mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, - &mem_scale_c, rntm - ); + // Buffer memory is only required if output needs to be + // persisted across iterations of the pc/KC loop. + // It was observed that the locks used while checking out + // a buffer from memory pool had an impact on performance + // and is better to not checkout if k <= KC. + if ( k > KC ) + { + mem_scale_c_size_req = sizeof( float ) * nc0 * ( ic_end - ic_start ); - temp_scal_c_buffer_bf16 = bli_mem_buffer( &mem_scale_c ); + lpgemm_alloc_mem_panel + ( + mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, + &mem_scale_c, rntm + ); - c_use_jc = ( float* )temp_scal_c_buffer_bf16; + temp_scal_c_buffer_bf16 = bli_mem_buffer( &mem_scale_c ); - if ( beta != 0 ) - { - dim_t i_temp = 0; - dim_t j_temp = 0; - int32_t temp_conv_buf = 0; - // Upscale out C to temporary C matrix. - for ( dim_t i_dscale = ic_start; i_dscale < ic_end; ++i_dscale ) - { - j_temp = 0; - for ( dim_t j_dscale = jc; j_dscale < ( jc + nc0 ); ++j_dscale ) - { - // Implemented with the idea sizeof(float)=4. - temp_conv_buf = 0; - temp_conv_buf = *( ( int16_t* )( ( bfloat16* )c + - ( rs_c * i_dscale ) + j_dscale ) ); - - // Add 16 bits in the fractional part. - temp_conv_buf = temp_conv_buf << 16; - - // Store the bytes in float format. - *( temp_scal_c_buffer_bf16 + ( nc0 * i_temp ) + j_temp ) - = *( ( float* )( &temp_conv_buf ) ); - - j_temp++; - } - i_temp++; - } + c_use_jc = ( float* )temp_scal_c_buffer_bf16; } // The temp c buffer stride is modified as opposed to original C matrix. @@ -195,6 +185,13 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) float beta0 = ( pc == 0 ) ? beta : 1; dim_t kc0 = bli_min( ( k - pc ), KC ); + // No parallelization in k dim, k always starts at 0. + is_first_k = ( pc == 0 ) ? ( TRUE ) : ( FALSE ); + post_ops_attr.is_first_k = is_first_k; + + is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE ); + post_ops_attr.is_last_k = is_last_k; + // kc0 needs to be a multiple of 2 so that it can be // used with dpbf16_ps instruction. Padding is added in // cases this condition is not satisfied, and therefore @@ -203,8 +200,6 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) dim_t kc0_updated = kc0; kc0_updated += (kc0_updated & 0x1); - is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE ); - if ( mtag_b == PACK ) { // Pack B chunks are based on jc work id. @@ -330,7 +325,6 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) post_ops_attr.post_op_c_i = ic; post_ops_attr.post_op_c_j = ( jc + jr ); post_ops_attr.rs_c_downscale = rs_c_downscale; - post_ops_attr.is_last_k = is_last_k; // Reorder/Packed B, Reorder/Packed/Unpacked A call. ( ( lpgemm_rowvar_bf16 )lcntx->kern_fun_ptr ) diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c index 53df235a2d..592af7f042 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c @@ -71,7 +71,7 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; if ( n0 < NR ) { @@ -414,13 +414,8 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); @@ -431,13 +426,8 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 ); c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 ); @@ -448,13 +438,8 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 ); c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 ); @@ -465,13 +450,8 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 ); // Broadcast a[3,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 ); c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 ); @@ -482,13 +462,8 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 ); // Broadcast a[4,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 ); c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 ); @@ -499,13 +474,8 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 ); // Broadcast a[5,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 5) + (cs_a * ( k_full_pieces ))); + a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 ); c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 ); @@ -523,159 +493,197 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) __m512 selector1 = _mm512_set1_ps ( alpha ); __m512 selector2 = _mm512_set1_ps ( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); - c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); - - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); - c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 ); - - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); - c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 ); - - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); - c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); - c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 ); - - c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); - c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); - c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 ); - c_float_4p3 = _mm512_mul_ps( selector1, c_float_4p3 ); - - c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 ); - c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 ); - c_float_5p2 = _mm512_mul_ps( selector1, c_float_5p2 ); - c_float_5p3 = _mm512_mul_ps( selector1, c_float_5p3 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); + + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 ); + + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); + c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 ); + + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); + c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); + c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 ); + + c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); + c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 ); + c_float_4p3 = _mm512_mul_ps( selector1, c_float_4p3 ); + + c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 ); + c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 ); + c_float_5p2 = _mm512_mul_ps( selector1, c_float_5p2 ); + c_float_5p3 = _mm512_mul_ps( selector1, c_float_5p3 ); + + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // For the downscaled api (C-bf16), the output C matrix values + // needs to be upscaled to float to be used for beta scale. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,ir,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,ir,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,ir,0,2,selector1,selector2) - // c[0,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 ); + // c[0,48-63] + BF16_F32_BETA_OP(c_float_0p3,ir,0,3,selector1,selector2) - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP(c_float_1p0,ir,1,0,selector1,selector2) - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,16-31] + BF16_F32_BETA_OP(c_float_1p1,ir,1,1,selector1,selector2) - // c[1,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 ); + // c[1,32-47] + BF16_F32_BETA_OP(c_float_1p2,ir,1,2,selector1,selector2) - // c[1,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p3 = _mm512_add_ps( selector1, c_float_1p3 ); + // c[1,48-63] + BF16_F32_BETA_OP(c_float_1p3,ir,1,3,selector1,selector2) - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP(c_float_2p0,ir,2,0,selector1,selector2) - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,16-31] + BF16_F32_BETA_OP(c_float_2p1,ir,2,1,selector1,selector2) - // c[2,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 ); + // c[2,32-47] + BF16_F32_BETA_OP(c_float_2p2,ir,2,2,selector1,selector2) - // c[2,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p3 = _mm512_add_ps( selector1, c_float_2p3 ); + // c[2,48-63] + BF16_F32_BETA_OP(c_float_2p3,ir,2,3,selector1,selector2) - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[3,0-15] + BF16_F32_BETA_OP(c_float_3p0,ir,3,0,selector1,selector2) - // c[3,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 ); + // c[3,16-31] + BF16_F32_BETA_OP(c_float_3p1,ir,3,1,selector1,selector2) - // c[3,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 ); + // c[3,32-47] + BF16_F32_BETA_OP(c_float_3p2,ir,3,2,selector1,selector2) - // c[3,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p3 = _mm512_add_ps( selector1, c_float_3p3 ); + // c[0,48-63] + BF16_F32_BETA_OP(c_float_3p3,ir,3,3,selector1,selector2) - // c[4,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 ); + // c[4,0-15] + BF16_F32_BETA_OP(c_float_4p0,ir,4,0,selector1,selector2) - // c[4,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 ); + // c[4,16-31] + BF16_F32_BETA_OP(c_float_4p1,ir,4,1,selector1,selector2) - // c[4,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p2 = _mm512_add_ps( selector1, c_float_4p2 ); + // c[4,32-47] + BF16_F32_BETA_OP(c_float_4p2,ir,4,2,selector1,selector2) - // c[4,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p3 = _mm512_add_ps( selector1, c_float_4p3 ); + // c[4,48-63] + BF16_F32_BETA_OP(c_float_4p3,ir,4,3,selector1,selector2) - // c[5,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 ); + // c[5,0-15] + BF16_F32_BETA_OP(c_float_5p0,ir,5,0,selector1,selector2) - // c[5,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p1 = _mm512_add_ps( selector1, c_float_5p1 ); + // c[5,16-31] + BF16_F32_BETA_OP(c_float_5p1,ir,5,1,selector1,selector2) - // c[5,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p2 = _mm512_add_ps( selector1, c_float_5p2 ); + // c[5,32-47] + BF16_F32_BETA_OP(c_float_5p2,ir,5,2,selector1,selector2) + + // c[5,48-63] + BF16_F32_BETA_OP(c_float_5p3,ir,5,3,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,ir,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,ir,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,ir,0,2,selector1,selector2) + + // c[0,48-63] + F32_F32_BETA_OP(c_float_0p3,ir,0,3,selector1,selector2) + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0,ir,1,0,selector1,selector2) + + // c[1,16-31] + F32_F32_BETA_OP(c_float_1p1,ir,1,1,selector1,selector2) + + // c[1,32-47] + F32_F32_BETA_OP(c_float_1p2,ir,1,2,selector1,selector2) + + // c[1,48-63] + F32_F32_BETA_OP(c_float_1p3,ir,1,3,selector1,selector2) + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0,ir,2,0,selector1,selector2) + + // c[2,16-31] + F32_F32_BETA_OP(c_float_2p1,ir,2,1,selector1,selector2) + + // c[2,32-47] + F32_F32_BETA_OP(c_float_2p2,ir,2,2,selector1,selector2) + + // c[2,48-63] + F32_F32_BETA_OP(c_float_2p3,ir,2,3,selector1,selector2) + + // c[3,0-15] + F32_F32_BETA_OP(c_float_3p0,ir,3,0,selector1,selector2) + + // c[3,16-31] + F32_F32_BETA_OP(c_float_3p1,ir,3,1,selector1,selector2) + + // c[3,32-47] + F32_F32_BETA_OP(c_float_3p2,ir,3,2,selector1,selector2) + + // c[0,48-63] + F32_F32_BETA_OP(c_float_3p3,ir,3,3,selector1,selector2) + + // c[4,0-15] + F32_F32_BETA_OP(c_float_4p0,ir,4,0,selector1,selector2) + + // c[4,16-31] + F32_F32_BETA_OP(c_float_4p1,ir,4,1,selector1,selector2) + + // c[4,32-47] + F32_F32_BETA_OP(c_float_4p2,ir,4,2,selector1,selector2) + + // c[4,48-63] + F32_F32_BETA_OP(c_float_4p3,ir,4,3,selector1,selector2) + + // c[5,0-15] + F32_F32_BETA_OP(c_float_5p0,ir,5,0,selector1,selector2) + + // c[5,16-31] + F32_F32_BETA_OP(c_float_5p1,ir,5,1,selector1,selector2) + + // c[5,32-47] + F32_F32_BETA_OP(c_float_5p2,ir,5,2,selector1,selector2) + + // c[5,48-63] + F32_F32_BETA_OP(c_float_5p3,ir,5,3,selector1,selector2) + + } - // c[5,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p3 = _mm512_add_ps( selector1, c_float_5p3 ); } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -1276,76 +1284,76 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) POST_OPS_DOWNSCALE_6x64: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); // c[0, 48-63] - CVT_F32_BF16(c_float_0p3,0,3); + MULRND_F32(c_float_0p3,0,3); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[1, 32-47] - CVT_F32_BF16(c_float_1p2,1,2); + MULRND_F32(c_float_1p2,1,2); // c[1, 48-63] - CVT_F32_BF16(c_float_1p3,1,3); + MULRND_F32(c_float_1p3,1,3); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); // c[2, 32-47] - CVT_F32_BF16(c_float_2p2,2,2); + MULRND_F32(c_float_2p2,2,2); // c[2, 48-63] - CVT_F32_BF16(c_float_2p3,2,3); + MULRND_F32(c_float_2p3,2,3); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[3, 16-31] - CVT_F32_BF16(c_float_3p1,3,1); + MULRND_F32(c_float_3p1,3,1); // c[3, 32-47] - CVT_F32_BF16(c_float_3p2,3,2); + MULRND_F32(c_float_3p2,3,2); // c[3, 48-63] - CVT_F32_BF16(c_float_3p3,3,3); + MULRND_F32(c_float_3p3,3,3); // c[4, 0-15] - CVT_F32_BF16(c_float_4p0,4,0); + MULRND_F32(c_float_4p0,4,0); // c[4, 16-31] - CVT_F32_BF16(c_float_4p1,4,1); + MULRND_F32(c_float_4p1,4,1); // c[4, 32-47] - CVT_F32_BF16(c_float_4p2,4,2); + MULRND_F32(c_float_4p2,4,2); // c[4, 48-63] - CVT_F32_BF16(c_float_4p3,4,3); + MULRND_F32(c_float_4p3,4,3); // c[5, 0-15] - CVT_F32_BF16(c_float_5p0,5,0); + MULRND_F32(c_float_5p0,5,0); // c[5, 16-31] - CVT_F32_BF16(c_float_5p1,5,1); + MULRND_F32(c_float_5p1,5,1); // c[5, 32-47] - CVT_F32_BF16(c_float_5p2,5,2); + MULRND_F32(c_float_5p2,5,2); // c[5, 48-63] - CVT_F32_BF16(c_float_5p3,5,3); + MULRND_F32(c_float_5p3,5,3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1353,78 +1361,169 @@ LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64) POST_OPS_6x64_DISABLE: ; - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 ); + // Case where the output C matrix is bf16 (downscaled) and this is the + // final write for a given block within C. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 ); + // Store the results in downscaled type (bf16 instead of float). - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_float_0p2 ); + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // c[0,48-63] - _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_float_0p3 ); + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 ); + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 ); + // c[0, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3); - // c[1,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_float_1p2 ); + // c[1, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[1,48-63] - _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_float_1p3 ); + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 ); + // c[1, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 ); + // c[1, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_1p3,1,3); - // c[2,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_float_2p2 ); + // c[2, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); - // c[2,48-63] - _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_float_2p3 ); + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 ); + // c[2, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2); - // c[3,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 ); + // c[2, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_2p3,2,3); - // c[3,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_float_3p2 ); + // c[3, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); - // c[3,48-63] - _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_float_3p3 ); + // c[3, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1); - // c[4,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 ); + // c[3, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2); - // c[4,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 ); + // c[3, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_3p3,3,3); - // c[4,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_float_4p2 ); + // c[4, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0); - // c[4,48-63] - _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_float_4p3 ); + // c[4, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1); - // c[5,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 ); + // c[4, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_4p2,4,2); - // c[5,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 ); + // c[4, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_4p3,4,3); - // c[5,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_float_5p2 ); + // c[5, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_5p0,5,0); - // c[5,48-63] - _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_float_5p3 ); + // c[5, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_5p1,5,1); + + // c[5, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_5p2,5,2); + + // c[5, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_5p3,5,3); + + } + + // Case where the output C matrix is float + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 ); + + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 ); + + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_float_0p2 ); + + // c[0,48-63] + _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_float_0p3 ); + + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 ); + + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 ); + + // c[1,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_float_1p2 ); + + // c[1,48-63] + _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_float_1p3 ); + + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 ); + + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 ); + + // c[2,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_float_2p2 ); + + // c[2,48-63] + _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_float_2p3 ); + + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 ); + + // c[3,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 ); + + // c[3,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_float_3p2 ); + + // c[3,48-63] + _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_float_3p3 ); + + // c[4,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 ); + + // c[4,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 ); + + // c[4,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_float_4p2 ); + + // c[4,48-63] + _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_float_4p3 ); + + // c[5,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 ); + + // c[5,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 ); + + // c[5,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_float_5p2 ); + + // c[5,48-63] + _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_float_5p3 ); + + } a = a + ( MR * ps_a ); post_ops_attr.post_op_c_i += MR; diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h index 92980193c4..325be38bf7 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h @@ -52,28 +52,63 @@ /* Apply scaling on for <= 0 elements.*/ \ reg = _mm512_mask_mul_ps( reg, relu_cmp_mask, reg, selector2 ); \ -#define CVT_F32_BF16(reg,m_ind,n_ind) \ - _mm256_storeu_epi16 \ +// F32 fma macro +#define F32_BETA_FMA(reg,scratch1,scratch2) \ + scratch1 = _mm512_mul_ps( scratch2, scratch1 ); \ + reg = _mm512_add_ps( scratch1, reg ); \ + +// Beta scale macro, scratch2=beta +#define F32_F32_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \ + scratch1 = \ + _mm512_loadu_ps \ ( \ - ( bfloat16* )post_ops_list_temp->op_args3 + \ - ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ - post_ops_attr.post_op_c_j + ( n_ind * 16 ), \ - (__m256i) _mm512_cvtneps_pbh( reg ) \ - ) \ + ( c + ( rs_c * ( m_ir + m_ind ) ) + ( n_ind * 16 ) ) \ + ); \ + F32_BETA_FMA(reg,scratch1,scratch2) \ -#define CVT_F32_BF16_LT16(reg,m_ind,n_ind) \ - _mm256_storeu_epi16 \ +// Downscale beta scale macro, scratch2=beta +#define BF16_F32_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \ + scratch1 = \ + _mm512_cvtpbh_ps \ ( \ - buf0, \ - (__m256i) _mm512_cvtneps_pbh( reg ) \ + (__m256bh)_mm256_loadu_epi16 \ + ( \ + ( ( bfloat16* )post_ops_attr.buf_downscale + \ + ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ + post_ops_attr.post_op_c_j + ( n_ind * 16 ) )\ + ) \ ); \ - memcpy \ + F32_BETA_FMA(reg,scratch1,scratch2) \ + +// Default n < 16 mask load beta macro +#define F32_F32_BETA_OP_NLT16F_MASK(lmask,reg,m_ir,m_ind,n_ind,scratch1,scratch2) \ + scratch1 = _mm512_maskz_loadu_ps( lmask, c + ( rs_c * ( m_ir + m_ind ) ) + ( n_ind * 16 ) ); \ + F32_BETA_FMA(reg,scratch1,scratch2) \ + +// Downscale n < 16 mask load beta macro +#define BF16_F32_BETA_OP_NLT16F_MASK(lmask,reg,m_ind,n_ind,scratch1,scratch2) \ + scratch1 = _mm512_cvtpbh_ps \ ( \ - ( bfloat16* )post_ops_list_temp->op_args3 + \ + (__m256bh)_mm256_maskz_loadu_epi16 \ + ( \ + lmask, \ + ( bfloat16* )post_ops_attr.buf_downscale + \ + ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ + post_ops_attr.post_op_c_j + ( n_ind * 16 ) \ + ) \ + ); \ + F32_BETA_FMA(reg,scratch1,scratch2) \ + +#define MULRND_F32(reg,m_ind,n_ind) \ + +#define CVT_STORE_F32_BF16_MASK(reg,m_ind,n_ind) \ + _mm256_mask_storeu_epi16 \ + ( \ + ( bfloat16* )post_ops_attr.buf_downscale + \ ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ post_ops_attr.post_op_c_j + ( n_ind * 16 ), \ - buf0, ( n0_rem * sizeof( bfloat16 ) ) \ - ); \ + mask_all1, (__m256i) _mm512_cvtneps_pbh( reg ) \ + ) \ /* TANH GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) ) */ #define GELU_TANH_F32_AVX512(reg, r, r2, x, z, dn, x_tanh, q) \ diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c index d364ba247b..e3e3bc2869 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c @@ -58,7 +58,7 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -164,13 +164,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); @@ -181,13 +176,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64) c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 ); c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 ); @@ -198,13 +188,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64) c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 ); c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 ); @@ -215,13 +200,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64) c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 ); // Broadcast a[3,kr:kr+4]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 ); c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 ); @@ -232,13 +212,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64) c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 ); // Broadcast a[4,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 ); c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 ); @@ -256,134 +231,166 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); - c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); - - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); - c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 ); - - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); - c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 ); - - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); - c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); - c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); + + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 ); + + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); + c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 ); + + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); + c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); + c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 ); + + c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); + c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 ); + c_float_4p3 = _mm512_mul_ps( selector1, c_float_4p3 ); - c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); - c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); - c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 ); - c_float_4p3 = _mm512_mul_ps( selector1, c_float_4p3 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // For the downscaled api (C-bf16), the output C matrix values + // needs to be upscaled to float to be used for beta scale. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) - // c[0,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 ); + // c[0,48-63] + BF16_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2) - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,16-31] + BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) - // c[1,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 ); + // c[1,32-47] + BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) - // c[1,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p3 = _mm512_add_ps( selector1, c_float_1p3 ); + // c[1,48-63] + BF16_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2) - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,16-31] + BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) - // c[2,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 ); + // c[2,32-47] + BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) - // c[2,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p3 = _mm512_add_ps( selector1, c_float_2p3 ); + // c[2,48-63] + BF16_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2) - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[3,0-15] + BF16_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2) - // c[3,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 ); + // c[3,16-31] + BF16_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2) - // c[3,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 ); + // c[3,32-47] + BF16_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2) - // c[3,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p3 = _mm512_add_ps( selector1, c_float_3p3 ); + // c[0,48-63] + BF16_F32_BETA_OP(c_float_3p3,0,3,3,selector1,selector2) - // c[4,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 ); + // c[4,0-15] + BF16_F32_BETA_OP(c_float_4p0,0,4,0,selector1,selector2) - // c[4,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 ); + // c[4,16-31] + BF16_F32_BETA_OP(c_float_4p1,0,4,1,selector1,selector2) - // c[4,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p2 = _mm512_add_ps( selector1, c_float_4p2 ); + // c[4,32-47] + BF16_F32_BETA_OP(c_float_4p2,0,4,2,selector1,selector2) - // c[4,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p3 = _mm512_add_ps( selector1, c_float_4p3 ); + // c[4,48-63] + BF16_F32_BETA_OP(c_float_4p3,0,4,3,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + + // c[0,48-63] + F32_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2) + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) + + // c[1,32-47] + F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) + + // c[1,48-63] + F32_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2) + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) + + // c[2,16-31] + F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) + + // c[2,32-47] + F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) + + // c[2,48-63] + F32_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2) + + // c[3,0-15] + F32_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2) + + // c[3,16-31] + F32_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2) + + // c[3,32-47] + F32_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2) + + // c[0,48-63] + F32_F32_BETA_OP(c_float_3p3,0,3,3,selector1,selector2) + + // c[4,0-15] + F32_F32_BETA_OP(c_float_4p0,0,4,0,selector1,selector2) + + // c[4,16-31] + F32_F32_BETA_OP(c_float_4p1,0,4,1,selector1,selector2) + + // c[4,32-47] + F32_F32_BETA_OP(c_float_4p2,0,4,2,selector1,selector2) + + // c[4,48-63] + F32_F32_BETA_OP(c_float_4p3,0,4,3,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -896,130 +903,208 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64) POST_OPS_DOWNSCALE_5x64: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); // c[0, 48-63] - CVT_F32_BF16(c_float_0p3,0,3); + MULRND_F32(c_float_0p3,0,3); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[1, 32-47] - CVT_F32_BF16(c_float_1p2,1,2); + MULRND_F32(c_float_1p2,1,2); // c[1, 48-63] - CVT_F32_BF16(c_float_1p3,1,3); + MULRND_F32(c_float_1p3,1,3); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); // c[2, 32-47] - CVT_F32_BF16(c_float_2p2,2,2); + MULRND_F32(c_float_2p2,2,2); // c[2, 48-63] - CVT_F32_BF16(c_float_2p3,2,3); + MULRND_F32(c_float_2p3,2,3); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[3, 16-31] - CVT_F32_BF16(c_float_3p1,3,1); + MULRND_F32(c_float_3p1,3,1); // c[3, 32-47] - CVT_F32_BF16(c_float_3p2,3,2); + MULRND_F32(c_float_3p2,3,2); // c[3, 48-63] - CVT_F32_BF16(c_float_3p3,3,3); + MULRND_F32(c_float_3p3,3,3); // c[4, 0-15] - CVT_F32_BF16(c_float_4p0,4,0); + MULRND_F32(c_float_4p0,4,0); // c[4, 16-31] - CVT_F32_BF16(c_float_4p1,4,1); + MULRND_F32(c_float_4p1,4,1); // c[4, 32-47] - CVT_F32_BF16(c_float_4p2,4,2); + MULRND_F32(c_float_4p2,4,2); // c[4, 48-63] - CVT_F32_BF16(c_float_4p3,4,3); + MULRND_F32(c_float_4p3,4,3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_5x64_DISABLE: ; + // Case where the output C matrix is bf16 (downscaled) and this is the + // final write for a given block within C. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (bf16 instead of float). + + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); + + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); + + // c[0, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3); + + // c[1, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); + + // c[1, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2); + + // c[1, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_1p3,1,3); + + // c[2, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); + + // c[2, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2); + + // c[2, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_2p3,2,3); + + // c[3, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); + + // c[3, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1); + + // c[3, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2); + + // c[3, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_3p3,3,3); + + // c[4, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // c[4, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[4, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_4p2,4,2); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + // c[4, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_4p3,4,3); - // c[0,48-63] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); + } - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // Case where the output C matrix is float + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[1,32-47] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); - // c[1,48-63] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 ); + // c[0,48-63] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); - // c[2,32-47] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); + // c[1,32-47] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); - // c[2,48-63] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 ); + // c[1,48-63] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 ); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); - // c[3,16-31] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + + // c[2,32-47] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); + + // c[2,48-63] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 ); + + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); - // c[3,32-47] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 ); + // c[3,16-31] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); + + // c[3,32-47] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 ); - // c[3,48-63] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 3*16 ), c_float_3p3 ); + // c[3,48-63] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 3*16 ), c_float_3p3 ); - // c[4,0-15] - _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 ); + // c[4,0-15] + _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 ); + + // c[4,16-31] + _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 ); - // c[4,16-31] - _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 ); + // c[4,32-47] + _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 2*16 ), c_float_4p2 ); - // c[4,32-47] - _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 2*16 ), c_float_4p2 ); + // c[4,48-63] + _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 3*16 ), c_float_4p3 ); - // c[4,48-63] - _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 3*16 ), c_float_4p3 ); + } } // 4x64 bf16 kernel @@ -1039,7 +1124,7 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x64) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -1130,13 +1215,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x64) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); @@ -1147,13 +1227,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x64) c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 ); c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 ); @@ -1164,13 +1239,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x64) c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 ); c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 ); @@ -1181,13 +1251,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x64) c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 ); // Broadcast a[3,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 ); c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 ); @@ -1205,109 +1270,137 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x64) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); - c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); - - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); - c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 ); - - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); - c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 ); - - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); - c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); - c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); + + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 ); + + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); + c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 ); + + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); + c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); + c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // For the downscaled api (C-bf16), the output C matrix values + // needs to be upscaled to float to be used for beta scale. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) - // c[0,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 ); + // c[0,48-63] + BF16_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2) - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,16-31] + BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) - // c[1,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 ); + // c[1,32-47] + BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) - // c[1,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p3 = _mm512_add_ps( selector1, c_float_1p3 ); + // c[1,48-63] + BF16_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2) - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,16-31] + BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) - // c[2,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 ); + // c[2,32-47] + BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) - // c[2,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p3 = _mm512_add_ps( selector1, c_float_2p3 ); + // c[2,48-63] + BF16_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2) - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[3,0-15] + BF16_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2) - // c[3,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 ); + // c[3,16-31] + BF16_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2) - // c[3,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 ); + // c[3,32-47] + BF16_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2) - // c[3,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p3 = _mm512_add_ps( selector1, c_float_3p3 ); + // c[0,48-63] + BF16_F32_BETA_OP(c_float_3p3,0,3,3,selector1,selector2) + + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + + // c[0,48-63] + F32_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2) + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) + + // c[1,32-47] + F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) + + // c[1,48-63] + F32_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2) + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) + + // c[2,16-31] + F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) + + // c[2,32-47] + F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) + + // c[2,48-63] + F32_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2) + + // c[3,0-15] + F32_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2) + + // c[3,16-31] + F32_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2) + + // c[3,32-47] + F32_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2) + + // c[0,48-63] + F32_F32_BETA_OP(c_float_3p3,0,3,3,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -1733,52 +1826,52 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x64) POST_OPS_DOWNSCALE_4x64: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); // c[0, 48-63] - CVT_F32_BF16(c_float_0p3,0,3); + MULRND_F32(c_float_0p3,0,3); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[1, 32-47] - CVT_F32_BF16(c_float_1p2,1,2); + MULRND_F32(c_float_1p2,1,2); // c[1, 48-63] - CVT_F32_BF16(c_float_1p3,1,3); + MULRND_F32(c_float_1p3,1,3); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); // c[2, 32-47] - CVT_F32_BF16(c_float_2p2,2,2); + MULRND_F32(c_float_2p2,2,2); // c[2, 48-63] - CVT_F32_BF16(c_float_2p3,2,3); + MULRND_F32(c_float_2p3,2,3); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[3, 16-31] - CVT_F32_BF16(c_float_3p1,3,1); + MULRND_F32(c_float_3p1,3,1); // c[3, 32-47] - CVT_F32_BF16(c_float_3p2,3,2); + MULRND_F32(c_float_3p2,3,2); // c[3, 48-63] - CVT_F32_BF16(c_float_3p3,3,3); + MULRND_F32(c_float_3p3,3,3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1786,54 +1879,119 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x64) POST_OPS_4x64_DISABLE: ; - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // Case where the output C matrix is bf16 (downscaled) and this is the + // final write for a given block within C. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (bf16 instead of float). + + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); + + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); + + // c[0, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3); + + // c[1, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); + + // c[1, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2); + + // c[1, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_1p3,1,3); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[2, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); - // c[0,48-63] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); + // c[2, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[2, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_2p3,2,3); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[3, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); - // c[1,32-47] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + // c[3, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1); - // c[1,48-63] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 ); + // c[3, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[3, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_3p3,3,3); + } + + // Case where the output C matrix is float + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + + // c[0,48-63] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); + + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + + // c[1,32-47] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + + // c[1,48-63] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 ); + + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); - // c[2,32-47] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); + // c[2,32-47] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); - // c[2,48-63] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 ); + // c[2,48-63] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 ); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); - // c[3,16-31] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); + // c[3,16-31] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); - // c[3,32-47] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 ); + // c[3,32-47] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 ); - // c[3,48-63] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 3*16 ), c_float_3p3 ); + // c[3,48-63] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 3*16 ), c_float_3p3 ); + } } // 3x64 bf16 kernel @@ -1853,7 +2011,7 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x64) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -1928,13 +2086,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x64) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); @@ -1945,13 +2098,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x64) c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 ); c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 ); @@ -1962,13 +2110,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x64) c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 ); c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 ); @@ -1986,84 +2129,107 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x64) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); - c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); - - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); - c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 ); - - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); - c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); + + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 ); + + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); + c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // For the downscaled api (C-bf16), the output C matrix values + // needs to be upscaled to float to be used for beta scale. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) - // c[0,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 ); + // c[0,48-63] + BF16_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2) - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,16-31] + BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) - // c[1,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 ); + // c[1,32-47] + BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) - // c[1,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p3 = _mm512_add_ps( selector1, c_float_1p3 ); + // c[1,48-63] + BF16_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2) - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,16-31] + BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) - // c[2,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 ); + // c[2,32-47] + BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) - // c[2,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p3 = _mm512_add_ps( selector1, c_float_2p3 ); + // c[2,48-63] + BF16_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + + // c[0,48-63] + F32_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2) + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) + + // c[1,32-47] + F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) + + // c[1,48-63] + F32_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2) + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) + + // c[2,16-31] + F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) + + // c[2,32-47] + F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) + + // c[2,48-63] + F32_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -2402,82 +2568,134 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x64) POST_OPS_DOWNSCALE_3x64: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); // c[0, 48-63] - CVT_F32_BF16(c_float_0p3,0,3); + MULRND_F32(c_float_0p3,0,3); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[1, 32-47] - CVT_F32_BF16(c_float_1p2,1,2); + MULRND_F32(c_float_1p2,1,2); // c[1, 48-63] - CVT_F32_BF16(c_float_1p3,1,3); + MULRND_F32(c_float_1p3,1,3); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); // c[2, 32-47] - CVT_F32_BF16(c_float_2p2,2,2); + MULRND_F32(c_float_2p2,2,2); // c[2, 48-63] - CVT_F32_BF16(c_float_2p3,2,3); + MULRND_F32(c_float_2p3,2,3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_3x64_DISABLE: ; + // Case where the output C matrix is bf16 (downscaled) and this is the + // final write for a given block within C. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (bf16 instead of float). + + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); + + // c[0, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3); + + // c[1, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); + + // c[1, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2); + + // c[1, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_1p3,1,3); + + // c[2, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); + + // c[2, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2); + + // c[2, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_2p3,2,3); + } + + // Case where the output C matrix is float + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); - // c[0,48-63] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); + // c[0,48-63] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); - // c[1,32-47] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + // c[1,32-47] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); - // c[1,48-63] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 ); + // c[1,48-63] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 ); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); - // c[2,32-47] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); + // c[2,32-47] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); - // c[2,48-63] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 ); + // c[2,48-63] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 ); + } } // 2x64 bf16 kernel @@ -2497,7 +2715,7 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x64) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; __m512bh b1; @@ -2555,13 +2773,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x64) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); @@ -2572,13 +2785,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x64) c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 ); c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 ); @@ -2596,59 +2804,78 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x64) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); - c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); - - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); - c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); + + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // For the downscaled api (C-bf16), the output C matrix values + // needs to be upscaled to float to be used for beta scale. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) - // c[0,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 ); + // c[0,48-63] + BF16_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2) - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,16-31] + BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) - // c[1,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 ); + // c[1,32-47] + BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) - // c[1,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p3 = _mm512_add_ps( selector1, c_float_1p3 ); + // c[1,48-63] + BF16_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + + // c[0,48-63] + F32_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2) + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) + + // c[1,32-47] + F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) + + // c[1,48-63] + F32_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -2900,58 +3127,99 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x64) POST_OPS_DOWNSCALE_2x64: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); // c[0, 48-63] - CVT_F32_BF16(c_float_0p3,0,3); + MULRND_F32(c_float_0p3,0,3); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[1, 32-47] - CVT_F32_BF16(c_float_1p2,1,2); + MULRND_F32(c_float_1p2,1,2); // c[1, 48-63] - CVT_F32_BF16(c_float_1p3,1,3); + MULRND_F32(c_float_1p3,1,3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_2x64_DISABLE: ; - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // Case where the output C matrix is bf16 (downscaled) and this is the + // final write for a given block within C. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (bf16 instead of float). + + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); + + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); + + // c[0, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[1, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); - // c[0,48-63] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); + // c[1, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_1p3,1,3); + } + + // Case where the output C matrix is float + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); - // c[1,32-47] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + // c[0,48-63] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); + + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[1,48-63] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 ); + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + + // c[1,32-47] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + + // c[1,48-63] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 ); + } } // 1x64 bf16 kernel @@ -2971,7 +3239,7 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x64) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // Registers to use for accumulating C. __m512 c_float_0p0 = _mm512_setzero_ps(); @@ -3004,13 +3272,8 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x64) __m512bh b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - __m512bh a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + __m512bh a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); __m512bh b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); __m512bh b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); @@ -3028,34 +3291,49 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x64) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); - c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 ); + } // Scale C by beta. if ( beta != 0) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // For the downscaled api (C-bf16), the output C matrix values + // needs to be upscaled to float to be used for beta scale. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) - // c[0,48-63] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 3*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 ); + // c[0,48-63] + BF16_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + + // c[0,48-63] + F32_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -3220,34 +3498,62 @@ LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x64) POST_OPS_DOWNSCALE_1x64: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); // c[0, 48-63] - CVT_F32_BF16(c_float_0p3,0,3); + MULRND_F32(c_float_0p3,0,3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_1x64_DISABLE: ; + // Case where the output C matrix is bf16 (downscaled) and this is the + // final write for a given block within C. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (bf16 instead of float). - // Store the accumulated results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); + + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); + + // c[0, 48-63] + CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3); + } + + // Case where the output C matrix is float + else + { + // Store the accumulated results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); - // c[0,48-63] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); + // c[0,48-63] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 ); + } } #endif #endif diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c index 1eab70432c..01b59d38cf 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c @@ -58,7 +58,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5xlt16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -68,10 +68,6 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5xlt16) // For corner cases. float buf0[16]; - float buf1[16]; - float buf2[16]; - float buf3[16]; - float buf4[16]; // Registers to use for accumulating C. __m512 c_float_0p0 = _mm512_setzero_ps(); @@ -129,40 +125,40 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5xlt16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15] c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15] c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 ); // Broadcast a[3,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15] c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 ); // Broadcast a[4,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15] @@ -173,50 +169,72 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5xlt16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + } // Scale C by beta. if ( beta != 0 ) { - memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf2, ( c + ( rs_c * 2 ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf3, ( c + ( rs_c * 3 ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf4, ( c + ( rs_c * 4 ) ), ( n0_rem * sizeof( float ) ) ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // c[0,0-15] - selector1 = _mm512_loadu_ps( buf0 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // c[0,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \ + selector1, selector2 ); - // c[1,0-15] - selector1 = _mm512_loadu_ps( buf1 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_1p0, 1, 0, \ + selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( buf2 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_2p0, 2, 0, \ + selector1, selector2 ); - // c[3,0-15] - selector1 = _mm512_loadu_ps( buf3 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[3,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_3p0, 3, 0, \ + selector1, selector2 ); - // c[4,0-15] - selector1 = _mm512_loadu_ps( buf4 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 ); + // c[4,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_4p0, 4, 0, \ + selector1, selector2 ); + } + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); + + // c[0,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, 0, 0, 0, \ + selector1, selector2); + + // c[1,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_1p0, 0, 1, 0, \ + selector1, selector2); + + // c[2,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_2p0, 0, 2, 0, \ + selector1, selector2); + + // c[3,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_3p0, 0, 3, 0, \ + selector1, selector2); + + // c[4,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_4p0, 0, 4, 0, \ + selector1, selector2); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -396,57 +414,65 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5xlt16) POST_OPS_DOWNSCALE_5xLT16: { // c[0, 0-15] - CVT_F32_BF16_LT16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[1, 0-15] - CVT_F32_BF16_LT16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[2, 0-15] - CVT_F32_BF16_LT16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[3, 0-15] - CVT_F32_BF16_LT16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[4, 0-15] - CVT_F32_BF16_LT16(c_float_4p0,4,0); + MULRND_F32(c_float_4p0,4,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_5xLT16_DISABLE: - ; + ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + __mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( buf0, c_float_0p0 ); + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // c[1,0-15] - _mm512_storeu_ps( buf1, c_float_1p0 ); + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[2,0-15] - _mm512_storeu_ps( buf2, c_float_2p0 ); + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); - // c[3,0-15] - _mm512_storeu_ps( buf3, c_float_3p0 ); + // c[3,0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); - // c[4,0-15] - _mm512_storeu_ps( buf4, c_float_4p0 ); + // c[4,0-15] + CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0); + } - // Memcpy partial parts. - // c[0,0-15] - memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) ); + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // c[1,0-15] - memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( float ) ) ); + // Store the results. + // c[0,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 0 ), load_mask, c_float_0p0 ); - // c[2,0-15] - memcpy( c + ( rs_c * 2 ) + ( 0*16 ), buf2, ( n0_rem * sizeof( float ) ) ); + // c[1,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 1 ), load_mask, c_float_1p0 ); - // c[3,0-15] - memcpy( c + ( rs_c * 3 ) + ( 0*16 ), buf3, ( n0_rem * sizeof( float ) ) ); + // c[2,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 2 ), load_mask, c_float_2p0 ); - // c[4,0-15] - memcpy( c + ( rs_c * 4 ) + ( 0*16 ), buf4, ( n0_rem * sizeof( float ) ) ); + // c[3,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 3 ), load_mask, c_float_3p0 ); + // c[4,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 4 ), load_mask, c_float_4p0 ); + } } // 4xlt16 bf16 fringe kernel @@ -466,7 +492,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4xlt16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -476,9 +502,6 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4xlt16) // For corner cases. float buf0[16]; - float buf1[16]; - float buf2[16]; - float buf3[16]; // Registers to use for accumulating C. __m512 c_float_0p0 = _mm512_setzero_ps(); @@ -528,32 +551,32 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4xlt16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15] c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15] c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 ); // Broadcast a[3,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15] @@ -564,42 +587,62 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4xlt16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + } // Scale C by beta. if ( beta != 0 ) { - memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf2, ( c + ( rs_c * 2 ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf3, ( c + ( rs_c * 3 ) ), ( n0_rem * sizeof( float ) ) ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // c[0,0-15] - selector1 = _mm512_loadu_ps( buf0 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // c[0,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \ + selector1, selector2 ); - // c[1,0-15] - selector1 = _mm512_loadu_ps( buf1 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_1p0, 1, 0, \ + selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( buf2 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_2p0, 2, 0, \ + selector1, selector2 ); - // c[3,0-15] - selector1 = _mm512_loadu_ps( buf3 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[3,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_3p0, 3, 0, \ + selector1, selector2 ); + } + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); + + // c[0,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, 0, 0, 0, \ + selector1, selector2); + + // c[1,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_1p0, 0, 1, 0, \ + selector1, selector2); + + // c[2,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_2p0, 0, 2, 0, \ + selector1, selector2); + + // c[3,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_3p0, 0, 3, 0, \ + selector1, selector2); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -755,47 +798,55 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4xlt16) POST_OPS_DOWNSCALE_4xLT16: { // c[0, 0-15] - CVT_F32_BF16_LT16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[1, 0-15] - CVT_F32_BF16_LT16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[2, 0-15] - CVT_F32_BF16_LT16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[3, 0-15] - CVT_F32_BF16_LT16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_4xLT16_DISABLE: - ; + ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + __mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( buf0, c_float_0p0 ); + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // c[1,0-15] - _mm512_storeu_ps( buf1, c_float_1p0 ); + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[2,0-15] - _mm512_storeu_ps( buf2, c_float_2p0 ); + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); - // c[3,0-15] - _mm512_storeu_ps( buf3, c_float_3p0 ); + // c[3,0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); + } + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // Memcpy partial parts. - // c[0,0-15] - memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) ); + // Store the results. + // c[0,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 0 ), load_mask, c_float_0p0 ); - // c[1,0-15] - memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( float ) ) ); + // c[1,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 1 ), load_mask, c_float_1p0 ); - // c[2,0-15] - memcpy( c + ( rs_c * 2 ) + ( 0*16 ), buf2, ( n0_rem * sizeof( float ) ) ); + // c[2,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 2 ), load_mask, c_float_2p0 ); - // c[3,0-15] - memcpy( c + ( rs_c * 3 ) + ( 0*16 ), buf3, ( n0_rem * sizeof( float ) ) ); + // c[3,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 3 ), load_mask, c_float_3p0 ); + } } @@ -816,7 +867,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3xlt16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -826,8 +877,6 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3xlt16) // For corner cases. float buf0[16]; - float buf1[16]; - float buf2[16]; // Registers to use for accumulating C. __m512 c_float_0p0 = _mm512_setzero_ps(); @@ -867,24 +916,24 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3xlt16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15] c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15] @@ -895,34 +944,52 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3xlt16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + } // Scale C by beta. if ( beta != 0 ) { - memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf2, ( c + ( rs_c * 2 ) ), ( n0_rem * sizeof( float) ) ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // c[0,0-15] - selector1 = _mm512_loadu_ps( buf0 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // c[0,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \ + selector1, selector2 ); - // c[1,0-15] - selector1 = _mm512_loadu_ps( buf1 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_1p0, 1, 0, \ + selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( buf2 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_2p0, 2, 0, \ + selector1, selector2 ); + } + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); + + // c[0,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, 0, 0, 0, \ + selector1, selector2); + + // c[1,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_1p0, 0, 1, 0, \ + selector1, selector2); + + // c[2,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_2p0, 0, 2, 0, \ + selector1, selector2); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -1054,38 +1121,47 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3xlt16) POST_OPS_DOWNSCALE_3xLT16: { // c[0, 0-15] - CVT_F32_BF16_LT16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[1, 0-15] - CVT_F32_BF16_LT16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[2, 0-15] - CVT_F32_BF16_LT16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_3xLT16_DISABLE: - ; + ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + __mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); + + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( buf0, c_float_0p0 ); + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[1,0-15] - _mm512_storeu_ps( buf1, c_float_1p0 ); + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + } - // c[2,0-15] - _mm512_storeu_ps( buf2, c_float_2p0 ); + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // Memcpy partial parts. - // c[0,0-15] - memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) ); + // Store the results. + // c[0,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 0 ), load_mask, c_float_0p0 ); - // c[1,0-15] - memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( float ) ) ); + // c[1,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 1 ), load_mask, c_float_1p0 ); - // c[2,0-15] - memcpy( c + ( rs_c * 2 ) + ( 0*16 ), buf2, ( n0_rem * sizeof( float ) ) ); + // c[2,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 2 ), load_mask, c_float_2p0 ); + } } @@ -1106,7 +1182,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2xlt16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -1116,7 +1192,6 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2xlt16) // For corner cases. float buf0[16]; - float buf1[16]; // Registers to use for accumulating C. __m512 c_float_0p0 = _mm512_setzero_ps(); @@ -1147,16 +1222,16 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2xlt16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15] @@ -1167,26 +1242,42 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2xlt16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + } // Scale C by beta. if ( beta != 0 ) { - memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( float) ) ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // c[0,0-15] - selector1 = _mm512_loadu_ps( buf0 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // c[0,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \ + selector1, selector2 ); - // c[1,0-15] - selector1 = _mm512_loadu_ps( buf1 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_1p0, 1, 0, \ + selector1, selector2 ); + } + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); + + // c[0,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, 0, 0, 0, \ + selector1, selector2); + + // c[1,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_1p0, 0, 1, 0, \ + selector1, selector2); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -1294,29 +1385,38 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2xlt16) POST_OPS_DOWNSCALE_2xLT16: { // c[0, 0-15] - CVT_F32_BF16_LT16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[1, 0-15] - CVT_F32_BF16_LT16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_2xLT16_DISABLE: - ; + ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + __mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); + + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( buf0, c_float_0p0 ); + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + } - // c[1,0-15] - _mm512_storeu_ps( buf1, c_float_1p0 ); + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // Memcpy partial parts. - // c[0,0-15] - memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) ); + // Store the results. + // c[0,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 0 ), load_mask, c_float_0p0 ); - // c[1,0-15] - memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( float ) ) ); + // c[1,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 1 ), load_mask, c_float_1p0 ); + } } @@ -1337,7 +1437,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1xlt16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -1368,8 +1468,8 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1xlt16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] @@ -1380,18 +1480,32 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1xlt16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + } // Scale C by beta. if ( beta != 0 ) { - memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( float ) ) ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // c[0,0-15] - selector1 = _mm512_loadu_ps( buf0 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // c[0,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \ + selector1, selector2 ); + } + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); + + // c[0,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, 0, 0, 0, \ + selector1, selector2); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -1475,20 +1589,29 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1xlt16) POST_OPS_DOWNSCALE_1xLT16: { // c[0, 0-15] - CVT_F32_BF16_LT16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_1xLT16_DISABLE: - ; + ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + __mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); + + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + } - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( buf0, c_float_0p0 ); + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // Memcpy partial parts. - // c[0,0-15] - memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) ); + // Store the results. + // c[0,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * 0 ), load_mask, c_float_0p0 ); + } } @@ -1509,7 +1632,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -1573,40 +1696,40 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15] c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15] c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 ); // Broadcast a[3,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15] c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 ); // Broadcast a[4,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15] @@ -1617,44 +1740,69 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, \ + selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[1,0-15] + BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, \ + selector1, selector2 ); - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[2,0-15] + BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, \ + selector1, selector2 ); - // c[4,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 ); + // c[3,0-15] + BF16_F32_BETA_OP( c_float_3p0, 0, 3, 0, \ + selector1, selector2 ); + + // c[4,0-15] + BF16_F32_BETA_OP( c_float_4p0, 0, 4, 0, \ + selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0, 0, 0, 0, \ + selector1, selector2); + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0, 0, 1, 0, \ + selector1, selector2); + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0, 0, 2, 0, \ + selector1, selector2); + + // c[3,0-15] + F32_F32_BETA_OP(c_float_3p0, 0, 3, 0, \ + selector1, selector2); + + // c[4,0-15] + F32_F32_BETA_OP(c_float_4p0, 0, 4, 0, \ + selector1, selector2); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -1834,40 +1982,66 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x16) POST_OPS_DOWNSCALE_5x16: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[4, 0-15] - CVT_F32_BF16(c_float_4p0,4,0); + MULRND_F32(c_float_4p0,4,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_5x16_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + + // c[3,0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); + + // c[4,0-15] + CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0); + } - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); - // c[4,0-15] - _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 ); + // c[4,0-15] + _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 ); + } } // 4x16 bf16 kernel @@ -1887,7 +2061,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -1942,32 +2116,32 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15] c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15] c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 ); // Broadcast a[3,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15] @@ -1978,37 +2152,59 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, \ + selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[1,0-15] + BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, \ + selector1, selector2 ); - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[2,0-15] + BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, \ + selector1, selector2 ); + + // c[3,0-15] + BF16_F32_BETA_OP( c_float_3p0, 0, 3, 0, \ + selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0, 0, 0, 0, \ + selector1, selector2); + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0, 0, 1, 0, \ + selector1, selector2); + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0, 0, 2, 0, \ + selector1, selector2); + + // c[3,0-15] + F32_F32_BETA_OP(c_float_3p0, 0, 3, 0, \ + selector1, selector2); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -2164,34 +2360,57 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x16) POST_OPS_DOWNSCALE_4x16: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_4x16_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + + // c[3,0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); + } - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + } } // 3x16 bf16 kernel @@ -2211,7 +2430,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -2257,24 +2476,24 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15] c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15] @@ -2285,45 +2504,65 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); - - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); - - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); - } - // Post Ops - lpgemm_post_op* post_ops_list_temp = post_ops_list; - POST_OP_LABEL_LASTK_SAFE_JUMP -POST_OPS_BIAS_3x16: - { - if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) || - ( *( char* )post_ops_list_temp->op_args2 == 'R' ) ) + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) { - selector1 = - _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j ); // c[0,0-15] - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, \ + selector1, selector2 ); + + // c[1,0-15] + BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, \ + selector1, selector2 ); + + // c[2,0-15] + BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, \ + selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0, 0, 0, 0, \ + selector1, selector2); + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0, 0, 1, 0, \ + selector1, selector2); + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0, 0, 2, 0, \ + selector1, selector2); + } + + } + // Post Ops + lpgemm_post_op* post_ops_list_temp = post_ops_list; + POST_OP_LABEL_LASTK_SAFE_JUMP +POST_OPS_BIAS_3x16: + { + if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) || + ( *( char* )post_ops_list_temp->op_args2 == 'R' ) ) + { + selector1 = + _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ); + + // c[0,0-15] + c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); // c[1,0-15] c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); @@ -2440,28 +2679,48 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x16) POST_OPS_DOWNSCALE_3x16: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_3x16_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + } - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + } } // 2x16 bf16 kernel @@ -2481,7 +2740,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -2518,16 +2777,16 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15] @@ -2538,23 +2797,40 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, \ + selector1, selector2 ); + + // c[1,0-15] + BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, \ + selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0, 0, 0, 0, \ + selector1, selector2); + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0, 0, 1, 0, \ + selector1, selector2); + } + } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -2662,22 +2938,39 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x16) POST_OPS_DOWNSCALE_2x16: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_2x16_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + } - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + } } // 1x16 bf16 kernel @@ -2697,7 +2990,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -2725,8 +3018,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] @@ -2737,16 +3030,30 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, \ + selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0, 0, 0, 0, \ + selector1, selector2); + } + } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -2830,16 +3137,29 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x16) POST_OPS_DOWNSCALE_1x16: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_1x16_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + } + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + } } // 5x32 bf16 kernel @@ -2859,7 +3179,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x32) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -2936,8 +3256,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x32) b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31] @@ -2945,8 +3265,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x32) c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31] @@ -2954,8 +3274,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x32) c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31] @@ -2963,8 +3283,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x32) c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 ); // Broadcast a[3,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31] @@ -2972,8 +3292,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x32) c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 ); // Broadcast a[4,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31] @@ -2983,76 +3303,96 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x32) // Load alpha and beta __m512 selector1 = _mm512_set1_ps( alpha ); - __m512 selector2 = _mm512_set1_ps( beta ); + __m512 selector2 = _mm512_set1_ps( beta );\ - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); - c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); - c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); + c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 ); - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[0, 16-31] + BF16_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 ); - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,0-15] + BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[1, 16-31] + BF16_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 ); - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,0-15] + BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 ); - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[2, 16-31] + BF16_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 ); - // c[3,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 ); + // c[3,0-15] + BF16_F32_BETA_OP( c_float_3p0, 0, 3, 0, selector1, selector2 ); - // c[4,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 ); + // c[3, 16-31] + BF16_F32_BETA_OP( c_float_3p1, 0, 3, 1, selector1, selector2 ); - // c[4,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 ); + // c[4,0-15] + BF16_F32_BETA_OP( c_float_4p0, 0, 4, 0, selector1, selector2 ); + + // c[4, 16-31] + BF16_F32_BETA_OP( c_float_4p1, 0, 4, 1, selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 ); + + // c[0, 16-31] + F32_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 ); + + // c[1,0-15] + F32_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 ); + + // c[1, 16-31] + F32_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 ); + + // c[2,0-15] + F32_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 ); + + // c[2, 16-31] + F32_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 ); + + // c[3,0-15] + F32_F32_BETA_OP( c_float_3p0, 0, 3, 0, selector1, selector2 ); + + // c[3, 16-31] + F32_F32_BETA_OP( c_float_3p1, 0, 3, 1, selector1, selector2 ); + + // c[4,0-15] + F32_F32_BETA_OP( c_float_4p0, 0, 4, 0, selector1, selector2 ); + + // c[4, 16-31] + F32_F32_BETA_OP( c_float_4p1, 0, 4, 1, selector1, selector2 ); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -3340,70 +3680,111 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x32) POST_OPS_DOWNSCALE_5x32: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[3, 16-31] - CVT_F32_BF16(c_float_3p1,3,1); + MULRND_F32(c_float_3p1,3,1); // c[4, 0-15] - CVT_F32_BF16(c_float_4p0,4,0); + MULRND_F32(c_float_4p0,4,0); // c[4, 16-31] - CVT_F32_BF16(c_float_4p1,4,1); + MULRND_F32(c_float_4p1,4,1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_5x32_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + // c[3,0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); - // c[3,16-31] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); + // c[3, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1); + + // c[4,0-15] + CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0); + + // c[4, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1); + } + + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[4,0-15] - _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); - // c[4,16-31] - _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + + // c[3,16-31] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); + + // c[4,0-15] + _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 ); + + // c[4,16-31] + _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 ); + } } // 4x32 bf16 kernel @@ -3423,7 +3804,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x32) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -3489,8 +3870,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x32) b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31] @@ -3498,8 +3879,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x32) c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31] @@ -3507,8 +3888,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x32) c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31] @@ -3516,8 +3897,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x32) c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 ); // Broadcast a[3,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31] @@ -3529,61 +3910,79 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x32) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 ); - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[0, 16-31] + BF16_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 ); - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,0-15] + BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[1, 16-31] + BF16_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 ); - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,0-15] + BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 ); - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[2, 16-31] + BF16_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 ); - // c[3,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 ); + // c[3,0-15] + BF16_F32_BETA_OP( c_float_3p0, 0, 3, 0, selector1, selector2 ); + + // c[3, 16-31] + BF16_F32_BETA_OP( c_float_3p1, 0, 3, 1, selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 ); + + // c[0, 16-31] + F32_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 ); + + // c[1,0-15] + F32_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 ); + + // c[1, 16-31] + F32_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 ); + + // c[2,0-15] + F32_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 ); + + // c[2, 16-31] + F32_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 ); + + // c[3,0-15] + F32_F32_BETA_OP( c_float_3p0, 0, 3, 0, selector1, selector2 ); + + // c[3, 16-31] + F32_F32_BETA_OP( c_float_3p1, 0, 3, 1, selector1, selector2 ); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -3826,58 +4225,93 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x32) POST_OPS_DOWNSCALE_4x32: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[3, 16-31] - CVT_F32_BF16(c_float_3p1,3,1); + MULRND_F32(c_float_3p1,3,1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_4x32_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); + + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + // c[3,0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); + + // c[3, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1); + } - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[3,16-31] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + + // c[3,16-31] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); + } } // 3x32 bf16 kernel @@ -3897,7 +4331,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x32) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -3952,8 +4386,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x32) b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31] @@ -3961,8 +4395,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x32) c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31] @@ -3970,8 +4404,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x32) c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31] @@ -3983,48 +4417,64 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x32) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 ); - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[0, 16-31] + BF16_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 ); - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,0-15] + BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[1, 16-31] + BF16_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 ); - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,0-15] + BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 ); + + // c[2, 16-31] + BF16_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 ); + + // c[0, 16-31] + F32_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 ); + + // c[1,0-15] + F32_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 ); + + // c[1, 16-31] + F32_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 ); + + // c[2,0-15] + F32_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 ); + + // c[2, 16-31] + F32_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 ); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -4222,46 +4672,75 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x32) POST_OPS_DOWNSCALE_3x32: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_3x32_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); + } - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + } } // 2x32 bf16 kernel @@ -4281,7 +4760,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x32) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -4325,8 +4804,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x32) b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31] @@ -4334,8 +4813,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x32) c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31] @@ -4347,35 +4826,49 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x32) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 ); - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[0, 16-31] + BF16_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 ); - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,0-15] + BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 ); + + // c[1, 16-31] + BF16_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 ); + + // c[0, 16-31] + F32_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 ); + + // c[1,0-15] + F32_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 ); + + // c[1, 16-31] + F32_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 ); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -4528,34 +5021,56 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x32) POST_OPS_DOWNSCALE_2x32: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_2x32_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); + } + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + } } // 1x32 bf16 kernel @@ -4575,7 +5090,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x32) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -4608,8 +5123,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x32) b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31] @@ -4621,22 +5136,34 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x32) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 ); + + // c[0, 16-31] + BF16_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 ); + + // c[0, 16-31] + F32_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 ); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -4744,22 +5271,39 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x32) POST_OPS_DOWNSCALE_1x32: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_1x32_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); + } - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + } } // 5x48 bf16 kernel @@ -4779,7 +5323,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x48) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -4869,8 +5413,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x48) b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47] @@ -4879,8 +5423,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x48) c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47] @@ -4889,8 +5433,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x48) c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47] @@ -4899,8 +5443,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x48) c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 ); // Broadcast a[3,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47] @@ -4909,8 +5453,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x48) c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 ); // Broadcast a[4,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47] @@ -4923,104 +5467,128 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x48) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); - c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); + c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); - c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); - c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); - c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 ); + c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); + c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,16-31] + BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) - // c[1,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 ); + // c[1,32-47] + BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,16-31] + BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) - // c[2,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 ); + // c[2,32-47] + BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[3,0-15] + BF16_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2) - // c[3,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 ); + // c[3,16-31] + BF16_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2) - // c[3,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 ); + // c[3,32-47] + BF16_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2) - // c[4,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 ); + // c[4,0-15] + BF16_F32_BETA_OP(c_float_4p0,0,4,0,selector1,selector2) - // c[4,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 ); + // c[4,16-31] + BF16_F32_BETA_OP(c_float_4p1,0,4,1,selector1,selector2) - // c[4,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p2 = _mm512_add_ps( selector1, c_float_4p2 ); + // c[4,32-47] + BF16_F32_BETA_OP(c_float_4p2,0,4,2,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) + + // c[1,32-47] + F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) + + // c[2,16-31] + F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) + + // c[2,32-47] + F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) + + // c[3,0-15] + F32_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2) + + // c[3,16-31] + F32_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2) + + // c[3,32-47] + F32_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2) + + // c[4,0-15] + F32_F32_BETA_OP(c_float_4p0,0,4,0,selector1,selector2) + + // c[4,16-31] + F32_F32_BETA_OP(c_float_4p1,0,4,1,selector1,selector2) + + // c[4,32-47] + F32_F32_BETA_OP(c_float_4p2,0,4,2,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -5371,147 +5939,205 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x48) CLIP_F32_AVX512(c_float_0p0, min, max) // c[0, 16-31] - CLIP_F32_AVX512(c_float_0p1, min, max) + CLIP_F32_AVX512(c_float_0p1, min, max) + + // c[0, 32-47] + CLIP_F32_AVX512(c_float_0p2, min, max) + + // c[1, 0-15] + CLIP_F32_AVX512(c_float_1p0, min, max) + + // c[1, 16-31] + CLIP_F32_AVX512(c_float_1p1, min, max) + + // c[1, 32-47] + CLIP_F32_AVX512(c_float_1p2, min, max) + + // c[2, 0-15] + CLIP_F32_AVX512(c_float_2p0, min, max) + + // c[2, 16-31] + CLIP_F32_AVX512(c_float_2p1, min, max) + + // c[2, 32-47] + CLIP_F32_AVX512(c_float_2p2, min, max) + + // c[3, 0-15] + CLIP_F32_AVX512(c_float_3p0, min, max) + + // c[3, 16-31] + CLIP_F32_AVX512(c_float_3p1, min, max) + + // c[3, 32-47] + CLIP_F32_AVX512(c_float_3p2, min, max) + + // c[4, 0-15] + CLIP_F32_AVX512(c_float_4p0, min, max) + + // c[4, 16-31] + CLIP_F32_AVX512(c_float_4p1, min, max) + + // c[4, 32-47] + CLIP_F32_AVX512(c_float_4p2, min, max) + + POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR + } + +POST_OPS_DOWNSCALE_5x48: + { + // c[0, 0-15] + MULRND_F32(c_float_0p0,0,0); + + // c[0, 16-31] + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CLIP_F32_AVX512(c_float_0p2, min, max) + MULRND_F32(c_float_0p2,0,2); // c[1, 0-15] - CLIP_F32_AVX512(c_float_1p0, min, max) + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CLIP_F32_AVX512(c_float_1p1, min, max) + MULRND_F32(c_float_1p1,1,1); // c[1, 32-47] - CLIP_F32_AVX512(c_float_1p2, min, max) + MULRND_F32(c_float_1p2,1,2); // c[2, 0-15] - CLIP_F32_AVX512(c_float_2p0, min, max) + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CLIP_F32_AVX512(c_float_2p1, min, max) + MULRND_F32(c_float_2p1,2,1); // c[2, 32-47] - CLIP_F32_AVX512(c_float_2p2, min, max) + MULRND_F32(c_float_2p2,2,2); // c[3, 0-15] - CLIP_F32_AVX512(c_float_3p0, min, max) + MULRND_F32(c_float_3p0,3,0); // c[3, 16-31] - CLIP_F32_AVX512(c_float_3p1, min, max) + MULRND_F32(c_float_3p1,3,1); // c[3, 32-47] - CLIP_F32_AVX512(c_float_3p2, min, max) + MULRND_F32(c_float_3p2,3,2); // c[4, 0-15] - CLIP_F32_AVX512(c_float_4p0, min, max) + MULRND_F32(c_float_4p0,4,0); // c[4, 16-31] - CLIP_F32_AVX512(c_float_4p1, min, max) + MULRND_F32(c_float_4p1,4,1); // c[4, 32-47] - CLIP_F32_AVX512(c_float_4p2, min, max) + MULRND_F32(c_float_4p2,4,2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } - -POST_OPS_DOWNSCALE_5x48: +POST_OPS_5x48_DISABLE: + ; + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (bf16 instead of float). + // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); // c[1, 32-47] - CVT_F32_BF16(c_float_1p2,1,2); + CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); // c[2, 32-47] - CVT_F32_BF16(c_float_2p2,2,2); + CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); // c[3, 16-31] - CVT_F32_BF16(c_float_3p1,3,1); + CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1); // c[3, 32-47] - CVT_F32_BF16(c_float_3p2,3,2); + CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2); // c[4, 0-15] - CVT_F32_BF16(c_float_4p0,4,0); + CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0); // c[4, 16-31] - CVT_F32_BF16(c_float_4p1,4,1); + CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1); // c[4, 32-47] - CVT_F32_BF16(c_float_4p2,4,2); - - POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR + CVT_STORE_F32_BF16_MASK(c_float_4p2,4,2); } -POST_OPS_5x48_DISABLE: - ; - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); - // c[1,32-47] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + // c[1,32-47] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); - // c[2,32-47] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); + // c[2,32-47] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); - // c[3,16-31] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); + // c[3,16-31] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); - // c[3,32-47] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 ); + // c[3,32-47] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 ); - // c[4,0-15] - _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 ); + // c[4,0-15] + _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 ); - // c[4,16-31] - _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 ); + // c[4,16-31] + _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 ); - // c[4,32-47] - _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 2*16 ), c_float_4p2 ); + // c[4,32-47] + _mm512_storeu_ps( c + ( rs_c * 4 ) + ( 2*16 ), c_float_4p2 ); + } } // 4x48 bf16 kernel @@ -5531,7 +6157,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x48) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -5608,8 +6234,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x48) b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47] @@ -5618,8 +6244,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x48) c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47] @@ -5628,8 +6254,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x48) c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47] @@ -5638,8 +6264,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x48) c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 ); // Broadcast a[3,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47] @@ -5652,85 +6278,106 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x48) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); - c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); + c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,16-31] + BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) - // c[1,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 ); + // c[1,32-47] + BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,16-31] + BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) - // c[2,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 ); + // c[2,32-47] + BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[3,0-15] + BF16_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2) - // c[3,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 ); + // c[3,16-31] + BF16_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2) - // c[3,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 ); + // c[3,32-47] + BF16_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) + + // c[1,32-47] + F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) + + // c[2,16-31] + F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) + + // c[2,32-47] + F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) + + // c[3,0-15] + F32_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2) + + // c[3,16-31] + F32_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2) + + // c[3,32-47] + F32_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -6062,82 +6709,131 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x48) POST_OPS_DOWNSCALE_4x48: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[1, 32-47] - CVT_F32_BF16(c_float_1p2,1,2); + MULRND_F32(c_float_1p2,1,2); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); // c[2, 32-47] - CVT_F32_BF16(c_float_2p2,2,2); + MULRND_F32(c_float_2p2,2,2); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[3, 16-31] - CVT_F32_BF16(c_float_3p1,3,1); + MULRND_F32(c_float_3p1,3,1); // c[3, 32-47] - CVT_F32_BF16(c_float_3p2,3,2); + MULRND_F32(c_float_3p2,3,2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_4x48_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (bf16 instead of float). + + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); + + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); + + // c[1, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); + + // c[1, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2); + + // c[2, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); + + // c[2, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2); + + // c[3, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); + + // c[3, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1); + + // c[3, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2); + } - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); - // c[1,32-47] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + // c[1,32-47] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); - // c[2,32-47] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); + // c[2,32-47] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 ); - // c[3,16-31] - _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); + // c[3,16-31] + _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 ); // c[3,32-47] _mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 ); + } } // 3x48 bf16 kernel @@ -6157,7 +6853,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x48) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -6221,8 +6917,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x48) b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47] @@ -6231,8 +6927,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x48) c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47] @@ -6241,8 +6937,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x48) c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 ); // Broadcast a[2,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47] @@ -6255,66 +6951,84 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x48) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,16-31] + BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) - // c[1,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 ); + // c[1,32-47] + BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,16-31] + BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) - // c[2,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 ); + // c[2,32-47] + BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) + + // c[1,32-47] + F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2) + + // c[2,16-31] + F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2) + + // c[2,32-47] + F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -6580,64 +7294,104 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x48) POST_OPS_DOWNSCALE_3x48: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[1, 32-47] - CVT_F32_BF16(c_float_1p2,1,2); + MULRND_F32(c_float_1p2,1,2); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); // c[2, 32-47] - CVT_F32_BF16(c_float_2p2,2,2); + MULRND_F32(c_float_2p2,2,2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_3x48_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (bf16 instead of float). + + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); + + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); + + // c[1, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); + + // c[1, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2); + + // c[2, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); + + // c[2, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2); + } - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); - // c[1,32-47] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + // c[1,32-47] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 ); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 ); - // c[2,32-47] - _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); + // c[2,32-47] + _mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 ); + } } // 2x48 bf16 kernel @@ -6657,7 +7411,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x48) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -6708,8 +7462,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x48) b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47] @@ -6718,8 +7472,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x48) c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 ); // Broadcast a[1,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47] @@ -6732,47 +7486,62 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x48) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,16-31] + BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) - // c[1,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 ); + // c[1,32-47] + BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2) + + // c[1,32-47] + F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -6972,46 +7741,77 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x48) POST_OPS_DOWNSCALE_2x48: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[1, 32-47] - CVT_F32_BF16(c_float_1p2,1,2); + MULRND_F32(c_float_1p2,1,2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_2x48_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (bf16 instead of float). + + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); + + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); + + // c[1, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); + + // c[1, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2); + } - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 ); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 ); - // c[1,32-47] - _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + // c[1,32-47] + _mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 ); + } } // 1x48 bf16 kernel @@ -7031,7 +7831,7 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x48) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -7069,8 +7869,8 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x48) b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+2]. - memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47] @@ -7083,28 +7883,40 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x48) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2) + + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -7238,28 +8050,50 @@ LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x48) POST_OPS_DOWNSCALE_1x48: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_1x48_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); + + // Store the results in downscaled type (bf16 instead of float). + + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); + } + + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 ); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 ); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 ); + } } #endif #endif diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c index 56795a00ba..c95c0090ae 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c @@ -62,7 +62,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6xlt16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -72,11 +72,6 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6xlt16) // For corner cases. float buf0[16]; - float buf1[16]; - float buf2[16]; - float buf3[16]; - float buf4[16]; - float buf5[16]; dim_t value; @@ -238,78 +233,48 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6xlt16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15] c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15] c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 ); // Broadcast a[3,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15] c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 ); // Broadcast a[4,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15] c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 ); // Broadcast a[5,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 5) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15] @@ -320,58 +285,82 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6xlt16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); - c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 ); + c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 ); + } // Scale C by beta. if ( beta != 0 ) { - memcpy( buf0, ( c + ( rs_c * ( ir + 0 ) ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf1, ( c + ( rs_c * ( ir + 1 ) ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf2, ( c + ( rs_c * ( ir + 2 ) ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf3, ( c + ( rs_c * ( ir + 3 ) ) ), ( n0_rem * sizeof( float) ) ); - memcpy( buf4, ( c + ( rs_c * ( ir + 4 ) ) ), ( n0_rem * sizeof( float ) ) ); - memcpy( buf5, ( c + ( rs_c * ( ir + 5 ) ) ), ( n0_rem * sizeof( float ) ) ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // c[0,0-15] - selector1 = _mm512_loadu_ps( buf0 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // c[0,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \ + selector1, selector2 ); - // c[1,0-15] - selector1 = _mm512_loadu_ps( buf1 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[1,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_1p0, 1, 0, \ + selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( buf2 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[2,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_2p0, 2, 0, \ + selector1, selector2 ); - // c[3,0-15] - selector1 = _mm512_loadu_ps( buf3 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[3,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_3p0, 3, 0, \ + selector1, selector2 ); - // c[4,0-15] - selector1 = _mm512_loadu_ps( buf4 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 ); + // c[4,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_4p0, 4, 0, \ + selector1, selector2 ); - // c[5,0-15] - selector1 = _mm512_loadu_ps( buf5 ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 ); + // c[5,0-15] + BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_5p0, 5, 0, \ + selector1, selector2 ); + } + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); + + // c[0,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, ir, 0, 0, \ + selector1, selector2); + + // c[1,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_1p0, ir, 1, 0, \ + selector1, selector2); + + // c[2,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_2p0, ir, 2, 0, \ + selector1, selector2); + + // c[3,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_3p0, ir, 3, 0, \ + selector1, selector2); + + // c[4,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_4p0, ir, 4, 0, \ + selector1, selector2); + + // c[5,0-15] + F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_5p0, ir, 5, 0, \ + selector1, selector2); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -575,65 +564,76 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6xlt16) POST_OPS_DOWNSCALE_6xLT16: { // c[0, 0-15] - CVT_F32_BF16_LT16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[1, 0-15] - CVT_F32_BF16_LT16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[2, 0-15] - CVT_F32_BF16_LT16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[3, 0-15] - CVT_F32_BF16_LT16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[4, 0-15] - CVT_F32_BF16_LT16(c_float_4p0,4,0); + MULRND_F32(c_float_4p0,4,0); // c[5, 0-15] - CVT_F32_BF16_LT16(c_float_5p0,5,0); + MULRND_F32(c_float_5p0,5,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_6xLT16_DISABLE: ; - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( buf0, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + __mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); - // c[1,0-15] - _mm512_storeu_ps( buf1, c_float_1p0 ); + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // c[2,0-15] - _mm512_storeu_ps( buf2, c_float_2p0 ); + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[3,0-15] - _mm512_storeu_ps( buf3, c_float_3p0 ); + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); - // c[4,0-15] - _mm512_storeu_ps( buf4, c_float_4p0 ); + // c[3,0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); + + // c[4,0-15] + CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0); + + // c[5,0-15] + CVT_STORE_F32_BF16_MASK(c_float_5p0,5,0); + } - // c[5,0-15] - _mm512_storeu_ps( buf5, c_float_5p0 ); + else + { + __mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) ); + + // Store the results. + // c[0,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * ( ir + 0 ) ), load_mask, c_float_0p0 ); - // Memcpy partial parts. - // c[0,0-15] - memcpy( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) ); + // c[1,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * ( ir + 1 ) ), load_mask, c_float_1p0 ); - // c[1,0-15] - memcpy( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), buf1, ( n0_rem * sizeof( float ) ) ); + // c[2,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * ( ir + 2 ) ), load_mask, c_float_2p0 ); - // c[2,0-15] - memcpy( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), buf2, ( n0_rem * sizeof( float ) ) ); + // c[3,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * ( ir + 3 ) ), load_mask, c_float_3p0 ); - // c[3,0-15] - memcpy( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), buf3, ( n0_rem * sizeof( float ) ) ); + // c[4,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * ( ir + 4 ) ), load_mask, c_float_4p0 ); - // c[4,0-15] - memcpy( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), buf4, ( n0_rem * sizeof( float ) ) ); + // c[5,0-15] + _mm512_mask_storeu_ps( c + ( rs_c * ( ir + 5 ) ), load_mask, c_float_5p0 ); - // c[5,0-15] - memcpy( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), buf5, ( n0_rem * sizeof( float ) ) ); + } a = a + ( MR * ps_a ); post_ops_attr.post_op_c_i += MR; @@ -731,7 +731,7 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x16) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -899,78 +899,48 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x16) b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15] c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 ); // Broadcast a[1,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15] c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 ); // Broadcast a[2,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15] c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 ); // Broadcast a[3,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15] c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 ); // Broadcast a[4,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15] c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 ); // Broadcast a[5,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 5) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15] @@ -981,51 +951,79 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x16) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); - c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 ); + c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, ir, 0, 0, \ + selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[1,0-15] + BF16_F32_BETA_OP( c_float_1p0, ir, 1, 0, \ + selector1, selector2 ); - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[2,0-15] + BF16_F32_BETA_OP( c_float_2p0, ir, 2, 0, \ + selector1, selector2 ); - // c[4,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 ); + // c[3,0-15] + BF16_F32_BETA_OP( c_float_3p0, ir, 3, 0, \ + selector1, selector2 ); - // c[5,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 ); + // c[4,0-15] + BF16_F32_BETA_OP( c_float_4p0, ir, 4, 0, \ + selector1, selector2 ); + + // c[5,0-15] + BF16_F32_BETA_OP( c_float_5p0, ir, 5, 0, \ + selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0, ir, 0, 0, \ + selector1, selector2); + + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0, ir, 1, 0, \ + selector1, selector2); + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0, ir, 2, 0, \ + selector1, selector2); + + // c[3,0-15] + F32_F32_BETA_OP(c_float_3p0, ir, 3, 0, \ + selector1, selector2); + + // c[4,0-15] + F32_F32_BETA_OP(c_float_4p0, ir, 4, 0, \ + selector1, selector2); + + // c[5,0-15] + F32_F32_BETA_OP(c_float_5p0, ir, 5, 0, \ + selector1, selector2); + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -1229,46 +1227,75 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x16) POST_OPS_DOWNSCALE_6x16: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[4, 0-15] - CVT_F32_BF16(c_float_4p0,4,0); + MULRND_F32(c_float_4p0,4,0); // c[5, 0-15] - CVT_F32_BF16(c_float_5p0,5,0); + MULRND_F32(c_float_5p0,5,0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_6x16_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 ); + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); + + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + + // c[3,0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); + + // c[4,0-15] + CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0); + + // c[5,0-15] + CVT_STORE_F32_BF16_MASK(c_float_5p0,5,0); + } + + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 ); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 ); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 ); + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 ); - // c[4,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 ); + // c[4,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 ); - // c[5,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 ); + // c[5,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 ); + } a = a + ( MR * ps_a ); post_ops_attr.post_op_c_i += MR; @@ -1366,7 +1393,7 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -1562,13 +1589,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32) b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) ); // Broadcast a[0,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31] @@ -1576,13 +1598,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32) c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 ); // Broadcast a[1,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31] @@ -1590,13 +1607,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32) c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 ); // Broadcast a[2,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31] @@ -1604,13 +1616,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32) c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 ); // Broadcast a[3,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31] @@ -1618,13 +1625,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32) c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 ); // Broadcast a[4,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31] @@ -1632,13 +1634,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32) c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 ); // Broadcast a[5,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 5) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[5,0-31] = a[5,kr:kr+2]*b[kr:kr+2,0-31] @@ -1649,87 +1646,110 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); - c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); - c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); + c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); - c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 ); - c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 ); + c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 ); + c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 ); + } // Scale C by beta. if ( beta != 0 ) { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[0,0-15] + BF16_F32_BETA_OP( c_float_0p0, ir, 0, 0, selector1, selector2 ); - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[0, 16-31] + BF16_F32_BETA_OP( c_float_0p1, ir, 0, 1, selector1, selector2 ); - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[1,0-15] + BF16_F32_BETA_OP( c_float_1p0, ir, 1, 0, selector1, selector2 ); - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[1, 16-31] + BF16_F32_BETA_OP( c_float_1p1, ir, 1, 1, selector1, selector2 ); - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[2,0-15] + BF16_F32_BETA_OP( c_float_2p0, ir, 2, 0, selector1, selector2 ); - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[2, 16-31] + BF16_F32_BETA_OP( c_float_2p1, ir, 2, 1, selector1, selector2 ); - // c[3,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 ); + // c[3,0-15] + BF16_F32_BETA_OP( c_float_3p0, ir, 3, 0, selector1, selector2 ); - // c[4,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 ); + // c[3, 16-31] + BF16_F32_BETA_OP( c_float_3p1, ir, 3, 1, selector1, selector2 ); - // c[4,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 ); + // c[4,0-15] + BF16_F32_BETA_OP( c_float_4p0, ir, 4, 0, selector1, selector2 ); - // c[5,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 ); + // c[4, 16-31] + BF16_F32_BETA_OP( c_float_4p1, ir, 4, 1, selector1, selector2 ); - // c[5,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p1 = _mm512_add_ps( selector1, c_float_5p1 ); + // c[5,0-15] + BF16_F32_BETA_OP( c_float_5p0, ir, 5, 0, selector1, selector2 ); + + // c[5, 16-31] + BF16_F32_BETA_OP( c_float_5p1, ir, 5, 1, selector1, selector2 ); + } + else + { + // c[0,0-15] + F32_F32_BETA_OP( c_float_0p0, ir, 0, 0, selector1, selector2 ); + + // c[0, 16-31] + F32_F32_BETA_OP( c_float_0p1, ir, 0, 1, selector1, selector2 ); + + // c[1,0-15] + F32_F32_BETA_OP( c_float_1p0, ir, 1, 0, selector1, selector2 ); + + // c[1, 16-31] + F32_F32_BETA_OP( c_float_1p1, ir, 1, 1, selector1, selector2 ); + + // c[2,0-15] + F32_F32_BETA_OP( c_float_2p0, ir, 2, 0, selector1, selector2 ); + + // c[2, 16-31] + F32_F32_BETA_OP( c_float_2p1, ir, 2, 1, selector1, selector2 ); + + // c[3,0-15] + F32_F32_BETA_OP( c_float_3p0, ir, 3, 0, selector1, selector2 ); + + // c[3, 16-31] + F32_F32_BETA_OP( c_float_3p1, ir, 3, 1, selector1, selector2 ); + + // c[4,0-15] + F32_F32_BETA_OP( c_float_4p0, ir, 4, 0, selector1, selector2 ); + + // c[4, 16-31] + F32_F32_BETA_OP( c_float_4p1, ir, 4, 1, selector1, selector2 ); + + // c[5,0-15] + F32_F32_BETA_OP( c_float_5p0, ir, 5, 0, selector1, selector2 ); + + // c[5, 16-31] + F32_F32_BETA_OP( c_float_5p1, ir, 5, 1, selector1, selector2 ); + } + } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -2062,82 +2082,129 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32) POST_OPS_DOWNSCALE_6x32: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[3, 16-31] - CVT_F32_BF16(c_float_3p1,3,1); + MULRND_F32(c_float_3p1,3,1); // c[4, 0-15] - CVT_F32_BF16(c_float_4p0,4,0); + MULRND_F32(c_float_4p0,4,0); // c[4, 16-31] - CVT_F32_BF16(c_float_4p1,4,1); + MULRND_F32(c_float_4p1,4,1); // c[5, 0-15] - CVT_F32_BF16(c_float_5p0,5,0); + MULRND_F32(c_float_5p0,5,0); // c[5, 16-31] - CVT_F32_BF16(c_float_5p1,5,1); + MULRND_F32(c_float_5p1,5,1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_6x32_DISABLE: ; + if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 ); + // Store the results in downscaled type (int8 instead of int32). + // c[0,0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 ); + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); + + // c[1,0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); + + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); + + // c[2,0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); + + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); + + // c[3,0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); + + // c[3, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1); + + // c[4,0-15] + CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0); + + // c[4, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1); + + // c[5,0-15] + CVT_STORE_F32_BF16_MASK(c_float_5p0,5,0); + + // c[5, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_5p1,5,1); + } + + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 ); + + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 ); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 ); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 ); + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 ); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 ); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 ); + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 ); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 ); + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 ); - // c[3,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 ); + // c[3,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 ); - // c[4,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 ); + // c[4,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 ); - // c[4,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 ); + // c[4,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 ); - // c[5,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 ); + // c[5,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 ); - // c[5,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 ); + // c[5,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 ); + } a = a + ( MR * ps_a ); post_ops_attr.post_op_c_i += MR; @@ -2235,7 +2302,7 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48) dim_t k_full_pieces = k0 / 2; dim_t k_partial_pieces = k0 % 2; - int32_t a_kfringe_buf = 0; + int16_t a_kfringe_buf = 0; // B matrix storage bfloat type __m512bh b0; @@ -2462,13 +2529,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48) b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) ); // Broadcast a[0,kr:kr+4]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47] @@ -2477,13 +2539,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48) c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 ); // Broadcast a[1,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47] @@ -2492,13 +2549,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48) c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 ); // Broadcast a[2,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47] @@ -2507,13 +2559,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48) c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 ); // Broadcast a[3,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47] @@ -2522,13 +2569,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48) c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 ); // Broadcast a[4,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47] @@ -2537,13 +2579,8 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48) c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 ); // Broadcast a[5,kr:kr+2]. - memcpy - ( - &a_kfringe_buf, - ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ), - ( k_partial_pieces * sizeof( bfloat16 ) ) - ); - a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf ); + a_kfringe_buf = *(a + (rs_a * 5) + (cs_a * ( k_full_pieces ))); + a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf ); // Perform column direction mat-mul with k = 2. // c[5,0-47] = a[5,kr:kr+2]*b[kr:kr+2,0-47] @@ -2556,123 +2593,150 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48) __m512 selector1 = _mm512_set1_ps( alpha ); __m512 selector2 = _mm512_set1_ps( beta ); - // Scale by alpha - c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); - c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); - c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + if ( alpha != 1 ) + { + // Scale by alpha + c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 ); + c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 ); + c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 ); + + c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); + c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); + c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + + c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); + c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); + c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); + + c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); + c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); + c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); + + c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); + c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); + c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 ); + + c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 ); + c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 ); + c_float_5p2 = _mm512_mul_ps( selector1, c_float_5p2 ); + } - c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 ); - c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 ); - c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 ); + // Scale C by beta. + if ( beta != 0 ) + { + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_first_k == TRUE ) ) + { + // c[0,0-15] + BF16_F32_BETA_OP(c_float_0p0,ir,0,0,selector1,selector2) - c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 ); - c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 ); - c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 ); + // c[0, 16-31] + BF16_F32_BETA_OP(c_float_0p1,ir,0,1,selector1,selector2) - c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 ); - c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 ); - c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 ); + // c[0,32-47] + BF16_F32_BETA_OP(c_float_0p2,ir,0,2,selector1,selector2) - c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 ); - c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 ); - c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 ); + // c[1,0-15] + BF16_F32_BETA_OP(c_float_1p0,ir,1,0,selector1,selector2) - c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 ); - c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 ); - c_float_5p2 = _mm512_mul_ps( selector1, c_float_5p2 ); + // c[1,16-31] + BF16_F32_BETA_OP(c_float_1p1,ir,1,1,selector1,selector2) - // Scale C by beta. - if ( beta != 0 ) - { - // c[0,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 ); + // c[1,32-47] + BF16_F32_BETA_OP(c_float_1p2,ir,1,2,selector1,selector2) - // c[0, 16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 ); + // c[2,0-15] + BF16_F32_BETA_OP(c_float_2p0,ir,2,0,selector1,selector2) - // c[0,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 ); + // c[2,16-31] + BF16_F32_BETA_OP(c_float_2p1,ir,2,1,selector1,selector2) - // c[1,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 ); + // c[2,32-47] + BF16_F32_BETA_OP(c_float_2p2,ir,2,2,selector1,selector2) - // c[1,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 ); + // c[3,0-15] + BF16_F32_BETA_OP(c_float_3p0,ir,3,0,selector1,selector2) - // c[1,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 ); + // c[3,16-31] + BF16_F32_BETA_OP(c_float_3p1,ir,3,1,selector1,selector2) - // c[2,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 ); + // c[3,32-47] + BF16_F32_BETA_OP(c_float_3p2,ir,3,2,selector1,selector2) - // c[2,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 ); + // c[4,0-15] + BF16_F32_BETA_OP(c_float_4p0,ir,4,0,selector1,selector2) - // c[2,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 ); + // c[4,16-31] + BF16_F32_BETA_OP(c_float_4p1,ir,4,1,selector1,selector2) - // c[3,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 ); + // c[4,32-47] + BF16_F32_BETA_OP(c_float_4p2,ir,4,2,selector1,selector2) - // c[3,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 ); + // c[5,0-15] + BF16_F32_BETA_OP(c_float_5p0,ir,5,0,selector1,selector2) - // c[3,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 ); + // c[5,16-31] + BF16_F32_BETA_OP(c_float_5p1,ir,5,1,selector1,selector2) - // c[4,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 ); + // c[5,32-47] + BF16_F32_BETA_OP(c_float_5p2,ir,5,2,selector1,selector2) + } + else + { + // c[0,0-15] + F32_F32_BETA_OP(c_float_0p0,ir,0,0,selector1,selector2) - // c[4,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 ); + // c[0, 16-31] + F32_F32_BETA_OP(c_float_0p1,ir,0,1,selector1,selector2) - // c[4,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_4p2 = _mm512_add_ps( selector1, c_float_4p2 ); + // c[0,32-47] + F32_F32_BETA_OP(c_float_0p2,ir,0,2,selector1,selector2) - // c[5,0-15] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 ); + // c[1,0-15] + F32_F32_BETA_OP(c_float_1p0,ir,1,0,selector1,selector2) - // c[5,16-31] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p1 = _mm512_add_ps( selector1, c_float_5p1 ); + // c[1,16-31] + F32_F32_BETA_OP(c_float_1p1,ir,1,1,selector1,selector2) - // c[5,32-47] - selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ) ); - selector1 = _mm512_mul_ps( selector2, selector1 ); - c_float_5p2 = _mm512_add_ps( selector1, c_float_5p2 ); + // c[1,32-47] + F32_F32_BETA_OP(c_float_1p2,ir,1,2,selector1,selector2) + + // c[2,0-15] + F32_F32_BETA_OP(c_float_2p0,ir,2,0,selector1,selector2) + + // c[2,16-31] + F32_F32_BETA_OP(c_float_2p1,ir,2,1,selector1,selector2) + + // c[2,32-47] + F32_F32_BETA_OP(c_float_2p2,ir,2,2,selector1,selector2) + + // c[3,0-15] + F32_F32_BETA_OP(c_float_3p0,ir,3,0,selector1,selector2) + + // c[3,16-31] + F32_F32_BETA_OP(c_float_3p1,ir,3,1,selector1,selector2) + + // c[3,32-47] + F32_F32_BETA_OP(c_float_3p2,ir,3,2,selector1,selector2) + + // c[4,0-15] + F32_F32_BETA_OP(c_float_4p0,ir,4,0,selector1,selector2) + + // c[4,16-31] + F32_F32_BETA_OP(c_float_4p1,ir,4,1,selector1,selector2) + + // c[4,32-47] + F32_F32_BETA_OP(c_float_4p2,ir,4,2,selector1,selector2) + + // c[5,0-15] + F32_F32_BETA_OP(c_float_5p0,ir,5,0,selector1,selector2) + + // c[5,16-31] + F32_F32_BETA_OP(c_float_5p1,ir,5,1,selector1,selector2) + + // c[5,32-47] + F32_F32_BETA_OP(c_float_5p2,ir,5,2,selector1,selector2) + } } // Post Ops lpgemm_post_op* post_ops_list_temp = post_ops_list; @@ -3137,118 +3201,187 @@ LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48) POST_OPS_DOWNSCALE_6x48: { // c[0, 0-15] - CVT_F32_BF16(c_float_0p0,0,0); + MULRND_F32(c_float_0p0,0,0); // c[0, 16-31] - CVT_F32_BF16(c_float_0p1,0,1); + MULRND_F32(c_float_0p1,0,1); // c[0, 32-47] - CVT_F32_BF16(c_float_0p2,0,2); + MULRND_F32(c_float_0p2,0,2); // c[1, 0-15] - CVT_F32_BF16(c_float_1p0,1,0); + MULRND_F32(c_float_1p0,1,0); // c[1, 16-31] - CVT_F32_BF16(c_float_1p1,1,1); + MULRND_F32(c_float_1p1,1,1); // c[1, 32-47] - CVT_F32_BF16(c_float_1p2,1,2); + MULRND_F32(c_float_1p2,1,2); // c[2, 0-15] - CVT_F32_BF16(c_float_2p0,2,0); + MULRND_F32(c_float_2p0,2,0); // c[2, 16-31] - CVT_F32_BF16(c_float_2p1,2,1); + MULRND_F32(c_float_2p1,2,1); // c[2, 32-47] - CVT_F32_BF16(c_float_2p2,2,2); + MULRND_F32(c_float_2p2,2,2); // c[3, 0-15] - CVT_F32_BF16(c_float_3p0,3,0); + MULRND_F32(c_float_3p0,3,0); // c[3, 16-31] - CVT_F32_BF16(c_float_3p1,3,1); + MULRND_F32(c_float_3p1,3,1); // c[3, 32-47] - CVT_F32_BF16(c_float_3p2,3,2); + MULRND_F32(c_float_3p2,3,2); // c[4, 0-15] - CVT_F32_BF16(c_float_4p0,4,0); + MULRND_F32(c_float_4p0,4,0); // c[4, 16-31] - CVT_F32_BF16(c_float_4p1,4,1); + MULRND_F32(c_float_4p1,4,1); // c[4, 32-47] - CVT_F32_BF16(c_float_4p2,4,2); + MULRND_F32(c_float_4p2,4,2); // c[5, 0-15] - CVT_F32_BF16(c_float_5p0,5,0); + MULRND_F32(c_float_5p0,5,0); // c[5, 16-31] - CVT_F32_BF16(c_float_5p1,5,1); + MULRND_F32(c_float_5p1,5,1); // c[5, 32-47] - CVT_F32_BF16(c_float_5p2,5,2); + MULRND_F32(c_float_5p2,5,2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } POST_OPS_6x48_DISABLE: ; + // Case where the output C matrix is bf16 (downscaled) and this is the + // final write for a given block within C. + if ( ( post_ops_attr.buf_downscale != NULL ) && + ( post_ops_attr.is_last_k == TRUE ) ) + { + // Generate a mask16 of all 1's. + __m512i selector_a = _mm512_setzero_epi32(); + __m512i selector_b = _mm512_set1_epi32( 10 ); + __mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b ); - // Store the results. - // c[0,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 ); + // Store the results in downscaled type (bf16 instead of float). - // c[0, 16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 ); + // c[0, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0); - // c[0,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_float_0p2 ); + // c[0, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1); - // c[1,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 ); + // c[0, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2); - // c[1,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 ); + // c[1, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0); - // c[1,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_float_1p2 ); + // c[1, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1); - // c[2,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 ); + // c[1, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2); - // c[2,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 ); + // c[2, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0); - // c[2,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_float_2p2 ); + // c[2, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1); - // c[3,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 ); + // c[2, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2); - // c[3,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 ); + // c[3, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0); - // c[3,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_float_3p2 ); + // c[3, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1); - // c[4,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 ); + // c[3, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2); + + // c[4, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0); + + // c[4, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1); + + // c[4, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_4p2,4,2); + + // c[5, 0-15] + CVT_STORE_F32_BF16_MASK(c_float_5p0,5,0); + + // c[5, 16-31] + CVT_STORE_F32_BF16_MASK(c_float_5p1,5,1); + + // c[5, 32-47] + CVT_STORE_F32_BF16_MASK(c_float_5p2,5,2); + } + + else + { + // Store the results. + // c[0,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 ); + + // c[0, 16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 ); - // c[4,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 ); + // c[0,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_float_0p2 ); - // c[4,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_float_4p2 ); + // c[1,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 ); - // c[5,0-15] - _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 ); + // c[1,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 ); - // c[5,16-31] - _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 ); + // c[1,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_float_1p2 ); - // c[5,32-47] - _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_float_5p2 ); + // c[2,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 ); + + // c[2,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 ); + + // c[2,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_float_2p2 ); + + // c[3,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 ); + + // c[3,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 ); + + // c[3,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_float_3p2 ); + + // c[4,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 ); + + // c[4,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 ); + + // c[4,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_float_4p2 ); + + // c[5,0-15] + _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 ); + + // c[5,16-31] + _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 ); + + // c[5,32-47] + _mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_float_5p2 ); + } a = a + ( MR * ps_a ); post_ops_attr.post_op_c_i += MR; From 2c4f032e0f9181629e037298b993b4bfc798a001 Mon Sep 17 00:00:00 2001 From: eashdash Date: Fri, 19 May 2023 09:11:26 +0000 Subject: [PATCH 095/226] Fix for lack of BF16 instruction when compiled with GCC-11 GCC-11 and below support AVX512-BF16. However, it doesn't support all the bf16 instructions required. For bf16 downscale APIs, when beta scaling is done, C output elements must be upscaled from BF16 type to Float type for beta scaling operation. For this upscaling operation of bf16 to float, _mm512_cvtpbh_ps is used. This however is not supported by GCC-11 and below (but is supported on GCC 12 onwards) Lack of this instruction support in gcc11, and below leads to compilation issues with this instruction (_mm512_cvtpbh_ps) not being recognized. To fix, this, we use a set of instructions: 1. register containing bf16 type __m256bh a1 2. Convert bf16 to float with shift left ops __m512 float_a1 = (__m512) (_mm512_sllv_epi32 (_mm512_cvtepi16_epi32 ((__m256i) a1), _mm512_set1_epi32 (16))); AMD-Internal: [CPUPL-3454] Change-Id: Ie4a9f04881c59ced088608633774b27f22b4ab8e --- .../lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h index 325be38bf7..f3875647eb 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h @@ -69,15 +69,12 @@ // Downscale beta scale macro, scratch2=beta #define BF16_F32_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \ scratch1 = \ - _mm512_cvtpbh_ps \ - ( \ - (__m256bh)_mm256_loadu_epi16 \ + (__m512)( _mm512_sllv_epi32( _mm512_cvtepi16_epi32( (__m256i)_mm256_loadu_epi16 \ ( \ ( ( bfloat16* )post_ops_attr.buf_downscale + \ ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ post_ops_attr.post_op_c_j + ( n_ind * 16 ) )\ - ) \ - ); \ + ) ), _mm512_set1_epi32 (16) ) );\ F32_BETA_FMA(reg,scratch1,scratch2) \ // Default n < 16 mask load beta macro @@ -87,16 +84,14 @@ // Downscale n < 16 mask load beta macro #define BF16_F32_BETA_OP_NLT16F_MASK(lmask,reg,m_ind,n_ind,scratch1,scratch2) \ - scratch1 = _mm512_cvtpbh_ps \ - ( \ - (__m256bh)_mm256_maskz_loadu_epi16 \ + scratch1 = \ + (__m512)( _mm512_sllv_epi32( _mm512_cvtepi16_epi32( (__m256i)_mm256_maskz_loadu_epi16 \ ( \ lmask, \ ( bfloat16* )post_ops_attr.buf_downscale + \ ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ post_ops_attr.post_op_c_j + ( n_ind * 16 ) \ - ) \ - ); \ + ) ), _mm512_set1_epi32 (16) ) );\ F32_BETA_FMA(reg,scratch1,scratch2) \ #define MULRND_F32(reg,m_ind,n_ind) \ From 5f5bc2498937d7ac5a64ff97fb48464e4dc4005a Mon Sep 17 00:00:00 2001 From: Mangala V Date: Mon, 15 May 2023 23:58:24 +0530 Subject: [PATCH 096/226] Bug fix: AVX2 code being invoked on non-avx2 machine for ZGEMM API Prevented calling avx2 based bli_zgemm_ref_k1_nn code on non-supported systems. Changed the name of the function bli_zgemm_ref_k1_nn to bli_zgemm_4x6_avx2_k1_nn(). Changed the name of the function bli_dgemm_ref_k1_nn to bli_dgemm_8x6_avx2_k1_nn(). Thanks to Kiran Varaganti for identifying and helping to fix the issue. AMD-Internal: [CPUPL-3352] Change-Id: I02530ab197ed84c96cbad4f7dd56eedca0109c35 --- frame/compat/bla_gemm_amd.c | 296 ++++++++++-------- kernels/zen/3/CMakeLists.txt | 6 +- ...bli_dgemm_ref_k1.c => bli_dgemm_avx2_k1.c} | 4 +- ...bli_zgemm_ref_k1.c => bli_zgemm_avx2_k1.c} | 4 +- kernels/zen/bli_kernels_zen.h | 4 +- 5 files changed, 169 insertions(+), 145 deletions(-) rename kernels/zen/3/{bli_dgemm_ref_k1.c => bli_dgemm_avx2_k1.c} (99%) rename kernels/zen/3/{bli_zgemm_ref_k1.c => bli_zgemm_avx2_k1.c} (99%) diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index 68f765976a..afbecd2a58 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -432,41 +432,41 @@ void dgemm_blis_impl double* c, const f77_int* ldc ) { - trans_t blis_transa; - trans_t blis_transb; - dim_t m0, n0, k0; + trans_t blis_transa; + trans_t blis_transb; + dim_t m0, n0, k0; - /* Initialize BLIS. */ - bli_init_auto(); + /* Initialize BLIS. */ + bli_init_auto(); - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \ - (void*)alpha, *lda, *ldb, (void*)beta, *ldc); + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \ + (void*)alpha, *lda, *ldb, (void*)beta, *ldc); - /* Perform BLAS parameter checking. */ - PASTEBLACHK(gemm) - ( - MKSTR(d), - MKSTR(gemm), - transa, - transb, - m, - n, - k, - lda, - ldb, - ldc - ); + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemm) + ( + MKSTR(d), + MKSTR(gemm), + transa, + transb, + m, + n, + k, + lda, + ldb, + ldc + ); - /* Quick return if possible. */ - if ( *m == 0 || *n == 0 || ((*alpha == 0.0 || *k == 0) && *beta == 1.0)) - { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; - } + /* Quick return if possible. */ + if ( *m == 0 || *n == 0 || ((*alpha == 0.0 || *k == 0) && *beta == 1.0)) + { + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } /* If alpha is zero scale C by beta and return early. */ if( PASTEMAC(d,eq0)( *alpha )) @@ -494,7 +494,7 @@ void dgemm_blis_impl return; } - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans(*transa, &blis_transa); bli_param_map_netlib_to_blis_trans(*transb, &blis_transb); @@ -564,92 +564,92 @@ void dgemm_blis_impl if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb)) { - bli_dgemm_ref_k1_nn( m0, n0, k0, - (double*)alpha, - (double*)a, *lda, - (double*)b, *ldb, - (double*)beta, - c, *ldc - ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS */ - bli_finalize_auto(); - return; + bli_dgemm_8x6_avx2_k1_nn( m0, n0, k0, + (double*)alpha, + (double*)a, *lda, + (double*)b, *ldb, + (double*)beta, + c, *ldc + ); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS */ + bli_finalize_auto(); + return; } if (n0 == 1) { - if (bli_is_notrans(blis_transa)) - { - bli_dgemv_unf_var2( - BLIS_NO_TRANSPOSE, - bli_extract_conj(blis_transb), - m0, k0, - (double*)alpha, - (double*)a, rs_a, cs_a, - (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, - (double*)beta, - c, rs_c, - ((void*)0) - ); - } - else - { - bli_dgemv_unf_var1( - blis_transa, - bli_extract_conj(blis_transb), - k0, m0, - (double*)alpha, - (double*)a, rs_a, cs_a, - (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, - (double*)beta, - c, rs_c, - ((void*)0) - ); - } + if (bli_is_notrans(blis_transa)) + { + bli_dgemv_unf_var2( + BLIS_NO_TRANSPOSE, + bli_extract_conj(blis_transb), + m0, k0, + (double*)alpha, + (double*)a, rs_a, cs_a, + (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, + (double*)beta, + c, rs_c, + ((void*)0) + ); + } + else + { + bli_dgemv_unf_var1( + blis_transa, + bli_extract_conj(blis_transb), + k0, m0, + (double*)alpha, + (double*)a, rs_a, cs_a, + (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, + (double*)beta, + c, rs_c, + ((void*)0) + ); + } - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS */ - bli_finalize_auto(); - return; + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS */ + bli_finalize_auto(); + return; } else if (m0 == 1) { - if (bli_is_notrans(blis_transb)) - { - bli_dgemv_unf_var1( - blis_transb, - bli_extract_conj(blis_transa), - n0, k0, - (double*)alpha, - (double*)b, cs_b, rs_b, - (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, - (double*)beta, - c, cs_c, - ((void*)0) - ); - } - else - { - bli_dgemv_unf_var2( - blis_transb, - bli_extract_conj(blis_transa), - k0, n0, - (double*)alpha, - (double*)b, cs_b, rs_b, - (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, - (double*)beta, - c, cs_c, - ((void*)0) - ); - } - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS */ - bli_finalize_auto(); - return; + if (bli_is_notrans(blis_transb)) + { + bli_dgemv_unf_var1( + blis_transb, + bli_extract_conj(blis_transa), + n0, k0, + (double*)alpha, + (double*)b, cs_b, rs_b, + (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, + (double*)beta, + c, cs_c, + ((void*)0) + ); + } + else + { + bli_dgemv_unf_var2( + blis_transb, + bli_extract_conj(blis_transa), + k0, n0, + (double*)alpha, + (double*)b, cs_b, rs_b, + (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, + (double*)beta, + c, cs_c, + ((void*)0) + ); + } + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS */ + bli_finalize_auto(); + return; } const num_t dt = BLIS_DOUBLE; @@ -687,26 +687,26 @@ void dgemm_blis_impl if (is_parallel) #endif { - // Will call parallelized dgemm code - sup & native - PASTEMAC(gemm, BLIS_OAPI_EX_SUF) - ( - &alphao, - &ao, - &bo, - &betao, - &co, - NULL, - NULL - ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + // Will call parallelized dgemm code - sup & native + PASTEMAC(gemm, BLIS_OAPI_EX_SUF) + ( + &alphao, + &ao, + &bo, + &betao, + &co, + NULL, + NULL + ); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + bli_finalize_auto(); + return; } - // The code below will be called when number of threads = 1. +// The code below will be called when number of threads = 1. #ifdef BLIS_ENABLE_SMALL_MATRIX @@ -813,18 +813,18 @@ void zgemm_blis_impl dcomplex* c, const f77_int* ldc ) { - trans_t blis_transa; - trans_t blis_transb; - dim_t m0, n0, k0; + trans_t blis_transa; + trans_t blis_transb; + dim_t m0, n0, k0; - /* Initialize BLIS. */ - bli_init_auto(); + /* Initialize BLIS. */ + bli_init_auto(); - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k, + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k, (void*)alpha, *lda, *ldb, (void*)beta, *ldc); - /* Perform BLAS parameter checking. */ + /* Perform BLAS parameter checking. */ PASTEBLACHK(gemm) ( MKSTR(z), @@ -924,6 +924,30 @@ void zgemm_blis_impl //dim_t nt = bli_thread_get_num_threads(); // get number of threads bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked. + // This function is invoked on all architectures including 'generic'. + // Non-AVX2+FMA3 platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx2fma3_supported() == FALSE) + { + + // Will call parallelized zgemm code - sup & native + PASTEMAC(gemm, BLIS_OAPI_EX_SUF) + ( + &alphao, + &ao, + &bo, + &betao, + &co, + NULL, + NULL + ); + + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } + /* Invoking the API for input sizes with k=1. - For single thread, the API has no constraints before invoking. @@ -933,7 +957,7 @@ void zgemm_blis_impl && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb)) { - bli_zgemm_ref_k1_nn( m0, n0, k0, + bli_zgemm_4x6_avx2_k1_nn( m0, n0, k0, (dcomplex*)alpha, (dcomplex*)a, *lda, (dcomplex*)b, *ldb, diff --git a/kernels/zen/3/CMakeLists.txt b/kernels/zen/3/CMakeLists.txt index b7187e59e5..741d46e2ca 100644 --- a/kernels/zen/3/CMakeLists.txt +++ b/kernels/zen/3/CMakeLists.txt @@ -1,11 +1,11 @@ -##Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## add_library(zen_3 OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_small.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_ref_k1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_ref_k1.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_avx2_k1.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_avx2_k1.c ) target_compile_options(zen_3 PRIVATE /arch:AVX2) diff --git a/kernels/zen/3/bli_dgemm_ref_k1.c b/kernels/zen/3/bli_dgemm_avx2_k1.c similarity index 99% rename from kernels/zen/3/bli_dgemm_ref_k1.c rename to kernels/zen/3/bli_dgemm_avx2_k1.c index 14fa99ada3..b225fdad1a 100644 --- a/kernels/zen/3/bli_dgemm_ref_k1.c +++ b/kernels/zen/3/bli_dgemm_avx2_k1.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ #define D_MR 8 #define D_NR 6 -void bli_dgemm_ref_k1_nn +void bli_dgemm_8x6_avx2_k1_nn ( dim_t m, dim_t n, diff --git a/kernels/zen/3/bli_zgemm_ref_k1.c b/kernels/zen/3/bli_zgemm_avx2_k1.c similarity index 99% rename from kernels/zen/3/bli_zgemm_ref_k1.c rename to kernels/zen/3/bli_zgemm_avx2_k1.c index 60353cced0..a6a92f9a54 100644 --- a/kernels/zen/3/bli_zgemm_ref_k1.c +++ b/kernels/zen/3/bli_zgemm_avx2_k1.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -107,7 +107,7 @@ NEG_PERM_M_FRINGE(rin_0,rn); \ rout_0 = _mm256_fmadd_pd(rbc, rin_0, rout_0); \ -void bli_zgemm_ref_k1_nn +void bli_zgemm_4x6_avx2_k1_nn ( dim_t m, dim_t n, diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index c90dbc0e00..e6a2f33f92 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -303,7 +303,7 @@ err_t bli_zgemm_small_At cntl_t* cntl ); -void bli_dgemm_ref_k1_nn +void bli_dgemm_8x6_avx2_k1_nn ( dim_t m, dim_t n, @@ -315,7 +315,7 @@ void bli_dgemm_ref_k1_nn double* c, const inc_t ldc ); -void bli_zgemm_ref_k1_nn +void bli_zgemm_4x6_avx2_k1_nn ( dim_t m, dim_t n, From 6911d2dd21ec5c71f485e015008511216e652ab8 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Fri, 12 May 2023 12:27:25 -0400 Subject: [PATCH 097/226] zen config make_defs.mk improvements Improvements to zen make_defs.mk files: * Add -znver4 flag for GCC 13 and later. * Add AVX512 flags or -znver4 as appropriate for upstream LLVM in config/zen4/make_defs.mk to enable BLIS to be build with LLVM rather than AOCC. * zen make_defs.mk files were inheriting settings from the previous one (zen->zen2->zen3->zen4), when they should be independent of each other. Correct by including config/zen/amd_config.mk in all zen make_defs.mk files to reinitialize the compiler flags. * Update zen2 and zen3 make_defs.mk for recent AOCC compiler releases, rather than rely on LLVM settings. * Remove -mfpmath=sse flag in config/zen4/make_defs.mk as this is already specified in amd_config.mk (and should be the default setting anyway). * Tidy files to simplify nested if structures and be more consistent with one another. AMD-Internal: [CPUPL-3399] Change-Id: Ice64ccedd90c2660fdee8b485348a6b405cfc5ac --- config/zen/make_defs.mk | 45 ++++++++--- config/zen2/make_defs.mk | 88 +++++++++++++-------- config/zen3/make_defs.mk | 112 +++++++++++++------------- config/zen4/make_defs.mk | 166 ++++++++++++++++++++------------------- 4 files changed, 234 insertions(+), 177 deletions(-) diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk index 2dc30a229a..59fc7b0a67 100644 --- a/config/zen/make_defs.mk +++ b/config/zen/make_defs.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -33,8 +33,9 @@ # # -# FLAGS specific to zen architecture are added here. -# FLAGS that are common for all the AMD architectures are present in amd_config.mk +# FLAGS that are specific to the 'zen' architecture are added here. +# FLAGS that are common for all the AMD architectures are present in +# config/zen/amd_config.mk. # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. @@ -46,10 +47,27 @@ AMD_CONFIG_FILE := amd_config.mk AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen -include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. + +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) + CDBGFLAGS := -g +endif + ifeq ($(DEBUG_TYPE),noopt) -COPTFLAGS := -O0 + COPTFLAGS := -O0 else -COPTFLAGS := -O3 + COPTFLAGS := -O3 endif # @@ -61,20 +79,21 @@ endif # they make explicit use of the rbp register. CKOPTFLAGS := $(COPTFLAGS) -fomit-frame-pointer ifeq ($(CC_VENDOR),gcc) -CKVECFLAGS += -march=znver1 -GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) -ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) -CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse -endif # GCC 9 -endif + CKVECFLAGS += -march=znver1 + GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) + + ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) + CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse + endif +endif# gcc # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) + CRVECFLAGS := $(CKVECFLAGS) else -CRVECFLAGS := $(CKVECFLAGS) + CRVECFLAGS := $(CKVECFLAGS) endif # Store all of the variables here to new variables containing the diff --git a/config/zen2/make_defs.mk b/config/zen2/make_defs.mk index 03d34c7072..180c201b06 100644 --- a/config/zen2/make_defs.mk +++ b/config/zen2/make_defs.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -42,6 +42,11 @@ THIS_CONFIG := zen2 #CONFIGS_INCL += $(THIS_CONFIG) +# Include file containing common flags for all AMD architectures +AMD_CONFIG_FILE := amd_config.mk +AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen +-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) + # # --- Determine the C compiler and related flags --- # @@ -56,49 +61,68 @@ CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) -CDBGFLAGS := -g + CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) -COPTFLAGS := -O0 + COPTFLAGS := -O0 else -COPTFLAGS := -O3 + COPTFLAGS := -O3 endif # Flags specific to optimized kernels. # NOTE: The -fomit-frame-pointer option is needed for some kernels because # they make explicit use of the rbp register. CKOPTFLAGS := $(COPTFLAGS) -fomit-frame-pointer + +# gcc or clang version must be at least 4.0 ifeq ($(CC_VENDOR),gcc) -GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) -#gcc or clang version must be atleast 4.0 -# gcc 9.0 or later: -ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) -CKVECFLAGS += -march=znver2 -CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse -else -# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 -# as the fallback option. -CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store -CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store -endif -else + GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) + + ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) + # gcc 9.0 or later + CKVECFLAGS += -march=znver2 + CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse + else + # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 + # as the fallback option. + CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store + CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store + endif +endif # gcc + ifeq ($(CC_VENDOR),clang) -ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) -CKVECFLAGS += -march=znver2 -else -#if compiling with clang -VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) -CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) -#clang 9.0 or later: -ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) -CKVECFLAGS += -march=znver2 -else -CKVECFLAGS += -march=znver1 -endif -endif -endif -endif + # AOCC clang has various formats for the version line + + # AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) + # AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) + # AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) + # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) + # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) + # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0) + + # For our purpose we just want to know if it version 2x or 3x or 4x + + # But also set these in case we are using upstream LLVM clang + VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) + CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) + + ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_4')),1) + # AOCC version 4x we will enable znver2 + CKVECFLAGS += -march=znver2 + else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1) + # AOCC version 3x we will enable znver2 + CKVECFLAGS += -march=znver2 + else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) + # AOCC version 2x we will enable znver2 + CKVECFLAGS += -march=znver2 + else ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) + # LLVM clang 9.0 or later + CKVECFLAGS += -march=znver2 + else + CKVECFLAGS += -march=znver1 + endif +endif # clang # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk index bcf91e7484..7ec1ee32e9 100644 --- a/config/zen3/make_defs.mk +++ b/config/zen3/make_defs.mk @@ -42,6 +42,11 @@ THIS_CONFIG := zen3 #CONFIGS_INCL += $(THIS_CONFIG) +# Include file containing common flags for all AMD architectures +AMD_CONFIG_FILE := amd_config.mk +AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen +-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) + # # --- Determine the C compiler and related flags --- # @@ -56,76 +61,77 @@ CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) -CDBGFLAGS := -g + CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) -COPTFLAGS := -O0 + COPTFLAGS := -O0 else -COPTFLAGS := -O3 + COPTFLAGS := -O3 endif # Flags specific to optimized kernels. # NOTE: The -fomit-frame-pointer option is needed for some kernels because # they make explicit use of the rbp register. CKOPTFLAGS := $(COPTFLAGS) -fomit-frame-pointer + +# gcc or clang version must be at least 4.0 ifeq ($(CC_VENDOR),gcc) -GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) -# gcc or clang version must be atleast 4.0 -# gcc 9.0 or later: -ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0) -CKVECFLAGS += -march=znver3 -# Update CKOPTFLAGS for gcc to use O3 optimization without -# -ftree-pre and -ftree-partial-pre flag. These flag results -# in suboptimal code gen for instrinsic based kernels. -# The -ftree-loop-vectorize results in ineffecient code gen -# for amd optimized l1 kernels based on instrinsics. -CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse -else -ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) -CKVECFLAGS += -march=znver2 -CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse -else -# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 -# as the fallback option. -CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store -CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store -endif # GCC 9 -endif # GCC 11 -else + GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) + + ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0) + # gcc 11.0 or later + CKVECFLAGS += -march=znver3 + # Update CKOPTFLAGS for gcc to use O3 optimization without + # -ftree-pre and -ftree-partial-pre flag. These flag results + # in suboptimal code generation for instrinsic based kernels. + # The -ftree-loop-vectorize results in inefficient code gen + # for amd optimized l1 kernels based on instrinsics. + CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse + else ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) + # gcc 9.0 or later + CKVECFLAGS += -march=znver2 + CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse + else + # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 + # as the fallback option. + CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store + CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store + endif +endif # gcc + ifeq ($(CC_VENDOR),clang) + # AOCC clang has various formats for the version line -# AOCC clang has various formats for the version line + # AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) + # AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) + # AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) + # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) + # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) + # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0) -# AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) -# AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) -# AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) -# AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) -# AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) + # For our purpose we just want to know if it version 2x or 3x or 4x -# For our purpose we just want to know if it version 2x or 3x + # But also set these in case we are using upstream LLVM clang + VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) + CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) -# for version 3x we will enable znver3 -ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1) -CKVECFLAGS += -march=znver3 -else -# for version 2x we will enable znver2 -ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) -CKVECFLAGS += -march=znver2 -else -#if compiling with clang -VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) -CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) -#clang 9.0 or later: -ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) -CKVECFLAGS += -march=znver2 -else -CKVECFLAGS += -march=znver1 -endif # ge 9 -endif # aocc 2 -endif # aocc 3 + ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_4')),1) + # AOCC version 4x we will enable znver3 + CKVECFLAGS += -march=znver3 + else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1) + # AOCC version 3x we will enable znver3 + CKVECFLAGS += -march=znver3 + else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) + # AOCC version 2x we will enable znver2 + CKVECFLAGS += -march=znver2 + else ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) + # LLVM clang 9.0 or later + CKVECFLAGS += -march=znver2 + else + CKVECFLAGS += -march=znver1 + endif endif # clang -endif # gcc # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) diff --git a/config/zen4/make_defs.mk b/config/zen4/make_defs.mk index c2b78d06ca..5a058e2fbc 100644 --- a/config/zen4/make_defs.mk +++ b/config/zen4/make_defs.mk @@ -41,6 +41,11 @@ THIS_CONFIG := zen4 #CONFIGS_INCL += $(THIS_CONFIG) +# Include file containing common flags for all AMD architectures +AMD_CONFIG_FILE := amd_config.mk +AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen +-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) + # # --- Determine the C compiler and related flags --- # @@ -55,98 +60,101 @@ CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) -CDBGFLAGS := -g + CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) -COPTFLAGS := -O0 + COPTFLAGS := -O0 else -COPTFLAGS := -O3 + COPTFLAGS := -O3 endif # Flags specific to optimized kernels. # NOTE: The -fomit-frame-pointer option is needed for some kernels because # they make explicit use of the rbp register. CKOPTFLAGS := $(COPTFLAGS) -fomit-frame-pointer -ifeq ($(CC_VENDOR),gcc) -GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) - - -# gcc 11.0 or later: -ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0) -CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -mfpmath=sse -CRVECFLAGS += -march=znver3 -# Update CKOPTFLAGS for gcc to use O3 optimization without -# -ftree-pre and -ftree-partial-pre flag. These flag results -# in suboptimal code generation for instrinsic based kernels. -# The -ftree-loop-vectorize results in ineffecient code gen -# for amd optimized l1 kernels based on instrinsics. -CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -else -# gcc 9.0 or later: -ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) -CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse -CRVECFLAGS += -march=znver2 -CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -else -ifeq ($(shell test $(GCC_VERSION) -ge 8; echo $$?),0) -CKVECFLAGS += -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse -CRVECFLAGS += -march=znver1 -else -# If gcc is older than 8.0.0 but at least 6.1.0, then we can use -march=znver1 -# as the fallback option. -CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store -CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store -endif # GCC 8 -endif # GCC 9 -endif # GCC 11 -else -ifeq ($(CC_VENDOR),clang) - -# AOCC clang has various formats for the version line - -# AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) -# AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) -# AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) -# AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) -# AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) -# AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0) - -# For our purpose we just want to know if it version 2x or 3x or 4x -# for version 4x we will enable znver4 -ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_4')),1) -CKVECFLAGS += -march=znver4 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512bf16 -mfpmath=sse -falign-loops=64 - -CRVECFLAGS += -march=znver4 -else -# for version 3x we will enable znver3 -ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1) -CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -mfpmath=sse -falign-loops=64 +# gcc or clang version must be at least 4.0 +ifeq ($(CC_VENDOR),gcc) + GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) + + ifeq ($(shell test $(GCC_VERSION) -ge 13; echo $$?),0) + # gcc 13.0 or later + CKVECFLAGS += -march=znver4 + CRVECFLAGS += -march=znver4 + # Update CKOPTFLAGS for gcc to use O3 optimization without + # -ftree-pre and -ftree-partial-pre flag. These flag results + # in suboptimal code generation for instrinsic based kernels. + # The -ftree-loop-vectorize results in inefficient code gen + # for amd optimized l1 kernels based on instrinsics. + CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize + else ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0) + # gcc 11.0 or later + CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 + CRVECFLAGS += -march=znver3 + CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize + else ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) + # gcc 9.0 or later + CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni + CRVECFLAGS += -march=znver2 + CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize + else ifeq ($(shell test $(GCC_VERSION) -ge 8; echo $$?),0) + # gcc 8.0 or later + CKVECFLAGS += -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni + CRVECFLAGS += -march=znver1 + else + # If gcc is older than 8.0.0 but at least 6.1.0, then we can use -march=znver1 + # as the fallback option. + CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store + CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store + endif +endif # gcc -CRVECFLAGS += -march=znver3 -else -# for version 2x we will enable znver2 -ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) -CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse -CRVECFLAGS += -march=znver2 -else -#if compiling with clang -VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) -CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) -#clang 9.0 or later: -ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) -CKVECFLAGS += -march=znver2 -CRVECFLAGS += -march=znver2 -else -CKVECFLAGS += -march=znver1 -CRVECFLAGS += -march=znver1 -endif # ge 9 -endif # aocc 2 -endif # aocc 3 -endif # aocc 4 +ifeq ($(CC_VENDOR),clang) + # AOCC clang has various formats for the version line + + # AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) + # AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) + # AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) + # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) + # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) + # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0) + + # For our purpose we just want to know if it version 2x or 3x or 4x + + # But also set these in case we are using upstream LLVM clang + VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) + CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) + + ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_4')),1) + # AOCC version 4x we will enable znver4 + CKVECFLAGS += -march=znver4 -falign-loops=64 + CRVECFLAGS += -march=znver4 + else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1) + # AOCC version 3x we will enable znver3 + CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64 + CRVECFLAGS += -march=znver3 + else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) + # AOCC version 2x we will enable znver2 + CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni + CRVECFLAGS += -march=znver2 + else ifeq ($(shell test $(CC_MAJOR) -ge 16; echo $$?),0) + # LLVM clang 16.0 or later + CKVECFLAGS += -march=znver4 -falign-loops=64 + CRVECFLAGS += -march=znver4 + else ifeq ($(shell test $(CC_MAJOR) -ge 13; echo $$?),0) + # LLVM clang 13.0 or later + CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64 + CRVECFLAGS += -march=znver3 + else ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) + # LLVM clang 9.0 or later + CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64 + CRVECFLAGS += -march=znver2 + else + CKVECFLAGS += -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -falign-loops=64 + CRVECFLAGS += -march=znver1 + endif endif # clang -endif # gcc # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) From ea2eea50979c3d387850c87dfef0507feb193975 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Fri, 19 May 2023 10:26:26 -0400 Subject: [PATCH 098/226] BLIS: Missing clobbers (batch 1) Add missing clobbers in first batch of assembly kernels: - zen3 bli_gemmsup* - bli_zgemm_zen4_asm_12x4 - bli_gemmsup_rv_haswell_asm_sMx6 AMD-Internal: [CPUPL-3456] Change-Id: I33c321043a197b2b885cfd6cd589532fc633a6a1 --- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c | 16 +++- .../zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c | 33 ++++++- .../zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16m.c | 18 +++- .../zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16n.c | 16 +++- .../zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4.c | 12 ++- .../zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4m.c | 10 +- .../zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4n.c | 9 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c | 11 ++- .../zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c | 9 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c | 93 ++++++++++++++++++- .../zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c | 9 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16n.c | 22 ++++- .../zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4.c | 11 ++- .../zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4m.c | 9 +- kernels/zen4/3/bli_zgemm_zen4_asm_12x4.c | 9 +- 15 files changed, 268 insertions(+), 19 deletions(-) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c index ac4e1ee0b0..1d80111ea8 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -662,6 +662,9 @@ void bli_sgemmsup_rv_haswell_asm_6x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "ymm14", "memory" ) } @@ -1236,6 +1239,9 @@ void bli_sgemmsup_rv_haswell_asm_5x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "memory" ) } @@ -1723,6 +1729,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "ymm8", "ymm10", "memory" ) } @@ -2211,6 +2219,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "memory" ) } @@ -2611,6 +2621,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", + "ymm6", "memory" ) } @@ -3000,6 +3012,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "memory" ) } @@ -3389,6 +3402,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "memory" ) } diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c index c309c8c0cd..3c47a910bb 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c +++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c @@ -2,8 +2,10 @@ BLIS An object-based framework for developing high-performance BLAS-like libraries. + Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 2022 , Advanced Micro Devices, Inc. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -15,6 +17,7 @@ - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -27,7 +30,9 @@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + #include "blis.h" + #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* @@ -328,6 +333,9 @@ void bli_sgemmsup_rd_zen_asm_2x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -560,6 +568,8 @@ void bli_sgemmsup_rd_zen_asm_1x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", + "ymm7", "ymm10", "ymm13", "memory" ) } @@ -858,6 +868,9 @@ void bli_sgemmsup_rd_zen_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -1088,6 +1101,8 @@ void bli_sgemmsup_rd_zen_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", + "ymm7", "ymm10", "ymm13", "memory" ) } @@ -1354,6 +1369,9 @@ void bli_sgemmsup_rd_zen_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -1568,6 +1586,8 @@ void bli_sgemmsup_rd_zen_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", + "ymm7", "ymm10", "ymm13", "memory" ) } @@ -1792,6 +1812,8 @@ void bli_sgemmsup_rd_zen_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm3", "ymm4", + "ymm5", "ymm6", "ymm7", "memory" ) } @@ -1979,6 +2001,8 @@ void bli_sgemmsup_rd_zen_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm3", "ymm4", + "ymm5", "memory" ) } @@ -2370,6 +2394,10 @@ void bli_sgemmsup_rd_zen_asm_6x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm3", "ymm4", + "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases: @@ -2664,6 +2692,9 @@ void bli_sgemmsup_rd_zen_asm_3x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm3", "ymm4", + "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "memory" ) } diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16m.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16m.c index 00773b3b58..6d1d001b50 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16m.c +++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -556,6 +556,10 @@ void bli_sgemmsup_rd_zen_asm_6x16m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases: @@ -1035,6 +1039,10 @@ void bli_sgemmsup_rd_zen_asm_6x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases: @@ -1517,6 +1525,10 @@ void bli_sgemmsup_rd_zen_asm_6x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases: @@ -1923,6 +1935,10 @@ void bli_sgemmsup_rd_zen_asm_6x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases: diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16n.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16n.c index dfe5ca28af..6b84594e39 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16n.c +++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -594,6 +594,10 @@ void bli_sgemmsup_rd_zen_asm_6x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1061,6 +1065,10 @@ void bli_sgemmsup_rd_zen_asm_3x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1471,6 +1479,10 @@ void bli_sgemmsup_rd_zen_asm_2x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", + "ymm15", "memory" ) @@ -1828,6 +1840,8 @@ void bli_sgemmsup_rd_zen_asm_1x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", + "ymm7", "ymm10", "ymm13", "memory" ) diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4.c index bdbdfd0455..d07ee3ec07 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4.c +++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4.c @@ -3,7 +3,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022 , Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -396,6 +396,9 @@ void bli_zgemmsup_rd_zen_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) @@ -686,6 +689,8 @@ void bli_zgemmsup_rd_zen_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) @@ -1025,6 +1030,9 @@ void bli_zgemmsup_rd_zen_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) @@ -1301,6 +1309,8 @@ void bli_zgemmsup_rd_zen_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4m.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4m.c index 9cf359af05..b8243a04ed 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4m.c +++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4m.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022 , Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -605,6 +605,10 @@ void bli_zgemmsup_rd_zen_asm_3x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1060,6 +1064,10 @@ void bli_zgemmsup_rd_zen_asm_3x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4n.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4n.c index 6d628027d3..8223e756f3 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4n.c +++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4n.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022 , Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -587,6 +587,10 @@ void bli_zgemmsup_rd_zen_asm_3x4n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1030,6 +1034,9 @@ void bli_zgemmsup_rd_zen_asm_2x4n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c index 03c1627f15..386c2ca8f0 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -548,6 +548,9 @@ void bli_cgemmsup_rv_zen_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -910,6 +913,8 @@ void bli_cgemmsup_rv_zen_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -1286,6 +1291,8 @@ void bli_cgemmsup_rv_zen_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "ymm8", "ymm10", "memory" ) } @@ -1604,6 +1611,8 @@ void bli_cgemmsup_rv_zen_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "ymm8", "ymm10", "memory" ) } diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c index 07fbd26296..f92b1cc17b 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -739,6 +739,10 @@ void bli_cgemmsup_rv_zen_asm_3x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1230,6 +1234,9 @@ void bli_cgemmsup_rv_zen_asm_3x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "ymm14", "memory" ) diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c index 7befbb69bb..2cb3a844cc 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-2022, Advanced Micro Devices, Inc. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -717,12 +717,16 @@ void bli_sgemmsup_rv_zen_asm_5x16 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1213,12 +1217,16 @@ void bli_sgemmsup_rv_zen_asm_4x16 [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1779,6 +1787,10 @@ void bli_sgemmsup_rv_zen_asm_3x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -2172,6 +2184,10 @@ void bli_sgemmsup_rv_zen_asm_2x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -2533,6 +2549,10 @@ void bli_sgemmsup_rv_zen_asm_1x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -2981,6 +3001,10 @@ void bli_sgemmsup_rv_zen_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -3434,6 +3458,10 @@ void bli_sgemmsup_rv_zen_asm_5x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -3800,6 +3828,10 @@ void bli_sgemmsup_rv_zen_asm_4x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -4195,6 +4227,10 @@ void bli_sgemmsup_rv_zen_asm_3x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -4504,6 +4540,10 @@ void bli_sgemmsup_rv_zen_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -4767,6 +4807,10 @@ void bli_sgemmsup_rv_zen_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -5168,6 +5212,10 @@ void bli_sgemmsup_rv_zen_asm_6x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -5556,6 +5604,10 @@ void bli_sgemmsup_rv_zen_asm_5x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -5894,6 +5946,9 @@ void bli_sgemmsup_rv_zen_asm_4x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -6219,6 +6274,9 @@ void bli_sgemmsup_rv_zen_asm_3x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -6492,6 +6550,9 @@ void bli_sgemmsup_rv_zen_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -6746,6 +6807,10 @@ void bli_sgemmsup_rv_zen_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "memory" ) } @@ -7133,6 +7198,9 @@ void bli_sgemmsup_rv_zen_asm_6x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -7506,6 +7574,9 @@ void bli_sgemmsup_rv_zen_asm_5x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -7842,6 +7913,10 @@ void bli_sgemmsup_rv_zen_asm_4x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -8144,6 +8219,10 @@ void bli_sgemmsup_rv_zen_asm_3x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -8406,6 +8485,10 @@ void bli_sgemmsup_rv_zen_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -8643,6 +8726,10 @@ void bli_sgemmsup_rv_zen_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2","ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "memory" ) } diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c index d5e2135a66..19acd5a1b6 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-2022, Advanced Micro Devices, Inc. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -895,6 +895,10 @@ void bli_sgemmsup_rv_zen_asm_6x16m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1439,6 +1443,9 @@ void bli_sgemmsup_rv_zen_asm_6x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "ymm14", "memory" ) diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16n.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16n.c index f46244d668..eb690e9f6c 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16n.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2021, Advanced Micro Devices, Inc. + Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -855,6 +855,10 @@ void bli_sgemmsup_rv_zen_asm_6x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1621,6 +1625,10 @@ void bli_sgemmsup_rv_zen_asm_5x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "memory" ) @@ -2230,6 +2238,9 @@ void bli_sgemmsup_rv_zen_asm_4x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) @@ -2876,6 +2887,10 @@ void bli_sgemmsup_rv_zen_asm_3x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -3366,6 +3381,9 @@ void bli_sgemmsup_rv_zen_asm_2x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm11", "ymm12", "memory" ) @@ -3821,6 +3839,8 @@ void bli_sgemmsup_rv_zen_asm_1x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "memory" ) diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4.c index 787d3f772b..298ede7204 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-2021, Advanced Micro Devices, Inc. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -515,6 +515,9 @@ void bli_zgemmsup_rv_zen_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) @@ -875,6 +878,8 @@ void bli_zgemmsup_rv_zen_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) @@ -1236,6 +1241,8 @@ void bli_zgemmsup_rv_zen_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "ymm8", "ymm10", "memory" ) } @@ -1543,6 +1550,8 @@ void bli_zgemmsup_rv_zen_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "memory" ) diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4m.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4m.c index 1c3c386a0c..804e196e12 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4m.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -798,6 +798,10 @@ void bli_zgemmsup_rv_zen_asm_3x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1260,6 +1264,9 @@ void bli_zgemmsup_rv_zen_asm_3x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "ymm14", "memory" ) diff --git a/kernels/zen4/3/bli_zgemm_zen4_asm_12x4.c b/kernels/zen4/3/bli_zgemm_zen4_asm_12x4.c index 795f61849b..4fc69acd15 100644 --- a/kernels/zen4/3/bli_zgemm_zen4_asm_12x4.c +++ b/kernels/zen4/3/bli_zgemm_zen4_asm_12x4.c @@ -1071,12 +1071,15 @@ void bli_zgemm_zen4_asm_12x4( [beta_mul_type] "m"(beta_mul_type) : // register clobber list "rax", "rbx", "rcx", "rdi", "rsi", "r9", "r10", "r12", "r14", - "xmm8", "xmm9", "xmm10", "zmm0", "zmm1", "zmm2", + "xmm8", "xmm9", "xmm10", + "ymm8", "ymm9", + "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", - "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", - "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory") + "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" + ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } From e960141fe2712e22a8a8f2daa3d6dc866b12d171 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Mon, 22 May 2023 11:41:37 -0400 Subject: [PATCH 099/226] BLIS: Missing clobbers (batch 2) Add missing clobbers in other zen4 kernels. AMD-Internal: [CPUPL-3456] Change-Id: I5cceb44fe100e03269cfe21d8c4c0d2171b921c3 --- kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c | 2 +- kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c | 2 +- kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c | 6 +- kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c | 6 +- .../3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c | 16 +++++ kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.c | 68 +++++++++++++++++++ .../zen4/3/sup/bli_gemmsup_rd_zen_s6x64m.c | 18 +++++ .../zen4/3/sup/bli_gemmsup_rd_zen_s6x64n.c | 14 ++++ kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c | 6 ++ .../zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c | 16 +++++ .../zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c | 6 ++ .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c | 6 ++ .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c | 6 ++ .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c | 6 ++ .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c | 6 ++ .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c | 6 ++ .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c | 6 ++ .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c | 6 ++ .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c | 6 ++ 19 files changed, 204 insertions(+), 4 deletions(-) diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c index e355da8bff..081ba3a4a8 100644 --- a/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c +++ b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c @@ -731,7 +731,7 @@ void bli_dpackm_zen4_asm_24xk "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", - "memory" + "zmm16", "zmm18", "zmm20", "zmm30", "zmm31", "memory" ) } else // if ( cdim0 < mnr || gs || !unitk ) diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c index 5ab39dae5f..9cbafadb24 100644 --- a/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c +++ b/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c @@ -383,7 +383,7 @@ void bli_dpackm_zen4_asm_8xk "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", - "memory" + "zmm16", "zmm18", "zmm20", "zmm30", "zmm31", "memory" ) } else // if ( cdim0 < mnr || gs || !unitk ) diff --git a/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c b/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c index f6e67e8c6c..1bc1dd3530 100644 --- a/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c +++ b/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c @@ -1659,10 +1659,14 @@ void bli_dgemmtrsm_l_zen_asm_16x14 : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "xmm0", "xmm1", + "ymm0", "ymm1", "ymm4", "ymm6", "ymm20", "ymm21", "ymm22", "ymm23", + "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", - "zmm30", "zmm31", "memory" + "zmm30", "zmm31", + "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_9); diff --git a/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c b/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c index 4a033103af..b0700b76de 100644 --- a/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c +++ b/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c @@ -1696,10 +1696,14 @@ void bli_dgemmtrsm_u_zen_asm_16x14 : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "xmm0", "xmm1", + "ymm0", "ymm1", "ymm4", "ymm6", "ymm20", "ymm21", "ymm22", "ymm23", + "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", - "zmm30", "zmm31", "memory" + "zmm30", "zmm31", + "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_9); diff --git a/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c index c7b618fb7e..503f615a3b 100644 --- a/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c +++ b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c @@ -1786,6 +1786,8 @@ void bli_dgemmsup_rv_zen4_asm_24x8m : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3281,6 +3283,8 @@ void bli_dgemmsup_rv_zen4_asm_24x7m : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -4664,6 +4668,8 @@ void bli_dgemmsup_rv_zen4_asm_24x6m : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -5930,6 +5936,8 @@ void bli_dgemmsup_rv_zen4_asm_24x5m : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -7030,6 +7038,8 @@ void bli_dgemmsup_rv_zen4_asm_24x4m : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -8011,6 +8021,8 @@ void bli_dgemmsup_rv_zen4_asm_24x3m : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -8877,6 +8889,8 @@ void bli_dgemmsup_rv_zen4_asm_24x2m : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -9627,6 +9641,8 @@ void bli_dgemmsup_rv_zen4_asm_24x1m : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.c b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.c index 9852be166a..96fa63e95d 100644 --- a/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.c +++ b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.c @@ -452,6 +452,12 @@ void bli_sgemmsup_rd_zen_asm_5x64_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", + "ymm20", "ymm21", "ymm23", "ymm24", "ymm26", "ymm27", + "ymm29", "ymm30", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -852,6 +858,11 @@ void bli_sgemmsup_rd_zen_asm_4x64_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", + "ymm20", "ymm23", "ymm26", "ymm29", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1234,6 +1245,10 @@ void bli_sgemmsup_rd_zen_asm_3x64_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1599,6 +1614,10 @@ void bli_sgemmsup_rd_zen_asm_2x64_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm17", "ymm18", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1950,6 +1969,10 @@ void bli_sgemmsup_rd_zen_asm_1x64_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm6", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", + "ymm14", "ymm17", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2368,6 +2391,12 @@ void bli_sgemmsup_rd_zen_asm_5x48_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", + "ymm20", "ymm21", "ymm23", "ymm24", "ymm26", "ymm27", + "ymm29", "ymm30", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2775,6 +2804,11 @@ void bli_sgemmsup_rd_zen_asm_4x48_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", + "ymm20", "ymm23", "ymm26", "ymm29", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3156,6 +3190,10 @@ void bli_sgemmsup_rd_zen_asm_3x48_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3522,6 +3560,10 @@ void bli_sgemmsup_rd_zen_asm_2x48_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm17", "ymm18", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3873,6 +3915,9 @@ void bli_sgemmsup_rd_zen_asm_1x48_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm6", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "ymm17", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -4290,6 +4335,12 @@ void bli_sgemmsup_rd_zen_asm_5x32_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", + "ymm20", "ymm21", "ymm23", "ymm24", "ymm26", "ymm27", + "ymm29", "ymm30", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -4696,6 +4747,11 @@ void bli_sgemmsup_rd_zen_asm_4x32_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", + "ymm20", "ymm23", "ymm26", "ymm29", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -5078,6 +5134,10 @@ void bli_sgemmsup_rd_zen_asm_3x32_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -5443,6 +5503,10 @@ void bli_sgemmsup_rd_zen_asm_2x32_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm17", "ymm18", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -5794,6 +5858,10 @@ void bli_sgemmsup_rd_zen_asm_1x32_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm6", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", + "ymm14", "ymm17", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64m.c b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64m.c index fa6a40bd70..1e0ce1c4c4 100644 --- a/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64m.c +++ b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64m.c @@ -590,6 +590,12 @@ void bli_sgemmsup_rd_zen_asm_6x64m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", + "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25", + "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1111,6 +1117,12 @@ void bli_sgemmsup_rd_zen_asm_6x48m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", + "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25", + "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1632,6 +1644,12 @@ void bli_sgemmsup_rd_zen_asm_6x32m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", + "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25", + "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64n.c b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64n.c index 5aa21cff2c..145d3b5201 100644 --- a/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64n.c +++ b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64n.c @@ -530,6 +530,12 @@ void bli_sgemmsup_rd_zen_asm_6x64n_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", + "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25", + "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -940,6 +946,10 @@ void bli_sgemmsup_rd_zen_asm_3x64n_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1343,6 +1353,10 @@ void bli_sgemmsup_rd_zen_asm_2x64n_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm17", "ymm18", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c index e47bbd6a7b..a69d016b38 100644 --- a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c +++ b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c @@ -354,6 +354,7 @@ void bli_sgemmsup_rv_zen_asm_5x48_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -668,6 +669,7 @@ void bli_sgemmsup_rv_zen_asm_5x32_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -973,6 +975,7 @@ void bli_sgemmsup_rv_zen_asm_5x16_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1273,6 +1276,7 @@ void bli_sgemmsup_rv_zen_asm_3x48_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1562,6 +1566,7 @@ void bli_sgemmsup_rv_zen_asm_3x32_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1840,6 +1845,7 @@ void bli_sgemmsup_rv_zen_asm_3x16_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c index 9fe45581a7..23f43052e8 100644 --- a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c +++ b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c @@ -621,6 +621,7 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1071,6 +1072,7 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1505,6 +1507,7 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1928,6 +1931,7 @@ void bli_sgemmsup_rv_zen_asm_6x16m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2302,6 +2306,7 @@ void bli_sgemmsup_rv_zen_asm_4x64m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm1", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2604,6 +2609,7 @@ void bli_sgemmsup_rv_zen_asm_4x48m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm1", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2897,6 +2903,7 @@ void bli_sgemmsup_rv_zen_asm_4x32m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm1", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3182,6 +3189,7 @@ void bli_sgemmsup_rv_zen_asm_4x16m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm1", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3466,6 +3474,7 @@ void bli_sgemmsup_rv_zen_asm_2x64m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3741,6 +3750,7 @@ void bli_sgemmsup_rv_zen_asm_2x48m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -4007,6 +4017,7 @@ void bli_sgemmsup_rv_zen_asm_2x32m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -4263,6 +4274,7 @@ void bli_sgemmsup_rv_zen_asm_2x16m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -4513,6 +4525,7 @@ void bli_sgemmsup_rv_zen_asm_1x64m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -4756,6 +4769,7 @@ void bli_sgemmsup_rv_zen_asm_1x48m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -4991,6 +5005,7 @@ void bli_sgemmsup_rv_zen_asm_1x32m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -5221,6 +5236,7 @@ void bli_sgemmsup_rv_zen_asm_1x16m_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c index 51b152f0c0..e4ce3d1490 100644 --- a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c +++ b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c @@ -487,6 +487,7 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1022,6 +1023,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1515,6 +1517,7 @@ void bli_sgemmsup_rv_zen_asm_4x64n_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm1", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2009,6 +2012,7 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2462,6 +2466,7 @@ void bli_sgemmsup_rv_zen_asm_2x64n_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2896,6 +2901,7 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c index 4f82f8895b..da76bab068 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c @@ -1075,6 +1075,8 @@ void bli_dgemmsup_rv_zen4_asm_24x1 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1689,6 +1691,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2221,6 +2225,8 @@ void bli_dgemmsup_rv_zen4_asm_8x1 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c index bdada9417a..442cfdde31 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c @@ -1197,6 +1197,8 @@ void bli_dgemmsup_rv_zen4_asm_24x2 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -1901,6 +1903,8 @@ void bli_dgemmsup_rv_zen4_asm_16x2 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2492,6 +2496,8 @@ void bli_dgemmsup_rv_zen4_asm_8x2 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c index fa3b4cf042..9f17bed812 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c @@ -1313,6 +1313,8 @@ void bli_dgemmsup_rv_zen4_asm_24x3 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2103,6 +2105,8 @@ void bli_dgemmsup_rv_zen4_asm_16x3 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2752,6 +2756,8 @@ void bli_dgemmsup_rv_zen4_asm_8x3 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c index 52dbf10912..2a5f6a8b03 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c @@ -1429,6 +1429,8 @@ void bli_dgemmsup_rv_zen4_asm_24x4 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2311,6 +2313,8 @@ void bli_dgemmsup_rv_zen4_asm_16x4 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3020,6 +3024,8 @@ void bli_dgemmsup_rv_zen4_asm_8x4 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c index 05cfa12441..7326688cde 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c @@ -1599,6 +1599,8 @@ void bli_dgemmsup_rv_zen4_asm_24x5 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2614,6 +2616,8 @@ void bli_dgemmsup_rv_zen4_asm_16x5 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3421,6 +3425,8 @@ void bli_dgemmsup_rv_zen4_asm_8x5 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c index 42335cac7f..95d6588854 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c @@ -1718,6 +1718,8 @@ void bli_dgemmsup_rv_zen4_asm_24x6 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -2823,6 +2825,8 @@ void bli_dgemmsup_rv_zen4_asm_16x6 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3689,6 +3693,8 @@ void bli_dgemmsup_rv_zen4_asm_8x6 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c index c3471d8d68..42cb6d648e 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c @@ -1835,6 +1835,8 @@ void bli_dgemmsup_rv_zen4_asm_24x7 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3027,6 +3029,8 @@ void bli_dgemmsup_rv_zen4_asm_16x7 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3950,6 +3954,8 @@ void bli_dgemmsup_rv_zen4_asm_8x7 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c index 32471c0d25..d7497c3db0 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c @@ -1915,6 +1915,8 @@ void bli_dgemmsup_rv_zen4_asm_24x8 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -3190,6 +3192,8 @@ void bli_dgemmsup_rv_zen4_asm_16x8 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", @@ -4164,6 +4168,8 @@ void bli_dgemmsup_rv_zen4_asm_8x8 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm2", "xmm31", + "ymm2", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", From 03965a4f07f0181ac4d9fb3c2c090053268add07 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Mon, 22 May 2023 11:54:38 -0400 Subject: [PATCH 100/226] BLIS: Missing clobbers (batch 3) Add missing clobbers in haswell (non-sup) kernels. AMD-Internal: [CPUPL-3456] Change-Id: I68f6ad0c01557fcde73b1775d250d48b5162c521 --- kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c | 2 ++ kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c | 2 ++ kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c | 2 ++ kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c | 2 ++ kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c | 3 +++ kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c | 2 ++ kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c | 2 ++ kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c | 2 ++ kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c | 12 ++++++++++++ kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c | 12 ++++++++++++ kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c | 6 ++++++ kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c | 6 ++++++ 12 files changed, 53 insertions(+) diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c index ab42e06aa9..ba31a1aad1 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c @@ -337,6 +337,8 @@ void bli_cpackm_haswell_asm_3xk "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm4", "ymm6", + "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c index a101e66d18..2c888b98e1 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c @@ -357,6 +357,8 @@ void bli_cpackm_haswell_asm_8xk "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c index b64f26591d..3a192a0eaf 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c @@ -341,6 +341,8 @@ void bli_dpackm_haswell_asm_6xk "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c index 0cfa2e8d68..eae1507f7e 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c @@ -361,6 +361,8 @@ void bli_dpackm_haswell_asm_8xk "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c index 40ac22bc55..d0e0ba2b8c 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c @@ -510,6 +510,9 @@ void bli_spackm_haswell_asm_16xk "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c index 3a134bed8f..a9f05653f3 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c @@ -383,6 +383,8 @@ void bli_spackm_haswell_asm_6xk "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm12", "ymm14", "memory" ) } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c index 06fcf1438a..672b797b96 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c @@ -343,6 +343,8 @@ void bli_zpackm_haswell_asm_3xk "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm11", "ymm12", "memory" ) } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c index 25a8b6181e..f9b0c6cdd8 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c @@ -353,6 +353,8 @@ void bli_zpackm_haswell_asm_4xk "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm12", "ymm14", "memory" ) } diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c index 1515f292ec..0d6cf99f2a 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c @@ -907,6 +907,9 @@ void bli_sgemm_haswell_asm_6x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); @@ -1664,6 +1667,9 @@ void bli_dgemm_haswell_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); @@ -2197,6 +2203,9 @@ void bli_cgemm_haswell_asm_3x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -2799,6 +2808,9 @@ void bli_zgemm_haswell_asm_3x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c index 5df6d7a88a..682956db7e 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c @@ -635,6 +635,9 @@ void bli_sgemm_haswell_asm_16x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); @@ -1222,6 +1225,9 @@ void bli_dgemm_haswell_asm_8x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); @@ -1755,6 +1761,9 @@ void bli_cgemm_haswell_asm_8x3 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -2283,6 +2292,9 @@ void bli_zgemm_haswell_asm_4x3 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c index 5fd21e883a..610274a389 100644 --- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c @@ -822,6 +822,9 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1569,6 +1572,9 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_9); diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c index f2032c69d8..429c085021 100644 --- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c @@ -826,6 +826,9 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1577,6 +1580,9 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_9); From a3adfb68cfcdb26b202bd0e9c92b1540828ea9fa Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Mon, 22 May 2023 14:37:25 -0400 Subject: [PATCH 101/226] BLIS: Missing clobbers (batch 4) Add missing clobbers haswell (sup) kernels. AMD-Internal: [CPUPL-3456] Change-Id: I19fa97b85f75c8b8fe15d31b13768f937cc5e4cc --- .../haswell/1m/bli_packm_haswell_asm_c3xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_c8xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_d6xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_d8xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_s16xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_s6xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_z3xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_z4xk.c | 2 +- kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c | 2 +- kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c | 1 + .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c | 2 +- .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c | 2 +- .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c | 59 ++++++++++++++++++- .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c | 11 +++- .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c | 17 +++++- .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c | 11 +++- .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c | 49 ++++++++++++++- .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c | 16 ++++- .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c | 16 ++++- .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c | 16 ++++- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c | 7 ++- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c | 9 ++- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c | 8 ++- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c | 8 ++- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c | 9 ++- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c | 12 +++- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c | 14 ++++- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c | 13 +++- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c | 8 ++- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c | 8 ++- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c | 8 ++- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c | 9 ++- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c | 8 ++- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c | 8 ++- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c | 13 +++- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c | 13 +++- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c | 11 +++- kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c | 2 +- kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c | 2 +- kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c | 2 +- 40 files changed, 350 insertions(+), 40 deletions(-) diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c index ba31a1aad1..78e76589dc 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c index 2c888b98e1..61ace6945d 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c index 3a192a0eaf..e2982dbfeb 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c index eae1507f7e..e3b00a71e7 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c index d0e0ba2b8c..b049fcdb5c 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c index a9f05653f3..c05c36b66f 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c index 672b797b96..cb025c1f01 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c index f9b0c6cdd8..e407fedf9f 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c index 0d6cf99f2a..f0a8fe34c3 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c index 682956db7e..02ea97b155 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c index 610274a389..939cab78f2 100644 --- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c index 429c085021..bd9d338b3c 100644 --- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c index a16bd36386..dc81b2d913 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -684,6 +684,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1640,6 +1643,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1932,6 +1938,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -2914,6 +2923,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -3313,6 +3325,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -4197,6 +4212,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -4650,6 +4668,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -5530,6 +5551,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -6218,6 +6242,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -7020,6 +7047,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -7900,6 +7930,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -8437,6 +8470,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -9303,6 +9339,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -9711,6 +9750,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -10125,6 +10167,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -11442,6 +11487,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -12828,6 +12876,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -13367,6 +13418,9 @@ void bli_dgemmsup_rd_haswell_asm_6x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -13983,6 +14037,9 @@ void bli_dgemmsup_rd_haswell_asm_6x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c index 5908d80f2c..65c985ef1a 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -740,6 +740,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1326,6 +1329,9 @@ void bli_dgemmsup_rd_haswell_asm_3x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1858,6 +1864,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) @@ -2333,6 +2341,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c index f764bc613e..9962e1a95e 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -717,6 +717,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) @@ -1328,6 +1331,9 @@ void bli_sgemmsup_rd_haswell_asm_6x12m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) @@ -1941,6 +1947,9 @@ void bli_sgemmsup_rd_haswell_asm_6x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -2525,6 +2534,9 @@ void bli_sgemmsup_rd_haswell_asm_6x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -3157,6 +3169,9 @@ void bli_sgemmsup_rd_haswell_asm_6x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c index 1fe862a8d1..3af06075a8 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -752,6 +752,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1348,6 +1351,9 @@ void bli_sgemmsup_rd_haswell_asm_3x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1885,6 +1891,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) @@ -2362,6 +2370,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c index d20058c5b3..05c240d2d1 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -896,6 +896,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1593,6 +1596,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -1952,6 +1957,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x8_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm12", "ymm14", "memory" ) @@ -2197,7 +2204,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x16_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm3", "ymm4", "ymm12", "ymm14", + "memory" ) } @@ -2844,6 +2852,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -3463,6 +3474,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x0_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -4092,6 +4106,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x8_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -4696,6 +4713,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_18x16_L "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -5222,6 +5242,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -5779,6 +5802,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x8_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -6335,6 +6361,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x16_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -6586,6 +6615,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x0_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm5", "ymm7", "memory" ) } @@ -6961,6 +6991,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x8_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm5", + "ymm7", "ymm9", "ymm11", "memory" ) } @@ -7448,6 +7480,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_18x16_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm13", "ymm15", "memory" ) } @@ -8087,6 +8122,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -8783,6 +8821,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -9441,6 +9482,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm14", "memory" ) @@ -10074,6 +10117,8 @@ void bli_dgemmsup_rv_haswell_asm_6x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm14", "memory" ) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c index 036338fbf7..4cdc763b67 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -872,6 +872,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) @@ -1622,6 +1625,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "memory" ) @@ -2281,6 +2287,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) @@ -2961,6 +2969,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -3522,6 +3533,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm5", "ymm6", "ymm7", "memory" ) @@ -4047,6 +4060,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "memory" ) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c index c299047ff9..d1c251bcbd 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2022, Advanced Micro Devices, Inc. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1030,6 +1030,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) @@ -1897,6 +1900,9 @@ void bli_sgemmsup_rv_haswell_asm_6x12m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) @@ -2610,6 +2616,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm14", "memory" ) @@ -3356,6 +3364,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm12", "ymm14", "memory" ) @@ -4025,6 +4035,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm12", "ymm14", "memory" ) @@ -4664,6 +4676,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm12", "ymm14", "memory" ) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c index 7463707cc9..af4ab52a02 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -982,6 +982,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) @@ -1914,6 +1917,9 @@ void bli_sgemmsup_rv_haswell_asm_5x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "memory" ) @@ -2672,6 +2678,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) @@ -3491,6 +3499,9 @@ void bli_sgemmsup_rv_haswell_asm_3x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) @@ -4129,6 +4140,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm5", "ymm6", "ymm7", "memory" ) @@ -4780,6 +4793,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "memory" ) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c index 08869010ba..6d9dd365ee 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -570,6 +570,8 @@ void bli_dgemmsup_rd_haswell_asm_6x1 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm3", "ymm4", "ymm6", "ymm8", + "ymm10", "ymm12", "ymm14", "memory" ) } @@ -979,6 +981,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm3", "ymm4", "ymm6", "ymm8", "memory" ) } @@ -1353,6 +1356,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm3", "ymm4", "ymm6", "memory" ) } @@ -1692,6 +1696,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm3", "ymm4", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c index af498eb0ee..94a8e9639e 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -612,6 +612,9 @@ void bli_dgemmsup_rd_haswell_asm_6x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1045,6 +1048,8 @@ void bli_dgemmsup_rd_haswell_asm_3x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "memory" ) } @@ -1437,6 +1442,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -1788,6 +1794,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c index bfc90d79a6..01e2d0a3dd 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -588,6 +588,9 @@ void bli_dgemmsup_rd_haswell_asm_6x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1038,6 +1041,8 @@ void bli_dgemmsup_rd_haswell_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -1441,6 +1446,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c index 03f9560952..9b97a40a45 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -679,6 +679,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) @@ -1192,6 +1195,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -1610,6 +1615,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c index 1a64eb9360..7c2fd21e1e 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -550,6 +550,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "memory" ) } @@ -1000,6 +1001,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "memory" ) } @@ -1408,6 +1410,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "memory" ) } @@ -1813,6 +1816,8 @@ void bli_dgemmsup_rv_haswell_asm_3x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm8", "ymm10", "memory" ) } @@ -2163,6 +2168,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "memory" ) } @@ -2490,6 +2496,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c index 607ed2b437..ad43e7ba57 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -572,6 +572,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm12", "ymm14", "memory" ) } @@ -1043,6 +1045,8 @@ void bli_dgemmsup_rv_haswell_asm_5x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "memory" ) } @@ -1463,6 +1467,8 @@ void bli_dgemmsup_rv_haswell_asm_4x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "memory" ) } @@ -1890,6 +1896,8 @@ void bli_dgemmsup_rv_haswell_asm_3x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "memory" ) } @@ -2253,6 +2261,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "memory" ) } @@ -2594,6 +2603,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c index 7e7de3fdb7..9f80ef2f0d 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -688,6 +688,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1263,6 +1266,8 @@ void bli_dgemmsup_rv_haswell_asm_5x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) } @@ -1761,6 +1766,8 @@ void bli_dgemmsup_rv_haswell_asm_4x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -2276,6 +2283,8 @@ void bli_dgemmsup_rv_haswell_asm_3x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -2707,6 +2716,8 @@ void bli_dgemmsup_rv_haswell_asm_2x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -3084,6 +3095,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c index 49339445a5..2a04011f37 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -804,6 +804,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1399,6 +1402,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) } @@ -1907,6 +1912,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -2441,6 +2448,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -2864,6 +2873,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -3254,6 +3264,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "memory" ) } diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c index 1eb8d926c9..fe6d124d32 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -582,6 +582,9 @@ void bli_sgemmsup_rd_haswell_asm_6x1 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -997,6 +1000,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "memory" ) } @@ -1375,6 +1379,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -1719,6 +1724,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm3", "ymm4", "ymm5", "memory" ) } diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c index 1d3d88309f..b7b0b46a1b 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -628,6 +628,9 @@ void bli_sgemmsup_rd_haswell_asm_6x12 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1105,6 +1108,8 @@ void bli_sgemmsup_rd_haswell_asm_2x12 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -1520,6 +1525,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c index bbb75a6fcd..9819671c7d 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -703,6 +703,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) @@ -1222,6 +1225,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -1642,6 +1647,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c index 1e3240350b..190eb9d1d7 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -630,6 +630,9 @@ void bli_sgemmsup_rd_haswell_asm_6x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1072,6 +1075,8 @@ void bli_sgemmsup_rd_haswell_asm_3x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "memory" ) } @@ -1469,6 +1474,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -1822,6 +1828,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "memory" ) } diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c index 9d4e9d51d2..d167bc08fb 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -600,6 +600,9 @@ void bli_sgemmsup_rd_haswell_asm_6x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1057,6 +1060,8 @@ void bli_sgemmsup_rd_haswell_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -1451,6 +1456,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c index 788912ecf6..498002da90 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -628,6 +628,9 @@ void bli_sgemmsup_rd_haswell_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13","ymm14", "ymm15", "memory" ) } @@ -1105,6 +1108,8 @@ void bli_sgemmsup_rd_haswell_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", + "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -1520,6 +1525,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c index 1bea78ee73..dd2c392e9c 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -775,6 +775,9 @@ void bli_sgemmsup_rv_haswell_asm_6x12 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1474,6 +1477,8 @@ void bli_sgemmsup_rv_haswell_asm_5x12 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) } @@ -2031,6 +2036,8 @@ void bli_sgemmsup_rv_haswell_asm_4x12 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -2619,6 +2626,8 @@ void bli_sgemmsup_rv_haswell_asm_3x12 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "memory" ) } @@ -3065,6 +3074,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -3512,6 +3522,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "memory" ) } diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c index 6a08cecd43..f6443e8b50 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -916,6 +916,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1672,6 +1675,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) } @@ -2257,6 +2262,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -2898,6 +2905,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "memory" ) } @@ -3367,6 +3376,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -3849,6 +3859,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "memory" ) } diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c index 2b1a221ada..43210cdc5a 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -628,6 +628,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm12", "ymm14", "memory" ) } @@ -1179,6 +1181,8 @@ void bli_sgemmsup_rv_haswell_asm_5x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm12", "memory" ) } @@ -1636,6 +1640,8 @@ void bli_sgemmsup_rv_haswell_asm_4x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm8", "ymm10", "memory" ) } @@ -2116,6 +2122,7 @@ void bli_sgemmsup_rv_haswell_asm_3x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "memory" ) } @@ -2502,6 +2509,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm6", "memory" ) } @@ -2889,6 +2897,7 @@ void bli_sgemmsup_rv_haswell_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm2", "ymm3", "ymm4", "memory" ) } diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c index 081ba3a4a8..bc7510eeb3 100644 --- a/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c +++ b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c b/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c index 1bc1dd3530..e4c076c435 100644 --- a/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c +++ b/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c b/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c index b0700b76de..c04d393b76 100644 --- a/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c +++ b/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are From dea5fe4d12a2f7dd149b09dcd7fe33136f6dafbf Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Tue, 23 May 2023 15:40:29 -0400 Subject: [PATCH 102/226] BLIS: Missing clobbers (batch 5) Add missing clobbers for AVX512 mask registers k0-k7 in zen4 kernels. AMD-Internal: [CPUPL-3456] Change-Id: I5f28c725d7af1466df4db4cdfa2d456bbc6ab36d --- kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c | 2 +- kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c | 2 +- kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c | 2 +- kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c | 2 +- .../zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c | 14 +++++++------- .../3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c | 6 +++--- .../3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c | 6 +++--- .../3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c | 6 +++--- .../3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c | 6 +++--- .../3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c | 6 +++--- .../3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c | 6 +++--- .../3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c | 6 +++--- .../3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c | 6 +++--- 13 files changed, 35 insertions(+), 35 deletions(-) diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c index bc7510eeb3..ee9e128e41 100644 --- a/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c +++ b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c @@ -731,7 +731,7 @@ void bli_dpackm_zen4_asm_24xk "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", - "zmm16", "zmm18", "zmm20", "zmm30", "zmm31", "memory" + "zmm16", "zmm18", "zmm20", "zmm30", "zmm31", "k2", "memory" ) } else // if ( cdim0 < mnr || gs || !unitk ) diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c index 9cbafadb24..ff18838aab 100644 --- a/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c +++ b/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c @@ -383,7 +383,7 @@ void bli_dpackm_zen4_asm_8xk "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", - "zmm16", "zmm18", "zmm20", "zmm30", "zmm31", "memory" + "zmm16", "zmm18", "zmm20", "zmm30", "zmm31", "k2", "memory" ) } else // if ( cdim0 < mnr || gs || !unitk ) diff --git a/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c b/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c index e4c076c435..08edcb574f 100644 --- a/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c +++ b/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c @@ -1666,7 +1666,7 @@ void bli_dgemmtrsm_l_zen_asm_16x14 "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k0", "k1", "k2", "k3", "k4", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_9); diff --git a/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c b/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c index c04d393b76..401c6e7d23 100644 --- a/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c +++ b/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c @@ -1703,7 +1703,7 @@ void bli_dgemmtrsm_u_zen_asm_16x14 "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k0", "k1", "k2", "k3", "k4", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_9); diff --git a/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c index 503f615a3b..97ac0985dc 100644 --- a/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c +++ b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c @@ -3291,7 +3291,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7m "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "memory" ) } //mloop @@ -4676,7 +4676,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6m "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "memory" ) } //mloop @@ -5944,7 +5944,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5m "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "memory" ) } //mloop @@ -7046,7 +7046,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4m "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "memory" ) } //mloop @@ -8029,7 +8029,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3m "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "memory" ) } //mloop @@ -8897,7 +8897,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2m "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "memory" ) } //mloop @@ -9649,7 +9649,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1m "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "memory" ) } //mloop diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c index da76bab068..d8806362e8 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c @@ -1083,7 +1083,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -1699,7 +1699,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -2233,7 +2233,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c index 442cfdde31..d8b5c73ad8 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c @@ -1205,7 +1205,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -1911,7 +1911,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -2504,7 +2504,7 @@ void bli_dgemmsup_rv_zen4_asm_8x2 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c index 9f17bed812..a739183e98 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c @@ -1321,7 +1321,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -2113,7 +2113,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -2764,7 +2764,7 @@ void bli_dgemmsup_rv_zen4_asm_8x3 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c index 2a5f6a8b03..e5d70ae5fd 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c @@ -1437,7 +1437,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -2321,7 +2321,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -3032,7 +3032,7 @@ void bli_dgemmsup_rv_zen4_asm_8x4 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c index 7326688cde..a41cbc4905 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c @@ -1607,7 +1607,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -2624,7 +2624,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -3433,7 +3433,7 @@ void bli_dgemmsup_rv_zen4_asm_8x5 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c index 95d6588854..fe638c320f 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c @@ -1726,7 +1726,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -2833,7 +2833,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -3701,7 +3701,7 @@ void bli_dgemmsup_rv_zen4_asm_8x6 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c index 42cb6d648e..610871ab2e 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c @@ -1843,7 +1843,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -3037,7 +3037,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -3962,7 +3962,7 @@ void bli_dgemmsup_rv_zen4_asm_8x7 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "k3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c index d7497c3db0..8cf46b43c5 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c @@ -1923,7 +1923,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -3200,7 +3200,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -4176,7 +4176,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8 "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k2", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } From 9c613c4c0368d6a7c771dabb5326dd8cdcbb119b Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Wed, 24 May 2023 17:26:18 +0530 Subject: [PATCH 103/226] Windows CMake bugfix in object libraries for shared library option Defining BLIS_IS_BUILDING_LIBRARY if BUILD_SHARED_LIBS=ON for the object libraries created in kernels/ directory. The macro definition was not propagated from high level CMake, so we need to define explicitly for the object libraries. AMD-Internal: [CPUPL-3241] Change-Id: Ifc5243861eb94670e7581367ef4bc7467c664d52 --- kernels/haswell/1m/CMakeLists.txt | 5 ++++- kernels/haswell/3/CMakeLists.txt | 3 +++ kernels/haswell/3/sup/CMakeLists.txt | 6 ++++-- kernels/haswell/3/sup/d6x8/CMakeLists.txt | 5 +++-- kernels/skx/3/CMakeLists.txt | 5 ++++- kernels/zen/1/CMakeLists.txt | 5 ++++- kernels/zen/1f/CMakeLists.txt | 7 +++++-- kernels/zen/2/CMakeLists.txt | 4 +++- kernels/zen/3/CMakeLists.txt | 4 +++- kernels/zen/3/sup/CMakeLists.txt | 5 ++++- kernels/zen4/1/CMakeLists.txt | 5 ++++- kernels/zen4/1m/CMakeLists.txt | 5 ++++- kernels/zen4/3/CMakeLists.txt | 5 ++++- kernels/zen4/3/sup/CMakeLists.txt | 5 ++++- kernels/zen4/3/sup/d24x8/CMakeLists.txt | 5 ++++- 15 files changed, 57 insertions(+), 17 deletions(-) diff --git a/kernels/haswell/1m/CMakeLists.txt b/kernels/haswell/1m/CMakeLists.txt index effa9d22d0..56abd13aec 100644 --- a/kernels/haswell/1m/CMakeLists.txt +++ b/kernels/haswell/1m/CMakeLists.txt @@ -12,4 +12,7 @@ add_library(haswell_1m ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_z4xk.c ) -target_compile_options(haswell_1m PRIVATE /arch:AVX2) \ No newline at end of file +target_compile_options(haswell_1m PRIVATE /arch:AVX2) +if(BUILD_SHARED_LIBS) + target_compile_definitions(haswell_1m PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() diff --git a/kernels/haswell/3/CMakeLists.txt b/kernels/haswell/3/CMakeLists.txt index 0f491c84e8..a42bdadf83 100644 --- a/kernels/haswell/3/CMakeLists.txt +++ b/kernels/haswell/3/CMakeLists.txt @@ -9,5 +9,8 @@ add_library(haswell_3 ) target_compile_options(haswell_3 PRIVATE /arch:AVX2) +if(BUILD_SHARED_LIBS) + target_compile_definitions(haswell_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() add_subdirectory(sup) diff --git a/kernels/haswell/3/sup/CMakeLists.txt b/kernels/haswell/3/sup/CMakeLists.txt index d0bf6a16f9..e5ed6183c2 100644 --- a/kernels/haswell/3/sup/CMakeLists.txt +++ b/kernels/haswell/3/sup/CMakeLists.txt @@ -12,6 +12,8 @@ add_library(haswell_3sup #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_s6x16n.c ) target_compile_options(haswell_3sup PRIVATE /arch:AVX2) - +if(BUILD_SHARED_LIBS) + target_compile_definitions(haswell_3sup PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() add_subdirectory(d6x8) -#add_subdirectory(s6x16) \ No newline at end of file +#add_subdirectory(s6x16) diff --git a/kernels/haswell/3/sup/d6x8/CMakeLists.txt b/kernels/haswell/3/sup/d6x8/CMakeLists.txt index 2ad43e0a2f..c74dff9372 100644 --- a/kernels/haswell/3/sup/d6x8/CMakeLists.txt +++ b/kernels/haswell/3/sup/d6x8/CMakeLists.txt @@ -14,5 +14,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx8.c ) target_compile_options(haswell_3supd6x8 PRIVATE /arch:AVX2) - - +if(BUILD_SHARED_LIBS) + target_compile_definitions(haswell_3supd6x8 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() diff --git a/kernels/skx/3/CMakeLists.txt b/kernels/skx/3/CMakeLists.txt index 4faf5cfefc..e4125f1b60 100644 --- a/kernels/skx/3/CMakeLists.txt +++ b/kernels/skx/3/CMakeLists.txt @@ -5,4 +5,7 @@ add_library(skx_3 ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_skx_asm_16x14.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_sgemm_skx_asm_32x12_l2.c ) -target_compile_options(skx_3 PRIVATE /arch:AVX2 /arch:AVX512) \ No newline at end of file +target_compile_options(skx_3 PRIVATE /arch:AVX2 /arch:AVX512) +if(BUILD_SHARED_LIBS) + target_compile_definitions(skx_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() diff --git a/kernels/zen/1/CMakeLists.txt b/kernels/zen/1/CMakeLists.txt index 1a0b644a99..87db4ac1c7 100644 --- a/kernels/zen/1/CMakeLists.txt +++ b/kernels/zen/1/CMakeLists.txt @@ -18,4 +18,7 @@ add_library(zen_1 ${CMAKE_CURRENT_SOURCE_DIR}/bli_norm2_zen_int.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_scal2v_zen_int.c ) -target_compile_options(zen_1 PRIVATE /arch:AVX2) \ No newline at end of file +target_compile_options(zen_1 PRIVATE /arch:AVX2) +if(BUILD_SHARED_LIBS) + target_compile_definitions(zen_1 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() diff --git a/kernels/zen/1f/CMakeLists.txt b/kernels/zen/1f/CMakeLists.txt index 4a110392da..5da0c9e7b0 100644 --- a/kernels/zen/1f/CMakeLists.txt +++ b/kernels/zen/1f/CMakeLists.txt @@ -8,6 +8,9 @@ add_library(zen_1f ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_4.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_6.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpy2v_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxaxpyf_zen_int_8.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxaxpyf_zen_int_8.c ) -target_compile_options(zen_1f PRIVATE /arch:AVX2) \ No newline at end of file +target_compile_options(zen_1f PRIVATE /arch:AVX2) +if(BUILD_SHARED_LIBS) + target_compile_definitions(zen_1f PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() diff --git a/kernels/zen/2/CMakeLists.txt b/kernels/zen/2/CMakeLists.txt index 9618ce256c..c9c9220609 100644 --- a/kernels/zen/2/CMakeLists.txt +++ b/kernels/zen/2/CMakeLists.txt @@ -8,7 +8,9 @@ add_library(zen_2 ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_zen_int_amd.c ) target_compile_options(zen_2 PRIVATE /arch:AVX2) - +if(BUILD_SHARED_LIBS) + target_compile_definitions(zen_2 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() # For any other TARGET_ARCH, it would fail to configure. # Select AMD specific sources for AMD configurations. #[=[if(${TARGET_ARCH} STREQUAL zen OR diff --git a/kernels/zen/3/CMakeLists.txt b/kernels/zen/3/CMakeLists.txt index 741d46e2ca..97a067bb64 100644 --- a/kernels/zen/3/CMakeLists.txt +++ b/kernels/zen/3/CMakeLists.txt @@ -8,5 +8,7 @@ add_library(zen_3 ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_avx2_k1.c ) target_compile_options(zen_3 PRIVATE /arch:AVX2) - +if(BUILD_SHARED_LIBS) + target_compile_definitions(zen_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() add_subdirectory(sup) diff --git a/kernels/zen/3/sup/CMakeLists.txt b/kernels/zen/3/sup/CMakeLists.txt index b1f829eebf..57f3ee01ff 100644 --- a/kernels/zen/3/sup/CMakeLists.txt +++ b/kernels/zen/3/sup/CMakeLists.txt @@ -18,4 +18,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4m.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4n.c ) -target_compile_options(zen_3_sup PRIVATE /arch:AVX2) \ No newline at end of file +target_compile_options(zen_3_sup PRIVATE /arch:AVX2) +if(BUILD_SHARED_LIBS) + target_compile_definitions(zen_3_sup PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() diff --git a/kernels/zen4/1/CMakeLists.txt b/kernels/zen4/1/CMakeLists.txt index a8d2c80097..9bfb5d650e 100644 --- a/kernels/zen4/1/CMakeLists.txt +++ b/kernels/zen4/1/CMakeLists.txt @@ -8,4 +8,7 @@ add_library(zen4_1 ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyv_zen_int_avx512.c ) -target_compile_options(zen4_1 PRIVATE /arch:AVX2 /arch:AVX512) \ No newline at end of file +target_compile_options(zen4_1 PRIVATE /arch:AVX2 /arch:AVX512) +if(BUILD_SHARED_LIBS) + target_compile_definitions(zen4_1 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() diff --git a/kernels/zen4/1m/CMakeLists.txt b/kernels/zen4/1m/CMakeLists.txt index 7e7cfda5f8..9dfbefc458 100644 --- a/kernels/zen4/1m/CMakeLists.txt +++ b/kernels/zen4/1m/CMakeLists.txt @@ -10,4 +10,7 @@ add_library(zen4_1m ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_z4xk.c ) -target_compile_options(zen4_1m PRIVATE /U__PRFCHW__ /arch:AVX2 /arch:AVX512) \ No newline at end of file +target_compile_options(zen4_1m PRIVATE /U__PRFCHW__ /arch:AVX2 /arch:AVX512) +if(BUILD_SHARED_LIBS) + target_compile_definitions(zen4_1m PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() diff --git a/kernels/zen4/3/CMakeLists.txt b/kernels/zen4/3/CMakeLists.txt index 2590a6aa34..0b38920998 100644 --- a/kernels/zen4/3/CMakeLists.txt +++ b/kernels/zen4/3/CMakeLists.txt @@ -13,5 +13,8 @@ add_library(zen4_3 ) target_compile_options(zen4_3 PRIVATE /arch:AVX2 /arch:AVX512) +if(BUILD_SHARED_LIBS) + target_compile_definitions(zen4_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() -add_subdirectory(sup) \ No newline at end of file +add_subdirectory(sup) diff --git a/kernels/zen4/3/sup/CMakeLists.txt b/kernels/zen4/3/sup/CMakeLists.txt index 642cf9da6a..81e194ef64 100644 --- a/kernels/zen4/3/sup/CMakeLists.txt +++ b/kernels/zen4/3/sup/CMakeLists.txt @@ -14,5 +14,8 @@ add_library(zen4_3sup ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_cv_zen4_z12x4m.c ) target_compile_options(zen4_3sup PRIVATE /arch:AVX2 /arch:AVX512) +if(BUILD_SHARED_LIBS) + target_compile_definitions(zen4_3sup PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() -add_subdirectory(d24x8) \ No newline at end of file +add_subdirectory(d24x8) diff --git a/kernels/zen4/3/sup/d24x8/CMakeLists.txt b/kernels/zen4/3/sup/d24x8/CMakeLists.txt index 254a031866..004a07c085 100644 --- a/kernels/zen4/3/sup/d24x8/CMakeLists.txt +++ b/kernels/zen4/3/sup/d24x8/CMakeLists.txt @@ -12,4 +12,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx7.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx8.c ) -target_compile_options(zen4_3supd24x8 PRIVATE /arch:AVX2 /arch:AVX512) \ No newline at end of file +target_compile_options(zen4_3supd24x8 PRIVATE /arch:AVX2 /arch:AVX512) +if(BUILD_SHARED_LIBS) + target_compile_definitions(zen4_3supd24x8 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) +endif() From 655955dd3bf79180707022c075a10bf385b0db67 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Thu, 25 May 2023 06:08:09 -0500 Subject: [PATCH 104/226] Doxygen document generation from cmake build - Added support to generate doxygen documentation from cmake build. - If doxygen is already installed on machine, it will generate documentation and promtps the path for documentation. AMD-Internal: [CPUPL-3188] Change-Id: I6047f62df63844aa71836fd481b4df246b793696 --- CMakeLists.txt | 15 ++++++ docs/Doxyfile | 132 ++++++++++++++++++++++++------------------------- 2 files changed, 81 insertions(+), 66 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ee512e40e..7a9c3bc803 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -316,6 +316,21 @@ else() set(rename_blis_model_type "BLIS_MODEL_TYPE") endif() +find_package(Doxygen) +set(W_DIR "${CMAKE_CURRENT_SOURCE_DIR}/docs") +if(NOT (DOXYGEN_FOUND)) + message(STATUS "Doxygen not found please install and try again.") +else() + execute_process(COMMAND doxygen Doxyfile + WORKING_DIRECTORY ${W_DIR} + COMMAND_ECHO STDOUT) +endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/docs/html/index.html) + message(STATUS "Documentation generated successfully, to view documentation open docs/html/index.html .") +else() + message(STATUS "Document generation failed.") +endif() + set(CMAKE_BUILD_TYPE ${CMAKE_CONFIGURATION_TYPES}) #print configurations diff --git a/docs/Doxyfile b/docs/Doxyfile index 91ab628063..36ae286238 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -63,14 +63,14 @@ PROJECT_BRIEF = # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. -PROJECT_LOGO = ./docs/styling/AMD_Logo.png +PROJECT_LOGO = ./styling/AMD_Logo.png # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = ./docs +OUTPUT_DIRECTORY = ./ # If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096 # sub-directories (in 2 levels) under the output directory of each output format @@ -919,7 +919,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = ./ +INPUT = ../ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -1019,63 +1019,63 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = ./addon \ - ./aocl_dtl \ - ./bench \ - ./blastest \ - ./build \ - ./config \ - ./examples \ - ./include \ - ./gtestsuite \ - ./kernels \ - ./lib \ - ./mpi_test \ - ./ref_kernels \ - ./sandbox \ - ./test \ - ./testsuite \ - ./travis \ - ./vendor \ - ./windows \ - ./frame/0 \ - ./frame/1 \ - ./frame/1d \ - ./frame/1f \ - ./frame/1m \ - ./frame/2 \ - ./frame/3 \ - ./frame/base \ - ./frame/include \ - ./frame/ind \ - ./frame/thread \ - ./frame/util \ - ./bli_addon.h \ - ./bli_config.h \ - ./configure \ - ./CONTRIBUTING.md \ - ./INSTALL \ - ./LICENSE \ - ./Makefile \ - ./README.md \ - ./RELEASING \ - ./docs/Addons.md \ - ./docs/BLISObjectAPI.md \ - ./docs/BLISTypedAPI.md \ - ./docs/BuildSystem.md \ - ./docs/CodingConventions.md \ - ./docs/ConfigurationHowTo.md \ - ./docs/Doxyfile \ - ./docs/FAQ.md \ - ./docs/HardwareSupport.md \ - ./docs/KernelsHowTo.md \ - ./docs/MixedDatatypes.md \ - ./docs/Multithreading.md \ - ./docs/Performance.md \ - ./docs/PerformanceSmall.md \ - ./docs/ReleaseNotes.md \ - ./docs/Sandboxes.md \ - ./docs/Testsuite.md +EXCLUDE = ../addon \ + ../aocl_dtl \ + ../bench \ + ../blastest \ + ../build \ + ../config \ + ../examples \ + ../include \ + ../gtestsuite \ + ../kernels \ + ../lib \ + ../mpi_test \ + ../ref_kernels \ + ../sandbox \ + ../test \ + ../testsuite \ + ../travis \ + ../vendor \ + ../windows \ + ../frame/0 \ + ../frame/1 \ + ../frame/1d \ + ../frame/1f \ + ../frame/1m \ + ../frame/2 \ + ../frame/3 \ + ../frame/base \ + ../frame/include \ + ../frame/ind \ + ../frame/thread \ + ../frame/util \ + ../bli_addon.h \ + ../bli_config.h \ + ../configure \ + ../CONTRIBUTING.md \ + ../INSTALL \ + ../LICENSE \ + ../Makefile \ + ../README.md \ + ../RELEASING \ + ../docs/Addons.md \ + ../docs/BLISObjectAPI.md \ + ../docs/BLISTypedAPI.md \ + ../docs/BuildSystem.md \ + ../docs/CodingConventions.md \ + ../docs/ConfigurationHowTo.md \ + ../docs/Doxyfile \ + ../docs/FAQ.md \ + ../docs/HardwareSupport.md \ + ../docs/KernelsHowTo.md \ + ../docs/MixedDatatypes.md \ + ../docs/Multithreading.md \ + ../docs/Performance.md \ + ../docs/PerformanceSmall.md \ + ../docs/ReleaseNotes.md \ + ../docs/Sandboxes.md \ + ../docs/Testsuite.md # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -1388,7 +1388,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = ./docs/styling/header.html +HTML_HEADER = ./styling/header.html # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1398,7 +1398,7 @@ HTML_HEADER = ./docs/styling/header.html # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = ./docs/styling/footer.html +HTML_FOOTER = ./styling/footer.html # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of @@ -1428,7 +1428,7 @@ HTML_STYLESHEET = # documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = ./docs/styling/doxygen-awesome.css +HTML_EXTRA_STYLESHEET = ./styling/doxygen-awesome.css # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1438,9 +1438,9 @@ HTML_EXTRA_STYLESHEET = ./docs/styling/doxygen-awesome.css # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = ./docs/styling/AMD_Logo.png \ - ./docs/styling/doxygen-fragment-copy-button.js \ - ./docs/styling/doxygen-interactive-toc.js +HTML_EXTRA_FILES = ./styling/AMD_Logo.png \ + ./styling/doxygen-fragment-copy-button.js \ + ./styling/doxygen-interactive-toc.js # The HTML_COLORSTYLE tag can be used to specify if the generated HTML output # should be rendered with a dark or light theme. From 85eb7880f73996fd299c4a3004691f7ae98cb754 Mon Sep 17 00:00:00 2001 From: "sireesha.sanga" Date: Thu, 25 May 2023 14:46:33 +0000 Subject: [PATCH 105/226] README File Update Updated with latest and relevant details. AMD-Internal: [CPUPL-3007] Change-Id: I6d86c5f0c49fd8739c656bcc8187a5f8a4dc9beb --- README.md | 715 +----------------------------------------------------- 1 file changed, 6 insertions(+), 709 deletions(-) diff --git a/README.md b/README.md index 28179306c7..ce923198e4 100644 --- a/README.md +++ b/README.md @@ -1,714 +1,11 @@ -![The BLIS cat is sleeping.](http://www.cs.utexas.edu/users/field/blis_cat.png) +# AOCL-BLAS library -[![Build Status](https://travis-ci.org/flame/blis.svg?branch=master)](https://travis-ci.org/flame/blis) -[![Build Status](https://ci.appveyor.com/api/projects/status/github/flame/blis?branch=master&svg=true)](https://ci.appveyor.com/project/shpc/blis/branch/master) +AOCL-BLAS is AMD's optimized version of BLAS targeted for AMD EPYC and Ryzen CPUs. It is developed as a forked version of BLIS (https://github.com/flame/blis), which is developed by members of the [Science of High-Performance Computing](http://shpc.oden.utexas.edu/) (SHPC) group in the [Institute for Computational Engineering and Sciences](https://www.oden.utexas.edu/) at [The University of Texas at Austin](https://www.utexas.edu/) and other collaborators (including AMD). All known features and functionalities of BLIS are retained and supported in AOCL-BLAS library. AOCL-BLAS is regularly updated with the improvements from the upstream repository. -Contents --------- +AOCL BLAS is optimized with SSE2, AVX2, AVX512 instruction sets which would be enabled based on the target Zen architecture using the dynamic dispatch feature. All prominent Level 3, Level 2 and Level 1 APIs are designed and optimized for specific paths targeting different size spectrums e.g., Small, Medium and Large sizes. These algorithms are designed and customized to exploit the architectural improvements of the target platform. -* **[Introduction](#introduction)** -* **[Education and Learning](#education-and-learning)** -* **[What's New](#whats-new)** -* **[What People Are Saying About BLIS](#what-people-are-saying-about-blis)** -* **[Key Features](#key-features)** -* **[How to Download BLIS](#how-to-download-blis)** -* **[Getting Started](#getting-started)** -* **[Documentation](#documentation)** -* **[External Packages](#external-packages)** -* **[Discussion](#discussion)** -* **[Contributing](#contributing)** -* **[Citations](#citations)** -* **[Funding](#funding)** +For detailed instructions on how to configure, build, install, and link against AOCL-BLAS on AMD CPUs, please refer to the AOCL User Guide located on AMD developer [portal](https://www.amd.com/en/developer/aocl.html). -Introduction ------------- - -BLIS is a portable software framework for instantiating high-performance -BLAS-like dense linear algebra libraries. The framework was designed to isolate -essential kernels of computation that, when optimized, immediately enable -optimized implementations of most of its commonly used and computationally -intensive operations. BLIS is written in [ISO -C99](http://en.wikipedia.org/wiki/C99) and available under a -[new/modified/3-clause BSD -license](http://opensource.org/licenses/BSD-3-Clause). While BLIS exports a -[new BLAS-like API](docs/BLISTypedAPI.md), -it also includes a BLAS compatibility layer which gives application developers -access to BLIS implementations via traditional [BLAS routine -calls](http://www.netlib.org/lapack/lug/node145.html). -An [object-based API](docs/BLISObjectAPI.md) unique to BLIS is also available. - -For a thorough presentation of our framework, please read our -[ACM Transactions on Mathematical Software (TOMS)](https://toms.acm.org/) -journal article, ["BLIS: A Framework for Rapidly Instantiating BLAS -Functionality"](http://dl.acm.org/authorize?N91172). -For those who just want an executive summary, please see the -[Key Features](#key-features) section below. - -In a follow-up article (also in [ACM TOMS](https://toms.acm.org/)), -["The BLIS Framework: Experiments in -Portability"](http://dl.acm.org/authorize?N16240), -we investigate using BLIS to instantiate level-3 BLAS implementations on a -variety of general-purpose, low-power, and multicore architectures. - -An IPDPS'14 conference paper titled ["Anatomy of High-Performance Many-Threaded -Matrix -Multiplication"](http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf) -systematically explores the opportunities for parallelism within the five loops -that BLIS exposes in its matrix multiplication algorithm. - -For other papers related to BLIS, please see the -[Citations section](#citations) below. - -It is our belief that BLIS offers substantial benefits in productivity when -compared to conventional approaches to developing BLAS libraries, as well as a -much-needed refinement of the BLAS interface, and thus constitutes a major -advance in dense linear algebra computation. While BLIS remains a -work-in-progress, we are excited to continue its development and further -cultivate its use within the community. - -The BLIS framework is primarily developed and maintained by individuals in the -[Science of High-Performance Computing](http://shpc.ices.utexas.edu/) -(SHPC) group in the -[Oden Institute for Computational Engineering and Sciences](https://www.oden.utexas.edu/) -at [The University of Texas at Austin](https://www.utexas.edu/). -Please visit the [SHPC](http://shpc.ices.utexas.edu/) website for more -information about our research group, such as a list of -[people](http://shpc.ices.utexas.edu/people.html) -and [collaborators](http://shpc.ices.utexas.edu/collaborators.html), -[funding sources](http://shpc.ices.utexas.edu/funding.html), -[publications](http://shpc.ices.utexas.edu/publications.html), -and [other educational projects](http://www.ulaff.net/) (such as MOOCs). - -Education and Learning ----------------------- - -Want to understand what's under the hood? -Many of the same concepts and principles employed when developing BLIS are -introduced and taught in a basic pedagogical setting as part of -[LAFF-On Programming for High Performance (LAFF-On-PfHP)](http://www.ulaff.net/), -one of several massive open online courses (MOOCs) in the -[Linear Algebra: Foundations to Frontiers](http://www.ulaff.net/) series, -all of which are available for free via the [edX platform](http://www.edx.org/). - -What's New ----------- - - * **Multithreaded small/skinny matrix support for sgemm now available!** Thanks to -funding and hardware support from Oracle, we have now accelerated `gemm` for -single-precision real matrix problems where one or two dimensions is exceedingly -small. This work is similar to the `gemm` optimization announced last year. -For now, we have only gathered performance results on an AMD Epyc Zen2 system, but -we hope to publish additional graphs for other architectures in the future. You may -find these Zen2 graphs via the [PerformanceSmall](docs/PerformanceSmall.md) document. - - * **BLIS awarded SIAM Activity Group on Supercomputing Best Paper Prize for 2020!** -We are thrilled to announce that the paper that we internally refer to as the -second BLIS paper, - - "The BLIS Framework: Experiments in Portability." Field G. Van Zee, Tyler Smith, Bryan Marker, Tze Meng Low, Robert A. van de Geijn, Francisco Igual, Mikhail Smelyanskiy, Xianyi Zhang, Michael Kistler, Vernon Austel, John A. Gunnels, Lee Killough. ACM Transactions on Mathematical Software (TOMS), 42(2):12:1--12:19, 2016. - - was selected for the [SIAM Activity Group on Supercomputing Best Paper Prize](https://www.siam.org/prizes-recognition/activity-group-prizes/detail/siag-sc-best-paper-prize) -for 2020. The prize is awarded once every two years to a paper judged to be -the most outstanding paper in the field of parallel scientific and engineering -computing, and has only been awarded once before (in 2016) since its inception -in 2015 (the committee did not award the prize in 2018). The prize -[was awarded](https://www.oden.utexas.edu/about/news/ScienceHighPerfomanceComputingSIAMBestPaperPrize/) -at the [2020 SIAM Conference on Parallel Processing for Scientific Computing](https://www.siam.org/conferences/cm/conference/pp20) in Seattle. Robert was present at -the conference to give -[a talk on BLIS](https://meetings.siam.org/sess/dsp_programsess.cfm?SESSIONCODE=68266) and accept the prize alongside other coauthors. -The selection committee sought to recognize the paper, "which validates BLIS, -a framework relying on the notion of microkernels that enables both productivity -and high performance." Their statement continues, "The framework will continue -having an important influence on the design and the instantiation of dense linear -algebra libraries." - - * **Multithreaded small/skinny matrix support for dgemm now available!** Thanks to -contributions made possible by our partnership with AMD, we have dramatically -accelerated `gemm` for double-precision real matrix problems where one or two -dimensions is exceedingly small. A natural byproduct of this optimization is -that the traditional case of small _m = n = k_ (i.e. square matrices) is also -accelerated, even though it was not targeted specifically. And though only -`dgemm` was optimized for now, support for other datatypes and/or other operations -may be implemented in the future. We've also added new graphs to the -[PerformanceSmall](docs/PerformanceSmall.md) document to showcase multithreaded -performance when one or more matrix dimensions are small. - - * **Performance comparisons now available!** We recently measured the -performance of various level-3 operations on a variety of hardware architectures, -as implemented within BLIS and other BLAS libraries for all four of the standard -floating-point datatypes. The results speak for themselves! Check out our -extensive performance graphs and background info in our new -[Performance](docs/Performance.md) document. - - * **BLIS is now in Debian Unstable!** Thanks to Debian developer-maintainers -[M. Zhou](https://github.com/cdluminate) and -[Nico Schlömer](https://github.com/nschloe) for sponsoring our package in Debian. -Their participation, contributions, and advocacy were key to getting BLIS into -the second-most popular Linux distribution (behind Ubuntu, which Debian packages -feed into). The Debian tracker page may be found -[here](https://tracker.debian.org/pkg/blis). - - * **BLIS now supports mixed-datatype gemm!** The `gemm` operation may now be -executed on operands of mixed domains and/or mixed precisions. Any combination -of storage datatype for A, B, and C is now supported, along with a separate -computation precision that can differ from the storage precision of A and B. -And even the 1m method now supports mixed-precision computation. -For more details, please see our [ACM TOMS](https://toms.acm.org/) journal -article submission ([current -draft](http://www.cs.utexas.edu/users/flame/pubs/blis7_toms_rev0.pdf)). - - * **BLIS now implements the 1m method.** Let's face it: writing complex -assembly `gemm` microkernels for a new architecture is never a priority--and -now, it almost never needs to be. The 1m method leverages existing real domain -`gemm` microkernels to implement all complex domain level-3 operations. For -more details, please see our [ACM TOMS](https://toms.acm.org/) journal article -submission ([current -draft](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev2.pdf)). - -What People Are Saying About BLIS ---------------------------------- - -*["I noticed a substantial increase in multithreaded performance on my own -machine, which was extremely satisfying."](https://groups.google.com/d/msg/blis-discuss/8iu9B5KCxpA/uftpjgIsBwAJ)* ... *["[I was] happy it worked so well!"](https://groups.google.com/d/msg/blis-discuss/8iu9B5KCxpA/uftpjgIsBwAJ)* (Justin Shea) - -*["This is an awesome library."](https://github.com/flame/blis/issues/288#issuecomment-447488637)* ... *["I want to thank you and the blis team for your efforts."](https://github.com/flame/blis/issues/288#issuecomment-448074704)* ([@Lephar](https://github.com/Lephar)) - -*["Any time somebody outside Intel beats MKL by a nontrivial amount, I report it to the MKL team. It is fantastic for any open-source project to get within 10% of MKL... [T]his is why Intel funds BLIS development."](https://github.com/flame/blis/issues/264#issuecomment-428673275)* ([@jeffhammond](https://github.com/jeffhammond)) - -*["So BLIS is now a part of Elk."](https://github.com/flame/blis/issues/267#issuecomment-429303902)* ... *["We have found that zgemm applied to a 15000x15000 matrix with multi-threaded BLIS on a 32-core Ryzen 2990WX processor is about twice as fast as MKL"](https://github.com/flame/blis/issues/264#issuecomment-428373946)* ... *["I'm starting to like this a lot."](https://github.com/flame/blis/issues/264#issuecomment-428926191)* ([@jdk2016](https://github.com/jdk2016)) - -*["I [found] BLIS because I was looking for BLAS operations on C-ordered arrays for NumPy. BLIS has that, but even better is the fact that it's developed in the open using a more modern language than Fortran."](https://github.com/flame/blis/issues/254#issuecomment-423838345)* ([@nschloe](https://github.com/nschloe)) - -*["The specific reason to have BLIS included [in Linux distributions] is the KNL and SKX [AVX-512] BLAS support, which OpenBLAS doesn't have."](https://github.com/flame/blis/issues/210#issuecomment-393126303)* ([@loveshack](https://github.com/loveshack)) - -*["All tests pass without errors on OpenBSD. Thanks!"](https://github.com/flame/blis/issues/202#issuecomment-389691543)* ([@ararslan](https://github.com/ararslan)) - -*["Thank you very much for your great help!... Looking forward to benchmarking."](https://github.com/flame/blis/issues/180#issuecomment-375895449)* ([@mrader1248](https://github.com/mrader1248)) - -*["Thanks for the beautiful work."](https://github.com/flame/blis/issues/163#issue-286575452)* ([@mmrmo](https://github.com/mmrmo)) - -*["[M]y software currently uses BLIS for its BLAS interface..."](https://github.com/flame/blis/issues/129#issuecomment-302904805)* ([@ShadenSmith](https://github.com/ShadenSmith)) - -*["[T]hanks so much for your work on this! Excited to test."](https://github.com/flame/blis/issues/129#issuecomment-341565071)* ... *["[On AMD Excavator], BLIS is competitive to / slightly faster than OpenBLAS for dgemms in my tests."](https://github.com/flame/blis/issues/129#issuecomment-341608673)* ([@iotamudelta](https://github.com/iotamudelta)) - -*["BLIS provided the only viable option on KNL, whose ecosystem is at present dominated by blackbox toolchains. Thanks again. Keep on this great work."](https://github.com/flame/blis/issues/116#issuecomment-281225101)* ([@heroxbd](https://github.com/heroxbd)) - -*["I want to definitely try this out..."](https://github.com/flame/blis/issues/12#issuecomment-48086295)* ([@ViralBShah](https://github.com/ViralBShah)) - -Key Features ------------- - -BLIS offers several advantages over traditional BLAS libraries: - - * **Portability that doesn't impede high performance.** Portability was a top -priority of ours when creating BLIS. With virtually no additional effort on the -part of the developer, BLIS is configurable as a fully-functional reference -implementation. But more importantly, the framework identifies and isolates a -key set of computational kernels which, when optimized, immediately and -automatically optimize performance across virtually all level-2 and level-3 -BLIS operations. In this way, the framework acts as a productivity multiplier. -And since the optimized (non-portable) code is compartmentalized within these -few kernels, instantiating a high-performance BLIS library on a new -architecture is a relatively straightforward endeavor. - - * **Generalized matrix storage.** The BLIS framework exports interfaces that -allow one to specify both the row stride and column stride of a matrix. This -allows one to compute with matrices stored in column-major order, row-major -order, or by general stride. (This latter storage format is important for those -seeking to implement tensor contractions on multidimensional arrays.) -Furthermore, since BLIS tracks stride information for each matrix, operands of -different storage formats can be used within the same operation invocation. By -contrast, BLAS requires column-major storage. And while the CBLAS interface -supports row-major storage, it does not allow mixing storage formats. - - * **Rich support for the complex domain.** BLIS operations are developed and -expressed in their most general form, which is typically in the complex domain. -These formulations then simplify elegantly down to the real domain, with -conjugations becoming no-ops. Unlike the BLAS, all input operands in BLIS that -allow transposition and conjugate-transposition also support conjugation -(without transposition), which obviates the need for thread-unsafe workarounds. -Also, where applicable, both complex symmetric and complex Hermitian forms are -supported. (BLAS omits some complex symmetric operations, such as `symv`, -`syr`, and `syr2`.) Another great example of BLIS serving as a portability -lever is its implementation of the 1m method for complex matrix multiplication, -a novel mechanism of providing high-performance complex level-3 operations using -only real domain microkernels. This new innovation guarantees automatic level-3 -support in the complex domain even when the kernel developers entirely forgo -writing complex kernels. - - * **Advanced multithreading support.** BLIS allows multiple levels of -symmetric multithreading for nearly all level-3 operations. (Currently, users -may choose to obtain parallelism via either OpenMP or POSIX threads). This -means that matrices may be partitioned in multiple dimensions simultaneously to -attain scalable, high-performance parallelism on multicore and many-core -architectures. The key to this innovation is a thread-specific control tree -infrastructure which encodes information about the logical thread topology and -allows threads to query and communicate data amongst one another. BLIS also -employs so-called "quadratic partitioning" when computing dimension sub-ranges -for each thread, so that arbitrary diagonal offsets of structured matrices with -unreferenced regions are taken into account to achieve proper load balance. -More recently, BLIS introduced a runtime abstraction to specify parallelism on -a per-call basis, which is useful for applications that want to handle most of -the parallelism. - - * **Ease of use.** The BLIS framework, and the library of routines it -generates, are easy to use for end users, experts, and vendors alike. An -optional BLAS compatibility layer provides application developers with -backwards compatibility to existing BLAS-dependent codes. Or, one may adjust or -write their application to take advantage of new BLIS functionality (such as -generalized storage formats or additional complex operations) by calling one -of BLIS's native APIs directly. BLIS's typed API will feel familiar to many -veterans of BLAS since these interfaces use BLAS-like calling sequences. And -many will find BLIS's object-based APIs a delight to use when customizing -or writing their own BLIS operations. (Objects are relatively lightweight -`structs` and passed by address, which helps tame function calling overhead.) - - * **Multilayered API, exposed kernels, and sandboxes.** The BLIS framework -exposes its -implementations in various layers, allowing expert developers to access exactly -the functionality desired. This layered interface includes that of the -lowest-level kernels, for those who wish to bypass the bulk of the framework. -Optimizations can occur at various levels, in part thanks to exposed packing -and unpacking facilities, which by default are highly parameterized and -flexible. And more recently, BLIS introduced sandboxes--a way to provide -alternative implementations of `gemm` that do not use any more of the BLIS -infrastructure than is desired. Sandboxes provide a convenient and -straightforward way of modifying the `gemm` implementation without disrupting -any other level-3 operation or any other part of the framework. This works -especially well when the developer wants to experiment with new optimizations -or try a different algorithm. - - * **Functionality that grows with the community's needs.** As its name -suggests, the BLIS framework is not a single library or static API, but rather -a nearly-complete template for instantiating high-performance BLAS-like -libraries. Furthermore, the framework is extensible, allowing developers to -leverage existing components to support new operations as they are identified. -If such operations require new kernels for optimal efficiency, the framework -and its APIs will be adjusted and extended accordingly. - - * **Code re-use.** Auto-generation approaches to achieving the aforementioned -goals tend to quickly lead to code bloat due to the multiple dimensions of -variation supported: operation (i.e. `gemm`, `herk`, `trmm`, etc.); parameter -case (i.e. side, [conjugate-]transposition, upper/lower storage, unit/non-unit -diagonal); datatype (i.e. single-/double-precision real/complex); matrix -storage (i.e. row-major, column-major, generalized); and algorithm (i.e. -partitioning path and kernel shape). These "brute force" approaches often -consider and optimize each operation or case combination in isolation, which is -less than ideal when the goal is to provide entire libraries. BLIS was designed -to be a complete framework for implementing basic linear algebra operations, -but supporting this vast amount of functionality in a manageable way required a -holistic design that employed careful abstractions, layering, and recycling of -generic (highly parameterized) codes, subject to the constraint that high -performance remain attainable. - - * **A foundation for mixed domain and/or mixed precision operations.** BLIS -was designed with the hope of one day allowing computation on real and complex -operands within the same operation. Similarly, we wanted to allow mixing -operands' numerical domains, floating-point precisions, or both domain and -precision, and to optionally compute in a precision different than one or both -operands' storage precisions. This feature has been implemented for the general -matrix multiplication (`gemm`) operation, providing 128 different possible type -combinations, which, when combined with existing transposition, conjugation, -and storage parameters, enables 55,296 different `gemm` use cases. For more -details, please see the documentation on [mixed datatype](docs/MixedDatatypes.md) -support and/or our [ACM TOMS](https://toms.acm.org/) journal paper on -mixed-domain/mixed-precision `gemm` ([linked below](#citations)). - -How to Download BLIS --------------------- - -There are a few ways to download BLIS. We list the most common four ways below. -We **highly recommend** using either Option 1 or 2. Otherwise, we recommend -Option 3 (over Option 4) so your compiler can perform optimizations specific -to your hardware. - -1. **Download a source repository with `git clone`.** -Generally speaking, we prefer using `git clone` to clone a `git` repository. -Having a repository allows the user to periodically pull in the latest changes -and quickly rebuild BLIS whenever they wish. Also, implicit in cloning a -repository is that the repository defaults to using the `master` branch, which -contains the latest "stable" commits since the most recent release. (This is -in contrast to Option 3 in which the user is opting for code that may be -slightly out of date.) - - In order to clone a `git` repository of BLIS, please obtain a repository -URL by clicking on the green button above the file/directory listing near the -top of this page (as rendered by GitHub). Generally speaking, it will amount -to executing the following command in your terminal shell: - ``` - git clone https://github.com/flame/blis.git - ``` - -2. **Download a source repository via a zip file.** -If you are uncomfortable with using `git` but would still like the latest -stable commits, we recommend that you download BLIS as a zip file. - - In order to download a zip file of the BLIS source distribution, please -click on the green button above the file listing near the top of this page. -This should reveal a link for downloading the zip file. - -3. **Download a source release via a tarball/zip file.** -Alternatively, if you would like to stick to the code that is included in -official releases, you may download either a tarball or zip file of any of -BLIS's previous [tagged releases](https://github.com/flame/blis/releases). -We consider this option to be less than ideal for most people since it will -likely mean you miss out on the latest bugfix or feature commits (in contrast -to Options 1 or 2), and you also will not be able to update your code with a -simple `git pull` command (in contrast to Option 1). - -4. **Download a binary package specific to your OS.** -While we don't recommend this as the first choice for most users, we provide -links to community members who generously maintain BLIS packages for various -Linux distributions such as Debian Unstable and EPEL/Fedora. Please see the -[External Packages](#external-packages) section below for more information. - -Getting Started ---------------- - -*NOTE: This section assumes you've either cloned a BLIS source code repository -via `git`, downloaded the latest source code via a zip file, or downloaded the -source code for a tagged version release---Options 1, 2, or 3, respectively, -as discussed in [the previous section](#how-to-download-blis).* - -If you just want to build a sequential (not parallelized) version of BLIS -in a hurry and come back and explore other topics later, you can configure -and build BLIS as follows: -``` -$ ./configure auto -$ make [-j] -``` -You can then verify your build by running BLAS- and BLIS-specific test -drivers via `make check`: -``` -$ make check [-j] -``` -And if you would like to install BLIS to the directory specified to `configure` -via the `--prefix` option, run the `install` target: -``` -$ make install -``` -Please read the output of `./configure --help` for a full list of configure-time -options. -If/when you have time, we *strongly* encourage you to read the detailed -walkthrough of the build system found in our [Build System](docs/BuildSystem.md) -guide. - -Documentation -------------- - -We provide extensive documentation on the BLIS build system, APIs, test -infrastructure, and other important topics. All documentation is formatted in -markdown and included in the BLIS source distribution (usually in the `docs` -directory). Slightly longer descriptions of each document may be found via in -the project's [wiki](https://github.com/flame/blis/wiki) section. - -**Documents for everyone:** - - * **[Build System](docs/BuildSystem.md).** This document covers the basics of -configuring and building BLIS libraries, as well as related topics. - - * **[Testsuite](docs/Testsuite.md).** This document describes how to run -BLIS's highly parameterized and configurable test suite, as well as the -included BLAS test drivers. - - * **[BLIS Typed API Reference](docs/BLISTypedAPI.md).** Here we document the -so-called "typed" (or BLAS-like) API. This is the API that many users who are -already familiar with the BLAS will likely want to use. You can find lots of -example code for the typed API in the [examples/tapi](examples/tapi) directory -included in the BLIS source distribution. - - * **[BLIS Object API Reference](docs/BLISObjectAPI.md).** Here we document -the object API. This is API abstracts away properties of vectors and matrices -within `obj_t` structs that can be queried with accessor functions. Many -developers and experts prefer this API over the typed API. You can find lots of -example code for the object API in the [examples/oapi](examples/oapi) directory -included in the BLIS source distribution. - - * **[Hardware Support](docs/HardwareSupport.md).** This document maintains a -table of supported microarchitectures. - - * **[Multithreading](docs/Multithreading.md).** This document describes how to -use the multithreading features of BLIS. - - * **[Mixed-Datatypes](docs/MixedDatatypes.md).** This document provides an -overview of BLIS's mixed-datatype functionality and provides a brief example -of how to take advantage of this new code. - - * **[Performance](docs/Performance.md).** This document reports empirically -measured performance of a representative set of level-3 operations on a variety -of hardware architectures, as implemented within BLIS and other BLAS libraries -for all four of the standard floating-point datatypes. - - * **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports -empirically measured performance of `gemm` on select hardware architectures -within BLIS and other BLAS libraries when performing matrix problems where one -or two dimensions is exceedingly small. - - * **[Release Notes](docs/ReleaseNotes.md).** This document tracks a summary of -changes included with each new version of BLIS, along with contributor credits -for key features. - - * **[Frequently Asked Questions](docs/FAQ.md).** If you have general questions -about BLIS, please read this FAQ. If you can't find the answer to your question, -please feel free to join the [blis-devel](https://groups.google.com/group/blis-devel) -mailing list and post a question. We also have a -[blis-discuss](https://groups.google.com/group/blis-discuss) mailing list that -anyone can post to (even without joining). - -**Documents for github contributors:** - - * **[Contributing bug reports, feature requests, PRs, etc](CONTRIBUTING.md).** -Interested in contributing to BLIS? Please read this document before getting -started. It provides a general overview of how best to report bugs, propose new -features, and offer code patches. - - * **[Coding Conventions](docs/CodingConventions.md).** If you are interested or -planning on contributing code to BLIS, please read this document so that you can -format your code in accordance with BLIS's standards. - -**Documents for BLIS developers:** - - * **[Kernels Guide](docs/KernelsHowTo.md).** If you would like to learn more -about the types of kernels that BLIS exposes, their semantics, the operations -that each kernel accelerates, and various implementation issues, please read -this guide. - - * **[Configuration Guide](docs/ConfigurationHowTo.md).** If you would like to -learn how to add new sub-configurations or configuration families, or are simply -interested in learning how BLIS organizes its configurations and kernel sets, -please read this thorough walkthrough of the configuration system. - - * **[Sandbox Guide](docs/Sandboxes.md).** If you are interested in learning -about using sandboxes in BLIS--that is, providing alternative implementations -of the `gemm` operation--please read this document. - -External Packages ------------------ - -Generally speaking, we **highly recommend** building from source whenever -possible using the latest `git` clone. (Tarballs of each -[tagged release](https://github.com/flame/blis/releases) are also available, but -we consider them to be less ideal since they are not as easy to upgrade as -`git` clones.) - -That said, some users may prefer binary and/or source packages through their -Linux distribution. Thanks to generous involvement/contributions from our -community members, the following BLIS packages are now available: - - * **Debian**. [M. Zhou](https://github.com/cdluminate) has volunteered to -sponsor and maintain BLIS packages within the Debian Linux distribution. The -Debian package tracker can be found [here](https://tracker.debian.org/pkg/blis). -(Also, thanks to [Nico Schlömer](https://github.com/nschloe) for previously -volunteering his time to set up a standalone PPA.) - - * **Gentoo**. [M. Zhou](https://github.com/cdluminate) also maintains the -[BLIS package](https://packages.gentoo.org/packages/sci-libs/blis) entry for -[Gentoo](https://www.gentoo.org/), a Linux distribution known for its -source-based [portage](https://wiki.gentoo.org/wiki/Portage) package manager -and distribution system. - - * **EPEL/Fedora**. There are official BLIS packages in Fedora and EPEL (for -RHEL7+ and compatible distributions) with versions for 64-bit integers, OpenMP, -and pthreads, and shims which can be dynamically linked instead of reference -BLAS. (NOTE: For architectures other than intel64, amd64, and maybe arm64, the -performance of packaged BLIS will be low because it uses unoptimized generic -kernels; for those architectures, [OpenBLAS](https://github.com/xianyi/OpenBLAS) -may be a better solution.) [Dave -Love](https://github.com/loveshack) provides additional packages for EPEL6 in a -[Fedora Copr](https://copr.fedorainfracloud.org/coprs/loveshack/blis/), and -possibly versions more recent than the official repo for other EPEL/Fedora -releases. The source packages may build on other rpm-based distributions. - - * **OpenSuSE**. The copr referred to above has rpms for some OpenSuSE releases; -the source rpms may build for others. - - * **GNU Guix**. Guix has BLIS packages, provides builds only for the generic -target and some specific x86_64 micro-architectures. - - * **Conda**. conda channel [conda-forge](https://github.com/conda-forge/blis-feedstock) -has Linux, OSX and Windows binary packages for x86_64. - -Discussion ----------- - -You can keep in touch with developers and other users of the project by joining -one of the following mailing lists: - - * [blis-devel](https://groups.google.com/group/blis-devel): Please join and -post to this mailing list if you are a BLIS developer, or if you are trying -to use BLIS beyond simply linking to it as a BLAS library. -**Note:** Most of the interesting discussions happen here; don't be afraid to -join! If you would like to submit a bug report, or discuss a possible bug, -please consider opening a [new issue](https://github.com/flame/blis/issues) on -github. - - * [blis-discuss](https://groups.google.com/group/blis-discuss): Please join and -post to this mailing list if you have general questions or feedback regarding -BLIS. Application developers (end users) may wish to post here, unless they -have bug reports, in which case they should open a -[new issue](https://github.com/flame/blis/issues) on github. - -Contributing ------------- - -For information on how to contribute to our project, including preferred -[coding conventions](docs/CodingConventions.md), please refer to the -[CONTRIBUTING](CONTRIBUTING.md) file at the top-level of the BLIS source -distribution. - -Citations ---------- - -For those of you looking for the appropriate article to cite regarding BLIS, we -recommend citing our -[first ACM TOMS journal paper]( https://dl.acm.org/doi/10.1145/2764454?cid=81314495332) -([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis1_toms_rev3.pdf)): - -``` -@article{BLIS1, - author = {Field G. {V}an~{Z}ee and Robert A. {v}an~{d}e~{G}eijn}, - title = {{BLIS}: A Framework for Rapidly Instantiating {BLAS} Functionality}, - journal = {ACM Transactions on Mathematical Software}, - volume = {41}, - number = {3}, - pages = {14:1--14:33}, - month = {June}, - year = {2015}, - issue_date = {June 2015}, - url = {http://doi.acm.org/10.1145/2764454}, -} -``` - -You may also cite the -[second ACM TOMS journal paper]( https://dl.acm.org/doi/10.1145/2755561?cid=81314495332) -([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis2_toms_rev3.pdf)): - -``` -@article{BLIS2, - author = {Field G. {V}an~{Z}ee and Tyler Smith and Francisco D. Igual and - Mikhail Smelyanskiy and Xianyi Zhang and Michael Kistler and Vernon Austel and - John Gunnels and Tze Meng Low and Bryan Marker and Lee Killough and - Robert A. {v}an~{d}e~{G}eijn}, - title = {The {BLIS} Framework: Experiments in Portability}, - journal = {ACM Transactions on Mathematical Software}, - volume = {42}, - number = {2}, - pages = {12:1--12:19}, - month = {June}, - year = {2016}, - issue_date = {June 2016}, - url = {http://doi.acm.org/10.1145/2755561}, -} -``` - -We also have a third paper, submitted to IPDPS 2014, on achieving -[multithreaded parallelism in BLIS](https://dl.acm.org/doi/10.1109/IPDPS.2014.110) -([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf)): - -``` -@inproceedings{BLIS3, - author = {Tyler M. Smith and Robert A. {v}an~{d}e~{G}eijn and Mikhail Smelyanskiy and - Jeff R. Hammond and Field G. {V}an~{Z}ee}, - title = {Anatomy of High-Performance Many-Threaded Matrix Multiplication}, - booktitle = {28th IEEE International Parallel \& Distributed Processing Symposium - (IPDPS 2014)}, - year = {2014}, - url = {https://doi.org/10.1109/IPDPS.2014.110}, -} -``` - -A fourth paper, submitted to ACM TOMS, also exists, which proposes an -[analytical model](https://dl.acm.org/doi/10.1145/2925987) -for determining blocksize parameters in BLIS -([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf)): - -``` -@article{BLIS4, - author = {Tze Meng Low and Francisco D. Igual and Tyler M. Smith and - Enrique S. Quintana-Ort\'{\i}}, - title = {Analytical Modeling Is Enough for High-Performance {BLIS}}, - journal = {ACM Transactions on Mathematical Software}, - volume = {43}, - number = {2}, - pages = {12:1--12:18}, - month = {August}, - year = {2016}, - issue_date = {August 2016}, - url = {http://doi.acm.org/10.1145/2925987}, -} -``` - -A fifth paper, submitted to ACM TOMS, begins the study of so-called -[induced methods for complex matrix multiplication]( https://dl.acm.org/doi/10.1145/3086466?cid=81314495332) -([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)): - -``` -@article{BLIS5, - author = {Field G. {V}an~{Z}ee and Tyler Smith}, - title = {Implementing High-performance Complex Matrix Multiplication via the 3m and 4m Methods}, - journal = {ACM Transactions on Mathematical Software}, - volume = {44}, - number = {1}, - pages = {7:1--7:36}, - month = {July}, - year = {2017}, - issue_date = {July 2017}, - url = {http://doi.acm.org/10.1145/3086466}, -} -``` - -A sixth paper, submitted to ACM TOMS, revisits the topic of the previous -article and derives a -[superior induced method](https://epubs.siam.org/doi/10.1137/19M1282040) -([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis6_sisc_rev3.pdf)): - -``` -@article{BLIS6, - author = {Field G. {V}an~{Z}ee}, - title = {Implementing High-Performance Complex Matrix Multiplication via the 1m Method}, - journal = {SIAM Journal on Scientific Computing}, - volume = {42}, - number = {5}, - pages = {C221--C244}, - month = {September} - year = {2020}, - issue_date = {September 2020}, - url = {https://doi.org/10.1137/19M1282040} -} -``` - -A seventh paper, submitted to ACM TOMS, explores the implementation of `gemm` for -[mixed-domain and/or mixed-precision](https://www.cs.utexas.edu/users/flame/pubs/blis7_toms_rev0.pdf) operands -([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis7_toms_rev0.pdf)): - -``` -@article{BLIS7, - author = {Field G. {V}an~{Z}ee and Devangi N. Parikh and Robert A. van~de~{G}eijn}, - title = {Supporting Mixed-domain Mixed-precision Matrix Multiplication -within the BLIS Framework}, - journal = {ACM Transactions on Mathematical Software}, - note = {submitted} -} -``` - -Funding -------- - -This project and its associated research were partially sponsored by grants from -[Microsoft](https://www.microsoft.com/), -[Intel](https://www.intel.com/), -[Texas Instruments](https://www.ti.com/), -[AMD](https://www.amd.com/), -[HPE](https://www.hpe.com/), -[Oracle](https://www.oracle.com/), -[Huawei](https://www.huawei.com/), -and -[Facebook](https://www.facebook.com/), -as well as grants from the -[National Science Foundation](https://www.nsf.gov/) (Awards -CCF-0917167, ACI-1148125/1340293, CCF-1320112, and ACI-1550493). - -_Any opinions, findings and conclusions or recommendations expressed in this -material are those of the author(s) and do not necessarily reflect the views of -the National Science Foundation (NSF)._ +The upstream repository (https://github.com/flame/blis) contains further information on BLIS, including background information on BLIS design, usage examples, and a complete BLIS API reference. +AOCL-BLAS is developed and maintained by AMD. You can contact us on the email-id toolchainsupport@amd.com. You can also raise any issue/suggestion on the git-hub repository at https://github.com/amd/blis/issues. From 7b2924c0792fc3fb0ce9ef7ccb16d2a0714c7733 Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Thu, 1 Jun 2023 17:29:37 +0530 Subject: [PATCH 106/226] Updating object library targets in CMakeLists.txt for zen4 based on configuration AMD-Internal: [CPUPL-3516] Change-Id: Ibfe66f50fa77d4011829d8386f0a91f140d38335 --- CMakeLists.txt | 57 ++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a9c3bc803..a1f89a6682 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -614,25 +614,36 @@ set(BLIS_VERSION_STRING ${BLIS_VERSION}) string(TIMESTAMP BUILD_DATE "%Y%m%d") add_definitions(-DBLIS_VERSION_STRING="AOCL-BLIS ${BLIS_VERSION_STRING} Build ${BUILD_DATE}") +# Set object libraries created in kernels directory to be added into BLIS library. +set(OBJECT_LIBRARIES + $ + $ + $ + $ + $ + $ + $ + $ + $ +) +# Ammend the list of object libraries to include zen4 paths as appropriate. +if(${TARGET_ARCH} STREQUAL zen4 OR + ${TARGET_ARCH} STREQUAL amdzen) + set(OBJECT_LIBRARIES ${OBJECT_LIBRARIES} + $ + $ + $ + $ + $ + $ + ) +endif() + if(BUILD_SHARED_LIBS) add_library("${PROJECT_NAME}" SHARED ${CMAKE_SOURCE_DIR}/bli_config.h ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/blis.h ${headers} - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ + ${OBJECT_LIBRARIES} ) if(ENABLE_OPENMP) target_link_libraries("${PROJECT_NAME}" PRIVATE OpenMP::OpenMP_CXX) @@ -644,21 +655,7 @@ if(NOT BUILD_SHARED_LIBS) add_library("${PROJECT_NAME}" STATIC ${CMAKE_SOURCE_DIR}/bli_config.h ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/blis.h ${headers} - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ - $ + ${OBJECT_LIBRARIES} ) if(ENABLE_OPENMP) set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C OUTPUT_NAME "${LIB_NAME}" STATIC_LIBRARY_OPTIONS "${OpenMP_libomp_LIBRARY}") From 94a4abe2e5a856c290c5936c780c82995e9e6901 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Wed, 7 Jun 2023 06:32:11 -0400 Subject: [PATCH 107/226] BLIS: Incorrect ifdef in cblas.h and cblas_f77.h Remove unnecessary ifdef BLIS_ENABLE_CBLAS statement from cblas.h and cblas_f77.h. These were erroneously added when fixing the --disable-blas functionality but are not needed in the CBLAS headers, as these files will not be generated when BLAS or CBLAS is disabled. This is a fix to commit 5bd2a777ba7fb939420899cd968fec5cabbc894a AMD-Internal: [CPUPL-3541] Change-Id: If38bd795d31098a7023d575672b0a913338c0d2d --- frame/compat/cblas/src/cblas.h | 4 ---- frame/compat/cblas/src/cblas_f77.h | 4 ---- 2 files changed, 8 deletions(-) diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h index a880d4a5f4..dcccb07baa 100644 --- a/frame/compat/cblas/src/cblas.h +++ b/frame/compat/cblas/src/cblas.h @@ -28,8 +28,6 @@ */ -#ifdef BLIS_ENABLE_CBLAS - #ifndef CBLAS_H #define CBLAS_H #include @@ -999,5 +997,3 @@ BLIS_EXPORT_BLAS f77_int cblas_izamin(f77_int N, const void *X, f77_int incX); } #endif #endif - -#endif // BLIS_ENABLE_CBLAS diff --git a/frame/compat/cblas/src/cblas_f77.h b/frame/compat/cblas/src/cblas_f77.h index 355a3a7418..be02986ae7 100644 --- a/frame/compat/cblas/src/cblas_f77.h +++ b/frame/compat/cblas/src/cblas_f77.h @@ -11,8 +11,6 @@ * */ -#ifdef BLIS_ENABLE_CBLAS - #ifndef CBLAS_F77_H #define CBLAS_F77_H @@ -394,5 +392,3 @@ #endif #endif /* CBLAS_F77_H */ - -#endif // BLIS_ENABLE_CBLAS From 7b35a1283bf5dcba4a5367f3989b7ec6e4540763 Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Fri, 16 Jun 2023 22:04:09 +0530 Subject: [PATCH 108/226] Updating CMake to select the correct Windows runtime libraries. - Upgrated to 3.15 as minimum version of CMake. - Used CMAKE_MSVC_RUNTIME_LIBRARY instead of CMAKE_C_FLAGS to set MT and MD flags correctly. AMD-Internal: [CPUPL-3559] Change-Id: Ib82821d245b6acaa1399166219168ad2535d8d92 --- CMakeLists.txt | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a1f89a6682..4051c79c9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ ##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.## -cmake_minimum_required(VERSION 3.0.0) +cmake_minimum_required(VERSION 3.15.0) project(AOCL-LibBlis-Win C CXX) @@ -260,11 +260,13 @@ endif () include_directories(${PROJECT_SOURCE_DIR}/external/msvc) add_definitions(-D_CRT_SECURE_NO_WARNINGS) -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MD ") -#add_definitions(-DBLIS_IS_BUILDING_LIBRARY) -if(NOT BUILD_SHARED_LIBS) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MT ") -add_definitions(-DBLIS_IS_BUILDING_LIBRARY) + +cmake_policy(SET CMP0091 NEW) +if(BUILD_SHARED_LIBS) + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") +else() + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") + add_definitions(-DBLIS_IS_BUILDING_LIBRARY) endif() if(ENABLE_MULTITHREADING) From 003d1e9ae68ffc7742380accf95c3b1d6878e798 Mon Sep 17 00:00:00 2001 From: jagar Date: Wed, 21 Jun 2023 16:34:15 +0530 Subject: [PATCH 109/226] GTestSuite: Using ELEMENT_TYPE to specify generation of random numbers in tests. Since random numbers are specified from ELEMENT_TYPE and we never generate tests for both integer and floating point numbers at the same time, we update code as described below: - random vector/matrix generators are updated to use ELEMENT_TYPE as a default parameter. - ::testing::Values(ELEMENT_TYPE) is removed from all test generators. AMD-Internal: [CPUPL-2732] Change-Id: Ibc6b05044502f541c9e8a7687931b1ca2903fb0c --- gtestsuite/testinghelpers/CMakeLists.txt | 1 + .../inc/common/data_generators.h | 8 +-- .../src/common/data_generators.cpp | 11 ++-- gtestsuite/testsuite/level1/addv/addv.h | 2 +- .../testsuite/level1/addv/caddv_generic.cpp | 15 ++--- .../testsuite/level1/addv/daddv_generic.cpp | 15 ++--- .../testsuite/level1/addv/saddv_generic.cpp | 15 ++--- gtestsuite/testsuite/level1/addv/test_addv.h | 15 +++-- .../testsuite/level1/addv/zaddv_generic.cpp | 15 ++--- gtestsuite/testsuite/level1/amaxv/amaxv.h | 2 +- .../testsuite/level1/amaxv/camaxv_generic.cpp | 19 ++---- .../testsuite/level1/amaxv/damaxv_generic.cpp | 19 ++---- .../testsuite/level1/amaxv/samaxv_generic.cpp | 19 ++---- .../testsuite/level1/amaxv/test_amaxv.h | 12 ++-- .../testsuite/level1/amaxv/zamaxv_generic.cpp | 19 ++---- gtestsuite/testsuite/level1/axpbyv/axpbyv.h | 2 +- .../level1/axpbyv/caxpbyv_generic.cpp | 22 +++---- .../level1/axpbyv/daxpbyv_generic.cpp | 25 +++----- .../level1/axpbyv/saxpbyv_generic.cpp | 25 +++----- .../testsuite/level1/axpbyv/test_axpbyv.h | 14 ++--- .../level1/axpbyv/zaxpbyv_generic.cpp | 22 +++---- gtestsuite/testsuite/level1/axpyv/axpyv.h | 2 +- .../testsuite/level1/axpyv/caxpyv_generic.cpp | 22 +++---- .../testsuite/level1/axpyv/daxpyv_generic.cpp | 25 +++----- .../testsuite/level1/axpyv/saxpyv_generic.cpp | 25 +++----- .../testsuite/level1/axpyv/test_axpyv.h | 9 +-- .../testsuite/level1/axpyv/zaxpyv_generic.cpp | 22 +++---- .../testsuite/level1/copyv/ccopyv_generic.cpp | 22 +++---- gtestsuite/testsuite/level1/copyv/copyv.h | 2 +- .../testsuite/level1/copyv/dcopyv_generic.cpp | 25 +++----- .../testsuite/level1/copyv/scopyv_generic.cpp | 25 +++----- .../testsuite/level1/copyv/test_copyv.h | 13 ++-- .../testsuite/level1/copyv/zcopyv_generic.cpp | 22 +++---- .../testsuite/level1/dotv/cdotv_generic.cpp | 22 +++---- .../testsuite/level1/dotv/ddotv_generic.cpp | 23 +++---- gtestsuite/testsuite/level1/dotv/dotv.h | 2 +- .../testsuite/level1/dotv/sdotv_generic.cpp | 25 +++----- gtestsuite/testsuite/level1/dotv/test_dotv.h | 14 ++--- .../testsuite/level1/dotv/zdotv_generic.cpp | 22 +++---- .../testsuite/level1/dotxv/cdotxv_generic.cpp | 22 +++---- .../testsuite/level1/dotxv/ddotxv_generic.cpp | 21 +++---- gtestsuite/testsuite/level1/dotxv/dotxv.h | 2 +- .../testsuite/level1/dotxv/sdotxv_generic.cpp | 21 +++---- .../testsuite/level1/dotxv/test_dotxv.h | 12 ++-- .../testsuite/level1/dotxv/zdotxv_generic.cpp | 18 ++---- .../level1/scal2v/cscal2v_generic.cpp | 59 ++++++++--------- .../level1/scal2v/dscal2v_generic.cpp | 62 ++++++++---------- gtestsuite/testsuite/level1/scal2v/scal2v.h | 2 +- .../level1/scal2v/sscal2v_generic.cpp | 63 +++++++++---------- .../testsuite/level1/scal2v/test_scal2v.h | 10 +-- .../level1/scal2v/zscal2v_generic.cpp | 59 ++++++++--------- .../testsuite/level1/scalv/cscalv_generic.cpp | 58 ++++++++--------- .../testsuite/level1/scalv/dscalv_generic.cpp | 61 ++++++++---------- gtestsuite/testsuite/level1/scalv/scalv.h | 2 +- .../level1/scalv/scalv_extreme_cases.cpp | 6 +- .../testsuite/level1/scalv/sscalv_generic.cpp | 63 ++++++++----------- .../testsuite/level1/scalv/test_scalv.h | 10 +-- .../testsuite/level1/scalv/zscalv_generic.cpp | 58 ++++++++--------- .../testsuite/level1/setv/csetv_generic.cpp | 2 +- .../testsuite/level1/setv/dsetv_generic.cpp | 2 +- gtestsuite/testsuite/level1/setv/setv.h | 2 +- .../testsuite/level1/setv/ssetv_generic.cpp | 2 +- gtestsuite/testsuite/level1/setv/test_setv.h | 5 +- .../testsuite/level1/setv/zsetv_generic.cpp | 2 +- .../testsuite/level1/subv/csubv_generic.cpp | 15 ++--- .../testsuite/level1/subv/dsubv_generic.cpp | 15 ++--- .../testsuite/level1/subv/ssubv_generic.cpp | 15 ++--- gtestsuite/testsuite/level1/subv/subv.h | 2 +- gtestsuite/testsuite/level1/subv/test_subv.h | 15 +++-- .../testsuite/level1/subv/zsubv_generic.cpp | 15 ++--- .../testsuite/level1/xpbyv/cxpbyv_generic.cpp | 17 ++--- .../testsuite/level1/xpbyv/dxpbyv_generic.cpp | 22 +++---- .../testsuite/level1/xpbyv/sxpbyv_generic.cpp | 22 +++---- .../testsuite/level1/xpbyv/test_xpbyv.h | 16 ++--- gtestsuite/testsuite/level1/xpbyv/xpbyv.h | 2 +- .../testsuite/level1/xpbyv/zxpbyv_generic.cpp | 19 ++---- .../testsuite/level2/gemv/cgemv_generic.cpp | 20 +++--- .../testsuite/level2/gemv/dgemv_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/gemv/gemv.h | 2 +- .../testsuite/level2/gemv/sgemv_generic.cpp | 18 ++---- gtestsuite/testsuite/level2/gemv/test_gemv.h | 15 +++-- .../testsuite/level2/gemv/zgemv_generic.cpp | 20 +++--- .../testsuite/level2/ger/cger_generic.cpp | 20 +++--- .../testsuite/level2/ger/dger_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/ger/ger.h | 2 +- .../testsuite/level2/ger/sger_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/ger/test_ger.h | 15 +++-- .../testsuite/level2/ger/zger_generic.cpp | 20 +++--- .../testsuite/level2/hemv/chemv_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/hemv/hemv.h | 2 +- gtestsuite/testsuite/level2/hemv/test_hemv.h | 15 +++-- .../testsuite/level2/hemv/zhemv_generic.cpp | 20 +++--- .../testsuite/level2/her/cher_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/her/her.h | 2 +- gtestsuite/testsuite/level2/her/test_her.h | 12 ++-- .../testsuite/level2/her/zher_generic.cpp | 20 +++--- .../testsuite/level2/her2/cher2_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/her2/her2.h | 2 +- gtestsuite/testsuite/level2/her2/test_her2.h | 15 +++-- .../testsuite/level2/her2/zher2_generic.cpp | 20 +++--- .../testsuite/level2/symv/dsymv_generic.cpp | 20 +++--- .../testsuite/level2/symv/ssymv_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/symv/symv.h | 2 +- gtestsuite/testsuite/level2/symv/test_symv.h | 15 +++-- .../testsuite/level2/syr/dsyr_generic.cpp | 20 +++--- .../testsuite/level2/syr/ssyr_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/syr/syr.h | 2 +- gtestsuite/testsuite/level2/syr/test_syr.h | 12 ++-- .../testsuite/level2/syr2/dsyr2_generic.cpp | 20 +++--- .../testsuite/level2/syr2/ssyr2_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/syr2/syr2.h | 2 +- gtestsuite/testsuite/level2/syr2/test_syr2.h | 15 +++-- .../testsuite/level2/trmv/ctrmv_generic.cpp | 20 +++--- .../testsuite/level2/trmv/dtrmv_generic.cpp | 20 +++--- .../testsuite/level2/trmv/strmv_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/trmv/test_trmv.h | 12 ++-- gtestsuite/testsuite/level2/trmv/trmv.h | 2 +- .../testsuite/level2/trmv/ztrmv_generic.cpp | 20 +++--- .../testsuite/level2/trsv/ctrsv_generic.cpp | 20 +++--- .../testsuite/level2/trsv/dtrsv_generic.cpp | 20 +++--- .../testsuite/level2/trsv/strsv_generic.cpp | 20 +++--- gtestsuite/testsuite/level2/trsv/test_trsv.h | 12 ++-- gtestsuite/testsuite/level2/trsv/trsv.h | 2 +- .../testsuite/level2/trsv/ztrsv_generic.cpp | 20 +++--- .../testsuite/level3/gemm/cgemm_generic.cpp | 19 +++--- .../testsuite/level3/gemm/dgemm_generic.cpp | 19 +++--- gtestsuite/testsuite/level3/gemm/gemm.h | 2 +- .../testsuite/level3/gemm/sgemm_generic.cpp | 19 +++--- gtestsuite/testsuite/level3/gemm/test_gemm.h | 21 +++---- .../testsuite/level3/gemm/zgemm_generic.cpp | 19 +++--- .../testsuite/level3/gemmt/cgemmt_generic.cpp | 20 +++--- .../testsuite/level3/gemmt/dgemmt_generic.cpp | 19 +++--- gtestsuite/testsuite/level3/gemmt/gemmt.h | 3 +- .../testsuite/level3/gemmt/sgemmt_generic.cpp | 19 +++--- .../testsuite/level3/gemmt/test_gemmt.h | 20 +++--- .../testsuite/level3/gemmt/zgemmt_generic.cpp | 19 +++--- .../testsuite/level3/hemm/chemm_generic.cpp | 19 +++--- gtestsuite/testsuite/level3/hemm/hemm.h | 2 +- gtestsuite/testsuite/level3/hemm/test_hemm.h | 22 +++---- .../testsuite/level3/hemm/zhemm_generic.cpp | 19 +++--- .../testsuite/level3/her2k/cher2k_generic.cpp | 19 +++--- gtestsuite/testsuite/level3/her2k/her2k.h | 2 +- .../testsuite/level3/her2k/test_her2k.h | 22 +++---- .../testsuite/level3/her2k/zher2k_generic.cpp | 19 +++--- .../testsuite/level3/herk/cherk_generic.cpp | 19 +++--- gtestsuite/testsuite/level3/herk/herk.h | 2 +- gtestsuite/testsuite/level3/herk/test_herk.h | 19 +++--- .../testsuite/level3/herk/zherk_generic.cpp | 35 +++++------ .../testsuite/level3/symm/csymm_generic.cpp | 19 +++--- .../testsuite/level3/symm/dsymm_generic.cpp | 19 +++--- .../testsuite/level3/symm/ssymm_generic.cpp | 19 +++--- gtestsuite/testsuite/level3/symm/symm.h | 2 +- gtestsuite/testsuite/level3/symm/test_symm.h | 23 +++---- .../testsuite/level3/symm/zsymm_generic.cpp | 19 +++--- .../testsuite/level3/syr2k/csyr2k_generic.cpp | 37 +++++------ .../testsuite/level3/syr2k/dsyr2k_generic.cpp | 37 +++++------ .../testsuite/level3/syr2k/ssyr2k_generic.cpp | 41 ++++++------ .../testsuite/level3/syr2k/test_syr2k.h | 24 ++++--- .../testsuite/level3/syr2k/zsyr2k_generic.cpp | 19 +++--- .../testsuite/level3/syrk/csyrk_generic.cpp | 19 +++--- .../testsuite/level3/syrk/dsyrk_generic.cpp | 19 +++--- .../testsuite/level3/syrk/ssyrk_generic.cpp | 35 +++++------ gtestsuite/testsuite/level3/syrk/syrk.h | 2 +- gtestsuite/testsuite/level3/syrk/test_syrk.h | 19 +++--- .../testsuite/level3/syrk/zsyrk_generic.cpp | 19 +++--- .../testsuite/level3/trmm/ctrmm_generic.cpp | 19 +++--- .../testsuite/level3/trmm/dtrmm_generic.cpp | 19 +++--- .../testsuite/level3/trmm/strmm_generic.cpp | 19 +++--- gtestsuite/testsuite/level3/trmm/test_trmm.h | 17 +++-- gtestsuite/testsuite/level3/trmm/trmm.h | 2 +- .../testsuite/level3/trmm/ztrmm_generic.cpp | 19 +++--- .../testsuite/level3/trmm3/ctrmm3_generic.cpp | 19 +++--- .../testsuite/level3/trmm3/dtrmm3_generic.cpp | 19 +++--- .../testsuite/level3/trmm3/strmm3_generic.cpp | 19 +++--- .../testsuite/level3/trmm3/test_trmm3.h | 20 +++--- gtestsuite/testsuite/level3/trmm3/trmm3.h | 2 +- .../testsuite/level3/trmm3/ztrmm3_generic.cpp | 19 +++--- .../testsuite/level3/trsm/ctrsm_generic.cpp | 19 +++--- .../testsuite/level3/trsm/dtrsm_generic.cpp | 19 +++--- .../testsuite/level3/trsm/strsm_generic.cpp | 19 +++--- gtestsuite/testsuite/level3/trsm/test_trsm.h | 20 +++--- gtestsuite/testsuite/level3/trsm/trsm.h | 2 +- .../testsuite/level3/trsm/ztrsm_generic.cpp | 19 +++--- .../testsuite/util/nrm2/cnrm2_generic.cpp | 15 ++--- .../testsuite/util/nrm2/dnrm2_generic.cpp | 13 ++-- gtestsuite/testsuite/util/nrm2/nrm2.h | 2 +- .../testsuite/util/nrm2/nrm2_extreme_vals.cpp | 2 +- .../testsuite/util/nrm2/snrm2_generic.cpp | 15 ++--- gtestsuite/testsuite/util/nrm2/test_nrm2.h | 14 ++--- .../testsuite/util/nrm2/znrm2_generic.cpp | 15 ++--- 190 files changed, 1272 insertions(+), 1999 deletions(-) diff --git a/gtestsuite/testinghelpers/CMakeLists.txt b/gtestsuite/testinghelpers/CMakeLists.txt index ab120e52da..b7ef2cc3b4 100644 --- a/gtestsuite/testinghelpers/CMakeLists.txt +++ b/gtestsuite/testinghelpers/CMakeLists.txt @@ -52,5 +52,6 @@ if(INT_SIZE STREQUAL "32") else() target_compile_definitions(testinghelpers PUBLIC INT_SIZE=64) endif() +target_compile_definitions(testinghelpers PUBLIC ELEMENT_TYPE='${ELEMENT_TYPE}') target_include_directories(testinghelpers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/inc ${BLIS_INCLUDE}) target_link_libraries(testinghelpers pthread) diff --git a/gtestsuite/testinghelpers/inc/common/data_generators.h b/gtestsuite/testinghelpers/inc/common/data_generators.h index 01bae20650..9656cc219c 100644 --- a/gtestsuite/testinghelpers/inc/common/data_generators.h +++ b/gtestsuite/testinghelpers/inc/common/data_generators.h @@ -72,19 +72,19 @@ void randomgenerators(int from, int to, char storage, gtint_t m, gtint_t n, T* a template void randomgenerators(int from, int to, char storage, char uplo, gtint_t m, - T* a, gtint_t lda, char datatype); + T* a, gtint_t lda, char fp ); } //end of namespace datagenerators template std::vector get_random_matrix(int from, int to, char storage, char trans, gtint_t m, gtint_t n, - gtint_t lda, char datatype); + gtint_t lda, char datatype = ELEMENT_TYPE ); template std::vector get_random_matrix(int from, int to, char storage, char uplo, gtint_t k, - gtint_t lda, char datatype); + gtint_t lda, char datatype = ELEMENT_TYPE ); template -std::vector get_random_vector(int from, int to, gtint_t n, gtint_t incx, char datatype); +std::vector get_random_vector(int from, int to, gtint_t n, gtint_t incx,char datatype = ELEMENT_TYPE); template std::vector get_vector( gtint_t n, gtint_t incx, T value ); diff --git a/gtestsuite/testinghelpers/src/common/data_generators.cpp b/gtestsuite/testinghelpers/src/common/data_generators.cpp index c77af67cd5..afe5650e9d 100644 --- a/gtestsuite/testinghelpers/src/common/data_generators.cpp +++ b/gtestsuite/testinghelpers/src/common/data_generators.cpp @@ -313,7 +313,7 @@ void randomgenerators( int from, int to, char storage, gtint_t m, gtint_t n, template void randomgenerators(int from, int to, char storage, char uplo, gtint_t k, - T* a, gtint_t lda, char datatype) { + T* a, gtint_t lda, char datatype ) { randomgenerators(from, to, storage, k, k, a, lda, datatype); if( (storage=='c')||(storage=='C') ) { @@ -359,14 +359,15 @@ void randomgenerators(int from, int to, char storage, char uplo, gtint_t k, template std::vector get_random_matrix(int from, int to, char storage, char trans, gtint_t m, gtint_t n, - gtint_t lda, char datatype) + gtint_t lda, char datatype ) { std::vector a(matsize(storage, trans, m, n, lda)); testinghelpers::datagenerators::randomgenerators( from, to, storage, m, n, a.data(), trans, lda, datatype ); return a; } + template -std::vector get_random_matrix(int from, int to, char storage, char uplo, gtint_t k, gtint_t lda, char datatype) +std::vector get_random_matrix(int from, int to, char storage, char uplo, gtint_t k, gtint_t lda, char datatype ) { // Create matrix for the given sizes. std::vector a( testinghelpers::matsize( storage, 'n', k, k, lda ) ); @@ -375,7 +376,7 @@ std::vector get_random_matrix(int from, int to, char storage, char uplo, gtin } template -std::vector get_random_vector(int from, int to, gtint_t n, gtint_t incx, char datatype) +std::vector get_random_vector(int from, int to, gtint_t n, gtint_t incx, char datatype ) { // Create vector for the given sizes. std::vector x( testinghelpers::buff_dim(n, incx) ); @@ -383,8 +384,6 @@ std::vector get_random_vector(int from, int to, gtint_t n, gtint_t incx, char return x; } - - template void set_vector( gtint_t n, gtint_t incx, T* x, T value ) { diff --git a/gtestsuite/testsuite/level1/addv/addv.h b/gtestsuite/testsuite/level1/addv/addv.h index ed392dedc5..e28a91a99d 100644 --- a/gtestsuite/testsuite/level1/addv/addv.h +++ b/gtestsuite/testsuite/level1/addv/addv.h @@ -79,4 +79,4 @@ static void addv(char conjx, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy) #else throw std::runtime_error("Error in testsuite/level1/addv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/addv/caddv_generic.cpp b/gtestsuite/testsuite/level1/addv/caddv_generic.cpp index 94f3621c5b..0cbf65b466 100644 --- a/gtestsuite/testsuite/level1/addv/caddv_generic.cpp +++ b/gtestsuite/testsuite/level1/addv/caddv_generic.cpp @@ -36,7 +36,7 @@ #include "test_addv.h" class caddvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(caddvGenericTest); @@ -55,8 +55,6 @@ TEST_P( caddvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -64,19 +62,18 @@ TEST_P( caddvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_addv(conj_x, n, incx, incy, thresh, datatype); + test_addv( conj_x, n, incx, incy, thresh ); } // Prints the test case combination class caddvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); std::string str_name = "bli_caddv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -84,7 +81,6 @@ class caddvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -98,9 +94,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n','c'), // n: not transpose for x, c: conjugate for x ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::caddvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/addv/daddv_generic.cpp b/gtestsuite/testsuite/level1/addv/daddv_generic.cpp index e9d5835cba..c700131423 100644 --- a/gtestsuite/testsuite/level1/addv/daddv_generic.cpp +++ b/gtestsuite/testsuite/level1/addv/daddv_generic.cpp @@ -36,7 +36,7 @@ #include "test_addv.h" class daddvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(daddvGenericTest); @@ -55,8 +55,6 @@ TEST_P( daddvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -64,19 +62,18 @@ TEST_P( daddvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_addv(conj_x, n, incx, incy, thresh, datatype); + test_addv( conj_x, n, incx, incy, thresh ); } // Prints the test case combination class daddvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); std::string str_name = "bli_daddv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -84,7 +81,6 @@ class daddvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -98,9 +94,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: not transpose for x ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::daddvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/addv/saddv_generic.cpp b/gtestsuite/testsuite/level1/addv/saddv_generic.cpp index 0d1da47652..4b4820e8c6 100644 --- a/gtestsuite/testsuite/level1/addv/saddv_generic.cpp +++ b/gtestsuite/testsuite/level1/addv/saddv_generic.cpp @@ -36,7 +36,7 @@ #include "test_addv.h" class saddvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(saddvGenericTest); @@ -55,8 +55,6 @@ TEST_P( saddvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -64,19 +62,18 @@ TEST_P( saddvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_addv(conj_x, n, incx, incy, thresh, datatype); + test_addv( conj_x, n, incx, incy, thresh ); } // Prints the test case combination class saddvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); std::string str_name = "bli_saddv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -84,7 +81,6 @@ class saddvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -98,9 +94,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: not transpose for x ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::saddvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/addv/test_addv.h b/gtestsuite/testsuite/level1/addv/test_addv.h index 0b66675b65..a535e404f3 100644 --- a/gtestsuite/testsuite/level1/addv/test_addv.h +++ b/gtestsuite/testsuite/level1/addv/test_addv.h @@ -43,29 +43,28 @@ */ template -void test_addv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, - double thresh, char datatype ) { - +void test_addv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, double thresh ) +{ //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-10, 10, n, incy, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); + std::vector y = testinghelpers::get_random_vector( -10, 10, n, incy ); //---------------------------------------------------------- // Call reference implementation to get ref results. //---------------------------------------------------------- // Create a copy of y so that we can check reference results. std::vector y_ref(y); - testinghelpers::ref_addv(conjx, n, x.data(), incx, y_ref.data(), incy); + testinghelpers::ref_addv( conjx, n, x.data(), incx, y_ref.data(), incy ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - addv(conjx, n, x.data(), incx, y.data(), incy); + addv( conjx, n, x.data(), incx, y.data(), incy ); //---------------------------------------------------------- // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp b/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp index ed7796d36b..df4d60beb3 100644 --- a/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp +++ b/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp @@ -36,7 +36,7 @@ #include "test_addv.h" class ZAddvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ZAddvGenericTest); @@ -55,8 +55,6 @@ TEST_P( ZAddvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -64,19 +62,18 @@ TEST_P( ZAddvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_addv(conj_x, n, incx, incy, thresh, datatype); + test_addv( conj_x, n, incx, incy, thresh ); } // Prints the test case combination class ZAddvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); std::string str_name = "bli_zaddv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -84,7 +81,6 @@ class ZAddvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -98,9 +94,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n','c'), // n: not transpose for x, c: conjugate for x ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::ZAddvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/amaxv/amaxv.h b/gtestsuite/testsuite/level1/amaxv/amaxv.h index 4479263e2b..04f76e42f3 100644 --- a/gtestsuite/testsuite/level1/amaxv/amaxv.h +++ b/gtestsuite/testsuite/level1/amaxv/amaxv.h @@ -114,4 +114,4 @@ static gtint_t amaxv(gtint_t n, T* x, gtint_t incx) #else throw std::runtime_error("Error in testsuite/level1/amaxv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp index d6dcd7f282..27799b0965 100644 --- a/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp +++ b/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp @@ -37,8 +37,7 @@ class camaxvGenericTest : public ::testing::TestWithParam> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( camaxvGenericTest, RandomData ) @@ -52,8 +51,6 @@ TEST_P( camaxvGenericTest, RandomData ) gtint_t n = std::get<0>(GetParam()); // stride size for x: gtint_t incx = std::get<1>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<2>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -61,7 +58,7 @@ TEST_P( camaxvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_amaxv(n, incx, thresh, datatype); + test_amaxv( n, incx, thresh ); } // Used to generate a test case with a sensible name. @@ -71,10 +68,9 @@ TEST_P( camaxvGenericTest, RandomData ) class camaxvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); gtint_t incx = std::get<1>(str.param); - char datatype = std::get<2>(str.param); #ifdef TEST_BLAS std::string str_name = "icamax_"; #elif TEST_CBLAS @@ -85,7 +81,6 @@ class camaxvGenericTestPrint { str_name += "_" + std::to_string(n); std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name += "_" + incx_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -96,8 +91,7 @@ INSTANTIATE_TEST_SUITE_P( camaxvGenericTest, ::testing::Combine( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for x ), ::camaxvGenericTestPrint() ); @@ -110,8 +104,7 @@ INSTANTIATE_TEST_SUITE_P( camaxvGenericTest, ::testing::Combine( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector - ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(2), gtint_t(11)) // stride size for x ), ::camaxvGenericTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp index f95871c8d2..1410daefa0 100644 --- a/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp +++ b/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp @@ -37,8 +37,7 @@ class damaxvGenericTest : public ::testing::TestWithParam> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( damaxvGenericTest, RandomData ) @@ -52,8 +51,6 @@ TEST_P( damaxvGenericTest, RandomData ) gtint_t n = std::get<0>(GetParam()); // stride size for x: gtint_t incx = std::get<1>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<2>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -61,7 +58,7 @@ TEST_P( damaxvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_amaxv(n, incx, thresh, datatype); + test_amaxv( n, incx, thresh ); } // Used to generate a test case with a sensible name. @@ -71,10 +68,9 @@ TEST_P( damaxvGenericTest, RandomData ) class damaxvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); gtint_t incx = std::get<1>(str.param); - char datatype = std::get<2>(str.param); #ifdef TEST_BLAS std::string str_name = "idamax_"; #elif TEST_CBLAS @@ -85,7 +81,6 @@ class damaxvGenericTestPrint { str_name += "_" + std::to_string(n); std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name += "_" + incx_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -96,8 +91,7 @@ INSTANTIATE_TEST_SUITE_P( damaxvGenericTest, ::testing::Combine( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for x ), ::damaxvGenericTestPrint() ); @@ -110,8 +104,7 @@ INSTANTIATE_TEST_SUITE_P( damaxvGenericTest, ::testing::Combine( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector - ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(2), gtint_t(11)) // stride size for x ), ::damaxvGenericTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp index 11aa87c216..acd0f38bb7 100644 --- a/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp +++ b/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp @@ -37,8 +37,7 @@ class samaxvGenericTest : public ::testing::TestWithParam> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( samaxvGenericTest, RandomData ) @@ -52,8 +51,6 @@ TEST_P( samaxvGenericTest, RandomData ) gtint_t n = std::get<0>(GetParam()); // stride size for x: gtint_t incx = std::get<1>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<2>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -61,7 +58,7 @@ TEST_P( samaxvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_amaxv(n, incx, thresh, datatype); + test_amaxv( n, incx, thresh ); } // Used to generate a test case with a sensible name. @@ -71,10 +68,9 @@ TEST_P( samaxvGenericTest, RandomData ) class samaxvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); gtint_t incx = std::get<1>(str.param); - char datatype = std::get<2>(str.param); #ifdef TEST_BLAS std::string str_name = "isamax_"; #elif TEST_CBLAS @@ -85,7 +81,6 @@ class samaxvGenericTestPrint { str_name += "_" + std::to_string(n); std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name += "_" + incx_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -96,8 +91,7 @@ INSTANTIATE_TEST_SUITE_P( samaxvGenericTest, ::testing::Combine( ::testing::Range(gtint_t(10), gtint_t(101), 10), // n size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for x ), ::samaxvGenericTestPrint() ); @@ -110,8 +104,7 @@ INSTANTIATE_TEST_SUITE_P( samaxvGenericTest, ::testing::Combine( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector - ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(2), gtint_t(11)) // stride size for x ), ::samaxvGenericTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/amaxv/test_amaxv.h b/gtestsuite/testsuite/level1/amaxv/test_amaxv.h index e723cc33da..0d2ea890dc 100644 --- a/gtestsuite/testsuite/level1/amaxv/test_amaxv.h +++ b/gtestsuite/testsuite/level1/amaxv/test_amaxv.h @@ -43,25 +43,25 @@ */ template -void test_amaxv( gtint_t n, gtint_t incx, double thresh, char datatype ) { - +void test_amaxv( gtint_t n, gtint_t incx, double thresh ) +{ //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); //---------------------------------------------------------- // Call reference implementation to get ref results. //---------------------------------------------------------- - gtint_t idx_ref = testinghelpers::ref_amaxv(n, x.data(), incx); + gtint_t idx_ref = testinghelpers::ref_amaxv( n, x.data(), incx ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - gtint_t idx = amaxv(n, x.data(), incx); + gtint_t idx = amaxv( n, x.data(), incx ); //---------------------------------------------------------- // Compute component-wise error. //---------------------------------------------------------- computediff( idx, idx_ref ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp index d8534c3da6..b6b1155273 100644 --- a/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp +++ b/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp @@ -37,8 +37,7 @@ class zamaxvGenericTest : public ::testing::TestWithParam> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( zamaxvGenericTest, RandomData ) @@ -52,8 +51,6 @@ TEST_P( zamaxvGenericTest, RandomData ) gtint_t n = std::get<0>(GetParam()); // stride size for x: gtint_t incx = std::get<1>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<2>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -61,7 +58,7 @@ TEST_P( zamaxvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_amaxv(n, incx, thresh, datatype); + test_amaxv( n, incx, thresh ); } // Used to generate a test case with a sensible name. @@ -71,10 +68,9 @@ TEST_P( zamaxvGenericTest, RandomData ) class zamaxvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); gtint_t incx = std::get<1>(str.param); - char datatype = std::get<2>(str.param); #ifdef TEST_BLAS std::string str_name = "izamax_"; #elif TEST_CBLAS @@ -85,7 +81,6 @@ class zamaxvGenericTestPrint { str_name += "_" + std::to_string(n); std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name += "_" + incx_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -96,8 +91,7 @@ INSTANTIATE_TEST_SUITE_P( zamaxvGenericTest, ::testing::Combine( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for x ), ::zamaxvGenericTestPrint() ); @@ -110,8 +104,7 @@ INSTANTIATE_TEST_SUITE_P( zamaxvGenericTest, ::testing::Combine( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector - ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(2), gtint_t(11)) // stride size for x ), ::zamaxvGenericTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpbyv/axpbyv.h b/gtestsuite/testsuite/level1/axpbyv/axpbyv.h index 0c415e1b0c..7d955cd7e7 100644 --- a/gtestsuite/testsuite/level1/axpbyv/axpbyv.h +++ b/gtestsuite/testsuite/level1/axpbyv/axpbyv.h @@ -111,4 +111,4 @@ static void axpbyv(char conj_x, gtint_t n, T alpha, T* x, gtint_t incx, T beta, #else throw std::runtime_error("Error in testsuite/level1/axpbyv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp index e4a4c80c03..bb277c300a 100644 --- a/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp @@ -41,8 +41,7 @@ class caxpbyvGenericTest : gtint_t, gtint_t, scomplex, - scomplex, - char>> {}; + scomplex>> {}; // Tests using random integers as vector elements. TEST_P( caxpbyvGenericTest, RandomData ) { @@ -63,8 +62,6 @@ TEST_P( caxpbyvGenericTest, RandomData ) T alpha = std::get<4>(GetParam()); // beta T beta = std::get<5>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<6>(GetParam()); // Set the threshold for the errors: double thresh = 2*testinghelpers::getEpsilon(); @@ -72,7 +69,7 @@ TEST_P( caxpbyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_axpbyv(conj_x, n, incx, incy, alpha, beta, thresh, datatype); + test_axpbyv( conj_x, n, incx, incy, alpha, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -82,14 +79,13 @@ TEST_P( caxpbyvGenericTest, RandomData ) class caxpbyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); scomplex alpha = std::get<4>(str.param); scomplex beta = std::get<5>(str.param); - char datatype = std::get<6>(str.param); #ifdef TEST_BLAS std::string str_name = "caxpby_"; #elif TEST_CBLAS @@ -109,7 +105,6 @@ class caxpbyvGenericTestPrint { beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag))))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -128,8 +123,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}), // alpha - ::testing::Values(scomplex{1.0, 2.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{1.0, 2.0}) // beta ), ::caxpbyvGenericTestPrint() ); @@ -150,8 +144,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2)), // stride size for x ::testing::Values(gtint_t(3)), // stride size for y ::testing::Values(scomplex{4.0, 3.1}), // alpha - ::testing::Values(scomplex{1.0, -2.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{1.0, -2.0}) // beta ), ::caxpbyvGenericTestPrint() ); @@ -169,9 +162,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(-11), gtint_t(5)), // stride size for x ::testing::Values(gtint_t(-3), gtint_t(7)), // stride size for y ::testing::Values(scomplex{4.0, 3.1}), // alpha - ::testing::Values(scomplex{1.0, -2.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{1.0, -2.0}) // beta ), ::caxpbyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp index efc2770ab2..181466bf6e 100644 --- a/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp @@ -41,8 +41,7 @@ class daxpbyvGenericTest : gtint_t, gtint_t, double, - double, - char>> {}; + double>> {}; // Tests using random integers as vector elements. TEST_P( daxpbyvGenericTest, RandomData ) { @@ -63,8 +62,6 @@ TEST_P( daxpbyvGenericTest, RandomData ) T alpha = std::get<4>(GetParam()); // beta T beta = std::get<5>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<6>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -72,7 +69,7 @@ TEST_P( daxpbyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_axpbyv(conj_x, n, incx, incy, alpha, beta, thresh, datatype); + test_axpbyv( conj_x, n, incx, incy, alpha, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -82,14 +79,13 @@ TEST_P( daxpbyvGenericTest, RandomData ) class daxpbyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); double alpha = std::get<4>(str.param); double beta = std::get<5>(str.param); - char datatype = std::get<6>(str.param); #ifdef TEST_BLAS std::string str_name = "daxpby_"; #elif TEST_CBLAS @@ -107,7 +103,6 @@ class daxpbyvGenericTestPrint { str_name = str_name + "_a" + alpha_str; std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta))); str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -122,8 +117,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(double(2.0), double(-2.0)), // alpha - ::testing::Values(double(-1.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(-1.0)) // beta ), ::daxpbyvGenericTestPrint() ); @@ -141,8 +135,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(double(2.0)), // alpha - ::testing::Values(double(1.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(1.0)) // beta ), ::daxpbyvGenericTestPrint() ); @@ -164,8 +157,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(7)), // stride size for x ::testing::Values(gtint_t(3)), // stride size for y ::testing::Values(4.0), // alpha - ::testing::Values(-2.0), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(-2.0) // beta ), ::daxpbyvGenericTestPrint() ); @@ -183,9 +175,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(11), gtint_t(-11)), // stride size for x ::testing::Values(gtint_t(-3), gtint_t(4)), // stride size for y ::testing::Values(4.0), // alpha - ::testing::Values(-2.0), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(-2.0) // beta ), ::daxpbyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp index 6f0cf3b8be..80f1fc478d 100644 --- a/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp @@ -41,8 +41,7 @@ class saxpbyvGenericTest : gtint_t, gtint_t, float, - float, - char>> {}; + float>> {}; // Tests using random integers as vector elements. TEST_P( saxpbyvGenericTest, RandomData ) { @@ -63,8 +62,6 @@ TEST_P( saxpbyvGenericTest, RandomData ) T alpha = std::get<4>(GetParam()); // beta T beta = std::get<5>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<6>(GetParam()); // Set the threshold for the errors: float thresh = testinghelpers::getEpsilon(); @@ -72,7 +69,7 @@ TEST_P( saxpbyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_axpbyv(conj_x, n, incx, incy, alpha, beta, thresh, datatype); + test_axpbyv( conj_x, n, incx, incy, alpha, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -82,14 +79,13 @@ TEST_P( saxpbyvGenericTest, RandomData ) class saxpbyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); float alpha = std::get<4>(str.param); float beta = std::get<5>(str.param); - char datatype = std::get<6>(str.param); #ifdef TEST_BLAS std::string str_name = "saxpby_"; #elif TEST_CBLAS @@ -107,7 +103,6 @@ class saxpbyvGenericTestPrint { str_name = str_name + "_a" + alpha_str; std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta))); str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -122,8 +117,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(float(2.0), float(-2.0)), // alpha - ::testing::Values(float(-1.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(-1.0)) // beta ), ::saxpbyvGenericTestPrint() ); @@ -141,8 +135,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(float(2.0)), // alpha - ::testing::Values(float(1.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(1.0)) // beta ), ::saxpbyvGenericTestPrint() ); @@ -160,8 +153,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(11)), /*(gtint_t(-5), gtint_t(-17))*/// stride size for x ::testing::Values(gtint_t(3)), /*(gtint_t(-12), gtint_t(-4))*/// stride size for y ::testing::Values(float(4.0)), // alpha - ::testing::Values(float(2.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(2.0)) // beta ), ::saxpbyvGenericTestPrint() ); @@ -179,9 +171,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(11), gtint_t(-11)), // stride size for x ::testing::Values(gtint_t(-3), gtint_t(4)), // stride size for y ::testing::Values(4.0), // alpha - ::testing::Values(-2.0), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(-2.0) // beta ), ::saxpbyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h b/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h index cf6156f141..487b95c734 100644 --- a/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h +++ b/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h @@ -44,28 +44,28 @@ template static void test_axpbyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, - T alpha, T beta, double thresh, char datatype ) { - + T alpha, T beta, double thresh ) +{ //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-10, 10, n, incy, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); + std::vector y = testinghelpers::get_random_vector( -10, 10, n, incy ); //---------------------------------------------------------- // Call reference implementation to get ref results. //---------------------------------------------------------- // Create a copy of y so that we can check reference results. std::vector y_ref(y); - testinghelpers::ref_axpbyv(conjx, n, alpha, x.data(), incx, beta, y_ref.data(), incy); + testinghelpers::ref_axpbyv( conjx, n, alpha, x.data(), incx, beta, y_ref.data(), incy ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - axpbyv(conjx, n, alpha, x.data(), incx, beta, y.data(), incy); + axpbyv( conjx, n, alpha, x.data(), incx, beta, y.data(), incy ); //---------------------------------------------------------- // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp index 690b7d4784..5447f57aff 100644 --- a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp @@ -41,8 +41,7 @@ class zaxpbyvGenericTest : gtint_t, gtint_t, dcomplex, - dcomplex, - char>> {}; + dcomplex>> {}; // Tests using random integers as vector elements. TEST_P( zaxpbyvGenericTest, RandomData ) { @@ -63,8 +62,6 @@ TEST_P( zaxpbyvGenericTest, RandomData ) T alpha = std::get<4>(GetParam()); // beta T beta = std::get<5>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<6>(GetParam()); // Set the threshold for the errors: double thresh = 2*testinghelpers::getEpsilon(); @@ -72,7 +69,7 @@ TEST_P( zaxpbyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_axpbyv(conj_x, n, incx, incy, alpha, beta, thresh, datatype); + test_axpbyv( conj_x, n, incx, incy, alpha, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -82,14 +79,13 @@ TEST_P( zaxpbyvGenericTest, RandomData ) class zaxpbyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); dcomplex alpha = std::get<4>(str.param); dcomplex beta = std::get<5>(str.param); - char datatype = std::get<6>(str.param); #ifdef TEST_BLAS std::string str_name = "zaxpby_"; #elif TEST_CBLAS @@ -109,7 +105,6 @@ class zaxpbyvGenericTestPrint { beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag))))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -128,8 +123,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(dcomplex{-3.0, 1.0}, dcomplex{1.0, 2.0}), // alpha - ::testing::Values(dcomplex{1.0, 2.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{1.0, 2.0}) // beta ), ::zaxpbyvGenericTestPrint() ); @@ -150,8 +144,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2)), /*(gtint_t(-5), gtint_t(-17))*/// stride size for x ::testing::Values(gtint_t(4)), /*(gtint_t(-12), gtint_t(-4))*/// stride size for y ::testing::Values(dcomplex{4.0, 3.1}), // alpha - ::testing::Values(dcomplex{1.0, 2.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{1.0, 2.0}) // beta ), ::zaxpbyvGenericTestPrint() ); @@ -169,9 +162,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(11), gtint_t(-11)), // stride size for x ::testing::Values(gtint_t(-3), gtint_t(4)), // stride size for y ::testing::Values(dcomplex{4.0, 3.1}), // alpha - ::testing::Values(dcomplex{1.0, -2.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{1.0, -2.0}) // beta ), ::zaxpbyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpyv/axpyv.h b/gtestsuite/testsuite/level1/axpyv/axpyv.h index 10e56cae15..9081da1051 100644 --- a/gtestsuite/testsuite/level1/axpyv/axpyv.h +++ b/gtestsuite/testsuite/level1/axpyv/axpyv.h @@ -110,4 +110,4 @@ static void axpyv(char conj_x, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gti #else throw std::runtime_error("Error in testsuite/level1/axpyv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp index 77cd26c285..4cd74f4dc8 100644 --- a/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp @@ -40,8 +40,7 @@ class caxpyvGenericTest : gtint_t, gtint_t, gtint_t, - scomplex, - char>> {}; + scomplex>> {}; // Tests using random integers as vector elements. TEST_P( caxpyvGenericTest, RandomData ) { @@ -60,8 +59,6 @@ TEST_P( caxpyvGenericTest, RandomData ) gtint_t incy = std::get<3>(GetParam()); // alpha T alpha = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = 2*testinghelpers::getEpsilon(); @@ -69,7 +66,7 @@ TEST_P( caxpyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_axpyv(conj_x, n, incx, incy, alpha, thresh, datatype); + test_axpyv( conj_x, n, incx, incy, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -79,13 +76,12 @@ TEST_P( caxpyvGenericTest, RandomData ) class caxpyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); scomplex alpha = std::get<4>(str.param); - char datatype = std::get<5>(str.param); #ifdef TEST_BLAS std::string str_name = "caxpy_"; #elif TEST_CBLAS @@ -102,7 +98,6 @@ class caxpyvGenericTestPrint { std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real)))); alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -120,8 +115,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}) // alpha ), ::caxpyvGenericTestPrint() ); @@ -141,8 +135,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2)), // stride size for x ::testing::Values(gtint_t(3)), // stride size for y - ::testing::Values(scomplex{4.0, 3.1}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{4.0, 3.1}) // alpha ), ::caxpyvGenericTestPrint() ); @@ -159,9 +152,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(-4)), // stride size for x ::testing::Values(gtint_t(-3)), // stride size for y - ::testing::Values(scomplex{4.0, 3.1}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{4.0, 3.1}) // alpha ), ::caxpyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp index 792d582782..69e69f8c6e 100644 --- a/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp @@ -40,8 +40,7 @@ class daxpyvGenericTest : gtint_t, gtint_t, gtint_t, - double, - char>> {}; + double>> {}; // Tests using random integers as vector elements. TEST_P( daxpyvGenericTest, RandomData ) { @@ -60,8 +59,6 @@ TEST_P( daxpyvGenericTest, RandomData ) gtint_t incy = std::get<3>(GetParam()); // alpha T alpha = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -69,7 +66,7 @@ TEST_P( daxpyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_axpyv(conj_x, n, incx, incy, alpha, thresh, datatype); + test_axpyv( conj_x, n, incx, incy, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -79,13 +76,12 @@ TEST_P( daxpyvGenericTest, RandomData ) class daxpyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); double alpha = std::get<4>(str.param); - char datatype = std::get<5>(str.param); #ifdef TEST_BLAS std::string str_name = "daxpy_"; #elif TEST_CBLAS @@ -101,7 +97,6 @@ class daxpyvGenericTestPrint { str_name += "_" + incy_str; std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -115,8 +110,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(double(2.0), double(-2.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(2.0), double(-2.0)) // alpha ), ::daxpyvGenericTestPrint() ); @@ -133,8 +127,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(double(2.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(2.0)) // alpha ), ::daxpyvGenericTestPrint() ); @@ -151,8 +144,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2)), /*(gtint_t(-5), gtint_t(-17))*/// stride size for x ::testing::Values(gtint_t(3)), /*(gtint_t(-12), gtint_t(-4))*/// stride size for y - ::testing::Values(double(4.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(4.0)) // beta ), ::daxpyvGenericTestPrint() ); @@ -169,9 +161,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(-4)), // stride size for x ::testing::Values(gtint_t(-3)), // stride size for y - ::testing::Values(4.0), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(4.0) // alpha ), ::daxpyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp index 67699e8337..ff8cc67b64 100644 --- a/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp @@ -40,8 +40,7 @@ class saxpyvGenericTest : gtint_t, gtint_t, gtint_t, - float, - char>> {}; + float>> {}; // Tests using random integers as vector elements. TEST_P( saxpyvGenericTest, RandomData ) { @@ -60,8 +59,6 @@ TEST_P( saxpyvGenericTest, RandomData ) gtint_t incy = std::get<3>(GetParam()); // alpha T alpha = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -69,7 +66,7 @@ TEST_P( saxpyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_axpyv(conj_x, n, incx, incy, alpha, thresh, datatype); + test_axpyv( conj_x, n, incx, incy, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -79,13 +76,12 @@ TEST_P( saxpyvGenericTest, RandomData ) class saxpyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); float alpha = std::get<4>(str.param); - char datatype = std::get<5>(str.param); #ifdef TEST_BLAS std::string str_name = "saxpy_"; #elif TEST_CBLAS @@ -101,7 +97,6 @@ class saxpyvGenericTestPrint { str_name += "_" + incy_str; std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -115,8 +110,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(float(2.0), float(-2.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(2.0), float(-2.0)) // alpha ), ::saxpyvGenericTestPrint() ); @@ -133,8 +127,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(float(2.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(2.0)) // alpha ), ::saxpyvGenericTestPrint() ); @@ -151,8 +144,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2), gtint_t(-2)), /*(gtint_t(-5), gtint_t(-17))*/// stride size for x ::testing::Values(gtint_t(3), gtint_t(-3)), /*(gtint_t(-12), gtint_t(-4))*/// stride size for y - ::testing::Values(float(4.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(4.0)) // alpha ), ::saxpyvGenericTestPrint() ); @@ -169,9 +161,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(-4)), // stride size for x ::testing::Values(gtint_t(-3)), // stride size for y - ::testing::Values(4.0), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(4.0) // alpha ), ::saxpyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpyv/test_axpyv.h b/gtestsuite/testsuite/level1/axpyv/test_axpyv.h index a2d6af583f..90f757ef7b 100644 --- a/gtestsuite/testsuite/level1/axpyv/test_axpyv.h +++ b/gtestsuite/testsuite/level1/axpyv/test_axpyv.h @@ -44,12 +44,13 @@ template static void test_axpyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, - T alpha, double thresh, char datatype ) { + T alpha, double thresh ) +{ //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-10, 10, n, incy, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); + std::vector y = testinghelpers::get_random_vector( -10, 10, n, incy ); //---------------------------------------------------------- // Call reference implementation to get ref results. @@ -67,4 +68,4 @@ static void test_axpyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp index a8cf1a6983..d88596c881 100644 --- a/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp @@ -40,8 +40,7 @@ class zaxpyvGenericTest : gtint_t, gtint_t, gtint_t, - dcomplex, - char>> {}; + dcomplex>> {}; // Tests using random integers as vector elements. TEST_P( zaxpyvGenericTest, RandomData ) { @@ -60,15 +59,13 @@ TEST_P( zaxpyvGenericTest, RandomData ) gtint_t incy = std::get<3>(GetParam()); // alpha T alpha = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = 2*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_axpyv(conj_x, n, incx, incy, alpha, thresh, datatype); + test_axpyv( conj_x, n, incx, incy, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -78,13 +75,12 @@ TEST_P( zaxpyvGenericTest, RandomData ) class zaxpyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); dcomplex alpha = std::get<4>(str.param); - char datatype = std::get<5>(str.param); #ifdef TEST_BLAS std::string str_name = "zaxpy_"; #elif TEST_CBLAS @@ -101,7 +97,6 @@ class zaxpyvGenericTestPrint { std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real)))); alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -119,8 +114,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(dcomplex{-3.0, 1.0}, dcomplex{1.0, 2.0}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{-3.0, 1.0}, dcomplex{1.0, 2.0}) // alpha ), ::zaxpyvGenericTestPrint() ); @@ -140,8 +134,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2)), // stride size for x ::testing::Values(gtint_t(3)), // stride size for y - ::testing::Values(dcomplex{-1.0, 2.0}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{-1.0, 2.0}) // alpha ), ::zaxpyvGenericTestPrint() ); @@ -158,9 +151,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(-4)), // stride size for x ::testing::Values(gtint_t(-3)), // stride size for y - ::testing::Values(dcomplex{4.0, 3.1}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{4.0, 3.1}) // alpha ), ::zaxpyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp index 5186cdecb5..beb0aced0c 100644 --- a/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp +++ b/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp @@ -39,8 +39,7 @@ class ccopyvGenericTest : public ::testing::TestWithParam> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( ccopyvGenericTest, RandomData ) @@ -58,8 +57,6 @@ TEST_P( ccopyvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -67,7 +64,7 @@ TEST_P( ccopyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_copyv(conjx, n, incx, incy, thresh, datatype); + test_copyv( conjx, n, incx, incy, thresh ); } // Used to generate a test case with a sensible name. @@ -77,12 +74,11 @@ TEST_P( ccopyvGenericTest, RandomData ) class ccopyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conjx = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); #ifdef TEST_BLAS std::string str_name = "ccopy_"; #elif TEST_CBLAS @@ -96,7 +92,6 @@ class ccopyvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -113,8 +108,7 @@ INSTANTIATE_TEST_SUITE_P( ), // n: use x, c: use conj(x) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::ccopyvGenericTestPrint() ); @@ -133,8 +127,7 @@ INSTANTIATE_TEST_SUITE_P( ), // n: use x, c: use conj(x) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(3), gtint_t(33)) // stride size for y ), ::ccopyvGenericTestPrint() ); @@ -150,9 +143,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, c: use conj(x) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(-5), gtint_t(7)), // stride size for x - ::testing::Values(gtint_t(13), gtint_t(-9)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(13), gtint_t(-9)) // stride size for y ), ::ccopyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/copyv/copyv.h b/gtestsuite/testsuite/level1/copyv/copyv.h index cc8bf85af0..bd0298bc89 100644 --- a/gtestsuite/testsuite/level1/copyv/copyv.h +++ b/gtestsuite/testsuite/level1/copyv/copyv.h @@ -109,4 +109,4 @@ static void copyv(char conjx, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy) #else throw std::runtime_error("Error in testsuite/level1/copyv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp index b97b992ba3..7957b02d01 100644 --- a/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp +++ b/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp @@ -39,8 +39,7 @@ class dcopyvGenericTest : public ::testing::TestWithParam> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( dcopyvGenericTest, RandomData ) @@ -58,8 +57,6 @@ TEST_P( dcopyvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -67,7 +64,7 @@ TEST_P( dcopyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_copyv(conjx, n, incx, incy, thresh, datatype); + test_copyv( conjx, n, incx, incy, thresh ); } // Used to generate a test case with a sensible name. @@ -77,12 +74,11 @@ TEST_P( dcopyvGenericTest, RandomData ) class dcopyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conjx = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); #ifdef TEST_BLAS std::string str_name = "dcopy_"; #elif TEST_CBLAS @@ -96,7 +92,6 @@ class dcopyvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -109,8 +104,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, not conj(x) (since it is real) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::dcopyvGenericTestPrint() ); @@ -126,8 +120,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('c'), // c: use conj(x) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::dcopyvGenericTestPrint() ); @@ -143,8 +136,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // use x, not conj(x) (since it is real) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(3), gtint_t(33)) // stride size for y ), ::dcopyvGenericTestPrint() ); @@ -160,9 +152,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, c: use conj(x) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(-5), gtint_t(7)), // stride size for x - ::testing::Values(gtint_t(13), gtint_t(-9)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(13), gtint_t(-9)) // stride size for y ), ::dcopyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp index 2035f92d60..ca2c591b2f 100644 --- a/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp +++ b/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp @@ -39,8 +39,7 @@ class scopyvGenericTest : public ::testing::TestWithParam> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( scopyvGenericTest, RandomData ) @@ -58,8 +57,6 @@ TEST_P( scopyvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -67,7 +64,7 @@ TEST_P( scopyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_copyv(conjx, n, incx, incy, thresh, datatype); + test_copyv( conjx, n, incx, incy, thresh ); } // Used to generate a test case with a sensible name. @@ -77,12 +74,11 @@ TEST_P( scopyvGenericTest, RandomData ) class scopyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conjx = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); #ifdef TEST_BLAS std::string str_name = "scopy_"; #elif TEST_CBLAS @@ -96,7 +92,6 @@ class scopyvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -109,8 +104,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, not conj(x) (since it is real) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::scopyvGenericTestPrint() ); @@ -126,8 +120,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('c'), // c: use conj(x) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::scopyvGenericTestPrint() ); @@ -143,8 +136,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, not conj(x) (since it is real) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(3), gtint_t(33)) // stride size for y ), ::scopyvGenericTestPrint() ); @@ -160,9 +152,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, c: use conj(x) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(-5), gtint_t(7)), // stride size for x - ::testing::Values(gtint_t(13), gtint_t(-9)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(13), gtint_t(-9)) // stride size for y ), ::scopyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/copyv/test_copyv.h b/gtestsuite/testsuite/level1/copyv/test_copyv.h index 95f27925e2..00f1995dd0 100644 --- a/gtestsuite/testsuite/level1/copyv/test_copyv.h +++ b/gtestsuite/testsuite/level1/copyv/test_copyv.h @@ -43,13 +43,12 @@ */ template -static void test_copyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, - double thresh, char datatype ) { - +static void test_copyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, double thresh ) +{ //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); std::vector y( testinghelpers::buff_dim(n, incy), T{-1} ); //---------------------------------------------------------- @@ -58,15 +57,15 @@ static void test_copyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, // Create a copy of y so that we can check reference results. std::vector y_ref(y); - testinghelpers::ref_copyv(conjx, n, x.data(), incx, y_ref.data(), incy); + testinghelpers::ref_copyv( conjx, n, x.data(), incx, y_ref.data(), incy ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - copyv(conjx, n, x.data(), incx, y.data(), incy); + copyv( conjx, n, x.data(), incx, y.data(), incy ); //---------------------------------------------------------- // Compute error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp index b76b11386e..3bd3aa64c7 100644 --- a/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp +++ b/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp @@ -39,8 +39,7 @@ class zcopyvGenericTest : public ::testing::TestWithParam> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( zcopyvGenericTest, RandomData ) @@ -58,8 +57,6 @@ TEST_P( zcopyvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -67,7 +64,7 @@ TEST_P( zcopyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_copyv(conjx, n, incx, incy, thresh, datatype); + test_copyv( conjx, n, incx, incy, thresh ); } // Used to generate a test case with a sensible name. @@ -77,12 +74,11 @@ TEST_P( zcopyvGenericTest, RandomData ) class zcopyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conjx = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); #ifdef TEST_BLAS std::string str_name = "zcopy_"; #elif TEST_CBLAS @@ -96,7 +92,6 @@ class zcopyvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -113,8 +108,7 @@ INSTANTIATE_TEST_SUITE_P( ), ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::zcopyvGenericTestPrint() ); @@ -133,8 +127,7 @@ INSTANTIATE_TEST_SUITE_P( ), ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(3), gtint_t(33)) // stride size for y ), ::zcopyvGenericTestPrint() ); @@ -150,9 +143,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, c: use conj(x) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(-5), gtint_t(7)), // stride size for x - ::testing::Values(gtint_t(13), gtint_t(-9)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(13), gtint_t(-9)) // stride size for y ), ::zcopyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp index 3584be5f08..1f21f8433a 100644 --- a/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp @@ -40,8 +40,7 @@ class cdotvGenericTest : char, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( cdotvGenericTest, RandomData ) @@ -61,8 +60,6 @@ TEST_P( cdotvGenericTest, RandomData ) gtint_t incx = std::get<3>(GetParam()); // stride size for y: gtint_t incy = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = 2*n*testinghelpers::getEpsilon(); @@ -70,7 +67,7 @@ TEST_P( cdotvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_dotv(conjx, conjy, n, incx, incy, thresh, datatype); + test_dotv( conjx, conjy, n, incx, incy, thresh ); } // Used to generate a test case with a sensible name. @@ -80,13 +77,12 @@ TEST_P( cdotvGenericTest, RandomData ) class cdotvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conjx = std::get<0>(str.param); char conjy = std::get<1>(str.param); gtint_t n = std::get<2>(str.param); gtint_t incx = std::get<3>(str.param); gtint_t incy = std::get<4>(str.param); - char datatype = std::get<5>(str.param); #ifdef TEST_BLAS std::string str_name = "cdotu_"; #elif TEST_CBLAS @@ -101,7 +97,6 @@ class cdotvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -123,8 +118,7 @@ INSTANTIATE_TEST_SUITE_P( ), // n: use y, c: use conj(y) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::cdotvGenericTestPrint() ); @@ -148,8 +142,7 @@ INSTANTIATE_TEST_SUITE_P( ), // n: use y, c: use conj(y) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(11)), // stride size for x - ::testing::Values(gtint_t(3)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(3)) // stride size for y ), ::cdotvGenericTestPrint() ); @@ -166,9 +159,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use y, c: use conj(y) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(-2)), // stride size for x - ::testing::Values(gtint_t(-3)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(-3)) // stride size for y ), ::cdotvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotv/ddotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/ddotv_generic.cpp index 250144e3f0..5af449fb32 100644 --- a/gtestsuite/testsuite/level1/dotv/ddotv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotv/ddotv_generic.cpp @@ -40,8 +40,7 @@ class ddotvGenericTest : char, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( ddotvGenericTest, RandomData ) @@ -61,8 +60,6 @@ TEST_P( ddotvGenericTest, RandomData ) gtint_t incx = std::get<3>(GetParam()); // stride size for y: gtint_t incy = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = n*testinghelpers::getEpsilon(); @@ -70,7 +67,7 @@ TEST_P( ddotvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_dotv(conjx, conjy, n, incx, incy, thresh, datatype); + test_dotv( conjx, conjy, n, incx, incy, thresh ); } // Used to generate a test case with a sensible name. @@ -80,13 +77,12 @@ TEST_P( ddotvGenericTest, RandomData ) class ddotvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conjx = std::get<0>(str.param); char conjy = std::get<1>(str.param); gtint_t n = std::get<2>(str.param); gtint_t incx = std::get<3>(str.param); gtint_t incy = std::get<4>(str.param); - char datatype = std::get<5>(str.param); #ifdef TEST_BLAS std::string str_name = "ddot_"; #elif TEST_CBLAS @@ -101,7 +97,6 @@ class ddotvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -115,8 +110,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use y, not conj(y) (since it is real) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::ddotvGenericTestPrint() ); @@ -133,8 +127,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('c'), // c: use conj(y) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::ddotvGenericTestPrint() ); @@ -151,8 +144,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // use y, not conj(y) (since it is real) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(3), gtint_t(33)) // stride size for y ), ::ddotvGenericTestPrint() ); @@ -169,8 +161,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use y, c: use conj(y) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(-2)), // stride size for x - ::testing::Values(gtint_t(-3)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(-3)) // stride size for y ), ::ddotvGenericTestPrint() ); diff --git a/gtestsuite/testsuite/level1/dotv/dotv.h b/gtestsuite/testsuite/level1/dotv/dotv.h index dad9802345..c65f229695 100644 --- a/gtestsuite/testsuite/level1/dotv/dotv.h +++ b/gtestsuite/testsuite/level1/dotv/dotv.h @@ -114,4 +114,4 @@ static void dotv(char conjx, char conjy, gtint_t n, #else throw std::runtime_error("Error in testsuite/level1/dotv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp index ce57c4f59b..9f59e2ea00 100644 --- a/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp @@ -40,8 +40,7 @@ class sdotvGenericTest : char, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( sdotvGenericTest, RandomData ) @@ -61,8 +60,6 @@ TEST_P( sdotvGenericTest, RandomData ) gtint_t incx = std::get<3>(GetParam()); // stride size for y: gtint_t incy = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = n*testinghelpers::getEpsilon(); @@ -70,7 +67,7 @@ TEST_P( sdotvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_dotv(conjx, conjy, n, incx, incy, thresh, datatype); + test_dotv( conjx, conjy, n, incx, incy, thresh ); } // Used to generate a test case with a sensible name. @@ -80,13 +77,12 @@ TEST_P( sdotvGenericTest, RandomData ) class sdotvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conjx = std::get<0>(str.param); char conjy = std::get<1>(str.param); gtint_t n = std::get<2>(str.param); gtint_t incx = std::get<3>(str.param); gtint_t incy = std::get<4>(str.param); - char datatype = std::get<5>(str.param); #ifdef TEST_BLAS std::string str_name = "sdot_"; #elif TEST_CBLAS @@ -101,7 +97,6 @@ class sdotvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -115,8 +110,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use y, not conj(y) (since it is real) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::sdotvGenericTestPrint() ); @@ -133,8 +127,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('c'), // c: use conj(y) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::sdotvGenericTestPrint() ); @@ -151,8 +144,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use y, not conj(y) (since it is real) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(3), gtint_t(33)) // stride size for y ), ::sdotvGenericTestPrint() ); @@ -169,9 +161,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use y, c: use conj(y) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(-2)), // stride size for x - ::testing::Values(gtint_t(-3)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(-3)) // stride size for y ), ::sdotvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotv/test_dotv.h b/gtestsuite/testsuite/level1/dotv/test_dotv.h index 1faf3120a2..fa5abb5270 100644 --- a/gtestsuite/testsuite/level1/dotv/test_dotv.h +++ b/gtestsuite/testsuite/level1/dotv/test_dotv.h @@ -44,15 +44,13 @@ template static void test_dotv( char conjx, char conjy, gtint_t n, gtint_t incx, - gtint_t incy, double thresh, char datatype ) + gtint_t incy, double thresh ) { - - //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-10, 10, n, incy, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); + std::vector y = testinghelpers::get_random_vector( -10, 10, n, incy ); //---------------------------------------------------------- // Call reference implementation to get ref results. @@ -63,16 +61,16 @@ static void test_dotv( char conjx, char conjy, gtint_t n, gtint_t incx, if constexpr (testinghelpers::type_info::is_real) testinghelpers::ref_dotv( n, x.data(), incx, y_ref.data(), incy, &rho_ref ); else - testinghelpers::ref_dotv(conjx, conjy, n, x.data(), incx, y_ref.data(), incy, &rho_ref); + testinghelpers::ref_dotv( conjx, conjy, n, x.data(), incx, y_ref.data(), incy, &rho_ref ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- T rho; - dotv(conjx, conjy, n, x.data(), incx, y.data(), incy, &rho); + dotv( conjx, conjy, n, x.data(), incx, y.data(), incy, &rho ); //---------------------------------------------------------- // Compute error. //---------------------------------------------------------- computediff( rho, rho_ref, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp index 4b0f3fbcdb..e37b3faa32 100644 --- a/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp @@ -40,8 +40,7 @@ class zdotvGenericTest : char, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; // Tests using random integers as vector elements. TEST_P( zdotvGenericTest, RandomData ) @@ -61,8 +60,6 @@ TEST_P( zdotvGenericTest, RandomData ) gtint_t incx = std::get<3>(GetParam()); // stride size for y: gtint_t incy = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = 2*n*testinghelpers::getEpsilon(); @@ -70,7 +67,7 @@ TEST_P( zdotvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_dotv(conjx, conjy, n, incx, incy, thresh, datatype); + test_dotv( conjx, conjy, n, incx, incy, thresh ); } // Used to generate a test case with a sensible name. @@ -80,13 +77,12 @@ TEST_P( zdotvGenericTest, RandomData ) class zdotvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conjx = std::get<0>(str.param); char conjy = std::get<1>(str.param); gtint_t n = std::get<2>(str.param); gtint_t incx = std::get<3>(str.param); gtint_t incy = std::get<4>(str.param); - char datatype = std::get<5>(str.param); #ifdef TEST_BLAS std::string str_name = "zdotu_"; #elif TEST_CBLAS @@ -101,7 +97,6 @@ class zdotvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -123,8 +118,7 @@ INSTANTIATE_TEST_SUITE_P( ), // n: use y, c: use conj(y) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1)) // stride size for y ), ::zdotvGenericTestPrint() ); @@ -148,8 +142,7 @@ INSTANTIATE_TEST_SUITE_P( ), // n: use y, c: use conj(y) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x - ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(3), gtint_t(33)) // stride size for y ), ::zdotvGenericTestPrint() ); @@ -166,9 +159,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use y, c: use conj(y) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(-2)), // stride size for x - ::testing::Values(gtint_t(-3)), // stride size for y - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(-3)) // stride size for y ), ::zdotvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp index 17377a7f0c..e4ed5e636b 100644 --- a/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp @@ -36,7 +36,7 @@ #include "test_dotxv.h" class cdotxvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(cdotxvGenericTest); @@ -62,8 +62,6 @@ TEST_P( cdotxvGenericTest, RandomData ) T alpha = std::get<5>(GetParam()); // beta T beta = std::get<6>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<7>(GetParam()); // Set the threshold for the errors: double thresh = n*testinghelpers::getEpsilon(); @@ -71,7 +69,7 @@ TEST_P( cdotxvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_dotxv(n, conj_x, conj_y, alpha, incx, incy, beta, thresh, datatype); + test_dotxv( n, conj_x, conj_y, alpha, incx, incy, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -81,7 +79,7 @@ TEST_P( cdotxvGenericTest, RandomData ) class cdotxvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); char conjx = std::get<1>(str.param); char conjy = std::get<2>(str.param); @@ -89,7 +87,6 @@ class cdotxvGenericTestPrint { gtint_t incy = std::get<4>(str.param); scomplex alpha = std::get<5>(str.param); scomplex beta = std::get<6>(str.param); - char datatype = std::get<7>(str.param); std::string str_name = "bli_cdotxv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conjx, 1); @@ -104,7 +101,6 @@ class cdotxvGenericTestPrint { beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag))))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -121,8 +117,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(scomplex{1.0, -1.0}), // alpha - ::testing::Values(scomplex{-1.0, 1.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{-1.0, 1.0}) // beta ), ::cdotxvGenericTestPrint() ); @@ -138,8 +133,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(scomplex{1.0, -1.0}), // alpha - ::testing::Values(scomplex{-1.0, 1.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{-1.0, 1.0}) // beta ), ::cdotxvGenericTestPrint() ); @@ -157,10 +151,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y ::testing::Values(scomplex{1.0, -1.0}), // alpha - ::testing::Values(scomplex{-1.0, 1.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{-1.0, 1.0}) // beta ), ::cdotxvGenericTestPrint() ); - -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp index 8cd33a861e..9ee8be98b8 100644 --- a/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp @@ -36,7 +36,7 @@ #include "test_dotxv.h" class ddotxvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ddotxvGenericTest); @@ -62,8 +62,6 @@ TEST_P( ddotxvGenericTest, RandomData ) T alpha = std::get<5>(GetParam()); // beta T beta = std::get<6>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<7>(GetParam()); // Set the threshold for the errors: double thresh = n*testinghelpers::getEpsilon(); @@ -71,7 +69,7 @@ TEST_P( ddotxvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_dotxv(n, conj_x, conj_y, alpha, incx, incy, beta, thresh, datatype); + test_dotxv(n, conj_x, conj_y, alpha, incx, incy, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -81,7 +79,7 @@ TEST_P( ddotxvGenericTest, RandomData ) class ddotxvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); char conjx = std::get<1>(str.param); char conjy = std::get<2>(str.param); @@ -89,7 +87,6 @@ class ddotxvGenericTestPrint { gtint_t incy = std::get<4>(str.param); double alpha = std::get<5>(str.param); double beta = std::get<6>(str.param); - char datatype = std::get<7>(str.param); std::string str_name = "bli_ddotxv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conjx, 1); @@ -102,7 +99,6 @@ class ddotxvGenericTestPrint { str_name = str_name + "_a" + alpha_str; std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta))); str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -119,8 +115,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(1.0, 2.0), // alpha - ::testing::Values(2.0, 3.0), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(2.0, 3.0) // beta ), ::ddotxvGenericTestPrint() ); @@ -138,8 +133,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(1.0, 2.0), // alpha - ::testing::Values(2.0, 3.0), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(2.0, 3.0) // beta ), ::ddotxvGenericTestPrint() ); @@ -157,9 +151,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y ::testing::Values(1.0, 2.0), // alpha - ::testing::Values(2.0, 3.0), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(2.0, 3.0) // beta ), ::ddotxvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotxv/dotxv.h b/gtestsuite/testsuite/level1/dotxv/dotxv.h index 3bb01ad0a0..91a13400fc 100644 --- a/gtestsuite/testsuite/level1/dotxv/dotxv.h +++ b/gtestsuite/testsuite/level1/dotxv/dotxv.h @@ -85,4 +85,4 @@ static void dotxv( char conjx, char conjy, gtint_t n, T* alpha, #else throw std::runtime_error("Error in testsuite/level1/dotxv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp index ea0ad22b6b..4dd80401e3 100644 --- a/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp @@ -36,7 +36,7 @@ #include "test_dotxv.h" class sdotxvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(sdotxvGenericTest); @@ -62,8 +62,6 @@ TEST_P( sdotxvGenericTest, RandomData ) T alpha = std::get<5>(GetParam()); // beta T beta = std::get<6>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<7>(GetParam()); // Set the threshold for the errors: double thresh = n*testinghelpers::getEpsilon(); @@ -71,7 +69,7 @@ TEST_P( sdotxvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_dotxv(n, conj_x, conj_y, alpha, incx, incy, beta, thresh, datatype); + test_dotxv( n, conj_x, conj_y, alpha, incx, incy, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -81,7 +79,7 @@ TEST_P( sdotxvGenericTest, RandomData ) class sdotxvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); char conjx = std::get<1>(str.param); char conjy = std::get<2>(str.param); @@ -89,7 +87,6 @@ class sdotxvGenericTestPrint { gtint_t incy = std::get<4>(str.param); float alpha = std::get<5>(str.param); float beta = std::get<6>(str.param); - char datatype = std::get<7>(str.param); std::string str_name = "bli_sdotxv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conjx, 1); @@ -102,7 +99,6 @@ class sdotxvGenericTestPrint { str_name = str_name + "_a" + alpha_str; std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta))); str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -119,8 +115,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(1.0, 2.0), // alpha - ::testing::Values(2.0, 3.0), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(2.0, 3.0) // beta ), ::sdotxvGenericTestPrint() ); @@ -138,8 +133,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(1.0, 2.0), // alpha - ::testing::Values(2.0, 3.0), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(2.0, 3.0) // beta ), ::sdotxvGenericTestPrint() ); @@ -157,9 +151,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y ::testing::Values(1.0, 2.0), // alpha - ::testing::Values(2.0, 3.0), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(2.0, 3.0) // beta ), ::sdotxvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotxv/test_dotxv.h b/gtestsuite/testsuite/level1/dotxv/test_dotxv.h index 6d0f74d5f0..1fe5b50614 100644 --- a/gtestsuite/testsuite/level1/dotxv/test_dotxv.h +++ b/gtestsuite/testsuite/level1/dotxv/test_dotxv.h @@ -44,13 +44,13 @@ template static void test_dotxv( gtint_t n, char conjx, char conjy, T alpha, - gtint_t incx, gtint_t incy, T beta, double thresh, char datatype ) + gtint_t incx, gtint_t incy, T beta, double thresh ) { //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-10, 10, n, incy, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); + std::vector y = testinghelpers::get_random_vector( -10, 10, n, incy ); //---------------------------------------------------------- // Call reference implementation to get ref results. @@ -59,17 +59,17 @@ static void test_dotxv( gtint_t n, char conjx, char conjy, T alpha, std::vector y_ref(y); T rho_ref; testinghelpers::initone(rho_ref); - testinghelpers::ref_dotxv(conjx, conjy, n, alpha, x.data(), incx, y.data(), incy, beta, &rho_ref); + testinghelpers::ref_dotxv( conjx, conjy, n, alpha, x.data(), incx, y.data(), incy, beta, &rho_ref ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- T rho; testinghelpers::initone(rho); - dotxv(conjx, conjy, n, &alpha, x.data(), incx, y.data(), incy, &beta, &rho); + dotxv( conjx, conjy, n, &alpha, x.data(), incx, y.data(), incy, &beta, &rho ); //---------------------------------------------------------- // Compute error. //---------------------------------------------------------- computediff( rho, rho_ref, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp index 829532afde..652c5d030c 100644 --- a/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp @@ -36,7 +36,7 @@ #include "test_dotxv.h" class zdotxvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zdotxvGenericTest); @@ -62,8 +62,6 @@ TEST_P( zdotxvGenericTest, RandomData ) T alpha = std::get<5>(GetParam()); // beta T beta = std::get<6>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<7>(GetParam()); // Set the threshold for the errors: double thresh = n*testinghelpers::getEpsilon(); @@ -71,7 +69,7 @@ TEST_P( zdotxvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_dotxv(n, conj_x, conj_y, alpha, incx, incy, beta, thresh, datatype); + test_dotxv(n, conj_x, conj_y, alpha, incx, incy, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -81,7 +79,7 @@ TEST_P( zdotxvGenericTest, RandomData ) class zdotxvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); char conjx = std::get<1>(str.param); char conjy = std::get<2>(str.param); @@ -89,7 +87,6 @@ class zdotxvGenericTestPrint { gtint_t incy = std::get<4>(str.param); dcomplex alpha = std::get<5>(str.param); dcomplex beta = std::get<6>(str.param); - char datatype = std::get<7>(str.param); std::string str_name = "bli_zdotxv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conjx, 1); @@ -104,7 +101,6 @@ class zdotxvGenericTestPrint { beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag))))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -121,8 +117,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y ::testing::Values(dcomplex{1.0, -1.0}), // alpha - ::testing::Values(dcomplex{-1.0, 1.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{-1.0, 1.0}) // beta ), ::zdotxvGenericTestPrint() ); @@ -140,9 +135,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x ::testing::Values(gtint_t(3), gtint_t(33)), // stride size for y ::testing::Values(dcomplex{1.0, -1.0}), // alpha - ::testing::Values(dcomplex{-1.0, 1.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{-1.0, 1.0}) // beta ), ::zdotxvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp index d25419606f..5d582ce7ce 100644 --- a/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp +++ b/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp @@ -40,38 +40,35 @@ class cscal2vGenericTest : gtint_t, gtint_t, gtint_t, - scomplex, - char>> {}; + scomplex>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(cscal2vGenericTest); // Tests using random integers as vector elements. TEST_P( cscal2vGenericTest, RandomData ) { - using T = scomplex; - //---------------------------------------------------------- - // Initialize values from the parameters passed through - // test suite instantiation (INSTANTIATE_TEST_SUITE_P). - //---------------------------------------------------------- - // denotes whether alpha or conj(alpha) will be used: - char conj_alpha = std::get<0>(GetParam()); - // vector length: - gtint_t n = std::get<1>(GetParam()); - // stride size for x: - gtint_t incx = std::get<2>(GetParam()); - // stride size for y: - gtint_t incy = std::get<3>(GetParam()); - // alpha - T alpha = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); + using T = scomplex; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // denotes whether alpha or conj(alpha) will be used: + char conj_alpha = std::get<0>(GetParam()); + // vector length: + gtint_t n = std::get<1>(GetParam()); + // stride size for x: + gtint_t incx = std::get<2>(GetParam()); + // stride size for y: + gtint_t incy = std::get<3>(GetParam()); + // alpha + T alpha = std::get<4>(GetParam()); - // Set the threshold for the errors: - double thresh = testinghelpers::getEpsilon(); - //---------------------------------------------------------- - // Call generic test body using those parameters - //---------------------------------------------------------- - test_scal2v(conj_alpha, n, incx, incy, alpha, thresh, datatype); + // Set the threshold for the errors: + double thresh = testinghelpers::getEpsilon(); + //---------------------------------------------------------- + // Call generic test body using those parameters + //---------------------------------------------------------- + test_scal2v( conj_alpha, n, incx, incy, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -81,13 +78,12 @@ TEST_P( cscal2vGenericTest, RandomData ) class cscal2vGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); scomplex alpha = std::get<4>(str.param); - char datatype = std::get<5>(str.param); std::string str_name = "bli_cscal2v"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -98,7 +94,6 @@ class cscal2vGenericTestPrint { std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real)))); alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -112,8 +107,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}) // alpha ), ::cscal2vGenericTestPrint() ); @@ -130,9 +124,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x ::testing::Values(gtint_t(4)), // stride size for y - ::testing::Values(scomplex{4.0, 3.1}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{4.0, 3.1}) // alpha ), ::cscal2vGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp index 396bf99ba1..790e8dc0ee 100644 --- a/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp +++ b/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp @@ -40,38 +40,35 @@ class dscal2vGenericTest : gtint_t, gtint_t, gtint_t, - double, - char>> {}; + double>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dscal2vGenericTest); // Tests using random integers as vector elements. TEST_P( dscal2vGenericTest, RandomData ) { - using T = double; - //---------------------------------------------------------- - // Initialize values from the parameters passed through - // test suite instantiation (INSTANTIATE_TEST_SUITE_P). - //---------------------------------------------------------- - // denotes whether alpha or conj(alpha) will be used: - char conj_alpha = std::get<0>(GetParam()); - // vector length: - gtint_t n = std::get<1>(GetParam()); - // stride size for x: - gtint_t incx = std::get<2>(GetParam()); - // stride size for y: - gtint_t incy = std::get<3>(GetParam()); - // alpha - T alpha = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); + using T = double; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // denotes whether alpha or conj(alpha) will be used: + char conj_alpha = std::get<0>(GetParam()); + // vector length: + gtint_t n = std::get<1>(GetParam()); + // stride size for x: + gtint_t incx = std::get<2>(GetParam()); + // stride size for y: + gtint_t incy = std::get<3>(GetParam()); + // alpha + T alpha = std::get<4>(GetParam()); - // Set the threshold for the errors: - float thresh = testinghelpers::getEpsilon(); - //---------------------------------------------------------- - // Call generic test body using those parameters - //---------------------------------------------------------- - test_scal2v(conj_alpha, n, incx, incy, alpha, thresh, datatype); + // Set the threshold for the errors: + float thresh = testinghelpers::getEpsilon(); + //---------------------------------------------------------- + // Call generic test body using those parameters + //---------------------------------------------------------- + test_scal2v( conj_alpha, n, incx, incy, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -81,13 +78,12 @@ TEST_P( dscal2vGenericTest, RandomData ) class dscal2vGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); double alpha = std::get<4>(str.param); - char datatype = std::get<5>(str.param); std::string str_name = "bli_dscal2v"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -97,7 +93,6 @@ class dscal2vGenericTestPrint { str_name += "_" + incy_str; std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -111,8 +106,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(double(2.0), double(-3.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(2.0), double(-3.0)) // alpha ), ::dscal2vGenericTestPrint() ); @@ -128,8 +122,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(double(-3.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(-3.0)) // alpha ), ::dscal2vGenericTestPrint() ); @@ -145,9 +138,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x ::testing::Values(gtint_t(5)), // stride size for y - ::testing::Values(double(3.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(3.0)) // alpha ), ::dscal2vGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scal2v/scal2v.h b/gtestsuite/testsuite/level1/scal2v/scal2v.h index ad1383b712..b90b2d9eef 100644 --- a/gtestsuite/testsuite/level1/scal2v/scal2v.h +++ b/gtestsuite/testsuite/level1/scal2v/scal2v.h @@ -80,4 +80,4 @@ static void scal2v(char conjx, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gti #else throw std::runtime_error("Error in testsuite/level1/scal2v.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp index ef02a4c225..f28670b0ef 100644 --- a/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp +++ b/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp @@ -40,38 +40,35 @@ class sscal2vGenericTest : gtint_t, gtint_t, gtint_t, - float, - char>> {}; + float>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(sscal2vGenericTest); // Tests using random integers as vector elements. TEST_P( sscal2vGenericTest, RandomData ) { - using T = float; - //---------------------------------------------------------- - // Initialize values from the parameters passed through - // test suite instantiation (INSTANTIATE_TEST_SUITE_P). - //---------------------------------------------------------- - // denotes whether alpha or conj(alpha) will be used: - char conj_alpha = std::get<0>(GetParam()); - // vector length: - gtint_t n = std::get<1>(GetParam()); - // stride size for x: - gtint_t incx = std::get<2>(GetParam()); - // stride size for y: - gtint_t incy = std::get<3>(GetParam()); - // alpha - T alpha = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); + using T = float; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // denotes whether alpha or conj(alpha) will be used: + char conj_alpha = std::get<0>(GetParam()); + // vector length: + gtint_t n = std::get<1>(GetParam()); + // stride size for x: + gtint_t incx = std::get<2>(GetParam()); + // stride size for y: + gtint_t incy = std::get<3>(GetParam()); + // alpha + T alpha = std::get<4>(GetParam()); - // Set the threshold for the errors: - float thresh = testinghelpers::getEpsilon(); - //---------------------------------------------------------- - // Call generic test body using those parameters - //---------------------------------------------------------- - test_scal2v(conj_alpha, n, incx, incy, alpha, thresh, datatype); + // Set the threshold for the errors: + float thresh = testinghelpers::getEpsilon(); + //---------------------------------------------------------- + // Call generic test body using those parameters + //---------------------------------------------------------- + test_scal2v( conj_alpha, n, incx, incy, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -81,13 +78,12 @@ TEST_P( sscal2vGenericTest, RandomData ) class sscal2vGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); float alpha = std::get<4>(str.param); - char datatype = std::get<5>(str.param); std::string str_name = "bli_sscal2v"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -97,10 +93,10 @@ class sscal2vGenericTestPrint { str_name += "_" + incy_str; std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; + #ifdef TEST_BLIS_TYPED // Black box testing for generic and main use of sscal2. INSTANTIATE_TEST_SUITE_P( @@ -111,8 +107,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(float(3.0), float(-5.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(3.0), float(-5.0)) // alpha ), ::sscal2vGenericTestPrint() ); @@ -128,8 +123,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(float(9.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(9.0)) // alpha ), ::sscal2vGenericTestPrint() ); @@ -145,9 +139,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x ::testing::Values(gtint_t(7)), // stride size for y - ::testing::Values(float(2.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(2.0)) // alpha ), ::sscal2vGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scal2v/test_scal2v.h b/gtestsuite/testsuite/level1/scal2v/test_scal2v.h index 8edb967ab2..c582688340 100644 --- a/gtestsuite/testsuite/level1/scal2v/test_scal2v.h +++ b/gtestsuite/testsuite/level1/scal2v/test_scal2v.h @@ -43,12 +43,12 @@ */ template -static void test_scal2v(char conjx, gtint_t n, gtint_t incx, gtint_t incy, T alpha, double thresh, char datatype) +static void test_scal2v(char conjx, gtint_t n, gtint_t incx, gtint_t incy, T alpha, double thresh ) { //---------------------------------------------------------- // Initialize vector with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); std::vector y( testinghelpers::buff_dim(n, incy), T{-112} ); //---------------------------------------------------------- @@ -56,15 +56,15 @@ static void test_scal2v(char conjx, gtint_t n, gtint_t incx, gtint_t incy, T alp //---------------------------------------------------------- // Create a copy of y so that we can check reference results. std::vector y_ref(y); - testinghelpers::ref_scal2v(conjx, n, alpha, x.data(), incx, y_ref.data(), incy); + testinghelpers::ref_scal2v( conjx, n, alpha, x.data(), incx, y_ref.data(), incy ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - scal2v(conjx, n, alpha, x.data(), incx, y.data(), incy); + scal2v( conjx, n, alpha, x.data(), incx, y.data(), incy ); //---------------------------------------------------------- // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp index 0308cbd10b..0619265732 100644 --- a/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp +++ b/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp @@ -40,8 +40,7 @@ class zscal2vGenericTest : gtint_t, gtint_t, gtint_t, - dcomplex, - char>> {}; + dcomplex>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zscal2vGenericTest); @@ -49,30 +48,28 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zscal2vGenericTest); // Tests using random integers as vector elements. TEST_P( zscal2vGenericTest, RandomData ) { - using T = dcomplex; - //---------------------------------------------------------- - // Initialize values from the parameters passed through - // test suite instantiation (INSTANTIATE_TEST_SUITE_P). - //---------------------------------------------------------- - // denotes whether alpha or conj(alpha) will be used: - char conj_alpha = std::get<0>(GetParam()); - // vector length: - gtint_t n = std::get<1>(GetParam()); - // stride size for x: - gtint_t incx = std::get<2>(GetParam()); - // stride size for y: - gtint_t incy = std::get<3>(GetParam()); - // alpha - T alpha = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); + using T = dcomplex; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // denotes whether alpha or conj(alpha) will be used: + char conj_alpha = std::get<0>(GetParam()); + // vector length: + gtint_t n = std::get<1>(GetParam()); + // stride size for x: + gtint_t incx = std::get<2>(GetParam()); + // stride size for y: + gtint_t incy = std::get<3>(GetParam()); + // alpha + T alpha = std::get<4>(GetParam()); - // Set the threshold for the errors: - float thresh = testinghelpers::getEpsilon(); - //---------------------------------------------------------- - // Call generic test body using those parameters - //---------------------------------------------------------- - test_scal2v(conj_alpha, n, incx, incy, alpha, thresh, datatype); + // Set the threshold for the errors: + float thresh = testinghelpers::getEpsilon(); + //---------------------------------------------------------- + // Call generic test body using those parameters + //---------------------------------------------------------- + test_scal2v( conj_alpha, n, incx, incy, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -82,13 +79,12 @@ TEST_P( zscal2vGenericTest, RandomData ) class zscal2vGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); dcomplex alpha = std::get<4>(str.param); - char datatype = std::get<5>(str.param); std::string str_name = "bli_zscal2v"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -99,7 +95,6 @@ class zscal2vGenericTestPrint { std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real)))); alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -113,8 +108,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(dcomplex{3.0, -2.0}, dcomplex{-1.0, 4.0}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{3.0, -2.0}, dcomplex{-1.0, 4.0}) // alpha ), ::zscal2vGenericTestPrint() ); @@ -131,9 +125,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2), gtint_t(11)), // stride size for x ::testing::Values(gtint_t(3)), // stride size for y - ::testing::Values(dcomplex{1.0, 2.1}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{1.0, 2.1}) // alpha ), ::zscal2vGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp index 223fec91d7..eb4a03580f 100644 --- a/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp @@ -39,35 +39,32 @@ class cscalvGenericTest : public ::testing::TestWithParam> {}; + scomplex>> {}; // Tests using random integers as vector elements. TEST_P( cscalvGenericTest, RandomData ) { - using T = scomplex; - //---------------------------------------------------------- - // Initialize values from the parameters passed through - // test suite instantiation (INSTANTIATE_TEST_SUITE_P). - //---------------------------------------------------------- - // denotes whether alpha or conj(alpha) will be used: - char conj_alpha = std::get<0>(GetParam()); - // vector length: - gtint_t n = std::get<1>(GetParam()); - // stride size for x: - gtint_t incx = std::get<2>(GetParam()); - // alpha - T alpha = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); + using T = scomplex; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // denotes whether alpha or conj(alpha) will be used: + char conj_alpha = std::get<0>(GetParam()); + // vector length: + gtint_t n = std::get<1>(GetParam()); + // stride size for x: + gtint_t incx = std::get<2>(GetParam()); + // alpha + T alpha = std::get<3>(GetParam()); - // Set the threshold for the errors: - double thresh = testinghelpers::getEpsilon(); - //---------------------------------------------------------- - // Call generic test body using those parameters - //---------------------------------------------------------- - test_scalv(conj_alpha, n, incx, alpha, thresh, datatype); + // Set the threshold for the errors: + double thresh = testinghelpers::getEpsilon(); + //---------------------------------------------------------- + // Call generic test body using those parameters + //---------------------------------------------------------- + test_scalv( conj_alpha, n, incx, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -77,12 +74,11 @@ TEST_P( cscalvGenericTest, RandomData ) class cscalvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); scomplex alpha = std::get<3>(str.param); - char datatype = std::get<4>(str.param); #ifdef TEST_BLAS std::string str_name = "cscal_"; #elif TEST_CBLAS @@ -97,7 +93,6 @@ class cscalvGenericTestPrint { std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real)))); alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -114,8 +109,7 @@ INSTANTIATE_TEST_SUITE_P( ), // n: use x, c: use conj(x) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}) // alpha ), ::cscalvGenericTestPrint() ); @@ -135,8 +129,7 @@ INSTANTIATE_TEST_SUITE_P( ), // n: use x, c: use conj(x) ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2), gtint_t(11)), //(gtint_t(-5), gtint_t(-17)) // stride size for x - ::testing::Values(scomplex{4.0, 3.1}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{4.0, 3.1}) // alpha ), ::cscalvGenericTestPrint() ); @@ -152,9 +145,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, c: use conj(x) ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x - ::testing::Values(scomplex{4.0, 3.1}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{4.0, 3.1}) // alpha ), ::cscalvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp index 6410481560..f2a08f340d 100644 --- a/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp @@ -39,35 +39,32 @@ class dscalvGenericTest : public ::testing::TestWithParam> {}; + double>> {}; // Tests using random integers as vector elements. TEST_P( dscalvGenericTest, RandomData ) { - using T = double; - //---------------------------------------------------------- - // Initialize values from the parameters passed through - // test suite instantiation (INSTANTIATE_TEST_SUITE_P). - //---------------------------------------------------------- - // denotes whether alpha or conj(alpha) will be used: - char conj_alpha = std::get<0>(GetParam()); - // vector length: - gtint_t n = std::get<1>(GetParam()); - // stride size for x: - gtint_t incx = std::get<2>(GetParam()); - // alpha - T alpha = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); + using T = double; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // denotes whether alpha or conj(alpha) will be used: + char conj_alpha = std::get<0>(GetParam()); + // vector length: + gtint_t n = std::get<1>(GetParam()); + // stride size for x: + gtint_t incx = std::get<2>(GetParam()); + // alpha + T alpha = std::get<3>(GetParam()); - // Set the threshold for the errors: - double thresh = testinghelpers::getEpsilon(); - //---------------------------------------------------------- - // Call generic test body using those parameters - //---------------------------------------------------------- - test_scalv(conj_alpha, n, incx, alpha, thresh, datatype); + // Set the threshold for the errors: + double thresh = testinghelpers::getEpsilon(); + //---------------------------------------------------------- + // Call generic test body using those parameters + //---------------------------------------------------------- + test_scalv( conj_alpha, n, incx, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -77,12 +74,11 @@ TEST_P( dscalvGenericTest, RandomData ) class dscalvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); double alpha = std::get<3>(str.param); - char datatype = std::get<4>(str.param); #ifdef TEST_BLAS std::string str_name = "dscal_"; #elif TEST_CBLAS @@ -96,7 +92,6 @@ class dscalvGenericTestPrint { str_name += "_" + incx_str; std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -109,8 +104,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, not conj(x) (since it is real) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(double(2.0), double(-3.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(2.0), double(-3.0)) // alpha ), ::dscalvGenericTestPrint() ); @@ -126,8 +120,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('c'), // c: use conjugate ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(double(-3.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(-3.0)) // alpha ), ::dscalvGenericTestPrint() ); @@ -143,8 +136,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2), gtint_t(11)), //(gtint_t(-5), gtint_t(-17)) // stride size for x - ::testing::Values(double(3.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(3.0)) // alpha ), ::dscalvGenericTestPrint() ); @@ -160,9 +152,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, c: use conj(x) ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x - ::testing::Values(3), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(3) // alpha ), ::dscalvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scalv/scalv.h b/gtestsuite/testsuite/level1/scalv/scalv.h index 0ae0125f52..a23fb24e5f 100644 --- a/gtestsuite/testsuite/level1/scalv/scalv.h +++ b/gtestsuite/testsuite/level1/scalv/scalv.h @@ -109,4 +109,4 @@ static void scalv(char conj_alpha, gtint_t n, T alpha, T* x, gtint_t incx) #else throw std::runtime_error("Error in testsuite/level1/scalv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp index df350f91b5..660d0450d8 100644 --- a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp +++ b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp @@ -46,7 +46,7 @@ TYPED_TEST(xscalv, zero_alpha_x_fp) gtint_t n = 10, incx = 1; std::vector x(n); // Initialize x with random numbers. - testinghelpers::datagenerators::randomgenerators(n, incx, x.data(), 'f'); + testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data(), ELEMENT_TYPE ); std::vector x_ref(x); T alpha = T{0}; @@ -70,7 +70,7 @@ TYPED_TEST(xscalv, zero_alpha_x_inf) gtint_t n = 10, incx = 1; std::vector x(n); // Initialize x with random numbers. - testinghelpers::datagenerators::randomgenerators(n, incx, x.data(), 'f'); + testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data(), ELEMENT_TYPE ); x[3] = 1.0/0.0; std::vector x_ref(x); T alpha = T{0}; @@ -87,4 +87,4 @@ TYPED_TEST(xscalv, zero_alpha_x_inf) // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); computediff( n, x.data(), x_ref.data(), incx, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp index 7e37a0e8fc..1bcdd90903 100644 --- a/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp @@ -39,35 +39,32 @@ class sscalvGenericTest : public ::testing::TestWithParam> {}; + float>> {}; // Tests using random integers as vector elements. TEST_P( sscalvGenericTest, RandomData ) { - using T = float; - //---------------------------------------------------------- - // Initialize values from the parameters passed through - // test suite instantiation (INSTANTIATE_TEST_SUITE_P). - //---------------------------------------------------------- - // denotes whether alpha or conj(alpha) will be used: - char conj_alpha = std::get<0>(GetParam()); - // vector length: - gtint_t n = std::get<1>(GetParam()); - // stride size for x: - gtint_t incx = std::get<2>(GetParam()); - // alpha - T alpha = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); - - // Set the threshold for the errors: - double thresh = testinghelpers::getEpsilon(); - //---------------------------------------------------------- - // Call generic test body using those parameters - //---------------------------------------------------------- - test_scalv(conj_alpha, n, incx, alpha, thresh, datatype); + using T = float; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // denotes whether alpha or conj(alpha) will be used: + char conj_alpha = std::get<0>(GetParam()); + // vector length: + gtint_t n = std::get<1>(GetParam()); + // stride size for x: + gtint_t incx = std::get<2>(GetParam()); + // alpha + T alpha = std::get<3>(GetParam()); + + // Set the threshold for the errors: + double thresh = testinghelpers::getEpsilon(); + //---------------------------------------------------------- + // Call generic test body using those parameters + //---------------------------------------------------------- + test_scalv( conj_alpha, n, incx, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -77,12 +74,11 @@ TEST_P( sscalvGenericTest, RandomData ) class sscalvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); float alpha = std::get<3>(str.param); - char datatype = std::get<4>(str.param); #ifdef TEST_BLAS std::string str_name = "sscal_"; #elif TEST_CBLAS @@ -96,7 +92,6 @@ class sscalvGenericTestPrint { str_name += "_" + incx_str; std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -109,8 +104,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, not conj(x) (since it is real) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(float(3.0), float(-5.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(3.0), float(-5.0)) // alpha ), ::sscalvGenericTestPrint() ); @@ -126,8 +120,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('c'), // c: use conjugate ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(float(9.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(9.0)) // alpha ), ::sscalvGenericTestPrint() ); @@ -143,8 +136,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2), gtint_t(11)), //(gtint_t(-5), gtint_t(-17)) // stride size for x - ::testing::Values(float(2.0)), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(2.0)) // alpha ), ::sscalvGenericTestPrint() ); @@ -161,9 +153,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, c: use conj(x) ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x - ::testing::Values(3), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(3) // alpha ), ::sscalvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scalv/test_scalv.h b/gtestsuite/testsuite/level1/scalv/test_scalv.h index bfe7f9bfde..a90405d7c6 100644 --- a/gtestsuite/testsuite/level1/scalv/test_scalv.h +++ b/gtestsuite/testsuite/level1/scalv/test_scalv.h @@ -43,27 +43,27 @@ */ template -static void test_scalv(char conja_alpha, gtint_t n, gtint_t incx, T alpha, double thresh, char datatype) +static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, T alpha, double thresh ) { //---------------------------------------------------------- // Initialize vector with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); //---------------------------------------------------------- // Call reference implementation to get ref results. //---------------------------------------------------------- // Create a copy of y so that we can check reference results. std::vector x_ref(x); - testinghelpers::ref_scalv(conja_alpha, n, alpha, x_ref.data(), incx); + testinghelpers::ref_scalv( conja_alpha, n, alpha, x_ref.data(), incx ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - scalv(conja_alpha, n, alpha, x.data(), incx); + scalv( conja_alpha, n, alpha, x.data(), incx ); //---------------------------------------------------------- // Compute component-wise error. //---------------------------------------------------------- computediff( n, x.data(), x_ref.data(), incx, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp index 6ddf2489d9..6336a121cc 100644 --- a/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp @@ -39,35 +39,32 @@ class zscalvGenericTest : public ::testing::TestWithParam> {}; + dcomplex>> {}; // Tests using random integers as vector elements. TEST_P( zscalvGenericTest, RandomData ) { - using T = dcomplex; - //---------------------------------------------------------- - // Initialize values from the parameters passed through - // test suite instantiation (INSTANTIATE_TEST_SUITE_P). - //---------------------------------------------------------- - // denotes whether alpha or conj(alpha) will be used: - char conj_alpha = std::get<0>(GetParam()); - // vector length: - gtint_t n = std::get<1>(GetParam()); - // stride size for x: - gtint_t incx = std::get<2>(GetParam()); - // alpha - T alpha = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); + using T = dcomplex; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // denotes whether alpha or conj(alpha) will be used: + char conj_alpha = std::get<0>(GetParam()); + // vector length: + gtint_t n = std::get<1>(GetParam()); + // stride size for x: + gtint_t incx = std::get<2>(GetParam()); + // alpha + T alpha = std::get<3>(GetParam()); - // Set the threshold for the errors: - double thresh = testinghelpers::getEpsilon(); - //---------------------------------------------------------- - // Call generic test body using those parameters - //---------------------------------------------------------- - test_scalv(conj_alpha, n, incx, alpha, thresh, datatype); + // Set the threshold for the errors: + double thresh = testinghelpers::getEpsilon(); + //---------------------------------------------------------- + // Call generic test body using those parameters + //---------------------------------------------------------- + test_scalv( conj_alpha, n, incx, alpha, thresh ); } // Used to generate a test case with a sensible name. @@ -77,12 +74,11 @@ TEST_P( zscalvGenericTest, RandomData ) class zscalvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); dcomplex alpha = std::get<3>(str.param); - char datatype = std::get<4>(str.param); #ifdef TEST_BLAS std::string str_name = "zscal_"; #elif TEST_CBLAS @@ -97,7 +93,6 @@ class zscalvGenericTestPrint { std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real)))); alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -114,8 +109,7 @@ INSTANTIATE_TEST_SUITE_P( ), // n: use x, c: use conj(x) ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(dcomplex{3.0, -2.0}, dcomplex{-1.0, 4.0}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{3.0, -2.0}, dcomplex{-1.0, 4.0}) // alpha ), ::zscalvGenericTestPrint() ); @@ -135,8 +129,7 @@ INSTANTIATE_TEST_SUITE_P( ), // n: use x, c: use conj(x) ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2), gtint_t(11)), //(gtint_t(-5), gtint_t(-17)) // stride size for x - ::testing::Values(dcomplex{1.0, 2.1}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{1.0, 2.1}) // alpha ), ::zscalvGenericTestPrint() ); @@ -152,9 +145,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, c: use conj(x) ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x - ::testing::Values(dcomplex{4.0, 3.1}), // alpha - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{4.0, 3.1}) // alpha ), ::zscalvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/setv/csetv_generic.cpp b/gtestsuite/testsuite/level1/setv/csetv_generic.cpp index 2a2daf72fd..2d6a9d8320 100644 --- a/gtestsuite/testsuite/level1/setv/csetv_generic.cpp +++ b/gtestsuite/testsuite/level1/setv/csetv_generic.cpp @@ -90,4 +90,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::csetvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp b/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp index 6051169bbc..8a9bef8184 100644 --- a/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp +++ b/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp @@ -90,4 +90,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::dsetvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/setv/setv.h b/gtestsuite/testsuite/level1/setv/setv.h index 651ec36b90..08a277dedb 100644 --- a/gtestsuite/testsuite/level1/setv/setv.h +++ b/gtestsuite/testsuite/level1/setv/setv.h @@ -77,4 +77,4 @@ static void setv(char conjalpha, gtint_t n, T* alpha, T* x, gtint_t incx) #else throw std::runtime_error("Error in testsuite/level1/setv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp b/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp index 2590619ea2..2c94385e1e 100644 --- a/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp +++ b/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp @@ -90,4 +90,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::ssetvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/setv/test_setv.h b/gtestsuite/testsuite/level1/setv/test_setv.h index 09bd121f6e..1fa4d3f6ab 100644 --- a/gtestsuite/testsuite/level1/setv/test_setv.h +++ b/gtestsuite/testsuite/level1/setv/test_setv.h @@ -43,7 +43,8 @@ */ template -void test_setv( char conjalpha, gtint_t n, T alpha, gtint_t incx ) { +void test_setv( char conjalpha, gtint_t n, T alpha, gtint_t incx ) +{ //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- @@ -71,4 +72,4 @@ void test_setv( char conjalpha, gtint_t n, T alpha, gtint_t incx ) { i = (incx > 0) ? (idx * incx) : ( - ( n - idx - 1 ) * incx ); EXPECT_EQ(x[i], alpha_ref) << "blis_sol[" << i << "]="<< x[i] <<" ref = " << alpha_ref; } -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp b/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp index d12271612f..e54bdfa887 100644 --- a/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp +++ b/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp @@ -90,4 +90,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zsetvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/subv/csubv_generic.cpp b/gtestsuite/testsuite/level1/subv/csubv_generic.cpp index 7b98a8ebfb..c61b27e4ae 100644 --- a/gtestsuite/testsuite/level1/subv/csubv_generic.cpp +++ b/gtestsuite/testsuite/level1/subv/csubv_generic.cpp @@ -36,7 +36,7 @@ #include "test_subv.h" class csubvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(csubvGenericTest); @@ -55,8 +55,6 @@ TEST_P( csubvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -64,19 +62,18 @@ TEST_P( csubvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_subv(conj_x, n, incx, incy, thresh, datatype); + test_subv( conj_x, n, incx, incy, thresh ); } // Prints the test case combination class csubvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); std::string str_name = "bli_csubv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -84,7 +81,6 @@ class csubvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -98,9 +94,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n','c'), // n: not transpose for x, c: conjugate for x ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1), gtint_t(4)), // stride size for x - ::testing::Values(gtint_t(1), gtint_t(7)), // stride size for y - ::testing::Values(ELEMENT_TYPE,'f') // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1), gtint_t(7)) // stride size for y ), ::csubvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp b/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp index 9b31bcb102..f34f4f28a3 100644 --- a/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp +++ b/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp @@ -36,7 +36,7 @@ #include "test_subv.h" class dsubvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dsubvGenericTest); @@ -55,8 +55,6 @@ TEST_P( dsubvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -64,19 +62,18 @@ TEST_P( dsubvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_subv(conj_x, n, incx, incy, thresh, datatype); + test_subv( conj_x, n, incx, incy, thresh ); } // Prints the test case combination class dsubvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); std::string str_name = "bli_dsubv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -84,7 +81,6 @@ class dsubvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -98,9 +94,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: not transpose for x ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1), gtint_t(4)), // stride size for x - ::testing::Values(gtint_t(1), gtint_t(7)), // stride size for y - ::testing::Values(ELEMENT_TYPE,'f') // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1), gtint_t(7)) // stride size for y ), ::dsubvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp b/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp index 4d96efc4e1..5447b08699 100644 --- a/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp +++ b/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp @@ -36,7 +36,7 @@ #include "test_subv.h" class ssubvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ssubvGenericTest); @@ -55,8 +55,6 @@ TEST_P( ssubvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -64,19 +62,18 @@ TEST_P( ssubvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_subv(conj_x, n, incx, incy, thresh, datatype); + test_subv( conj_x, n, incx, incy, thresh ); } // Prints the test case combination class ssubvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); std::string str_name = "bli_ssubv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -84,7 +81,6 @@ class ssubvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -98,9 +94,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: not transpose for x ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1), gtint_t(4)), // stride size for x - ::testing::Values(gtint_t(1), gtint_t(7)), // stride size for y - ::testing::Values(ELEMENT_TYPE,'f') // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1), gtint_t(7)) // stride size for y ), ::ssubvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/subv/subv.h b/gtestsuite/testsuite/level1/subv/subv.h index ff5059d6ff..f0a9da4c65 100644 --- a/gtestsuite/testsuite/level1/subv/subv.h +++ b/gtestsuite/testsuite/level1/subv/subv.h @@ -78,4 +78,4 @@ static void subv(char conjx, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy) #else throw std::runtime_error("Error in testsuite/level1/subv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/subv/test_subv.h b/gtestsuite/testsuite/level1/subv/test_subv.h index 9406823bd3..db9c64bbaf 100644 --- a/gtestsuite/testsuite/level1/subv/test_subv.h +++ b/gtestsuite/testsuite/level1/subv/test_subv.h @@ -43,29 +43,28 @@ */ template -void test_subv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, - double thresh, char datatype ) { +void test_subv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, double thresh ) +{ //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-10, 10, n, incy, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); + std::vector y = testinghelpers::get_random_vector( -10, 10, n, incy ); //---------------------------------------------------------- // Call reference implementation to get ref results. //---------------------------------------------------------- // Create a copy of y so that we can check reference results. std::vector y_ref(y); - testinghelpers::ref_subv(conjx, n, x.data(), incx, y_ref.data(), incy); + testinghelpers::ref_subv( conjx, n, x.data(), incx, y_ref.data(), incy ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - subv(conjx, n, x.data(), incx, y.data(), incy); + subv( conjx, n, x.data(), incx, y.data(), incy ); //---------------------------------------------------------- // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); - -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp b/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp index 2fa7236e64..270c2a1c83 100644 --- a/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp +++ b/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp @@ -36,7 +36,7 @@ #include "test_subv.h" class zsubvGenericTest : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zsubvGenericTest); @@ -55,8 +55,6 @@ TEST_P( zsubvGenericTest, RandomData ) gtint_t incx = std::get<2>(GetParam()); // stride size for y: gtint_t incy = std::get<3>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<4>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -64,19 +62,18 @@ TEST_P( zsubvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_subv(conj_x, n, incx, incy, thresh, datatype); + test_subv( conj_x, n, incx, incy, thresh ); } // Prints the test case combination class zsubvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); - char datatype = std::get<4>(str.param); std::string str_name = "bli_zsubv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -84,7 +81,6 @@ class zsubvGenericTestPrint { str_name += "_" + incx_str; std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -98,9 +94,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n','c'), // n: not transpose for x, c: conjugate for x ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1), gtint_t(4)), // stride size for x - ::testing::Values(gtint_t(1), gtint_t(7)), // stride size for y - ::testing::Values(ELEMENT_TYPE,'f') // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(1), gtint_t(7)) // stride size for y ), ::zsubvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/xpbyv/cxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/cxpbyv_generic.cpp index 7af0647138..6fb81b92aa 100644 --- a/gtestsuite/testsuite/level1/xpbyv/cxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/xpbyv/cxpbyv_generic.cpp @@ -40,8 +40,7 @@ class cxpbyvGenericTest : gtint_t, gtint_t, gtint_t, - scomplex, - char>> {}; + scomplex>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(cxpbyvGenericTest); @@ -63,15 +62,13 @@ TEST_P( cxpbyvGenericTest, RandomData ) gtint_t incy = std::get<3>(GetParam()); // beta T beta = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = 2*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_xpbyv(conj_x, n, incx, incy, beta, thresh, datatype); + test_xpbyv( conj_x, n, incx, incy, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -81,13 +78,12 @@ TEST_P( cxpbyvGenericTest, RandomData ) class cxpbyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); scomplex beta = std::get<4>(str.param); - char datatype = std::get<5>(str.param); std::string str_name = "bli_cxpbyv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -98,7 +94,6 @@ class cxpbyvGenericTestPrint { std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real)))); beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag))))); str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -113,8 +108,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}) // beta ), ::cxpbyvGenericTestPrint() ); @@ -130,8 +124,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2), gtint_t(11)), /*(gtint_t(-5), gtint_t(-17))*/ // stride size for x ::testing::Values(gtint_t(3), gtint_t(33)), /*(gtint_t(-12), gtint_t(-4))*/ // stride size for y - ::testing::Values(scomplex{4.0, 3.1}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(scomplex{4.0, 3.1}) // beta ), ::cxpbyvGenericTestPrint() ); diff --git a/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp index 15e06808c0..fef51802f4 100644 --- a/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp @@ -40,8 +40,7 @@ class dxpbyvGenericTest : gtint_t, gtint_t, gtint_t, - double, - char>> {}; + double>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dxpbyvGenericTest); @@ -63,8 +62,6 @@ TEST_P( dxpbyvGenericTest, RandomData ) gtint_t incy = std::get<3>(GetParam()); // beta T beta = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = 2*testinghelpers::getEpsilon(); @@ -72,7 +69,7 @@ TEST_P( dxpbyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_xpbyv(conj_x, n, incx, incy, beta, thresh, datatype); + test_xpbyv( conj_x, n, incx, incy, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -82,13 +79,12 @@ TEST_P( dxpbyvGenericTest, RandomData ) class dxpbyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); double beta = std::get<4>(str.param); - char datatype = std::get<5>(str.param); std::string str_name = "bli_dxpbyv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -98,7 +94,6 @@ class dxpbyvGenericTestPrint { str_name += "_" + incy_str; std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta))); str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -113,8 +108,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(double(2.0), double(-2.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(2.0), double(-2.0)) // beta ), ::dxpbyvGenericTestPrint() ); @@ -131,8 +125,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(double(2.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(2.0)) // beta ), ::dxpbyvGenericTestPrint() ); @@ -149,9 +142,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2), gtint_t(11)), /*(gtint_t(-5), gtint_t(-17))*/// stride size for x ::testing::Values(gtint_t(3), gtint_t(33)), /*(gtint_t(-12), gtint_t(-4))*/// stride size for y - ::testing::Values(double(4.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(double(4.0)) // beta ), ::dxpbyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp index b424025ce7..7c9120e276 100644 --- a/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp @@ -40,8 +40,7 @@ class sxpbyvGenericTest : gtint_t, gtint_t, gtint_t, - float, - char>> {}; + float>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(sxpbyvGenericTest); @@ -63,8 +62,6 @@ TEST_P( sxpbyvGenericTest, RandomData ) gtint_t incy = std::get<3>(GetParam()); // beta T beta = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: float thresh = 2*testinghelpers::getEpsilon(); @@ -72,7 +69,7 @@ TEST_P( sxpbyvGenericTest, RandomData ) //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_xpbyv(conj_x, n, incx, incy, beta, thresh, datatype); + test_xpbyv( conj_x, n, incx, incy, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -82,13 +79,12 @@ TEST_P( sxpbyvGenericTest, RandomData ) class sxpbyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); float beta = std::get<4>(str.param); - char datatype = std::get<5>(str.param); std::string str_name = "bli_sxpbyv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -98,7 +94,6 @@ class sxpbyvGenericTestPrint { str_name += "_" + incy_str; std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta))); str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -113,8 +108,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(float(2.0), float(-2.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(2.0), float(-2.0)) // beta ), ::sxpbyvGenericTestPrint() ); @@ -130,8 +124,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(float(2.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(2.0)) // beta ), ::sxpbyvGenericTestPrint() ); @@ -148,9 +141,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector ::testing::Values(gtint_t(2), gtint_t(11)), /*(gtint_t(-5), gtint_t(-17))*/// stride size for x ::testing::Values(gtint_t(3), gtint_t(33)), /*(gtint_t(-12), gtint_t(-4))*/// stride size for y - ::testing::Values(float(4.0)), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(float(4.0)) // beta ), ::sxpbyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h b/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h index 46af04c30e..5b1534582e 100644 --- a/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h +++ b/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h @@ -43,29 +43,29 @@ */ template -static void test_xpbyv(char conjx, gtint_t n, gtint_t incx, gtint_t incy, - T beta, double thresh, char datatype ) { - +static void test_xpbyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, + T beta, double thresh ) +{ //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-10, 10, n, incy, datatype); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); + std::vector y = testinghelpers::get_random_vector( -10, 10, n, incy ); //---------------------------------------------------------- // Call reference implementation to get ref results. //---------------------------------------------------------- // Create a copy of y so that we can check reference results. std::vector y_ref(y); - testinghelpers::ref_xpbyv(conjx, n, x.data(), incx, beta, y_ref.data(), incy); + testinghelpers::ref_xpbyv( conjx, n, x.data(), incx, beta, y_ref.data(), incy ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - xpbyv(conjx, n, x.data(), incx, beta, y.data(), incy); + xpbyv( conjx, n, x.data(), incx, beta, y.data(), incy ); //---------------------------------------------------------- // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/xpbyv/xpbyv.h b/gtestsuite/testsuite/level1/xpbyv/xpbyv.h index 2b3a15fbd5..21212f6834 100644 --- a/gtestsuite/testsuite/level1/xpbyv/xpbyv.h +++ b/gtestsuite/testsuite/level1/xpbyv/xpbyv.h @@ -79,4 +79,4 @@ static void xpbyv(char conj_x, gtint_t n, T* x, gtint_t incx, T beta, T* y, gtin #else throw std::runtime_error("Error in testsuite/level1/xpbyv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp index cea3e8a086..e648e83f0d 100644 --- a/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp @@ -40,8 +40,7 @@ class zxpbyvGenericTest : gtint_t, gtint_t, gtint_t, - dcomplex, - char>> {}; + dcomplex>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zxpbyvGenericTest); @@ -63,15 +62,13 @@ TEST_P( zxpbyvGenericTest, RandomData ) gtint_t incy = std::get<3>(GetParam()); // beta T beta = std::get<4>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<5>(GetParam()); // Set the threshold for the errors: double thresh = 2*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_xpbyv(conj_x, n, incx, incy, beta, thresh, datatype); + test_xpbyv( conj_x, n, incx, incy, beta, thresh ); } // Used to generate a test case with a sensible name. @@ -81,13 +78,12 @@ TEST_P( zxpbyvGenericTest, RandomData ) class zxpbyvGenericTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char conj = std::get<0>(str.param); gtint_t n = std::get<1>(str.param); gtint_t incx = std::get<2>(str.param); gtint_t incy = std::get<3>(str.param); dcomplex beta = std::get<4>(str.param); - char datatype = std::get<5>(str.param); std::string str_name = "bli_zxpbyv"; str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); @@ -98,7 +94,6 @@ class zxpbyvGenericTestPrint { std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real)))); beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag))))); str_name = str_name + "_b" + beta_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -113,8 +108,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), /*(gtint_t(-5), gtint_t(-17))*/ // stride size for x ::testing::Values(gtint_t(1)), /*(gtint_t(-12), gtint_t(-4))*/ // stride size for y - ::testing::Values(dcomplex{2.0, -1.0}, dcomplex{-2.0, 3.0}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{2.0, -1.0}, dcomplex{-2.0, 3.0}) // beta ), ::zxpbyvGenericTestPrint() ); @@ -130,9 +124,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(2), gtint_t(11)), /*(gtint_t(-5), gtint_t(-17))*/ // stride size for x ::testing::Values(gtint_t(3), gtint_t(33)), /*(gtint_t(-12), gtint_t(-4))*/ // stride size for y - ::testing::Values(dcomplex{4.0, 3.1}), // beta - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(dcomplex{4.0, 3.1}) // beta ), ::zxpbyvGenericTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp index 8c0cb5200a..03b7762d79 100644 --- a/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp @@ -45,12 +45,11 @@ class cgemvTest : scomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(cgemvTest, RandomData) { +TEST_P(cgemvTest, RandomData) +{ using T = scomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -77,8 +76,6 @@ TEST_P(cgemvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); @@ -86,13 +83,13 @@ TEST_P(cgemvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemv(storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, datatype); + test_gemv( storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh ); } class cgemvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char transa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -103,7 +100,6 @@ class cgemvTestPrint { gtint_t incx = std::get<7>(str.param); gtint_t incy = std::get<8>(str.param); gtint_t ld_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "cgemv_"; #elif TEST_CBLAS @@ -126,7 +122,6 @@ class cgemvTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_b" + beta_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -149,8 +144,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(scomplex{-1.0, 1.0}), // beta ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0)) // increment to the leading dim of a ), ::cgemvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp index 4fc91b1f46..7357097204 100644 --- a/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp @@ -45,12 +45,11 @@ class dgemvTest : double, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dgemvTest, RandomData) { +TEST_P(dgemvTest, RandomData) +{ using T = double; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -77,8 +76,6 @@ TEST_P(dgemvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); @@ -86,13 +83,13 @@ TEST_P(dgemvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemv(storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, datatype); + test_gemv( storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh ); } class dgemvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char transa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -103,7 +100,6 @@ class dgemvTestPrint { gtint_t incx = std::get<7>(str.param); gtint_t incy = std::get<8>(str.param); gtint_t ld_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "dgemv_"; #elif TEST_CBLAS @@ -124,7 +120,6 @@ class dgemvTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_b" + beta_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -147,8 +142,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-1.0 ), // beta ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0)) // increment to the leading dim of a ), ::dgemvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/gemv/gemv.h b/gtestsuite/testsuite/level2/gemv/gemv.h index d6cc12f2db..d7d66d6264 100644 --- a/gtestsuite/testsuite/level2/gemv/gemv.h +++ b/gtestsuite/testsuite/level2/gemv/gemv.h @@ -147,4 +147,4 @@ static void gemv( char storage, char trans, char conj_x, gtint_t m, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/gemv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp index a6906559eb..ec0d19bd4a 100644 --- a/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp @@ -45,12 +45,11 @@ class sgemvTest : float, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(sgemvTest, RandomData) { +TEST_P(sgemvTest, RandomData) +{ using T = float; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -77,8 +76,6 @@ TEST_P(sgemvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); @@ -86,13 +83,13 @@ TEST_P(sgemvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemv(storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, datatype); + test_gemv( storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh ); } class sgemvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char transa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -103,7 +100,6 @@ class sgemvTestPrint { gtint_t incx = std::get<7>(str.param); gtint_t incy = std::get<8>(str.param); gtint_t ld_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "sgemv_"; #elif TEST_CBLAS @@ -124,7 +120,6 @@ class sgemvTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_b" + beta_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -147,8 +142,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-1.0 ), // beta ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0)) // increment to the leading dim of a ), ::sgemvTestPrint() ); diff --git a/gtestsuite/testsuite/level2/gemv/test_gemv.h b/gtestsuite/testsuite/level2/gemv/test_gemv.h index 7d3dfc14d6..a3b3ccf653 100644 --- a/gtestsuite/testsuite/level2/gemv/test_gemv.h +++ b/gtestsuite/testsuite/level2/gemv/test_gemv.h @@ -43,11 +43,10 @@ template void test_gemv( char storage, char trnsa, char conjx, gtint_t m, gtint_t n, - T alpha, gtint_t lda_inc, gtint_t incx, T beta, gtint_t incy, - double thresh, char datatype ) { - + T alpha, gtint_t lda_inc, gtint_t incx, T beta, gtint_t incy, double thresh ) +{ // Compute the leading dimensions for matrix size calculation. - gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', m, n, lda_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, 'n', m, n, lda_inc ); // Get correct vector lengths. gtint_t lenx = ( testinghelpers::chknotrans( trnsa ) ) ? n : m ; @@ -56,9 +55,9 @@ void test_gemv( char storage, char trnsa, char conjx, gtint_t m, gtint_t n, //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(1, 5, storage, 'n', m, n, lda, datatype); - std::vector x = testinghelpers::get_random_vector(1, 3, lenx, incx, datatype); - std::vector y = testinghelpers::get_random_vector(1, 3, leny, incy, datatype); + std::vector a = testinghelpers::get_random_matrix( 1, 5, storage, 'n', m, n, lda ); + std::vector x = testinghelpers::get_random_vector( 1, 3, lenx, incx ); + std::vector y = testinghelpers::get_random_vector( 1, 3, leny, incy ); // Create a copy of c so that we can check reference results. std::vector y_ref(y); @@ -78,4 +77,4 @@ void test_gemv( char storage, char trnsa, char conjx, gtint_t m, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( leny, y.data(), y_ref.data(), incy, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp index 74d95b5b13..94700a36b7 100644 --- a/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp @@ -45,12 +45,11 @@ class zgemvTest : dcomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zgemvTest, RandomData) { +TEST_P(zgemvTest, RandomData) +{ using T = dcomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -77,8 +76,6 @@ TEST_P(zgemvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); @@ -86,13 +83,13 @@ TEST_P(zgemvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemv(storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, datatype); + test_gemv( storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh ); } class zgemvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char transa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -103,7 +100,6 @@ class zgemvTestPrint { gtint_t incx = std::get<7>(str.param); gtint_t incy = std::get<8>(str.param); gtint_t ld_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "zgemv_"; #elif TEST_CBLAS @@ -126,7 +122,6 @@ class zgemvTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_b" + beta_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -149,8 +144,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{-1.0, 1.0}), // beta ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0)) // increment to the leading dim of a ), ::zgemvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/ger/cger_generic.cpp b/gtestsuite/testsuite/level2/ger/cger_generic.cpp index 7dcd4fea70..29646c656e 100644 --- a/gtestsuite/testsuite/level2/ger/cger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/cger_generic.cpp @@ -44,12 +44,11 @@ class cgerTest : scomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(cgerTest, RandomData) { +TEST_P(cgerTest, RandomData) +{ using T = scomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -74,8 +73,6 @@ TEST_P(cgerTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); @@ -83,13 +80,13 @@ TEST_P(cgerTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_ger(storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh, datatype); + test_ger( storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh ); } class cgerTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char conjx = std::get<1>(str.param); char conjy = std::get<2>(str.param); @@ -99,7 +96,6 @@ class cgerTestPrint { gtint_t incx = std::get<6>(str.param); gtint_t incy = std::get<7>(str.param); gtint_t ld_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "cger_"; #elif TEST_CBLAS @@ -119,7 +115,6 @@ class cgerTestPrint { alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -141,8 +136,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(scomplex{1.0, -2.0}), // alpha ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::cgerTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/ger/dger_generic.cpp b/gtestsuite/testsuite/level2/ger/dger_generic.cpp index 043a165407..b8142cb685 100644 --- a/gtestsuite/testsuite/level2/ger/dger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/dger_generic.cpp @@ -44,12 +44,11 @@ class dgerTest : double, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dgerTest, RandomData) { +TEST_P(dgerTest, RandomData) +{ using T = double; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -74,8 +73,6 @@ TEST_P(dgerTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); @@ -83,13 +80,13 @@ TEST_P(dgerTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_ger(storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh, datatype); + test_ger( storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh ); } class dgerTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char conjx = std::get<1>(str.param); char conjy = std::get<2>(str.param); @@ -99,7 +96,6 @@ class dgerTestPrint { gtint_t incx = std::get<6>(str.param); gtint_t incy = std::get<7>(str.param); gtint_t ld_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "dger_"; #elif TEST_CBLAS @@ -118,7 +114,6 @@ class dgerTestPrint { std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -140,8 +135,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( 1.0 ), // alpha ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::dgerTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/ger/ger.h b/gtestsuite/testsuite/level2/ger/ger.h index c6747f6c7a..f211c4cbba 100644 --- a/gtestsuite/testsuite/level2/ger/ger.h +++ b/gtestsuite/testsuite/level2/ger/ger.h @@ -155,4 +155,4 @@ static void ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/ger.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/ger/sger_generic.cpp b/gtestsuite/testsuite/level2/ger/sger_generic.cpp index 113dee0342..0dc66d658b 100644 --- a/gtestsuite/testsuite/level2/ger/sger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/sger_generic.cpp @@ -44,12 +44,11 @@ class sgerTest : float, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(sgerTest, RandomData) { +TEST_P(sgerTest, RandomData) +{ using T = float; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -74,8 +73,6 @@ TEST_P(sgerTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = 4*std::max(m,n)*testinghelpers::getEpsilon(); @@ -83,13 +80,13 @@ TEST_P(sgerTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_ger(storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh, datatype); + test_ger( storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh ); } class sgerTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char conjx = std::get<1>(str.param); char conjy = std::get<2>(str.param); @@ -99,7 +96,6 @@ class sgerTestPrint { gtint_t incx = std::get<6>(str.param); gtint_t incy = std::get<7>(str.param); gtint_t ld_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "sger_"; #elif TEST_CBLAS @@ -118,7 +114,6 @@ class sgerTestPrint { std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -140,8 +135,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( 1.0 ), // alpha ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::sgerTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/ger/test_ger.h b/gtestsuite/testsuite/level2/ger/test_ger.h index a85a13a7e9..fd43d8fb49 100644 --- a/gtestsuite/testsuite/level2/ger/test_ger.h +++ b/gtestsuite/testsuite/level2/ger/test_ger.h @@ -43,18 +43,17 @@ template void test_ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n, - T alpha, gtint_t incx, gtint_t incy, gtint_t lda_inc, double thresh, - char datatype ) { - + T alpha, gtint_t incx, gtint_t incy, gtint_t lda_inc, double thresh ) +{ // Compute the leading dimensions for matrix size calculation. - gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', m, n, lda_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, 'n', m, n, lda_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 5, storage, 'n', m, n, lda, datatype); - std::vector x = testinghelpers::get_random_vector(-3, 3, m, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-3, 3, n, incy, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 5, storage, 'n', m, n, lda ); + std::vector x = testinghelpers::get_random_vector( -3, 3, m, incx ); + std::vector y = testinghelpers::get_random_vector( -3, 3, n, incy ); // Create a copy of c so that we can check reference results. std::vector a_ref(a); @@ -74,4 +73,4 @@ void test_ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, a.data(), a_ref.data(), lda, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/ger/zger_generic.cpp b/gtestsuite/testsuite/level2/ger/zger_generic.cpp index 0f32161eaa..293d7c5f88 100644 --- a/gtestsuite/testsuite/level2/ger/zger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/zger_generic.cpp @@ -44,12 +44,11 @@ class zgerTest : dcomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zgerTest, RandomData) { +TEST_P(zgerTest, RandomData) +{ using T = dcomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -74,8 +73,6 @@ TEST_P(zgerTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); @@ -83,13 +80,13 @@ TEST_P(zgerTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_ger(storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh, datatype); + test_ger( storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh ); } class zgerTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char conjx = std::get<1>(str.param); char conjy = std::get<2>(str.param); @@ -99,7 +96,6 @@ class zgerTestPrint { gtint_t incx = std::get<6>(str.param); gtint_t incy = std::get<7>(str.param); gtint_t ld_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "zger_"; #elif TEST_CBLAS @@ -119,7 +115,6 @@ class zgerTestPrint { alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -141,8 +136,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{1.0, -2.0}), // alpha ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::zgerTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp b/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp index ed650d0229..b59ee251fd 100644 --- a/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp +++ b/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp @@ -45,12 +45,11 @@ class chemvTest : scomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(chemvTest, RandomData) { +TEST_P(chemvTest, RandomData) +{ using T = scomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -77,8 +76,6 @@ TEST_P(chemvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = 4*std::sqrt(n)*testinghelpers::getEpsilon(); @@ -86,13 +83,13 @@ TEST_P(chemvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_hemv(storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh, datatype); + test_hemv( storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh ); } class chemvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conja = std::get<2>(str.param); @@ -103,7 +100,6 @@ class chemvTestPrint { gtint_t incx = std::get<7>(str.param); gtint_t incy = std::get<8>(str.param); gtint_t ld_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "chemv_"; #elif TEST_CBLAS @@ -125,7 +121,6 @@ class chemvTestPrint { str_name = str_name + "_" + incx_str; str_name = str_name + "_" + incy_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -148,8 +143,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(scomplex{2.0, -1.0}), // beta ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::chemvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/hemv/hemv.h b/gtestsuite/testsuite/level2/hemv/hemv.h index 90086336a7..7dbf7a961f 100644 --- a/gtestsuite/testsuite/level2/hemv/hemv.h +++ b/gtestsuite/testsuite/level2/hemv/hemv.h @@ -135,4 +135,4 @@ static void hemv( char storage, char uploa, char conja, char conjx, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/hemv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/hemv/test_hemv.h b/gtestsuite/testsuite/level2/hemv/test_hemv.h index 8f8357e96e..4985c4644e 100644 --- a/gtestsuite/testsuite/level2/hemv/test_hemv.h +++ b/gtestsuite/testsuite/level2/hemv/test_hemv.h @@ -43,18 +43,17 @@ template void test_hemv( char storage, char uploa, char conja, char conjx, gtint_t n, - T alpha, gtint_t lda_inc, gtint_t incx, T beta, gtint_t incy, - double thresh, char datatype ) { - + T alpha, gtint_t lda_inc, gtint_t incx, T beta, gtint_t incy, double thresh ) +{ // Compute the leading dimensions of a. - gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, 'n', n, n, lda_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 5, storage, 'n', n, n, lda, datatype); - std::vector x = testinghelpers::get_random_vector(-3, 3, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-3, 3, n, incy, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 5, storage, 'n', n, n, lda ); + std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); + std::vector y = testinghelpers::get_random_vector( -3, 3, n, incy ); mkherm( storage, uploa, n, a.data(), lda ); mktrim( storage, uploa, n, a.data(), lda ); @@ -77,4 +76,4 @@ void test_hemv( char storage, char uploa, char conja, char conjx, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp b/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp index 1f60f25468..7ee8c9b21f 100644 --- a/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp +++ b/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp @@ -45,12 +45,11 @@ class zhemvTest : dcomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zhemvTest, RandomData) { +TEST_P(zhemvTest, RandomData) +{ using T = dcomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -77,8 +76,6 @@ TEST_P(zhemvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = 8*std::sqrt(n)*testinghelpers::getEpsilon(); @@ -86,13 +83,13 @@ TEST_P(zhemvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_hemv(storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh, datatype); + test_hemv( storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh ); } class zhemvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conja = std::get<2>(str.param); @@ -103,7 +100,6 @@ class zhemvTestPrint { gtint_t incx = std::get<7>(str.param); gtint_t incy = std::get<8>(str.param); gtint_t ld_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "zhemv_"; #elif TEST_CBLAS @@ -125,7 +121,6 @@ class zhemvTestPrint { str_name = str_name + "_" + incx_str; str_name = str_name + "_" + incy_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -148,8 +143,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{2.0, -1.0}), // beta ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::zhemvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/her/cher_generic.cpp b/gtestsuite/testsuite/level2/her/cher_generic.cpp index 2805f17f23..fc1797ec34 100644 --- a/gtestsuite/testsuite/level2/her/cher_generic.cpp +++ b/gtestsuite/testsuite/level2/her/cher_generic.cpp @@ -42,12 +42,11 @@ class cherTest : gtint_t, float, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(cherTest, RandomData) { +TEST_P(cherTest, RandomData) +{ using T = scomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -68,8 +67,6 @@ TEST_P(cherTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<6>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<7>(GetParam()); // Set the threshold for the errors: double thresh = 4*std::sqrt(n)*testinghelpers::getEpsilon(); @@ -77,13 +74,13 @@ TEST_P(cherTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_her(storage, uploa, conjx, n, alpha, incx, lda_inc, thresh, datatype); + test_her( storage, uploa, conjx, n, alpha, incx, lda_inc, thresh ); } class cherTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -91,7 +88,6 @@ class cherTestPrint { float alpha = std::get<4>(str.param); gtint_t incx = std::get<5>(str.param); gtint_t ld_inc = std::get<6>(str.param); - char datatype = std::get<7>(str.param); #ifdef TEST_BLAS std::string str_name = "cher_"; #elif TEST_CBLAS @@ -107,7 +103,6 @@ class cherTestPrint { std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha)))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -127,8 +122,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(31), 10), // n ::testing::Values(1.0), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::cherTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/her/her.h b/gtestsuite/testsuite/level2/her/her.h index ea7d3008c7..a21d907008 100644 --- a/gtestsuite/testsuite/level2/her/her.h +++ b/gtestsuite/testsuite/level2/her/her.h @@ -123,4 +123,4 @@ static void her( char storage, char uploa, char conj_x, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/her.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/her/test_her.h b/gtestsuite/testsuite/level2/her/test_her.h index ad8a351eb1..6e18e04810 100644 --- a/gtestsuite/testsuite/level2/her/test_her.h +++ b/gtestsuite/testsuite/level2/her/test_her.h @@ -43,16 +43,16 @@ template void test_her( char storage, char uploa, char conjx, gtint_t n, Tr alpha, - gtint_t incx, gtint_t lda_inc, double thresh, char datatype ) { - + gtint_t incx, gtint_t lda_inc, double thresh ) +{ // Compute the leading dimensions of a. - gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, 'n', n, n, lda_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 5, storage, 'n', n, n, lda, datatype); - std::vector x = testinghelpers::get_random_vector(-3, 3, n, incx, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 5, storage, 'n', n, n, lda ); + std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); mktrim( storage, uploa, n, a.data(), lda ); @@ -73,4 +73,4 @@ void test_her( char storage, char uploa, char conjx, gtint_t n, Tr alpha, // check component-wise error. //---------------------------------------------------------- computediff( storage, n, n, a.data(), a_ref.data(), lda, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/her/zher_generic.cpp b/gtestsuite/testsuite/level2/her/zher_generic.cpp index 902820d3ca..eb5d6b40e4 100644 --- a/gtestsuite/testsuite/level2/her/zher_generic.cpp +++ b/gtestsuite/testsuite/level2/her/zher_generic.cpp @@ -42,12 +42,11 @@ class zherTest : gtint_t, double, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zherTest, RandomData) { +TEST_P(zherTest, RandomData) +{ using T = dcomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -68,8 +67,6 @@ TEST_P(zherTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<6>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<7>(GetParam()); // Set the threshold for the errors: double thresh = 4*std::sqrt(n)*testinghelpers::getEpsilon(); @@ -77,13 +74,13 @@ TEST_P(zherTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_her(storage, uploa, conjx, n, alpha, incx, lda_inc, thresh, datatype); + test_her( storage, uploa, conjx, n, alpha, incx, lda_inc, thresh ); } class zherTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -91,7 +88,6 @@ class zherTestPrint { double alpha = std::get<4>(str.param); gtint_t incx = std::get<5>(str.param); gtint_t ld_inc = std::get<6>(str.param); - char datatype = std::get<7>(str.param); #ifdef TEST_BLAS std::string str_name = "zher_"; #elif TEST_CBLAS @@ -107,7 +103,6 @@ class zherTestPrint { std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha)))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -127,8 +122,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(31), 10), // n ::testing::Values(1.0), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::zherTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/her2/cher2_generic.cpp b/gtestsuite/testsuite/level2/her2/cher2_generic.cpp index 7c7f16bf72..472d30c745 100644 --- a/gtestsuite/testsuite/level2/her2/cher2_generic.cpp +++ b/gtestsuite/testsuite/level2/her2/cher2_generic.cpp @@ -44,12 +44,11 @@ class cher2Test : scomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(cher2Test, RandomData) { +TEST_P(cher2Test, RandomData) +{ using T = scomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -74,8 +73,6 @@ TEST_P(cher2Test, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = 4*n*testinghelpers::getEpsilon(); @@ -83,13 +80,13 @@ TEST_P(cher2Test, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_her2(storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh, datatype); + test_her2( storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh ); } class cher2TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -99,7 +96,6 @@ class cher2TestPrint { gtint_t incx = std::get<6>(str.param); gtint_t incy = std::get<7>(str.param); gtint_t ld_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "cher2_"; #elif TEST_CBLAS @@ -118,7 +114,6 @@ class cher2TestPrint { str_name = str_name + "_" + incx_str; str_name = str_name + "_" + incy_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -140,8 +135,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(scomplex{1.0, -2.0}), // alpha ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::cher2TestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/her2/her2.h b/gtestsuite/testsuite/level2/her2/her2.h index 759b2d90d2..d68d7e4f7d 100644 --- a/gtestsuite/testsuite/level2/her2/her2.h +++ b/gtestsuite/testsuite/level2/her2/her2.h @@ -128,4 +128,4 @@ static void her2( char storage, char uploa, char conj_x, char conj_y, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/her2.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/her2/test_her2.h b/gtestsuite/testsuite/level2/her2/test_her2.h index 10814b90db..f896e89cac 100644 --- a/gtestsuite/testsuite/level2/her2/test_her2.h +++ b/gtestsuite/testsuite/level2/her2/test_her2.h @@ -43,18 +43,17 @@ template void test_her2( char storage, char uploa, char conjx, char conjy, gtint_t n, - T alpha, gtint_t incx, gtint_t incy, gtint_t lda_inc, double thresh, - char datatype ) { - + T alpha, gtint_t incx, gtint_t incy, gtint_t lda_inc, double thresh ) +{ // Compute the leading dimensions of a. - gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, 'n', n, n, lda_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 5, storage, 'n', n, n, lda, datatype); - std::vector x = testinghelpers::get_random_vector(-3, 3, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-2, 5, n, incy, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 5, storage, 'n', n, n, lda ); + std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); + std::vector y = testinghelpers::get_random_vector( -2, 5, n, incy ); mkherm( storage, uploa, n, a.data(), lda ); mktrim( storage, uploa, n, a.data(), lda ); @@ -77,4 +76,4 @@ void test_her2( char storage, char uploa, char conjx, char conjy, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, n, n, a.data(), a_ref.data(), lda, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/her2/zher2_generic.cpp b/gtestsuite/testsuite/level2/her2/zher2_generic.cpp index c7bc0bcd9a..f09e8fb104 100644 --- a/gtestsuite/testsuite/level2/her2/zher2_generic.cpp +++ b/gtestsuite/testsuite/level2/her2/zher2_generic.cpp @@ -44,12 +44,11 @@ class zher2Test : dcomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zher2Test, RandomData) { +TEST_P(zher2Test, RandomData) +{ using T = dcomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -74,8 +73,6 @@ TEST_P(zher2Test, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = 6*std::sqrt(n)*testinghelpers::getEpsilon(); @@ -83,13 +80,13 @@ TEST_P(zher2Test, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_her2(storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh, datatype); + test_her2( storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh ); } class zher2TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -99,7 +96,6 @@ class zher2TestPrint { gtint_t incx = std::get<6>(str.param); gtint_t incy = std::get<7>(str.param); gtint_t ld_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "zher2_"; #elif TEST_CBLAS @@ -118,7 +114,6 @@ class zher2TestPrint { str_name = str_name + "_" + incx_str; str_name = str_name + "_" + incy_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -140,8 +135,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{1.0, -2.0}), // alpha ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::zher2TestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp b/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp index a8ca008deb..d768de0734 100644 --- a/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp +++ b/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp @@ -45,12 +45,11 @@ class dsymvTest : double, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dsymvTest, RandomData) { +TEST_P(dsymvTest, RandomData) +{ using T = double; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -77,8 +76,6 @@ TEST_P(dsymvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = 10*n*testinghelpers::getEpsilon(); @@ -86,13 +83,13 @@ TEST_P(dsymvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_symv(storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh, datatype); + test_symv( storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh ); } class dsymvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conja = std::get<2>(str.param); @@ -103,7 +100,6 @@ class dsymvTestPrint { gtint_t incx = std::get<7>(str.param); gtint_t incy = std::get<8>(str.param); gtint_t ld_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "dsymv_"; #elif TEST_CBLAS @@ -123,7 +119,6 @@ class dsymvTestPrint { str_name = str_name + "_" + incx_str; str_name = str_name + "_" + incy_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -146,8 +141,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( 2.0, -1.0 ), // beta ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::dsymvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp b/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp index 498a7b89c9..520befd98f 100644 --- a/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp +++ b/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp @@ -45,12 +45,11 @@ class ssymvTest : float, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ssymvTest, RandomData) { +TEST_P(ssymvTest, RandomData) +{ using T = float; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -77,8 +76,6 @@ TEST_P(ssymvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = 10*n*testinghelpers::getEpsilon(); @@ -86,13 +83,13 @@ TEST_P(ssymvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_symv(storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh, datatype); + test_symv( storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh ); } class ssymvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conja = std::get<2>(str.param); @@ -103,7 +100,6 @@ class ssymvTestPrint { gtint_t incx = std::get<7>(str.param); gtint_t incy = std::get<8>(str.param); gtint_t ld_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "ssymv_"; #elif TEST_CBLAS @@ -123,7 +119,6 @@ class ssymvTestPrint { str_name = str_name + "_" + incx_str; str_name = str_name + "_" + incy_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -146,8 +141,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( 2.0, -1.0 ), // beta ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::ssymvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/symv/symv.h b/gtestsuite/testsuite/level2/symv/symv.h index 2d77b25de4..78a7aaf0a1 100644 --- a/gtestsuite/testsuite/level2/symv/symv.h +++ b/gtestsuite/testsuite/level2/symv/symv.h @@ -130,4 +130,4 @@ static void symv( char storage, char uploa, char conja, char conjx, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/symv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/symv/test_symv.h b/gtestsuite/testsuite/level2/symv/test_symv.h index 22c556d346..a808060d52 100644 --- a/gtestsuite/testsuite/level2/symv/test_symv.h +++ b/gtestsuite/testsuite/level2/symv/test_symv.h @@ -43,18 +43,17 @@ template void test_symv( char storage, char uploa, char conja, char conjx, gtint_t n, - T alpha, gtint_t lda_inc, gtint_t incx, T beta, gtint_t incy, - double thresh, char datatype ) { - + T alpha, gtint_t lda_inc, gtint_t incx, T beta, gtint_t incy, double thresh ) +{ // Compute the leading dimensions of a. - gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, 'n', n, n, lda_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 5, storage, 'n', n, n, lda, datatype); - std::vector x = testinghelpers::get_random_vector(-3, 3, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-2, 5, n, incy, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 5, storage, 'n', n, n, lda ); + std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); + std::vector y = testinghelpers::get_random_vector( -2, 5, n, incy ); mksymm( storage, uploa, n, a.data(), lda ); mktrim( storage, uploa, n, a.data(), lda ); @@ -77,4 +76,4 @@ void test_symv( char storage, char uploa, char conja, char conjx, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp b/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp index d80e990298..e2aef734ed 100644 --- a/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp +++ b/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp @@ -42,12 +42,11 @@ class dsyrTest : gtint_t, double, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dsyrTest, RandomData) { +TEST_P(dsyrTest, RandomData) +{ using T = double; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -68,8 +67,6 @@ TEST_P(dsyrTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<6>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<7>(GetParam()); // Set the threshold for the errors: double thresh = 2*n*testinghelpers::getEpsilon(); @@ -77,13 +74,13 @@ TEST_P(dsyrTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syr(storage, uploa, conjx, n, alpha, incx, lda_inc, thresh, datatype); + test_syr( storage, uploa, conjx, n, alpha, incx, lda_inc, thresh ); } class dsyrTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -91,7 +88,6 @@ class dsyrTestPrint { double alpha = std::get<4>(str.param); gtint_t incx = std::get<5>(str.param); gtint_t ld_inc = std::get<6>(str.param); - char datatype = std::get<7>(str.param); #ifdef TEST_BLAS std::string str_name = "dsyr_"; #elif TEST_CBLAS @@ -107,7 +103,6 @@ class dsyrTestPrint { std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha)))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -127,8 +122,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(31), 10), // n ::testing::Values(1.0), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::dsyrTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp b/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp index 9e44b518f6..66d4d1ce0e 100644 --- a/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp +++ b/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp @@ -42,12 +42,11 @@ class ssyrTest : gtint_t, float, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ssyrTest, RandomData) { +TEST_P(ssyrTest, RandomData) +{ using T = float; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -68,8 +67,6 @@ TEST_P(ssyrTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<6>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<7>(GetParam()); // Set the threshold for the errors: double thresh = 2*n*testinghelpers::getEpsilon(); @@ -77,13 +74,13 @@ TEST_P(ssyrTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syr(storage, uploa, conjx, n, alpha, incx, lda_inc, thresh, datatype); + test_syr( storage, uploa, conjx, n, alpha, incx, lda_inc, thresh ); } class ssyrTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -91,7 +88,6 @@ class ssyrTestPrint { float alpha = std::get<4>(str.param); gtint_t incx = std::get<5>(str.param); gtint_t ld_inc = std::get<6>(str.param); - char datatype = std::get<7>(str.param); #ifdef TEST_BLAS std::string str_name = "ssyr_"; #elif TEST_CBLAS @@ -107,7 +103,6 @@ class ssyrTestPrint { std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha)))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -127,8 +122,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(31), 10), // n ::testing::Values(1.0), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::ssyrTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/syr/syr.h b/gtestsuite/testsuite/level2/syr/syr.h index e16d5c5322..dad1b9f278 100644 --- a/gtestsuite/testsuite/level2/syr/syr.h +++ b/gtestsuite/testsuite/level2/syr/syr.h @@ -125,4 +125,4 @@ static void syr( char storage, char uploa, char conj_x, gtint_t n, T* alpha, #else throw std::runtime_error("Error in testsuite/level2/syr.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/syr/test_syr.h b/gtestsuite/testsuite/level2/syr/test_syr.h index d8cc9e9ada..3227cc2a4a 100644 --- a/gtestsuite/testsuite/level2/syr/test_syr.h +++ b/gtestsuite/testsuite/level2/syr/test_syr.h @@ -43,16 +43,16 @@ template void test_syr( char storage, char uploa, char conjx, gtint_t n, T alpha, - gtint_t incx, gtint_t lda_inc, double thresh, char datatype ) { - + gtint_t incx, gtint_t lda_inc, double thresh ) +{ // Compute the leading dimensions for matrix size calculation. - gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, 'n', n, n, lda_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 5, storage, 'n', n, n, lda, datatype); - std::vector x = testinghelpers::get_random_vector(-3, 3, n, incx, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 5, storage, 'n', n, n, lda ); + std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); mktrim( storage, uploa, n, a.data(), lda ); @@ -73,4 +73,4 @@ void test_syr( char storage, char uploa, char conjx, gtint_t n, T alpha, // check component-wise error. //---------------------------------------------------------- computediff( storage, n, n, a.data(), a_ref.data(), lda, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp b/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp index 896323648c..07266866f7 100644 --- a/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp +++ b/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp @@ -44,12 +44,11 @@ class dsyr2Test : double, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dsyr2Test, RandomData) { +TEST_P(dsyr2Test, RandomData) +{ using T = double; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -74,8 +73,6 @@ TEST_P(dsyr2Test, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = 3*n*testinghelpers::getEpsilon(); @@ -83,13 +80,13 @@ TEST_P(dsyr2Test, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syr2(storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh, datatype); + test_syr2( storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh ); } class dsyr2TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -99,7 +96,6 @@ class dsyr2TestPrint { gtint_t incx = std::get<6>(str.param); gtint_t incy = std::get<7>(str.param); gtint_t ld_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "dsyr2_"; #elif TEST_CBLAS @@ -117,7 +113,6 @@ class dsyr2TestPrint { str_name = str_name + "_" + incx_str; str_name = str_name + "_" + incy_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -139,8 +134,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(1.0, -2.0), // alpha ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::dsyr2TestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp b/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp index ced6dfdd89..d0ccfb3e79 100644 --- a/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp +++ b/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp @@ -44,12 +44,11 @@ class ssyr2Test : float, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ssyr2Test, RandomData) { +TEST_P(ssyr2Test, RandomData) +{ using T = float; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -74,8 +73,6 @@ TEST_P(ssyr2Test, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = 3*n*testinghelpers::getEpsilon(); @@ -83,13 +80,13 @@ TEST_P(ssyr2Test, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syr2(storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh, datatype); + test_syr2( storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh ); } class ssyr2TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char conjx = std::get<2>(str.param); @@ -99,7 +96,6 @@ class ssyr2TestPrint { gtint_t incx = std::get<6>(str.param); gtint_t incy = std::get<7>(str.param); gtint_t ld_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "ssyr2_"; #elif TEST_CBLAS @@ -117,7 +113,6 @@ class ssyr2TestPrint { str_name = str_name + "_" + incx_str; str_name = str_name + "_" + incy_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -139,8 +134,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(1.0, -2.0), // alpha ::testing::Values(gtint_t(1)), // stride size for x ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::ssyr2TestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/syr2/syr2.h b/gtestsuite/testsuite/level2/syr2/syr2.h index dd51b5497b..622bd0edd8 100644 --- a/gtestsuite/testsuite/level2/syr2/syr2.h +++ b/gtestsuite/testsuite/level2/syr2/syr2.h @@ -128,4 +128,4 @@ static void syr2( char storage, char uploa, char conj_x, char conj_y, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/syr2.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/syr2/test_syr2.h b/gtestsuite/testsuite/level2/syr2/test_syr2.h index 92b8b64baa..9389b67172 100644 --- a/gtestsuite/testsuite/level2/syr2/test_syr2.h +++ b/gtestsuite/testsuite/level2/syr2/test_syr2.h @@ -43,18 +43,17 @@ template void test_syr2( char storage, char uploa, char conjx, char conjy, gtint_t n, - T alpha, gtint_t incx, gtint_t incy, gtint_t lda_inc, double thresh, - char datatype ) { - + T alpha, gtint_t incx, gtint_t incy, gtint_t lda_inc, double thresh ) +{ // Compute the leading dimensions for matrix size calculation. - gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, 'n', n, n, lda_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 5, storage, 'n', n, n, lda, datatype); - std::vector x = testinghelpers::get_random_vector(-3, 3, n, incx, datatype); - std::vector y = testinghelpers::get_random_vector(-3, 3, n, incy, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 5, storage, 'n', n, n, lda ); + std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); + std::vector y = testinghelpers::get_random_vector( -3, 3, n, incy ); mksymm( storage, uploa, n, a.data(), lda ); mktrim( storage, uploa, n, a.data(), lda ); @@ -77,4 +76,4 @@ void test_syr2( char storage, char uploa, char conjx, char conjy, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, n, n, a.data(), a_ref.data(), lda, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp index 61f048c70d..a77120e69f 100644 --- a/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp @@ -43,12 +43,11 @@ class ctrmvTest : gtint_t, scomplex, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ctrmvTest, RandomData) { +TEST_P(ctrmvTest, RandomData) +{ using T = scomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -71,8 +70,6 @@ TEST_P(ctrmvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<8>(GetParam()); // Set the threshold for the errors: double thresh = 10*n*testinghelpers::getEpsilon(); @@ -80,13 +77,13 @@ TEST_P(ctrmvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmv(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype); + test_trmv( storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh ); } class ctrmvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char transa = std::get<2>(str.param); @@ -95,7 +92,6 @@ class ctrmvTestPrint { scomplex alpha = std::get<5>(str.param); gtint_t incx = std::get<6>(str.param); gtint_t ld_inc = std::get<7>(str.param); - char datatype = std::get<8>(str.param); #ifdef TEST_BLAS std::string str_name = "ctrmv_"; #elif TEST_CBLAS @@ -113,7 +109,6 @@ class ctrmvTestPrint { std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -138,8 +133,7 @@ INSTANTIATE_TEST_SUITE_P( #endif ), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(9)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(9)) // increment to the leading dim of a ), ::ctrmvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp index 869cc69744..cd3e123a9d 100644 --- a/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp @@ -43,12 +43,11 @@ class dtrmvTest : gtint_t, double, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dtrmvTest, RandomData) { +TEST_P(dtrmvTest, RandomData) +{ using T = double; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -71,8 +70,6 @@ TEST_P(dtrmvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<8>(GetParam()); // Set the threshold for the errors: double thresh = 20*n*testinghelpers::getEpsilon(); @@ -80,13 +77,13 @@ TEST_P(dtrmvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmv(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype); + test_trmv( storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh ); } class dtrmvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char transa = std::get<2>(str.param); @@ -95,7 +92,6 @@ class dtrmvTestPrint { double alpha = std::get<5>(str.param); gtint_t incx = std::get<6>(str.param); gtint_t ld_inc = std::get<7>(str.param); - char datatype = std::get<8>(str.param); #ifdef TEST_BLAS std::string str_name = "dtrmv_"; #elif TEST_CBLAS @@ -112,7 +108,6 @@ class dtrmvTestPrint { std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -137,8 +132,7 @@ INSTANTIATE_TEST_SUITE_P( #endif ), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::dtrmvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp index 18bbd93b77..5560dc6094 100644 --- a/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp @@ -43,12 +43,11 @@ class strmvTest : gtint_t, float, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(strmvTest, RandomData) { +TEST_P(strmvTest, RandomData) +{ using T = float; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -71,8 +70,6 @@ TEST_P(strmvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<8>(GetParam()); // Set the threshold for the errors: double thresh = 10*n*testinghelpers::getEpsilon(); @@ -80,13 +77,13 @@ TEST_P(strmvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmv(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype); + test_trmv( storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh ); } class strmvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char transa = std::get<2>(str.param); @@ -95,7 +92,6 @@ class strmvTestPrint { float alpha = std::get<5>(str.param); gtint_t incx = std::get<6>(str.param); gtint_t ld_inc = std::get<7>(str.param); - char datatype = std::get<8>(str.param); #ifdef TEST_BLAS std::string str_name = "strmv_"; #elif TEST_CBLAS @@ -112,7 +108,6 @@ class strmvTestPrint { std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -137,8 +132,7 @@ INSTANTIATE_TEST_SUITE_P( #endif ), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of a ), ::strmvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trmv/test_trmv.h b/gtestsuite/testsuite/level2/trmv/test_trmv.h index 82d8b0d6a3..80cf4d4f5f 100644 --- a/gtestsuite/testsuite/level2/trmv/test_trmv.h +++ b/gtestsuite/testsuite/level2/trmv/test_trmv.h @@ -43,16 +43,16 @@ template void test_trmv( char storage, char uploa, char transa, char diaga, gtint_t n, - T alpha, gtint_t lda_inc, gtint_t incx, double thresh, char datatype ) { - + T alpha, gtint_t lda_inc, gtint_t incx, double thresh ) +{ // Compute the leading dimensions for matrix size calculation. - gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, n, n, lda_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, transa, n, n, lda_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 8, storage, transa, n, n, lda, datatype); - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, transa, n, n, lda ); + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); mktrim( storage, uploa, n, a.data(), lda ); @@ -72,4 +72,4 @@ void test_trmv( char storage, char uploa, char transa, char diaga, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( n, x.data(), x_ref.data(), incx, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trmv/trmv.h b/gtestsuite/testsuite/level2/trmv/trmv.h index 8ee3750a62..38f10dbea8 100644 --- a/gtestsuite/testsuite/level2/trmv/trmv.h +++ b/gtestsuite/testsuite/level2/trmv/trmv.h @@ -157,4 +157,4 @@ static void trmv( char storage, char uploa, char transa, char diaga, #else throw std::runtime_error("Error in testsuite/level2/trmv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp index 759202433d..a3868e61dd 100644 --- a/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp @@ -43,12 +43,11 @@ class ztrmvTest : gtint_t, dcomplex, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ztrmvTest, RandomData) { +TEST_P(ztrmvTest, RandomData) +{ using T = dcomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -71,8 +70,6 @@ TEST_P(ztrmvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<8>(GetParam()); // Set the threshold for the errors: double thresh = 10*n*testinghelpers::getEpsilon(); @@ -80,13 +77,13 @@ TEST_P(ztrmvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmv(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype); + test_trmv( storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh ); } class ztrmvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char transa = std::get<2>(str.param); @@ -95,7 +92,6 @@ class ztrmvTestPrint { dcomplex alpha = std::get<5>(str.param); gtint_t incx = std::get<6>(str.param); gtint_t ld_inc = std::get<7>(str.param); - char datatype = std::get<8>(str.param); #ifdef TEST_BLAS std::string str_name = "ztrmv_"; #elif TEST_CBLAS @@ -113,7 +109,6 @@ class ztrmvTestPrint { std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -138,8 +133,7 @@ INSTANTIATE_TEST_SUITE_P( #endif ), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::ztrmvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp index 45421b8f97..09e9c05a4d 100644 --- a/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp @@ -43,12 +43,11 @@ class ctrsvTest : gtint_t, scomplex, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ctrsvTest, RandomData) { +TEST_P(ctrsvTest, RandomData) +{ using T = scomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -71,8 +70,6 @@ TEST_P(ctrsvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<8>(GetParam()); // Set the threshold for the errors: double thresh = 5*n*testinghelpers::getEpsilon(); @@ -80,13 +77,13 @@ TEST_P(ctrsvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trsv(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype); + test_trsv( storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh ); } class ctrsvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char transa = std::get<2>(str.param); @@ -95,7 +92,6 @@ class ctrsvTestPrint { scomplex alpha = std::get<5>(str.param); gtint_t incx = std::get<6>(str.param); gtint_t ld_inc = std::get<7>(str.param); - char datatype = std::get<8>(str.param); #ifdef TEST_BLAS std::string str_name = "ctrsv_"; #elif TEST_CBLAS @@ -113,7 +109,6 @@ class ctrsvTestPrint { std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -138,8 +133,7 @@ INSTANTIATE_TEST_SUITE_P( #endif ), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::ctrsvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp index 2a4e1c6cac..ac74f828e9 100644 --- a/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp @@ -43,12 +43,11 @@ class dtrsvTest : gtint_t, double, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dtrsvTest, RandomData) { +TEST_P(dtrsvTest, RandomData) +{ using T = double; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -71,8 +70,6 @@ TEST_P(dtrsvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<8>(GetParam()); // Set the threshold for the errors: double thresh = 100*n*testinghelpers::getEpsilon(); @@ -80,13 +77,13 @@ TEST_P(dtrsvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trsv(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype); + test_trsv( storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh ); } class dtrsvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char transa = std::get<2>(str.param); @@ -95,7 +92,6 @@ class dtrsvTestPrint { double alpha = std::get<5>(str.param); gtint_t incx = std::get<6>(str.param); gtint_t ld_inc = std::get<7>(str.param); - char datatype = std::get<8>(str.param); #ifdef TEST_BLAS std::string str_name = "dtrsv_"; #elif TEST_CBLAS @@ -112,7 +108,6 @@ class dtrsvTestPrint { std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -137,8 +132,7 @@ INSTANTIATE_TEST_SUITE_P( #endif ), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::dtrsvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp index edd0197070..ed7d26d713 100644 --- a/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp @@ -43,12 +43,11 @@ class strsvTest : gtint_t, float, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(strsvTest, RandomData) { +TEST_P(strsvTest, RandomData) +{ using T = float; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -71,8 +70,6 @@ TEST_P(strsvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<8>(GetParam()); // Set the threshold for the errors: double thresh = 20*n*testinghelpers::getEpsilon(); @@ -80,13 +77,13 @@ TEST_P(strsvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trsv(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype); + test_trsv( storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh ); } class strsvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char transa = std::get<2>(str.param); @@ -95,7 +92,6 @@ class strsvTestPrint { float alpha = std::get<5>(str.param); gtint_t incx = std::get<6>(str.param); gtint_t ld_inc = std::get<7>(str.param); - char datatype = std::get<8>(str.param); #ifdef TEST_BLAS std::string str_name = "strsv_"; #elif TEST_CBLAS @@ -112,7 +108,6 @@ class strsvTestPrint { std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -137,8 +132,7 @@ INSTANTIATE_TEST_SUITE_P( #endif ), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(7)) // increment to the leading dim of a ), ::strsvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trsv/test_trsv.h b/gtestsuite/testsuite/level2/trsv/test_trsv.h index 320459c862..096cd5ee0a 100644 --- a/gtestsuite/testsuite/level2/trsv/test_trsv.h +++ b/gtestsuite/testsuite/level2/trsv/test_trsv.h @@ -43,16 +43,16 @@ template void test_trsv( char storage, char uploa, char transa, char diaga, gtint_t n, - T alpha, gtint_t lda_inc, gtint_t incx, double thresh, char datatype ) { - + T alpha, gtint_t lda_inc, gtint_t incx, double thresh ) +{ // Compute the leading dimensions for matrix size calculation. - gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, n, n, lda_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, transa, n, n, lda_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(1, 5, storage, transa, n, n, lda, datatype); - std::vector x = testinghelpers::get_random_vector(1, 3, n, incx, datatype); + std::vector a = testinghelpers::get_random_matrix( 1, 5, storage, transa, n, n, lda ); + std::vector x = testinghelpers::get_random_vector( 1, 3, n, incx ); mktrim( storage, uploa, n, a.data(), lda ); @@ -72,4 +72,4 @@ void test_trsv( char storage, char uploa, char transa, char diaga, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( n, x.data(), x_ref.data(), incx, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trsv/trsv.h b/gtestsuite/testsuite/level2/trsv/trsv.h index 65ca33112a..522ae319fb 100644 --- a/gtestsuite/testsuite/level2/trsv/trsv.h +++ b/gtestsuite/testsuite/level2/trsv/trsv.h @@ -157,4 +157,4 @@ static void trsv( char storage, char uploa, char transa, char diaga, #else throw std::runtime_error("Error in testsuite/level2/trsv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp index e3232f0229..97f1c3440d 100644 --- a/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp @@ -43,12 +43,11 @@ class ztrsvTest : gtint_t, dcomplex, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ztrsvTest, RandomData) { +TEST_P(ztrsvTest, RandomData) +{ using T = dcomplex; - //---------------------------------------------------------- // Initialize values from the parameters passed through // test suite instantiation (INSTANTIATE_TEST_SUITE_P). @@ -71,8 +70,6 @@ TEST_P(ztrsvTest, RandomData) { // If increment is zero, then the array size matches the matrix size. // If increment are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<8>(GetParam()); // Set the threshold for the errors: double thresh = 10*n*testinghelpers::getEpsilon(); @@ -80,13 +77,13 @@ TEST_P(ztrsvTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trsv(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype); + test_trsv( storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh ); } class ztrsvTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uploa = std::get<1>(str.param); char transa = std::get<2>(str.param); @@ -95,7 +92,6 @@ class ztrsvTestPrint { dcomplex alpha = std::get<5>(str.param); gtint_t incx = std::get<6>(str.param); gtint_t ld_inc = std::get<7>(str.param); - char datatype = std::get<8>(str.param); #ifdef TEST_BLAS std::string str_name = "ztrsv_"; #elif TEST_CBLAS @@ -113,7 +109,6 @@ class ztrsvTestPrint { std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_" + std::to_string(ld_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -138,8 +133,7 @@ INSTANTIATE_TEST_SUITE_P( #endif ), // alpha ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of a ), ::ztrsvTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp index fa6b10006a..9efea8b5dc 100644 --- a/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp @@ -46,10 +46,10 @@ class CGemmTest : scomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(CGemmTest, RandomData) { +TEST_P(CGemmTest, RandomData) +{ using T = scomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -77,8 +77,6 @@ TEST_P(CGemmTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = 10*m*n*testinghelpers::getEpsilon(); @@ -86,13 +84,13 @@ TEST_P(CGemmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemm(storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_gemm( storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class CGemmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char tsa = std::get<1>(str.param); char tsb = std::get<2>(str.param); @@ -104,7 +102,6 @@ class CGemmTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "cgemm_"; #elif TEST_CBLAS @@ -126,7 +123,6 @@ class CGemmTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -150,8 +146,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(scomplex{1.0,2.0}), // beta ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::CGemmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp index 5a7bcbd910..447a96c459 100644 --- a/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp @@ -46,10 +46,10 @@ class DGemmTest : double, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(DGemmTest, RandomData) { +TEST_P(DGemmTest, RandomData) +{ using T = double; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -77,8 +77,6 @@ TEST_P(DGemmTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = 10*m*n*k*testinghelpers::getEpsilon(); @@ -86,13 +84,13 @@ TEST_P(DGemmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemm(storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_gemm( storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class DGemmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char tsa = std::get<1>(str.param); char tsb = std::get<2>(str.param); @@ -104,7 +102,6 @@ class DGemmTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "dgemm_"; #elif TEST_CBLAS @@ -124,7 +121,6 @@ class DGemmTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -148,8 +144,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-1.0, 1.0), // beta ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::DGemmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm/gemm.h b/gtestsuite/testsuite/level3/gemm/gemm.h index 907f078848..13f8bf6198 100644 --- a/gtestsuite/testsuite/level3/gemm/gemm.h +++ b/gtestsuite/testsuite/level3/gemm/gemm.h @@ -164,4 +164,4 @@ static void gemm( char storage, char transa, char transb, gtint_t m, gtint_t n, #else throw std::runtime_error("Error in testsuite/level3/gemm.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp index f1f7bec8cf..f7683ea7eb 100644 --- a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp @@ -46,10 +46,10 @@ class SGemmTest : float, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(SGemmTest, RandomData) { +TEST_P(SGemmTest, RandomData) +{ using T = float; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -77,8 +77,6 @@ TEST_P(SGemmTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = 10*m*n*testinghelpers::getEpsilon(); @@ -86,13 +84,13 @@ TEST_P(SGemmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemm(storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_gemm( storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class SGemmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char tsa = std::get<1>(str.param); char tsb = std::get<2>(str.param); @@ -104,7 +102,6 @@ class SGemmTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "sgemm_"; #elif TEST_CBLAS @@ -124,7 +121,6 @@ class SGemmTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -148,8 +144,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-1.0, 1.0), // beta ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(7)) // increment to the leading dim of c ), ::SGemmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm/test_gemm.h b/gtestsuite/testsuite/level3/gemm/test_gemm.h index 3396ba2ce6..df88bb50b0 100644 --- a/gtestsuite/testsuite/level3/gemm/test_gemm.h +++ b/gtestsuite/testsuite/level3/gemm/test_gemm.h @@ -40,23 +40,22 @@ #include #include - template void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n, - gtint_t k, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, - T alpha, T beta, double thresh, char datatype ) { - + gtint_t k, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, T alpha, + T beta, double thresh ) +{ // Compute the leading dimensions of a, b, and c. - gtint_t lda = testinghelpers::get_leading_dimension(storage, trnsa, m, k, lda_inc); - gtint_t ldb = testinghelpers::get_leading_dimension(storage, trnsb, k, n, ldb_inc); - gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldc_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, trnsa, m, k, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, trnsb, k, n, ldb_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, n, ldc_inc ); //---------------------------------------------------------- // Initialize matrics with random numbers //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 8, storage, trnsa, m, k, lda, datatype); - std::vector b = testinghelpers::get_random_matrix(-5, 2, storage, trnsb, k, n, ldb, datatype); - std::vector c = testinghelpers::get_random_matrix(-3, 5, storage, 'n', m, n, ldc, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, trnsa, m, k, lda ); + std::vector b = testinghelpers::get_random_matrix( -5, 2, storage, trnsb, k, n, ldb ); + std::vector c = testinghelpers::get_random_matrix( -3, 5, storage, 'n', m, n, ldc ); // Create a copy of c so that we can check reference results. std::vector c_ref(c); @@ -77,4 +76,4 @@ void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp index 0f4bb4783d..dd61ce69cf 100644 --- a/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp @@ -46,10 +46,10 @@ class ZGemmTest : dcomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ZGemmTest, RandomData) { +TEST_P(ZGemmTest, RandomData) +{ using T = dcomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -77,8 +77,6 @@ TEST_P(ZGemmTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = 10*m*n*testinghelpers::getEpsilon(); @@ -86,13 +84,13 @@ TEST_P(ZGemmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemm(storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_gemm( storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class ZGemmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char tsa = std::get<1>(str.param); char tsb = std::get<2>(str.param); @@ -104,7 +102,6 @@ class ZGemmTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "zgemm_"; #elif TEST_CBLAS @@ -126,7 +123,6 @@ class ZGemmTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -150,8 +146,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{1.0,2.0}), // beta ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of c ), ::ZGemmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp index f15fc50619..39bc5a5472 100644 --- a/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp +++ b/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp @@ -46,12 +46,12 @@ class cgemmtTest : scomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(cgemmtTest); -TEST_P(cgemmtTest, RandomData) { +TEST_P(cgemmtTest, RandomData) +{ using T = scomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -79,8 +79,6 @@ TEST_P(cgemmtTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = 10*n*k*testinghelpers::getEpsilon(); @@ -88,13 +86,13 @@ TEST_P(cgemmtTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemmt(storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_gemmt( storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class cgemmtTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -106,7 +104,6 @@ class cgemmtTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "cgemmt_"; #elif TEST_CBLAS @@ -128,10 +125,10 @@ class cgemmtTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; + // Disable tests for BLIS_TYPED case due to compiler errors. #ifndef TEST_BLIS_TYPED // Black box testing. @@ -153,9 +150,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(scomplex{1.0,2.0}), // beta ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of c ), ::cgemmtTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp index b27b6c66b9..71d23f2e2b 100644 --- a/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp +++ b/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp @@ -46,12 +46,12 @@ class dgemmtTest : double, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dgemmtTest); -TEST_P(dgemmtTest, RandomData) { +TEST_P(dgemmtTest, RandomData) +{ using T = double; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -79,8 +79,6 @@ TEST_P(dgemmtTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = 10*n*k*testinghelpers::getEpsilon(); @@ -88,13 +86,13 @@ TEST_P(dgemmtTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemmt(storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_gemmt( storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class dgemmtTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char tsa = std::get<1>(str.param); char tsb = std::get<2>(str.param); @@ -106,7 +104,6 @@ class dgemmtTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "dgemmt_"; #elif TEST_CBLAS @@ -126,7 +123,6 @@ class dgemmtTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -151,9 +147,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(3.0), // beta ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::dgemmtTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemmt/gemmt.h b/gtestsuite/testsuite/level3/gemmt/gemmt.h index 217cd5bcd0..062657bd81 100644 --- a/gtestsuite/testsuite/level3/gemmt/gemmt.h +++ b/gtestsuite/testsuite/level3/gemmt/gemmt.h @@ -154,6 +154,7 @@ static void typed_gemmt(char storage, char uplo, char trnsa, char trnsb, throw std::runtime_error("Error in testsuite/level3/gemmt.h: Invalid typename in typed_gemmt()."); } #endif + template static void gemmt( char storage, char uplo, char transa, char transb, gtint_t n, gtint_t k, T* alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc ) @@ -172,4 +173,4 @@ static void gemmt( char storage, char uplo, char transa, char transb, gtint_t n, #else throw std::runtime_error("Error in testsuite/level3/gemmt.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp index c9686e84bb..4ac56998e3 100644 --- a/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp +++ b/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp @@ -46,12 +46,12 @@ class sgemmtTest : float, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(sgemmtTest); -TEST_P(sgemmtTest, RandomData) { +TEST_P(sgemmtTest, RandomData) +{ using T = float; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -79,8 +79,6 @@ TEST_P(sgemmtTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = 10*n*k*testinghelpers::getEpsilon(); @@ -88,13 +86,13 @@ TEST_P(sgemmtTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemmt(storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_gemmt( storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class sgemmtTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char tsa = std::get<1>(str.param); char tsb = std::get<2>(str.param); @@ -106,7 +104,6 @@ class sgemmtTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "sgemmt_"; #elif TEST_CBLAS @@ -126,7 +123,6 @@ class sgemmtTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -152,9 +148,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(3.0), // beta ::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::sgemmtTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemmt/test_gemmt.h b/gtestsuite/testsuite/level3/gemmt/test_gemmt.h index 9087c9fa81..5b88894647 100644 --- a/gtestsuite/testsuite/level3/gemmt/test_gemmt.h +++ b/gtestsuite/testsuite/level3/gemmt/test_gemmt.h @@ -42,20 +42,20 @@ template void test_gemmt( char storage, char uplo, char trnsa, char trnsb, gtint_t n, - gtint_t k, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, - T alpha, T beta, double thresh, char datatype ) { - + gtint_t k, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, T alpha, + T beta, double thresh ) +{ // Compute the leading dimensions of a, b, and c. - gtint_t lda = testinghelpers::get_leading_dimension(storage, trnsa, n, k, lda_inc); - gtint_t ldb = testinghelpers::get_leading_dimension(storage, trnsb, k, n, ldb_inc); - gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', n, n, ldc_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, trnsa, n, k, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, trnsb, k, n, ldb_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', n, n, ldc_inc ); //---------------------------------------------------------- // Initialize matrics with random numbers //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 8, storage, trnsa, n, k, lda, datatype); - std::vector b = testinghelpers::get_random_matrix(-5, 2, storage, trnsb, k, n, ldb, datatype); - std::vector c = testinghelpers::get_random_matrix(-3, 5, storage, 'n', n, n, ldc, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, trnsa, n, k, lda ); + std::vector b = testinghelpers::get_random_matrix( -5, 2, storage, trnsb, k, n, ldb ); + std::vector c = testinghelpers::get_random_matrix( -3, 5, storage, 'n', n, n, ldc ); // Create a copy of c so that we can check reference results. std::vector c_ref(c); @@ -76,4 +76,4 @@ void test_gemmt( char storage, char uplo, char trnsa, char trnsb, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, n, n, c.data(), c_ref.data(), ldc, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp index d5ddd84276..898e50de91 100644 --- a/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp +++ b/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp @@ -46,12 +46,12 @@ class zgemmtTest : dcomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zgemmtTest); -TEST_P(zgemmtTest, RandomData) { +TEST_P(zgemmtTest, RandomData) +{ using T = dcomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -79,8 +79,6 @@ TEST_P(zgemmtTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = std::max(n,k)*testinghelpers::getEpsilon(); @@ -88,13 +86,13 @@ TEST_P(zgemmtTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_gemmt(storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_gemmt( storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class zgemmtTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -106,7 +104,6 @@ class zgemmtTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "zgemmt_"; #elif TEST_CBLAS @@ -128,7 +125,6 @@ class zgemmtTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -154,9 +150,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{1.0,2.0}), // beta ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(9)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(9)) // increment to the leading dim of c ), ::zgemmtTestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp b/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp index 4a1221c4b4..181956d507 100644 --- a/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp +++ b/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp @@ -47,10 +47,10 @@ class chemmTest : scomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(chemmTest, RandomData) { +TEST_P(chemmTest, RandomData) +{ using T = scomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -81,8 +81,6 @@ TEST_P(chemmTest, RandomData) { gtint_t lda_inc = std::get<9>(GetParam()); gtint_t ldb_inc = std::get<10>(GetParam()); gtint_t ldc_inc = std::get<11>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<12>(GetParam()); // Set the threshold for the errors: double thresh = 10*m*n*testinghelpers::getEpsilon(); @@ -90,13 +88,13 @@ TEST_P(chemmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_hemm(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_hemm( storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class chemmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uplo = std::get<2>(str.param); @@ -109,7 +107,6 @@ class chemmTestPrint { gtint_t lda_inc = std::get<9>(str.param); gtint_t ldb_inc = std::get<10>(str.param); gtint_t ldc_inc = std::get<11>(str.param); - char datatype = std::get<12>(str.param); #ifdef TEST_BLAS std::string str_name = "chemm_"; #elif TEST_CBLAS @@ -130,7 +127,6 @@ class chemmTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -155,8 +151,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(scomplex{-3.0, 2.0}), // beta ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::chemmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/hemm/hemm.h b/gtestsuite/testsuite/level3/hemm/hemm.h index 1cc0ca1473..2fae4c3c36 100644 --- a/gtestsuite/testsuite/level3/hemm/hemm.h +++ b/gtestsuite/testsuite/level3/hemm/hemm.h @@ -164,4 +164,4 @@ static void hemm( char storage, char side, char uplo, char conja, char transb, g #else throw std::runtime_error("Error in testsuite/level3/hemm.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/hemm/test_hemm.h b/gtestsuite/testsuite/level3/hemm/test_hemm.h index bae4756f6b..b0b2d94847 100644 --- a/gtestsuite/testsuite/level3/hemm/test_hemm.h +++ b/gtestsuite/testsuite/level3/hemm/test_hemm.h @@ -42,17 +42,15 @@ template void test_hemm( char storage, char side, char uplo, char conja, char transb, - gtint_t m, gtint_t n, - gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, - T alpha, T beta, - double thresh, char datatype -) { + gtint_t m, gtint_t n, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, + T alpha, T beta, double thresh ) +{ // Set the dimension for row/col of A, depending on the value of side. gtint_t k = ((side == 'l')||(side == 'L'))? m : n; // Compute the leading dimensions of a, b, and c. - gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', k, k, lda_inc); - gtint_t ldb = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldb_inc); - gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldc_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, 'n', k, k, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, 'n', m, n, ldb_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, n, ldc_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. @@ -60,9 +58,9 @@ void test_hemm( char storage, char side, char uplo, char conja, char transb, // Since matrix A, stored in a, is symmetric and we only use the upper or lower // part in the computation of hemm and zero-out the rest to ensure // that code operates as expected. - std::vector a = testinghelpers::get_random_matrix(-5, 2, storage, uplo, k, lda, datatype); - std::vector b = testinghelpers::get_random_matrix(-5, 2, storage, transb, m, n, ldb, datatype); - std::vector c = testinghelpers::get_random_matrix(-3, 5, storage, 'n', m, n, ldc, datatype); + std::vector a = testinghelpers::get_random_matrix( -5, 2, storage, uplo, k, lda ); + std::vector b = testinghelpers::get_random_matrix( -5, 2, storage, transb, m, n, ldb ); + std::vector c = testinghelpers::get_random_matrix( -3, 5, storage, 'n', m, n, ldc ); // Create a copy of c so that we can check reference results. std::vector c_ref(c); @@ -82,4 +80,4 @@ void test_hemm( char storage, char side, char uplo, char conja, char transb, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp b/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp index 4ebc75ef2c..fcc2b0c73e 100644 --- a/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp +++ b/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp @@ -47,10 +47,10 @@ class zhemmTest : dcomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zhemmTest, RandomData) { +TEST_P(zhemmTest, RandomData) +{ using T = dcomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -81,8 +81,6 @@ TEST_P(zhemmTest, RandomData) { gtint_t lda_inc = std::get<9>(GetParam()); gtint_t ldb_inc = std::get<10>(GetParam()); gtint_t ldc_inc = std::get<11>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<12>(GetParam()); // Set the threshold for the errors: double thresh = 10*m*n*testinghelpers::getEpsilon(); @@ -90,13 +88,13 @@ TEST_P(zhemmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_hemm(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_hemm( storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class zhemmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uplo = std::get<2>(str.param); @@ -109,7 +107,6 @@ class zhemmTestPrint { gtint_t lda_inc = std::get<9>(str.param); gtint_t ldb_inc = std::get<10>(str.param); gtint_t ldc_inc = std::get<11>(str.param); - char datatype = std::get<12>(str.param); #ifdef TEST_BLAS std::string str_name = "zhemm_"; #elif TEST_CBLAS @@ -130,7 +127,6 @@ class zhemmTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -155,8 +151,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{4.0, -1.0}), // beta ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(6)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(6)) // increment to the leading dim of c ), ::zhemmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp b/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp index b33db3a187..1e6d848ac8 100644 --- a/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp +++ b/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp @@ -46,10 +46,10 @@ class cher2kTest : float, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(cher2kTest, RandomData) { +TEST_P(cher2kTest, RandomData) +{ using T = scomplex; using RT = typename testinghelpers::type_info::real_type; //---------------------------------------------------------- @@ -78,8 +78,6 @@ TEST_P(cher2kTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = 2*m*k*testinghelpers::getEpsilon(); @@ -87,13 +85,13 @@ TEST_P(cher2kTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_her2k(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_her2k( storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class cher2kTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -105,7 +103,6 @@ class cher2kTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "cher2k_"; #elif TEST_CBLAS @@ -126,7 +123,6 @@ class cher2kTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -150,8 +146,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-3.0, 2.0), // beta ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of c ), ::cher2kTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/her2k/her2k.h b/gtestsuite/testsuite/level3/her2k/her2k.h index 76ea95f3b4..90d548aa0c 100644 --- a/gtestsuite/testsuite/level3/her2k/her2k.h +++ b/gtestsuite/testsuite/level3/her2k/her2k.h @@ -155,4 +155,4 @@ static void her2k( char storage, char uplo, char transa, char transb, gtint_t m, #else throw std::runtime_error("Error in testsuite/level3/her2k.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/her2k/test_her2k.h b/gtestsuite/testsuite/level3/her2k/test_her2k.h index 60c1f1c2f0..e05845b451 100644 --- a/gtestsuite/testsuite/level3/her2k/test_her2k.h +++ b/gtestsuite/testsuite/level3/her2k/test_her2k.h @@ -42,25 +42,23 @@ template::real_type> void test_her2k( char storage, char uplo, char transa, char transb, - gtint_t m, gtint_t k, - gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, - T alpha, RT beta, - double thresh, char datatype -) { + gtint_t m, gtint_t k, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, + T alpha, RT beta, double thresh ) +{ // Compute the leading dimensions of a, b, and c. - gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, m, k, lda_inc); - gtint_t ldb = testinghelpers::get_leading_dimension(storage, transb, m, k, ldb_inc); - gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, m, ldc_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, transa, m, k, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, transb, m, k, ldb_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, m, ldc_inc ); //---------------------------------------------------------- // Initialize matrics with random numbers //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 8, storage, transa, m, k, lda, datatype); - std::vector b = testinghelpers::get_random_matrix(-5, 2, storage, transb, m, k, ldb, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, transa, m, k, lda ); + std::vector b = testinghelpers::get_random_matrix( -5, 2, storage, transb, m, k, ldb ); // Since matrix C, stored in c, is symmetric and we only use the upper or lower // part in the computation of her2k and zero-out the rest to ensure // that code operates as expected. - std::vector c = testinghelpers::get_random_matrix(-3, 5, storage, uplo, m, ldc, datatype); + std::vector c = testinghelpers::get_random_matrix(-3, 5, storage, uplo, m, ldc ); // Create a copy of c so that we can check reference results. std::vector c_ref(c); @@ -81,4 +79,4 @@ void test_her2k( char storage, char uplo, char transa, char transb, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, m, c.data(), c_ref.data(), ldc, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp b/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp index 95301a291b..316fc730b7 100644 --- a/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp +++ b/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp @@ -46,10 +46,10 @@ class zher2kTest : double, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zher2kTest, RandomData) { +TEST_P(zher2kTest, RandomData) +{ using T = dcomplex; using RT = typename testinghelpers::type_info::real_type; //---------------------------------------------------------- @@ -78,8 +78,6 @@ TEST_P(zher2kTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = 2*m*k*testinghelpers::getEpsilon(); @@ -87,13 +85,13 @@ TEST_P(zher2kTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_her2k(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_her2k( storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class zher2kTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -105,7 +103,6 @@ class zher2kTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "zher2k_"; #elif TEST_CBLAS @@ -126,7 +123,6 @@ class zher2kTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -150,8 +146,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(4.0, -1.0), // beta ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of c ), ::zher2kTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/herk/cherk_generic.cpp b/gtestsuite/testsuite/level3/herk/cherk_generic.cpp index 13252de9cd..4db35fbeeb 100644 --- a/gtestsuite/testsuite/level3/herk/cherk_generic.cpp +++ b/gtestsuite/testsuite/level3/herk/cherk_generic.cpp @@ -44,10 +44,10 @@ class cherkTest : float, float, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(cherkTest, RandomData) { +TEST_P(cherkTest, RandomData) +{ using T = scomplex; using RT = typename testinghelpers::type_info::real_type; //---------------------------------------------------------- @@ -73,8 +73,6 @@ TEST_P(cherkTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); gtint_t ldc_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = m*k*testinghelpers::getEpsilon(); @@ -82,13 +80,13 @@ TEST_P(cherkTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_herk(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype); + test_herk( storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh ); } class cherkTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -98,7 +96,6 @@ class cherkTestPrint { float beta = std::get<6>(str.param); gtint_t lda_inc = std::get<7>(str.param); gtint_t ldc_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "cherk_"; #elif TEST_CBLAS @@ -117,7 +114,6 @@ class cherkTestPrint { str_name = str_name + "_b" + beta_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -139,8 +135,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-2.0, 3.0), // alpha ::testing::Values(4.0, -1.0), // beta ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of b ), ::cherkTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/herk/herk.h b/gtestsuite/testsuite/level3/herk/herk.h index 6aab4355dc..fd6990ff07 100644 --- a/gtestsuite/testsuite/level3/herk/herk.h +++ b/gtestsuite/testsuite/level3/herk/herk.h @@ -144,4 +144,4 @@ static void herk( char storage, char uplo, char transa, gtint_t m, gtint_t k, #else throw std::runtime_error("Error in testsuite/level3/herk.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/herk/test_herk.h b/gtestsuite/testsuite/level3/herk/test_herk.h index 355b514ec4..42704dff7c 100644 --- a/gtestsuite/testsuite/level3/herk/test_herk.h +++ b/gtestsuite/testsuite/level3/herk/test_herk.h @@ -41,25 +41,22 @@ #include template::real_type> -void test_herk( char storage, char uplo, char transa, - gtint_t m, gtint_t k, - gtint_t lda_inc, gtint_t ldc_inc, - RT alpha, RT beta, - double thresh, char datatype -) { +void test_herk( char storage, char uplo, char transa, gtint_t m, gtint_t k, + gtint_t lda_inc, gtint_t ldc_inc, RT alpha, RT beta, double thresh ) +{ // Compute the leading dimensions of a, b, and c. - gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, m, k, lda_inc); - gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, m, ldc_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, transa, m, k, lda_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, m, ldc_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-5, 2, storage, transa, m, k, lda, datatype); + std::vector a = testinghelpers::get_random_matrix( -5, 2, storage, transa, m, k, lda ); // Since matrix C, stored in c, is symmetric, we only use the upper or lower // part in the computation of herk and zero-out the rest to ensure // that code operates as expected. - std::vector c = testinghelpers::get_random_matrix(-8, 12, storage, uplo, m, ldc, datatype); + std::vector c = testinghelpers::get_random_matrix( -8, 12, storage, uplo, m, ldc ); // Create a copy of c so that we can check reference results. std::vector c_ref(c); @@ -79,4 +76,4 @@ void test_herk( char storage, char uplo, char transa, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, m, c.data(), c_ref.data(), ldc, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/herk/zherk_generic.cpp b/gtestsuite/testsuite/level3/herk/zherk_generic.cpp index 3bbe6cf334..620841669c 100644 --- a/gtestsuite/testsuite/level3/herk/zherk_generic.cpp +++ b/gtestsuite/testsuite/level3/herk/zherk_generic.cpp @@ -44,10 +44,10 @@ class zherkTest : double, double, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zherkTest, RandomData) { +TEST_P(zherkTest, RandomData) +{ using T = dcomplex; using RT = typename testinghelpers::type_info::real_type; //---------------------------------------------------------- @@ -73,8 +73,6 @@ TEST_P(zherkTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); gtint_t ldc_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = m*k*testinghelpers::getEpsilon(); @@ -82,13 +80,13 @@ TEST_P(zherkTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_herk(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype); + test_herk( storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh ); } class zherkTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -98,7 +96,6 @@ class zherkTestPrint { double beta = std::get<6>(str.param); gtint_t lda_inc = std::get<7>(str.param); gtint_t ldc_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "zherk_"; #elif TEST_CBLAS @@ -117,7 +114,6 @@ class zherkTestPrint { str_name = str_name + "_b" + beta_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -131,16 +127,15 @@ INSTANTIATE_TEST_SUITE_P( #ifndef TEST_BLAS ,'r' #endif - ), - ::testing::Values('u','l'), // storage format - ::testing::Values('n','c'), // u:upper, l:lower - ::testing::Range(gtint_t(10), gtint_t(31), 10), // transa + ), // storage format + ::testing::Values('u','l'), // u:upper, l:lower + ::testing::Values('n','c'), // transa ::testing::Range(gtint_t(10), gtint_t(31), 10), // m - ::testing::Values(2.0, -1.0), // n - ::testing::Values(-3.0, 2.0), // alpha - ::testing::Values(gtint_t(0), gtint_t(4)), // beta - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // increment to the leading dim of b - ), // i : integer, f : float datatype type tested + ::testing::Range(gtint_t(10), gtint_t(31), 10), // n + ::testing::Values(2.0, -1.0), // alpha + ::testing::Values(-3.0, 2.0), // beta + ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of a + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c + ), ::zherkTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/symm/csymm_generic.cpp b/gtestsuite/testsuite/level3/symm/csymm_generic.cpp index 96c53c63df..8a16fd0583 100644 --- a/gtestsuite/testsuite/level3/symm/csymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/csymm_generic.cpp @@ -47,10 +47,10 @@ class csymmTest : scomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(csymmTest, RandomData) { +TEST_P(csymmTest, RandomData) +{ using T = scomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -81,8 +81,6 @@ TEST_P(csymmTest, RandomData) { gtint_t lda_inc = std::get<9>(GetParam()); gtint_t ldb_inc = std::get<10>(GetParam()); gtint_t ldc_inc = std::get<11>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<12>(GetParam()); // Set the threshold for the errors: double thresh = m*n*testinghelpers::getEpsilon(); @@ -90,13 +88,13 @@ TEST_P(csymmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_symm(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_symm( storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class csymmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uplo = std::get<2>(str.param); @@ -109,7 +107,6 @@ class csymmTestPrint { gtint_t lda_inc = std::get<9>(str.param); gtint_t ldb_inc = std::get<10>(str.param); gtint_t ldc_inc = std::get<11>(str.param); - char datatype = std::get<12>(str.param); #ifdef TEST_BLAS std::string str_name = "csymm_"; #elif TEST_CBLAS @@ -131,7 +128,6 @@ class csymmTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -156,8 +152,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(scomplex{-3.0, 2.0}, scomplex{4.0, -1.0}), // beta ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : dcomplex datatype type tested + ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of c ), ::csymmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp b/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp index 9217152a22..4a4c9710a3 100644 --- a/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp @@ -47,10 +47,10 @@ class dsymmTest : double, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dsymmTest, RandomData) { +TEST_P(dsymmTest, RandomData) +{ using T = double; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -81,8 +81,6 @@ TEST_P(dsymmTest, RandomData) { gtint_t lda_inc = std::get<9>(GetParam()); gtint_t ldb_inc = std::get<10>(GetParam()); gtint_t ldc_inc = std::get<11>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<12>(GetParam()); // Set the threshold for the errors: double thresh = 30*m*n*testinghelpers::getEpsilon(); @@ -90,13 +88,13 @@ TEST_P(dsymmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_symm(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_symm( storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class dsymmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uplo = std::get<2>(str.param); @@ -109,7 +107,6 @@ class dsymmTestPrint { gtint_t lda_inc = std::get<9>(str.param); gtint_t ldb_inc = std::get<10>(str.param); gtint_t ldc_inc = std::get<11>(str.param); - char datatype = std::get<12>(str.param); #ifdef TEST_BLAS std::string str_name = "dsymm_"; #elif TEST_CBLAS @@ -129,7 +126,6 @@ class dsymmTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -154,8 +150,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-1.0, 1.0), // beta ::testing::Values(gtint_t(0), gtint_t(6)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : dcomplex datatype type tested + ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of c ), ::dsymmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp b/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp index 1fca984ee7..9670c88391 100644 --- a/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp @@ -47,10 +47,10 @@ class ssymmTest : float, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ssymmTest, RandomData) { +TEST_P(ssymmTest, RandomData) +{ using T = float; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -81,8 +81,6 @@ TEST_P(ssymmTest, RandomData) { gtint_t lda_inc = std::get<9>(GetParam()); gtint_t ldb_inc = std::get<10>(GetParam()); gtint_t ldc_inc = std::get<11>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<12>(GetParam()); // Set the threshold for the errors: double thresh = 8*m*n*testinghelpers::getEpsilon(); @@ -90,13 +88,13 @@ TEST_P(ssymmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_symm(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_symm( storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class ssymmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uplo = std::get<2>(str.param); @@ -109,7 +107,6 @@ class ssymmTestPrint { gtint_t lda_inc = std::get<9>(str.param); gtint_t ldb_inc = std::get<10>(str.param); gtint_t ldc_inc = std::get<11>(str.param); - char datatype = std::get<12>(str.param); #ifdef TEST_BLAS std::string str_name = "ssymm_"; #elif TEST_CBLAS @@ -129,7 +126,6 @@ class ssymmTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -154,8 +150,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-1.0, 1.0), // beta ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(9)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : dcomplex datatype type tested + ::testing::Values(gtint_t(0), gtint_t(9)) // increment to the leading dim of c ), ::ssymmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/symm/symm.h b/gtestsuite/testsuite/level3/symm/symm.h index cc97c9304f..6f6037472b 100644 --- a/gtestsuite/testsuite/level3/symm/symm.h +++ b/gtestsuite/testsuite/level3/symm/symm.h @@ -172,4 +172,4 @@ static void symm( char storage, char side, char uplo, char conja, char transb, g #else throw std::runtime_error("Error in testsuite/level3/symm.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/symm/test_symm.h b/gtestsuite/testsuite/level3/symm/test_symm.h index 4274067b72..b3ebc37953 100644 --- a/gtestsuite/testsuite/level3/symm/test_symm.h +++ b/gtestsuite/testsuite/level3/symm/test_symm.h @@ -42,18 +42,15 @@ template void test_symm( char storage, char side, char uplo, char conja, char transb, - gtint_t m, gtint_t n, - gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, - T alpha, T beta, - double thresh, char datatype -) { - + gtint_t m, gtint_t n, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, + T alpha, T beta, double thresh ) +{ // Set the dimension for row/col of A, depending on the value of side. gtint_t k = ((side == 'l')||(side == 'L'))? m : n; // Compute the leading dimensions of a, b, and c. - gtint_t lda = testinghelpers::get_leading_dimension(storage, conja, k, k, lda_inc); - gtint_t ldb = testinghelpers::get_leading_dimension(storage, transb, m, n, ldb_inc); - gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldc_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, conja, k, k, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, transb, m, n, ldb_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, n, ldc_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. @@ -61,9 +58,9 @@ void test_symm( char storage, char side, char uplo, char conja, char transb, // Since matrix A, stored in a, is symmetric and we only use the upper or lower // part in the computation of hemm and zero-out the rest to ensure // that code operates as expected. - std::vector a = testinghelpers::get_random_matrix(-5, 2, storage, uplo, k, lda, datatype); - std::vector b = testinghelpers::get_random_matrix(-5, 2, storage, transb, m, n, ldb, datatype); - std::vector c = testinghelpers::get_random_matrix(-3, 5, storage, 'n', m, n, ldc, datatype); + std::vector a = testinghelpers::get_random_matrix( -5, 2, storage, uplo, k, lda ); + std::vector b = testinghelpers::get_random_matrix( -5, 2, storage, transb, m, n, ldb ); + std::vector c = testinghelpers::get_random_matrix( -3, 5, storage, 'n', m, n, ldc ); // Create a copy of c so that we can check reference results. std::vector c_ref(c); @@ -84,4 +81,4 @@ void test_symm( char storage, char side, char uplo, char conja, char transb, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp b/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp index 9585a8915b..53683dffb7 100644 --- a/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp @@ -47,10 +47,10 @@ class zsymmTest : dcomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zsymmTest, RandomData) { +TEST_P(zsymmTest, RandomData) +{ using T = dcomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -81,8 +81,6 @@ TEST_P(zsymmTest, RandomData) { gtint_t lda_inc = std::get<9>(GetParam()); gtint_t ldb_inc = std::get<10>(GetParam()); gtint_t ldc_inc = std::get<11>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<12>(GetParam()); // Set the threshold for the errors: double thresh = m*n*testinghelpers::getEpsilon(); @@ -90,13 +88,13 @@ TEST_P(zsymmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_symm(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_symm( storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class zsymmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uplo = std::get<2>(str.param); @@ -109,7 +107,6 @@ class zsymmTestPrint { gtint_t lda_inc = std::get<9>(str.param); gtint_t ldb_inc = std::get<10>(str.param); gtint_t ldc_inc = std::get<11>(str.param); - char datatype = std::get<12>(str.param); #ifdef TEST_BLAS std::string str_name = "zsymm_"; #elif TEST_CBLAS @@ -131,7 +128,6 @@ class zsymmTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -156,8 +152,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{-3.0, 2.0}, dcomplex{4.0, -1.0}), // beta ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : dcomplex datatype type tested + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of c ), ::zsymmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp index 6b359496a3..6e7cb8db09 100644 --- a/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp @@ -46,10 +46,10 @@ class csyr2kTest : scomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(csyr2kTest, RandomData) { +TEST_P(csyr2kTest, RandomData) +{ using T = scomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -77,8 +77,6 @@ TEST_P(csyr2kTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = m*k*testinghelpers::getEpsilon(); @@ -86,13 +84,13 @@ TEST_P(csyr2kTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syr2k(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_syr2k( storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class csyr2kTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -104,7 +102,6 @@ class csyr2kTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "csyr2k_"; #elif TEST_CBLAS @@ -126,7 +123,6 @@ class csyr2kTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -140,18 +136,17 @@ INSTANTIATE_TEST_SUITE_P( #ifndef TEST_BLAS ,'r' #endif - ), - ::testing::Values('u','l'), // storage format - ::testing::Values('n'), // u:upper, l:lower + ), // storage format + ::testing::Values('u','l'), // u:upper, l:lower ::testing::Values('n'), // transa - ::testing::Range(gtint_t(10), gtint_t(31), 10), // transb + ::testing::Values('n'), // transb ::testing::Range(gtint_t(10), gtint_t(31), 10), // m - ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}), // n - ::testing::Values(scomplex{-3.0, 2.0}, scomplex{4.0, -1.0}), // alpha - ::testing::Values(gtint_t(0), gtint_t(2)), // beta - ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // increment to the leading dim of c - ), // i : integer, f : dcomplex datatype type tested + ::testing::Range(gtint_t(10), gtint_t(31), 10), // n + ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}), // alpha + ::testing::Values(scomplex{-3.0, 2.0}, scomplex{4.0, -1.0}), // beta + ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a + ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of b + ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of c + ), ::csyr2kTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp index 39110773f3..a38fa2c512 100644 --- a/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp @@ -46,10 +46,10 @@ class dsyr2kTest : double, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dsyr2kTest, RandomData) { +TEST_P(dsyr2kTest, RandomData) +{ using T = double; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -77,8 +77,6 @@ TEST_P(dsyr2kTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = m*k*testinghelpers::getEpsilon(); @@ -86,13 +84,13 @@ TEST_P(dsyr2kTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syr2k(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_syr2k( storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class dsyr2kTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -104,7 +102,6 @@ class dsyr2kTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "dsyr2k_"; #elif TEST_CBLAS @@ -124,7 +121,6 @@ class dsyr2kTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -138,18 +134,17 @@ INSTANTIATE_TEST_SUITE_P( #ifndef TEST_BLAS ,'r' #endif - ), - ::testing::Values('u','l'), // storage format - ::testing::Values('n'), // u:upper, l:lower + ), // storage format + ::testing::Values('u','l'), // u:upper, l:lower ::testing::Values('n'), // transa - ::testing::Range(gtint_t(10), gtint_t(31), 10), // transb + ::testing::Values('n'), // transb ::testing::Range(gtint_t(10), gtint_t(31), 10), // m - ::testing::Values( 1.0, -2.0), // n - ::testing::Values(-1.0, 1.0), // alpha - ::testing::Values(gtint_t(0), gtint_t(4)), // beta - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // increment to the leading dim of c - ), // i : integer, f : dcomplex datatype type tested + ::testing::Range(gtint_t(10), gtint_t(31), 10), // n + ::testing::Values( 1.0, -2.0), // alpha + ::testing::Values(-1.0, 1.0), // beta + ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of a + ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of b + ::testing::Values(gtint_t(0), gtint_t(7)) // increment to the leading dim of c + ), ::dsyr2kTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp index ad6f883606..bcec08f487 100644 --- a/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp @@ -46,10 +46,10 @@ class ssyr2kTest : float, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ssyr2kTest, RandomData) { +TEST_P(ssyr2kTest, RandomData) +{ using T = float; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -77,8 +77,6 @@ TEST_P(ssyr2kTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = 10*m*k*testinghelpers::getEpsilon(); @@ -86,13 +84,13 @@ TEST_P(ssyr2kTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syr2k(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_syr2k( storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class ssyr2kTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -104,7 +102,6 @@ class ssyr2kTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "ssyr2k_"; #elif TEST_CBLAS @@ -124,7 +121,6 @@ class ssyr2kTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -138,18 +134,17 @@ INSTANTIATE_TEST_SUITE_P( #ifndef TEST_BLAS ,'r' #endif - ), - ::testing::Values('u','l'), // storage format - ::testing::Values('n'), // u:upper, l:lower - ::testing::Values('n'), // transa - ::testing::Range(gtint_t(10), gtint_t(31), 10), // transb - ::testing::Range(gtint_t(10), gtint_t(31), 10), // m - ::testing::Values( 1.0, -2.0), // n - ::testing::Values(-1.0, 1.0), // alpha - ::testing::Values(gtint_t(0), gtint_t(7)), // beta - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // increment to the leading dim of c - ), // i : integer, f : dcomplex datatype type tested + ), // storage format + ::testing::Values('u','l'), // u:upper, l:lower + ::testing::Values('n'), // transa + ::testing::Values('n'), // transb + ::testing::Range(gtint_t(10), gtint_t(31), 10), // m + ::testing::Range(gtint_t(10), gtint_t(31), 10), // n + ::testing::Values( 1.0, -2.0), // alpha + ::testing::Values(-1.0, 1.0), // beta + ::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a + ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of b + ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of c + ), ::ssyr2kTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syr2k/test_syr2k.h b/gtestsuite/testsuite/level3/syr2k/test_syr2k.h index 9a7fb82b6f..aebd1e2cc4 100644 --- a/gtestsuite/testsuite/level3/syr2k/test_syr2k.h +++ b/gtestsuite/testsuite/level3/syr2k/test_syr2k.h @@ -41,26 +41,24 @@ #include template -void test_syr2k( char storage, char uplo, char transa, char transb, - gtint_t m, gtint_t k, - gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, - T alpha, T beta, - double thresh, char datatype -) { +void test_syr2k( char storage, char uplo, char transa, char transb, gtint_t m, + gtint_t k, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, T alpha, + T beta, double thresh ) +{ // Compute the leading dimensions of a, b, and c. - gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, m, k, lda_inc); - gtint_t ldb = testinghelpers::get_leading_dimension(storage, transb, m, k, ldb_inc); - gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, m, ldc_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, transa, m, k, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, transb, m, k, ldb_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, m, ldc_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 8, storage, transa, m, k, lda, datatype); - std::vector b = testinghelpers::get_random_matrix(-5, 2, storage, transb, m, k, ldb, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, transa, m, k, lda ); + std::vector b = testinghelpers::get_random_matrix( -5, 2, storage, transb, m, k, ldb ); // Since matrix C, stored in c, is symmetric and we only use the upper or lower // part in the computation of her2k and zero-out the rest to ensure // that code operates as expected. - std::vector c = testinghelpers::get_random_matrix(-3, 5, storage, uplo, m, ldc, datatype); + std::vector c = testinghelpers::get_random_matrix(-3, 5, storage, uplo, m, ldc ); // Create a copy of c so that we can check reference results. std::vector c_ref(c); @@ -81,4 +79,4 @@ void test_syr2k( char storage, char uplo, char transa, char transb, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, m, c.data(), c_ref.data(), ldc, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp index 9b0d018768..0caf572134 100644 --- a/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp @@ -46,10 +46,10 @@ class zsyr2kTest : dcomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zsyr2kTest, RandomData) { +TEST_P(zsyr2kTest, RandomData) +{ using T = dcomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -77,8 +77,6 @@ TEST_P(zsyr2kTest, RandomData) { gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); gtint_t ldc_inc = std::get<10>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<11>(GetParam()); // Set the threshold for the errors: double thresh = m*k*testinghelpers::getEpsilon(); @@ -86,13 +84,13 @@ TEST_P(zsyr2kTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syr2k(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype); + test_syr2k( storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } class zsyr2kTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -104,7 +102,6 @@ class zsyr2kTestPrint { gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); gtint_t ldc_inc = std::get<10>(str.param); - char datatype = std::get<11>(str.param); #ifdef TEST_BLAS std::string str_name = "zsyr2k_"; #elif TEST_CBLAS @@ -126,7 +123,6 @@ class zsyr2kTestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -150,8 +146,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{-3.0, 2.0}, dcomplex{4.0, -1.0}), // beta ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of b - ::testing::Values(gtint_t(0), gtint_t(6)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : dcomplex datatype type tested + ::testing::Values(gtint_t(0), gtint_t(6)) // increment to the leading dim of c ), ::zsyr2kTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp index 092235019e..63a60703e5 100644 --- a/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp @@ -44,10 +44,10 @@ class csyrkTest : scomplex, scomplex, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(csyrkTest, RandomData) { +TEST_P(csyrkTest, RandomData) +{ using T = scomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -72,8 +72,6 @@ TEST_P(csyrkTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); gtint_t ldc_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = m*k*testinghelpers::getEpsilon(); @@ -81,13 +79,13 @@ TEST_P(csyrkTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syrk(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype); + test_syrk( storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh ); } class csyrkTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -97,7 +95,6 @@ class csyrkTestPrint { scomplex beta = std::get<6>(str.param); gtint_t lda_inc = std::get<7>(str.param); gtint_t ldc_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "csyrk_"; #elif TEST_CBLAS @@ -118,7 +115,6 @@ class csyrkTestPrint { str_name = str_name + "_a" + beta_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -140,8 +136,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}), // alpha ::testing::Values(scomplex{-3.0, 2.0}, scomplex{4.0, -1.0}), // beta ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : dcomplex datatype type tested + ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::csyrkTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp index af5d263e5c..1e01e15f13 100644 --- a/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp @@ -44,10 +44,10 @@ class dsyrkTest : double, double, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dsyrkTest, RandomData) { +TEST_P(dsyrkTest, RandomData) +{ using T = double; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -72,8 +72,6 @@ TEST_P(dsyrkTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); gtint_t ldc_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = m*k*testinghelpers::getEpsilon(); @@ -81,13 +79,13 @@ TEST_P(dsyrkTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syrk(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype); + test_syrk( storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh ); } class dsyrkTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -97,7 +95,6 @@ class dsyrkTestPrint { double beta = std::get<6>(str.param); gtint_t lda_inc = std::get<7>(str.param); gtint_t ldc_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "dsyrk_"; #elif TEST_CBLAS @@ -116,7 +113,6 @@ class dsyrkTestPrint { str_name = str_name + "_b" + beta_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -138,8 +134,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( 1.0, -2.0), // alpha ::testing::Values(-1.0, 1.0), // beta ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(9)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : dcomplex datatype type tested + ::testing::Values(gtint_t(0), gtint_t(9)) // increment to the leading dim of c ), ::dsyrkTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp index a413c6f15c..d959f444ab 100644 --- a/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp @@ -44,10 +44,10 @@ class ssyrkTest : float, float, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ssyrkTest, RandomData) { +TEST_P(ssyrkTest, RandomData) +{ using T = float; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -72,8 +72,6 @@ TEST_P(ssyrkTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); gtint_t ldc_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = m*k*testinghelpers::getEpsilon(); @@ -81,13 +79,13 @@ TEST_P(ssyrkTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syrk(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype); + test_syrk( storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh ); } class ssyrkTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -97,7 +95,6 @@ class ssyrkTestPrint { float beta = std::get<6>(str.param); gtint_t lda_inc = std::get<7>(str.param); gtint_t ldc_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "ssyrk_"; #elif TEST_CBLAS @@ -116,7 +113,6 @@ class ssyrkTestPrint { str_name = str_name + "_b" + beta_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -130,16 +126,15 @@ INSTANTIATE_TEST_SUITE_P( #ifndef TEST_BLAS ,'r' #endif - ), - ::testing::Values('u','l'), // storage format - ::testing::Values('n','t','c'), // u:upper, l:lower - ::testing::Range(gtint_t(10), gtint_t(31), 10), // transa + ), // storage format + ::testing::Values('u','l'), // u:upper, l:lower + ::testing::Values('n','t','c'), // transa ::testing::Range(gtint_t(10), gtint_t(31), 10), // m - ::testing::Values( 1.0, -2.0), // k - ::testing::Values(-1.0, 1.0), // alpha - ::testing::Values(gtint_t(0), gtint_t(3)), // beta - ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of a - ::testing::Values(ELEMENT_TYPE) // increment to the leading dim of c - ), // i : integer, f : dcomplex datatype type tested + ::testing::Range(gtint_t(10), gtint_t(31), 10), // k + ::testing::Values( 1.0, -2.0), // alpha + ::testing::Values(-1.0, 1.0), // beta + ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a + ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of c + ), ::ssyrkTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syrk/syrk.h b/gtestsuite/testsuite/level3/syrk/syrk.h index ecbea4725e..27628ac7e3 100644 --- a/gtestsuite/testsuite/level3/syrk/syrk.h +++ b/gtestsuite/testsuite/level3/syrk/syrk.h @@ -153,4 +153,4 @@ static void syrk( char storage, char uplo, char transa, gtint_t m, gtint_t k, #else throw std::runtime_error("Error in testsuite/level3/syrk.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syrk/test_syrk.h b/gtestsuite/testsuite/level3/syrk/test_syrk.h index 9c8585e64a..fc75b61df7 100644 --- a/gtestsuite/testsuite/level3/syrk/test_syrk.h +++ b/gtestsuite/testsuite/level3/syrk/test_syrk.h @@ -41,24 +41,21 @@ #include template -void test_syrk( char storage, char uplo, char transa, - gtint_t m, gtint_t k, - gtint_t lda_inc, gtint_t ldc_inc, - T alpha, T beta, - double thresh, char datatype -) { +void test_syrk( char storage, char uplo, char transa, gtint_t m, gtint_t k, + gtint_t lda_inc, gtint_t ldc_inc, T alpha, T beta, double thresh ) +{ // Compute the leading dimensions of a, b, and c. - gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, m, k, lda_inc); - gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, m, ldc_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, transa, m, k, lda_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, m, ldc_inc ); //---------------------------------------------------------- // Initialize matrics with random integer numbers. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, transa, m, k, lda, datatype ); + std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, transa, m, k, lda ); // Since matrix C, stored in c, is symmetric, we only use the upper or lower // part in the computation of syrk and zero-out the rest to ensure // that code operates as expected. - std::vector c = testinghelpers::get_random_matrix( -3, 5, storage, uplo, m, ldc, datatype ); + std::vector c = testinghelpers::get_random_matrix( -3, 5, storage, uplo, m, ldc ); // Create a copy of c so that we can check reference results. std::vector c_ref(c); @@ -76,4 +73,4 @@ void test_syrk( char storage, char uplo, char transa, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, m, c.data(), c_ref.data(), ldc, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp index 7bb7d9cedf..712f0d52eb 100644 --- a/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp @@ -44,10 +44,10 @@ class zsyrkTest : dcomplex, dcomplex, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(zsyrkTest, RandomData) { +TEST_P(zsyrkTest, RandomData) +{ using T = dcomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -72,8 +72,6 @@ TEST_P(zsyrkTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<7>(GetParam()); gtint_t ldc_inc = std::get<8>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<9>(GetParam()); // Set the threshold for the errors: double thresh = m*k*testinghelpers::getEpsilon(); @@ -81,13 +79,13 @@ TEST_P(zsyrkTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_syrk(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype); + test_syrk( storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh ); } class zsyrkTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char uplo = std::get<1>(str.param); char tsa = std::get<2>(str.param); @@ -97,7 +95,6 @@ class zsyrkTestPrint { dcomplex beta = std::get<6>(str.param); gtint_t lda_inc = std::get<7>(str.param); gtint_t ldc_inc = std::get<8>(str.param); - char datatype = std::get<9>(str.param); #ifdef TEST_BLAS std::string str_name = "zsyrk_"; #elif TEST_CBLAS @@ -118,7 +115,6 @@ class zsyrkTestPrint { str_name = str_name + "_a" + beta_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -140,8 +136,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{2.0, -1.0}, dcomplex{-2.0, 3.0}), // alpha ::testing::Values(dcomplex{-3.0, 2.0}, dcomplex{4.0, -1.0}), // beta ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : dcomplex datatype type tested + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of c ), ::zsyrkTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp index a875f77282..603a1287c7 100644 --- a/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp @@ -45,10 +45,10 @@ class ctrmmTest : gtint_t, scomplex, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ctrmmTest, RandomData) { +TEST_P(ctrmmTest, RandomData) +{ using T = scomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -76,8 +76,6 @@ TEST_P(ctrmmTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = m*n*testinghelpers::getEpsilon(); @@ -85,13 +83,13 @@ TEST_P(ctrmmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype ); + test_trmm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh ); } class ctrmmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -102,7 +100,6 @@ class ctrmmTestPrint { scomplex alpha = std::get<7>(str.param); gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "ctrmm_"; #elif TEST_CBLAS @@ -120,7 +117,6 @@ class ctrmmTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -143,8 +139,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(31), 10), // n ::testing::Values(scomplex{2.0,-1.0}), // alpha ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of b ), ::ctrmmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp index 94fb07ba3c..6b65bdb6dc 100644 --- a/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp @@ -45,10 +45,10 @@ class dtrmmTest : gtint_t, double, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dtrmmTest, RandomData) { +TEST_P(dtrmmTest, RandomData) +{ using T = double; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -76,8 +76,6 @@ TEST_P(dtrmmTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = m*n*testinghelpers::getEpsilon(); @@ -85,13 +83,13 @@ TEST_P(dtrmmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype ); + test_trmm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh ); } class dtrmmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -102,7 +100,6 @@ class dtrmmTestPrint { double alpha = std::get<7>(str.param); gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "dtrmm_"; #elif TEST_CBLAS @@ -119,7 +116,6 @@ class dtrmmTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -142,8 +138,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(31), 10), // n ::testing::Values( 1.0, -2.0), // alpha ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of b ), ::dtrmmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp index df2287c90a..5b4718e269 100644 --- a/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp @@ -45,10 +45,10 @@ class strmmTest : gtint_t, float, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(strmmTest, RandomData) { +TEST_P(strmmTest, RandomData) +{ using T = float; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -76,8 +76,6 @@ TEST_P(strmmTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = 20*m*n*testinghelpers::getEpsilon(); @@ -85,13 +83,13 @@ TEST_P(strmmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype ); + test_trmm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh ); } class strmmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -102,7 +100,6 @@ class strmmTestPrint { float alpha = std::get<7>(str.param); gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "strmm_"; #elif TEST_CBLAS @@ -119,7 +116,6 @@ class strmmTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -142,8 +138,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(31), 10), // n ::testing::Values( 1.0, -2.0), // alpha ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of b ), ::strmmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm/test_trmm.h b/gtestsuite/testsuite/level3/trmm/test_trmm.h index 1993127bae..96998bd6cb 100644 --- a/gtestsuite/testsuite/level3/trmm/test_trmm.h +++ b/gtestsuite/testsuite/level3/trmm/test_trmm.h @@ -42,20 +42,19 @@ #include template -void test_trmm( char storage, char side, char uploa, char transa, - char diaga, gtint_t m, gtint_t n, T alpha, gtint_t lda_inc, - gtint_t ldb_inc, double thresh, char datatype ) { - +void test_trmm( char storage, char side, char uploa, char transa, char diaga, + gtint_t m, gtint_t n, T alpha, gtint_t lda_inc, gtint_t ldb_inc, double thresh ) +{ gtint_t mn; testinghelpers::set_dim_with_side( side, m, n, &mn ); - gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, mn, mn, lda_inc); - gtint_t ldb = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldb_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, transa, mn, mn, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, 'n', m, n, ldb_inc ); //---------------------------------------------------------- // Initialize matrics with random values. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 8, storage, transa, mn, mn, lda, datatype); - std::vector b = testinghelpers::get_random_matrix(-5, 2, storage, 'n', m, n, ldb, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, transa, mn, mn, lda ); + std::vector b = testinghelpers::get_random_matrix( -5, 2, storage, 'n', m, n, ldb ); // Create a copy of v so that we can check reference results. std::vector b_ref(b); @@ -75,4 +74,4 @@ void test_trmm( char storage, char side, char uploa, char transa, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, b.data(), b_ref.data(), ldb, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm/trmm.h b/gtestsuite/testsuite/level3/trmm/trmm.h index 267aa41e7e..51daceccdf 100644 --- a/gtestsuite/testsuite/level3/trmm/trmm.h +++ b/gtestsuite/testsuite/level3/trmm/trmm.h @@ -167,4 +167,4 @@ static void trmm( char storage, char side, char uploa, char transa, char diaga, #else throw std::runtime_error("Error in testsuite/level3/trmm.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp index 823f9fdcf3..e127a0d33c 100644 --- a/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp @@ -45,10 +45,10 @@ class ztrmmTest : gtint_t, dcomplex, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ztrmmTest, RandomData) { +TEST_P(ztrmmTest, RandomData) +{ using T = dcomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -76,8 +76,6 @@ TEST_P(ztrmmTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = m*n*testinghelpers::getEpsilon(); @@ -85,13 +83,13 @@ TEST_P(ztrmmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype ); + test_trmm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh ); } class ztrmmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -102,7 +100,6 @@ class ztrmmTestPrint { dcomplex alpha = std::get<7>(str.param); gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "ztrmm_"; #elif TEST_CBLAS @@ -120,7 +117,6 @@ class ztrmmTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -143,8 +139,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(31), 10), // n ::testing::Values(dcomplex{1.0,2.0}), // alpha ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(1)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of b ), ::ztrmmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp index a10d9866ef..ccb9770dbe 100644 --- a/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp @@ -48,12 +48,12 @@ class ctrmm3Test : scomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ctrmm3Test); -TEST_P(ctrmm3Test, RandomData) { +TEST_P(ctrmm3Test, RandomData) +{ using T = scomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -86,8 +86,6 @@ TEST_P(ctrmm3Test, RandomData) { gtint_t lda_inc = std::get<10>(GetParam()); gtint_t ldb_inc = std::get<11>(GetParam()); gtint_t ldc_inc = std::get<12>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<13>(GetParam()); // Set the threshold for the errors: double thresh = m*n*testinghelpers::getEpsilon(); @@ -95,13 +93,13 @@ TEST_P(ctrmm3Test, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmm3( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh, datatype ); + test_trmm3( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh ); } class ctrmm3TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -115,7 +113,6 @@ class ctrmm3TestPrint { gtint_t lda_inc = std::get<10>(str.param); gtint_t ldb_inc = std::get<11>(str.param); gtint_t ldc_inc = std::get<12>(str.param); - char datatype = std::get<13>(str.param); std::string str_name = "blis_ctrmm3"; str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa + transb; @@ -131,7 +128,6 @@ class ctrmm3TestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -154,9 +150,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(scomplex{-1.0,1.0}), // beta ::testing::Values(gtint_t(0)), // increment to the leading dim of a ::testing::Values(gtint_t(0)), // increment to the leading dim of b - ::testing::Values(gtint_t(0)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0)) // increment to the leading dim of c ), ::ctrmm3TestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp index 222d70604e..c86ae0ddd1 100644 --- a/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp @@ -48,12 +48,12 @@ class dtrmm3Test : double, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dtrmm3Test); -TEST_P(dtrmm3Test, RandomData) { +TEST_P(dtrmm3Test, RandomData) +{ using T = double; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -86,8 +86,6 @@ TEST_P(dtrmm3Test, RandomData) { gtint_t lda_inc = std::get<10>(GetParam()); gtint_t ldb_inc = std::get<11>(GetParam()); gtint_t ldc_inc = std::get<12>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<13>(GetParam()); // Set the threshold for the errors: double thresh = m*n*testinghelpers::getEpsilon(); @@ -95,13 +93,13 @@ TEST_P(dtrmm3Test, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmm3( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh, datatype ); + test_trmm3( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh ); } class dtrmm3TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -115,7 +113,6 @@ class dtrmm3TestPrint { gtint_t lda_inc = std::get<10>(str.param); gtint_t ldb_inc = std::get<11>(str.param); gtint_t ldc_inc = std::get<12>(str.param); - char datatype = std::get<13>(str.param); std::string str_name = "blis_dtrmm3"; str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa + transb; @@ -129,7 +126,6 @@ class dtrmm3TestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -152,9 +148,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-1.0, 2.0), // beta ::testing::Values(gtint_t(0)), // increment to the leading dim of a ::testing::Values(gtint_t(0)), // increment to the leading dim of b - ::testing::Values(gtint_t(0)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0)) // increment to the leading dim of c ), ::dtrmm3TestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp index df6e4e9bee..9cc27d2e6d 100644 --- a/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp @@ -48,12 +48,12 @@ class strmm3Test : float, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(strmm3Test); -TEST_P(strmm3Test, RandomData) { +TEST_P(strmm3Test, RandomData) +{ using T = float; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -86,8 +86,6 @@ TEST_P(strmm3Test, RandomData) { gtint_t lda_inc = std::get<10>(GetParam()); gtint_t ldb_inc = std::get<11>(GetParam()); gtint_t ldc_inc = std::get<12>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<13>(GetParam()); // Set the threshold for the errors: double thresh = m*n*testinghelpers::getEpsilon(); @@ -95,13 +93,13 @@ TEST_P(strmm3Test, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmm3( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh, datatype ); + test_trmm3( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh ); } class strmm3TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -115,7 +113,6 @@ class strmm3TestPrint { gtint_t lda_inc = std::get<10>(str.param); gtint_t ldb_inc = std::get<11>(str.param); gtint_t ldc_inc = std::get<12>(str.param); - char datatype = std::get<13>(str.param); std::string str_name = "blis_strmm3"; str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa + transb; @@ -129,7 +126,6 @@ class strmm3TestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -152,9 +148,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-1.0, 2.0), // beta ::testing::Values(gtint_t(0)), // increment to the leading dim of a ::testing::Values(gtint_t(0)), // increment to the leading dim of b - ::testing::Values(gtint_t(0)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0)) // increment to the leading dim of c ), ::strmm3TestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm3/test_trmm3.h b/gtestsuite/testsuite/level3/trmm3/test_trmm3.h index 779f2fef50..208d64a1e0 100644 --- a/gtestsuite/testsuite/level3/trmm3/test_trmm3.h +++ b/gtestsuite/testsuite/level3/trmm3/test_trmm3.h @@ -43,21 +43,21 @@ template void test_trmm3( char storage, char side, char uploa, char transa, char diaga, - char transb, gtint_t m, gtint_t n, T alpha, gtint_t lda_inc, gtint_t ldb_inc, - T beta, gtint_t ldc_inc, double thresh, char datatype ) { - + char transb, gtint_t m, gtint_t n, T alpha, gtint_t lda_inc, gtint_t ldb_inc, + T beta, gtint_t ldc_inc, double thresh ) +{ gtint_t mn; testinghelpers::set_dim_with_side( side, m, n, &mn ); - gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, mn, mn, lda_inc); - gtint_t ldb = testinghelpers::get_leading_dimension(storage, transb, m, n, ldb_inc); - gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldc_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, transa, mn, mn, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, transb, m, n, ldb_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, n, ldc_inc ); //---------------------------------------------------------- // Initialize matrics with random values. //---------------------------------------------------------- - std::vector a = testinghelpers::get_random_matrix(-2, 8, storage, transa, mn, mn, lda, datatype); - std::vector b = testinghelpers::get_random_matrix(-5, 2, storage, transb, m, n, ldb, datatype); - std::vector c = testinghelpers::get_random_matrix(-3, 5, storage, 'n', m, n, ldc, datatype); + std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, transa, mn, mn, lda ); + std::vector b = testinghelpers::get_random_matrix( -5, 2, storage, transb, m, n, ldb ); + std::vector c = testinghelpers::get_random_matrix( -3, 5, storage, 'n', m, n, ldc ); // Create a copy of v so that we can check reference results. std::vector c_ref(c); @@ -78,4 +78,4 @@ void test_trmm3( char storage, char side, char uploa, char transa, char diaga, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, c.data(), c_ref.data(), ldb, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm3/trmm3.h b/gtestsuite/testsuite/level3/trmm3/trmm3.h index 2bd52db11a..77be6ce392 100644 --- a/gtestsuite/testsuite/level3/trmm3/trmm3.h +++ b/gtestsuite/testsuite/level3/trmm3/trmm3.h @@ -136,4 +136,4 @@ static void trmm3( char storage, char side, char uploa, char transa, char diaga, #else throw std::runtime_error("Error in testsuite/level3/trmm3.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp index f32c5caab8..2818daf7be 100644 --- a/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp @@ -48,12 +48,12 @@ class ztrmm3Test : dcomplex, gtint_t, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ztrmm3Test); -TEST_P(ztrmm3Test, RandomData) { +TEST_P(ztrmm3Test, RandomData) +{ using T = dcomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -86,8 +86,6 @@ TEST_P(ztrmm3Test, RandomData) { gtint_t lda_inc = std::get<10>(GetParam()); gtint_t ldb_inc = std::get<11>(GetParam()); gtint_t ldc_inc = std::get<12>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<13>(GetParam()); // Set the threshold for the errors: double thresh = m*n*testinghelpers::getEpsilon(); @@ -95,13 +93,13 @@ TEST_P(ztrmm3Test, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trmm3( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh, datatype ); + test_trmm3( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh ); } class ztrmm3TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -115,7 +113,6 @@ class ztrmm3TestPrint { gtint_t lda_inc = std::get<10>(str.param); gtint_t ldb_inc = std::get<11>(str.param); gtint_t ldc_inc = std::get<12>(str.param); - char datatype = std::get<13>(str.param); std::string str_name = "blis_ztrmm3"; str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa + transb; @@ -131,7 +128,6 @@ class ztrmm3TestPrint { str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -154,9 +150,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{2.0,-1.0}), // beta ::testing::Values(gtint_t(0)), // increment to the leading dim of a ::testing::Values(gtint_t(0)), // increment to the leading dim of b - ::testing::Values(gtint_t(0)), // increment to the leading dim of c - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0)) // increment to the leading dim of c ), ::ztrmm3TestPrint() ); -#endif +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp index d4644da077..a33621091b 100644 --- a/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp @@ -45,10 +45,10 @@ class ctrsmTest : gtint_t, scomplex, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ctrsmTest, RandomData) { +TEST_P(ctrsmTest, RandomData) +{ using T = scomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -76,8 +76,6 @@ TEST_P(ctrsmTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = std::max(m, n)*testinghelpers::getEpsilon(); @@ -85,13 +83,13 @@ TEST_P(ctrsmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trsm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype ); + test_trsm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh ); } class ctrsmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -102,7 +100,6 @@ class ctrsmTestPrint { scomplex alpha = std::get<7>(str.param); gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "ctrsm_"; #elif TEST_CBLAS @@ -120,7 +117,6 @@ class ctrsmTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -143,8 +139,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(31), 10), // n ::testing::Values(scomplex{2.0,-1.0}), // alpha ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of b ), ::ctrsmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp index 9995ca3c6c..0dde0d4545 100644 --- a/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp @@ -45,10 +45,10 @@ class dtrsmTest : gtint_t, double, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(dtrsmTest, RandomData) { +TEST_P(dtrsmTest, RandomData) +{ using T = double; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -76,8 +76,6 @@ TEST_P(dtrsmTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = std::max(m, n)*testinghelpers::getEpsilon(); @@ -85,13 +83,13 @@ TEST_P(dtrsmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trsm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype ); + test_trsm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh ); } class dtrsmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -102,7 +100,6 @@ class dtrsmTestPrint { double alpha = std::get<7>(str.param); gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "dtrsm_"; #elif TEST_CBLAS @@ -119,7 +116,6 @@ class dtrsmTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -142,8 +138,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(11), 10), // n ::testing::Values( 1.0, -2.0), // alpha ::testing::Values(gtint_t(0), gtint_t(5)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of b ), ::dtrsmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp index aa69d719ac..23922a08ba 100644 --- a/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp @@ -45,10 +45,10 @@ class strsmTest : gtint_t, float, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(strsmTest, RandomData) { +TEST_P(strsmTest, RandomData) +{ using T = float; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -76,8 +76,6 @@ TEST_P(strsmTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = std::max(m, n)*testinghelpers::getEpsilon(); @@ -85,13 +83,13 @@ TEST_P(strsmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trsm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype ); + test_trsm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh ); } class strsmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -102,7 +100,6 @@ class strsmTestPrint { float alpha = std::get<7>(str.param); gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "strsm_"; #elif TEST_CBLAS @@ -119,7 +116,6 @@ class strsmTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -142,8 +138,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(11), 10), // n ::testing::Values( 1.0, -2.0), // alpha ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(4)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of b ), ::strsmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trsm/test_trsm.h b/gtestsuite/testsuite/level3/trsm/test_trsm.h index 7145a92156..dc1e2a6cfa 100644 --- a/gtestsuite/testsuite/level3/trsm/test_trsm.h +++ b/gtestsuite/testsuite/level3/trsm/test_trsm.h @@ -42,22 +42,21 @@ #include template -void test_trsm( char storage, char side, char uploa, char transa, - char diaga, gtint_t m, gtint_t n, T alpha, gtint_t lda_inc, - gtint_t ldb_inc, double thresh, char datatype ) { - +void test_trsm( char storage, char side, char uploa, char transa, char diaga, + gtint_t m, gtint_t n, T alpha, gtint_t lda_inc, gtint_t ldb_inc, double thresh ) +{ gtint_t mn; testinghelpers::set_dim_with_side( side, m, n, &mn ); - gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, mn, mn, lda_inc); - gtint_t ldb = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldb_inc); + gtint_t lda = testinghelpers::get_leading_dimension( storage, transa, mn, mn, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, 'n', m, n, ldb_inc ); //---------------------------------------------------------- // Initialize matrics with random values. //---------------------------------------------------------- gtint_t lower = (diaga = 'n')||(diaga = 'N') ? 3 : 0; gtint_t upper = (diaga = 'n')||(diaga = 'N') ? 10 : 1; - std::vector a = testinghelpers::get_random_matrix(lower, upper, storage, transa, mn, mn, lda, datatype); - std::vector b = testinghelpers::get_random_matrix(3, 10, storage, 'n', m, n, ldb, datatype); + std::vector a = testinghelpers::get_random_matrix( lower, upper, storage, transa, mn, mn, lda ); + std::vector b = testinghelpers::get_random_matrix( 3, 10, storage, 'n', m, n, ldb ); // Making A diagonally dominant so that the condition number is good and // the algorithm doesn't diverge. @@ -77,10 +76,11 @@ void test_trsm( char storage, char side, char uploa, char transa, //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_trsm( storage, side, uploa, transa, diaga, m, n, alpha, a.data(), lda, b_ref.data(), ldb ); + testinghelpers::ref_trsm( storage, side, uploa, transa, diaga, m, n, alpha, a.data(), + lda, b_ref.data(), ldb ); //---------------------------------------------------------- // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, b.data(), b_ref.data(), ldb, thresh ); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trsm/trsm.h b/gtestsuite/testsuite/level3/trsm/trsm.h index bb7f0469e2..8d26f1303b 100644 --- a/gtestsuite/testsuite/level3/trsm/trsm.h +++ b/gtestsuite/testsuite/level3/trsm/trsm.h @@ -167,4 +167,4 @@ static void trsm( char storage, char side, char uploa, char transa, char diaga, #else throw std::runtime_error("Error in testsuite/level3/trsm.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp index 1987251fc2..0b5530e05d 100644 --- a/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp @@ -45,10 +45,10 @@ class ztrsmTest : gtint_t, dcomplex, gtint_t, - gtint_t, - char>> {}; + gtint_t>> {}; -TEST_P(ztrsmTest, RandomData) { +TEST_P(ztrsmTest, RandomData) +{ using T = dcomplex; //---------------------------------------------------------- // Initialize values from the parameters passed through @@ -76,8 +76,6 @@ TEST_P(ztrsmTest, RandomData) { // If increments are nonnegative, the array size is bigger than the matrix size. gtint_t lda_inc = std::get<8>(GetParam()); gtint_t ldb_inc = std::get<9>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<10>(GetParam()); // Set the threshold for the errors: double thresh = std::max(m, n)*testinghelpers::getEpsilon(); @@ -85,13 +83,13 @@ TEST_P(ztrsmTest, RandomData) { //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_trsm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype ); + test_trsm( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh ); } class ztrsmTestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { char sfm = std::get<0>(str.param); char side = std::get<1>(str.param); char uploa = std::get<2>(str.param); @@ -102,7 +100,6 @@ class ztrsmTestPrint { dcomplex alpha = std::get<7>(str.param); gtint_t lda_inc = std::get<8>(str.param); gtint_t ldb_inc = std::get<9>(str.param); - char datatype = std::get<10>(str.param); #ifdef TEST_BLAS std::string str_name = "ztrsm_"; #elif TEST_CBLAS @@ -120,7 +117,6 @@ class ztrsmTestPrint { str_name = str_name + "_a" + alpha_str; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); - str_name = str_name + "_" + datatype; return str_name; } }; @@ -143,8 +139,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Range(gtint_t(10), gtint_t(11), 10), // n ::testing::Values(dcomplex{1.0,2.0}), // alpha ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of b - ::testing::Values(ELEMENT_TYPE) // i : integer, f : float datatype type tested + ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of b ), ::ztrsmTestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp index a020075f2c..898f4fee5c 100644 --- a/gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp @@ -36,7 +36,7 @@ #include "test_nrm2.h" class CNrm2Test : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; TEST_P( CNrm2Test, RandomData ) { @@ -49,8 +49,6 @@ TEST_P( CNrm2Test, RandomData ) gtint_t n = std::get<0>(GetParam()); // stride size for x: gtint_t incx = std::get<1>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<2>(GetParam()); // Set the threshold for the errors: double thresh = std::sqrt(n)*testinghelpers::getEpsilon(); @@ -58,17 +56,16 @@ TEST_P( CNrm2Test, RandomData ) //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_nrm2(n, incx, thresh, datatype); + test_nrm2( n, incx, thresh ); } // Prints the test case combination class CNrm2TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); gtint_t incx = std::get<1>(str.param); - char datatype = std::get<2>(str.param); #ifdef TEST_BLAS std::string str_name = "scnrm2_"; #elif TEST_CBLAS @@ -79,7 +76,6 @@ class CNrm2TestPrint { str_name = str_name + "_" + std::to_string(n); std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -94,8 +90,7 @@ INSTANTIATE_TEST_SUITE_P( #ifndef TEST_BLIS_TYPED , gtint_t(-1), gtint_t(-2) #endif - ), // stride size for x - ::testing::Values('i') // i : integer, f : float datatype type tested + ) // stride size for x ), ::CNrm2TestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp index 245b5f49ac..2ea60db522 100644 --- a/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp @@ -36,7 +36,7 @@ #include "test_nrm2.h" class dnrm2Test : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; TEST_P( dnrm2Test, RandomData ) { @@ -49,8 +49,6 @@ TEST_P( dnrm2Test, RandomData ) gtint_t n = std::get<0>(GetParam()); // stride size for x: gtint_t incx = std::get<1>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<2>(GetParam()); // Set the threshold for the errors: double thresh = std::sqrt(n)*testinghelpers::getEpsilon(); @@ -58,17 +56,16 @@ TEST_P( dnrm2Test, RandomData ) //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_nrm2(n, incx, thresh, datatype); + test_nrm2( n, incx, thresh ); } // Prints the test case combination class dnrm2TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); gtint_t incx = std::get<1>(str.param); - char datatype = std::get<2>(str.param); #ifdef TEST_BLAS std::string str_name = "dnrm2_"; #elif TEST_CBLAS @@ -79,7 +76,6 @@ class dnrm2TestPrint { str_name = str_name + "_" + std::to_string(n); std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -94,8 +90,7 @@ INSTANTIATE_TEST_SUITE_P( #ifndef TEST_BLIS_TYPED ,gtint_t(-1), gtint_t(-2) #endif - ), // stride size for x - ::testing::Values('i') // i : integer, f : float datatype type tested + ) // stride size for x ), ::dnrm2TestPrint() ); diff --git a/gtestsuite/testsuite/util/nrm2/nrm2.h b/gtestsuite/testsuite/util/nrm2/nrm2.h index 9d54d51f65..6e9de9e547 100644 --- a/gtestsuite/testsuite/util/nrm2/nrm2.h +++ b/gtestsuite/testsuite/util/nrm2/nrm2.h @@ -103,4 +103,4 @@ static Treal nrm2(gtint_t n, T* x, gtint_t incx) #else throw std::runtime_error("Error in testsuite/level1/axpyv.h: No interfaces are set to be tested."); #endif -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp index 5bd2bb46e6..7080a144ea 100644 --- a/gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp +++ b/gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp @@ -74,4 +74,4 @@ TEST(dnrm2, largeDouble) { norm = nrm2(n, y.data(), 1); EXPECT_EQ(5e300, norm); -} +} \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp index e23bc0d90c..6dcd793253 100644 --- a/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp @@ -36,7 +36,7 @@ #include "test_nrm2.h" class snrm2Test : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; TEST_P( snrm2Test, RandomData ) { @@ -49,8 +49,6 @@ TEST_P( snrm2Test, RandomData ) gtint_t n = std::get<0>(GetParam()); // stride size for x: gtint_t incx = std::get<1>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<2>(GetParam()); // Set the threshold for the errors: double thresh = 2*n*testinghelpers::getEpsilon(); @@ -58,17 +56,16 @@ TEST_P( snrm2Test, RandomData ) //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_nrm2(n, incx, thresh, datatype); + test_nrm2( n, incx, thresh ); } // Prints the test case combination class snrm2TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); gtint_t incx = std::get<1>(str.param); - char datatype = std::get<2>(str.param); #ifdef TEST_BLAS std::string str_name = "snrm2_"; #elif TEST_CBLAS @@ -79,7 +76,6 @@ class snrm2TestPrint { str_name = str_name + "_" + std::to_string(n); std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -94,8 +90,7 @@ INSTANTIATE_TEST_SUITE_P( #ifndef TEST_BLIS_TYPED ,gtint_t(-1), gtint_t(-2) #endif - ), // stride size for x - ::testing::Values('i') // i : integer, f : float datatype type tested + ) // stride size for x ), ::snrm2TestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/test_nrm2.h b/gtestsuite/testsuite/util/nrm2/test_nrm2.h index 2c9de86dc4..6964382aee 100644 --- a/gtestsuite/testsuite/util/nrm2/test_nrm2.h +++ b/gtestsuite/testsuite/util/nrm2/test_nrm2.h @@ -39,14 +39,15 @@ #include "inc/check_error.h" template -void test_nrm2( gtint_t n, gtint_t incx, double thresh, char datatype ) +void test_nrm2( gtint_t n, gtint_t incx, double thresh ) { //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x( testinghelpers::buff_dim(n, incx) ); - testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data(), datatype ); - + //std::vector x( testinghelpers::buff_dim( n, incx ) ); + //testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data() ); + std::vector x = testinghelpers::get_random_vector( -10, -10, n, incx ); + //---------------------------------------------------------- // Call reference implementation to get ref results. //---------------------------------------------------------- @@ -57,11 +58,10 @@ void test_nrm2( gtint_t n, gtint_t incx, double thresh, char datatype ) //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - real norm = nrm2(n, x.data(), incx); + real norm = nrm2( n, x.data(), incx ); //---------------------------------------------------------- // Compute error. //---------------------------------------------------------- computediff( norm, norm_ref, thresh ); -} - +} \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp index 55c1b9be07..cec642871f 100644 --- a/gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp @@ -36,7 +36,7 @@ #include "test_nrm2.h" class znrm2Test : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; TEST_P( znrm2Test, RandomData ) { @@ -49,8 +49,6 @@ TEST_P( znrm2Test, RandomData ) gtint_t n = std::get<0>(GetParam()); // stride size for x: gtint_t incx = std::get<1>(GetParam()); - // specifies the datatype for randomgenerators - char datatype = std::get<2>(GetParam()); // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); @@ -58,17 +56,16 @@ TEST_P( znrm2Test, RandomData ) //---------------------------------------------------------- // Call test body using these parameters //---------------------------------------------------------- - test_nrm2(n, incx, thresh, datatype); + test_nrm2( n, incx, thresh ); } // Prints the test case combination class znrm2TestPrint { public: std::string operator()( - testing::TestParamInfo> str) const { + testing::TestParamInfo> str) const { gtint_t n = std::get<0>(str.param); gtint_t incx = std::get<1>(str.param); - char datatype = std::get<2>(str.param); #ifdef TEST_BLAS std::string str_name = "dznrm2_"; #elif TEST_CBLAS @@ -79,7 +76,6 @@ class znrm2TestPrint { str_name = str_name + "_" + std::to_string(n); std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; - str_name = str_name + "_" + datatype; return str_name; } }; @@ -94,8 +90,7 @@ INSTANTIATE_TEST_SUITE_P( #ifndef TEST_BLIS_TYPED ,gtint_t(-1), gtint_t(-2) #endif - ), // stride size for x - ::testing::Values('i') // i : integer, f : float datatype type tested + ) // stride size for x ), ::znrm2TestPrint() - ); + ); \ No newline at end of file From cdba2db827ac44b3f049f22ee0239efc85b05ff6 Mon Sep 17 00:00:00 2001 From: Chandrashekara K R Date: Wed, 7 Jun 2023 11:19:39 +0530 Subject: [PATCH 110/226] BLIS: Added address sanitizer flag for blis library on windows. Description: Added cmake option to test address related issues using address sanitizer(-fsanitizer=address) on windows. When the user enable the ENABLE_ASAN_TESTS option, cmake will add related compiler and linker flags along with dependent libraries. AMD-Internal: [CPUPL-2984] Change-Id: I6d2a0cfe84fe122fc6c40e3023d8c79211d5fa71 --- CMakeLists.txt | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4051c79c9b..8ffa646e9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,6 +110,8 @@ option (ENABLE_AOCL_DYNAMIC "Enable Dynamic Multi-threading" OFF) option(DISABLE_BLIS_ARCH_TYPE "Disable BLIS_ARCH_TYPE and BLIS_MODEL_TYPE functionality" OFF) option(RENAME_BLIS_ARCH_TYPE "Rename BLIS_ARCH_TYPE env var renamed to supplied value" BLIS_ARCH_TYPE) option(RENAME_BLIS_MODEL_TYPE "Rename BLIS_MODEL_TYPE env var renamed to supplied value" BLIS_MODEL_TYPE) +option(ENABLE_ASAN_TESTS "Enable Address Sanitiser tests" OFF) +SET(ASAN_DEPENDENCY_LIB_DIR "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/Llvm/x64/lib/clang/15.0.1/lib/windows" CACHE STRING "ASAN Dependent library folder name") if (${AOCL_BLIS_FAMILY} STREQUAL "amdzen") set(REF_KERNEL_MIRRORING_PY "${CMAKE_SOURCE_DIR}/build/blis_ref_kernel_mirror.py") @@ -615,6 +617,19 @@ file (STRINGS "version" BLIS_VERSION) set(BLIS_VERSION_STRING ${BLIS_VERSION}) string(TIMESTAMP BUILD_DATE "%Y%m%d") add_definitions(-DBLIS_VERSION_STRING="AOCL-BLIS ${BLIS_VERSION_STRING} Build ${BUILD_DATE}") +if (ENABLE_ASAN_TESTS) + if(ENABLE_OPENMP) + set(STATIC_LIB_OPTIONS "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic-x86_64.lib" "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic_runtime_thunk-x86_64.lib" "${OpenMP_libomp_LIBRARY}") + else () + set(STATIC_LIB_OPTIONS "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic-x86_64.lib" "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic_runtime_thunk-x86_64.lib") + endif() +else() + if(ENABLE_OPENMP) + set(STATIC_LIB_OPTIONS "${OpenMP_libomp_LIBRARY}") + else (ENABLE_OPENMP) + set(STATIC_LIB_OPTIONS "") + endif() +endif () # Set object libraries created in kernels directory to be added into BLIS library. set(OBJECT_LIBRARIES @@ -659,10 +674,17 @@ if(NOT BUILD_SHARED_LIBS) ${headers} ${OBJECT_LIBRARIES} ) - if(ENABLE_OPENMP) - set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C OUTPUT_NAME "${LIB_NAME}" STATIC_LIBRARY_OPTIONS "${OpenMP_libomp_LIBRARY}") - else() - set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C OUTPUT_NAME "${LIB_NAME}") + set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C OUTPUT_NAME "${LIB_NAME}" STATIC_LIBRARY_OPTIONS "${STATIC_LIB_OPTIONS}") +endif() + +# Enabling the address sanitizer tests. +if (ENABLE_ASAN_TESTS) + target_compile_options("${PROJECT_NAME}" PRIVATE -fsanitize=address /Od) + if(BUILD_SHARED_LIBS) + # /MD will be used implicitly + target_link_directories("${PROJECT_NAME}" PRIVATE ${ASAN_DEPENDENCY_LIB_DIR}) + target_link_libraries("${PROJECT_NAME}" PRIVATE clang_rt.asan_dynamic-x86_64 clang_rt.asan_dynamic_runtime_thunk-x86_64) + target_link_options("${PROJECT_NAME}" PRIVATE /wholearchive:clang_rt.asan_dynamic_runtime_thunk-x86_64.lib) endif() endif() From fb6f1380b2f9d53bb5e29c3d862213e225f2e2fe Mon Sep 17 00:00:00 2001 From: jagar Date: Mon, 24 Apr 2023 12:27:27 +0530 Subject: [PATCH 111/226] Gtestsuite:Added util functions - Functions to print matrix and vector elements. - Functions to convert matrix to symmetric, hermitian triangular matrix and set diagonal elements in matrix. AMD-Internal: [CPUPL-2732] Change-Id: I1ffa5289329cbb8a9581bf545bdd157801cf5baa --- .../inc/common/testing_basics.h | 100 +++++- .../src/common/testing_basics.cpp | 304 +++++++++++++++++- .../testinghelpers/src/level3/ref_trmm3.cpp | 2 +- gtestsuite/testsuite/inc/utils.h | 211 ------------ gtestsuite/testsuite/level2/hemv/test_hemv.h | 5 +- gtestsuite/testsuite/level2/her/test_her.h | 3 +- gtestsuite/testsuite/level2/her2/test_her2.h | 5 +- gtestsuite/testsuite/level2/symv/test_symv.h | 5 +- gtestsuite/testsuite/level2/syr/test_syr.h | 3 +- gtestsuite/testsuite/level2/syr2/test_syr2.h | 5 +- gtestsuite/testsuite/level2/trmv/test_trmv.h | 3 +- gtestsuite/testsuite/level2/trsv/test_trsv.h | 3 +- gtestsuite/testsuite/level3/trmm/test_trmm.h | 3 +- .../testsuite/level3/trmm3/test_trmm3.h | 3 +- gtestsuite/testsuite/level3/trsm/test_trsm.h | 3 +- 15 files changed, 410 insertions(+), 248 deletions(-) delete mode 100644 gtestsuite/testsuite/inc/utils.h diff --git a/gtestsuite/testinghelpers/inc/common/testing_basics.h b/gtestsuite/testinghelpers/inc/common/testing_basics.h index 3176562a72..df2c77059e 100644 --- a/gtestsuite/testinghelpers/inc/common/testing_basics.h +++ b/gtestsuite/testinghelpers/inc/common/testing_basics.h @@ -195,8 +195,8 @@ bool chkconj( char trans ); * @param uplo specifies whether matrix is upper or lower triangular stored in memory. * @return boolean of the triangular form of the matrix. */ -bool chkupper( char uplo ); -bool chklower( char uplo ); +bool is_upper_triangular( char uplo ); +bool is_lower_triangular( char uplo ); /** * @brief Returns the boolean form of a matrix unit/non-unit diagonal form. @@ -275,4 +275,100 @@ void set_dims( char trans, gtint_t m, gtint_t n, gtint_t* mt, gtint_t* nt ); */ void set_dim_with_side( char side, gtint_t m, gtint_t n, gtint_t* dim ); +/** + * ========================================================================== + * MKHERM + * Make an n x n matrix A explicitly Hermitian by copying the conjugate + * of the triangle specified by uploa to the opposite triangle. Imaginary + * components of diagonal elements are explicitly set to zero. + * It is assumed that the diagonal offset of A is zero. + * ========================================================================== + * @param[in] storage specifies the storage format of matrix in memory. + * @param[in] uplo specifies upper or lower triangular part of A is used. + * @param[in] n specifies the number of rows & columns of square matrix. + * @param[in] a specifies pointer which points to the first element of a. + * @param[in] ld specifies leading dimension for a given matrix. + */ +template +void make_herm( char storage, char uplo, gtint_t n, T* a, gtint_t ld ); + +/** + * ========================================================================== + * MKSYMM + * Make an n x n matrix A explicitly symmetric by copying the triangle + * specified by uploa to the opposite triangle. + * It is assumed that the diagonal offset of A is zero. + * ========================================================================== + * @param[in] storage specifies the storage format of matrix in memory. + * @param[in] uplo specifies upper or lower triangular part of A is used. + * @param[in] n specifies the number of rows & columns of square matrix. + * @param[in] a specifies pointer which points to the first element of a. + * @param[in] ld specifies leading dimension for a given matrix. + */ +template +void make_symm( char storage, char uplo, gtint_t n, T* a, gtint_t ld ); + +/** + * ========================================================================== + * MKTRIM + * Make an n x n matrix A explicitly triangular by preserving the triangle + * specified by uploa and zeroing the elements in the opposite triangle. + * It is assumed that the diagonal offset of A is zero + * ========================================================================== + * @param[in] storage specifies the storage format of matrix in memory. + * @param[in] uplo specifies upper or lower triangular part of A is used. + * @param[in] n specifies the number of rows & columns of square matrix. + * @param[in] a specifies pointer which points to the first element of a. + * @param[in] ld specifies leading dimension for a given matrix. + */ +template +void make_triangular( char storage, char uplo, gtint_t n, T* a, gtint_t ld ); + +/** + * ========================================================================== + * MKDIAG + * Make an m x n matrix A, which adds a scalar value to + * every element along an arbitrary diagonal of a matrix. + * It is assumed that the diagonal offset of A is zero + * ========================================================================== + * @param[in] storage specifies the storage format of matrix in memory. + * @param[in] m specifies the number of rows of a given matrix. + * @param[in] n specifies the number of columns of a given matrix. + * @param[in] alpha specifies the value to set diagonal elements. + * @param[in] a specifies pointer which points to the first element of a. + * @param[in] ld specifies leading dimension for a given matrix. + */ +template +void make_diag( char storage, gtint_t m, gtint_t n, T alpha, T *a, gtint_t ld ); + +/** + * print scalar value + * @param[in] x specifies the value. + * @param[in] spec specifies the format specifer. + */ +template +void print_scalar( T x, const char *spec ); + +/** + * print vector of length n + * @param[in] n specifies the length of the given vector. + * @param[in] a specifies pointer which points to the first element of a. + * @param[in] incx specifies storage spacing between elements of a. + * @param[in] spec specifies the format specifer. + */ +template +void print_vector( const char *vec, gtint_t n, T *x, gtint_t incx, const char *spec ); + +/** + * print matrix of size m x n + * @param[in] storage specifies the storage format of matrix in memory. + * @param[in] m specifies the number of rows of given matrix. + * @param[in] n specifies the number of columns of given matrix. + * @param[in] a specifies pointer which points to the first element of a. + * @param[in] ld specifies leading dimension for a given matrix. + * @param[in] spec specifies the format specifer. + */ +template +void print_matrix( const char *mat, char storage, gtint_t m, gtint_t n, T *a, gtint_t ld, const char *spec ); + } //end of namespace testinghelpers diff --git a/gtestsuite/testinghelpers/src/common/testing_basics.cpp b/gtestsuite/testinghelpers/src/common/testing_basics.cpp index 2d07072716..53c1050795 100644 --- a/gtestsuite/testinghelpers/src/common/testing_basics.cpp +++ b/gtestsuite/testinghelpers/src/common/testing_basics.cpp @@ -34,6 +34,7 @@ #include "common/testing_basics.h" #include "common/type_info.h" +#include "common/complex_helpers.h" namespace testinghelpers { @@ -91,7 +92,6 @@ void char_to_cblas_order( char order, CBLAS_ORDER *cblas_order ) { if ( order == 'c' || order == 'C' ) *cblas_order = CblasColMajor; else if ( order == 'r' || order == 'R' ) *cblas_order = CblasRowMajor; - } void char_to_cblas_trans( char trans, CBLAS_TRANSPOSE *cblas_trans ) @@ -160,16 +160,16 @@ gtint_t get_leading_dimension( char storage, char trans, gtint_t m, gtint_t n, g if( (storage == 'c') || (storage == 'C') ) //column-major order { if ((trans == 'n')||(trans == 'N')) - lda = std::max(gtint_t(1),m) + inc; + lda = (std::max)(gtint_t(1),m) + inc; else - lda = std::max(gtint_t(1),n) + inc; + lda = (std::max)(gtint_t(1),n) + inc; } else //row-major order { if ((trans == 'n')||(trans == 'N')) - lda = std::max(gtint_t(1),n) + inc; + lda = (std::max)(gtint_t(1),n) + inc; else - lda = std::max(gtint_t(1),m) + inc; + lda = (std::max)(gtint_t(1),m) + inc; } return lda; } @@ -249,14 +249,14 @@ bool chkconj( char conjx ) ( ( conj & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } -bool chkupper( char uplo ) +bool is_upper_triangular( char uplo ) { uplo_t uploa; char_to_blis_uplo( uplo, &uploa ); return ( bool ) ( uploa == BLIS_UPPER ); } -bool chklower( char uplo ) +bool is_lower_triangular( char uplo ) { uplo_t uploa; char_to_blis_uplo( uplo, &uploa ); @@ -326,4 +326,292 @@ void set_dim_with_side( char side, gtint_t m, gtint_t n, gtint_t* dim ) else *dim = n; } -} //end of namespace testinghelpers +template +static void set_imag_zero(T &x){ + x = {x.real, 0.0}; +} + +/** + * ========================================================================== + * MKHERM + * Make an n x n matrix A explicitly Hermitian by copying the conjugate + * of the triangle specified by uploa to the opposite triangle. Imaginary + * components of diagonal elements are explicitly set to zero. + * It is assumed that the diagonal offset of A is zero. + * ========================================================================== + */ +template +void make_herm( char storage, char uplo, gtint_t n, T* a, gtint_t ld ) +{ + gtint_t rs,cs; + rs=cs=1; + /* a = n x n */ + if( (storage == 'c') || (storage == 'C') ) + cs = ld ; + else + rs = ld ; + + bool uploa = testinghelpers::is_upper_triangular( uplo ); + + if( uploa ) { + gtint_t i, j; + for ( j = 0; j < ( n-1) ; j++ ) + { + for ( i = (j+1) ; i < n ; i++ ) + { + a[i*rs + j*cs] = testinghelpers::conj(a[i*cs + j*rs]); + } + } + } + else + { + gtint_t i, j; + for ( j = 1; j < n ; j++ ) + { + for ( i = 0 ; i < j ; i++ ) + { + a[i*rs + j*cs] = testinghelpers::conj(a[i*cs + j*rs]); + } + } + } + if constexpr (testinghelpers::type_info::is_complex) { + gtint_t i; + for ( i = 0; i < n ; i++ ) + { + set_imag_zero(a[i*rs + i*cs]); + } + } +} +template void make_herm( char, char, gtint_t, float *, gtint_t ); +template void make_herm( char, char, gtint_t, double *, gtint_t ); +template void make_herm( char, char, gtint_t, scomplex *, gtint_t ); +template void make_herm( char, char, gtint_t, dcomplex *, gtint_t ); + +/** + * ========================================================================== + * MKSYMM + * Make an n x n matrix A explicitly symmetric by copying the triangle + * specified by uploa to the opposite triangle. + * It is assumed that the diagonal offset of A is zero. + * ========================================================================== + */ +template +void make_symm( char storage, char uplo, gtint_t n, T* a, gtint_t ld ) +{ + gtint_t rs,cs; + rs=cs=1; + /* a = n x n */ + if( (storage == 'c') || (storage == 'C') ) + cs = ld ; + else + rs = ld ; + + bool uploa = testinghelpers::is_upper_triangular( uplo ); + + /* Toggle uplo so that it refers to the unstored triangle. */ + if( uploa ) { + gtint_t i, j; + for ( j = 0; j < ( n-1) ; j++ ) + { + for ( i = (j+1) ; i < n ; i++ ) + { + a[i*rs + j*cs] = a[i*cs + j*rs]; + } + } + } + else + { + gtint_t i, j; + for ( j = 1; j < n ; j++ ) + { + for ( i = 0 ; i < j ; i++ ) + { + a[i*rs + j*cs] = a[i*cs + j*rs]; + } + } + } +} +template void make_symm( char, char, gtint_t, float *, gtint_t ); +template void make_symm( char, char, gtint_t, double *, gtint_t ); +template void make_symm( char, char, gtint_t, scomplex *, gtint_t ); +template void make_symm( char, char, gtint_t, dcomplex *, gtint_t ); + +/** + * ========================================================================== + * MKTRIM + * Make an n x n matrix A explicitly triangular by preserving the triangle + * specified by uploa and zeroing the elements in the opposite triangle. + * It is assumed that the diagonal offset of A is zero + * ========================================================================== + */ +template +void make_triangular( char storage, char uplo, gtint_t n, T* a, gtint_t ld ) +{ + gtint_t rs,cs; + rs=cs=1; + /* a = n x n */ + if( (storage == 'c') || (storage == 'C') ) + cs = ld ; + else + rs = ld ; + + if ( n < 0 ) + return; + + bool uploa = testinghelpers::is_upper_triangular( uplo ); + T zero; + testinghelpers::initzero(zero); + + /* Toggle uplo so that it refers to the unstored triangle. */ + if( !uploa ) { + gtint_t i, j; + for ( j = 1; j < n ; j++ ) + { + for ( i = 0 ; i < j ; i++ ) + { + a[i*rs + j*cs] = zero; + } + } + } + else + { + gtint_t i, j; + for ( j = 0; j < ( n-1) ; j++ ) + { + for ( i = (j+1) ; i < n ; i++ ) + { + a[i*rs + j*cs] = zero; + } + } + } +} +template void make_triangular( char, char, gtint_t, float *, gtint_t ); +template void make_triangular( char, char, gtint_t, double *, gtint_t ); +template void make_triangular( char, char, gtint_t, scomplex *, gtint_t ); +template void make_triangular( char, char, gtint_t, dcomplex *, gtint_t ); + +/** + * ========================================================================== + * MKDIAG + * Make an m x n matrix A, which adds a scalar value to + * every element along an arbitrary diagonal of a matrix. + * It is assumed that the diagonal offset of A is zero + * ========================================================================== + */ +template +void make_diag( char storage, gtint_t m, gtint_t n, T alpha, T *a, gtint_t ld ) +{ + gtint_t rs,cs; + rs=cs=1; + + if( (storage == 'c') || (storage == 'C') ) + cs = ld ; + else + rs = ld ; + + /* a = mn x mn */ + gtint_t mn = (std::min)( n , m ); + + gtint_t i; + gtint_t inca = rs + cs ; + T *ap = a; + gtint_t ia = 0; + for ( i = 0; i < mn; i++ ) + { + ap[ia] = (alpha + ap[ia]); + ia = ia + inca; + } +} +template void make_diag( char, gtint_t, gtint_t, float, float *, gtint_t ); +template void make_diag( char, gtint_t, gtint_t, double, double *, gtint_t ); +template void make_diag( char, gtint_t, gtint_t, scomplex, scomplex *, gtint_t ); +template void make_diag( char, gtint_t, gtint_t, dcomplex, dcomplex *, gtint_t ); + +/** + * print scalar value + * @param[in] x specifies the value. + * @param[in] spec specifies the format specifer. + */ +template +void print_scalar( T x, const char *spec ) { + if constexpr (testinghelpers::type_info::is_real) + printf(spec, x); + else { + printf( spec, x.real ); + if(x.imag < 0) printf( "-" ); + else printf( "+" ); + printf( spec, abs(x.imag) ); + printf( " " ); + } +} +template void print_scalar( float x, const char * ); +template void print_scalar( double x, const char * ); +template void print_scalar( scomplex x, const char * ); +template void print_scalar( dcomplex x, const char * ); + +/** + * print vector of length n + * @param[in] n specifies the length of the given vector. + * @param[in] a specifies pointer which points to the first element of a. + * @param[in] incx specifies storage spacing between elements of a. + * @param[in] spec specifies the format specifer. + */ +template +void print_vector( gtint_t n, T *x, gtint_t incx, const char *spec ) +{ + gtint_t i, idx; + T val; + + for ( i = 0; i < n; i++ ) + { + idx = (incx > 0) ? (i * incx) : ( - ( n - i - 1 ) * incx ); + val = x[idx]; + print_scalar(val,spec); + printf( " " ); + } + printf( "\n\n" ); +} +template void print_vector( gtint_t, float *, gtint_t, const char * ); +template void print_vector( gtint_t, double *, gtint_t, const char * ); +template void print_vector( gtint_t, scomplex *, gtint_t, const char * ); +template void print_vector( gtint_t, dcomplex *, gtint_t, const char * ); + +/** + * print matrix of size m x n + * @param[in] storage specifies the storage format of matrix in memory. + * @param[in] m specifies the number of rows of given matrix. + * @param[in] n specifies the number of columns of given matrix. + * @param[in] a specifies pointer which points to the first element of a. + * @param[in] ld specifies leading dimension for a given matrix. + * @param[in] spec specifies the format specifer. + */ +template +void print_matrix( char storage, gtint_t m, gtint_t n, T *a, gtint_t ld, const char *spec ) +{ + gtint_t rs,cs; + rs=cs=1; + T val; + if( (storage == 'c') || (storage == 'C') ) + cs = ld ; + else + rs = ld ; + + gtint_t i, j; + for ( i = 0; i < m; i++ ) + { + for ( j = 0; j < n; j++ ) + { + val = a[i*rs + j*cs]; + print_scalar(val,spec); + printf( " " ); + } + printf( "\n" ); + } + printf( "\n" ); +} +template void print_matrix( char, gtint_t, gtint_t, float *, gtint_t, const char * ); +template void print_matrix( char, gtint_t, gtint_t, double *, gtint_t, const char * ); +template void print_matrix( char, gtint_t, gtint_t, scomplex *, gtint_t, const char * ); +template void print_matrix( char, gtint_t, gtint_t, dcomplex *, gtint_t, const char * ); + +} //end of namespace testinghelpers \ No newline at end of file diff --git a/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp b/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp index 8f409773f7..2633b63b43 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp @@ -64,7 +64,7 @@ void ref_trmm3( char storage, char side, char uploa, char trnsa, char diaga, //* Test the input parameters. bool lside = ( testinghelpers::chksideleft( side ) ); - bool upper = ( testinghelpers::chkupper( uploa ) ); + bool upper = ( testinghelpers::is_upper_triangular( uploa ) ); bool unitdg = ( testinghelpers::chkunitdiag( diaga ) ); bool transa = ( testinghelpers::chktrans( trnsa ) ); bool transb = ( testinghelpers::chktrans( trnsb ) ); diff --git a/gtestsuite/testsuite/inc/utils.h b/gtestsuite/testsuite/inc/utils.h deleted file mode 100644 index ded4e98f92..0000000000 --- a/gtestsuite/testsuite/inc/utils.h +++ /dev/null @@ -1,211 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#pragma once - -#pragma once -#include "blis.h" -#include "common/testing_helpers.h" - -/* - * ========================================================================== - * MKHERM - * Make an m x m matrix A explicitly Hermitian by copying the conjugate - * of the triangle specified by uploa to the opposite triangle. Imaginary - * components of diagonal elements are explicitly set to zero. - * It is assumed that the diagonal offset of A is zero. - * ========================================================================== - */ -template -static void mkherm( char storage, char uplo, gtint_t n, T* ap, gtint_t lda ) -{ - uplo_t uploa; - - // Map parameter characters to BLIS constants. - testinghelpers::char_to_blis_uplo ( uplo, &uploa ); - - dim_t rsa,csa; - rsa=csa=1; - /* a = n x n */ - if( (storage == 'c') || (storage == 'C') ) - csa = lda ; - else - rsa = lda ; - - if constexpr (std::is_same::value) - bli_smkherm( uploa, n, ap, rsa, csa ); - else if constexpr (std::is_same::value) - bli_dmkherm( uploa, n, ap, rsa, csa ); - else if constexpr (std::is_same::value) - bli_cmkherm( uploa, n, ap, rsa, csa ); - else if constexpr (std::is_same::value) - bli_zmkherm( uploa, n, ap, rsa, csa ); - else - - throw std::runtime_error("Error in utils.h: Invalid typename in mkherm()."); -} - -/* - * ========================================================================== - * MKSYMM - * Make an m x m matrix A explicitly symmetric by copying the triangle - * specified by uploa to the opposite triangle. - * It is assumed that the diagonal offset of A is zero. - * ========================================================================== - */ - -template -static void mksymm( char storage, char uplo, gtint_t n, T* ap, gtint_t lda ) -{ - uplo_t uploa; - - // Map parameter characters to BLIS constants. - testinghelpers::char_to_blis_uplo ( uplo, &uploa ); - - dim_t rsa,csa; - rsa=csa=1; - /* a = n x n */ - if( (storage == 'c') || (storage == 'C') ) - csa = lda ; - else - rsa = lda ; - - if constexpr (std::is_same::value) - bli_smksymm( uploa, n, ap, rsa, csa ); - else if constexpr (std::is_same::value) - bli_dmksymm( uploa, n, ap, rsa, csa ); - else if constexpr (std::is_same::value) - bli_cmksymm( uploa, n, ap, rsa, csa ); - else if constexpr (std::is_same::value) - bli_zmksymm( uploa, n, ap, rsa, csa ); - else - - throw std::runtime_error("Error in utils.h: Invalid typename in mksymm()."); -} - -/* - * ========================================================================== - * MKTRIM - * Make an m x m matrix A explicitly triangular by preserving the triangle - * specified by uploa and zeroing the elements in the opposite triangle. - * It is assumed that the diagonal offset of A is zero - * ========================================================================== - */ -template -static void mktrim( char storage, char uplo, gtint_t n, T* ap, gtint_t lda ) -{ - uplo_t uploa; - - // Map parameter characters to BLIS constants. - testinghelpers::char_to_blis_uplo ( uplo, &uploa ); - - dim_t rsa,csa; - rsa=csa=1; - /* a = n x n */ - if( (storage == 'c') || (storage == 'C') ) - csa = lda ; - else - rsa = lda ; - - if constexpr (std::is_same::value) - bli_smktrim( uploa, n, ap, rsa, csa ); - else if constexpr (std::is_same::value) - bli_dmktrim( uploa, n, ap, rsa, csa ); - else if constexpr (std::is_same::value) - bli_cmktrim( uploa, n, ap, rsa, csa ); - else if constexpr (std::is_same::value) - bli_zmktrim( uploa, n, ap, rsa, csa ); - else - - throw std::runtime_error("Error in utils.h: Invalid typename in mktrim()."); -} - -template -static void print( T x, const char *spec ) { - if constexpr (testinghelpers::type_info::is_real) - printf(spec, x); - else { - printf( spec, x.real ); - if(x.imag < 0) printf( " -" ); - else printf( " +" ); - printf( spec, abs(x.imag) ); - printf( " " ); - } -} - -template -void printmat( const char *mat, char storage, gtint_t m, gtint_t n, T *a, gtint_t ld, const char *spec ) -{ - dim_t i, j; - dim_t rs,cs; - rs=cs=1; - T val; - if( (storage == 'c') || (storage == 'C') ) - cs = ld ; - else - rs = ld ; - - std::cout <<"matrix : " << mat << std::endl; - - for ( i = 0; i < m; i++ ) - { - for ( j = 0; j < n; j++ ) - { - val = a[i*rs + j*cs]; - print(val,spec); - printf( " " ); - } - printf( "\n" ); - } - printf( "\n" ); -} - -template -void printvec( const char *vec, gtint_t n, T *x, gtint_t incx, const char *spec ) -{ - dim_t i, idx; - T val; - - std::cout <<"vector : " << vec << std::endl; - - for ( i = 0; i < n; i++ ) - { - idx = (incx > 0) ? (i * incx) : ( - ( n - i - 1 ) * incx ); - val = x[idx]; - print(val,spec); - printf( " " ); - } - printf( "\n\n" ); -} - diff --git a/gtestsuite/testsuite/level2/hemv/test_hemv.h b/gtestsuite/testsuite/level2/hemv/test_hemv.h index 4985c4644e..a5018701af 100644 --- a/gtestsuite/testsuite/level2/hemv/test_hemv.h +++ b/gtestsuite/testsuite/level2/hemv/test_hemv.h @@ -37,7 +37,6 @@ #include "hemv.h" #include "level2/ref_hemv.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -55,8 +54,8 @@ void test_hemv( char storage, char uploa, char conja, char conjx, gtint_t n, std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); std::vector y = testinghelpers::get_random_vector( -3, 3, n, incy ); - mkherm( storage, uploa, n, a.data(), lda ); - mktrim( storage, uploa, n, a.data(), lda ); + testinghelpers::make_herm( storage, uploa, n, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, n, a.data(), lda ); // Create a copy of c so that we can check reference results. std::vector y_ref(y); diff --git a/gtestsuite/testsuite/level2/her/test_her.h b/gtestsuite/testsuite/level2/her/test_her.h index 6e18e04810..b0975b2ad1 100644 --- a/gtestsuite/testsuite/level2/her/test_her.h +++ b/gtestsuite/testsuite/level2/her/test_her.h @@ -37,7 +37,6 @@ #include "her.h" #include "level2/ref_her.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -54,7 +53,7 @@ void test_her( char storage, char uploa, char conjx, gtint_t n, Tr alpha, std::vector a = testinghelpers::get_random_matrix( -2, 5, storage, 'n', n, n, lda ); std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); - mktrim( storage, uploa, n, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, n, a.data(), lda ); // Create a copy of c so that we can check reference results. std::vector a_ref(a); diff --git a/gtestsuite/testsuite/level2/her2/test_her2.h b/gtestsuite/testsuite/level2/her2/test_her2.h index f896e89cac..487454ae9d 100644 --- a/gtestsuite/testsuite/level2/her2/test_her2.h +++ b/gtestsuite/testsuite/level2/her2/test_her2.h @@ -37,7 +37,6 @@ #include "her2.h" #include "level2/ref_her2.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -55,8 +54,8 @@ void test_her2( char storage, char uploa, char conjx, char conjy, gtint_t n, std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); std::vector y = testinghelpers::get_random_vector( -2, 5, n, incy ); - mkherm( storage, uploa, n, a.data(), lda ); - mktrim( storage, uploa, n, a.data(), lda ); + testinghelpers::make_herm( storage, uploa, n, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, n, a.data(), lda ); // Create a copy of c so that we can check reference results. std::vector a_ref(a); diff --git a/gtestsuite/testsuite/level2/symv/test_symv.h b/gtestsuite/testsuite/level2/symv/test_symv.h index a808060d52..789caecbae 100644 --- a/gtestsuite/testsuite/level2/symv/test_symv.h +++ b/gtestsuite/testsuite/level2/symv/test_symv.h @@ -37,7 +37,6 @@ #include "symv.h" #include "level2/ref_symv.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -55,8 +54,8 @@ void test_symv( char storage, char uploa, char conja, char conjx, gtint_t n, std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); std::vector y = testinghelpers::get_random_vector( -2, 5, n, incy ); - mksymm( storage, uploa, n, a.data(), lda ); - mktrim( storage, uploa, n, a.data(), lda ); + testinghelpers::make_symm( storage, uploa, n, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, n, a.data(), lda ); // Create a copy of c so that we can check reference results. std::vector y_ref(y); diff --git a/gtestsuite/testsuite/level2/syr/test_syr.h b/gtestsuite/testsuite/level2/syr/test_syr.h index 3227cc2a4a..3a62dd371a 100644 --- a/gtestsuite/testsuite/level2/syr/test_syr.h +++ b/gtestsuite/testsuite/level2/syr/test_syr.h @@ -37,7 +37,6 @@ #include "syr.h" #include "level2/ref_syr.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -54,7 +53,7 @@ void test_syr( char storage, char uploa, char conjx, gtint_t n, T alpha, std::vector a = testinghelpers::get_random_matrix( -2, 5, storage, 'n', n, n, lda ); std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); - mktrim( storage, uploa, n, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, n, a.data(), lda ); // Create a copy of c so that we can check reference results. std::vector a_ref(a); diff --git a/gtestsuite/testsuite/level2/syr2/test_syr2.h b/gtestsuite/testsuite/level2/syr2/test_syr2.h index 9389b67172..5f4e81f7b6 100644 --- a/gtestsuite/testsuite/level2/syr2/test_syr2.h +++ b/gtestsuite/testsuite/level2/syr2/test_syr2.h @@ -37,7 +37,6 @@ #include "syr2.h" #include "level2/ref_syr2.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -55,8 +54,8 @@ void test_syr2( char storage, char uploa, char conjx, char conjy, gtint_t n, std::vector x = testinghelpers::get_random_vector( -3, 3, n, incx ); std::vector y = testinghelpers::get_random_vector( -3, 3, n, incy ); - mksymm( storage, uploa, n, a.data(), lda ); - mktrim( storage, uploa, n, a.data(), lda ); + testinghelpers::make_symm( storage, uploa, n, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, n, a.data(), lda ); // Create a copy of c so that we can check reference results. std::vector a_ref(a); diff --git a/gtestsuite/testsuite/level2/trmv/test_trmv.h b/gtestsuite/testsuite/level2/trmv/test_trmv.h index 80cf4d4f5f..2ac5c70145 100644 --- a/gtestsuite/testsuite/level2/trmv/test_trmv.h +++ b/gtestsuite/testsuite/level2/trmv/test_trmv.h @@ -37,7 +37,6 @@ #include "trmv.h" #include "level2/ref_trmv.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -54,7 +53,7 @@ void test_trmv( char storage, char uploa, char transa, char diaga, gtint_t n, std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, transa, n, n, lda ); std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); - mktrim( storage, uploa, n, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, n, a.data(), lda ); // Create a copy of c so that we can check reference results. std::vector x_ref(x); diff --git a/gtestsuite/testsuite/level2/trsv/test_trsv.h b/gtestsuite/testsuite/level2/trsv/test_trsv.h index 096cd5ee0a..c5f8cd61cd 100644 --- a/gtestsuite/testsuite/level2/trsv/test_trsv.h +++ b/gtestsuite/testsuite/level2/trsv/test_trsv.h @@ -37,7 +37,6 @@ #include "trsv.h" #include "level2/ref_trsv.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -54,7 +53,7 @@ void test_trsv( char storage, char uploa, char transa, char diaga, gtint_t n, std::vector a = testinghelpers::get_random_matrix( 1, 5, storage, transa, n, n, lda ); std::vector x = testinghelpers::get_random_vector( 1, 3, n, incx ); - mktrim( storage, uploa, n, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, n, a.data(), lda ); // Create a copy of c so that we can check reference results. std::vector x_ref(x); diff --git a/gtestsuite/testsuite/level3/trmm/test_trmm.h b/gtestsuite/testsuite/level3/trmm/test_trmm.h index 96998bd6cb..11e74f286c 100644 --- a/gtestsuite/testsuite/level3/trmm/test_trmm.h +++ b/gtestsuite/testsuite/level3/trmm/test_trmm.h @@ -37,7 +37,6 @@ #include "trmm.h" #include "level3/ref_trmm.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -59,7 +58,7 @@ void test_trmm( char storage, char side, char uploa, char transa, char diaga, // Create a copy of v so that we can check reference results. std::vector b_ref(b); - mktrim( storage, uploa, mn, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, mn, a.data(), lda ); //---------------------------------------------------------- // Call BLIS function //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level3/trmm3/test_trmm3.h b/gtestsuite/testsuite/level3/trmm3/test_trmm3.h index 208d64a1e0..84d6d1c0bd 100644 --- a/gtestsuite/testsuite/level3/trmm3/test_trmm3.h +++ b/gtestsuite/testsuite/level3/trmm3/test_trmm3.h @@ -37,7 +37,6 @@ #include "trmm3.h" #include "level3/ref_trmm3.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -62,7 +61,7 @@ void test_trmm3( char storage, char side, char uploa, char transa, char diaga, // Create a copy of v so that we can check reference results. std::vector c_ref(c); - mktrim( storage, uploa, mn, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, mn, a.data(), lda ); //---------------------------------------------------------- // Call BLIS function //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level3/trsm/test_trsm.h b/gtestsuite/testsuite/level3/trsm/test_trsm.h index dc1e2a6cfa..698a382823 100644 --- a/gtestsuite/testsuite/level3/trsm/test_trsm.h +++ b/gtestsuite/testsuite/level3/trsm/test_trsm.h @@ -37,7 +37,6 @@ #include "trsm.h" #include "level3/ref_trsm.h" #include "inc/check_error.h" -#include "inc/utils.h" #include #include @@ -67,7 +66,7 @@ void test_trsm( char storage, char side, char uploa, char transa, char diaga, // Create a copy of v so that we can check reference results. std::vector b_ref(b); - mktrim( storage, uploa, mn, a.data(), lda ); + testinghelpers::make_triangular( storage, uploa, mn, a.data(), lda ); //---------------------------------------------------------- // Call BLIS function //---------------------------------------------------------- From ffbb0e83e5554f53f1d203af45508f0996e72e3b Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Fri, 23 Jun 2023 15:12:45 +0530 Subject: [PATCH 112/226] ZGEMM optimization for cases when m = 1 or n = 1 - When n = 1 and A matrix is transposed ZGEMV row major variant is invoked. - When m = 1 and B matrix is not transposed ZGEMV row major variant is invoked. - This redirection happens before parallel ZGEMM check. This is done to avoid the unneccesary condition check. Any parallelization check is expected to happen in the invoked ZGEMV interface. AMD-Internal: [CPUPL-2773] Change-Id: I6b7b31db712edc682c089475d12e98730a960138 --- frame/compat/bla_gemm_amd.c | 121 +++++++++++++++++++++++------------- 1 file changed, 77 insertions(+), 44 deletions(-) diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index afbecd2a58..5c199e3712 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -917,6 +917,83 @@ void zgemm_blis_impl bli_obj_set_conjtrans( blis_transa, &ao ); bli_obj_set_conjtrans( blis_transb, &bo ); + /* Call GEMV when m == 1 or n == 1 with the context set + to an uninitialized void pointer i.e. ((void *)0)*/ + if (n0 == 1) + { + if (bli_is_notrans(blis_transa)) + { + bli_zgemv_unf_var2 + ( + blis_transa, + bli_extract_conj(blis_transb), + m0, k0, + (dcomplex *)alpha, + (dcomplex *)a, rs_a, cs_a, + (dcomplex *)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, + (dcomplex *)beta, + c, rs_c, + ((void *)0) + ); + } + else + { + bli_zgemv_unf_var1 + ( + blis_transa, + bli_extract_conj(blis_transb), + k0, m0, + (dcomplex *)alpha, + (dcomplex *)a, rs_a, cs_a, + (dcomplex *)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, + (dcomplex *)beta, + c, rs_c, + ((void *)0) + ); + } + + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + bli_finalize_auto(); + return; + } + else if (m0 == 1) + { + if (bli_is_notrans(blis_transb)) + { + bli_zgemv_unf_var1 + ( + blis_transb, + bli_extract_conj(blis_transa), + n0, k0, + (dcomplex *)alpha, + (dcomplex *)b, cs_b, rs_b, + (dcomplex *)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, + (dcomplex *)beta, + c, cs_c, + ((void *)0) + ); + } + else + { + bli_zgemv_unf_var2 + ( + blis_transb, + bli_extract_conj(blis_transa), + k0, n0, + (dcomplex *)alpha, + (dcomplex *)b, cs_b, rs_b, + (dcomplex *)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, + (dcomplex *)beta, + c, cs_c, + ((void *)0) + ); + } + + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + bli_finalize_auto(); + return; + } + // default instance performance tuning is done in zgemm. // Single instance tuning is done based on env set. //dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 ); @@ -970,50 +1047,6 @@ void zgemm_blis_impl return; } - /* Call Gemv when m/n=1 */ - if (n0 == 1) - { - if (bli_is_notrans(blis_transa)) - { - bli_zgemv_unf_var2( - BLIS_NO_TRANSPOSE, - bli_extract_conj(blis_transb), - m0, k0, - (dcomplex *)alpha, - (dcomplex *)a, rs_a, cs_a, - (dcomplex *)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, - (dcomplex *)beta, - c, rs_c, - ((void *)0)); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; - } - } - else if (m0 == 1) - { - if (bli_is_trans(blis_transb)) - { - bli_zgemv_unf_var2( - blis_transb, - bli_extract_conj(blis_transa), - k0, n0, - (dcomplex *)alpha, - (dcomplex *)b, cs_b, rs_b, - (dcomplex *)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, - (dcomplex *)beta, - c, cs_c, - ((void *)0)); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; - } - } - #ifdef BLIS_ENABLE_SMALL_MATRIX if (((!is_parallel) && (((m0*k0) <= 16384) || ((n0*k0) <= 16384))) || From 660cd6d1b27f4b06afa62c3670f97b55fa8a307e Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Tue, 4 Jul 2023 16:26:15 +0530 Subject: [PATCH 113/226] Adding nrm2 target for benchmarking on Windows. Modifying blis/bench/CMakeLists.txt to include nrm2 target and produce the corresponding executable. AMD-Internal: [CPUPL-3625] Change-Id: I7945416142e07ac99510ed9500a2c620053c7e13 --- bench/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index 00d01fdd21..4c6fed1140 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -61,6 +61,13 @@ if(ENABLE_OPENMP) endif() target_link_libraries(BenchGer optimized "${LIB_NAME}.lib") +add_executable(BenchNrm2 bench_nrm2.c) +target_link_libraries(BenchNrm2 debug "${LIB_NAME}.lib") +if(ENABLE_OPENMP) + target_link_libraries(BenchNrm2 OpenMP::OpenMP_CXX) +endif() +target_link_libraries(BenchNrm2 optimized "${LIB_NAME}.lib") + add_executable(BenchScalv bench_scalv.c) target_link_libraries(BenchScalv debug "${LIB_NAME}.lib") if(ENABLE_OPENMP) From 7b78d9328260c5ff6272913f50736fea88be5e9a Mon Sep 17 00:00:00 2001 From: Chandrashekara K R Date: Wed, 5 Jul 2023 14:38:34 +0530 Subject: [PATCH 114/226] Removing omp library linking to static multithreaded library build. Description: We have seen the library dependency issue when we are linking the libomp.lib or libiomp5md.lib while building the library for static multithreaded scenario. So we are removing the linking of openmp library for static multithreaded blis library build. So that user can link any openmp library(libomp.lib or libiomp5md.lib) while building their applications by linking static multithreaded blis library. AMD-Internal: [SWLCSG-2196] Change-Id: I96722f3587ee555af12de664957c211c56fcf03d --- CMakeLists.txt | 14 +----- test/CMakeLists.txt | 96 ++++++++++++++++++++-------------------- testsuite/CMakeLists.txt | 4 +- 3 files changed, 52 insertions(+), 62 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ffa646e9b..6143056a82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,7 +111,7 @@ option(DISABLE_BLIS_ARCH_TYPE "Disable BLIS_ARCH_TYPE and BLIS_MODEL_TYPE functi option(RENAME_BLIS_ARCH_TYPE "Rename BLIS_ARCH_TYPE env var renamed to supplied value" BLIS_ARCH_TYPE) option(RENAME_BLIS_MODEL_TYPE "Rename BLIS_MODEL_TYPE env var renamed to supplied value" BLIS_MODEL_TYPE) option(ENABLE_ASAN_TESTS "Enable Address Sanitiser tests" OFF) -SET(ASAN_DEPENDENCY_LIB_DIR "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/Llvm/x64/lib/clang/15.0.1/lib/windows" CACHE STRING "ASAN Dependent library folder name") +SET(ASAN_DEPENDENCY_LIB_DIR "" CACHE STRING "ASAN Dependent library folder name") if (${AOCL_BLIS_FAMILY} STREQUAL "amdzen") set(REF_KERNEL_MIRRORING_PY "${CMAKE_SOURCE_DIR}/build/blis_ref_kernel_mirror.py") @@ -618,17 +618,7 @@ set(BLIS_VERSION_STRING ${BLIS_VERSION}) string(TIMESTAMP BUILD_DATE "%Y%m%d") add_definitions(-DBLIS_VERSION_STRING="AOCL-BLIS ${BLIS_VERSION_STRING} Build ${BUILD_DATE}") if (ENABLE_ASAN_TESTS) - if(ENABLE_OPENMP) - set(STATIC_LIB_OPTIONS "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic-x86_64.lib" "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic_runtime_thunk-x86_64.lib" "${OpenMP_libomp_LIBRARY}") - else () - set(STATIC_LIB_OPTIONS "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic-x86_64.lib" "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic_runtime_thunk-x86_64.lib") - endif() -else() - if(ENABLE_OPENMP) - set(STATIC_LIB_OPTIONS "${OpenMP_libomp_LIBRARY}") - else (ENABLE_OPENMP) - set(STATIC_LIB_OPTIONS "") - endif() + set(STATIC_LIB_OPTIONS "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic-x86_64.lib" "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic_runtime_thunk-x86_64.lib") endif () # Set object libraries created in kernels directory to be added into BLIS library. diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 77b746ba94..d116e942d0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -4,169 +4,169 @@ add_definitions(-DBLAS="AOCL") add_executable(TestAminv test_aminv.c) target_link_libraries(TestAminv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestAminv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestAminv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestAminv optimized "${LIB_NAME}.lib") add_executable(TestAxpyv test_axpyv.c) target_link_libraries(TestAxpyv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestAxpyv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestAxpyv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestAxpyv optimized "${LIB_NAME}.lib") add_executable(TestAxpbyv test_axpbyv.c) target_link_libraries(TestAxpbyv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestAxpbyv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestAxpbyv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestAxpbyv optimized "${LIB_NAME}.lib") add_executable(TestCopyv test_copyv.c) target_link_libraries(TestCopyv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestCopyv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestCopyv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestCopyv optimized "${LIB_NAME}.lib") add_executable(TestCabs1 test_cabs1.c) target_link_libraries(TestCabs1 debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestCabs1 "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestCabs1 OpenMP::OpenMP_CXX) endif() target_link_libraries(TestCabs1 optimized "${LIB_NAME}.lib") add_executable(TestDotv test_dotv.c) target_link_libraries(TestDotv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestDotv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestDotv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestDotv optimized "${LIB_NAME}.lib") add_executable(TestGemm test_gemm.c) target_link_libraries(TestGemm debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestGemm "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestGemm OpenMP::OpenMP_CXX) endif() target_link_libraries(TestGemm optimized "${LIB_NAME}.lib") add_executable(TestGemmBatch test_gemm_batch.c) target_link_libraries(TestGemmBatch debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestGemmBatch "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestGemmBatch OpenMP::OpenMP_CXX) endif() target_link_libraries(TestGemmBatch optimized "${LIB_NAME}.lib") add_executable(TestGemm3m test_gemm3m.c) target_link_libraries(TestGemm3m debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestGemm3m "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestGemm3m OpenMP::OpenMP_CXX) endif() target_link_libraries(TestGemm3m optimized "${LIB_NAME}.lib") add_executable(TestGemmt test_gemmt.c) target_link_libraries(TestGemmt debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestGemmt "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestGemmt OpenMP::OpenMP_CXX) endif() target_link_libraries(TestGemmt optimized "${LIB_NAME}.lib") add_executable(TestGemv test_gemv.c) target_link_libraries(TestGemv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestGemv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestGemv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestGemv optimized "${LIB_NAME}.lib") add_executable(TestGer test_ger.c) target_link_libraries(TestGer debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestGer "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestGer OpenMP::OpenMP_CXX) endif() target_link_libraries(TestGer optimized "${LIB_NAME}.lib") add_executable(TestHemm test_hemm.c) target_link_libraries(TestHemm debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestHemm "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestHemm OpenMP::OpenMP_CXX) endif() target_link_libraries(TestHemm optimized "${LIB_NAME}.lib") add_executable(TestHemv test_hemv.c) target_link_libraries(TestHemv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestHemv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestHemv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestHemv optimized "${LIB_NAME}.lib") add_executable(TestHer test_her.c) target_link_libraries(TestHer debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestHer "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestHer OpenMP::OpenMP_CXX) endif() target_link_libraries(TestHer optimized "${LIB_NAME}.lib") add_executable(TestHer2 test_her2.c) target_link_libraries(TestHer2 debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestHer2 "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestHer2 OpenMP::OpenMP_CXX) endif() target_link_libraries(TestHer2 optimized "${LIB_NAME}.lib") add_executable(TestHer2k test_her2k.c) target_link_libraries(TestHer2k debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestHer2k "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestHer2k OpenMP::OpenMP_CXX) endif() target_link_libraries(TestHer2k optimized "${LIB_NAME}.lib") add_executable(TestHerk test_herk.c) target_link_libraries(TestHerk debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestHerk "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestHerk OpenMP::OpenMP_CXX) endif() target_link_libraries(TestHerk optimized "${LIB_NAME}.lib") add_executable(TestScalv test_scalv.c) target_link_libraries(TestScalv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestScalv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestScalv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestScalv optimized "${LIB_NAME}.lib") add_executable(TestSwapv test_swapv.c) target_link_libraries(TestSwapv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestSwapv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestSwapv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestSwapv optimized "${LIB_NAME}.lib") add_executable(TestTrmm test_trmm.c) target_link_libraries(TestTrmm debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestTrmm "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestTrmm OpenMP::OpenMP_CXX) endif() target_link_libraries(TestTrmm optimized "${LIB_NAME}.lib") add_executable(TestTrmv test_trmv.c) target_link_libraries(TestTrmv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestTrmv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestTrmv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestTrmv optimized "${LIB_NAME}.lib") add_executable(TestTrsm test_trsm.c) target_link_libraries(TestTrsm debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestTrsm "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestTrsm OpenMP::OpenMP_CXX) endif() target_link_libraries(TestTrsm optimized "${LIB_NAME}.lib") add_executable(TestTrsv test_trsv.c) target_link_libraries(TestTrsv debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(TestTrsv "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(TestTrsv OpenMP::OpenMP_CXX) endif() target_link_libraries(TestTrsv optimized "${LIB_NAME}.lib") diff --git a/testsuite/CMakeLists.txt b/testsuite/CMakeLists.txt index b997b8a8d9..85866926dd 100644 --- a/testsuite/CMakeLists.txt +++ b/testsuite/CMakeLists.txt @@ -7,8 +7,8 @@ add_executable(test_libblis "") add_subdirectory(src) target_link_libraries(test_libblis debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP AND BUILD_SHARED_LIBS) - target_link_libraries(test_libblis "${OMP_LIB}") +if(ENABLE_OPENMP) + target_link_libraries(test_libblis OpenMP::OpenMP_CXX) endif() target_link_libraries(test_libblis optimized "${LIB_NAME}.lib") From 79e174ff0aaa86ea9cdb2e74b7f6ca71465e6191 Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Tue, 18 Jul 2023 12:44:44 +0530 Subject: [PATCH 115/226] Level-3 triangular routines now use different block sizes and kernels. Details: - Eliminated the need for override function in SUP for GEMMT/SYRK. - New set of block sizes, kernels and kernel preferences are added to cntx data structure for level-3 triangular routines. - Added supporting functions to set and get the above parameters from cntx. - Modified GEMMT/SYRK SUP code to use these new block sizes/kernels. In case they are not set, use the default block sizes/kernels of Level-3 SUP. AMD-Internal: [CPUPL-3649] Change-Id: Iee11bd4c4f1d8fbbb749c296258d1b8121c009a0 --- config/amdzen/bli_family_amdzen.h | 2 - config/zen4/bli_cntx_init_zen4.c | 94 +++++----- config/zen4/bli_family_zen4.h | 2 - frame/3/bli_l3_sup.c | 35 +--- frame/3/bli_l3_sup_int_amd.c | 12 +- frame/3/gemmt/bli_gemmt_sup_var1n2m.c | 159 ++++++++++------- frame/base/bli_cntx.c | 239 +++++++++++++++++++++++++- frame/base/bli_cntx.h | 91 ++++++++++ frame/include/bli_type_defs.h | 3 + ref_kernels/bli_cntx_ref.c | 75 +++++++- 10 files changed, 562 insertions(+), 150 deletions(-) diff --git a/config/amdzen/bli_family_amdzen.h b/config/amdzen/bli_family_amdzen.h index 7e4d460d13..5a1fbc68d9 100644 --- a/config/amdzen/bli_family_amdzen.h +++ b/config/amdzen/bli_family_amdzen.h @@ -72,8 +72,6 @@ */ BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx); -BLIS_EXPORT_BLIS void bli_zen4_override_gemmt_blkszs (cntx_t* cntx); - /* * Restore the block sizes to default values needed for zen4 context. * diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index 8dda84ccce..066bf34df6 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -320,54 +320,10 @@ void bli_cntx_init_zen4( cntx_t* cntx ) BLIS_MR, &blkszs[ BLIS_MR ], cntx ); -} - -/* - * Override the block sizes in the context to the block sizes used - * by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default - * GEMM kernels are AVX512 based and uses different block sizes. - * - * This function should be called in TRSM path before performing - * any packing operations. - * - * Also the context must be restored to default values by calling - * bli_zen4_restore_default_blkszs() before exiting TRSM Path - */ -void bli_zen4_override_trsm_blkszs (cntx_t* cntx) -{ - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 8, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 24, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 120, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4008, 4080, 4080 ); - - - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native execution. - bli_cntx_set_blkszs - ( - BLIS_NAT, 5, - // level-3 - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); -} - - -// Since the output of syrk/gemmt is a triangular matrix, -// near-to-square shaped kernel performs better than -// skewed/rectangular shaped kernel. -// Hence we are overriding blocksizes and kernel -// function pointers for gemmt/syrk with avx2 specific ones -void bli_zen4_override_gemmt_blkszs (cntx_t* cntx) -{ - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + // Initialize level-3 sup blocksize objects for operations dealing with + //triangular objects with architecture-specific values. + // bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, 9, 9, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); @@ -376,18 +332,19 @@ void bli_zen4_override_gemmt_blkszs (cntx_t* cntx) bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. - bli_cntx_set_l3_sup_blkszs + bli_cntx_set_l3_sup_tri_blkszs ( - 4, + 5, // level-3 BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_MR, &blkszs[ BLIS_MR ], cntx ); - bli_cntx_set_l3_sup_kers + bli_cntx_set_l3_sup_tri_kers ( 24, BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, @@ -418,6 +375,43 @@ void bli_zen4_override_gemmt_blkszs (cntx_t* cntx) ); } +/* + * Override the block sizes in the context to the block sizes used + * by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default + * GEMM kernels are AVX512 based and uses different block sizes. + * + * This function should be called in TRSM path before performing + * any packing operations. + * + * Also the context must be restored to default values by calling + * bli_zen4_restore_default_blkszs() before exiting TRSM Path + */ +void bli_zen4_override_trsm_blkszs (cntx_t* cntx) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 8, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 24, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 120, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4008, 4080, 4080 ); + + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + cntx + ); +} + + /* * Restore the block sizes to default values needed for zen4 context. * diff --git a/config/zen4/bli_family_zen4.h b/config/zen4/bli_family_zen4.h index a1666ea9d3..55d3e62d3c 100644 --- a/config/zen4/bli_family_zen4.h +++ b/config/zen4/bli_family_zen4.h @@ -73,8 +73,6 @@ */ BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx); -BLIS_EXPORT_BLIS void bli_zen4_override_gemmt_blkszs (cntx_t* cntx); - /* * Restore the block sizes to default values needed for zen4 context. * diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index 5ee53bf951..afd74d2ee4 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -250,8 +250,6 @@ err_t bli_gemmtsup // that function assumes the context pointer is valid. if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - cntx_t cntx_gemmt = *cntx; - thresh_func_ft func_fp; func_fp = bli_cntx_get_l3_thresh_func(BLIS_GEMMT, cntx); @@ -268,19 +266,6 @@ err_t bli_gemmtsup if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } -#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) - - if((bli_arch_query_id() == BLIS_ARCH_ZEN4)) - { - if( bli_obj_dt(a) != BLIS_SCOMPLEX ) - { - // override the existing blocksizes with AVX-2 specific ones. - // Since gemmt has a triangular matrix as output, near-to-square - // shaped kernel perform better than skewed/rectangular shaped kernel. - bli_zen4_override_gemmt_blkszs(&cntx_gemmt); - } - } -#endif #ifdef AOCL_DYNAMIC // If dynamic-threading is enabled, calculate optimum number // of threads and update in rntm @@ -322,7 +307,7 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n", b, beta, c, - &cntx_gemmt, + cntx, rntm ); @@ -398,8 +383,6 @@ err_t bli_syrksup // that function assumes the context pointer is valid. if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - cntx_t cntx_syrk = *cntx; - thresh_func_ft func_fp = bli_cntx_get_l3_thresh_func(BLIS_SYRK, cntx); if( !func_fp( a, &at_local, c, cntx)) { @@ -413,20 +396,6 @@ err_t bli_syrksup if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } -#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) - - if((bli_arch_query_id() == BLIS_ARCH_ZEN4)) - { - if( bli_obj_dt(a) != BLIS_SCOMPLEX ) - { - // override the existing blocksizes with AVX-2 specific ones. - // Since gemmt has a triangular matrix as output, near-to-square - // shaped kernel perform better than skewed/rectangular shaped kernel. - bli_zen4_override_gemmt_blkszs(&cntx_syrk); - } - } -#endif - #ifdef AOCL_DYNAMIC // Will change this name later to BLIS_SMART_THREAD // If dynamic-threading is enabled, calculate optimum // number of threads. @@ -467,7 +436,7 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n", &at_local, beta, c, - &cntx_syrk, + cntx, rntm ); diff --git a/frame/3/bli_l3_sup_int_amd.c b/frame/3/bli_l3_sup_int_amd.c index 029c383dc1..69b691674c 100644 --- a/frame/3/bli_l3_sup_int_amd.c +++ b/frame/3/bli_l3_sup_int_amd.c @@ -288,16 +288,22 @@ err_t bli_gemmtsup_int const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; const num_t dt = bli_obj_dt( c ); - const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); + dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); + dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); + bool row_pref = bli_cntx_l3_sup_tri_ker_prefers_rows_dt( dt, stor_id, cntx ); + if( (0 == MR) || (0 == NR) ) + { + MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); + NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); + row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); + } const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); const dim_t m = bli_obj_length( c ); const dim_t n = m; const dim_t k = bli_obj_width( a ); - const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const bool auto_factor = bli_rntm_auto_factor( rntm ); const dim_t n_threads = bli_rntm_num_threads( rntm ); bool use_bp = TRUE; diff --git a/frame/3/gemmt/bli_gemmt_sup_var1n2m.c b/frame/3/gemmt/bli_gemmt_sup_var1n2m.c index 982dc6e035..0b5176a6ab 100644 --- a/frame/3/gemmt/bli_gemmt_sup_var1n2m.c +++ b/frame/3/gemmt/bli_gemmt_sup_var1n2m.c @@ -364,11 +364,31 @@ void PASTEMACT(ch,opname,uplo,varname) \ stor_id = bli_stor3_trans( stor_id ); \ \ /* Query the context for various blocksizes. */ \ - const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ - const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ - const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ - const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ + dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ + dim_t NC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ + dim_t MC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ + dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ + /* Query the maximum blocksize for MR, which implies a maximum blocksize + extension for the final iteration. */ \ + dim_t MRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_MR, cntx ); \ +\ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ +\ + if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC0 ) || ( 0 == MC0 ) || ( 0 == KC0 ) ) \ + { \ + NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ + } \ + const dim_t MRE = MRM - MR; \ \ dim_t KC; \ if ( packa && packb ) \ @@ -412,11 +432,6 @@ void PASTEMACT(ch,opname,uplo,varname) \ because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \ const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \ const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \ -\ - /* Query the maximum blocksize for MR, which implies a maximum blocksize - extension for the final iteration. */ \ - const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const dim_t MRE = MRM - MR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = rs_c; \ @@ -436,11 +451,6 @@ void PASTEMACT(ch,opname,uplo,varname) \ const inc_t irstep_c = cs_c * NR; \ const inc_t irstep_b = cs_b * NR; \ */ \ -\ - /* Query the context for the sup microkernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ @@ -838,11 +848,31 @@ void PASTEMACT(ch,opname,uplo,varname) \ stor_id = bli_stor3_trans( stor_id ); \ \ /* Query the context for various blocksizes. */ \ - const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ - const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ - const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ - const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ + dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ + dim_t NC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ + dim_t MC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ + dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the maximum blocksize for MR, which implies a maximum blocksize + extension for the final iteration. */ \ + dim_t MRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_MR, cntx ); \ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ +\ + if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC0 ) || ( 0 == MC0 ) || ( 0 == KC0 ) ) \ + { \ + NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ + } \ + const dim_t MRE = MRM - MR; \ \ dim_t KC; \ if ( packa && packb ) \ @@ -886,11 +916,6 @@ void PASTEMACT(ch,opname,uplo,varname) \ because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \ const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \ const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \ -\ - /* Query the maximum blocksize for MR, which implies a maximum blocksize - extension for the final iteration. */ \ - const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const dim_t MRE = MRM - MR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = rs_c; \ @@ -910,11 +935,6 @@ void PASTEMACT(ch,opname,uplo,varname) \ const inc_t irstep_c = cs_c * NR; \ const inc_t irstep_b = cs_b * NR; \ */ \ -\ - /* Query the context for the sup microkernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ @@ -1487,11 +1507,31 @@ void PASTEMACT(ch,opname,uplo,varname) \ } \ \ /* Query the context for various blocksizes. */ \ - const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ - const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ - const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ - const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ + dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ + dim_t NC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ + dim_t MC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ + dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ + /* Query the maximum blocksize for NR, which implies a maximum blocksize + extension for the final iteration. */ \ + dim_t NRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ +\ + if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC ) || ( 0 == MC ) || ( 0 == KC0 ) ) \ + { \ + NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ + } \ + const dim_t NRE = NRM - NR; \ \ dim_t KC; \ if ( packa && packb ) \ @@ -1530,11 +1570,6 @@ void PASTEMACT(ch,opname,uplo,varname) \ else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ else KC = (( KC0 / 5 ) / 4 ) * 4; \ } \ -\ - /* Query the maximum blocksize for NR, which implies a maximum blocksize - extension for the final iteration. */ \ - const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ - const dim_t NRE = NRM - NR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ @@ -1558,10 +1593,6 @@ void PASTEMACT(ch,opname,uplo,varname) \ const inc_t irstep_a = rs_a * MR; \ */ \ \ - /* Query the context for the sup microkernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype ) ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ \ /* storage-scheme of ct should be same as that of C. @@ -2143,11 +2174,32 @@ void PASTEMACT(ch,opname,uplo,varname) \ } \ \ /* Query the context for various blocksizes. */ \ - const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ - const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ - const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ - const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ + dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ + dim_t NC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ + dim_t MC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ + dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the maximum blocksize for NR, which implies a maximum blocksize + extension for the final iteration. */ \ + dim_t NRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ +\ + if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC ) || ( 0 == MC ) || ( 0 == KC0 ) ) \ + { \ + NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ + } \ + const dim_t NRE = NRM - NR; \ \ dim_t KC; \ if ( packa && packb ) \ @@ -2193,11 +2245,6 @@ void PASTEMACT(ch,opname,uplo,varname) \ else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ else KC = (( KC0 / 5 ) / 4 ) * 4; \ } \ -\ - /* Query the maximum blocksize for NR, which implies a maximum blocksize - extension for the final iteration. */ \ - const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ - const dim_t NRE = NRM - NR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ @@ -2221,10 +2268,6 @@ void PASTEMACT(ch,opname,uplo,varname) \ const inc_t irstep_a = rs_a * MR; \ */ \ \ - /* Query the context for the sup microkernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype ) ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ \ /* Storage scheme of ct should be same as that of C. diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 3a8a2f0d70..774a31ead7 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1086,6 +1086,110 @@ void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ) // ----------------------------------------------------------------------------- +void bli_cntx_set_l3_sup_tri_blkszs( dim_t n_bs, ... ) +{ + // This function can be called from the bli_cntx_init_*() function for + // a particular architecture if the kernel developer wishes to use + // non-default l3 sup blocksizes for operations that deal with triangular + // matrices. It should be called after bli_cntx_init_defaults() so that + // the context begins with default blocksizes across all datatypes. + + /* Example prototypes: + + void bli_cntx_set_blkszs + ( + dim_t n_bs, + bszid_t bs0_id, blksz_t* blksz0, + bszid_t bs1_id, blksz_t* blksz1, + bszid_t bs2_id, blksz_t* blksz2, + ... + cntx_t* cntx + ); + */ + + va_list args; + dim_t i; + + // Allocate some temporary local arrays. + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif + bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif + blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) ); + + // -- Begin variable argument section -- + + // Initialize variable argument environment. + va_start( args, n_bs ); + + // Process n_bs tuples. + for ( i = 0; i < n_bs; ++i ) + { + // Here, we query the variable argument list for: + // - the bszid_t of the blocksize we're about to process, + // - the address of the blksz_t object. + bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); + blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); + + // Store the values in our temporary arrays. + bszids[ i ] = bs_id; + blkszs[ i ] = blksz; + } + + // The last argument should be the context pointer. + cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); + + // Shutdown variable argument environment and clean up stack. + va_end( args ); + + // -- End variable argument section -- + + // Query the context for the addresses of: + // - the blocksize object array + blksz_t* cntx_l3_sup_tri_blkszs = bli_cntx_l3_sup_tri_blkszs_buf( cntx ); + + // Now that we have the context address, we want to copy the values + // from the temporary buffers into the corresponding buffers in the + // context. Notice that the blksz_t* pointers were saved, rather than + // the objects themselves, but we copy the contents of the objects + // when copying into the context. + + // Process each blocksize id tuple provided. + for ( i = 0; i < n_bs; ++i ) + { + // Read the current blocksize id, blksz_t* pointer, blocksize + // multiple id, and blocksize scalar. + bszid_t bs_id = bszids[ i ]; + blksz_t* blksz = blkszs[ i ]; + + blksz_t* cntx_l3_sup_tri_blksz = &cntx_l3_sup_tri_blkszs[ bs_id ]; + + // Copy the blksz_t object contents into the appropriate + // location within the context's blksz_t array. + //cntx_l3_sup_blkszs[ bs_id ] = *blksz; + //bli_blksz_copy( blksz, cntx_l3_sup_blksz ); + bli_blksz_copy_if_pos( blksz, cntx_l3_sup_tri_blksz ); + } + + // Free the temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif + bli_free_intl( blkszs ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif + bli_free_intl( bszids ); +} + +// ----------------------------------------------------------------------------- + void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ) { // This function can be called from the bli_cntx_init_*() function for @@ -1262,6 +1366,139 @@ void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ) bli_free_intl( ukr_prefs ); } +// ----------------------------------------------------------------------------- + +void bli_cntx_set_l3_sup_tri_kers( dim_t n_ukrs, ... ) +{ + // This function can be called from the bli_cntx_init_*() function for + // a particular architecture if the kernel developer wishes to use + // non-default level-3 microkernels for small/unpacked matrices for operations + // that deal with triangular matrices. It should be called after + // bli_cntx_init_defaults() so that the context begins with default sup + // micro/millikernels across all datatypes. + + /* Example prototypes: + + void bli_cntx_set_l3_sup_kers + ( + dim_t n_ukrs, + stor3_t stor_id0, num_t dt0, void* ukr0_fp, bool pref0, + stor3_t stor_id1, num_t dt1, void* ukr1_fp, bool pref1, + stor3_t stor_id2, num_t dt2, void* ukr2_fp, bool pref2, + ... + cntx_t* cntx + ); + */ + + va_list args; + dim_t i; + + // Allocate some temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_sup_tri_kers(): " ); + #endif + stor3_t* st3_ids = bli_malloc_intl( n_ukrs * sizeof( stor3_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_sup_tri_kers(): " ); + #endif + num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_sup_tri_kers(): " ); + #endif + void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_sup_tri_kers(): " ); + #endif + bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ) ); + + // -- Begin variable argument section -- + + // Initialize variable argument environment. + va_start( args, n_ukrs ); + + // Process n_ukrs tuples. + for ( i = 0; i < n_ukrs; ++i ) + { + // Here, we query the variable argument list for: + // - the stor3_t storage case being assigned to the kernel we're + // about to process, + // - the datatype of the kernel, + // - the kernel function pointer, and + // - the kernel function storage preference + // that we need to store to the context. + const stor3_t st3_id = ( stor3_t )va_arg( args, stor3_t ); + const num_t ukr_dt = ( num_t )va_arg( args, num_t ); + void* ukr_fp = ( void* )va_arg( args, void* ); + const bool ukr_pref = ( bool )va_arg( args, int ); + + // Store the values in our temporary arrays. + st3_ids[ i ] = st3_id; + ukr_dts[ i ] = ukr_dt; + ukr_fps[ i ] = ukr_fp; + ukr_prefs[ i ] = ukr_pref; + } + + // The last argument should be the context pointer. + cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); + + // Shutdown variable argument environment and clean up stack. + va_end( args ); + + // -- End variable argument section -- + + // Query the context for the addresses of: + // - the l3 small/unpacked ukernel func_t array + // - the l3 small/unpacked ukernel preferences array + func_t* cntx_l3_sup_tri_kers = bli_cntx_l3_sup_tri_kers_buf( cntx ); + mbool_t* cntx_l3_sup_tri_kers_prefs = bli_cntx_l3_sup_tri_kers_prefs_buf( cntx ); + + // Process each blocksize id tuple provided. + for ( i = 0; i < n_ukrs; ++i ) + { + // Read the current stor3_t id, ukernel datatype, ukernel function + // pointer, and ukernel preference. + const stor3_t st3_id = st3_ids[ i ]; + const num_t ukr_dt = ukr_dts[ i ]; + void* ukr_fp = ukr_fps[ i ]; + const bool ukr_pref = ukr_prefs[ i ]; + + // Index to the func_t and mbool_t for the current stor3_t id + // being processed. + func_t* ukrs = &cntx_l3_sup_tri_kers[ st3_id ]; + mbool_t* prefs = &cntx_l3_sup_tri_kers_prefs[ st3_id ]; + + // Store the ukernel function pointer and preference values into + // the stor3_t location in the context. + bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); + bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); + } + + // Free the temporary local arrays. + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_sup_tri_kers(): " ); + #endif + bli_free_intl( st3_ids ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_sup_tri_kers(): " ); + #endif + bli_free_intl( ukr_dts ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_sup_tri_kers(): " ); + #endif + bli_free_intl( ukr_fps ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_sup_tri_kers(): " ); + #endif + bli_free_intl( ukr_prefs ); +} + // ----------------------------------------------------------------------------- void bli_cntx_set_trsm_blkszs( dim_t n_bs, ... ) { diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 1c47b9e583..e76c544ae6 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -112,14 +112,26 @@ BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } +BLIS_INLINE blksz_t* bli_cntx_l3_sup_tri_blkszs_buf( cntx_t* cntx ) +{ + return cntx->l3_sup_tri_blkszs; +} BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } +BLIS_INLINE func_t* bli_cntx_l3_sup_tri_kers_buf( cntx_t* cntx ) +{ + return cntx->l3_sup_tri_kers; +} BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } +BLIS_INLINE mbool_t* bli_cntx_l3_sup_tri_kers_prefs_buf( cntx_t* cntx ) +{ + return cntx->l3_sup_tri_kers_prefs; +} BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; @@ -331,7 +343,14 @@ BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) // Return the address of the blksz_t identified by bs_id. return blksz; } +BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_tri_blksz( bszid_t bs_id, cntx_t* cntx ) +{ + blksz_t* blkszs = bli_cntx_l3_sup_tri_blkszs_buf( cntx ); + blksz_t* blksz = &blkszs[ bs_id ]; + // Return the address of the blksz_t identified by bs_id. + return blksz; +} BLIS_INLINE blksz_t* bli_cntx_get_trsm_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_trsm_blkszs_buf( cntx ); @@ -351,6 +370,15 @@ BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cnt return bs_dt; } +BLIS_INLINE dim_t bli_cntx_get_l3_sup_tri_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) +{ + blksz_t* blksz = bli_cntx_get_l3_sup_tri_blksz( bs_id, cntx ); + dim_t bs_dt = bli_blksz_get_def( dt, blksz ); + + // Return the main (default) blocksize value for the datatype given. + return bs_dt; +} + BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); @@ -360,6 +388,20 @@ BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cnt return bs_dt; } +BLIS_INLINE dim_t bli_cntx_get_l3_sup_tri_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) +{ + blksz_t* blksz = bli_cntx_get_l3_sup_tri_blksz( bs_id, cntx ); + dim_t bs_dt = bli_blksz_get_max( dt, blksz ); + + if( bs_dt <= 0) + { + blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); + bs_dt = bli_blksz_get_max( dt, blksz ); + } + // Return the auxiliary (maximum) blocksize value for the datatype given. + return bs_dt; +} + // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) @@ -370,6 +412,14 @@ BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) return func; } +BLIS_INLINE func_t* bli_cntx_get_l3_sup_tri_kers( stor3_t stor_id, cntx_t* cntx ) +{ + func_t* funcs = bli_cntx_l3_sup_tri_kers_buf( cntx ); + func_t* func = &funcs[ stor_id ]; + + return func; +} + BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); @@ -377,6 +427,13 @@ BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* return bli_func_get_dt( dt, func ); } +BLIS_INLINE void* bli_cntx_get_l3_sup_tri_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) +{ + func_t* func = bli_cntx_get_l3_sup_tri_kers( stor_id, cntx ); + + return bli_func_get_dt( dt, func ); +} + // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) @@ -387,6 +444,14 @@ BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cnt return mbool; } +BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_tri_ker_prefs( stor3_t stor_id, cntx_t* cntx ) +{ + mbool_t* mbools = bli_cntx_l3_sup_tri_kers_prefs_buf( cntx ); + mbool_t* mbool = &mbools[ stor_id ]; + + return mbool; +} + BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); @@ -394,6 +459,13 @@ BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cn return ( bool )bli_mbool_get_dt( dt, mbool ); } +BLIS_INLINE bool bli_cntx_get_l3_sup_tri_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) +{ + mbool_t* mbool = bli_cntx_get_l3_sup_tri_ker_prefs( stor_id, cntx ); + + return ( bool )bli_mbool_get_dt( dt, mbool ); +} + // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) @@ -634,6 +706,15 @@ BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, ( prefs == TRUE ); } +BLIS_INLINE bool bli_cntx_l3_sup_tri_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) +{ + const bool prefs = bli_cntx_get_l3_sup_tri_ker_prefs_dt( dt, stor_id, cntx ); + + // A ukernel preference of TRUE means the ukernel prefers row storage. + return ( bool ) + ( prefs == TRUE ); +} + BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); @@ -643,6 +724,14 @@ BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, ( prefs == FALSE ); } +BLIS_INLINE bool bli_cntx_l3_sup_tri_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) +{ + const bool prefs = bli_cntx_get_l3_sup_tri_ker_prefs_dt( dt, stor_id, cntx ); + + // A ukernel preference of FALSE means the ukernel prefers column storage. + return ( bool ) + ( prefs == FALSE ); +} BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); @@ -834,7 +923,9 @@ BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_tri_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_tri_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 4a4dc0ec4e..22a96c215f 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1481,8 +1481,11 @@ typedef struct cntx_s blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; + blksz_t l3_sup_tri_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; + func_t l3_sup_tri_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; + mbool_t l3_sup_tri_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index 00acdfd08d..fedd7bc5c2 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -529,6 +529,79 @@ void GENBARNAME(cntx_init) bli_mbool_init( &mbools[ BLIS_XXX ], TRUE, TRUE, TRUE, TRUE ); + // -- Set level-3 small/unpacked micro-kernels, preferences and blocksizes + // for matrices dealing with triangular matrices------------- + +// -- Set blocksizes ------------------------------------------------------- + + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 0, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 0, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 0, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 0, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 0, 0, 0 ); + + // Initialize the context with the default blocksize objects and their + // multiples. + bli_cntx_set_l3_sup_tri_blkszs + ( + 5, + // level-3 + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + + funcs = bli_cntx_l3_sup_tri_kers_buf( cntx ); + mbools = bli_cntx_l3_sup_tri_kers_prefs_buf( cntx ); + +#if 0 + // Adhere to the small/unpacked ukernel mappings: + // - rv -> rrr, rcr + // - rg -> rrc, rcc + // - cv -> ccr, ccc + // - cg -> crr, crc + gen_sup_func_init( &funcs[ BLIS_RRR ], + &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name ); + gen_sup_func_init( &funcs[ BLIS_RRC ], + &funcs[ BLIS_RCC ], gemmsup_rg_ukr_name ); + gen_sup_func_init( &funcs[ BLIS_CCR ], + &funcs[ BLIS_CCC ], gemmsup_cv_ukr_name ); + gen_sup_func_init( &funcs[ BLIS_CRR ], + &funcs[ BLIS_CRC ], gemmsup_cg_ukr_name ); +#endif + gen_func_init( &funcs[ BLIS_RRR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_RRC ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_RCC ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_CRR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_CRC ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_CCR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_CCC ], gemmsup_rv_ukr_name ); + + // Register the general-stride/generic ukernel to the "catch-all" slot + // associated with the BLIS_XXX enum value. This slot will be queried if + // *any* operand is stored with general stride. + gen_func_init( &funcs[ BLIS_XXX ], gemmsup_gx_ukr_name ); + + + // Set the l3 sup ukernel storage preferences. + // s d c z + bli_mbool_init( &mbools[ BLIS_RRR ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_RRC ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_RCR ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_RCC ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_CRR ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_CRC ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_CCR ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_CCC ], TRUE, TRUE, TRUE, TRUE ); + + bli_mbool_init( &mbools[ BLIS_XXX ], TRUE, TRUE, TRUE, TRUE ); + + // -- Set level-1f kernels ------------------------------------------------- funcs = bli_cntx_l1f_kers_buf( cntx ); From 954c97f858c02eacc926f4520cb3dd5b447da654 Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Wed, 7 Jun 2023 09:05:10 +0000 Subject: [PATCH 116/226] Added NT in DTL logs for GEMMT, TRSM and NRM2 - Number of threads and gflops are added in the DTL logs for GEMMT, TRSM and NRM2 AMD-Internal: [CPUPL-2144] Change-Id: If68887a5150bd0feda351180f379996497a1e678 --- aocl_dtl/aocldtl_blis.c | 78 +++++++++++++++++++++++++++++++++++-- aocl_dtl/aocldtl_blis.h | 32 ++++++++++++++- frame/compat/bla_gemmt.c | 6 ++- frame/compat/bla_nrm2.c | 3 +- frame/compat/bla_trsm_amd.c | 55 +++++++++++++++++++++++++- 5 files changed, 164 insertions(+), 10 deletions(-) diff --git a/aocl_dtl/aocldtl_blis.c b/aocl_dtl/aocldtl_blis.c index c4de2bfcda..078da3b5db 100755 --- a/aocl_dtl/aocldtl_blis.c +++ b/aocl_dtl/aocldtl_blis.c @@ -3,7 +3,7 @@ * * Description : BLIS library specific debug helpes. * - * Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ @@ -111,6 +111,25 @@ void AOCL_DTL_log_gemm_stats(int8 loglevel, DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer); } +void AOCL_DTL_log_gemmt_stats(int8 loglevel, + const f77_int n, + const f77_int k) +{ + char buffer[256]; + + double flops = n * n * k; + + // Execution time is in micro seconds. + Double execution_time = AOCL_DTL_get_time_spent(); + + sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", + AOCL_get_requested_threads_count(), + execution_time/1000.0, + flops/(execution_time * 1e3)); + + DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer); +} + void AOCL_DTL_log_trsm_sizes(int8 loglevel, char dt_type, f77_char side, @@ -131,17 +150,47 @@ void AOCL_DTL_log_trsm_sizes(int8 loglevel, double alpha_real = 0.0; double alpha_imag = 0.0; + DTL_get_complex_parts(dt_type, alpha, &alpha_real, &alpha_imag); //{S, D, C, Z} side, uplo, transa, diaga, m, n, lda, ldb, alpha_real, alpha_imag - sprintf(buffer, "%c %c %c %c %c %ld %ld %ld %ld %lf %lf\n", dt_type, + sprintf(buffer, "%c %c %c %c %c %ld %ld %ld %ld %lf %lf", dt_type, side, uploa, transa, diaga, (dim_t)m, (dim_t)n, (dim_t)lda, (dim_t)ldb, alpha_real, alpha_imag); + AOCL_DTL_START_PERF_TIMER(); DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer); } +void AOCL_DTL_log_trsm_stats(int8 loglevel, + f77_char side, + const f77_int m, + const f77_int n) +{ + char buffer[256]; + + double flops = 0.0; + if (side == 'L' || side =='l') + { + flops = 1.0 * m * n * m; + } + else + { + flops = 1.0 * m * n * n; + } + + // Execution time is in micro seconds. + Double execution_time = AOCL_DTL_get_time_spent(); + + sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", + AOCL_get_requested_threads_count(), + execution_time/1000.0, + flops/(execution_time * 1e3)); + + DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer); +} + void AOCL_DTL_log_gemmt_sizes(int8 loglevel, char dt_type, char uplo, @@ -165,18 +214,20 @@ void AOCL_DTL_log_gemmt_sizes(int8 loglevel, double beta_real = 0.0; double beta_imag = 0.0; + DTL_get_complex_parts(dt_type, alpha, &alpha_real, &alpha_imag); DTL_get_complex_parts(dt_type, beta, &beta_real, &beta_imag); // {S,D,C,Z} {triangC : l or u} {n k lda ldb ldc transa transb alpha_real alpha_imaginary // beta_real, beta_imaginary} - sprintf(buffer, "%c %c %ld %ld %lu %lu %lu %c %c %lf %lf %lf %lf\n", + sprintf(buffer, "%c %c %ld %ld %lu %lu %lu %c %c %lf %lf %lf %lf", dt_type, uplo, (dim_t)n, (dim_t)k, (dim_t)lda, (dim_t)ldb, (dim_t)ldc, transa, transb, alpha_real, alpha_imag, beta_real, beta_imag); + AOCL_DTL_START_PERF_TIMER(); DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer); } @@ -639,12 +690,31 @@ void AOCL_DTL_log_nrm2_sizes(int8 loglevel, { char buffer[256]; // {S, D, C, Z} {n, incx} - sprintf(buffer, "%c %ld %ld\n", + sprintf(buffer, "%c %ld %ld", dt_type, (dim_t)n, (dim_t)incx); + AOCL_DTL_START_PERF_TIMER(); DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer); } +void AOCL_DTL_log_nrm2_stats(int8 loglevel, + const f77_int n) +{ + char buffer[256]; + + double flops = 2.0 * n; + + // Execution time is in micro seconds. + Double execution_time = AOCL_DTL_get_time_spent(); + + sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", + AOCL_get_requested_threads_count(), + execution_time/1000.0, + flops/(execution_time * 1e3)); + + DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer); +} + //Level-2 void AOCL_DTL_log_syr2_sizes(int8 loglevel, char dt_type, diff --git a/aocl_dtl/aocldtl_blis.h b/aocl_dtl/aocldtl_blis.h index 7b352f9d43..924dcc7445 100755 --- a/aocl_dtl/aocldtl_blis.h +++ b/aocl_dtl/aocldtl_blis.h @@ -3,7 +3,7 @@ * * Description : BLIS library specific debug helpes. * - * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ @@ -37,6 +37,11 @@ void AOCL_DTL_log_gemm_stats(int8 loglevel, const f77_int n, const f77_int k); +void AOCL_DTL_log_trsm_stats(int8 loglevel, + f77_char side, + const f77_int m, + const f77_int n); + void AOCL_DTL_log_trsm_sizes(int8 loglevel, char dt, f77_char side, @@ -68,6 +73,10 @@ void AOCL_DTL_log_gemmt_sizes(int8 loglevel, const char* function_name, int line); +void AOCL_DTL_log_gemmt_stats(int8 loglevel, + const f77_int n, + const f77_int k); + void AOCL_DTL_log_hemm_sizes(int8 loglevel, char dt_type, const f77_char side, @@ -243,6 +252,9 @@ void AOCL_DTL_log_nrm2_sizes( int8 loglevel, const char* function_name, int line); +void AOCL_DTL_log_nrm2_stats(int8 loglevel, + const f77_int n); + void AOCL_DTL_log_amax_sizes ( int8 loglevel, char dt_type, const f77_int n, @@ -393,11 +405,19 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel, if (gbIsLoggingEnabled) \ AOCL_DTL_log_gemm_stats(loglevel, m, n, k); +#define AOCL_DTL_LOG_GEMMT_STATS(loglevel, n, k) \ + if (gbIsLoggingEnabled) \ + AOCL_DTL_log_gemmt_stats(loglevel, n, k); + #define AOCL_DTL_LOG_TRSM_INPUTS(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb) \ if (gbIsLoggingEnabled) \ AOCL_DTL_log_trsm_sizes(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb, \ __FILE__, __FUNCTION__, __LINE__); +#define AOCL_DTL_LOG_TRSM_STATS(loglevel, side, m, n) \ + if (gbIsLoggingEnabled) \ + AOCL_DTL_log_trsm_stats(loglevel, side, m, n); + #define AOCL_DTL_LOG_GEMMT_INPUTS(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc) \ if (gbIsLoggingEnabled) \ AOCL_DTL_log_gemmt_sizes(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc, \ @@ -460,6 +480,10 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel, if (gbIsLoggingEnabled) \ AOCL_DTL_log_nrm2_sizes(loglevel, dt_type, n, incx, __FILE__,__FUNCTION__,__LINE__); +#define AOCL_DTL_LOG_NRM2_STATS(loglevel, n) \ + if (gbIsLoggingEnabled) \ + AOCL_DTL_log_nrm2_stats(loglevel, n); + #define AOCL_DTL_LOG_HEMV_INPUTS(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy) \ if (gbIsLoggingEnabled) \ AOCL_DTL_log_hemv_sizes(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy, \ @@ -535,8 +559,12 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel, #define AOCL_DTL_LOG_TRSM_INPUTS(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb) +#define AOCL_DTL_LOG_TRSM_STATS(loglevel, side, m, n) + #define AOCL_DTL_LOG_GEMMT_INPUTS(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc) +#define AOCL_DTL_LOG_GEMMT_STATS(loglevel, n, k) + #define AOCL_DTL_LOG_HEMM_INPUTS(loglevel, dt_type, side, uplo, m, n, alpha, lda, ldb, beta, ldc) #define AOCL_DTL_LOG_HERK_INPUTS(loglevel, dt_type, uploc, transa, m, k, alpha, lda, beta, ldc) @@ -561,6 +589,8 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel, #define AOCL_DTL_LOG_NRM2_INPUTS(loglevel, dt_type, n, incx) +#define AOCL_DTL_LOG_NRM2_STATS(loglevel, n) + #define AOCL_DTL_LOG_HEMV_INPUTS(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy) #define AOCL_DTL_LOG_HER2_INPUTS(loglevel, dt_type, uploa, m, alpha, incx, incy, lda) diff --git a/frame/compat/bla_gemmt.c b/frame/compat/bla_gemmt.c index 24a6d1324e..e2a25321ec 100644 --- a/frame/compat/bla_gemmt.c +++ b/frame/compat/bla_gemmt.c @@ -89,6 +89,7 @@ void PASTEF77S(ch,blasname) \ if ( *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \ && PASTEMAC(ch,eq1)( *beta ) )) \ { \ + AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -112,8 +113,6 @@ void PASTEF77S(ch,blasname) \ rs_c = 1; \ cs_c = *ldc; \ \ - if(!( n )) \ - return; \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ @@ -131,6 +130,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ + AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -203,6 +203,7 @@ void PASTEF77S(ch,blasname) \ if ( *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \ && PASTEMAC(ch,eq1)( *beta ) )) \ { \ + AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -262,6 +263,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ + AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ diff --git a/frame/compat/bla_nrm2.c b/frame/compat/bla_nrm2.c index db866a83ca..e17baf282c 100755 --- a/frame/compat/bla_nrm2.c +++ b/frame/compat/bla_nrm2.c @@ -49,7 +49,7 @@ ftype_r PASTEF772S(chr,chx,blasname) \ ) \ { \ AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \ - AOCL_DTL_LOG_NRM2_INPUTS(AOCL_DTL_LEVEL_TRACE_1,*MKSTR(chr),*n, *incx);\ + AOCL_DTL_LOG_NRM2_INPUTS(AOCL_DTL_LEVEL_TRACE_1,*MKSTR(chx),*n, *incx);\ dim_t n0; \ ftype_x* x0; \ inc_t incx0; \ @@ -75,6 +75,7 @@ ftype_r PASTEF772S(chr,chx,blasname) \ NULL \ ); \ \ + AOCL_DTL_LOG_NRM2_STATS(AOCL_DTL_LEVEL_TRACE_1, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ diff --git a/frame/compat/bla_trsm_amd.c b/frame/compat/bla_trsm_amd.c index a4585235a1..382989b621 100644 --- a/frame/compat/bla_trsm_amd.c +++ b/frame/compat/bla_trsm_amd.c @@ -89,6 +89,7 @@ void PASTEF77S(ch,blasname) \ /* Quick return if possible. */ \ if ( *m == 0 || *n == 0 ) \ { \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -123,7 +124,8 @@ void PASTEF77S(ch,blasname) \ (ftype*) b, rs_b, cs_b, \ NULL, NULL \ ); \ - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ return; \ @@ -145,6 +147,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -216,6 +219,7 @@ void PASTEF77S(ch,blasname) \ /* Quick return if possible. */ \ if ( *m == 0 || *n == 0 ) \ { \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -251,6 +255,7 @@ void PASTEF77S(ch,blasname) \ (ftype*) b, rs_b, cs_b, \ NULL, NULL \ ); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -299,6 +304,7 @@ void PASTEF77S(ch,blasname) \ (ftype*)b, rs_b, \ NULL \ ); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -315,6 +321,7 @@ void PASTEF77S(ch,blasname) \ (ftype*)b, rs_b, \ NULL \ ); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -340,6 +347,7 @@ void PASTEF77S(ch,blasname) \ PASTEMAC(ch,invscals)( a_conj, b[indx] ); \ } \ }\ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -363,6 +371,7 @@ void PASTEF77S(ch,blasname) \ (ftype*)a, cs_a, rs_a, \ (ftype*)b, cs_b, \ NULL); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -381,6 +390,7 @@ void PASTEF77S(ch,blasname) \ (ftype*)a, cs_a, rs_a, \ (ftype*)b, cs_b, \ NULL); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -406,6 +416,7 @@ void PASTEF77S(ch,blasname) \ PASTEMAC(ch,invscals)( a_conj, b[indx*cs_b] ); \ }\ } \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -442,6 +453,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -511,6 +523,7 @@ void strsm_blis_impl /* Quick return if possible. */ if ( *m == 0 || *n == 0 ) { + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -546,6 +559,7 @@ void strsm_blis_impl (float*) b, rs_b, cs_b, NULL, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) /* Finalize BLIS. */ bli_finalize_auto(); @@ -569,6 +583,7 @@ void strsm_blis_impl (float*)b, rs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -585,6 +600,7 @@ void strsm_blis_impl (float*)b, rs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -609,6 +625,7 @@ void strsm_blis_impl b[indx] = ( inva * b[indx] ); } } + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -635,6 +652,7 @@ void strsm_blis_impl (float*)b, cs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -656,6 +674,7 @@ void strsm_blis_impl (float*)b, cs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -680,6 +699,7 @@ void strsm_blis_impl b[indx*cs_b] = (inva * b[indx*cs_b] ); } } + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -731,6 +751,7 @@ void strsm_blis_impl ); if (status == BLIS_SUCCESS) { + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -750,6 +771,7 @@ void strsm_blis_impl NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) /* Finalize BLIS. */ bli_finalize_auto(); @@ -817,6 +839,7 @@ void dtrsm_blis_impl /* Quick return if possible. */ if ( *m == 0 || *n == 0 ) { + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -852,6 +875,7 @@ void dtrsm_blis_impl (double*) b, rs_b, cs_b, NULL, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) /* Finalize BLIS. */ bli_finalize_auto(); @@ -875,6 +899,7 @@ void dtrsm_blis_impl (double*)b, rs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -891,6 +916,7 @@ void dtrsm_blis_impl (double*)b, rs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -915,6 +941,7 @@ void dtrsm_blis_impl b[indx] = ( inva * b[indx] ); } } + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -941,6 +968,7 @@ void dtrsm_blis_impl (double*)b, cs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -962,6 +990,7 @@ void dtrsm_blis_impl (double*)b, cs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -986,6 +1015,7 @@ void dtrsm_blis_impl b[indx*cs_b] = (inva * b[indx*cs_b] ); } } + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1107,6 +1137,7 @@ void dtrsm_blis_impl } if (status == BLIS_SUCCESS) { + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1124,7 +1155,7 @@ void dtrsm_blis_impl NULL, NULL ); - + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) /* Finalize BLIS. */ bli_finalize_auto(); @@ -1193,6 +1224,7 @@ void ztrsm_blis_impl /* Quick return if possible. */ if ( *m == 0 || *n == 0 ) { + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1228,6 +1260,7 @@ void ztrsm_blis_impl (dcomplex*) b, rs_b, cs_b, NULL, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) /* Finalize BLIS. */ bli_finalize_auto(); @@ -1251,6 +1284,7 @@ void ztrsm_blis_impl (dcomplex*)b, rs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1267,6 +1301,7 @@ void ztrsm_blis_impl (dcomplex*)b, rs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1321,6 +1356,7 @@ void ztrsm_blis_impl } + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1347,6 +1383,7 @@ void ztrsm_blis_impl (dcomplex*)b, cs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1368,6 +1405,7 @@ void ztrsm_blis_impl (dcomplex*)b, cs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1421,6 +1459,7 @@ void ztrsm_blis_impl } } + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; @@ -1475,6 +1514,7 @@ void ztrsm_blis_impl ); if (status == BLIS_SUCCESS) { + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1494,6 +1534,7 @@ void ztrsm_blis_impl NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) /* Finalize BLIS. */ bli_finalize_auto(); @@ -1562,6 +1603,7 @@ void ctrsm_blis_impl /* Quick return if possible. */ if ( *m == 0 || *n == 0 ) { + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1597,6 +1639,7 @@ void ctrsm_blis_impl (scomplex*) b, rs_b, cs_b, NULL, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) /* Finalize BLIS. */ bli_finalize_auto(); @@ -1620,6 +1663,7 @@ void ctrsm_blis_impl (scomplex*)b, rs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1636,6 +1680,7 @@ void ctrsm_blis_impl (scomplex*)b, rs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1689,6 +1734,7 @@ void ctrsm_blis_impl } } + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; @@ -1716,6 +1762,7 @@ void ctrsm_blis_impl (scomplex*)b, cs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1737,6 +1784,7 @@ void ctrsm_blis_impl (scomplex*)b, cs_b, NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1790,6 +1838,7 @@ void ctrsm_blis_impl } } + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1842,6 +1891,7 @@ void ctrsm_blis_impl ); if (status == BLIS_SUCCESS) { + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1861,6 +1911,7 @@ void ctrsm_blis_impl NULL ); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) /* Finalize BLIS. */ bli_finalize_auto(); From fa77d0415a594b1c3c5888a73c10e88b7faf2150 Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Thu, 13 Apr 2023 15:59:13 +0530 Subject: [PATCH 117/226] Updating nrm2 GTestSuite testing - Adding default template parameter for the type of the returned value from nrm2. - Bugfix on NaN/Inf comparator for scalars. - Tuning sizes of vector x to exercise the different paths for vectorized and scalar code. - Adding wrong parameters and extreme value testing. - Adding tests for overflow and underflow using max and min representable numbers for vectorized and scalar code. AMD-Internal: [CPUPL-2732] Change-Id: Ice8ee65095ecaa7b30ebd5f90ed2a890178533db --- gtestsuite/testinghelpers/inc/util/ref_nrm2.h | 4 +- .../testinghelpers/src/util/ref_nrm2.cpp | 6 +- gtestsuite/testsuite/inc/check_error.h | 3 +- .../testsuite/util/nrm2/cnrm2_generic.cpp | 96 -------- .../util/nrm2/dnrm2_extreme_values.cpp | 191 ++++++++++++++++ .../testsuite/util/nrm2/dnrm2_generic.cpp | 31 ++- .../util/nrm2/dznrm2_extreme_values.cpp | 190 ++++++++++++++++ .../testsuite/util/nrm2/dznrm2_generic.cpp | 81 +++++++ gtestsuite/testsuite/util/nrm2/nrm2.h | 37 +-- .../testsuite/util/nrm2/nrm2_corner_cases.cpp | 73 ++++++ .../testsuite/util/nrm2/nrm2_extreme_vals.cpp | 77 ------- .../util/nrm2/nrm2_invalid_inputs.cpp | 27 +++ .../util/nrm2/nrm2_underflow_overflow.cpp | 84 +++++++ .../util/nrm2/scnrm2_extreme_values.cpp | 211 +++++++++++++++++ .../testsuite/util/nrm2/scnrm2_generic.cpp | 82 +++++++ .../util/nrm2/snrm2_extreme_values.cpp | 215 ++++++++++++++++++ .../testsuite/util/nrm2/snrm2_generic.cpp | 33 ++- gtestsuite/testsuite/util/nrm2/test_nrm2.h | 63 ++++- .../testsuite/util/nrm2/znrm2_generic.cpp | 96 -------- 19 files changed, 1292 insertions(+), 308 deletions(-) delete mode 100644 gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp create mode 100644 gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp create mode 100644 gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp create mode 100644 gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp create mode 100644 gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp delete mode 100644 gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp create mode 100644 gtestsuite/testsuite/util/nrm2/nrm2_invalid_inputs.cpp create mode 100644 gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp create mode 100644 gtestsuite/testsuite/util/nrm2/scnrm2_extreme_values.cpp create mode 100644 gtestsuite/testsuite/util/nrm2/scnrm2_generic.cpp create mode 100644 gtestsuite/testsuite/util/nrm2/snrm2_extreme_values.cpp delete mode 100644 gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp diff --git a/gtestsuite/testinghelpers/inc/util/ref_nrm2.h b/gtestsuite/testinghelpers/inc/util/ref_nrm2.h index 44c506b715..3163d46556 100644 --- a/gtestsuite/testinghelpers/inc/util/ref_nrm2.h +++ b/gtestsuite/testinghelpers/inc/util/ref_nrm2.h @@ -46,7 +46,7 @@ namespace testinghelpers { -template -T ref_nrm2(gtint_t n, Tf* x, gtint_t incx); +template ::real_type> +RT ref_nrm2(gtint_t n, T* x, gtint_t incx); } //end of namespace testinghelpers diff --git a/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp b/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp index d61b5735dd..75021e412e 100644 --- a/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp +++ b/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp @@ -46,10 +46,10 @@ namespace testinghelpers { -template -Treal ref_nrm2(gtint_t n, T* x, gtint_t incx) { +template +RT ref_nrm2(gtint_t n, T* x, gtint_t incx) { - typedef Treal (*Fptr_ref_cblas_nrm2)( f77_int, const T *, f77_int ); + typedef RT (*Fptr_ref_cblas_nrm2)( f77_int, const T *, f77_int ); Fptr_ref_cblas_nrm2 ref_cblas_nrm2; // Call C function diff --git a/gtestsuite/testsuite/inc/check_error.h b/gtestsuite/testsuite/inc/check_error.h index 25e3c16204..4f6d848855 100644 --- a/gtestsuite/testsuite/inc/check_error.h +++ b/gtestsuite/testsuite/inc/check_error.h @@ -309,7 +309,8 @@ void computediff( T blis_sol, T ref_sol, bool nan_inf_check = false ) template void computediff( T blis_sol, T ref_sol, double thresh, bool nan_inf_check = false ) { - ComparisonHelper comp_helper(SCALAR, thresh); + ComparisonHelper comp_helper(SCALAR, thresh); + comp_helper.nan_inf_check = nan_inf_check; ASSERT_PRED_FORMAT3(NumericalComparison, blis_sol, ref_sol, comp_helper); } diff --git a/gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp deleted file mode 100644 index 898f4fee5c..0000000000 --- a/gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#include "test_nrm2.h" - -class CNrm2Test : - public ::testing::TestWithParam> {}; - -TEST_P( CNrm2Test, RandomData ) -{ - using T = scomplex; - //---------------------------------------------------------- - // Initialize values from the parameters passed through - // test suite instantiation (INSTANTIATE_TEST_SUITE_P). - //---------------------------------------------------------- - // vector length: - gtint_t n = std::get<0>(GetParam()); - // stride size for x: - gtint_t incx = std::get<1>(GetParam()); - - // Set the threshold for the errors: - double thresh = std::sqrt(n)*testinghelpers::getEpsilon(); - - //---------------------------------------------------------- - // Call test body using these parameters - //---------------------------------------------------------- - test_nrm2( n, incx, thresh ); -} - -// Prints the test case combination -class CNrm2TestPrint { -public: - std::string operator()( - testing::TestParamInfo> str) const { - gtint_t n = std::get<0>(str.param); - gtint_t incx = std::get<1>(str.param); -#ifdef TEST_BLAS - std::string str_name = "scnrm2_"; -#elif TEST_CBLAS - std::string str_name = "cblas_scnrm2"; -#else //#elif TEST_BLIS_TYPED - std::string str_name = "bli_cnormfv"; -#endif - str_name = str_name + "_" + std::to_string(n); - std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); - str_name = str_name + "_" + incx_str; - return str_name; - } -}; - -// Black box testing. -INSTANTIATE_TEST_SUITE_P( - Blackbox, - CNrm2Test, - ::testing::Combine( - ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1), gtint_t(2) -#ifndef TEST_BLIS_TYPED - , gtint_t(-1), gtint_t(-2) -#endif - ) // stride size for x - ), - ::CNrm2TestPrint() - ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp new file mode 100644 index 0000000000..469385f1a1 --- /dev/null +++ b/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp @@ -0,0 +1,191 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "test_nrm2.h" + +class dnrm2_EVT : + public ::testing::TestWithParam> {}; + +TEST_P( dnrm2_EVT, EVT ) +{ + using T = double; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // vector length: + gtint_t n = std::get<0>(GetParam()); + // stride size for x: + gtint_t incx = std::get<1>(GetParam()); + // index with extreme value iexval. + gtint_t i = std::get<2>(GetParam()); + T iexval = std::get<3>(GetParam()); + // index with extreme value jexval. + gtint_t j = std::get<4>(GetParam()); + T jexval = std::get<5>(GetParam()); + + //---------------------------------------------------------- + // Call test body using these parameters + //---------------------------------------------------------- + test_nrm2(n, incx, i, iexval, j, jexval); +} + +// Prints the test case combination +class dnrm2_TestPrint { +public: + std::string operator()( + testing::TestParamInfo> str) const { + // vector length: + gtint_t n = std::get<0>(str.param); + // stride size for x: + gtint_t incx = std::get<1>(str.param); + // index with extreme value iexval. + gtint_t i = std::get<2>(str.param); + double iexval = std::get<3>(str.param); + // index with extreme value jexval. + gtint_t j = std::get<4>(str.param); + double jexval = std::get<5>(str.param); +#ifdef TEST_BLAS + std::string str_name = "dnrm2_"; +#elif TEST_CBLAS + std::string str_name = "cblas_dnrm2"; +#else //#elif TEST_BLIS_TYPED + std::string str_name = "bli_dnormfv"; +#endif + str_name = str_name + "_" + std::to_string(n); + std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); + str_name = str_name + "_" + incx_str; + str_name = str_name + "_i" + std::to_string(i); + std::string iexval_str = getValueString(iexval); + str_name = str_name + "_" + iexval_str; + str_name = str_name + "_j" + std::to_string(j); + std::string jexval_str = getValueString(jexval); + str_name = str_name + "_" + jexval_str; + return str_name; + } +}; + +static double NaN = std::numeric_limits::quiet_NaN(); +static double Inf = std::numeric_limits::infinity(); + +/** + * dnrm2 implementation is composed by two parts: + * - vectorized path for n>4 + * - for-loop for multiples of 8 (F8) + * - for-loop for multiples of 4 (F4) + * - scalar path for n<=4 (S) + */ + +// Test for scalar path. +// Testing for jexval=1.0, means that we test only one NaN/Inf value. +// for jexval also being an extreme value, we test all combinations +// of having first a NaN and then an Inf and so on. +INSTANTIATE_TEST_SUITE_P( + scalar, + dnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(3)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(0), + // iexval + ::testing::Values(NaN, Inf, -Inf), + ::testing::Values(2), + ::testing::Values(1.0, NaN, Inf, -Inf) + ), + ::dnrm2_TestPrint() + ); + +INSTANTIATE_TEST_SUITE_P( + vector_F8, + dnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(8)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(3), + // iexval + ::testing::Values(NaN, Inf, -Inf), + ::testing::Values(6), + ::testing::Values(1.0, NaN, Inf, -Inf) + ), + ::dnrm2_TestPrint() + ); + +// To test the second for-loop (F4), we use n = 12 +// and ensure that the extreme values are on or after index 8. +INSTANTIATE_TEST_SUITE_P( + vector_F4, + dnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(12)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(9), + // iexval + ::testing::Values(NaN, Inf, -Inf), + ::testing::Values(11), + ::testing::Values(1.0, NaN, Inf, -Inf) + ), + ::dnrm2_TestPrint() + ); + +// Now let's check the combination of a vectorized path and +// the scalar path, by putting an extreme value in each +// to check that the checks are integrated correctly. +INSTANTIATE_TEST_SUITE_P( + vector_scalar, + dnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(10)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(5), + // iexval + ::testing::Values(NaN, Inf, -Inf), + ::testing::Values(8), + ::testing::Values(1.0, NaN, Inf, -Inf) + ), + ::dnrm2_TestPrint() + ); + diff --git a/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp index 2ea60db522..419c8499d7 100644 --- a/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp @@ -80,17 +80,36 @@ class dnrm2TestPrint { } }; -// Black box testing. +/** + * dnrm2 implementation is composed by two parts: + * - vectorized path for n>4 + * - for-loop for multiples of 8 (F8) + * - for-loop for multiples of 4 (F4) + * - scalar path for n<=4 (S) +*/ INSTANTIATE_TEST_SUITE_P( - Blackbox, + AT, dnrm2Test, ::testing::Combine( - ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1), gtint_t(2) + // m size of vector + ::testing::Values(gtint_t(1), // trivial case n=1 + gtint_t(3), // will only go through S + gtint_t(8), // 1*8 - will only go through F8 + gtint_t(24), // 3*8 - will go through F8 + gtint_t(34), // 4*8 + 2 - will go through F8 & S + gtint_t(52), // 6*8 + 4 - will go through F8 & F4 + gtint_t(71), // 8*8 + 4 + 3 - will go through F8 & F4 & S + gtint_t(89), // a few bigger numbers + gtint_t(122), + gtint_t(185), + gtint_t(217) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(4) #ifndef TEST_BLIS_TYPED - ,gtint_t(-1), gtint_t(-2) + , gtint_t(-1), gtint_t(-5) #endif ) // stride size for x ), ::dnrm2TestPrint() - ); + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp new file mode 100644 index 0000000000..4615a18356 --- /dev/null +++ b/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp @@ -0,0 +1,190 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "test_nrm2.h" + +class dznrm2_EVT : + public ::testing::TestWithParam>{}; + +TEST_P( dznrm2_EVT, EVT ) +{ + using T = dcomplex; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // vector length: + gtint_t n = std::get<0>(GetParam()); + // stride size for x: + gtint_t incx = std::get<1>(GetParam()); + // index with extreme value iexval. + gtint_t i = std::get<2>(GetParam()); + T iexval = std::get<3>(GetParam()); + // index with extreme value jexval. + gtint_t j = std::get<4>(GetParam()); + T jexval = std::get<5>(GetParam()); + + //---------------------------------------------------------- + // Call test body using these parameters + //---------------------------------------------------------- + test_nrm2(n, incx, i, iexval, j, jexval); +} + +// Prints the test case combination +class dznrm2_TestPrint{ +public: + std::string operator()( + testing::TestParamInfo> str) const { + // vector length: + gtint_t n = std::get<0>(str.param); + // stride size for x: + gtint_t incx = std::get<1>(str.param); + // index with extreme value iexval. + gtint_t i = std::get<2>(str.param); + dcomplex iexval = std::get<3>(str.param); + // index with extreme value jexval. + gtint_t j = std::get<4>(str.param); + dcomplex jexval = std::get<5>(str.param); +#ifdef TEST_BLAS + std::string str_name = "dznrm2_"; +#elif TEST_CBLAS + std::string str_name = "cblas_dznrm2"; +#else //#elif TEST_BLIS_TYPED + std::string str_name = "bli_znormfv"; +#endif + str_name = str_name + "_" + std::to_string(n); + std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); + str_name = str_name + "_" + incx_str; + str_name = str_name + "_i" + std::to_string(i); + std::string iexval_str = "_Re_" + getValueString(iexval.real) + "_Im_" + getValueString(iexval.imag); + str_name = str_name + iexval_str; + str_name = str_name + "_j" + std::to_string(j); + std::string jexval_str = "_Re_" + getValueString(jexval.real) + "_Im_" + getValueString(jexval.imag); + str_name = str_name + jexval_str; + return str_name; + } +}; + +static double NaN = std::numeric_limits::quiet_NaN(); +static double Inf = std::numeric_limits::infinity(); +/** + * dznrm2 implementation is composed by two parts: + * - vectorized path for n>2 + * - for-loop for multiples of 4 (F4) + * - for-loop for multiples of 2 (F2) + * - scalar path for n<=2 (S) +*/ + +// Test for scalar path. +// Testing for jexval=(1.0, 2.0), means that we test only one NaN/Inf value. +// for jexval also being an extreme value, we test all combinations +// of having first a NaN and then an Inf and so on. +INSTANTIATE_TEST_SUITE_P( + scalar, + dznrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(2)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(0), + // iexval + ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}, dcomplex{NaN, Inf}, dcomplex{Inf, NaN}), + ::testing::Values(1), + ::testing::Values(dcomplex{1.0, 2.0}, dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}) + ), + ::dznrm2_TestPrint() + ); + +INSTANTIATE_TEST_SUITE_P( + vector_F4, + dznrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(4)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(1), + // iexval + ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}, dcomplex{NaN, Inf}, dcomplex{Inf, NaN}), + ::testing::Values(3), + ::testing::Values(dcomplex{1.0, 2.0}, dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}) + ), + ::dznrm2_TestPrint() + ); + +// To test the second for-loop (F2), we use n = 6 +// and ensure that the extreme values are on or after index 4. +INSTANTIATE_TEST_SUITE_P( + vector_F2, + dznrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(6)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(4), + // iexval + ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}, dcomplex{NaN, Inf}, dcomplex{Inf, NaN}), + ::testing::Values(5), + ::testing::Values(dcomplex{1.0, 2.0}, dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}) + ), + ::dznrm2_TestPrint() + ); + +// Now let's check the combination of a vectorized path and +// the scalar path, by putting an extreme value in each +// to check that the checks are integrated correctly. +INSTANTIATE_TEST_SUITE_P( + vector_scalar, + dznrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(7)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(2), + // iexval + ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}, dcomplex{NaN, Inf}, dcomplex{Inf, NaN}), + ::testing::Values(6), + ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}) + ), + ::dznrm2_TestPrint() + ); + diff --git a/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp new file mode 100644 index 0000000000..e6477ff427 --- /dev/null +++ b/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp @@ -0,0 +1,81 @@ +#include +#include "test_nrm2.h" + +class dznrm2Test : + public ::testing::TestWithParam> {}; + +TEST_P( dznrm2Test, RandomData ) +{ + using T = dcomplex; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // vector length: + gtint_t n = std::get<0>(GetParam()); + // stride size for x: + gtint_t incx = std::get<1>(GetParam()); + + // Set the threshold for the errors: + double thresh = 3*testinghelpers::getEpsilon(); + + //---------------------------------------------------------- + // Call test body using these parameters + //---------------------------------------------------------- + test_nrm2(n, incx, thresh); +} + +// Prints the test case combination +class dznrm2TestPrint { +public: + std::string operator()( + testing::TestParamInfo> str) const { + gtint_t n = std::get<0>(str.param); + gtint_t incx = std::get<1>(str.param); +#ifdef TEST_BLAS + std::string str_name = "dznrm2_"; +#elif TEST_CBLAS + std::string str_name = "cblas_dznrm2"; +#else //#elif TEST_BLIS_TYPED + std::string str_name = "bli_znormfv"; +#endif + str_name = str_name + "_" + std::to_string(n); + std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); + str_name = str_name + "_" + incx_str; + return str_name; + } +}; + +/** + * dznrm2 implementation is composed by two parts: + * - vectorized path for n>2 + * - for-loop for multiples of 4 (F4) + * - for-loop for multiples of 2 (F2) + * - scalar path for n<=2 (S) +*/ +INSTANTIATE_TEST_SUITE_P( + AT, + dznrm2Test, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(1), // trivial case n=1 + gtint_t(2), // will only go through S + gtint_t(4), // 1*4 - will only go through F4 + gtint_t(12), // 3*4 - will go through F4 + gtint_t(17), // 4*4 + 1 - will go through F4 & S + gtint_t(22), // 5*4 + 2 - will go through F4 & F2 + gtint_t(35), // 8*4 + 2 + 1 - will go through F4 & F2 & S + gtint_t(78), // a few bigger numbers + gtint_t(112), + gtint_t(187), + gtint_t(213) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(3) +#ifndef TEST_BLIS_TYPED + , gtint_t(-1), gtint_t(-7) +#endif + ) + ), + ::dznrm2TestPrint() + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/nrm2.h b/gtestsuite/testsuite/util/nrm2/nrm2.h index 6e9de9e547..537cf27f43 100644 --- a/gtestsuite/testsuite/util/nrm2/nrm2.h +++ b/gtestsuite/testsuite/util/nrm2/nrm2.h @@ -38,17 +38,24 @@ #include "common/testing_helpers.h" /** - * @brief Overload bli_*normfv() functions using typed_nrm2. - * Will be used in testing and especially in TYPED_TESTs. - * Computes the Euclidean norm of x. + * @brief Computes the Euclidean norm of x. + * + * Euclidean norm of a vector x is defined as nrm2 = sqrt(x'*x). + * In case a vector element is NaN, nrm2 must be NaN. + * In case a vector element is inf, and there is no element which is NaN, nrm2 must be inf. + * If n <= 0, nrm2 returns zero. + * If incx = 0, nrm2 returns sqrt(n*abs(x[0])**2). + * * @param[in] n vector length * @param[in] x pointer which points to the first element of x * @param[in] incx increment of x * @return the Euclidean norm of x + * + * */ -template -static Treal nrm2_(gtint_t n, T* x, gtint_t incx){ +template::real_type> +static RT nrm2_(gtint_t n, T* x, gtint_t incx){ if constexpr (std::is_same::value) return snrm2_( &n, x, &incx ); else if constexpr (std::is_same::value) @@ -61,8 +68,8 @@ static Treal nrm2_(gtint_t n, T* x, gtint_t incx){ throw std::runtime_error("Error in testsuite/level1/nrm2.h: Invalid typename in nrm2_()."); } -template -static Treal cblas_nrm2(gtint_t n, T* x, gtint_t incx){ +template::real_type> +static RT cblas_nrm2(gtint_t n, T* x, gtint_t incx){ if constexpr (std::is_same::value) return cblas_snrm2( n, x, incx ); else if constexpr (std::is_same::value) @@ -75,9 +82,9 @@ static Treal cblas_nrm2(gtint_t n, T* x, gtint_t incx){ throw std::runtime_error("Error in testsuite/level1/nrm2.h: Invalid typename in cblas_nrm2()."); } -template -static Treal typed_nrm2(gtint_t n, T* x, gtint_t incx){ - Treal nrm; +template::real_type> +static RT typed_nrm2(gtint_t n, T* x, gtint_t incx){ + RT nrm; if constexpr (std::is_same::value) bli_snormfv(n, x, incx, &nrm); else if constexpr (std::is_same::value) @@ -91,15 +98,15 @@ static Treal typed_nrm2(gtint_t n, T* x, gtint_t incx){ return nrm; } -template -static Treal nrm2(gtint_t n, T* x, gtint_t incx) +template::real_type> +static RT nrm2(gtint_t n, T* x, gtint_t incx) { #ifdef TEST_BLAS - return nrm2_(n, x, incx); + return nrm2_(n, x, incx); #elif TEST_CBLAS - return cblas_nrm2(n, x, incx); + return cblas_nrm2(n, x, incx); #elif TEST_BLIS_TYPED - return typed_nrm2(n, x, incx); + return typed_nrm2(n, x, incx); #else throw std::runtime_error("Error in testsuite/level1/axpyv.h: No interfaces are set to be tested."); #endif diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp new file mode 100644 index 0000000000..ac8f104697 --- /dev/null +++ b/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp @@ -0,0 +1,73 @@ +#include +#include "test_nrm2.h" + +/** + * Testing edge input parameters. + * + * zero n should return 0. + * zero incx should return sqrt(n*abs(x[0])**2). +*/ + +// Early return. +template +class nrm2_ERS : public ::testing::Test {}; +typedef ::testing::Types TypeParam; +TYPED_TEST_SUITE(nrm2_ERS, TypeParam); + +TYPED_TEST(nrm2_ERS, zero_n) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + gtint_t n = 0; + gtint_t incx = 1; + // initialize norm to ensure that it is set to zero from nrm2 and it does not simply return. + RT blis_norm = 19.0; + // using nullptr since x should not be accessed anyway. + // If "x" is accessed before return then nrm2 would segfault. + blis_norm = nrm2(n, nullptr, incx); + RT ref_norm = testinghelpers::ref_nrm2(n, nullptr, incx); + computediff(blis_norm, ref_norm); +} + +// Edge case where it actually does not return early. +// Since there are 2 different paths, vectorized and scalar, +// we break this into 2 tests, once for each case. +template +class nrm2_EIC : public ::testing::Test {}; +typedef ::testing::Types TypeParam; +TYPED_TEST_SUITE(nrm2_EIC, TypeParam); + +TYPED_TEST(nrm2_EIC, zero_incx_scalar) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + gtint_t n = 2; + gtint_t incx = 0; + std::vector x(n); + for (auto &xi : x) + testinghelpers::initone(xi); + // For incx=0, nrm2 iterates through the first element n-times. + // So, we initialize x[0] with a different value than the rest + // of the elements. + x[0] = T{2.0}*x[0]; + RT blis_norm = 19.0; + blis_norm = nrm2(n, x.data(), incx); + RT ref_norm = testinghelpers::ref_nrm2(n, x.data(), incx); + computediff(blis_norm, ref_norm); +} + +TYPED_TEST(nrm2_EIC, zero_incx_vectorized) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + gtint_t n = 64; + gtint_t incx = 0; + std::vector x(n); + for (auto &xi : x) + testinghelpers::initone(xi); + // For incx=0, nrm2 iterates through the first element n-times. + // So, we initialize x[0] with a different value than the rest + // of the elements. + x[0] = T{2.0}*x[0]; + RT blis_norm = 19.0; + blis_norm = nrm2(n, x.data(), incx); + RT ref_norm = testinghelpers::ref_nrm2(n, x.data(), incx); + computediff(blis_norm, ref_norm); +} \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp deleted file mode 100644 index 7080a144ea..0000000000 --- a/gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#include "test_nrm2.h" - -template -class xnrm2 : public ::testing::Test {}; -typedef ::testing::Types TypeParam; -TYPED_TEST_SUITE(xnrm2, TypeParam); - -TYPED_TEST(xnrm2, zeroFP) { - using T = TypeParam; - T x = T(0); - - T norm = nrm2(1, &x, 1); - EXPECT_EQ(0, norm); -} - -TYPED_TEST(xnrm2, minFP) { - using T = TypeParam; - T x = std::numeric_limits::min(); - - T norm = nrm2(1, &x, 1); - EXPECT_EQ(x, norm); -} - -TYPED_TEST(xnrm2, maxFP) { - using T = TypeParam; - T x = std::numeric_limits::max(); - - T norm = nrm2(1, &x, 1); - EXPECT_EQ(x, norm); -} - -TEST(dnrm2, largeDouble) { - using T = double; - gtint_t n = 2; - std::vector x{3e300, 4e300}, y{-4e300, -3e300}; - - T norm = nrm2(n, x.data(), 1); - EXPECT_EQ(5e300, norm); - - norm = nrm2(n, y.data(), 1); - EXPECT_EQ(5e300, norm); -} \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_invalid_inputs.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_invalid_inputs.cpp new file mode 100644 index 0000000000..a4a8abf6af --- /dev/null +++ b/gtestsuite/testsuite/util/nrm2/nrm2_invalid_inputs.cpp @@ -0,0 +1,27 @@ +#include +#include "test_nrm2.h" +#include "common/wrong_inputs_helpers.h" + +/** + * Testing invalid/incorrect input parameters. + * + * That is only negative n for this API. Zero incx and zero n is allowed. +*/ +template +class nrm2_IIT : public ::testing::Test {}; +typedef ::testing::Types TypeParam; +TYPED_TEST_SUITE(nrm2_IIT, TypeParam); + +// Adding namespace to get default parameters from testinghelpers/common/wrong_input_helpers.h. +using namespace testinghelpers::IIT; + +TYPED_TEST(nrm2_IIT, negative_n) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + T x = T{-3.7}; + // initialize blis norm with garbage. + RT blis_norm = -4.2; + blis_norm = nrm2(-2, &x, INC); + + computediff(blis_norm, 0.0); +} \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp new file mode 100644 index 0000000000..7ab2f99c91 --- /dev/null +++ b/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp @@ -0,0 +1,84 @@ +#include +#include "test_nrm2.h" + +template +class OUT_nrm2 : public ::testing::Test {}; +typedef ::testing::Types TypeParam; +TYPED_TEST_SUITE(OUT_nrm2, TypeParam); + +// Testing for max representable number to see if overflow is handled correctly. +TYPED_TEST(OUT_nrm2, maxFP_scalar) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + + RT maxval = std::numeric_limits::max(); + T x = T{maxval}; + + RT norm = nrm2(1, &x, 1); + computediff(maxval, norm); +} +TYPED_TEST(OUT_nrm2, maxFP_vectorized) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + gtint_t n = 64; + std::vector x(n, T{0}); + RT maxval = std::numeric_limits::max(); + x[17] = T{maxval}; + RT norm = nrm2(n, x.data(), 1); + computediff(maxval, norm); +} + +// Testing for min representable number to see if underflow is handled correctly. +TYPED_TEST(OUT_nrm2, minFP_scalar) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + + RT minval = std::numeric_limits::min(); + T x = T{minval}; + RT norm = nrm2(1, &x, 1); + computediff(minval, norm); +} +TYPED_TEST(OUT_nrm2, minFP_vectorized) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + gtint_t n = 64; + std::vector x(n, T{0}); + RT minval = std::numeric_limits::min(); + x[17] = T{minval}; + RT norm = nrm2(n, x.data(), 1); + computediff(minval, norm); +} + +// Since there are 2 different paths, vectorized and scalar, +// we break this into 2 tests, once for each case. +TYPED_TEST(OUT_nrm2, zeroFP_scalar) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + T x = T{0}; + + RT norm = nrm2(1, &x, 1); + computediff(0, norm); +} +TYPED_TEST(OUT_nrm2, zeroFP_vectorized) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + gtint_t n = 64; + std::vector x(n, T{0}); + + RT norm = nrm2(n, x.data(), 1); + computediff(0, norm); +} + +// Specific test case used by an ISV. +// Checks for overflow. +TEST(dnrm2, largeDouble) { + using T = double; + gtint_t n = 2; + std::vector x{3e300, 4e300}, y{-4e300, -3e300}; + + T norm = nrm2(n, x.data(), 1); + computediff(5e300, norm); + + norm = nrm2(n, y.data(), 1); + computediff(5e300, norm); +} diff --git a/gtestsuite/testsuite/util/nrm2/scnrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/scnrm2_extreme_values.cpp new file mode 100644 index 0000000000..fa1d7abc97 --- /dev/null +++ b/gtestsuite/testsuite/util/nrm2/scnrm2_extreme_values.cpp @@ -0,0 +1,211 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "test_nrm2.h" + +class scnrm2_EVT : + public ::testing::TestWithParam>{}; + +TEST_P( scnrm2_EVT, EVT ) +{ + using T = scomplex; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // vector length: + gtint_t n = std::get<0>(GetParam()); + // stride size for x: + gtint_t incx = std::get<1>(GetParam()); + // index with extreme value iexval. + gtint_t i = std::get<2>(GetParam()); + T iexval = std::get<3>(GetParam()); + // index with extreme value jexval. + gtint_t j = std::get<4>(GetParam()); + T jexval = std::get<5>(GetParam()); + + //---------------------------------------------------------- + // Call test body using these parameters + //---------------------------------------------------------- + test_nrm2(n, incx, i, iexval, j, jexval); +} + +// Prints the test case combination +class scnrm2_TestPrint{ +public: + std::string operator()( + testing::TestParamInfo> str) const { + // vector length: + gtint_t n = std::get<0>(str.param); + // stride size for x: + gtint_t incx = std::get<1>(str.param); + // index with extreme value iexval. + gtint_t i = std::get<2>(str.param); + scomplex iexval = std::get<3>(str.param); + // index with extreme value jexval. + gtint_t j = std::get<4>(str.param); + scomplex jexval = std::get<5>(str.param); +#ifdef TEST_BLAS + std::string str_name = "scnrm2_"; +#elif TEST_CBLAS + std::string str_name = "cblas_scnrm2"; +#else //#elif TEST_BLIS_TYPED + std::string str_name = "bli_cnormfv"; +#endif + str_name = str_name + "_" + std::to_string(n); + std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); + str_name = str_name + "_" + incx_str; + str_name = str_name + "_i" + std::to_string(i); + std::string iexval_str = "_Re_" + getValueString(iexval.real) + "_Im_" + getValueString(iexval.imag); + str_name = str_name + iexval_str; + str_name = str_name + "_j" + std::to_string(j); + std::string jexval_str = "_Re_" + getValueString(jexval.real) + "_Im_" + getValueString(jexval.imag); + str_name = str_name + jexval_str; + return str_name; + } +}; + +static float NaN = std::numeric_limits::quiet_NaN(); +static float Inf = std::numeric_limits::infinity(); +/** + * scnrm2 implementation is composed by two parts: + * - vectorized path for n>=64 + * - for-loop for multiples of 16 (F16) + * - for-loop for multiples of 12 (F12) + * - for-loop for multiples of 8 (F8) + * - scalar path for n<64 (S) +*/ + +// Test for scalar path. +// Testing for jexval=(1.0, 2.0), means that we test only one NaN/Inf value. +// for jexval also being an extreme value, we test all combinations +// of having first a NaN and then an Inf and so on. +INSTANTIATE_TEST_SUITE_P( + scalar, + scnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(2)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(0), + // iexval + ::testing::Values(scomplex{NaN, 1.0}, scomplex{Inf, 9.0}, scomplex{-1.0, -Inf}, scomplex{2.0, NaN}, scomplex{NaN, Inf}, scomplex{Inf, NaN}), + ::testing::Values(1), + ::testing::Values(scomplex{1.0, 2.0}, scomplex{NaN, 1.0}, scomplex{Inf, 9.0}, scomplex{-1.0, -Inf}, scomplex{2.0, NaN}) + ), + ::scnrm2_TestPrint() + ); + +INSTANTIATE_TEST_SUITE_P( + vector_F16, + scnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(64)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(10), + // iexval + ::testing::Values(scomplex{NaN, 1.0}, scomplex{Inf, 9.0}, scomplex{-1.0, -Inf}, scomplex{2.0, NaN}, scomplex{NaN, Inf}, scomplex{Inf, NaN}), + ::testing::Values(30), + ::testing::Values(scomplex{1.0, 2.0}, scomplex{NaN, 1.0}, scomplex{Inf, 9.0}, scomplex{-1.0, -Inf}, scomplex{2.0, NaN}) + ), + ::scnrm2_TestPrint() + ); + +// To test the second for-loop (F12), we use n = 76 = 4*16+12 +// and ensure that the extreme values are on or after index 64. +INSTANTIATE_TEST_SUITE_P( + vector_F12, + scnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(76)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(68), + // iexval + ::testing::Values(scomplex{NaN, 1.0}, scomplex{Inf, 9.0}, scomplex{-1.0, -Inf}, scomplex{2.0, NaN}, scomplex{NaN, Inf}, scomplex{Inf, NaN}), + ::testing::Values(70), + ::testing::Values(scomplex{1.0, 2.0}, scomplex{NaN, 1.0}, scomplex{Inf, 9.0}, scomplex{-1.0, -Inf}, scomplex{2.0, NaN}) + ), + ::scnrm2_TestPrint() + ); + +// To test the second for-loop (F8), we use n = 72 = 4*16+8 +// and ensure that the extreme values are on or after index 64. +INSTANTIATE_TEST_SUITE_P( + vector_F8, + scnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(72)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(66), + // iexval + ::testing::Values(scomplex{NaN, 1.0}, scomplex{Inf, 9.0}, scomplex{-1.0, -Inf}, scomplex{2.0, NaN}, scomplex{NaN, Inf}, scomplex{Inf, NaN}), + ::testing::Values(70), + ::testing::Values(scomplex{1.0, 2.0}, scomplex{NaN, 1.0}, scomplex{Inf, 9.0}, scomplex{-1.0, -Inf}, scomplex{2.0, NaN}) + ), + ::scnrm2_TestPrint() + ); + +// Now let's check the combination of a vectorized path and +// the scalar path, by putting an extreme value in each +// to check that the checks are integrated correctly. +INSTANTIATE_TEST_SUITE_P( + vector_scalar, + scnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(79)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(25), + // iexval + ::testing::Values(scomplex{NaN, 1.0}, scomplex{Inf, 9.0}, scomplex{-1.0, -Inf}, scomplex{2.0, NaN}, scomplex{NaN, Inf}, scomplex{Inf, NaN}), + ::testing::Values(68), + ::testing::Values(scomplex{NaN, 1.0}, scomplex{Inf, 9.0}, scomplex{-1.0, -Inf}, scomplex{2.0, NaN}) + ), + ::scnrm2_TestPrint() + ); + diff --git a/gtestsuite/testsuite/util/nrm2/scnrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/scnrm2_generic.cpp new file mode 100644 index 0000000000..0204a8335a --- /dev/null +++ b/gtestsuite/testsuite/util/nrm2/scnrm2_generic.cpp @@ -0,0 +1,82 @@ +#include +#include "test_nrm2.h" + +class scnrm2Test : + public ::testing::TestWithParam> {}; + +TEST_P( scnrm2Test, RandomData ) +{ + using T = scomplex; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // vector length: + gtint_t n = std::get<0>(GetParam()); + // stride size for x: + gtint_t incx = std::get<1>(GetParam()); + + // Set the threshold for the errors: + double thresh = std::sqrt(n)*testinghelpers::getEpsilon(); + + //---------------------------------------------------------- + // Call test body using these parameters + //---------------------------------------------------------- + test_nrm2(n, incx, thresh); +} + +// Prints the test case combination +class scnrm2TestPrint { +public: + std::string operator()( + testing::TestParamInfo> str) const { + gtint_t n = std::get<0>(str.param); + gtint_t incx = std::get<1>(str.param); +#ifdef TEST_BLAS + std::string str_name = "scnrm2_"; +#elif TEST_CBLAS + std::string str_name = "cblas_scnrm2"; +#else //#elif TEST_BLIS_TYPED + std::string str_name = "bli_cnormfv"; +#endif + str_name = str_name + "_" + std::to_string(n); + std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); + str_name = str_name + "_" + incx_str; + return str_name; + } +}; + +/** + * scnrm2 implementation is composed by two parts: + * - vectorized path for n>=64 + * - for-loop for multiples of 16 (F16) + * - for-loop for multiples of 12 (F12) + * - for-loop for multiples of 8 (F8) + * - scalar path for n<64 (S) +*/ +INSTANTIATE_TEST_SUITE_P( + AT, + scnrm2Test, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(1), // trivial case n=1 + gtint_t(35), // will only go through S + gtint_t(64), // 4*16 - will only go through F16 + gtint_t(67), // 4*16 + 3 - will go through F16 & S + gtint_t(72), // 4*16 + 8 - will go through F16 & F8 + gtint_t(75), // 4*16 + 8 + 3 - will go through F16 & F8 & S + gtint_t(76), // 4*16 + 12 - will go through F16 & F12 + gtint_t(78), // 4*16 + 12 + 2 - will go through F16 & F12 & S + gtint_t(112), // a few bigger numbers + gtint_t(187), + gtint_t(213) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(3) +#ifndef TEST_BLIS_TYPED + , gtint_t(-1), gtint_t(-7) +#endif + ) + ), + ::scnrm2TestPrint() + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/snrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/snrm2_extreme_values.cpp new file mode 100644 index 0000000000..8de5e6aac2 --- /dev/null +++ b/gtestsuite/testsuite/util/nrm2/snrm2_extreme_values.cpp @@ -0,0 +1,215 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "test_nrm2.h" + +class snrm2_EVT : + public ::testing::TestWithParam> {}; + +TEST_P( snrm2_EVT, EVT ) +{ + using T = float; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // vector length: + gtint_t n = std::get<0>(GetParam()); + // stride size for x: + gtint_t incx = std::get<1>(GetParam()); + // index with extreme value iexval. + gtint_t i = std::get<2>(GetParam()); + T iexval = std::get<3>(GetParam()); + // index with extreme value jexval. + gtint_t j = std::get<4>(GetParam()); + T jexval = std::get<5>(GetParam()); + + //---------------------------------------------------------- + // Call test body using these parameters + //---------------------------------------------------------- + test_nrm2(n, incx, i, iexval, j, jexval); +} + +// Prints the test case combination +class snrm2_TestPrint { +public: + std::string operator()( + testing::TestParamInfo> str) const { + // vector length: + gtint_t n = std::get<0>(str.param); + // stride size for x: + gtint_t incx = std::get<1>(str.param); + // index with extreme value iexval. + gtint_t i = std::get<2>(str.param); + float iexval = std::get<3>(str.param); + // index with extreme value jexval. + gtint_t j = std::get<4>(str.param); + float jexval = std::get<5>(str.param); +#ifdef TEST_BLAS + std::string str_name = "snrm2_"; +#elif TEST_CBLAS + std::string str_name = "cblas_snrm2"; +#else //#elif TEST_BLIS_TYPED + std::string str_name = "bli_snormfv"; +#endif + str_name = str_name + "_" + std::to_string(n); + std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); + str_name = str_name + "_" + incx_str; + str_name = str_name + "_i" + std::to_string(i); + std::string iexval_str = getValueString(iexval); + str_name = str_name + "_" + iexval_str; + str_name = str_name + "_j" + std::to_string(j); + std::string jexval_str = getValueString(jexval); + str_name = str_name + "_" + jexval_str; + return str_name; + } +}; + +static float NaN = std::numeric_limits::quiet_NaN(); +static float Inf = std::numeric_limits::infinity(); + +/** + * Note: snrm2 scalar ONLY implementation is used, but we write the test + * using values that worked for the vectorized path for the future. + * + * scnrm2 implementation is composed by two parts: + * - vectorized path for n>=64 + * - for-loop for multiples of 32 (F32) + * - for-loop for multiples of 24 (F24) + * - for-loop for multiples of 16 (F16) + * - scalar path for n<64 (S) +*/ + +// Test for scalar path. +// Testing for jexval=1.0, means that we test only one NaN/Inf value. +// for jexval also being an extreme value, we test all combinations +// of having first a NaN and then an Inf and so on. +INSTANTIATE_TEST_SUITE_P( + scalar, + snrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(3)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(0), + // iexval + ::testing::Values(NaN, Inf, -Inf), + ::testing::Values(2), + ::testing::Values(1.0, NaN, Inf, -Inf) + ), + ::snrm2_TestPrint() + ); + +INSTANTIATE_TEST_SUITE_P( + vector_F32, + snrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(64)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(13), + // iexval + ::testing::Values(NaN, Inf, -Inf), + ::testing::Values(26), + ::testing::Values(1.0, NaN, Inf, -Inf) + ), + ::snrm2_TestPrint() + ); + +// To test the second for-loop (F24), we use n = 88 = 2*32+24 +// and ensure that the extreme values are on or after index 64. +INSTANTIATE_TEST_SUITE_P( + vector_F24, + snrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(88)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(70), + // iexval + ::testing::Values(NaN, Inf, -Inf), + ::testing::Values(80), + ::testing::Values(1.0, NaN, Inf, -Inf) + ), + ::snrm2_TestPrint() + ); + +// To test the second for-loop (F16), we use n = 80 = 2*32+16 +// and ensure that the extreme values are on or after index 64. +INSTANTIATE_TEST_SUITE_P( + vector_F16, + snrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(80)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(70), + // iexval + ::testing::Values(NaN, Inf, -Inf), + ::testing::Values(75), + ::testing::Values(1.0, NaN, Inf, -Inf) + ), + ::snrm2_TestPrint() + ); + +// Now let's check the combination of a vectorized path and +// the scalar path, by putting an extreme value in each +// to check that the checks are integrated correctly. +INSTANTIATE_TEST_SUITE_P( + vector_scalar, + snrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(68)), + // stride size for x + ::testing::Values(gtint_t(1)), + // i : index of x that has value iexval + ::testing::Values(5), + // iexval + ::testing::Values(NaN, Inf, -Inf), + ::testing::Values(65), + ::testing::Values(NaN, Inf, -Inf) + ), + ::snrm2_TestPrint() + ); + diff --git a/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp index 6dcd793253..289e387c16 100644 --- a/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp @@ -80,15 +80,38 @@ class snrm2TestPrint { } }; -// Black box testing. +/** + * Note: snrm2 scalar ONLY implementation is used, but we write the test + * using values that worked for the vectorized path for the future. + * + * scnrm2 implementation is composed by two parts: + * - vectorized path for n>=64 + * - for-loop for multiples of 32 (F32) + * - for-loop for multiples of 24 (F24) + * - for-loop for multiples of 16 (F16) + * - scalar path for n<64 (S) +*/ INSTANTIATE_TEST_SUITE_P( - Blackbox, + AT, snrm2Test, ::testing::Combine( - ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1), gtint_t(2) + // m size of vector + ::testing::Values(gtint_t(1), // trivial case n=1 + gtint_t(35), // will only go through S + gtint_t(64), // 2*32 - will only go through F32 + gtint_t(76), // 2*32 + 12 - will go through F32 & S + gtint_t(80), // 2*32 + 16 - will go through F32 & F16 + gtint_t(85), // 2*32 + 16 + 5 - will go through F32 & F16 & S + gtint_t(88), // 2*32 + 24 - will go through F32 & F24 + gtint_t(91), // 2*32 + 24 + 3 - will go through F32 & F24 & S + gtint_t(124), // a few bigger numbers + gtint_t(167), + gtint_t(259) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(3) #ifndef TEST_BLIS_TYPED - ,gtint_t(-1), gtint_t(-2) + , gtint_t(-1), gtint_t(-5) #endif ) // stride size for x ), diff --git a/gtestsuite/testsuite/util/nrm2/test_nrm2.h b/gtestsuite/testsuite/util/nrm2/test_nrm2.h index 6964382aee..9ed6e47adc 100644 --- a/gtestsuite/testsuite/util/nrm2/test_nrm2.h +++ b/gtestsuite/testsuite/util/nrm2/test_nrm2.h @@ -35,33 +35,82 @@ #pragma once #include "nrm2.h" +#include #include "util/ref_nrm2.h" #include "inc/check_error.h" +// Used for generic tests with random values in x. template void test_nrm2( gtint_t n, gtint_t incx, double thresh ) { + // Get real type from T. + using RT = typename testinghelpers::type_info::real_type; //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - //std::vector x( testinghelpers::buff_dim( n, incx ) ); - //testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data() ); std::vector x = testinghelpers::get_random_vector( -10, -10, n, incx ); //---------------------------------------------------------- // Call reference implementation to get ref results. //---------------------------------------------------------- - // Create a copy of y so that we can check reference results. - using real = typename testinghelpers::type_info::real_type; - real norm_ref = testinghelpers::ref_nrm2( n, x.data(), incx ); + RT norm_ref = testinghelpers::ref_nrm2( n, x.data(), incx ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - real norm = nrm2( n, x.data(), incx ); + RT norm = nrm2(n, x.data(), incx); //---------------------------------------------------------- // Compute error. //---------------------------------------------------------- - computediff( norm, norm_ref, thresh ); + computediff( norm, norm_ref, thresh ); +} + +// Test body used for extreme value testing, where we want to test +// cases where two extreme values are present. +// i is the index with corresponding extreme value iexval. +// j is the index with corresponding extreme value jexval. +template +void test_nrm2( gtint_t n, gtint_t incx, gtint_t i, T iexval, gtint_t j = 0, T jexval = T{1.0}) +{ + // Get real type from T. + using RT = typename testinghelpers::type_info::real_type; + //---------------------------------------------------------- + // Initialize vectors with random numbers. + //---------------------------------------------------------- + std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, ELEMENT_TYPE); + // Initialize ith element of vector x to iexval. + x[i*incx] = iexval; + // Initialize jth element of vector x to jexval. + x[j*incx] = jexval; + //---------------------------------------------------------- + // Call reference implementation to get ref results. + //---------------------------------------------------------- + RT norm_ref = testinghelpers::ref_nrm2( n, x.data(), incx ); + + //---------------------------------------------------------- + // Call BLIS function. + //---------------------------------------------------------- + RT norm = nrm2(n, x.data(), incx); + + //---------------------------------------------------------- + // Compute error. + //---------------------------------------------------------- + // Compare using NaN/Inf checks. + computediff( norm, norm_ref, true ); +} + +// Helper function that returns a string with the correct NaN/Inf printing +// so that we can print the test names correctly from using parametrized testing. +template +std::string getValueString(T exval) +{ + std::string exval_str; + if(std::isnan(exval)) + exval_str = "nan"; + else if(std::isinf(exval)) + exval_str = (exval > 0) ? "inf" : "minus_inf"; + else + exval_str = ( exval > 0) ? std::to_string(int(exval)) : "minus_" + std::to_string(int(std::abs(exval))); + return exval_str; } \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp deleted file mode 100644 index cec642871f..0000000000 --- a/gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#include "test_nrm2.h" - -class znrm2Test : - public ::testing::TestWithParam> {}; - -TEST_P( znrm2Test, RandomData ) -{ - using T = dcomplex; - //---------------------------------------------------------- - // Initialize values from the parameters passed through - // test suite instantiation (INSTANTIATE_TEST_SUITE_P). - //---------------------------------------------------------- - // vector length: - gtint_t n = std::get<0>(GetParam()); - // stride size for x: - gtint_t incx = std::get<1>(GetParam()); - - // Set the threshold for the errors: - double thresh = testinghelpers::getEpsilon(); - - //---------------------------------------------------------- - // Call test body using these parameters - //---------------------------------------------------------- - test_nrm2( n, incx, thresh ); -} - -// Prints the test case combination -class znrm2TestPrint { -public: - std::string operator()( - testing::TestParamInfo> str) const { - gtint_t n = std::get<0>(str.param); - gtint_t incx = std::get<1>(str.param); -#ifdef TEST_BLAS - std::string str_name = "dznrm2_"; -#elif TEST_CBLAS - std::string str_name = "cblas_dznrm2"; -#else //#elif TEST_BLIS_TYPED - std::string str_name = "bli_znormfv"; -#endif - str_name = str_name + "_" + std::to_string(n); - std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); - str_name = str_name + "_" + incx_str; - return str_name; - } -}; - -// Black box testing. -INSTANTIATE_TEST_SUITE_P( - Blackbox, - znrm2Test, - ::testing::Combine( - ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1), gtint_t(2) -#ifndef TEST_BLIS_TYPED - ,gtint_t(-1), gtint_t(-2) -#endif - ) // stride size for x - ), - ::znrm2TestPrint() - ); \ No newline at end of file From 9607f207da79e95c1f583e024da46a00df4f33af Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Tue, 30 May 2023 07:25:20 +0000 Subject: [PATCH 118/226] AOCL Dynamic tuning for DAXPYV - Existing logic is not picking the ideal number of threads for some problem sizes. - Problem size and their corresponding ideal number of threads are retuned for daxpy in aocl dynamic. AMD-Internal: [CPUPL-3484] Change-Id: Ice874ceef0a1815383f74f1a4b9677677b276af7 --- frame/base/bli_rntm.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index 98131623d8..e8c29b0cc5 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -1741,16 +1741,18 @@ static void aocl_daxpyv_dynamic case BLIS_ARCH_ZEN2: case BLIS_ARCH_ZEN3: - if ( n_elem <= 100 ) + if ( n_elem <= 4000 ) *nt_ideal = 1; - else if (n_elem <= 10000) - *nt_ideal = 2; - else if (n_elem <= 250000) + else if (n_elem <= 11000) + *nt_ideal = 4; + else if (n_elem <= 300000) *nt_ideal = 8; else if (n_elem <= 750000) *nt_ideal = 16; - else if (n_elem <= 2000000) + else if (n_elem <= 2600000) *nt_ideal = 32; + else if (n_elem <= 4000000) + *nt_ideal = 64; else // For sizes in this range, AOCL dynamic does not make any change *nt_ideal = -1; From b126c9943b3648462d2ca03df38c7a7beaefeaf0 Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Fri, 14 Jul 2023 14:59:20 +0530 Subject: [PATCH 119/226] ZSCALV kernel optimization - ZSCALV kernel now uses fmaddsub intrinsics instead of mul followed by addsub instrinsics. - Removed the negative incx handling checks from the BLAS impli layer as BLAS expects early return for incx <= 0. - Moved all exceptions in the kernel to the BLAS impli layer. AMD-Internal: [SWLCSG-2224] Change-Id: I03b968d21ca5128cb78ddcef5acfd5e579b22674 --- frame/compat/bla_scal_amd.c | 149 +++++++++++----------------- kernels/zen/1/bli_scalv_zen_int10.c | 109 ++++++++++---------- 2 files changed, 112 insertions(+), 146 deletions(-) diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c index 041c1b6a87..5de72e7bce 100644 --- a/frame/compat/bla_scal_amd.c +++ b/frame/compat/bla_scal_amd.c @@ -620,98 +620,63 @@ void zscal_blis_impl dcomplex* x, const f77_int* incx ) { - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *)alpha, *n, *incx); - dim_t n0; - dcomplex *x0; - inc_t incx0; - - // When n is zero or the alpha pointer passed is null, return early - if ((*n == 0) || (alpha == NULL)) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* Convert/typecast negative values of n to zero. */ - if (*n < 0) - n0 = (dim_t)0; - else - n0 = (dim_t)(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if (*incx < 0) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = (x) + (n0 - 1) * (-*incx); - incx0 = (inc_t)(*incx); - } - else - { - x0 = (x); - incx0 = (inc_t)(*incx); - } - - /* If the incx is zero, return early. */ - if (bli_zero_dim1(incx0)) - return; - - // Definition of function pointer - zscalv_ker_ft scalv_fun_ptr; - - cntx_t* cntx = NULL; - - // Query the architecture ID - arch_t id = bli_arch_query_id(); - - // Pick the kernel based on the architecture ID - switch (id) - { - case BLIS_ARCH_ZEN4: - case BLIS_ARCH_ZEN: - case BLIS_ARCH_ZEN2: - case BLIS_ARCH_ZEN3: - - // AVX2 Kernel - scalv_fun_ptr = bli_zscalv_zen_int; - break; - - default: - - // Query the context - cntx = bli_gks_query_cntx(); - - // Query the function pointer using the context - scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx); - } - - /* The expectation is that the condition to return early for vector dimension is zero - or the real part of alpha is 1 and imaginary part 0 is inside the compute kernel called */ - - // Call the function based on the function pointer assigned above - scalv_fun_ptr - ( - BLIS_NO_CONJUGATE, - n0, - (dcomplex*) alpha, - x0, incx0, - cntx - ); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *)alpha, *n, *incx); + + dim_t n0 = (dim_t)(*n); + dcomplex *x0 = x; + inc_t incx0 = (inc_t)(*incx); + + /* + When n is zero or the alpha pointer passed is null + or the incx is zero or alpha is 1, return early. + */ + if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(z, eq1)(*alpha)) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + // Definition of function pointer + zscalv_ker_ft scalv_fun_ptr; + + cntx_t* cntx = NULL; + + // Query the architecture ID + arch_t id = bli_arch_query_id(); + + // Pick the kernel based on the architecture ID + switch (id) + { + case BLIS_ARCH_ZEN4: + case BLIS_ARCH_ZEN: + case BLIS_ARCH_ZEN2: + case BLIS_ARCH_ZEN3: + + // AVX2 Kernel + scalv_fun_ptr = bli_zscalv_zen_int; + break; + + default: + + // Query the context + cntx = bli_gks_query_cntx(); + + // Query the function pointer using the context + scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx); + } + + // Call the function based on the function pointer assigned above + scalv_fun_ptr + ( + BLIS_NO_CONJUGATE, + n0, + (dcomplex*) alpha, + x0, incx0, + cntx + ); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) } #ifdef BLIS_ENABLE_BLAS void zscal_ diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c index 2d96d756c1..0837b8ad38 100644 --- a/kernels/zen/1/bli_scalv_zen_int10.c +++ b/kernels/zen/1/bli_scalv_zen_int10.c @@ -817,10 +817,20 @@ void bli_zscalv_zen_int cntx_t* restrict cntx ) { - // If the vector dimension is zero, or if alpha is unit, return early. - if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha)) - return; + /* + Undefined behaviour + ------------------- + 1. This layer is not BLAS complaint and the kernel results in + undefined behaviour when n <= 0 and incx <= 1. The expectation + is that the application/higher-layer invoking this layer should + the arg checks. + */ + // if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha)) + // return; + + // To Do: This call to SETV needs to be removed for BLAS compliance + // Currently removing this is resulting in ZHERK failures if (PASTEMAC(z, eq0)(*alpha)) { // Expert interface of setv is invoked when alpha is zero @@ -834,8 +844,7 @@ void bli_zscalv_zen_int zero, x, incx, cntx, - NULL - ); + NULL); return; } @@ -896,33 +905,29 @@ void bli_zscalv_zen_int x_vec_ymm[2] = _mm256_loadu_pd(x0 + 2 * n_elem_per_reg); x_vec_ymm[3] = _mm256_loadu_pd(x0 + 3 * n_elem_per_reg); - temp_ymm[0] = _mm256_mul_pd(x_vec_ymm[0], alpha_real_ymm); - temp_ymm[1] = _mm256_mul_pd(x_vec_ymm[0], alpha_imag_ymm); - temp_ymm[2] = _mm256_mul_pd(x_vec_ymm[1], alpha_real_ymm); - temp_ymm[3] = _mm256_mul_pd(x_vec_ymm[1], alpha_imag_ymm); - temp_ymm[4] = _mm256_mul_pd(x_vec_ymm[2], alpha_real_ymm); - temp_ymm[5] = _mm256_mul_pd(x_vec_ymm[2], alpha_imag_ymm); - temp_ymm[6] = _mm256_mul_pd(x_vec_ymm[3], alpha_real_ymm); - temp_ymm[7] = _mm256_mul_pd(x_vec_ymm[3], alpha_imag_ymm); + temp_ymm[0] = _mm256_mul_pd(x_vec_ymm[0], alpha_imag_ymm); + temp_ymm[1] = _mm256_mul_pd(x_vec_ymm[1], alpha_imag_ymm); + temp_ymm[2] = _mm256_mul_pd(x_vec_ymm[2], alpha_imag_ymm); + temp_ymm[3] = _mm256_mul_pd(x_vec_ymm[3], alpha_imag_ymm); - temp_ymm[1] = _mm256_permute_pd(temp_ymm[1], 0b0101); - temp_ymm[3] = _mm256_permute_pd(temp_ymm[3], 0b0101); - temp_ymm[5] = _mm256_permute_pd(temp_ymm[5], 0b0101); - temp_ymm[7] = _mm256_permute_pd(temp_ymm[7], 0b0101); + temp_ymm[4] = _mm256_permute_pd(temp_ymm[0], 0b0101); + temp_ymm[5] = _mm256_permute_pd(temp_ymm[1], 0b0101); + temp_ymm[6] = _mm256_permute_pd(temp_ymm[2], 0b0101); + temp_ymm[7] = _mm256_permute_pd(temp_ymm[3], 0b0101); /* - a[i+63:i] := b[i+63:i] - c[i+63:i] for odd indices - a[i+63:i] := b[i+63:i] + c[i+63:i] for even indices + a[i+63:i] := alpha_real * b[i+63:i] - c[i+63:i] for odd indices + a[i+63:i] := alpha_real * b[i+63:i] + c[i+63:i] for even indices */ - temp_ymm[0] = _mm256_addsub_pd(temp_ymm[0], temp_ymm[1]); - temp_ymm[2] = _mm256_addsub_pd(temp_ymm[2], temp_ymm[3]); - temp_ymm[4] = _mm256_addsub_pd(temp_ymm[4], temp_ymm[5]); - temp_ymm[6] = _mm256_addsub_pd(temp_ymm[6], temp_ymm[7]); + temp_ymm[0] = _mm256_fmaddsub_pd(x_vec_ymm[0], alpha_real_ymm, temp_ymm[4]); + temp_ymm[1] = _mm256_fmaddsub_pd(x_vec_ymm[1], alpha_real_ymm, temp_ymm[5]); + temp_ymm[2] = _mm256_fmaddsub_pd(x_vec_ymm[2], alpha_real_ymm, temp_ymm[6]); + temp_ymm[3] = _mm256_fmaddsub_pd(x_vec_ymm[3], alpha_real_ymm, temp_ymm[7]); _mm256_storeu_pd(x0, temp_ymm[0]); - _mm256_storeu_pd(x0 + n_elem_per_reg, temp_ymm[2]); - _mm256_storeu_pd(x0 + 2 * n_elem_per_reg, temp_ymm[4]); - _mm256_storeu_pd(x0 + 3 * n_elem_per_reg, temp_ymm[6]); + _mm256_storeu_pd(x0 + n_elem_per_reg, temp_ymm[1]); + _mm256_storeu_pd(x0 + 2 * n_elem_per_reg, temp_ymm[2]); + _mm256_storeu_pd(x0 + 3 * n_elem_per_reg, temp_ymm[3]); x0 += 4 * n_elem_per_reg; } @@ -932,19 +937,17 @@ void bli_zscalv_zen_int x_vec_ymm[0] = _mm256_loadu_pd(x0); x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg); - temp_ymm[0] = _mm256_mul_pd(x_vec_ymm[0], alpha_real_ymm); - temp_ymm[1] = _mm256_mul_pd(x_vec_ymm[0], alpha_imag_ymm); - temp_ymm[2] = _mm256_mul_pd(x_vec_ymm[1], alpha_real_ymm); - temp_ymm[3] = _mm256_mul_pd(x_vec_ymm[1], alpha_imag_ymm); + temp_ymm[0] = _mm256_mul_pd(x_vec_ymm[0], alpha_imag_ymm); + temp_ymm[1] = _mm256_mul_pd(x_vec_ymm[1], alpha_imag_ymm); - temp_ymm[1] = _mm256_permute_pd(temp_ymm[1], 0b0101); - temp_ymm[3] = _mm256_permute_pd(temp_ymm[3], 0b0101); + temp_ymm[2] = _mm256_permute_pd(temp_ymm[0], 0b0101); + temp_ymm[3] = _mm256_permute_pd(temp_ymm[1], 0b0101); - temp_ymm[0] = _mm256_addsub_pd(temp_ymm[0], temp_ymm[1]); - temp_ymm[2] = _mm256_addsub_pd(temp_ymm[2], temp_ymm[3]); + temp_ymm[0] = _mm256_fmaddsub_pd(x_vec_ymm[0], alpha_real_ymm, temp_ymm[2]); + temp_ymm[1] = _mm256_fmaddsub_pd(x_vec_ymm[1], alpha_real_ymm, temp_ymm[3]); _mm256_storeu_pd(x0, temp_ymm[0]); - _mm256_storeu_pd(x0 + n_elem_per_reg, temp_ymm[2]); + _mm256_storeu_pd(x0 + n_elem_per_reg, temp_ymm[1]); x0 += 2 * n_elem_per_reg; } @@ -953,44 +956,42 @@ void bli_zscalv_zen_int { x_vec_ymm[0] = _mm256_loadu_pd(x0); - temp_ymm[0] = _mm256_mul_pd(x_vec_ymm[0], alpha_real_ymm); - temp_ymm[1] = _mm256_mul_pd(x_vec_ymm[0], alpha_imag_ymm); + temp_ymm[0] = _mm256_mul_pd(x_vec_ymm[0], alpha_imag_ymm); - temp_ymm[1] = _mm256_permute_pd(temp_ymm[1], 0b0101); + temp_ymm[1] = _mm256_permute_pd(temp_ymm[0], 0b0101); - temp_ymm[0] = _mm256_addsub_pd(temp_ymm[0], temp_ymm[1]); + temp_ymm[0] = _mm256_fmaddsub_pd(x_vec_ymm[0], alpha_real_ymm, temp_ymm[1]); _mm256_storeu_pd(x0, temp_ymm[0]); x0 += n_elem_per_reg; } - } - // Issue vzeroupper instruction to clear upper lanes of ymm registers. - // This avoids a performance penalty caused by false dependencies when - // transitioning from AVX to SSE instructions (which may occur later, - // especially if BLIS is compiled with -mfpmath=sse). - _mm256_zeroupper(); + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from AVX to SSE instructions (which may occur later, + // especially if BLIS is compiled with -mfpmath=sse). + _mm256_zeroupper(); + } /* In double complex data type the computation of unit stride elements can still be vectorized using SSE*/ - __m128d temp_ymm[2], alpha_real_ymm, alpha_imag_ymm, x_vec_ymm; + __m128d temp_xmm[2], alpha_real_xmm, alpha_imag_xmm, x_vec_xmm; - alpha_real_ymm = _mm_set1_pd(real); - alpha_imag_ymm = _mm_set1_pd(imag); + alpha_real_xmm = _mm_set1_pd(real); + alpha_imag_xmm = _mm_set1_pd(imag); for (; i < n; i++) { - x_vec_ymm = _mm_loadu_pd(x0); + x_vec_xmm = _mm_loadu_pd(x0); - temp_ymm[0] = _mm_mul_pd(x_vec_ymm, alpha_real_ymm); - temp_ymm[1] = _mm_mul_pd(x_vec_ymm, alpha_imag_ymm); + temp_xmm[0] = _mm_permute_pd(x_vec_xmm, 0b01); - temp_ymm[1] = _mm_permute_pd(temp_ymm[1], 0b01); + temp_xmm[1] = _mm_mul_pd(temp_xmm[0], alpha_imag_xmm); - temp_ymm[0] = _mm_addsub_pd(temp_ymm[0], temp_ymm[1]); + temp_xmm[0] = _mm_fmaddsub_pd(x_vec_xmm, alpha_real_xmm, temp_xmm[1]); - _mm_storeu_pd(x0, temp_ymm[0]); + _mm_storeu_pd(x0, temp_xmm[0]); x0 += 2 * incx; } From c97471dce07b044ee9b1b4578f19ffb8a0debc9b Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Mon, 17 Jul 2023 12:24:18 +0530 Subject: [PATCH 120/226] Added AVX512 ZDSCALV kernel - Added AVX512-based kernel for ZDSCAL. This will be dispatched from the BLAS layer for machines that have AVX512 flags. - In AVX2 kernel for ZDSCALV, vectorized fringe compute using SSE instructions. - Removed the negative incx handling checks from the blis_impli layer of ZDSCAL as BLAS expects early return for incx <= 0. AMD-Internal: [CPUPL-3648] Change-Id: I820808e3158036502b78b703f5f7faa799e5f7d9 --- frame/compat/bla_scal_amd.c | 51 ++------ kernels/zen/1/bli_scalv_zen_int10.c | 75 ++++++----- kernels/zen4/1/bli_scalv_zen_int_avx512.c | 147 ++++++++++++++++++++++ kernels/zen4/bli_kernels_zen4.h | 5 +- 4 files changed, 200 insertions(+), 78 deletions(-) diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c index 5de72e7bce..3041f3bbe6 100644 --- a/frame/compat/bla_scal_amd.c +++ b/frame/compat/bla_scal_amd.c @@ -438,50 +438,20 @@ void zdscal_blis_impl { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *) alpha, *n, *incx ); - dim_t n_elem; - dcomplex* x0; - inc_t incx0; + dim_t n_elem = (dim_t)(*n); + dcomplex* x0 = x; + inc_t incx0 = (inc_t)(*incx); /* Initialize BLIS. */ //bli_init_auto(); - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n_elem = ( dim_t )0; - else n_elem = ( dim_t )(*n); - /* - Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception - Return early when alpha pointer is NULL - BLIS exception + When n is zero or the alpha pointer passed is null + or the incx is zero or alpha is 1, return early. */ - if (*n <= 0 || alpha == NULL || bli_deq1(*alpha) || incx <= 0) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = (x) + (n_elem-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else + if ((n_elem <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha)) { - x0 = (x); - incx0 = ( inc_t )(*incx); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; } dcomplex alpha_cast; @@ -500,6 +470,11 @@ void zdscal_blis_impl switch (arch_id_local) { case BLIS_ARCH_ZEN4: +#if defined(BLIS_KERNELS_ZEN4) + // AVX2 Kernel + scalv_ker_ptr = bli_zdscalv_zen_int_avx512; + break; +#endif case BLIS_ARCH_ZEN: case BLIS_ARCH_ZEN2: case BLIS_ARCH_ZEN3: diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c index 0837b8ad38..e760367060 100644 --- a/kernels/zen/1/bli_scalv_zen_int10.c +++ b/kernels/zen/1/bli_scalv_zen_int10.c @@ -609,8 +609,8 @@ void bli_zdscalv_zen_int10 for ( ; ( i + 29 ) < n; i += 30 ) { - xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg ); + xv[0] = _mm256_loadu_pd( x0 ); + xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2 * n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3 * n_elem_per_reg ); xv[4] = _mm256_loadu_pd( x0 + 4 * n_elem_per_reg ); @@ -641,8 +641,8 @@ void bli_zdscalv_zen_int10 xv[13] = _mm256_mul_pd( alphav, xv[13] ); xv[14] = _mm256_mul_pd( alphav, xv[14] ); - _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] ); - _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), xv[1] ); + _mm256_storeu_pd( x0, xv[0] ); + _mm256_storeu_pd( (x0 + n_elem_per_reg), xv[1] ); _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), xv[2] ); _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), xv[3] ); _mm256_storeu_pd( (x0 + 4*n_elem_per_reg), xv[4] ); @@ -662,8 +662,8 @@ void bli_zdscalv_zen_int10 for ( ; ( i + 23 ) < n; i += 24 ) { - xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg ); + xv[0] = _mm256_loadu_pd( x0 ); + xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2 * n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3 * n_elem_per_reg ); xv[4] = _mm256_loadu_pd( x0 + 4 * n_elem_per_reg ); @@ -688,8 +688,8 @@ void bli_zdscalv_zen_int10 xv[10] = _mm256_mul_pd( alphav, xv[10] ); xv[11] = _mm256_mul_pd( alphav, xv[11] ); - _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] ); - _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), xv[1] ); + _mm256_storeu_pd( x0, xv[0] ); + _mm256_storeu_pd( (x0 + n_elem_per_reg), xv[1] ); _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), xv[2] ); _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), xv[3] ); _mm256_storeu_pd( (x0 + 4*n_elem_per_reg), xv[4] ); @@ -706,8 +706,8 @@ void bli_zdscalv_zen_int10 for ( ; ( i + 15 ) < n; i += 16 ) { - xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg ); + xv[0] = _mm256_loadu_pd( x0 ); + xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2 * n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3 * n_elem_per_reg ); xv[4] = _mm256_loadu_pd( x0 + 4 * n_elem_per_reg ); @@ -724,8 +724,8 @@ void bli_zdscalv_zen_int10 xv[6] = _mm256_mul_pd( alphav, xv[6] ); xv[7] = _mm256_mul_pd( alphav, xv[7] ); - _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] ); - _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), xv[1] ); + _mm256_storeu_pd( x0, xv[0] ); + _mm256_storeu_pd( (x0 + n_elem_per_reg), xv[1] ); _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), xv[2] ); _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), xv[3] ); _mm256_storeu_pd( (x0 + 4*n_elem_per_reg), xv[4] ); @@ -738,8 +738,8 @@ void bli_zdscalv_zen_int10 for ( ; ( i + 7 ) < n; i += 8 ) { - xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg ); + xv[0] = _mm256_loadu_pd( x0 ); + xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2 * n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3 * n_elem_per_reg ); @@ -748,8 +748,8 @@ void bli_zdscalv_zen_int10 xv[2] = _mm256_mul_pd( alphav, xv[2] ); xv[3] = _mm256_mul_pd( alphav, xv[3] ); - _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] ); - _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), xv[1] ); + _mm256_storeu_pd( x0, xv[0] ); + _mm256_storeu_pd( (x0 + n_elem_per_reg), xv[1] ); _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), xv[2] ); _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), xv[3] ); @@ -758,35 +758,27 @@ void bli_zdscalv_zen_int10 for ( ; ( i + 3 ) < n; i += 4 ) { - xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg ); + xv[0] = _mm256_loadu_pd( x0 ); + xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg ); xv[0] = _mm256_mul_pd( alphav, xv[0] ); xv[1] = _mm256_mul_pd( alphav, xv[1] ); - _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] ); - _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), xv[1] ); + _mm256_storeu_pd( x0, xv[0] ); + _mm256_storeu_pd( (x0 + n_elem_per_reg), xv[1] ); x0 += 2 * n_elem_per_reg; } for ( ; ( i + 1 ) < n; i += 2 ) { - xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg ); + xv[0] = _mm256_loadu_pd( x0 ); xv[0] = _mm256_mul_pd( alphav, xv[0] ); - _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] ); + _mm256_storeu_pd( x0, xv[0] ); - x0 += 1 * n_elem_per_reg; - } - - for ( ; i < n; i++ ) - { - ( *x0 ) *= alphac; - ( *( x0 + 1 ) ) *= alphac; - - x0 += 2 * incx; + x0 += n_elem_per_reg; } // Issue vzeroupper instruction to clear upper lanes of ymm registers. @@ -796,15 +788,22 @@ void bli_zdscalv_zen_int10 // -mfpmath=sse). _mm256_zeroupper(); } - else + + /* In double complex data type the computation of + unit stride elements can still be vectorized using SSE*/ + __m128d alpha_reg, x_vec; + + alpha_reg = _mm_set1_pd((*alpha).real); + + for (; i < n; ++i) { - for ( ; i < n; ++i ) - { - ( *x0 ) *= alphac; - ( *( x0 + 1 ) ) *= alphac; + x_vec = _mm_loadu_pd(x0); - x0 += 2 * incx; - } + x_vec = _mm_mul_pd(x_vec, alpha_reg); + + _mm_storeu_pd(x0, x_vec); + + x0 += 2 * incx; } } diff --git a/kernels/zen4/1/bli_scalv_zen_int_avx512.c b/kernels/zen4/1/bli_scalv_zen_int_avx512.c index 2dd355b268..0ba20116ca 100644 --- a/kernels/zen4/1/bli_scalv_zen_int_avx512.c +++ b/kernels/zen4/1/bli_scalv_zen_int_avx512.c @@ -417,3 +417,150 @@ void bli_dscalv_zen_int_avx512 x0 += incx; } } + +/* + Functionality + ------------- + + This function scales a double complex vector by an element of the + type double. + + x := conjalpha(alpha) * x + + Function Signature + ------------------- + + * 'conjalpha' - Variable specified if alpha needs to be conjugated + * 'n' - Length of the array passed + * 'alpha' - Pointer to the element by which the vector is to be scaled + * 'x' - Double complex pointer pointing to an array + * 'incx' - Stride to point to the next element in the array + * 'cntx' - BLIS context object + + Exception + ---------- + + None + + Deviation from BLAS + -------------------- + + None + + Undefined behaviour + ------------------- + + 1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation + is that these are standard BLAS exceptions and should be handled in a higher layer. +*/ +void bli_zdscalv_zen_int_avx512 + ( + conj_t conjalpha, + dim_t n, + dcomplex* restrict alpha, + dcomplex* restrict x, inc_t incx, + cntx_t* restrict cntx + ) +{ + /* + This kernel only performs the computation + when alpha is double from the BLAS layer + alpha is passed as double complex to adhere + to function pointer definition in BLIS + */ + const double alphac = (*alpha).real; + + dim_t i = 0; + + double *restrict x0 = (double *)x; + + if (incx == 1) + { + __m512d alphav, xv[4]; + const dim_t n_elem_per_reg = 8; // number of elements per register + + alphav = _mm512_set1_pd(alphac); + + for (; (i + 15) < n; i += 16) + { + xv[0] = _mm512_loadu_pd(x0); + xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg); + xv[2] = _mm512_loadu_pd(x0 + 2 * n_elem_per_reg); + xv[3] = _mm512_loadu_pd(x0 + 3 * n_elem_per_reg); + + xv[0] = _mm512_mul_pd(alphav, xv[0]); + xv[1] = _mm512_mul_pd(alphav, xv[1]); + xv[2] = _mm512_mul_pd(alphav, xv[2]); + xv[3] = _mm512_mul_pd(alphav, xv[3]); + + _mm512_storeu_pd(x0, xv[0]); + _mm512_storeu_pd(x0 + n_elem_per_reg, xv[1]); + _mm512_storeu_pd(x0 + 2 * n_elem_per_reg, xv[2]); + _mm512_storeu_pd(x0 + 3 * n_elem_per_reg, xv[3]); + + x0 += 4 * n_elem_per_reg; + } + + for (; (i + 7) < n; i += 8) + { + xv[0] = _mm512_loadu_pd(x0); + xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg); + + xv[0] = _mm512_mul_pd(alphav, xv[0]); + xv[1] = _mm512_mul_pd(alphav, xv[1]); + + _mm512_storeu_pd(x0, xv[0]); + _mm512_storeu_pd(x0 + n_elem_per_reg, xv[1]); + + x0 += 2 * n_elem_per_reg; + } + + for (; (i + 3) < n; i += 4) + { + xv[0] = _mm512_loadu_pd(x0); + + xv[0] = _mm512_mul_pd(alphav, xv[0]); + + _mm512_storeu_pd(x0, xv[0]); + + x0 += n_elem_per_reg; + } + + for (; (i + 1) < n; i += 2) + { + __m256d xv = _mm256_loadu_pd(x0); + + __m256d alphav = _mm256_set1_pd(alphac); + + xv = _mm256_mul_pd(alphav, xv); + + _mm256_storeu_pd(x0, xv); + + x0 += 4; + } + + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from AVX to SSE instructions (which may occur as soon + // as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + } + + /* In double complex data type the computation of + unit stride elements can still be vectorized using SSE*/ + __m128d alpha_reg, x_vec; + + alpha_reg = _mm_set1_pd((*alpha).real); + + for (; i < n; ++i) + { + x_vec = _mm_loadu_pd(x0); + + x_vec = _mm_mul_pd(x_vec, alpha_reg); + + _mm_storeu_pd(x0, x_vec); + + x0 += 2 * incx; + } +} diff --git a/kernels/zen4/bli_kernels_zen4.h b/kernels/zen4/bli_kernels_zen4.h index 701e2ecb49..d8ec5e6d7d 100644 --- a/kernels/zen4/bli_kernels_zen4.h +++ b/kernels/zen4/bli_kernels_zen4.h @@ -39,8 +39,9 @@ AMAXV_KER_PROT( float, s, amaxv_zen_int_avx512 ) AMAXV_KER_PROT( double, d, amaxv_zen_int_avx512 ) // scalv (AVX512 intrinsics) -SCALV_KER_PROT( float, s, scalv_zen_int_avx512 ) -SCALV_KER_PROT( double, d, scalv_zen_int_avx512 ) +SCALV_KER_PROT( float, s, scalv_zen_int_avx512 ) +SCALV_KER_PROT( double, d, scalv_zen_int_avx512 ) +SCALV_KER_PROT( dcomplex, z, dscalv_zen_int_avx512) // ZDSCAL kernel // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int_avx512 ) From 758ec3b5caa7df396691d0cad9616c068460d6ca Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Mon, 10 Jul 2023 16:17:04 +0530 Subject: [PATCH 121/226] ZGEMM optimizations for cases with k = 1 - Implemented bli_zgemm_4x4_avx2_k1_nn( ... ) kernel to replace bli_zgemm_4x6_avx2_k1_nn( ... ) kernel in the BLAS layer of ZGEMM. The kernel is built for handling the GEMM computation with inputs having k = 1, and the transpose values for A and B as N. - The kernel dimension has been changed from 4x6 to 4x4, due to the following reasons : - The 1xNR block of B in the n-loop can be reused over multiple MRx1 blocks of A in the m-loop during computation. Similar analogy exists for the fringe cases. - Every 1xNR block of B was scaled with alpha and stored in registers before traversing in the m-dimension. Similar change was done for fringe cases in n-dimension. - These registers should not be modified during compute, hence the kernel dimension was changed from 4x6 to 4x4. - The check for early exit(with regards to BLAS mandate) has been removed, since it is already present in the BLAS layer. - The check for parallel ZGEMM has been moved post the redirection to this kernel, since the kernel is single-threaded. - The bli_kernels_zen.h file was updated with the new kernel signature. AMD-Internal: [CPUPL-3622] Change-Id: Iaf03b00d5075dd74cc412290d77a401986ba0bea --- frame/compat/bla_gemm_amd.c | 104 +- kernels/zen/3/bli_zgemm_avx2_k1.c | 2590 +++++++++++------------------ kernels/zen/bli_kernels_zen.h | 2 +- 3 files changed, 1013 insertions(+), 1683 deletions(-) diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index 5c199e3712..0cb0afa405 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -893,30 +893,6 @@ void zgemm_blis_impl const inc_t rs_c = 1; const inc_t cs_c = *ldc; - const num_t dt = BLIS_DCOMPLEX; - - obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t ao = BLIS_OBJECT_INITIALIZER; - obj_t bo = BLIS_OBJECT_INITIALIZER; - obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t co = BLIS_OBJECT_INITIALIZER; - - dim_t m0_a, n0_a; - dim_t m0_b, n0_b; - - bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); - bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); - - bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao ); - bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao ); - - bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao ); - bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo ); - bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co ); - - bli_obj_set_conjtrans( blis_transa, &ao ); - bli_obj_set_conjtrans( blis_transb, &bo ); - /* Call GEMV when m == 1 or n == 1 with the context set to an uninitialized void pointer i.e. ((void *)0)*/ if (n0 == 1) @@ -999,12 +975,37 @@ void zgemm_blis_impl //dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 ); //dim_t nt = bli_thread_get_num_threads(); // get number of threads - bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked. // This function is invoked on all architectures including 'generic'. // Non-AVX2+FMA3 platforms will use the kernels derived from the context. if (bli_cpuid_is_avx2fma3_supported() == FALSE) { + // This code is duplicated below, however we don't want to move it out of + // this IF block as we want to avoid object initialization until required. + // Also this is temporary fix which will be replaced later. + const num_t dt = BLIS_DCOMPLEX; + + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t co = BLIS_OBJECT_INITIALIZER; + + dim_t m0_a, n0_a; + dim_t m0_b, n0_b; + + bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); + bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); + + bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao ); + bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao ); + + bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao ); + bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo ); + bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co ); + + bli_obj_set_conjtrans( blis_transa, &ao ); + bli_obj_set_conjtrans( blis_transb, &bo ); // Will call parallelized zgemm code - sup & native PASTEMAC(gemm, BLIS_OAPI_EX_SUF) @@ -1026,20 +1027,23 @@ void zgemm_blis_impl } /* - Invoking the API for input sizes with k=1. - - For single thread, the API has no constraints before invoking. - - For multiple threads, the constraint is that m and n should individually be less than 128. + Invoking the API for input sizes with k = 1. + - The API is single-threaded. + - The input constraints are that k should be 1, and transa and transb + should be N and N respectively. */ - if((k0 == 1) && ((!is_parallel) || ((is_parallel) && (m0 < 128) && (n0 < 128))) - && bli_is_notrans(blis_transa) - && bli_is_notrans(blis_transb)) + if( ( k0 == 1 ) && bli_is_notrans( blis_transa ) && bli_is_notrans( blis_transb ) ) { - bli_zgemm_4x6_avx2_k1_nn( m0, n0, k0, - (dcomplex*)alpha, - (dcomplex*)a, *lda, - (dcomplex*)b, *ldb, - (dcomplex*)beta, - c, *ldc); + bli_zgemm_4x4_avx2_k1_nn + ( + m0, n0, k0, + (dcomplex*)alpha, + (dcomplex*)a, *lda, + (dcomplex*)b, *ldb, + (dcomplex*)beta, + c, *ldc + ); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS */ @@ -1047,6 +1051,32 @@ void zgemm_blis_impl return; } + const num_t dt = BLIS_DCOMPLEX; + + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t co = BLIS_OBJECT_INITIALIZER; + + dim_t m0_a, n0_a; + dim_t m0_b, n0_b; + + bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); + bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); + + bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao ); + bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao ); + + bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao ); + bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo ); + bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co ); + + bli_obj_set_conjtrans( blis_transa, &ao ); + bli_obj_set_conjtrans( blis_transb, &bo ); + + bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked. + #ifdef BLIS_ENABLE_SMALL_MATRIX if (((!is_parallel) && (((m0*k0) <= 16384) || ((n0*k0) <= 16384))) || diff --git a/kernels/zen/3/bli_zgemm_avx2_k1.c b/kernels/zen/3/bli_zgemm_avx2_k1.c index a6a92f9a54..f264741a2e 100644 --- a/kernels/zen/3/bli_zgemm_avx2_k1.c +++ b/kernels/zen/3/bli_zgemm_avx2_k1.c @@ -33,81 +33,64 @@ */ #include -#include #include "blis.h" #include "immintrin.h" #define Z_MR 4 -#define Z_NR 6 - -// Macros for the main loop for M -#define SCALE_ALPHA_REAL_M_LOOP(rin_0,rin_1,r_bcast,real_val) \ - r_bcast = _mm256_broadcast_sd((double const *)(&real_val)); \ - rin_0 = _mm256_mul_pd(rin_0,r_bcast); \ - rin_1 = _mm256_mul_pd(rin_1,r_bcast); \ - -#define SCALE_ALPHA_IMAG_M_LOOP(rout_0,rout_1,rin_0,rin_1,r_bcast,r_perm,imag_val) \ - r_perm = _mm256_permute4x64_pd(rin_0,0b10110001); \ - r_bcast = _mm256_set_pd(1.0,-1.0,1.0,-1.0); \ - r_perm = _mm256_mul_pd(r_bcast, r_perm); \ - r_bcast = _mm256_broadcast_sd((double const *)(&imag_val)); \ - rout_0 = _mm256_fmadd_pd(r_perm,r_bcast,rout_0); \ - r_perm = _mm256_permute4x64_pd(rin_1,0b10110001); \ - r_bcast = _mm256_set_pd(1.0,-1.0,1.0,-1.0); \ - r_perm = _mm256_mul_pd(r_bcast, r_perm); \ - r_bcast = _mm256_broadcast_sd((double const *)(&imag_val)); \ - rout_1 = _mm256_fmadd_pd(r_perm,r_bcast,rout_1); \ - -#define NEG_PERM_M_LOOP(r0,r1,r2) \ - r0 = _mm256_permute4x64_pd(r0,0b10110001); \ - r1 = _mm256_permute4x64_pd(r1,0b10110001); \ - r2 = _mm256_set_pd(1.0,-1.0,1.0,-1.0); \ - r0 = _mm256_mul_pd(r2, r0); \ - r1 = _mm256_mul_pd(r2, r1); \ - -#define FMA_M_LOOP(rin_0,rin_1,rout_0,rout_1,rbc,loc) \ - rbc = _mm256_broadcast_sd(loc); \ - rout_0 = _mm256_fmadd_pd(rbc, rin_0, rout_0); \ - rout_1 = _mm256_fmadd_pd(rbc, rin_1, rout_1); \ - -#define SCALE_BETA_REAL_M_LOOP(rin_0,rin_1,rout_0,rout_1,rbc) \ - rout_0 = _mm256_fmadd_pd(rbc, rin_0, rout_0); \ - rout_1 = _mm256_fmadd_pd(rbc, rin_1, rout_1); \ - -#define SCALE_BETA_IMAG_M_LOOP(rin_0,rin_1,rout_0,rout_1,rbc,rn) \ - NEG_PERM_M_LOOP(rin_0,rin_1,rn); \ - rout_0 = _mm256_fmadd_pd(rbc, rin_0, rout_0); \ - rout_1 = _mm256_fmadd_pd(rbc, rin_1, rout_1); \ - -// Macros for fringe cases with M -#define SCALE_ALPHA_REAL_M_FRINGE(rin_0,r_bcast,real_val) \ - r_bcast = _mm256_broadcast_sd((double const *)(&real_val)); \ - rin_0 = _mm256_mul_pd(rin_0,r_bcast); \ - -#define SCALE_ALPHA_IMAG_M_FRINGE(rout_0,rin_0,r_bcast,r_perm,imag_val) \ - r_perm = _mm256_permute4x64_pd(rin_0,0b10110001); \ - r_bcast = _mm256_set_pd(1.0,-1.0,1.0,-1.0); \ - r_perm = _mm256_mul_pd(r_bcast, r_perm); \ - r_bcast = _mm256_broadcast_sd((double const *)(&imag_val)); \ - rout_0 = _mm256_fmadd_pd(r_perm,r_bcast,rout_0); \ - -#define NEG_PERM_M_FRINGE(r0,r2) \ - r0 = _mm256_permute4x64_pd(r0,0b10110001); \ - r2 = _mm256_set_pd(1.0,-1.0,1.0,-1.0); \ - r0 = _mm256_mul_pd(r2, r0); \ - -#define FMA_M_FRINGE(r_in,r_out,r_bc,loc) \ - r_bc = _mm256_broadcast_sd(loc); \ - r_out = _mm256_fmadd_pd(r_bc, r_in, r_out); \ - -#define SCALE_BETA_REAL_M_FRINGE(rin_0,rout_0,rbc) \ - rout_0 = _mm256_fmadd_pd(rbc, rin_0, rout_0); \ - -#define SCALE_BETA_IMAG_M_FRINGE(rin_0,rout_0,rbc,rn) \ - NEG_PERM_M_FRINGE(rin_0,rn); \ - rout_0 = _mm256_fmadd_pd(rbc, rin_0, rout_0); \ - -void bli_zgemm_4x6_avx2_k1_nn +#define Z_NR 4 + +// Macro to be used for beta scaling with 2 loads from C(main loop of m) +#define BETA_SCALING_C_MAIN(reg_0, reg_1, loc) \ +\ + /* Here, a_vec_0 and a_vec_1 are used to load columns of + length Z_MR from C, with bdcst_0 and bdcst_1 already + having the real and imaginary parts of beta broadcasted + onto them. reg_0 and reg_1 are the intermediate registers + containing the result of alpha*A*B on them. The beta scaling + and final accumalation is done on these registers for + storing the corresponding column of C. */ \ +\ + a_vec_0 = _mm256_loadu_pd((double const*)(loc)); \ + a_vec_1 = _mm256_loadu_pd((double const*)(loc + 2)); \ +\ + reg_0 = _mm256_fmadd_pd(a_vec_0, bdcst_0, reg_0); \ + reg_1 = _mm256_fmadd_pd(a_vec_1, bdcst_0, reg_1); \ +\ + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); \ + a_vec_1 = _mm256_permute_pd(a_vec_1, 0x5); \ +\ + a_vec_0 = _mm256_mul_pd(a_vec_0, bdcst_1); \ + a_vec_1 = _mm256_mul_pd(a_vec_1, bdcst_1); \ +\ + reg_0 = _mm256_addsub_pd(reg_0, a_vec_0); \ + reg_1 = _mm256_addsub_pd(reg_1, a_vec_1); + +// Macro to be used for beta scaling with 1 load from C(fringe case with m_rem == 1) +#define BETA_SCALING_C_FRINGE(reg_0, loc) \ +\ + /* Here, a_vec_0 is used to load a column of length 2 + from C, with bdcst_0 and bdcst_1 already having the real + and imaginary parts of beta broadcasted onto them. reg_0 + is the intermediate register containing the result of + alpha*A*B on it. The beta scaling and final accumalation + is done on these registers for storing the corresponding + column of C. */ \ +\ + a_vec_0 = _mm256_loadu_pd((double const*)(loc)); \ +\ + reg_0 = _mm256_fmadd_pd(a_vec_0, bdcst_0, reg_0); \ +\ + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); \ +\ + a_vec_0 = _mm256_mul_pd(a_vec_0, bdcst_1); \ +\ + reg_0 = _mm256_addsub_pd(reg_0, a_vec_0); + +/* The following API implements the ZGEMM operation specifically for inputs A and B + with k == 1. It expects the inputs and output to support the column-major storage + scheme, without any requirement to conjugate/transpose any of the operands. */ + +void bli_zgemm_4x4_avx2_k1_nn ( dim_t m, dim_t n, @@ -119,1711 +102,1028 @@ void bli_zgemm_4x6_avx2_k1_nn dcomplex* c, const inc_t ldc ) { + // Setting the required variables for choosing the right path + // to execute the required computation. + dim_t m_iter = ( m / Z_MR ); + dim_t n_iter = ( n / Z_NR ); - double alpha_real, beta_real; - double alpha_imag, beta_imag; - - alpha_real = alpha->real; - beta_real = beta->real; - alpha_imag = alpha->imag; - beta_imag = beta->imag; - - /* If m or n is zero, return immediately. */ - if ( bli_zero_dim2( m, n ) ) return; - /* If alpha alone is zero, scale by beta and return. */ - if (bli_zeq0(*(alpha))) - { - bli_zscalm( - BLIS_NO_CONJUGATE, - 0, - BLIS_NONUNIT_DIAG, - BLIS_DENSE, - m, n, - beta, - c, 1, ldc - ); - return; - } - - dim_t m_remainder = (m % Z_MR); - dim_t n_remainder = (n % Z_NR); - - //scratch registers - __m256d ymm0, ymm1, ymm2, ymm3; - __m256d ymm4, ymm5, ymm6, ymm7; - __m256d ymm8, ymm9, ymm10, ymm11; - __m256d ymm12, ymm13, ymm14, ymm15; - __m128d xmm5; - - //gcc12 throws a unitialized warning, - //To avoid that these variable are set to zero. - ymm0 = _mm256_setzero_pd(); - /* Form C = alpha*A*B + beta*c */ - // Main loop along N dimension - for(dim_t j = 0;j < (n-Z_NR+1);j=j+Z_NR) + dim_t m_remainder = ( m % Z_MR ); + dim_t n_remainder = ( n % Z_NR ); + + // Setting the alpha and beta scaling components(real and imaginary). + double alpha_real = alpha->real; + double alpha_imag = alpha->imag; + + double beta_real = beta->real; + double beta_imag = beta->imag; + + // Using the predefined enumerated constants to classify beta scaling + // into one of the below categories. + int beta_mul_type = BLIS_MUL_DEFAULT; + + // Setting the appropriate type for beta scaling + // based on any of the special cases. + if( beta_imag == 0.0 ) { - dcomplex* temp_b = b + j*ldb; - dcomplex* temp_a = a; - dcomplex* temp_c = c + j*ldc; + if( beta_real == 0.0 ) beta_mul_type = BLIS_MUL_ZERO; + else if( beta_real == 1.0 ) beta_mul_type = BLIS_MUL_ONE; + } + + // Implementing the GEMM operation, which is as follows : + // C := beta*C + alpha*A*B. + + // The code structure deals with fringe cases first, followed by the main loop + // both in the n and m direction. - //Main loop along M dimension - for(dim_t i = 0;i < (m-Z_MR+1);i=i+Z_MR) + // Local pointers for B and C, to be used along the n-loop + dcomplex* temp_b = b; + dcomplex* temp_c = c; + + if( ( n_remainder & 0x1 ) == 1 ) // In case of n_remainder being 1 or 3 + { + // Setting the panel addresses for A, B and C, to be used along m-loop + dcomplex *temp_ai = a; + dcomplex *temp_bj = temp_b; + dcomplex *temp_cij = temp_c; + + /* Multiple blocks of Z_MR x 1(main loop for m) and/or m_remainder x 1 block(s) + of A use the same 1 x 1 block of B in order to compute the associated Z_MR x 1 + and/or m_remainder x 1 block of C. This reusability has been exploited, wherein + the associated 1 x 1 block of B is scaled with alpha, and stored in + registers beforehand, to be reused in the main loop or fringe case of m. */ + + // Intermediate registers used for alpha scaling the block of B and storing. + __m256d a_vec_0, a_vec_1; + __m256d b_vec_0; + __m256d b_real_0; + __m256d b_imag_0; + __m256d bdcst_0, bdcst_1; + + /* Broadcasting real and imaginary components of elements from B + and unpacking them to set them in registers in the form : + { Real_part, Imag_part, Real_part, Imag_part }. + + A total of Z_NR registers are used to store the alpha-scaled B + for reuse. */ + + b_real_0 = _mm256_broadcast_sd((double const *)(temp_bj)); + b_imag_0 = _mm256_broadcast_sd((double const *)(temp_bj) + 1); + b_vec_0 = _mm256_unpacklo_pd(b_real_0, b_imag_0); + + // Broadcast elements from alpha, and exhibit the compute for complex scaling. + a_vec_0 = _mm256_broadcast_sd((double const *)(&alpha_real)); + a_vec_1 = _mm256_broadcast_sd((double const *)(&alpha_imag)); + + bdcst_0 = _mm256_unpacklo_pd(b_imag_0, b_real_0); + bdcst_0 = _mm256_mul_pd(a_vec_1, bdcst_0); + b_vec_0 = _mm256_fmaddsub_pd(a_vec_0, b_vec_0, bdcst_0); + + // Fringe cases in the m-direction. + dim_t m_rem = m_remainder; + if ( ( m_rem & 0x1 ) == 1 ) { - ymm3 = _mm256_setzero_pd(); - ymm4 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm6 = _mm256_setzero_pd(); - ymm7 = _mm256_setzero_pd(); - ymm8 = _mm256_setzero_pd(); - ymm9 = _mm256_setzero_pd(); - ymm10 = _mm256_setzero_pd(); - ymm11 = _mm256_setzero_pd(); - ymm12 = _mm256_setzero_pd(); - - /* - a. Perform alpha*A*B using temp_a, temp_b and alpha_real, alpha_imag - where alpha_real and/or alpha_imag is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_b where - computing all Z_MR rows of temp_a. - c. Same approach is used in remaining fringe cases. - */ - //R(a[0][0]) I(a[0][0]) R(a[1][0]) I(a[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_a)); - //R(a[2][0]) I(a[2][0]) R(a[3][0]) I(a[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_a + 2)); - - ymm13 = ymm0; - ymm14 = ymm1; - _mm_prefetch((char*)(temp_a + 32), _MM_HINT_T0); - - SCALE_ALPHA_REAL_M_LOOP(ymm0,ymm1,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_LOOP(ymm0,ymm1,ymm13,ymm14,ymm15,ymm2,alpha_imag); - - ymm13 = _mm256_setzero_pd(); - ymm14 = _mm256_setzero_pd(); - - /* - The result after scaling with alpha_real and/or alpha_imag is as follows: - For ymm0 : - R(a[0][0]) = alpha_real*R(a[0][0])-alpha_imag*I(a[0][0]) - I(a[0][0]) = alpha_real*I(a[0][0])+alpha_imag*R[0][0] - R(a[1][0]) = alpha_real*R(a[1][0])-alpha_imag*I(a[1][0]) - I(a[1][0]) = alpha_real*I(a[1][0])+alpha_imag*(R[1][0]) - - For ymm1 : - R(a[2][0]) = alpha_real*R(a[2][0])-alpha_imag*I(a[2][0]) - I(a[2][0]) = alpha_real*I(a[2][0])+alpha_imag*R[2][0] - R(a[3][0]) = alpha_real*R(a[3][0])-alpha_imag*I(a[3][0]) - I(a[3][0]) = alpha_real*I(a[3][0])+alpha_imag*(R[3][0]) - */ - - //Calculating using real part of complex number in B matrix - //ymm3+=R(b[0][0])*R(a[0][0]) R(b[0][0])*I(a[0][0]) - // R(b[0][0])*R(a[1][0]) R(b[0][0])*I(a[1][0]) - //ymm4+=R(b[0][0])*R(a[2][0]) R(b[0][0])*I(a[2][0]) - // R(b[0][0])*R(a[3][0]) R(b[0][0])*I(a[3][0]) - FMA_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm2,(double const *)(temp_b)); - //ymm5+=R(b[0][1])*R(a[0][0]) R(b[0][1])*I(a[0][0]) - // R(b[0][1])*R(a[1][0]) R(b[0][1])*I(a[1][0]) - //ymm6+=R(b[0][1])*R(a[0][0]) R(b[0][1])*I(a[0][0]) - // R(b[0][1])*R(a[1][0]) R(b[0][1])*I(a[1][0]) - FMA_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm2,(double const *)(temp_b+ldb)); - //ymm7+=R(b[0][2])*R(a[0][0]) R(b[0][2])*I(a[0][0]) - // R(b[0][2])*R(a[1][0]) R(b[0][2])*I(a[1][0]) - //ymm8+=R(b[0][2])*R(a[0][0]) R(b[0][2])*I(a[0][0]) - // R(b[0][2])*R(a[1][0]) R(b[0][2])*I(a[1][0]) - FMA_M_LOOP(ymm0,ymm1,ymm7,ymm8,ymm2,(double const *)(temp_b+ldb*2)); - //ymm9+=R(b[0][3])*R(a[0][0]) R(b[0][3])*I(a[0][0]) - // R(b[0][3])*R(a[1][0]) R(b[0][3])*I(a[1][0]) - //ymm10+=R(b[0][3])*R(a[0][0]) R(b[0][3])*I(a[0][0]) - // R(b[0][3])*R(a[1][0]) R(b[0][3])*I(a[1][0]) - FMA_M_LOOP(ymm0,ymm1,ymm9,ymm10,ymm2,(double const *)(temp_b+ldb*3)); - //ymm11+=R(b[0][4])*R(a[0][0]) R(b[0][4])*I(a[0][0]) - // R(b[0][4])*R(a[1][0]) R(b[0][4])*I(a[1][0]) - //ymm12+=R(b[0][4])*R(a[0][0]) R(b[0][4])*I(a[0][0]) - // R(b[0][4])*R(a[1][0]) R(b[0][4])*I(a[1][0]) - FMA_M_LOOP(ymm0,ymm1,ymm11,ymm12,ymm2,(double const *)(temp_b+ldb*4)); - //ymm11+=R(b[0][5])*R(a[0][0]) R(b[0][5])*I(a[0][0]) - // R(b[0][5])*R(a[1][0]) R(b[0][5])*I(a[1][0]) - //ymm12+=R(b[0][5])*R(a[0][0]) R(b[0][5])*I(a[0][0]) - // R(b[0][5])*R(a[1][0]) R(b[0][5])*I(a[1][0]) - FMA_M_LOOP(ymm0,ymm1,ymm13,ymm14,ymm2,(double const *)(temp_b+ldb*5)); - - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 and ymm1 in accordance to the requirement - NEG_PERM_M_LOOP(ymm0,ymm1,ymm2); - //ymm3+=I(b[0][0])*R(a[0][0]) I(b[0][0])*I(a[0][0]) - // I(b[0][0])*R(a[1][0]) I(b[0][0])*I(a[1][0]) - //ymm4+=R(b[0][0])*R(a[2][0]) I(b[0][0])*I(a[2][0]) - // I(b[0][0])*R(a[3][0]) I(b[0][0])*I(a[3][0]) - FMA_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm2,(double const *)(temp_b)+1); - //ymm5+=I(b[0][1])*R(a[0][0]) I(b[0][1])*I(a[0][0]) - // I(b[0][1])*R(a[1][0]) I(b[0][1])*I(a[1][0]) - //ymm6+=R(b[0][1])*R(a[0][0]) I(b[0][1])*I(a[0][0]) - // I(b[0][1])*R(a[1][0]) I(b[0][1])*I(a[1][0]) - FMA_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm2,(double const *)(temp_b+ldb)+1); - //ymm7+=I(b[0][2])*R(a[0][0]) I(b[0][2])*I(a[0][0]) - // I(b[0][2])*R(a[1][0]) I(b[0][2])*I(a[1][0]) - //ymm8+=I(b[0][2])*R(a[0][0]) I(b[0][2])*I(a[0][0]) - // I(b[0][2])*R(a[1][0]) I(b[0][2])*I(a[1][0]) - FMA_M_LOOP(ymm0,ymm1,ymm7,ymm8,ymm2,(double const *)(temp_b+ldb*2)+1); - //ymm9+=I(b[0][3])*R(a[0][0]) I(b[0][3])*I(a[0][0]) - // I(b[0][3])*R(a[1][0]) I(b[0][3])*I(a[1][0]) - //ymm10+=I(b[0][3])*R(a[0][0]) I(b[0][3])*I(a[0][0]) - // I(b[0][3])*R(a[1][0]) I(b[0][3])*I(a[1][0]) - FMA_M_LOOP(ymm0,ymm1,ymm9,ymm10,ymm2,(double const *)(temp_b+ldb*3)+1); - //ymm11+=I(b[0][4])*R(a[0][0]) I(b[0][4])*I(a[0][0]) - // I(b[0][4])*R(a[1][0]) I(b[0][4])*I(a[1][0]) - //ymm12+=I(b[0][4])*R(a[0][0]) I(b[0][4])*I(a[0][0]) - // I(b[0][4])*R(a[1][0]) I(b[0][4])*I(a[1][0]) - FMA_M_LOOP(ymm0,ymm1,ymm11,ymm12,ymm2,(double const *)(temp_b+ldb*4)+1); - //ymm13+=I(b[0][5])*R(a[0][0]) I(b[0][5])*I(a[0][0]) - // I(b[0][5])*R(a[1][0]) I(b[0][5])*I(a[1][0]) - //ymm14+=I(b[0][5])*R(a[0][0]) I(b[0][5])*I(a[0][0]) - // I(b[0][5])*R(a[1][0]) I(b[0][5])*I(a[1][0]) - FMA_M_LOOP(ymm0,ymm1,ymm13,ymm14,ymm2,(double const *)(temp_b+ldb*5)+1); - - /* - a. Perform beta*C using temp_c, beta_real, - where beta_real is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_c where - computing all Z_MR rows of temp_c. - c. Accumulated alpha*A*B into registers will be added to beta*C - d. Same approach is used in remaining fringe cases. - */ - if(beta_real != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //R(c[2][0]) I(c[2][0]) R(c[3][0]) I(c[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + 2)); - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - // beta_real*R(c[1][0]) beta_real*I(c[1][0]) - //ymm4+=beta_real*R(c[2][0]) beta_real*I(c[2][0]) - // beta_real*R(c[3][0]) beta_real*I(c[3][0]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm15); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //R(c[2][1]) I(c[2][1]) R(c[3][1]) I(c[3][1]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc + 2)); - //ymm5+=beta_real*R(c[0][1]) beta_real*I(c[0][1]) - // beta_real*R(c[1][1]) beta_real*I(c[1][1]) - //ymm6+=beta_real*R(c[2][1]) beta_real*I(c[2][1]) - // beta_real*R(c[3][1]) beta_real*I(c[3][1]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm15); - - //R(c[0][2]) I(c[0][2]) R(c[1][2]) I(c[1][2]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*2)); - //R(c[2][2]) I(c[2][2]) R(c[3][2]) I(c[3][2]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*2 + 2)); - //ymm7+=beta_real*R(c[0][2]) beta_real*I(c[0][2]) - // beta_real*R(c[1][2]) beta_real*I(c[1][2]) - //ymm8+=beta_real*R(c[2][2]) beta_real*I(c[2][2]) - //beta_real*R(c[3][2]) beta_real*I(c[3][2]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm7,ymm8,ymm15); - - //R(c[0][3]) I(c[0][3]) R(c[1][3]) I(c[1][3]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*3)); - //R(c[2][3]) I(c[2][3]) R(c[3][3]) I(c[3][3]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*3 + 2)); - //ymm9+=beta_real*R(c[0][3]) beta_real*I(c[0][3]) - // beta_real*R(c[1][3]) beta_real*I(c[1][3]) - //ymm10+=beta_real*R(c[2][3]) beta_real*I(c[2][3]) - // beta_real*R(c[3][3]) beta_real*I(c[3][3]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm9,ymm10,ymm15); - - //R(c[0][4]) I(c[0][4]) R(c[1][4]) I(c[1][4]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*4)); - //R(c[2][4]) I(c[2][4]) R(c[3][4]) I(c[3][4]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*4 + 2)); - //ymm11+=beta_real*R(c[0][4]) beta_real*I(c[0][4]) - // beta_real*R(c[1][4]) beta_real*I(c[1][4]) - //ymm12+=beta_real*R(c[2][4]) beta_real*I(c[2][4]) - // beta_real*R(c[3][4]) beta_real*I(c[3][4]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm11,ymm12,ymm15); - - //R(c[0][5]) I(c[0][5]) R(c[1][5]) I(c[1][5]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*5)); - //R(c[2][5]) I(c[2][5]) R(c[3][5]) I(c[3][5]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*5 + 2)); - //ymm13+=beta_real*R(c[0][5]) beta_real*I(c[0][5]) - // beta_real*R(c[1][5]) beta_real*I(c[1][5]) - //ymm14+=beta_real*R(c[2][5]) beta_real*I(c[2][5]) - // beta_real*R(c[3][5]) beta_real*I(c[3][5]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm13,ymm14,ymm15); - } + // Scratch registers. + __m256d b_scaled_0, b_perm_0, a_real, a_imag; - /* - a. Perform beta*C using temp_c, beta_imag, - where beta_imag is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_c where - computing all Z_MR rows of temp_c. - c. Accumulated alpha*A*B into registers will be added to beta*C - d. Same approach is used in remaining fringe cases. - */ - - if(beta_imag != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //R(c[2][0]) I(c[2][0]) R(c[3][0]) I(c[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + 2)); - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - // beta_imag*(-I(c[1][0])) beta_imag*R(c[1][0]) - //ymm4+=beta_imag*(-I(c[2][0])) beta_imag*R(c[2][0]) - // beta_imag*(-I(c[3][0])) beta_imag*R(c[3][0]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm15,ymm2); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //R(c[2][1]) I(c[2][1]) R(c[3][1]) I(c[3][1]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc + 2)); - //ymm5+=beta_imag*(-I(c[0][1])) beta_imag*R(c[0][1]) - // beta_imag*(-I(c[1][1])) beta_imag*R(c[1][1]) - //ymm6+=beta_imag*(-I(c[2][1])) beta_imag*R(c[2][1]) - // beta_imag*(-I(c[3][1])) beta_imag*R(c[3][1]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm15,ymm2); - - //R(c[0][2]) I(c[0][2]) R(c[1][2]) I(c[1][2]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*2)); - //R(c[2][2]) I(c[2][2]) R(c[3][2]) I(c[3][2]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*2 + 2)); - //ymm7+=beta_imag*(-I(c[0][2])) beta_imag*R(c[0][2]) - // beta_imag*(-I(c[1][2])) beta_imag*R(c[1][2]) - //ymm8+=beta_imag*(-I(c[2][2])) beta_imag*R(c[2][2]) - // beta_imag*(-I(c[3][2])) beta_imag*R(c[3][2]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm7,ymm8,ymm15,ymm2); - - //R(c[0][3]) I(c[0][3]) R(c[1][3]) I(c[1][3]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*3)); - //R(c[2][3]) I(c[2][3]) R(c[3][3]) I(c[3][3]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*3 + 2)); - //ymm9+=beta_imag*(-I(c[0][3])) beta_imag*R(c[0][3]) - // beta_imag*(-I(c[1][3])) beta_imag*R(c[1][3]) - //ymm10+=beta_imag*(-I(c[2][3])) beta_imag*R(c[2][3]) - // beta_imag*(-I(c[3][3])) beta_imag*R(c[3][3]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm9,ymm10,ymm15,ymm2); - - //R(c[0][4]) I(c[0][4]) R(c[1][4]) I(c[1][4]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*4)); - //R(c[2][4]) I(c[2][4]) R(c[3][4]) I(c[3][4]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*4 + 2)); - //ymm11+=beta_imag*(-I(c[0][4])) beta_imag*R(c[0][4]) - // beta_imag*(-I(c[1][4])) beta_imag*R(c[1][4]) - //ymm12+=beta_imag*(-I(c[2][4])) beta_imag*R(c[2][4]) - // beta_imag*(-I(c[3][4])) beta_imag*R(c[3][4]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm11,ymm12,ymm15,ymm2); - - //R(c[0][5]) I(c[0][5]) R(c[1][5]) I(c[1][5]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*5)); - //R(c[2][5]) I(c[2][5]) R(c[3][5]) I(c[3][5]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*5 + 2)); - //ymm13+=beta_imag*(-I(c[0][5])) beta_imag*R(c[0][5]) - // beta_imag*(-I(c[1][5])) beta_imag*R(c[1][5]) - //ymm14+=beta_imag*(-I(c[2][5])) beta_imag*R(c[2][5]) - // beta_imag*(-I(c[3][5])) beta_imag*R(c[3][5]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm13,ymm14,ymm15,ymm2); - } - /* - The scaling has been done sequentially as follows: - - If alpha_real is not 0, it is used for scaling A - - If alpha_imag is not 0, it is used for scaling A using permutation - and selective negation, after loading - - If beta_real is not 0, is is used for scaling C - - If beta_imag is not 0, it is used for scaling C using permutation - and selective negation, after loading + __m128d b_element_0, c_element_0; + __m128d beta_real_reg, beta_imag_reg, c_perm_0; - The results are accumalated in accordance to the non zero scalar values, - and similar approach is followed in fringe cases - */ + b_scaled_0 = _mm256_setzero_pd(); + b_perm_0 = _mm256_setzero_pd(); - _mm256_storeu_pd((double *)(temp_c), ymm3); - _mm256_storeu_pd((double *)(temp_c + 2), ymm4); + /* Here, only a single element from A is of concern. + Also, we already have alpha-scaled B available in + b_vec_0 and b_vec_1. Thus, we could scale these + registers with the element from A using AVX2 ISA */ - _mm256_storeu_pd((double *)(temp_c + ldc), ymm5); - _mm256_storeu_pd((double *)(temp_c + ldc + 2), ymm6); + // Broadcasting real and imaginary components from A. - _mm256_storeu_pd((double *)(temp_c + ldc*2), ymm7); - _mm256_storeu_pd((double *)(temp_c + ldc*2 + 2), ymm8); + a_real = _mm256_broadcast_sd((double const *)(temp_ai)); + a_imag = _mm256_broadcast_sd((double const *)(temp_ai) + 1); - _mm256_storeu_pd((double *)(temp_c + ldc*3), ymm9); - _mm256_storeu_pd((double *)(temp_c + ldc*3 + 2), ymm10); + // Obtaining the alpha-scaled B matrix + b_scaled_0 = b_vec_0; + b_perm_0 = _mm256_permute_pd(b_scaled_0, 0x5); - _mm256_storeu_pd((double *)(temp_c + ldc*4), ymm11); - _mm256_storeu_pd((double *)(temp_c + ldc*4 + 2), ymm12); + b_perm_0 = _mm256_mul_pd(b_perm_0, a_imag); + b_scaled_0 = _mm256_fmaddsub_pd(b_scaled_0, a_real, b_perm_0); - _mm256_storeu_pd((double *)(temp_c + ldc*5), ymm13); - _mm256_storeu_pd((double *)(temp_c + ldc*5 + 2), ymm14); + c_element_0 = _mm256_castpd256_pd128(b_scaled_0); - temp_c+=Z_MR; - temp_a+=Z_MR; - } + // Clearing out the upper lanes of 256 bit registers to avoid + // the transition penalty + _mm256_zeroupper(); - // Fringe cases for M - dim_t m_rem=m_remainder; - if(m_rem>=2) - { - ymm3 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm7 = _mm256_setzero_pd(); - ymm9 = _mm256_setzero_pd(); - ymm11 = _mm256_setzero_pd(); - - //R(a[0][0]) I(a[0][0]) R(a[1][0]) I(a[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_a)); - - ymm13 = ymm0; - SCALE_ALPHA_REAL_M_FRINGE(ymm0,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_FRINGE(ymm0,ymm13,ymm15,ymm2,alpha_imag); - - ymm13 = _mm256_setzero_pd(); - - /* - The result after scaling with alpha_real and/or alpha_imag is as follows: - For ymm0 : - R(a[0][0]) = alpha_real*R(a[0][0])-alpha_imag*I(a[0][0]) - I(a[0][0]) = alpha_real*I(a[0][0])+alpha_imag*R[0][0] - R(a[1][0]) = alpha_real*R(a[1][0])-alpha_imag*I(a[1][0]) - I(a[1][0]) = alpha_real*I(a[1][0])+alpha_imag*(R[1][0]) - */ - - //Calculating using real part of complex number in B matrix - //ymm3+=R(b[0][0])*R(a[0][0]) R(b[0][0])*I(a[0][0]) - // R(b[0][0])*R(a[1][0]) R(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)); - //ymm5+=R(b[0][1])*R(a[0][0]) R(b[0][1])*I(a[0][0]) - // R(b[0][1])*R(a[1][0]) R(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)); - //ymm7+=R(b[0][2])*R(a[0][0]) R(b[0][2])*I(a[0][0]) - // R(b[0][2])*R(a[1][0]) R(b[0][2])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm7,ymm2,(double const *)(temp_b+ldb*2)); - //ymm9+=R(b[0][3])*R(a[0][0]) R(b[0][3])*I(a[0][0]) - // R(b[0][3])*R(a[1][0]) R(b[0][3])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm9,ymm2,(double const *)(temp_b+ldb*3)); - //ymm11+=R(b[0][4])*R(a[0][0]) R(b[0][4])*I(a[0][0]) - // R(b[0][4])*R(a[1][0]) R(b[0][4])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm11,ymm2,(double const *)(temp_b+ldb*4)); - //ymm13+=R(b[0][5])*R(a[0][0]) R(b[0][5])*I(a[0][0]) - // R(b[0][5])*R(a[1][0]) R(b[0][5])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm13,ymm2,(double const *)(temp_b+ldb*5)); - - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 in accordance to the requirement - NEG_PERM_M_FRINGE(ymm0,ymm2); - - // ymm3+=I(b[0][0])*R(a[0][0]) I(b[0][0])*I(a[0][0]) - // I(b[0][0])*R(a[1][0]) I(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)+1); - //ymm5+=I(b[0][1])*R(a[0][0]) I(b[0][1])*I(a[0][0]) - // I(b[0][1])*R(a[1][0]) I(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)+1); - //ymm7+=I(b[0][2])*R(a[0][0]) I(b[0][2])*I(a[0][0]) - // I(b[0][2])*R(a[1][0]) I(b[0][2])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm7,ymm2,(double const *)(temp_b+ldb*2)+1); - //ymm9+=I(b[0][3])*R(a[0][0]) I(b[0][3])*I(a[0][0]) - // I(b[0][3])*R(a[1][0]) I(b[0][3])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm9,ymm2,(double const *)(temp_b+ldb*3)+1); - //ymm11+=I(b[0][4])*R(a[0][0]) I(b[0][4])*I(a[0][0]) - // I(b[0][4])*R(a[1][0]) I(b[0][4])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm11,ymm2,(double const *)(temp_b+ldb*4)+1); - //ymm13+=I(b[0][5])*R(a[0][0]) I(b[0][5])*I(a[0][0]) - // I(b[0][5])*R(a[1][0]) I(b[0][5])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm13,ymm2,(double const *)(temp_b+ldb*5)+1); - - - if(beta_real != 0.0) + // Scaling with beta, according to its type. + switch( beta_mul_type ) { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - // beta_real*R(c[1][0]) beta_real*I(c[1][0]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm3,ymm15); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //ymm5+=beta_real*R(c[0][1]) beta_real*I(c[0][1]) - // beta_real*R(c[1][1]) beta_real*I(c[1][1]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm5,ymm15); - - //R(c[0][2]) I(c[0][2]) R(c[1][2]) I(c[1][2]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*2)); - //ymm7+=beta_real*R(c[0][2]) beta_real*I(c[0][2]) - // beta_real*R(c[1][2]) beta_real*I(c[1][2]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm7,ymm15); - - //R(c[0][3]) I(c[0][3]) R(c[1][3]) I(c[1][3]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*3)); - //ymm9+=beta_real*R(c[0][3]) beta_real*I(c[0][3]) - // beta_real*R(c[1][3]) beta_real*I(c[1][3]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm9,ymm15); - - //R(c[0][4]) I(c[0][4]) R(c[1][4]) I(c[1][4]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*4)); - //ymm11+=beta_real*R(c[0][4]) beta_real*I(c[0][4]) - // beta_real*R(c[1][4]) beta_real*I(c[1][4]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm11,ymm15); - - //R(c[0][5]) I(c[0][5]) R(c[1][5]) I(c[1][5]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*5)); - //ymm13+=beta_real*R(c[0][5]) beta_real*I(c[0][5]) - // beta_real*R(c[1][5]) beta_real*I(c[1][5]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm13,ymm15); - } + case BLIS_MUL_ZERO : + break; + case BLIS_MUL_ONE : + // Load C and add with the corresponding scratch register. + b_element_0 = _mm_loadu_pd((double const*)(temp_cij)); + c_element_0 = _mm_add_pd(c_element_0, b_element_0); + break; - if(beta_imag != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - // beta_imag*(-I(c[1][0])) beta_imag*R(c[1][0]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm3,ymm15,ymm2); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //ymm5+=beta_imag*(-I(c[0][1])) beta_imag*R(c[0][1]) - // beta_imag*(-I(c[1][1])) beta_imag*R(c[1][1]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm5,ymm15,ymm2); - - //R(c[0][2]) I(c[0][2]) R(c[1][2]) I(c[1][2]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*2)); - //ymm7+=beta_imag*(-I(c[0][2])) beta_imag*R(c[0][2]) - // beta_imag*(-I(c[1][2])) beta_imag*R(c[1][2]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm7,ymm15,ymm2); - - //R(c[0][3]) I(c[0][3]) R(c[1][3]) I(c[1][3]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*3)); - //ymm9+=beta_imag*(-I(c[0][3])) beta_imag*R(c[0][3]) - // beta_imag*(-I(c[1][3])) beta_imag*R(c[1][3]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm9,ymm15,ymm2); - - //R(c[0][4]) I(c[0][4]) R(c[1][4]) I(c[1][4]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*4)); - //ymm11+=beta_imag*(-I(c[0][4])) beta_imag*R(c[0][4]) - // beta_imag*(-I(c[1][4])) beta_imag*R(c[1][4]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm11,ymm15,ymm2); - - //R(c[0][5]) I(c[0][5]) R(c[1][5]) I(c[1][5]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*5)); - //ymm13+=beta_imag*(-I(c[0][5])) beta_imag*R(c[0][5]) - // beta_imag*(-I(c[1][5])) beta_imag*R(c[1][5]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm13,ymm15,ymm2); - } + default : + // Broadcast beta real and imaginary part and scale with C. + beta_real_reg = _mm_loaddup_pd((double const*)beta); + beta_imag_reg = _mm_loaddup_pd((double const*)beta + 1); + + // Load C onto registers + b_element_0 = _mm_loadu_pd((double const*)(temp_cij)); - /* - The scaling has been done sequentially as follows: - - If alpha_real is not 0, it is used for scaling A - - If alpha_imag is not 0, it is used for scaling A using permutation - and selective negation, after loading - - If beta_real is not 0, is is used for scaling C - - If beta_imag is not 0, it is used for scaling C using permutation - and selective negation, after loading + // Shuffle for the compute with imgarinary part scaling + c_perm_0 = _mm_shuffle_pd(b_element_0, b_element_0, 0x01); - The results are accumalated in accordance to the non zero scalar values. - */ + c_perm_0 = _mm_mul_pd(beta_imag_reg, c_perm_0); - _mm256_storeu_pd((double *)(temp_c), ymm3); - _mm256_storeu_pd((double *)(temp_c + ldc), ymm5); - _mm256_storeu_pd((double *)(temp_c + ldc*2), ymm7); - _mm256_storeu_pd((double *)(temp_c + ldc*3), ymm9); - _mm256_storeu_pd((double *)(temp_c + ldc*4), ymm11); - _mm256_storeu_pd((double *)(temp_c + ldc*5), ymm13); + b_element_0 = _mm_mul_pd(beta_real_reg, b_element_0); + // Compute beta-scaled C + b_element_0 = _mm_addsub_pd(b_element_0, c_perm_0); + // Add to intermediate reg storing alpha*A*B + c_element_0 = _mm_add_pd(b_element_0, c_element_0); + } - temp_c+=2; - temp_a+=2; + // Storing the result in C. + _mm_storeu_pd((double *)(temp_cij), c_element_0); - m_rem -= 2; + // We need to restore the upper lanes of the registers b_vec_0, b_vec_1, + // b_vec_2 and b_vec_3 + // They need to contain the alpha scaled B, to be reused in the main loop for m + b_element_0 = _mm256_castpd256_pd128(b_vec_0); + b_vec_0 = _mm256_insertf128_pd(b_vec_0, b_element_0, 0x01); + + // Adjusting the addresses of A and C for the next block. + temp_cij += 1; + temp_ai += 1; + + m_rem -= 1; } - if(m_rem==1) + if( m_rem == 2 ) { + // Scratch registers. + __m256d c_vec_0; - xmm5 = _mm_setzero_pd(); - ymm3 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm7 = _mm256_setzero_pd(); - ymm9 = _mm256_setzero_pd(); - ymm11 = _mm256_setzero_pd(); - - xmm5 = _mm_loadu_pd((double const*)(temp_a));//R(a[0][0]) I(a[0][0]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(a[0][0]) I(a[0][0]) - - ymm13 = ymm0; - SCALE_ALPHA_REAL_M_FRINGE(ymm0,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_FRINGE(ymm0,ymm13,ymm15,ymm2,alpha_imag); - - ymm13 = _mm256_setzero_pd(); - - //Calculating using real part of complex number in B matrix - //ymm3+=R(b[0][0])*R(a[0][0]) R(b[0][0])*I(a[0][0]) - // R(b[0][0])*R(a[1][0]) R(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)); - //ymm5+=R(b[0][1])*R(a[0][0]) R(b[0][1])*I(a[0][0]) - // R(b[0][1])*R(a[1][0]) R(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)); - //ymm7+=R(b[0][2])*R(a[0][0]) R(b[0][2])*I(a[0][0]) - // R(b[0][2])*R(a[1][0]) R(b[0][2])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm7,ymm2,(double const *)(temp_b+ldb*2)); - //ymm9+=R(b[0][3])*R(a[0][0]) R(b[0][3])*I(a[0][0]) - // R(b[0][3])*R(a[1][0]) R(b[0][3])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm9,ymm2,(double const *)(temp_b+ldb*3)); - //ymm11+=R(b[0][4])*R(a[0][0]) R(b[0][4])*I(a[0][0]) - // R(b[0][4])*R(a[1][0]) R(b[0][4])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm11,ymm2,(double const *)(temp_b+ldb*4)); - //ymm13+=R(b[0][5])*R(a[0][0]) R(b[0][5])*I(a[0][0]) - // R(b[0][5])*R(a[1][0]) R(b[0][5])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm13,ymm2,(double const *)(temp_b+ldb*5)); - - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 in accordance to the requirement - NEG_PERM_M_FRINGE(ymm0,ymm2); - - // ymm3+=I(b[0][0])*R(a[0][0]) I(b[0][0])*I(a[0][0]) - // I(b[0][0])*R(a[1][0]) I(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)+1); - //ymm5+=I(b[0][1])*R(a[0][0]) I(b[0][1])*I(a[0][0]) - // I(b[0][1])*R(a[1][0]) I(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)+1); - //ymm7+=I(b[0][2])*R(a[0][0]) I(b[0][2])*I(a[0][0]) - // I(b[0][2])*R(a[1][0]) I(b[0][2])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm7,ymm2,(double const *)(temp_b+ldb*2)+1); - //ymm9+=I(b[0][3])*R(a[0][0]) I(b[0][3])*I(a[0][0]) - // I(b[0][3])*R(a[1][0]) I(b[0][3])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm9,ymm2,(double const *)(temp_b+ldb*3)+1); - //ymm11+=I(b[0][4])*R(a[0][0]) I(b[0][4])*I(a[0][0]) - // I(b[0][4])*R(a[1][0]) I(b[0][4])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm11,ymm2,(double const *)(temp_b+ldb*4)+1); - //ymm13+=I(b[0][5])*R(a[0][0]) I(b[0][5])*I(a[0][0]) - // I(b[0][5])*R(a[1][0]) I(b[0][5])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm13,ymm2,(double const *)(temp_b+ldb*5)+1); - - if(beta_real != 0.0) + a_vec_0 = _mm256_setzero_pd(); + a_vec_1 = _mm256_setzero_pd(); + bdcst_0 = _mm256_setzero_pd(); + bdcst_1 = _mm256_setzero_pd(); + c_vec_0 = _mm256_setzero_pd(); + + // Loading a vector from A with 2 elements. + a_vec_0 = _mm256_loadu_pd((double const *)(temp_ai)); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + + // Scaling with imaginary components of elements from B. + bdcst_0 = _mm256_unpackhi_pd(b_vec_0, b_vec_0); + c_vec_0 = _mm256_mul_pd(a_vec_0, bdcst_0); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + + // Scaling with real components of elements from B. + bdcst_0 = _mm256_unpacklo_pd(b_vec_0, b_vec_0); + c_vec_0 = _mm256_fmaddsub_pd(a_vec_0, bdcst_0, c_vec_0); + + // Scaling with beta, according to its type. + switch( beta_mul_type ) { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); - - xmm5 = _mm_loadu_pd((double const*)(temp_c));//R(c[0][0]) I(c[0][0]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][0]) I(c[0][0]) - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm3,ymm15); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc));//R(c[0][1]) I(c[0][1]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][1]) I(c[0][1]) - //ymm5+=beta_real*R(c[0][1]) beta_real*I(c[0][1]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm5,ymm15); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 2));//R(c[0][2]) I(c[0][2]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][2]) I(c[0][2]) - //ymm7+=beta_real*R(c[0][2]) beta_real*I(c[0][2]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm7,ymm15); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 3));//R(c[0][3]) I(c[0][3]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][3]) I(c[0][3]) - //ymm9+=beta_real*R(c[0][3]) beta_real*I(c[0][3]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm9,ymm15); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 4));//R(c[0][4]) I(c[0][4]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][4]) I(c[0][4]) - //ymm11+=beta_real*R(c[0][4]) beta_real*I(c[0][4]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm11,ymm15); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 5));//R(c[0][5]) I(c[0][5]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][5]) I(c[0][5]) - //ymm13+=beta_real*R(c[0][5]) beta_real*I(c[0][5]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm13,ymm15); + case BLIS_MUL_ZERO : + break; + + case BLIS_MUL_ONE : + // Load C and add with the corresponding scratch register. + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij)); + c_vec_0 = _mm256_add_pd(c_vec_0, a_vec_0); + break; + + default : + // Broadcast beta and redirect to the beta scaling macro. + bdcst_0 = _mm256_broadcast_sd((double const*)(&beta_real)); + bdcst_1 = _mm256_broadcast_sd((double const*)(&beta_imag)); + + BETA_SCALING_C_FRINGE(c_vec_0, temp_cij); } - if(beta_imag != 0.0) + // Storing the result in C. + _mm256_storeu_pd((double *)(temp_cij), c_vec_0); + + // Adjusting the addresses of A and C for the next block. + temp_cij += 2; + temp_ai += 2; + + m_rem -= 2; + } + + // Main loop along M dimension. + for( dim_t i = 0; i < m_iter; i++ ) + { + // Scratch registers + __m256d c_vec_0, c_vec_1; + + a_vec_0 = _mm256_setzero_pd(); + a_vec_1 = _mm256_setzero_pd(); + bdcst_0 = _mm256_setzero_pd(); + bdcst_1 = _mm256_setzero_pd(); + c_vec_0 = _mm256_setzero_pd(); + c_vec_1 = _mm256_setzero_pd(); + + // Prefetching the block of C to be used for computation. + _mm_prefetch((char const*)(temp_cij), _MM_HINT_T0); + _mm_prefetch((char const*)(temp_cij + ldc), _MM_HINT_T0); + _mm_prefetch((char const*)(temp_cij + ldc*2), _MM_HINT_T0); + _mm_prefetch((char const*)(temp_cij + ldc*3), _MM_HINT_T0); + + // Loading vectors from A with Z_MR elements in total. + a_vec_0 = _mm256_loadu_pd((double const *)(temp_ai)); + a_vec_1 = _mm256_loadu_pd((double const *)(temp_ai + 2)); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + a_vec_1 = _mm256_permute_pd(a_vec_1, 0x5); + + // Scaling with imaginary components of elements from B. + bdcst_0 = _mm256_unpackhi_pd(b_vec_0, b_vec_0); + c_vec_0 = _mm256_mul_pd(a_vec_0, bdcst_0); + c_vec_1 = _mm256_mul_pd(a_vec_1, bdcst_0); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + a_vec_1 = _mm256_permute_pd(a_vec_1, 0x5); + + // Scaling with real components of elements from B. + bdcst_0 = _mm256_unpacklo_pd(b_vec_0, b_vec_0); + c_vec_0 = _mm256_fmaddsub_pd(a_vec_0, bdcst_0, c_vec_0); + c_vec_1 = _mm256_fmaddsub_pd(a_vec_1, bdcst_0, c_vec_1); + + // Scaling with beta, according to its type. + switch( beta_mul_type ) { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); - - xmm5 = _mm_loadu_pd((double const*)(temp_c));//R(c[0][0]) I(c[0][0]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][0]) I(c[0][0]) - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm3,ymm15,ymm2); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc));//R(c[0][1]) I(c[0][1]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][1]) I(c[0][1]) - //ymm5+=beta_imag*(-I(c[0][1])) beta_imag*R(c[0][1]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm5,ymm15,ymm2); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 2));//R(c[0][2]) I(c[0][2]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][2]) I(c[0][2]) - //ymm7+=beta_imag*(-I(c[0][2])) beta_imag*R(c[0][2]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm7,ymm15,ymm2); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 3));//R(c[0][3]) I(c[0][3]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][3]) I(c[0][3]) - //ymm9+=beta_imag*(-I(c[0][3])) beta_imag*R(c[0][3]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm9,ymm15,ymm2); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 4));//R(c[0][4]) I(c[0][4]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][4]) I(c[0][4]) - //ymm11+=beta_imag*(-I(c[0][4])) beta_imag*R(c[0][4]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm11,ymm15,ymm2); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 5));//R(c[0][5]) I(c[0][5]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][5]) I(c[0][5]) - //ymm13+=beta_imag*(-I(c[0][5])) beta_imag*R(c[0][5]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm13,ymm15,ymm2); - } + case BLIS_MUL_ZERO : + break; - xmm5 = _mm256_extractf128_pd(ymm3, 0); - _mm_storeu_pd((double *)(temp_c), xmm5); + case BLIS_MUL_ONE : + // Load C and add with the corresponding scratch register. + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij)); + a_vec_1 = _mm256_loadu_pd((double const*)(temp_cij + 2)); + c_vec_0 = _mm256_add_pd(c_vec_0, a_vec_0); + c_vec_1 = _mm256_add_pd(c_vec_1, a_vec_1); + break; - xmm5 = _mm256_extractf128_pd(ymm5, 0); - _mm_storeu_pd((double *)(temp_c + ldc), xmm5); + default : + // Broadcast beta and redirect to the beta scaling macro. + bdcst_0 = _mm256_broadcast_sd((double const*)(&beta_real)); + bdcst_1 = _mm256_broadcast_sd((double const*)(&beta_imag)); - xmm5 = _mm256_extractf128_pd(ymm7, 0); - _mm_storeu_pd((double *)(temp_c + ldc*2), xmm5); + BETA_SCALING_C_MAIN(c_vec_0, c_vec_1, temp_cij); - xmm5 = _mm256_extractf128_pd(ymm9, 0); - _mm_storeu_pd((double *)(temp_c + ldc*3), xmm5); + } - xmm5 = _mm256_extractf128_pd(ymm11, 0); - _mm_storeu_pd((double *)(temp_c + ldc*4), xmm5); + // Storing the result in C. + _mm256_storeu_pd((double *)(temp_cij), c_vec_0); + _mm256_storeu_pd((double *)(temp_cij + 2), c_vec_1); - xmm5 = _mm256_extractf128_pd(ymm13, 0); - _mm_storeu_pd((double *)(temp_c + ldc*5), xmm5); + // Adjusting the addresses of A and C for the next iteration. + temp_cij += Z_MR; + temp_ai += Z_MR; } + temp_b += ldb; + temp_c += ldc; + + n_remainder -= 1; } - //Fringe case for N - if(n_remainder>=4) + if( n_remainder == 2 ) { - dcomplex* temp_b = b + (n - n_remainder)*ldb; - dcomplex* temp_a = a; - dcomplex* temp_c = c + (n - n_remainder)*ldc; - - //Main loop for M - for(dim_t i = 0;i < (m-Z_MR+1);i=i+Z_MR) + // Setting the panel addresses for A B, and C, to be used along m-loop + dcomplex *temp_ai = a; + dcomplex *temp_bj = temp_b; + dcomplex *temp_cij = temp_c; + + /* Multiple blocks of Z_MR x 1(main loop for m) and/or m_remainder x 1 block(s) + of A use the same 1 x 2 block of B in order to compute the associated Z_MR x 2 + and/or m_remainder x 2 block of C. This reusability has been exploited, wherein + the associated 1 x 2 block of B is scaled with alpha, and stored in + registers beforehand, to be reused in the main loop or fringe case of m. */ + + // Intermediate registers used for alpha scaling the block of B and storing. + __m256d a_vec_0, a_vec_1; + __m256d b_vec_0, b_vec_1; + __m256d b_real_0, b_real_1; + __m256d b_imag_0, b_imag_1; + __m256d bdcst_0, bdcst_1; + + /* Broadcasting real and imaginary components of elements from B + and unpacking them to set them in registers in the form : + { Real_part, Imag_part, Real_part, Imag_part }. + + A total of Z_NR registers are used to store the alpha-scaled B + for reuse. */ + + b_real_0 = _mm256_broadcast_sd((double const *)(temp_bj)); + b_imag_0 = _mm256_broadcast_sd((double const *)(temp_bj) + 1); + b_vec_0 = _mm256_unpacklo_pd(b_real_0, b_imag_0); + + b_real_1 = _mm256_broadcast_sd((double const *)(temp_bj + ldb)); + b_imag_1 = _mm256_broadcast_sd((double const *)(temp_bj + ldb) + 1); + b_vec_1 = _mm256_unpacklo_pd(b_real_1, b_imag_1); + + // Broadcast elements from alpha, and exhibit the compute for complex scaling. + a_vec_0 = _mm256_broadcast_sd((double const *)(&alpha_real)); + a_vec_1 = _mm256_broadcast_sd((double const *)(&alpha_imag)); + + bdcst_0 = _mm256_unpacklo_pd(b_imag_0, b_real_0); + bdcst_1 = _mm256_unpacklo_pd(b_imag_1, b_real_1); + bdcst_0 = _mm256_mul_pd(a_vec_1, bdcst_0); + bdcst_1 = _mm256_mul_pd(a_vec_1, bdcst_1); + b_vec_0 = _mm256_fmaddsub_pd(a_vec_0, b_vec_0, bdcst_0); + b_vec_1 = _mm256_fmaddsub_pd(a_vec_0, b_vec_1, bdcst_1); + + // Fringe cases in the m-direction. + dim_t m_rem = m_remainder; + if ( ( m_rem & 0x1 ) == 1 ) { - ymm3 = _mm256_setzero_pd(); - ymm4 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm6 = _mm256_setzero_pd(); - ymm7 = _mm256_setzero_pd(); - ymm8 = _mm256_setzero_pd(); - ymm9 = _mm256_setzero_pd(); - ymm10 = _mm256_setzero_pd(); - - /* - a. Perform alpha*A*B using temp_a, temp_b and alpha_real, alpha_imag - where alpha_real and/or alpha_imag is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_b where - computing all Z_MR rows of temp_a. - c. Same approach is used in remaining fringe cases. - */ - - //R(a[0][0]) I(a[0][0]) R(a[1][0]) I(a[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_a)); - //R(a[2][0]) I(a[2][0]) R(a[3][0]) I(a[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_a + 2)); - - ymm13 = ymm0; - ymm14 = ymm1; - SCALE_ALPHA_REAL_M_LOOP(ymm0,ymm1,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_LOOP(ymm0,ymm1,ymm13,ymm14,ymm15,ymm2,alpha_imag); - - /* - The result after scaling with alpha_real and/or alpha_imag is as follows: - For ymm0 : - R(a[0][0]) = alpha_real*R(a[0][0])-alpha_imag*I(a[0][0]) - I(a[0][0]) = alpha_real*I(a[0][0])+alpha_imag*R[0][0] - R(a[1][0]) = alpha_real*R(a[1][0])-alpha_imag*I(a[1][0]) - I(a[1][0]) = alpha_real*I(a[1][0])+alpha_imag*(R[1][0]) - - For ymm1 : - R(a[2][0]) = alpha_real*R(a[2][0])-alpha_imag*I(a[2][0]) - I(a[2][0]) = alpha_real*I(a[2][0])+alpha_imag*R[2][0] - R(a[3][0]) = alpha_real*R(a[3][0])-alpha_imag*I(a[3][0]) - I(a[3][0]) = alpha_real*I(a[3][0])+alpha_imag*(R[3][0]) - */ - - //Calculating using real part of complex number in B matrix - FMA_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm2,(double const *)(temp_b)); - FMA_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm2,(double const *)(temp_b+ldb)); - FMA_M_LOOP(ymm0,ymm1,ymm7,ymm8,ymm2,(double const *)(temp_b+ldb*2)); - FMA_M_LOOP(ymm0,ymm1,ymm9,ymm10,ymm2,(double const *)(temp_b+ldb*3)); - - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 and ymm1 in accordance to the requirement - NEG_PERM_M_LOOP(ymm0,ymm1,ymm2); - FMA_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm2,(double const *)(temp_b)+1); - FMA_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm2,(double const *)(temp_b+ldb)+1); - FMA_M_LOOP(ymm0,ymm1,ymm7,ymm8,ymm2,(double const *)(temp_b+ldb*2)+1); - FMA_M_LOOP(ymm0,ymm1,ymm9,ymm10,ymm2,(double const *)(temp_b+ldb*3)+1); - - /* - a. Perform beta*C using temp_c, beta_real, - where beta_real is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_c where - computing all Z_MR rows of temp_c. - c. Accumulated alpha*A*B into registers will be added to beta*C - d. Same approach is used in remaining fringe cases. - */ - if(beta_real != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //R(c[2][0]) I(c[2][0]) R(c[3][0]) I(c[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + 2)); - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - // beta_real*R(c[1][0]) beta_real*I(c[1][0]) - //ymm4+=beta_real*R(c[2][0]) beta_real*I(c[2][0]) - // beta_real*R(c[3][0]) beta_real*I(c[3][0]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm15); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //R(c[2][1]) I(c[2][1]) R(c[3][1]) I(c[3][1]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc + 2)); - //ymm5+=beta_real*R(c[0][1]) beta_real*I(c[0][1]) - // beta_real*R(c[1][1]) beta_real*I(c[1][1]) - //ymm6+=beta_real*R(c[2][1]) beta_real*I(c[2][1]) - // beta_real*R(c[3][1]) beta_real*I(c[3][1]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm15); - - //R(c[0][2]) I(c[0][2]) R(c[1][2]) I(c[1][2]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*2)); - //R(c[2][2]) I(c[2][2]) R(c[3][2]) I(c[3][2]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*2 + 2)); - //ymm7+=beta_real*R(c[0][2]) beta_real*I(c[0][2]) - // beta_real*R(c[1][2]) beta_real*I(c[1][2]) - //ymm8+=beta_real*R(c[2][2]) beta_real*I(c[2][2]) - // beta_real*R(c[3][2]) beta_real*I(c[3][2]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm7,ymm8,ymm15); - - //R(c[0][3]) I(c[0][3]) R(c[1][3]) I(c[1][3]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*3)); - //R(c[2][3]) I(c[2][3]) R(c[3][3]) I(c[3][3]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*3 + 2)); - //ymm9+=beta_real*R(c[0][3]) beta_real*I(c[0][3]) - // beta_real*R(c[1][3]) beta_real*I(c[1][3]) - //ymm10+=beta_real*R(c[2][3]) beta_real*I(c[2][3]) - // beta_real*R(c[3][3]) beta_real*I(c[3][3]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm9,ymm10,ymm15); - } - /* - a. Perform beta*C using temp_c, beta_imag, - where beta_imag is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_c where - computing all Z_MR rows of temp_c. - c. Accumulated alpha*A*B into registers will be added to beta*C - d. Same approach is used in remaining fringe cases. - */ - - if(beta_imag != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //R(c[2][0]) I(c[2][0]) R(c[3][0]) I(c[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + 2)); - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - // beta_imag*(-I(c[1][0])) beta_imag*R(c[1][0]) - //ymm4+=beta_imag*(-I(c[2][0])) beta_imag*R(c[2][0]) - // beta_imag*(-I(c[3][0])) beta_imag*R(c[3][0]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm15,ymm2); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //R(c[2][1]) I(c[2][1]) R(c[3][1]) I(c[3][1]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc + 2)); - //ymm5+=beta_imag*(-I(c[0][1])) beta_imag*R(c[0][1]) - // beta_imag*(-I(c[1][1])) beta_imag*R(c[1][1]) - //ymm6+=beta_imag*(-I(c[2][1])) beta_imag*R(c[2][1]) - // beta_imag*(-I(c[3][1])) beta_imag*R(c[3][1]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm15,ymm2); - - //R(c[0][2]) I(c[0][2]) R(c[1][2]) I(c[1][2]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*2)); - //R(c[2][2]) I(c[2][2]) R(c[3][2]) I(c[3][2]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*2 + 2)); - //ymm7+=beta_imag*(-I(c[0][2])) beta_imag*R(c[0][2]) - // beta_imag*(-I(c[1][2])) beta_imag*R(c[1][2]) - //ymm8+=beta_imag*(-I(c[2][2])) beta_imag*R(c[2][2]) - // beta_imag*(-I(c[3][2])) beta_imag*R(c[3][2]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm7,ymm8,ymm15,ymm2); - - //R(c[0][3]) I(c[0][3]) R(c[1][3]) I(c[1][3]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*3)); - //R(c[2][3]) I(c[2][3]) R(c[3][3]) I(c[3][3]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc*3 + 2)); - //ymm9+=beta_imag*(-I(c[0][3])) beta_imag*R(c[0][3]) - // beta_imag*(-I(c[1][3])) beta_imag*R(c[1][3]) - //ymm10+=beta_imag*(-I(c[2][3])) beta_imag*R(c[2][3]) - // beta_imag*(-I(c[3][3])) beta_imag*R(c[3][3]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm9,ymm10,ymm15,ymm2); - } - /* - The scaling has been done sequentially as follows: - - If alpha_real is not 0, it is used for scaling A - - If alpha_imag is not 0, it is used for scaling A using permutation - and selective negation, after loading - - If beta_real is not 0, is is used for scaling C - - If beta_imag is not 0, it is used for scaling C using permutation - and selective negation, after loading + // Scratch registers. + __m256d b_scaled_0, b_perm_0, a_real, a_imag; - The results are accumalated in accordance to the non zero scalar values, - and similar approach is followed in fringe cases - */ + __m128d b_element_0, b_element_1, c_element_0, c_element_1; + __m128d beta_real_reg, beta_imag_reg, c_perm_0, c_perm_1; - _mm256_storeu_pd((double *)(temp_c), ymm3); - _mm256_storeu_pd((double *)(temp_c + 2), ymm4); + b_scaled_0 = _mm256_setzero_pd(); + b_perm_0 = _mm256_setzero_pd(); - _mm256_storeu_pd((double *)(temp_c + ldc), ymm5); - _mm256_storeu_pd((double *)(temp_c + ldc + 2), ymm6); + /* Here, only a single element from A is of concern. + Also, we already have alpha-scaled B available in + b_vec_0 and b_vec_1. Thus, we could scale these + registers with the element from A using AVX2 ISA */ - _mm256_storeu_pd((double *)(temp_c + ldc*2), ymm7); - _mm256_storeu_pd((double *)(temp_c + ldc*2 + 2), ymm8); + // Broadcasting real and imaginary components from A. - _mm256_storeu_pd((double *)(temp_c + ldc*3), ymm9); - _mm256_storeu_pd((double *)(temp_c + ldc*3 + 2), ymm10); + a_real = _mm256_broadcast_sd((double const *)(temp_ai)); + a_imag = _mm256_broadcast_sd((double const *)(temp_ai) + 1); - temp_c+=Z_MR; - temp_a+=Z_MR; - } + // Obtaining the alpha-scaled B matrix - // Fringe cases for M - dim_t m_rem=m_remainder; - if(m_rem>=2) - { - ymm3 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm7 = _mm256_setzero_pd(); - ymm9 = _mm256_setzero_pd(); - - - //R(a[0][0]) I(a[0][0]) R(a[1][0]) I(a[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_a)); - - ymm13 = ymm0; - SCALE_ALPHA_REAL_M_FRINGE(ymm0,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_FRINGE(ymm0,ymm13,ymm15,ymm2,alpha_imag); - /* - The result after scaling with alpha_real and/or alpha_imag is as follows: - For ymm0 : - R(a[0][0]) = alpha_real*R(a[0][0])-alpha_imag*I(a[0][0]) - I(a[0][0]) = alpha_real*I(a[0][0])+alpha_imag*R[0][0] - R(a[1][0]) = alpha_real*R(a[1][0])-alpha_imag*I(a[1][0]) - I(a[1][0]) = alpha_real*I(a[1][0])+alpha_imag*(R[1][0]) - */ - - //Calculating using real part of complex number in B matrix - //ymm3+=R(b[0][0])*R(a[0][0]) R(b[0][0])*I(a[0][0]) - // R(b[0][0])*R(a[1][0]) R(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)); - //ymm5+=R(b[0][1])*R(a[0][0]) R(b[0][1])*I(a[0][0]) - // R(b[0][1])*R(a[1][0]) R(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)); - //ymm7+=R(b[0][2])*R(a[0][0]) R(b[0][2])*I(a[0][0]) - // R(b[0][2])*R(a[1][0]) R(b[0][2])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm7,ymm2,(double const *)(temp_b+ldb*2)); - //ymm9+=R(b[0][3])*R(a[0][0]) R(b[0][3])*I(a[0][0]) - // R(b[0][3])*R(a[1][0]) R(b[0][3])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm9,ymm2,(double const *)(temp_b+ldb*3)); - - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 in accordance to the requirement - NEG_PERM_M_FRINGE(ymm0,ymm2); - - // ymm3+=I(b[0][0])*R(a[0][0]) I(b[0][0])*I(a[0][0]) - // I(b[0][0])*R(a[1][0]) I(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)+1); - //ymm5+=I(b[0][1])*R(a[0][0]) I(b[0][1])*I(a[0][0]) - // I(b[0][1])*R(a[1][0]) I(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)+1); - //ymm7+=I(b[0][2])*R(a[0][0]) I(b[0][2])*I(a[0][0]) - // I(b[0][2])*R(a[1][0]) I(b[0][2])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm7,ymm2,(double const *)(temp_b+ldb*2)+1); - //ymm9+=I(b[0][3])*R(a[0][0]) I(b[0][3])*I(a[0][0]) - // I(b[0][3])*R(a[1][0]) I(b[0][3])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm9,ymm2,(double const *)(temp_b+ldb*3)+1); - - - if(beta_real != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - // beta_real*R(c[1][0]) beta_real*I(c[1][0]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm3,ymm15); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //ymm5+=beta_real*R(c[0][1]) beta_real*I(c[0][1]) - // beta_real*R(c[1][1]) beta_real*I(c[1][1]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm5,ymm15); - - //R(c[0][2]) I(c[0][2]) R(c[1][2]) I(c[1][2]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*2)); - //ymm7+=beta_real*R(c[0][2]) beta_real*I(c[0][2]) - // beta_real*R(c[1][2]) beta_real*I(c[1][2]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm7,ymm15); - - //R(c[0][3]) I(c[0][3]) R(c[1][3]) I(c[1][3]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*3)); - //ymm9+=beta_real*R(c[0][3]) beta_real*I(c[0][3]) - // beta_real*R(c[1][3]) beta_real*I(c[1][3]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm9,ymm15); - } + b_scaled_0 = _mm256_permute2f128_pd(b_vec_0, b_vec_1, 0x20); + b_perm_0 = _mm256_permute_pd(b_scaled_0, 0x5); + + b_perm_0 = _mm256_mul_pd(b_perm_0, a_imag); + b_scaled_0 = _mm256_fmaddsub_pd(b_scaled_0, a_real, b_perm_0); - if(beta_imag != 0.0) + c_element_0 = _mm256_castpd256_pd128(b_scaled_0); + c_element_1 = _mm256_extractf128_pd(b_scaled_0, 0x01); + + // Clearing out the upper lanes of 256 bit registers to avoid + // the transition penalty + _mm256_zeroupper(); + + // Scaling with beta, according to its type. + switch( beta_mul_type ) { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - // beta_imag*(-I(c[1][0])) beta_imag*R(c[1][0]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm3,ymm15,ymm2); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //ymm5+=beta_imag*(-I(c[0][1])) beta_imag*R(c[0][1]) - // beta_imag*(-I(c[1][1])) beta_imag*R(c[1][1]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm5,ymm15,ymm2); - - //R(c[0][2]) I(c[0][2]) R(c[1][2]) I(c[1][2]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*2)); - //ymm7+=beta_imag*(-I(c[0][2])) beta_imag*R(c[0][2]) - // beta_imag*(-I(c[1][2])) beta_imag*R(c[1][2]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm7,ymm15,ymm2); - - //R(c[0][3]) I(c[0][3]) R(c[1][3]) I(c[1][3]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc*3)); - //ymm9+=beta_imag*(-I(c[0][3])) beta_imag*R(c[0][3]) - // beta_imag*(-I(c[1][3])) beta_imag*R(c[1][3]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm9,ymm15,ymm2); - } + case BLIS_MUL_ZERO : + break; - /* - The scaling has been done sequentially as follows: - - If alpha_real is not 0, it is used for scaling A - - If alpha_imag is not 0, it is used for scaling A using permutation - and selective negation, after loading - - If beta_real is not 0, is is used for scaling C - - If beta_imag is not 0, it is used for scaling C using permutation - and selective negation, after loading + case BLIS_MUL_ONE : + // Load C and add with the corresponding scratch register. + b_element_0 = _mm_loadu_pd((double const*)(temp_cij)); + c_element_0 = _mm_add_pd(c_element_0, b_element_0); - The results are accumalated in accordance to the non zero scalar values, - and similar approach is followed in fringe cases - */ + b_element_1 = _mm_loadu_pd((double const*)(temp_cij + ldc)); + c_element_1 = _mm_add_pd(c_element_1, b_element_1); + break; - _mm256_storeu_pd((double *)(temp_c), ymm3); - _mm256_storeu_pd((double *)(temp_c + ldc), ymm5); - _mm256_storeu_pd((double *)(temp_c + ldc*2), ymm7); - _mm256_storeu_pd((double *)(temp_c + ldc*3), ymm9); + default : + // Broadcast beta real and imaginary part and scale with C. + beta_real_reg = _mm_loaddup_pd((double const*)beta); + beta_imag_reg = _mm_loaddup_pd((double const*)beta + 1); - temp_c+=2; - temp_a+=2; + // Load C onto registers + b_element_0 = _mm_loadu_pd((double const*)(temp_cij)); + b_element_1 = _mm_loadu_pd((double const*)(temp_cij + ldc)); - m_rem -= 2; - } + // Shuffle for the compute with imgarinary part scaling + c_perm_0 = _mm_shuffle_pd(b_element_0, b_element_0, 0x01); + c_perm_1 = _mm_shuffle_pd(b_element_1, b_element_1, 0x01); - if(m_rem==1) - { + c_perm_0 = _mm_mul_pd(beta_imag_reg, c_perm_0); + c_perm_1 = _mm_mul_pd(beta_imag_reg, c_perm_1); - xmm5 = _mm_setzero_pd(); - ymm3 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm7 = _mm256_setzero_pd(); - ymm9 = _mm256_setzero_pd(); - - xmm5 = _mm_loadu_pd((double const*)(temp_a));//R(a[0][0]) I(a[0][0]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(a[0][0]) I(a[0][0]) - - ymm13 = ymm0; - SCALE_ALPHA_REAL_M_FRINGE(ymm0,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_FRINGE(ymm0,ymm13,ymm15,ymm2,alpha_imag); - - //Calculating using real part of complex number in B matrix - //ymm3+=R(b[0][0])*R(a[0][0]) R(b[0][0])*I(a[0][0]) - // R(b[0][0])*R(a[1][0]) R(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)); - //ymm5+=R(b[0][1])*R(a[0][0]) R(b[0][1])*I(a[0][0]) - // R(b[0][1])*R(a[1][0]) R(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)); - //ymm7+=R(b[0][2])*R(a[0][0]) R(b[0][2])*I(a[0][0]) - // R(b[0][2])*R(a[1][0]) R(b[0][2])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm7,ymm2,(double const *)(temp_b+ldb*2)); - //ymm9+=R(b[0][3])*R(a[0][0]) R(b[0][3])*I(a[0][0]) - // R(b[0][3])*R(a[1][0]) R(b[0][3])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm9,ymm2,(double const *)(temp_b+ldb*3)); - - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 in accordance to the requirement - NEG_PERM_M_FRINGE(ymm0,ymm2); - - // ymm3+=I(b[0][0])*R(a[0][0]) I(b[0][0])*I(a[0][0]) - // I(b[0][0])*R(a[1][0]) I(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)+1); - //ymm5+=I(b[0][1])*R(a[0][0]) I(b[0][1])*I(a[0][0]) - // I(b[0][1])*R(a[1][0]) I(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)+1); - //ymm7+=I(b[0][2])*R(a[0][0]) I(b[0][2])*I(a[0][0]) - // I(b[0][2])*R(a[1][0]) I(b[0][2])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm7,ymm2,(double const *)(temp_b+ldb*2)+1); - //ymm9+=I(b[0][3])*R(a[0][0]) I(b[0][3])*I(a[0][0]) - // I(b[0][3])*R(a[1][0]) I(b[0][3])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm9,ymm2,(double const *)(temp_b+ldb*3)+1); - - if(beta_real != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); - - xmm5 = _mm_loadu_pd((double const*)(temp_c));//R(c[0][0]) I(c[0][0]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][0]) I(c[0][0]) - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm3,ymm15); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc));//R(c[0][1]) I(c[0][1]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][1]) I(c[0][1]) - //ymm5+=beta_real*R(c[0][1]) beta_real*I(c[0][1]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm5,ymm15); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 2));//R(c[0][2]) I(c[0][2]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][2]) I(c[0][2]) - //ymm7+=beta_real*R(c[0][2]) beta_real*I(c[0][2]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm7,ymm15); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 3));//R(c[0][3]) I(c[0][3]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][3]) I(c[0][3]) - //ymm9+=beta_real*R(c[0][3]) beta_real*I(c[0][3]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm9,ymm15); - } + b_element_0 = _mm_mul_pd(beta_real_reg, b_element_0); + b_element_1 = _mm_mul_pd(beta_real_reg, b_element_1); - if(beta_imag != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); - - xmm5 = _mm_loadu_pd((double const*)(temp_c));//R(c[0][0]) I(c[0][0]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][0]) I(c[0][0]) - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm3,ymm15,ymm2); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc));//R(c[0][1]) I(c[0][1]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][1]) I(c[0][1]) - //ymm5+=beta_imag*(-I(c[0][1])) beta_imag*R(c[0][1]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm5,ymm15,ymm2); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 2));//R(c[0][2]) I(c[0][2]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][2]) I(c[0][2]) - //ymm7+=beta_imag*(-I(c[0][2])) beta_imag*R(c[0][2]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm7,ymm15,ymm2); - - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc * 3));//R(c[0][3]) I(c[0][3]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][3]) I(c[0][3]) - //ymm9+=beta_imag*(-I(c[0][3])) beta_imag*R(c[0][3]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm9,ymm15,ymm2); - } + // Compute beta-scaled C + b_element_0 = _mm_addsub_pd(b_element_0, c_perm_0); + b_element_1 = _mm_addsub_pd(b_element_1, c_perm_1); - xmm5 = _mm256_extractf128_pd(ymm3, 0); - _mm_storeu_pd((double *)(temp_c), xmm5); + // Add to intermediate reg storing alpha*A*B + c_element_0 = _mm_add_pd(b_element_0, c_element_0); + c_element_1 = _mm_add_pd(b_element_1, c_element_1); + } - xmm5 = _mm256_extractf128_pd(ymm5, 0); - _mm_storeu_pd((double *)(temp_c + ldc), xmm5); + // Storing the result in C. + _mm_storeu_pd((double *)(temp_cij), c_element_0); + _mm_storeu_pd((double *)(temp_cij + ldc), c_element_1); - xmm5 = _mm256_extractf128_pd(ymm7, 0); - _mm_storeu_pd((double *)(temp_c + ldc*2), xmm5); + // We need to restore the upper lanes of the registers b_vec_0, b_vec_1, + // b_vec_2 and b_vec_3 + // They need to contain the alpha scaled B, to be reused in the main loop for m + b_element_0 = _mm256_castpd256_pd128(b_vec_0); + b_element_1 = _mm256_extractf128_pd(b_vec_1, 0x00); + b_vec_0 = _mm256_insertf128_pd(b_vec_0, b_element_0, 0x01); + b_vec_1 = _mm256_insertf128_pd(b_vec_1, b_element_1, 0x01); - xmm5 = _mm256_extractf128_pd(ymm9, 0); - _mm_storeu_pd((double *)(temp_c + ldc*3), xmm5); + // Adjusting the addresses of A and C for the next block. + temp_cij += 1; + temp_ai += 1; + m_rem -= 1; } - n_remainder -= 4; - } - if(n_remainder>=2) - { - dcomplex* temp_b = b + (n - n_remainder)*ldb; - dcomplex* temp_a = a; - dcomplex* temp_c = c + (n - n_remainder)*ldc; - for(dim_t i = 0;i < (m-Z_MR+1);i=i+Z_MR) + if( m_rem == 2 ) { - ymm3 = _mm256_setzero_pd(); - ymm4 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm6 = _mm256_setzero_pd(); - - /* - a. Perform alpha*A*B using temp_a, temp_b and alpha_real, alpha_imag - where alpha_real and/or alpha_imag is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_b where - computing all Z_MR rows of temp_a. - c. Same approach is used in remaining fringe cases. - */ - - //R(a[0][0]) I(a[0][0]) R(a[1][0]) I(a[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_a)); - //R(a[2][0]) I(a[2][0]) R(a[3][0]) I(a[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_a + 2)); - - ymm13 = ymm0; - ymm14 = ymm1; - SCALE_ALPHA_REAL_M_LOOP(ymm0,ymm1,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_LOOP(ymm0,ymm1,ymm13,ymm14,ymm15,ymm2,alpha_imag); - - /* - The result after scaling with alpha_real and/or alpha_imag is as follows: - For ymm0 : - R(a[0][0]) = alpha_real*R(a[0][0])-alpha_imag*I(a[0][0]) - I(a[0][0]) = alpha_real*I(a[0][0])+alpha_imag*R[0][0] - R(a[1][0]) = alpha_real*R(a[1][0])-alpha_imag*I(a[1][0]) - I(a[1][0]) = alpha_real*I(a[1][0])+alpha_imag*(R[1][0]) - - For ymm1 : - R(a[2][0]) = alpha_real*R(a[2][0])-alpha_imag*I(a[2][0]) - I(a[2][0]) = alpha_real*I(a[2][0])+alpha_imag*R[2][0] - R(a[3][0]) = alpha_real*R(a[3][0])-alpha_imag*I(a[3][0]) - I(a[3][0]) = alpha_real*I(a[3][0])+alpha_imag*(R[3][0]) - */ - - //Calculating using real part of complex number in B matrix - FMA_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm2,(double const *)(temp_b)); - FMA_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm2,(double const *)(temp_b+ldb)); - - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 and ymm1 in accordance to the requirement - NEG_PERM_M_LOOP(ymm0,ymm1,ymm2); - FMA_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm2,(double const *)(temp_b)+1); - FMA_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm2,(double const *)(temp_b+ldb)+1); - - /* - a. Perform beta*C using temp_c, beta_real, - where beta_real is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_c where - computing all Z_MR rows of temp_c. - c. Accumulated alpha*A*B into registers will be added to beta*C - d. Same approach is used in remaining fringe cases. - */ - if(beta_real != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //R(c[2][0]) I(c[2][0]) R(c[3][0]) I(c[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + 2)); - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - // beta_real*R(c[1][0]) beta_real*I(c[1][0]) - //ymm4+=beta_real*R(c[2][0]) beta_real*I(c[2][0]) - // beta_real*R(c[3][0]) beta_real*I(c[3][0]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm15); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //R(c[2][1]) I(c[2][1]) R(c[3][1]) I(c[3][1]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc + 2)); - //ymm5+=beta_real*R(c[0][1]) beta_real*I(c[0][1]) - // beta_real*R(c[1][1]) beta_real*I(c[1][1]) - //ymm6+=beta_real*R(c[2][1]) beta_real*I(c[2][1]) - // beta_real*R(c[3][1]) beta_real*I(c[3][1]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm15); - } + // Scratch registers. + __m256d c_vec_0, c_vec_2; - /* - a. Perform beta*C using temp_c, beta_imag, - where beta_imag is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_c where - computing all Z_MR rows of temp_c. - c. Accumulated alpha*A*B into registers will be added to beta*C - d. Same approach is used in remaining fringe cases. - */ - - if(beta_imag != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //R(c[2][0]) I(c[2][0]) R(c[3][0]) I(c[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + 2)); - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - // beta_imag*(-I(c[1][0])) beta_imag*R(c[1][0]) - //ymm4+=beta_imag*(-I(c[2][0])) beta_imag*R(c[2][0]) - // beta_imag*(-I(c[3][0])) beta_imag*R(c[3][0]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm15,ymm2); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //R(c[2][1]) I(c[2][1]) R(c[3][1]) I(c[3][1]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + ldc + 2)); - //ymm5+=beta_imag*(-I(c[0][1])) beta_imag*R(c[0][1]) - // beta_imag*(-I(c[1][1])) beta_imag*R(c[1][1]) - //ymm6+=beta_imag*(-I(c[2][1])) beta_imag*R(c[2][1]) - // beta_imag*(-I(c[3][1])) beta_imag*R(c[3][1]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm5,ymm6,ymm15,ymm2); - } - /* - The scaling has been done sequentially as follows: - - If alpha_real is not 0, it is used for scaling A - - If alpha_imag is not 0, it is used for scaling A using permutation - and selective negation, after loading - - If beta_real is not 0, is is used for scaling C - - If beta_imag is not 0, it is used for scaling C using permutation - and selective negation, after loading - - The results are accumalated in accordance to the non zero scalar values, - and similar approach is followed in fringe cases - */ - - _mm256_storeu_pd((double *)(temp_c), ymm3); - _mm256_storeu_pd((double *)(temp_c + 2), ymm4); - - _mm256_storeu_pd((double *)(temp_c + ldc), ymm5); - _mm256_storeu_pd((double *)(temp_c + ldc + 2), ymm6); - - temp_c+=Z_MR; - temp_a+=Z_MR; - } + a_vec_0 = _mm256_setzero_pd(); + a_vec_1 = _mm256_setzero_pd(); + bdcst_0 = _mm256_setzero_pd(); + bdcst_1 = _mm256_setzero_pd(); + c_vec_0 = _mm256_setzero_pd(); + c_vec_2 = _mm256_setzero_pd(); - dim_t m_rem=m_remainder; - if(m_rem>=2) - { - ymm3 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - - - //R(a[0][0]) I(a[0][0]) R(a[1][0]) I(a[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_a)); - - ymm13 = ymm0; - SCALE_ALPHA_REAL_M_FRINGE(ymm0,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_FRINGE(ymm0,ymm13,ymm15,ymm2,alpha_imag); - /* - The result after scaling with alpha_real and/or alpha_imag is as follows: - For ymm0 : - R(a[0][0]) = alpha_real*R(a[0][0])-alpha_imag*I(a[0][0]) - I(a[0][0]) = alpha_real*I(a[0][0])+alpha_imag*R[0][0] - R(a[1][0]) = alpha_real*R(a[1][0])-alpha_imag*I(a[1][0]) - I(a[1][0]) = alpha_real*I(a[1][0])+alpha_imag*(R[1][0]) - */ - - //Calculating using real part of complex number in B matrix - //ymm3+=R(b[0][0])*R(a[0][0]) R(b[0][0])*I(a[0][0]) - // R(b[0][0])*R(a[1][0]) R(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)); - //ymm5+=R(b[0][1])*R(a[0][0]) R(b[0][1])*I(a[0][0]) - // R(b[0][1])*R(a[1][0]) R(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)); - - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 in accordance to the requirement - NEG_PERM_M_FRINGE(ymm0,ymm2); - - // ymm3+=I(b[0][0])*R(a[0][0]) I(b[0][0])*I(a[0][0]) - // I(b[0][0])*R(a[1][0]) I(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)+1); - //ymm5+=I(b[0][1])*R(a[0][0]) I(b[0][1])*I(a[0][0]) - // I(b[0][1])*R(a[1][0]) I(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)+1); - - - if(beta_real != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - // beta_real*R(c[1][0]) beta_real*I(c[1][0]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm3,ymm15); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //ymm5+=beta_real*R(c[0][1]) beta_real*I(c[0][1]) - // beta_real*R(c[1][1]) beta_real*I(c[1][1]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm5,ymm15); - } + // Loading a vector from A with 2 elements. + a_vec_0 = _mm256_loadu_pd((double const *)(temp_ai)); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + + // Scaling with imaginary components of elements from B. + bdcst_0 = _mm256_unpackhi_pd(b_vec_0, b_vec_0); + bdcst_1 = _mm256_unpackhi_pd(b_vec_1, b_vec_1); + c_vec_0 = _mm256_mul_pd(a_vec_0, bdcst_0); + c_vec_2 = _mm256_mul_pd(a_vec_0, bdcst_1); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); - if(beta_imag != 0.0) + // Scaling with real components of elements from B. + bdcst_0 = _mm256_unpacklo_pd(b_vec_0, b_vec_0); + bdcst_1 = _mm256_unpacklo_pd(b_vec_1, b_vec_1); + c_vec_0 = _mm256_fmaddsub_pd(a_vec_0, bdcst_0, c_vec_0); + c_vec_2 = _mm256_fmaddsub_pd(a_vec_0, bdcst_1, c_vec_2); + + // Scaling with beta, according to its type. + switch( beta_mul_type ) { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - // beta_imag*(-I(c[1][0])) beta_imag*R(c[1][0]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm3,ymm15,ymm2); - - //R(c[0][1]) I(c[0][1]) R(c[1][1]) I(c[1][1]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c + ldc)); - //ymm5+=beta_imag*(-I(c[0][1])) beta_imag*R(c[0][1]) - // beta_imag*(-I(c[1][1])) beta_imag*R(c[1][1]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm5,ymm15,ymm2); - } + case BLIS_MUL_ZERO : + break; + + case BLIS_MUL_ONE : + // Load C and add with the corresponding scratch register. + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij)); + c_vec_0 = _mm256_add_pd(c_vec_0, a_vec_0); + + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij + ldc)); + c_vec_2 = _mm256_add_pd(c_vec_2, a_vec_0); + break; - /* - The scaling has been done sequentially as follows: - - If alpha_real is not 0, it is used for scaling A - - If alpha_imag is not 0, it is used for scaling A using permutation - and selective negation, after loading - - If beta_real is not 0, is is used for scaling C - - If beta_imag is not 0, it is used for scaling C using permutation - and selective negation, after loading + default : + // Broadcast beta and redirect to the beta scaling macro. + bdcst_0 = _mm256_broadcast_sd((double const*)(&beta_real)); + bdcst_1 = _mm256_broadcast_sd((double const*)(&beta_imag)); - The results are accumalated in accordance to the non zero scalar values, - and similar approach is followed in fringe cases - */ + BETA_SCALING_C_FRINGE(c_vec_0, temp_cij); + BETA_SCALING_C_FRINGE(c_vec_2, temp_cij + ldc); - _mm256_storeu_pd((double *)(temp_c), ymm3); - _mm256_storeu_pd((double *)(temp_c + ldc), ymm5); + } + + // Storing the result in C. + _mm256_storeu_pd((double *)(temp_cij), c_vec_0); + _mm256_storeu_pd((double *)(temp_cij + ldc), c_vec_2); - temp_c+=2; - temp_a+=2; + // Adjusting the addresses of A and C for the next block. + temp_cij += 2; + temp_ai += 2; m_rem -= 2; } - if(m_rem==1) + // Main loop along M dimension. + for( dim_t i = 0; i < m_iter; i++ ) { - - xmm5 = _mm_setzero_pd(); - ymm3 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - - xmm5 = _mm_loadu_pd((double const*)(temp_a));//R(a[0][0]) I(a[0][0]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(a[0][0]) I(a[0][0]) - - ymm13 = ymm0; - SCALE_ALPHA_REAL_M_FRINGE(ymm0,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_FRINGE(ymm0,ymm13,ymm15,ymm2,alpha_imag); - - //Calculating using real part of complex number in B matrix - //ymm3+=R(b[0][0])*R(a[0][0]) R(b[0][0])*I(a[0][0]) - // R(b[0][0])*R(a[1][0]) R(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)); - //ymm5+=R(b[0][1])*R(a[0][0]) R(b[0][1])*I(a[0][0]) - // R(b[0][1])*R(a[1][0]) R(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)); - - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 in accordance to the requirement - NEG_PERM_M_FRINGE(ymm0,ymm2); - - // ymm3+=I(b[0][0])*R(a[0][0]) I(b[0][0])*I(a[0][0]) - // I(b[0][0])*R(a[1][0]) I(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)+1); - //ymm5+=I(b[0][1])*R(a[0][0]) I(b[0][1])*I(a[0][0]) - // I(b[0][1])*R(a[1][0]) I(b[0][1])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm5,ymm2,(double const *)(temp_b+ldb)+1); - - if(beta_real != 0.0) + // Scratch registers + __m256d c_vec_0, c_vec_1, c_vec_2, c_vec_3; + + a_vec_0 = _mm256_setzero_pd(); + a_vec_1 = _mm256_setzero_pd(); + bdcst_0 = _mm256_setzero_pd(); + bdcst_1 = _mm256_setzero_pd(); + c_vec_0 = _mm256_setzero_pd(); + c_vec_1 = _mm256_setzero_pd(); + c_vec_2 = _mm256_setzero_pd(); + c_vec_3 = _mm256_setzero_pd(); + + // Prefetching the block of C to be used for computation. + _mm_prefetch((char const*)(temp_cij), _MM_HINT_T0); + _mm_prefetch((char const*)(temp_cij + ldc), _MM_HINT_T0); + _mm_prefetch((char const*)(temp_cij + ldc*2), _MM_HINT_T0); + _mm_prefetch((char const*)(temp_cij + ldc*3), _MM_HINT_T0); + + // Loading vectors from A with Z_MR elements in total. + a_vec_0 = _mm256_loadu_pd((double const *)(temp_ai)); + a_vec_1 = _mm256_loadu_pd((double const *)(temp_ai + 2)); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + a_vec_1 = _mm256_permute_pd(a_vec_1, 0x5); + + // Scaling with imaginary components of elements from B. + bdcst_0 = _mm256_unpackhi_pd(b_vec_0, b_vec_0); + bdcst_1 = _mm256_unpackhi_pd(b_vec_1, b_vec_1); + c_vec_0 = _mm256_mul_pd(a_vec_0, bdcst_0); + c_vec_1 = _mm256_mul_pd(a_vec_1, bdcst_0); + c_vec_2 = _mm256_mul_pd(a_vec_0, bdcst_1); + c_vec_3 = _mm256_mul_pd(a_vec_1, bdcst_1); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + a_vec_1 = _mm256_permute_pd(a_vec_1, 0x5); + + // Scaling with real components of elements from B. + bdcst_0 = _mm256_unpacklo_pd(b_vec_0, b_vec_0); + bdcst_1 = _mm256_unpacklo_pd(b_vec_1, b_vec_1); + c_vec_0 = _mm256_fmaddsub_pd(a_vec_0, bdcst_0, c_vec_0); + c_vec_1 = _mm256_fmaddsub_pd(a_vec_1, bdcst_0, c_vec_1); + c_vec_2 = _mm256_fmaddsub_pd(a_vec_0, bdcst_1, c_vec_2); + c_vec_3 = _mm256_fmaddsub_pd(a_vec_1, bdcst_1, c_vec_3); + + // Scaling with beta, according to its type. + switch( beta_mul_type ) { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); - - xmm5 = _mm_loadu_pd((double const*)(temp_c));//R(c[0][0]) I(c[0][0]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][0]) I(c[0][0]) - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm3,ymm15); + case BLIS_MUL_ZERO : + break; + + case BLIS_MUL_ONE : + // Load C and add with the corresponding scratch register. + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij)); + a_vec_1 = _mm256_loadu_pd((double const*)(temp_cij + 2)); + c_vec_0 = _mm256_add_pd(c_vec_0, a_vec_0); + c_vec_1 = _mm256_add_pd(c_vec_1, a_vec_1); + + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij + ldc)); + a_vec_1 = _mm256_loadu_pd((double const*)(temp_cij + ldc + 2)); + c_vec_2 = _mm256_add_pd(c_vec_2, a_vec_0); + c_vec_3 = _mm256_add_pd(c_vec_3, a_vec_1); + break; + + default : + // Broadcast beta and redirect to the beta scaling macro. + bdcst_0 = _mm256_broadcast_sd((double const*)(&beta_real)); + bdcst_1 = _mm256_broadcast_sd((double const*)(&beta_imag)); + + BETA_SCALING_C_MAIN(c_vec_0, c_vec_1, temp_cij); + BETA_SCALING_C_MAIN(c_vec_2, c_vec_3, temp_cij + ldc); - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc));//R(c[0][1]) I(c[0][1]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][1]) I(c[0][1]) - //ymm5+=beta_real*R(c[0][1]) beta_real*I(c[0][1]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm5,ymm15); } - if(beta_imag != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); + // Storing the result in C. + _mm256_storeu_pd((double *)(temp_cij), c_vec_0); + _mm256_storeu_pd((double *)(temp_cij + 2), c_vec_1); - xmm5 = _mm_loadu_pd((double const*)(temp_c));//R(c[0][0]) I(c[0][0]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][0]) I(c[0][0]) - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm3,ymm15,ymm2); + _mm256_storeu_pd((double *)(temp_cij + ldc), c_vec_2); + _mm256_storeu_pd((double *)(temp_cij + ldc + 2), c_vec_3); - xmm5 = _mm_loadu_pd((double const*)(temp_c + ldc));//R(c[0][1]) I(c[0][1]) - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0);//R(c[0][1]) I(c[0][1]) - //ymm5+=beta_imag*(-I(c[0][1])) beta_imag*R(c[0][1]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm5,ymm15,ymm2); - } + // Adjusting the addresses of A and C for the next iteration. + temp_cij += Z_MR; + temp_ai += Z_MR; - xmm5 = _mm256_extractf128_pd(ymm3, 0); - _mm_storeu_pd((double *)(temp_c), xmm5); + } - xmm5 = _mm256_extractf128_pd(ymm5, 0); - _mm_storeu_pd((double *)(temp_c + ldc), xmm5); + temp_b += ldb*2; + temp_c += ldc*2; - } n_remainder -= 2; } - if(n_remainder==1) - { - dcomplex* temp_b = b + (n - n_remainder)*ldb; - dcomplex* temp_a = a; - dcomplex* temp_c = c + (n - n_remainder)*ldc; - // Main loop for M - for(dim_t i = 0;i < (m-Z_MR+1);i=i+Z_MR) + // Main loop along N dimension + for( dim_t j = 0; j < n_iter; j++ ) + { + dcomplex* temp_bj = temp_b + j * ldb * Z_NR; + dcomplex* temp_ai = a; + dcomplex* temp_cij = temp_c + j * ldc * Z_NR; + + /* Multiple blocks of Z_MR x 1(main loop for m) and/or m_remainder x 1 block(s) + of A use the same 1 x Z_NR block of B in order to compute the associated + Z_MR x Z_NR and/or m_remainder x Z_NR block(s) of C. This reusability has been + exploited, wherein the associated 1 x Z_NR block of B is scaled with alpha, + and stored in registers beforehand, to be reused in the main loop or fringe + case of m. */ + + // Intermediate registers used for alpha scaling the block of B and storing. + __m256d a_vec_0, a_vec_1; + __m256d b_vec_0, b_vec_1, b_vec_2, b_vec_3; + __m256d b_real_0, b_real_1, b_real_2, b_real_3; + __m256d b_imag_0, b_imag_1, b_imag_2, b_imag_3; + __m256d bdcst_0, bdcst_1; + + /* Broadcasting real and imaginary components of elements from B + and unpacking them to set them in registers in the form : + { Real_part, Imag_part, Real_part, Imag_part }. + + A total of Z_NR registers are used to store the alpha-scaled B + for reuse. */ + + b_real_0 = _mm256_broadcast_sd((double const *)(temp_bj)); + b_imag_0 = _mm256_broadcast_sd((double const *)(temp_bj) + 1); + b_vec_0 = _mm256_unpacklo_pd(b_real_0, b_imag_0); + + b_real_1 = _mm256_broadcast_sd((double const *)(temp_bj + ldb)); + b_imag_1 = _mm256_broadcast_sd((double const *)(temp_bj + ldb) + 1); + b_vec_1 = _mm256_unpacklo_pd(b_real_1, b_imag_1); + + b_real_2 = _mm256_broadcast_sd((double const *)(temp_bj + ldb*2)); + b_imag_2 = _mm256_broadcast_sd((double const *)(temp_bj + ldb*2) + 1); + b_vec_2 = _mm256_unpacklo_pd(b_real_2, b_imag_2); + + b_real_3 = _mm256_broadcast_sd((double const *)(temp_bj + ldb*3)); + b_imag_3 = _mm256_broadcast_sd((double const *)(temp_bj + ldb*3) + 1); + b_vec_3 = _mm256_unpacklo_pd(b_real_3, b_imag_3); + + // Broadcast elements from alpha, and exhibit the compute for complex scaling. + a_vec_0 = _mm256_broadcast_sd((double const *)(&alpha_real)); + a_vec_1 = _mm256_broadcast_sd((double const *)(&alpha_imag)); + + bdcst_0 = _mm256_unpacklo_pd(b_imag_0, b_real_0); + bdcst_1 = _mm256_unpacklo_pd(b_imag_1, b_real_1); + bdcst_0 = _mm256_mul_pd(a_vec_1, bdcst_0); + bdcst_1 = _mm256_mul_pd(a_vec_1, bdcst_1); + b_vec_0 = _mm256_fmaddsub_pd(a_vec_0, b_vec_0, bdcst_0); + b_vec_1 = _mm256_fmaddsub_pd(a_vec_0, b_vec_1, bdcst_1); + + bdcst_0 = _mm256_unpacklo_pd(b_imag_2, b_real_2); + bdcst_1 = _mm256_unpacklo_pd(b_imag_3, b_real_3); + bdcst_0 = _mm256_mul_pd(a_vec_1, bdcst_0); + bdcst_1 = _mm256_mul_pd(a_vec_1, bdcst_1); + b_vec_2 = _mm256_fmaddsub_pd(a_vec_0, b_vec_2, bdcst_0); + b_vec_3 = _mm256_fmaddsub_pd(a_vec_0, b_vec_3, bdcst_1); + + // Fringe cases in the m-direction. + dim_t m_rem = m_remainder; + if ( ( m_rem & 0x1 ) == 1 ) { - ymm3 = _mm256_setzero_pd(); - ymm4 = _mm256_setzero_pd(); - - - /* - a. Perform alpha*A*B using temp_a, temp_b and alpha_real, aplha_vali - where alpha_real and/or alpha_imag is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_b where - computing all Z_MR rows of temp_a. - c. Same approach is used in remaining fringe cases. - */ - - //R(a[0][0]) I(a[0][0]) R(a[1][0]) I(a[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_a)); - //R(a[2][0]) I(a[2][0]) R(a[3][0]) I(a[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_a + 2)); - - ymm13 = ymm0; - ymm14 = ymm1; - SCALE_ALPHA_REAL_M_LOOP(ymm0,ymm1,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_LOOP(ymm0,ymm1,ymm13,ymm14,ymm15,ymm2,alpha_imag); - - /* - The result after scaling with alpha_real and/or alpha_imag is as follows: - For ymm0 : - R(a[0][0]) = alpha_real*R(a[0][0])-alpha_imag*I(a[0][0]) - I(a[0][0]) = alpha_real*I(a[0][0])+alpha_imag*R[0][0] - R(a[1][0]) = alpha_real*R(a[1][0])-alpha_imag*I(a[1][0]) - I(a[1][0]) = alpha_real*I(a[1][0])+alpha_imag*(R[1][0]) - - For ymm1 : - R(a[2][0]) = alpha_real*R(a[2][0])-alpha_imag*I(a[2][0]) - I(a[2][0]) = alpha_real*I(a[2][0])+alpha_imag*R[2][0] - R(a[3][0]) = alpha_real*R(a[3][0])-alpha_imag*I(a[3][0]) - I(a[3][0]) = alpha_real*I(a[3][0])+alpha_imag*(R[3][0]) - */ - - //Calculating using real part of complex number in B matrix - FMA_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm2,(double const *)(temp_b)); - - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 and ymm1 in accordance to the requirement - NEG_PERM_M_LOOP(ymm0,ymm1,ymm2); - FMA_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm2,(double const *)(temp_b)+1); - - /* - a. Perform beta*C using temp_c, beta_real, - where beta_real is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_c where - computing all Z_MR rows of temp_c. - c. Accumulated alpha*A*B into registers will be added to beta*C - d. Same approach is used in remaining fringe cases. - */ - if(beta_real != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //R(c[2][0]) I(c[2][0]) R(c[3][0]) I(c[3][0]) - ymm1 = _mm256_loadu_pd((double const *)(temp_c + 2)); - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - // beta_real*R(c[1][0]) beta_real*I(c[1][0]) - //ymm4+=beta_real*R(c[2][0]) beta_real*I(c[2][0]) - // beta_real*R(c[3][0]) beta_real*I(c[3][0]) - SCALE_BETA_REAL_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm15); - } + // Scratch registers. + __m256d b_scaled_0, b_perm_0, a_real, a_imag; - /* - a. Perform beta*C using temp_c, beta_imag, - where beta_imag is not zero. - b. This loop operates with 4x6 block size - along n dimension for every Z_NR columns of temp_c where - computing all Z_MR rows of temp_c. - c. Accumulated alpha*A*B into registers will be added to beta*C - d. Same approach is used in remaining fringe cases. - */ - - if(beta_imag != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); - - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - ymm1 = _mm256_loadu_pd((double const *)(temp_c + 2)); - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - // beta_imag*(-I(c[1][0])) beta_imag*R(c[1][0]) - //ymm4+=beta_imag*(-I(c[2][0])) beta_imag*R(c[2][0]) - // beta_imag*(-I(c[3][0])) beta_imag*R(c[3][0]) - SCALE_BETA_IMAG_M_LOOP(ymm0,ymm1,ymm3,ymm4,ymm15,ymm2); - } - /* - The scaling has been done sequentially as follows: - - If alpha_real is not 0, it is used for scaling A - - If alpha_imag is not 0, it is used for scaling A using permutation - and selective negation, after loading - - If beta_real is not 0, is is used for scaling C - - If beta_imag is not 0, it is used for scaling C using permutation - and selective negation, after loading - - The results are accumalated in accordance to the non zero scalar values, - and similar approach is followed in fringe cases - */ - - //R(c[0][0]) I(c[0][0]) R(c[1][0]) I(c[1][0]) - _mm256_storeu_pd((double *)(temp_c), ymm3); - //R(c[2][0]) I(c[2][0]) R(c[3][0]) I(c[3][0]) - _mm256_storeu_pd((double *)(temp_c + 2), ymm4); - - temp_c+=Z_MR; - temp_a+=Z_MR; - } + __m128d b_element_0, b_element_1; + __m128d c_element_0, c_element_1, c_element_2, c_element_3; + __m128d beta_real_reg, beta_imag_reg, c_perm_0, c_perm_1; - // Fringe cases for M - dim_t m_rem=m_remainder; - if(m_rem>=2) - { - ymm3 = _mm256_setzero_pd(); + b_scaled_0 = _mm256_setzero_pd(); + b_perm_0 = _mm256_setzero_pd(); + /* Here, only a single element from A is of concern. + Also, we already have alpha-scaled B available in + b_vec_0 and b_vec_1. Thus, we could scale these + registers with the element from A using AVX2 ISA */ - //R(a[0][0]) I(a[0][0]) R(a[1][0]) I(a[1][0]) - ymm0 = _mm256_loadu_pd((double const *)(temp_a)); + // Broadcasting real and imaginary components from A. - ymm13 = ymm0; - SCALE_ALPHA_REAL_M_FRINGE(ymm0,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_FRINGE(ymm0,ymm13,ymm15,ymm2,alpha_imag); + a_real = _mm256_broadcast_sd((double const *)(temp_ai)); + a_imag = _mm256_broadcast_sd((double const *)(temp_ai) + 1); - /* - The result after scaling with alpha_real and/or alpha_imag is as follows: - For ymm0 : - R(a[0][0]) = alpha_real*R(a[0][0])-alpha_imag*I(a[0][0]) - I(a[0][0]) = alpha_real*I(a[0][0])+alpha_imag*R[0][0] - R(a[1][0]) = alpha_real*R(a[1][0])-alpha_imag*I(a[1][0]) - I(a[1][0]) = alpha_real*I(a[1][0])+alpha_imag*(R[1][0]) - */ + // Obtaining the alpha-scaled B matrix - //Calculating using real part of complex number in B matrix - //ymm3+=R(b[0][0])*R(a[0][0]) R(b[0][0])*I(a[0][0]) - // R(b[0][0])*R(a[1][0]) R(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)); + b_scaled_0 = _mm256_permute2f128_pd(b_vec_0, b_vec_1, 0x20); + b_perm_0 = _mm256_permute_pd(b_scaled_0, 0x5); - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 in accordance to the requirement - NEG_PERM_M_FRINGE(ymm0,ymm2); + b_perm_0 = _mm256_mul_pd(b_perm_0, a_imag); + b_scaled_0 = _mm256_fmaddsub_pd(b_scaled_0, a_real, b_perm_0); - // ymm3+=I(b[0][0])*R(a[0][0]) I(b[0][0])*I(a[0][0]) - // I(b[0][0])*R(a[1][0]) I(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)+1); + c_element_0 = _mm256_castpd256_pd128(b_scaled_0); + c_element_1 = _mm256_extractf128_pd(b_scaled_0, 0x01); + b_scaled_0 = _mm256_permute2f128_pd(b_vec_2, b_vec_3, 0x20); + b_perm_0 = _mm256_permute_pd(b_scaled_0, 0x5); - if(beta_real != 0.0) - { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); + b_perm_0 = _mm256_mul_pd(b_perm_0, a_imag); + b_scaled_0 = _mm256_fmaddsub_pd(b_scaled_0, a_real, b_perm_0); - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - // beta_real*R(c[1][0]) beta_real*I(c[1][0]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm3,ymm15); - } + c_element_2 = _mm256_castpd256_pd128(b_scaled_0); + c_element_3 = _mm256_extractf128_pd(b_scaled_0, 0x01); + + // Clearing out the upper lanes of 256 bit registers to avoid + // the transition penalty + _mm256_zeroupper(); - if(beta_imag != 0.0) + // Scaling with beta, according to its type. + switch( beta_mul_type ) { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); + case BLIS_MUL_ZERO : + break; - ymm0 = _mm256_loadu_pd((double const *)(temp_c)); - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - // beta_imag*(-I(c[1][0])) beta_imag*R(c[1][0]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm3,ymm15,ymm2); - } - /* - The scaling has been done sequentially as follows: - - If alpha_real is not 0, it is used for scaling A - - If alpha_imag is not 0, it is used for scaling A using permutation - and selective negation, after loading - - If beta_real is not 0, is is used for scaling C - - If beta_imag is not 0, it is used for scaling C using permutation - and selective negation, after loading + case BLIS_MUL_ONE : + // Load C and add with the corresponding scratch register. + b_element_0 = _mm_loadu_pd((double const*)(temp_cij)); + c_element_0 = _mm_add_pd(c_element_0, b_element_0); - The results are accumalated in accordance to the non zero scalar values, - and similar approach is followed in fringe cases - */ + b_element_1 = _mm_loadu_pd((double const*)(temp_cij + ldc)); + c_element_1 = _mm_add_pd(c_element_1, b_element_1); - _mm256_storeu_pd((double *)(temp_c), ymm3); + b_element_0 = _mm_loadu_pd((double const*)(temp_cij + ldc*2)); + c_element_2 = _mm_add_pd(c_element_2, b_element_0); - temp_c+=2; - temp_a+=2; + b_element_1 = _mm_loadu_pd((double const*)(temp_cij + ldc*3)); + c_element_3 = _mm_add_pd(c_element_3, b_element_1); + break; - m_rem -= 2; - } + default : + // Broadcast beta real and imaginary part and scale with C. + beta_real_reg = _mm_loaddup_pd((double const*)beta); + beta_imag_reg = _mm_loaddup_pd((double const*)beta + 1); - if(m_rem==1) - { + // Load C onto registers + b_element_0 = _mm_loadu_pd((double const*)(temp_cij)); + b_element_1 = _mm_loadu_pd((double const*)(temp_cij + ldc)); - xmm5 = _mm_setzero_pd(); - ymm3 = _mm256_setzero_pd(); + // Shuffle for the compute with imgarinary part scaling + c_perm_0 = _mm_shuffle_pd(b_element_0, b_element_0, 0x01); + c_perm_1 = _mm_shuffle_pd(b_element_1, b_element_1, 0x01); - xmm5 = _mm_loadu_pd((double const*)(temp_a)); - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0); + c_perm_0 = _mm_mul_pd(beta_imag_reg, c_perm_0); + c_perm_1 = _mm_mul_pd(beta_imag_reg, c_perm_1); - ymm13 = ymm0; - SCALE_ALPHA_REAL_M_FRINGE(ymm0,ymm15,alpha_real); - SCALE_ALPHA_IMAG_M_FRINGE(ymm0,ymm13,ymm15,ymm2,alpha_imag); + b_element_0 = _mm_mul_pd(beta_real_reg, b_element_0); + b_element_1 = _mm_mul_pd(beta_real_reg, b_element_1); - //Calculating using real part of complex number in B matrix - //ymm3+=R(b[0][0])*R(a[0][0]) R(b[0][0])*I(a[0][0]) - // R(b[0][0])*R(a[1][0]) R(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)); + // Compute beta-scaled C + b_element_0 = _mm_addsub_pd(b_element_0, c_perm_0); + b_element_1 = _mm_addsub_pd(b_element_1, c_perm_1); - //Calculating using imaginary part of complex numbers in B matrix - //Shuffling ymm0 in accordance to the requirement - NEG_PERM_M_FRINGE(ymm0,ymm2); + // Add to intermediate reg storing alpha*A*B + c_element_0 = _mm_add_pd(b_element_0, c_element_0); + c_element_1 = _mm_add_pd(b_element_1, c_element_1); - // ymm3+=I(b[0][0])*R(a[0][0]) I(b[0][0])*I(a[0][0]) - // I(b[0][0])*R(a[1][0]) I(b[0][0])*I(a[1][0]) - FMA_M_FRINGE(ymm0,ymm3,ymm2,(double const *)(temp_b)+1); + // Load C onto registers + b_element_0 = _mm_loadu_pd((double const*)(temp_cij + ldc*2)); + b_element_1 = _mm_loadu_pd((double const*)(temp_cij + ldc*3)); + + // Shuffle for the compute with imgarinary part scaling + c_perm_0 = _mm_shuffle_pd(b_element_0, b_element_0, 0x01); + c_perm_1 = _mm_shuffle_pd(b_element_1, b_element_1, 0x01); + + c_perm_0 = _mm_mul_pd(beta_imag_reg, c_perm_0); + c_perm_1 = _mm_mul_pd(beta_imag_reg, c_perm_1); + + b_element_0 = _mm_mul_pd(beta_real_reg, b_element_0); + b_element_1 = _mm_mul_pd(beta_real_reg, b_element_1); + + // Compute beta-scaled C + b_element_0 = _mm_addsub_pd(b_element_0, c_perm_0); + b_element_1 = _mm_addsub_pd(b_element_1, c_perm_1); + + // Add to intermediate reg storing alpha*A*B + c_element_2 = _mm_add_pd(b_element_0, c_element_2); + c_element_3 = _mm_add_pd(b_element_1, c_element_3); + } + + // Storing the result in C. + _mm_storeu_pd((double *)(temp_cij), c_element_0); + _mm_storeu_pd((double *)(temp_cij + ldc), c_element_1); + _mm_storeu_pd((double *)(temp_cij + ldc*2), c_element_2); + _mm_storeu_pd((double *)(temp_cij + ldc*3), c_element_3); + + // We need to restore the upper lanes of the registers b_vec_0, b_vec_1, + // b_vec_2 and b_vec_3 + // They need to contain the alpha scaled B, to be reused in the main loop for m + b_element_0 = _mm256_castpd256_pd128(b_vec_0); + b_element_1 = _mm256_castpd256_pd128(b_vec_1); + b_vec_0 = _mm256_insertf128_pd(b_vec_0, b_element_0, 0x01); + b_vec_1 = _mm256_insertf128_pd(b_vec_1, b_element_1, 0x01); + + b_element_0 = _mm256_castpd256_pd128(b_vec_2); + b_element_1 = _mm256_castpd256_pd128(b_vec_3); + b_vec_2 = _mm256_insertf128_pd(b_vec_2, b_element_0, 0x01); + b_vec_3 = _mm256_insertf128_pd(b_vec_3, b_element_1, 0x01); + + // Adjusting the addresses of A and C for the next block. + temp_cij += 1; + temp_ai += 1; + + m_rem -= 1; + } - if(beta_real != 0.0) + if( m_rem >= 2 ) + { + // Scratch registers. + __m256d c_vec_0, c_vec_2, c_vec_4, c_vec_6; + + a_vec_0 = _mm256_setzero_pd(); + a_vec_1 = _mm256_setzero_pd(); + bdcst_0 = _mm256_setzero_pd(); + bdcst_1 = _mm256_setzero_pd(); + c_vec_0 = _mm256_setzero_pd(); + c_vec_2 = _mm256_setzero_pd(); + c_vec_4 = _mm256_setzero_pd(); + c_vec_6 = _mm256_setzero_pd(); + + // Loading a vector from A with 2 elements. + a_vec_0 = _mm256_loadu_pd((double const *)(temp_ai)); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + + // Scaling with imaginary components of elements from B. + bdcst_0 = _mm256_unpackhi_pd(b_vec_0, b_vec_0); + bdcst_1 = _mm256_unpackhi_pd(b_vec_1, b_vec_1); + c_vec_0 = _mm256_mul_pd(a_vec_0, bdcst_0); + c_vec_2 = _mm256_mul_pd(a_vec_0, bdcst_1); + + bdcst_0 = _mm256_unpackhi_pd(b_vec_2, b_vec_2); + bdcst_1 = _mm256_unpackhi_pd(b_vec_3, b_vec_3); + c_vec_4 = _mm256_mul_pd(a_vec_0, bdcst_0); + c_vec_6 = _mm256_mul_pd(a_vec_0, bdcst_1); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + + // Scaling with real components of elements from B. + bdcst_0 = _mm256_unpacklo_pd(b_vec_0, b_vec_0); + bdcst_1 = _mm256_unpacklo_pd(b_vec_1, b_vec_1); + c_vec_0 = _mm256_fmaddsub_pd(a_vec_0, bdcst_0, c_vec_0); + c_vec_2 = _mm256_fmaddsub_pd(a_vec_0, bdcst_1, c_vec_2); + + bdcst_0 = _mm256_unpacklo_pd(b_vec_2, b_vec_2); + bdcst_1 = _mm256_unpacklo_pd(b_vec_3, b_vec_3); + c_vec_4 = _mm256_fmaddsub_pd(a_vec_0, bdcst_0, c_vec_4); + c_vec_6 = _mm256_fmaddsub_pd(a_vec_0, bdcst_1, c_vec_6); + + // Scaling with beta, according to its type. + switch( beta_mul_type ) { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_real)); + case BLIS_MUL_ZERO : + break; + + case BLIS_MUL_ONE : + // Load C and add with the corresponding scratch register. + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij)); + c_vec_0 = _mm256_add_pd(c_vec_0, a_vec_0); + + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij + ldc)); + c_vec_2 = _mm256_add_pd(c_vec_2, a_vec_0); + + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij + ldc*2)); + c_vec_4 = _mm256_add_pd(c_vec_4, a_vec_0); + + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij + ldc*3)); + c_vec_6 = _mm256_add_pd(c_vec_6, a_vec_0); + break; + + default : + // Broadcast beta and redirect to the beta scaling macro. + bdcst_0 = _mm256_broadcast_sd((double const*)(&beta_real)); + bdcst_1 = _mm256_broadcast_sd((double const*)(&beta_imag)); + + BETA_SCALING_C_FRINGE(c_vec_0, temp_cij); + BETA_SCALING_C_FRINGE(c_vec_2, temp_cij + ldc); + BETA_SCALING_C_FRINGE(c_vec_4, temp_cij + ldc*2); + BETA_SCALING_C_FRINGE(c_vec_6, temp_cij + ldc*3); - xmm5 = _mm_loadu_pd((double const*)(temp_c)); - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0); - //ymm3+=beta_real*R(c[0][0]) beta_real*I(c[0][0]) - SCALE_BETA_REAL_M_FRINGE(ymm0,ymm3,ymm15); } - if(beta_imag != 0.0) + // Storing the result in C. + _mm256_storeu_pd((double *)(temp_cij), c_vec_0); + _mm256_storeu_pd((double *)(temp_cij + ldc), c_vec_2); + _mm256_storeu_pd((double *)(temp_cij + ldc*2), c_vec_4); + _mm256_storeu_pd((double *)(temp_cij + ldc*3), c_vec_6); + + // Adjusting the addresses of A and C for the next block. + temp_cij += 2; + temp_ai += 2; + + m_rem -= 2; + } + + // Main loop along M dimension. + for( dim_t i = 0; i < m_iter; i++ ) + { + // Scratch registers. + __m256d c_vec_0, c_vec_1, c_vec_2, c_vec_3; + __m256d c_vec_4, c_vec_5, c_vec_6, c_vec_7; + + a_vec_0 = _mm256_setzero_pd(); + a_vec_1 = _mm256_setzero_pd(); + bdcst_0 = _mm256_setzero_pd(); + bdcst_1 = _mm256_setzero_pd(); + c_vec_0 = _mm256_setzero_pd(); + c_vec_1 = _mm256_setzero_pd(); + c_vec_2 = _mm256_setzero_pd(); + c_vec_3 = _mm256_setzero_pd(); + c_vec_4 = _mm256_setzero_pd(); + c_vec_5 = _mm256_setzero_pd(); + c_vec_6 = _mm256_setzero_pd(); + c_vec_7 = _mm256_setzero_pd(); + + _mm_prefetch((char const*)(temp_cij), _MM_HINT_T0); + _mm_prefetch((char const*)(temp_cij + ldc), _MM_HINT_T0); + _mm_prefetch((char const*)(temp_cij + ldc*2), _MM_HINT_T0); + _mm_prefetch((char const*)(temp_cij + ldc*3), _MM_HINT_T0); + + // Loading vectors from A with Z_MR elements in total. + a_vec_0 = _mm256_loadu_pd((double const *)(temp_ai)); + a_vec_1 = _mm256_loadu_pd((double const *)(temp_ai + 2)); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + a_vec_1 = _mm256_permute_pd(a_vec_1, 0x5); + + // Scaling with imaginary components of elements from B. + bdcst_0 = _mm256_unpackhi_pd(b_vec_0, b_vec_0); + bdcst_1 = _mm256_unpackhi_pd(b_vec_1, b_vec_1); + c_vec_0 = _mm256_mul_pd(a_vec_0, bdcst_0); + c_vec_1 = _mm256_mul_pd(a_vec_1, bdcst_0); + c_vec_2 = _mm256_mul_pd(a_vec_0, bdcst_1); + c_vec_3 = _mm256_mul_pd(a_vec_1, bdcst_1); + + bdcst_0 = _mm256_unpackhi_pd(b_vec_2, b_vec_2); + bdcst_1 = _mm256_unpackhi_pd(b_vec_3, b_vec_3); + c_vec_4 = _mm256_mul_pd(a_vec_0, bdcst_0); + c_vec_5 = _mm256_mul_pd(a_vec_1, bdcst_0); + c_vec_6 = _mm256_mul_pd(a_vec_0, bdcst_1); + c_vec_7 = _mm256_mul_pd(a_vec_1, bdcst_1); + + a_vec_0 = _mm256_permute_pd(a_vec_0, 0x5); + a_vec_1 = _mm256_permute_pd(a_vec_1, 0x5); + + // Scaling with real components of elements from B. + bdcst_0 = _mm256_unpacklo_pd(b_vec_0, b_vec_0); + bdcst_1 = _mm256_unpacklo_pd(b_vec_1, b_vec_1); + c_vec_0 = _mm256_fmaddsub_pd(a_vec_0, bdcst_0, c_vec_0); + c_vec_1 = _mm256_fmaddsub_pd(a_vec_1, bdcst_0, c_vec_1); + c_vec_2 = _mm256_fmaddsub_pd(a_vec_0, bdcst_1, c_vec_2); + c_vec_3 = _mm256_fmaddsub_pd(a_vec_1, bdcst_1, c_vec_3); + + bdcst_0 = _mm256_unpacklo_pd(b_vec_2, b_vec_2); + bdcst_1 = _mm256_unpacklo_pd(b_vec_3, b_vec_3); + c_vec_4 = _mm256_fmaddsub_pd(a_vec_0, bdcst_0, c_vec_4); + c_vec_5 = _mm256_fmaddsub_pd(a_vec_1, bdcst_0, c_vec_5); + c_vec_6 = _mm256_fmaddsub_pd(a_vec_0, bdcst_1, c_vec_6); + c_vec_7 = _mm256_fmaddsub_pd(a_vec_1, bdcst_1, c_vec_7); + + // Scaling with beta, according to its type. + switch( beta_mul_type ) { - ymm15 = _mm256_broadcast_sd((double const *)(&beta_imag)); + case BLIS_MUL_ZERO : + break; + + case BLIS_MUL_ONE : + // Load C and add with the corresponding scratch register. + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij)); + a_vec_1 = _mm256_loadu_pd((double const*)(temp_cij + 2)); + c_vec_0 = _mm256_add_pd(c_vec_0, a_vec_0); + c_vec_1 = _mm256_add_pd(c_vec_1, a_vec_1); + + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij + ldc)); + a_vec_1 = _mm256_loadu_pd((double const*)(temp_cij + ldc + 2)); + c_vec_2 = _mm256_add_pd(c_vec_2, a_vec_0); + c_vec_3 = _mm256_add_pd(c_vec_3, a_vec_1); + + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij + ldc*2)); + a_vec_1 = _mm256_loadu_pd((double const*)(temp_cij + ldc*2 + 2)); + c_vec_4 = _mm256_add_pd(c_vec_4, a_vec_0); + c_vec_5 = _mm256_add_pd(c_vec_5, a_vec_1); + + a_vec_0 = _mm256_loadu_pd((double const*)(temp_cij + ldc*3)); + a_vec_1 = _mm256_loadu_pd((double const*)(temp_cij + ldc*3 + 2)); + c_vec_6 = _mm256_add_pd(c_vec_6, a_vec_0); + c_vec_7 = _mm256_add_pd(c_vec_7, a_vec_1); + break; + + default : + // Broadcast beta and redirect to the beta scaling macro. + bdcst_0 = _mm256_broadcast_sd((double const*)(&beta_real)); + bdcst_1 = _mm256_broadcast_sd((double const*)(&beta_imag)); + + BETA_SCALING_C_MAIN(c_vec_0, c_vec_1, temp_cij); + BETA_SCALING_C_MAIN(c_vec_2, c_vec_3, temp_cij + ldc); + BETA_SCALING_C_MAIN(c_vec_4, c_vec_5, temp_cij + ldc*2); + BETA_SCALING_C_MAIN(c_vec_6, c_vec_7, temp_cij + ldc*3); - xmm5 = _mm_loadu_pd((double const*)(temp_c)); - ymm0 = _mm256_insertf128_pd(ymm0,xmm5,0); - //ymm3+=beta_imag*(-I(c[0][0])) beta_imag*R(c[0][0]) - SCALE_BETA_IMAG_M_FRINGE(ymm0,ymm3,ymm15,ymm2); } - xmm5 = _mm256_extractf128_pd(ymm3, 0); - _mm_storeu_pd((double *)(temp_c), xmm5); + // Storing the result in C. + _mm256_storeu_pd((double *)(temp_cij), c_vec_0); + _mm256_storeu_pd((double *)(temp_cij + 2), c_vec_1); + + _mm256_storeu_pd((double *)(temp_cij + ldc), c_vec_2); + _mm256_storeu_pd((double *)(temp_cij + ldc + 2), c_vec_3); + + _mm256_storeu_pd((double *)(temp_cij + ldc*2), c_vec_4); + _mm256_storeu_pd((double *)(temp_cij + ldc*2 + 2), c_vec_5); + + _mm256_storeu_pd((double *)(temp_cij + ldc*3), c_vec_6); + _mm256_storeu_pd((double *)(temp_cij + ldc*3 + 2), c_vec_7); + // Adjusting the addresses of A and C for the next iteration. + temp_cij += Z_MR; + temp_ai += Z_MR; } } -} +} \ No newline at end of file diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index e6a2f33f92..b5af321a96 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -315,7 +315,7 @@ void bli_dgemm_8x6_avx2_k1_nn double* c, const inc_t ldc ); -void bli_zgemm_4x6_avx2_k1_nn +void bli_zgemm_4x4_avx2_k1_nn ( dim_t m, dim_t n, From 5bdf5e2aaa5a36282cd5d7829c7e9ee4e5a8c569 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Tue, 18 Jul 2023 01:25:14 -0500 Subject: [PATCH 122/226] Optimized AVX2 DGEMM SUP and small edge kernels. - Re-designed the new edge kernels that uses masked load-store instructions for handling corner cases. - Mask load-store instruction macros are added. vmovdqu, VMOVDQU for setting up the mask. vmaskmovpd, VMASKMOVPD for masked load-store - Following edge kernels are added for 6x8m dgemm sup. n-left edge kernels - bli_dgemmsup_rv_haswell_asm_6x7m - bli_dgemmsup_rv_haswell_asm_6x5m - bli_dgemmsup_rv_haswell_asm_6x3m m-left edge kernels - bli_dgemmsup_rv_haswell_asm_5x7 - bli_dgemmsup_rv_haswell_asm_4x7 - bli_dgemmsup_rv_haswell_asm_3x7 - bli_dgemmsup_rv_haswell_asm_2x7 - bli_dgemmsup_rv_haswell_asm_1x7 - bli_dgemmsup_rv_haswell_asm_5x5 - bli_dgemmsup_rv_haswell_asm_4x5 - bli_dgemmsup_rv_haswell_asm_3x5 - bli_dgemmsup_rv_haswell_asm_2x5 - bli_dgemmsup_rv_haswell_asm_1x5 - bli_dgemmsup_rv_haswell_asm_5x3 - bli_dgemmsup_rv_haswell_asm_4x3 - bli_dgemmsup_rv_haswell_asm_3x3 - bli_dgemmsup_rv_haswell_asm_2x3 - bli_dgemmsup_rv_haswell_asm_1x3 - For 16x3 dgemm_small, m_left computation is handled with masked load-store instructions avoid overhead of conditional checks for edge cases. - It improves performance by reducing branching overhead and by being more cache friendly. AMD-Internal: [CPUPL-3574] Change-Id: I976d6a9209d2a1a02b2830d03d21d200a5aad173 --- frame/include/bli_x86_asm_macros.h | 5 +- .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c | 2393 +++++++++++++-- kernels/haswell/3/sup/d6x8/CMakeLists.txt | 3 + .../d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c | 2137 ++++++++++++++ .../d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c | 2519 ++++++++++++++++ .../d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c | 2602 +++++++++++++++++ kernels/haswell/bli_kernels_haswell.h | 20 +- kernels/zen/3/bli_gemm_small.c | 1710 ++++++----- 8 files changed, 10450 insertions(+), 939 deletions(-) create mode 100644 kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c create mode 100644 kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c create mode 100644 kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h index 84bc76c21d..112fe64736 100644 --- a/frame/include/bli_x86_asm_macros.h +++ b/frame/include/bli_x86_asm_macros.h @@ -776,6 +776,7 @@ #define VMOVHPD(...) INSTR_(vmovhpd, __VA_ARGS__) #define VMOVDQA(_0, _1) INSTR_(vmovdqa, _0, _1) #define VMOVDQA32(_0, _1) INSTR_(vmovdqa32, _0, _1) +#define VMOVDQU(_0, _1) INSTR_(vmovdqu, _0, _1) #define VMOVDQA64(_0, _1) INSTR_(vmovdqa64, _0, _1) #define VBROADCASTSS(_0, _1) INSTR_(vbroadcastss, _0, _1) #define VBROADCASTSD(_0, _1) INSTR_(vbroadcastsd, _0, _1) @@ -809,6 +810,7 @@ #define vmovhpd(...) VMOVHPD(__VA_ARGS__) #define vmovdqa(_0, _1) VMOVDQA(_0, _1) #define vmovdqa32(_0, _1) VMOVDQA32(_0, _1) +#define vmovdqu(_0, _1) VMOVDQU(_0, _1) #define vmovdqa64(_0, _1) VMOVDQA64(_0, _1) #define vbroadcastss(_0, _1) VBROADCASTSS(_0, _1) #define vbroadcastsd(_0, _1) VBROADCASTSD(_0, _1) @@ -911,6 +913,7 @@ #define VCOMISS(_0, _1) INSTR_(vcomiss, _0, _1) #define VCOMISD(_0, _1) INSTR_(vcomisd, _0, _1) +#define VMASKMOVPD(_0, _1, _2) INSTR_(vmaskmovpd, _0, _1, _2) #define VFMADD132SS(_0, _1, _2) INSTR_(vfmadd132ss, _0, _1, _2) #define VFMADD213SS(_0, _1, _2) INSTR_(vfmadd213ss, _0, _1, _2) #define VFMADD231SS(_0, _1, _2) INSTR_(vfmadd231ss, _0, _1, _2) @@ -1236,7 +1239,7 @@ #define vblendpd(_0, _1, _2, _3) VBLENDPD(_0, _1, _2, _3) #define vblendmps(_0, _1, _2) VBLENDMSD(_0, _1, _2) #define vblendmpd(_0, _1, _2) VBLENDMPD(_0, _1, _2) - +#define vmaskmovpd(_0, _1, _2) VMASKMOVPD(_0, _1, _2) // Prefetches #define PREFETCH(_0, _1) INSTR_(prefetcht##_0, _1) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c index 05c240d2d1..cdd6989820 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c @@ -38,6 +38,360 @@ #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" +static const int64_t mask_3[4] = {-1, -1, -1, 0}; +static const int64_t mask_1[4] = {-1, 0, 0, 0}; + +static void bli_dgemmsup_rv_haswell_asm_6x7m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ); + +static void bli_dgemmsup_rv_haswell_asm_6x5m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ); + +static void bli_dgemmsup_rv_haswell_asm_6x3m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ); + +#define C_TRANSPOSE_6x7_TILE(R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + /*Broadcasting Beta into ymm15 vector register*/\ + vbroadcastsd(mem(rbx), ymm15)\ +\ + /*Scaling C matrix by Beta and adding it to fma result.*/ \ + /*R1, R2, R3, R4 holds final result*/ \ + vfmadd231pd(mem(rcx ), ymm15, ymm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm15, ymm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm15, ymm(R3))\ + vfmadd231pd(mem(rcx, rax, 1), ymm15, ymm(R4))\ + /*Storing it back to C matrix.*/ \ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + /*Moving to operate on last 2 rows of 6 rows.*/ \ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ + vunpckhpd(ymm(R6), ymm(R5), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm3)\ +\ + /*Scaling C matrix by Beta and adding it to fma result.*/ \ + /*0, 1, 2, 3 holds final result*/ \ + vfmadd231pd(mem(rdx ), xmm15, xmm0)\ + vfmadd231pd(mem(rdx, rsi, 1), xmm15, xmm1)\ + vfmadd231pd(mem(rdx, rsi, 2), xmm15, xmm2)\ + vfmadd231pd(mem(rdx, rax, 1), xmm15, xmm3)\ + vmovupd(xmm0, mem(rdx ))\ + vmovupd(xmm1, mem(rdx, rsi, 1))\ + vmovupd(xmm2, mem(rdx, rsi, 2))\ + vmovupd(xmm3, mem(rdx, rax, 1))\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R8), ymm(R7), ymm0)\ + vunpckhpd(ymm(R8), ymm(R7), ymm1)\ + vunpcklpd(ymm(R10), ymm(R9), ymm2)\ + vunpckhpd(ymm(R10), ymm(R9), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm5)\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm7)\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm9)\ +\ + vfmadd231pd(mem(rcx ), ymm15, ymm5)\ + vfmadd231pd(mem(rcx, rsi, 1), ymm15, ymm7)\ + vfmadd231pd(mem(rcx, rsi, 2), ymm15, ymm9)\ +\ + vmovupd(ymm5, mem(rcx ))\ + vmovupd(ymm7, mem(rcx, rsi, 1))\ + vmovupd(ymm9, mem(rcx, rsi, 2))\ +\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R12), ymm(R11), ymm0)\ + vunpckhpd(ymm(R12), ymm(R11), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm4)\ +\ + vfmadd231pd(mem(rdx ), xmm15, xmm0)\ + vfmadd231pd(mem(rdx, rsi, 1), xmm15, xmm1)\ + vfmadd231pd(mem(rdx, rsi, 2), xmm15, xmm2)\ +\ + vmovupd(xmm0, mem(rdx ))\ + vmovupd(xmm1, mem(rdx, rsi, 1))\ + vmovupd(xmm2, mem(rdx, rsi, 2)) + +#define C_TRANSPOSE_6x7_TILE_BZ(R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + /*Storing transposed 4x4 tile back to C matrix*/\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ + vunpckhpd(ymm(R6), ymm(R5), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm3)\ +\ + /*Storing transposed 2x4 tile back to C matrix*/\ + vmovupd(xmm0, mem(rdx ))\ + vmovupd(xmm1, mem(rdx, rsi, 1))\ + vmovupd(xmm2, mem(rdx, rsi, 2))\ + vmovupd(xmm3, mem(rdx, rax, 1))\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R8), ymm(R7), ymm0)\ + vunpckhpd(ymm(R8), ymm(R7), ymm1)\ + vunpcklpd(ymm(R10), ymm(R9), ymm2)\ + vunpckhpd(ymm(R10), ymm(R9), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm5)\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm7)\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm9)\ +\ + /*Storing transposed 4x3 tile back to C matrix*/\ + vmovupd(ymm5, mem(rcx ))\ + vmovupd(ymm7, mem(rcx, rsi, 1))\ + vmovupd(ymm9, mem(rcx, rsi, 2))\ +\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R12), ymm(R11), ymm0)\ + vunpckhpd(ymm(R12), ymm(R11), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm4)\ +\ + /*Storing transposed 2x3 tile back to C matrix*/\ + vmovupd(xmm0, mem(rdx ))\ + vmovupd(xmm1, mem(rdx, rsi, 1))\ + vmovupd(xmm2, mem(rdx, rsi, 2)) + +#define C_TRANSPOSE_6x5_TILE(R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + /*Broadcasting Beta into ymm15 vector register*/\ + vbroadcastsd(mem(rbx), ymm15)\ +\ + /*Scaling C matrix by Beta and adding it to fma result.*/ \ + /*R1, R2, R3, R4 holds final result*/ \ + vfmadd231pd(mem(rcx ), ymm15, ymm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm15, ymm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm15, ymm(R3))\ + vfmadd231pd(mem(rcx, rax, 1), ymm15, ymm(R4))\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ + vunpckhpd(ymm(R6), ymm(R5), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm3)\ +\ + /*Scaling C matrix by Beta and adding it to fma result.*/ \ + /*0, 1, 2, 3 holds final result*/ \ + vfmadd231pd(mem(rdx ), xmm15, xmm0)\ + vfmadd231pd(mem(rdx, rsi, 1), xmm15, xmm1)\ + vfmadd231pd(mem(rdx, rsi, 2), xmm15, xmm2)\ + vfmadd231pd(mem(rdx, rax, 1), xmm15, xmm3)\ + vmovupd(xmm0, mem(rdx ))\ + vmovupd(xmm1, mem(rdx, rsi, 1))\ + vmovupd(xmm2, mem(rdx, rsi, 2))\ + vmovupd(xmm3, mem(rdx, rax, 1))\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 4x1 tile*/ \ + vunpcklpd(ymm(R8), ymm(R7), ymm0)\ + vunpcklpd(ymm(R10), ymm(R9), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm5)\ +\ + vfmadd231pd(mem(rcx ), ymm15, ymm5)\ + vmovupd(ymm5, mem(rcx ))\ +\ + /*Transposing 2x1 tile*/ \ + vunpcklpd(ymm(R12), ymm(R11), ymm0)\ + vfmadd231pd(mem(rdx ), xmm15, xmm0)\ +\ + vmovupd(xmm0, mem(rdx )) + +#define C_TRANSPOSE_6x5_TILE_BZ(R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + /*Storing transposed 4x4 tile back to C matrix*/\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ + vunpckhpd(ymm(R6), ymm(R5), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm3)\ +\ + /*Storing transposed 4x2 tile back to C matrix*/\ + vmovupd(xmm0, mem(rdx ))\ + vmovupd(xmm1, mem(rdx, rsi, 1))\ + vmovupd(xmm2, mem(rdx, rsi, 2))\ + vmovupd(xmm3, mem(rdx, rax, 1))\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 4x1 tile*/ \ + vunpcklpd(ymm(R8), ymm(R7), ymm0)\ + vunpcklpd(ymm(R10), ymm(R9), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm5)\ +\ + /*Storing transposed 4x1 tile back to C matrix*/\ + vmovupd(ymm5, mem(rcx ))\ +\ + /*Transposing 2x1 tile*/ \ + vunpcklpd(ymm(R12), ymm(R11), ymm0)\ +\ + /*Storing transposed 2x1 tile back to C matrix*/\ + vmovupd(xmm0, mem(rdx )) + +#define C_TRANSPOSE_6x3_TILE(R1, R2, R3, R4, R5, R6) \ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ +\ + vbroadcastsd(mem(rbx), ymm3)\ +\ + /*Scaling C matrix by Beta and adding it to fma result.*/ \ + /*R1, R2, R3 holds final result*/ \ + vfmadd231pd(mem(rcx ), ymm3, ymm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm(R3))\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ +\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ + vunpckhpd(ymm(R6), ymm(R5), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ +\ + /*Scaling C matrix by Beta and adding it to fma result.*/ \ + /*0, 1, 2 holds final result*/ \ + vfmadd231pd(mem(rdx ), xmm3, xmm0)\ + vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1)\ + vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2)\ + vmovupd(xmm0, mem(rdx ))\ + vmovupd(xmm1, mem(rdx, rsi, 1))\ + vmovupd(xmm2, mem(rdx, rsi, 2)) + +#define C_TRANSPOSE_6x3_TILE_BZ(R1, R2, R3, R4, R5, R6) \ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ +\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ +\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ + vunpckhpd(ymm(R6), ymm(R5), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm4)\ +\ + vmovupd(xmm0, mem(rdx ))\ + vmovupd(xmm1, mem(rdx, rsi, 1))\ + vmovupd(xmm2, mem(rdx, rsi, 2)) + /* rrr: -------- ------ -------- @@ -108,93 +462,114 @@ void bli_dgemmsup_rv_haswell_asm_6x8m double* restrict bj = b; double* restrict ai = a; - if ( 6 <= n_left ) - { - const dim_t nr_cur = 6; - - bli_dgemmsup_rv_haswell_asm_6x6m - ( - conja, conjb, m0, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; - } - if ( 4 <= n_left ) - { - const dim_t nr_cur = 4; - - bli_dgemmsup_rv_haswell_asm_6x4m - ( - conja, conjb, m0, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; - } - if ( 2 <= n_left ) - { - const dim_t nr_cur = 2; - - bli_dgemmsup_rv_haswell_asm_6x2m - ( - conja, conjb, m0, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; - } - if ( 1 == n_left ) + switch(n_left) { -#if 0 - const dim_t nr_cur = 1; - - bli_dgemmsup_r_haswell_ref - ( - conja, conjb, m0, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); -#else - dim_t ps_a0 = bli_auxinfo_ps_a( data ); - - if ( ps_a0 == 6 * rs_a0 ) + case 7: { - // Since A is not packed, we can use one gemv. - bli_dgemv_ex + bli_dgemmsup_rv_haswell_asm_6x7m ( - BLIS_NO_TRANSPOSE, conjb, m0, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, - beta, cij, rs_c0, cntx, NULL + conja, conjb, m0, n_left, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx ); + break; } - else + case 6: { - const dim_t mr = 6; - - // Since A is packed into row panels, we must use a loop over - // gemv. - dim_t m_iter = ( m0 + mr - 1 ) / mr; - dim_t m_left = m0 % mr; - - double* restrict ai_ii = ai; - double* restrict cij_ii = cij; + bli_dgemmsup_rv_haswell_asm_6x6m + ( + conja, conjb, m0, n_left, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + case 5: + { + bli_dgemmsup_rv_haswell_asm_6x5m + ( + conja, conjb, m0, n_left, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + case 4: + { + bli_dgemmsup_rv_haswell_asm_6x4m + ( + conja, conjb, m0, n_left, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + case 3: + { + bli_dgemmsup_rv_haswell_asm_6x3m + ( + conja, conjb, m0, n_left, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + case 2: + { + bli_dgemmsup_rv_haswell_asm_6x2m + ( + conja, conjb, m0, n_left, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + case 1: + { + dim_t ps_a0 = bli_auxinfo_ps_a( data ); - for ( dim_t ii = 0; ii < m_iter; ii += 1 ) + if ( ps_a0 == 6 * rs_a0 ) { - dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) - ? mr : m_left ); - + // Since A is not packed, we can use one gemv. bli_dgemv_ex - ( - BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, - alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, - beta, cij_ii, rs_c0, cntx, NULL - ); - cij_ii += mr*rs_c0; ai_ii += ps_a0; + ( + BLIS_NO_TRANSPOSE, conjb, m0, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, + beta, cij, rs_c0, cntx, NULL + ); + } + else + { + const dim_t mr = 6; + + // Since A is packed into row panels, we must use a loop over + // gemv. + dim_t m_iter = ( m0 + mr - 1 ) / mr; + dim_t m_left = m0 % mr; + + double* restrict ai_ii = ai; + double* restrict cij_ii = cij; + + for ( dim_t ii = 0; ii < m_iter; ii += 1 ) + { + dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) + ? mr : m_left ); + + bli_dgemv_ex + ( + BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, + alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, + beta, cij_ii, rs_c0, cntx, NULL + ); + cij_ii += mr*rs_c0; ai_ii += ps_a0; + } } + break; + } + default: + { + break; } -#endif } return; } @@ -916,53 +1291,6 @@ void bli_dgemmsup_rv_haswell_asm_6x8m double* restrict ai = a + m_iter * ps_a; double* restrict bj = b; -#if 0 - // We add special handling for slightly inflated MR blocksizes - // at edge cases, up to a maximum of 9. - if ( 6 < m_left ) - { - dgemmsup_ker_ft ker_fp1 = NULL; - dgemmsup_ker_ft ker_fp2 = NULL; - dim_t mr1, mr2; - - if ( m_left == 7 ) - { - mr1 = 4; mr2 = 3; - ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x8; - ker_fp2 = bli_dgemmsup_rv_haswell_asm_3x8; - } - else if ( m_left == 8 ) - { - mr1 = 4; mr2 = 4; - ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x8; - ker_fp2 = bli_dgemmsup_rv_haswell_asm_4x8; - } - else // if ( m_left == 9 ) - { - mr1 = 4; mr2 = 5; - ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x8; - ker_fp2 = bli_dgemmsup_rv_haswell_asm_5x8; - } - - ker_fp1 - ( - conja, conjb, mr1, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - cij += mr1*rs_c0; ai += mr1*rs_a0; - - ker_fp2 - ( - conja, conjb, mr2, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - - return; - } -#endif - dgemmsup_ker_ft ker_fps[6] = { NULL, @@ -8129,7 +8457,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U ) } -void bli_dgemmsup_rv_haswell_asm_6x6m +static void bli_dgemmsup_rv_haswell_asm_6x7m ( conj_t conja, conj_t conjb, @@ -8146,6 +8474,31 @@ void bli_dgemmsup_rv_haswell_asm_6x6m ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 7 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 3 elements, +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. +// //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); @@ -8168,13 +8521,15 @@ void bli_dgemmsup_rv_haswell_asm_6x6m uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a8 = ps_a * sizeof( double ); - if ( m_iter == 0 ) goto consider_edge_cases; + int64_t const *mask_vec = mask_3; - // ------------------------------------------------------------------------- + if ( m_iter == 0 ) goto consider_edge_cases_7; + // ------------------------------------------------------------------------- begin_asm() - //vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load mask mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a @@ -8185,43 +8540,19 @@ void bli_dgemmsup_rv_haswell_asm_6x6m lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a - //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b - //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) - //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - - // NOTE: We cannot pre-load elements of a or b - // because it could eventually, in the last - // unrolled iter or the cleanup loop, result - // in reading beyond the bounds allocated mem - // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) - - // During preamble and loops: - // r12 = rcx = c - // r14 = rax = a - // read rbx from var(b) near beginning of loop - // r11 = m dim index ii - mov(var(m_iter), r11) // ii = m_iter; - label(.DLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] - - + label(.DLOOP6X7I) // LOOP OVER ii = [ m_iter ... 1 0 ] -#if 0 - vzeroall() // zero all xmm/ymm registers. -#else - // skylake can execute 3 vxorpd ipc with - // a latency of 1 cycle, while vzeroall - // has a latency of 12 cycles. - vxorpd(ymm1, ymm1, ymm1) // zero ymm1 since we only use the lower - vxorpd(ymm4, ymm4, ymm4) // half (xmm1), and nans/infs may slow us + vxorpd(ymm4, ymm4, ymm4) + vmovapd( ymm4, ymm3) vmovapd( ymm4, ymm5) vmovapd( ymm4, ymm6) vmovapd( ymm4, ymm7) @@ -8232,14 +8563,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vmovapd( ymm4, ymm12) vmovapd( ymm4, ymm13) vmovapd( ymm4, ymm14) - vmovapd( ymm4, ymm15) -#endif mov(var(b), rbx) // load address of b. - //mov(r12, rcx) // reset rcx to current utile of c. - mov(r14, rax) // reset rax to current upanel of a. - - + mov(r14, rax) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case @@ -8247,12 +8573,12 @@ void bli_dgemmsup_rv_haswell_asm_6x6m lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 3*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c @@ -8261,12 +8587,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + lea(mem(rdx, rsi, 2), rcx) // rcx = c + 5*cs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 6*cs_c label(.DPOSTPFETCH) // done prefetching c @@ -8283,15 +8611,1737 @@ void bli_dgemmsup_rv_haswell_asm_6x6m lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - + label(.DLOOPKITER) // MAIN LOOP + // ---------------------------------- iteration 0 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, 5*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + + vbroadcastsd(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) + + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, r9, 1, 5*8)) +#endif + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements based on mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + + vbroadcastsd(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) + + // ---------------------------------- iteration 2 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, r9, 2, 5*8)) +#endif + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements based on mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + + vbroadcastsd(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) + + // ---------------------------------- iteration 3 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, rcx, 1, 5*8)) + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements based on mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + + vbroadcastsd(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 1 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements based on mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + + vbroadcastsd(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + label(.DPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + + vmulpd(ymm0, ymm5, ymm5) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + + vmulpd(ymm0, ymm7, ymm7) // scale by alpha + vmulpd(ymm0, ymm8, ymm8) + + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm0, ymm10, ymm10) + + vmulpd(ymm0, ymm11, ymm11) + vmulpd(ymm0, ymm12, ymm12) + + vmulpd(ymm0, ymm13, ymm13) + vmulpd(ymm0, ymm14, ymm14) + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + label(.DROWSTORED) + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + //Loads 4 element + vmovupd(ymm3, mem(rcx, 0*32)) + //Loads 3 elements based on mask_3 mask vector + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------4 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm11) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm12) + + vmovupd(ymm11, mem(rcx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------5 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm13) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm14) + + vmovupd(ymm13, mem(rcx, 0*32)) + vmaskmovpd(ymm14, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------6 + + jmp(.DDONE) // jump to end. + + label(.DCOLSTORED) + C_TRANSPOSE_6x7_TILE(3, 5, 7, 9, 11, 13, 4, 6, 8, 10, 12, 14) + jmp(.RESETPARAM) + + label(.DBETAZERO) + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + label(.DROWSTORBZ) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------4 + + vmovupd(ymm11, mem(rcx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------5 + + vmovupd(ymm13, mem(rcx, 0*32)) + vmaskmovpd(ymm14, ymm15, mem(rcx, 1*32)) + + //-----------------------6 + + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_6x7_TILE_BZ(3, 5, 7, 9, 11, 13, 4, 6, 8, 10, 12, 14) + jmp(.RESETPARAM) + + label(.RESETPARAM) + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load mask + jmp(.DDONE) + + label(.DDONE) + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + mov(var(ps_a8), rax) // load ps_a8 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8 + + dec(r11) // ii -= 1; + jne(.DLOOP6X7I) // iterate again if ii != 0. + + + label(.DRETURN) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [mask_vec] "m" (mask_vec), + [rs_c] "m" (rs_c), + [n0] "m" (n0), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", + "memory" + ) + + consider_edge_cases_7: + // Handle edge cases in the m dimension, if they exist. + if ( m_left ) + { + const dim_t nr_cur = n0; + const dim_t i_edge = m0 - ( dim_t )m_left; + + double* restrict cij = c + i_edge*rs_c; + double* restrict ai = a + m_iter * ps_a; + double* restrict bj = b; + + dgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_dgemmsup_rv_haswell_asm_1x7, + bli_dgemmsup_rv_haswell_asm_2x7, + bli_dgemmsup_rv_haswell_asm_3x7, + bli_dgemmsup_rv_haswell_asm_4x7, + bli_dgemmsup_rv_haswell_asm_5x7 + }; + + dgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + + return; + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); +} + +static void bli_dgemmsup_rv_haswell_asm_6x5m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_5); + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 5 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 1 element, +// kernel is using mask_1 which is set to -1, 0, 0, 0 static that the +// 1 element will be loaded and other 3 elements will be set to 0 in destination vector. +// + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + int64_t const *mask_vec = mask_1; + + if ( m_iter == 0 ) goto consider_edge_cases_5; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load mask + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + mov(var(m_iter), r11) // ii = m_iter; + + label(.DLOOP6X5I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + vxorpd(ymm4, ymm4, ymm4) + vmovapd( ymm4, ymm3) + vmovapd( ymm4, ymm5) + vmovapd( ymm4, ymm6) + vmovapd( ymm4, ymm7) + vmovapd( ymm4, ymm8) + vmovapd( ymm4, ymm9) + vmovapd( ymm4, ymm10) + vmovapd( ymm4, ymm11) + vmovapd( ymm4, ymm12) + vmovapd( ymm4, ymm13) + vmovapd( ymm4, ymm14) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 3*8)) // prefetch c + 5*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + mov(var(ps_a8), rdx) // load ps_a8 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a8 + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + // use rcx, rdx for prefetching lines + // from next upanel of a. +#else + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; +#endif + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.DLOOPKITER) // MAIN LOOP + // ---------------------------------- iteration 0 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, 5*8)) +#endif + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + + vbroadcastsd(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) + + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, r9, 1, 5*8)) +#endif + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + + vbroadcastsd(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) + + // ---------------------------------- iteration 2 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, r9, 2, 5*8)) +#endif + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + + vbroadcastsd(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) + + // ---------------------------------- iteration 3 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, rcx, 1, 5*8)) + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + + vbroadcastsd(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 1 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + + vbroadcastsd(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + label(.DPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + + vmulpd(ymm0, ymm5, ymm5) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + + vmulpd(ymm0, ymm7, ymm7) // scale by alpha + vmulpd(ymm0, ymm8, ymm8) + + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm0, ymm10, ymm10) + + vmulpd(ymm0, ymm11, ymm11) + vmulpd(ymm0, ymm12, ymm12) + + vmulpd(ymm0, ymm13, ymm13) + vmulpd(ymm0, ymm14, ymm14) + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + label(.DROWSTORED) + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + //Loads 4 element + vmovupd(ymm3, mem(rcx, 0*32)) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------4 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm11) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm12) + + vmovupd(ymm11, mem(rcx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------5 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm13) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm14) + + vmovupd(ymm13, mem(rcx, 0*32)) + vmaskmovpd(ymm14, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------6 + + jmp(.DDONE) // jump to end. + + label(.DCOLSTORED) + + C_TRANSPOSE_6x5_TILE(3, 5, 7, 9, 11, 13, 4, 6, 8, 10, 12, 14) + jmp(.RESETPARAM) + + label(.DBETAZERO) + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + label(.DROWSTORBZ) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------4 + + vmovupd(ymm11, mem(rcx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------5 + + vmovupd(ymm13, mem(rcx, 0*32)) + vmaskmovpd(ymm14, ymm15, mem(rcx, 1*32)) + + //-----------------------6 + + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_6x5_TILE_BZ(3, 5, 7, 9, 11, 13, 4, 6, 8, 10, 12, 14) + jmp(.RESETPARAM) + + label(.RESETPARAM) + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load mask + jmp(.DDONE) + + label(.DDONE) + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + mov(var(ps_a8), rax) // load ps_a8 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8 + + dec(r11) // ii -= 1; + jne(.DLOOP6X5I) // iterate again if ii != 0. + + + label(.DRETURN) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [mask_vec] "m" (mask_vec), + [rs_c] "m" (rs_c), + [n0] "m" (n0), + [cs_c] "m" (cs_c)/*, + [a_next] "m" (a_next), + [b_next] "m" (b_next)*/ + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", + "memory" + ) + + consider_edge_cases_5: + // Handle edge cases in the m dimension, if they exist. + if ( m_left ) + { + const dim_t nr_cur = n0; + const dim_t i_edge = m0 - ( dim_t )m_left; + + double* restrict cij = c + i_edge*rs_c; + double* restrict ai = a + m_iter * ps_a; + double* restrict bj = b; + + dgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_dgemmsup_rv_haswell_asm_1x5, + bli_dgemmsup_rv_haswell_asm_2x5, + bli_dgemmsup_rv_haswell_asm_3x5, + bli_dgemmsup_rv_haswell_asm_4x5, + bli_dgemmsup_rv_haswell_asm_5x5 + }; + + dgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + + return; + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_5); +} + +static void bli_dgemmsup_rv_haswell_asm_6x3m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// + +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. +// + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + int64_t const *mask_vec = mask_3; + + if ( m_iter == 0 ) goto consider_edge_cases_nleft_3; + + begin_asm() + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load mask + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + mov(var(m_iter), r11) // ii = m_iter; + + label(.DLOOP6X3I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + vxorpd(ymm4, ymm4, ymm4) + vmovapd( ymm4, ymm6) + vmovapd( ymm4, ymm8) + vmovapd( ymm4, ymm10) + vmovapd( ymm4, ymm12) + vmovapd( ymm4, ymm14) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 2*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 2*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 2*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 2*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 2*8)) // prefetch c + 5*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + mov(var(ps_a8), rdx) // load ps_a8 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a8 + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + // use rcx, rdx for prefetching lines + // from next upanel of a. +#else + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; +#endif + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, 5*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm0, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm0, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vbroadcastsd(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm0, ymm3, ymm14) + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, r9, 1, 5*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm0, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm0, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vbroadcastsd(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm0, ymm3, ymm14) + + + // ---------------------------------- iteration 2 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, r9, 2, 5*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm0, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm0, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vbroadcastsd(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm0, ymm3, ymm14) + + + // ---------------------------------- iteration 3 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, rcx, 1, 5*8)) + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm0, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm0, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vbroadcastsd(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm0, ymm3, ymm14) + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 1 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm0, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm0, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vbroadcastsd(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm0, ymm3, ymm14) + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + label(.DPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm10, ymm10) + vmulpd(ymm0, ymm12, ymm12) + vmulpd(ymm0, ymm14, ymm14) + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + label(.DROWSTORED) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm3, ymm4) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm3, ymm6) + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm3, ymm8) + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm3, ymm10) + vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm3, ymm12) + vmaskmovpd(ymm12, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm3, ymm14) + vmaskmovpd(ymm14, ymm15, mem(rcx, 0*32)) + + jmp(.DDONE) // jump to end. + + label(.DCOLSTORED) + + C_TRANSPOSE_6x3_TILE(4, 6, 8, 10, 12, 14) + jmp(.RESETPARAM) + + label(.DBETAZERO) + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + label(.DROWSTORBZ) + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm12, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm14, ymm15, mem(rcx, 0*32)) + + + jmp(.DDONE) // jump to end. + + label(.DCOLSTORBZ) + + C_TRANSPOSE_6x3_TILE_BZ(4, 6, 8, 10, 12, 14) + jmp(.RESETPARAM) + + label(.RESETPARAM) + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load mask + jmp(.DDONE) // jump to end. + + label(.DDONE) + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + mov(var(ps_a8), rax) // load ps_a8 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8 + + dec(r11) // ii -= 1; + jne(.DLOOP6X3I) // iterate again if ii != 0. + + label(.DRETURN) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [n0] "m" (n0), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c)/*, + [a_next] "m" (a_next), + [b_next] "m" (b_next)*/ + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm14", + "memory" + ) + + consider_edge_cases_nleft_3: + if ( m_left ) + { + const dim_t nr_cur = n0; + const dim_t i_edge = m0 - ( dim_t )m_left; + + double* restrict cij = c + i_edge*rs_c; + double* restrict ai = a + m_iter * ps_a; + double* restrict bj = b; + + dgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_dgemmsup_rv_haswell_asm_1x3, + bli_dgemmsup_rv_haswell_asm_2x3, + bli_dgemmsup_rv_haswell_asm_3x3, + bli_dgemmsup_rv_haswell_asm_4x3, + bli_dgemmsup_rv_haswell_asm_5x3 + }; + + dgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + + return; + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); +} + + + +void bli_dgemmsup_rv_haswell_asm_6x6m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + // ------------------------------------------------------------------------- + + begin_asm() + + //vzeroall() // zero all xmm/ymm registers. + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + //mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + // r11 = m dim index ii + + mov(var(m_iter), r11) // ii = m_iter; + + label(.DLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + + +#if 0 + vzeroall() // zero all xmm/ymm registers. +#else + // skylake can execute 3 vxorpd ipc with + // a latency of 1 cycle, while vzeroall + // has a latency of 12 cycles. + vxorpd(ymm1, ymm1, ymm1) // zero ymm1 since we only use the lower + vxorpd(ymm4, ymm4, ymm4) // half (xmm1), and nans/infs may slow us + vmovapd( ymm4, ymm5) + vmovapd( ymm4, ymm6) + vmovapd( ymm4, ymm7) + vmovapd( ymm4, ymm8) + vmovapd( ymm4, ymm9) + vmovapd( ymm4, ymm10) + vmovapd( ymm4, ymm11) + vmovapd( ymm4, ymm12) + vmovapd( ymm4, ymm13) + vmovapd( ymm4, ymm14) + vmovapd( ymm4, ymm15) +#endif + + mov(var(b), rbx) // load address of b. + //mov(r12, rcx) // reset rcx to current utile of c. + mov(r14, rax) // reset rax to current upanel of a. + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + mov(var(ps_a8), rdx) // load ps_a8 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a8 + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + // use rcx, rdx for prefetching lines + // from next upanel of a. +#else + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.DLOOPKITER) // MAIN LOOP @@ -8912,6 +10962,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } + void bli_dgemmsup_rv_haswell_asm_6x4m ( conj_t conja, @@ -10206,5 +12257,3 @@ void bli_dgemmsup_rv_haswell_asm_6x2m } AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } - - diff --git a/kernels/haswell/3/sup/d6x8/CMakeLists.txt b/kernels/haswell/3/sup/d6x8/CMakeLists.txt index c74dff9372..24edd62ba5 100644 --- a/kernels/haswell/3/sup/d6x8/CMakeLists.txt +++ b/kernels/haswell/3/sup/d6x8/CMakeLists.txt @@ -8,8 +8,11 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx2.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx4.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx8.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx2.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx3.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx4.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx5.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx6.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx7.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx8.c ) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c new file mode 100644 index 0000000000..795ca5772b --- /dev/null +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c @@ -0,0 +1,2137 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + + +#define C_TRANSPOSE_5x3_TILE(R1, R2, R3, R4, R5)\ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm3, ymm1, ymm(R4))\ +\ + vbroadcastsd(mem(rbx), ymm3)\ +\ + vfmadd231pd(mem(rcx ), ymm3, ymm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm(R3))\ + vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm(R4))\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ +\ + /*Transposing 4x1 tile*/ \ + vmovlpd(mem(rdx ), xmm0, xmm0)\ + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)\ + vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1)\ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0)\ +\ + vfmadd213pd(ymm(R5), ymm3, ymm0)\ + vextractf128(imm(1), ymm0, xmm1)\ + vmovlpd(xmm0, mem(rdx ))\ + vmovhpd(xmm0, mem(rdx, rsi, 1))\ + vmovlpd(xmm1, mem(rdx, rsi, 2)) + +#define C_TRANSPOSE_5x3_TILE_BZ(R1, R2, R3, R4, R5)\ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm3, ymm1, ymm(R4))\ +\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ +\ + /*Transposing 1x3 tile*/ \ + vextractf128(imm(1), ymm(R5), xmm1)\ + vmovlpd(xmm(R5), mem(rdx ))\ + vmovhpd(xmm(R5), mem(rdx, rsi, 1))\ + vmovlpd(xmm1, mem(rdx, rsi, 2)) + + +#define C_TRANSPOSE_4x3_TILE(R1, R2, R3, R4)\ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ +\ + vbroadcastsd(mem(rbx), ymm3)\ +\ + vfmadd231pd(mem(rcx ), ymm3, ymm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm(R3))\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2)) + +#define C_TRANSPOSE_4x3_TILE_BZ(R1, R2, R3, R4)\ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ +\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2)) + +#define C_TRANSPOSE_3x3_TILE(R1, R2, R3)\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(10), ymm(R3), ymm2)\ + vunpckhpd(ymm(10), ymm(R3), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ +\ + vextractf128(imm(0x1), ymm(R1), xmm12)\ + vextractf128(imm(0x1), ymm(R2), xmm13)\ + vextractf128(imm(0x1), ymm(R3), xmm14)\ +\ + vbroadcastsd(mem(rbx), ymm3)\ +\ + vfmadd231pd(mem(rcx ), xmm3, xmm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm(R3))\ + vmovupd(xmm(R1), mem(rcx ))\ + vmovupd(xmm(R2), mem(rcx, rsi, 1))\ + vmovupd(xmm(R3), mem(rcx, rsi, 2))\ +\ + /*Transposing 1x3 tile*/ \ + vfmadd231sd(mem(rdx ), xmm3, xmm12)\ + vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)\ + vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14)\ + vmovsd(xmm12, mem(rdx ))\ + vmovsd(xmm13, mem(rdx, rsi, 1))\ + vmovsd(xmm14, mem(rdx, rsi, 2)) + +#define C_TRANSPOSE_3x3_TILE_BZ(R1, R2, R3)\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(10), ymm(R3), ymm2)\ + vunpckhpd(ymm(10), ymm(R3), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ +\ + vextractf128(imm(0x1), ymm(R1), xmm12)\ + vextractf128(imm(0x1), ymm(R2), xmm13)\ + vextractf128(imm(0x1), ymm(R3), xmm14)\ +\ + vmovupd(xmm(R1), mem(rcx ))\ + vmovupd(xmm(R2), mem(rcx, rsi, 1))\ + vmovupd(xmm(R3), mem(rcx, rsi, 2))\ +\ + /*Transposing 1x3 tile*/ \ + vmovlpd(xmm(12), mem(rdx ))\ + vmovlpd(xmm(13), mem(rdx, rsi, 1))\ + vmovlpd(xmm(14), mem(rdx, rsi, 2)) + +#define C_TRANSPOSE_2x3_TILE(R1, R2)\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ +\ + vbroadcastsd(mem(rbx), ymm3)\ + vfmadd231pd(mem(rcx ), xmm3, xmm0)\ + vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1)\ + vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2)\ + vmovupd(xmm0, mem(rcx ))\ + vmovupd(xmm1, mem(rcx, rsi, 1))\ + vmovupd(xmm2, mem(rcx, rsi, 2)) + + +#define C_TRANSPOSE_2x3_TILE_BZ(R1, R2)\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ +\ + vmovupd(xmm0, mem(rcx ))\ + vmovupd(xmm1, mem(rcx, rsi, 1))\ + vmovupd(xmm2, mem(rcx, rsi, 2)) + +#define C_TRANSPOSE_1x3_TILE(R1)\ + vmovlpd(mem(rcx ), xmm0, xmm0)\ + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)\ + vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1)\ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0)\ +\ + vbroadcastsd(mem(rbx), ymm3)\ + vfmadd213pd(ymm(R1), ymm3, ymm0)\ +\ + vextractf128(imm(1), ymm0, xmm1)\ + vmovlpd(xmm0, mem(rcx ))\ + vmovhpd(xmm0, mem(rcx, rsi, 1))\ + vmovlpd(xmm1, mem(rcx, rsi, 2)) + +#define C_TRANSPOSE_1x3_TILE_BZ(R1)\ + vextractf128(imm(1), ymm(R1), xmm1)\ + vmovlpd(xmm(R1), mem(rcx ))\ + vmovhpd(xmm(R1), mem(rcx, rsi, 1))\ + vmovlpd(xmm1, mem(rcx, rsi, 2)) + +static const int64_t mask_3[4] = {-1, -1, -1, 0}; + +void bli_dgemmsup_rv_haswell_asm_5x3 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. +// + int64_t const *mask_vec = mask_3; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + lea(mem(r9, r9, 2), r15) // r15 = 3*cs_a + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 2*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 2*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 2*8)) // prefetch c + 4*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 5*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 5*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + prefetch(0, mem(rdx, r15, 1, 5*8)) // a_prefetch += 3*cs_a; + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm10, ymm10) + vmulpd(ymm0, ymm12, ymm12) + + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + + add(rdi, rcx) + //-----------------------1 + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + + add(rdi, rcx) + //-----------------------2 + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + + add(rdi, rcx) + //-----------------------3 + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) + vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + + add(rdi, rcx) + //-----------------------4 + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm12) + vmaskmovpd(ymm12, ymm15, mem(rcx, 0*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + C_TRANSPOSE_5x3_TILE(4, 6, 8, 10, 12) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm12, ymm15, mem(rcx, 0*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_5x3_TILE_BZ(4, 6, 8, 10, 12) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_4x3 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. +// + int64_t const *mask_vec = mask_3; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 2*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 2*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 2*8)) // prefetch c + 4*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 2), rdx) // a_prefetch += 2*cs_a; + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 3*cs_a; + prefetch(0, mem(rdx, 4*8)) + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm10, ymm10) + + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + + add(rdi, rcx) + //-----------------------1 + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + + add(rdi, rcx) + //-----------------------2 + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + + add(rdi, rcx) + //-----------------------3 + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) + vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_4x3_TILE(4, 6, 8, 10) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_4x3_TILE_BZ(4, 6, 8, 10) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_3x3 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. +// + int64_t const *mask_vec = mask_3; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 2*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 2*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 2*8)) // prefetch c + 4*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 3*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 2), rdx) // a_prefetch += 2*cs_a; + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 3*cs_a; + prefetch(0, mem(rdx, 4*8)) + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm1, ymm2, ymm8) + + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm8, ymm8) + + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + + add(rdi, rcx) + //-----------------------1 + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + + add(rdi, rcx) + //-----------------------2 + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_3x3_TILE(4, 6, 8) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_3x3_TILE_BZ(4, 6, 8) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_2x3 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. +// + int64_t const *mask_vec = mask_3; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 2*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 2*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 2*8)) // prefetch c + 4*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 2*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 2*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 2), rdx) // a_prefetch += 2*cs_a; + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 3*cs_a; + prefetch(0, mem(rdx, 4*8)) + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + + add(rdi, rcx) + //-----------------------1 + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_2x3_TILE(4, 6) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + + jmp(.DDONE) // jump to end. + + label(.DCOLSTORBZ) + + C_TRANSPOSE_2x3_TILE_BZ(4, 6) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_1x3 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. +// + int64_t const *mask_vec = mask_3; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 1*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 1*8)) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 2), rdx) // a_prefetch += 2*cs_a; + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 3*cs_a; + prefetch(0, mem(rdx, 4*8)) + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_1x3_TILE(4) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_1x3_TILE_BZ(4) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", + "memory" + ) +} diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c new file mode 100644 index 0000000000..ac12db75c1 --- /dev/null +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c @@ -0,0 +1,2519 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + +//3, 5, 7, 9, 11, 13, 4, 6, 8, 10, 12, 14 +#define C_TRANSPOSE_5x5_TILE(R1, R2, R3, R4, R5, R6, R7, R8, R9, R10) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + /*Broadcasting Beta into ymm15 vector register*/\ + vbroadcastsd(mem(rbx), ymm15)\ +\ + vfmadd231pd(mem(rcx ), ymm15, ymm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm15, ymm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm15, ymm(R3))\ + vfmadd231pd(mem(rcx, rax, 1), ymm15, ymm(R4))\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 1x4 tile*/ \ + vmovlpd(mem(rdx ), xmm0, xmm0)\ + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)\ + vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1)\ + vmovhpd(mem(rdx, rax, 1), xmm1, xmm1)\ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0)\ +\ + /*Transposing 4x1 tile*/ \ + vfmadd213pd(ymm(R5), ymm15, ymm0)\ + vextractf128(imm(1), ymm0, xmm1)\ + vmovlpd(xmm0, mem(rdx ))\ + vmovhpd(xmm0, mem(rdx, rsi, 1))\ + vmovlpd(xmm1, mem(rdx, rsi, 2))\ + vmovhpd(xmm1, mem(rdx, rax, 1))\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + vunpcklpd(ymm(R7), ymm(R6), ymm0)\ + vunpcklpd(ymm(R9), ymm(R8), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R6))\ +\ + vfmadd231pd(mem(rcx ), ymm15, ymm(R6))\ + vmovupd(ymm(R6), mem(rcx ))\ +\ + /*Transposing 1x1 tile*/ \ + vmovlpd(mem(rdx ), xmm0, xmm0)\ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0)\ +\ + vfmadd213pd(ymm(R10), ymm15, ymm0)\ + vmovlpd(xmm0, mem(rdx )) + +#define C_TRANSPOSE_5x5_TILE_BZ(R1, R2, R3, R4, R5, R6, R7, R8, R9, R10) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 1x4 tile*/ \ + vextractf128(imm(1), ymm(R5), xmm1)\ + vmovlpd(xmm(R5), mem(rdx ))\ + vmovhpd(xmm(R5), mem(rdx, rsi, 1))\ + vmovlpd(xmm1, mem(rdx, rsi, 2))\ + vmovhpd(xmm1, mem(rdx, rax, 1))\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 1x4 tile*/ \ + vunpcklpd(ymm(R7), ymm(R6), ymm0)\ + vunpcklpd(ymm(R9), ymm(R8), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R6))\ +\ + vmovupd(ymm(R6), mem(rcx ))\ +\ + /*Transposing 1x1 tile*/ \ + vmovlpd(xmm(R10), mem(rdx )) + + +#define C_TRANSPOSE_4x5_TILE(R1, R2, R3, R4, R5, R6, R7, R8) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + vbroadcastsd(mem(rbx), ymm15)\ +\ + vfmadd231pd(mem(rcx ), ymm15, ymm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm15, ymm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm15, ymm(R3))\ + vfmadd231pd(mem(rcx, rax, 1), ymm15, ymm(R4))\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 4x1 tile*/ \ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ + vunpcklpd(ymm(R8), ymm(R7), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R5))\ +\ + vfmadd231pd(mem(rcx ), ymm15, ymm(R5))\ + vmovupd(ymm(R5), mem(rcx )) + +#define C_TRANSPOSE_4x5_TILE_BZ(R1, R2, R3, R4, R5, R6, R7, R8) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 4x1 tile*/ \ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ + vunpcklpd(ymm(R8), ymm(R7), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R5))\ +\ + vmovupd(ymm(R5), mem(rcx )) + +//3, 5, 7, 4, 6, 8 +#define C_TRANSPOSE_3x5_TILE(R1, R2, R3, R4, R5, R6) \ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm10, ymm(R3), ymm2)\ + vunpckhpd(ymm10, ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm10)\ +\ + vextractf128(imm(0x1), ymm(R1), xmm12)\ + vextractf128(imm(0x1), ymm(R2), xmm13)\ + vextractf128(imm(0x1), ymm(R3), xmm14)\ + vextractf128(imm(0x1), ymm10, xmm15)\ +\ + vbroadcastsd(mem(rbx), ymm11)\ +\ + vfmadd231pd(mem(rcx ), xmm11, xmm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), xmm11, xmm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), xmm11, xmm(R3))\ + vfmadd231pd(mem(rcx, rax, 1), xmm11, xmm10)\ + vmovupd(xmm(R1), mem(rcx ))\ + vmovupd(xmm(R2), mem(rcx, rsi, 1))\ + vmovupd(xmm(R3), mem(rcx, rsi, 2))\ + vmovupd(xmm10, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 1x4 tile*/ \ + vfmadd231sd(mem(rdx ), xmm11, xmm12)\ + vfmadd231sd(mem(rdx, rsi, 1), xmm11, xmm13)\ + vfmadd231sd(mem(rdx, rsi, 2), xmm11, xmm14)\ + vfmadd231sd(mem(rdx, rax, 1), xmm11, xmm15)\ + vmovsd(xmm12, mem(rdx ))\ + vmovsd(xmm13, mem(rdx, rsi, 1))\ + vmovsd(xmm14, mem(rdx, rsi, 2))\ + vmovsd(xmm15, mem(rdx, rax, 1))\ + \ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 2x1 tile*/ \ + vunpcklpd(ymm(R5), ymm(R4), ymm0)\ + vunpcklpd(ymm11, ymm(R6), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R4))\ +\ + vextractf128(imm(0x1), ymm(R4), xmm12)\ +\ + vbroadcastsd(mem(rbx), ymm3)\ +\ + vfmadd231pd(mem(rcx ), xmm3, xmm(R4))\ + vmovupd(xmm(R4), mem(rcx ))\ +\ + /*Transposing 1x1 tile*/ \ + vfmadd231sd(mem(rdx ), xmm3, xmm12)\ + vmovsd(xmm12, mem(rdx )) + +#define C_TRANSPOSE_3x5_TILE_BZ(R1, R2, R3, R4, R5, R6) \ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm10, ymm(R3), ymm2)\ + vunpckhpd(ymm10, ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm10)\ +\ + vextractf128(imm(0x1), ymm(R1), xmm12)\ + vextractf128(imm(0x1), ymm(R2), xmm13)\ + vextractf128(imm(0x1), ymm(R3), xmm14)\ + vextractf128(imm(0x1), ymm10, xmm15)\ +\ + vmovupd(xmm(R1), mem(rcx ))\ + vmovupd(xmm(R2), mem(rcx, rsi, 1))\ + vmovupd(xmm(R3), mem(rcx, rsi, 2))\ + vmovupd(xmm10, mem(rcx, rax, 1))\ +\ + /*Transposing 1x4 tile*/ \ + lea(mem(rcx, rsi, 4), rcx)\ + vmovsd(xmm12, mem(rdx ))\ + vmovsd(xmm13, mem(rdx, rsi, 1))\ + vmovsd(xmm14, mem(rdx, rsi, 2))\ + vmovsd(xmm15, mem(rdx, rax, 1))\ + \ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 2x1 tile*/ \ + vunpcklpd(ymm(R5), ymm(R4), ymm0)\ + vunpcklpd(ymm11, ymm(R6), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R4))\ +\ + vextractf128(imm(0x1), ymm(R4), xmm12)\ +\ + vmovupd(xmm(R4), mem(rcx ))\ +\ + /*Transposing 1x1 tile*/ \ + vmovsd(xmm12, mem(rdx )) + +//3, 5, 4, 6 +#define C_TRANSPOSE_2x5_TILE(R1, R2, R3, R4) \ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm7)\ +\ + vbroadcastsd(mem(rbx), ymm3)\ + vfmadd231pd(mem(rcx ), xmm3, xmm0)\ + vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1)\ + vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2)\ + vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm7)\ + vmovupd(xmm0, mem(rcx ))\ + vmovupd(xmm1, mem(rcx, rsi, 1))\ + vmovupd(xmm2, mem(rcx, rsi, 2))\ + vmovupd(xmm7, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 2x1 tile*/ \ + vunpcklpd(ymm(R4), ymm(R3), ymm0)\ +\ + vfmadd231pd(mem(rcx ), xmm3, xmm0)\ + vmovupd(xmm0, mem(rcx )) + +#define C_TRANSPOSE_2x5_TILE_BZ(R1, R2, R3, R4) \ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm7)\ +\ + vmovupd(xmm0, mem(rcx ))\ + vmovupd(xmm1, mem(rcx, rsi, 1))\ + vmovupd(xmm2, mem(rcx, rsi, 2))\ + vmovupd(xmm7, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 2x1 tile*/ \ + vunpcklpd(ymm(R4), ymm(R3), ymm0)\ +\ + vmovupd(xmm0, mem(rcx )) + +#define C_TRANSPOSE_1x5_TILE(R1, R2) \ + /*Transposing 1x4 tile*/ \ + vmovlpd(mem(rcx ), xmm0, xmm0)\ + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)\ + vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1)\ + vmovhpd(mem(rcx, rax, 1), xmm1, xmm1)\ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0)\ +\ + vbroadcastsd(mem(rbx), ymm15)\ + vfmadd213pd(ymm(R1), ymm15, ymm0)\ +\ + vextractf128(imm(1), ymm0, xmm1)\ + vmovlpd(xmm0, mem(rcx ))\ + vmovhpd(xmm0, mem(rcx, rsi, 1))\ + vmovlpd(xmm1, mem(rcx, rsi, 2))\ + vmovhpd(xmm1, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + vmovlpd(mem(rcx ), xmm0, xmm0)\ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0)\ +\ + vfmadd213pd(ymm(R2), ymm15, ymm0)\ +\ + /*Transposing 1x1 tile*/ \ + vextractf128(imm(1), ymm0, xmm1)\ + vmovlpd(xmm0, mem(rcx )) + +#define C_TRANSPOSE_1x5_TILE_BZ(R1, R2) \ + vextractf128(imm(1), ymm(R1), xmm1)\ + vmovlpd(xmm(R1), mem(rcx ))\ + vmovhpd(xmm(R1), mem(rcx, rsi, 1))\ + vmovlpd(xmm1, mem(rcx, rsi, 2))\ + vmovhpd(xmm1, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + vextractf128(imm(1), ymm(R2), xmm1)\ + vmovlpd(xmm(R2), mem(rcx )) + +static const int64_t mask_1[4] = {-1, 0, 0, 0}; + +void bli_dgemmsup_rv_haswell_asm_5x5 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 5 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 1 element, +// kernel is using mask_1 which is set to -1, 0, 0, 0 static that the +// 1 element will be loaded and other 3 elements will be set to 0 in destination vector. +// + int64_t const *mask_vec = mask_1; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 4*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 4*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 4*8)) // prefetch c + 4*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rcx, rdx, 1, 4*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rcx, rdx, 2, 4*8)) // prefetch c + 4*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm7, ymm7) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm0, ymm10, ymm10) + vmulpd(ymm0, ymm11, ymm11) + vmulpd(ymm0, ymm12, ymm12) + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------4 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm11) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm12) + + vmovupd(ymm11, mem(rcx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + C_TRANSPOSE_5x5_TILE(3, 5, 7, 9, 11, 4, 6, 8, 10, 12) + + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------4 + vmovupd(ymm11, mem(rcx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + C_TRANSPOSE_5x5_TILE_BZ(3, 5, 7, 9, 11, 4, 6, 8, 10, 12) + label(.DDONE) + + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", + "ymm5", "ymm7", "ymm9", "ymm11", "ymm15", + "memory" + ) +} + + +void bli_dgemmsup_rv_haswell_asm_4x5 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 5 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 1 element, +// kernel is using mask_1 which is set to -1, 0, 0, 0 static that the +// 1 element will be loaded and other 3 elements will be set to 0 in destination vector. +// + int64_t const *mask_vec = mask_1; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 4*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 4*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rcx, rdx, 1, 3*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rcx, rdx, 2, 3*8)) // prefetch c + 4*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm7, ymm7) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm0, ymm10, ymm10) + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + //-----------------------4 + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + C_TRANSPOSE_4x5_TILE(3, 5, 7, 9, 4, 6, 8, 10) + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + //-----------------------4 + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + C_TRANSPOSE_4x5_TILE_BZ(3, 5, 7, 9, 4, 6, 8, 10) + label(.DDONE) + + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", + "ymm5", "ymm7", "ymm9", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_3x5 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 5 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 1 element, +// kernel is using mask_1 which is set to -1, 0, 0, 0 static that the +// 1 element will be loaded and other 3 elements will be set to 0 in destination vector. +// + int64_t const *mask_vec = mask_1; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 4*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 4*8)) // prefetch c + 2*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rcx, rdx, 1, 2*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rcx, rdx, 2, 2*8)) // prefetch c + 4*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm7, ymm7) + vmulpd(ymm0, ymm8, ymm8) + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + C_TRANSPOSE_3x5_TILE(3, 5, 7, 4, 6, 8) + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + add(rdi, rcx) + + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + add(rdi, rcx) + + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + C_TRANSPOSE_3x5_TILE_BZ(3, 5, 7, 4, 6, 8) + label(.DDONE) + + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", + "ymm5", "ymm7", "ymm11", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_2x5 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 5 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 1 element, +// kernel is using mask_1 which is set to -1, 0, 0, 0 static that the +// 1 element will be loaded and other 3 elements will be set to 0 in destination vector. +// + int64_t const *mask_vec = mask_1; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 4*8)) // prefetch c + 1*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rcx, rdx, 1, 1*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rcx, rdx, 2, 1*8)) // prefetch c + 4*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + C_TRANSPOSE_2x5_TILE(3, 5, 4, 6) + + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + add(rdi, rcx) + + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + C_TRANSPOSE_2x5_TILE_BZ(3, 5, 4, 6) + label(.DDONE) + + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", + "ymm5", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_1x5 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 5 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 1 element, +// kernel is using mask_1 which is set to -1, 0, 0, 0 static that the +// 1 element will be loaded and other 3 elements will be set to 0 in destination vector. +// + int64_t const *mask_vec = mask_1; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rcx, rdx, 1, 0*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rcx, rdx, 2, 0*8)) // prefetch c + 4*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + + + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + C_TRANSPOSE_1x5_TILE(3, 4) + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + C_TRANSPOSE_1x5_TILE_BZ(3, 4) + label(.DDONE) + + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", + "ymm15", + "memory" + ) +} diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c new file mode 100644 index 0000000000..8c14eba4af --- /dev/null +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c @@ -0,0 +1,2602 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + +//3, 5, 7, 9, 11, 13, 4, 6, 8, 10, 12, 14 +#define C_TRANSPOSE_5x7_TILE(R1, R2, R3, R4, R5, R6, R7, R8, R9, R10) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + /*Broadcasting Beta into ymm15 vector register*/\ + vbroadcastsd(mem(rbx), ymm15)\ +\ + /*Scaling C matrix by Beta and adding it to fma result.*/ \ + /*R1, R2, R3, R4 holds final result*/ \ + vfmadd231pd(mem(rcx ), ymm15, ymm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm15, ymm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm15, ymm(R3))\ + vfmadd231pd(mem(rcx, rax, 1), ymm15, ymm(R4))\ + /*Storing it back to C matrix.*/ \ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + /*Moving to operate on last 1 row of 5 rows.*/ \ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 1x4 tile*/ \ + vmovlpd(mem(rdx ), xmm0, xmm0)\ + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)\ + vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1)\ + vmovhpd(mem(rdx, rax, 1), xmm1, xmm1)\ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0)\ +\ + vfmadd213pd(ymm(R5), ymm15, ymm0)\ + vextractf128(imm(1), ymm0, xmm1)\ + vmovlpd(xmm0, mem(rdx ))\ + vmovhpd(xmm0, mem(rdx, rsi, 1))\ + vmovlpd(xmm1, mem(rdx, rsi, 2))\ + vmovhpd(xmm1, mem(rdx, rax, 1))\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R7), ymm(R6), ymm0)\ + vunpckhpd(ymm(R7), ymm(R6), ymm1)\ + vunpcklpd(ymm(R9), ymm(R8), ymm2)\ + vunpckhpd(ymm(R9), ymm(R8), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R6))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R7))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R8))\ +\ + vfmadd231pd(mem(rcx ), ymm15, ymm(R6))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm15, ymm(R7))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm15, ymm(R8))\ + vmovupd(ymm(R6), mem(rcx ))\ + vmovupd(ymm(R7), mem(rcx, rsi, 1))\ + vmovupd(ymm(R8), mem(rcx, rsi, 2))\ +\ + vmovlpd(mem(rdx ), xmm0, xmm0)\ + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0)\ + vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1)\ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0)\ +\ + /*Transposing 1x3 tile*/ \ + vfmadd213pd(ymm(R10), ymm15, ymm0)\ + vextractf128(imm(1), ymm0, xmm1)\ + vmovlpd(xmm0, mem(rdx ))\ + vmovhpd(xmm0, mem(rdx, rsi, 1))\ + vmovlpd(xmm1, mem(rdx, rsi, 2)) + +#define C_TRANSPOSE_5x7_TILE_BZ(R1, R2, R3, R4, R5, R6, R7, R8, R9, R10) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 1x4 tile*/ \ + vextractf128(imm(1), ymm(R5), xmm1)\ + vmovlpd(xmm(R5), mem(rdx ))\ + vmovhpd(xmm(R5), mem(rdx, rsi, 1))\ + vmovlpd(xmm1, mem(rdx, rsi, 2))\ + vmovhpd(xmm1, mem(rdx, rax, 1))\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R7), ymm(R6), ymm0)\ + vunpckhpd(ymm(R7), ymm(R6), ymm1)\ + vunpcklpd(ymm(R9), ymm(R8), ymm2)\ + vunpckhpd(ymm(R9), ymm(R8), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R6))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R7))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R8))\ +\ + vmovupd(ymm(R6), mem(rcx ))\ + vmovupd(ymm(R7), mem(rcx, rsi, 1))\ + vmovupd(ymm(R8), mem(rcx, rsi, 2))\ +\ + /*Transposing 1x3 tile*/ \ + vextractf128(imm(1), ymm(R10), xmm1)\ + vmovlpd(xmm(R10), mem(rdx ))\ + vmovhpd(xmm(R10), mem(rdx, rsi, 1))\ + vmovlpd(xmm1, mem(rdx, rsi, 2)) + +#define C_TRANSPOSE_4x7_TILE(R1, R2, R3, R4, R5, R6, R7, R8) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + vbroadcastsd(mem(rbx), ymm15)\ +\ + vfmadd231pd(mem(rcx ), ymm15, ymm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm15, ymm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm15, ymm(R3))\ + vfmadd231pd(mem(rcx, rax, 1), ymm15, ymm(R4))\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ + vunpckhpd(ymm(R6), ymm(R5), ymm1)\ + vunpcklpd(ymm(R8), ymm(R7), ymm2)\ + vunpckhpd(ymm(R8), ymm(R7), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R5))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R6))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R7))\ +\ + vfmadd231pd(mem(rcx ), ymm15, ymm(R5))\ + vfmadd231pd(mem(rcx, rsi, 1), ymm15, ymm(R6))\ + vfmadd231pd(mem(rcx, rsi, 2), ymm15, ymm(R7))\ + vmovupd(ymm(R5), mem(rcx ))\ + vmovupd(ymm(R6), mem(rcx, rsi, 1))\ + vmovupd(ymm(R7), mem(rcx, rsi, 2)) + +#define C_TRANSPOSE_4x7_TILE_BZ(R1, R2, R3, R4, R5, R6, R7, R8) \ + /*Transposing 4x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vunpckhpd(ymm(R4), ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm(R4))\ +\ + vmovupd(ymm(R1), mem(rcx ))\ + vmovupd(ymm(R2), mem(rcx, rsi, 1))\ + vmovupd(ymm(R3), mem(rcx, rsi, 2))\ + vmovupd(ymm(R4), mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 4x3 tile*/ \ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ + vunpckhpd(ymm(R6), ymm(R5), ymm1)\ + vunpcklpd(ymm(R8), ymm(R7), ymm2)\ + vunpckhpd(ymm(R8), ymm(R7), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R5))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R6))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R7))\ +\ + vmovupd(ymm(R5), mem(rcx ))\ + vmovupd(ymm(R6), mem(rcx, rsi, 1))\ + vmovupd(ymm(R7), mem(rcx, rsi, 2)) + +//3, 5, 7, 4, 6, 8 +#define C_TRANSPOSE_3x7_TILE(R1, R2, R3, R4, R5, R6) \ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm10, ymm(R3), ymm2)\ + vunpckhpd(ymm10, ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm10)\ +\ + /*Transposing 1x4 tile*/ \ + vextractf128(imm(0x1), ymm(R1), xmm12)\ + vextractf128(imm(0x1), ymm(R2), xmm13)\ + vextractf128(imm(0x1), ymm(R3), xmm14)\ + vextractf128(imm(0x1), ymm10, xmm15)\ +\ + vbroadcastsd(mem(rbx), ymm11)\ +\ + vfmadd231pd(mem(rcx ), xmm11, xmm(R1))\ + vfmadd231pd(mem(rcx, rsi, 1), xmm11, xmm(R2))\ + vfmadd231pd(mem(rcx, rsi, 2), xmm11, xmm(R3))\ + vfmadd231pd(mem(rcx, rax, 1), xmm11, xmm10)\ + vmovupd(xmm(R1), mem(rcx ))\ + vmovupd(xmm(R2), mem(rcx, rsi, 1))\ + vmovupd(xmm(R3), mem(rcx, rsi, 2))\ + vmovupd(xmm10, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + vfmadd231sd(mem(rdx ), xmm11, xmm12)\ + vfmadd231sd(mem(rdx, rsi, 1), xmm11, xmm13)\ + vfmadd231sd(mem(rdx, rsi, 2), xmm11, xmm14)\ + vfmadd231sd(mem(rdx, rax, 1), xmm11, xmm15)\ + vmovsd(xmm12, mem(rdx ))\ + vmovsd(xmm13, mem(rdx, rsi, 1))\ + vmovsd(xmm14, mem(rdx, rsi, 2))\ + vmovsd(xmm15, mem(rdx, rax, 1))\ + \ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R5), ymm(R4), ymm0)\ + vunpckhpd(ymm(R5), ymm(R4), ymm1)\ + vunpcklpd(ymm11, ymm(R6), ymm2)\ + vunpckhpd(ymm11, ymm(R6), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R4))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R5))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R6))\ +\ + /*Transposing 1x3 tile*/ \ + vextractf128(imm(0x1), ymm(R4), xmm12)\ + vextractf128(imm(0x1), ymm(R5), xmm13)\ + vextractf128(imm(0x1), ymm(R6), xmm14)\ +\ + vfmadd231pd(mem(rcx ), xmm11, xmm(R4))\ + vfmadd231pd(mem(rcx, rsi, 1), xmm11, xmm(R5))\ + vfmadd231pd(mem(rcx, rsi, 2), xmm11, xmm(R6))\ + vmovupd(xmm(R4), mem(rcx ))\ + vmovupd(xmm(R5), mem(rcx, rsi, 1))\ + vmovupd(xmm(R6), mem(rcx, rsi, 2))\ +\ + vfmadd231sd(mem(rdx ), xmm11, xmm12)\ + vfmadd231sd(mem(rdx, rsi, 1), xmm11, xmm13)\ + vfmadd231sd(mem(rdx, rsi, 2), xmm11, xmm14)\ + vmovsd(xmm12, mem(rdx ))\ + vmovsd(xmm13, mem(rdx, rsi, 1))\ + vmovsd(xmm14, mem(rdx, rsi, 2)) + +#define C_TRANSPOSE_3x7_TILE_BZ(R1, R2, R3, R4, R5, R6) \ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vunpcklpd(ymm10, ymm(R3), ymm2)\ + vunpckhpd(ymm10, ymm(R3), ymm15)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ + vinsertf128(imm(0x1), xmm15, ymm1, ymm(R2))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R3))\ + vperm2f128(imm(0x31), ymm15, ymm1, ymm10)\ +\ + /*Transposing 1x4 tile*/ \ + vextractf128(imm(0x1), ymm(R1), xmm12)\ + vextractf128(imm(0x1), ymm(R2), xmm13)\ + vextractf128(imm(0x1), ymm(R3), xmm14)\ + vextractf128(imm(0x1), ymm10, xmm15)\ +\ + vmovupd(xmm(R1), mem(rcx ))\ + vmovupd(xmm(R2), mem(rcx, rsi, 1))\ + vmovupd(xmm(R3), mem(rcx, rsi, 2))\ + vmovupd(xmm10, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ + vmovsd(xmm12, mem(rdx ))\ + vmovsd(xmm13, mem(rdx, rsi, 1))\ + vmovsd(xmm14, mem(rdx, rsi, 2))\ + vmovsd(xmm15, mem(rdx, rax, 1))\ + \ + lea(mem(rdx, rsi, 4), rdx)\ +\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R5), ymm(R4), ymm0)\ + vunpckhpd(ymm(R5), ymm(R4), ymm1)\ + vunpcklpd(ymm11, ymm(R6), ymm2)\ + vunpckhpd(ymm11, ymm(R6), ymm3)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R4))\ + vinsertf128(imm(0x1), xmm3, ymm1, ymm(R5))\ + vperm2f128(imm(0x31), ymm2, ymm0, ymm(R6))\ +\ + /*Transposing 1x3 tile*/ \ + vextractf128(imm(0x1), ymm(R4), xmm12)\ + vextractf128(imm(0x1), ymm(R5), xmm13)\ + vextractf128(imm(0x1), ymm(R6), xmm14)\ +\ + vmovupd(xmm(R4), mem(rcx ))\ + vmovupd(xmm(R5), mem(rcx, rsi, 1))\ + vmovupd(xmm(R6), mem(rcx, rsi, 2))\ +\ + vmovsd(xmm12, mem(rdx ))\ + vmovsd(xmm13, mem(rdx, rsi, 1))\ + vmovsd(xmm14, mem(rdx, rsi, 2)) + +//3, 5, 4, 6 +#define C_TRANSPOSE_2x7_TILE(R1, R2, R3, R4) \ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm7)\ +\ + vbroadcastsd(mem(rbx), ymm3)\ + vfmadd231pd(mem(rcx ), xmm3, xmm0)\ + vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1)\ + vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2)\ + vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm7)\ + vmovupd(xmm0, mem(rcx ))\ + vmovupd(xmm1, mem(rcx, rsi, 1))\ + vmovupd(xmm2, mem(rcx, rsi, 2))\ + vmovupd(xmm7, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R4), ymm(R3), ymm0)\ + vunpckhpd(ymm(R4), ymm(R3), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ +\ + vfmadd231pd(mem(rcx ), xmm3, xmm0)\ + vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1)\ + vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2)\ + vmovupd(xmm0, mem(rcx ))\ + vmovupd(xmm1, mem(rcx, rsi, 1))\ + vmovupd(xmm2, mem(rcx, rsi, 2)) + + +#define C_TRANSPOSE_2x7_TILE_BZ(R1, R2, R3, R4) \ + /*Transposing 2x4 tile*/ \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpckhpd(ymm(R2), ymm(R1), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ + vextractf128(imm(0x1), ymm1, xmm7)\ +\ + vmovupd(xmm0, mem(rcx ))\ + vmovupd(xmm1, mem(rcx, rsi, 1))\ + vmovupd(xmm2, mem(rcx, rsi, 2))\ + vmovupd(xmm7, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + /*Transposing 2x3 tile*/ \ + vunpcklpd(ymm(R4), ymm(R3), ymm0)\ + vunpckhpd(ymm(R4), ymm(R3), ymm1)\ + vextractf128(imm(0x1), ymm0, xmm2)\ +\ + vmovupd(xmm0, mem(rcx ))\ + vmovupd(xmm1, mem(rcx, rsi, 1))\ + vmovupd(xmm2, mem(rcx, rsi, 2)) + + +#define C_TRANSPOSE_1x7_TILE(R1, R2) \ + vmovlpd(mem(rcx ), xmm0, xmm0)\ + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)\ + vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1)\ + vmovhpd(mem(rcx, rax, 1), xmm1, xmm1)\ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0)\ +\ + vbroadcastsd(mem(rbx), ymm15)\ + vfmadd213pd(ymm(R1), ymm15, ymm0)\ +\ + vextractf128(imm(1), ymm0, xmm1)\ + vmovlpd(xmm0, mem(rcx ))\ + vmovhpd(xmm0, mem(rcx, rsi, 1))\ + vmovlpd(xmm1, mem(rcx, rsi, 2))\ + vmovhpd(xmm1, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ +\ + vmovlpd(mem(rcx ), xmm0, xmm0)\ + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0)\ + vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1)\ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0)\ +\ + vfmadd213pd(ymm(R2), ymm15, ymm0)\ +\ + vextractf128(imm(1), ymm0, xmm1)\ + vmovlpd(xmm0, mem(rcx ))\ + vmovhpd(xmm0, mem(rcx, rsi, 1))\ + vmovlpd(xmm1, mem(rcx, rsi, 2)) + + +#define C_TRANSPOSE_1x7_TILE_BZ(R1, R2) \ + vextractf128(imm(1), ymm(R1), xmm1)\ + vmovlpd(xmm(R1), mem(rcx ))\ + vmovhpd(xmm(R1), mem(rcx, rsi, 1))\ + vmovlpd(xmm1, mem(rcx, rsi, 2))\ + vmovhpd(xmm1, mem(rcx, rax, 1))\ +\ + lea(mem(rcx, rsi, 4), rcx)\ + vextractf128(imm(1), ymm(R2), xmm1)\ + vmovlpd(xmm(R2), mem(rcx ))\ + vmovhpd(xmm(R2), mem(rcx, rsi, 1))\ + vmovlpd(xmm1, mem(rcx, rsi, 2)) + +static const int64_t mask_3[4] = {-1, -1, -1, 0}; + +void bli_dgemmsup_rv_haswell_asm_5x7 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 7 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 3 elements, +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. + int64_t const *mask_vec = mask_3; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 6*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 6*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 6*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 6*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 6*8)) // prefetch c + 4*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c + lea(mem(rdx, rsi, 2), rdx) // rdx = 5*cs_c; + prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 6*cs_c + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm7, ymm7) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm0, ymm10, ymm10) + vmulpd(ymm0, ymm11, ymm11) + vmulpd(ymm0, ymm12, ymm12) + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------4 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm11) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm12) + + vmovupd(ymm11, mem(rcx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_5x7_TILE(3, 5, 7, 9, 11, 4, 6, 8, 10, 12) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------4 + + vmovupd(ymm11, mem(rcx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_5x7_TILE_BZ(3, 5, 7, 9, 11, 4, 6, 8, 10, 12) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", + "ymm5", "ymm7", "ymm9", "ymm15", + "memory" + ) +} + + +void bli_dgemmsup_rv_haswell_asm_4x7 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 7 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 3 elements, +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. + int64_t const *mask_vec = mask_3; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 6*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 6*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 6*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 6*8)) // prefetch c + 3*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c + lea(mem(rdx, rsi, 2), rdx) // rdx = 5*cs_c; + prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 6*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm7, ymm7) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm0, ymm10, ymm10) + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) + + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + //-----------------------4 + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_4x7_TILE(3, 5, 7, 9, 4, 6, 8, 10) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------3 + vmovupd(ymm9, mem(rcx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + //-----------------------4 + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_4x7_TILE_BZ(3, 5, 7, 9, 4, 6, 8, 10) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", + "ymm5", "ymm7", "ymm11", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_3x7 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 7 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 3 elements, +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. + int64_t const *mask_vec = mask_3; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 6*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 6*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 6*8)) // prefetch c + 2*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c + lea(mem(rdx, rsi, 2), rdx) // rdx = 5*cs_c; + prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 6*cs_c + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm7, ymm7) + vmulpd(ymm0, ymm8, ymm8) + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) + + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_3x7_TILE(3, 5, 7, 4, 6, 8) + jmp(.DDONE) // jump to end. + + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------2 + vmovupd(ymm7, mem(rcx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_3x7_TILE_BZ(3, 5, 7, 4, 6, 8) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", + "ymm5", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_2x7 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 7 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 3 elements, +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. + int64_t const *mask_vec = mask_3; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 6*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 6*8)) // prefetch c + 1*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c + lea(mem(rdx, rsi, 2), rdx) // rdx = 5*cs_c; + prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 6*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm6) + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_2x7_TILE(3, 5, 4, 6) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + add(rdi, rcx) + //-----------------------1 + + vmovupd(ymm5, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_2x7_TILE_BZ(3, 5, 4, 6) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", + "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_1x7 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// Since we have 7 elements to load, kernel will use one normal load +// that loads 4 elements into vector register and for remainder 3 elements, +// kernel is using mask_3 which is set to -1, -1, -1, 0 so that the +// 3 elements will be loaded and 4th element will be set to 0 in destination vector. + int64_t const *mask_vec = mask_3; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 6*8)) // prefetch c + 0*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 0*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 5*cs_c + lea(mem(rdx, rsi, 2), rdx) // rdx = 5*cs_c; + prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 6*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + + //Loads 4 element + vmovupd(mem(rbx, 0*32), ymm0) + //Loads 3 elements as per mask_3 mask vector + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm0, ymm2, ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm3, ymm3) // scale by alpha + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) + vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_1x7_TILE(3, 4) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_1x7_TILE_BZ(3, 4) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", + "ymm15", + "memory" + ) +} diff --git a/kernels/haswell/bli_kernels_haswell.h b/kernels/haswell/bli_kernels_haswell.h index d841d715f3..8c4e3c44ec 100644 --- a/kernels/haswell/bli_kernels_haswell.h +++ b/kernels/haswell/bli_kernels_haswell.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2022, Advanced Micro Devices, Inc. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -219,6 +219,12 @@ GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x7 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x7 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x7 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x7 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x7 ) + GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) @@ -226,6 +232,12 @@ GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x5 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x5 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x5 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x5 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x5 ) + GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) @@ -233,6 +245,12 @@ GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x3 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x3 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x3 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x3 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x3 ) + GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c index 477c710471..1d1c5105f0 100644 --- a/kernels/zen/3/bli_gemm_small.c +++ b/kernels/zen/3/bli_gemm_small.c @@ -2392,939 +2392,1119 @@ err_t bli_dgemm_small } m_remainder = M - row_idx; - - if (m_remainder >= 12) + if(m_remainder) { - m_remainder -= 12; - - for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + // Sets up the mask for loading relevant remainder elements in load direction + // int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. + // + // Low end High end * Low end High end + // ________________________ * ________________________ + // | | | | | * | | | | | + // | 1 | 2 | 3 | 4 | ----> Source vector * | 1 | 2 | 3 | 4 | ----> Source vector + // |_____|_____|_____|_____| * |_____|_____|_____|_____| + // * + // ________________________ * ________________________ + // | | | | | * | | | | | + // | -1 | -1 | -1 | 0 | ----> Mask vector( mask_3 ) | -1 | -1 | 0 | 0 | ----> Mask vector( mask_2 ) + // |_____|_____|_____|_____| * |_____|_____|_____|_____| + // * + // ________________________ * ________________________ + // | | | | | * | | | | | + // | 1 | 2 | 3 | 0 | ----> Destination vector * | 1 | 2 | 0 | 0 | ----> Destination vector + // |_____|_____|_____|_____| * |_____|_____|_____|_____| + // + // -1 sets all the bits to 1. + // + dim_t m_rem = 0; + int64_t mask_4[4] = {0}; + mask_4[0] = -1; + mask_4[1] = -1; + mask_4[2] = -1; + mask_4[3] = -1; + + int64_t mask_3[4] = {0}; + mask_3[0] = -1; + mask_3[1] = -1; + mask_3[2] = -1; + mask_3[3] = 0; + + int64_t mask_2[4] = {0}; + mask_2[0] = -1; + mask_2[1] = -1; + mask_2[2] = 0; + mask_2[3] = 0; + + int64_t mask_1[4] = {0}; + mask_1[0] = -1; + mask_1[1] = 0; + mask_1[2] = 0; + mask_1[3] = 0; + + int64_t *mask_ptr[] = {mask_4, mask_1, mask_2, mask_3, mask_4}; + if(m_remainder > 12) { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; - - // clear scratch registers. - ymm4 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm6 = _mm256_setzero_pd(); - ymm8 = _mm256_setzero_pd(); - ymm9 = _mm256_setzero_pd(); - ymm10 = _mm256_setzero_pd(); - ymm12 = _mm256_setzero_pd(); - ymm13 = _mm256_setzero_pd(); - ymm14 = _mm256_setzero_pd(); - - for (k = 0; k < K; ++k) + // Handles edge cases where remainder elements are between 12-16(13, 14, 15). + // Here m_rem gives index in mask_ptr that points which mask to be used based + // on remainder elements which could be 1, 2, or 3 here. + m_rem = (m_remainder % 12); + __m256i maskVec = _mm256_loadu_si256( (__m256i *)mask_ptr[m_rem]); + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); - ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); - tB += tb_inc_row; - - //broadcasted matrix B elements are multiplied - //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - // ymm4 += ymm0 * ymm3; - ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); - // ymm8 += ymm1 * ymm3; - ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); - // ymm12 += ymm2 * ymm3; - ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); - - ymm3 = _mm256_loadu_pd(tA + 4); - // ymm5 += ymm0 * ymm3; - ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); - // ymm9 += ymm1 * ymm3; - ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); - // ymm13 += ymm2 * ymm3; - ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); - - ymm3 = _mm256_loadu_pd(tA + 8); - // ymm6 += ymm0 * ymm3; - ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); - // ymm10 += ymm1 * ymm3; - ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); - // ymm14 += ymm2 * ymm3; - ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); - - tA += lda; - } - // alpha, beta multiplication. - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); - - //multiply A*B by alpha. - ymm4 = _mm256_mul_pd(ymm4, ymm0); - ymm5 = _mm256_mul_pd(ymm5, ymm0); - ymm6 = _mm256_mul_pd(ymm6, ymm0); - ymm8 = _mm256_mul_pd(ymm8, ymm0); - ymm9 = _mm256_mul_pd(ymm9, ymm0); - ymm10 = _mm256_mul_pd(ymm10, ymm0); - ymm12 = _mm256_mul_pd(ymm12, ymm0); - ymm13 = _mm256_mul_pd(ymm13, ymm0); - ymm14 = _mm256_mul_pd(ymm14, ymm0); - - if(is_beta_non_zero) - { - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(tC); - ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); - ymm2 = _mm256_loadu_pd(tC + 4); - ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); - ymm2 = _mm256_loadu_pd(tC + 8); - ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); - - // multiply C by beta and accumulate. - double *ttC = tC +ldc; - ymm2 = _mm256_loadu_pd(ttC); - ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); - ymm2 = _mm256_loadu_pd(ttC + 4); - ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); - ymm2 = _mm256_loadu_pd(ttC + 8); - ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); - - // multiply C by beta and accumulate. - ttC += ldc; - ymm2 = _mm256_loadu_pd(ttC); - ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); - ymm2 = _mm256_loadu_pd(ttC + 4); - ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); - ymm2 = _mm256_loadu_pd(ttC + 8); - ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - } - _mm256_storeu_pd(tC, ymm4); - _mm256_storeu_pd(tC + 4, ymm5); - _mm256_storeu_pd(tC + 8, ymm6); + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm11 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + ymm15 = _mm256_setzero_pd(); - tC += ldc; + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; - _mm256_storeu_pd(tC, ymm8); - _mm256_storeu_pd(tC + 4, ymm9); - _mm256_storeu_pd(tC + 8, ymm10); + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); + ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); - tC += ldc; + ymm3 = _mm256_loadu_pd(tA + 4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); + ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); - _mm256_storeu_pd(tC, ymm12); - _mm256_storeu_pd(tC + 4, ymm13); - _mm256_storeu_pd(tC + 8, ymm14); - } - n_remainder = N - col_idx; - // if the N is not multiple of 3. - // handling edge case. - if (n_remainder == 2) - { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; + ymm3 = _mm256_loadu_pd(tA + 8); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); + ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); - // clear scratch registers. - ymm8 = _mm256_setzero_pd(); - ymm9 = _mm256_setzero_pd(); - ymm10 = _mm256_setzero_pd(); - ymm12 = _mm256_setzero_pd(); - ymm13 = _mm256_setzero_pd(); - ymm14 = _mm256_setzero_pd(); + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA + 12, maskVec); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); + ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); - for (k = 0; k < K; ++k) - { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); - tB += tb_inc_row; + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm6 = _mm256_mul_pd(ymm6, ymm0); + ymm7 = _mm256_mul_pd(ymm7, ymm0); + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm11 = _mm256_mul_pd(ymm11, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); + ymm15 = _mm256_mul_pd(ymm15, ymm0); - //broadcasted matrix B elements are multiplied - //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - ymm8 = _mm256_fmadd_pd(ymm0, ymm3, ymm8); - ymm12 = _mm256_fmadd_pd(ymm1, ymm3, ymm12); + if(is_beta_non_zero) + { + // multiply C by beta and accumulate col 1. + ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC + 12, maskVec); + ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); + + // multiply C by beta and accumulate, col 2. + double* ttC = tC + ldc; + ymm2 = _mm256_loadu_pd(ttC); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_pd(ttC + 4); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_pd(ttC + 8); + ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC + 12, maskVec); + ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11); + + // multiply C by beta and accumulate, col 3. + ttC += ldc; + ymm2 = _mm256_loadu_pd(ttC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_pd(ttC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_pd(ttC + 8); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC + 12, maskVec); + ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); + } + _mm256_storeu_pd(tC, ymm4); + _mm256_storeu_pd(tC + 4, ymm5); + _mm256_storeu_pd(tC + 8, ymm6); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 12, maskVec, ymm7); - ymm3 = _mm256_loadu_pd(tA + 4); - ymm9 = _mm256_fmadd_pd(ymm0, ymm3, ymm9); - ymm13 = _mm256_fmadd_pd(ymm1, ymm3, ymm13); + tC += ldc; - ymm3 = _mm256_loadu_pd(tA + 8); - ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); - ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); + _mm256_storeu_pd(tC, ymm8); + _mm256_storeu_pd(tC + 4, ymm9); + _mm256_storeu_pd(tC + 8, ymm10); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 12, maskVec, ymm11); - tA += lda; + tC += ldc; + _mm256_storeu_pd(tC, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + _mm256_storeu_pd(tC + 8, ymm14); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 12, maskVec, ymm15); } - // alpha, beta multiplication. - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); - - //multiply A*B by alpha. - ymm8 = _mm256_mul_pd(ymm8, ymm0); - ymm9 = _mm256_mul_pd(ymm9, ymm0); - ymm10 = _mm256_mul_pd(ymm10, ymm0); - ymm12 = _mm256_mul_pd(ymm12, ymm0); - ymm13 = _mm256_mul_pd(ymm13, ymm0); - ymm14 = _mm256_mul_pd(ymm14, ymm0); - + n_remainder = N - col_idx; - if(is_beta_non_zero) + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) { - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(tC + 0); - ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); - ymm2 = _mm256_loadu_pd(tC + 4); - ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); - ymm2 = _mm256_loadu_pd(tC + 8); - ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); - - double *ttC = tC + ldc; + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(ttC); - ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); - ymm2 = _mm256_loadu_pd(ttC + 4); - ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); - ymm2 = _mm256_loadu_pd(ttC + 8); - ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + // clear scratch registers. + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm11 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + ymm15 = _mm256_setzero_pd(); - } - _mm256_storeu_pd(tC + 0, ymm8); - _mm256_storeu_pd(tC + 4, ymm9); - _mm256_storeu_pd(tC + 8, ymm10); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + tB += tb_inc_row; - tC += ldc; + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm8 = _mm256_fmadd_pd(ymm0, ymm3, ymm8); + ymm12 = _mm256_fmadd_pd(ymm1, ymm3, ymm12); - _mm256_storeu_pd(tC, ymm12); - _mm256_storeu_pd(tC + 4, ymm13); - _mm256_storeu_pd(tC + 8, ymm14); + ymm3 = _mm256_loadu_pd(tA + 4); + ymm9 = _mm256_fmadd_pd(ymm0, ymm3, ymm9); + ymm13 = _mm256_fmadd_pd(ymm1, ymm3, ymm13); - col_idx += 2; - } - // if the N is not multiple of 3. - // handling edge case. - if (n_remainder == 1) - { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; + ymm3 = _mm256_loadu_pd(tA + 8); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); - // clear scratch registers. - ymm12 = _mm256_setzero_pd(); - ymm13 = _mm256_setzero_pd(); - ymm14 = _mm256_setzero_pd(); + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA + 12, maskVec); + ymm11 = _mm256_fmadd_pd(ymm0, ymm3, ymm11); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); - for (k = 0; k < K; ++k) - { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - tB += tb_inc_row; + tA += lda; - //broadcasted matrix B elements are multiplied - //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - ymm12 = _mm256_fmadd_pd(ymm0, ymm3, ymm12); + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); - ymm3 = _mm256_loadu_pd(tA + 4); - ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + //multiply A*B by alpha. + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm11 = _mm256_mul_pd(ymm11, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); + ymm15 = _mm256_mul_pd(ymm15, ymm0); - ymm3 = _mm256_loadu_pd(tA + 8); - ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + if(is_beta_non_zero) + { + // multiply C by beta and accumulate, col 1. + ymm2 = _mm256_loadu_pd(tC + 0); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC + 12, maskVec); + ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11); + + // multiply C by beta and accumulate, col 2. + double *ttC = tC + ldc; + + ymm2 = _mm256_loadu_pd(ttC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_pd(ttC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_pd(ttC + 8); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC + 12, maskVec); + ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); + } + + _mm256_storeu_pd(tC + 0, ymm8); + _mm256_storeu_pd(tC + 4, ymm9); + _mm256_storeu_pd(tC + 8, ymm10); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 12, maskVec, ymm11); - tA += lda; + tC += ldc; + _mm256_storeu_pd(tC, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + _mm256_storeu_pd(tC + 8, ymm14); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 12, maskVec, ymm15); + col_idx += 2; } - // alpha, beta multiplication. - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); - - //multiply A*B by alpha. - ymm12 = _mm256_mul_pd(ymm12, ymm0); - ymm13 = _mm256_mul_pd(ymm13, ymm0); - ymm14 = _mm256_mul_pd(ymm14, ymm0); - - - if(is_beta_non_zero) + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) { - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(tC + 0); - ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); - ymm2 = _mm256_loadu_pd(tC + 4); - ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); - ymm2 = _mm256_loadu_pd(tC + 8); - ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - } - _mm256_storeu_pd(tC + 0, ymm12); - _mm256_storeu_pd(tC + 4, ymm13); - _mm256_storeu_pd(tC + 8, ymm14); - } + // clear scratch registers. + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + ymm15 = _mm256_setzero_pd(); - row_idx += 12; - } + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + tB += tb_inc_row; - if (m_remainder >= 8) - { - m_remainder -= 8; + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm12 = _mm256_fmadd_pd(ymm0, ymm3, ymm12); - for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) - { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; + ymm3 = _mm256_loadu_pd(tA + 4); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); - // clear scratch registers. - ymm4 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm6 = _mm256_setzero_pd(); - ymm7 = _mm256_setzero_pd(); - ymm8 = _mm256_setzero_pd(); - ymm9 = _mm256_setzero_pd(); + ymm3 = _mm256_loadu_pd(tA + 8); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); - for (k = 0; k < K; ++k) - { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); - ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); - tB += tb_inc_row; + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA + 12, maskVec); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); - //broadcasted matrix B elements are multiplied - //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); - ymm6 = _mm256_fmadd_pd(ymm1, ymm3, ymm6); - ymm8 = _mm256_fmadd_pd(ymm2, ymm3, ymm8); + tA += lda; - ymm3 = _mm256_loadu_pd(tA + 4); - ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); - ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); - ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); - tA += lda; - } - // alpha, beta multiplication. - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); + //multiply A*B by alpha. + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); + ymm15 = _mm256_mul_pd(ymm15, ymm0); - //multiply A*B by alpha. - ymm4 = _mm256_mul_pd(ymm4, ymm0); - ymm5 = _mm256_mul_pd(ymm5, ymm0); - ymm6 = _mm256_mul_pd(ymm6, ymm0); - ymm7 = _mm256_mul_pd(ymm7, ymm0); - ymm8 = _mm256_mul_pd(ymm8, ymm0); - ymm9 = _mm256_mul_pd(ymm9, ymm0); + if(is_beta_non_zero) + { + // multiply C by beta and accumulate. + ymm2 = _mm256_loadu_pd(tC + 0); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC + 12, maskVec); + ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); + } + + _mm256_storeu_pd(tC + 0, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + _mm256_storeu_pd(tC + 8, ymm14); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 12, maskVec, ymm15); + } + } + else if(m_remainder > 8) + { + // Handles edge cases where remainder elements are between 9-12(9, 10, 11, 12). + // Here m_rem gives index in mask_ptr that points which mask to be used based + // on remainder elements which could be 1, 2, 3 or 4 here. + m_rem = (m_remainder % 8); + __m256i maskVec = _mm256_loadu_si256( (__m256i *)mask_ptr[m_rem]); - if(is_beta_non_zero) + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(tC); - ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); - ymm2 = _mm256_loadu_pd(tC + 4); - ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); - - double* ttC = tC + ldc; - - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(ttC); - ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); - ymm2 = _mm256_loadu_pd(ttC + 4); - ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - ttC += ldc; + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(ttC); - ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); - ymm2 = _mm256_loadu_pd(ttC + 4); - ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); - } + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; - _mm256_storeu_pd(tC, ymm4); - _mm256_storeu_pd(tC + 4, ymm5); + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); + ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); - tC += ldc; - _mm256_storeu_pd(tC, ymm6); - _mm256_storeu_pd(tC + 4, ymm7); + ymm3 = _mm256_loadu_pd(tA + 4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); + ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); - tC += ldc; - _mm256_storeu_pd(tC, ymm8); - _mm256_storeu_pd(tC + 4, ymm9); + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA + 8, maskVec); + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); + ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); - } - n_remainder = N - col_idx; - // if the N is not multiple of 3. - // handling edge case. - if (n_remainder == 2) - { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm6 = _mm256_mul_pd(ymm6, ymm0); + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); - // clear scratch registers. - ymm4 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm6 = _mm256_setzero_pd(); - ymm7 = _mm256_setzero_pd(); + if(is_beta_non_zero) + { + // multiply C by beta and accumulate. + ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + + ymm2 = _mm256_loadu_pd(tC + 4); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC + 8, maskVec); + ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); + + // multiply C by beta and accumulate. + double *ttC = tC +ldc; + ymm2 = _mm256_loadu_pd(ttC); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + + ymm2 = _mm256_loadu_pd(ttC + 4); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC + 8, maskVec); + ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); + + // multiply C by beta and accumulate. + ttC += ldc; + ymm2 = _mm256_loadu_pd(ttC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + + ymm2 = _mm256_loadu_pd(ttC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC + 8, maskVec); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + + } + _mm256_storeu_pd(tC, ymm4); + _mm256_storeu_pd(tC + 4, ymm5); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 8, maskVec, ymm6); - for (k = 0; k < K; ++k) - { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); - tB += tb_inc_row; + tC += ldc; - //broadcasted matrix B elements are multiplied - //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); - ymm6 = _mm256_fmadd_pd(ymm1, ymm3, ymm6); + _mm256_storeu_pd(tC, ymm8); + _mm256_storeu_pd(tC + 4, ymm9); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 8, maskVec, ymm10); - ymm3 = _mm256_loadu_pd(tA + 4); - ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); - ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + tC += ldc; - tA += lda; + _mm256_storeu_pd(tC, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 8, maskVec, ymm14); } - // alpha, beta multiplication. - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); - - //multiply A*B by alpha. - ymm4 = _mm256_mul_pd(ymm4, ymm0); - ymm5 = _mm256_mul_pd(ymm5, ymm0); - ymm6 = _mm256_mul_pd(ymm6, ymm0); - ymm7 = _mm256_mul_pd(ymm7, ymm0); - - if(is_beta_non_zero) + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) { - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(tC); - ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); - ymm2 = _mm256_loadu_pd(tC + 4); - ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); - - double* ttC = tC + ldc; - - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(ttC); - ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); - ymm2 = _mm256_loadu_pd(ttC + 4); - ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); - } - _mm256_storeu_pd(tC, ymm4); - _mm256_storeu_pd(tC + 4, ymm5); - - tC += ldc; - _mm256_storeu_pd(tC, ymm6); - _mm256_storeu_pd(tC + 4, ymm7); + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - col_idx += 2; + // clear scratch registers. + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); - } - // if the N is not multiple of 3. - // handling edge case. - if (n_remainder == 1) - { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + tB += tb_inc_row; - ymm4 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm8 = _mm256_fmadd_pd(ymm0, ymm3, ymm8); + ymm12 = _mm256_fmadd_pd(ymm1, ymm3, ymm12); - for (k = 0; k < K; ++k) - { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - tB += tb_inc_row; + ymm3 = _mm256_loadu_pd(tA + 4); + ymm9 = _mm256_fmadd_pd(ymm0, ymm3, ymm9); + ymm13 = _mm256_fmadd_pd(ymm1, ymm3, ymm13); - //broadcasted matrix B elements are multiplied - //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA + 8, maskVec); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); - ymm3 = _mm256_loadu_pd(tA + 4); - ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + tA += lda; - tA += lda; - } - // alpha, beta multiplication. - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); - ymm4 = _mm256_mul_pd(ymm4, ymm0); - ymm5 = _mm256_mul_pd(ymm5, ymm0); + //multiply A*B by alpha. + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); - if(is_beta_non_zero) - { - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(tC); - ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); - ymm2 = _mm256_loadu_pd(tC + 4); - ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); - } - _mm256_storeu_pd(tC, ymm4); - _mm256_storeu_pd(tC + 4, ymm5); - } + if(is_beta_non_zero) + { + // multiply C by beta and accumulate. + ymm2 = _mm256_loadu_pd(tC + 0); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); - row_idx += 8; - } + ymm2 = _mm256_loadu_pd(tC + 4); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC + 8, maskVec); + ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); - if (m_remainder >= 4) - { - m_remainder -= 4; + double *ttC = tC + ldc; - for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) - { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; + // multiply C by beta and accumulate. + ymm2 = _mm256_loadu_pd(ttC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); - // clear scratch registers. - ymm4 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - ymm6 = _mm256_setzero_pd(); + ymm2 = _mm256_loadu_pd(ttC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC + 8, maskVec); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); - for (k = 0; k < K; ++k) - { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); - ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); - tB += tb_inc_row; + } + _mm256_storeu_pd(tC + 0, ymm8); + _mm256_storeu_pd(tC + 4, ymm9); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 8, maskVec, ymm10); - //broadcasted matrix B elements are multiplied - //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); - ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); - ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); + tC += ldc; - tA += lda; + _mm256_storeu_pd(tC, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 8, maskVec, ymm14); + + col_idx += 2; } - // alpha, beta multiplication. - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - //multiply A*B by alpha. - ymm4 = _mm256_mul_pd(ymm4, ymm0); - ymm5 = _mm256_mul_pd(ymm5, ymm0); - ymm6 = _mm256_mul_pd(ymm6, ymm0); + // clear scratch registers. + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); - if(is_beta_non_zero) - { - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(tC); - ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + tB += tb_inc_row; - double* ttC = tC + ldc; + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm12 = _mm256_fmadd_pd(ymm0, ymm3, ymm12); - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(ttC); - ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + ymm3 = _mm256_loadu_pd(tA + 4); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); - ttC += ldc; + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA + 8, maskVec); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(ttC); - ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); - } - _mm256_storeu_pd(tC, ymm4); + tA += lda; - tC += ldc; - _mm256_storeu_pd(tC, ymm5); + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); - tC += ldc; - _mm256_storeu_pd(tC, ymm6); - } - n_remainder = N - col_idx; - // if the N is not multiple of 3. - // handling edge case. - if (n_remainder == 2) - { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; + //multiply A*B by alpha. + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); - ymm4 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); - for (k = 0; k < K; ++k) - { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); - tB += tb_inc_row; + if(is_beta_non_zero) + { + // multiply C by beta and accumulate. + ymm2 = _mm256_loadu_pd(tC + 0); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); - //broadcasted matrix B elements are multiplied - //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); - ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC + 8, maskVec); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); - tA += lda; + } + _mm256_storeu_pd(tC + 0, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 8, maskVec, ymm14); } - // alpha, beta multiplication. - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); + } + else if(m_remainder > 4) + { + // Handles edge cases where remainder elements are between 5-8(5, 6, 7, 8). + // Here m_rem gives index in mask_ptr that points which mask to be used based + // on remainder elements which could be 1, 2, 3 or 4 here. + m_rem = (m_remainder % 4); + __m256i maskVec = _mm256_loadu_si256( (__m256i *)mask_ptr[m_rem]); + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - //multiply A*B by alpha. - ymm4 = _mm256_mul_pd(ymm4, ymm0); - ymm5 = _mm256_mul_pd(ymm5, ymm0); + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm11 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + ymm15 = _mm256_setzero_pd(); - if(is_beta_non_zero) - { - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(tC); - ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; - double* ttC = tC + ldc; + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); + ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(ttC); - ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); - } - _mm256_storeu_pd(tC, ymm4); + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA + 4, maskVec); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); + ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); - tC += ldc; - _mm256_storeu_pd(tC, ymm5); + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm6 = _mm256_mul_pd(ymm6, ymm0); + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); - col_idx += 2; + if(is_beta_non_zero) + { + // multiply C by beta and accumulate. + ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC + 4, maskVec); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + + // multiply C by beta and accumulate. + double *ttC = tC +ldc; + ymm2 = _mm256_loadu_pd(ttC); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC + 4, maskVec); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); + + // multiply C by beta and accumulate. + ttC += ldc; + ymm2 = _mm256_loadu_pd(ttC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC + 4, maskVec); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + } + _mm256_storeu_pd(tC, ymm4); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 4, maskVec, ymm5); - } - // if the N is not multiple of 3. - // handling edge case. - if (n_remainder == 1) - { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; + tC += ldc; - ymm4 = _mm256_setzero_pd(); + _mm256_storeu_pd(tC, ymm8); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 4, maskVec, ymm9); - for (k = 0; k < K; ++k) + tC += ldc; + + _mm256_storeu_pd(tC, ymm12); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 4, maskVec, ymm13); + } + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - tB += tb_inc_row; + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - //broadcasted matrix B elements are multiplied - //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + // clear scratch registers. + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); - tA += lda; - } - // alpha, beta multiplication. - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + tB += tb_inc_row; - ymm4 = _mm256_mul_pd(ymm4, ymm0); + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm8 = _mm256_fmadd_pd(ymm0, ymm3, ymm8); + ymm12 = _mm256_fmadd_pd(ymm1, ymm3, ymm12); + + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA + 4, maskVec); + ymm9 = _mm256_fmadd_pd(ymm0, ymm3, ymm9); + ymm13 = _mm256_fmadd_pd(ymm1, ymm3, ymm13); + tA += lda; + + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); - if(is_beta_non_zero) - { - // multiply C by beta and accumulate. - ymm2 = _mm256_loadu_pd(tC); - ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + //multiply A*B by alpha. + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); - } - _mm256_storeu_pd(tC, ymm4); - } + if(is_beta_non_zero) + { + // multiply C by beta and accumulate. + ymm2 = _mm256_loadu_pd(tC + 0); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC + 4, maskVec); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); - row_idx += 4; - } - // M is not a multiple of 32. - // The handling of edge case where the remainder - // dimension is less than 8. The padding takes place - // to handle this case. - if ((m_remainder) && (lda > 3)) - { - double f_temp[8] = {0.0}; + double *ttC = tC + ldc; - for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) - { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; + // multiply C by beta and accumulate. + ymm2 = _mm256_loadu_pd(ttC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC + 4, maskVec); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); - // clear scratch registers. - ymm5 = _mm256_setzero_pd(); - ymm7 = _mm256_setzero_pd(); - ymm9 = _mm256_setzero_pd(); + } + _mm256_storeu_pd(tC + 0, ymm8); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 4, maskVec, ymm9); - for (k = 0; k < (K - 1); ++k) - { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); - ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); - tB += tb_inc_row; + tC += ldc; - //broadcasted matrix B elements are multiplied - //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); - ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); - ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); + _mm256_storeu_pd(tC, ymm12); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 4, maskVec, ymm13); - tA += lda; + col_idx += 2; } - // alpha, beta multiplication. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); - ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); - tB += tb_inc_row; - - for (int i = 0; i < m_remainder; i++) + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) { - f_temp[i] = tA[i]; - } - ymm3 = _mm256_loadu_pd(f_temp); - ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); - ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); - ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); - - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - //multiply A*B by alpha. - ymm5 = _mm256_mul_pd(ymm5, ymm0); - ymm7 = _mm256_mul_pd(ymm7, ymm0); - ymm9 = _mm256_mul_pd(ymm9, ymm0); + // clear scratch registers. + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); - if(is_beta_non_zero) - { - for (int i = 0; i < m_remainder; i++) + for (k = 0; k < K; ++k) { - f_temp[i] = tC[i]; - } - ymm2 = _mm256_loadu_pd(f_temp); - ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + tB += tb_inc_row; + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm12 = _mm256_fmadd_pd(ymm0, ymm3, ymm12); - double* ttC = tC + ldc; + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA + 4, maskVec); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); - for (int i = 0; i < m_remainder; i++) - { - f_temp[i] = ttC[i]; - } - ymm2 = _mm256_loadu_pd(f_temp); - ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); + tA += lda; - ttC += ldc; - for (int i = 0; i < m_remainder; i++) - { - f_temp[i] = ttC[i]; - } - ymm2 = _mm256_loadu_pd(f_temp); - ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); - } - _mm256_storeu_pd(f_temp, ymm5); - for (int i = 0; i < m_remainder; i++) - { - tC[i] = f_temp[i]; } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); - tC += ldc; - _mm256_storeu_pd(f_temp, ymm7); - for (int i = 0; i < m_remainder; i++) - { - tC[i] = f_temp[i]; - } + //multiply A*B by alpha. + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); - tC += ldc; - _mm256_storeu_pd(f_temp, ymm9); - for (int i = 0; i < m_remainder; i++) + if(is_beta_non_zero) { - tC[i] = f_temp[i]; - } + // multiply C by beta and accumulate. + ymm2 = _mm256_loadu_pd(tC + 0); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC + 4, maskVec); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + } + _mm256_storeu_pd(tC + 0, ymm12); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC + 4, maskVec, ymm13); + } } - n_remainder = N - col_idx; - // if the N is not multiple of 3. - // handling edge case. - if (n_remainder == 2) + else { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; - - ymm5 = _mm256_setzero_pd(); - ymm7 = _mm256_setzero_pd(); - - for (k = 0; k < (K - 1); ++k) + __m256i maskVec = _mm256_loadu_si256( (__m256i *)mask_ptr[m_remainder]); + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); - tB += tb_inc_row; + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - ymm3 = _mm256_loadu_pd(tA); - ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); - ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); - tA += lda; - } + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); - tB += tb_inc_row; + //broadcasted matrix B elements are multiplied + //with matrix A columns. - for (int i = 0; i < m_remainder; i++) - { - f_temp[i] = tA[i]; - } - ymm3 = _mm256_loadu_pd(f_temp); - ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); - ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA, maskVec); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); - ymm5 = _mm256_mul_pd(ymm5, ymm0); - ymm7 = _mm256_mul_pd(ymm7, ymm0); + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm6 = _mm256_mul_pd(ymm6, ymm0); - if(is_beta_non_zero) - { - for (int i = 0; i < m_remainder; i++) + if(is_beta_non_zero) { - f_temp[i] = tC[i]; - } - ymm2 = _mm256_loadu_pd(f_temp); - ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC, maskVec); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); - double* ttC = tC + ldc; + double* ttC = tC + ldc; - for (int i = 0; i < m_remainder; i++) - { - f_temp[i] = ttC[i]; + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC, maskVec); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + + ttC += ldc; + + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC, maskVec); + ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); } - ymm2 = _mm256_loadu_pd(f_temp); - ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC, maskVec, ymm4); + + tC += ldc; + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC, maskVec, ymm5); + tC += ldc; + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC, maskVec, ymm6); } - _mm256_storeu_pd(f_temp, ymm5); - for (int i = 0; i < m_remainder; i++) + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) { - tC[i] = f_temp[i]; - } + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; - tC += ldc; - _mm256_storeu_pd(f_temp, ymm7); - for (int i = 0; i < m_remainder; i++) - { - tC[i] = f_temp[i]; - } - } - // if the N is not multiple of 3. - // handling edge case. - if (n_remainder == 1) - { - //pointer math to point to proper memory - tC = C + ldc * col_idx + row_idx; - tB = B + tb_inc_col * col_idx; - tA = A + row_idx; + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); - ymm5 = _mm256_setzero_pd(); + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + tB += tb_inc_row; - for (k = 0; k < (K - 1); ++k) - { - // The inner loop broadcasts the B matrix data and - // multiplies it with the A matrix. - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - tB += tb_inc_row; + //broadcasted matrix B elements are multiplied + //with matrix A columns. - ymm3 = _mm256_loadu_pd(tA); - ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA, maskVec); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); - tA += lda; - } + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); - ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); - tB += tb_inc_row; + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); - for (int i = 0; i < m_remainder; i++) - { - f_temp[i] = tA[i]; - } - ymm3 = _mm256_loadu_pd(f_temp); - ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + if(is_beta_non_zero) + { + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC, maskVec); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); - ymm0 = _mm256_broadcast_sd(alpha_cast); - ymm1 = _mm256_broadcast_sd(beta_cast); + double* ttC = tC + ldc; - // multiply C by beta and accumulate. - ymm5 = _mm256_mul_pd(ymm5, ymm0); + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(ttC, maskVec); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + } + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC, maskVec, ymm4); - if(is_beta_non_zero) - { + tC += ldc; + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC, maskVec, ymm5); - for (int i = 0; i < m_remainder; i++) - { - f_temp[i] = tC[i]; - } - ymm2 = _mm256_loadu_pd(f_temp); - ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); - } - _mm256_storeu_pd(f_temp, ymm5); - for (int i = 0; i < m_remainder; i++) - { - tC[i] = f_temp[i]; - } - } - m_remainder = 0; - } + col_idx += 2; - if (m_remainder) - { - double result; - for (; row_idx < M; row_idx += 1) - { - for (col_idx = 0; col_idx < N; col_idx += 1) + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; - result = 0; + ymm4 = _mm256_setzero_pd(); + for (k = 0; k < K; ++k) { - result += (*tA) * (*tB); - tA += lda; + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + + // Masked load the relevant remainder elements only + // using maskVec. + ymm3 = _mm256_maskload_pd(tA, maskVec); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tA += lda; } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + ymm1 = _mm256_broadcast_sd(beta_cast); + + ymm4 = _mm256_mul_pd(ymm4, ymm0); - result *= (*alpha_cast); if(is_beta_non_zero) - (*tC) = (*tC) * (*beta_cast) + result; - else - (*tC) = result; + { + // Masked load the relevant remaider elements of C matrix + // Scale by beta. + ymm2 = _mm256_maskload_pd(tC, maskVec); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + + } + // Masked store the relevant remaider elements of C matrix + _mm256_maskstore_pd(tC, maskVec, ymm4); } } } - // Return the buffer to pool + // Return the buffer to pool if ((required_packing_A == 1) && bli_mem_is_alloc( &local_mem_buf_A_s )) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_dgemm_small(): releasing mem pool block\n" ); From 3be43d264fdc03729d6dc001cb5a1dc8c98c11e3 Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Mon, 22 May 2023 15:56:00 +0530 Subject: [PATCH 123/226] Optimized xGEMV for non-unit stride Y vector - In variant 2 of GEMV, A matrix is in column major. Y vector has to be of unit stride if the operation is to be vectorized. - In cases when Y vector is non-unit stride, vectorization of the GEMV operation inside the kernel has been ensured by packing the input Y vector to a temporary buffer with unit stride. As part of the packing Y is scaled by beta to reduce the number of times Y vector is to be loaded. - After performing the GEMV operation, the results in the temporary buffer are copied to the original buffer and the temporary one is released. - In DGEMV var 2, moved problem decomposition for Zen architecture to the AXPYF kernel. - Removed flag check based kernel dispatch logic from DGEMV. Now, kernels will be picked from the context for non-avx machines. For avx machines, the kernel(s) to be dispatched is(are) assigned to the function pointer in the unf_var layer. AMD-Internal: [CPUPL-3485] Change-Id: I7b2efb00a9fa9abca65abca07ee80f38229bf654 --- frame/2/gemv/bli_gemv_unf_var2.c | 174 ++++++++-- frame/2/gemv/bli_gemv_unf_var2_amd.c | 496 ++++++++++++++++----------- kernels/zen/1f/bli_axpyf_zen_int_5.c | 19 +- kernels/zen/1f/bli_axpyf_zen_int_8.c | 90 ++++- kernels/zen/bli_kernels_zen.h | 1 + 5 files changed, 519 insertions(+), 261 deletions(-) diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c index d6c21de6df..3001f7efaa 100644 --- a/frame/2/gemv/bli_gemv_unf_var2.c +++ b/frame/2/gemv/bli_gemv_unf_var2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -78,33 +78,121 @@ void PASTEMAC(ch,varname) \ \ conja = bli_extract_conj( transa ); \ \ - /* If beta is zero, use setv. Otherwise, scale by beta. */ \ - if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - /* y = 0; */ \ - PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - n_elem, \ - zero, \ - y, incy, \ - cntx, \ - NULL \ - ); \ - } \ - else \ - { \ - /* y = beta * y; */ \ - PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - n_elem, \ - beta, \ - y, incy, \ - cntx, \ - NULL \ - ); \ - } \ + /* + Memory pool declarations for packing vector Y. + */\ + mem_t mem_bufY;\ + rntm_t rntm;\ + ctype* y_buf = y;\ + inc_t buf_incy = incy;\ +\ + /* + Boolean to check if the y has been packed + and memory needs to be freed in the end + */\ + bool is_y_temp_buf_created = FALSE;\ +\ + /* + If alpha is equal to zero, y = beta * y + alpha * A * x + becomes y = beat * y in that case packing will be costly. + y is only scaled with SCALV and returned. + */\ + if (incy > 1 && (!PASTEMAC(ch,eq0)( *alpha )))\ + {\ + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_membrk_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */\ + mem_bufY.pblk.buf = NULL;\ + mem_bufY.pblk.block_size = 0;\ + mem_bufY.buf_type = 0;\ + mem_bufY.size = 0;\ + mem_bufY.pool = NULL;\ +\ + /* + In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm + */\ +\ + bli_rntm_init_from_global(&rntm);\ + bli_rntm_set_num_threads_only(1, &rntm);\ + bli_membrk_rntm_set_membrk(&rntm);\ +\ + /* + Calculate the size required for n_elem double elements in vector Y. + */\ + size_t buffer_size = n_elem * sizeof(ctype);\ +\ + /* + Acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufY. + */\ + bli_membrk_acquire_m(&rntm,\ + buffer_size,\ + BLIS_BUFFER_FOR_B_PANEL,\ + &mem_bufY);\ +\ + /* + Continue packing Y if buffer memory is allocated + */\ + if ((bli_mem_is_alloc(&mem_bufY)))\ + {\ + y_buf = bli_mem_buffer(&mem_bufY);\ + buf_incy = 1;\ + PASTECH(ch,scal2v_ker_ft) scal2v_kr_ptr;\ + scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCAL2V_KER, cntx );\ +\ + /* + Invoke the SCAL2V function using the function pointer + */\ + scal2v_kr_ptr\ + (\ + BLIS_NO_CONJUGATE,\ + n_elem,\ + beta,\ + y, incy,\ + y_buf, buf_incy,\ + cntx\ + );\ +\ + /* + Set y is packed as the memory allocation was + successful and contents have been copied + */\ + is_y_temp_buf_created = TRUE;\ + }\ + }\ + else\ + {\ + /* + Invoke the SCALV function using the function pointer + */\ + PASTECH(ch,scalv_ker_ft) scalv_kr_ptr;\ + scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(dt, BLIS_SCALV_KER, cntx);\ +\ + scalv_kr_ptr\ + (\ + BLIS_NO_CONJUGATE,\ + n_elem,\ + beta,\ + y_buf, buf_incy,\ + cntx\ + );\ + }\ +\ + /* + If alpha is zero(0), we only need to scalv y and return + */\ + if (PASTEMAC(ch,eq0)( *alpha ))\ + {\ + /* + Return early for alpha is zero(0) + */\ + return;\ + }\ \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ @@ -118,7 +206,7 @@ void PASTEMAC(ch,varname) \ \ A1 = a + (0 )*rs_at + (i )*cs_at; \ x1 = x + (i )*incx; \ - y1 = y + (0 )*incy; \ + y1 = y_buf + (0 )* buf_incy; \ \ /* y = y + alpha * A1 * x1; */ \ kfp_af \ @@ -130,10 +218,36 @@ void PASTEMAC(ch,varname) \ alpha, \ A1, rs_at, cs_at, \ x1, incx, \ - y1, incy, \ + y1, buf_incy, \ cntx \ ); \ } \ +\ + /* + Check if temp y buffer was used for compute + */\ + if (is_y_temp_buf_created)\ + {\ + /* + Store the result from unit strided y_buf to non-unit strided Y + Invoke the COPYV function using the function pointer + */\ + PASTECH(ch,copyv_ker_ft) copyv_kr_ptr;\ + copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(dt, BLIS_COPYV_KER, cntx);\ +\ + copyv_kr_ptr\ + (\ + BLIS_NO_CONJUGATE,\ + n_elem,\ + y_buf, buf_incy,\ + y, incy,\ + cntx\ + );\ +\ + /* Return the buffer to pool */\ + bli_membrk_release(&rntm, &mem_bufY);\ + }\ +\ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); \ } diff --git a/frame/2/gemv/bli_gemv_unf_var2_amd.c b/frame/2/gemv/bli_gemv_unf_var2_amd.c index c4317dd4d1..6ed47cb0ac 100644 --- a/frame/2/gemv/bli_gemv_unf_var2_amd.c +++ b/frame/2/gemv/bli_gemv_unf_var2_amd.c @@ -78,33 +78,121 @@ void PASTEMAC(ch,varname) \ \ conja = bli_extract_conj( transa ); \ \ - /* If beta is zero, use setv. Otherwise, scale by beta. */ \ - if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - /* y = 0; */ \ - PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - n_elem, \ - zero, \ - y, incy, \ - cntx, \ - NULL \ - ); \ - } \ - else \ - { \ - /* y = beta * y; */ \ - PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - n_elem, \ - beta, \ - y, incy, \ - cntx, \ - NULL \ - ); \ - } \ + /* + Memory pool declarations for packing vector Y. + */\ + mem_t mem_bufY;\ + rntm_t rntm;\ + ctype* y_buf = y;\ + inc_t buf_incy = incy;\ +\ + /* + Boolean to check if the y has been packed + and memory needs to be freed in the end + */\ + bool is_y_temp_buf_created = FALSE;\ +\ + /* + If alpha is equal to zero, y is only scaled by beta and returned. + In this case, packing and unpacking y will be costly and it is + avoided. + */\ + if (incy > 1 && (!PASTEMAC(ch,eq0)( *alpha )))\ + {\ + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_membrk_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */\ + mem_bufY.pblk.buf = NULL;\ + mem_bufY.pblk.block_size = 0;\ + mem_bufY.buf_type = 0;\ + mem_bufY.size = 0;\ + mem_bufY.pool = NULL;\ +\ + /* + In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm + */\ +\ + bli_rntm_init_from_global(&rntm);\ + bli_rntm_set_num_threads_only(1, &rntm);\ + bli_membrk_rntm_set_membrk(&rntm);\ +\ + /* + Calculate the size required for n_elem double elements in vector Y. + */\ + size_t buffer_size = n_elem * sizeof(ctype);\ +\ + /* + Acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufY. + */\ + bli_membrk_acquire_m(&rntm,\ + buffer_size,\ + BLIS_BUFFER_FOR_B_PANEL,\ + &mem_bufY);\ +\ + /* + Continue packing Y if buffer memory is allocated + */\ + if ((bli_mem_is_alloc(&mem_bufY)))\ + {\ + y_buf = bli_mem_buffer(&mem_bufY);\ + buf_incy = 1;\ + PASTECH(ch,scal2v_ker_ft) scal2v_kr_ptr;\ + scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCAL2V_KER, cntx );\ +\ + /* + Invoke the SCAL2V function using the function pointer + */\ + scal2v_kr_ptr\ + (\ + BLIS_NO_CONJUGATE,\ + n_elem,\ + beta,\ + y, incy,\ + y_buf, buf_incy,\ + cntx\ + );\ +\ + /* + Set y is packed as the memory allocation was + successful and contents have been copied + */\ + is_y_temp_buf_created = TRUE;\ + }\ + }\ + else\ + {\ + /* + Invoke the SCALV function using the function pointer + */\ + PASTECH(ch,scalv_ker_ft) scalv_kr_ptr;\ + scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(dt, BLIS_SCALV_KER, cntx);\ +\ + scalv_kr_ptr\ + (\ + BLIS_NO_CONJUGATE,\ + n_elem,\ + beta,\ + y_buf, buf_incy,\ + cntx\ + );\ + }\ +\ + /* + If alpha is zero(0), we only need to scalv y and return + */\ + if (PASTEMAC(ch,eq0)( *alpha ))\ + {\ + /* + Return early for alpha is zero(0) + */\ + return;\ + }\ \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ @@ -118,7 +206,7 @@ void PASTEMAC(ch,varname) \ \ A1 = a + (0 )*rs_at + (i )*cs_at; \ x1 = x + (i )*incx; \ - y1 = y + (0 )*incy; \ + y1 = y_buf + (0 )* buf_incy; \ \ /* y = y + alpha * A1 * x1; */ \ kfp_af \ @@ -130,10 +218,36 @@ void PASTEMAC(ch,varname) \ alpha, \ A1, rs_at, cs_at, \ x1, incx, \ - y1, incy, \ + y1, buf_incy, \ cntx \ ); \ } \ +\ + /* + Check if temp y buffer was used for compute + */\ + if (is_y_temp_buf_created)\ + {\ + /* + Store the result from unit strided y_buf to non-unit strided Y + Invoke the COPYV function using the function pointer + */\ + PASTECH(ch,copyv_ker_ft) copyv_kr_ptr;\ + copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(dt, BLIS_COPYV_KER, cntx);\ +\ + copyv_kr_ptr\ + (\ + BLIS_NO_CONJUGATE,\ + n_elem,\ + y_buf, buf_incy,\ + y, incy,\ + cntx\ + );\ +\ + /* Return the buffer to pool */\ + bli_membrk_release(&rntm, &mem_bufY);\ + }\ +\ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); \ } @@ -156,20 +270,16 @@ void bli_dgemv_unf_var2 double* A1; double* x1; dim_t i; - dim_t f; + dim_t f, b_fuse; dim_t n_elem, n_iter; inc_t rs_at, cs_at; conj_t conja; - //memory pool declarations for packing vector Y. + + // Memory pool declarations for packing vector Y. mem_t mem_bufY; rntm_t rntm; - double *y_buf = y; - inc_t buf_incy = incy; - - // For AMD these APIS are invoked skipping intermediate framework layers - // Hence we need to ensure that cntx is set here. - bli_init_once(); - if(cntx == NULL) cntx = bli_gks_query_cntx(); + double* y_temp = y; + inc_t temp_incy = incy; bli_set_dims_incs_with_trans( transa, m, n, rs_a, cs_a, @@ -177,94 +287,88 @@ void bli_dgemv_unf_var2 conja = bli_extract_conj( transa ); - // This function is invoked on all architectures including 'generic'. - // Non-AVX2+FMA3 platforms will use the kernels derived from the context. - if (bli_cpuid_is_avx2fma3_supported() == FALSE) + /* + Fatbinary config amdzen when run on non-AMD X86 will query for + the support of AVX512 or AVX2, if AVX512 - arch_id will be zen4 + or for AVX2 it will be zen3. + */ + arch_t id = bli_arch_query_id(); + + /* + Function pointer declaration for the functions + that will be used by this API + */ + daxpyf_ker_ft axpyf_kr_ptr; // DAXPYF + dscal2v_ker_ft scal2v_kr_ptr; // DSCAL2V + dscalv_ker_ft scalv_kr_ptr; // DSCALV + dcopyv_ker_ft copyv_kr_ptr; // DCOPYV + + /* + Boolean to check if the y has been packed + and memory needs to be freed in the end + */ + bool is_y_temp_buf_created = FALSE; + + switch (id) { - const num_t dt = PASTEMAC(d,type); - double* x1; - double* y1; - /* If beta is zero, use setv. Otherwise, scale by beta. */ - if ( PASTEMAC(d,eq0)( *beta ) ) - { - double* zero = PASTEMAC(d,0); - /* y = 0; */ - PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - zero, - y, incy, - cntx, - NULL - ); - } - else - { - /* y = beta * y; */ - PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - cntx, - NULL - ); - } + case BLIS_ARCH_ZEN4: +#if defined(BLIS_KERNELS_ZEN4) + /* + Assign the AVX512 based kernel function pointers for + AXPYF, SCALV, COPYV and corresponding fusing + factor of DAXPYF kernel + */ - PASTECH(d,axpyf_ker_ft) kfp_af; + axpyf_kr_ptr = bli_daxpyf_zen_int_8; + b_fuse = 8; - /* Query the context for the kernel function pointer and fusing factor. */ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); - dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); + scalv_kr_ptr = bli_dscalv_zen_int_avx512; - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + copyv_kr_ptr = bli_dcopyv_zen_int; - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - y1 = y + (0 )*incy; + break; +#endif + case BLIS_ARCH_ZEN: + case BLIS_ARCH_ZEN2: + case BLIS_ARCH_ZEN3: - /* y = y + alpha * A1 * x1; */ - kfp_af - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y1, incy, - cntx - ); - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; - } + /* + Assign the AVX2 based kernel function pointers for + AXPYF, SCALV, COPYV and corresponding fusing + factor of DAXPYF kernel + */ - /* If beta is zero, use setv. Otherwise, scale by beta. */ - /* y = beta * y; */ - /* beta=0 case is handled by scalv internally */ + axpyf_kr_ptr = bli_daxpyf_zen_int_8; + b_fuse = 8; - bli_dscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - cntx - ); + scalv_kr_ptr = bli_dscalv_zen_int10; - if( bli_deq0( *alpha ) ) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) - return; + copyv_kr_ptr = bli_dcopyv_zen_int; + + break; + default: + // For non-Zen architectures, query the context if it is NULL + if(cntx == NULL) cntx = bli_gks_query_cntx(); + + /* + Query the context for the kernel function pointers for + AXPYF, SCALV, COPYV and corresponding fusing + factor of AXPYF kernel + */ + axpyf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DOUBLE, BLIS_AXPYF_KER, cntx); + b_fuse = bli_cntx_get_blksz_def_dt(BLIS_DOUBLE, BLIS_AF, cntx); + + scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCALV_KER, cntx); + + copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_COPYV_KER, cntx); } - if (incy > 1) + /* + If alpha is equal to zero, y is only scaled by beta and returned. + In this case, packing and unpacking y will be costly and it is + avoided. + */ + if ( (incy > 1) && (!bli_deq0( *alpha ))) { /* Initialize mem pool buffer to NULL and size to 0 @@ -301,110 +405,90 @@ void bli_dgemv_unf_var2 /*Continue packing Y if buffer memory is allocated*/ if ((bli_mem_is_alloc( &mem_bufY ))) { - y_buf = bli_mem_buffer(&mem_bufY); - - //pack Y vector with non-unit stride to a temp buffer y_buf with unit stride - for(dim_t y_index = 0 ; y_index < n_elem ; y_index++) - { - *(y_buf + y_index) = *(y + (y_index * incy)) ; - } - // stride of vector y_buf =1 - buf_incy = 1; - } - } - - dim_t fuse_factor = 8; - dim_t f_temp = 0; + y_temp = bli_mem_buffer(&mem_bufY); - // Change the fuse factor based on - // Input size and available kernels - // This ensures that fusing is possible when the number of - // left over colums is less (better problem decomposition) - if (n < 5) fuse_factor = 4; - else if (n < 8) fuse_factor = 5; + // Stride of vector y_temp + temp_incy = 1; - for (i = 0; i < n_iter; i += f) - { - f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor); + // Query the context if it is NULL. This will be necessary for Zen architectures + if(cntx == NULL) cntx = bli_gks_query_cntx(); - A1 = a + (i)*cs_at; - x1 = x + (i)*incx; + scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCAL2V_KER, cntx); - // Pick kernel based on problem size - switch (f) - { - case 8: + // Invoke the SCAL2V function using the function pointer + scal2v_kr_ptr + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + y_temp, temp_incy, + cntx + ); - bli_daxpyf_zen_int_8( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y_buf, buf_incy, - cntx); + /* + Set y is packed as the memory allocation was successful + and contents have been scaled and copied to a temp buffer + */ + is_y_temp_buf_created = TRUE; + } + } + else + { + // Invoke the DSCALV function using the function pointer + scalv_kr_ptr + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y_temp, temp_incy, + cntx + ); + } - break; - default: + if( bli_deq0( *alpha ) ) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) + return; + } - if (f < 5) - { - bli_daxpyf_zen_int_16x4( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y_buf, buf_incy, - cntx); - } - else - { - bli_daxpyf_zen_int_5( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y_buf, buf_incy, - cntx); - } - } + for (i = 0; i < n_iter; i += f) + { + f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse); - // Calculate the next problem size - f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor); + A1 = a + (i * cs_at); + x1 = x + (i * incx); - // Change fuse factor based on the next problem size - if (f_temp < fuse_factor) - { - if (f_temp < 5) - { - fuse_factor = 4; - } - else - { - fuse_factor = 5; - } - } + axpyf_kr_ptr + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y_temp, temp_incy, + cntx + ); } - if ((incy > 1) && bli_mem_is_alloc( &mem_bufY )) + if (is_y_temp_buf_created) { - //store the result from unit strided y_buf to non-unit strided Y - for(dim_t y_index = 0 ; y_index < n_elem ; y_index++) - { - *(y + (y_index * incy)) = *(y_buf + y_index) ; - } + // Store the result from unit strided y_buf to non-unit strided Y + // Invoke the COPYV function using the function pointer + copyv_kr_ptr + ( + BLIS_NO_CONJUGATE, + n_elem, + y_temp, temp_incy, + y, incy, + cntx + ); - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" ); - #endif +#ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" ); +#endif // Return the buffer to pool bli_membrk_release(&rntm , &mem_bufY); } diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c index 8fea5f6498..6b23fe6c45 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_5.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -667,7 +667,7 @@ void bli_daxpyf_zen_int_5 // ----------------------------------------------------------------------------- -static void bli_daxpyf_zen_int_16x2 +void bli_daxpyf_zen_int_16x2 ( conj_t conja, conj_t conjx, @@ -1003,21 +1003,6 @@ void bli_daxpyf_zen_int_16x4 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - if (b_n & 2) - { - bli_daxpyf_zen_int_16x2( conja, - conjx, - m, 2, - alpha, a, inca, lda, - x, incx, - y, incy, - cntx - ); - b_n -= 2; - a += 2*lda; - x += 2 * incx; - } - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) diff --git a/kernels/zen/1f/bli_axpyf_zen_int_8.c b/kernels/zen/1f/bli_axpyf_zen_int_8.c index 27dafb28fc..f2fb671361 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_8.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2018, The University of Texas at Austin - Copyright (C) 2016 - 2022, Advanced Micro Devices, Inc. + Copyright (C) 2016 - 2023, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -296,17 +296,91 @@ void bli_daxpyf_zen_int_8 // If either dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim2( m, b_n ) || PASTEMAC(d,eq0)( *alpha ) ) return; - // If b_n is not equal to the fusing factor, then perform the entire - // operation as a loop over axpyv. + /* + If b_n is not equal to the fusing factor, then perform the entire + operation as axpyv or perform the operation using axpyf kernels with + lower fuse factor. + */ if ( b_n != fuse_fac ) { - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + if (b_n >= 5) + { + dim_t fuse_fac = 5; - for ( i = 0; i < b_n; ++i ) + bli_daxpyf_zen_int_5 + ( + conja, + conjx, + m, + fuse_fac, + alpha, + a, inca, lda, + x, incx, + y, incy, + cntx + ); + + a = a + (fuse_fac * lda); + x = x + (fuse_fac * incx); + + b_n -= fuse_fac; + } + + if (b_n == 4) { - double* a1 = a + (0 )*inca + (i )*lda; - double* chi1 = x + (i )*incx; - double* y1 = y + (0 )*incy; + dim_t fuse_fac = 4; + + bli_daxpyf_zen_int_16x4 + ( + conja, + conjx, + m, + fuse_fac, + alpha, + a, inca, lda, + x, incx, + y, incy, + cntx + ); + + a = a + (fuse_fac * lda); + x = x + (fuse_fac * incx); + + b_n -= fuse_fac; + } + + if (b_n >= 2) + { + dim_t fuse_fac = 2; + + bli_daxpyf_zen_int_16x2 + ( + conja, + conjx, + m, fuse_fac, + alpha, a, inca, lda, + x, incx, + y, incy, + cntx + ); + + a = a + (fuse_fac * lda); + x = x + (fuse_fac * incx); + + b_n -= fuse_fac; + + } + + if (b_n == 1) + { + // Query the context if it is NULL. This will be necessary for Zen architectures + if (cntx == NULL) cntx = bli_gks_query_cntx(); + + daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_AXPYV_KER, cntx); + + double* a1 = a; + double* chi1 = x; + double* y1 = y; double alpha_chi1; PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 ); diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index b5af321a96..8c338006ad 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -110,6 +110,7 @@ SETV_KER_PROT(double, d, setv_zen_int) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) +AXPYF_KER_PROT( double, d, axpyf_zen_int_16x2 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_6 ) From c445f192d597fda82a869b0c394ae51408a31d79 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Wed, 2 Aug 2023 08:40:52 -0400 Subject: [PATCH 124/226] BLIS: Missing clobbers (batch 6) More missing clobbers in skx and zen4 kernels, missed in previous commits. AMD-Internal: [CPUPL-3521] Change-Id: I838240f0539af4bf977a10d20302a40c34710858 --- kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c | 7 ++++- kernels/skx/3/bli_dgemm_skx_asm_16x14.c | 9 ++++-- kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c | 15 ++++++---- kernels/zen4/3/bli_dgemm_zen4_asm_32x6.c | 9 ++++-- kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c | 16 +++++++---- kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c | 8 ++++-- kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c | 8 ++++-- .../zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c | 28 +++++++++---------- 8 files changed, 66 insertions(+), 34 deletions(-) diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c index 3a20cd8618..5735a5911a 100644 --- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c +++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -539,7 +540,11 @@ void bli_dgemm_skx_asm_16x12_l2( [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", - "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "r13", "r14", "r15", "k0", "k1", "k2", "xmm1", "xmm7", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", + "ymm22", "ymm23", "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", + "ymm29", "ymm30", "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c index c0ada1eb66..acf00e35d3 100644 --- a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c +++ b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2022, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -454,7 +454,12 @@ void bli_dgemm_skx_asm_16x14( [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", - "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "r13", "r14", "r15", "k0", "k1", "k2", "k3", "k4", "xmm1", + "xmm2", "ymm2", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "ymm16", + "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23", + "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", + "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c index 40af496140..572045832d 100644 --- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c +++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -564,10 +565,14 @@ void bli_sgemm_skx_asm_32x12_l2( [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", - "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", - "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", - "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", - "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", - "zmm30", "zmm31", "memory" + "r13", "r14", "r15", "k0", "k1", "k2", "k3", "k4", "xmm1", "xmm7", + "ymm1", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", + "ymm19", "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25", + "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31", "zmm0", "zmm1", + "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", + "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", + "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", + "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) } diff --git a/kernels/zen4/3/bli_dgemm_zen4_asm_32x6.c b/kernels/zen4/3/bli_dgemm_zen4_asm_32x6.c index c20c0ab898..cab5ea0ce5 100644 --- a/kernels/zen4/3/bli_dgemm_zen4_asm_32x6.c +++ b/kernels/zen4/3/bli_dgemm_zen4_asm_32x6.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -474,7 +474,12 @@ void bli_dgemm_zen4_asm_32x6( [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", - "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "r13", "r14", "r15", "k0", "k1", "k2", "k3", "k4", "xmm1", + "xmm2", "ymm2", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "ymm16", + "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23", + "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", + "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", diff --git a/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c b/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c index 1f133dfc15..4a1c416a5d 100644 --- a/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c +++ b/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -703,10 +703,14 @@ void bli_dgemm_zen4_asm_8x24( [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", - "r13", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", - "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", - "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", - "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", - "zmm30", "zmm31", "memory" + "r13", "k0", "k1", "k2", "k3", "xmm1", "xmm2", + "ymm2", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", + "ymm21", "ymm22", "ymm23", "ymm24", "ymm25", "ymm26", "ymm27", + "ymm28", "ymm29", "ymm30", "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", + "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", + "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", + "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) } diff --git a/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c b/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c index 139edc7ddb..001bcd910c 100644 --- a/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c +++ b/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -812,7 +812,11 @@ void bli_dgemmtrsm_l_zen4_asm_8x24 [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", - "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "r13", "r14", "r15", "k0", "k1", "k2", "k3", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "ymm16", + "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23", + "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", + "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", diff --git a/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c b/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c index d1ea0109d7..b620410c89 100644 --- a/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c +++ b/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -817,7 +817,11 @@ void bli_dgemmtrsm_u_zen4_asm_8x24 [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", - "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "r13", "r14", "r15", "k0", "k1", "k2", "k3", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "ymm16", + "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23", + "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", + "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", diff --git a/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c b/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c index a0db7fd504..117a6cb564 100644 --- a/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c +++ b/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1707,7 +1707,7 @@ void bli_zgemmsup_cv_zen4_asm_12x3m "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k3", "memory" ) consider_edge_cases: @@ -2112,7 +2112,7 @@ void bli_zgemmsup_cv_zen4_asm_12x2m "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k3", "memory" ) consider_edge_cases: @@ -2501,7 +2501,7 @@ void bli_zgemmsup_cv_zen4_asm_12x1m "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k3", "memory" ) consider_edge_cases:; @@ -3056,7 +3056,7 @@ void bli_zgemmsup_cv_zen4_asm_8x3 "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k3", "memory" ) } @@ -3301,7 +3301,7 @@ void bli_zgemmsup_cv_zen4_asm_8x2 "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k3", "memory" ) } @@ -3538,7 +3538,7 @@ void bli_zgemmsup_cv_zen4_asm_8x1 "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k3", "memory" ) } @@ -3992,7 +3992,7 @@ void bli_zgemmsup_cv_zen4_asm_4x3 "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k3", "memory" ) } @@ -4216,7 +4216,7 @@ void bli_zgemmsup_cv_zen4_asm_4x2 "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k3", "memory" ) } @@ -4433,7 +4433,7 @@ void bli_zgemmsup_cv_zen4_asm_4x1 "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "memory" + "k3", "memory" ) } @@ -5151,11 +5151,11 @@ void bli_zgemmsup_cv_zen4_asm_2x3 [cs_c] "m" (cs_c) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al", + "xmm9", "xmm10", "xmm11", "xmm12", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", - "ymm12", "ymm13", "ymm14", "ymm15", - "memory" + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -5679,10 +5679,10 @@ void bli_zgemmsup_cv_zen4_asm_2x1 [cs_c] "m" (cs_c) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al", + "xmm5", "xmm6", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", - "ymm12", "ymm13", "ymm14", "ymm15", - "memory" + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } From 03fa660792e2c3a3feb45a432a72d99e1f1b53fd Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Mon, 22 May 2023 11:03:28 +0530 Subject: [PATCH 125/226] Optimized xGEMV for non-unit stride X vector - In GEMV variant 1, the input matrix A is in row major. X vector has to be of unit stride if the operation is to be vectorized. - In cases when X vector is non-unit stride, vectorization of the GEMV operation inside the kernel has been ensured by packing the input X vector to a temporary buffer with unit stride. Currently, the packing is done using the SCAL2V. - In case of DGEMV, X vector is scaled by alpha as part of packing. In CGEMV and ZGEMV, alpha is passed as 1 while packing. - The temporary buffer created is released once the GEMV operation is complete. - In DGEMV variant 1, moved problem decomposition for Zen architecture to the DOTXF kernel. - Removed flag check based kernel dispatch logic from DGEMV. Now, kernels will be picked from the context for non-avx machines. For avx machines, the kernel(s) to be dispatched is(are) assigned to the function pointer in the unf_var layer. AMD-Internal: [CPUPL-3475] Change-Id: Icd9fd91eccd831f1fcb9fbf0037fcbbc2e34268e --- frame/2/gemv/bli_gemv_unf_var1.c | 96 ++++++- frame/2/gemv/bli_gemv_unf_var1_amd.c | 365 ++++++++++++++++----------- kernels/zen/1f/bli_dotxf_zen_int_8.c | 68 ++++- 3 files changed, 365 insertions(+), 164 deletions(-) diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c index 8162613c18..d69416d21f 100644 --- a/frame/2/gemv/bli_gemv_unf_var1.c +++ b/frame/2/gemv/bli_gemv_unf_var1.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 23, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -65,6 +65,17 @@ void PASTEMAC(ch,varname) \ dim_t n_elem, n_iter; \ inc_t rs_at, cs_at; \ conj_t conja; \ + \ + /* Memory pool declarations for packing vector X. */\ + mem_t mem_bufX;\ + rntm_t rntm;\ + ctype* x_buf = x;\ + inc_t buf_incx = incx;\ + /* + Boolean to check if the y has been packed + and memory needs to be freed in the end + */\ + bool is_x_temp_buf_created = FALSE;\ \ bli_set_dims_incs_with_trans( transa, \ m, n, rs_a, cs_a, \ @@ -73,6 +84,75 @@ void PASTEMAC(ch,varname) \ conja = bli_extract_conj( transa ); \ \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ +\ + if( incx > 1 )\ + {\ + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_membrk_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */\ + mem_bufX.pblk.buf = NULL;\ + mem_bufX.pblk.block_size = 0;\ + mem_bufX.buf_type = 0;\ + mem_bufX.size = 0;\ + mem_bufX.pool = NULL;\ +\ + /* + In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm + */\ +\ + bli_rntm_init_from_global(&rntm);\ + bli_rntm_set_num_threads_only(1, &rntm);\ + bli_membrk_rntm_set_membrk(&rntm);\ +\ + /* + Calculate the size required for n_elem double elements in vector Y. + */\ + size_t buffer_size = n_elem * sizeof(ctype);\ +\ + /* + Acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufX. + */\ + bli_membrk_acquire_m(&rntm, buffer_size, BLIS_BUFFER_FOR_B_PANEL, &mem_bufX);\ +\ + /* + Continue packing X if buffer memory is allocated + */\ + if ((bli_mem_is_alloc(&mem_bufX)))\ + {\ + x_buf = bli_mem_buffer(&mem_bufX);\ + buf_incx = 1;\ + ctype* alpha_passed = PASTEMAC(ch,1);\ +\ + PASTECH(ch,scal2v_ker_ft) scal2v_kr_ptr;\ +\ + scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(dt, BLIS_SCAL2V_KER, cntx);\ +\ + /* + Invoke the ZSCAL2V function using the function pointer + */\ + scal2v_kr_ptr\ + (\ + BLIS_NO_CONJUGATE,\ + n_elem,\ + alpha_passed,\ + x, incx,\ + x_buf, buf_incx,\ + cntx\ + );\ +\ + /* + Set x is packed as the memory allocation was + successful and contents have been copied + */\ + is_x_temp_buf_created = TRUE;\ + }\ + }\ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ @@ -83,7 +163,7 @@ void PASTEMAC(ch,varname) \ f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ \ A1 = a + (i )*rs_at + (0 )*cs_at; \ - x1 = x + (0 )*incy; \ + x1 = x_buf + (0 )*buf_incx; \ y1 = y + (i )*incy; \ \ /* y1 = beta * y1 + alpha * A1 * x; */ \ @@ -95,13 +175,23 @@ void PASTEMAC(ch,varname) \ f, \ alpha, \ A1, cs_at, rs_at, \ - x1, incx, \ + x1, buf_incx, \ beta, \ y1, incy, \ cntx \ ); \ \ } \ + /* + Check if temp y buffer was used for compute + */\ + if (is_x_temp_buf_created)\ + {\ + /* + Return the buffer to pool + */\ + bli_membrk_release(&rntm, &mem_bufX);\ + }\ } INSERT_GENTFUNC_BASIC0( gemv_unf_var1 ) diff --git a/frame/2/gemv/bli_gemv_unf_var1_amd.c b/frame/2/gemv/bli_gemv_unf_var1_amd.c index 05f7fbb875..4422b7587a 100644 --- a/frame/2/gemv/bli_gemv_unf_var1_amd.c +++ b/frame/2/gemv/bli_gemv_unf_var1_amd.c @@ -65,6 +65,17 @@ void PASTEMAC(ch,varname) \ dim_t n_elem, n_iter; \ inc_t rs_at, cs_at; \ conj_t conja; \ + \ + /* Memory pool declarations for packing vector X. */\ + mem_t mem_bufX;\ + rntm_t rntm;\ + ctype* x_temp = x;\ + inc_t temp_incx = incx;\ + /* + Boolean to check if the y has been packed + and memory needs to be freed in the end + */\ + bool is_x_temp_buf_created = FALSE;\ \ bli_set_dims_incs_with_trans( transa, \ m, n, rs_a, cs_a, \ @@ -73,6 +84,75 @@ void PASTEMAC(ch,varname) \ conja = bli_extract_conj( transa ); \ \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ +\ + if( incx > 1 )\ + {\ + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_membrk_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */\ + mem_bufX.pblk.buf = NULL;\ + mem_bufX.pblk.block_size = 0;\ + mem_bufX.buf_type = 0;\ + mem_bufX.size = 0;\ + mem_bufX.pool = NULL;\ +\ + /* + In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm + */\ +\ + bli_rntm_init_from_global(&rntm);\ + bli_rntm_set_num_threads_only(1, &rntm);\ + bli_membrk_rntm_set_membrk(&rntm);\ +\ + /* + Calculate the size required for n_elem double elements in vector Y. + */\ + size_t buffer_size = n_elem * sizeof(ctype);\ +\ + /* + Acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufX. + */\ + bli_membrk_acquire_m(&rntm, buffer_size, BLIS_BUFFER_FOR_B_PANEL, &mem_bufX);\ +\ + /* + Continue packing X if buffer memory is allocated + */\ + if ((bli_mem_is_alloc(&mem_bufX)))\ + {\ + x_temp = bli_mem_buffer(&mem_bufX);\ + temp_incx = 1;\ + ctype* alpha_passed = PASTEMAC(ch,1);\ +\ + PASTECH(ch,scal2v_ker_ft) scal2v_kr_ptr;\ +\ + scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(dt, BLIS_SCAL2V_KER, cntx);\ +\ + /* + Invoke the ZSCAL2V function using the function pointer + */\ + scal2v_kr_ptr\ + (\ + BLIS_NO_CONJUGATE,\ + n_elem,\ + alpha_passed,\ + x, incx,\ + x_temp, temp_incx,\ + cntx\ + );\ +\ + /* + Set x is packed as the memory allocation was + successful and contents have been copied + */\ + is_x_temp_buf_created = TRUE;\ + }\ + }\ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ @@ -83,7 +163,7 @@ void PASTEMAC(ch,varname) \ f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ \ A1 = a + (i )*rs_at + (0 )*cs_at; \ - x1 = x + (0 )*incy; \ + x1 = x_temp + (0 )*temp_incx; \ y1 = y + (i )*incy; \ \ /* y1 = beta * y1 + alpha * A1 * x; */ \ @@ -95,13 +175,23 @@ void PASTEMAC(ch,varname) \ f, \ alpha, \ A1, cs_at, rs_at, \ - x1, incx, \ + x1, temp_incx, \ beta, \ y1, incy, \ cntx \ ); \ \ } \ + /* + Check if temp y buffer was used for compute + */\ + if (is_x_temp_buf_created)\ + {\ + /* + Return the buffer to pool + */\ + bli_membrk_release(&rntm, &mem_bufX);\ + }\ } void bli_dgemv_unf_var1 @@ -122,20 +212,21 @@ void bli_dgemv_unf_var1 double *A1; double *y1; dim_t i; - dim_t f; + dim_t f, b_fuse; dim_t n_elem, n_iter; inc_t rs_at, cs_at; conj_t conja; - //memory pool declarations for packing vector X. - mem_t mem_bufX; - rntm_t rntm; - double *x_buf = x; - inc_t buf_incx = incx; - bli_init_once(); + // Copy the alpha to the temp alpha + double alpha_temp = *alpha; - if (cntx == NULL) - cntx = bli_gks_query_cntx(); + // Memory pool declarations for packing vector X. + mem_t mem_bufX; + rntm_t rntm; + double* x_temp = x; + inc_t temp_incx = incx; + + //bli_init_once(); bli_set_dims_incs_with_trans(transa, m, n, rs_a, cs_a, @@ -143,185 +234,151 @@ void bli_dgemv_unf_var1 conja = bli_extract_conj(transa); - // This function is invoked on all architectures including 'generic'. - // Non-AVX2+FMA3 platforms will use the kernels derived from the context. - if (bli_cpuid_is_avx2fma3_supported() == FALSE) - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - const num_t dt = PASTEMAC(d,type); - double* x1; - double* y1; - PASTECH(d,dotxf_ker_ft) kfp_df; - /* Query the context for the kernel function pointer and fusing factor. */ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); - dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); + /* + Fatbinary config amdzen when run on non-AMD X86 + will query for the support of AVX512 or AVX2, if + AVX512 - arch_id will be zen4 or for AVX2 it will + be zen3 + */ + arch_t id = bli_arch_query_id(); - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + /* + Function pointer declaration for the functions + that will be used by this API + */ + ddotxf_ker_ft dotxf_kr_ptr; // DOTXF - A1 = a + (i )*rs_at + (0 )*cs_at; - x1 = x + (0 )*incy; - y1 = y + (i )*incy; + /* + Boolean to check if the y has been packed + and memory needs to be freed in the end + */ + bool is_x_temp_buf_created = FALSE; - /* y1 = beta * y1 + alpha * A1 * x; */ - kfp_df - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x1, incx, - beta, - y1, incy, - cntx - ); + switch (id) + { + case BLIS_ARCH_ZEN4: + case BLIS_ARCH_ZEN: + case BLIS_ARCH_ZEN2: + case BLIS_ARCH_ZEN3: - } + /* + Assign the AVX2 based kernel function pointers for DOTXF, + SCAL2V and corresponding fusing factor of DOTXF kernel + */ - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; + dotxf_kr_ptr = bli_ddotxf_zen_int_8; + b_fuse = 8; + + break; + default: + // For non-Zen architectures, query the context if it is NULL + if(cntx == NULL) cntx = bli_gks_query_cntx(); + + /* + Query the context for the kernel function pointers for + DOTXF, SCAL2V and corresponding fusing + factor of DOTXF kernel + */ + dotxf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DOUBLE, BLIS_DOTXF_KER, cntx); + b_fuse = bli_cntx_get_blksz_def_dt(BLIS_DOUBLE, BLIS_AF, cntx); } + if (incx > 1) { - /* + /* Initialize mem pool buffer to NULL and size to 0 "buf" and "size" fields are assigned once memory is allocated from the pool in bli_membrk_acquire_m(). This will ensure bli_mem_is_alloc() will be passed on an allocated memory if created or a NULL . - */ + */ - mem_bufX.pblk.buf = NULL; - mem_bufX.pblk.block_size = 0; - mem_bufX.buf_type = 0; - mem_bufX.size = 0; - mem_bufX.pool = NULL; + mem_bufX.pblk.buf = NULL; + mem_bufX.pblk.block_size = 0; + mem_bufX.buf_type = 0; + mem_bufX.size = 0; + mem_bufX.pool = NULL; - /* In order to get the buffer from pool via rntm access to memory broker - is needed.Following are initializations for rntm */ + /* In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm */ - bli_rntm_init_from_global(&rntm); - bli_rntm_set_num_threads_only(1, &rntm); - bli_membrk_rntm_set_membrk(&rntm); + bli_rntm_init_from_global(&rntm); + bli_rntm_set_num_threads_only(1, &rntm); + bli_membrk_rntm_set_membrk(&rntm); - //calculate the size required for n_elem double elements in vector X. - size_t buffer_size = n_elem * sizeof(double); + //calculate the size required for n_elem double elements in vector X. + size_t buffer_size = n_elem * sizeof(double); #ifdef BLIS_ENABLE_MEM_TRACING - printf("bli_dgemv_unf_var1(): get mem pool block\n"); + printf("bli_dgemv_unf_var1(): get mem pool block\n"); #endif - /*acquire a Buffer(n_elem*size(double)) from the memory broker - and save the associated mem_t entry to mem_bufX.*/ - bli_membrk_acquire_m(&rntm, - buffer_size, - BLIS_BUFFER_FOR_B_PANEL, - &mem_bufX); + /*acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufX.*/ + bli_membrk_acquire_m(&rntm, + buffer_size, + BLIS_BUFFER_FOR_B_PANEL, + &mem_bufX); - /*Continue packing X if buffer memory is allocated*/ - if ((bli_mem_is_alloc(&mem_bufX))) - { - x_buf = bli_mem_buffer(&mem_bufX); - - //pack X vector with non-unit stride to a temp buffer x_buf with unit stride - for (dim_t x_index = 0; x_index < n_elem; x_index++) - { - *(x_buf + x_index) = *(x + (x_index * incx)); - } - // stride of vector x_buf =1 - buf_incx = 1; - } - } + /*Continue packing X if buffer memory is allocated*/ + if ((bli_mem_is_alloc(&mem_bufX))) + { + x_temp = bli_mem_buffer(&mem_bufX); + temp_incx = 1; - dim_t fuse_factor = 8; - dim_t f_temp =0; + // Query the context if it is NULL + if(cntx == NULL) cntx = bli_gks_query_cntx(); - if (n < 4) - { - fuse_factor = 2; - } else if (n < 8) - { - fuse_factor = 4; + dscal2v_ker_ft scal2v_kr_ptr; // SCAL2V + + scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCAL2V_KER, cntx); + + /* + Invoke the SCAL2V function using the function + pointer and scale by alpha as we pack X vector + */ + scal2v_kr_ptr + ( + BLIS_NO_CONJUGATE, + n_elem, + &alpha_temp, + x, incx, + x_temp, temp_incx, + cntx + ); + + is_x_temp_buf_created = TRUE; + + // Set alpha_temp to 1.0 since X has already been scaled by alpha + alpha_temp = *bli_d1; + } } for (i = 0; i < n_iter; i += f) { - f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor); + f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse); //A = a + i * row_increment + 0 * column_increment A1 = a + (i)*rs_at; y1 = y + (i)*incy; /* y1 = beta * y1 + alpha * A1 * x; */ - switch (f) - { - case 8: - - bli_ddotxf_zen_int_8( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x_buf, buf_incx, - beta, - y1, incy, - cntx); - - break; - default: - - if (f < 4) - { - bli_ddotxf_zen_int_2( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x_buf, buf_incx, - beta, - y1, incy, - cntx); - } - else - { - bli_ddotxf_zen_int_4( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x_buf, buf_incx, - beta, - y1, incy, - cntx); - } - } - - f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor); - - if (f_temp < fuse_factor) - { - switch (fuse_factor) - { - case 8: - fuse_factor = 4; - break; - case 4: - fuse_factor = 2; - break; - } - } + dotxf_kr_ptr + ( + conja, + conjx, + n_elem, + f, + &alpha_temp, + A1, cs_at, rs_at, + x_temp, temp_incx, + beta, + y1, incy, + cntx + ); } - if ((incx > 1) && bli_mem_is_alloc(&mem_bufX)) + if (is_x_temp_buf_created) { #ifdef BLIS_ENABLE_MEM_TRACING printf("bli_dgemv_unf_var1(): releasing mem pool block\n"); diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c index 815e388f21..6f652db309 100644 --- a/kernels/zen/1f/bli_dotxf_zen_int_8.c +++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2018, The University of Texas at Austin - Copyright (C) 2017 - 22, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2017 - 23, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -458,15 +458,69 @@ void bli_ddotxf_zen_int_8 return; } - // If b_n is not equal to the fusing factor, then perform the entire - // operation as a loop over dotxv. + /* + If b_n is not equal to the fusing factor, then perform the entire + operation as dotxv or perform the operation using dotxf kernels with + lower fuse factor. + */ if (b_n != fuse_fac) { - for (dim_t i = 0; i < b_n; ++i) + if (b_n >= 4) { - double *a1 = a + (0) * inca + (i)*lda; - double *x1 = x + (0) * incx; - double *psi1 = y + (i)*incy; + dim_t fuse = 4; + + bli_ddotxf_zen_int_4 + ( + conjat, + conjx, + m, + fuse, + alpha, + a, inca, lda, + x, incx, + beta, + y, incy, + cntx + ); + + // Increment the pointers + a = a + (fuse)*lda; + y = y + (fuse)*incy; + + // Decrement to point to the remaining compute left + b_n -= 4; + } + + if (b_n >= 2) + { + dim_t fuse = 2; + + bli_ddotxf_zen_int_2 + ( + conjat, + conjx, + m, + fuse, + alpha, + a, inca, lda, + x, incx, + beta, + y, incy, + cntx + ); + + // Increment the pointers + a = a + (fuse)*lda; + y = y + (fuse)*incy; + + b_n -= 2; + } + + if (b_n == 1) + { + double *a1 = a; + double *x1 = x; + double *psi1 = y; bli_ddotxv_zen_int( conjat, From 248d09c7229e7197b8eb4951771659d7d2f2d081 Mon Sep 17 00:00:00 2001 From: Chandrashekara K R Date: Tue, 8 Aug 2023 14:35:54 +0530 Subject: [PATCH 126/226] Version String Update AOCL-BLIS: Updated version string to AOCL-BLIS 4.1.1 Build Change-Id: Iced62a66d0859b3c7d4bcfe6f0e0527922e41cae --- so_version | 2 +- version | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/so_version b/so_version index 261e84c64a..8789ec07a4 100644 --- a/so_version +++ b/so_version @@ -1,2 +1,2 @@ 4 -0.1 +1.1 diff --git a/version b/version index 1454f6ed4b..627a3f43a6 100644 --- a/version +++ b/version @@ -1 +1 @@ -4.0.1 +4.1.1 From 278ca71706cf8cbe8139e08f8f187806af53432d Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Wed, 9 Aug 2023 13:14:48 +0530 Subject: [PATCH 127/226] Fixes for GEMV Functionality Issues - Added call to dsetv in dscalv. When DSCALV is invoked by DGEMV the SCAL function is expected to SET the vector to zero when alpha is 0. This change is done to ensure BLAS compatibility of DGEMV. - Fixed bug in DGEMV var 1. Reverted changes in DGEMV var 1 to remove packing and dispatch logic. - CMAKE now builds with _amd files for unf_var2 of GEMV. AMD-Internal: [CPUPL-3772] Change-Id: I0d60c9e1025a3a56419d6ae47ded509d50e5eade --- frame/2/gemv/CMakeLists.txt | 5 +- frame/2/gemv/bli_gemv_unf_var1_amd.c | 271 ++++++++++++---------- kernels/zen4/1/bli_scalv_zen_int_avx512.c | 23 ++ 3 files changed, 178 insertions(+), 121 deletions(-) diff --git a/frame/2/gemv/CMakeLists.txt b/frame/2/gemv/CMakeLists.txt index 633ec9431a..9768c9f6ff 100644 --- a/frame/2/gemv/CMakeLists.txt +++ b/frame/2/gemv/CMakeLists.txt @@ -1,10 +1,9 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## target_sources("${PROJECT_NAME}" PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unb_var1.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unb_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unf_var2.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_var_oapi.c ) @@ -17,10 +16,12 @@ if(${TARGET_ARCH} STREQUAL zen OR target_sources("${PROJECT_NAME}" PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unf_var1_amd.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unf_var2_amd.c ) else() target_sources("${PROJECT_NAME}" PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unf_var1.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unf_var2.c ) endif() diff --git a/frame/2/gemv/bli_gemv_unf_var1_amd.c b/frame/2/gemv/bli_gemv_unf_var1_amd.c index 4422b7587a..634fe29c86 100644 --- a/frame/2/gemv/bli_gemv_unf_var1_amd.c +++ b/frame/2/gemv/bli_gemv_unf_var1_amd.c @@ -212,21 +212,20 @@ void bli_dgemv_unf_var1 double *A1; double *y1; dim_t i; - dim_t f, b_fuse; + dim_t f; dim_t n_elem, n_iter; inc_t rs_at, cs_at; conj_t conja; + //memory pool declarations for packing vector X. + mem_t mem_bufX; + rntm_t rntm; + double *x_buf = x; + inc_t buf_incx = incx; - // Copy the alpha to the temp alpha - double alpha_temp = *alpha; - - // Memory pool declarations for packing vector X. - mem_t mem_bufX; - rntm_t rntm; - double* x_temp = x; - inc_t temp_incx = incx; + bli_init_once(); - //bli_init_once(); + if (cntx == NULL) + cntx = bli_gks_query_cntx(); bli_set_dims_incs_with_trans(transa, m, n, rs_a, cs_a, @@ -234,151 +233,185 @@ void bli_dgemv_unf_var1 conja = bli_extract_conj(transa); - /* - Fatbinary config amdzen when run on non-AMD X86 - will query for the support of AVX512 or AVX2, if - AVX512 - arch_id will be zen4 or for AVX2 it will - be zen3 - */ - arch_t id = bli_arch_query_id(); - - /* - Function pointer declaration for the functions - that will be used by this API - */ - ddotxf_ker_ft dotxf_kr_ptr; // DOTXF - - /* - Boolean to check if the y has been packed - and memory needs to be freed in the end - */ - bool is_x_temp_buf_created = FALSE; - - switch (id) + // This function is invoked on all architectures including 'generic'. + // Non-AVX2+FMA3 platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx2fma3_supported() == FALSE) { - case BLIS_ARCH_ZEN4: - case BLIS_ARCH_ZEN: - case BLIS_ARCH_ZEN2: - case BLIS_ARCH_ZEN3: + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + const num_t dt = PASTEMAC(d,type); + double* x1; + double* y1; + PASTECH(d,dotxf_ker_ft) kfp_df; + /* Query the context for the kernel function pointer and fusing factor. */ + kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); + dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); - /* - Assign the AVX2 based kernel function pointers for DOTXF, - SCAL2V and corresponding fusing factor of DOTXF kernel - */ + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (i )*rs_at + (0 )*cs_at; + x1 = x + (0 )*incy; + y1 = y + (i )*incy; - dotxf_kr_ptr = bli_ddotxf_zen_int_8; - b_fuse = 8; + /* y1 = beta * y1 + alpha * A1 * x; */ + kfp_df + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x1, incx, + beta, + y1, incy, + cntx + ); - break; - default: - // For non-Zen architectures, query the context if it is NULL - if(cntx == NULL) cntx = bli_gks_query_cntx(); + } - /* - Query the context for the kernel function pointers for - DOTXF, SCAL2V and corresponding fusing - factor of DOTXF kernel - */ - dotxf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DOUBLE, BLIS_DOTXF_KER, cntx); - b_fuse = bli_cntx_get_blksz_def_dt(BLIS_DOUBLE, BLIS_AF, cntx); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + return; } - if (incx > 1) { - /* + /* Initialize mem pool buffer to NULL and size to 0 "buf" and "size" fields are assigned once memory is allocated from the pool in bli_membrk_acquire_m(). This will ensure bli_mem_is_alloc() will be passed on an allocated memory if created or a NULL . - */ + */ - mem_bufX.pblk.buf = NULL; - mem_bufX.pblk.block_size = 0; - mem_bufX.buf_type = 0; - mem_bufX.size = 0; - mem_bufX.pool = NULL; + mem_bufX.pblk.buf = NULL; + mem_bufX.pblk.block_size = 0; + mem_bufX.buf_type = 0; + mem_bufX.size = 0; + mem_bufX.pool = NULL; - /* In order to get the buffer from pool via rntm access to memory broker - is needed.Following are initializations for rntm */ + /* In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm */ - bli_rntm_init_from_global(&rntm); - bli_rntm_set_num_threads_only(1, &rntm); - bli_membrk_rntm_set_membrk(&rntm); + bli_rntm_init_from_global(&rntm); + bli_rntm_set_num_threads_only(1, &rntm); + bli_membrk_rntm_set_membrk(&rntm); - //calculate the size required for n_elem double elements in vector X. - size_t buffer_size = n_elem * sizeof(double); + //calculate the size required for n_elem double elements in vector X. + size_t buffer_size = n_elem * sizeof(double); #ifdef BLIS_ENABLE_MEM_TRACING - printf("bli_dgemv_unf_var1(): get mem pool block\n"); + printf("bli_dgemv_unf_var1(): get mem pool block\n"); #endif - /*acquire a Buffer(n_elem*size(double)) from the memory broker - and save the associated mem_t entry to mem_bufX.*/ - bli_membrk_acquire_m(&rntm, - buffer_size, - BLIS_BUFFER_FOR_B_PANEL, - &mem_bufX); - - /*Continue packing X if buffer memory is allocated*/ - if ((bli_mem_is_alloc(&mem_bufX))) - { - x_temp = bli_mem_buffer(&mem_bufX); - temp_incx = 1; - - // Query the context if it is NULL - if(cntx == NULL) cntx = bli_gks_query_cntx(); + /*acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufX.*/ + bli_membrk_acquire_m(&rntm, + buffer_size, + BLIS_BUFFER_FOR_B_PANEL, + &mem_bufX); - dscal2v_ker_ft scal2v_kr_ptr; // SCAL2V - - scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCAL2V_KER, cntx); - - /* - Invoke the SCAL2V function using the function - pointer and scale by alpha as we pack X vector - */ - scal2v_kr_ptr - ( - BLIS_NO_CONJUGATE, - n_elem, - &alpha_temp, - x, incx, - x_temp, temp_incx, - cntx - ); + /*Continue packing X if buffer memory is allocated*/ + if ((bli_mem_is_alloc(&mem_bufX))) + { + x_buf = bli_mem_buffer(&mem_bufX); + + //pack X vector with non-unit stride to a temp buffer x_buf with unit stride + for (dim_t x_index = 0; x_index < n_elem; x_index++) + { + *(x_buf + x_index) = *(x + (x_index * incx)); + } + // stride of vector x_buf =1 + buf_incx = 1; + } + } - is_x_temp_buf_created = TRUE; + dim_t fuse_factor = 8; + dim_t f_temp =0; - // Set alpha_temp to 1.0 since X has already been scaled by alpha - alpha_temp = *bli_d1; - } + if (n < 4) + { + fuse_factor = 2; + } else if (n < 8) + { + fuse_factor = 4; } for (i = 0; i < n_iter; i += f) { - f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse); + f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor); //A = a + i * row_increment + 0 * column_increment A1 = a + (i)*rs_at; y1 = y + (i)*incy; /* y1 = beta * y1 + alpha * A1 * x; */ - dotxf_kr_ptr - ( - conja, - conjx, - n_elem, - f, - &alpha_temp, - A1, cs_at, rs_at, - x_temp, temp_incx, - beta, - y1, incy, - cntx - ); + switch (f) + { + case 8: + + bli_ddotxf_zen_int_8( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x_buf, buf_incx, + beta, + y1, incy, + cntx); + + break; + default: + + if (f < 4) + { + bli_ddotxf_zen_int_2( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x_buf, buf_incx, + beta, + y1, incy, + cntx); + } + else + { + bli_ddotxf_zen_int_4( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x_buf, buf_incx, + beta, + y1, incy, + cntx); + } + } + + f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor); + + if (f_temp < fuse_factor) + { + switch (fuse_factor) + { + case 8: + fuse_factor = 4; + break; + case 4: + fuse_factor = 2; + break; + } + } } - if (is_x_temp_buf_created) + if ((incx > 1) && bli_mem_is_alloc(&mem_bufX)) { #ifdef BLIS_ENABLE_MEM_TRACING printf("bli_dgemv_unf_var1(): releasing mem pool block\n"); diff --git a/kernels/zen4/1/bli_scalv_zen_int_avx512.c b/kernels/zen4/1/bli_scalv_zen_int_avx512.c index 0ba20116ca..febd6aa8e9 100644 --- a/kernels/zen4/1/bli_scalv_zen_int_avx512.c +++ b/kernels/zen4/1/bli_scalv_zen_int_avx512.c @@ -269,6 +269,29 @@ void bli_dscalv_zen_int_avx512 cntx_t *restrict cntx ) { + // If the vector dimension is zero, or if alpha is unit, return early. + if (bli_zero_dim1(n) || PASTEMAC(d, eq1)(*alpha)) + return; + + // If alpha is zero, use setv. + if (PASTEMAC(d, eq0)(*alpha)) + { + double *zero = bli_d0; + if (cntx == NULL) cntx = bli_gks_query_cntx(); + dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SETV_KER, cntx); + + f + ( + BLIS_NO_CONJUGATE, + n, + zero, + x, incx, + cntx + ); + + return; + } + dim_t i = 0; double *restrict x0; From 0000cc88de49ff3e5af759f9c2c020450b4e6c36 Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Fri, 4 Aug 2023 10:45:29 +0530 Subject: [PATCH 128/226] Removed local copy of cntx in TRSM - TRSM and GEMM has different blocksizes in zen4, in order to accommodate this, a local copy of cntx was created in TRSM. - Local copy of cntx has been removed and TRSM blocksizes are stored in cntx->trsmblkszs. - Functions to override and restore default blocksizes for TRSM are removed. Instead of overriding the default blocksizes, TRSM blocksizes are stored separately in cntx. - Pack buffers for TRSM have to be packed with TRSM blocksizes and GEMM pack buffers have to be packed with default blocksizes. To check if we are packing for TRSM, "family" argument is added in bli_packm_init_pack function. - BLIS_GEMM_FOR_TRSM_UKR has to be used for TRSM if it is set, if it is not set then BLIS_GEMM_UKR has to be used. This functionality has been added to all TRSM macro kernels. - Methods to retrieve TRSM blocksizes from cntx are added to bli_cntx.h. - Tests for micro kernels are modified to accommodate the change in signature of bli_packm_init_pack. AMD-Internal: [CPUPL-3781] Change-Id: Ia567215d6d1aa0f14eae5d3177f4a3dd63b4b20a --- config/amdzen/bli_family_amdzen.h | 24 ------- config/zen4/bli_cntx_init_zen4.c | 98 ++++++----------------------- config/zen4/bli_family_zen4.h | 24 ------- frame/1m/packm/bli_packm_cntl.c | 5 +- frame/1m/packm/bli_packm_cntl.h | 3 +- frame/1m/packm/bli_packm_init.c | 39 ++++++++++-- frame/1m/packm/bli_packm_init.h | 2 + frame/3/bli_l3_blocksize.c | 8 ++- frame/3/gemm/bli_gemm_cntl.c | 6 +- frame/3/trsm/bli_trsm_blk_var1.c | 6 +- frame/3/trsm/bli_trsm_cntl.c | 9 ++- frame/3/trsm/bli_trsm_front.c | 35 ++--------- frame/3/trsm/bli_trsm_ll_ker_var2.c | 24 ++----- frame/3/trsm/bli_trsm_lu_ker_var2.c | 24 ++----- frame/3/trsm/bli_trsm_rl_ker_var2.c | 24 ++----- frame/3/trsm/bli_trsm_ru_ker_var2.c | 24 ++----- frame/base/bli_cntx.c | 11 ++++ frame/base/bli_cntx.h | 45 ++++++++++--- frame/base/bli_gks.c | 30 +++++++++ frame/thread/bli_thread.c | 25 +++++++- testsuite/src/test_gemm_ukr.c | 6 +- testsuite/src/test_gemmtrsm_ukr.c | 67 +++++++------------- testsuite/src/test_trsm_ukr.c | 8 ++- 23 files changed, 239 insertions(+), 308 deletions(-) diff --git a/config/amdzen/bli_family_amdzen.h b/config/amdzen/bli_family_amdzen.h index 5a1fbc68d9..eda853f356 100644 --- a/config/amdzen/bli_family_amdzen.h +++ b/config/amdzen/bli_family_amdzen.h @@ -59,29 +59,5 @@ // BLIS), defining this macro as 1 yields better performance. #define AOCL_BLIS_MULTIINSTANCE 0 -/* - * Override the block sizes in the context to the block sizes used - * by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default - * GEMM kernels are AVX512 based and uses different block sizes. - * - * This function should be called in TRSM path before performing - * any packing operations. - * - * Also the context must be restored to default values by calling - * bli_zen4_restore_default_blkszs() before exiting TRSM Path - */ -BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx); - -/* - * Restore the block sizes to default values needed for zen4 context. - * - * This function should be called to restore the block sizes to there - * default values if they where overriden by calling - * bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the - * TRSM path. - * - */ -BLIS_EXPORT_BLIS void bli_zen4_restore_default_blkszs (cntx_t* cntx); - #endif diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index 066bf34df6..f5b7be7de5 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -235,6 +235,25 @@ void bli_cntx_init_zen4( cntx_t* cntx ) BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native TRSMK execution. + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 8, 3, 12 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 24, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 120, 144, 60 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 512 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4008, 4080, 2004 ); + + bli_cntx_set_trsm_blkszs + ( + 5, + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); // ------------------------------------------------------------------------- // Initialize sup thresholds with architecture-appropriate values. s d c z @@ -373,81 +392,4 @@ void bli_cntx_init_zen4( cntx_t* cntx ) BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, cntx ); -} - -/* - * Override the block sizes in the context to the block sizes used - * by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default - * GEMM kernels are AVX512 based and uses different block sizes. - * - * This function should be called in TRSM path before performing - * any packing operations. - * - * Also the context must be restored to default values by calling - * bli_zen4_restore_default_blkszs() before exiting TRSM Path - */ -void bli_zen4_override_trsm_blkszs (cntx_t* cntx) -{ - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 8, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 24, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 120, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4008, 4080, 4080 ); - - - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native execution. - bli_cntx_set_blkszs - ( - BLIS_NAT, 5, - // level-3 - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); -} - - -/* - * Restore the block sizes to default values needed for zen4 context. - * - * This function should be called to restore the block sizes to there - * default values if they where overriden by calling - * bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the - * TRSM path. - * - */ -void bli_zen4_restore_default_blkszs (cntx_t* cntx) -{ - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - - if ( bli_init_model_query_id() == BLIS_MODEL_BERGAMO ) - { - BLI_CNTX_DEFAULT_BLKSZ_LIST_BERGAMO(blkszs); - } - else // BLIS_MODEL_DEFAULT choice, also currently used for BLIS_MODEL_GENOA and BLIS_MODEL_GENOA_X - { - BLI_CNTX_DEFAULT_BLKSZ_LIST_GENOA(blkszs); - } - - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native execution. - bli_cntx_set_blkszs - ( - BLIS_NAT, 7, - // level-3 - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - // level-1f - BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, - BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); -} +} \ No newline at end of file diff --git a/config/zen4/bli_family_zen4.h b/config/zen4/bli_family_zen4.h index 55d3e62d3c..0cd41b2b93 100644 --- a/config/zen4/bli_family_zen4.h +++ b/config/zen4/bli_family_zen4.h @@ -60,28 +60,4 @@ #define BLIS_SIMD_SIZE 64 #define BLIS_SIMD_NUM_REGISTERS 32 -/* - * Override the block sizes in the context to the block sizes used - * by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default - * GEMM kernels are AVX512 based and uses different block sizes. - * - * This function should be called in TRSM path before performing - * any packing operations. - * - * Also the context must be restored to default values by calling - * bli_zen4_restore_default_blkszs() before exiting TRSM Path - */ -BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx); - -/* - * Restore the block sizes to default values needed for zen4 context. - * - * This function should be called to restore the block sizes to there - * default values if they where overriden by calling - * bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the - * TRSM path. - * - */ -BLIS_EXPORT_BLIS void bli_zen4_restore_default_blkszs (cntx_t* cntx); - #endif diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index fc6ba8052c..4872d5a8cc 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,6 +38,7 @@ cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, + opid_t family, void_fp var_func, void_fp packm_var_func, bszid_t bmid_m, @@ -82,7 +83,7 @@ cntl_t* bli_packm_cntl_create_node cntl = bli_cntl_create_node ( rntm, - BLIS_NOID, + family, BLIS_NO_PART, var_func, params, diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 17aa196e8d..369ab0bc42 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -92,6 +92,7 @@ BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, + opid_t family, void_fp var_func, void_fp packm_var_func, bszid_t bmid_m, diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 57c1175bfe..a23da8c342 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -6,6 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -180,6 +181,7 @@ siz_t bli_packm_init bli_packm_init_pack ( invert_diag, + bli_cntl_family( cntl ), schema, pack_ord_if_up, pack_ord_if_lo, @@ -198,6 +200,7 @@ siz_t bli_packm_init siz_t bli_packm_init_pack ( invdiag_t invert_diag, + opid_t family, pack_t schema, packord_t pack_ord_if_up, packord_t pack_ord_if_lo, @@ -215,10 +218,11 @@ siz_t bli_packm_init_pack trans_t transa = bli_obj_onlytrans_status( a ); dim_t m_a = bli_obj_length( a ); dim_t n_a = bli_obj_width( a ); - dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); - dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); - dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); - dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx ); + + dim_t bmult_m_def = 0; + dim_t bmult_m_pack = 0; + dim_t bmult_n_def = 0; + dim_t bmult_n_pack = 0; dim_t m_p, n_p; dim_t m_p_pad, n_p_pad; @@ -227,6 +231,33 @@ siz_t bli_packm_init_pack inc_t rs_p, cs_p; inc_t is_p; + if( family == BLIS_TRSM ) + { + bmult_m_def = bli_cntx_get_trsm_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + bmult_m_pack = bli_cntx_get_trsm_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + bmult_n_def = bli_cntx_get_trsm_blksz_def_dt( dt_tar, bmult_id_n, cntx ); + bmult_n_pack = bli_cntx_get_trsm_blksz_max_dt( dt_tar, bmult_id_n, cntx ); + + // bmult_m_def will be zero when trsm block sizes are not set, use global + // block sizes in this case + if( bmult_m_def == 0 ) + { + bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + } + if( bmult_n_def == 0 ) + { + bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); + bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx ); + } + } + else + { + bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); + bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx ); + } // We begin by copying the fields of A. bli_obj_alias_to( a, p ); diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index 9365a131ef..73ed04dd4e 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -43,6 +44,7 @@ siz_t bli_packm_init BLIS_EXPORT_BLIS siz_t bli_packm_init_pack ( invdiag_t invert_diag, + opid_t family, pack_t schema, packord_t pack_ord_if_up, packord_t pack_ord_if_lo, diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 580b1ca2a5..595b5410ab 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -330,7 +330,11 @@ dim_t PASTEMAC0(opname) \ because even when the triangle is on the right, packing of that matrix uses MR, since only left-side trsm micro-kernels are supported. */ \ - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + mnr = bli_cntx_get_trsm_blksz_def_dt( dt, BLIS_MR, cntx ); \ + if( mnr == 0 ) \ + { \ + mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + } \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 1e50fd7c7e..6adaa81165 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -94,6 +94,7 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( rntm, + BLIS_GEMM, bli_gemm_packa, // pack the left-hand operand packa_fp, BLIS_MR, @@ -120,6 +121,7 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( rntm, + BLIS_GEMM, bli_gemm_packb, // pack the right-hand operand packb_fp, BLIS_KR, @@ -194,6 +196,7 @@ cntl_t* bli_gemmpb_cntl_create cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( bli_gemm_packb, // pack the right-hand operand + BLIS_GEMM, bli_packm_blk_var1, BLIS_KR, BLIS_MR, @@ -219,6 +222,7 @@ cntl_t* bli_gemmpb_cntl_create cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( bli_gemm_packa, // pack the left-hand operand + BLIS_GEMM, bli_packm_blk_var1, BLIS_NR, BLIS_KR, diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 51a444e840..16068cb0a5 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -152,8 +152,8 @@ void bli_trsm_blk_var1 { obj_t a11, c1; - // Determine the current algorithmic blocksize for GEMM. - b_alg = bli_determine_blocksize( BLIS_GEMM, direct, i, my_end, &ax1, + // Determine the current algorithmic blocksize for GEMM_FOR_TRSM. + b_alg = bli_determine_blocksize( BLIS_TRSM, direct, i, my_end, &ax1, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 4a7a4de8fd..e186c98545 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -68,6 +68,7 @@ cntl_t* bli_trsm_l_cntl_create packb_fp = bli_packm_blk_var1; const opid_t family = BLIS_TRSM; + opid_t pack_family = BLIS_TRSM; // // Create nodes for packing A and the macro-kernel (gemm branch). @@ -95,6 +96,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( rntm, + pack_family, bli_trsm_packa, // trsm operation's packm function for A. packa_fp, BLIS_MR, @@ -133,6 +135,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( rntm, + pack_family, bli_trsm_packa, // trsm operation's packm function for A. packa_fp, BLIS_MR, @@ -171,6 +174,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( rntm, + pack_family, bli_trsm_packb, packb_fp, BLIS_MR, @@ -220,6 +224,7 @@ cntl_t* bli_trsm_r_cntl_create void_fp packb_fp = bli_packm_blk_var1; const opid_t family = BLIS_TRSM; + opid_t pack_family = BLIS_TRSM; // Create two nodes for the macro-kernel. cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node @@ -244,6 +249,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( rntm, + pack_family, bli_trsm_packa, packa_fp, BLIS_NR, @@ -270,6 +276,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( rntm, + pack_family, bli_trsm_packb, packb_fp, BLIS_MR, diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 07555301bb..080b9713f0 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -145,47 +145,20 @@ void bli_trsm_front rntm ); - // If TRSM and GEMM have different blocksizes and blocksizes - // are changed in global cntx object, when GEMM and TRSM are - // called in parallel, blocksizes in global cntx object will - // not be correct for GEMM - // to fix this - // create a local copy of cntx so that overriding the blocksizes does - // not impact the global cntx object. - cntx_t cntx_trsm = *cntx; - // A sort of hack for communicating the desired pack schemas for A and B // to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and // bli_l3_cntl_create_if()). This allows us to access the schemas from // the control tree, which hopefully reduces some confusion, particularly // in bli_packm_init(). - if ( bli_cntx_method( &cntx_trsm ) == BLIS_NAT ) + if ( bli_cntx_method( cntx ) == BLIS_NAT ) { -#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4) - /* Zen4 TRSM Fixme: - * - * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels - * for TRSM (Till we implement TRSM AVX-512 kernels) - * - * The AVX2 kernels use different block sizes then AVX512 kernels - * Here we override the default block sizes in the context with AVX2 - * specific block size used in GEMMTRSM kernerls. - * - * We need to revisit this when TRSM AVX-512 kernels are implemented. - */ - if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4) && - ((bli_obj_dt(a) == BLIS_FLOAT) || (bli_obj_dt(a) == BLIS_DOUBLE)) ) - { - bli_zen4_override_trsm_blkszs(&cntx_trsm); - } -#endif bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); } else // if ( bli_cntx_method( cntx_trsm ) != BLIS_NAT ) { - pack_t schema_a = bli_cntx_schema_a_block( &cntx_trsm ); - pack_t schema_b = bli_cntx_schema_b_panel( &cntx_trsm ); + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); bli_obj_set_pack_schema( schema_a, &a_local ); bli_obj_set_pack_schema( schema_b, &b_local ); @@ -201,7 +174,7 @@ void bli_trsm_front &b_local, alpha, &c_local, - &cntx_trsm, + cntx, rntm, cntl ); diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 75d5241f55..279df5277a 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -173,26 +173,12 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Zen4 TRSM Fixme: - * - * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels - * for TRSM (Till we implement TRSM AVX-512 kernels) - * - * The AVX2 kernels for TRSM are enabled in the context, but they - * are compatible with only AVX2 version of GEMM kernels. - * - * Here we force the GEMM kernels to the AVX2 varients for float and double. - * For scomplex and dcomplex reference path is retained as is. - * - * We need to revisit this when TRSM AVX-512 kernels are implemented. - */ \ - bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - if (bli_arch_query_id() == BLIS_ARCH_ZEN4 && ((dt == BLIS_FLOAT) || (dt == BLIS_DOUBLE)) ) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + if( gemm_ukr == NULL || bli_cntx_method( cntx ) != BLIS_NAT ) \ { \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ - col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ } \ \ /* Temporary C buffer for edge cases. Note that the strides of this diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 27cea4dc3c..5a68106a78 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -173,26 +173,12 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Zen4 TRSM Fixme: - * - * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels - * for TRSM (Till we implement TRSM AVX-512 kernels) - * - * The AVX2 kernels for TRSM are enabled in the context, but they - * are compatible with only AVX2 version of GEMM kernels. - * - * Here we force the GEMM kernels to the AVX2 varients for float and double. - * For scomplex and dcomplex reference path is retained as is. - * - * We need to revisit this when TRSM AVX-512 kernels are implemented. - */ \ - bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - if (bli_arch_query_id() == BLIS_ARCH_ZEN4 && ((dt == BLIS_FLOAT) || (dt == BLIS_DOUBLE)) ) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + if( gemm_ukr == NULL || bli_cntx_method( cntx ) != BLIS_NAT ) \ { \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ - col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ } \ \ /* Temporary C buffer for edge cases. Note that the strides of this diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 7c57438a2d..507785c6cc 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -179,26 +179,12 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Zen4 TRSM Fixme: - * - * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels - * for TRSM (Till we implement TRSM AVX-512 kernels) - * - * The AVX2 kernels for TRSM are enabled in the context, but they - * are compatible with only AVX2 version of GEMM kernels. - * - * Here we force the GEMM kernels to the AVX2 varients for float and double. - * For scomplex and dcomplex reference path is retained as is. - * - * We need to revisit this when TRSM AVX-512 kernels are implemented. - */ \ - bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - if (bli_arch_query_id() == BLIS_ARCH_ZEN4 && ((dt == BLIS_FLOAT) || (dt == BLIS_DOUBLE)) ) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + if( gemm_ukr == NULL || bli_cntx_method( cntx ) != BLIS_NAT ) \ { \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ - col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ } \ \ /* Temporary C buffer for edge cases. Note that the strides of this diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 766a6b95c1..53ad570f60 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -178,26 +178,12 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Zen4 TRSM Fixme: - * - * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels - * for TRSM (Till we implement TRSM AVX-512 kernels) - * - * The AVX2 kernels for TRSM are enabled in the context, but they - * are compatible with only AVX2 version of GEMM kernels. - * - * Here we force the GEMM kernels to the AVX2 varients for float and double. - * For scomplex and dcomplex reference path is retained as is. - * - * We need to revisit this when TRSM AVX-512 kernels are implemented. - */ \ - bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - if (bli_arch_query_id() == BLIS_ARCH_ZEN4 && ((dt == BLIS_FLOAT) || (dt == BLIS_DOUBLE)) ) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + if( gemm_ukr == NULL || bli_cntx_method( cntx ) != BLIS_NAT ) \ { \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ - col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ } \ \ /* Temporary C buffer for edge cases. Note that the strides of this diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 774a31ead7..b3cdb343d0 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -424,11 +424,14 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... ) // to the blocksize multiple. blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx ); blksz_t* cntx_bmult = bli_cntx_get_bmult( bs_id, cntx ); + blksz_t* cntx_trsm_blksz = bli_cntx_get_trsm_blksz( bs_id, cntx ); // Copy the real domain values of the blksz_t object into the // the complex domain slots of the same object. bli_blksz_copy_dt( BLIS_FLOAT, cntx_blksz, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_copy_dt( BLIS_DOUBLE, cntx_blksz, BLIS_DCOMPLEX, cntx_blksz ); + bli_blksz_copy_dt( BLIS_FLOAT, cntx_blksz, BLIS_SCOMPLEX, cntx_trsm_blksz); + bli_blksz_copy_dt( BLIS_DOUBLE, cntx_blksz, BLIS_DCOMPLEX, cntx_trsm_blksz); // If the default blocksize scalar is non-unit, we need to scale // the complex domain default blocksizes. @@ -438,6 +441,8 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... ) // blocksize object. bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz ); + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_trsm_blksz); + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_trsm_blksz); // Perform rounding to ensure the newly scaled values are still // multiples of their register blocksize multiples. But only @@ -451,6 +456,8 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... ) // Round the newly-scaled blocksizes down to their multiple. bli_blksz_reduce_def_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_reduce_def_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_blksz ); + bli_blksz_reduce_def_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_trsm_blksz ); + bli_blksz_reduce_def_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_trsm_blksz ); } } @@ -462,6 +469,8 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... ) // blocksize object. bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz ); + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_trsm_blksz ); + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_trsm_blksz ); // Perform rounding to ensure the newly scaled values are still // multiples of their register blocksize multiples. But only @@ -475,6 +484,8 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... ) // Round the newly-scaled blocksizes down to their multiple. bli_blksz_reduce_max_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_reduce_max_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_blksz ); + bli_blksz_reduce_max_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_trsm_blksz ); + bli_blksz_reduce_max_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_trsm_blksz ); } } } diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index e76c544ae6..8de023a2b2 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -212,6 +212,16 @@ BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) return blksz; } +BLIS_INLINE blksz_t* bli_cntx_get_trsm_blksz( bszid_t bs_id, cntx_t* cntx ) +{ + blksz_t* blkszs = bli_cntx_trsm_blkszs_buf( cntx ); + blksz_t* blksz = &blkszs[ bs_id ]; + + // Return the address of the blksz_t identified by bs_id. + return blksz; + +} + BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); @@ -221,6 +231,15 @@ BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cn return bs_dt; } +BLIS_INLINE dim_t bli_cntx_get_trsm_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) +{ + blksz_t* blksz = bli_cntx_get_trsm_blksz( bs_id, cntx ); + dim_t bs_dt = bli_blksz_get_def( dt, blksz ); + + // Return the main (default) blocksize value for the datatype given. + return bs_dt; +} + BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); @@ -230,6 +249,15 @@ BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cn return bs_dt; } +BLIS_INLINE dim_t bli_cntx_get_trsm_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) +{ + blksz_t* blksz = bli_cntx_get_trsm_blksz( bs_id, cntx ); + dim_t bs_dt = bli_blksz_get_max( dt, blksz ); + + // Return the auxiliary (maximum) blocksize value for the datatype given. + return bs_dt; +} + BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); @@ -246,6 +274,14 @@ BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) return bmult; } +BLIS_INLINE blksz_t* bli_cntx_get_trsm_bmult( bszid_t bs_id, cntx_t* cntx ) +{ + bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); + blksz_t* restrict bmult = bli_cntx_get_trsm_blksz( bm_id, cntx ); + + return bmult; +} + BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); @@ -351,15 +387,6 @@ BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_tri_blksz( bszid_t bs_id, cntx_t* cntx // Return the address of the blksz_t identified by bs_id. return blksz; } -BLIS_INLINE blksz_t* bli_cntx_get_trsm_blksz( bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blkszs = bli_cntx_trsm_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; - - // Return the address of the blksz_t identified by bs_id. - return blksz; - -} BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 56eb556977..d871c5c025 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -447,6 +447,36 @@ void bli_gks_register_cntx e_val = bli_check_valid_mc_mod_mult( mc, nr ); bli_check_error_code( e_val ); e_val = bli_check_valid_nc_mod_mult( nc, mr ); bli_check_error_code( e_val ); #endif + + + // Verify that cache blocksizes are whole multiples of register blocksizes for TRSM. + mc = bli_cntx_get_trsm_blksz( BLIS_MC, gks_id_nat ); + nc = bli_cntx_get_trsm_blksz( BLIS_NC, gks_id_nat ); + kc = bli_cntx_get_trsm_blksz( BLIS_KC, gks_id_nat ); + mr = bli_cntx_get_trsm_blksz( BLIS_MR, gks_id_nat ); + nr = bli_cntx_get_trsm_blksz( BLIS_NR, gks_id_nat ); + kr = bli_cntx_get_trsm_blksz( BLIS_KR, gks_id_nat ); + + // If trsm blocksizes are not set then skip check. + for ( num_t dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) + { + dim_t mr_dt = bli_blksz_get_def( dt, mr ); + dim_t nr_dt = bli_blksz_get_def( dt, nr ); + dim_t kr_dt = bli_blksz_get_def( dt, kr ); + + if( mr_dt == 0 || nr_dt == 0 || kr_dt == 0 ) + { + return; + } + } + + e_val = bli_check_valid_mc_mod_mult( mc, mr ); bli_check_error_code( e_val ); + e_val = bli_check_valid_nc_mod_mult( nc, nr ); bli_check_error_code( e_val ); + e_val = bli_check_valid_kc_mod_mult( kc, kr ); bli_check_error_code( e_val ); +#ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS + e_val = bli_check_valid_mc_mod_mult( mc, nr ); bli_check_error_code( e_val ); + e_val = bli_check_valid_nc_mod_mult( nc, mr ); bli_check_error_code( e_val ); +#endif } // ----------------------------------------------------------------------------- diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 60c2b37f34..b5e9cfed73 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -674,6 +674,7 @@ siz_t bli_thread_range_mdim { bszid_t bszid = bli_cntl_bszid( cntl ); opid_t family = bli_cntl_family( cntl ); + blksz_t* bmult; // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires @@ -681,11 +682,20 @@ siz_t bli_thread_range_mdim // packing A and B. if ( family == BLIS_TRSM ) { + bmult = bli_cntx_get_trsm_bmult( bszid, cntx); + // if trsm blockszs are not set then use global blockszs + if (bli_blksz_get_def( bli_obj_dt( a ) , bmult ) == 0) + { + bmult = bli_cntx_get_bmult( bszid, cntx ); + } if ( bli_obj_root_is_triangular( a ) ) bszid = BLIS_MR; else bszid = BLIS_NR; } + else + { + bmult = bli_cntx_get_bmult( bszid, cntx ); + } - blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); obj_t* x; bool use_weighted; @@ -734,6 +744,7 @@ siz_t bli_thread_range_ndim { bszid_t bszid = bli_cntl_bszid( cntl ); opid_t family = bli_cntl_family( cntl ); + blksz_t* bmult; // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires @@ -741,11 +752,21 @@ siz_t bli_thread_range_ndim // packing A and B. if ( family == BLIS_TRSM ) { + bmult = bli_cntx_get_trsm_bmult( bszid, cntx); + + // if trsm blockszs are not set then use global blockszs + if (bli_blksz_get_def( bli_obj_dt( a ) , bmult ) == 0) + { + bmult = bli_cntx_get_bmult( bszid, cntx ); + } if ( bli_obj_root_is_triangular( b ) ) bszid = BLIS_MR; else bszid = BLIS_NR; } + else + { + bmult = bli_cntx_get_bmult( bszid, cntx ); + } - blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); obj_t* x; bool use_weighted; diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 48996f28e7..8e3858ee83 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -284,10 +284,10 @@ void libblis_test_gemm_ukr_experiment // allocated so we can re-store it to the object afterward. void* buf_ap = bli_obj_buffer( &ap ); void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, + bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_GEMM, BLIS_PACKED_ROW_PANELS, BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, + bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_GEMM, BLIS_PACKED_COL_PANELS, BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, BLIS_KR, BLIS_NR, &b, &bp, cntx ); bli_obj_set_buffer( buf_ap, &ap ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 7ce7034453..34e45645d3 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -209,38 +209,6 @@ void libblis_test_gemmtrsm_ukr_experiment // Query a context. cntx = bli_gks_query_cntx(); - // If TRSM and GEMM have different blocksizes and blocksizes - // are changed in global cntx object, when GEMM and TRSM are - // called in parallel, blocksizes in global cntx object will - // not be correct - // to fix this a local copy of cntx is created, so that - // overriding the blocksizes does not impact the global cntx - // object. - // This is a temporary fix, a better fix is to create a - // separate blocksz_trsm array in cntx. - cntx_t cntx_trsm = *cntx; - -#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4) - /* Zen4 TRSM Fixme: - * - * TRSM and GEMM used different values of MR and NR, we need to ensure that - * Values used for packing are as per the MR and NR values expected by the kernels - * For now this issue exists only for zen4 hence override the values here if - * the family is BLIS_TRSM and architecture is zen4 - * - * We need to override the values here as well as the packing and compute - * kernels are invoked directly from here (instead of BLIS/BLAS call.) - * - * We need to revisit this when TRSM AVX-512 kernels are implemented. - */ - if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4) && - ((dc_str[0] == 's') || (dc_str[0] == 'd') || - (dc_str[0] == 'S') || (dc_str[0] == 'D')) ) - { - bli_zen4_override_trsm_blkszs(&cntx_trsm); - } -#endif - // Use the datatype of the first char in the datatype combination string. bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); @@ -248,14 +216,25 @@ void libblis_test_gemmtrsm_ukr_experiment k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); - m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, &cntx_trsm ); - n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, &cntx_trsm ); + m = bli_cntx_get_trsm_blksz_def_dt( datatype, BLIS_MR, cntx ); + n = bli_cntx_get_trsm_blksz_def_dt( datatype, BLIS_NR, cntx ); // Also query PACKMR and PACKNR as the leading dimensions to ap and bp, // respectively. - ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, &cntx_trsm ); - ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, &cntx_trsm); + ldap = bli_cntx_get_trsm_blksz_max_dt( datatype, BLIS_MR, cntx ); + ldbp = bli_cntx_get_trsm_blksz_max_dt( datatype, BLIS_NR, cntx); + // if trsm block sizes are not set use global block sizes + if( m == 0 || n == 0) + { + m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx ); + n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx ); + + // Also query PACKMR and PACKNR as the leading dimensions to ap and bp, + // respectively. + ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx ); + ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx); + } // Store the register blocksizes so that the driver can retrieve the // values later when printing results. @@ -372,12 +351,12 @@ void libblis_test_gemmtrsm_ukr_experiment // allocated so we can re-store it to the object afterward. void* buf_ap = bli_obj_buffer( &ap ); void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, + bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_TRSM, BLIS_PACKED_ROW_PANELS, BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_MR, BLIS_KR, &a, &ap, &cntx_trsm ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, + BLIS_MR, BLIS_KR, &a, &ap, cntx ); + bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_TRSM, BLIS_PACKED_COL_PANELS, BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_KR, BLIS_NR, &b, &bp, &cntx_trsm ); + BLIS_KR, BLIS_NR, &b, &bp, cntx ); bli_obj_set_buffer( buf_ap, &ap ); bli_obj_set_buffer( buf_bp, &bp ); @@ -391,8 +370,8 @@ void libblis_test_gemmtrsm_ukr_experiment bli_obj_set_uplo( uploa, &ap ); // Pack the data from the source objects. - bli_packm_blk_var1( &a, &ap, &cntx_trsm, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, &cntx_trsm, NULL, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); // Create subpartitions from the a and b panels. bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, @@ -415,13 +394,13 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Re-pack (restore) the contents of b to bp. //bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, &cntx_trsm, NULL, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); time = bli_clock(); libblis_test_gemmtrsm_ukr_impl( iface, side, &alpha, &a1xp, &a11p, &bx1p, &b11p, &c11, - &cntx_trsm ); + cntx ); time_min = bli_clock_min_diff( time_min, time ); } diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 6366e5fc3c..890c0598cc 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -285,10 +285,12 @@ void libblis_test_trsm_ukr_experiment // allocated so we can re-store it to the object afterward. void* buf_ap = bli_obj_buffer( &ap ); void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, + // trsm_ukr are derived from gemm kernels therefore packing is done with + // gemm blocksizes + bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_GEMM, BLIS_PACKED_ROW_PANELS, BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, + bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_GEMM, BLIS_PACKED_COL_PANELS, BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, BLIS_KR, BLIS_NR, &b, &bp, cntx ); bli_obj_set_buffer( buf_ap, &ap ); From a6641dec0b3e7c7eadc98c47bacd8f0620a8adfe Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Mon, 10 Jul 2023 19:07:23 +0530 Subject: [PATCH 129/226] Updating GTestSuite CMake system to enable testing BLIS libraries on Windows. - Renaming ELEMENT_TYPE to BLIS_ELEMENT_TYPE, since the first is defined on a Windows header. - Updating refCBLAS object to have different implementation depending on the platform. - Removing dlfcn.h from all reference headers since it's linux specific and adding it conditionally on a higher level. - Changes on all CMakeLists.txt files to enable building on Windows. AMD-Internal: [CPUPL-2732] Change-Id: I6e35656a3779b35dc815a2409cf84c22dd27f3e7 --- gtestsuite/CMakeLists.txt | 305 +++++++++++------- gtestsuite/README.md | 92 +++++- gtestsuite/testinghelpers/CMakeLists.txt | 17 +- .../inc/common/data_generators.h | 6 +- .../testinghelpers/inc/common/refCBLAS.h | 34 +- .../src/common/complex_helpers.cpp | 8 +- .../testinghelpers/src/common/refCBLAS.cpp | 59 +++- .../testinghelpers/src/level1/ref_addv.cpp | 9 +- .../testinghelpers/src/level1/ref_amaxv.cpp | 9 +- .../testinghelpers/src/level1/ref_axpbyv.cpp | 25 +- .../testinghelpers/src/level1/ref_axpyv.cpp | 9 +- .../testinghelpers/src/level1/ref_copyv.cpp | 9 +- .../testinghelpers/src/level1/ref_dotv.cpp | 9 +- .../testinghelpers/src/level1/ref_dotxv.cpp | 1 - .../testinghelpers/src/level1/ref_scal2v.cpp | 9 +- .../testinghelpers/src/level1/ref_scalv.cpp | 9 +- .../testinghelpers/src/level1/ref_xpbyv.cpp | 17 +- .../testinghelpers/src/level2/ref_gemv.cpp | 9 +- .../testinghelpers/src/level2/ref_ger.cpp | 13 +- .../testinghelpers/src/level2/ref_hemv.cpp | 5 +- .../testinghelpers/src/level2/ref_her.cpp | 5 +- .../testinghelpers/src/level2/ref_her2.cpp | 5 +- .../testinghelpers/src/level2/ref_symv.cpp | 5 +- .../testinghelpers/src/level2/ref_syr.cpp | 5 +- .../testinghelpers/src/level2/ref_syr2.cpp | 5 +- .../testinghelpers/src/level2/ref_trmv.cpp | 9 +- .../testinghelpers/src/level2/ref_trsv.cpp | 9 +- .../testinghelpers/src/level3/ref_gemm.cpp | 9 +- .../testinghelpers/src/level3/ref_gemmt.cpp | 9 +- .../testinghelpers/src/level3/ref_hemm.cpp | 5 +- .../testinghelpers/src/level3/ref_her2k.cpp | 5 +- .../testinghelpers/src/level3/ref_herk.cpp | 5 +- .../testinghelpers/src/level3/ref_symm.cpp | 9 +- .../testinghelpers/src/level3/ref_syr2k.cpp | 9 +- .../testinghelpers/src/level3/ref_syrk.cpp | 9 +- .../testinghelpers/src/level3/ref_trmm.cpp | 9 +- .../testinghelpers/src/level3/ref_trmm3.cpp | 1 - .../testinghelpers/src/level3/ref_trsm.cpp | 9 +- .../testinghelpers/src/util/ref_nrm2.cpp | 9 +- gtestsuite/testsuite/CMakeLists.txt | 38 ++- gtestsuite/testsuite/level1/addv/test_addv.h | 2 +- .../testsuite/level1/amaxv/test_amaxv.h | 6 +- .../level1/axpbyv/zaxpbyv_generic.cpp | 2 +- .../testsuite/level1/axpyv/saxpyv_generic.cpp | 4 +- gtestsuite/testsuite/level1/dotv/dotv.h | 28 +- .../testsuite/level1/dotxv/test_dotxv.h | 6 +- .../level1/scalv/scalv_extreme_cases.cpp | 4 +- gtestsuite/testsuite/level1/setv/test_setv.h | 2 +- gtestsuite/testsuite/level1/subv/test_subv.h | 2 +- .../testsuite/level2/gemv/cgemv_generic.cpp | 4 +- .../testsuite/level2/gemv/dgemv_generic.cpp | 4 +- .../testsuite/level2/gemv/sgemv_generic.cpp | 4 +- gtestsuite/testsuite/level2/gemv/test_gemv.h | 4 +- .../testsuite/level2/gemv/zgemv_generic.cpp | 4 +- .../testsuite/level2/ger/cger_generic.cpp | 4 +- .../testsuite/level2/ger/dger_generic.cpp | 4 +- .../testsuite/level2/ger/sger_generic.cpp | 4 +- gtestsuite/testsuite/level2/ger/test_ger.h | 4 +- .../testsuite/level2/ger/zger_generic.cpp | 4 +- .../testsuite/level2/hemv/chemv_generic.cpp | 2 +- .../testsuite/level2/hemv/zhemv_generic.cpp | 2 +- .../testsuite/level2/her/cher_generic.cpp | 2 +- .../testsuite/level2/her/zher_generic.cpp | 2 +- .../testsuite/level2/her2/cher2_generic.cpp | 2 +- .../testsuite/level2/her2/zher2_generic.cpp | 2 +- .../testsuite/level2/symv/dsymv_generic.cpp | 2 +- .../testsuite/level2/symv/ssymv_generic.cpp | 2 +- .../testsuite/level2/syr/dsyr_generic.cpp | 2 +- .../testsuite/level2/syr/ssyr_generic.cpp | 2 +- .../testsuite/level2/syr2/dsyr2_generic.cpp | 2 +- .../testsuite/level2/syr2/ssyr2_generic.cpp | 2 +- .../testsuite/level2/trmv/ctrmv_generic.cpp | 2 +- .../testsuite/level2/trmv/dtrmv_generic.cpp | 2 +- .../testsuite/level2/trmv/strmv_generic.cpp | 2 +- .../testsuite/level2/trmv/ztrmv_generic.cpp | 2 +- .../testsuite/level2/trsv/ctrsv_generic.cpp | 2 +- .../testsuite/level2/trsv/dtrsv_generic.cpp | 2 +- .../testsuite/level2/trsv/strsv_generic.cpp | 2 +- .../testsuite/level2/trsv/ztrsv_generic.cpp | 2 +- .../testsuite/level3/gemm/dgemm_generic.cpp | 2 +- .../testsuite/level3/gemm/sgemm_generic.cpp | 2 +- gtestsuite/testsuite/level3/gemm/test_gemm.h | 2 +- .../testsuite/level3/gemm/zgemm_generic.cpp | 2 +- .../testsuite/level3/gemmt/test_gemmt.h | 2 +- .../testsuite/level3/gemmt/zgemmt_generic.cpp | 2 +- .../testsuite/level3/hemm/chemm_generic.cpp | 2 +- gtestsuite/testsuite/level3/hemm/test_hemm.h | 2 +- .../testsuite/level3/hemm/zhemm_generic.cpp | 2 +- .../testsuite/level3/her2k/cher2k_generic.cpp | 2 +- .../testsuite/level3/her2k/test_her2k.h | 2 +- .../testsuite/level3/her2k/zher2k_generic.cpp | 2 +- .../testsuite/level3/herk/cherk_generic.cpp | 2 +- .../testsuite/level3/herk/zherk_generic.cpp | 2 +- .../testsuite/level3/symm/csymm_generic.cpp | 2 +- .../testsuite/level3/symm/dsymm_generic.cpp | 2 +- .../testsuite/level3/symm/ssymm_generic.cpp | 2 +- gtestsuite/testsuite/level3/symm/test_symm.h | 2 +- .../testsuite/level3/symm/zsymm_generic.cpp | 2 +- .../testsuite/level3/syr2k/csyr2k_generic.cpp | 2 +- .../testsuite/level3/syr2k/dsyr2k_generic.cpp | 2 +- .../testsuite/level3/syr2k/ssyr2k_generic.cpp | 2 +- .../testsuite/level3/syr2k/test_syr2k.h | 2 +- .../testsuite/level3/syr2k/zsyr2k_generic.cpp | 2 +- .../testsuite/level3/syrk/csyrk_generic.cpp | 2 +- .../testsuite/level3/syrk/dsyrk_generic.cpp | 2 +- .../testsuite/level3/syrk/ssyrk_generic.cpp | 2 +- .../testsuite/level3/syrk/zsyrk_generic.cpp | 2 +- .../testsuite/level3/trmm/ctrmm_generic.cpp | 2 +- .../testsuite/level3/trmm/dtrmm_generic.cpp | 2 +- .../testsuite/level3/trmm/strmm_generic.cpp | 2 +- gtestsuite/testsuite/level3/trmm/test_trmm.h | 2 +- .../testsuite/level3/trmm/ztrmm_generic.cpp | 2 +- .../testsuite/level3/trmm3/ctrmm3_generic.cpp | 2 +- .../testsuite/level3/trmm3/dtrmm3_generic.cpp | 2 +- .../testsuite/level3/trmm3/strmm3_generic.cpp | 2 +- .../testsuite/level3/trmm3/test_trmm3.h | 2 +- .../testsuite/level3/trmm3/ztrmm3_generic.cpp | 2 +- .../testsuite/level3/trsm/ctrsm_generic.cpp | 4 +- .../testsuite/level3/trsm/dtrsm_generic.cpp | 4 +- .../testsuite/level3/trsm/strsm_generic.cpp | 4 +- gtestsuite/testsuite/level3/trsm/test_trsm.h | 2 +- .../testsuite/level3/trsm/ztrsm_generic.cpp | 4 +- gtestsuite/testsuite/util/nrm2/test_nrm2.h | 2 +- 123 files changed, 623 insertions(+), 432 deletions(-) diff --git a/gtestsuite/CMakeLists.txt b/gtestsuite/CMakeLists.txt index 9f1b132a7d..f044ceaec2 100644 --- a/gtestsuite/CMakeLists.txt +++ b/gtestsuite/CMakeLists.txt @@ -30,7 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ]=] -cmake_minimum_required(VERSION 3.14.0) +cmake_minimum_required(VERSION 3.20.0) set(CMAKE_CXX_COMPILER ${CXX_COMPILER}) set(CMAKE_CXX_STANDARD 17) @@ -38,28 +38,6 @@ project(BLIS_GtestSuite) enable_testing() -# Set the path to the BLIS installation. -if(NOT(BLIS_PATH)) - message(FATAL_ERROR "Need to provide a BLIS installation path during CMake invocation. Please use \ - $ cmake .. -DBLIS_PATH=/home/username/blis_installation") -endif() -# Set the path to BLIS include directory. -set(BLIS_INCLUDE ${BLIS_PATH}/include/blis) - -# Set OpenMP as the default option -set(ENABLE_THREADING "openmp" CACHE STRING "Setting OpenMP as the threading library") -# Set the possible values of theading libraries for cmake-gui -set_property(CACHE ENABLE_THREADING PROPERTY STRINGS "openmp" "pthreads" "no") - -# Set static BLIS as the default library we build against. -set(BLIS_LINKING_TYPE "static" CACHE STRING "Linking to a static BLIS library") -# Set the possible values of BLIS linking type for cmake-gui -set_property(CACHE BLIS_LINKING_TYPE PROPERTY STRINGS "static" "shared") - -option(ENABLE_ASAN "Run tests using Address Sanatizer" OFF) - -option(ENABLE_COVERAGE "Run tests for Code Coderage" OFF) - # Set variable if the platform is Linux based. if(UNIX AND NOT APPLE) set(LINUX TRUE) @@ -70,94 +48,93 @@ if(APPLE) message(FATAL_ERROR "Build system does not support Apple platform.") endif() -# Use INT_SIZE to set the int type used for testing. -set(INT_SIZE "32" CACHE STRING "Library used to compute reference results.") -# Set the possible values of reference CBLAS for cmake-gui -set_property(CACHE INT_SIZE PROPERTY STRINGS "32" "64") -if( NOT ((INT_SIZE STREQUAL "32") OR (INT_SIZE STREQUAL "64")) ) - message(FATAL_ERROR "INT_SIZE option ${INT_SIZE} is not supported. Must be 32 or 64.") +# Set the path to the BLIS installation. +if(LINUX) + if(NOT(BLIS_PATH)) + message(FATAL_ERROR "Need to provide a BLIS installation path during CMake invocation. Please use \ + $ cmake .. -DBLIS_PATH=/home/username/blis_installation") + endif() + # Set the path to BLIS include directory. + set(BLIS_INCLUDE ${BLIS_PATH}/include/blis) +else() + if(NOT(BLIS_LIB_PATH)) + message(FATAL_ERROR "Need to provide a path to BLIS library during CMake invocation. Please use \ + $ cmake .. -DBLIS_LIB_PATH=/home/username/blis_installation/path_to_library") + endif() + # Set the path to BLIS include directory. + if(NOT(BLIS_INCLUDE)) + message(FATAL_ERROR "Need to provide a path to BLIS headers during CMake invocation. Please use \ + $ cmake .. -DBLIS_INCLUDE=/home/username/blis_installation/path_to_headers") + endif() endif() # Use REF_BLAS to set the library that will be used for reference results. set(REF_CBLAS CACHE STRING "Library used to compute reference results.") # Set the possible values of reference CBLAS for cmake-gui -set_property(CACHE REF_CBLAS PROPERTY STRINGS "OpenBLAS" "Netlib" "MKL") +if(LINUX) + set_property(CACHE REF_CBLAS PROPERTY STRINGS "OpenBLAS" "Netlib" "MKL") +else() + set_property(CACHE REF_CBLAS PROPERTY STRINGS "OpenBLAS" "MKL") +endif() +# Set OpenMP as the default option +set(ENABLE_THREADING "openmp" CACHE STRING "Setting OpenMP as the threading library") +# Set the possible values of theading libraries for cmake-gui +if(LINUX) + set_property(CACHE ENABLE_THREADING PROPERTY STRINGS "openmp" "pthreads" "no") +else() + set_property(CACHE ENABLE_THREADING PROPERTY STRINGS "openmp" "no") +endif() -if(REF_LIB) - set(REFLIB_PATH ${REF_LIB}/..) - find_library(reflib NAMES openblas cblas mkl_intel_lp64 mkl_intel_ilp64 PATHS ${REFLIB_PATH}) - if(${reflib} STREQUAL reflib-NOTFOUND) - message(FATAL_ERROR "Reference Library not found : " ${REF_LIB}) - else() - message(STATUS "Found Reference Library : " ${reflib}) + +# Set the possibe values of OpenMP runtimes +if(WIN32) + # Set LLVM OpenMP library as the default option + set(OpenMP_LIBRARY "LLVM" CACHE STRING "Using LLVM OpenMP library") + set_property(CACHE OpenMP_LIBRARY PROPERTY STRINGS "LLVM" "Intel") +endif() + +# If MKL is used as a reference set up the threading library options. +if(REF_CBLAS STREQUAL "MKL") + # MKL threading option is set up as BLIS threading option by default. + set(MKL_ENABLE_THREADING ${ENABLE_THREADING} CACHE STRING "Setting MKL threading the same as BLIS threading") +endif() + +# Set up OpenMP flags correctly if it's required. +if( (ENABLE_THREADING STREQUAL "openmp") OR (MKL_ENABLE_THREADING STREQUAL "openmp") ) + if(WIN32) + set(OpenMP_libomp_LIBRARY "C:/Program Files/LLVM/lib/libomp.lib" CACHE STRING "openmp library path") endif() - message( "Setting REF_LIB to ${REF_LIB}") -else() - if(REF_CBLAS STREQUAL "OpenBLAS") - if(NOT(OPENBLAS_PATH)) - message(FATAL_ERROR "Need to provide an OpenBLAS installation path \ - during CMake invokation when OpenBLAS is used for reference results. Please use \ - $ cmake .. -DOPENBLAS_PATH=/home/username/openblas_installation") - endif() - find_library(reflib NAMES openblas PATHS ${OPENBLAS_PATH}) - if(${reflib} STREQUAL reflib-NOTFOUND) - message(FATAL_ERROR "OpenBLAS Reference Library not found : " ${OPENBLAS_PATH}) - else() - message(STATUS "Found OpenBLAS Reference Library : " ${reflib}) - endif() - set(REF_LIB ${reflib}) - elseif(REF_CBLAS STREQUAL "Netlib") - if(NOT(NETLIB_PATH)) - message(FATAL_ERROR "Need to provide a Netlib installation path \ - during CMake invokation when Netlib is used for reference results. Please use \ - $ cmake .. -DNETLIB_PATH=/home/username/netlib_installation") - endif() - if(INT_SIZE STREQUAL "32") - find_library(netlib NAMES cblas PATHS ${NETLIB_PATH}) - else() - find_library(netlib NAMES cblas64 PATHS ${NETLIB_PATH}) - endif() - if(${netlib} STREQUAL netlib-NOTFOUND) - message(FATAL_ERROR "Netlib Reference Library not found : " ${NETLIB_PATH}) - else() - message(STATUS "Found Netlib Reference Library : " ${netlib}) - endif() - set(REF_LIB ${netlib}) - elseif(REF_CBLAS STREQUAL "MKL") - set(MKL_PATH $ENV{MKLROOT}/lib/intel64 - CACHE STRING "The path to MKL.") - if(INT_SIZE STREQUAL "32") - find_library(mkllib NAMES mkl_intel_lp64 PATHS ${MKL_PATH}) - else() - find_library(mkllib NAMES mkl_intel_ilp64 PATHS ${MKL_PATH}) - endif() - if(${mkllib} STREQUAL mkllib-NOTFOUND) - message(FATAL_ERROR "MKL Reference Library not found : " ${MKL_PATH}) - else() - message(STATUS "Found MKL Reference Library : " ${mkllib}) - endif() - set(REF_LIB ${mkllib}) - find_library(mklcore NAMES mkl_core PATHS ${MKL_PATH}) - if(${mklcore} STREQUAL mklcore-NOTFOUND) - message(FATAL_ERROR "MKL_CORE Library not found : " ${MKL_PATH}) - else() - message(STATUS "Found MKL_CORE Library : " ${mklcore}) - endif() - set(MKL_CORE_PATH ${mklcore}) - find_library(mklthread NAMES mkl_gnu_thread PATHS ${MKL_PATH}) - if(${mklthread} STREQUAL mklthread-NOTFOUND) - message(FATAL_ERROR "MKL_GNU_THREAD Library not found : " ${MKL_PATH}) - else() - message(STATUS "Found MKL_GNU_THREAD Library : " ${mklthread}) - endif() - set(MKL_GNU_THREAD_PATH ${mklthread}) + find_package(OpenMP) + if(OPENMP_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") else() - message(FATAL_ERROR "Need to set up a reference library. Please use on of the following options \ - during CMake invokation: -DREF_CBLAS=Netlib or -DREF_CBLAS=OpenBLAS or -DREF_CBLAS=MKL") + message (FATAL_ERROR "Openmp Not Found, please provide an OpenMP library using -DOpenMP_libomp_LIBRARY=path_to_omp_lib.") endif() endif() +# Set static BLIS as the default library we build against. +set(BLIS_LINKING_TYPE "static" CACHE STRING "Linking to a static BLIS library") +# Set the possible values of BLIS linking type for cmake-gui +set_property(CACHE BLIS_LINKING_TYPE PROPERTY STRINGS "static" "shared") + +# Set common libraries. +if(LINUX) + set(COMMON_LIBS pthread m dl) + option(ENABLE_ASAN "Run tests using Address Sanatizer" OFF) + option(ENABLE_COVERAGE "Run tests for Code Coderage" OFF) +endif() + +# Use INT_SIZE to set the int type used for testing. +set(INT_SIZE "32" CACHE STRING "Integer size used in testing suite.") +# Set the possible values of reference CBLAS for cmake-gui +set_property(CACHE INT_SIZE PROPERTY STRINGS "32" "64") +if( NOT ((INT_SIZE STREQUAL "32") OR (INT_SIZE STREQUAL "64")) ) + message(FATAL_ERROR "INT_SIZE option ${INT_SIZE} is not supported. Must be 32 or 64.") +endif() + # Use TEST_INTERFACE to set which interface, supported by BLIS is meant to be tested. set(TEST_INTERFACE "BLAS" CACHE STRING "Interface that is being tested.") # Set the possible values of interfaces for cmake-gui @@ -167,17 +144,93 @@ if( NOT ((TEST_INTERFACE STREQUAL "BLAS") OR (TEST_INTERFACE STREQUAL "CBLAS") O during CMake invokation: -DTEST_INTERFACE=BLAS or -DTEST_INTERFACE=CBLAS or -DTEST_INTERFACE=BLIS_TYPED") endif() -# Use ELEMENT_TYPE to set whether the elements of any matrix/vector tested are integers or floating point values. -set(ELEMENT_TYPE "f" CACHE STRING "Type of elements of matrices/vectors") +# Use BLIS_ELEMENT_TYPE to set whether the elements of any matrix/vector tested are integers or floating point values. +set(BLIS_ELEMENT_TYPE "f" CACHE STRING "Type of elements of matrices/vectors") # Set the possible values of element types for cmake-gui -set_property(CACHE ELEMENT_TYPE PROPERTY STRINGS "f" "i") -if( NOT ((ELEMENT_TYPE STREQUAL "f") OR (ELEMENT_TYPE STREQUAL "i")) ) - message(FATAL_ERROR "ELEMENT_TYPE option ${ELEMENT_TYPE} is not supported. Please use on of the following options \ - during CMake invokation: -DELEMENT_TYPE=f or -DELEMENT_TYPE=i") +set_property(CACHE BLIS_ELEMENT_TYPE PROPERTY STRINGS "f" "i") +if( NOT ((BLIS_ELEMENT_TYPE STREQUAL "f") OR (BLIS_ELEMENT_TYPE STREQUAL "i")) ) + message(FATAL_ERROR "BLIS_ELEMENT_TYPE option ${BLIS_ELEMENT_TYPE} is not supported. Please use on of the following options \ + during CMake invokation: -DBLIS_ELEMENT_TYPE=f or -DBLIS_ELEMENT_TYPE=i") endif() -# Set common libraries. -set(COMMON_LIBS pthread m dl) +if(LINUX) + if(REF_LIB) + set(REFLIB_PATH ${REF_LIB}/..) + find_library(reflib NAMES openblas cblas mkl_rt PATHS ${REFLIB_PATH}) + if(${reflib} STREQUAL reflib-NOTFOUND) + message(FATAL_ERROR "Reference Library not found : " ${REF_LIB}) + else() + message(STATUS "Found Reference Library : " ${reflib}) + endif() + else() + if(REF_CBLAS STREQUAL "OpenBLAS") + if(NOT(OPENBLAS_PATH)) + message(FATAL_ERROR "Need to provide an OpenBLAS installation path \ + during CMake invokation when OpenBLAS is used for reference results. Please use \ + $ cmake .. -DOPENBLAS_PATH=/home/username/openblas_installation") + endif() + find_library(reflib NAMES openblas PATHS ${OPENBLAS_PATH}) + if(${reflib} STREQUAL reflib-NOTFOUND) + message(FATAL_ERROR "OpenBLAS Reference Library not found : " ${OPENBLAS_PATH}) + else() + message(STATUS "Found OpenBLAS Reference Library : " ${reflib}) + endif() + set(REF_LIB ${reflib}) + elseif(REF_CBLAS STREQUAL "Netlib") + if(NOT(NETLIB_PATH)) + message(FATAL_ERROR "Need to provide a Netlib installation path \ + during CMake invokation when Netlib is used for reference results. Please use \ + $ cmake .. -DNETLIB_PATH=/home/username/netlib_installation") + endif() + if(INT_SIZE STREQUAL "32") + find_library(netlib NAMES cblas PATHS ${NETLIB_PATH}) + else() + find_library(netlib NAMES cblas64 PATHS ${NETLIB_PATH}) + endif() + if(${netlib} STREQUAL netlib-NOTFOUND) + message(FATAL_ERROR "Netlib Reference Library not found : " ${NETLIB_PATH}) + else() + message(STATUS "Found Netlib Reference Library : " ${netlib}) + endif() + set(REF_LIB ${netlib}) + elseif(REF_CBLAS STREQUAL "MKL") + set(MKL_PATH $ENV{MKLROOT}/lib/intel64 + CACHE STRING "The path to MKL.") + find_library(mkllib NAMES mkl_rt PATHS ${MKL_PATH}) + if(${mkllib} STREQUAL mkllib-NOTFOUND) + message(FATAL_ERROR "MKL Reference Library not found : " ${MKL_PATH}) + else() + message(STATUS "Found MKL Reference Library : " ${mkllib}) + endif() + set(REF_LIB ${mkllib}) + else() + message(FATAL_ERROR "Need to set up a reference library. Please use on of the following options \ + during CMake invokation: -DREF_CBLAS=Netlib or -DREF_CBLAS=OpenBLAS or -DREF_CBLAS=MKL") + endif() + endif() +else() #WIN32 + if( NOT ((REF_CBLAS STREQUAL "OpenBLAS") OR (REF_CBLAS STREQUAL "MKL")) ) + message(FATAL_ERROR "REF_CBLAS option ${REF_CBLAS} is not supported. Please use on of the following options \ + during CMake invokation: -DREF_CBLAS=OpenBLAS or -DREF_CBLAS=MKL") + endif() + if(REF_CBLAS STREQUAL "OpenBLAS") + if(NOT(OPENBLAS_PATH)) + message(FATAL_ERROR "Need to provide an OpenBLAS installation path \ + during CMake invokation when OpenBLAS is used for reference results. Please use \ + $ cmake .. -DOPENBLAS_PATH=/home/username/openblas_installation") + endif() + set(REF_LIB "${OPENBLAS_PATH}/libopenblas.dll" CACHE STRING "Reference OpenBLAS Library") + message(STATUS "Found OpenBLAS Reference Library : " ${REF_LIB}) + elseif(REF_CBLAS STREQUAL "MKL") + if(NOT(MKL_PATH)) + message(FATAL_ERROR "Need to provide an MKL_PATH installation path \ + during CMake invokation when MKL] is used for reference results. Please use \ + $ cmake .. -DMKL_PATH=/home/username/path_to_mkl_rt") + endif() + set(REF_LIB "${MKL_PATH}/mkl_rt.2.dll" CACHE STRING "Reference MKL Library") + message(STATUS "Found MKL Reference Library : " ${REF_LIB}) + endif() +endif() # Set compiler options and BLIS library for Linux. if(LINUX) @@ -193,23 +246,18 @@ if(LINUX) set(CMAKE_CXX_FLAGS "-O0 --coverage") endif() - # Set GNU OpenMP library as the default option - set(OpenMP_LIBRARY "GNU" CACHE STRING "Using GNU OpenMP library") - # Set the possibe values of OpenMP runtimes - set_property(CACHE OpenMP_LIBRARY PROPERTY STRINGS "GNU" "Intel") - if(ENABLE_THREADING STREQUAL "no") if(BLIS_LINKING_TYPE STREQUAL "static") - set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis.a" CACHE STRING "blis library path") + set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis.a") else() - set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis.so" CACHE STRING "blis library path") + set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis.so") endif() find_library(libblis NAMES blis PATHS ${BLIS_PATH}/lib) else() if(BLIS_LINKING_TYPE STREQUAL "static") - set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis-mt.a" CACHE STRING "blis library path") + set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis-mt.a") else() - set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis-mt.so" CACHE STRING "blis library path") + set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis-mt.so") endif() find_library(libblis NAMES blis-mt PATHS ${BLIS_PATH}/lib) endif() @@ -218,24 +266,31 @@ if(LINUX) else() message(STATUS "Found BLIS Library : " ${Blis_LIBRARY}) endif() -endif() - -# Set BLIS library for Windows. -if(WIN32) +else() + add_definitions(-DBOOST_THREAD_USE_LIB) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + cmake_policy(SET CMP0091 NEW) + if(BLIS_LINKING_TYPE STREQUAL "shared") + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") + else() + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") + endif() + if(ENABLE_THREADING STREQUAL "no") if(BLIS_LINKING_TYPE STREQUAL "static") - set(Blis_LIBRARY "${BLIS_PATH}/bin/AOCL-LibBlis-Win.a" CACHE STRING "blis library path") + set(Blis_LIBRARY "${BLIS_LIB_PATH}/AOCL-LibBlis-Win.lib") else() - set(Blis_LIBRARY "${BLIS_PATH}/bin/AOCL-LibBlis-Win-dll.lib" CACHE STRING "blis library path") + set(Blis_LIBRARY "${BLIS_LIB_PATH}/AOCL-LibBlis-Win-dll.lib") + set(BLIS_DLL "${BLIS_LIB_PATH}/AOCL-LibBlis-Win-dll.dll") endif() else() if(BLIS_LINKING_TYPE STREQUAL "static") - set(Blis_LIBRARY "${BLIS_PATH}/bin/AOCL-LibBlis-Win-MT.a" CACHE STRING "blis library path") + set(Blis_LIBRARY "${BLIS_LIB_PATH}/AOCL-LibBlis-Win-MT.lib") else() - set(Blis_LIBRARY "${BLIS_PATH}/bin/AOCL-LibBlis-Win-MT-dll.lib" CACHE STRING "blis library path") + set(Blis_LIBRARY "${BLIS_LIB_PATH}/AOCL-LibBlis-Win-MT-dll.lib") + set(BLIS_DLL "${BLIS_LIB_PATH}/AOCL-LibBlis-Win-MT-dll.dll") endif() endif() - endif() add_subdirectory(testinghelpers) diff --git a/gtestsuite/README.md b/gtestsuite/README.md index 0cb220fbfe..b9d3bc44b3 100644 --- a/gtestsuite/README.md +++ b/gtestsuite/README.md @@ -41,7 +41,6 @@ First create and `build` directory using $ mkdir build $ cd build ``` - ## Configure BLIS GTestSuite with OpenBLAS as reference ```console $ cmake .. -DBLIS_PATH=/path_to_blis_installation -DREF_CBLAS=OpenBLAS -DOPENBLAS_PATH=/path_to_openblas_lib @@ -66,22 +65,29 @@ There are multiple configuration options to chose from when invoking CMake. Thos ## Compiler Options * `-DCMAKE_CXX_COMPILER=path_to_preferred_compiler` can be used to specify the compiler. * For example, to compile with Clang, use `-DCMAKE_CXX_COMPILER=clang++`. -## Threading Options (Linux Only) +## Threading Options * For single threaded BLIS, use `-DENABLE_THREADING=no`. -* For multithreaded BLIS that uses pthreads, use `-DENABLE_THREADING=pthreads`. +* For multithreaded BLIS that uses pthreads, use `-DENABLE_THREADING=pthreads` (Linux only). * For multithreaded BLIS that uses OpenMP, use `-DENABLE_THREADING=openmp`. [**Default**] - * In addition, to use Intel OpenMP runtime, use `-DOpenMP_LIBRARY=Intel`. - * For GNU OpenMP runtime, use `-DOpenMP_LIBRARY=GNU`. [**Default**] -## BLIS Linking Type (Linux Only) + * GNU OpenMP runtime is used by default on Linux. + * LLVM OpenMP runtime is used by default on Windows, except if MKL is used as a reference, in which case Intel OpenMP runtime is used. +## Threading Options for MKL (if used as reference) +In general, the variable `MKL_ENABLE_THREADING` gets its value from `ENABLE_THREADING` defined above, but can be overwritten, especially if we want to test single-threaded BLIS with multi-threaded MKL. +* For threaded MKL version, use `-DMKL_ENABLE_THREADING=openmp`. +For threaded MKL the following OpenMP runtimes are used: +* GNU is used by default on Linux. +* Intel is used by default on Windows. + +## BLIS Linking Type * To link static BLIS, use `-DBLIS_LINKING_TYPE=static`. [**Default**] * To link shared BLIS, use `-DBLIS_LINKING_TYPE=shared`. ## Integer Size * For testing a 32-bit integer BLIS library, use `-DINT_SIZE=32`. [**Default**"] * For testing a 64-bit integer BLIS library, use `-DINT_SIZE=64`. -## Address Sanitizer +## Address Sanitizer (Linux Only) * To build using address sanitizer, configure using `-DENABLE_ASAN=ON`. [**OFF by default**] * An installation to BLIS which was build with ASAN flags[CFLAGS="-O0 -g -fsanitize=address"] needs to be provided. -## Code Coverage[Only GCC Compiler] +## Code Coverage (Only GCC Compiler) * BLIS : Configure BLIS Library with code coverage flags[CFLAGS="-O0 -fprofile-arcs -ftest-coverage"], compile and install. * Gtestsuite : To build for code coverage, configure cmake with `-DENABLE_COVERAGE=ON`. [**OFF by default**] and then compile and run the executable. * CodeCoverage : in gtestsuite folder, run the below mentioned steps or bash script - to generate html LCOV-code coverage report. @@ -155,6 +161,11 @@ You can also find more details in [CMake Documentation](https://cmake.org/cmake/ ## Using the Executables As we mentioned earlier, all cpp files of each API directory are compiled into one executable. This executable can be run separately which can be very useful while developing or debugging. +When MKL is used as a reference, the following environment variables need to be set before calling the executables, depending on the configuration. +* MKL_INTERFACE_LAYER=LP64 or MKL_INTERFACE_LAYER=ILP64 depending on whether 32 or 64 bit integers are used, respectivelly. +* MKL_THREADING_LAYER=SEQUENTIAL for sequential MKL. +* MKL_THREADING_LAYER=INTEL or MKL_THREADING_LAYER=GNU depending on whether we execute on Windows or on Linux, respectivelly. + ### To run all addv tests use: ```console $ ./testsuite.level1.addv @@ -318,3 +329,68 @@ To overcome this issue and generate tests which fullfill the requirements for th * Add the lda_inc parameter: lda += lda_inc To test an m-by-n matrix A (column-major), stored in an array a, use lda_inc = 0 as a parameter to the test generator. To test for the case where A is a submatrix of k-by-n matrix B, use lda_inc = k-m. + +# BLIS GTestSuite on Windows +Building and runing GTestSuite on Windows is somewhat similar to Linux. In this section we focus on what is different, so please read the previous sections so that you have the complete picture. + +The instructions are given for building and running through the terminal, but using cmake-gui is also possible. An x64 Native Toolbox Command Prompt can be used so that the environment is set and the necessary compilers are available. + +## Build System Generators +On the descriptions above we assumed that Make will be used to build the libraries. The same instructions can be modified so that Ninja is used instead in a straigthforward manner. On Windows, where Make cannot be used, you can invoke CMake in one of the two ways below. Beware that Windows environment needs to be set correctly otherwise there might be some libraries missing. + +### Generate using Ninja +```console +$ cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl [more variables that we'll explain later] +``` +We specify the compilers to clang-cl, so that it's the same as the default option for BLIS library builds. + +### Generate using Visual Studio +```console +$ cmake .. -G "Visual Studio 17 2022" -TClangCl [more variables that we'll explain later] +``` +-TClangCl sets the toolbox to be used for the generation. To see what VS generators you have available, type +```console +$ cmake --help +``` + +## Configuring CMake +On Windows currently the BLIS repo does not have a CMake target for the library and the headers, so to configure properly we need to replace the variable BLIS_PATH that was used on Linux with the variables BLIS_LIB_PATH and BLIS_INCLUDE. +So, we can invoke cmake using +```console +$ cmake .. -G "Visual Studio 17 2022" -TClangCl -DBLIS_LIB_PATH=/path_to_blis_libraries -DBLIS_INCLUDE=/path_to_blis_headers -DREF_CBLAS=OpenBLAS -DOPENBLAS_PATH=/path_to_openblas_dll +``` +## Additional CMake Configuration Opions +The configuration is similar to Linux. In this section we only mention the specifics for Windows. +### BLIS Linking Type +* `-DBLIS_LINKING_TYPE=static` implies that AOCL-LibBlis-Win.lib (or AOCL-LibBlis-Win-MT.lib) will be tested. +* `-DBLIS_LINKING_TYPE=shared` implies that AOCL-LibBlis-Win-dll.lib (or AOCL-LibBlis-Win-MT-dll.lib) will be tested. Windows needs to find the coresponding dlls (AOCL-LibBlis-Win-dll.dll or AOCL-LibBlis-Win-MT-dll.dll) to be able to run the tests. The CMake system uses the prepends the Environment's PATH to the path provided during configuration, so that `ctest` can find the dll. To run the executables separately, you need to copy the dll manually, or specify the PATH. +### Threading Options +The path to the OpenMP runtime needs to be passed using `-DOpenMP_libomp_LIBRARY=/path_to_openmp_runtime`. + +## Building the Tests +The building process is similar to Windows with the main difference that a testinghelpers.lib is built. +### Building with Ninja +To build with Ninja, replace the word `make` with `ninja`. +### Building with Visual Studio +To build everything use +```console +$ cmake --build . --config Release +``` +To build a specific target use +```console +$ cmake --build . --config Release --target testsuite.level1 +``` + +## To run tests with Visual Studio +The process is similar to Linux, apart from the modification below. For parallel builds etc. you can add the options after `Release`. +```console +$ ctest -C Release +``` + +## Using the executables using Visual Studio +Visual Studio is a multiconfig generator. That means that it can build for `Release`, `Debug`, etc. simultaneously. For that reason VS will create a directory named Release, where it puts all the executables. So, to runn all addv tests, use +```console +$ cd Release +$ testsuite.level1.addv.exe +``` +Then, you can use filters in the same way if you need to. \ No newline at end of file diff --git a/gtestsuite/testinghelpers/CMakeLists.txt b/gtestsuite/testinghelpers/CMakeLists.txt index b7ef2cc3b4..264631e679 100644 --- a/gtestsuite/testinghelpers/CMakeLists.txt +++ b/gtestsuite/testinghelpers/CMakeLists.txt @@ -34,7 +34,7 @@ file(GLOB_RECURSE SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "src/*/*.cpp") add_library(testinghelpers STATIC ${SOURCES}) target_compile_definitions(testinghelpers PUBLIC REFERENCE_BLAS="${REF_LIB}") if(REF_CBLAS STREQUAL "MKL") - target_compile_definitions(testinghelpers PUBLIC MKL_CORE="${MKL_CORE_PATH}" MKL_GNU_THREAD="${MKL_GNU_THREAD_PATH}" REF_IS_MKL) + target_compile_definitions(testinghelpers PUBLIC REF_IS_MKL) elseif(REF_CBLAS STREQUAL "Netlib") target_compile_definitions(testinghelpers PUBLIC REF_IS_NETLIB) elseif(REF_CBLAS STREQUAL "OpenBLAS") @@ -52,6 +52,17 @@ if(INT_SIZE STREQUAL "32") else() target_compile_definitions(testinghelpers PUBLIC INT_SIZE=64) endif() -target_compile_definitions(testinghelpers PUBLIC ELEMENT_TYPE='${ELEMENT_TYPE}') +target_compile_definitions(testinghelpers PUBLIC BLIS_ELEMENT_TYPE='${BLIS_ELEMENT_TYPE}') target_include_directories(testinghelpers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/inc ${BLIS_INCLUDE}) -target_link_libraries(testinghelpers pthread) +if(LINUX) + target_link_libraries(testinghelpers pthread) +else() + find_package(Threads) + if ("${CMAKE_VERSION}" VERSION_LESS "3.1.0") + set(threads_spec ${CMAKE_THREAD_LIBS_INIT}) + else() + set(threads_spec Threads::Threads) + endif() + target_link_libraries(testinghelpers PUBLIC ${threads_spec}) + set_target_properties(testinghelpers PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() \ No newline at end of file diff --git a/gtestsuite/testinghelpers/inc/common/data_generators.h b/gtestsuite/testinghelpers/inc/common/data_generators.h index 9656cc219c..48d61f3a67 100644 --- a/gtestsuite/testinghelpers/inc/common/data_generators.h +++ b/gtestsuite/testinghelpers/inc/common/data_generators.h @@ -77,14 +77,14 @@ void randomgenerators(int from, int to, char storage, char uplo, gtint_t m, template std::vector get_random_matrix(int from, int to, char storage, char trans, gtint_t m, gtint_t n, - gtint_t lda, char datatype = ELEMENT_TYPE ); + gtint_t lda, char datatype = BLIS_ELEMENT_TYPE ); template std::vector get_random_matrix(int from, int to, char storage, char uplo, gtint_t k, - gtint_t lda, char datatype = ELEMENT_TYPE ); + gtint_t lda, char datatype = BLIS_ELEMENT_TYPE ); template -std::vector get_random_vector(int from, int to, gtint_t n, gtint_t incx,char datatype = ELEMENT_TYPE); +std::vector get_random_vector(int from, int to, gtint_t n, gtint_t incx,char datatype = BLIS_ELEMENT_TYPE); template std::vector get_vector( gtint_t n, gtint_t incx, T value ); diff --git a/gtestsuite/testinghelpers/inc/common/refCBLAS.h b/gtestsuite/testinghelpers/inc/common/refCBLAS.h index fbdfe76e6d..f483a76e60 100644 --- a/gtestsuite/testinghelpers/inc/common/refCBLAS.h +++ b/gtestsuite/testinghelpers/inc/common/refCBLAS.h @@ -32,24 +32,46 @@ */ +#ifdef WIN32 +#include +#include +#elif defined __linux__ #include +#endif #include +/** + * This is a helper class that we use to load the symbols + * from the reference library dynamically so that we get + * the reference solution. + * Since dynamic loading can be time consuming this class works + * in the following manner. + * - We have a thread local instance of this object. That means + * that for each executable there is a global variable called + * refCBLASModule. + * - The constructor of refCBLASModule (which is called automatically) + * loads the library either with a call to dlopen (Linux) or with + * a call to LoadLibrary (Windows). + * - Similarly the destructor unloads the library. + * - The member function loadSymbol() is used to return the pointer + * to that symbol in the library, either with a call to ldsym (Linux) + * or with a call to GetProcAddress (Windows). + * This means that the library is only loaded once per executable + * due to having the global variable refCBLASModule and unloaded once + * at the end. Multiple calls to loadSymbol are used to access the + * corresponding API used for reference. +*/ namespace testinghelpers { class refCBLAS { private: -#ifdef REF_IS_MKL - void *MKLCoreModule = nullptr; - void *MKLGNUThreadModule = nullptr; -#endif void *refCBLASModule = nullptr; public: refCBLAS(); ~refCBLAS(); - void* get(); + void* loadSymbol(const char*); }; } //end of testinghelpers namespace -extern thread_local testinghelpers::refCBLAS refCBLASModule; +extern thread_local testinghelpers::refCBLAS refCBLASModule; \ No newline at end of file diff --git a/gtestsuite/testinghelpers/src/common/complex_helpers.cpp b/gtestsuite/testinghelpers/src/common/complex_helpers.cpp index 90158270dc..3f8b9a27fe 100644 --- a/gtestsuite/testinghelpers/src/common/complex_helpers.cpp +++ b/gtestsuite/testinghelpers/src/common/complex_helpers.cpp @@ -89,18 +89,18 @@ dcomplex operator*(const dcomplex x, const dcomplex y) bool operator== (const scomplex x, const scomplex y) { - return {(x.real==y.real) && (x.imag==y.imag)}; + return ((x.real==y.real) && (x.imag==y.imag)); } bool operator== (const dcomplex x, const dcomplex y) { - return {(x.real==y.real) && (x.imag==y.imag)}; + return ((x.real==y.real) && (x.imag==y.imag)); } bool operator!= (const scomplex x, const scomplex y) { - return {!((x.real==y.real) && (x.imag==y.imag))}; + return (!((x.real==y.real) && (x.imag==y.imag))); } bool operator!= (const dcomplex x, const dcomplex y) { - return {!((x.real==y.real) && (x.imag==y.imag))}; + return (!((x.real==y.real) && (x.imag==y.imag))); } diff --git a/gtestsuite/testinghelpers/src/common/refCBLAS.cpp b/gtestsuite/testinghelpers/src/common/refCBLAS.cpp index 533fd2e356..12499648e1 100644 --- a/gtestsuite/testinghelpers/src/common/refCBLAS.cpp +++ b/gtestsuite/testinghelpers/src/common/refCBLAS.cpp @@ -38,40 +38,71 @@ #endif #include "common/refCBLAS.h" +/** + * This is a helper class that we use to load the symbols + * from the reference library dynamically so that we get + * the reference solution. + * Since dynamic loading can be time consuming this class works + * in the following manner. + * - We have a thread local instance of this object. That means + * that for each executable there is a global variable called + * refCBLASModule. + * - The constructor of refCBLASModule (which is called automatically) + * loads the library either with a call to dlopen (Linux) or with + * a call to LoadLibrary (Windows). + * - Similarly the destructor unloads the library. + * - The member function loadSymbol() is used to return the pointer + * to that symbol in the library, either with a call to ldsym (Linux) + * or with a call to GetProcAddress (Windows). + * This means that the library is only loaded once per executable + * due to having the global variable refCBLASModule and unloaded once + * at the end. Multiple calls to loadSymbol are used to access the + * corresponding API used for reference. +*/ + namespace testinghelpers { refCBLAS::refCBLAS() { + std::cout << "refCBLAS constructor\n"; if (!refCBLASModule) { -#ifdef REF_IS_MKL - // Dummy call to force linker, link OpenMP library if MKL is used. - omp_get_num_threads(); - MKLCoreModule = dlopen(MKL_CORE, RTLD_GLOBAL | RTLD_LAZY); - MKLGNUThreadModule = dlopen(MKL_GNU_THREAD, RTLD_GLOBAL | RTLD_LAZY); -#endif -#ifdef ENABLE_ASAN - refCBLASModule = dlopen(REFERENCE_BLAS, RTLD_LOCAL | RTLD_LAZY); +#ifdef WIN32 + refCBLASModule = LoadLibraryEx(REFERENCE_BLAS, NULL, LOAD_LIBRARY_SAFE_CURRENT_DIRS); #else - refCBLASModule = dlopen(REFERENCE_BLAS, RTLD_DEEPBIND | RTLD_LAZY); + #ifdef ENABLE_ASAN + refCBLASModule = dlopen(REFERENCE_BLAS, RTLD_LOCAL | RTLD_LAZY); + #else + refCBLASModule = dlopen(REFERENCE_BLAS, RTLD_DEEPBIND | RTLD_LAZY); + #endif #endif } if (refCBLASModule == nullptr) { +#ifndef WIN32 std::cout<(GetProcAddress((HMODULE)refCBLASModule, symbol)); +#else + return dlsym(refCBLASModule, symbol); +#endif } -void* refCBLAS::get() { return refCBLASModule; } } //end of testinghelpers namespace thread_local testinghelpers::refCBLAS refCBLASModule; diff --git a/gtestsuite/testinghelpers/src/level1/ref_addv.cpp b/gtestsuite/testinghelpers/src/level1/ref_addv.cpp index 90351b0ec2..87f4c217d7 100644 --- a/gtestsuite/testinghelpers/src/level1/ref_addv.cpp +++ b/gtestsuite/testinghelpers/src/level1/ref_addv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level1/ref_addv.h" namespace testinghelpers { @@ -50,19 +49,19 @@ void ref_addv( char conj_x, gtint_t n, const T* x, gtint_t incx, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get( ), "cblas_saxpy"); + ref_cblas_axpy = (Fptr_ref_cblas_axpy)refCBLASModule.loadSymbol("cblas_saxpy"); } else if (typeid(T) == typeid(double)) { - ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_daxpy"); + ref_cblas_axpy = (Fptr_ref_cblas_axpy)refCBLASModule.loadSymbol("cblas_daxpy"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_caxpy"); + ref_cblas_axpy = (Fptr_ref_cblas_axpy)refCBLASModule.loadSymbol("cblas_caxpy"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_zaxpy"); + ref_cblas_axpy = (Fptr_ref_cblas_axpy)refCBLASModule.loadSymbol("cblas_zaxpy"); } else { diff --git a/gtestsuite/testinghelpers/src/level1/ref_amaxv.cpp b/gtestsuite/testinghelpers/src/level1/ref_amaxv.cpp index 4ad4610eb9..33007e0fd3 100644 --- a/gtestsuite/testinghelpers/src/level1/ref_amaxv.cpp +++ b/gtestsuite/testinghelpers/src/level1/ref_amaxv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level1/ref_amaxv.h" namespace testinghelpers { @@ -49,19 +48,19 @@ gtint_t ref_amaxv( gtint_t n, const T* x, gtint_t incx ) { /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)dlsym(refCBLASModule.get( ), "cblas_isamax"); + ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)refCBLASModule.loadSymbol("cblas_isamax"); } else if (typeid(T) == typeid(double)) { - ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)dlsym(refCBLASModule.get(), "cblas_idamax"); + ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)refCBLASModule.loadSymbol("cblas_idamax"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)dlsym(refCBLASModule.get(), "cblas_icamax"); + ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)refCBLASModule.loadSymbol("cblas_icamax"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)dlsym(refCBLASModule.get(), "cblas_izamax"); + ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)refCBLASModule.loadSymbol("cblas_izamax"); } else { diff --git a/gtestsuite/testinghelpers/src/level1/ref_axpbyv.cpp b/gtestsuite/testinghelpers/src/level1/ref_axpbyv.cpp index 2f6f64ec60..373d31e0e1 100644 --- a/gtestsuite/testinghelpers/src/level1/ref_axpbyv.cpp +++ b/gtestsuite/testinghelpers/src/level1/ref_axpbyv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level1/ref_axpbyv.h" namespace testinghelpers { @@ -51,19 +50,19 @@ void ref_axpbyv( char conj_x, gtint_t n, T alpha, const T* x, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_sscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_sscal"); } else if (typeid(T) == typeid(double)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_dscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_dscal"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_cscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_cscal"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_zscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_zscal"); } else { @@ -81,19 +80,19 @@ void ref_axpbyv( char conj_x, gtint_t n, T alpha, const T* x, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_saxpy"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_saxpy"); } else if (typeid(T) == typeid(double)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_daxpy"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_daxpy"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_caxpy"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_caxpy"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_zaxpy"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_zaxpy"); } else { @@ -130,19 +129,19 @@ void ref_axpbyv( char conj_x, gtint_t n, T alpha, const T* x, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_saxpby"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_saxpby"); } else if (typeid(T) == typeid(double)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_daxpby"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_daxpby"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_caxpby"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_caxpby"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_zaxpby"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_zaxpby"); } else { diff --git a/gtestsuite/testinghelpers/src/level1/ref_axpyv.cpp b/gtestsuite/testinghelpers/src/level1/ref_axpyv.cpp index c5541ca86a..750ac04172 100644 --- a/gtestsuite/testinghelpers/src/level1/ref_axpyv.cpp +++ b/gtestsuite/testinghelpers/src/level1/ref_axpyv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level1/ref_axpyv.h" namespace testinghelpers { @@ -50,19 +49,19 @@ void ref_axpyv( char conj_x, gtint_t n, T alpha, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_saxpy"); + ref_cblas_axpy = (Fptr_ref_cblas_axpy)refCBLASModule.loadSymbol("cblas_saxpy"); } else if (typeid(T) == typeid(double)) { - ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_daxpy"); + ref_cblas_axpy = (Fptr_ref_cblas_axpy)refCBLASModule.loadSymbol("cblas_daxpy"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_caxpy"); + ref_cblas_axpy = (Fptr_ref_cblas_axpy)refCBLASModule.loadSymbol("cblas_caxpy"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_zaxpy"); + ref_cblas_axpy = (Fptr_ref_cblas_axpy)refCBLASModule.loadSymbol("cblas_zaxpy"); } else { diff --git a/gtestsuite/testinghelpers/src/level1/ref_copyv.cpp b/gtestsuite/testinghelpers/src/level1/ref_copyv.cpp index 90b70a2bab..4539ab551c 100644 --- a/gtestsuite/testinghelpers/src/level1/ref_copyv.cpp +++ b/gtestsuite/testinghelpers/src/level1/ref_copyv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level1/ref_copyv.h" namespace testinghelpers { @@ -49,19 +48,19 @@ void ref_copyv( char conj_x, gtint_t n, const T* xp, gtint_t incx, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_copyv = (Fptr_ref_cblas_copyv)dlsym(refCBLASModule.get(), "cblas_scopy"); + ref_cblas_copyv = (Fptr_ref_cblas_copyv)refCBLASModule.loadSymbol("cblas_scopy"); } else if (typeid(T) == typeid(double)) { - ref_cblas_copyv = (Fptr_ref_cblas_copyv)dlsym(refCBLASModule.get(), "cblas_dcopy"); + ref_cblas_copyv = (Fptr_ref_cblas_copyv)refCBLASModule.loadSymbol("cblas_dcopy"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_copyv = (Fptr_ref_cblas_copyv)dlsym(refCBLASModule.get(), "cblas_ccopy"); + ref_cblas_copyv = (Fptr_ref_cblas_copyv)refCBLASModule.loadSymbol("cblas_ccopy"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_copyv = (Fptr_ref_cblas_copyv)dlsym(refCBLASModule.get(), "cblas_zcopy"); + ref_cblas_copyv = (Fptr_ref_cblas_copyv)refCBLASModule.loadSymbol("cblas_zcopy"); } else { diff --git a/gtestsuite/testinghelpers/src/level1/ref_dotv.cpp b/gtestsuite/testinghelpers/src/level1/ref_dotv.cpp index d7a098a1bf..35c4b5ec5c 100644 --- a/gtestsuite/testinghelpers/src/level1/ref_dotv.cpp +++ b/gtestsuite/testinghelpers/src/level1/ref_dotv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level1/ref_dotv.h" namespace testinghelpers { @@ -49,11 +48,11 @@ void ref_dotv(gtint_t len, const T* xp, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_dot = (Fptr_ref_cblas_dot)dlsym(refCBLASModule.get(), "cblas_sdot"); + ref_cblas_dot = (Fptr_ref_cblas_dot)refCBLASModule.loadSymbol("cblas_sdot"); } else if (typeid(T) == typeid(double)) { - ref_cblas_dot = (Fptr_ref_cblas_dot)dlsym(refCBLASModule.get(), "cblas_ddot"); + ref_cblas_dot = (Fptr_ref_cblas_dot)refCBLASModule.loadSymbol("cblas_ddot"); } else { @@ -97,11 +96,11 @@ void ref_dotv( char conj_x, char conj_y, gtint_t len, const T* xp, gtint_t incx, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(scomplex)) { - ref_cblas_dot = (Fptr_ref_cblas_dot)dlsym(refCBLASModule.get(), "cblas_cdotu_sub"); + ref_cblas_dot = (Fptr_ref_cblas_dot)refCBLASModule.loadSymbol("cblas_cdotu_sub"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_dot = (Fptr_ref_cblas_dot)dlsym(refCBLASModule.get(), "cblas_zdotu_sub"); + ref_cblas_dot = (Fptr_ref_cblas_dot)refCBLASModule.loadSymbol("cblas_zdotu_sub"); } else { diff --git a/gtestsuite/testinghelpers/src/level1/ref_dotxv.cpp b/gtestsuite/testinghelpers/src/level1/ref_dotxv.cpp index b3cdf476ab..1d08c4d438 100644 --- a/gtestsuite/testinghelpers/src/level1/ref_dotxv.cpp +++ b/gtestsuite/testinghelpers/src/level1/ref_dotxv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level1/ref_dotxv.h" namespace testinghelpers { diff --git a/gtestsuite/testinghelpers/src/level1/ref_scal2v.cpp b/gtestsuite/testinghelpers/src/level1/ref_scal2v.cpp index 8d9e59a86d..34ea17dc1c 100644 --- a/gtestsuite/testinghelpers/src/level1/ref_scal2v.cpp +++ b/gtestsuite/testinghelpers/src/level1/ref_scal2v.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level1/ref_scalv.h" namespace testinghelpers { @@ -49,19 +48,19 @@ void ref_scal2v(char conjx, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gtint_ /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_sscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_sscal"); } else if (typeid(T) == typeid(double)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_dscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_dscal"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_cscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_cscal"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_zscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_zscal"); } else { diff --git a/gtestsuite/testinghelpers/src/level1/ref_scalv.cpp b/gtestsuite/testinghelpers/src/level1/ref_scalv.cpp index c4ad195c36..5b74b91b25 100644 --- a/gtestsuite/testinghelpers/src/level1/ref_scalv.cpp +++ b/gtestsuite/testinghelpers/src/level1/ref_scalv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level1/ref_scalv.h" namespace testinghelpers { @@ -49,19 +48,19 @@ void ref_scalv(char conjalpha, gtint_t n, T alpha, T* x, gtint_t incx) /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_sscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_sscal"); } else if (typeid(T) == typeid(double)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_dscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_dscal"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_cscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_cscal"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_zscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_zscal"); } else { diff --git a/gtestsuite/testinghelpers/src/level1/ref_xpbyv.cpp b/gtestsuite/testinghelpers/src/level1/ref_xpbyv.cpp index 27773a08cb..d8f30dea64 100644 --- a/gtestsuite/testinghelpers/src/level1/ref_xpbyv.cpp +++ b/gtestsuite/testinghelpers/src/level1/ref_xpbyv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level1/ref_xpbyv.h" namespace testinghelpers { @@ -50,19 +49,19 @@ void ref_xpbyv( char conj_x, gtint_t n, const T* x, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_sscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_sscal"); } else if (typeid(T) == typeid(double)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_dscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_dscal"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_cscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_cscal"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_zscal"); + ref_cblas_scal = (Fptr_ref_cblas_scal)refCBLASModule.loadSymbol("cblas_zscal"); } else { @@ -80,19 +79,19 @@ void ref_xpbyv( char conj_x, gtint_t n, const T* x, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_saxpy"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_saxpy"); } else if (typeid(T) == typeid(double)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_daxpy"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_daxpy"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_caxpy"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_caxpy"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_zaxpy"); + ref_cblas_axpby = (Fptr_ref_cblas_axpby)refCBLASModule.loadSymbol("cblas_zaxpy"); } else { diff --git a/gtestsuite/testinghelpers/src/level2/ref_gemv.cpp b/gtestsuite/testinghelpers/src/level2/ref_gemv.cpp index 99168283f5..fac8e661db 100644 --- a/gtestsuite/testinghelpers/src/level2/ref_gemv.cpp +++ b/gtestsuite/testinghelpers/src/level2/ref_gemv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level2/ref_gemv.h" /* @@ -73,19 +72,19 @@ void ref_gemv( char storage, char trans, char conjx, gtint_t m, gtint_t n, T alp /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_gemv = (Fptr_ref_cblas_gemv)dlsym(refCBLASModule.get( ), "cblas_sgemv"); + ref_cblas_gemv = (Fptr_ref_cblas_gemv)refCBLASModule.loadSymbol("cblas_sgemv"); } else if (typeid(T) == typeid(double)) { - ref_cblas_gemv = (Fptr_ref_cblas_gemv)dlsym(refCBLASModule.get(), "cblas_dgemv"); + ref_cblas_gemv = (Fptr_ref_cblas_gemv)refCBLASModule.loadSymbol("cblas_dgemv"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_gemv = (Fptr_ref_cblas_gemv)dlsym(refCBLASModule.get(), "cblas_cgemv"); + ref_cblas_gemv = (Fptr_ref_cblas_gemv)refCBLASModule.loadSymbol("cblas_cgemv"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_gemv = (Fptr_ref_cblas_gemv)dlsym(refCBLASModule.get(), "cblas_zgemv"); + ref_cblas_gemv = (Fptr_ref_cblas_gemv)refCBLASModule.loadSymbol("cblas_zgemv"); } else { diff --git a/gtestsuite/testinghelpers/src/level2/ref_ger.cpp b/gtestsuite/testinghelpers/src/level2/ref_ger.cpp index ade3ee35e1..60857cce5c 100644 --- a/gtestsuite/testinghelpers/src/level2/ref_ger.cpp +++ b/gtestsuite/testinghelpers/src/level2/ref_ger.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level2/ref_ger.h" /* @@ -68,25 +67,25 @@ void ref_ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get( ), "cblas_sger"); + ref_cblas_ger = (Fptr_ref_cblas_ger)refCBLASModule.loadSymbol("cblas_sger"); } else if (typeid(T) == typeid(double)) { - ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get(), "cblas_dger"); + ref_cblas_ger = (Fptr_ref_cblas_ger)refCBLASModule.loadSymbol("cblas_dger"); } else if (typeid(T) == typeid(scomplex)) { if( cfy ) - ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get(), "cblas_cgerc"); + ref_cblas_ger = (Fptr_ref_cblas_ger)refCBLASModule.loadSymbol("cblas_cgerc"); else - ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get(), "cblas_cgeru"); + ref_cblas_ger = (Fptr_ref_cblas_ger)refCBLASModule.loadSymbol("cblas_cgeru"); } else if (typeid(T) == typeid(dcomplex)) { if( cfy ) - ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get(), "cblas_zgerc"); + ref_cblas_ger = (Fptr_ref_cblas_ger)refCBLASModule.loadSymbol("cblas_zgerc"); else - ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get(), "cblas_zgeru"); + ref_cblas_ger = (Fptr_ref_cblas_ger)refCBLASModule.loadSymbol("cblas_zgeru"); } else { diff --git a/gtestsuite/testinghelpers/src/level2/ref_hemv.cpp b/gtestsuite/testinghelpers/src/level2/ref_hemv.cpp index 93571be74f..13e7996ab2 100644 --- a/gtestsuite/testinghelpers/src/level2/ref_hemv.cpp +++ b/gtestsuite/testinghelpers/src/level2/ref_hemv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level2/ref_hemv.h" /* @@ -68,11 +67,11 @@ void ref_hemv( char storage, char uploa, char conja, char conjx, gtint_t n, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(scomplex)) { - ref_cblas_hemv = (Fptr_ref_cblas_hemv)dlsym(refCBLASModule.get(), "cblas_chemv"); + ref_cblas_hemv = (Fptr_ref_cblas_hemv)refCBLASModule.loadSymbol("cblas_chemv"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_hemv = (Fptr_ref_cblas_hemv)dlsym(refCBLASModule.get(), "cblas_zhemv"); + ref_cblas_hemv = (Fptr_ref_cblas_hemv)refCBLASModule.loadSymbol("cblas_zhemv"); } else { diff --git a/gtestsuite/testinghelpers/src/level2/ref_her.cpp b/gtestsuite/testinghelpers/src/level2/ref_her.cpp index 3be456e7bc..b9a078b7f1 100644 --- a/gtestsuite/testinghelpers/src/level2/ref_her.cpp +++ b/gtestsuite/testinghelpers/src/level2/ref_her.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level2/ref_her.h" /* @@ -65,11 +64,11 @@ void ref_her( char storage, char uploa, char conjx, gtint_t n, Tr alpha, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(scomplex)) { - ref_cblas_her = (Fptr_ref_cblas_her)dlsym(refCBLASModule.get(), "cblas_cher"); + ref_cblas_her = (Fptr_ref_cblas_her)refCBLASModule.loadSymbol("cblas_cher"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_her = (Fptr_ref_cblas_her)dlsym(refCBLASModule.get(), "cblas_zher"); + ref_cblas_her = (Fptr_ref_cblas_her)refCBLASModule.loadSymbol("cblas_zher"); } else { diff --git a/gtestsuite/testinghelpers/src/level2/ref_her2.cpp b/gtestsuite/testinghelpers/src/level2/ref_her2.cpp index 266909547e..fe078008ce 100644 --- a/gtestsuite/testinghelpers/src/level2/ref_her2.cpp +++ b/gtestsuite/testinghelpers/src/level2/ref_her2.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level2/ref_her2.h" /* @@ -66,11 +65,11 @@ void ref_her2( char storage, char uploa, char conjx, char conjy, gtint_t n, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(scomplex)) { - ref_cblas_her2 = (Fptr_ref_cblas_her2)dlsym(refCBLASModule.get(), "cblas_cher2"); + ref_cblas_her2 = (Fptr_ref_cblas_her2)refCBLASModule.loadSymbol("cblas_cher2"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_her2 = (Fptr_ref_cblas_her2)dlsym(refCBLASModule.get(), "cblas_zher2"); + ref_cblas_her2 = (Fptr_ref_cblas_her2)refCBLASModule.loadSymbol("cblas_zher2"); } else { diff --git a/gtestsuite/testinghelpers/src/level2/ref_symv.cpp b/gtestsuite/testinghelpers/src/level2/ref_symv.cpp index 5c4afb668f..ae976d2580 100644 --- a/gtestsuite/testinghelpers/src/level2/ref_symv.cpp +++ b/gtestsuite/testinghelpers/src/level2/ref_symv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level2/ref_symv.h" /* @@ -67,11 +66,11 @@ void ref_symv( char storage, char uploa, char conja, char conjx, gtint_t n, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_symv = (Fptr_ref_cblas_symv)dlsym(refCBLASModule.get(), "cblas_ssymv"); + ref_cblas_symv = (Fptr_ref_cblas_symv)refCBLASModule.loadSymbol("cblas_ssymv"); } else if (typeid(T) == typeid(double)) { - ref_cblas_symv = (Fptr_ref_cblas_symv)dlsym(refCBLASModule.get(), "cblas_dsymv"); + ref_cblas_symv = (Fptr_ref_cblas_symv)refCBLASModule.loadSymbol("cblas_dsymv"); } else { diff --git a/gtestsuite/testinghelpers/src/level2/ref_syr.cpp b/gtestsuite/testinghelpers/src/level2/ref_syr.cpp index b9d0f69103..c5648cc23f 100644 --- a/gtestsuite/testinghelpers/src/level2/ref_syr.cpp +++ b/gtestsuite/testinghelpers/src/level2/ref_syr.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level2/ref_syr.h" /* @@ -66,11 +65,11 @@ void ref_syr( char storage, char uploa, char conjx, gtint_t n, T alpha, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_syr = (Fptr_ref_cblas_syr)dlsym(refCBLASModule.get(), "cblas_ssyr"); + ref_cblas_syr = (Fptr_ref_cblas_syr)refCBLASModule.loadSymbol("cblas_ssyr"); } else if (typeid(T) == typeid(double)) { - ref_cblas_syr = (Fptr_ref_cblas_syr)dlsym(refCBLASModule.get(), "cblas_dsyr"); + ref_cblas_syr = (Fptr_ref_cblas_syr)refCBLASModule.loadSymbol("cblas_dsyr"); } else { diff --git a/gtestsuite/testinghelpers/src/level2/ref_syr2.cpp b/gtestsuite/testinghelpers/src/level2/ref_syr2.cpp index 2fdc09362c..fe593d1c41 100644 --- a/gtestsuite/testinghelpers/src/level2/ref_syr2.cpp +++ b/gtestsuite/testinghelpers/src/level2/ref_syr2.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level2/ref_syr2.h" /* @@ -66,11 +65,11 @@ void ref_syr2( char storage, char uploa, char conjx, char conjy, gtint_t n, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_syr2 = (Fptr_ref_cblas_syr2)dlsym(refCBLASModule.get(), "cblas_ssyr2"); + ref_cblas_syr2 = (Fptr_ref_cblas_syr2)refCBLASModule.loadSymbol("cblas_ssyr2"); } else if (typeid(T) == typeid(double)) { - ref_cblas_syr2 = (Fptr_ref_cblas_syr2)dlsym(refCBLASModule.get(), "cblas_dsyr2"); + ref_cblas_syr2 = (Fptr_ref_cblas_syr2)refCBLASModule.loadSymbol("cblas_dsyr2"); } else { diff --git a/gtestsuite/testinghelpers/src/level2/ref_trmv.cpp b/gtestsuite/testinghelpers/src/level2/ref_trmv.cpp index 62beea0520..1e18b35e15 100644 --- a/gtestsuite/testinghelpers/src/level2/ref_trmv.cpp +++ b/gtestsuite/testinghelpers/src/level2/ref_trmv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level2/ref_trmv.h" /* @@ -74,19 +73,19 @@ void ref_trmv( char storage, char uploa, char transa, char diaga, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_trmv = (Fptr_ref_cblas_trmv)dlsym(refCBLASModule.get(), "cblas_strmv"); + ref_cblas_trmv = (Fptr_ref_cblas_trmv)refCBLASModule.loadSymbol("cblas_strmv"); } else if (typeid(T) == typeid(double)) { - ref_cblas_trmv = (Fptr_ref_cblas_trmv)dlsym(refCBLASModule.get(), "cblas_dtrmv"); + ref_cblas_trmv = (Fptr_ref_cblas_trmv)refCBLASModule.loadSymbol("cblas_dtrmv"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_trmv = (Fptr_ref_cblas_trmv)dlsym(refCBLASModule.get(), "cblas_ctrmv"); + ref_cblas_trmv = (Fptr_ref_cblas_trmv)refCBLASModule.loadSymbol("cblas_ctrmv"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_trmv = (Fptr_ref_cblas_trmv)dlsym(refCBLASModule.get(), "cblas_ztrmv"); + ref_cblas_trmv = (Fptr_ref_cblas_trmv)refCBLASModule.loadSymbol("cblas_ztrmv"); } else { diff --git a/gtestsuite/testinghelpers/src/level2/ref_trsv.cpp b/gtestsuite/testinghelpers/src/level2/ref_trsv.cpp index 455ed8455d..5d92a3c3e4 100644 --- a/gtestsuite/testinghelpers/src/level2/ref_trsv.cpp +++ b/gtestsuite/testinghelpers/src/level2/ref_trsv.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level2/ref_trsv.h" /* @@ -73,19 +72,19 @@ void ref_trsv( char storage, char uploa, char transa, char diaga, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_trsv = (Fptr_ref_cblas_trsv)dlsym(refCBLASModule.get(), "cblas_strsv"); + ref_cblas_trsv = (Fptr_ref_cblas_trsv)refCBLASModule.loadSymbol("cblas_strsv"); } else if (typeid(T) == typeid(double)) { - ref_cblas_trsv = (Fptr_ref_cblas_trsv)dlsym(refCBLASModule.get(), "cblas_dtrsv"); + ref_cblas_trsv = (Fptr_ref_cblas_trsv)refCBLASModule.loadSymbol("cblas_dtrsv"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_trsv = (Fptr_ref_cblas_trsv)dlsym(refCBLASModule.get(), "cblas_ctrsv"); + ref_cblas_trsv = (Fptr_ref_cblas_trsv)refCBLASModule.loadSymbol("cblas_ctrsv"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_trsv = (Fptr_ref_cblas_trsv)dlsym(refCBLASModule.get(), "cblas_ztrsv"); + ref_cblas_trsv = (Fptr_ref_cblas_trsv)refCBLASModule.loadSymbol("cblas_ztrsv"); } else { diff --git a/gtestsuite/testinghelpers/src/level3/ref_gemm.cpp b/gtestsuite/testinghelpers/src/level3/ref_gemm.cpp index 6a5987a363..52589ff233 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_gemm.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_gemm.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_gemm.h" /* @@ -83,19 +82,19 @@ void ref_gemm(char storage, char trnsa, char trnsb, gtint_t m, gtint_t n, gtint_ /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_gemm = (Fptr_ref_cblas_gemm)dlsym(refCBLASModule.get( ), "cblas_sgemm"); + ref_cblas_gemm = (Fptr_ref_cblas_gemm)refCBLASModule.loadSymbol("cblas_sgemm"); } else if (typeid(T) == typeid(double)) { - ref_cblas_gemm = (Fptr_ref_cblas_gemm)dlsym(refCBLASModule.get(), "cblas_dgemm"); + ref_cblas_gemm = (Fptr_ref_cblas_gemm)refCBLASModule.loadSymbol("cblas_dgemm"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_gemm = (Fptr_ref_cblas_gemm)dlsym(refCBLASModule.get(), "cblas_cgemm"); + ref_cblas_gemm = (Fptr_ref_cblas_gemm)refCBLASModule.loadSymbol("cblas_cgemm"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_gemm = (Fptr_ref_cblas_gemm)dlsym(refCBLASModule.get(), "cblas_zgemm"); + ref_cblas_gemm = (Fptr_ref_cblas_gemm)refCBLASModule.loadSymbol("cblas_zgemm"); } else { diff --git a/gtestsuite/testinghelpers/src/level3/ref_gemmt.cpp b/gtestsuite/testinghelpers/src/level3/ref_gemmt.cpp index 4c232e643b..8d260aefb6 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_gemmt.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_gemmt.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_gemm.h" #include "level3/ref_gemmt.h" @@ -135,19 +134,19 @@ void ref_gemmt ( /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)dlsym(refCBLASModule.get( ), "cblas_sgemmt"); + ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)refCBLASModule.loadSymbol("cblas_sgemmt"); } else if (typeid(T) == typeid(double)) { - ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)dlsym(refCBLASModule.get(), "cblas_dgemmt"); + ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)refCBLASModule.loadSymbol("cblas_dgemmt"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)dlsym(refCBLASModule.get(), "cblas_cgemmt"); + ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)refCBLASModule.loadSymbol("cblas_cgemmt"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)dlsym(refCBLASModule.get(), "cblas_zgemmt"); + ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)refCBLASModule.loadSymbol("cblas_zgemmt"); } else { diff --git a/gtestsuite/testinghelpers/src/level3/ref_hemm.cpp b/gtestsuite/testinghelpers/src/level3/ref_hemm.cpp index 8bbafc0afe..45dce9ca43 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_hemm.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_hemm.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_hemm.h" namespace testinghelpers { @@ -66,11 +65,11 @@ void ref_hemm ( /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(scomplex)) { - ref_cblas_hemm = (Fptr_ref_cblas_hemm)dlsym(refCBLASModule.get(), "cblas_chemm"); + ref_cblas_hemm = (Fptr_ref_cblas_hemm)refCBLASModule.loadSymbol("cblas_chemm"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_hemm = (Fptr_ref_cblas_hemm)dlsym(refCBLASModule.get(), "cblas_zhemm"); + ref_cblas_hemm = (Fptr_ref_cblas_hemm)refCBLASModule.loadSymbol("cblas_zhemm"); } else { diff --git a/gtestsuite/testinghelpers/src/level3/ref_her2k.cpp b/gtestsuite/testinghelpers/src/level3/ref_her2k.cpp index f7303ed998..25030d7d42 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_her2k.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_her2k.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_her2k.h" namespace testinghelpers { @@ -65,11 +64,11 @@ void ref_her2k( /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(scomplex)) { - ref_cblas_her2k = (Fptr_ref_cblas_her2k)dlsym(refCBLASModule.get(), "cblas_cher2k"); + ref_cblas_her2k = (Fptr_ref_cblas_her2k)refCBLASModule.loadSymbol("cblas_cher2k"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_her2k = (Fptr_ref_cblas_her2k)dlsym(refCBLASModule.get(), "cblas_zher2k"); + ref_cblas_her2k = (Fptr_ref_cblas_her2k)refCBLASModule.loadSymbol("cblas_zher2k"); } else { diff --git a/gtestsuite/testinghelpers/src/level3/ref_herk.cpp b/gtestsuite/testinghelpers/src/level3/ref_herk.cpp index 1f6c48bdce..6516833d88 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_herk.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_herk.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_herk.h" namespace testinghelpers { @@ -64,11 +63,11 @@ void ref_herk( /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(scomplex)) { - ref_cblas_herk = (Fptr_ref_cblas_herk)dlsym(refCBLASModule.get(), "cblas_cherk"); + ref_cblas_herk = (Fptr_ref_cblas_herk)refCBLASModule.loadSymbol("cblas_cherk"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_herk = (Fptr_ref_cblas_herk)dlsym(refCBLASModule.get(), "cblas_zherk"); + ref_cblas_herk = (Fptr_ref_cblas_herk)refCBLASModule.loadSymbol("cblas_zherk"); } else { diff --git a/gtestsuite/testinghelpers/src/level3/ref_symm.cpp b/gtestsuite/testinghelpers/src/level3/ref_symm.cpp index a784b804c1..fa13613327 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_symm.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_symm.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_symm.h" namespace testinghelpers { @@ -66,19 +65,19 @@ void ref_symm ( /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_symm = (Fptr_ref_cblas_symm)dlsym(refCBLASModule.get( ), "cblas_ssymm"); + ref_cblas_symm = (Fptr_ref_cblas_symm)refCBLASModule.loadSymbol("cblas_ssymm"); } else if (typeid(T) == typeid(double)) { - ref_cblas_symm = (Fptr_ref_cblas_symm)dlsym(refCBLASModule.get(), "cblas_dsymm"); + ref_cblas_symm = (Fptr_ref_cblas_symm)refCBLASModule.loadSymbol("cblas_dsymm"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_symm = (Fptr_ref_cblas_symm)dlsym(refCBLASModule.get(), "cblas_csymm"); + ref_cblas_symm = (Fptr_ref_cblas_symm)refCBLASModule.loadSymbol("cblas_csymm"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_symm = (Fptr_ref_cblas_symm)dlsym(refCBLASModule.get(), "cblas_zsymm"); + ref_cblas_symm = (Fptr_ref_cblas_symm)refCBLASModule.loadSymbol("cblas_zsymm"); } else { diff --git a/gtestsuite/testinghelpers/src/level3/ref_syr2k.cpp b/gtestsuite/testinghelpers/src/level3/ref_syr2k.cpp index 49cb1cf5af..41ae007f6a 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_syr2k.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_syr2k.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_syr2k.h" namespace testinghelpers { @@ -66,19 +65,19 @@ void ref_syr2k( /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)dlsym(refCBLASModule.get( ), "cblas_ssyr2k"); + ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)refCBLASModule.loadSymbol("cblas_ssyr2k"); } else if (typeid(T) == typeid(double)) { - ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)dlsym(refCBLASModule.get(), "cblas_dsyr2k"); + ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)refCBLASModule.loadSymbol("cblas_dsyr2k"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)dlsym(refCBLASModule.get(), "cblas_csyr2k"); + ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)refCBLASModule.loadSymbol("cblas_csyr2k"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)dlsym(refCBLASModule.get(), "cblas_zsyr2k"); + ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)refCBLASModule.loadSymbol("cblas_zsyr2k"); } else { diff --git a/gtestsuite/testinghelpers/src/level3/ref_syrk.cpp b/gtestsuite/testinghelpers/src/level3/ref_syrk.cpp index a834b3b0d7..6a1d009cb4 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_syrk.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_syrk.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_syrk.h" namespace testinghelpers { @@ -65,19 +64,19 @@ void ref_syrk( /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_syrk = (Fptr_ref_cblas_syrk)dlsym(refCBLASModule.get( ), "cblas_ssyrk"); + ref_cblas_syrk = (Fptr_ref_cblas_syrk)refCBLASModule.loadSymbol("cblas_ssyrk"); } else if (typeid(T) == typeid(double)) { - ref_cblas_syrk = (Fptr_ref_cblas_syrk)dlsym(refCBLASModule.get(), "cblas_dsyrk"); + ref_cblas_syrk = (Fptr_ref_cblas_syrk)refCBLASModule.loadSymbol("cblas_dsyrk"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_syrk = (Fptr_ref_cblas_syrk)dlsym(refCBLASModule.get(), "cblas_csyrk"); + ref_cblas_syrk = (Fptr_ref_cblas_syrk)refCBLASModule.loadSymbol("cblas_csyrk"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_syrk = (Fptr_ref_cblas_syrk)dlsym(refCBLASModule.get(), "cblas_zsyrk"); + ref_cblas_syrk = (Fptr_ref_cblas_syrk)refCBLASModule.loadSymbol("cblas_zsyrk"); } else { diff --git a/gtestsuite/testinghelpers/src/level3/ref_trmm.cpp b/gtestsuite/testinghelpers/src/level3/ref_trmm.cpp index ebf08be5ca..0faa1e52fb 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_trmm.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_trmm.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_trmm.h" /* @@ -75,19 +74,19 @@ void ref_trmm( char storage, char side, char uploa, char transa, char diaga, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_trmm = (Fptr_ref_cblas_trmm)dlsym(refCBLASModule.get( ), "cblas_strmm"); + ref_cblas_trmm = (Fptr_ref_cblas_trmm)refCBLASModule.loadSymbol("cblas_strmm"); } else if (typeid(T) == typeid(double)) { - ref_cblas_trmm = (Fptr_ref_cblas_trmm)dlsym(refCBLASModule.get(), "cblas_dtrmm"); + ref_cblas_trmm = (Fptr_ref_cblas_trmm)refCBLASModule.loadSymbol("cblas_dtrmm"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_trmm = (Fptr_ref_cblas_trmm)dlsym(refCBLASModule.get(), "cblas_ctrmm"); + ref_cblas_trmm = (Fptr_ref_cblas_trmm)refCBLASModule.loadSymbol("cblas_ctrmm"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_trmm = (Fptr_ref_cblas_trmm)dlsym(refCBLASModule.get(), "cblas_ztrmm"); + ref_cblas_trmm = (Fptr_ref_cblas_trmm)refCBLASModule.loadSymbol("cblas_ztrmm"); } else { diff --git a/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp b/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp index 2633b63b43..cb6e1283d2 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_trmm3.h" /* diff --git a/gtestsuite/testinghelpers/src/level3/ref_trsm.cpp b/gtestsuite/testinghelpers/src/level3/ref_trsm.cpp index c5d326a5eb..6f56c069e1 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_trsm.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_trsm.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "level3/ref_trsm.h" /* @@ -76,19 +75,19 @@ void ref_trsm( char storage, char side, char uploa, char transa, char diaga, /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_trsm = (Fptr_ref_cblas_trsm)dlsym(refCBLASModule.get( ), "cblas_strsm"); + ref_cblas_trsm = (Fptr_ref_cblas_trsm)refCBLASModule.loadSymbol("cblas_strsm"); } else if (typeid(T) == typeid(double)) { - ref_cblas_trsm = (Fptr_ref_cblas_trsm)dlsym(refCBLASModule.get(), "cblas_dtrsm"); + ref_cblas_trsm = (Fptr_ref_cblas_trsm)refCBLASModule.loadSymbol("cblas_dtrsm"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_trsm = (Fptr_ref_cblas_trsm)dlsym(refCBLASModule.get(), "cblas_ctrsm"); + ref_cblas_trsm = (Fptr_ref_cblas_trsm)refCBLASModule.loadSymbol("cblas_ctrsm"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_trsm = (Fptr_ref_cblas_trsm)dlsym(refCBLASModule.get(), "cblas_ztrsm"); + ref_cblas_trsm = (Fptr_ref_cblas_trsm)refCBLASModule.loadSymbol("cblas_ztrsm"); } else { diff --git a/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp b/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp index 75021e412e..95bc2e1e93 100644 --- a/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp +++ b/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "util/ref_nrm2.h" /* @@ -56,19 +55,19 @@ RT ref_nrm2(gtint_t n, T* x, gtint_t incx) { /* Check the typename T passed to this function template and call respective function.*/ if (typeid(T) == typeid(float)) { - ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)dlsym(refCBLASModule.get( ), "cblas_snrm2"); + ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)refCBLASModule.loadSymbol("cblas_snrm2"); } else if (typeid(T) == typeid(double)) { - ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)dlsym(refCBLASModule.get(), "cblas_dnrm2"); + ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)refCBLASModule.loadSymbol("cblas_dnrm2"); } else if (typeid(T) == typeid(scomplex)) { - ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)dlsym(refCBLASModule.get(), "cblas_scnrm2"); + ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)refCBLASModule.loadSymbol("cblas_scnrm2"); } else if (typeid(T) == typeid(dcomplex)) { - ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)dlsym(refCBLASModule.get(), "cblas_dznrm2"); + ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)refCBLASModule.loadSymbol("cblas_dznrm2"); } else { diff --git a/gtestsuite/testsuite/CMakeLists.txt b/gtestsuite/testsuite/CMakeLists.txt index 135403e0f8..f03b7463b0 100644 --- a/gtestsuite/testsuite/CMakeLists.txt +++ b/gtestsuite/testsuite/CMakeLists.txt @@ -43,6 +43,26 @@ set(BUILD_GTEST ON CACHE BOOL "" FORCE) FetchContent_MakeAvailable(googletest) include(GoogleTest) +# Set corresponding environment variables when we compare against MKL. +if(REF_CBLAS STREQUAL "MKL") + # Since we test with MKL as reference we need to ensure that the correct interface is being picked up from mkl_rt library. + if(INT_SIZE STREQUAL "32") + set(MKL_ENV ${MKL_ENV};MKL_INTERFACE_LAYER=LP64) + else() + set(MKL_ENV ${MKL_ENV};MKL_INTERFACE_LAYER=ILP64) + endif() + # Chose which threading library to use with MKL depending on the option. + if(MKL_ENABLE_THREADING STREQUAL "no") + set(MKL_ENV ${MKL_ENV};MKL_THREADING_LAYER=SEQUENTIAL) + else() + if(WIN32) + set(MKL_ENV ${MKL_ENV};MKL_THREADING_LAYER=INTEL) + else() # if the system is Linux + set(MKL_ENV ${MKL_ENV};MKL_THREADING_LAYER=GNU) + endif() + endif() +endif() + # Return the list of the subdirectories in the directory curdir. MACRO(SUBDIRLIST result curdir) FILE(GLOB children RELATIVE ${curdir} ${curdir}/*) @@ -70,14 +90,8 @@ foreach(dir ${DIRS}) target_include_directories(${target_name}.${dir}.${subdir} PUBLIC ${BLIS_INCLUDE} ${CMAKE_SOURCE_DIR}/testinghelpers/inc ${CMAKE_SOURCE_DIR}/testsuite/) target_link_libraries(${target_name}.${dir}.${subdir} gtest gtest_main testinghelpers ${Blis_LIBRARY} ${COMMON_LIBS}) # if we test serial BLIS, but MKL is used as a reference we still need to set up OpenMP. - if( (ENABLE_THREADING STREQUAL "openmp") OR (REF_CBLAS STREQUAL "MKL")) - if(LINUX) - if(OpenMP_LIBRARY STREQUAL "GNU") - target_link_libraries(${target_name}.${dir}.${subdir} -fopenmp) - else() - target_link_libraries(${target_name}.${dir}.${subdir} iomp5) - endif() - endif() + if( (ENABLE_THREADING STREQUAL "openmp") OR (MKL_ENABLE_THREADING STREQUAL "openmp")) + target_link_libraries(${target_name}.${dir}.${subdir} OpenMP::OpenMP_CXX) endif() if(ENABLE_ASAN) target_link_libraries(${target_name}.${dir}.${subdir} -fsanitize=address) @@ -92,8 +106,14 @@ foreach(dir ${DIRS}) else() # BLIS_TYPED option target_compile_definitions(${target_name}.${dir}.${subdir} PUBLIC TEST_BLIS_TYPED) endif() - target_compile_definitions(${target_name}.${dir}.${subdir} PUBLIC ELEMENT_TYPE='${ELEMENT_TYPE}') + target_compile_definitions(${target_name}.${dir}.${subdir} PUBLIC BLIS_ELEMENT_TYPE='${BLIS_ELEMENT_TYPE}') add_test(NAME ${target_name}.${dir}.${subdir} COMMAND ${target_name}.${dir}.${subdir}) + if(REF_CBLAS STREQUAL "MKL") + set_property(TEST ${target_name}.${dir}.${subdir} PROPERTY ENVIRONMENT ${MKL_ENV}) + endif() + if(BLIS_LINKING_TYPE STREQUAL "shared") + set_property(TEST ${target_name}.${dir}.${subdir} PROPERTY ENVIRONMENT_MODIFICATION "PATH=path_list_prepend:${BLIS_LIB_PATH}") + endif() add_dependencies(${target_name}.${dir} ${target_name}.${dir}.${subdir}) endif() endforeach() diff --git a/gtestsuite/testsuite/level1/addv/test_addv.h b/gtestsuite/testsuite/level1/addv/test_addv.h index a535e404f3..cf9cfd86b4 100644 --- a/gtestsuite/testsuite/level1/addv/test_addv.h +++ b/gtestsuite/testsuite/level1/addv/test_addv.h @@ -61,7 +61,7 @@ void test_addv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, double thresh //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - addv( conjx, n, x.data(), incx, y.data(), incy ); + addv( conjx, n, x.data(), incx, y.data(), incy ); //---------------------------------------------------------- // Compute component-wise error. diff --git a/gtestsuite/testsuite/level1/amaxv/test_amaxv.h b/gtestsuite/testsuite/level1/amaxv/test_amaxv.h index 0d2ea890dc..a02464e8ee 100644 --- a/gtestsuite/testsuite/level1/amaxv/test_amaxv.h +++ b/gtestsuite/testsuite/level1/amaxv/test_amaxv.h @@ -58,10 +58,10 @@ void test_amaxv( gtint_t n, gtint_t incx, double thresh ) //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - gtint_t idx = amaxv( n, x.data(), incx ); + gtint_t idx = amaxv( n, x.data(), incx ); //---------------------------------------------------------- // Compute component-wise error. //---------------------------------------------------------- - computediff( idx, idx_ref ); -} \ No newline at end of file + EXPECT_EQ( idx, idx_ref ); +} diff --git a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp index 5447f57aff..d43e5a70f0 100644 --- a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp @@ -64,7 +64,7 @@ TEST_P( zaxpbyvGenericTest, RandomData ) T beta = std::get<5>(GetParam()); // Set the threshold for the errors: - double thresh = 2*testinghelpers::getEpsilon(); + double thresh = 20*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call generic test body using those parameters diff --git a/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp index ff8cc67b64..437518c498 100644 --- a/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp @@ -142,8 +142,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Combine( ::testing::Values('n'), // n: use x, not conj(x) (since it is real) ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector - ::testing::Values(gtint_t(2), gtint_t(-2)), /*(gtint_t(-5), gtint_t(-17))*/// stride size for x - ::testing::Values(gtint_t(3), gtint_t(-3)), /*(gtint_t(-12), gtint_t(-4))*/// stride size for y + ::testing::Values(gtint_t(2)), // stride size for x + ::testing::Values(gtint_t(3)), // stride size for y ::testing::Values(float(4.0)) // alpha ), ::saxpyvGenericTestPrint() diff --git a/gtestsuite/testsuite/level1/dotv/dotv.h b/gtestsuite/testsuite/level1/dotv/dotv.h index c65f229695..2120b40ea8 100644 --- a/gtestsuite/testsuite/level1/dotv/dotv.h +++ b/gtestsuite/testsuite/level1/dotv/dotv.h @@ -54,16 +54,24 @@ template static void dotv_(gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy, T* rho) { - if constexpr (std::is_same::value) - *rho = sdot_( &n, x, &incx, y, &incy ); - else if constexpr (std::is_same::value) - *rho = ddot_( &n, x, &incx, y, &incy ); - else if constexpr (std::is_same::value) - *rho = cdotu_( &n, x, &incx, y, &incy ); - else if constexpr (std::is_same::value) - *rho = zdotu_( &n, x, &incx, y, &incy ); - else - throw std::runtime_error("Error in testsuite/level1/dotv.h: Invalid typename in dotv_()."); + if constexpr (std::is_same::value) + *rho = sdot_(&n, x, &incx, y, &incy); + else if constexpr (std::is_same::value) + *rho = ddot_( &n, x, &incx, y, &incy ); + else if constexpr (std::is_same::value) + #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL + *rho = cdotu_(&n, x, &incx, y, &incy); + #else + cdotu_(rho, &n, x, &incx, y, &incy); + #endif + else if constexpr (std::is_same::value) + #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL + *rho = zdotu_(&n, x, &incx, y, &incy); + #else + zdotu_(rho, &n, x, &incx, y, &incy); + #endif + else + throw std::runtime_error("Error in testsuite/level1/dotv.h: Invalid typename in dotv_()."); } template diff --git a/gtestsuite/testsuite/level1/dotxv/test_dotxv.h b/gtestsuite/testsuite/level1/dotxv/test_dotxv.h index 1fe5b50614..6562e3dc46 100644 --- a/gtestsuite/testsuite/level1/dotxv/test_dotxv.h +++ b/gtestsuite/testsuite/level1/dotxv/test_dotxv.h @@ -58,15 +58,15 @@ static void test_dotxv( gtint_t n, char conjx, char conjy, T alpha, // Create a copy of y so that we can check reference results. std::vector y_ref(y); T rho_ref; - testinghelpers::initone(rho_ref); + testinghelpers::initone(rho_ref); testinghelpers::ref_dotxv( conjx, conjy, n, alpha, x.data(), incx, y.data(), incy, beta, &rho_ref ); //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- T rho; - testinghelpers::initone(rho); - dotxv( conjx, conjy, n, &alpha, x.data(), incx, y.data(), incy, &beta, &rho ); + testinghelpers::initone(rho); + dotxv( conjx, conjy, n, &alpha, x.data(), incx, y.data(), incy, &beta, &rho ); //---------------------------------------------------------- // Compute error. diff --git a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp index 660d0450d8..3e5cf70b1e 100644 --- a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp +++ b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp @@ -46,7 +46,7 @@ TYPED_TEST(xscalv, zero_alpha_x_fp) gtint_t n = 10, incx = 1; std::vector x(n); // Initialize x with random numbers. - testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data(), ELEMENT_TYPE ); + testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data(), BLIS_ELEMENT_TYPE ); std::vector x_ref(x); T alpha = T{0}; @@ -70,7 +70,7 @@ TYPED_TEST(xscalv, zero_alpha_x_inf) gtint_t n = 10, incx = 1; std::vector x(n); // Initialize x with random numbers. - testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data(), ELEMENT_TYPE ); + testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data(), BLIS_ELEMENT_TYPE ); x[3] = 1.0/0.0; std::vector x_ref(x); T alpha = T{0}; diff --git a/gtestsuite/testsuite/level1/setv/test_setv.h b/gtestsuite/testsuite/level1/setv/test_setv.h index 1fa4d3f6ab..e5521aafe8 100644 --- a/gtestsuite/testsuite/level1/setv/test_setv.h +++ b/gtestsuite/testsuite/level1/setv/test_setv.h @@ -61,7 +61,7 @@ void test_setv( char conjalpha, gtint_t n, T alpha, gtint_t incx ) //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - setv( conjalpha, n, &alpha, x.data(), incx ); + setv( conjalpha, n, &alpha, x.data(), incx ); //---------------------------------------------------------- // Compute component-wise error. diff --git a/gtestsuite/testsuite/level1/subv/test_subv.h b/gtestsuite/testsuite/level1/subv/test_subv.h index db9c64bbaf..b61b1c50eb 100644 --- a/gtestsuite/testsuite/level1/subv/test_subv.h +++ b/gtestsuite/testsuite/level1/subv/test_subv.h @@ -61,7 +61,7 @@ void test_subv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, double thresh //---------------------------------------------------------- // Call BLIS function. //---------------------------------------------------------- - subv( conjx, n, x.data(), incx, y.data(), incy ); + subv( conjx, n, x.data(), incx, y.data(), incy ); //---------------------------------------------------------- // Compute component-wise error. diff --git a/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp index 03b7762d79..5403ca19fc 100644 --- a/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp @@ -78,7 +78,7 @@ TEST_P(cgemvTest, RandomData) gtint_t lda_inc = std::get<9>(GetParam()); // Set the threshold for the errors: - double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); + double thresh = 2*(std::max)(m,n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -105,7 +105,7 @@ class cgemvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_cgemv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_cgemv"; + std::string str_name = "bli_cgemv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + transa+conjx; diff --git a/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp index 7357097204..79249202d1 100644 --- a/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp @@ -78,7 +78,7 @@ TEST_P(dgemvTest, RandomData) gtint_t lda_inc = std::get<9>(GetParam()); // Set the threshold for the errors: - double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); + double thresh = 2*(std::max)(m,n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -105,7 +105,7 @@ class dgemvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dgemv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dgemv"; + std::string str_name = "bli_dgemv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + transa+conjx; diff --git a/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp index ec0d19bd4a..ec726ff56b 100644 --- a/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp @@ -78,7 +78,7 @@ TEST_P(sgemvTest, RandomData) gtint_t lda_inc = std::get<9>(GetParam()); // Set the threshold for the errors: - double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); + double thresh = 2*(std::max)(m,n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -105,7 +105,7 @@ class sgemvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_sgemv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_sgemv"; + std::string str_name = "bli_sgemv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + transa+conjx; diff --git a/gtestsuite/testsuite/level2/gemv/test_gemv.h b/gtestsuite/testsuite/level2/gemv/test_gemv.h index a3b3ccf653..7175b07fc2 100644 --- a/gtestsuite/testsuite/level2/gemv/test_gemv.h +++ b/gtestsuite/testsuite/level2/gemv/test_gemv.h @@ -64,13 +64,13 @@ void test_gemv( char storage, char trnsa, char conjx, gtint_t m, gtint_t n, //---------------------------------------------------------- // Call BLIS function //---------------------------------------------------------- - gemv( storage, trnsa, conjx, m, n, &alpha, a.data(), lda, + gemv( storage, trnsa, conjx, m, n, &alpha, a.data(), lda, x.data(), incx, &beta, y.data(), incy ); //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_gemv( storage, trnsa, conjx, m, n, alpha, a.data(), + testinghelpers::ref_gemv( storage, trnsa, conjx, m, n, alpha, a.data(), lda, x.data(), incx, beta, y_ref.data(), incy ); //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp index 94700a36b7..44903e9347 100644 --- a/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp @@ -78,7 +78,7 @@ TEST_P(zgemvTest, RandomData) gtint_t lda_inc = std::get<9>(GetParam()); // Set the threshold for the errors: - double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); + double thresh = 2*(std::max)(m,n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -105,7 +105,7 @@ class zgemvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zgemv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zgemv"; + std::string str_name = "bli_zgemv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + transa+conjx; diff --git a/gtestsuite/testsuite/level2/ger/cger_generic.cpp b/gtestsuite/testsuite/level2/ger/cger_generic.cpp index 29646c656e..b3bad3620e 100644 --- a/gtestsuite/testsuite/level2/ger/cger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/cger_generic.cpp @@ -75,7 +75,7 @@ TEST_P(cgerTest, RandomData) gtint_t lda_inc = std::get<8>(GetParam()); // Set the threshold for the errors: - double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); + double thresh = 2*(std::max)(m,n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -101,7 +101,7 @@ class cgerTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_cger"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_cger"; + std::string str_name = "bli_cger"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + conjx+conjy; diff --git a/gtestsuite/testsuite/level2/ger/dger_generic.cpp b/gtestsuite/testsuite/level2/ger/dger_generic.cpp index b8142cb685..d25e5bd16f 100644 --- a/gtestsuite/testsuite/level2/ger/dger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/dger_generic.cpp @@ -75,7 +75,7 @@ TEST_P(dgerTest, RandomData) gtint_t lda_inc = std::get<8>(GetParam()); // Set the threshold for the errors: - double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); + double thresh = 2*(std::max)(m,n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -101,7 +101,7 @@ class dgerTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dger"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dger"; + std::string str_name = "bli_dger"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + conjx+conjy; diff --git a/gtestsuite/testsuite/level2/ger/sger_generic.cpp b/gtestsuite/testsuite/level2/ger/sger_generic.cpp index 0dc66d658b..7298224040 100644 --- a/gtestsuite/testsuite/level2/ger/sger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/sger_generic.cpp @@ -75,7 +75,7 @@ TEST_P(sgerTest, RandomData) gtint_t lda_inc = std::get<8>(GetParam()); // Set the threshold for the errors: - double thresh = 4*std::max(m,n)*testinghelpers::getEpsilon(); + double thresh = 4*(std::max)(m,n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -101,7 +101,7 @@ class sgerTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_sger"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_sger"; + std::string str_name = "bli_sger"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + conjx+conjy; diff --git a/gtestsuite/testsuite/level2/ger/test_ger.h b/gtestsuite/testsuite/level2/ger/test_ger.h index fd43d8fb49..13ef4f7596 100644 --- a/gtestsuite/testsuite/level2/ger/test_ger.h +++ b/gtestsuite/testsuite/level2/ger/test_ger.h @@ -60,13 +60,13 @@ void test_ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n, //---------------------------------------------------------- // Call BLIS function //---------------------------------------------------------- - ger( storage, conjx, conjy, m, n, &alpha, x.data(), incx, + ger( storage, conjx, conjy, m, n, &alpha, x.data(), incx, y.data(), incy, a.data(), lda ); //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_ger( storage, conjx, conjy, m, n, alpha, + testinghelpers::ref_ger( storage, conjx, conjy, m, n, alpha, x.data(), incx, y.data(), incy, a_ref.data(), lda ); //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level2/ger/zger_generic.cpp b/gtestsuite/testsuite/level2/ger/zger_generic.cpp index 293d7c5f88..b5fd790703 100644 --- a/gtestsuite/testsuite/level2/ger/zger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/zger_generic.cpp @@ -75,7 +75,7 @@ TEST_P(zgerTest, RandomData) gtint_t lda_inc = std::get<8>(GetParam()); // Set the threshold for the errors: - double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon(); + double thresh = 2*(std::max)(m,n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -101,7 +101,7 @@ class zgerTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zger"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zger"; + std::string str_name = "bli_zger"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + conjx+conjy; diff --git a/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp b/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp index b59ee251fd..33aebd8125 100644 --- a/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp +++ b/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp @@ -105,7 +105,7 @@ class chemvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_chemv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_chemv"; + std::string str_name = "bli_chemv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conja+conjx; diff --git a/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp b/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp index 7ee8c9b21f..8e116b186e 100644 --- a/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp +++ b/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp @@ -105,7 +105,7 @@ class zhemvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zhemv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zhemv"; + std::string str_name = "bli_zhemv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conja+conjx; diff --git a/gtestsuite/testsuite/level2/her/cher_generic.cpp b/gtestsuite/testsuite/level2/her/cher_generic.cpp index fc1797ec34..9ad83a597f 100644 --- a/gtestsuite/testsuite/level2/her/cher_generic.cpp +++ b/gtestsuite/testsuite/level2/her/cher_generic.cpp @@ -93,7 +93,7 @@ class cherTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_cher"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_cher"; + std::string str_name = "bli_cher"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conjx; diff --git a/gtestsuite/testsuite/level2/her/zher_generic.cpp b/gtestsuite/testsuite/level2/her/zher_generic.cpp index eb5d6b40e4..198e0a3bdb 100644 --- a/gtestsuite/testsuite/level2/her/zher_generic.cpp +++ b/gtestsuite/testsuite/level2/her/zher_generic.cpp @@ -93,7 +93,7 @@ class zherTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zher"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zher"; + std::string str_name = "bli_zher"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conjx; diff --git a/gtestsuite/testsuite/level2/her2/cher2_generic.cpp b/gtestsuite/testsuite/level2/her2/cher2_generic.cpp index 472d30c745..4df3e6dda3 100644 --- a/gtestsuite/testsuite/level2/her2/cher2_generic.cpp +++ b/gtestsuite/testsuite/level2/her2/cher2_generic.cpp @@ -101,7 +101,7 @@ class cher2TestPrint { #elif TEST_CBLAS std::string str_name = "cblas_cher2"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_cher2"; + std::string str_name = "bli_cher2"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conjx+conjy; diff --git a/gtestsuite/testsuite/level2/her2/zher2_generic.cpp b/gtestsuite/testsuite/level2/her2/zher2_generic.cpp index f09e8fb104..19723abd6f 100644 --- a/gtestsuite/testsuite/level2/her2/zher2_generic.cpp +++ b/gtestsuite/testsuite/level2/her2/zher2_generic.cpp @@ -101,7 +101,7 @@ class zher2TestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zher2"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zher2"; + std::string str_name = "bli_zher2"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conjx+conjy; diff --git a/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp b/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp index d768de0734..0e959e759b 100644 --- a/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp +++ b/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp @@ -105,7 +105,7 @@ class dsymvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dsymv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dsymv"; + std::string str_name = "bli_dsymv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conja+conjx; diff --git a/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp b/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp index 520befd98f..11ac8d71e8 100644 --- a/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp +++ b/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp @@ -105,7 +105,7 @@ class ssymvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ssymv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ssymv"; + std::string str_name = "bli_ssymv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conja+conjx; diff --git a/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp b/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp index e2aef734ed..784fa63ca6 100644 --- a/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp +++ b/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp @@ -93,7 +93,7 @@ class dsyrTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dsyr"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dsyr"; + std::string str_name = "bli_dsyr"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conjx; diff --git a/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp b/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp index 66d4d1ce0e..3fb8a17570 100644 --- a/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp +++ b/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp @@ -93,7 +93,7 @@ class ssyrTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ssyr"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ssyr"; + std::string str_name = "bli_ssyr"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conjx; diff --git a/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp b/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp index 07266866f7..cbbf06ea84 100644 --- a/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp +++ b/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp @@ -101,7 +101,7 @@ class dsyr2TestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dsyr2"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dsyr2"; + std::string str_name = "bli_dsyr2"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conjx+conjy; diff --git a/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp b/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp index d0ccfb3e79..261921746e 100644 --- a/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp +++ b/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp @@ -101,7 +101,7 @@ class ssyr2TestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ssyr2"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ssyr2"; + std::string str_name = "bli_ssyr2"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+conjx+conjy; diff --git a/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp index a77120e69f..0c24ba588a 100644 --- a/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp @@ -97,7 +97,7 @@ class ctrmvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ctrmv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ctrmv"; + std::string str_name = "bli_ctrmv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+transa; diff --git a/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp index cd3e123a9d..c825d93be5 100644 --- a/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp @@ -97,7 +97,7 @@ class dtrmvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dtrmv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dtrmv"; + std::string str_name = "bli_dtrmv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+transa; diff --git a/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp index 5560dc6094..bd4caad329 100644 --- a/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp @@ -97,7 +97,7 @@ class strmvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_strmv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_strmv"; + std::string str_name = "bli_strmv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+transa; diff --git a/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp index a3868e61dd..4e76623824 100644 --- a/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp @@ -97,7 +97,7 @@ class ztrmvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ztrmv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ztrmv"; + std::string str_name = "bli_ztrmv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+transa; diff --git a/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp index 09e9c05a4d..1652a74e49 100644 --- a/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp @@ -97,7 +97,7 @@ class ctrsvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ctrsv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ctrsv"; + std::string str_name = "bli_ctrsv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+transa; diff --git a/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp index ac74f828e9..fb4a8af541 100644 --- a/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp @@ -97,7 +97,7 @@ class dtrsvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dtrsv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dtrsv"; + std::string str_name = "bli_dtrsv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+transa; diff --git a/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp index ed7d26d713..7dcf457134 100644 --- a/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp @@ -97,7 +97,7 @@ class strsvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_strsv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_strsv"; + std::string str_name = "bli_strsv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+transa; diff --git a/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp index 97f1c3440d..1cc4fbf34b 100644 --- a/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp @@ -97,7 +97,7 @@ class ztrsvTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ztrsv"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ztrsv"; + std::string str_name = "bli_ztrsv"; #endif str_name = str_name + "_" + sfm; str_name = str_name + "_" + uploa+transa; diff --git a/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp index 447a96c459..4c76593b7a 100644 --- a/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp @@ -107,7 +107,7 @@ class DGemmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dgemm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dgemm"; + std::string str_name = "bli_dgemm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + tsa + tsb; diff --git a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp index f7683ea7eb..4d1eb7f4c9 100644 --- a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp @@ -107,7 +107,7 @@ class SGemmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_sgemm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_sgemm"; + std::string str_name = "bli_sgemm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + tsa + tsb; diff --git a/gtestsuite/testsuite/level3/gemm/test_gemm.h b/gtestsuite/testsuite/level3/gemm/test_gemm.h index df88bb50b0..efb00df866 100644 --- a/gtestsuite/testsuite/level3/gemm/test_gemm.h +++ b/gtestsuite/testsuite/level3/gemm/test_gemm.h @@ -69,7 +69,7 @@ void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n, //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_gemm( storage, trnsa, trnsb, m, n, k, alpha, + testinghelpers::ref_gemm( storage, trnsa, trnsb, m, n, k, alpha, a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc ); //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp index dd61ce69cf..068c8398b8 100644 --- a/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp @@ -107,7 +107,7 @@ class ZGemmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zgemm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zgemm"; + std::string str_name = "bli_zgemm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + tsa + tsb; diff --git a/gtestsuite/testsuite/level3/gemmt/test_gemmt.h b/gtestsuite/testsuite/level3/gemmt/test_gemmt.h index 5b88894647..af67f55565 100644 --- a/gtestsuite/testsuite/level3/gemmt/test_gemmt.h +++ b/gtestsuite/testsuite/level3/gemmt/test_gemmt.h @@ -69,7 +69,7 @@ void test_gemmt( char storage, char uplo, char trnsa, char trnsb, gtint_t n, //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_gemmt( storage, uplo, trnsa, trnsb, n, k, alpha, + testinghelpers::ref_gemmt( storage, uplo, trnsa, trnsb, n, k, alpha, a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc ); //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp index 898e50de91..de5ec8ba70 100644 --- a/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp +++ b/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp @@ -81,7 +81,7 @@ TEST_P(zgemmtTest, RandomData) gtint_t ldc_inc = std::get<10>(GetParam()); // Set the threshold for the errors: - double thresh = std::max(n,k)*testinghelpers::getEpsilon(); + double thresh = (std::max)(n,k)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters diff --git a/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp b/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp index 181956d507..314a320032 100644 --- a/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp +++ b/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp @@ -112,7 +112,7 @@ class chemmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_chemm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_chemm"; + std::string str_name = "bli_chemm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uplo; diff --git a/gtestsuite/testsuite/level3/hemm/test_hemm.h b/gtestsuite/testsuite/level3/hemm/test_hemm.h index b0b2d94847..7b1cbf4d15 100644 --- a/gtestsuite/testsuite/level3/hemm/test_hemm.h +++ b/gtestsuite/testsuite/level3/hemm/test_hemm.h @@ -73,7 +73,7 @@ void test_hemm( char storage, char side, char uplo, char conja, char transb, //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_hemm( storage, side, uplo, conja, transb, m, n, alpha, + testinghelpers::ref_hemm( storage, side, uplo, conja, transb, m, n, alpha, a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc ); //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp b/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp index fcc2b0c73e..4ab063bb91 100644 --- a/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp +++ b/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp @@ -112,7 +112,7 @@ class zhemmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zhemm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zhemm"; + std::string str_name = "bli_zhemm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uplo; diff --git a/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp b/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp index 1e6d848ac8..c256096221 100644 --- a/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp +++ b/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp @@ -108,7 +108,7 @@ class cher2kTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_cher2k"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_cher2k"; + std::string str_name = "bli_cher2k"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/her2k/test_her2k.h b/gtestsuite/testsuite/level3/her2k/test_her2k.h index e05845b451..345fe5d890 100644 --- a/gtestsuite/testsuite/level3/her2k/test_her2k.h +++ b/gtestsuite/testsuite/level3/her2k/test_her2k.h @@ -72,7 +72,7 @@ void test_her2k( char storage, char uplo, char transa, char transb, //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_her2k( storage, uplo, transa, transb, m, k, &alpha, + testinghelpers::ref_her2k( storage, uplo, transa, transb, m, k, &alpha, a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc ); //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp b/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp index 316fc730b7..9f24bc78fe 100644 --- a/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp +++ b/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp @@ -108,7 +108,7 @@ class zher2kTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zher2k"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zher2k"; + std::string str_name = "bli_zher2k"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/herk/cherk_generic.cpp b/gtestsuite/testsuite/level3/herk/cherk_generic.cpp index 4db35fbeeb..2480b1d6de 100644 --- a/gtestsuite/testsuite/level3/herk/cherk_generic.cpp +++ b/gtestsuite/testsuite/level3/herk/cherk_generic.cpp @@ -101,7 +101,7 @@ class cherkTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_cherk"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_cherk"; + std::string str_name = "bli_cherk"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/herk/zherk_generic.cpp b/gtestsuite/testsuite/level3/herk/zherk_generic.cpp index 620841669c..2947549b15 100644 --- a/gtestsuite/testsuite/level3/herk/zherk_generic.cpp +++ b/gtestsuite/testsuite/level3/herk/zherk_generic.cpp @@ -101,7 +101,7 @@ class zherkTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zherk"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zherk"; + std::string str_name = "bli_zherk"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/symm/csymm_generic.cpp b/gtestsuite/testsuite/level3/symm/csymm_generic.cpp index 8a16fd0583..f1e7ff6e28 100644 --- a/gtestsuite/testsuite/level3/symm/csymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/csymm_generic.cpp @@ -112,7 +112,7 @@ class csymmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_csymm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_csymm"; + std::string str_name = "bli_csymm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uplo; diff --git a/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp b/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp index 4a4c9710a3..5c83a66237 100644 --- a/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp @@ -112,7 +112,7 @@ class dsymmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dsymm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dsymm"; + std::string str_name = "bli_dsymm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uplo; diff --git a/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp b/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp index 9670c88391..64a1532922 100644 --- a/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp @@ -112,7 +112,7 @@ class ssymmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ssymm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ssymm"; + std::string str_name = "bli_ssymm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uplo; diff --git a/gtestsuite/testsuite/level3/symm/test_symm.h b/gtestsuite/testsuite/level3/symm/test_symm.h index b3ebc37953..0bfcd3fd1b 100644 --- a/gtestsuite/testsuite/level3/symm/test_symm.h +++ b/gtestsuite/testsuite/level3/symm/test_symm.h @@ -74,7 +74,7 @@ void test_symm( char storage, char side, char uplo, char conja, char transb, //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_symm( storage, side, uplo, conja, transb, m, n, alpha, + testinghelpers::ref_symm( storage, side, uplo, conja, transb, m, n, alpha, a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc ); //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp b/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp index 53683dffb7..3840ab4aca 100644 --- a/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp @@ -112,7 +112,7 @@ class zsymmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zsymm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zsymm"; + std::string str_name = "bli_zsymm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uplo; diff --git a/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp index 6e7cb8db09..28e562764f 100644 --- a/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp @@ -107,7 +107,7 @@ class csyr2kTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_csyr2k"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_csyr2k"; + std::string str_name = "bli_csyr2k"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp index a38fa2c512..8ab791c5b6 100644 --- a/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp @@ -107,7 +107,7 @@ class dsyr2kTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dsyr2k"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dsyr2k"; + std::string str_name = "bli_dsyr2k"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp index bcec08f487..fe4941e84d 100644 --- a/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp @@ -107,7 +107,7 @@ class ssyr2kTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ssyr2k"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ssyr2k"; + std::string str_name = "bli_ssyr2k"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/syr2k/test_syr2k.h b/gtestsuite/testsuite/level3/syr2k/test_syr2k.h index aebd1e2cc4..218a893698 100644 --- a/gtestsuite/testsuite/level3/syr2k/test_syr2k.h +++ b/gtestsuite/testsuite/level3/syr2k/test_syr2k.h @@ -72,7 +72,7 @@ void test_syr2k( char storage, char uplo, char transa, char transb, gtint_t m, //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_syr2k( storage, uplo, transa, transb, m, k, alpha, + testinghelpers::ref_syr2k( storage, uplo, transa, transb, m, k, alpha, a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc ); //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp index 0caf572134..e929c13601 100644 --- a/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp @@ -107,7 +107,7 @@ class zsyr2kTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zsyr2k"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zsyr2k"; + std::string str_name = "bli_zsyr2k"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp index 63a60703e5..2aa7b2063f 100644 --- a/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp @@ -100,7 +100,7 @@ class csyrkTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_csyrk"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_csyrk"; + std::string str_name = "bli_csyrk"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp index 1e01e15f13..b4c8b61be3 100644 --- a/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp @@ -100,7 +100,7 @@ class dsyrkTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dsyrk"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dsyrk"; + std::string str_name = "bli_dsyrk"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp index d959f444ab..1b99dc65fe 100644 --- a/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp @@ -100,7 +100,7 @@ class ssyrkTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ssyrk"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ssyrk"; + std::string str_name = "bli_ssyrk"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp index 712f0d52eb..a76a24533c 100644 --- a/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp @@ -100,7 +100,7 @@ class zsyrkTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_zsyrk"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_zsyrk"; + std::string str_name = "bli_zsyrk"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + uplo; diff --git a/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp index 603a1287c7..11014e542a 100644 --- a/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp @@ -105,7 +105,7 @@ class ctrmmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ctrmm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ctrmm"; + std::string str_name = "bli_ctrmm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa; diff --git a/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp index 6b65bdb6dc..ec3608bf45 100644 --- a/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp @@ -105,7 +105,7 @@ class dtrmmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dtrmm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dtrmm"; + std::string str_name = "bli_dtrmm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa; diff --git a/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp index 5b4718e269..2090b39611 100644 --- a/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp @@ -105,7 +105,7 @@ class strmmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_strmm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_strmm"; + std::string str_name = "bli_strmm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa; diff --git a/gtestsuite/testsuite/level3/trmm/test_trmm.h b/gtestsuite/testsuite/level3/trmm/test_trmm.h index 11e74f286c..91b169d99c 100644 --- a/gtestsuite/testsuite/level3/trmm/test_trmm.h +++ b/gtestsuite/testsuite/level3/trmm/test_trmm.h @@ -67,7 +67,7 @@ void test_trmm( char storage, char side, char uploa, char transa, char diaga, //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_trmm( storage, side, uploa, transa, diaga, m, n, alpha, a.data(), lda, b_ref.data(), ldb ); + testinghelpers::ref_trmm( storage, side, uploa, transa, diaga, m, n, alpha, a.data(), lda, b_ref.data(), ldb ); //---------------------------------------------------------- // check component-wise error. diff --git a/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp index e127a0d33c..bbeb07d100 100644 --- a/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp @@ -105,7 +105,7 @@ class ztrmmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ztrmm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ztrmm"; + std::string str_name = "bli_ztrmm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa; diff --git a/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp index ccb9770dbe..9dcafcb32b 100644 --- a/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp @@ -113,7 +113,7 @@ class ctrmm3TestPrint { gtint_t lda_inc = std::get<10>(str.param); gtint_t ldb_inc = std::get<11>(str.param); gtint_t ldc_inc = std::get<12>(str.param); - std::string str_name = "blis_ctrmm3"; + std::string str_name = "bli_ctrmm3"; str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa + transb; str_name = str_name + "_d" + diaga; diff --git a/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp index c86ae0ddd1..6cb677e988 100644 --- a/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp @@ -113,7 +113,7 @@ class dtrmm3TestPrint { gtint_t lda_inc = std::get<10>(str.param); gtint_t ldb_inc = std::get<11>(str.param); gtint_t ldc_inc = std::get<12>(str.param); - std::string str_name = "blis_dtrmm3"; + std::string str_name = "bli_dtrmm3"; str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa + transb; str_name = str_name + "_d" + diaga; diff --git a/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp index 9cc27d2e6d..4752556df8 100644 --- a/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp @@ -113,7 +113,7 @@ class strmm3TestPrint { gtint_t lda_inc = std::get<10>(str.param); gtint_t ldb_inc = std::get<11>(str.param); gtint_t ldc_inc = std::get<12>(str.param); - std::string str_name = "blis_strmm3"; + std::string str_name = "bli_strmm3"; str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa + transb; str_name = str_name + "_d" + diaga; diff --git a/gtestsuite/testsuite/level3/trmm3/test_trmm3.h b/gtestsuite/testsuite/level3/trmm3/test_trmm3.h index 84d6d1c0bd..e82f25dd0c 100644 --- a/gtestsuite/testsuite/level3/trmm3/test_trmm3.h +++ b/gtestsuite/testsuite/level3/trmm3/test_trmm3.h @@ -70,7 +70,7 @@ void test_trmm3( char storage, char side, char uploa, char transa, char diaga, //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_trmm3( storage, side, uploa, transa, diaga, transb, + testinghelpers::ref_trmm3( storage, side, uploa, transa, diaga, transb, m, n, alpha, a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc ); //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp index 2818daf7be..9ab008b974 100644 --- a/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp @@ -113,7 +113,7 @@ class ztrmm3TestPrint { gtint_t lda_inc = std::get<10>(str.param); gtint_t ldb_inc = std::get<11>(str.param); gtint_t ldc_inc = std::get<12>(str.param); - std::string str_name = "blis_ztrmm3"; + std::string str_name = "bli_ztrmm3"; str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa + transb; str_name = str_name + "_d" + diaga; diff --git a/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp index a33621091b..d001651df4 100644 --- a/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp @@ -78,7 +78,7 @@ TEST_P(ctrsmTest, RandomData) gtint_t ldb_inc = std::get<9>(GetParam()); // Set the threshold for the errors: - double thresh = std::max(m, n)*testinghelpers::getEpsilon(); + double thresh = (std::max)(m, n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -105,7 +105,7 @@ class ctrsmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ctrsm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ctrsm"; + std::string str_name = "bli_ctrsm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa; diff --git a/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp index 0dde0d4545..a0c64ddb6c 100644 --- a/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp @@ -78,7 +78,7 @@ TEST_P(dtrsmTest, RandomData) gtint_t ldb_inc = std::get<9>(GetParam()); // Set the threshold for the errors: - double thresh = std::max(m, n)*testinghelpers::getEpsilon(); + double thresh = (std::max)(m, n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -105,7 +105,7 @@ class dtrsmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_dtrsm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_dtrsm"; + std::string str_name = "bli_dtrsm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa; diff --git a/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp index 23922a08ba..a1e43aa20f 100644 --- a/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp @@ -78,7 +78,7 @@ TEST_P(strsmTest, RandomData) gtint_t ldb_inc = std::get<9>(GetParam()); // Set the threshold for the errors: - double thresh = std::max(m, n)*testinghelpers::getEpsilon(); + double thresh = (std::max)(m, n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -105,7 +105,7 @@ class strsmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_strsm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_strsm"; + std::string str_name = "bli_strsm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa; diff --git a/gtestsuite/testsuite/level3/trsm/test_trsm.h b/gtestsuite/testsuite/level3/trsm/test_trsm.h index 698a382823..e36e29374d 100644 --- a/gtestsuite/testsuite/level3/trsm/test_trsm.h +++ b/gtestsuite/testsuite/level3/trsm/test_trsm.h @@ -75,7 +75,7 @@ void test_trsm( char storage, char side, char uploa, char transa, char diaga, //---------------------------------------------------------- // Call reference implementation. //---------------------------------------------------------- - testinghelpers::ref_trsm( storage, side, uploa, transa, diaga, m, n, alpha, a.data(), + testinghelpers::ref_trsm( storage, side, uploa, transa, diaga, m, n, alpha, a.data(), lda, b_ref.data(), ldb ); //---------------------------------------------------------- diff --git a/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp index 0b5530e05d..8b7d0cab4d 100644 --- a/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp @@ -78,7 +78,7 @@ TEST_P(ztrsmTest, RandomData) gtint_t ldb_inc = std::get<9>(GetParam()); // Set the threshold for the errors: - double thresh = std::max(m, n)*testinghelpers::getEpsilon(); + double thresh = (std::max)(m, n)*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters @@ -105,7 +105,7 @@ class ztrsmTestPrint { #elif TEST_CBLAS std::string str_name = "cblas_ztrsm"; #else //#elif TEST_BLIS_TYPED - std::string str_name = "blis_ztrsm"; + std::string str_name = "bli_ztrsm"; #endif str_name = str_name + "_" + sfm+sfm+sfm; str_name = str_name + "_" + side + uploa + transa; diff --git a/gtestsuite/testsuite/util/nrm2/test_nrm2.h b/gtestsuite/testsuite/util/nrm2/test_nrm2.h index 9ed6e47adc..270269d571 100644 --- a/gtestsuite/testsuite/util/nrm2/test_nrm2.h +++ b/gtestsuite/testsuite/util/nrm2/test_nrm2.h @@ -78,7 +78,7 @@ void test_nrm2( gtint_t n, gtint_t incx, gtint_t i, T iexval, gtint_t j = 0, T j //---------------------------------------------------------- // Initialize vectors with random numbers. //---------------------------------------------------------- - std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx, ELEMENT_TYPE); + std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx); // Initialize ith element of vector x to iexval. x[i*incx] = iexval; // Initialize jth element of vector x to jexval. From e5e9127a682c50d5dcce1142e1125000e9e41dc4 Mon Sep 17 00:00:00 2001 From: mkadavil Date: Wed, 6 Sep 2023 16:00:34 +0530 Subject: [PATCH 130/226] Fixes for aocl_gemm addon compilation issues Certain functions were updated recently and now takes extra arguments for error handling. Usage of the same are now updated in aocl_gemm. Change-Id: I7daca4fd1f284d57034d564f0a08cc6410ccfd5c --- .../threading/lpgemm_thread_decor_openmp.c | 3 +- bench/bench_aocl_gemm/bench_lpgemm.c | 31 ++++++++++--------- bench/bench_aocl_gemm/bench_lpgemm_utils.c | 10 +++--- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c index 1d0f7515b0..36be28e570 100644 --- a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c +++ b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c @@ -680,10 +680,11 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \ \ thrcomm_t static_lpgemm_comms[BLIS_LPGEMM_NUM_STATIC_COMMS]; \ thrcomm_t* cur_lpgemm_comms = static_lpgemm_comms; \ + err_t bli_errors = BLIS_SUCCESS; \ \ if ( jc_ways > BLIS_LPGEMM_NUM_STATIC_COMMS ) \ { \ - cur_lpgemm_comms = bli_malloc_intl( jc_ways * sizeof( thrcomm_t ) ); \ + cur_lpgemm_comms = bli_malloc_intl( jc_ways * sizeof( thrcomm_t ), &bli_errors ); \ } \ for ( dim_t i = 0; i < jc_ways; ++i ) \ { \ diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 7dd049b159..75883cf7fe 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -107,7 +107,8 @@ GEN_FILL_ARRAY_FUNC(int32_t) void fill_array_bfloat16( void* arr, dim_t size ) { - float* c_float = ( float* ) bli_malloc_user( sizeof( float ) * size ); + err_t bli_errors = BLIS_SUCCESS; + float* c_float = ( float* ) bli_malloc_user( sizeof( float ) * size, &bli_errors ); for ( dim_t i = 0; i < size; ++i ) { c_float[i] = 2.0; @@ -1038,14 +1039,15 @@ void mat_mul_bench_main_ ## BLAS_SFX \ } \ \ /* Get 64 byte aligned memory.*/ \ - A_type* a = ( A_type* ) bli_malloc_user( sizeof( A_type ) * m * k ); \ + err_t bli_errors = BLIS_SUCCESS; \ + A_type* a = ( A_type* ) bli_malloc_user( sizeof( A_type ) * m * k, &bli_errors ); \ \ - B_type* b = ( B_type* ) bli_malloc_user( sizeof( B_type ) * n * k ); \ + B_type* b = ( B_type* ) bli_malloc_user( sizeof( B_type ) * n * k, &bli_errors ); \ \ - C_type* c = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n ); \ + C_type* c = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n, &bli_errors ); \ memset( ( void* ) c, 0, sizeof( C_type ) * m * n ); \ \ - C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n ); \ + C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n, &bli_errors ); \ memset( ( void* ) c_ref, 0, sizeof( C_type ) * m * n ); \ \ GEN_FUNC_NAME(fill_array_,A_type)( a, ( m * k ) ); \ @@ -1101,7 +1103,7 @@ void mat_mul_bench_main_ ## BLAS_SFX \ siz_t b_reorder_buf_siz_req = \ GEN_FUNC_NAME(aocl_get_reorder_buf_size_,REORDER_SFX)( 'B', k, n ); \ \ - B_type* b_reorder = ( B_type* ) bli_malloc_user( b_reorder_buf_siz_req ); \ + B_type* b_reorder = ( B_type* ) bli_malloc_user( b_reorder_buf_siz_req, &bli_errors ); \ GEN_FUNC_NAME(aocl_reorder_,REORDER_SFX)( 'B', b, b_reorder, k, n, stride_b ); \ \ GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ @@ -1192,27 +1194,28 @@ void mat_mul_bench_main_ ## BLAS_SFX \ n_repeats = global_n_repeat; \ } \ \ + err_t bli_errors = BLIS_SUCCESS; \ /* Get 64 byte aligned memory.*/ \ - bfloat16* a = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * m * k ); \ - float *a_float = bli_malloc_user( m * k * sizeof( float )); \ + bfloat16* a = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * m * k, &bli_errors ); \ + float *a_float = bli_malloc_user( m * k * sizeof( float ), &bli_errors); \ for ( int32_t i = 0; i < m*k; ++i ) \ { \ a_float[i] = ( float ) ( i % 5 ); \ } \ convert_float_arr_to_bf16( a_float, a, m * k ); \ \ - bfloat16* b = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * n * k ); \ - float *b_float = bli_malloc_user( k * n * sizeof( float )); \ + bfloat16* b = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * n * k, &bli_errors ); \ + float *b_float = bli_malloc_user( k * n * sizeof( float ), &bli_errors); \ for ( int32_t i = 0; i < k*n; ++i ) \ { \ b_float[i] = ( float ) ( i % 5 );\ } \ convert_float_arr_to_bf16( b_float, b, k * n ); \ \ - C_type* c = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n ); \ + C_type* c = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n, &bli_errors ); \ memset( ( void* ) c, 0, sizeof( C_type ) * m * n ); \ \ - C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n ); \ + C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n, &bli_errors ); \ memset( ( void* ) c_ref, 0, sizeof( C_type ) * m * n ); \ \ if ( bench_mode == 'a' ) \ @@ -1265,8 +1268,8 @@ void mat_mul_bench_main_ ## BLAS_SFX \ siz_t b_reorder_buf_siz_req = \ aocl_get_reorder_buf_size_bf16bf16f32of32( 'B', k, n ); \ \ - bfloat16* b_reorder = ( bfloat16* ) bli_malloc_user( b_reorder_buf_siz_req ); \ - aocl_reorder_bf16bf16f32of32( 'B', b, b_reorder, k, n, stride_b ); \ + bfloat16* b_reorder = ( bfloat16* ) bli_malloc_user( b_reorder_buf_siz_req, &bli_errors ); \ + aocl_reorder_bf16bf16f32of32( 'B', b, b_reorder, k, n, stride_b ); \ \ GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ ( \ diff --git a/bench/bench_aocl_gemm/bench_lpgemm_utils.c b/bench/bench_aocl_gemm/bench_lpgemm_utils.c index dbbdce6703..a95e214b4a 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm_utils.c +++ b/bench/bench_aocl_gemm/bench_lpgemm_utils.c @@ -261,10 +261,11 @@ void gelu_bench_main_ ## GELU_SFX \ n_repeats = global_n_repeat; \ } \ \ - V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \ + err_t bli_errors = BLIS_SUCCESS; \ + V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \ GEN_FUNC_NAME(fill_array_,V_type)( x, ( n * incx ) ); \ \ - V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \ + V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \ GEN_FUNC_NAME(fill_array_,V_type)( ref_x, ( n * incx ) ); \ \ GEN_FUNC_NAME(gelu_bench_driver_,GELU_SFX)(n_repeats,n,x,incx); \ @@ -292,10 +293,11 @@ void softmax_bench_main_ ## SOFTMAX_SFX \ n_repeats = global_n_repeat; \ } \ \ - V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \ + err_t bli_errors = BLIS_SUCCESS; \ + V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \ GEN_FUNC_NAME(fill_array_,V_type)( x, ( n * incx ) ); \ \ - V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \ + V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \ GEN_FUNC_NAME(fill_array_,V_type)( ref_x, ( n * incx ) ); \ \ GEN_FUNC_NAME(softmax_bench_driver_,SOFTMAX_SFX)(n_repeats,n,x,incx); \ From 32104c400c7112ce33e3f12e904efbb02b3b90eb Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Mon, 31 Jul 2023 22:32:12 +0530 Subject: [PATCH 131/226] GTestSuite : Designing test cases for ZGEMM - Designed test cases for unit testing of ZGEMM compute kernel for handling inputs when k == 1. The design uses value-parameterized testing for checking accuracy, and verifying the mandate in case of exception values on the inputs/output. - The design uses type-parameterized testing for verifying BLAS standard for invalid input cases, and also for early return scenarios. - Added the function template set_ev_mat( ... ) as part of testinghelpers. This function is used as a helper for inducing exception values onto indices specified as arguments to the test_gemm( ... ) interface. - Abstracted the function definition of getValueString( ... ) from the NRM2 testing interface to testinghelpers(renamed as get_value_string( ... ) for naming consistency), in order to use it as a helper function across all APIs in case of exception value testing. AMD-Internal: [CPUPL-3823] Change-Id: I0fea21f9c8759bbbdc88ba0a016202753e28f2a7 --- .../inc/common/data_generators.h | 20 +- .../inc/common/testing_basics.h | 9 + .../src/common/data_generators.cpp | 35 ++ .../src/common/testing_basics.cpp | 72 ++++ .../testsuite/level3/gemm/IIT_ERS_test.cpp | 264 +++++++++++++ gtestsuite/testsuite/level3/gemm/test_gemm.h | 59 +++ .../level3/gemm/zgemm_evt_testing.cpp | 356 ++++++++++++++++++ .../testsuite/level3/gemm/zgemm_generic.cpp | 49 ++- .../util/nrm2/dnrm2_extreme_values.cpp | 4 +- .../util/nrm2/dznrm2_extreme_values.cpp | 4 +- .../util/nrm2/scnrm2_extreme_values.cpp | 4 +- .../util/nrm2/snrm2_extreme_values.cpp | 4 +- gtestsuite/testsuite/util/nrm2/test_nrm2.h | 15 - 13 files changed, 857 insertions(+), 38 deletions(-) create mode 100644 gtestsuite/testsuite/level3/gemm/IIT_ERS_test.cpp create mode 100644 gtestsuite/testsuite/level3/gemm/zgemm_evt_testing.cpp diff --git a/gtestsuite/testinghelpers/inc/common/data_generators.h b/gtestsuite/testinghelpers/inc/common/data_generators.h index 48d61f3a67..f40eeba018 100644 --- a/gtestsuite/testinghelpers/inc/common/data_generators.h +++ b/gtestsuite/testinghelpers/inc/common/data_generators.h @@ -62,17 +62,17 @@ void randomgenerators(int from, int to, T* alpha, char fp); * if fp=='f' the elements will have random float values. */ template -void randomgenerators(int from, int to, gtint_t n, gtint_t incx, T* x, char fp); +void randomgenerators(int from, int to, gtint_t n, gtint_t incx, T* x, char fp = BLIS_ELEMENT_TYPE); template -void randomgenerators(int from, int to, char storage, gtint_t m, gtint_t n, T* a, gtint_t lda, char fp); +void randomgenerators(int from, int to, char storage, gtint_t m, gtint_t n, T* a, gtint_t lda, char fp = BLIS_ELEMENT_TYPE); template -void randomgenerators(int from, int to, char storage, gtint_t m, gtint_t n, T* a, char transa, gtint_t lda, char fp); +void randomgenerators(int from, int to, char storage, gtint_t m, gtint_t n, T* a, char transa, gtint_t lda, char fp = BLIS_ELEMENT_TYPE); template void randomgenerators(int from, int to, char storage, char uplo, gtint_t m, - T* a, gtint_t lda, char fp ); + T* a, gtint_t lda, char fp = BLIS_ELEMENT_TYPE ); } //end of namespace datagenerators template @@ -92,4 +92,16 @@ std::vector get_vector( gtint_t n, gtint_t incx, T value ); template std::vector get_matrix( char storage, char trans, gtint_t m, gtint_t n, gtint_t lda, T value ); +template +void set_vector( gtint_t n, gtint_t incx, T* x, T value ); + +template +void set_matrix( char storage, gtint_t m, gtint_t n, T* a, char transa, gtint_t lda, T value ); + +// Function template to set the exception value exval on matrix m, at indices (i, j) +// In case of transposition, this function internally swaps the indices, and thus they can be +// passed without swapping on the instantiator. +template +void set_ev_mat( char storage, char trns, gtint_t ld, gtint_t i, gtint_t j, T exval, T* m ); + } //end of namespace testinghelpers diff --git a/gtestsuite/testinghelpers/inc/common/testing_basics.h b/gtestsuite/testinghelpers/inc/common/testing_basics.h index df2c77059e..46bcfeb4a8 100644 --- a/gtestsuite/testinghelpers/inc/common/testing_basics.h +++ b/gtestsuite/testinghelpers/inc/common/testing_basics.h @@ -371,4 +371,13 @@ void print_vector( const char *vec, gtint_t n, T *x, gtint_t incx, const char *s template void print_matrix( const char *mat, char storage, gtint_t m, gtint_t n, T *a, gtint_t ld, const char *spec ); +/** + * @brief returns a string with the correct NaN/Inf for printing + * + * @tparam T float, double, scomplex, dcomplex. + * @param exval exception value for setting the string. + */ +template +std::string get_value_string( T exval ); + } //end of namespace testinghelpers diff --git a/gtestsuite/testinghelpers/src/common/data_generators.cpp b/gtestsuite/testinghelpers/src/common/data_generators.cpp index afe5650e9d..8ed6416836 100644 --- a/gtestsuite/testinghelpers/src/common/data_generators.cpp +++ b/gtestsuite/testinghelpers/src/common/data_generators.cpp @@ -441,6 +441,26 @@ std::vector get_matrix( char storage, char trans, gtint_t m, gtint_t n, gtint return a; } +template +void set_ev_mat( char storage, char trns, gtint_t ld, gtint_t i, gtint_t j, T exval, T* m ) +{ + // Setting the exception values on the indices passed as arguments + if ( storage == 'c' || storage == 'C' ) + { + if ( trns == 'n' || trns == 'N' ) + m[i + j*ld] = exval; + else + m[j + i*ld] = exval; + } + else + { + if ( trns == 'n' || trns == 'N' ) + m[i*ld + j] = exval; + else + m[j*ld + i] = exval; + } +} + } //end of namespace testinghelpers // Explicit template instantiations @@ -493,3 +513,18 @@ template std::vector testinghelpers::get_matrix( char, char, gtint_t, gti template std::vector testinghelpers::get_matrix( char, char, gtint_t, gtint_t, gtint_t, double ); template std::vector testinghelpers::get_matrix( char, char, gtint_t, gtint_t, gtint_t, scomplex ); template std::vector testinghelpers::get_matrix( char, char, gtint_t, gtint_t, gtint_t, dcomplex ); + +template void testinghelpers::set_vector( gtint_t, gtint_t, float*, float ); +template void testinghelpers::set_vector( gtint_t, gtint_t, double*, double ); +template void testinghelpers::set_vector( gtint_t, gtint_t, scomplex*, scomplex ); +template void testinghelpers::set_vector( gtint_t, gtint_t, dcomplex*, dcomplex ); + +template void testinghelpers::set_matrix( char, gtint_t, gtint_t, float*, char, gtint_t, float ); +template void testinghelpers::set_matrix( char, gtint_t, gtint_t, double*, char, gtint_t, double ); +template void testinghelpers::set_matrix( char, gtint_t, gtint_t, scomplex*, char, gtint_t, scomplex ); +template void testinghelpers::set_matrix( char, gtint_t, gtint_t, dcomplex*, char, gtint_t, dcomplex ); + +template void testinghelpers::set_ev_mat( char, char, gtint_t, gtint_t, gtint_t, float, float* ); +template void testinghelpers::set_ev_mat( char, char, gtint_t, gtint_t, gtint_t, double, double* ); +template void testinghelpers::set_ev_mat( char, char, gtint_t, gtint_t, gtint_t, scomplex, scomplex* ); +template void testinghelpers::set_ev_mat( char, char, gtint_t, gtint_t, gtint_t, dcomplex, dcomplex* ); \ No newline at end of file diff --git a/gtestsuite/testinghelpers/src/common/testing_basics.cpp b/gtestsuite/testinghelpers/src/common/testing_basics.cpp index 53c1050795..c7f8d50dac 100644 --- a/gtestsuite/testinghelpers/src/common/testing_basics.cpp +++ b/gtestsuite/testinghelpers/src/common/testing_basics.cpp @@ -614,4 +614,76 @@ template void print_matrix( char, gtint_t, gtint_t, double *, gtint_t, c template void print_matrix( char, gtint_t, gtint_t, scomplex *, gtint_t, const char * ); template void print_matrix( char, gtint_t, gtint_t, dcomplex *, gtint_t, const char * ); + +/* + Helper function that returns a string based on the value that is passed + The return values are as follows : + If datatype is real : "nan", "inf"/"minus_inf", "value", where "value" + is the string version of the value that is passed, if it is not nan/inf/-inf. + + If the datatype is complex : The string is concatenated with both the real and + imaginary components values, based on analysis done separately to each of them + (similar to real datatype). +*/ +template +std::string get_value_string(T exval) +{ + std::string exval_str; + if constexpr (testinghelpers::type_info::is_real) + { + if(std::isnan(exval)) + exval_str = "nan"; + else if(std::isinf(exval)) + exval_str = (exval >= 0) ? "inf" : "minus_inf"; + else + exval_str = ( exval >= 0) ? std::to_string(int(exval)) : "minus_" + std::to_string(int(std::abs(exval))); + } + else + { + if(std::isnan(exval.real)) + { + exval_str = "nan"; + if(std::isinf(exval.imag)) + exval_str = exval_str + "pi" + ((exval.imag >= 0) ? "inf" : "minus_inf"); + else + exval_str = exval_str + "pi" + ((exval.imag >= 0)? std::to_string(int(exval.imag)) : "m" + std::to_string(int(std::abs(exval.imag)))); + } + else if(std::isnan(exval.imag)) + { + if(std::isinf(exval.real)) + exval_str = ((exval.real >= 0) ? "inf" : "minus_inf"); + else + exval_str = ((exval.real >= 0)? std::to_string(int(exval.real)) : "m" + std::to_string(int(std::abs(exval.real)))); + exval_str = exval_str + "pinan"; + } + else if(std::isinf(exval.real)) + { + exval_str = ((exval.real >= 0) ? "inf" : "minus_inf"); + if(std::isnan(exval.imag)) + exval_str = exval_str + "pinan"; + else + exval_str = exval_str + "pi" + ((exval.imag >= 0)? std::to_string(int(exval.imag)) : "m" + std::to_string(int(std::abs(exval.imag)))); + } + else if(std::isinf(exval.imag)) + { + if(std::isnan(exval.real)) + exval_str = "nan"; + else + exval_str = ((exval.real >= 0)? std::to_string(int(exval.real)) : "m" + std::to_string(int(std::abs(exval.real)))); + + exval_str = exval_str + ((exval.imag >= 0) ? "inf" : "minus_inf"); + } + else + { + exval_str = ((exval.real >= 0)? std::to_string(int(exval.real)) : "m" + std::to_string(int(std::abs(exval.real)))); + exval_str = exval_str + "pi" + ((exval.imag >= 0)? std::to_string(int(exval.imag)) : "m" + std::to_string(int(std::abs(exval.imag)))); + } + } + return exval_str; +} +template std::string testinghelpers::get_value_string( float ); +template std::string testinghelpers::get_value_string( double ); +template std::string testinghelpers::get_value_string( scomplex ); +template std::string testinghelpers::get_value_string( dcomplex ); + } //end of namespace testinghelpers \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm/IIT_ERS_test.cpp b/gtestsuite/testsuite/level3/gemm/IIT_ERS_test.cpp new file mode 100644 index 0000000000..debe86a5dc --- /dev/null +++ b/gtestsuite/testsuite/level3/gemm/IIT_ERS_test.cpp @@ -0,0 +1,264 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "common/testing_helpers.h" +#include "gemm.h" +#include "inc/check_error.h" +#include "common/wrong_inputs_helpers.h" + +template +class Gemm_IIT_ERS_Test : public ::testing::Test {}; +typedef ::testing::Types TypeParam; // The supported datatypes from BLAS calls for GEMM +TYPED_TEST_SUITE(Gemm_IIT_ERS_Test, TypeParam); // Defining individual testsuites based on the datatype support. + +// Adding namespace to get default parameters(valid case) from testinghelpers/common/wrong_input_helpers.h. +using namespace testinghelpers::IIT; + +#ifdef TEST_BLAS + +/* + Incorrect Input Testing(IIT) + + BLAS exceptions get triggered in the following cases(for GEMM): + 1. When TRANSA != 'N' || TRANSA != 'T' || TRANSA != 'C' (info = 1) + 2. When TRANSB != 'N' || TRANSB != 'T' || TRANSB != 'C' (info = 2) + 3. When m < 0 (info = 3) + 4. When n < 0 (info = 4) + 5. When k < 0 (info = 5) + 6. When lda < max(1, thresh) (info = 8), thresh set based on TRANSA value + 7. When ldb < max(1, thresh) (info = 10), thresh set based on TRANSB value + 8. When ldc < max(1, n) (info = 13) + +*/ + +// When info == 1 +TYPED_TEST(Gemm_IIT_ERS_Test, invalid_transa) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for TRANS value for A. + gemm( STORAGE, 'p', TRANS, M, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 2 +TYPED_TEST(Gemm_IIT_ERS_Test, invalid_transb) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for TRANS value for B. + gemm( STORAGE, TRANS, 'p', M, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 3 +TYPED_TEST(Gemm_IIT_ERS_Test, m_lt_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for m. + gemm( STORAGE, TRANS, TRANS, -1, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 4 +TYPED_TEST(Gemm_IIT_ERS_Test, n_lt_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for n. + gemm( STORAGE, TRANS, TRANS, M, -1, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 5 +TYPED_TEST(Gemm_IIT_ERS_Test, k_lt_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for k. + gemm( STORAGE, TRANS, TRANS, M, N, -1, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 8 +TYPED_TEST(Gemm_IIT_ERS_Test, invalid_lda) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for lda. + gemm( STORAGE, TRANS, TRANS, M, N, K, nullptr, nullptr, LDA - 1, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 10 +TYPED_TEST(Gemm_IIT_ERS_Test, invalid_ldb) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for ldb. + gemm( STORAGE, TRANS, TRANS, M, N, K, nullptr, nullptr, LDA, nullptr, LDB - 1, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 13 +TYPED_TEST(Gemm_IIT_ERS_Test, invalid_ldc) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for ldc. + gemm( STORAGE, TRANS, TRANS, M, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC - 1 ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +/* + Early Return Scenarios(ERS) : + + The GEMM API is expected to return early in the following cases: + + 1. When m == 0. + 2. When n == 0. + 3. When (alpha == 0 or k == 0) and beta == 1. + +*/ + +// When m is 0 +TYPED_TEST(Gemm_IIT_ERS_Test, m_eq_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + gemm( STORAGE, TRANS, TRANS, 0, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When n is 0 +TYPED_TEST(Gemm_IIT_ERS_Test, n_eq_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + gemm( STORAGE, TRANS, TRANS, M, 0, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When alpha is 0 and beta is 1 +TYPED_TEST(Gemm_IIT_ERS_Test, alpha_zero_beta_one) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + T alpha, beta; + + testinghelpers::initzero( alpha ); + testinghelpers::initone( beta ); + + gemm( STORAGE, TRANS, TRANS, M, N, K, &alpha, nullptr, LDA, nullptr, LDB, &beta, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When k is 0 and beta is 1 +TYPED_TEST(Gemm_IIT_ERS_Test, k_zero_beta_one) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + T alpha, beta; + + testinghelpers::initone( alpha ); + testinghelpers::initone( beta ); + + gemm( STORAGE, TRANS, TRANS, M, N, 0, &alpha, nullptr, LDA, nullptr, LDB, &beta, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + + +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm/test_gemm.h b/gtestsuite/testsuite/level3/gemm/test_gemm.h index efb00df866..862d47b168 100644 --- a/gtestsuite/testsuite/level3/gemm/test_gemm.h +++ b/gtestsuite/testsuite/level3/gemm/test_gemm.h @@ -76,4 +76,63 @@ void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh ); +} + +// Test body used for exception value testing, by iducing an exception value +// in the index that is passed for each of the matrices. +/* + (ai, aj) is the index with corresponding exception value aexval in matrix A. + The index is with respect to the assumption that the matrix is column stored, + without any transpose. In case of the row-storage and/or transpose, the index + is translated from its assumption accordingly. + Ex : (2, 3) with storage 'c' and transpose 'n' becomes (3, 2) if storage becomes + 'r' or transpose becomes 't'. +*/ +// (bi, bj) is the index with corresponding exception value bexval in matrix B. +// (ci, cj) is the index with corresponding exception value cexval in matrix C. +template +void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n, + gtint_t k, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, T alpha, + T beta, gtint_t ai, gtint_t aj, T aexval, gtint_t bi, gtint_t bj, T bexval, + gtint_t ci, gtint_t cj, T cexval, double thresh ) +{ + // Compute the leading dimensions of a, b, and c. + gtint_t lda = testinghelpers::get_leading_dimension( storage, trnsa, m, k, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, trnsb, k, n, ldb_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, n, ldc_inc ); + + //---------------------------------------------------------- + // Initialize matrics with random numbers + //---------------------------------------------------------- + std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, trnsa, m, k, lda ); + std::vector b = testinghelpers::get_random_matrix( -5, 2, storage, trnsb, k, n, ldb ); + std::vector c = testinghelpers::get_random_matrix( -3, 5, storage, 'n', m, n, ldc ); + + // Inducing exception values onto the matrices based on the indices passed as arguments. + // Assumption is that the indices are with respect to the matrices in column storage without + // any transpose. In case of difference in storage scheme or transposition, the row and column + // indices are appropriately swapped. + testinghelpers::set_ev_mat( storage, trnsa, lda, ai, aj, aexval, a.data() ); + testinghelpers::set_ev_mat( storage, trnsb, ldb, bi, bj, bexval, b.data() ); + testinghelpers::set_ev_mat( storage, 'n', ldc, ci, cj, cexval, c.data() ); + + // Create a copy of c so that we can check reference results. + std::vector c_ref(c); + + //---------------------------------------------------------- + // Call BLIS function + //---------------------------------------------------------- + gemm( storage, trnsa, trnsb, m, n, k, &alpha, a.data(), lda, + b.data(), ldb, &beta, c.data(), ldc ); + + //---------------------------------------------------------- + // Call reference implementation. + //---------------------------------------------------------- + testinghelpers::ref_gemm( storage, trnsa, trnsb, m, n, k, alpha, + a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc ); + + //---------------------------------------------------------- + // check component-wise error. + //---------------------------------------------------------- + computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh, true ); } \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm/zgemm_evt_testing.cpp b/gtestsuite/testsuite/level3/gemm/zgemm_evt_testing.cpp new file mode 100644 index 0000000000..4f328a60be --- /dev/null +++ b/gtestsuite/testsuite/level3/gemm/zgemm_evt_testing.cpp @@ -0,0 +1,356 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* + The following file contains both the exception value testing(EVT) and the + positive accuracy testing of the bli_zgemm_4x4_avx2_k1_nn( ... ) computational + kernel. This kernel is invoked from the BLAS layer, and inputs are given + in a manner so as to avoid the other code-paths and test only the required + kernel. + +*/ + +#include +#include "test_gemm.h" + +class ZGemmEVTTest : + public ::testing::TestWithParam> {}; + +TEST_P(ZGemmEVTTest, Unit_Tester) +{ + using T = dcomplex; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // matrix storage format(row major, column major) + char storage = std::get<0>(GetParam()); + // denotes whether matrix a is n,c,t,h + char transa = std::get<1>(GetParam()); + // denotes whether matrix b is n,c,t,h + char transb = std::get<2>(GetParam()); + // matrix size m + gtint_t m = std::get<3>(GetParam()); + // matrix size n + gtint_t n = std::get<4>(GetParam()); + // matrix size k + gtint_t k = std::get<5>(GetParam()); + + gtint_t ai, aj, bi, bj, ci, cj; + T aex, bex, cex; + ai = std::get<6>(GetParam()); + aj = std::get<7>(GetParam()); + aex = std::get<8>(GetParam()); + + bi = std::get<9>(GetParam()); + bj = std::get<10>(GetParam()); + bex = std::get<11>(GetParam()); + + ci = std::get<12>(GetParam()); + cj = std::get<13>(GetParam()); + cex = std::get<14>(GetParam()); + + // specifies alpha value + T alpha = std::get<15>(GetParam()); + // specifies beta value + T beta = std::get<16>(GetParam()); + // lda, ldb, ldc increments. + // If increments are zero, then the array size matches the matrix size. + // If increments are nonnegative, the array size is bigger than the matrix size. + gtint_t lda_inc = std::get<17>(GetParam()); + gtint_t ldb_inc = std::get<18>(GetParam()); + gtint_t ldc_inc = std::get<19>(GetParam()); + + // Set the threshold for the errors: + double thresh = 10*m*n*testinghelpers::getEpsilon(); + + //---------------------------------------------------------- + // Call test body using these parameters + //---------------------------------------------------------- + test_gemm( storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, + alpha, beta, ai, aj, aex, bi, bj, bex, ci, cj, cex, thresh ); +} + +// Helper classes for printing the test case parameters based on the instantiator +// These are mainly used to help with debugging, in case of failures + +// Utility to print the test-case in case of exception value on matrices +class ZGemmEVMatPrint { +public: + std::string operator()( + testing::TestParamInfo> str) const { + char sfm = std::get<0>(str.param); + char tsa = std::get<1>(str.param); + char tsb = std::get<2>(str.param); + gtint_t m = std::get<3>(str.param); + gtint_t n = std::get<4>(str.param); + gtint_t k = std::get<5>(str.param); + gtint_t ai, aj, bi, bj, ci, cj; + dcomplex aex, bex, cex; + ai = std::get<6>(str.param); + aj = std::get<7>(str.param); + aex = std::get<8>(str.param); + + bi = std::get<9>(str.param); + bj = std::get<10>(str.param); + bex = std::get<11>(str.param); + + ci = std::get<12>(str.param); + cj = std::get<13>(str.param); + cex = std::get<14>(str.param); + + dcomplex alpha = std::get<15>(str.param); + dcomplex beta = std::get<16>(str.param); + gtint_t lda_inc = std::get<17>(str.param); + gtint_t ldb_inc = std::get<18>(str.param); + gtint_t ldc_inc = std::get<19>(str.param); + +#ifdef TEST_BLAS + std::string str_name = "zgemm_"; +#elif TEST_CBLAS + std::string str_name = "cblas_zgemm"; +#else //#elif TEST_BLIS_TYPED + std::string str_name = "blis_zgemm"; +#endif + str_name = str_name + "_" + sfm+sfm+sfm; + str_name = str_name + "_" + tsa + tsb; + str_name = str_name + "_" + std::to_string(m); + str_name = str_name + "_" + std::to_string(n); + str_name = str_name + "_" + std::to_string(k); + str_name = str_name + "_A" + std::to_string(ai) + std::to_string(aj); + str_name = str_name + "_" + testinghelpers::get_value_string(aex); + str_name = str_name + "_B" + std::to_string(bi) + std::to_string(bj); + str_name = str_name + "_" + testinghelpers::get_value_string(bex); + str_name = str_name + "_C" + std::to_string(ci) + std::to_string(cj); + str_name = str_name + "_" + testinghelpers::get_value_string(cex); + str_name = str_name + "_a" + testinghelpers::get_value_string(alpha); + str_name = str_name + "_b" + testinghelpers::get_value_string(beta); + str_name = str_name + "_" + std::to_string(lda_inc); + str_name = str_name + "_" + std::to_string(ldb_inc); + str_name = str_name + "_" + std::to_string(ldc_inc); + return str_name; + } +}; + +// Utility to print the test-case in case of exception value on matrices +class ZGemmEVAlphaBetaPrint { +public: + std::string operator()( + testing::TestParamInfo> str) const { + char sfm = std::get<0>(str.param); + char tsa = std::get<1>(str.param); + char tsb = std::get<2>(str.param); + gtint_t m = std::get<3>(str.param); + gtint_t n = std::get<4>(str.param); + gtint_t k = std::get<5>(str.param); + + dcomplex alpha = std::get<15>(str.param); + dcomplex beta = std::get<16>(str.param); + gtint_t lda_inc = std::get<17>(str.param); + gtint_t ldb_inc = std::get<18>(str.param); + gtint_t ldc_inc = std::get<19>(str.param); + +#ifdef TEST_BLAS + std::string str_name = "zgemm_"; +#elif TEST_CBLAS + std::string str_name = "cblas_zgemm"; +#else //#elif TEST_BLIS_TYPED + std::string str_name = "blis_zgemm"; +#endif + str_name = str_name + "_" + sfm+sfm+sfm; + str_name = str_name + "_" + tsa + tsb; + str_name = str_name + "_" + std::to_string(m); + str_name = str_name + "_" + std::to_string(n); + str_name = str_name + "_" + std::to_string(k); + str_name = str_name + "_a" + testinghelpers::get_value_string(alpha); + str_name = str_name + "_b" + testinghelpers::get_value_string(beta); + str_name = str_name + "_" + std::to_string(lda_inc); + str_name = str_name + "_" + std::to_string(ldb_inc); + str_name = str_name + "_" + std::to_string(ldc_inc); + return str_name; + } +}; + +static double NaN = std::numeric_limits::quiet_NaN(); +static double Inf = std::numeric_limits::infinity(); + +// Exception value testing(on matrices) + +/* + For the bli_zgemm_4x4_avx2_k1_nn kernel, the main and fringe dimensions are as follows: + For m : Main = { 4 }, fringe = { 2, 1 } + For n : Main = { 4 }, fringe = { 2, 1 } + + Without any changes to the BLAS layer in BLIS, the fringe case of 1 cannot be touched + separately, since if m/n is 1, the inputs are redirected to ZGEMV. + +*/ + +// Testing for the main loop case for m and n +// The kernel uses 2 loads and 4 broadcasts. The exception values +// are induced at one index individually for each of the loads. +// They are also induced in the broadcast direction at two places. +INSTANTIATE_TEST_SUITE_P( + bli_zgemm_4x4_avx2_k1_nn_evt_mat_main, + ZGemmEVTTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n'), // transa + ::testing::Values('n'), // transb + ::testing::Values(gtint_t(4)), // m + ::testing::Values(gtint_t(4)), // n + ::testing::Values(gtint_t(1)), // k + ::testing::Values(gtint_t(1), gtint_t(3)), // ai + ::testing::Values(gtint_t(0)), // aj + ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, + dcomplex{3.4, NaN}, dcomplex{NaN, -Inf}), // aexval + ::testing::Values(gtint_t(0)), // bi + ::testing::Values(gtint_t(0), gtint_t(2)), // bj + ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, + dcomplex{3.4, NaN}, dcomplex{NaN, -Inf}), // bexval + ::testing::Values(gtint_t(0), gtint_t(2)), // ci + ::testing::Values(gtint_t(1), gtint_t(3)), // cj + ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, + dcomplex{3.4, NaN}, dcomplex{NaN, -Inf}), // cexval + ::testing::Values(dcomplex{-2.2, 3.3}), // alpha + ::testing::Values(dcomplex{1.2, -2.3}), // beta + ::testing::Values(gtint_t(0)), // increment to the leading dim of a + ::testing::Values(gtint_t(0)), // increment to the leading dim of b + ::testing::Values(gtint_t(0)) // increment to the leading dim of c + ), + ::ZGemmEVMatPrint() + ); + +// Testing the fringe cases +// Fringe case minimum size is 2 along both m and n. +// Invloves only one load(AVX2 or (AVX2+SSE)). Thus, +// the exception values are induced at the first and second indices of the +// column vector A and row vector B. +INSTANTIATE_TEST_SUITE_P( + bli_zgemm_4x4_avx2_k1_nn_evt_mat_fringe, + ZGemmEVTTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n'), // transa + ::testing::Values('n'), // transb + ::testing::Values(gtint_t(2), gtint_t(3)), // m + ::testing::Values(gtint_t(2), gtint_t(3)), // n + ::testing::Values(gtint_t(1)), // k + ::testing::Values(gtint_t(0), gtint_t(1)), // ai + ::testing::Values(gtint_t(0)), // aj + ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, + dcomplex{3.4, NaN}, dcomplex{NaN, -Inf}), // aexval + ::testing::Values(gtint_t(0)), // bi + ::testing::Values(gtint_t(0), gtint_t(1)), // bj + ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, + dcomplex{3.4, NaN}, dcomplex{NaN, -Inf}), // bexval + ::testing::Values(gtint_t(0), gtint_t(1)), // ci + ::testing::Values(gtint_t(0), gtint_t(1)), // cj + ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, + dcomplex{3.4, NaN}, dcomplex{NaN, -Inf}), // cexval + ::testing::Values(dcomplex{-2.2, 3.3}), // alpha + ::testing::Values(dcomplex{1.2, -2.3}), // beta + ::testing::Values(gtint_t(0)), // increment to the leading dim of a + ::testing::Values(gtint_t(0)), // increment to the leading dim of b + ::testing::Values(gtint_t(0)) // increment to the leading dim of c + ), + ::ZGemmEVMatPrint() + ); + +// Exception value testing(on alpha and beta) +// Alpha and beta are set to exception values +INSTANTIATE_TEST_SUITE_P( + bli_zgemm_4x4_avx2_k1_nn_evt_alphabeta, + ZGemmEVTTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n'), // transa + ::testing::Values('n'), // transb + ::testing::Values(gtint_t(2), gtint_t(3), gtint_t(4)), // m + ::testing::Values(gtint_t(2), gtint_t(3), gtint_t(4)), // n + ::testing::Values(gtint_t(1)), // k + ::testing::Values(gtint_t(0)), // ai + ::testing::Values(gtint_t(0)), // aj + ::testing::Values(dcomplex{0.0, 0.0}), + ::testing::Values(gtint_t(0)), // bi + ::testing::Values(gtint_t(0)), // bj + ::testing::Values(dcomplex{0.0, 0.0}), + ::testing::Values(gtint_t(0)), // ci + ::testing::Values(gtint_t(0)), // cj + ::testing::Values(dcomplex{0.0, 0.0}), + ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, + dcomplex{3.4, NaN}, dcomplex{NaN, -Inf}), // alpha + ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, + dcomplex{3.4, NaN}, dcomplex{NaN, -Inf}), // beta + ::testing::Values(gtint_t(0)), // increment to the leading dim of a + ::testing::Values(gtint_t(0)), // increment to the leading dim of b + ::testing::Values(gtint_t(0)) // increment to the leading dim of c + ), + ::ZGemmEVAlphaBetaPrint() + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp index 068c8398b8..94bb6fb914 100644 --- a/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp @@ -35,7 +35,7 @@ #include #include "test_gemm.h" -class ZGemmTest : +class ZGemmAccTest : public ::testing::TestWithParam> {}; -TEST_P(ZGemmTest, RandomData) +TEST_P(ZGemmAccTest, Unit_Tester) { using T = dcomplex; //---------------------------------------------------------- @@ -87,7 +87,7 @@ TEST_P(ZGemmTest, RandomData) test_gemm( storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); } -class ZGemmTestPrint { +class ZGemmAccPrint { public: std::string operator()( testing::TestParamInfo> str) const { @@ -114,12 +114,8 @@ class ZGemmTestPrint { str_name = str_name + "_" + std::to_string(m); str_name = str_name + "_" + std::to_string(n); str_name = str_name + "_" + std::to_string(k); - std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real)))); - alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); - std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real)))); - beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag))))); - str_name = str_name + "_a" + alpha_str; - str_name = str_name + "_b" + beta_str; + str_name = str_name + "_a" + testinghelpers::get_value_string(alpha);; + str_name = str_name + "_b" + testinghelpers::get_value_string(beta);; str_name = str_name + "_" + std::to_string(lda_inc); str_name = str_name + "_" + std::to_string(ldb_inc); str_name = str_name + "_" + std::to_string(ldc_inc); @@ -127,10 +123,41 @@ class ZGemmTestPrint { } }; +// Unit testing for bli_zgemm_4x4_avx2_k1_nn kernel +/* From the BLAS layer(post parameter checking), the inputs will be redirected to this kernel + if m != 1, n !=1 and k == 1 */ + +INSTANTIATE_TEST_SUITE_P( + bli_zgemm_4x4_avx2_k1_nn, + ZGemmAccTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n'), // transa + ::testing::Values('n'), // transb + ::testing::Range(gtint_t(2), gtint_t(8), 1), // m + ::testing::Range(gtint_t(2), gtint_t(8), 1), // n + ::testing::Values(gtint_t(1)), // k + ::testing::Values(dcomplex{1.0, 0.0}, dcomplex{-1.0, 0.0}, + dcomplex{0.0, 1.0}, dcomplex{2.1, -1.9}, + dcomplex{0.0, 0.0}), // alpha + ::testing::Values(dcomplex{1.0, 0.0}, dcomplex{-1.0, 0.0}, + dcomplex{0.0, 1.0}, dcomplex{2.1, -1.9}, + dcomplex{0.0, 0.0}), // beta + ::testing::Values(gtint_t(0), gtint_t(3)), // increment to the leading dim of a + ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of b + ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of c + ), + ::ZGemmAccPrint() + ); + // Black box testing. INSTANTIATE_TEST_SUITE_P( Blackbox, - ZGemmTest, + ZGemmAccTest, ::testing::Combine( ::testing::Values('c' #ifndef TEST_BLAS @@ -148,5 +175,5 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)), // increment to the leading dim of b ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of c ), - ::ZGemmTestPrint() + ::ZGemmAccPrint() ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp index 469385f1a1..04e9d1fc37 100644 --- a/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp +++ b/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp @@ -88,10 +88,10 @@ class dnrm2_TestPrint { std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_i" + std::to_string(i); - std::string iexval_str = getValueString(iexval); + std::string iexval_str = testinghelpers::get_value_string(iexval); str_name = str_name + "_" + iexval_str; str_name = str_name + "_j" + std::to_string(j); - std::string jexval_str = getValueString(jexval); + std::string jexval_str = testinghelpers::get_value_string(jexval); str_name = str_name + "_" + jexval_str; return str_name; } diff --git a/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp index 4615a18356..3d61719eea 100644 --- a/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp +++ b/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp @@ -88,10 +88,10 @@ class dznrm2_TestPrint{ std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_i" + std::to_string(i); - std::string iexval_str = "_Re_" + getValueString(iexval.real) + "_Im_" + getValueString(iexval.imag); + std::string iexval_str = "_Re_" + testinghelpers::get_value_string(iexval.real) + "_Im_" + testinghelpers::get_value_string(iexval.imag); str_name = str_name + iexval_str; str_name = str_name + "_j" + std::to_string(j); - std::string jexval_str = "_Re_" + getValueString(jexval.real) + "_Im_" + getValueString(jexval.imag); + std::string jexval_str = "_Re_" + testinghelpers::get_value_string(jexval.real) + "_Im_" + testinghelpers::get_value_string(jexval.imag); str_name = str_name + jexval_str; return str_name; } diff --git a/gtestsuite/testsuite/util/nrm2/scnrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/scnrm2_extreme_values.cpp index fa1d7abc97..52ba4f8647 100644 --- a/gtestsuite/testsuite/util/nrm2/scnrm2_extreme_values.cpp +++ b/gtestsuite/testsuite/util/nrm2/scnrm2_extreme_values.cpp @@ -88,10 +88,10 @@ class scnrm2_TestPrint{ std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_i" + std::to_string(i); - std::string iexval_str = "_Re_" + getValueString(iexval.real) + "_Im_" + getValueString(iexval.imag); + std::string iexval_str = "_Re_" + testinghelpers::get_value_string(iexval.real) + "_Im_" + testinghelpers::get_value_string(iexval.imag); str_name = str_name + iexval_str; str_name = str_name + "_j" + std::to_string(j); - std::string jexval_str = "_Re_" + getValueString(jexval.real) + "_Im_" + getValueString(jexval.imag); + std::string jexval_str = "_Re_" + testinghelpers::get_value_string(jexval.real) + "_Im_" + testinghelpers::get_value_string(jexval.imag); str_name = str_name + jexval_str; return str_name; } diff --git a/gtestsuite/testsuite/util/nrm2/snrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/snrm2_extreme_values.cpp index 8de5e6aac2..5bfa83a346 100644 --- a/gtestsuite/testsuite/util/nrm2/snrm2_extreme_values.cpp +++ b/gtestsuite/testsuite/util/nrm2/snrm2_extreme_values.cpp @@ -88,10 +88,10 @@ class snrm2_TestPrint { std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name = str_name + "_" + incx_str; str_name = str_name + "_i" + std::to_string(i); - std::string iexval_str = getValueString(iexval); + std::string iexval_str = testinghelpers::get_value_string(iexval); str_name = str_name + "_" + iexval_str; str_name = str_name + "_j" + std::to_string(j); - std::string jexval_str = getValueString(jexval); + std::string jexval_str = testinghelpers::get_value_string(jexval); str_name = str_name + "_" + jexval_str; return str_name; } diff --git a/gtestsuite/testsuite/util/nrm2/test_nrm2.h b/gtestsuite/testsuite/util/nrm2/test_nrm2.h index 270269d571..a0ec2f3b35 100644 --- a/gtestsuite/testsuite/util/nrm2/test_nrm2.h +++ b/gtestsuite/testsuite/util/nrm2/test_nrm2.h @@ -98,19 +98,4 @@ void test_nrm2( gtint_t n, gtint_t incx, gtint_t i, T iexval, gtint_t j = 0, T j //---------------------------------------------------------- // Compare using NaN/Inf checks. computediff( norm, norm_ref, true ); -} - -// Helper function that returns a string with the correct NaN/Inf printing -// so that we can print the test names correctly from using parametrized testing. -template -std::string getValueString(T exval) -{ - std::string exval_str; - if(std::isnan(exval)) - exval_str = "nan"; - else if(std::isinf(exval)) - exval_str = (exval > 0) ? "inf" : "minus_inf"; - else - exval_str = ( exval > 0) ? std::to_string(int(exval)) : "minus_" + std::to_string(int(std::abs(exval))); - return exval_str; } \ No newline at end of file From 9d19effec5e8e953d2945785d678643ad1911f7e Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Fri, 8 Sep 2023 17:36:11 +0530 Subject: [PATCH 132/226] Fix for non-x86 builds: cpuid query functions Ensure functions bli_cpuid_query_id() and bli_cpuid_query_model_id() are defined for all architectures in bli_cpuid.c AMD-Internal: [CPUPL-3838] Change-Id: I7b0582a4d63d9f28076761749cf5c24d87316f3e --- frame/base/bli_cpuid.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index 91bd6d8e6d..4f64e0c756 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -892,6 +892,12 @@ arch_t bli_cpuid_query_id( void ) return BLIS_ARCH_GENERIC; } +model_t bli_cpuid_query_model_id( arch_t arch_id ) +{ + // Set default for architectures where separate models haven't been defined. + return BLIS_MODEL_DEFAULT; +} + bool bli_cpuid_is_thunderx2 ( uint32_t family, @@ -967,6 +973,22 @@ bool bli_cpuid_is_cortexa9 return TRUE; } +#else + +// Define basic versions of these functions for architectures not explicitly +// handled above. + +arch_t bli_cpuid_query_id( void ) +{ + return BLIS_ARCH_GENERIC; +} + +model_t bli_cpuid_query_model_id( arch_t arch_id ) +{ + // Set default for architectures where separate models haven't been defined. + return BLIS_MODEL_DEFAULT; +} + #endif // ----------------------------------------------------------------------------- From 09e34fd2bd338832148ae2e92094926b717c49ef Mon Sep 17 00:00:00 2001 From: orequest Date: Wed, 30 Aug 2023 09:13:00 +0530 Subject: [PATCH 133/226] Added optimised CGEMM function pointers in zen4 cntx 1. Two CGEMM function pointers are added for different storage schemes 1. bli_cgemmsup_rv_zen_asm_3x8m 2. bli_cgemmsup_rv_zen_asm_3x8n 2. In previous commit: (Level-3 triangular routines now use different block sizes and kernels Commit Id: 79e174ff0aaa86ea9cdb2e74b7f6ca71465e6191) 1. bli_cntx_set_l3_sup_tri_kers cntx function was created 2. Function holds optimised function pointers for GEMMT/SYRK API's 3. It avoids over riding default block sizes which improves the performance 4. This function did not include optimised CGEMM function pointers leading to regression as reference kernels were invoked 3. With this commit, 2 optimized CGEMM function pointers are added in bli_cntx_set_l3_sup_tri_kers 1. This fixes the regression as optimized CGEMM functions are invoked AMD-Internal: [CPUPL-3831] [CPUPL-3830] Change-Id: Ie8b41a5e62439de2a65e7df0b07d63ee2383e51e --- config/zen4/bli_cntx_init_zen4.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index f5b7be7de5..8ef336d43b 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -365,7 +365,7 @@ void bli_cntx_init_zen4( cntx_t* cntx ) bli_cntx_set_l3_sup_tri_kers ( - 24, + 30, BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, @@ -382,6 +382,12 @@ void bli_cntx_init_zen4( cntx_t* cntx ) BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE, BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, @@ -392,4 +398,4 @@ void bli_cntx_init_zen4( cntx_t* cntx ) BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, cntx ); -} \ No newline at end of file +} From 0d16d952dc4c6396dd7d3dd25c2b727f67c4debb Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Mon, 28 Aug 2023 15:29:58 -0400 Subject: [PATCH 134/226] BLIS: DTL enhancements Several improvements to BLIS DTL functionality - For APIs that report performance statistics, test for time=0.0 before dividing by time when calculating GFLOPS. - Call AOCL_DTL_TRACE_EXIT in the parameter checking functions inlined from ./frame/compat/check/bla_*_check.h - Correct flop count for complex routines. AMD-Internal: [CPUPL-3736] Change-Id: Icc515d88810dd79e66e22ea8c47d84649ca9f768 --- aocl_dtl/aocldtl_blis.c | 72 ++++++++++++---- aocl_dtl/aocldtl_blis.h | 28 +++--- frame/compat/bla_gemm.c | 22 ++--- frame/compat/bla_gemm_amd.c | 60 ++++++------- frame/compat/bla_gemmt.c | 8 +- frame/compat/bla_nrm2.c | 2 +- frame/compat/bla_trsm_amd.c | 117 +++++++++++++------------- frame/compat/check/bla_gemm3m_check.h | 5 ++ frame/compat/check/bla_gemm_check.h | 6 ++ frame/compat/check/bla_gemmt_check.h | 6 ++ frame/compat/check/bla_gemv_check.h | 5 ++ frame/compat/check/bla_ger_check.h | 5 ++ frame/compat/check/bla_hemm_check.h | 5 ++ frame/compat/check/bla_hemv_check.h | 5 ++ frame/compat/check/bla_her2_check.h | 5 ++ frame/compat/check/bla_her2k_check.h | 5 ++ frame/compat/check/bla_her_check.h | 5 ++ frame/compat/check/bla_herk_check.h | 5 ++ frame/compat/check/bla_syr2k_check.h | 5 ++ frame/compat/check/bla_syrk_check.h | 5 ++ frame/compat/check/bla_trmm_check.h | 5 ++ frame/compat/check/bla_trmv_check.h | 5 ++ frame/compat/check/bla_trsm_check.h | 59 ++++++++++++- 23 files changed, 313 insertions(+), 132 deletions(-) diff --git a/aocl_dtl/aocldtl_blis.c b/aocl_dtl/aocldtl_blis.c index 078da3b5db..b9d74242a8 100755 --- a/aocl_dtl/aocldtl_blis.c +++ b/aocl_dtl/aocldtl_blis.c @@ -92,6 +92,7 @@ void AOCL_DTL_log_gemm_sizes(int8 loglevel, } void AOCL_DTL_log_gemm_stats(int8 loglevel, + char dt_type, const f77_int m, const f77_int n, const f77_int k) @@ -99,33 +100,52 @@ void AOCL_DTL_log_gemm_stats(int8 loglevel, char buffer[256]; double flops = 2.0 * m * n * k; + if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z') + { + flops = 4.0 * flops; + } // Execution time is in micro seconds. Double execution_time = AOCL_DTL_get_time_spent(); - sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", - AOCL_get_requested_threads_count(), - execution_time/1000.0, - flops/(execution_time * 1e3)); + if (execution_time != 0.0) + sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", + AOCL_get_requested_threads_count(), + execution_time/1000.0, + flops/(execution_time * 1e3)); + else + sprintf(buffer, " nt=%ld %.3f ms", + AOCL_get_requested_threads_count(), + execution_time/1000.0); DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer); } void AOCL_DTL_log_gemmt_stats(int8 loglevel, + char dt_type, const f77_int n, const f77_int k) { char buffer[256]; double flops = n * n * k; + if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z') + { + flops = 4.0 * flops; + } // Execution time is in micro seconds. Double execution_time = AOCL_DTL_get_time_spent(); - sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", - AOCL_get_requested_threads_count(), - execution_time/1000.0, - flops/(execution_time * 1e3)); + if (execution_time != 0.0) + sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", + AOCL_get_requested_threads_count(), + execution_time/1000.0, + flops/(execution_time * 1e3)); + else + sprintf(buffer, " nt=%ld %.3f ms", + AOCL_get_requested_threads_count(), + execution_time/1000.0); DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer); } @@ -164,6 +184,7 @@ void AOCL_DTL_log_trsm_sizes(int8 loglevel, } void AOCL_DTL_log_trsm_stats(int8 loglevel, + char dt_type, f77_char side, const f77_int m, const f77_int n) @@ -179,14 +200,23 @@ void AOCL_DTL_log_trsm_stats(int8 loglevel, { flops = 1.0 * m * n * n; } + if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z') + { + flops = 4.0 * flops; + } // Execution time is in micro seconds. Double execution_time = AOCL_DTL_get_time_spent(); - sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", - AOCL_get_requested_threads_count(), - execution_time/1000.0, - flops/(execution_time * 1e3)); + if (execution_time != 0.0) + sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", + AOCL_get_requested_threads_count(), + execution_time/1000.0, + flops/(execution_time * 1e3)); + else + sprintf(buffer, " nt=%ld %.3f ms", + AOCL_get_requested_threads_count(), + execution_time/1000.0); DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer); } @@ -698,19 +728,29 @@ void AOCL_DTL_log_nrm2_sizes(int8 loglevel, } void AOCL_DTL_log_nrm2_stats(int8 loglevel, + char dt_type, const f77_int n) { char buffer[256]; double flops = 2.0 * n; + if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z') + { + flops = 2.0 * flops; + } // Execution time is in micro seconds. Double execution_time = AOCL_DTL_get_time_spent(); - sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", - AOCL_get_requested_threads_count(), - execution_time/1000.0, - flops/(execution_time * 1e3)); + if (execution_time != 0.0) + sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS", + AOCL_get_requested_threads_count(), + execution_time/1000.0, + flops/(execution_time * 1e3)); + else + sprintf(buffer, " nt=%ld %.3f ms", + AOCL_get_requested_threads_count(), + execution_time/1000.0); DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer); } diff --git a/aocl_dtl/aocldtl_blis.h b/aocl_dtl/aocldtl_blis.h index 924dcc7445..e01d80efd3 100755 --- a/aocl_dtl/aocldtl_blis.h +++ b/aocl_dtl/aocldtl_blis.h @@ -33,11 +33,13 @@ void AOCL_DTL_log_gemm_sizes(int8 loglevel, int line); void AOCL_DTL_log_gemm_stats(int8 loglevel, + char dt_type, const f77_int m, const f77_int n, const f77_int k); void AOCL_DTL_log_trsm_stats(int8 loglevel, + char dt_type, f77_char side, const f77_int m, const f77_int n); @@ -74,6 +76,7 @@ void AOCL_DTL_log_gemmt_sizes(int8 loglevel, int line); void AOCL_DTL_log_gemmt_stats(int8 loglevel, + char dt_type, const f77_int n, const f77_int k); @@ -253,6 +256,7 @@ void AOCL_DTL_log_nrm2_sizes( int8 loglevel, int line); void AOCL_DTL_log_nrm2_stats(int8 loglevel, + char dt_type, const f77_int n); void AOCL_DTL_log_amax_sizes ( int8 loglevel, @@ -401,22 +405,22 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel, AOCL_DTL_log_gemm_sizes(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc, \ __FILE__, __FUNCTION__, __LINE__); -#define AOCL_DTL_LOG_GEMM_STATS(loglevel, m, n, k) \ +#define AOCL_DTL_LOG_GEMM_STATS(loglevel, dt_type, m, n, k) \ if (gbIsLoggingEnabled) \ - AOCL_DTL_log_gemm_stats(loglevel, m, n, k); + AOCL_DTL_log_gemm_stats(loglevel, dt_type, m, n, k); -#define AOCL_DTL_LOG_GEMMT_STATS(loglevel, n, k) \ +#define AOCL_DTL_LOG_GEMMT_STATS(loglevel, dt_type, n, k) \ if (gbIsLoggingEnabled) \ - AOCL_DTL_log_gemmt_stats(loglevel, n, k); + AOCL_DTL_log_gemmt_stats(loglevel, dt_type, n, k); #define AOCL_DTL_LOG_TRSM_INPUTS(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb) \ if (gbIsLoggingEnabled) \ AOCL_DTL_log_trsm_sizes(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb, \ __FILE__, __FUNCTION__, __LINE__); -#define AOCL_DTL_LOG_TRSM_STATS(loglevel, side, m, n) \ +#define AOCL_DTL_LOG_TRSM_STATS(loglevel, dt_type, side, m, n) \ if (gbIsLoggingEnabled) \ - AOCL_DTL_log_trsm_stats(loglevel, side, m, n); + AOCL_DTL_log_trsm_stats(loglevel, dt_type, side, m, n); #define AOCL_DTL_LOG_GEMMT_INPUTS(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc) \ if (gbIsLoggingEnabled) \ @@ -480,9 +484,9 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel, if (gbIsLoggingEnabled) \ AOCL_DTL_log_nrm2_sizes(loglevel, dt_type, n, incx, __FILE__,__FUNCTION__,__LINE__); -#define AOCL_DTL_LOG_NRM2_STATS(loglevel, n) \ +#define AOCL_DTL_LOG_NRM2_STATS(loglevel, dt_type, n) \ if (gbIsLoggingEnabled) \ - AOCL_DTL_log_nrm2_stats(loglevel, n); + AOCL_DTL_log_nrm2_stats(loglevel, dt_type, n); #define AOCL_DTL_LOG_HEMV_INPUTS(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy) \ if (gbIsLoggingEnabled) \ @@ -555,15 +559,15 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel, #define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc) -#define AOCL_DTL_LOG_GEMM_STATS(loglevel, m, n, k) +#define AOCL_DTL_LOG_GEMM_STATS(loglevel, dt_type, m, n, k) #define AOCL_DTL_LOG_TRSM_INPUTS(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb) -#define AOCL_DTL_LOG_TRSM_STATS(loglevel, side, m, n) +#define AOCL_DTL_LOG_TRSM_STATS(loglevel, dt_type, side, m, n) #define AOCL_DTL_LOG_GEMMT_INPUTS(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc) -#define AOCL_DTL_LOG_GEMMT_STATS(loglevel, n, k) +#define AOCL_DTL_LOG_GEMMT_STATS(loglevel, dt_type, n, k) #define AOCL_DTL_LOG_HEMM_INPUTS(loglevel, dt_type, side, uplo, m, n, alpha, lda, ldb, beta, ldc) @@ -589,7 +593,7 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel, #define AOCL_DTL_LOG_NRM2_INPUTS(loglevel, dt_type, n, incx) -#define AOCL_DTL_LOG_NRM2_STATS(loglevel, n) +#define AOCL_DTL_LOG_NRM2_STATS(loglevel, dt_type, n) #define AOCL_DTL_LOG_HEMV_INPUTS(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy) diff --git a/frame/compat/bla_gemm.c b/frame/compat/bla_gemm.c index e7576096cd..b3d322370e 100644 --- a/frame/compat/bla_gemm.c +++ b/frame/compat/bla_gemm.c @@ -91,7 +91,7 @@ void PASTEF77S(ch,blasname) \ if ( *m == 0 || *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \ && PASTEMAC(ch,eq1)( *beta ) )) \ { \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -117,7 +117,7 @@ void PASTEF77S(ch,blasname) \ NULL, NULL \ ); \ \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -158,7 +158,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);\ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k);\ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -235,7 +235,7 @@ void PASTEF77S(ch,blasname) \ if ( *m == 0 || *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \ && PASTEMAC(ch,eq1)( *beta ) )) \ { \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -261,7 +261,7 @@ void PASTEF77S(ch,blasname) \ NULL, NULL \ ); \ \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -315,7 +315,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ } \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -351,7 +351,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ } \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -390,7 +390,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -463,7 +463,7 @@ void dzgemm_ if ( *m == 0 || *n == 0 || (( PASTEMAC(z,eq0)( *alpha ) || *k == 0) && PASTEMAC(z,eq1)( *beta ) )) { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -489,7 +489,7 @@ void dzgemm_ NULL, NULL ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -542,7 +542,7 @@ void dzgemm_ bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index 0cb0afa405..decd7e1aa5 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -91,7 +91,7 @@ void PASTEF77S(ch,blasname) \ if ( *m == 0 || *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \ && PASTEMAC(ch,eq1)( *beta ) )) \ { \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -117,7 +117,7 @@ void PASTEF77S(ch,blasname) \ NULL, NULL \ ); \ \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -158,7 +158,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);\ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k);\ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -236,7 +236,7 @@ void PASTEF77S(ch,blasname) \ if ( *m == 0 || *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \ && PASTEMAC(ch,eq1)( *beta ) )) \ { \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -262,7 +262,7 @@ void PASTEF77S(ch,blasname) \ NULL, NULL \ ); \ \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -316,7 +316,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ } \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -352,7 +352,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ } \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -391,7 +391,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *m, *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -461,7 +461,7 @@ void dgemm_blis_impl /* Quick return if possible. */ if ( *m == 0 || *n == 0 || ((*alpha == 0.0 || *k == 0) && *beta == 1.0)) { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -487,7 +487,7 @@ void dgemm_blis_impl NULL, NULL ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -555,7 +555,7 @@ void dgemm_blis_impl NULL ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -571,7 +571,7 @@ void dgemm_blis_impl (double*)beta, c, *ldc ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS */ bli_finalize_auto(); @@ -609,7 +609,7 @@ void dgemm_blis_impl ); } - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS */ bli_finalize_auto(); @@ -645,7 +645,7 @@ void dgemm_blis_impl ((void*)0) ); } - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS */ bli_finalize_auto(); @@ -698,7 +698,7 @@ void dgemm_blis_impl NULL, NULL ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ @@ -741,7 +741,7 @@ void dgemm_blis_impl if (status == BLIS_SUCCESS) { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -754,7 +754,7 @@ void dgemm_blis_impl err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL); if (status == BLIS_SUCCESS) { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS */ bli_finalize_auto(); @@ -776,7 +776,7 @@ void dgemm_blis_impl /* NULL */ /* ); */ - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -843,7 +843,7 @@ void zgemm_blis_impl if ( *m == 0 || *n == 0 || (( PASTEMAC(z,eq0)( *alpha ) || *k == 0) && PASTEMAC(z,eq1)( *beta ) )) { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -869,7 +869,7 @@ void zgemm_blis_impl NULL, NULL ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -928,7 +928,7 @@ void zgemm_blis_impl ); } - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); bli_finalize_auto(); return; } @@ -965,7 +965,7 @@ void zgemm_blis_impl ); } - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); bli_finalize_auto(); return; } @@ -1019,7 +1019,7 @@ void zgemm_blis_impl NULL ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1044,7 +1044,7 @@ void zgemm_blis_impl c, *ldc ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS */ bli_finalize_auto(); @@ -1108,7 +1108,7 @@ void zgemm_blis_impl if (status == BLIS_SUCCESS) { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1120,7 +1120,7 @@ void zgemm_blis_impl err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL); if (status == BLIS_SUCCESS) { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1130,7 +1130,7 @@ void zgemm_blis_impl // fall back on native path when zgemm is not handled in sup path. bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1200,7 +1200,7 @@ void dzgemm_blis_impl if ( *m == 0 || *n == 0 || (( PASTEMAC(z,eq0)( *alpha ) || *k == 0) && PASTEMAC(z,eq1)( *beta ) )) { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1226,7 +1226,7 @@ void dzgemm_blis_impl NULL, NULL ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1278,7 +1278,7 @@ void dzgemm_blis_impl // fall back on native path when zgemm is not handled in sup path. bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); /* Finalize BLIS. */ bli_finalize_auto(); diff --git a/frame/compat/bla_gemmt.c b/frame/compat/bla_gemmt.c index e2a25321ec..815cab7372 100644 --- a/frame/compat/bla_gemmt.c +++ b/frame/compat/bla_gemmt.c @@ -89,7 +89,7 @@ void PASTEF77S(ch,blasname) \ if ( *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \ && PASTEMAC(ch,eq1)( *beta ) )) \ { \ - AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *n, *k); \ + AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -130,7 +130,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ - AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *n, *k); \ + AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -203,7 +203,7 @@ void PASTEF77S(ch,blasname) \ if ( *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \ && PASTEMAC(ch,eq1)( *beta ) )) \ { \ - AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *n, *k); \ + AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -263,7 +263,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ - AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *n, *k); \ + AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *k); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ diff --git a/frame/compat/bla_nrm2.c b/frame/compat/bla_nrm2.c index e17baf282c..89a17f7f83 100755 --- a/frame/compat/bla_nrm2.c +++ b/frame/compat/bla_nrm2.c @@ -75,7 +75,7 @@ ftype_r PASTEF772S(chr,chx,blasname) \ NULL \ ); \ \ - AOCL_DTL_LOG_NRM2_STATS(AOCL_DTL_LEVEL_TRACE_1, *n); \ + AOCL_DTL_LOG_NRM2_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(chx), *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ diff --git a/frame/compat/bla_trsm_amd.c b/frame/compat/bla_trsm_amd.c index 382989b621..0e627f7832 100644 --- a/frame/compat/bla_trsm_amd.c +++ b/frame/compat/bla_trsm_amd.c @@ -58,7 +58,10 @@ void PASTEF77S(ch,blasname) \ ftype* b, const f77_int* ldb \ ) \ { \ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) \ + AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), \ + *side, *uploa,*transa, *diaga, *m, *n, \ + (void*)alpha,*lda, *ldb); \ \ side_t blis_side; \ uplo_t blis_uploa; \ @@ -89,7 +92,7 @@ void PASTEF77S(ch,blasname) \ /* Quick return if possible. */ \ if ( *m == 0 || *n == 0 ) \ { \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -124,7 +127,7 @@ void PASTEF77S(ch,blasname) \ (ftype*) b, rs_b, cs_b, \ NULL, NULL \ ); \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -147,7 +150,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -219,7 +222,7 @@ void PASTEF77S(ch,blasname) \ /* Quick return if possible. */ \ if ( *m == 0 || *n == 0 ) \ { \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -255,7 +258,7 @@ void PASTEF77S(ch,blasname) \ (ftype*) b, rs_b, cs_b, \ NULL, NULL \ ); \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -304,7 +307,7 @@ void PASTEF77S(ch,blasname) \ (ftype*)b, rs_b, \ NULL \ ); \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -321,7 +324,7 @@ void PASTEF77S(ch,blasname) \ (ftype*)b, rs_b, \ NULL \ ); \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -347,7 +350,7 @@ void PASTEF77S(ch,blasname) \ PASTEMAC(ch,invscals)( a_conj, b[indx] ); \ } \ }\ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -371,7 +374,7 @@ void PASTEF77S(ch,blasname) \ (ftype*)a, cs_a, rs_a, \ (ftype*)b, cs_b, \ NULL); \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -390,7 +393,7 @@ void PASTEF77S(ch,blasname) \ (ftype*)a, cs_a, rs_a, \ (ftype*)b, cs_b, \ NULL); \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -416,7 +419,7 @@ void PASTEF77S(ch,blasname) \ PASTEMAC(ch,invscals)( a_conj, b[indx*cs_b] ); \ }\ } \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ return; \ } \ @@ -453,7 +456,7 @@ void PASTEF77S(ch,blasname) \ NULL \ ); \ \ - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); \ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *m, *n); \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ @@ -491,7 +494,7 @@ void strsm_blis_impl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) - AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 's', + AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *uploa,*transa, *diaga, *m, *n, (void*)alpha,*lda, *ldb); @@ -523,7 +526,7 @@ void strsm_blis_impl /* Quick return if possible. */ if ( *m == 0 || *n == 0 ) { - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -559,7 +562,7 @@ void strsm_blis_impl (float*) b, rs_b, cs_b, NULL, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) /* Finalize BLIS. */ bli_finalize_auto(); @@ -583,7 +586,7 @@ void strsm_blis_impl (float*)b, rs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -600,7 +603,7 @@ void strsm_blis_impl (float*)b, rs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -625,7 +628,7 @@ void strsm_blis_impl b[indx] = ( inva * b[indx] ); } } - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -652,7 +655,7 @@ void strsm_blis_impl (float*)b, cs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -674,7 +677,7 @@ void strsm_blis_impl (float*)b, cs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -699,7 +702,7 @@ void strsm_blis_impl b[indx*cs_b] = (inva * b[indx*cs_b] ); } } - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -751,7 +754,7 @@ void strsm_blis_impl ); if (status == BLIS_SUCCESS) { - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -771,7 +774,7 @@ void strsm_blis_impl NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) /* Finalize BLIS. */ bli_finalize_auto(); @@ -807,7 +810,7 @@ void dtrsm_blis_impl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) - AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'd', + AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *uploa,*transa, *diaga, *m, *n, (void*)alpha,*lda, *ldb); @@ -839,7 +842,7 @@ void dtrsm_blis_impl /* Quick return if possible. */ if ( *m == 0 || *n == 0 ) { - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -875,7 +878,7 @@ void dtrsm_blis_impl (double*) b, rs_b, cs_b, NULL, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) /* Finalize BLIS. */ bli_finalize_auto(); @@ -899,7 +902,7 @@ void dtrsm_blis_impl (double*)b, rs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -916,7 +919,7 @@ void dtrsm_blis_impl (double*)b, rs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -941,7 +944,7 @@ void dtrsm_blis_impl b[indx] = ( inva * b[indx] ); } } - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -968,7 +971,7 @@ void dtrsm_blis_impl (double*)b, cs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -990,7 +993,7 @@ void dtrsm_blis_impl (double*)b, cs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1015,7 +1018,7 @@ void dtrsm_blis_impl b[indx*cs_b] = (inva * b[indx*cs_b] ); } } - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1137,7 +1140,7 @@ void dtrsm_blis_impl } if (status == BLIS_SUCCESS) { - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1155,7 +1158,7 @@ void dtrsm_blis_impl NULL, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) /* Finalize BLIS. */ bli_finalize_auto(); @@ -1192,7 +1195,7 @@ void ztrsm_blis_impl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) - AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'z', + AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *uploa,*transa, *diaga, *m, *n, (void*)alpha,*lda, *ldb); @@ -1224,7 +1227,7 @@ void ztrsm_blis_impl /* Quick return if possible. */ if ( *m == 0 || *n == 0 ) { - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1260,7 +1263,7 @@ void ztrsm_blis_impl (dcomplex*) b, rs_b, cs_b, NULL, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) /* Finalize BLIS. */ bli_finalize_auto(); @@ -1284,7 +1287,7 @@ void ztrsm_blis_impl (dcomplex*)b, rs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1301,7 +1304,7 @@ void ztrsm_blis_impl (dcomplex*)b, rs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1356,7 +1359,7 @@ void ztrsm_blis_impl } - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1383,7 +1386,7 @@ void ztrsm_blis_impl (dcomplex*)b, cs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1405,7 +1408,7 @@ void ztrsm_blis_impl (dcomplex*)b, cs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1459,7 +1462,7 @@ void ztrsm_blis_impl } } - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; @@ -1514,7 +1517,7 @@ void ztrsm_blis_impl ); if (status == BLIS_SUCCESS) { - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1534,7 +1537,7 @@ void ztrsm_blis_impl NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) /* Finalize BLIS. */ bli_finalize_auto(); @@ -1571,7 +1574,7 @@ void ctrsm_blis_impl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) - AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'c', + AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *uploa,*transa, *diaga, *m, *n, (void*)alpha,*lda, *ldb); @@ -1603,7 +1606,7 @@ void ctrsm_blis_impl /* Quick return if possible. */ if ( *m == 0 || *n == 0 ) { - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1639,7 +1642,7 @@ void ctrsm_blis_impl (scomplex*) b, rs_b, cs_b, NULL, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) /* Finalize BLIS. */ bli_finalize_auto(); @@ -1663,7 +1666,7 @@ void ctrsm_blis_impl (scomplex*)b, rs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1680,7 +1683,7 @@ void ctrsm_blis_impl (scomplex*)b, rs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1734,7 +1737,7 @@ void ctrsm_blis_impl } } - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; @@ -1762,7 +1765,7 @@ void ctrsm_blis_impl (scomplex*)b, cs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1784,7 +1787,7 @@ void ctrsm_blis_impl (scomplex*)b, cs_b, NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1838,7 +1841,7 @@ void ctrsm_blis_impl } } - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return; } @@ -1891,7 +1894,7 @@ void ctrsm_blis_impl ); if (status == BLIS_SUCCESS) { - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); /* Finalize BLIS. */ bli_finalize_auto(); @@ -1911,7 +1914,7 @@ void ctrsm_blis_impl NULL ); - AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *side, *m, *n); + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) /* Finalize BLIS. */ bli_finalize_auto(); diff --git a/frame/compat/check/bla_gemm3m_check.h b/frame/compat/check/bla_gemm3m_check.h index b5a2887ce0..ffb70bbdda 100644 --- a/frame/compat/check/bla_gemm3m_check.h +++ b/frame/compat/check/bla_gemm3m_check.h @@ -78,6 +78,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_gemm_check.h b/frame/compat/check/bla_gemm_check.h index 8b68b22e0c..ca2add8bca 100644 --- a/frame/compat/check/bla_gemm_check.h +++ b/frame/compat/check/bla_gemm_check.h @@ -79,6 +79,12 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *dt_str, *m, *n, *k); \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_gemmt_check.h b/frame/compat/check/bla_gemmt_check.h index fb81c70732..5ba66ee00f 100644 --- a/frame/compat/check/bla_gemmt_check.h +++ b/frame/compat/check/bla_gemmt_check.h @@ -82,6 +82,12 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_LOG_GEMMT_STATS(AOCL_DTL_LEVEL_TRACE_1, *dt_str, *n, *k); \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_gemv_check.h b/frame/compat/check/bla_gemv_check.h index 33b0665f56..6d93cc7ee6 100644 --- a/frame/compat/check/bla_gemv_check.h +++ b/frame/compat/check/bla_gemv_check.h @@ -64,6 +64,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_ger_check.h b/frame/compat/check/bla_ger_check.h index cd2f4c8de3..17719702ce 100644 --- a/frame/compat/check/bla_ger_check.h +++ b/frame/compat/check/bla_ger_check.h @@ -59,6 +59,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_hemm_check.h b/frame/compat/check/bla_hemm_check.h index 342f485f9f..f2a9879aaf 100644 --- a/frame/compat/check/bla_hemm_check.h +++ b/frame/compat/check/bla_hemm_check.h @@ -72,6 +72,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_hemv_check.h b/frame/compat/check/bla_hemv_check.h index 716d434f26..e000df9534 100644 --- a/frame/compat/check/bla_hemv_check.h +++ b/frame/compat/check/bla_hemv_check.h @@ -61,6 +61,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_her2_check.h b/frame/compat/check/bla_her2_check.h index f9e100612e..a3bc1f8c7a 100644 --- a/frame/compat/check/bla_her2_check.h +++ b/frame/compat/check/bla_her2_check.h @@ -61,6 +61,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_her2k_check.h b/frame/compat/check/bla_her2k_check.h index 631977d245..67df79cc16 100644 --- a/frame/compat/check/bla_her2k_check.h +++ b/frame/compat/check/bla_her2k_check.h @@ -72,6 +72,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_her_check.h b/frame/compat/check/bla_her_check.h index 4120f8bf9d..1a30961fe0 100644 --- a/frame/compat/check/bla_her_check.h +++ b/frame/compat/check/bla_her_check.h @@ -59,6 +59,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_herk_check.h b/frame/compat/check/bla_herk_check.h index dca2314419..39ef7e5296 100644 --- a/frame/compat/check/bla_herk_check.h +++ b/frame/compat/check/bla_herk_check.h @@ -70,6 +70,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_syr2k_check.h b/frame/compat/check/bla_syr2k_check.h index 66bffae1b5..cf420f704b 100644 --- a/frame/compat/check/bla_syr2k_check.h +++ b/frame/compat/check/bla_syr2k_check.h @@ -77,6 +77,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_syrk_check.h b/frame/compat/check/bla_syrk_check.h index 8a42eaae36..bdbd43dfc1 100644 --- a/frame/compat/check/bla_syrk_check.h +++ b/frame/compat/check/bla_syrk_check.h @@ -75,6 +75,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_trmm_check.h b/frame/compat/check/bla_trmm_check.h index af9d8ce493..4cb050da21 100644 --- a/frame/compat/check/bla_trmm_check.h +++ b/frame/compat/check/bla_trmm_check.h @@ -81,6 +81,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_trmv_check.h b/frame/compat/check/bla_trmv_check.h index e4577738fb..b1fb4faf8b 100644 --- a/frame/compat/check/bla_trmv_check.h +++ b/frame/compat/check/bla_trmv_check.h @@ -70,6 +70,11 @@ bli_string_mkupper( func_str ); \ \ PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ return; \ } \ diff --git a/frame/compat/check/bla_trsm_check.h b/frame/compat/check/bla_trsm_check.h index 2372770bc8..2ab7bca1f5 100644 --- a/frame/compat/check/bla_trsm_check.h +++ b/frame/compat/check/bla_trsm_check.h @@ -33,4 +33,61 @@ */ -#define bla_trsm_check bla_trmm_check +#define bla_trsm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ +{ \ + f77_int info = 0; \ + f77_int left, right; \ + f77_int lower, upper; \ + f77_int nota, ta, conja; \ + f77_int unita, nonua; \ + f77_int nrowa; \ +\ + left = PASTE_LSAME( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ + right = PASTE_LSAME( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ + lower = PASTE_LSAME( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ + upper = PASTE_LSAME( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ + nota = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \ + ta = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \ + conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \ + unita = PASTE_LSAME( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ + nonua = PASTE_LSAME( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ +\ + if ( left ) { nrowa = *m; } \ + else { nrowa = *n; } \ +\ + if ( !left && !right ) \ + info = 1; \ + else if ( !lower && !upper ) \ + info = 2; \ + else if ( !nota && !ta && !conja ) \ + info = 3; \ + else if ( !unita && !nonua ) \ + info = 4; \ + else if ( *m < 0 ) \ + info = 5; \ + else if ( *n < 0 ) \ + info = 6; \ + else if ( *lda < bli_max( 1, nrowa ) ) \ + info = 9; \ + else if ( *ldb < bli_max( 1, *m ) ) \ + info = 11; \ +\ + if ( info != 0 ) \ + { \ + char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ +\ + sprintf( func_str, "%s%-5s", dt_str, op_str ); \ +\ + bli_string_mkupper( func_str ); \ +\ + PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *dt_str, *side, *m, *n); \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +\ + return; \ + } \ +} From 15f9a747af40516ea7d17e819d8412ec037260bf Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Fri, 8 Sep 2023 18:14:11 +0530 Subject: [PATCH 135/226] Fix for non-x86 builds: bli_gemmt_sup_var1n2m.c bli_gemmt_sup_var1n2m.c contained x86 specific code. Move to frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c and restore bli_gemmt_sup_var1n2m.c as of commit 10ca8710f0 as variant for non-AMD codepath builds. AMD-Internal: [CPUPL-3838] Change-Id: I88db20b93b2dbcbbf5092a4cb78f14dd1179975f --- frame/3/gemmt/CMakeLists.txt | 20 +- frame/3/gemmt/bli_gemmt_sup_var1n2m.c | 586 +---- frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c | 2798 +++++++++++++++++++++ 3 files changed, 2952 insertions(+), 452 deletions(-) create mode 100644 frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c diff --git a/frame/3/gemmt/CMakeLists.txt b/frame/3/gemmt/CMakeLists.txt index d2b8228336..44437e66a3 100644 --- a/frame/3/gemmt/CMakeLists.txt +++ b/frame/3/gemmt/CMakeLists.txt @@ -1,9 +1,25 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## target_sources("${PROJECT_NAME}" PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmt_front.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmt_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmt_sup_var1n2m.c ) +# Select AMD specific sources for AMD configurations. +if(${TARGET_ARCH} STREQUAL zen OR +${TARGET_ARCH} STREQUAL zen2 OR +${TARGET_ARCH} STREQUAL zen3 OR +${TARGET_ARCH} STREQUAL zen4 OR +${TARGET_ARCH} STREQUAL amdzen) + target_sources("${PROJECT_NAME}" + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmt_sup_var1n2m_amd.c + ) +else() + target_sources("${PROJECT_NAME}" + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmt_sup_var1n2m.c + ) +endif() + diff --git a/frame/3/gemmt/bli_gemmt_sup_var1n2m.c b/frame/3/gemmt/bli_gemmt_sup_var1n2m.c index 0b5176a6ab..382ca6f67d 100644 --- a/frame/3/gemmt/bli_gemmt_sup_var1n2m.c +++ b/frame/3/gemmt/bli_gemmt_sup_var1n2m.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2020 - 23, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -55,70 +55,6 @@ typedef void (*FUNCPTR_T) rntm_t* restrict rntm, thrinfo_t* restrict thread ); - - -// Declaration of gemmt specific kernels function pointer -// This is aligned to bli_dgemmsup_rv_haswell_asm_6x8m function protype. -typedef void (*gemmt_ker_ft) - ( - conj_t conja, - conj_t conjb, - dim_t m0, - dim_t n0, - dim_t k0, - double* restrict alpha, - double* restrict a, inc_t rs_a0, inc_t cs_a0, - double* restrict b, inc_t rs_b0, inc_t cs_b0, - double* restrict beta, - double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx - ); - -// these kernels are compiled as part of haswell config -// use them only when BLIS_KERNELS_HASWELL is defined -#ifdef BLIS_KERNELS_HASWELL -//Look-up table for Gemmt Upper Variant Kernels -gemmt_ker_ft ker_fpus[14] = - { - bli_dgemmsup_rv_haswell_asm_6x8m_0x0_U, - bli_dgemmsup_rv_haswell_asm_6x8m_6x0_U, - bli_dgemmsup_rv_haswell_asm_6x8m_6x8_U, - bli_dgemmsup_rv_haswell_asm_6x8m_12x8_U, - bli_dgemmsup_rv_haswell_asm_6x8m_12x16_U, - bli_dgemmsup_rv_haswell_asm_6x8m_18x16_U, - bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U, - bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U, - bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U, - bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U, - bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U, - bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U, - bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U, - bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U}; - -//Look-up table for Gemmt Lower Variant Kernels -gemmt_ker_ft ker_fpls[14] = - { - bli_dgemmsup_rv_haswell_asm_6x8m_0x0_L, - bli_dgemmsup_rv_haswell_asm_6x8m_6x0_L, - bli_dgemmsup_rv_haswell_asm_6x8m_6x8_L, - bli_dgemmsup_rv_haswell_asm_6x8m_12x8_L, - bli_dgemmsup_rv_haswell_asm_6x8m_12x16_L, - bli_dgemmsup_rv_haswell_asm_6x8m_18x16_L, - bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L, - bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L, - bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L, - bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L, - bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L, - bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L, - bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L, - bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L - }; -#else -gemmt_ker_ft ker_fpls[1]; -gemmt_ker_ft ker_fpus[1]; -#endif - // // -- var1n -------------------------------------------------------------------- // @@ -226,12 +162,6 @@ void bli_gemmtsup_ref_var1n cs_b = bli_obj_row_stride( b ); } - - // Optimize some storage/packing cases by transforming them into others. - // These optimizations are expressed by changing trans and/or eff_id. - bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx ); - - bool uploc; if( bli_obj_is_lower( c ) ) { @@ -255,6 +185,12 @@ void bli_gemmtsup_ref_var1n // function pointer. FUNCPTR_T f = ftypes_var1n[dt][uploc]; +#if 1 + // Optimize some storage/packing cases by transforming them into others. + // These optimizations are expressed by changing trans and/or eff_id. + bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx ); +#endif + if ( bli_is_notrans( trans ) ) { // Invoke the function. @@ -364,31 +300,11 @@ void PASTEMACT(ch,opname,uplo,varname) \ stor_id = bli_stor3_trans( stor_id ); \ \ /* Query the context for various blocksizes. */ \ - dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ - dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ - dim_t NC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ - dim_t MC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ - dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ - /* Query the maximum blocksize for MR, which implies a maximum blocksize - extension for the final iteration. */ \ - dim_t MRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_MR, cntx ); \ -\ - /* Query the context for the sup microkernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ -\ - if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC0 ) || ( 0 == MC0 ) || ( 0 == KC0 ) ) \ - { \ - NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ - MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ - NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ - MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ - KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ - MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ - } \ - const dim_t MRE = MRM - MR; \ + const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ \ dim_t KC; \ if ( packa && packb ) \ @@ -432,6 +348,11 @@ void PASTEMACT(ch,opname,uplo,varname) \ because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \ const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \ const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \ +\ + /* Query the maximum blocksize for MR, which implies a maximum blocksize + extension for the final iteration. */ \ + const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const dim_t MRE = MRM - MR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = rs_c; \ @@ -451,6 +372,11 @@ void PASTEMACT(ch,opname,uplo,varname) \ const inc_t irstep_c = cs_c * NR; \ const inc_t irstep_b = cs_b * NR; \ */ \ +\ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ @@ -848,31 +774,11 @@ void PASTEMACT(ch,opname,uplo,varname) \ stor_id = bli_stor3_trans( stor_id ); \ \ /* Query the context for various blocksizes. */ \ - dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ - dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ - dim_t NC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ - dim_t MC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ - dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ -\ - /* Query the maximum blocksize for MR, which implies a maximum blocksize - extension for the final iteration. */ \ - dim_t MRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_MR, cntx ); \ - /* Query the context for the sup microkernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ -\ - if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC0 ) || ( 0 == MC0 ) || ( 0 == KC0 ) ) \ - { \ - NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ - MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ - NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ - MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ - KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ - MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ - } \ - const dim_t MRE = MRM - MR; \ + const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ \ dim_t KC; \ if ( packa && packb ) \ @@ -916,6 +822,11 @@ void PASTEMACT(ch,opname,uplo,varname) \ because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \ const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \ const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \ +\ + /* Query the maximum blocksize for MR, which implies a maximum blocksize + extension for the final iteration. */ \ + const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const dim_t MRE = MRM - MR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = rs_c; \ @@ -935,6 +846,11 @@ void PASTEMACT(ch,opname,uplo,varname) \ const inc_t irstep_c = cs_c * NR; \ const inc_t irstep_b = cs_b * NR; \ */ \ +\ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ @@ -1380,13 +1296,8 @@ void bli_gemmtsup_ref_var2m cs_b = bli_obj_row_stride( b ); } - - // Optimize some storage/packing cases by transforming them into others. - // These optimizations are expressed by changing trans and/or eff_id. - bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx ); - - bool uploc; + if ( bli_is_notrans ( trans ) ) uploc = bli_obj_is_lower( c ) ? 0 : 1; else @@ -1405,7 +1316,11 @@ void bli_gemmtsup_ref_var2m // function pointer. FUNCPTR_T f = ftypes_var2m[dt][uploc]; - +#if 0 + // Optimize some storage/packing cases by transforming them into others. + // These optimizations are expressed by changing trans and/or eff_id. + bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx ); +#endif if ( bli_is_notrans( trans ) ) { @@ -1507,31 +1422,11 @@ void PASTEMACT(ch,opname,uplo,varname) \ } \ \ /* Query the context for various blocksizes. */ \ - dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ - dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ - dim_t NC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ - dim_t MC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ - dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ - /* Query the maximum blocksize for NR, which implies a maximum blocksize - extension for the final iteration. */ \ - dim_t NRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - /* Query the context for the sup microkernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ -\ - if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC ) || ( 0 == MC ) || ( 0 == KC0 ) ) \ - { \ - NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ - MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ - NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ - MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ - KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ - NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ - } \ - const dim_t NRE = NRM - NR; \ + const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ \ dim_t KC; \ if ( packa && packb ) \ @@ -1570,6 +1465,11 @@ void PASTEMACT(ch,opname,uplo,varname) \ else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ else KC = (( KC0 / 5 ) / 4 ) * 4; \ } \ +\ + /* Query the maximum blocksize for NR, which implies a maximum blocksize + extension for the final iteration. */ \ + const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ + const dim_t NRE = NRM - NR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ @@ -1593,11 +1493,15 @@ void PASTEMACT(ch,opname,uplo,varname) \ const inc_t irstep_a = rs_a * MR; \ */ \ \ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype ) ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ \ /* storage-scheme of ct should be same as that of C. Since update routines only support row-major order, - col_pref flag is used to induce transpose to matrices before + col_pref flag is used to induce transpose to matrices before passing to update routine whenever C is col-stored */ \ const bool col_pref = (rs_c == 1)? 1 : 0; \ \ @@ -1929,144 +1833,40 @@ void PASTEMACT(ch,opname,uplo,varname) \ { \ const dim_t mr_cur = (i+MR-1) < mc_cur ? MR : mc_cur - i; \ \ - /* Prerequisites : MR = 6, NR = 8. - An optimization: allow the last jr iteration to contain up to NRE - In DGEMMT API implementation, kernel operates on 6x8 block. MR and - NR are set as 6 and 8 respectively. 24 being the LCM of 6 and 8, - the diagonal pattern repeats for every 24x24 block. - This pattern is exploited to achieve the optimization in diagonal - blocks by computing only the required elements. In the previous - implementation, all the 48 outputs of the given 6x8 block are - computed and stored into a temporary buffer. Later, the required - elements are copied into the final C output buffer. - With this optimization, we are avoiding copy operation and also - reducing the number of computations. - Variables m_off_24 and n_off_24 respectively store the m and n - offsets from the starting point of the corresponding 24x24 block. - Variables m_idx and n_idx store indices of the current 6x8 block - along m and n dimensions, in 24x24 block. m_idx is computed as - (m_off_24 / MR) while n_idx is computed as (n_off_24 / NR). - Range of m_idx is 0 <= m_idx <= 3 and the range of n_idx is - 0 <= n_idx <= 2. Based on these indices, for the given 6x8 block, - logic is implemented to identify the relevant kernel from the - look-up table. - During instances, where m is not a multiple of 6 or n is not a - multiple of 8, it goes to the default gemm kernel. MR and NR must be - 6 and 8 for these kernels to achieve the expected functionality.*/ \ -\ - dim_t m_off_24 = m_off_cblock % 24; \ - dim_t n_off_24 = n_off_cblock % 24; \ - dim_t m_idx = (dim_t)(m_off_24 / MR); \ - dim_t n_idx = (dim_t)(n_off_24 / NR); \ -\ - /* Check if m, n indices are multiple of MR and NR respectively - and current block is a complete 6x8 block */ \ - bool idx_supported = ((m_off_24 % MR) == 0) && ((n_off_24 % NR) == 0)\ - && (MR == 6) && (NR == 8) \ - && (bli_cpuid_is_avx2fma3_supported() == TRUE) && (mr_cur == MR) && (nr_cur == NR); \ -\ - /* m_idx and n_idx would be equal only if the current block is - a diagonal block */\ - if( (dt == BLIS_DOUBLE) && (m_idx == n_idx) && (idx_supported) ) { \ - /* index of kernel in lookup table is 2*m_idx) */ \ - dim_t ker_idx; \ - ker_idx = m_idx<<1; \ -\ - /* If there is another 6x8 diagonal block pending for computation - after the current 6x8 diagonal block, then the two blocks can - be computed together(12x8). This combined kernel is implemented - only for the case where n_idx = 2 i.e., n_off_24 = 16. To call - this, it has to be ensured that at least 12 rows are pending in - C for computation. (m_off + 2 * MR <=m). Usage of this combined - kernel saves the entire time to execute one kernel*/ \ - if( (n_idx == 2) && (m_off_cblock + MR + MR <= m) ) {\ - ker_idx = 6; /* use combined kernel, index of combined kernel - in lookup table is 6 */\ - } \ - /* use rd kernel if B is column major storage */ \ - if( stor_id == BLIS_RRC ) { \ - ker_idx += 7; /* index of rd kernel*/ \ - } \ - gemmt_ker_ft ker_fp = ker_fpls[ker_idx]; \ - ker_fp \ - ( \ - conja, \ - conjb, \ - mr_cur, \ - nr_cur, \ - kc_cur, \ - (double*) alpha_cast, \ - (double*) a_ir, rs_a_use, cs_a_use, \ - (double*) b_jr, rs_b_use, cs_b_use, \ - (double*) beta_use, \ - (double*) c_ir, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ + /* Invoke the gemmsup millikernel. */ \ + gemmsup_ker \ + ( \ + conja, \ + conjb, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + alpha_cast, \ + a_ir, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ + /* Scale the bottom edge of C and add the result from above. */ \ + /* If c and ct are col-major, induce transpose and call update for upper-triangle of C */ \ + if( col_pref ) \ + { \ + PASTEMAC(ch,update_upper_triang)( n_off_cblock, m_off_cblock, \ + nr_cur, mr_cur, \ + ct, cs_ct, rs_ct, \ + beta_use, \ + c_ir, cs_c, rs_c ); \ } \ - /* 6x8 block where m_idx == n_idx+1 also has some parts of the diagonal */\ - else if( (dt == BLIS_DOUBLE) && (m_idx == n_idx+1) && (idx_supported) ) { \ - /* If current block was already computed in the combined kernel it - can be skipped combined kernel is only implemented for n_idx=2, - i == m_zero is only true for the first iteration therefore if - i == m_zero then the current 6x8 block was not computed in - combined kernel*/ \ - if( (n_idx != 2) || (i == m_zero) ) { \ - dim_t ker_idx = (n_idx << 1) + 1; \ - /* use rd kernel if B is column major storage */ \ - if( stor_id == BLIS_RRC ) { ker_idx += 7; } \ - gemmt_ker_ft ker_fp = ker_fpls[ker_idx]; \ - ker_fp \ - ( \ - conja, \ - conjb, \ - mr_cur, \ - nr_cur, \ - kc_cur, \ - (double*) alpha_cast, \ - (double*) a_ir, rs_a_use, cs_a_use, \ - (double*) b_jr, rs_b_use, cs_b_use, \ - (double*) beta_use, \ - (double*) c_ir, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ + else \ + { \ + PASTEMAC(ch,update_lower_triang)( m_off_cblock, n_off_cblock, \ + mr_cur, nr_cur, \ + ct, rs_ct, cs_ct, \ + beta_use, \ + c_ir, rs_c, cs_c ); \ } \ - /* Call the regular kernel for non applicable cases */ \ - else { \ - gemmsup_ker \ - ( \ - conja, \ - conjb, \ - mr_cur, \ - nr_cur, \ - kc_cur, \ - alpha_cast, \ - a_ir, rs_a_use, cs_a_use, \ - b_jr, rs_b_use, cs_b_use, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ - if( col_pref ) \ - { \ - PASTEMAC(ch,update_upper_triang)( n_off_cblock, m_off_cblock, \ - nr_cur, mr_cur, \ - ct, cs_ct, rs_ct, \ - beta_use, \ - c_ir, cs_c, rs_c ); \ - } \ - else \ - { \ - PASTEMAC(ch,update_lower_triang)( m_off_cblock, n_off_cblock, \ - mr_cur, nr_cur, \ - ct, rs_ct, cs_ct, \ - beta_use, \ - c_ir, rs_c, cs_c ); \ - }\ - }\ \ a_ir += ps_a_use; \ c_ir += irstep_c; \ @@ -2174,32 +1974,11 @@ void PASTEMACT(ch,opname,uplo,varname) \ } \ \ /* Query the context for various blocksizes. */ \ - dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ - dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ - dim_t NC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ - dim_t MC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ - dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ -\ - /* Query the maximum blocksize for NR, which implies a maximum blocksize - extension for the final iteration. */ \ - dim_t NRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - /* Query the context for the sup microkernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ -\ - if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC ) || ( 0 == MC ) || ( 0 == KC0 ) ) \ - { \ - NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ - MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ - NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ - MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ - KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ - NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ - } \ - const dim_t NRE = NRM - NR; \ + const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ \ dim_t KC; \ if ( packa && packb ) \ @@ -2245,6 +2024,11 @@ void PASTEMACT(ch,opname,uplo,varname) \ else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ else KC = (( KC0 / 5 ) / 4 ) * 4; \ } \ +\ + /* Query the maximum blocksize for NR, which implies a maximum blocksize + extension for the final iteration. */ \ + const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ + const dim_t NRE = NRM - NR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ @@ -2268,6 +2052,10 @@ void PASTEMACT(ch,opname,uplo,varname) \ const inc_t irstep_a = rs_a * MR; \ */ \ \ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype ) ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ \ /* Storage scheme of ct should be same as that of C. @@ -2621,142 +2409,40 @@ void PASTEMACT(ch,opname,uplo,varname) \ for( dim_t i = m_rect;( i < mc_cur) && (m_off_cblock < n_off_cblock + nr_cur); i += MR ) \ { \ const dim_t mr_cur = (i+MR-1) < mc_cur ? MR : mc_cur - i; \ - /* Prerequisites : MR = 6, NR = 8. - An optimization: allow the last jr iteration to contain up to NRE - In DGEMMT API implementation, kernel operates on 6x8 block. MR and - NR are set as 6 and 8 respectively. 24 being the LCM of 6 and 8, - the diagonal pattern repeats for every 24x24 block. - This pattern is exploited to achieve the optimization in diagonal - blocks by computing only the required elements. In the previous - implementation, all the 48 outputs of the given 6x8 block are - computed and stored into a temporary buffer. Later, the required - elements are copied into the final C output buffer. - With this optimization, we are avoiding copy operation and also - reducing the number of computations. - Variables m_off_24 and n_off_24 respectively store the m and n - offsets from the starting point of the corresponding 24x24 block. - Variables m_idx and n_idx store indices of the current 6x8 block - along m and n dimensions, in 24x24 block. m_idx is computed as - (m_off_24 / MR) while n_idx is computed as (n_off_24 / NR). - Range of m_idx is 0 <= m_idx <= 3 and the range of n_idx is - 0 <= n_idx <= 2. Based on these indices, for the given 6x8 block, - logic is implemented to identify the relevant kernel from the - look-up table. - During instances, where m is not a multiple of 6 or n is not a - multiple of 8, it goes to the default gemm kernel. MR and NR must be - 6 and 8 for these kernels to achieve the expected functionality.*/ \ - dim_t m_off_24 = m_off_cblock % 24; \ - dim_t n_off_24 = n_off_cblock % 24; \ - dim_t m_idx = (dim_t)(m_off_24 / MR); \ - dim_t n_idx = (dim_t)(n_off_24 / NR); \ -\ - /* Check if m, n indices are multiple of MR and NR respectively - and current block is a complete 6x8 block */ \ - bool idx_supported = ((m_off_24 % MR) == 0) && ((n_off_24 % NR) == 0)\ - && (MR == 6) && (NR == 8) \ - && (bli_cpuid_is_avx2fma3_supported() == TRUE) && (mr_cur==MR) && (nr_cur==NR); \ -\ - /* m_idx and n_idx would be equal only if the current block is - a diagonal block */\ - if( (dt == BLIS_DOUBLE) && (m_idx == n_idx) && idx_supported ) { \ - dim_t ker_idx = m_idx<<1; \ - /* If there is another 6x8 diagonal block pending for computation - after the current 6x8 diagonal block, then the two blocks can - be computed together(12x8). This combined kernel is implemented - only for the case where n_idx = 0 i.e., n_off_24 = 0. To call - this, it has to be ensured that at least 12 rows are pending in - C for computation (i+ MR + MR <= mc_cur). Usage of this combined - kernel saves the entire time to execute one kernel*/ \ - if( (n_idx == 0) && (i+ MR + MR <= mc_cur) ) { \ - ker_idx = 6; /* use combined kernel, index of combined kernel - in lookup table is 6 */\ - } \ - /* if B is column storage we use rd kernel*/ \ - if( stor_id == BLIS_RRC ) { \ - ker_idx += 7; /* index of rd kernel*/\ - } \ - gemmt_ker_ft ker_fp = ker_fpus[ker_idx]; \ - ker_fp \ - ( \ - conja, \ - conjb, \ - mr_cur, \ - nr_cur, \ - kc_cur, \ - (double*) alpha_cast, \ - (double*) a_ir, rs_a_use, cs_a_use, \ - (double*) b_jr, rs_b_use, cs_b_use, \ - (double*) beta_use, \ - (double*) c_ir, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - /* 6x8 block where m_idx == n_idx+1 also has some parts of the diagonal */\ - else if( (dt == BLIS_DOUBLE) && (m_idx == n_idx+1) && (idx_supported) ) { \ - /* If current block was already computed in the combined kernel it - can be skipped combined kernel is only implemented for n_idx=0, - i == m_rect is only true for the first iteration therefore if - i == m_rect then the current 6x8 block was not computed in - combined kernel*/ \ - if( (n_idx != 0) || (i == m_rect) ) { \ - dim_t ker_idx = (n_idx << 1) + 1 ; \ - /* use rd kernel if B is column major storage */ \ - if( stor_id == BLIS_RRC ) { ker_idx += 7; } \ - gemmt_ker_ft ker_fp = ker_fpus[ker_idx]; \ - ker_fp \ - ( \ - conja, \ - conjb, \ - mr_cur, \ - nr_cur, \ - kc_cur, \ - (double*) alpha_cast, \ - (double*) a_ir, rs_a_use, cs_a_use, \ - (double*) b_jr, rs_b_use, cs_b_use, \ - (double*) beta_use, \ - (double*) c_ir, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ +\ + /* Invoke the gemmsup millikernel. */ \ + gemmsup_ker \ + ( \ + conja, \ + conjb, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + alpha_cast, \ + a_ir, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + if( col_pref ) \ + { \ + PASTEMAC(ch,update_lower_triang)( n_off_cblock, m_off_cblock, \ + nr_cur, mr_cur, \ + ct, cs_ct, rs_ct, \ + beta_use, \ + c_ir, cs_c, rs_c ); \ } \ - /* call the regular kernel for non applicable cases */ \ - else { \ - gemmsup_ker \ - ( \ - conja, \ - conjb, \ - mr_cur, \ - nr_cur, \ - kc_cur, \ - alpha_cast, \ - a_ir, rs_a_use, cs_a_use, \ - b_jr, rs_b_use, cs_b_use, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ - \ - if( col_pref ) \ - { \ - PASTEMAC(ch,update_lower_triang)( n_off_cblock, m_off_cblock, \ - nr_cur, mr_cur, \ - ct, cs_ct, rs_ct, \ - beta_use, \ - c_ir, cs_c, rs_c ); \ - } \ - else \ - { \ - PASTEMAC(ch,update_upper_triang)( m_off_cblock, n_off_cblock, \ - mr_cur, nr_cur, \ - ct, rs_ct, cs_ct, \ - beta_use, \ - c_ir, rs_c, cs_c ); \ - } \ + else \ + { \ + PASTEMAC(ch,update_upper_triang)( m_off_cblock, n_off_cblock, \ + mr_cur, nr_cur, \ + ct, rs_ct, cs_ct, \ + beta_use, \ + c_ir, rs_c, cs_c ); \ } \ -\ a_ir += ps_a_use; \ c_ir += irstep_c; \ m_off_cblock += mr_cur; \ diff --git a/frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c b/frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c new file mode 100644 index 0000000000..0b5176a6ab --- /dev/null +++ b/frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c @@ -0,0 +1,2798 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020 - 23, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmtsup_fp + +typedef void (*FUNCPTR_T) + ( + bool packa, + bool packb, + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* restrict alpha, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict b, inc_t rs_b, inc_t cs_b, + void* restrict beta, + void* restrict c, inc_t rs_c, inc_t cs_c, + stor3_t eff_id, + cntx_t* restrict cntx, + rntm_t* restrict rntm, + thrinfo_t* restrict thread + ); + + +// Declaration of gemmt specific kernels function pointer +// This is aligned to bli_dgemmsup_rv_haswell_asm_6x8m function protype. +typedef void (*gemmt_ker_ft) + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ); + +// these kernels are compiled as part of haswell config +// use them only when BLIS_KERNELS_HASWELL is defined +#ifdef BLIS_KERNELS_HASWELL +//Look-up table for Gemmt Upper Variant Kernels +gemmt_ker_ft ker_fpus[14] = + { + bli_dgemmsup_rv_haswell_asm_6x8m_0x0_U, + bli_dgemmsup_rv_haswell_asm_6x8m_6x0_U, + bli_dgemmsup_rv_haswell_asm_6x8m_6x8_U, + bli_dgemmsup_rv_haswell_asm_6x8m_12x8_U, + bli_dgemmsup_rv_haswell_asm_6x8m_12x16_U, + bli_dgemmsup_rv_haswell_asm_6x8m_18x16_U, + bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U, + bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U, + bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U, + bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U, + bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U, + bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U, + bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U, + bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U}; + +//Look-up table for Gemmt Lower Variant Kernels +gemmt_ker_ft ker_fpls[14] = + { + bli_dgemmsup_rv_haswell_asm_6x8m_0x0_L, + bli_dgemmsup_rv_haswell_asm_6x8m_6x0_L, + bli_dgemmsup_rv_haswell_asm_6x8m_6x8_L, + bli_dgemmsup_rv_haswell_asm_6x8m_12x8_L, + bli_dgemmsup_rv_haswell_asm_6x8m_12x16_L, + bli_dgemmsup_rv_haswell_asm_6x8m_18x16_L, + bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L, + bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L, + bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L, + bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L, + bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L, + bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L, + bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L, + bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L + }; +#else +gemmt_ker_ft ker_fpls[1]; +gemmt_ker_ft ker_fpus[1]; +#endif + +// +// -- var1n -------------------------------------------------------------------- +// + +static FUNCPTR_T GENARRAY_T(ftypes_var1n,gemmtsup,ref_var1n); + +void bli_gemmtsup_ref_var1n + ( + trans_t trans, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + stor3_t eff_id, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_5); +#if 0 + obj_t at, bt; + + bli_obj_alias_to( a, &at ); + bli_obj_alias_to( b, &bt ); + + // Induce transpositions on A and/or B if either object is marked for + // transposition. We can induce "fast" transpositions since they objects + // are guaranteed to not have structure or be packed. + if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } + if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } + + const num_t dt = bli_obj_dt( c ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + + const dim_t k = bli_obj_width( &at ); + + void* restrict buf_a = bli_obj_buffer_at_off( &at ); + const inc_t rs_a = bli_obj_row_stride( &at ); + const inc_t cs_a = bli_obj_col_stride( &at ); + + void* restrict buf_b = bli_obj_buffer_at_off( &bt ); + const inc_t rs_b = bli_obj_row_stride( &bt ); + const inc_t cs_b = bli_obj_col_stride( &bt ); + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + +#else + const num_t dt = bli_obj_dt( c ); + + const bool packa = bli_rntm_pack_a( rntm ); + const bool packb = bli_rntm_pack_b( rntm ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + dim_t k; + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + inc_t rs_a; + inc_t cs_a; + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b; + inc_t cs_b; + + if ( bli_obj_has_notrans( a ) ) + { + k = bli_obj_width( a ); + + rs_a = bli_obj_row_stride( a ); + cs_a = bli_obj_col_stride( a ); + } + else // if ( bli_obj_has_trans( a ) ) + { + // Assign the variables with an implicit transposition. + k = bli_obj_length( a ); + + rs_a = bli_obj_col_stride( a ); + cs_a = bli_obj_row_stride( a ); + } + + if ( bli_obj_has_notrans( b ) ) + { + rs_b = bli_obj_row_stride( b ); + cs_b = bli_obj_col_stride( b ); + } + else // if ( bli_obj_has_trans( b ) ) + { + // Assign the variables with an implicit transposition. + rs_b = bli_obj_col_stride( b ); + cs_b = bli_obj_row_stride( b ); + } + + + // Optimize some storage/packing cases by transforming them into others. + // These optimizations are expressed by changing trans and/or eff_id. + bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx ); + + + bool uploc; + if( bli_obj_is_lower( c ) ) + { + uploc = 0; + } + else + { + uploc = 1; + } + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + +#endif + + // Index into the type combination array to extract the correct + // function pointer. + FUNCPTR_T f = ftypes_var1n[dt][uploc]; + + if ( bli_is_notrans( trans ) ) + { + // Invoke the function. + f + ( + packa, + packb, + conja, + conjb, + m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + eff_id, + cntx, + rntm, + thread + ); + } + else + { + // Invoke the function (transposing the operation). + f + ( + packb, + packa, + conjb, // swap the conj values. + conja, + n, // swap the m and n dimensions. + m, + k, + buf_alpha, + buf_b, cs_b, rs_b, // swap the positions of A and B. + buf_a, cs_a, rs_a, // swap the strides of A and B. + buf_beta, + buf_c, cs_c, rs_c, // swap the strides of C. + bli_stor3_trans( eff_id ), // transpose the stor3_t id. + cntx, + rntm, + thread + ); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_5); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, uplo, varname ) \ +\ +void PASTEMACT(ch,opname,uplo,varname) \ + ( \ + bool packa, \ + bool packb, \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + stor3_t stor_id, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* If m or n is zero, return immediately. */ \ + if ( bli_zero_dim2( m, n ) ) return; \ +\ + /* If k < 1 or alpha is zero, scale by beta and return. */ \ + if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + PASTEMAC(ch,scalm) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m, n, \ + beta, \ + c, rs_c, cs_c \ + ); \ + } \ + return; \ + } \ +\ + /* This transposition of the stor3_t id value is inherent to variant 1. + The reason: we assume that variant 2 is the "main" variant. The + consequence of this is that we assume that the millikernels that + iterate over m are registered to the "primary" kernel group associated + with the kernel IO preference; similarly, mkernels that iterate over + n are assumed to be registered to the "non-primary" group associated + with the ("non-primary") anti-preference. Note that this pattern holds + regardless of whether the mkernel set has a row or column preference.) + See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \ + stor_id = bli_stor3_trans( stor_id ); \ +\ + /* Query the context for various blocksizes. */ \ + dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ + dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ + dim_t NC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ + dim_t MC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ + dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ + /* Query the maximum blocksize for MR, which implies a maximum blocksize + extension for the final iteration. */ \ + dim_t MRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_MR, cntx ); \ +\ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ +\ + if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC0 ) || ( 0 == MC0 ) || ( 0 == KC0 ) ) \ + { \ + NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ + } \ + const dim_t MRE = MRM - MR; \ +\ + dim_t KC; \ + if ( packa && packb ) \ + { \ + KC = KC0; \ + } \ + else if ( packb ) \ + { \ + if ( stor_id == BLIS_RRR || \ + stor_id == BLIS_CCC ) KC = KC0; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( stor_id == BLIS_RCR || \ + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ + else KC = KC0; \ + } \ + else if ( packa ) \ + { \ + if ( stor_id == BLIS_RRR || \ + stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( stor_id == BLIS_RCR || \ + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ + else KC = KC0; \ + } \ + else /* if ( !packa && !packb ) */ \ + { \ + if ( FALSE ) KC = KC0; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( m <= MR && n <= NR ) KC = KC0; \ + else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \ + else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ + else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ + else KC = (( KC0 / 5 ) / 4 ) * 4; \ + } \ +\ + /* Nudge NC up to a multiple of MR and MC up to a multiple of NR. + NOTE: This is unique to variant 1 (ie: not performed in variant 2) + because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \ + const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \ + const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = rs_c; \ + const inc_t jcstep_a = rs_a; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = cs_c; \ + const inc_t icstep_b = cs_b; \ +\ + const inc_t jrstep_c = rs_c * MR; \ +\ + /* + const inc_t jrstep_a = rs_a * MR; \ +\ + const inc_t irstep_c = cs_c * NR; \ + const inc_t irstep_b = cs_b * NR; \ + */ \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of beta and one scalars to prevent any unnecessary + sharing of cache lines between the cores' caches. */ \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ +\ + auxinfo_t aux; \ +\ + /* Parse and interpret the contents of the rntm_t object to properly + set the ways of parallelism for each loop. */ \ + /*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. An alternative way of initializing the + mem_t entries is: + + bli_mem_clear( &mem_a ); \ + bli_mem_clear( &mem_b ); \ + */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. + NOTE: These bszid_t values, and their order, match that of the bp + algorithm (variant 2) because they are not used to query actual + blocksizes but rather query the ways of parallelism for the various + loops. For example, the 2nd loop in variant 1 partitions in the m + dimension (in increments of MR), but parallelizes that m dimension + with BLIS_JR_NT. The only difference is that the _packa and _packb + arrays have been adjusted for the semantic difference in order in + which packa and packb nodes are encountered in the thrinfo tree. + That is, this panel-block algorithm partitions an NC x KC submatrix + of A to be packed in the 4th loop, and a KC x MC submatrix of B + to be packed in the 3rd loop. */ \ + /* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \ + bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t* restrict bszids; \ +\ + /* Set the bszids pointer to the correct bszids array above based on which + matrices (if any) are being packed. */ \ + if ( packa ) { if ( packb ) bszids = bszids_packab; \ + else bszids = bszids_packa; } \ + else { if ( packb ) bszids = bszids_packb; \ + else bszids = bszids_nopack; } \ +\ + /* Determine whether we are using more than one thread. */ \ + const bool is_mt = bli_rntm_calc_num_threads( rntm ); \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jc = bszids; \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \ + const dim_t m_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = m_local % NC; \ +\ + /* Loop over the m dimension (NC rows/columns at a time). */ \ + /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict a_jc = a_00 + jj * jcstep_a; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_pc = &bszids_jc[1]; \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_jc + pp * pcstep_a; \ + ctype* restrict b_pc = b_00 + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing A. If we won't be packing A, we alias to + the _pc variables so that code further down can unconditionally + reference the _pa variables. Note that *if* we will be packing + A, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pa; \ + if ( packa ) { bszids_pa = &bszids_pc[1]; \ + thread_pa = bli_thrinfo_sub_node( thread_pc ); } \ + else { bszids_pa = &bszids_pc[0]; \ + thread_pa = thread_pc; } \ +\ + /* Determine the packing buffer and related parameters for matrix + A. (If A will not be packed, then a_use will be set to point to + a and the _a_use strides will be set accordingly.) Then call + the packm sup variant chooser, which will call the appropriate + implementation based on the schema deduced from the stor_id. + NOTE: packing matrix A in this panel-block algorithm corresponds + to packing matrix B in the block-panel algorithm. */ \ + PASTEMAC(ch,packm_sup_a) \ + ( \ + packa, \ + BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \ + stor_id, /* a "panel of B". */ \ + BLIS_NO_TRANSPOSE, \ + NC, KC, /* This "panel of B" is (at most) NC x KC. */ \ + nc_cur, kc_cur, MR, \ + &one_local, \ + a_pc, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_pc_use = a_use; \ +\ + /* We don't need to embed the panel stride of A within the auxinfo_t + object because this variant iterates through A in the jr loop, + which occurs here, within the macrokernel, not within the + millikernel. */ \ + /*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_ic = &bszids_pa[1]; \ + thread_ic = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \ + const dim_t n_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = n_local % MC; \ +\ + /* Loop over the n dimension (MC rows at a time). */ \ + /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict b_ic = b_pc + ii * icstep_b; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing A. If we won't be packing A, we alias to + the _pc variables so that code further down can unconditionally + reference the _pa variables. Note that *if* we will be packing + A, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pb; \ + if ( packb ) { bszids_pb = &bszids_ic[1]; \ + thread_pb = bli_thrinfo_sub_node( thread_ic ); } \ + else { bszids_pb = &bszids_ic[0]; \ + thread_pb = thread_ic; } \ +\ + /* Determine the packing buffer and related parameters for matrix + B. (If B will not be packed, then b_use will be set to point to + b and the _b_use strides will be set accordingly.) Then call + the packm sup variant chooser, which will call the appropriate + implementation based on the schema deduced from the stor_id. + NOTE: packing matrix B in this panel-block algorithm corresponds + to packing matrix A in the block-panel algorithm. */ \ + PASTEMAC(ch,packm_sup_b) \ + ( \ + packb, \ + BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \ + stor_id, /* a "block of A". */ \ + BLIS_NO_TRANSPOSE, \ + KC, MC, /* This "block of A" is (at most) KC x MC. */ \ + kc_cur, mc_cur, NR, \ + &one_local, \ + b_ic, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias b_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_ic_use = b_use; \ +\ + /* Embed the panel stride of B within the auxinfo_t object. The + millikernel will query and use this to iterate through + micropanels of B. */ \ + bli_auxinfo_set_ps_b( ps_b_use, &aux ); \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jr = &bszids_pb[1]; \ + thread_jr = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \ + dim_t jr_left = nc_cur % MR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* An optimization: allow the last jr iteration to contain up to MRE + rows of C and A. (If MRE > MR, the mkernel has agreed to handle + these cases.) Note that this prevents us from declaring jr_iter and + jr_left as const. NOTE: We forgo this optimization when packing A + since packing an extended edge case is not yet supported. */ \ + if ( !packa && !is_mt ) \ + if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \ + { \ + jr_iter--; jr_left += MR; \ + } \ +\ + /* Loop over the m dimension (NR columns at a time). */ \ + /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \ +\ + /* + ctype* restrict a_jr = a_pc + j * jrstep_a; \ + */ \ + ctype* restrict a_jr = a_pc_use + j * ps_a_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* + const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \ + const dim_t ir_left = mc_cur % NR; \ + */ \ +\ + /* Loop over the n dimension (MR rows at a time). */ \ + { \ + /* Invoke the gemmsup millikernel. */ \ + gemmsup_ker \ + ( \ + conja, \ + conjb, \ + nr_cur, /* Notice: nr_cur <= MR. */ \ + mc_cur, /* Recall: mc_cur partitions the n dimension! */ \ + kc_cur, \ + alpha_cast, \ + a_jr, rs_a_use, cs_a_use, \ + b_ic_use, rs_b_use, cs_b_use, \ + beta_use, \ + c_jr, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* NOTE: This barrier is only needed if we are packing A (since + that matrix is packed within the pc loop of this variant). */ \ + if ( packa ) bli_thread_barrier( thread_pa ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTEMAC(ch,packm_sup_finalize_mem_a) \ + ( \ + packa, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTEMAC(ch,packm_sup_finalize_mem_b) \ + ( \ + packb, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_L( gemmtsup, ref_var1n ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, uplo, varname ) \ +\ +void PASTEMACT(ch,opname,uplo,varname) \ + ( \ + bool packa, \ + bool packb, \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + stor3_t stor_id, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* If m or n is zero, return immediately. */ \ + if ( bli_zero_dim2( m, n ) ) return; \ +\ + /* If k < 1 or alpha is zero, scale by beta and return. */ \ + if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + PASTEMAC(ch,scalm) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m, n, \ + beta, \ + c, rs_c, cs_c \ + ); \ + } \ + return; \ + } \ +\ + /* This transposition of the stor3_t id value is inherent to variant 1. + The reason: we assume that variant 2 is the "main" variant. The + consequence of this is that we assume that the millikernels that + iterate over m are registered to the "primary" kernel group associated + with the kernel IO preference; similarly, mkernels that iterate over + n are assumed to be registered to the "non-primary" group associated + with the ("non-primary") anti-preference. Note that this pattern holds + regardless of whether the mkernel set has a row or column preference.) + See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \ + stor_id = bli_stor3_trans( stor_id ); \ +\ + /* Query the context for various blocksizes. */ \ + dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ + dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ + dim_t NC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ + dim_t MC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ + dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the maximum blocksize for MR, which implies a maximum blocksize + extension for the final iteration. */ \ + dim_t MRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_MR, cntx ); \ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ +\ + if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC0 ) || ( 0 == MC0 ) || ( 0 == KC0 ) ) \ + { \ + NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ + } \ + const dim_t MRE = MRM - MR; \ +\ + dim_t KC; \ + if ( packa && packb ) \ + { \ + KC = KC0; \ + } \ + else if ( packb ) \ + { \ + if ( stor_id == BLIS_RRR || \ + stor_id == BLIS_CCC ) KC = KC0; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( stor_id == BLIS_RCR || \ + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ + else KC = KC0; \ + } \ + else if ( packa ) \ + { \ + if ( stor_id == BLIS_RRR || \ + stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( stor_id == BLIS_RCR || \ + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ + else KC = KC0; \ + } \ + else /* if ( !packa && !packb ) */ \ + { \ + if ( FALSE ) KC = KC0; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( m <= MR && n <= NR ) KC = KC0; \ + else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \ + else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ + else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ + else KC = (( KC0 / 5 ) / 4 ) * 4; \ + } \ +\ + /* Nudge NC up to a multiple of MR and MC up to a multiple of NR. + NOTE: This is unique to variant 1 (ie: not performed in variant 2) + because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \ + const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \ + const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = rs_c; \ + const inc_t jcstep_a = rs_a; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = cs_c; \ + const inc_t icstep_b = cs_b; \ +\ + const inc_t jrstep_c = rs_c * MR; \ +\ + /* + const inc_t jrstep_a = rs_a * MR; \ +\ + const inc_t irstep_c = cs_c * NR; \ + const inc_t irstep_b = cs_b * NR; \ + */ \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of beta and one scalars to prevent any unnecessary + sharing of cache lines between the cores' caches. */ \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ +\ + auxinfo_t aux; \ +\ + /* Parse and interpret the contents of the rntm_t object to properly + set the ways of parallelism for each loop. */ \ + /*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. An alternative way of initializing the + mem_t entries is: + + bli_mem_clear( &mem_a ); \ + bli_mem_clear( &mem_b ); \ + */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. + NOTE: These bszid_t values, and their order, match that of the bp + algorithm (variant 2) because they are not used to query actual + blocksizes but rather query the ways of parallelism for the various + loops. For example, the 2nd loop in variant 1 partitions in the m + dimension (in increments of MR), but parallelizes that m dimension + with BLIS_JR_NT. The only difference is that the _packa and _packb + arrays have been adjusted for the semantic difference in order in + which packa and packb nodes are encountered in the thrinfo tree. + That is, this panel-block algorithm partitions an NC x KC submatrix + of A to be packed in the 4th loop, and a KC x MC submatrix of B + to be packed in the 3rd loop. */ \ + /* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \ + bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t* restrict bszids; \ +\ + /* Set the bszids pointer to the correct bszids array above based on which + matrices (if any) are being packed. */ \ + if ( packa ) { if ( packb ) bszids = bszids_packab; \ + else bszids = bszids_packa; } \ + else { if ( packb ) bszids = bszids_packb; \ + else bszids = bszids_nopack; } \ +\ + /* Determine whether we are using more than one thread. */ \ + const bool is_mt = bli_rntm_calc_num_threads( rntm ); \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jc = bszids; \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \ + const dim_t m_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = m_local % NC; \ +\ + /* Loop over the m dimension (NC rows/columns at a time). */ \ + /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict a_jc = a_00 + jj * jcstep_a; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_pc = &bszids_jc[1]; \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_jc + pp * pcstep_a; \ + ctype* restrict b_pc = b_00 + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing A. If we won't be packing A, we alias to + the _pc variables so that code further down can unconditionally + reference the _pa variables. Note that *if* we will be packing + A, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pa; \ + if ( packa ) { bszids_pa = &bszids_pc[1]; \ + thread_pa = bli_thrinfo_sub_node( thread_pc ); } \ + else { bszids_pa = &bszids_pc[0]; \ + thread_pa = thread_pc; } \ +\ + /* Determine the packing buffer and related parameters for matrix + A. (If A will not be packed, then a_use will be set to point to + a and the _a_use strides will be set accordingly.) Then call + the packm sup variant chooser, which will call the appropriate + implementation based on the schema deduced from the stor_id. + NOTE: packing matrix A in this panel-block algorithm corresponds + to packing matrix B in the block-panel algorithm. */ \ + PASTEMAC(ch,packm_sup_a) \ + ( \ + packa, \ + BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \ + stor_id, /* a "panel of B". */ \ + BLIS_NO_TRANSPOSE, \ + NC, KC, /* This "panel of B" is (at most) NC x KC. */ \ + nc_cur, kc_cur, MR, \ + &one_local, \ + a_pc, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_pc_use = a_use; \ +\ + /* We don't need to embed the panel stride of A within the auxinfo_t + object because this variant iterates through A in the jr loop, + which occurs here, within the macrokernel, not within the + millikernel. */ \ + /*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_ic = &bszids_pa[1]; \ + thread_ic = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \ + const dim_t n_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = n_local % MC; \ +\ + /* Loop over the n dimension (MC rows at a time). */ \ + /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict b_ic = b_pc + ii * icstep_b; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing A. If we won't be packing A, we alias to + the _pc variables so that code further down can unconditionally + reference the _pa variables. Note that *if* we will be packing + A, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pb; \ + if ( packb ) { bszids_pb = &bszids_ic[1]; \ + thread_pb = bli_thrinfo_sub_node( thread_ic ); } \ + else { bszids_pb = &bszids_ic[0]; \ + thread_pb = thread_ic; } \ +\ + /* Determine the packing buffer and related parameters for matrix + B. (If B will not be packed, then b_use will be set to point to + b and the _b_use strides will be set accordingly.) Then call + the packm sup variant chooser, which will call the appropriate + implementation based on the schema deduced from the stor_id. + NOTE: packing matrix B in this panel-block algorithm corresponds + to packing matrix A in the block-panel algorithm. */ \ + PASTEMAC(ch,packm_sup_b) \ + ( \ + packb, \ + BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \ + stor_id, /* a "block of A". */ \ + BLIS_NO_TRANSPOSE, \ + KC, MC, /* This "block of A" is (at most) KC x MC. */ \ + kc_cur, mc_cur, NR, \ + &one_local, \ + b_ic, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias b_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_ic_use = b_use; \ +\ + /* Embed the panel stride of B within the auxinfo_t object. The + millikernel will query and use this to iterate through + micropanels of B. */ \ + bli_auxinfo_set_ps_b( ps_b_use, &aux ); \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jr = &bszids_pb[1]; \ + thread_jr = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \ + dim_t jr_left = nc_cur % MR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* An optimization: allow the last jr iteration to contain up to MRE + rows of C and A. (If MRE > MR, the mkernel has agreed to handle + these cases.) Note that this prevents us from declaring jr_iter and + jr_left as const. NOTE: We forgo this optimization when packing A + since packing an extended edge case is not yet supported. */ \ + if ( !packa && !is_mt ) \ + if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \ + { \ + jr_iter--; jr_left += MR; \ + } \ +\ + /* Loop over the m dimension (NR columns at a time). */ \ + /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \ +\ + /* + ctype* restrict a_jr = a_pc + j * jrstep_a; \ + */ \ + ctype* restrict a_jr = a_pc_use + j * ps_a_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* + const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \ + const dim_t ir_left = mc_cur % NR; \ + */ \ +\ + /* Loop over the n dimension (MR rows at a time). */ \ + { \ + /* Invoke the gemmsup millikernel. */ \ + gemmsup_ker \ + ( \ + conja, \ + conjb, \ + nr_cur, /* Notice: nr_cur <= MR. */ \ + mc_cur, /* Recall: mc_cur partitions the n dimension! */ \ + kc_cur, \ + alpha_cast, \ + a_jr, rs_a_use, cs_a_use, \ + b_ic_use, rs_b_use, cs_b_use, \ + beta_use, \ + c_jr, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* NOTE: This barrier is only needed if we are packing A (since + that matrix is packed within the pc loop of this variant). */ \ + if ( packa ) bli_thread_barrier( thread_pa ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTEMAC(ch,packm_sup_finalize_mem_a) \ + ( \ + packa, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTEMAC(ch,packm_sup_finalize_mem_b) \ + ( \ + packb, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_U( gemmtsup, ref_var1n ) + + +// +// -- var2m -------------------------------------------------------------------- +// + +static FUNCPTR_T GENARRAY_T(ftypes_var2m,gemmtsup,ref_var2m); + +void bli_gemmtsup_ref_var2m + ( + trans_t trans, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + stor3_t eff_id, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_5); +#if 0 + obj_t at, bt; + + bli_obj_alias_to( a, &at ); + bli_obj_alias_to( b, &bt ); + + // Induce transpositions on A and/or B if either object is marked for + // transposition. We can induce "fast" transpositions since they objects + // are guaranteed to not have structure or be packed. + if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } + if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } + + const num_t dt = bli_obj_dt( c ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + + const dim_t k = bli_obj_width( &at ); + + void* restrict buf_a = bli_obj_buffer_at_off( &at ); + const inc_t rs_a = bli_obj_row_stride( &at ); + const inc_t cs_a = bli_obj_col_stride( &at ); + + void* restrict buf_b = bli_obj_buffer_at_off( &bt ); + const inc_t rs_b = bli_obj_row_stride( &bt ); + const inc_t cs_b = bli_obj_col_stride( &bt ); + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + +#else + const num_t dt = bli_obj_dt( c ); + + const bool packa = bli_rntm_pack_a( rntm ); + const bool packb = bli_rntm_pack_b( rntm ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + dim_t k; + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + inc_t rs_a; + inc_t cs_a; + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b; + inc_t cs_b; + + if ( bli_obj_has_notrans( a ) ) + { + k = bli_obj_width( a ); + + rs_a = bli_obj_row_stride( a ); + cs_a = bli_obj_col_stride( a ); + } + else // if ( bli_obj_has_trans( a ) ) + { + // Assign the variables with an implicit transposition. + k = bli_obj_length( a ); + + rs_a = bli_obj_col_stride( a ); + cs_a = bli_obj_row_stride( a ); + } + + if ( bli_obj_has_notrans( b ) ) + { + rs_b = bli_obj_row_stride( b ); + cs_b = bli_obj_col_stride( b ); + } + else // if ( bli_obj_has_trans( b ) ) + { + // Assign the variables with an implicit transposition. + rs_b = bli_obj_col_stride( b ); + cs_b = bli_obj_row_stride( b ); + } + + + // Optimize some storage/packing cases by transforming them into others. + // These optimizations are expressed by changing trans and/or eff_id. + bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx ); + + + bool uploc; + if ( bli_is_notrans ( trans ) ) + uploc = bli_obj_is_lower( c ) ? 0 : 1; + else + uploc = bli_obj_is_lower( c ) ? 1 : 0; + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + +#endif + + // Index into the type combination array to extract the correct + // function pointer. + FUNCPTR_T f = ftypes_var2m[dt][uploc]; + + + + if ( bli_is_notrans( trans ) ) + { + // Invoke the function. + f + ( + packa, + packb, + conja, + conjb, + m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + eff_id, + cntx, + rntm, + thread + ); + } + else + { + // Invoke the function (transposing the operation). + f + ( + packb, // swap the pack values. + packa, + conjb, // swap the conj values. + conja, + n, // swap the m and n dimensions. + m, + k, + buf_alpha, + buf_b, cs_b, rs_b, // swap the positions of A and B. + buf_a, cs_a, rs_a, // swap the strides of A and B. + buf_beta, + buf_c, cs_c, rs_c, // swap the strides of C. + bli_stor3_trans( eff_id ), // transpose the stor3_t id. + cntx, + rntm, + thread + ); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_5); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, uplo, varname ) \ +\ +void PASTEMACT(ch,opname,uplo,varname) \ + ( \ + bool packa, \ + bool packb, \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + stor3_t stor_id, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ +\ + /* If m or n is zero, return immediately. */ \ + if ( bli_zero_dim2( m, n ) ) return; \ +\ + /* If k < 1 or alpha is zero, scale by beta and return. */ \ + if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + PASTEMAC(ch,scalm) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m, n, \ + beta, \ + c, rs_c, cs_c \ + ); \ + } \ + return; \ + } \ +\ + /* Query the context for various blocksizes. */ \ + dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ + dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ + dim_t NC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ + dim_t MC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ + dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ + /* Query the maximum blocksize for NR, which implies a maximum blocksize + extension for the final iteration. */ \ + dim_t NRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ +\ + if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC ) || ( 0 == MC ) || ( 0 == KC0 ) ) \ + { \ + NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ + } \ + const dim_t NRE = NRM - NR; \ +\ + dim_t KC; \ + if ( packa && packb ) \ + { \ + KC = KC0; \ + } \ + else if ( packb ) \ + { \ + if ( stor_id == BLIS_RRR || \ + stor_id == BLIS_CCC ) KC = KC0; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( stor_id == BLIS_RCR || \ + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ + else KC = KC0; \ + } \ + else if ( packa ) \ + { \ + if ( stor_id == BLIS_RRR || \ + stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( stor_id == BLIS_RCR || \ + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ + else KC = KC0; \ + } \ + else /* if ( !packa && !packb ) */ \ + { \ + if ( stor_id == BLIS_RRR || \ + stor_id == BLIS_CCC ) KC = KC0; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( m <= MR && n <= NR ) KC = KC0; \ + else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \ + else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ + else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ + else KC = (( KC0 / 5 ) / 4 ) * 4; \ + } \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + const inc_t irstep_c = rs_c * MR; \ +\ + /* + const inc_t jrstep_b = cs_b * NR; \ + ( void )jrstep_b; \ +\ + const inc_t irstep_c = rs_c * MR; \ + const inc_t irstep_a = rs_a * MR; \ + */ \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype ) ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ +\ + /* storage-scheme of ct should be same as that of C. + Since update routines only support row-major order, + col_pref flag is used to induce transpose to matrices before + passing to update routine whenever C is col-stored */ \ + const bool col_pref = (rs_c == 1)? 1 : 0; \ +\ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of beta and one scalars to prevent any unnecessary + sharing of cache lines between the cores' caches. */ \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ +\ + auxinfo_t aux; \ +\ + /* Parse and interpret the contents of the rntm_t object to properly + set the ways of parallelism for each loop. */ \ + /*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. An alternative way of initializing the + mem_t entries is: + + bli_mem_clear( &mem_a ); \ + bli_mem_clear( &mem_b ); \ + */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + /* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \ + bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t* restrict bszids; \ +\ + /* Set the bszids pointer to the correct bszids array above based on which + matrices (if any) are being packed. */ \ + if ( packa ) { if ( packb ) bszids = bszids_packab; \ + else bszids = bszids_packa; } \ + else { if ( packb ) bszids = bszids_packb; \ + else bszids = bszids_nopack; } \ +\ + /* Determine whether we are using more than one thread. */ \ + const bool is_mt = bli_rntm_calc_num_threads( rntm ); \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jc = bszids; \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_weighted_sub( thread_jc, 0, BLIS_LOWER, m, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + dim_t m_off_cblock, n_off_cblock; \ + dim_t m_off = 0; \ + dim_t n_off = 0; \ + doff_t diagoffc; \ + dim_t i, ip; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_pc = &bszids_jc[1]; \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + m_off = 0; \ + n_off = jj; \ + diagoffc = m_off - n_off; \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing B. If we won't be packing B, we alias to + the _pc variables so that code further down can unconditionally + reference the _pb variables. Note that *if* we will be packing + B, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pb; \ + if ( packb ) { bszids_pb = &bszids_pc[1]; \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); } \ + else { bszids_pb = &bszids_pc[0]; \ + thread_pb = thread_pc; } \ +\ + /* Determine the packing buffer and related parameters for matrix + B. (If B will not be packed, then a_use will be set to point to + b and the _b_use strides will be set accordingly.) Then call + the packm sup variant chooser, which will call the appropriate + implementation based on the schema deduced from the stor_id. */ \ + PASTEMAC(ch,packm_sup_b) \ + ( \ + packb, \ + BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \ + stor_id, /* a "panel of B." */ \ + BLIS_NO_TRANSPOSE, \ + KC, NC, /* This "panel of B" is (at most) KC x NC. */ \ + kc_cur, nc_cur, NR, \ + &one_local, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_pc_use = b_use; \ +\ + /* We don't need to embed the panel stride of B within the auxinfo_t + object because this variant iterates through B in the jr loop, + which occurs here, within the macrokernel, not within the + millikernel. */ \ + /*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_ic = &bszids_pb[1]; \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_weighted_sub( thread_ic, -diagoffc, BLIS_UPPER, nc_cur, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ + dim_t nc_pruned = nc_cur; \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + m_off = ii; \ +\ + if(bli_gemmt_is_strictly_above_diag( m_off, n_off, mc_cur, nc_cur ) ) continue; \ +\ + diagoffc = m_off - n_off; \ +\ + if( diagoffc < 0 ) \ + { \ + ip = -diagoffc / MR; \ + i = ip * MR; \ + mc_cur = mc_cur - i; \ + diagoffc = -diagoffc % MR; \ + m_off += i; \ + c_ic = c_ic + ( i ) * rs_c; \ + a_ic = a_ic + ( i ) * rs_a; \ + } \ +\ + if( ( diagoffc + mc_cur ) < nc_cur ) \ + { \ + nc_pruned = diagoffc + mc_cur; \ + } \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing B. If we won't be packing A, we alias to + the _ic variables so that code further down can unconditionally + reference the _pa variables. Note that *if* we will be packing + A, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pa; \ + if ( packa ) { bszids_pa = &bszids_ic[1]; \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); } \ + else { bszids_pa = &bszids_ic[0]; \ + thread_pa = thread_ic; } \ +\ + /* Determine the packing buffer and related parameters for matrix + A. (If A will not be packed, then a_use will be set to point to + a and the _a_use strides will be set accordingly.) Then call + the packm sup variant chooser, which will call the appropriate + implementation based on the schema deduced from the stor_id. */ \ + PASTEMAC(ch,packm_sup_a) \ + ( \ + packa, \ + BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \ + stor_id, /* a "block of A." */ \ + BLIS_NO_TRANSPOSE, \ + MC, KC, /* This "block of A" is (at most) MC x KC. */ \ + mc_cur, kc_cur, MR, \ + &one_local, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_ic_use = a_use; \ +\ + /* Embed the panel stride of A within the auxinfo_t object. The + millikernel will query and use this to iterate through + micropanels of A (if needed). */ \ + bli_auxinfo_set_ps_a( ps_a_use, &aux ); \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jr = &bszids_pa[1]; \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_pruned + NR - 1 ) / NR; \ + dim_t jr_left = nc_pruned % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* An optimization: allow the last jr iteration to contain up to NRE + columns of C and B. (If NRE > NR, the mkernel has agreed to handle + these cases.) Note that this prevents us from declaring jr_iter and + jr_left as const. NOTE: We forgo this optimization when packing B + since packing an extended edge case is not yet supported. */ \ + if ( !packb && !is_mt ) \ + if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \ + { \ + jr_iter--; jr_left += NR; \ + } \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + /* + ctype* restrict b_jr = b_pc_use + j * jrstep_b; \ + */ \ + ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + dim_t i; \ + dim_t m_zero = 0; \ + dim_t n_iter_zero = 0; \ +\ + m_off_cblock = m_off; \ + n_off_cblock = n_off + j * NR; \ +\ + if(bli_gemmt_is_strictly_below_diag(m_off_cblock, n_off_cblock, mc_cur, nc_cur)) \ + { \ + m_zero = 0; \ + } \ + else \ + { \ + /* compute number of rows that are filled with zeroes and can be ignored */ \ + n_iter_zero = (n_off_cblock < m_off_cblock)? 0 : (n_off_cblock - m_off)/MR; \ + m_zero = n_iter_zero * MR; \ + } \ +\ + ctype* restrict a_ir = a_ic_use + n_iter_zero * ps_a_use; \ + ctype* restrict c_ir = c_jr + n_iter_zero * irstep_c; \ +\ + /* Ignore the zero region */ \ + m_off_cblock += m_zero; \ +\ + /* Compute the triangular part */ \ + for( i = m_zero; (i < mc_cur) && ( m_off_cblock < n_off_cblock + nr_cur); i += MR ) \ + { \ + const dim_t mr_cur = (i+MR-1) < mc_cur ? MR : mc_cur - i; \ +\ + /* Prerequisites : MR = 6, NR = 8. + An optimization: allow the last jr iteration to contain up to NRE + In DGEMMT API implementation, kernel operates on 6x8 block. MR and + NR are set as 6 and 8 respectively. 24 being the LCM of 6 and 8, + the diagonal pattern repeats for every 24x24 block. + This pattern is exploited to achieve the optimization in diagonal + blocks by computing only the required elements. In the previous + implementation, all the 48 outputs of the given 6x8 block are + computed and stored into a temporary buffer. Later, the required + elements are copied into the final C output buffer. + With this optimization, we are avoiding copy operation and also + reducing the number of computations. + Variables m_off_24 and n_off_24 respectively store the m and n + offsets from the starting point of the corresponding 24x24 block. + Variables m_idx and n_idx store indices of the current 6x8 block + along m and n dimensions, in 24x24 block. m_idx is computed as + (m_off_24 / MR) while n_idx is computed as (n_off_24 / NR). + Range of m_idx is 0 <= m_idx <= 3 and the range of n_idx is + 0 <= n_idx <= 2. Based on these indices, for the given 6x8 block, + logic is implemented to identify the relevant kernel from the + look-up table. + During instances, where m is not a multiple of 6 or n is not a + multiple of 8, it goes to the default gemm kernel. MR and NR must be + 6 and 8 for these kernels to achieve the expected functionality.*/ \ +\ + dim_t m_off_24 = m_off_cblock % 24; \ + dim_t n_off_24 = n_off_cblock % 24; \ + dim_t m_idx = (dim_t)(m_off_24 / MR); \ + dim_t n_idx = (dim_t)(n_off_24 / NR); \ +\ + /* Check if m, n indices are multiple of MR and NR respectively + and current block is a complete 6x8 block */ \ + bool idx_supported = ((m_off_24 % MR) == 0) && ((n_off_24 % NR) == 0)\ + && (MR == 6) && (NR == 8) \ + && (bli_cpuid_is_avx2fma3_supported() == TRUE) && (mr_cur == MR) && (nr_cur == NR); \ +\ + /* m_idx and n_idx would be equal only if the current block is + a diagonal block */\ + if( (dt == BLIS_DOUBLE) && (m_idx == n_idx) && (idx_supported) ) { \ + /* index of kernel in lookup table is 2*m_idx) */ \ + dim_t ker_idx; \ + ker_idx = m_idx<<1; \ +\ + /* If there is another 6x8 diagonal block pending for computation + after the current 6x8 diagonal block, then the two blocks can + be computed together(12x8). This combined kernel is implemented + only for the case where n_idx = 2 i.e., n_off_24 = 16. To call + this, it has to be ensured that at least 12 rows are pending in + C for computation. (m_off + 2 * MR <=m). Usage of this combined + kernel saves the entire time to execute one kernel*/ \ + if( (n_idx == 2) && (m_off_cblock + MR + MR <= m) ) {\ + ker_idx = 6; /* use combined kernel, index of combined kernel + in lookup table is 6 */\ + } \ + /* use rd kernel if B is column major storage */ \ + if( stor_id == BLIS_RRC ) { \ + ker_idx += 7; /* index of rd kernel*/ \ + } \ + gemmt_ker_ft ker_fp = ker_fpls[ker_idx]; \ + ker_fp \ + ( \ + conja, \ + conjb, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + (double*) alpha_cast, \ + (double*) a_ir, rs_a_use, cs_a_use, \ + (double*) b_jr, rs_b_use, cs_b_use, \ + (double*) beta_use, \ + (double*) c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + /* 6x8 block where m_idx == n_idx+1 also has some parts of the diagonal */\ + else if( (dt == BLIS_DOUBLE) && (m_idx == n_idx+1) && (idx_supported) ) { \ + /* If current block was already computed in the combined kernel it + can be skipped combined kernel is only implemented for n_idx=2, + i == m_zero is only true for the first iteration therefore if + i == m_zero then the current 6x8 block was not computed in + combined kernel*/ \ + if( (n_idx != 2) || (i == m_zero) ) { \ + dim_t ker_idx = (n_idx << 1) + 1; \ + /* use rd kernel if B is column major storage */ \ + if( stor_id == BLIS_RRC ) { ker_idx += 7; } \ + gemmt_ker_ft ker_fp = ker_fpls[ker_idx]; \ + ker_fp \ + ( \ + conja, \ + conjb, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + (double*) alpha_cast, \ + (double*) a_ir, rs_a_use, cs_a_use, \ + (double*) b_jr, rs_b_use, cs_b_use, \ + (double*) beta_use, \ + (double*) c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + /* Call the regular kernel for non applicable cases */ \ + else { \ + gemmsup_ker \ + ( \ + conja, \ + conjb, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + alpha_cast, \ + a_ir, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ + if( col_pref ) \ + { \ + PASTEMAC(ch,update_upper_triang)( n_off_cblock, m_off_cblock, \ + nr_cur, mr_cur, \ + ct, cs_ct, rs_ct, \ + beta_use, \ + c_ir, cs_c, rs_c ); \ + } \ + else \ + { \ + PASTEMAC(ch,update_lower_triang)( m_off_cblock, n_off_cblock, \ + mr_cur, nr_cur, \ + ct, rs_ct, cs_ct, \ + beta_use, \ + c_ir, rs_c, cs_c ); \ + }\ + }\ +\ + a_ir += ps_a_use; \ + c_ir += irstep_c; \ + m_off_cblock += mr_cur; \ + } \ +\ + /* Invoke the gemmsup millikerneli for remaining rectangular part. */ \ + gemmsup_ker \ + ( \ + conja, \ + conjb, \ + (i > mc_cur)? 0: mc_cur - i, \ + nr_cur, \ + kc_cur, \ + alpha_cast, \ + a_ir, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ +\ + } \ + } \ +\ + /* NOTE: This barrier is only needed if we are packing B (since + that matrix is packed within the pc loop of this variant). */ \ + if ( packb ) bli_thread_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTEMAC(ch,packm_sup_finalize_mem_a) \ + ( \ + packa, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTEMAC(ch,packm_sup_finalize_mem_b) \ + ( \ + packb, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_L( gemmtsup, ref_var2m ) + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, uplo, varname ) \ +\ +void PASTEMACT(ch,opname,uplo,varname) \ + ( \ + bool packa, \ + bool packb, \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + stor3_t stor_id, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ +\ + /* If m or n is zero, return immediately. */ \ + if ( bli_zero_dim2( m, n ) ) return; \ +\ + /* If k < 1 or alpha is zero, scale by beta and return. */ \ + if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + PASTEMAC(ch,scalm) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m, n, \ + beta, \ + c, rs_c, cs_c \ + ); \ + } \ + return; \ + } \ +\ + /* Query the context for various blocksizes. */ \ + dim_t NR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NR, cntx ); \ + dim_t MR = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MR, cntx ); \ + dim_t NC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_NC, cntx ); \ + dim_t MC = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_MC, cntx ); \ + dim_t KC0 = bli_cntx_get_l3_sup_tri_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the maximum blocksize for NR, which implies a maximum blocksize + extension for the final iteration. */ \ + dim_t NRM = bli_cntx_get_l3_sup_tri_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + /* Query the context for the sup microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_tri_ker_dt( dt, stor_id, cntx ); \ +\ + if( ( 0 == NR ) || ( 0 == MR ) || ( 0 == NC ) || ( 0 == MC ) || ( 0 == KC0 ) ) \ + { \ + NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ + NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ + } \ + const dim_t NRE = NRM - NR; \ +\ + dim_t KC; \ + if ( packa && packb ) \ + { \ + KC = KC0; \ + } \ + else if ( packb ) \ + { \ + if ( stor_id == BLIS_RRR || \ + stor_id == BLIS_CCC ) KC = KC0; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( stor_id == BLIS_RCR || \ + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ + else KC = KC0; \ + } \ + else if ( packa ) \ + { \ + if ( stor_id == BLIS_RRR || \ + stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( stor_id == BLIS_RCR || \ + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ + else KC = KC0; \ + } \ + else /* if ( !packa && !packb ) */ \ + { \ + if ( stor_id == BLIS_RRR || \ + stor_id == BLIS_CCC ) KC = KC0; \ + else if ( stor_id == BLIS_RRC || \ + stor_id == BLIS_CRC ) KC = KC0; \ + else if ( stor_id == BLIS_RCR ) \ + { \ + if ( m <= 4*MR ) KC = KC0; \ + else if ( m <= 36*MR ) KC = KC0 / 2; \ + else if ( m <= 56*MR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ + else KC = KC0 / 4; \ + } \ + else if ( m <= MR && n <= NR ) KC = KC0; \ + else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \ + else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ + else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ + else KC = (( KC0 / 5 ) / 4 ) * 4; \ + } \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + const inc_t irstep_c = rs_c * MR; \ +\ + /* + const inc_t jrstep_b = cs_b * NR; \ + ( void )jrstep_b; \ +\ + const inc_t irstep_c = rs_c * MR; \ + const inc_t irstep_a = rs_a * MR; \ + */ \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( ctype ) ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ +\ + /* Storage scheme of ct should be same as that of C. + Since update routines only support row-major order, + col_pref flag is used to induce transpose to matrices before + passing to update routine whenever C is col-stored */ \ + const bool col_pref = (rs_c == 1) ? 1 : 0; \ +\ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of beta and one scalars to prevent any unnecessary + sharing of cache lines between the cores' caches. */ \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ +\ + auxinfo_t aux; \ +\ + /* Parse and interpret the contents of the rntm_t object to properly + set the ways of parallelism for each loop. */ \ + /*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. An alternative way of initializing the + mem_t entries is: + + bli_mem_clear( &mem_a ); \ + bli_mem_clear( &mem_b ); \ + */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + /* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \ + bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t* restrict bszids; \ +\ + /* Set the bszids pointer to the correct bszids array above based on which + matrices (if any) are being packed. */ \ + if ( packa ) { if ( packb ) bszids = bszids_packab; \ + else bszids = bszids_packa; } \ + else { if ( packb ) bszids = bszids_packb; \ + else bszids = bszids_nopack; } \ +\ + /* Determine whether we are using more than one thread. */ \ + const bool is_mt = bli_rntm_calc_num_threads( rntm ); \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jc = bszids; \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_weighted_sub( thread_jc, 0, BLIS_UPPER, m, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + dim_t m_off = 0; \ + dim_t n_off = 0; \ + doff_t diagoffc; \ + dim_t m_off_cblock, n_off_cblock; \ + dim_t jp, j; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_pc = &bszids_jc[1]; \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + m_off = 0; \ + n_off = jj; \ + diagoffc = m_off - n_off; \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing B. If we won't be packing B, we alias to + the _pc variables so that code further down can unconditionally + reference the _pb variables. Note that *if* we will be packing + B, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pb; \ + if ( packb ) { bszids_pb = &bszids_pc[1]; \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); } \ + else { bszids_pb = &bszids_pc[0]; \ + thread_pb = thread_pc; } \ +\ + /* Determine the packing buffer and related parameters for matrix + B. (If B will not be packed, then a_use will be set to point to + b and the _b_use strides will be set accordingly.) Then call + the packm sup variant chooser, which will call the appropriate + implementation based on the schema deduced from the stor_id. */ \ + PASTEMAC(ch,packm_sup_b) \ + ( \ + packb, \ + BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \ + stor_id, /* a "panel of B." */ \ + BLIS_NO_TRANSPOSE, \ + KC, NC, /* This "panel of B" is (at most) KC x NC. */ \ + kc_cur, nc_cur, NR, \ + &one_local, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_pc_use = b_use; \ +\ + /* We don't need to embed the panel stride of B within the auxinfo_t + object because this variant iterates through B in the jr loop, + which occurs here, within the macrokernel, not within the + millikernel. */ \ + /*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_ic = &bszids_pb[1]; \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_weighted_sub( thread_ic, -diagoffc, BLIS_LOWER, nc_cur, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + dim_t nc_pruned = nc_cur; \ +\ + m_off = ii; \ + n_off = jj; \ +\ + if(bli_gemmt_is_strictly_below_diag(m_off, n_off, mc_cur, nc_cur)) continue; \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + doff_t diagoffc = m_off - n_off; \ +\ + ctype* restrict b_pc_pruned = b_pc_use; \ +\ + if(diagoffc > 0 ) \ + { \ + jp = diagoffc / NR; \ + j = jp * NR; \ + nc_pruned = nc_cur - j; \ + n_off += j; \ + diagoffc = diagoffc % NR; \ + c_ic = c_ic + ( j ) * cs_c; \ + b_pc_pruned = b_pc_use + ( jp ) * ps_b_use; \ + } \ +\ + if( ( ( -diagoffc ) + nc_pruned ) < mc_cur ) \ + { \ + mc_cur = -diagoffc + nc_pruned; \ + } \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing B. If we won't be packing A, we alias to + the _ic variables so that code further down can unconditionally + reference the _pa variables. Note that *if* we will be packing + A, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pa; \ + if ( packa ) { bszids_pa = &bszids_ic[1]; \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); } \ + else { bszids_pa = &bszids_ic[0]; \ + thread_pa = thread_ic; } \ +\ + /* Determine the packing buffer and related parameters for matrix + A. (If A will not be packed, then a_use will be set to point to + a and the _a_use strides will be set accordingly.) Then call + the packm sup variant chooser, which will call the appropriate + implementation based on the schema deduced from the stor_id. */ \ + PASTEMAC(ch,packm_sup_a) \ + ( \ + packa, \ + BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \ + stor_id, /* a "block of A." */ \ + BLIS_NO_TRANSPOSE, \ + MC, KC, /* This "block of A" is (at most) MC x KC. */ \ + mc_cur, kc_cur, MR, \ + &one_local, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_ic_use = a_use; \ +\ + /* Embed the panel stride of A within the auxinfo_t object. The + millikernel will query and use this to iterate through + micropanels of A (if needed). */ \ + bli_auxinfo_set_ps_a( ps_a_use, &aux ); \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jr = &bszids_pa[1]; \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_pruned + NR - 1 ) / NR; \ + dim_t jr_left = nc_pruned % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* An optimization: allow the last jr iteration to contain up to NRE + columns of C and B. (If NRE > NR, the mkernel has agreed to handle + these cases.) Note that this prevents us from declaring jr_iter and + jr_left as const. NOTE: We forgo this optimization when packing B + since packing an extended edge case is not yet supported. */ \ + if ( !packb && !is_mt ) \ + if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \ + { \ + jr_iter--; jr_left += NR; \ + } \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + /* + ctype* restrict b_jr = b_pc_use + j * jrstep_b; \ + */ \ + ctype* restrict b_jr = b_pc_pruned + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ + dim_t m_rect = 0; \ + dim_t n_iter_rect = 0; \ +\ + m_off_cblock = m_off; \ + n_off_cblock = n_off + j * NR; \ +\ + if(bli_gemmt_is_strictly_above_diag(m_off_cblock, n_off_cblock, mc_cur, nr_cur)) \ + { \ + m_rect = mc_cur; \ + } \ + else \ + { \ + /* calculate the number of rows in rectangular region of the block */ \ + n_iter_rect = n_off_cblock < m_off_cblock ? 0: (n_off_cblock - m_off_cblock) / MR; \ + m_rect = n_iter_rect * MR; \ + } \ +\ + /* Compute the rectangular part */ \ + gemmsup_ker \ + ( \ + conja, \ + conjb, \ + m_rect, \ + nr_cur, \ + kc_cur, \ + alpha_cast, \ + a_ic_use, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + beta_use, \ + c_jr, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ +\ + m_off_cblock = m_off + m_rect; \ +\ + ctype* restrict a_ir = a_ic_use + n_iter_rect * ps_a_use; \ + ctype* restrict c_ir = c_jr + n_iter_rect * irstep_c; \ +\ + /* compute the remaining triangular part */ \ + for( dim_t i = m_rect;( i < mc_cur) && (m_off_cblock < n_off_cblock + nr_cur); i += MR ) \ + { \ + const dim_t mr_cur = (i+MR-1) < mc_cur ? MR : mc_cur - i; \ + /* Prerequisites : MR = 6, NR = 8. + An optimization: allow the last jr iteration to contain up to NRE + In DGEMMT API implementation, kernel operates on 6x8 block. MR and + NR are set as 6 and 8 respectively. 24 being the LCM of 6 and 8, + the diagonal pattern repeats for every 24x24 block. + This pattern is exploited to achieve the optimization in diagonal + blocks by computing only the required elements. In the previous + implementation, all the 48 outputs of the given 6x8 block are + computed and stored into a temporary buffer. Later, the required + elements are copied into the final C output buffer. + With this optimization, we are avoiding copy operation and also + reducing the number of computations. + Variables m_off_24 and n_off_24 respectively store the m and n + offsets from the starting point of the corresponding 24x24 block. + Variables m_idx and n_idx store indices of the current 6x8 block + along m and n dimensions, in 24x24 block. m_idx is computed as + (m_off_24 / MR) while n_idx is computed as (n_off_24 / NR). + Range of m_idx is 0 <= m_idx <= 3 and the range of n_idx is + 0 <= n_idx <= 2. Based on these indices, for the given 6x8 block, + logic is implemented to identify the relevant kernel from the + look-up table. + During instances, where m is not a multiple of 6 or n is not a + multiple of 8, it goes to the default gemm kernel. MR and NR must be + 6 and 8 for these kernels to achieve the expected functionality.*/ \ + dim_t m_off_24 = m_off_cblock % 24; \ + dim_t n_off_24 = n_off_cblock % 24; \ + dim_t m_idx = (dim_t)(m_off_24 / MR); \ + dim_t n_idx = (dim_t)(n_off_24 / NR); \ +\ + /* Check if m, n indices are multiple of MR and NR respectively + and current block is a complete 6x8 block */ \ + bool idx_supported = ((m_off_24 % MR) == 0) && ((n_off_24 % NR) == 0)\ + && (MR == 6) && (NR == 8) \ + && (bli_cpuid_is_avx2fma3_supported() == TRUE) && (mr_cur==MR) && (nr_cur==NR); \ +\ + /* m_idx and n_idx would be equal only if the current block is + a diagonal block */\ + if( (dt == BLIS_DOUBLE) && (m_idx == n_idx) && idx_supported ) { \ + dim_t ker_idx = m_idx<<1; \ + /* If there is another 6x8 diagonal block pending for computation + after the current 6x8 diagonal block, then the two blocks can + be computed together(12x8). This combined kernel is implemented + only for the case where n_idx = 0 i.e., n_off_24 = 0. To call + this, it has to be ensured that at least 12 rows are pending in + C for computation (i+ MR + MR <= mc_cur). Usage of this combined + kernel saves the entire time to execute one kernel*/ \ + if( (n_idx == 0) && (i+ MR + MR <= mc_cur) ) { \ + ker_idx = 6; /* use combined kernel, index of combined kernel + in lookup table is 6 */\ + } \ + /* if B is column storage we use rd kernel*/ \ + if( stor_id == BLIS_RRC ) { \ + ker_idx += 7; /* index of rd kernel*/\ + } \ + gemmt_ker_ft ker_fp = ker_fpus[ker_idx]; \ + ker_fp \ + ( \ + conja, \ + conjb, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + (double*) alpha_cast, \ + (double*) a_ir, rs_a_use, cs_a_use, \ + (double*) b_jr, rs_b_use, cs_b_use, \ + (double*) beta_use, \ + (double*) c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + /* 6x8 block where m_idx == n_idx+1 also has some parts of the diagonal */\ + else if( (dt == BLIS_DOUBLE) && (m_idx == n_idx+1) && (idx_supported) ) { \ + /* If current block was already computed in the combined kernel it + can be skipped combined kernel is only implemented for n_idx=0, + i == m_rect is only true for the first iteration therefore if + i == m_rect then the current 6x8 block was not computed in + combined kernel*/ \ + if( (n_idx != 0) || (i == m_rect) ) { \ + dim_t ker_idx = (n_idx << 1) + 1 ; \ + /* use rd kernel if B is column major storage */ \ + if( stor_id == BLIS_RRC ) { ker_idx += 7; } \ + gemmt_ker_ft ker_fp = ker_fpus[ker_idx]; \ + ker_fp \ + ( \ + conja, \ + conjb, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + (double*) alpha_cast, \ + (double*) a_ir, rs_a_use, cs_a_use, \ + (double*) b_jr, rs_b_use, cs_b_use, \ + (double*) beta_use, \ + (double*) c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + /* call the regular kernel for non applicable cases */ \ + else { \ + gemmsup_ker \ + ( \ + conja, \ + conjb, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + alpha_cast, \ + a_ir, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ + \ + if( col_pref ) \ + { \ + PASTEMAC(ch,update_lower_triang)( n_off_cblock, m_off_cblock, \ + nr_cur, mr_cur, \ + ct, cs_ct, rs_ct, \ + beta_use, \ + c_ir, cs_c, rs_c ); \ + } \ + else \ + { \ + PASTEMAC(ch,update_upper_triang)( m_off_cblock, n_off_cblock, \ + mr_cur, nr_cur, \ + ct, rs_ct, cs_ct, \ + beta_use, \ + c_ir, rs_c, cs_c ); \ + } \ + } \ +\ + a_ir += ps_a_use; \ + c_ir += irstep_c; \ + m_off_cblock += mr_cur; \ +\ + } \ + } \ + } \ +\ + /* NOTE: This barrier is only needed if we are packing B (since + that matrix is packed within the pc loop of this variant). */ \ + if ( packb ) bli_thread_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTEMAC(ch,packm_sup_finalize_mem_a) \ + ( \ + packa, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTEMAC(ch,packm_sup_finalize_mem_b) \ + ( \ + packb, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_U( gemmtsup, ref_var2m ) + From ccb8dd26fd396fe190d503930e9585f0e8a99b18 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Fri, 8 Sep 2023 15:10:50 -0400 Subject: [PATCH 136/226] Compiler warnings when using --int-size=32 Correct compiler warnings when building with configure --int-size=32 - bla_imatcopy.c: Cast ints to longs to match %ld format specification in error printf statement and change this to fprintf to stderr. Also copy this additional fprintf statement to other variants of this function. - bli_type_defs.h: siz_t should always be the same size as a pointer. This corrects an issue in bli_malloc.c when casting from a pointer to a siz_t integer value. AMD-Internal: [CPUPL-3519] Change-Id: Ic87cd6142b8a6fed177b7c55bc0bb6013c5b69ab --- frame/compat/bla_imatcopy.c | 15 +++++++++++++-- frame/include/bli_type_defs.h | 6 +++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/frame/compat/bla_imatcopy.c b/frame/compat/bla_imatcopy.c index 699d8b4243..67a82defd6 100644 --- a/frame/compat/bla_imatcopy.c +++ b/frame/compat/bla_imatcopy.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -376,6 +376,8 @@ static dim_t bli_siMatCopy_cn(dim_t rows,dim_t cols,const float alpha,float* a,d if ( rows <= 0 || cols <= 0 || a == NULL || lda < cols || ldb < cols) { + fprintf( stderr, " Invalid trans setting bli_siMatCopy_cn() %ld %ld %ld %ld \n", + ( long )rows, ( long )cols, ( long )lda, ( long )ldb); bli_print_msg( " Invalid function parameters bli_siMatCopy_cn() .", __FILE__, __LINE__ ); AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "Invalid function parameters"); return (0); @@ -422,7 +424,8 @@ static dim_t bli_diMatCopy_cn(dim_t rows,dim_t cols,const double alpha,double* a if ( rows <= 0 || cols <= 0 || a == NULL || lda < cols || ldb < cols) { - printf( " Invalid trans setting bli_diMatcopy_cn() %ld %ld %ld %ld \n", rows, cols, lda,ldb); + fprintf( stderr, " Invalid trans setting bli_diMatcopy_cn() %ld %ld %ld %ld \n", + ( long )rows, ( long )cols, ( long )lda, ( long )ldb); bli_print_msg( " Invalid function parameters bli_diMatCopy_cn() .", __FILE__, __LINE__ ); AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "Invalid function parameters"); return (0); @@ -470,6 +473,8 @@ static dim_t bli_ciMatCopy_cn(dim_t rows,dim_t cols,const scomplex alpha,scomple if ( rows <= 0 || cols <= 0 || a == NULL || lda < cols || ldb < cols) { + fprintf( stderr, " Invalid trans setting bli_ciMatCopy_cn() %ld %ld %ld %ld \n", + ( long )rows, ( long )cols, ( long )lda, ( long )ldb); bli_print_msg( " Invalid function parameters bli_ciMatCopy_cn() .", __FILE__, __LINE__ ); AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "Invalid function parameters"); return (0); @@ -518,6 +523,8 @@ static dim_t bli_ziMatCopy_cn(dim_t rows,dim_t cols,const dcomplex alpha,dcomple if ( rows <= 0 || cols <= 0 || a == NULL || lda < cols || ldb < cols) { + fprintf( stderr, " Invalid trans setting bli_ziMatCopy_cn() %ld %ld %ld %ld \n", + ( long )rows, ( long )cols, ( long )lda, ( long )ldb); bli_print_msg( " Invalid function parameters bli_ziMatCopy_cn() .", __FILE__, __LINE__ ); AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "Invalid function parameters"); return (0); @@ -565,6 +572,8 @@ static dim_t bli_ciMatCopy_cr(dim_t rows,dim_t cols,const scomplex alpha,scomple if ( rows <= 0 || cols <= 0 || a == NULL || lda < cols || ldb < cols) { + fprintf( stderr, " Invalid trans setting bli_ciMatCopy_cr() %ld %ld %ld %ld \n", + ( long )rows, ( long )cols, ( long )lda, ( long )ldb); bli_print_msg( " Invalid function parameters bli_ciMatCopy_cr() .", __FILE__, __LINE__ ); AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "Invalid function parameters"); return (0); @@ -612,6 +621,8 @@ static dim_t bli_ziMatCopy_cr(dim_t rows,dim_t cols,const dcomplex alpha,dcomple if ( rows <= 0 || cols <= 0 || a == NULL || lda < cols || ldb < cols) { + fprintf( stderr, " Invalid trans setting bli_ziMatCopy_cr() %ld %ld %ld %ld \n", + ( long )rows, ( long )cols, ( long )lda, ( long )ldb); bli_print_msg( " Invalid function parameters bli_ziMatCopy_cr() .", __FILE__, __LINE__ ); AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "Invalid function parameters"); return (0); diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 173c5d62e4..823410e0aa 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -118,7 +118,11 @@ typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type -typedef guint_t siz_t; // byte size type +#ifdef BLIS_ARCH_64 +typedef uint64_t siz_t; // byte size type +#else +typedef uint32_t siz_t; // byte size type +#endif typedef uint32_t objbits_t; // object information bit field // -- Real types -- From e437469a992cf925575e85259c46133091fe475b Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Wed, 30 Aug 2023 04:48:28 -0500 Subject: [PATCH 137/226] Optimized AVX2 DGEMM SUP edge kernels - For edge kernels which handles the corner cases and specially for cases where there is really small amount of computation to be done, executing FMA efficiently becomes very crucial. - In previous implementation, edge kernels were using same, limited number of vector register to hold FMA result, which indirectly creates dependency on previous FMA to complete before CPU can issue new FMA. - This commit address this issue by using different vector registers that are available at disposal to hold FMA result. - That way we hold FMA results in two sets of vector registers, so that sub-sequent FMA won't have to wait for previous FMA to complete. - At the end of un-rolled K loop these two sets of vector registers are added together to store correct result in intended vector registers. AMD-Internal: [CPUPL-3574] Change-Id: I48fa9e29b6650a785321097b9feeddc3326e3c54 --- .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c | 1074 ++++++--- kernels/haswell/3/sup/d6x8/CMakeLists.txt | 1 + .../d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c | 1984 +++++++++++++++++ .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c | 1106 +++++---- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c | 385 ++-- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c | 999 ++++----- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c | 252 +-- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c | 1177 +++++----- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c | 248 +-- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c | 1202 +++++----- kernels/haswell/bli_kernels_haswell.h | 5 + 11 files changed, 5366 insertions(+), 3067 deletions(-) create mode 100644 kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c index cdd6989820..14093d4f42 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c @@ -89,6 +89,22 @@ static void bli_dgemmsup_rv_haswell_asm_6x3m cntx_t* restrict cntx ); +static void bli_dgemmsup_rv_haswell_asm_6x1m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ); + #define C_TRANSPOSE_6x7_TILE(R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12) \ /*Transposing 4x4 tile*/ \ vunpcklpd(ymm(R2), ymm(R1), ymm0)\ @@ -392,6 +408,35 @@ static void bli_dgemmsup_rv_haswell_asm_6x3m vmovupd(xmm1, mem(rdx, rsi, 1))\ vmovupd(xmm2, mem(rdx, rsi, 2)) +#define C_TRANSPOSE_6x1_TILE(R1, R2, R3, R4, R5, R6) \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ +\ + vbroadcastsd(mem(rbx), ymm3)\ +\ + /*Scaling C matrix by Beta and adding it to fma result.*/ \ + /*R1, R2, R3 holds final result*/ \ + vfmadd231pd(mem(rcx ), ymm3, ymm(R1))\ + vmovupd(ymm(R1), mem(rcx ))\ +\ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ +\ + /*Scaling C matrix by Beta and adding it to fma result.*/ \ + /*0, 1, 2 holds final result*/ \ + vfmadd231pd(mem(rdx ), xmm3, xmm0)\ + vmovupd(xmm0, mem(rdx ))\ + +#define C_TRANSPOSE_6x1_TILE_BZ(R1, R2, R3, R4, R5, R6) \ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ +\ + vmovupd(ymm(R1), mem(rcx ))\ +\ + vunpcklpd(ymm(R6), ymm(R5), ymm0)\ +\ + vmovupd(xmm0, mem(rdx ))\ /* rrr: -------- ------ -------- @@ -526,44 +571,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8m } case 1: { - dim_t ps_a0 = bli_auxinfo_ps_a( data ); - - if ( ps_a0 == 6 * rs_a0 ) - { - // Since A is not packed, we can use one gemv. - bli_dgemv_ex - ( - BLIS_NO_TRANSPOSE, conjb, m0, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, - beta, cij, rs_c0, cntx, NULL - ); - } - else - { - const dim_t mr = 6; - - // Since A is packed into row panels, we must use a loop over - // gemv. - dim_t m_iter = ( m0 + mr - 1 ) / mr; - dim_t m_left = m0 % mr; - - double* restrict ai_ii = ai; - double* restrict cij_ii = cij; - - for ( dim_t ii = 0; ii < m_iter; ii += 1 ) - { - dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) - ? mr : m_left ); - - bli_dgemv_ex - ( - BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, - alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, - beta, cij_ii, rs_c0, cntx, NULL - ); - cij_ii += mr*rs_c0; ai_ii += ps_a0; - } - } + bli_dgemmsup_rv_haswell_asm_6x1m + ( + conja, conjb, m0, n_left, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); break; } default: @@ -975,6 +988,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m label(.DROWSTORED) + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) @@ -989,39 +1003,36 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) - add(rdi, rcx) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm8) + vmovupd(ymm8, mem(rbx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) - vmovupd(ymm9, mem(rcx, 1*32)) - add(rdi, rcx) + vfmadd231pd(mem(rbx, 1*32), ymm3, ymm9) + vmovupd(ymm9, mem(rbx, 1*32)) + add(rdi, rbx) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) - vmovupd(ymm10, mem(rcx, 0*32)) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm10) + vmovupd(ymm10, mem(rbx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) - vmovupd(ymm11, mem(rcx, 1*32)) - add(rdi, rcx) + vfmadd231pd(mem(rbx, 1*32), ymm3, ymm11) + vmovupd(ymm11, mem(rbx, 1*32)) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) - vmovupd(ymm12, mem(rcx, 0*32)) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm12) + vmovupd(ymm12, mem(rdx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) - vmovupd(ymm13, mem(rcx, 1*32)) - add(rdi, rcx) + vfmadd231pd(mem(rdx, 1*32), ymm3, ymm13) + vmovupd(ymm13, mem(rdx, 1*32)) + add(rdi, rdx) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) - vmovupd(ymm14, mem(rcx, 0*32)) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm14) + vmovupd(ymm14, mem(rdx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15) - vmovupd(ymm15, mem(rcx, 1*32)) - //add(rdi, rcx) + vfmadd231pd(mem(rdx, 1*32), ymm3, ymm15) + vmovupd(ymm15, mem(rdx, 1*32)) jmp(.DDONE) // jump to end. @@ -8867,6 +8878,8 @@ static void bli_dgemmsup_rv_haswell_asm_6x7m label(.DROWSTORED) + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm4) @@ -8885,47 +8898,44 @@ static void bli_dgemmsup_rv_haswell_asm_6x7m vmovupd(ymm5, mem(rcx, 0*32)) vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) - add(rdi, rcx) //-----------------------2 - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rbx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm8) - vmovupd(ymm7, mem(rcx, 0*32)) - vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + vmovupd(ymm7, mem(rbx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rbx, 1*32)) - add(rdi, rcx) + add(rdi, rbx) //-----------------------3 - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rbx, 0*32), ymm1, ymm9) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm10) - vmovupd(ymm9, mem(rcx, 0*32)) - vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + vmovupd(ymm9, mem(rbx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rbx, 1*32)) - add(rdi, rcx) //-----------------------4 - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm11) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm11) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm12) - vmovupd(ymm11, mem(rcx, 0*32)) - vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + vmovupd(ymm11, mem(rdx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rdx, 1*32)) - add(rdi, rcx) + add(rdi, rdx) //-----------------------5 - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm13) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm13) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm14) - vmovupd(ymm13, mem(rcx, 0*32)) - vmaskmovpd(ymm14, ymm15, mem(rcx, 1*32)) + vmovupd(ymm13, mem(rdx, 0*32)) + vmaskmovpd(ymm14, ymm15, mem(rdx, 1*32)) - add(rdi, rcx) //-----------------------6 jmp(.DDONE) // jump to end. @@ -9481,6 +9491,8 @@ static void bli_dgemmsup_rv_haswell_asm_6x5m label(.DROWSTORED) + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm4) @@ -9499,47 +9511,44 @@ static void bli_dgemmsup_rv_haswell_asm_6x5m vmovupd(ymm5, mem(rcx, 0*32)) vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) - add(rdi, rcx) //-----------------------2 - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rbx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm8) - vmovupd(ymm7, mem(rcx, 0*32)) - vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + vmovupd(ymm7, mem(rbx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rbx, 1*32)) - add(rdi, rcx) + add(rdi, rbx) //-----------------------3 - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rbx, 0*32), ymm1, ymm9) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm10) - vmovupd(ymm9, mem(rcx, 0*32)) - vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + vmovupd(ymm9, mem(rbx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rbx, 1*32)) - add(rdi, rcx) //-----------------------4 - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm11) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm11) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm12) - vmovupd(ymm11, mem(rcx, 0*32)) - vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + vmovupd(ymm11, mem(rdx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rdx, 1*32)) - add(rdi, rcx) + add(rdi, rdx) //-----------------------5 - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm13) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm13) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm14) - vmovupd(ymm13, mem(rcx, 0*32)) - vmaskmovpd(ymm14, ymm15, mem(rcx, 1*32)) + vmovupd(ymm13, mem(rdx, 0*32)) + vmaskmovpd(ymm14, ymm15, mem(rdx, 1*32)) - add(rdi, rcx) //-----------------------6 jmp(.DDONE) // jump to end. @@ -10018,6 +10027,9 @@ static void bli_dgemmsup_rv_haswell_asm_6x3m jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) + + lea(mem(rcx, rdi, 2), rbx) // load address of c + 4*rs_c; + //Loads 3 elements as per mask_3 mask vector vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm3, ymm4) @@ -10027,33 +10039,31 @@ static void bli_dgemmsup_rv_haswell_asm_6x3m vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm3, ymm6) vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) - add(rdi, rcx) - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm3, ymm8) - vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) - add(rdi, rcx) + vmaskmovpd(ymm8, ymm15, mem(rbx, 0*32)) + add(rdi, rbx) - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm3, ymm10) - vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) - add(rdi, rcx) + vmaskmovpd(ymm10, ymm15, mem(rbx, 0*32)) - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm3, ymm12) - vmaskmovpd(ymm12, ymm15, mem(rcx, 0*32)) - add(rdi, rcx) + vmaskmovpd(ymm12, ymm15, mem(rdx, 0*32)) + add(rdi, rdx) - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm3, ymm14) - vmaskmovpd(ymm14, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm14, ymm15, mem(rdx, 0*32)) jmp(.DDONE) // jump to end. label(.DCOLSTORED) C_TRANSPOSE_6x3_TILE(4, 6, 8, 10, 12, 14) - jmp(.RESETPARAM) + jmp(.DDONE) label(.DBETAZERO) @@ -10085,18 +10095,12 @@ static void bli_dgemmsup_rv_haswell_asm_6x3m label(.DCOLSTORBZ) C_TRANSPOSE_6x3_TILE_BZ(4, 6, 8, 10, 12, 14) - jmp(.RESETPARAM) - - label(.RESETPARAM) - mov(var(mask_vec), rdx) - vmovdqu(mem(rdx), ymm15) //load mask - jmp(.DDONE) // jump to end. + jmp(.DDONE) label(.DDONE) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) - mov(var(mask_vec), rdx) - vmovdqu(mem(rdx), ymm15) //load + lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c @@ -10178,8 +10182,7 @@ static void bli_dgemmsup_rv_haswell_asm_6x3m } - -void bli_dgemmsup_rv_haswell_asm_6x6m +static void bli_dgemmsup_rv_haswell_asm_6x1m ( conj_t conja, conj_t conjb, @@ -10195,10 +10198,30 @@ void bli_dgemmsup_rv_haswell_asm_6x6m cntx_t* restrict cntx ) { - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); - //void* a_next = bli_auxinfo_next_a( data ); - //void* b_next = bli_auxinfo_next_b( data ); + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_1 which is set to -1, 0, 0, 0 so that the +// 1 element will be loaded and 4th element will be set to 0 in destination vector. +// // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; @@ -10218,13 +10241,13 @@ void bli_dgemmsup_rv_haswell_asm_6x6m uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a8 = ps_a * sizeof( double ); - if ( m_iter == 0 ) goto consider_edge_cases; + int64_t const *mask_vec = mask_1; - // ------------------------------------------------------------------------- + if ( m_iter == 0 ) goto consider_edge_cases_nleft_1; begin_asm() - - //vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load mask mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a @@ -10235,43 +10258,19 @@ void bli_dgemmsup_rv_haswell_asm_6x6m lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a - //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b - //mov(var(cs_b), r11) // load cs_b - lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) - //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - // NOTE: We cannot pre-load elements of a or b - // because it could eventually, in the last - // unrolled iter or the cleanup loop, result - // in reading beyond the bounds allocated mem - // (the likely result: a segmentation fault). + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) - - // During preamble and loops: - // r12 = rcx = c - // r14 = rax = a - // read rbx from var(b) near beginning of loop - // r11 = m dim index ii - mov(var(m_iter), r11) // ii = m_iter; - label(.DLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] - - + label(.DLOOP6X1I) // LOOP OVER ii = [ m_iter ... 1 0 ] -#if 0 - vzeroall() // zero all xmm/ymm registers. -#else - // skylake can execute 3 vxorpd ipc with - // a latency of 1 cycle, while vzeroall - // has a latency of 12 cycles. - vxorpd(ymm1, ymm1, ymm1) // zero ymm1 since we only use the lower - vxorpd(ymm4, ymm4, ymm4) // half (xmm1), and nans/infs may slow us + vxorpd(ymm4, ymm4, ymm4) vmovapd( ymm4, ymm5) vmovapd( ymm4, ymm6) vmovapd( ymm4, ymm7) @@ -10282,14 +10281,10 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vmovapd( ymm4, ymm12) vmovapd( ymm4, ymm13) vmovapd( ymm4, ymm14) - vmovapd( ymm4, ymm15) -#endif + vmovapd( ymm4, ymm1) mov(var(b), rbx) // load address of b. - //mov(r12, rcx) // reset rcx to current utile of c. - mov(r14, rax) // reset rax to current upanel of a. - - + mov(r14, rax) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case @@ -10297,12 +10292,12 @@ void bli_dgemmsup_rv_haswell_asm_6x6m lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + prefetch(0, mem(r12, 2*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 2*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 2*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 2*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 2*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c @@ -10314,9 +10309,6 @@ void bli_dgemmsup_rv_haswell_asm_6x6m prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c - prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c - prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c @@ -10325,22 +10317,18 @@ void bli_dgemmsup_rv_haswell_asm_6x6m mov(var(ps_a8), rdx) // load ps_a8 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a8 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; - // use rcx, rdx for prefetching lines - // from next upanel of a. + // use rcx, rdx for prefetching lines + // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that - // contains the k_left loop. - + // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP @@ -10352,33 +10340,25 @@ void bli_dgemmsup_rv_haswell_asm_6x6m #else prefetch(0, mem(rdx, 5*8)) #endif - - vmovupd(mem(rbx, 0*32), ymm0) - vmovupd(mem(rbx, 1*32), xmm1) + //Loads 1 elements as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) - vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) - vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) - vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) - vfmadd231pd(ymm1, ymm3, ymm15) - // ---------------------------------- iteration 1 @@ -10387,32 +10367,25 @@ void bli_dgemmsup_rv_haswell_asm_6x6m #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif - - vmovupd(mem(rbx, 0*32), ymm0) - vmovupd(mem(rbx, 1*32), xmm1) + //Loads 1 elements as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - vfmadd231pd(ymm0, ymm3, ymm6) - vfmadd231pd(ymm1, ymm3, ymm7) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm1, ymm2, ymm9) - vfmadd231pd(ymm0, ymm3, ymm10) - vfmadd231pd(ymm1, ymm3, ymm11) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm12) - vfmadd231pd(ymm1, ymm2, ymm13) - vfmadd231pd(ymm0, ymm3, ymm14) - vfmadd231pd(ymm1, ymm3, ymm15) + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm1) // ---------------------------------- iteration 2 @@ -10422,8 +10395,531 @@ void bli_dgemmsup_rv_haswell_asm_6x6m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif + //Loads 1 elements as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) + add(r10, rbx) // b += rs_b; - vmovupd(mem(rbx, 0*32), ymm0) + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm0, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm0, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vbroadcastsd(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm0, ymm3, ymm14) + + + // ---------------------------------- iteration 3 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, rcx, 1, 5*8)) + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 1 elements as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm7) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm11) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vbroadcastsd(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm1) + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + vaddpd(ymm5, ymm4, ymm4) + vaddpd(ymm7, ymm6, ymm6) + vaddpd(ymm9, ymm8, ymm8) + vaddpd(ymm11, ymm10, ymm10) + vaddpd(ymm13, ymm12, ymm12) + vaddpd(ymm1, ymm14, ymm14) + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 1 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm0, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm0, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vbroadcastsd(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm0, ymm3, ymm14) + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + label(.DPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm10, ymm10) + vmulpd(ymm0, ymm12, ymm12) + vmulpd(ymm0, ymm14, ymm14) + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + label(.DROWSTORED) + + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm3, ymm4) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm1) + vfmadd231pd(ymm1, ymm3, ymm6) + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm2) + vfmadd231pd(ymm2, ymm3, ymm8) + vmaskmovpd(ymm8, ymm15, mem(rbx, 0*32)) + add(rdi, rbx) + + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm4) + vfmadd231pd(ymm4, ymm3, ymm10) + vmaskmovpd(ymm10, ymm15, mem(rbx, 0*32)) + + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm5) + vfmadd231pd(ymm5, ymm3, ymm12) + vmaskmovpd(ymm12, ymm15, mem(rdx, 0*32)) + add(rdi, rdx) + + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm6) + vfmadd231pd(ymm6, ymm3, ymm14) + vmaskmovpd(ymm14, ymm15, mem(rdx, 0*32)) + + jmp(.DDONE) // jump to end. + + label(.DCOLSTORED) + + C_TRANSPOSE_6x1_TILE(4, 6, 8, 10, 12, 14) + jmp(.DDONE) + + label(.DBETAZERO) + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + label(.DROWSTORBZ) + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm12, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm14, ymm15, mem(rcx, 0*32)) + + + jmp(.DDONE) // jump to end. + + label(.DCOLSTORBZ) + + C_TRANSPOSE_6x1_TILE_BZ(4, 6, 8, 10, 12, 14) + jmp(.DDONE) + + label(.DDONE) + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + mov(var(ps_a8), rax) // load ps_a8 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8 + + dec(r11) // ii -= 1; + jne(.DLOOP6X1I) // iterate again if ii != 0. + + label(.DRETURN) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a8] "m" (ps_a8), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [n0] "m" (n0), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c)/*, + [a_next] "m" (a_next), + [b_next] "m" (b_next)*/ + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm14", + "memory" + ) + + consider_edge_cases_nleft_1: + if ( m_left ) + { + const dim_t nr_cur = n0; + const dim_t i_edge = m0 - ( dim_t )m_left; + + double* restrict cij = c + i_edge*rs_c; + double* restrict ai = a + m_iter * ps_a; + double* restrict bj = b; + + dgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_dgemmsup_rv_haswell_asm_1x1, + bli_dgemmsup_rv_haswell_asm_2x1, + bli_dgemmsup_rv_haswell_asm_3x1, + bli_dgemmsup_rv_haswell_asm_4x1, + bli_dgemmsup_rv_haswell_asm_5x1 + }; + + dgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + + return; + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); +} + +void bli_dgemmsup_rv_haswell_asm_6x6m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a8 = ps_a * sizeof( double ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + // ------------------------------------------------------------------------- + + begin_asm() + + //vzeroall() // zero all xmm/ymm registers. + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + //mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + // r11 = m dim index ii + + mov(var(m_iter), r11) // ii = m_iter; + + label(.DLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + + +#if 0 + vzeroall() // zero all xmm/ymm registers. +#else + // skylake can execute 3 vxorpd ipc with + // a latency of 1 cycle, while vzeroall + // has a latency of 12 cycles. + vxorpd(ymm1, ymm1, ymm1) // zero ymm1 since we only use the lower + vxorpd(ymm4, ymm4, ymm4) // half (xmm1), and nans/infs may slow us + vmovapd( ymm4, ymm5) + vmovapd( ymm4, ymm6) + vmovapd( ymm4, ymm7) + vmovapd( ymm4, ymm8) + vmovapd( ymm4, ymm9) + vmovapd( ymm4, ymm10) + vmovapd( ymm4, ymm11) + vmovapd( ymm4, ymm12) + vmovapd( ymm4, ymm13) + vmovapd( ymm4, ymm14) + vmovapd( ymm4, ymm15) +#endif + + mov(var(b), rbx) // load address of b. + //mov(r12, rcx) // reset rcx to current utile of c. + mov(r14, rax) // reset rax to current upanel of a. + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + mov(var(ps_a8), rdx) // load ps_a8 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a8 + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + // use rcx, rdx for prefetching lines + // from next upanel of a. +#else + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, 5*8)) +#endif + + vmovupd(mem(rbx, 0*32), ymm0) + vmovupd(mem(rbx, 1*32), xmm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vbroadcastsd(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, r9, 1, 5*8)) +#endif + + vmovupd(mem(rbx, 0*32), ymm0) + vmovupd(mem(rbx, 1*32), xmm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vbroadcastsd(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + + // ---------------------------------- iteration 2 + +#if 0 + prefetch(0, mem(rdx, 5*8)) +#else + prefetch(0, mem(rdx, r9, 2, 5*8)) +#endif + + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -10595,6 +11091,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m label(.DROWSTORED) + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) @@ -10609,39 +11106,36 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) - add(rdi, rcx) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm8) + vmovupd(ymm8, mem(rbx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) - vmovupd(xmm9, mem(rcx, 1*32)) - add(rdi, rcx) + vfmadd231pd(mem(rbx, 1*32), xmm3, xmm9) + vmovupd(xmm9, mem(rbx, 1*32)) + add(rdi, rbx) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) - vmovupd(ymm10, mem(rcx, 0*32)) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm10) + vmovupd(ymm10, mem(rbx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) - vmovupd(xmm11, mem(rcx, 1*32)) - add(rdi, rcx) + vfmadd231pd(mem(rbx, 1*32), xmm3, xmm11) + vmovupd(xmm11, mem(rbx, 1*32)) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) - vmovupd(ymm12, mem(rcx, 0*32)) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm12) + vmovupd(ymm12, mem(rdx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13) - vmovupd(xmm13, mem(rcx, 1*32)) - add(rdi, rcx) + vfmadd231pd(mem(rdx, 1*32), xmm3, xmm13) + vmovupd(xmm13, mem(rdx, 1*32)) + add(rdi, rdx) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) - vmovupd(ymm14, mem(rcx, 0*32)) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm14) + vmovupd(ymm14, mem(rdx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm15) - vmovupd(xmm15, mem(rcx, 1*32)) - //add(rdi, rcx) + vfmadd231pd(mem(rdx, 1*32), xmm3, xmm15) + vmovupd(xmm15, mem(rdx, 1*32)) jmp(.DDONE) // jump to end. @@ -11055,11 +11549,17 @@ void bli_dgemmsup_rv_haswell_asm_6x4m // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) + vmovapd( ymm4, ymm5) vmovapd( ymm4, ymm6) + vmovapd( ymm4, ymm7) vmovapd( ymm4, ymm8) + vmovapd( ymm4, ymm9) vmovapd( ymm4, ymm10) + vmovapd( ymm4, ymm11) vmovapd( ymm4, ymm12) + vmovapd( ymm4, ymm13) vmovapd( ymm4, ymm14) + vmovapd( ymm4, ymm15) #endif mov(var(b), rbx) // load address of b. @@ -11161,19 +11661,19 @@ void bli_dgemmsup_rv_haswell_asm_6x4m vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm12) - vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm15) // ---------------------------------- iteration 2 @@ -11218,27 +11718,31 @@ void bli_dgemmsup_rv_haswell_asm_6x4m vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm12) - vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - + vaddpd(ymm5, ymm4, ymm4) + vaddpd(ymm7, ymm6, ymm6) + vaddpd(ymm9, ymm8, ymm8) + vaddpd(ymm11, ymm10, ymm10) + vaddpd(ymm13, ymm12, ymm12) + vaddpd(ymm15, ymm14, ymm14) @@ -11329,6 +11833,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m label(.DROWSTORED) + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) @@ -11337,27 +11842,24 @@ void bli_dgemmsup_rv_haswell_asm_6x4m vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) - add(rdi, rcx) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) - add(rdi, rcx) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm8) + vmovupd(ymm8, mem(rbx, 0*32)) + add(rdi, rbx) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) - vmovupd(ymm10, mem(rcx, 0*32)) - add(rdi, rcx) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm10) + vmovupd(ymm10, mem(rbx, 0*32)) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) - vmovupd(ymm12, mem(rcx, 0*32)) - add(rdi, rcx) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm12) + vmovupd(ymm12, mem(rdx, 0*32)) + add(rdi, rdx) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) - vmovupd(ymm14, mem(rcx, 0*32)) - //add(rdi, rcx) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm14) + vmovupd(ymm14, mem(rdx, 0*32)) jmp(.DDONE) // jump to end. @@ -11715,11 +12217,17 @@ void bli_dgemmsup_rv_haswell_asm_6x2m // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(xmm4, xmm4, xmm4) + vmovapd( ymm4, ymm5) vmovapd( ymm4, ymm6) + vmovapd( ymm4, ymm7) vmovapd( ymm4, ymm8) + vmovapd( ymm4, ymm9) vmovapd( ymm4, ymm10) + vmovapd( ymm4, ymm11) vmovapd( ymm4, ymm12) + vmovapd( ymm4, ymm13) vmovapd( ymm4, ymm14) + vmovapd( ymm4, ymm15) #endif mov(var(b), rbx) // load address of b. @@ -11817,19 +12325,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2m vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(xmm0, xmm2, xmm4) - vfmadd231pd(xmm0, xmm3, xmm6) + vfmadd231pd(xmm0, xmm2, xmm5) + vfmadd231pd(xmm0, xmm3, xmm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) - vfmadd231pd(xmm0, xmm2, xmm8) - vfmadd231pd(xmm0, xmm3, xmm10) + vfmadd231pd(xmm0, xmm2, xmm9) + vfmadd231pd(xmm0, xmm3, xmm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; - vfmadd231pd(xmm0, xmm2, xmm12) - vfmadd231pd(xmm0, xmm3, xmm14) + vfmadd231pd(xmm0, xmm2, xmm13) + vfmadd231pd(xmm0, xmm3, xmm15) // ---------------------------------- iteration 2 @@ -11874,29 +12382,31 @@ void bli_dgemmsup_rv_haswell_asm_6x2m vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(xmm0, xmm2, xmm4) - vfmadd231pd(xmm0, xmm3, xmm6) + vfmadd231pd(xmm0, xmm2, xmm5) + vfmadd231pd(xmm0, xmm3, xmm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) - vfmadd231pd(xmm0, xmm2, xmm8) - vfmadd231pd(xmm0, xmm3, xmm10) + vfmadd231pd(xmm0, xmm2, xmm9) + vfmadd231pd(xmm0, xmm3, xmm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; - vfmadd231pd(xmm0, xmm2, xmm12) - vfmadd231pd(xmm0, xmm3, xmm14) + vfmadd231pd(xmm0, xmm2, xmm13) + vfmadd231pd(xmm0, xmm3, xmm15) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - + vaddpd(ymm5, ymm4, ymm4) + vaddpd(ymm7, ymm6, ymm6) + vaddpd(ymm9, ymm8, ymm8) + vaddpd(ymm11, ymm10, ymm10) + vaddpd(ymm13, ymm12, ymm12) + vaddpd(ymm15, ymm14, ymm14) label(.DCONSIDKLEFT) @@ -11984,6 +12494,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m label(.DROWSTORED) + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) @@ -11992,27 +12503,24 @@ void bli_dgemmsup_rv_haswell_asm_6x2m vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) - add(rdi, rcx) - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) - vmovupd(xmm8, mem(rcx, 0*32)) - add(rdi, rcx) + vfmadd231pd(mem(rbx, 0*32), xmm3, xmm8) + vmovupd(xmm8, mem(rbx, 0*32)) + add(rdi, rbx) - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) - vmovupd(xmm10, mem(rcx, 0*32)) - add(rdi, rcx) + vfmadd231pd(mem(rbx, 0*32), xmm3, xmm10) + vmovupd(xmm10, mem(rbx, 0*32)) - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12) - vmovupd(xmm12, mem(rcx, 0*32)) - add(rdi, rcx) + vfmadd231pd(mem(rdx, 0*32), xmm3, xmm12) + vmovupd(xmm12, mem(rdx, 0*32)) + add(rdi, rdx) - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm14) - vmovupd(xmm14, mem(rcx, 0*32)) - //add(rdi, rcx) + vfmadd231pd(mem(rdx, 0*32), xmm3, xmm14) + vmovupd(xmm14, mem(rdx, 0*32)) jmp(.DDONE) // jump to end. diff --git a/kernels/haswell/3/sup/d6x8/CMakeLists.txt b/kernels/haswell/3/sup/d6x8/CMakeLists.txt index 24edd62ba5..5d41661142 100644 --- a/kernels/haswell/3/sup/d6x8/CMakeLists.txt +++ b/kernels/haswell/3/sup/d6x8/CMakeLists.txt @@ -7,6 +7,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx1.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx2.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx4.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx8.c +${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx1.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx2.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx3.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx4.c diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c new file mode 100644 index 0000000000..2c0f50c637 --- /dev/null +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c @@ -0,0 +1,1984 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + +#define C_TRANSPOSE_5x1_TILE(R1, R2, R3, R4, R5)\ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ +\ + vbroadcastsd(mem(rbx), ymm3)\ +\ + vfmadd231pd(mem(rcx ), ymm3, ymm(R1))\ + vmovupd(ymm(R1), mem(rcx ))\ +\ + vmovlpd(mem(rdx ), xmm0, xmm0)\ +\ + vfmadd213pd(ymm(R5), ymm3, ymm0)\ + vmovlpd(xmm0, mem(rdx ))\ + +#define C_TRANSPOSE_5x1_TILE_BZ(R1, R2, R3, R4, R5)\ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ +\ + vmovupd(ymm(R1), mem(rcx ))\ +\ + vmovlpd(xmm(R5), mem(rdx ))\ + + +#define C_TRANSPOSE_4x1_TILE(R1, R2, R3, R4)\ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ +\ + vbroadcastsd(mem(rbx), ymm3)\ +\ + vfmadd231pd(mem(rcx ), ymm3, ymm(R1))\ + vmovupd(ymm(R1), mem(rcx ))\ + +#define C_TRANSPOSE_4x1_TILE_BZ(R1, R2, R3, R4)\ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpcklpd(ymm(R4), ymm(R3), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ +\ + vmovupd(ymm(R1), mem(rcx )) + +#define C_TRANSPOSE_3x1_TILE(R1, R2, R3)\ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpcklpd(ymm(10), ymm(R3), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ +\ + vextractf128(imm(0x1), ymm(R1), xmm12)\ +\ + vbroadcastsd(mem(rbx), ymm3)\ +\ + vfmadd231pd(mem(rcx ), xmm3, xmm(R1))\ + vmovupd(xmm(R1), mem(rcx ))\ +\ + vfmadd231sd(mem(rdx ), xmm3, xmm12)\ + vmovsd(xmm12, mem(rdx )) + +#define C_TRANSPOSE_3x1_TILE_BZ(R1, R2, R3)\ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ + vunpcklpd(ymm(10), ymm(R3), ymm2)\ + vinsertf128(imm(0x1), xmm2, ymm0, ymm(R1))\ +\ + vextractf128(imm(0x1), ymm(R1), xmm12)\ +\ + vmovupd(xmm(R1), mem(rcx ))\ +\ + vmovlpd(xmm(12), mem(rdx )) + +#define C_TRANSPOSE_2x1_TILE(R1, R2)\ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ +\ + vbroadcastsd(mem(rbx), ymm3)\ + vfmadd231pd(mem(rcx ), xmm3, xmm0)\ + vmovupd(xmm0, mem(rcx )) + + +#define C_TRANSPOSE_2x1_TILE_BZ(R1, R2)\ + vunpcklpd(ymm(R2), ymm(R1), ymm0)\ +\ + vmovupd(xmm0, mem(rcx )) + +#define C_TRANSPOSE_1x1_TILE(R1)\ + vmovlpd(mem(rcx ), xmm0, xmm0)\ +\ + vbroadcastsd(mem(rbx), ymm3)\ + vfmadd213pd(ymm(R1), ymm3, ymm0)\ +\ + vmovlpd(xmm0, mem(rcx )) + +#define C_TRANSPOSE_1x1_TILE_BZ(R1)\ + vmovlpd(xmm(R1), mem(rcx )) + +static const int64_t mask_1[4] = {-1, 0, 0, 0}; + + +void bli_dgemmsup_rv_haswell_asm_5x1 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_1 which is set to -1, 0, 0, 0 so that the +// 1 element will be loaded. +// + int64_t const *mask_vec = mask_1; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + lea(mem(r9, r9, 2), r15) // r15 = 3*cs_a + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 2*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 2*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 2*8)) // prefetch c + 4*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 5*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm8) + vfmadd231pd(ymm1, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm1, ymm2, ymm13) + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 5*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm8) + vfmadd231pd(ymm1, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + prefetch(0, mem(rdx, r15, 1, 5*8)) // a_prefetch += 3*cs_a; + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm1, ymm2, ymm13) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + vaddpd(ymm5, ymm4, ymm4) + vaddpd(ymm7, ymm6, ymm6) + vaddpd(ymm9, ymm8, ymm8) + vaddpd(ymm11, ymm10, ymm10) + vaddpd(ymm13, ymm12, ymm12) + + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm2) + vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm8) + vfmadd231pd(ymm1, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 4), ymm2) + vfmadd231pd(ymm1, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm10, ymm10) + vmulpd(ymm0, ymm12, ymm12) + + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + lea(mem(rbx, rdi, 1), r8) // load address of c + 3*rs_c; + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vmaskmovpd(mem(rax, 0*32), ymm15, ymm2) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm3) + vmaskmovpd(mem(r8, 0*32), ymm15, ymm5) + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm7) + + vfmadd231pd(ymm0, ymm1, ymm4) + vfmadd231pd(ymm2, ymm1, ymm6) + vfmadd231pd(ymm3, ymm1, ymm8) + vfmadd231pd(ymm5, ymm1, ymm10) + vfmadd231pd(ymm7, ymm1, ymm12) + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rax, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rbx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(r8, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rdx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + C_TRANSPOSE_5x1_TILE(4, 6, 8, 10, 12) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm12, ymm15, mem(rcx, 0*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_5x1_TILE_BZ(4, 6, 8, 10, 12) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_4x1 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_1 which is set to -1, 0, 0, 0 so that the +// 1 element will be loaded. +// + int64_t const *mask_vec = mask_1; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 2*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 2*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 2*8)) // prefetch c + 4*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 4*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) + vfmadd231pd(ymm1, ymm12, ymm8) + vfmadd231pd(ymm1, ymm13, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) + vfmadd231pd(ymm1, ymm12, ymm9) + vfmadd231pd(ymm1, ymm13, ymm11) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) + vfmadd231pd(ymm1, ymm12, ymm8) + vfmadd231pd(ymm1, ymm13, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 2), rdx) // a_prefetch += 2*cs_a; + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 3*cs_a; + prefetch(0, mem(rdx, 4*8)) + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) + vfmadd231pd(ymm1, ymm12, ymm9) + vfmadd231pd(ymm1, ymm13, ymm11) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + vaddpd(ymm5, ymm4, ymm4) + vaddpd(ymm7, ymm6, ymm6) + vaddpd(ymm9, ymm8, ymm8) + vaddpd(ymm11, ymm10, ymm10) + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) + vfmadd231pd(ymm1, ymm12, ymm8) + vfmadd231pd(ymm1, ymm13, ymm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm10, ymm10) + + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; + lea(mem(rdx, rdi, 1), rbx) // load address of c + 3*rs_c; + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vmaskmovpd(mem(rax, 0*32), ymm15, ymm2) + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm3) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm14) + + vfmadd231pd(ymm0, ymm1, ymm4) + vfmadd231pd(ymm2, ymm1, ymm6) + vfmadd231pd(ymm3, ymm1, ymm8) + vfmadd231pd(ymm14, ymm1, ymm10) + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rax, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rdx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rbx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_4x1_TILE(4, 6, 8, 10) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_4x1_TILE_BZ(4, 6, 8, 10) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_3x1 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_1 which is set to -1, 0, 0, 0 so that the +// 1 element will be loaded. +// + int64_t const *mask_vec = mask_1; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 2*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 2*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 2*8)) // prefetch c + 4*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 3*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vfmadd231pd(ymm1, ymm12, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm1, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vfmadd231pd(ymm1, ymm12, ymm11) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 4*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vfmadd231pd(ymm1, ymm12, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 2), rdx) // a_prefetch += 2*cs_a; + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 3*cs_a; + prefetch(0, mem(rdx, 4*8)) + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm1, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vfmadd231pd(ymm1, ymm12, ymm11) + + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + vaddpd(ymm9, ymm4, ymm4) + vaddpd(ymm10, ymm6, ymm6) + vaddpd(ymm11, ymm8, ymm8) + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vfmadd231pd(ymm1, ymm12, ymm8) + + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm8, ymm8) + + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vmaskmovpd(mem(rax, 0*32), ymm15, ymm2) + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm3) + + vfmadd231pd(ymm0, ymm1, ymm4) + vfmadd231pd(ymm2, ymm1, ymm6) + vfmadd231pd(ymm3, ymm1, ymm8) + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rax, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rdx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_3x1_TILE(4, 6, 8) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + + vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_3x1_TILE_BZ(4, 6, 8) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_2x1 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_1 which is set to -1, 0, 0, 0 so that the +// 1 element will be loaded. +// + int64_t const *mask_vec = mask_1; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 2*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 2*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 2*8)) // prefetch c + 4*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 2*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm9) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm10) + vbroadcastsd(mem(rax, r8, 1), ymm11) + vfmadd231pd(ymm9, ymm10, ymm7) + vfmadd231pd(ymm9, ymm11, ymm8) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 2*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 2), rdx) // a_prefetch += 2*cs_a; + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 3*cs_a; + prefetch(0, mem(rdx, 4*8)) + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm9) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm10) + vbroadcastsd(mem(rax, r8, 1), ymm11) + vfmadd231pd(ymm9, ymm10, ymm7) + vfmadd231pd(ymm9, ymm11, ymm8) + + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + vaddpd(ymm7, ymm4, ymm4) + vaddpd(ymm8, ymm6, ymm6) + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + vbroadcastsd(mem(rax, r8, 1), ymm2) + vfmadd231pd(ymm1, ymm2, ymm6) + + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm6, ymm6) + + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + lea(mem(rcx, rdi, 1), rdx) // load address of c + 1*rs_c; + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm2) + + vfmadd231pd(ymm0, ymm1, ymm4) + vfmadd231pd(ymm2, ymm1, ymm6) + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rdx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_2x1_TILE(4, 6) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + add(rdi, rcx) + + vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + + jmp(.DDONE) // jump to end. + + label(.DCOLSTORBZ) + + C_TRANSPOSE_2x1_TILE_BZ(4, 6) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", + "memory" + ) +} + +void bli_dgemmsup_rv_haswell_asm_1x1 + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +// Sets up the mask for loading relevant remainder elements in load direction +// int64_t array of size 4 represents the mask for 4 elements of AVX2 vector register. +// +// Low end High end +// ________________________ +// | | | | | +// | 1 | 2 | 3 | 4 | ----> Source vector +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | -1 | 0 | 0 | 0 | ----> Mask vector( mask_1 ) +// |_____|_____|_____|_____| +// +// ________________________ +// | | | | | +// | 1 | 0 | 0 | 0 | ----> Destination vector +// |_____|_____|_____|_____| +// +// kernel is using mask_1 which is set to -1, 0, 0, 0 so that the +// 1 element will be loaded. +// + int64_t const *mask_vec = mask_1; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm15) //load + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) + lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(b), rbx) // load address of b. + mov(var(rs_b), r10) // load rs_b + //mov(var(cs_b), r11) // load cs_b + lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) + //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPFETCH) // jump to column storage case + label(.DROWPFETCH) // row-stored prefetching on c + + lea(mem(rcx, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*rs_c + + jmp(.DPOSTPFETCH) // jump to end of prefetching c + label(.DCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) + lea(mem(rsi, rsi, 2), rdx) // rdx = 3*cs_c; + prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c + + label(.DPOSTPFETCH) // done prefetching c + + +#if 1 + lea(mem(rax, r9, 8), rdx) // + lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; +#endif + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + +#if 1 + prefetch(0, mem(rdx, 1*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + +#if 0 + prefetch(0, mem(rdx, r9, 1, 4*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm7) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm8) + vfmadd231pd(ymm7, ymm8, ymm5) + + add(r9, rax) // a += cs_a; + // ---------------------------------- iteration 2 + +#if 1 + prefetch(0, mem(rdx, r9, 2, 1*8)) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + +#if 1 + lea(mem(rdx, r9, 2), rdx) // a_prefetch += 2*cs_a; + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 3*cs_a; + prefetch(0, mem(rdx, 4*8)) + lea(mem(rdx, r9, 1), rdx) // a_prefetch += 4*cs_a; +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm7) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm8) + vfmadd231pd(ymm7, ymm8, ymm5) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + vaddpd(ymm5, ymm4, ymm4) + + label(.DCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + +#if 0 + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) +#endif + //Loads 1 element as per mask_1 mask vector + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastsd(mem(rax ), ymm2) + vfmadd231pd(ymm1, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm1) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DROWSTORED) + + + vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm4) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + C_TRANSPOSE_1x1_TILE(4) + jmp(.DDONE) // jump to end. + + label(.DBETAZERO) + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DROWSTORBZ) + + + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + C_TRANSPOSE_1x1_TILE_BZ(4) + jmp(.DDONE) // jump to end. + + label(.DDONE) + vzeroupper() + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [n0] "m" (n0), + [rs_c] "m" (rs_c), + [mask_vec] "m" (mask_vec), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", + "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", + "memory" + ) +} \ No newline at end of file diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c index 7c2fd21e1e..71178b2907 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -115,9 +115,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -132,7 +132,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -168,31 +168,31 @@ void bli_dgemmsup_rv_haswell_asm_6x2 prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -200,19 +200,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 1 #if 0 @@ -226,25 +226,25 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -252,18 +252,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 3 @@ -278,43 +278,43 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -322,57 +322,57 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -381,42 +381,42 @@ void bli_dgemmsup_rv_haswell_asm_6x2 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12) vmovupd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm14) vmovupd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -452,40 +452,40 @@ void bli_dgemmsup_rv_haswell_asm_6x2 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) @@ -517,13 +517,13 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -589,9 +589,9 @@ void bli_dgemmsup_rv_haswell_asm_5x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -606,7 +606,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -647,21 +647,21 @@ void bli_dgemmsup_rv_haswell_asm_5x2 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 - + #if 1 prefetch(0, mem(rdx, 5*8)) #endif @@ -673,17 +673,17 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) - + // ---------------------------------- iteration 1 #if 0 @@ -697,23 +697,23 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) - + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -721,16 +721,16 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) - + // ---------------------------------- iteration 3 @@ -745,41 +745,41 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -787,54 +787,54 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // r13 = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -843,37 +843,27 @@ void bli_dgemmsup_rv_haswell_asm_5x2 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + lea(mem(rbx, rdi, 1), r8) // load address of c + 2*rs_c; + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) - vmovupd(xmm4, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) - vmovupd(xmm6, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) - vmovupd(xmm8, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) - vmovupd(xmm10, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12) - vmovupd(xmm12, mem(rcx, 0*32)) - //add(rdi, rcx) - + vfmadd231pd(mem(rax, 0*32), xmm3, xmm6) + vfmadd231pd(mem(rbx, 0*32), xmm3, xmm8) + vfmadd231pd(mem(r8, 0*32), xmm3, xmm10) + vfmadd231pd(mem(rdx, 0*32), xmm3, xmm12) + vmovupd(xmm4, mem(rcx, 0*32)) + vmovupd(xmm6, mem(rax, 0*32)) + vmovupd(xmm8, mem(rbx, 0*32)) + vmovupd(xmm10, mem(r8, 0*32)) + vmovupd(xmm12, mem(rdx, 0*32)) + + jmp(.DDONE) // jump to end. @@ -908,37 +898,37 @@ void bli_dgemmsup_rv_haswell_asm_5x2 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) @@ -948,7 +938,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2 label(.DCOLSTORBZ) - + // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) @@ -968,13 +958,13 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vmovhpd(xmm0, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1040,9 +1030,9 @@ void bli_dgemmsup_rv_haswell_asm_4x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1057,7 +1047,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1091,31 +1081,31 @@ void bli_dgemmsup_rv_haswell_asm_4x2 prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c - + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1123,14 +1113,14 @@ void bli_dgemmsup_rv_haswell_asm_4x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + // ---------------------------------- iteration 1 #if 0 @@ -1144,20 +1134,20 @@ void bli_dgemmsup_rv_haswell_asm_4x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1165,13 +1155,13 @@ void bli_dgemmsup_rv_haswell_asm_4x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + // ---------------------------------- iteration 3 @@ -1186,89 +1176,89 @@ void bli_dgemmsup_rv_haswell_asm_4x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1277,32 +1267,25 @@ void bli_dgemmsup_rv_haswell_asm_4x2 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + lea(mem(rcx, rdi, 1), rax) // load address of c + 2*rs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; + lea(mem(rdx, rdi, 1), rbx) // load address of c + 3*rs_c; + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) + vfmadd231pd(mem(rax, 0*32), xmm3, xmm6) + vfmadd231pd(mem(rdx, 0*32), xmm3, xmm8) + vfmadd231pd(mem(rbx, 0*32), xmm3, xmm10) + vmovupd(xmm4, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) - vmovupd(xmm6, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) - vmovupd(xmm8, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) - vmovupd(xmm10, mem(rcx, 0*32)) - //add(rdi, rcx) - - + vmovupd(xmm6, mem(rax, 0*32)) + vmovupd(xmm8, mem(rdx, 0*32)) + vmovupd(xmm10, mem(rbx, 0*32)) + + jmp(.DDONE) // jump to end. @@ -1328,32 +1311,32 @@ void bli_dgemmsup_rv_haswell_asm_4x2 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) @@ -1364,7 +1347,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2 label(.DCOLSTORBZ) - + // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) @@ -1377,13 +1360,13 @@ void bli_dgemmsup_rv_haswell_asm_4x2 vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1449,9 +1432,9 @@ void bli_dgemmsup_rv_haswell_asm_3x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1466,7 +1449,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1499,31 +1482,31 @@ void bli_dgemmsup_rv_haswell_asm_3x2 prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1531,12 +1514,12 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) - - + + // ---------------------------------- iteration 1 #if 0 @@ -1548,20 +1531,20 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(xmm0, xmm2, xmm4) - vfmadd231pd(xmm0, xmm3, xmm6) - + vfmadd231pd(xmm0, xmm2, xmm9) + vfmadd231pd(xmm0, xmm3, xmm10) + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; - vfmadd231pd(xmm0, xmm2, xmm8) - + vfmadd231pd(xmm0, xmm2, xmm11) + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1569,11 +1552,11 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) - + // ---------------------------------- iteration 3 @@ -1586,38 +1569,37 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(xmm0, xmm2, xmm4) - vfmadd231pd(xmm0, xmm3, xmm6) - + vfmadd231pd(xmm0, xmm2, xmm9) + vfmadd231pd(xmm0, xmm3, xmm10) + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; - vfmadd231pd(xmm0, xmm2, xmm8) - - - + vfmadd231pd(xmm0, xmm2, xmm11) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + vaddpd(xmm9, xmm4, xmm4) + vaddpd(xmm10, xmm6, xmm6) + vaddpd(xmm11, xmm8, xmm8) + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1625,78 +1607,73 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - + label(.DROWSTORED) - - - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) - vmovupd(xmm4, mem(rcx, 0*32)) - add(rdi, rcx) + lea(mem(rcx, rdi, 1), rbx) // load address of c + 1*rs_c; - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) - vmovupd(xmm6, mem(rcx, 0*32)) - add(rdi, rcx) + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) + vfmadd231pd(mem(rbx, 0*32), xmm3, xmm6) + vfmadd231pd(mem(rdx, 0*32), xmm3, xmm8) + + vmovupd(xmm4, mem(rcx, 0*32)) + vmovupd(xmm6, mem(rbx, 0*32)) + vmovupd(xmm8, mem(rdx, 0*32)) - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) - vmovupd(xmm8, mem(rcx, 0*32)) - //add(rdi, rcx) - - jmp(.DDONE) // jump to end. - + label(.DCOLSTORED) @@ -1725,26 +1702,26 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) - + //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -1755,8 +1732,8 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vmovupd(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1784,12 +1761,12 @@ void bli_dgemmsup_rv_haswell_asm_3x2 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1856,9 +1833,9 @@ void bli_dgemmsup_rv_haswell_asm_2x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1873,7 +1850,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1905,72 +1882,72 @@ void bli_dgemmsup_rv_haswell_asm_2x2 prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif - vmovupd(mem(rbx, 0*32), xmm0) + vmovupd(mem(rbx, 0*32), xmm9) add(r10, rbx) // b += rs_b; - - vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) + + vbroadcastsd(mem(rax ), ymm10) + vbroadcastsd(mem(rax, r8, 1), ymm11) add(r9, rax) // a += cs_a; - vfmadd231pd(xmm0, xmm2, xmm4) - vfmadd231pd(xmm0, xmm3, xmm6) - - + vfmadd231pd(xmm9, xmm10, xmm7) + vfmadd231pd(xmm9, xmm11, xmm8) + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + // ---------------------------------- iteration 3 @@ -1978,84 +1955,82 @@ void bli_dgemmsup_rv_haswell_asm_2x2 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif - vmovupd(mem(rbx, 0*32), xmm0) + vmovupd(mem(rbx, 0*32), xmm9) add(r10, rbx) // b += rs_b; - - vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) + + vbroadcastsd(mem(rax ), ymm10) + vbroadcastsd(mem(rax, r8, 1), ymm11) add(r9, rax) // a += cs_a; - vfmadd231pd(xmm0, xmm2, xmm4) - vfmadd231pd(xmm0, xmm3, xmm6) - - - + vfmadd231pd(xmm9, xmm10, xmm7) + vfmadd231pd(xmm9, xmm11, xmm8) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + vaddpd(xmm7, xmm4, xmm4) + vaddpd(xmm8, xmm6, xmm6) + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2064,22 +2039,20 @@ void bli_dgemmsup_rv_haswell_asm_2x2 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + lea(mem(rcx, rdi, 1), rbx) // load address of c + 1*rs_c; + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) + vfmadd231pd(mem(rbx, 0*32), xmm3, xmm6) + vmovupd(xmm4, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) - vmovupd(xmm6, mem(rcx, 0*32)) - //add(rdi, rcx) - - + vmovupd(xmm6, mem(rbx, 0*32)) + + jmp(.DDONE) // jump to end. @@ -2099,34 +2072,34 @@ void bli_dgemmsup_rv_haswell_asm_2x2 jmp(.DDONE) // jump to end. - - - + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) - + jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) - + vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) @@ -2135,13 +2108,13 @@ void bli_dgemmsup_rv_haswell_asm_2x2 vmovupd(xmm1, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -2207,9 +2180,9 @@ void bli_dgemmsup_rv_haswell_asm_1x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2224,7 +2197,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2255,31 +2228,31 @@ void bli_dgemmsup_rv_haswell_asm_1x2 prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -2287,34 +2260,34 @@ void bli_dgemmsup_rv_haswell_asm_1x2 add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) - + // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif - vmovupd(mem(rbx, 0*32), xmm0) + vmovupd(mem(rbx, 0*32), xmm6) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax ), ymm7) add(r9, rax) // a += cs_a; - vfmadd231pd(xmm0, xmm2, xmm4) - - + vfmadd231pd(xmm6, xmm7, xmm5) + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) - + // ---------------------------------- iteration 3 @@ -2322,98 +2295,95 @@ void bli_dgemmsup_rv_haswell_asm_1x2 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif - vmovupd(mem(rbx, 0*32), xmm0) + vmovupd(mem(rbx, 0*32), xmm6) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax ), ymm7) add(r9, rax) // a += cs_a; - vfmadd231pd(xmm0, xmm2, xmm4) - - - + vfmadd231pd(xmm6, xmm7, xmm5) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + vaddpd(xmm5, xmm4, xmm4) + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2428,48 +2398,48 @@ void bli_dgemmsup_rv_haswell_asm_1x2 vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) - + //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. - + label(.DCOLSTORBZ) - + // begin I/O on columns 0-1 vmovlpd(xmm4, mem(rcx )) vmovhpd(xmm4, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c index 795ca5772b..809986ab48 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c @@ -362,19 +362,17 @@ void bli_dgemmsup_rv_haswell_asm_5x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) - + vbroadcastsd(mem(rax, r8, 2), ymm14) vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm14, ymm8) vfmadd231pd(ymm1, ymm2, ymm10) - vbroadcastsd(mem(rax, r8, 4), ymm2) - vfmadd231pd(ymm1, ymm2, ymm12) + vbroadcastsd(mem(rax, r8, 4), ymm3) + vfmadd231pd(ymm1, ymm3, ymm12) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 @@ -387,19 +385,17 @@ void bli_dgemmsup_rv_haswell_asm_5x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm1, ymm2, ymm4) - - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm1, ymm3, ymm7) + vbroadcastsd(mem(rax, r8, 2), ymm14) vbroadcastsd(mem(rax, r13, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm10) + vfmadd231pd(ymm1, ymm14, ymm9) + vfmadd231pd(ymm1, ymm2, ymm11) - vbroadcastsd(mem(rax, r8, 4), ymm2) - vfmadd231pd(ymm1, ymm2, ymm12) + vbroadcastsd(mem(rax, r8, 4), ymm3) + vfmadd231pd(ymm1, ymm3, ymm13) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -411,19 +407,17 @@ void bli_dgemmsup_rv_haswell_asm_5x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) - + vbroadcastsd(mem(rax, r8, 2), ymm14) vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm14, ymm8) vfmadd231pd(ymm1, ymm2, ymm10) - vbroadcastsd(mem(rax, r8, 4), ymm2) - vfmadd231pd(ymm1, ymm2, ymm12) + vbroadcastsd(mem(rax, r8, 4), ymm3) + vfmadd231pd(ymm1, ymm3, ymm12) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 @@ -437,27 +431,27 @@ void bli_dgemmsup_rv_haswell_asm_5x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm1, ymm2, ymm4) - - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm1, ymm3, ymm7) + vbroadcastsd(mem(rax, r8, 2), ymm14) vbroadcastsd(mem(rax, r13, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm10) + vfmadd231pd(ymm1, ymm14, ymm9) + vfmadd231pd(ymm1, ymm2, ymm11) - vbroadcastsd(mem(rax, r8, 4), ymm2) - vfmadd231pd(ymm1, ymm2, ymm12) + vbroadcastsd(mem(rax, r8, 4), ymm3) + vfmadd231pd(ymm1, ymm3, ymm13) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - + vaddpd(ymm5, ymm4, ymm4) + vaddpd(ymm7, ymm6, ymm6) + vaddpd(ymm9, ymm8, ymm8) + vaddpd(ymm11, ymm10, ymm10) + vaddpd(ymm13, ymm12, ymm12) label(.DCONSIDKLEFT) @@ -479,19 +473,17 @@ void bli_dgemmsup_rv_haswell_asm_5x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) - + vbroadcastsd(mem(rax, r8, 2), ymm14) vbroadcastsd(mem(rax, r13, 1), ymm2) + vfmadd231pd(ymm1, ymm14, ymm8) vfmadd231pd(ymm1, ymm2, ymm10) - vbroadcastsd(mem(rax, r8, 4), ymm2) - vfmadd231pd(ymm1, ymm2, ymm12) + vbroadcastsd(mem(rax, r8, 4), ymm3) + vfmadd231pd(ymm1, ymm3, ymm12) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; @@ -544,37 +536,27 @@ void bli_dgemmsup_rv_haswell_asm_5x3 label(.DROWSTORED) + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + lea(mem(rbx, rdi, 1), r8) // load address of c + 3*rs_c; vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm4) - vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) - - add(rdi, rcx) - //-----------------------1 - - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm6) - vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) - - add(rdi, rcx) - //-----------------------2 - - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm8) - vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) - - add(rdi, rcx) - //-----------------------3 + vmaskmovpd(mem(rax, 0*32), ymm15, ymm2) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm3) + vmaskmovpd(mem(r8, 0*32), ymm15, ymm5) + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm7) - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm10) - vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + vfmadd231pd(ymm0, ymm1, ymm4) + vfmadd231pd(ymm2, ymm1, ymm6) + vfmadd231pd(ymm3, ymm1, ymm8) + vfmadd231pd(ymm5, ymm1, ymm10) + vfmadd231pd(ymm7, ymm1, ymm12) - add(rdi, rcx) - //-----------------------4 - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm12) - vmaskmovpd(ymm12, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rax, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rbx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(r8, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rdx, 0*32)) jmp(.DDONE) // jump to end. @@ -793,16 +775,14 @@ void bli_dgemmsup_rv_haswell_asm_4x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - - vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm1, ymm2, ymm6) - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) - - vbroadcastsd(mem(rax, r13, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm10) + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) + vfmadd231pd(ymm1, ymm12, ymm8) + vfmadd231pd(ymm1, ymm13, ymm10) add(r9, rax) // a += cs_a; @@ -816,16 +796,14 @@ void bli_dgemmsup_rv_haswell_asm_4x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm1, ymm2, ymm4) - - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm1, ymm2, ymm7) - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) - - vbroadcastsd(mem(rax, r13, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm10) + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) + vfmadd231pd(ymm1, ymm12, ymm9) + vfmadd231pd(ymm1, ymm13, ymm11) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -838,16 +816,14 @@ void bli_dgemmsup_rv_haswell_asm_4x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) - - vbroadcastsd(mem(rax, r13, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm10) + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) + vfmadd231pd(ymm1, ymm12, ymm8) + vfmadd231pd(ymm1, ymm13, ymm10) add(r9, rax) // a += cs_a; @@ -864,24 +840,24 @@ void bli_dgemmsup_rv_haswell_asm_4x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm1, ymm2, ymm4) - - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm1, ymm3, ymm7) - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) - - vbroadcastsd(mem(rax, r13, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm10) + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) + vfmadd231pd(ymm1, ymm12, ymm9) + vfmadd231pd(ymm1, ymm13, ymm11) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - + vaddpd(ymm5, ymm4, ymm4) + vaddpd(ymm7, ymm6, ymm6) + vaddpd(ymm9, ymm8, ymm8) + vaddpd(ymm11, ymm10, ymm10) @@ -904,16 +880,14 @@ void bli_dgemmsup_rv_haswell_asm_4x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - - vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm1, ymm2, ymm6) - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) - - vbroadcastsd(mem(rax, r13, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm10) + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) + vfmadd231pd(ymm1, ymm12, ymm8) + vfmadd231pd(ymm1, ymm13, ymm10) add(r9, rax) // a += cs_a; @@ -965,33 +939,24 @@ void bli_dgemmsup_rv_haswell_asm_4x3 label(.DROWSTORED) - - - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm4) - vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) - - add(rdi, rcx) - //-----------------------1 + lea(mem(rcx, rdi, 1), rax) // load address of c + 2*rs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; + lea(mem(rdx, rdi, 1), rbx) // load address of c + 3*rs_c; vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm6) - vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) - - add(rdi, rcx) - //-----------------------2 - - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm8) - vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + vmaskmovpd(mem(rax, 0*32), ymm15, ymm2) + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm3) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm14) - add(rdi, rcx) - //-----------------------3 - - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm10) - vmaskmovpd(ymm10, ymm15, mem(rcx, 0*32)) + vfmadd231pd(ymm0, ymm1, ymm4) + vfmadd231pd(ymm2, ymm1, ymm6) + vfmadd231pd(ymm3, ymm1, ymm8) + vfmadd231pd(ymm14, ymm1, ymm10) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rax, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rdx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rbx, 0*32)) jmp(.DDONE) // jump to end. @@ -1208,13 +1173,12 @@ void bli_dgemmsup_rv_haswell_asm_3x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) + vbroadcastsd(mem(rax, r8, 2), ymm12) + vfmadd231pd(ymm1, ymm12, ymm8) add(r9, rax) // a += cs_a; @@ -1228,13 +1192,12 @@ void bli_dgemmsup_rv_haswell_asm_3x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm1, ymm2, ymm4) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm1, ymm3, ymm10) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) + vbroadcastsd(mem(rax, r8, 2), ymm12) + vfmadd231pd(ymm1, ymm12, ymm11) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -1247,13 +1210,12 @@ void bli_dgemmsup_rv_haswell_asm_3x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm1, ymm3, ymm6) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) + vbroadcastsd(mem(rax, r8, 2), ymm12) + vfmadd231pd(ymm1, ymm12, ymm8) add(r9, rax) // a += cs_a; @@ -1270,13 +1232,12 @@ void bli_dgemmsup_rv_haswell_asm_3x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm1, ymm2, ymm4) + vbroadcastsd(mem(rax, r8, 1), ymm3) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm1, ymm3, ymm10) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm1, ymm2, ymm8) + vbroadcastsd(mem(rax, r8, 2), ymm12) + vfmadd231pd(ymm1, ymm12, ymm11) add(r9, rax) // a += cs_a; @@ -1284,10 +1245,9 @@ void bli_dgemmsup_rv_haswell_asm_3x3 dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - + vaddpd(ymm9, ymm4, ymm4) + vaddpd(ymm10, ymm6, ymm6) + vaddpd(ymm11, ymm8, ymm8) label(.DCONSIDKLEFT) @@ -1366,27 +1326,19 @@ void bli_dgemmsup_rv_haswell_asm_3x3 label(.DROWSTORED) - + lea(mem(rcx, rdi, 1), rbx) // load address of c + 1*rs_c; //Loads 3 elements as per mask_3 mask vector vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm4) - vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) - - add(rdi, rcx) - //-----------------------1 - - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm6) - vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) - - add(rdi, rcx) - //-----------------------2 - - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm8) - vmaskmovpd(ymm8, ymm15, mem(rcx, 0*32)) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm2) + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm3) + vfmadd231pd(ymm0, ymm1, ymm4) + vfmadd231pd(ymm2, ymm1, ymm6) + vfmadd231pd(ymm3, ymm1, ymm8) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rbx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rdx, 0*32)) jmp(.DDONE) // jump to end. @@ -1596,10 +1548,9 @@ void bli_dgemmsup_rv_haswell_asm_2x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) + vfmadd231pd(ymm1, ymm3, ymm6) add(r9, rax) // a += cs_a; @@ -1609,14 +1560,13 @@ void bli_dgemmsup_rv_haswell_asm_2x3 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif //Loads 3 elements as per mask_3 mask vector - vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm9) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm1, ymm2, ymm4) - - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax ), ymm10) + vbroadcastsd(mem(rax, r8, 1), ymm11) + vfmadd231pd(ymm9, ymm10, ymm7) + vfmadd231pd(ymm9, ymm11, ymm8) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -1629,10 +1579,9 @@ void bli_dgemmsup_rv_haswell_asm_2x3 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) + vfmadd231pd(ymm1, ymm3, ymm6) add(r9, rax) // a += cs_a; @@ -1645,14 +1594,13 @@ void bli_dgemmsup_rv_haswell_asm_2x3 lea(mem(rdx, r9, 1), rdx) // a_prefetch += 4*cs_a; #endif //Loads 3 elements as per mask_3 mask vector - vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm9) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm1, ymm2, ymm4) - - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax ), ymm10) + vbroadcastsd(mem(rax, r8, 1), ymm11) + vfmadd231pd(ymm9, ymm10, ymm7) + vfmadd231pd(ymm9, ymm11, ymm8) add(r9, rax) // a += cs_a; @@ -1660,10 +1608,8 @@ void bli_dgemmsup_rv_haswell_asm_2x3 dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - + vaddpd(ymm7, ymm4, ymm4) + vaddpd(ymm8, ymm6, ymm6) label(.DCONSIDKLEFT) @@ -1739,17 +1685,15 @@ void bli_dgemmsup_rv_haswell_asm_2x3 label(.DROWSTORED) - + lea(mem(rcx, rdi, 1), rdx) // load address of c + 1*rs_c; vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm4) - vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + vmaskmovpd(mem(rdx, 0*32), ymm15, ymm2) - add(rdi, rcx) - //-----------------------1 + vfmadd231pd(ymm0, ymm1, ymm4) + vfmadd231pd(ymm2, ymm1, ymm6) - vmaskmovpd(mem(rcx, 0*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm6) - vmaskmovpd(ymm6, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rdx, 0*32)) jmp(.DDONE) // jump to end. @@ -1956,11 +1900,11 @@ void bli_dgemmsup_rv_haswell_asm_1x3 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif //Loads 3 elements as per mask_3 mask vector - vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm7) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm1, ymm2, ymm4) + vbroadcastsd(mem(rax ), ymm8) + vfmadd231pd(ymm7, ymm8, ymm5) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -1986,21 +1930,18 @@ void bli_dgemmsup_rv_haswell_asm_1x3 lea(mem(rdx, r9, 1), rdx) // a_prefetch += 4*cs_a; #endif //Loads 3 elements as per mask_3 mask vector - vmaskmovpd(mem(rbx, 0*32), ymm15, ymm1) + vmaskmovpd(mem(rbx, 0*32), ymm15, ymm7) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm1, ymm2, ymm4) + vbroadcastsd(mem(rax ), ymm8) + vfmadd231pd(ymm7, ymm8, ymm5) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - + vaddpd(ymm5, ymm4, ymm4) label(.DCONSIDKLEFT) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c index ad43e7ba57..99a128a238 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -612,9 +612,9 @@ void bli_dgemmsup_rv_haswell_asm_5x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -629,7 +629,7 @@ void bli_dgemmsup_rv_haswell_asm_5x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -672,19 +672,19 @@ void bli_dgemmsup_rv_haswell_asm_5x4 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -698,17 +698,17 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) - + // ---------------------------------- iteration 1 #if 0 @@ -720,18 +720,18 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm0, ymm3, ymm6) - + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm7) + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm0, ymm3, ymm10) - + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm11) + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm12) - + vfmadd231pd(ymm0, ymm2, ymm13) + // ---------------------------------- iteration 2 @@ -746,16 +746,16 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) - + // ---------------------------------- iteration 3 @@ -768,43 +768,47 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm0, ymm3, ymm6) - + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm7) + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm0, ymm3, ymm10) - + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm11) + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm12) - - - + vfmadd231pd(ymm0, ymm2, ymm13) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + vaddpd(ymm5, ymm4, ymm4) + vaddpd(ymm7, ymm6, ymm6) + vaddpd(ymm9, ymm8, ymm8) + vaddpd(ymm11, ymm10, ymm10) + vaddpd(ymm13, ymm12, ymm12) + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -812,54 +816,54 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm12, ymm12) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -868,37 +872,27 @@ void bli_dgemmsup_rv_haswell_asm_5x4 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + lea(mem(rcx, rdi, 1), rax) // load address of c + 2*rs_c; + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + lea(mem(rbx, rdi, 1), r8) // load address of c + 2*rs_c; + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) + vfmadd231pd(mem(rax, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm8) + vfmadd231pd(mem(r8, 0*32), ymm3, ymm10) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm12) + vmovupd(ymm4, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) - vmovupd(ymm10, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) - vmovupd(ymm12, mem(rcx, 0*32)) - //add(rdi, rcx) - - + vmovupd(ymm6, mem(rax, 0*32)) + vmovupd(ymm8, mem(rbx, 0*32)) + vmovupd(ymm10, mem(r8, 0*32)) + vmovupd(ymm12, mem(rdx, 0*32)) + jmp(.DDONE) // jump to end. @@ -945,41 +939,41 @@ void bli_dgemmsup_rv_haswell_asm_5x4 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) //add(rdi, rcx) - + jmp(.DDONE) // jump to end. @@ -1012,13 +1006,13 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vmovhpd(xmm1, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1085,9 +1079,9 @@ void bli_dgemmsup_rv_haswell_asm_4x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1102,7 +1096,7 @@ void bli_dgemmsup_rv_haswell_asm_4x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1138,8 +1132,8 @@ void bli_dgemmsup_rv_haswell_asm_4x4 prefetch(0, mem(rcx, rdx, 1, 3*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; @@ -1147,22 +1141,22 @@ void bli_dgemmsup_rv_haswell_asm_4x4 - + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1170,14 +1164,14 @@ void bli_dgemmsup_rv_haswell_asm_4x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vbroadcastsd(mem(rax, r13, 1), ymm3) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm0, ymm3, ymm10) - - + vfmadd231pd(ymm0, ymm12, ymm8) + vfmadd231pd(ymm0, ymm13, ymm10) + + // ---------------------------------- iteration 1 #if 0 @@ -1186,39 +1180,39 @@ void bli_dgemmsup_rv_haswell_asm_4x4 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm0, ymm3, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm7) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm0, ymm3, ymm10) - + vfmadd231pd(ymm0, ymm12, ymm9) + vfmadd231pd(ymm0, ymm13, ymm11) + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vbroadcastsd(mem(rax, r13, 1), ymm3) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm0, ymm3, ymm10) - + vfmadd231pd(ymm0, ymm12, ymm8) + vfmadd231pd(ymm0, ymm13, ymm10) + // ---------------------------------- iteration 3 @@ -1228,128 +1222,123 @@ void bli_dgemmsup_rv_haswell_asm_4x4 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm0, ymm3, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vbroadcastsd(mem(rax, r13, 1), ymm3) + vfmadd231pd(ymm0, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm7) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm0, ymm3, ymm10) - - - + vfmadd231pd(ymm0, ymm12, ymm9) + vfmadd231pd(ymm0, ymm13, ymm11) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + vaddpd(ymm5, ymm4, ymm4) + vaddpd(ymm7, ymm6, ymm6) + vaddpd(ymm9, ymm8, ymm8) + vaddpd(ymm11, ymm10, ymm10) + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) - vbroadcastsd(mem(rax, r13, 1), ymm3) + + vbroadcastsd(mem(rax, r8, 2), ymm12) + vbroadcastsd(mem(rax, r13, 1), ymm13) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm0, ymm3, ymm10) - - + vfmadd231pd(ymm0, ymm12, ymm8) + vfmadd231pd(ymm0, ymm13, ymm10) + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) - - + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*cs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; + lea(mem(rdx, rdi, 1), rbx) // load address of c + 3*rs_c; + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) + vfmadd231pd(mem(rax, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm8) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm10) + vmovupd(ymm4, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) - vmovupd(ymm10, mem(rcx, 0*32)) - //add(rdi, rcx) - - + vmovupd(ymm6, mem(rax, 0*32)) + vmovupd(ymm8, mem(rdx, 0*32)) + vmovupd(ymm10, mem(rbx, 0*32)) + + jmp(.DDONE) // jump to end. @@ -1381,33 +1370,33 @@ void bli_dgemmsup_rv_haswell_asm_4x4 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm10, mem(rcx, 0*32)) //add(rdi, rcx) @@ -1417,7 +1406,7 @@ void bli_dgemmsup_rv_haswell_asm_4x4 label(.DCOLSTORBZ) - + // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) @@ -1435,12 +1424,12 @@ void bli_dgemmsup_rv_haswell_asm_4x4 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1507,9 +1496,9 @@ void bli_dgemmsup_rv_haswell_asm_3x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1524,7 +1513,7 @@ void bli_dgemmsup_rv_haswell_asm_3x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1559,31 +1548,31 @@ void bli_dgemmsup_rv_haswell_asm_3x4 prefetch(0, mem(rcx, rdx, 1, 2*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1591,12 +1580,12 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) + + vbroadcastsd(mem(rax, r8, 2), ymm12) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - - + vfmadd231pd(ymm0, ymm12, ymm8) + + // ---------------------------------- iteration 1 #if 0 @@ -1608,20 +1597,20 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm0, ymm3, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 2), ymm12) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - + vfmadd231pd(ymm0, ymm12, ymm11) + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1629,11 +1618,11 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) + + vbroadcastsd(mem(rax, r8, 2), ymm12) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - + vfmadd231pd(ymm0, ymm12, ymm8) + // ---------------------------------- iteration 3 @@ -1646,38 +1635,37 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm0, ymm3, ymm6) - - vbroadcastsd(mem(rax, r8, 2), ymm2) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + + vbroadcastsd(mem(rax, r8, 2), ymm12) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - - - + vfmadd231pd(ymm0, ymm12, ymm11) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + vaddpd(ymm9, ymm4, ymm4) + vaddpd(ymm10, ymm6, ymm6) + vaddpd(ymm11, ymm8, ymm8) + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1685,78 +1673,73 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - + label(.DROWSTORED) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) - vmovupd(ymm4, mem(rcx, 0*32)) - add(rdi, rcx) + lea(mem(rcx, rdi, 1), rbx) // load address of c + 2*rs_c; - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) - add(rdi, rcx) + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm8) + + vmovupd(ymm4, mem(rcx, 0*32)) + vmovupd(ymm6, mem(rbx, 0*32)) + vmovupd(ymm8, mem(rdx, 0*32)) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) - //add(rdi, rcx) - - jmp(.DDONE) // jump to end. - + label(.DCOLSTORED) @@ -1797,26 +1780,26 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) - + //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -1827,8 +1810,8 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vmovupd(ymm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1864,12 +1847,12 @@ void bli_dgemmsup_rv_haswell_asm_3x4 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1936,9 +1919,9 @@ void bli_dgemmsup_rv_haswell_asm_2x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1953,7 +1936,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1987,31 +1970,31 @@ void bli_dgemmsup_rv_haswell_asm_2x4 prefetch(0, mem(rcx, rdx, 1, 1*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -2020,26 +2003,26 @@ void bli_dgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - - + + // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif - vmovupd(mem(rbx, 0*32), ymm0) + vmovupd(mem(rbx, 0*32), ymm9) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) + vbroadcastsd(mem(rax ), ymm10) + vbroadcastsd(mem(rax, r8, 1), ymm11) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm0, ymm3, ymm6) - + vfmadd231pd(ymm9, ymm10, ymm7) + vfmadd231pd(ymm9, ymm11, ymm8) + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -2052,7 +2035,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + // ---------------------------------- iteration 3 @@ -2060,40 +2043,38 @@ void bli_dgemmsup_rv_haswell_asm_2x4 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif - vmovupd(mem(rbx, 0*32), ymm0) + vmovupd(mem(rbx, 0*32), ymm9) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) + vbroadcastsd(mem(rax ), ymm10) + vbroadcastsd(mem(rax, r8, 1), ymm11) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm0, ymm3, ymm6) - - - + vfmadd231pd(ymm9, ymm10, ymm7) + vfmadd231pd(ymm9, ymm11, ymm8) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + vaddpd(ymm7, ymm4, ymm4) + vaddpd(ymm8, ymm6, ymm6) + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -2102,42 +2083,42 @@ void bli_dgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2146,22 +2127,19 @@ void bli_dgemmsup_rv_haswell_asm_2x4 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + lea(mem(rcx, rdi, 1), rdx) // load address of c + 1*cs_c; + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm6) + vmovupd(ymm4, mem(rcx, 0*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) - //add(rdi, rcx) - - + vmovupd(ymm6, mem(rdx, 0*32)) + jmp(.DDONE) // jump to end. @@ -2187,24 +2165,24 @@ void bli_dgemmsup_rv_haswell_asm_2x4 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) //add(rdi, rcx) @@ -2215,7 +2193,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4 label(.DCOLSTORBZ) - + // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) @@ -2228,13 +2206,13 @@ void bli_dgemmsup_rv_haswell_asm_2x4 vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -2300,9 +2278,9 @@ void bli_dgemmsup_rv_haswell_asm_1x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2317,7 +2295,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2350,27 +2328,27 @@ void bli_dgemmsup_rv_haswell_asm_1x4 prefetch(0, mem(rcx, rdx, 1, 0*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 - + #if 1 prefetch(0, mem(rdx, 4*8)) #endif @@ -2381,35 +2359,35 @@ void bli_dgemmsup_rv_haswell_asm_1x4 vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) - - + + // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif - vmovupd(mem(rbx, 0*32), ymm0) + vmovupd(mem(rbx, 0*32), ymm1) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax ), ymm3) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - + vfmadd231pd(ymm1, ymm3, ymm5) + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - - vmovupd(mem(rbx, 0*32), ymm0) + + vmovupd(mem(rbx, 0*32), ymm6) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax ), ymm7) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - + vfmadd231pd(ymm6, ymm7, ymm4) + // ---------------------------------- iteration 3 @@ -2417,33 +2395,30 @@ void bli_dgemmsup_rv_haswell_asm_1x4 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif - vmovupd(mem(rbx, 0*32), ymm0) + vmovupd(mem(rbx, 0*32), ymm8) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax ), ymm9) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - - - + vfmadd231pd(ymm8, ymm9, ymm5) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + vaddpd(ymm5, ymm4, ymm4) + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -2455,41 +2430,41 @@ void bli_dgemmsup_rv_haswell_asm_1x4 vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2497,17 +2472,17 @@ void bli_dgemmsup_rv_haswell_asm_1x4 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2529,15 +2504,15 @@ void bli_dgemmsup_rv_haswell_asm_1x4 vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) - + //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) @@ -2545,10 +2520,10 @@ void bli_dgemmsup_rv_haswell_asm_1x4 jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) @@ -2558,7 +2533,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4 label(.DCOLSTORBZ) - + // begin I/O on columns 0-3 vmovupd(ymm4, ymm0) @@ -2569,14 +2544,14 @@ void bli_dgemmsup_rv_haswell_asm_1x4 vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - - - + + + + + label(.DDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c index ac12db75c1..8c6a45c513 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c @@ -741,53 +741,45 @@ void bli_dgemmsup_rv_haswell_asm_5x5 label(.DROWSTORED) + lea(mem(rcx, rdi, 1), rax) // load address of c + 2*rs_c; + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + lea(mem(rbx, rdi, 1), r8) // load address of c + 2*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm4) - vmovupd(ymm3, mem(rcx, 0*32)) - vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - - add(rdi, rcx) - //-----------------------1 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rax, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rax, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm6) - vmovupd(ymm5, mem(rcx, 0*32)) - vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) - - add(rdi, rcx) - //-----------------------2 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rbx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm8) - vmovupd(ymm7, mem(rcx, 0*32)) - vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + vfmadd231pd(mem(r8, 0*32), ymm1, ymm9) + vmaskmovpd(mem(r8, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) - add(rdi, rcx) - //-----------------------3 + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm11) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm12) - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm10) - vmovupd(ymm9, mem(rcx, 0*32)) - vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - add(rdi, rcx) - //-----------------------4 + vmovupd(ymm5, mem(rax, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rax, 1*32)) - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm11) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm12) + vmovupd(ymm7, mem(rbx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rbx, 1*32)) - vmovupd(ymm11, mem(rcx, 0*32)) - vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + vmovupd(ymm9, mem(r8, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(r8, 1*32)) + + vmovupd(ymm11, mem(rdx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rdx, 1*32)) jmp(.DDONE) // jump to end. @@ -1223,44 +1215,38 @@ void bli_dgemmsup_rv_haswell_asm_4x5 label(.DROWSTORED) + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; + lea(mem(rdx, rdi, 1), rbx) // load address of c + 3*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm4) - vmovupd(ymm3, mem(rcx, 0*32)) - vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - - add(rdi, rcx) - //-----------------------1 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rax, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rax, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm6) - vmovupd(ymm5, mem(rcx, 0*32)) - vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) - add(rdi, rcx) - //-----------------------2 + vfmadd231pd(mem(rbx, 0*32), ymm1, ymm9) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm8) - vmovupd(ymm7, mem(rcx, 0*32)) - vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - add(rdi, rcx) - //-----------------------3 + vmovupd(ymm5, mem(rax, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rax, 1*32)) - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm10) + vmovupd(ymm7, mem(rdx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rdx, 1*32)) - vmovupd(ymm9, mem(rcx, 0*32)) - vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) - //-----------------------4 + vmovupd(ymm9, mem(rbx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rbx, 1*32)) jmp(.DDONE) // jump to end. @@ -1513,16 +1499,16 @@ void bli_dgemmsup_rv_haswell_asm_3x5 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm0, ymm2, ymm7) - vfmadd231pd(ymm1, ymm2, ymm8) + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -1564,26 +1550,28 @@ void bli_dgemmsup_rv_haswell_asm_3x5 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm0, ymm2, ymm7) - vfmadd231pd(ymm1, ymm2, ymm8) + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - + vaddpd(ymm9, ymm3, ymm3) + vaddpd(ymm10, ymm4, ymm4) + vaddpd(ymm11, ymm5, ymm5) + vaddpd(ymm12, ymm6, ymm6) + vaddpd(ymm13, ymm7, ymm7) + vaddpd(ymm14, ymm8, ymm8) label(.DCONSIDKLEFT) @@ -1667,33 +1655,29 @@ void bli_dgemmsup_rv_haswell_asm_3x5 label(.DROWSTORED) + lea(mem(rcx, rdi, 1), rbx) // load address of c + 1*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm4) - vmovupd(ymm3, mem(rcx, 0*32)) - vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - - add(rdi, rcx) - //-----------------------1 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rbx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm6) - vmovupd(ymm5, mem(rcx, 0*32)) - vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) - add(rdi, rcx) - //-----------------------2 - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm8) + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - vmovupd(ymm7, mem(rcx, 0*32)) - vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + vmovupd(ymm5, mem(rbx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rbx, 1*32)) + + vmovupd(ymm7, mem(rdx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rdx, 1*32)) jmp(.DDONE) // jump to end. @@ -1914,9 +1898,9 @@ void bli_dgemmsup_rv_haswell_asm_2x5 vfmadd231pd(ymm0, ymm2, ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm11) + vfmadd231pd(ymm0, ymm11, ymm5) + vfmadd231pd(ymm1, ymm11, ymm6) add(r9, rax) // a += cs_a; @@ -1933,12 +1917,12 @@ void bli_dgemmsup_rv_haswell_asm_2x5 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm11) + vfmadd231pd(ymm0, ymm11, ymm9) + vfmadd231pd(ymm1, ymm11, ymm10) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -1957,9 +1941,9 @@ void bli_dgemmsup_rv_haswell_asm_2x5 vfmadd231pd(ymm0, ymm2, ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm11) + vfmadd231pd(ymm0, ymm11, ymm5) + vfmadd231pd(ymm1, ymm11, ymm6) add(r9, rax) // a += cs_a; @@ -1976,22 +1960,22 @@ void bli_dgemmsup_rv_haswell_asm_2x5 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm0, ymm2, ymm7) + vfmadd231pd(ymm1, ymm2, ymm8) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm11) + vfmadd231pd(ymm0, ymm11, ymm9) + vfmadd231pd(ymm1, ymm11, ymm10) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - + vaddpd(ymm7, ymm3, ymm3) + vaddpd(ymm8, ymm4, ymm4) + vaddpd(ymm9, ymm5, ymm5) + vaddpd(ymm10, ymm6, ymm6) label(.DCONSIDKLEFT) @@ -2018,9 +2002,9 @@ void bli_dgemmsup_rv_haswell_asm_2x5 vfmadd231pd(ymm0, ymm2, ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm11) + vfmadd231pd(ymm0, ymm11, ymm5) + vfmadd231pd(ymm1, ymm11, ymm6) add(r9, rax) // a += cs_a; @@ -2073,23 +2057,21 @@ void bli_dgemmsup_rv_haswell_asm_2x5 label(.DROWSTORED) + lea(mem(rcx, rdi, 1), rdx) // load address of c + 1*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm4) + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm2) + vfmadd231pd(ymm2, ymm1, ymm6) + vmovupd(ymm3, mem(rcx, 0*32)) vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - add(rdi, rcx) - //-----------------------1 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm6) - - vmovupd(ymm5, mem(rcx, 0*32)) - vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + vmovupd(ymm5, mem(rdx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rdx, 1*32)) jmp(.DDONE) // jump to end. @@ -2315,14 +2297,14 @@ void bli_dgemmsup_rv_haswell_asm_1x5 #endif //Loads 4 element - vmovupd(mem(rbx, 0*32), ymm0) + vmovupd(mem(rbx, 0*32), ymm8) //Loads 1 element as per mask_1 mask vector - vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm9) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vbroadcastsd(mem(rax ), ymm7) + vfmadd231pd(ymm8, ymm7, ymm5) + vfmadd231pd(ymm9, ymm7, ymm6) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -2350,24 +2332,22 @@ void bli_dgemmsup_rv_haswell_asm_1x5 #endif //Loads 4 element - vmovupd(mem(rbx, 0*32), ymm0) + vmovupd(mem(rbx, 0*32), ymm8) //Loads 1 element as per mask_1 mask vector - vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm9) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vbroadcastsd(mem(rax ), ymm7) + vfmadd231pd(ymm8, ymm7, ymm5) + vfmadd231pd(ymm9, ymm7, ymm6) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - + vaddpd(ymm5, ymm3, ymm3) + vaddpd(ymm6, ymm4, ymm4) label(.DCONSIDKLEFT) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c index 9f80ef2f0d..caa20a06cd 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -115,15 +115,15 @@ void bli_dgemmsup_rv_haswell_asm_6x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -180,18 +180,18 @@ void bli_dgemmsup_rv_haswell_asm_6x6 lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -208,14 +208,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -224,7 +224,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -241,14 +241,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -256,8 +256,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 1 @@ -274,14 +274,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -289,7 +289,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -307,14 +307,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -322,50 +322,50 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -373,22 +373,22 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) @@ -401,24 +401,24 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vmulpd(xmm0, xmm13, xmm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(xmm0, xmm15, xmm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -427,60 +427,60 @@ void bli_dgemmsup_rv_haswell_asm_6x6 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13) vmovupd(xmm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm15) vmovupd(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -555,51 +555,51 @@ void bli_dgemmsup_rv_haswell_asm_6x6 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(xmm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -656,12 +656,12 @@ void bli_dgemmsup_rv_haswell_asm_6x6 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -729,15 +729,15 @@ void bli_dgemmsup_rv_haswell_asm_5x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -793,18 +793,18 @@ void bli_dgemmsup_rv_haswell_asm_5x6 lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -821,20 +821,20 @@ void bli_dgemmsup_rv_haswell_asm_5x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 1 #if 0 @@ -851,20 +851,20 @@ void bli_dgemmsup_rv_haswell_asm_5x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - + + // ---------------------------------- iteration 2 #if 1 @@ -881,19 +881,19 @@ void bli_dgemmsup_rv_haswell_asm_5x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 3 @@ -911,82 +911,82 @@ void bli_dgemmsup_rv_haswell_asm_5x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) @@ -997,24 +997,24 @@ void bli_dgemmsup_rv_haswell_asm_5x6 vmulpd(xmm0, xmm11, xmm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(xmm0, xmm13, xmm13) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1023,52 +1023,46 @@ void bli_dgemmsup_rv_haswell_asm_5x6 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) - vmovupd(ymm4, mem(rcx, 0*32)) + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + lea(mem(rbx, rdi, 1), r8) // load address of c + 3*rs_c; + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) + + vfmadd231pd(mem(rax, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rax, 1*32), xmm3, xmm7) + + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm8) + vfmadd231pd(mem(rbx, 1*32), xmm3, xmm9) + + vfmadd231pd(mem(r8, 0*32), ymm3, ymm10) + vfmadd231pd(mem(r8, 1*32), xmm3, xmm11) + + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm12) + vfmadd231pd(mem(rdx, 1*32), xmm3, xmm13) + + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) - vmovupd(xmm7, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) + vmovupd(ymm6, mem(rax, 0*32)) + vmovupd(xmm7, mem(rax, 1*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) - vmovupd(xmm9, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) - vmovupd(ymm10, mem(rcx, 0*32)) + vmovupd(ymm8, mem(rbx, 0*32)) + vmovupd(xmm9, mem(rbx, 1*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) - vmovupd(xmm11, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) - vmovupd(ymm12, mem(rcx, 0*32)) + vmovupd(ymm10, mem(r8, 0*32)) + vmovupd(xmm11, mem(r8, 1*32)) + + vmovupd(ymm12, mem(rdx, 0*32)) + vmovupd(xmm13, mem(rdx, 1*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13) - vmovupd(xmm13, mem(rcx, 1*32)) - //add(rdi, rcx) - - jmp(.DDONE) // jump to end. @@ -1141,46 +1135,46 @@ void bli_dgemmsup_rv_haswell_asm_5x6 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(xmm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1234,12 +1228,12 @@ void bli_dgemmsup_rv_haswell_asm_5x6 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1306,15 +1300,15 @@ void bli_dgemmsup_rv_haswell_asm_4x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -1370,17 +1364,17 @@ void bli_dgemmsup_rv_haswell_asm_4x6 #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -1397,7 +1391,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1406,7 +1400,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 1 #if 0 @@ -1423,7 +1417,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1431,8 +1425,8 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - + + // ---------------------------------- iteration 2 #if 1 @@ -1449,7 +1443,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1457,7 +1451,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 3 @@ -1475,7 +1469,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1483,43 +1477,43 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1527,22 +1521,22 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) @@ -1551,24 +1545,24 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vmulpd(xmm0, xmm9, xmm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(xmm0, xmm11, xmm11) - - - - - - - mov(var(cs_c), rsi) // load cs_c - lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; - //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; - lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1577,44 +1571,40 @@ void bli_dgemmsup_rv_haswell_asm_4x6 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) - vmovupd(ymm4, mem(rcx, 0*32)) + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; + lea(mem(rdx, rdi, 1), rbx) // load address of c + 3*rs_c; + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) + + vfmadd231pd(mem(rax, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rax, 1*32), xmm3, xmm7) + + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm8) + vfmadd231pd(mem(rdx, 1*32), xmm3, xmm9) + + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm10) + vfmadd231pd(mem(rbx, 1*32), xmm3, xmm11) + + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) - vmovupd(xmm7, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) + vmovupd(ymm6, mem(rax, 0*32)) + vmovupd(xmm7, mem(rax, 1*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) - vmovupd(xmm9, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) - vmovupd(ymm10, mem(rcx, 0*32)) + vmovupd(ymm8, mem(rdx, 0*32)) + vmovupd(xmm9, mem(rdx, 1*32)) + + vmovupd(ymm10, mem(rbx, 0*32)) + vmovupd(xmm11, mem(rbx, 1*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) - vmovupd(xmm11, mem(rcx, 1*32)) - //add(rdi, rcx) - - jmp(.DDONE) // jump to end. @@ -1663,41 +1653,41 @@ void bli_dgemmsup_rv_haswell_asm_4x6 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1734,9 +1724,9 @@ void bli_dgemmsup_rv_haswell_asm_4x6 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) @@ -1806,9 +1796,9 @@ void bli_dgemmsup_rv_haswell_asm_3x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1823,7 +1813,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1861,31 +1851,31 @@ void bli_dgemmsup_rv_haswell_asm_3x6 prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -1896,13 +1886,13 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - + + // ---------------------------------- iteration 1 #if 0 @@ -1915,19 +1905,19 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - vfmadd231pd(ymm0, ymm3, ymm6) - vfmadd231pd(ymm1, ymm3, ymm7) - + vfmadd231pd(ymm0, ymm2, ymm10) + vfmadd231pd(ymm1, ymm2, ymm11) + vfmadd231pd(ymm0, ymm3, ymm12) + vfmadd231pd(ymm1, ymm3, ymm13) + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm1, ymm2, ymm9) - + vfmadd231pd(ymm0, ymm2, ymm14) + vfmadd231pd(ymm1, ymm2, ymm15) + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -1942,12 +1932,12 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 3 @@ -1961,41 +1951,44 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - vfmadd231pd(ymm0, ymm3, ymm6) - vfmadd231pd(ymm1, ymm3, ymm7) - + vfmadd231pd(ymm0, ymm2, ymm10) + vfmadd231pd(ymm1, ymm2, ymm11) + vfmadd231pd(ymm0, ymm3, ymm12) + vfmadd231pd(ymm1, ymm3, ymm13) + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm1, ymm2, ymm9) - - - + vfmadd231pd(ymm0, ymm2, ymm14) + vfmadd231pd(ymm1, ymm2, ymm15) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + vaddpd(ymm10, ymm4, ymm4) + vaddpd(ymm11, ymm5, ymm5) + vaddpd(ymm12, ymm6, ymm6) + vaddpd(ymm13, ymm7, ymm7) + vaddpd(ymm14, ymm8, ymm8) + vaddpd(ymm15, ymm9, ymm9) + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -2006,91 +1999,83 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - label(.DROWSTORED) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) - vmovupd(ymm4, mem(rcx, 0*32)) - - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) - vmovupd(xmm5, mem(rcx, 1*32)) - add(rdi, rcx) + label(.DROWSTORED) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) + lea(mem(rcx, rdi, 1), rbx) // load address of c + 1*rs_c; - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) - vmovupd(xmm7, mem(rcx, 1*32)) - add(rdi, rcx) + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) + vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rbx, 1*32), xmm3, xmm7) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm8) + vfmadd231pd(mem(rdx, 1*32), xmm3, xmm9) + vmovupd(ymm4, mem(rcx, 0*32)) + vmovupd(xmm5, mem(rcx, 1*32)) + vmovupd(ymm6, mem(rbx, 0*32)) + vmovupd(xmm7, mem(rbx, 1*32)) + vmovupd(ymm8, mem(rdx, 0*32)) + vmovupd(xmm9, mem(rdx, 1*32)) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) - vmovupd(xmm9, mem(rcx, 1*32)) - //add(rdi, rcx) - - jmp(.DDONE) // jump to end. - + label(.DCOLSTORED) @@ -2131,7 +2116,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) - + lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-5 @@ -2162,26 +2147,26 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) - + //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2193,8 +2178,8 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2251,12 +2236,12 @@ void bli_dgemmsup_rv_haswell_asm_3x6 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -2323,15 +2308,15 @@ void bli_dgemmsup_rv_haswell_asm_2x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -2385,17 +2370,17 @@ void bli_dgemmsup_rv_haswell_asm_2x6 #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -2414,7 +2399,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6 vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 1 #if 0 @@ -2425,15 +2410,15 @@ void bli_dgemmsup_rv_haswell_asm_2x6 vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) + vbroadcastsd(mem(rax ), ymm12) + vbroadcastsd(mem(rax, r8, 1), ymm13) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - vfmadd231pd(ymm0, ymm3, ymm6) - vfmadd231pd(ymm1, ymm3, ymm7) - - + vfmadd231pd(ymm0, ymm12, ymm8) + vfmadd231pd(ymm1, ymm12, ymm9) + vfmadd231pd(ymm0, ymm13, ymm10) + vfmadd231pd(ymm1, ymm13, ymm11) + + // ---------------------------------- iteration 2 #if 1 @@ -2451,7 +2436,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 3 @@ -2463,43 +2448,44 @@ void bli_dgemmsup_rv_haswell_asm_2x6 vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) + vbroadcastsd(mem(rax ), ymm12) + vbroadcastsd(mem(rax, r8, 1), ymm13) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - vfmadd231pd(ymm0, ymm3, ymm6) - vfmadd231pd(ymm1, ymm3, ymm7) - - - + vfmadd231pd(ymm0, ymm12, ymm8) + vfmadd231pd(ymm1, ymm12, ymm9) + vfmadd231pd(ymm0, ymm13, ymm10) + vfmadd231pd(ymm1, ymm13, ymm11) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + vaddpd(ymm8, ymm4, ymm4) + vaddpd(ymm9, ymm5, ymm5) + vaddpd(ymm10, ymm6, ymm6) + vaddpd(ymm11, ymm7, ymm7) + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; @@ -2507,44 +2493,44 @@ void bli_dgemmsup_rv_haswell_asm_2x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2553,28 +2539,24 @@ void bli_dgemmsup_rv_haswell_asm_2x6 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) - vmovupd(ymm4, mem(rcx, 0*32)) + lea(mem(rcx, rdi, 1), rdx) // load address of c + 1*rs_c; + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rdx, 1*32), xmm3, xmm7) + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) + vmovupd(ymm6, mem(rdx, 0*32)) + vmovupd(xmm7, mem(rdx, 1*32)) + - vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) - vmovupd(xmm7, mem(rcx, 1*32)) - //add(rdi, rcx) - - jmp(.DDONE) // jump to end. @@ -2623,31 +2605,31 @@ void bli_dgemmsup_rv_haswell_asm_2x6 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2684,9 +2666,9 @@ void bli_dgemmsup_rv_haswell_asm_2x6 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) @@ -2756,15 +2738,15 @@ void bli_dgemmsup_rv_haswell_asm_1x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -2817,17 +2799,17 @@ void bli_dgemmsup_rv_haswell_asm_1x6 #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -2843,7 +2825,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6 vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 1 #if 0 @@ -2854,12 +2836,12 @@ void bli_dgemmsup_rv_haswell_asm_1x6 vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax ), ymm3) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - - + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + // ---------------------------------- iteration 2 #if 1 @@ -2874,7 +2856,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 3 @@ -2886,80 +2868,79 @@ void bli_dgemmsup_rv_haswell_asm_1x6 vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) + vbroadcastsd(mem(rax ), ymm3) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - - - + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + vaddpd(ymm6, ymm4, ymm4) + vaddpd(ymm7, ymm5, ymm5) + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2968,20 +2949,20 @@ void bli_dgemmsup_rv_haswell_asm_1x6 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3018,26 +2999,26 @@ void bli_dgemmsup_rv_haswell_asm_1x6 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3063,9 +3044,9 @@ void bli_dgemmsup_rv_haswell_asm_1x6 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c index 8c14eba4af..e25c67230c 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c @@ -818,53 +818,45 @@ void bli_dgemmsup_rv_haswell_asm_5x7 label(.DROWSTORED) + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + lea(mem(rbx, rdi, 1), r8) // load address of c + 3*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm4) - vmovupd(ymm3, mem(rcx, 0*32)) - vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - - add(rdi, rcx) - //-----------------------1 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rax, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rax, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm6) - vmovupd(ymm5, mem(rcx, 0*32)) - vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) - - add(rdi, rcx) - //-----------------------2 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rbx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm8) - vmovupd(ymm7, mem(rcx, 0*32)) - vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + vfmadd231pd(mem(r8, 0*32), ymm1, ymm9) + vmaskmovpd(mem(r8, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) - add(rdi, rcx) - //-----------------------3 + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm11) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm12) - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm10) - vmovupd(ymm9, mem(rcx, 0*32)) - vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - add(rdi, rcx) - //-----------------------4 + vmovupd(ymm5, mem(rax, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rax, 1*32)) - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm11) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm12) + vmovupd(ymm7, mem(rbx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rbx, 1*32)) - vmovupd(ymm11, mem(rcx, 0*32)) - vmaskmovpd(ymm12, ymm15, mem(rcx, 1*32)) + vmovupd(ymm9, mem(r8, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(r8, 1*32)) + + vmovupd(ymm11, mem(rdx, 0*32)) + vmaskmovpd(ymm12, ymm15, mem(rdx, 1*32)) jmp(.DDONE) // jump to end. @@ -1303,43 +1295,37 @@ void bli_dgemmsup_rv_haswell_asm_4x7 label(.DROWSTORED) + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; + lea(mem(rdx, rdi, 1), rbx) // load address of c + 3*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm4) - vmovupd(ymm3, mem(rcx, 0*32)) - vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - - add(rdi, rcx) - //-----------------------1 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rax, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rax, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm6) - vmovupd(ymm5, mem(rcx, 0*32)) - vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) - - add(rdi, rcx) - //-----------------------2 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm8) - vmovupd(ymm7, mem(rcx, 0*32)) - vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + vfmadd231pd(mem(rbx, 0*32), ymm1, ymm9) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm10) - add(rdi, rcx) - //-----------------------3 + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm9) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm10) + vmovupd(ymm5, mem(rax, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rax, 1*32)) - vmovupd(ymm9, mem(rcx, 0*32)) - vmaskmovpd(ymm10, ymm15, mem(rcx, 1*32)) + vmovupd(ymm7, mem(rdx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rdx, 1*32)) + + vmovupd(ymm9, mem(rbx, 0*32)) + vmaskmovpd(ymm10, ymm15, mem(rbx, 1*32)) //-----------------------4 jmp(.DDONE) // jump to end. @@ -1592,16 +1578,16 @@ void bli_dgemmsup_rv_haswell_asm_3x7 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm0, ymm2, ymm7) - vfmadd231pd(ymm1, ymm2, ymm8) + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -1643,25 +1629,28 @@ void bli_dgemmsup_rv_haswell_asm_3x7 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm0, ymm2, ymm9) + vfmadd231pd(ymm1, ymm2, ymm10) vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vfmadd231pd(ymm0, ymm2, ymm11) + vfmadd231pd(ymm1, ymm2, ymm12) vbroadcastsd(mem(rax, r8, 2), ymm2) - vfmadd231pd(ymm0, ymm2, ymm7) - vfmadd231pd(ymm1, ymm2, ymm8) + vfmadd231pd(ymm0, ymm2, ymm13) + vfmadd231pd(ymm1, ymm2, ymm14) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - + vaddpd(ymm9, ymm3, ymm3) + vaddpd(ymm10, ymm4, ymm4) + vaddpd(ymm11, ymm5, ymm5) + vaddpd(ymm12, ymm6, ymm6) + vaddpd(ymm13, ymm7, ymm7) + vaddpd(ymm14, ymm8, ymm8) label(.DCONSIDKLEFT) @@ -1747,33 +1736,29 @@ void bli_dgemmsup_rv_haswell_asm_3x7 label(.DROWSTORED) + lea(mem(rcx, rdi, 1), rbx) // load address of c + 1*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm4) - vmovupd(ymm3, mem(rcx, 0*32)) - vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) + vfmadd231pd(mem(rbx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm2) + vfmadd231pd(ymm2, ymm1, ymm6) - add(rdi, rcx) - //-----------------------1 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm6) + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm7) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm0) + vfmadd231pd(ymm0, ymm1, ymm8) - vmovupd(ymm5, mem(rcx, 0*32)) - vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) - add(rdi, rcx) - //-----------------------2 + vmovupd(ymm3, mem(rcx, 0*32)) + vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm7) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm8) + vmovupd(ymm5, mem(rbx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rbx, 1*32)) - vmovupd(ymm7, mem(rcx, 0*32)) - vmaskmovpd(ymm8, ymm15, mem(rcx, 1*32)) + vmovupd(ymm7, mem(rdx, 0*32)) + vmaskmovpd(ymm8, ymm15, mem(rdx, 1*32)) jmp(.DDONE) // jump to end. @@ -1998,9 +1983,9 @@ void bli_dgemmsup_rv_haswell_asm_2x7 vfmadd231pd(ymm0, ymm2, ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm14) + vfmadd231pd(ymm0, ymm14, ymm5) + vfmadd231pd(ymm1, ymm14, ymm6) add(r9, rax) // a += cs_a; @@ -2017,12 +2002,12 @@ void bli_dgemmsup_rv_haswell_asm_2x7 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm0, ymm2, ymm10) + vfmadd231pd(ymm1, ymm2, ymm11) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm14) + vfmadd231pd(ymm0, ymm14, ymm12) + vfmadd231pd(ymm1, ymm14, ymm13) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -2041,9 +2026,9 @@ void bli_dgemmsup_rv_haswell_asm_2x7 vfmadd231pd(ymm0, ymm2, ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm14) + vfmadd231pd(ymm0, ymm14, ymm5) + vfmadd231pd(ymm1, ymm14, ymm6) add(r9, rax) // a += cs_a; @@ -2060,21 +2045,22 @@ void bli_dgemmsup_rv_haswell_asm_2x7 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vfmadd231pd(ymm0, ymm2, ymm10) + vfmadd231pd(ymm1, ymm2, ymm11) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm14) + vfmadd231pd(ymm0, ymm14, ymm12) + vfmadd231pd(ymm1, ymm14, ymm13) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - + vaddpd(ymm10, ymm3, ymm3) + vaddpd(ymm11, ymm4, ymm4) + vaddpd(ymm12, ymm5, ymm5) + vaddpd(ymm13, ymm6, ymm6) label(.DCONSIDKLEFT) @@ -2102,9 +2088,9 @@ void bli_dgemmsup_rv_haswell_asm_2x7 vfmadd231pd(ymm0, ymm2, ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - vbroadcastsd(mem(rax, r8, 1), ymm2) - vfmadd231pd(ymm0, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm6) + vbroadcastsd(mem(rax, r8, 1), ymm14) + vfmadd231pd(ymm0, ymm14, ymm5) + vfmadd231pd(ymm1, ymm14, ymm6) add(r9, rax) // a += cs_a; @@ -2153,23 +2139,21 @@ void bli_dgemmsup_rv_haswell_asm_2x7 label(.DROWSTORED) + lea(mem(rcx, rdi, 1), rdx) // load address of c + 1*rs_c; vfmadd231pd(mem(rcx, 0*32), ymm1, ymm3) vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) vfmadd231pd(ymm0, ymm1, ymm4) + vfmadd231pd(mem(rdx, 0*32), ymm1, ymm5) + vmaskmovpd(mem(rdx, 1*32), ymm15, ymm2) + vfmadd231pd(ymm2, ymm1, ymm6) + vmovupd(ymm3, mem(rcx, 0*32)) vmaskmovpd(ymm4, ymm15, mem(rcx, 1*32)) - add(rdi, rcx) - //-----------------------1 - - vfmadd231pd(mem(rcx, 0*32), ymm1, ymm5) - vmaskmovpd(mem(rcx, 1*32), ymm15, ymm0) - vfmadd231pd(ymm0, ymm1, ymm6) - - vmovupd(ymm5, mem(rcx, 0*32)) - vmaskmovpd(ymm6, ymm15, mem(rcx, 1*32)) + vmovupd(ymm5, mem(rdx, 0*32)) + vmaskmovpd(ymm6, ymm15, mem(rdx, 1*32)) jmp(.DDONE) // jump to end. @@ -2400,14 +2384,14 @@ void bli_dgemmsup_rv_haswell_asm_1x7 #endif //Loads 4 element - vmovupd(mem(rbx, 0*32), ymm0) + vmovupd(mem(rbx, 0*32), ymm8) //Loads 3 elements as per mask_3 mask vector - vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm9) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vbroadcastsd(mem(rax ), ymm10) + vfmadd231pd(ymm8, ymm10, ymm6) + vfmadd231pd(ymm9, ymm10, ymm7) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 @@ -2435,14 +2419,14 @@ void bli_dgemmsup_rv_haswell_asm_1x7 #endif //Loads 4 element - vmovupd(mem(rbx, 0*32), ymm0) + vmovupd(mem(rbx, 0*32), ymm8) //Loads 3 elements as per mask_3 mask vector - vmaskmovpd(mem(rbx, 1*32), ymm15, ymm1) + vmaskmovpd(mem(rbx, 1*32), ymm15, ymm9) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - vfmadd231pd(ymm0, ymm2, ymm3) - vfmadd231pd(ymm1, ymm2, ymm4) + vbroadcastsd(mem(rax ), ymm10) + vfmadd231pd(ymm8, ymm10, ymm6) + vfmadd231pd(ymm9, ymm10, ymm7) add(r9, rax) // a += cs_a; @@ -2450,8 +2434,8 @@ void bli_dgemmsup_rv_haswell_asm_1x7 jne(.DLOOPKITER) // iterate again if i != 0. - - + vaddpd(ymm6, ymm3, ymm3) + vaddpd(ymm7, ymm4, ymm4) label(.DCONSIDKLEFT) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c index 2a04011f37..cb581bf72a 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -178,7 +178,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8 // Advance C and A pointers by the mrs and nrs we just // used, and decrement m_left. cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; - } + } } // Advance C and B pointers by the mrs and nrs we just used, and @@ -208,9 +208,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -225,7 +225,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -275,25 +275,25 @@ void bli_dgemmsup_rv_haswell_asm_6x8 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -304,14 +304,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -320,7 +320,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -337,14 +337,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -352,14 +352,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -370,14 +370,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -385,7 +385,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -403,14 +403,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -418,50 +418,50 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -469,22 +469,22 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -497,24 +497,24 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -523,60 +523,60 @@ void bli_dgemmsup_rv_haswell_asm_6x8 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -663,51 +663,51 @@ void bli_dgemmsup_rv_haswell_asm_6x8 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -772,12 +772,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -845,15 +845,15 @@ void bli_dgemmsup_rv_haswell_asm_5x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -912,18 +912,18 @@ void bli_dgemmsup_rv_haswell_asm_5x8 lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -940,20 +940,20 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 1 #if 0 @@ -970,26 +970,26 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1000,19 +1000,19 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 3 @@ -1030,37 +1030,37 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 @@ -1071,41 +1071,41 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -1116,24 +1116,24 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1142,52 +1142,37 @@ void bli_dgemmsup_rv_haswell_asm_5x8 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - - label(.DROWSTORED) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) - vmovupd(ymm4, mem(rcx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) - vmovupd(ymm5, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) - vmovupd(ymm7, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) + label(.DROWSTORED) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) - vmovupd(ymm9, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) - vmovupd(ymm10, mem(rcx, 0*32)) + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rbx) // load address of c + 2*rs_c; + lea(mem(rbx, rdi, 1), r8) // load address of c + 3*rs_c; - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) - vmovupd(ymm11, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) - vmovupd(ymm12, mem(rcx, 0*32)) + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) + vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) + vfmadd231pd(mem(rax, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rax, 1*32), ymm3, ymm7) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm8) + vfmadd231pd(mem(rbx, 1*32), ymm3, ymm9) + vfmadd231pd(mem(r8, 0*32), ymm3, ymm10) + vfmadd231pd(mem(r8, 1*32), ymm3, ymm11) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm12) + vfmadd231pd(mem(rdx, 1*32), ymm3, ymm13) + + vmovupd(ymm4, mem(rcx, 0*32)) + vmovupd(ymm5, mem(rcx, 1*32)) + vmovupd(ymm6, mem(rax, 0*32)) + vmovupd(ymm7, mem(rax, 1*32)) + vmovupd(ymm8, mem(rbx, 0*32)) + vmovupd(ymm9, mem(rbx, 1*32)) + vmovupd(ymm10, mem(r8, 0*32)) + vmovupd(ymm11, mem(r8, 1*32)) + vmovupd(ymm12, mem(rdx, 0*32)) + vmovupd(ymm13, mem(rdx, 1*32)) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) - vmovupd(ymm13, mem(rcx, 1*32)) - //add(rdi, rcx) - - jmp(.DDONE) // jump to end. @@ -1272,46 +1257,46 @@ void bli_dgemmsup_rv_haswell_asm_5x8 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1370,9 +1355,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) @@ -1442,9 +1427,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1459,7 +1444,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1501,31 +1486,31 @@ void bli_dgemmsup_rv_haswell_asm_4x8 prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c - + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1536,7 +1521,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1544,8 +1529,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - + + // ---------------------------------- iteration 1 #if 0 @@ -1562,7 +1547,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1570,10 +1555,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -1588,7 +1573,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1596,7 +1581,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 3 @@ -1614,7 +1599,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1622,27 +1607,27 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -1658,7 +1643,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1666,22 +1651,22 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -1690,72 +1675,62 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) - - - - - - - mov(var(cs_c), rsi) // load cs_c - lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; - //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; - lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + + + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - label(.DROWSTORED) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) - vmovupd(ymm4, mem(rcx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) - vmovupd(ymm5, mem(rcx, 1*32)) - add(rdi, rcx) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) - - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) - vmovupd(ymm7, mem(rcx, 1*32)) - add(rdi, rcx) + label(.DROWSTORED) + lea(mem(rcx, rdi, 1), rax) // load address of c + 1*rs_c; + lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; + lea(mem(rdx, rdi, 1), rbx) // load address of c + 3*rs_c; - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) + vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) + vfmadd231pd(mem(rax, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rax, 1*32), ymm3, ymm7) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm8) + vfmadd231pd(mem(rdx, 1*32), ymm3, ymm9) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm10) + vfmadd231pd(mem(rbx, 1*32), ymm3, ymm11) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) - vmovupd(ymm9, mem(rcx, 1*32)) - add(rdi, rcx) + vmovupd(ymm4, mem(rcx, 0*32)) + vmovupd(ymm5, mem(rcx, 1*32)) + vmovupd(ymm6, mem(rax, 0*32)) + vmovupd(ymm7, mem(rax, 1*32)) + vmovupd(ymm8, mem(rdx, 0*32)) + vmovupd(ymm9, mem(rdx, 1*32)) + vmovupd(ymm10, mem(rbx, 0*32)) + vmovupd(ymm11, mem(rbx, 1*32)) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) - vmovupd(ymm10, mem(rcx, 0*32)) - - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) - vmovupd(ymm11, mem(rcx, 1*32)) - //add(rdi, rcx) - - jmp(.DDONE) // jump to end. - + label(.DCOLSTORED) @@ -1810,19 +1785,19 @@ void bli_dgemmsup_rv_haswell_asm_4x8 jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -1838,8 +1813,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1880,12 +1855,12 @@ void bli_dgemmsup_rv_haswell_asm_4x8 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1952,9 +1927,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1969,7 +1944,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2010,27 +1985,27 @@ void bli_dgemmsup_rv_haswell_asm_3x8 prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 - + #if 1 prefetch(0, mem(rdx, 4*8)) #endif @@ -2045,13 +2020,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - + + // ---------------------------------- iteration 1 #if 0 @@ -2064,19 +2039,19 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - vfmadd231pd(ymm0, ymm3, ymm6) - vfmadd231pd(ymm1, ymm3, ymm7) - + vfmadd231pd(ymm0, ymm2, ymm10) + vfmadd231pd(ymm1, ymm2, ymm11) + vfmadd231pd(ymm0, ymm3, ymm12) + vfmadd231pd(ymm1, ymm3, ymm13) + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm1, ymm2, ymm9) - + vfmadd231pd(ymm0, ymm2, ymm14) + vfmadd231pd(ymm1, ymm2, ymm15) + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -2091,12 +2066,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 3 @@ -2110,36 +2085,40 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - vfmadd231pd(ymm0, ymm3, ymm6) - vfmadd231pd(ymm1, ymm3, ymm7) - + vfmadd231pd(ymm0, ymm2, ymm10) + vfmadd231pd(ymm1, ymm2, ymm11) + vfmadd231pd(ymm0, ymm3, ymm12) + vfmadd231pd(ymm1, ymm3, ymm13) + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm8) - vfmadd231pd(ymm1, ymm2, ymm9) - - - + vfmadd231pd(ymm0, ymm2, ymm14) + vfmadd231pd(ymm1, ymm2, ymm15) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + vaddpd(ymm10, ymm4, ymm4) + vaddpd(ymm11, ymm5, ymm5) + vaddpd(ymm12, ymm6, ymm6) + vaddpd(ymm13, ymm7, ymm7) + vaddpd(ymm14, ymm8, ymm8) + vaddpd(ymm15, ymm9, ymm9) + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -2155,91 +2134,82 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - label(.DROWSTORED) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) - vmovupd(ymm4, mem(rcx, 0*32)) - - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) - vmovupd(ymm5, mem(rcx, 1*32)) - add(rdi, rcx) + label(.DROWSTORED) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) - - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) - vmovupd(ymm7, mem(rcx, 1*32)) - add(rdi, rcx) + lea(mem(rcx, rdi, 1), rbx) // load address of c + 1*rs_c; + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) + vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) + vfmadd231pd(mem(rbx, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rbx, 1*32), ymm3, ymm7) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm8) + vfmadd231pd(mem(rdx, 1*32), ymm3, ymm9) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) - vmovupd(ymm8, mem(rcx, 0*32)) + vmovupd(ymm4, mem(rcx, 0*32)) + vmovupd(ymm5, mem(rcx, 1*32)) + vmovupd(ymm6, mem(rbx, 0*32)) + vmovupd(ymm7, mem(rbx, 1*32)) + vmovupd(ymm8, mem(rdx, 0*32)) + vmovupd(ymm9, mem(rdx, 1*32)) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) - vmovupd(ymm9, mem(rcx, 1*32)) - //add(rdi, rcx) - - jmp(.DDONE) // jump to end. - + label(.DCOLSTORED) @@ -2280,7 +2250,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) - + lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 @@ -2319,26 +2289,26 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) - + //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2350,8 +2320,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2416,12 +2386,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -2488,9 +2458,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2505,7 +2475,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2545,27 +2515,27 @@ void bli_dgemmsup_rv_haswell_asm_2x8 prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 - + #if 1 prefetch(0, mem(rdx, 4*8)) #endif @@ -2581,29 +2551,29 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - + + // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif - vmovupd(mem(rbx, 0*32), ymm0) - vmovupd(mem(rbx, 1*32), ymm1) + vmovupd(mem(rbx, 0*32), ymm8) + vmovupd(mem(rbx, 1*32), ymm9) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - vfmadd231pd(ymm0, ymm3, ymm6) - vfmadd231pd(ymm1, ymm3, ymm7) - + vfmadd231pd(ymm8, ymm2, ymm10) + vfmadd231pd(ymm9, ymm2, ymm11) + vfmadd231pd(ymm8, ymm3, ymm12) + vfmadd231pd(ymm9, ymm3, ymm13) + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -2619,7 +2589,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 3 @@ -2627,38 +2597,40 @@ void bli_dgemmsup_rv_haswell_asm_2x8 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif - vmovupd(mem(rbx, 0*32), ymm0) - vmovupd(mem(rbx, 1*32), ymm1) + vmovupd(mem(rbx, 0*32), ymm8) + vmovupd(mem(rbx, 1*32), ymm9) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - vfmadd231pd(ymm0, ymm3, ymm6) - vfmadd231pd(ymm1, ymm3, ymm7) - - - + vfmadd231pd(ymm8, ymm2, ymm10) + vfmadd231pd(ymm9, ymm2, ymm11) + vfmadd231pd(ymm8, ymm3, ymm12) + vfmadd231pd(ymm9, ymm3, ymm13) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + vaddpd(ymm10, ymm4, ymm4) + vaddpd(ymm11, ymm5, ymm5) + vaddpd(ymm12, ymm6, ymm6) + vaddpd(ymm13, ymm7, ymm7) + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -2675,44 +2647,44 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2721,28 +2693,24 @@ void bli_dgemmsup_rv_haswell_asm_2x8 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) - vmovupd(ymm4, mem(rcx, 0*32)) + lea(mem(rcx, rdi, 1), rdx) // load address of c + 1*rs_c; + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) - vmovupd(ymm5, mem(rcx, 1*32)) - add(rdi, rcx) + vfmadd231pd(mem(rdx, 0*32), ymm3, ymm6) + vfmadd231pd(mem(rdx, 1*32), ymm3, ymm7) + vmovupd(ymm4, mem(rcx, 0*32)) + vmovupd(ymm5, mem(rcx, 1*32)) + vmovupd(ymm6, mem(rdx, 0*32)) + vmovupd(ymm7, mem(rdx, 1*32)) - vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) - vmovupd(ymm6, mem(rcx, 0*32)) - vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) - vmovupd(ymm7, mem(rcx, 1*32)) - //add(rdi, rcx) - - jmp(.DDONE) // jump to end. @@ -2787,19 +2755,19 @@ void bli_dgemmsup_rv_haswell_asm_2x8 jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2807,8 +2775,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2841,12 +2809,12 @@ void bli_dgemmsup_rv_haswell_asm_2x8 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -2912,9 +2880,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2929,7 +2897,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2968,27 +2936,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8 prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c - + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 - + #if 1 prefetch(0, mem(rdx, 4*8)) #endif @@ -2999,28 +2967,29 @@ void bli_dgemmsup_rv_haswell_asm_1x8 vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; + vbroadcastsd(mem(rax ), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - + + // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif - vmovupd(mem(rbx, 0*32), ymm0) - vmovupd(mem(rbx, 1*32), ymm1) + vmovupd(mem(rbx, 0*32), ymm8) + vmovupd(mem(rbx, 1*32), ymm9) add(r10, rbx) // b += rs_b; - - vbroadcastsd(mem(rax ), ymm2) - add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - + + vfmadd231pd(ymm8, ymm3, ymm6) + vfmadd231pd(ymm9, ymm3, ymm7) + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -3031,9 +3000,11 @@ void bli_dgemmsup_rv_haswell_asm_1x8 vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; + vbroadcastsd(mem(rax ), ymm3) + add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 3 @@ -3041,35 +3012,34 @@ void bli_dgemmsup_rv_haswell_asm_1x8 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif - vmovupd(mem(rbx, 0*32), ymm0) - vmovupd(mem(rbx, 1*32), ymm1) + vmovupd(mem(rbx, 0*32), ymm8) + vmovupd(mem(rbx, 1*32), ymm9) add(r10, rbx) // b += rs_b; - vbroadcastsd(mem(rax ), ymm2) - add(r9, rax) // a += cs_a; - vfmadd231pd(ymm0, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm5) - - - + + vfmadd231pd(ymm8, ymm3, ymm6) + vfmadd231pd(ymm9, ymm3, ymm7) + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + vaddpd(ymm6, ymm4, ymm4) + vaddpd(ymm7, ymm5, ymm5) + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -3078,18 +3048,18 @@ void bli_dgemmsup_rv_haswell_asm_1x8 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) @@ -3098,27 +3068,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8 mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -3126,20 +3096,20 @@ void bli_dgemmsup_rv_haswell_asm_1x8 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3160,7 +3130,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8 vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) - + lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 @@ -3183,26 +3153,26 @@ void bli_dgemmsup_rv_haswell_asm_1x8 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3230,14 +3200,14 @@ void bli_dgemmsup_rv_haswell_asm_1x8 vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/bli_kernels_haswell.h b/kernels/haswell/bli_kernels_haswell.h index 8c4e3c44ec..33e2ef125b 100644 --- a/kernels/haswell/bli_kernels_haswell.h +++ b/kernels/haswell/bli_kernels_haswell.h @@ -258,6 +258,11 @@ GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x1 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x1 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x1 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x1 ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) From f96e20b8940dad53c2cfc2097d4eda444ee1b04e Mon Sep 17 00:00:00 2001 From: jagar Date: Wed, 5 Jul 2023 14:33:48 +0530 Subject: [PATCH 138/226] Update in CMakeLists.txt to install on windows Updated CMakeLists.txt to copy library and headers into folder mentioned during cmake configuration. Steps to install 1. cmake .. -G ........ -DCMAKE_INSTALL_PREFIX=path_to_install 2. cmake --build . --config Release 3. cmake --install . (install lib and headers) Change-Id: Ic2728209a2e1d181cc92bab08b82a748bec583d4 --- CMakeLists.txt | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6143056a82..400214efdb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -622,7 +622,7 @@ if (ENABLE_ASAN_TESTS) endif () # Set object libraries created in kernels directory to be added into BLIS library. -set(OBJECT_LIBRARIES +set(OBJECT_LIBRARIES $ $ $ @@ -634,7 +634,7 @@ set(OBJECT_LIBRARIES $ ) # Ammend the list of object libraries to include zen4 paths as appropriate. -if(${TARGET_ARCH} STREQUAL zen4 OR +if(${TARGET_ARCH} STREQUAL zen4 OR ${TARGET_ARCH} STREQUAL amdzen) set(OBJECT_LIBRARIES ${OBJECT_LIBRARIES} $ @@ -649,7 +649,7 @@ endif() if(BUILD_SHARED_LIBS) add_library("${PROJECT_NAME}" SHARED ${CMAKE_SOURCE_DIR}/bli_config.h ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/blis.h - ${headers} + ${headers} ${OBJECT_LIBRARIES} ) if(ENABLE_OPENMP) @@ -661,7 +661,7 @@ endif() if(NOT BUILD_SHARED_LIBS) add_library("${PROJECT_NAME}" STATIC ${CMAKE_SOURCE_DIR}/bli_config.h ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/blis.h - ${headers} + ${headers} ${OBJECT_LIBRARIES} ) set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C OUTPUT_NAME "${LIB_NAME}" STATIC_LIBRARY_OPTIONS "${STATIC_LIB_OPTIONS}") @@ -681,6 +681,30 @@ endif() link_directories(${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) add_definitions(-DEXPMODULE) +if(NOT CMAKE_INSTALL_PREFIX) + if(WIN32) + set(CMAKE_INSTALL_PREFIX + "${PROJECT_BINARY_DIR}/libblis" + CACHE PATH "Install path prefix, prepended onto install directories") + else() + set(CMAKE_INSTALL_PREFIX + "/usr/local/blis" + CACHE PATH "Install path prefix, prepended onto install directories") + endif() +endif() + +# Public blis headers +set(BLIS_PUBLIC_HEADERS + ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/${BLIS_H} + ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/${CBLAS_H} +) + +set_target_properties(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${BLIS_PUBLIC_HEADERS}") + +install(TARGETS ${PROJECT_NAME} LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_PREFIX}/include) + add_subdirectory(config) add_subdirectory(ref_kernels) add_subdirectory(kernels) From 29711dd5a35ea3ab23e9b71dbf83f574cd3a5dcd Mon Sep 17 00:00:00 2001 From: jagar Date: Wed, 13 Sep 2023 16:18:39 +0530 Subject: [PATCH 139/226] Gtestsuite: Updated testings_basics.* to print matrix/vector name AMD-Internal: [CPUPL-2732] Change-Id: I89b4ffc97ea852e66f42b82058af67c16144fbf6 --- .../inc/common/testing_basics.h | 4 ++- .../src/common/testing_basics.cpp | 27 ++++++++++--------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/gtestsuite/testinghelpers/inc/common/testing_basics.h b/gtestsuite/testinghelpers/inc/common/testing_basics.h index 46bcfeb4a8..67e51254df 100644 --- a/gtestsuite/testinghelpers/inc/common/testing_basics.h +++ b/gtestsuite/testinghelpers/inc/common/testing_basics.h @@ -327,7 +327,7 @@ void make_triangular( char storage, char uplo, gtint_t n, T* a, gtint_t ld ); /** * ========================================================================== * MKDIAG - * Make an m x n matrix A, which adds a scalar value to + * Make an m x n matrix A, which adds a scalar value to * every element along an arbitrary diagonal of a matrix. * It is assumed that the diagonal offset of A is zero * ========================================================================== @@ -351,6 +351,7 @@ void print_scalar( T x, const char *spec ); /** * print vector of length n + * @param[in] vec specifies the vector name * @param[in] n specifies the length of the given vector. * @param[in] a specifies pointer which points to the first element of a. * @param[in] incx specifies storage spacing between elements of a. @@ -361,6 +362,7 @@ void print_vector( const char *vec, gtint_t n, T *x, gtint_t incx, const char *s /** * print matrix of size m x n + * @param[in] mat specifies the matrix name * @param[in] storage specifies the storage format of matrix in memory. * @param[in] m specifies the number of rows of given matrix. * @param[in] n specifies the number of columns of given matrix. diff --git a/gtestsuite/testinghelpers/src/common/testing_basics.cpp b/gtestsuite/testinghelpers/src/common/testing_basics.cpp index c7f8d50dac..6f3c2b8f9c 100644 --- a/gtestsuite/testinghelpers/src/common/testing_basics.cpp +++ b/gtestsuite/testinghelpers/src/common/testing_basics.cpp @@ -493,7 +493,7 @@ template void make_triangular( char, char, gtint_t, dcomplex *, gtint_ /** * ========================================================================== * MKDIAG - * Make an m x n matrix A, which adds a scalar value to + * Make an m x n matrix A, which adds a scalar value to * every element along an arbitrary diagonal of a matrix. * It is assumed that the diagonal offset of A is zero * ========================================================================== @@ -551,17 +551,18 @@ template void print_scalar( dcomplex x, const char * ); /** * print vector of length n + * @param[in] vec specifies the vector name * @param[in] n specifies the length of the given vector. * @param[in] a specifies pointer which points to the first element of a. * @param[in] incx specifies storage spacing between elements of a. * @param[in] spec specifies the format specifer. */ template -void print_vector( gtint_t n, T *x, gtint_t incx, const char *spec ) +void print_vector( const char *vec, gtint_t n, T *x, gtint_t incx, const char *spec ) { gtint_t i, idx; T val; - + std::cout << "Vector " << vec << std::endl; for ( i = 0; i < n; i++ ) { idx = (incx > 0) ? (i * incx) : ( - ( n - i - 1 ) * incx ); @@ -571,13 +572,14 @@ void print_vector( gtint_t n, T *x, gtint_t incx, const char *spec ) } printf( "\n\n" ); } -template void print_vector( gtint_t, float *, gtint_t, const char * ); -template void print_vector( gtint_t, double *, gtint_t, const char * ); -template void print_vector( gtint_t, scomplex *, gtint_t, const char * ); -template void print_vector( gtint_t, dcomplex *, gtint_t, const char * ); +template void print_vector( const char *vec, gtint_t, float *, gtint_t, const char * ); +template void print_vector( const char *vec, gtint_t, double *, gtint_t, const char * ); +template void print_vector( const char *vec, gtint_t, scomplex *, gtint_t, const char * ); +template void print_vector( const char *vec, gtint_t, dcomplex *, gtint_t, const char * ); /** * print matrix of size m x n + * @param[in] mat specifies the matrix name * @param[in] storage specifies the storage format of matrix in memory. * @param[in] m specifies the number of rows of given matrix. * @param[in] n specifies the number of columns of given matrix. @@ -586,7 +588,7 @@ template void print_vector( gtint_t, dcomplex *, gtint_t, const char * * @param[in] spec specifies the format specifer. */ template -void print_matrix( char storage, gtint_t m, gtint_t n, T *a, gtint_t ld, const char *spec ) +void print_matrix( const char *mat, char storage, gtint_t m, gtint_t n, T *a, gtint_t ld, const char *spec ) { gtint_t rs,cs; rs=cs=1; @@ -597,6 +599,7 @@ void print_matrix( char storage, gtint_t m, gtint_t n, T *a, gtint_t ld, const c rs = ld ; gtint_t i, j; + std::cout << "Matrix " << mat << std::endl; for ( i = 0; i < m; i++ ) { for ( j = 0; j < n; j++ ) @@ -609,10 +612,10 @@ void print_matrix( char storage, gtint_t m, gtint_t n, T *a, gtint_t ld, const c } printf( "\n" ); } -template void print_matrix( char, gtint_t, gtint_t, float *, gtint_t, const char * ); -template void print_matrix( char, gtint_t, gtint_t, double *, gtint_t, const char * ); -template void print_matrix( char, gtint_t, gtint_t, scomplex *, gtint_t, const char * ); -template void print_matrix( char, gtint_t, gtint_t, dcomplex *, gtint_t, const char * ); +template void print_matrix( const char *mat, char, gtint_t, gtint_t, float *, gtint_t, const char * ); +template void print_matrix( const char *mat, char, gtint_t, gtint_t, double *, gtint_t, const char * ); +template void print_matrix( const char *mat, char, gtint_t, gtint_t, scomplex *, gtint_t, const char * ); +template void print_matrix( const char *mat, char, gtint_t, gtint_t, dcomplex *, gtint_t, const char * ); /* From db4fbfe9a6f01e60cb674405a0e34d5534bad789 Mon Sep 17 00:00:00 2001 From: Kiran Varaganti Date: Sun, 20 Aug 2023 12:41:53 +0530 Subject: [PATCH 140/226] Fix compiler error for "inline" functions in LPGEMM bench Application Functions which are declared as "inline" may trigger compiler error "undefined function" This linker error is eliminated by use "static" before "inline". Therefore added "static" before all inline functions. Change-Id: I5952fb71112fc4792011c3e29be930ccfbce4562 --- bench/bench_aocl_gemm/bench_lpgemm.c | 24 +++++++++++----------- bench/bench_aocl_gemm/bench_lpgemm_utils.c | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 75883cf7fe..9fd41dc42e 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -63,14 +63,14 @@ dim_t num_eltwise = 0; // To keep track of eltwise operations. #define GEN_FUNC_NAME(prototype,ctype) prototype ## ctype -inline void float_to_bf16( float* float_value, bfloat16* bf16_val ) +static inline void float_to_bf16( float* float_value, bfloat16* bf16_val ) { /*Set offset 2 to copy most significant 2 bytes of float to convert float values to bf16 values*/ memcpy( ( bf16_val ), (char *)( float_value ) + 2, sizeof ( bfloat16 ) ); } -inline float bf16_to_float +static inline float bf16_to_float ( bfloat16 bf16_val ) @@ -81,7 +81,7 @@ inline float bf16_to_float return float_value; } -inline void convert_float_arr_to_bf16( float* array, bfloat16* array_bf16, int size ) +static inline void convert_float_arr_to_bf16( float* array, bfloat16* array_bf16, int size ) { for (int i=0; i< size; i++) { @@ -353,7 +353,7 @@ int min (int a, int b) } #define GEN_MAT_MUL_ACC_CHK_DOWNSCALE(ACCUM_type,SCALE_type,BLAS_DOWNSCALE_SFX) \ -inline ACCUM_type mat_mul_accuracy_check_downscale_ ## BLAS_DOWNSCALE_SFX \ +static inline ACCUM_type mat_mul_accuracy_check_downscale_ ## BLAS_DOWNSCALE_SFX \ (\ ACCUM_type temp_accum,\ aocl_post_op* post_op, \ @@ -370,7 +370,7 @@ GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int32_t,float,u8s8s32os8) GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int32_t,float,s8s8s32os8) GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int16_t,float,s8s8s16os8) -inline float mat_mul_accuracy_check_downscale_bf16bf16f32obf16 +static inline float mat_mul_accuracy_check_downscale_bf16bf16f32obf16 ( float temp_accum, aocl_post_op* post_op, @@ -381,7 +381,7 @@ inline float mat_mul_accuracy_check_downscale_bf16bf16f32obf16 } #define GEN_MAT_MUL_ACC_CHK_ACCUM(A_type, B_type, C_type,ACCUM_type,BLAS_SFX) \ -inline ACCUM_type mat_mul_accuracy_check_accum_ ## BLAS_SFX \ +static inline ACCUM_type mat_mul_accuracy_check_accum_ ## BLAS_SFX \ (\ A_type* a, \ B_type* b, \ @@ -421,7 +421,7 @@ GEN_MAT_MUL_ACC_CHK_ACCUM(int8_t,int8_t,int32_t,int32_t,s8s8s32os32) GEN_MAT_MUL_ACC_CHK_ACCUM(int8_t,int8_t,int8_t,int16_t,s8s8s16os8) GEN_MAT_MUL_ACC_CHK_ACCUM(int8_t,int8_t,int16_t,int16_t,s8s8s16os16) -inline float mat_mul_accuracy_check_accum_bf16bf16f32of32 +static inline float mat_mul_accuracy_check_accum_bf16bf16f32of32 ( bfloat16* a, bfloat16* b, @@ -451,7 +451,7 @@ inline float mat_mul_accuracy_check_accum_bf16bf16f32of32 return temp_accum; } -inline float mat_mul_accuracy_check_accum_bf16bf16f32obf16 +static inline float mat_mul_accuracy_check_accum_bf16bf16f32obf16 ( bfloat16* a, bfloat16* b, @@ -483,7 +483,7 @@ inline float mat_mul_accuracy_check_accum_bf16bf16f32obf16 } #define GEN_GELU_TANH_POSTOP_INT(ACCUM_type,BLAS_SFX) \ -inline ACCUM_type GELU_TANH_post_op_ ## BLAS_SFX \ +static inline ACCUM_type GELU_TANH_post_op_ ## BLAS_SFX \ (\ ACCUM_type temp_accum \ )\ @@ -505,7 +505,7 @@ GEN_GELU_TANH_POSTOP_INT(int16_t,s8s8s16os8) GEN_GELU_TANH_POSTOP_INT(int16_t,s8s8s16os16) #define GEN_GELU_TANH_POSTOP_FLOAT(BLAS_SFX) \ -inline float GELU_TANH_post_op_ ## BLAS_SFX \ +static inline float GELU_TANH_post_op_ ## BLAS_SFX \ (\ float temp_accum \ )\ @@ -521,7 +521,7 @@ GEN_GELU_TANH_POSTOP_FLOAT(bf16bf16f32of32) GEN_GELU_TANH_POSTOP_FLOAT(bf16bf16f32obf16) #define GEN_GELU_ERF_POSTOP_INT(ACCUM_type,BLAS_SFX) \ -inline ACCUM_type GELU_ERF_post_op_ ## BLAS_SFX \ +static inline ACCUM_type GELU_ERF_post_op_ ## BLAS_SFX \ (\ ACCUM_type temp_accum \ )\ @@ -541,7 +541,7 @@ GEN_GELU_ERF_POSTOP_INT(int16_t,s8s8s16os8) GEN_GELU_ERF_POSTOP_INT(int16_t,s8s8s16os16) #define GEN_GELU_ERF_POSTOP_FLOAT(BLAS_SFX) \ -inline float GELU_ERF_post_op_ ## BLAS_SFX \ +static inline float GELU_ERF_post_op_ ## BLAS_SFX \ (\ float temp_accum \ )\ diff --git a/bench/bench_aocl_gemm/bench_lpgemm_utils.c b/bench/bench_aocl_gemm/bench_lpgemm_utils.c index a95e214b4a..2f800ad63f 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm_utils.c +++ b/bench/bench_aocl_gemm/bench_lpgemm_utils.c @@ -157,7 +157,7 @@ void softmax_bench_driver_ ## SOFTMAX_SFX \ GEN_SOFTMAX_BENCH_DRV_FN(float,softmax_f32) -inline float gelu_tanh_f32 +static inline float gelu_tanh_f32 ( float temp_accum ) @@ -168,7 +168,7 @@ inline float gelu_tanh_f32 return temp_accum; }\ -inline float gelu_erf_f32 +static inline float gelu_erf_f32 ( float temp_accum ) From f0416cff085f17bbdf42907e7aa6761ddcb72b2b Mon Sep 17 00:00:00 2001 From: Arnav Sharma Date: Tue, 25 Jul 2023 11:03:52 +0530 Subject: [PATCH 141/226] SGEMM SUP Panel Stride Bug Fix - The AVX512 SGEMM SUP rv m and n kernels did not accomodate for the use of panel strides in case of packed matrices, thus resulting in incorrect matrix strides when packing was explicitly enabled using BLIS_PACK_A=1, BLIS_PACK_B=1 or both. - The kernels are updated to use panel strides for traversing both A and B matrix buffers accurately. [AMD-Internal]: CPUPL-3673 Change-Id: I4341ed7e1e1419cc3e2063b06f278edcb9145adb --- .../zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c | 209 +++++++----- .../zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c | 299 +++++++++--------- 2 files changed, 280 insertions(+), 228 deletions(-) diff --git a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c index 23f43052e8..2e55b698ca 100644 --- a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c +++ b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c @@ -53,9 +53,6 @@ - m0 and n0 are at most MR (6) and NR (64), respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. - - NOTE: These kernels currently do not have in-register transpose - implemented and hence they do not support column-oriented IO. */ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 ( @@ -95,7 +92,7 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 data, cntx ); cij += nr_cur * cs_c0; - bj += nr_cur * cs_b0; + bj += nr_cur * cs_b0; n_left -= nr_cur; } @@ -111,7 +108,7 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 data, cntx ); cij += nr_cur * cs_c0; - bj += nr_cur * cs_b0; + bj += nr_cur * cs_b0; n_left -= nr_cur; } @@ -127,7 +124,7 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 data, cntx ); cij += nr_cur * cs_c0; - bj += nr_cur * cs_b0; + bj += nr_cur * cs_b0; n_left -= nr_cur; } @@ -143,7 +140,7 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 data, cntx ); cij += nr_cur * cs_c0; - bj += nr_cur * cs_b0; + bj += nr_cur * cs_b0; n_left -= nr_cur; } @@ -195,21 +192,21 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 else { const dim_t mr = 6; - + // Since A is packed into row panels, // we must use a loop over gemv. dim_t m_iter = ( m0 + mr - 1 ) / mr; dim_t m_left = m0 % mr; - + float* restrict ai_ii = ai; float* restrict cij_ii = cij; - + for ( dim_t ii = 0; ii < m_iter; ii += 1 ) { dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) ? mr : m_left ); - - bli_sgemv_ex + + bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, @@ -217,7 +214,7 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 ); cij_ii += mr_cur * rs_c0; ai_ii += ps_a0; - } + } } n_left -= nr_cur; } @@ -241,6 +238,10 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + float *abuf = a; float *bbuf = b; float *cbuf = c; @@ -474,18 +475,18 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 * 4x16 & 2x16 each. * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles, * to get the transpose of 6x64 tile and are stored as 64x6 tile. - * - * |-----------------------------------| |------------------|--------| - * | | | | | | | | - * | | | | | | 16x4 | 16x2 | - * | 4x16 | 4x16 | 4x16 | 4x16 | | | | - * | | | | | |------------------|--------| - * | | | | | | | | - * |-----------------------------------| -> | 16x4 | 16x2 | - * | | | | | | | | - * | 2x16 | 2x16 | 2x16 | 2x16 | |------------------|--------| - * | | | | | | | | - * |-----------------------------------| | 16x4 | 16x2 | + * + * |-----------------------------------| |------------------|--------| + * | | | | | | | | + * | | | | | | 16x4 | 16x2 | + * | 4x16 | 4x16 | 4x16 | 4x16 | | | | + * | | | | | |------------------|--------| + * | | | | | | | | + * |-----------------------------------| -> | 16x4 | 16x2 | + * | | | | | | | | + * | 2x16 | 2x16 | 2x16 | 2x16 | |------------------|--------| + * | | | | | | | | + * |-----------------------------------| | 16x4 | 16x2 | * | | | * |------------------|--------| * | | | @@ -495,9 +496,9 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 */ /* Transposing 4x16 tiles to 16x4 tiles */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c *= sizeof(dt) => rs_c *= 4 - lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * rs_c + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c TRANSPOSE_4X16( 8, 12, 16, 20 ) lea( mem( rcx, r12, 4 ), rcx ) @@ -510,7 +511,11 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 /* Transposing 2x16 tiles to 16x2 tiles */ mov( var( cbuf ), rcx ) // load address of c - lea( mem( rcx, r10, 4 ), rcx ) + mov( var( rs_c ), r12 ) // load rs_c; r12 = rs_c + lea( mem( , r12, 4 ), r12 ) // r12 = rs_c*sizeof(dt) => r12 = rs_c*4 + lea( mem( rcx, r12, 4 ), rcx ) // rcx += 4 * r12 => rcx = 4 * rs_c + + TRANSPOSE_2X16( 24, 28 ) lea( mem( rcx, rdi, 2 ), rcx ) TRANSPOSE_2X16( 25, 29 ) @@ -553,7 +558,7 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 mov( var( cbuf ), rcx ) // load address of c mov( var( cs_c ), rdi ) // load rs_c lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) - lea( mem( rdi, rdi, 2 ), r12 ) + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c TRANSPOSE_4X16_BZ( 8, 12, 16, 20 ) lea( mem( rcx, r12, 4 ), rcx ) @@ -561,11 +566,14 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 lea( mem( rcx, r12, 4 ), rcx ) TRANSPOSE_4X16_BZ( 10, 14, 18, 22 ) lea( mem( rcx, r12, 4 ), rcx ) - TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) + TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) /* Transposing 2x16 tiles to 16x2 tiles */ mov( var( cbuf ), rcx ) // load address of c - lea( mem( rcx, r10, 4 ), rcx ) + mov( var( rs_c ), r12 ) // load rs_c; r12 = rs_c + lea( mem( , r12, 4 ), r12 ) // r12 = rs_c*sizeof(dt) => r12 = rs_c*4 + lea( mem( rcx, r12, 4 ), rcx ) // rcx += 4 * r12 => rcx = 4 * rs_c + TRANSPOSE_2X16_BZ( 24, 28 ) lea( mem( rcx, rdi, 2 ), rcx ) TRANSPOSE_2X16_BZ( 25, 29 ) @@ -579,13 +587,12 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 label( .SDONE ) - lea( mem( , r8, 2 ), rdx ) // rdx = rs_a * 2 - lea( mem( rdx, r8, 4 ), rdx ) // rdx = rs_a * 6 + mov( var( ps_a4 ), rdx ) // load panel stride of a; rdx = ps_a4 mov( var( abuf ), rax ) // load address of a - add( rdx, rax ) // a += rs_a * 6(MR) + add( rdx, rax ) // a += ps_a4 mov( rax, var( abuf ) ) // store updated a - mov( var( rs_c ), rdi ) + mov( var( rs_c ), rdi ) // load rs_c; rdi = rs_c lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c *= sizeof(dt) => rs_c *= 4 lea( mem( , rdi, 2 ), rdx ) // rdx = rs_c * 2 lea( mem( rdx, rdi, 4 ), rdx ) // rdx = rdi * 4 => rdx = rs_c * 6 @@ -604,6 +611,7 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), @@ -639,7 +647,7 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge * rs_c; - float* restrict ai = a + i_edge * rs_a; + float* restrict ai = a + m_iter * ps_a; float* restrict bj = b; if ( 4 <= m_left ) @@ -658,7 +666,7 @@ void bli_sgemmsup_rv_zen_asm_6x64m_avx512 ai += mr_cur * rs_a; m_left -= mr_cur; } - + if ( 2 <= m_left ) { const dim_t mr_cur = 2; @@ -724,6 +732,10 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + float *abuf = a; float *bbuf = b; float *cbuf = c; @@ -914,7 +926,7 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 jne( .K_LEFT_LOOP ) // if rsi != 0, repeat k-loop - label(.SPOSTACCUM) + label(.SPOSTACCUM) // Scaling A * B with alpha. ALPHA_SCALE3( 7, 8, 9, 10 ) ALPHA_SCALE3( 7, 12, 13, 14 ) @@ -955,9 +967,9 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 * to get the transpose of 6x64 tile and are stored as 64x6 tile. */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c *= sizeof(dt) => rs_c *= 4 - lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * rs_c + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c TRANSPOSE_4X16( 8, 12, 16, 20 ) lea( mem( rcx, r12, 4 ), rcx ) @@ -967,7 +979,10 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 lea( mem( rcx, r12, 4 ), rcx ) mov( var( cbuf ), rcx ) // load address of c - lea( mem( rcx, r10, 4 ), rcx ) + mov( var( rs_c ), r12 ) // load rs_c; r12 = rs_c + lea( mem( , r12, 4 ), r12 ) // r12 = rs_c*sizeof(dt) => r12 = rs_c*4 + lea( mem( rcx, r12, 4 ), rcx ) // rcx += 4 * r12 => rcx = 4 * rs_c + TRANSPOSE_2X16( 24, 28 ) lea( mem( rcx, rdi, 2 ), rcx ) TRANSPOSE_2X16( 25, 29 ) @@ -1005,9 +1020,9 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 * to get the transpose of 6x64 tile and are stored as 64x6 tile. */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) - lea( mem( rdi, rdi, 2 ), r12 ) + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c /* Transposing 4x16 tiles to 16x4 tiles */ TRANSPOSE_4X16_BZ( 8, 12, 16, 20 ) @@ -1018,7 +1033,10 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 /* Transposing 2x16 tiles to 16x2 tiles */ mov( var( cbuf ), rcx ) // load address of c - lea( mem( rcx, r10, 4 ), rcx ) + mov( var( rs_c ), r12 ) // load rs_c; r12 = rs_c + lea( mem( , r12, 4 ), r12 ) // r12 = rs_c*sizeof(dt) => r12 = rs_c*4 + lea( mem( rcx, r12, 4 ), rcx ) // rcx += 4 * r12 => rcx = 4 * rs_c + TRANSPOSE_2X16_BZ( 24, 28 ) lea( mem( rcx, rdi, 2 ), rcx ) TRANSPOSE_2X16_BZ( 25, 29 ) @@ -1030,13 +1048,12 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 label( .SDONE ) - lea( mem( , r8, 2 ), rdx ) // rdx = rs_a * 2 - lea( mem( rdx, r8, 4 ), rdx ) // rdx = rs_a * 6 + mov( var( ps_a4 ), rdx ) // load panel stride of a mov( var( abuf ), rax ) // load address of a - add( rdx, rax ) // a += rs_a * 6(MR) + add( rdx, rax ) // a += ps_a4 mov( rax, var( abuf ) ) // store updated a - mov( var( rs_c ), rdi ) + mov( var( rs_c ), rdi ) // load rs_c; rdi = rs_c lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c *= sizeof(dt) => rs_c *= 4 lea( mem( , rdi, 2 ), rdx ) // rdx = rs_c * 2 lea( mem( rdx, rdi, 4 ), rdx ) // rdx = rdi * 4 => rdx = rs_c * 6 @@ -1055,6 +1072,7 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), @@ -1090,7 +1108,7 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; - float* restrict ai = a + i_edge*rs_a; + float* restrict ai = a + m_iter * ps_a; float* restrict bj = b; if ( 4 <= m_left ) @@ -1109,7 +1127,7 @@ void bli_sgemmsup_rv_zen_asm_6x48m_avx512 ai += mr_cur * rs_a; m_left -= mr_cur; } - + if ( 2 <= m_left ) { const dim_t mr_cur = 2; @@ -1175,6 +1193,10 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + float *abuf = a; float *bbuf = b; float *cbuf = c; @@ -1327,10 +1349,10 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 label( .CONSID_K_LEFT ) - mov( var( k_left ), rsi ) // i = k_left; - test( rsi, rsi ) // check i via logical AND. - je( .SPOSTACCUM ) // if i == 0, we're done; jump to end. - // else, we prepare to enter k_left loop. + mov( var( k_left ), rsi ) // i = k_left; + test( rsi, rsi ) // check i via logical AND. + je( .SPOSTACCUM ) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. label( .K_LEFT_LOOP ) // Load 2 rows from B matrix. @@ -1397,9 +1419,9 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 * to get the transpose of 6x64 tile and are stored as 64x6 tile. */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c *= sizeof(dt) => rs_c *= 4 - lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * rs_c + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c /* Transposing 4x16 tiles to 16x4 tiles */ TRANSPOSE_4X16( 8, 12, 16, 20 ) @@ -1409,7 +1431,11 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 /* Transposing 2x16 tiles to 16x2 tiles */ mov( var( cbuf ), rcx ) // load address of c - lea( mem( rcx, r10, 4 ), rcx ) + mov( var( rs_c ), r12 ) // load rs_c; r12 = rs_c + lea( mem( , r12, 4 ), r12 ) // r12 = rs_c*sizeof(dt) => r12 = rs_c*4 + lea( mem( rcx, r12, 4 ), rcx ) // rcx += 4 * r12 => rcx = 4 * rs_c + + TRANSPOSE_2X16( 24, 28 ) lea( mem( rcx, rdi, 2 ), rcx ) TRANSPOSE_2X16( 25, 29 ) @@ -1445,9 +1471,9 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 */ /* Transposing 4x16 tiles to 16x4 tiles */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) - lea( mem( rdi, rdi, 2 ), r12 ) + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c TRANSPOSE_4X16_BZ( 8, 12, 16, 20 ) lea( mem( rcx, r12, 4 ), rcx ) @@ -1455,7 +1481,10 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 /* Transposing 2x16 tiles to 16x2 tiles */ mov( var( cbuf ), rcx ) // load address of c - lea( mem( rcx, r10, 4 ), rcx ) + mov( var( rs_c ), r12 ) // load rs_c; r12 = rs_c + lea( mem( , r12, 4 ), r12 ) // r12 = rs_c*sizeof(dt) => r12 = rs_c*4 + lea( mem( rcx, r12, 4 ), rcx ) // rcx += 4 * r12 => rcx = 4 * rs_c + TRANSPOSE_2X16_BZ( 24, 28 ) lea( mem( rcx, rdi, 2 ), rcx ) TRANSPOSE_2X16_BZ( 25, 29 ) @@ -1465,13 +1494,12 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 label( .SDONE ) - lea( mem( , r8, 2 ), rdx ) // rdx = rs_a * 2 - lea( mem( rdx, r8, 4 ), rdx ) // rdx = rs_a * 6 + mov( var( ps_a4 ), rdx ) // load panel stride of a mov( var( abuf ), rax ) // load address of a - add( rdx, rax ) // a += rs_a * 6(MR) + add( rdx, rax ) // a += ps_a4 mov( rax, var( abuf ) ) // store updated a - mov( var( rs_c ), rdi ) + mov( var( rs_c ), rdi ) // load rs_c; rdi = rs_c lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c *= sizeof(dt) => rs_c *= 4 lea( mem( , rdi, 2 ), rdx ) // rdx = rs_c * 2 lea( mem( rdx, rdi, 4 ), rdx ) // rdx = rdi * 4 => rdx = rs_c * 6 @@ -1490,6 +1518,7 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), @@ -1525,7 +1554,7 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; - float* restrict ai = a + i_edge*rs_a; + float* restrict ai = a + m_iter * ps_a; float* restrict bj = b; if ( 4 <= m_left ) @@ -1544,7 +1573,7 @@ void bli_sgemmsup_rv_zen_asm_6x32m_avx512 ai += mr_cur * rs_a; m_left -= mr_cur; } - + if ( 2 <= m_left ) { const dim_t mr_cur = 2; @@ -1610,6 +1639,10 @@ void bli_sgemmsup_rv_zen_asm_6x16m_avx512 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + float *abuf = a; float *bbuf = b; float *cbuf = c; @@ -1830,16 +1863,19 @@ void bli_sgemmsup_rv_zen_asm_6x16m_avx512 */ /* Transposing 4x16 tiles to 16x4 tiles */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c *= sizeof(dt) => rs_c *= 4 - lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * rs_c + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c TRANSPOSE_4X16( 8, 12, 16, 20 ) lea( mem( rcx, r12, 4 ), rcx ) /* Transposing 2x16 tiles to 16x2 tiles */ mov( var( cbuf ), rcx ) // load address of c - lea( mem( rcx, r10, 4 ), rcx ) + mov( var( rs_c ), r12 ) // load rs_c; r12 = rs_c + lea( mem( , r12, 4 ), r12 ) // r12 = rs_c*sizeof(dt) => r12 = rs_c*4 + lea( mem( rcx, r12, 4 ), rcx ) // rcx += 4 * r12 => rcx = 4 * rs_c + TRANSPOSE_2X16( 24, 28 ) jmp( .SDONE ) // jump to the end @@ -1873,15 +1909,18 @@ void bli_sgemmsup_rv_zen_asm_6x16m_avx512 */ /* Transposing 4x16 tiles to 16x4 tiles */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) - lea( mem( rdi, rdi, 2 ), r12 ) + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c TRANSPOSE_4X16_BZ( 8, 12, 16, 20 ) /* Transposing 2x16 tiles to 16x2 tiles */ mov( var( cbuf ), rcx ) // load address of c - lea( mem( rcx, r10, 4 ), rcx ) + mov( var( rs_c ), r12 ) // load rs_c; r12 = rs_c + lea( mem( , r12, 4 ), r12 ) // r12 = rs_c*sizeof(dt) => r12 = rs_c*4 + lea( mem( rcx, r12, 4 ), rcx ) // rcx += 4 * r12 => rcx = 4 * rs_c + TRANSPOSE_2X16_BZ( 24, 28 ) jmp( .SDONE ) // jump to the end @@ -1889,13 +1928,12 @@ void bli_sgemmsup_rv_zen_asm_6x16m_avx512 label( .SDONE ) - lea( mem( , r8, 2 ), rdx ) // rdx = rs_a * 2 - lea( mem( rdx, r8, 4 ), rdx ) // rdx = rs_a * 6 + mov( var( ps_a4 ), rdx ) // load panel stride of a mov( var( abuf ), rax ) // load address of a - add( rdx, rax ) // a += rs_a * 6(MR) + add( rdx, rax ) // a += ps_a4 mov( rax, var( abuf ) ) // store updated a - mov( var( rs_c ), rdi ) + mov( var( rs_c ), rdi ) // load rs_c; rdi = rs_c lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c *= sizeof(dt) => rs_c *= 4 lea( mem( , rdi, 2 ), rdx ) // rdx = rs_c * 2 lea( mem( rdx, rdi, 4 ), rdx ) // rdx = rdi * 4 => rdx = rs_c * 6 @@ -1914,6 +1952,7 @@ void bli_sgemmsup_rv_zen_asm_6x16m_avx512 [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), @@ -1949,7 +1988,7 @@ void bli_sgemmsup_rv_zen_asm_6x16m_avx512 const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; - float* restrict ai = a + i_edge*rs_a; + float* restrict ai = a + m_iter*ps_a; float* restrict bj = b; if ( 4 <= m_left ) @@ -2275,7 +2314,7 @@ void bli_sgemmsup_rv_zen_asm_4x64m_avx512 lea( mem( rcx, r12, 4 ), rcx ) TRANSPOSE_4X16_BZ( 10, 14, 18, 22 ) lea( mem( rcx, r12, 4 ), rcx ) - TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) + TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) jmp( .SDONE ) // jump to the end diff --git a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c index e4ce3d1490..08204eef20 100644 --- a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c +++ b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c @@ -41,21 +41,18 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : Assumptions: - B is row-stored; - A is row-stored; - m0 and n0 are at most MR (6) and NR (64), respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. - - NOTE: These kernels currently do not have in-register transpose - implemented and hence they do not support column-oriented IO. */ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 ( @@ -139,6 +136,10 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // Query the panel stride of B and convert it to units of bytes. + uint64_t ps_b = bli_auxinfo_ps_b( data ); + uint64_t ps_b4 = ps_b * sizeof( float ); + float *abuf = a; float *bbuf = b; float *cbuf = c; @@ -360,9 +361,9 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 */ /* Transposing 4x16 tiles to 16x4 tiles */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c *= sizeof(dt) => rs_c *= 4 - lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * rs_c + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c TRANSPOSE_4X16( 8, 12, 16, 20 ) lea( mem( rcx, r12, 4 ), rcx ) @@ -415,9 +416,9 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 */ /* Transposing 4x16 tiles to 16x4 tiles */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) - lea( mem( rdi, rdi, 2 ), r12 ) + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c TRANSPOSE_4X16_BZ( 8, 12, 16, 20 ) lea( mem( rcx, r12, 4 ), rcx ) @@ -425,7 +426,7 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 lea( mem( rcx, r12, 4 ), rcx ) TRANSPOSE_4X16_BZ( 10, 14, 18, 22 ) lea( mem( rcx, r12, 4 ), rcx ) - TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) + TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) /* Transposing 2x16 tiles to 16x2 tiles */ mov( var( cbuf ), rcx ) // load address of c @@ -438,26 +439,24 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 lea( mem( rcx, rdi, 2 ), rcx ) TRANSPOSE_2X16_BZ( 27, 31 ) - jmp( .SDONE ) // jump to the end + jmp( .SDONE ) // jump to the end label( .SDONE ) - mov( var( cs_b ), rdx ) - lea( mem( , rdx, 4 ), rdx ) - lea( mem( , rdx, 8 ), rdx ) // rdx = cs_b * 8 - lea( mem( , rdx, 8 ), rdx ) // rdx += cs_b * 8 => rdx = cs_b * 16 - mov( var( bbuf ), rbx ) - add( rdx, rbx ) + mov( var( ps_b4 ), rdx ) // load ps_b4; rdx = ps_b4 + mov( var( bbuf ), rbx ) // load b + add( rdx, rbx ) // b += ps_b4 mov( rbx, var( bbuf ) ) - mov( var( cs_c ), rdx ) - lea( mem( , rdx, 4 ), rdx ) - lea( mem( , rdx, 8 ), rdx ) // rdx = cs_c * 8 - lea( mem( , rdx, 8 ), rdx ) // rdx = rdx * 8 = cs_c * 8 * 8 => rdx = cs_c * 64 - mov( var( cbuf ), rcx ) // load address of c - add( rdx, rcx ) // c += rs_c * MR - mov( rcx, var( cbuf ) ) // store updated c + mov( var( cs_c ), rdx ) // load cs_c; rdx = cs_c + lea( mem( , rdx, 4 ), rdx ) // rdx = cs_c*sizeof(dt) => rdx = cs_c*4 + lea( mem( , rdx, 8 ), rdx ) // rdx = cs_c * 8 + lea( mem( , rdx, 8 ), rdx ) // rdx = rdx * 8 = cs_c * 8 * 8 + // => rdx = cs_c * 64 + mov( var( cbuf ), rcx ) // load address of c + add( rdx, rcx ) // c += rs_c * MR + mov( rcx, var( cbuf ) ) // store updated c dec( r11 ) jne( .N_LOOP_ITER ) @@ -473,6 +472,7 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), + [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), @@ -523,7 +523,7 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -539,7 +539,7 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -555,7 +555,7 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -571,7 +571,7 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -637,7 +637,7 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) ? mr : m_left ); - bli_sgemv_ex + bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, @@ -645,7 +645,7 @@ void bli_sgemmsup_rv_zen_asm_6x64n_avx512 ); cij_ii += mr_cur*rs_c0; ai_ii += ps_a0; - } + } } n_left -= nr_cur; } @@ -681,6 +681,10 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // Query the panel stride of B and convert it to units of bytes. + uint64_t ps_b = bli_auxinfo_ps_b( data ); + uint64_t ps_b4 = ps_b * sizeof( float ); + float *abuf = a; float *bbuf = b; float *cbuf = c; @@ -741,7 +745,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 VFMA4( 4, 20, 21, 22, 23 ) vbroadcastss( mem( rax, r8, 4 ), zmm5 ) VFMA4( 5, 24, 25, 26, 27 ) - + add( r9, rbx ) add( r10, rax ) @@ -763,7 +767,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 VFMA4( 4, 20, 21, 22, 23 ) vbroadcastss( mem( rax, r8, 4 ), zmm5 ) VFMA4( 5, 24, 25, 26, 27 ) - + add( r9, rbx ) add( r10, rax ) @@ -785,7 +789,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 VFMA4( 4, 20, 21, 22, 23 ) vbroadcastss( mem( rax, r8, 4 ), zmm5 ) VFMA4( 5, 24, 25, 26, 27 ) - + add( r9, rbx ) add( r10, rax ) @@ -807,7 +811,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 VFMA4( 4, 20, 21, 22, 23 ) vbroadcastss( mem( rax, r8, 4 ), zmm5 ) VFMA4( 5, 24, 25, 26, 27 ) - + add( r9, rbx ) add( r10, rax ) @@ -842,7 +846,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 VFMA4( 4, 20, 21, 22, 23 ) vbroadcastss( mem( rax, r8, 4 ), zmm5 ) VFMA4( 5, 24, 25, 26, 27 ) - + add( r9, rbx ) add( r10, rax ) dec( rsi ) @@ -958,7 +962,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 lea( mem( rcx, r12, 4 ), rcx ) TRANSPOSE_4X16_BZ( 10, 14, 18, 22 ) lea( mem( rcx, r12, 4 ), rcx ) - TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) + TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) /* Transposing 1x16 tiles to 16x1 tiles */ mov( var( cbuf ), rcx ) // load address of c @@ -979,12 +983,9 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 label( .SDONE ) - mov( var( cs_b ), rdx ) - lea( mem( , rdx, 4 ), rdx ) - lea( mem( , rdx, 8 ), rdx ) // rdx = cs_b * 8 - lea( mem( , rdx, 8 ), rdx ) // rdx += cs_b * 8 => rdx = cs_b * 16 - mov( var( bbuf ), rbx ) - add( rdx, rbx ) + mov( var( ps_b4 ), rdx ) // load ps_b4 + mov( var( bbuf ), rbx ) // load b + add( rdx, rbx ) // b += ps_b4 mov( rbx, var( bbuf ) ) mov( var( cs_c ), rdx ) @@ -1009,6 +1010,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), + [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), @@ -1059,7 +1061,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -1075,7 +1077,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -1091,7 +1093,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -1107,7 +1109,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -1173,7 +1175,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) ? mr : m_left ); - bli_sgemv_ex + bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, @@ -1181,7 +1183,7 @@ void bli_sgemmsup_rv_zen_asm_5x64n_avx512 ); cij_ii += mr_cur*rs_c0; ai_ii += ps_a0; - } + } } n_left -= nr_cur; } @@ -1217,6 +1219,10 @@ void bli_sgemmsup_rv_zen_asm_4x64n_avx512 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // Query the panel stride of B and convert it to units of bytes. + uint64_t ps_b = bli_auxinfo_ps_b( data ); + uint64_t ps_b4 = ps_b * sizeof( float ); + float *abuf = a; float *bbuf = b; float *cbuf = c; @@ -1466,28 +1472,26 @@ void bli_sgemmsup_rv_zen_asm_4x64n_avx512 lea( mem( rcx, r12, 4 ), rcx ) TRANSPOSE_4X16_BZ( 10, 14, 18, 22 ) lea( mem( rcx, r12, 4 ), rcx ) - TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) + TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) - jmp( .SDONE ) // jump to the end + jmp( .SDONE ) // jump to the end label( .SDONE ) - mov( var( cs_b ), rdx ) - lea( mem( , rdx, 4 ), rdx ) - lea( mem( , rdx, 8 ), rdx ) // rdx = cs_b * 8 - lea( mem( , rdx, 8 ), rdx ) // rdx += cs_b * 8 => rdx = cs_b * 16 - mov( var( bbuf ), rbx ) - add( rdx, rbx ) + mov( var( ps_b4 ), rdx ) // load ps_b4; rdx = ps_b4 + mov( var( bbuf ), rbx ) // load b + add( rdx, rbx ) // b += ps_b4 mov( rbx, var( bbuf ) ) - mov( var( cs_c ), rdx ) - lea( mem( , rdx, 4 ), rdx ) - lea( mem( , rdx, 8 ), rdx ) // rdx = cs_c * 8 - lea( mem( , rdx, 8 ), rdx ) // rdx = rdx * 8 = cs_c * 8 * 8 => rdx = cs_c * 64 - mov( var( cbuf ), rcx ) // load address of c - add( rdx, rcx ) // c += rs_c * MR - mov( rcx, var( cbuf ) ) // store updated c + mov( var( cs_c ), rdx ) // load cs_c; rdx = cs_c + lea( mem( , rdx, 4 ), rdx ) // rdx = cs_c*sizeof(dt) => rdx = cs_c*4 + lea( mem( , rdx, 8 ), rdx ) // rdx = cs_c * 8 + lea( mem( , rdx, 8 ), rdx ) // rdx = rdx * 8 = cs_c * 8 * 8 + // => rdx = cs_c * 64 + mov( var( cbuf ), rcx ) // load address of c + add( rdx, rcx ) // c += rs_c * MR + mov( rcx, var( cbuf ) ) // store updated c dec( r11 ) jne( .N_LOOP_ITER ) @@ -1503,6 +1507,7 @@ void bli_sgemmsup_rv_zen_asm_4x64n_avx512 [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), + [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), @@ -1553,7 +1558,7 @@ void bli_sgemmsup_rv_zen_asm_4x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -1569,7 +1574,7 @@ void bli_sgemmsup_rv_zen_asm_4x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -1585,7 +1590,7 @@ void bli_sgemmsup_rv_zen_asm_4x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -1601,7 +1606,7 @@ void bli_sgemmsup_rv_zen_asm_4x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -1667,7 +1672,7 @@ void bli_sgemmsup_rv_zen_asm_4x64n_avx512 dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) ? mr : m_left ); - bli_sgemv_ex + bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, @@ -1675,7 +1680,7 @@ void bli_sgemmsup_rv_zen_asm_4x64n_avx512 ); cij_ii += mr_cur*rs_c0; ai_ii += ps_a0; - } + } } n_left -= nr_cur; } @@ -1711,6 +1716,10 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // Query the panel stride of B and convert it to units of bytes. + uint64_t ps_b = bli_auxinfo_ps_b( data ); + uint64_t ps_b4 = ps_b * sizeof( float ); + float *abuf = a; float *bbuf = b; float *cbuf = c; @@ -1937,9 +1946,9 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 /* Transposing 2x16 tiles to 16x2 tiles */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) - lea( mem( rdi, rdi, 2 ), r12 ) + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c TRANSPOSE_2X16_BZ( 8, 12 ) lea( mem( rcx, rdi, 2 ), rcx ) @@ -1950,39 +1959,37 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 TRANSPOSE_2X16_BZ( 11, 15 ) /* Transposing 1x16 tiles to 16x1 tiles */ - mov( var( cbuf ), rcx ) - mov( var( rs_c ), rdi ) - lea( mem( , rdi, 4 ), rdi ) - lea( mem( rcx, rdi, 2 ), rcx ) - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) - lea( mem( rdi, rdi, 2 ), r12 ) + mov( var( cbuf ), rcx ) // load address of c + mov( var( rs_c ), rdi ) // load rs_c; rdi = rs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c*sizeof(dt) => rdi = rs_c*4 + lea( mem( rcx, rdi, 2 ), rcx ) // c += rdi * 2 + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c UPDATE_C_1X16_BZ( 16 ) UPDATE_C_1X16_BZ( 17 ) UPDATE_C_1X16_BZ( 18 ) UPDATE_C_1X16_BZ( 19 ) - jmp( .SDONE ) // jump to the end + jmp( .SDONE ) // jump to the end label( .SDONE ) - mov( var( cs_b ), rdx ) - lea( mem( , rdx, 4 ), rdx ) - lea( mem( , rdx, 8 ), rdx ) // rdx = cs_b * 8 - lea( mem( , rdx, 8 ), rdx ) // rdx += cs_b * 8 => rdx = cs_b * 16 - mov( var( bbuf ), rbx ) - add( rdx, rbx ) + mov( var( ps_b4 ), rdx ) // load ps_b4 + mov( var( bbuf ), rbx ) // load b + add( rdx, rbx ) // b += ps_b4 mov( rbx, var( bbuf ) ) - mov( var( cs_c ), rdx ) - lea( mem( , rdx, 4 ), rdx ) - lea( mem( , rdx, 8 ), rdx ) // rdx = cs_c * 8 - lea( mem( , rdx, 8 ), rdx ) // rdx = rdx * 8 = cs_c * 8 * 8 => rdx = cs_c * 64 - mov( var( cbuf ), rcx ) // load address of c - add( rdx, rcx ) // c += rs_c * MR - mov( rcx, var( cbuf ) ) // store updated c + mov( var( cs_c ), rdx ) // load cs_c; rdx = cs_c + lea( mem( , rdx, 4 ), rdx ) // rdx = cs_c*sizeof(dt) => rdx = cs_c*4 + lea( mem( , rdx, 8 ), rdx ) // rdx = cs_c * 8 + lea( mem( , rdx, 8 ), rdx ) // rdx = rdx * 8 = cs_c * 8 * 8 + // => rdx = cs_c * 64 + mov( var( cbuf ), rcx ) // load address of c + add( rdx, rcx ) // c += rs_c * MR + mov( rcx, var( cbuf ) ) // store updated c dec( r11 ) jne( .N_LOOP_ITER ) @@ -1998,6 +2005,7 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), + [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), @@ -2048,7 +2056,7 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2064,7 +2072,7 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2080,7 +2088,7 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2096,7 +2104,7 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2162,7 +2170,7 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) ? mr : m_left ); - bli_sgemv_ex + bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, @@ -2170,7 +2178,7 @@ void bli_sgemmsup_rv_zen_asm_3x64n_avx512 ); cij_ii += mr_cur*rs_c0; ai_ii += ps_a0; - } + } } n_left -= nr_cur; } @@ -2206,6 +2214,10 @@ void bli_sgemmsup_rv_zen_asm_2x64n_avx512 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // Query the panel stride of B and convert it to units of bytes. + uint64_t ps_b = bli_auxinfo_ps_b( data ); + uint64_t ps_b4 = ps_b * sizeof( float ); + float *abuf = a; float *bbuf = b; float *cbuf = c; @@ -2422,12 +2434,9 @@ void bli_sgemmsup_rv_zen_asm_2x64n_avx512 label( .SDONE ) - mov( var( cs_b ), rdx ) - lea( mem( , rdx, 4 ), rdx ) - lea( mem( , rdx, 8 ), rdx ) // rdx = cs_b * 8 - lea( mem( , rdx, 8 ), rdx ) // rdx += cs_b * 8 => rdx = cs_b * 16 - mov( var( bbuf ), rbx ) - add( rdx, rbx ) + mov( var( ps_b4 ), rdx ) // load ps_b4 + mov( var( bbuf ), rbx ) // load b + add( rdx, rbx ) // b += ps_b4 mov( rbx, var( bbuf ) ) mov( var( cs_c ), rdx ) @@ -2452,6 +2461,7 @@ void bli_sgemmsup_rv_zen_asm_2x64n_avx512 [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), + [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), @@ -2502,7 +2512,7 @@ void bli_sgemmsup_rv_zen_asm_2x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2518,7 +2528,7 @@ void bli_sgemmsup_rv_zen_asm_2x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2534,7 +2544,7 @@ void bli_sgemmsup_rv_zen_asm_2x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2550,7 +2560,7 @@ void bli_sgemmsup_rv_zen_asm_2x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2616,7 +2626,7 @@ void bli_sgemmsup_rv_zen_asm_2x64n_avx512 dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) ? mr : m_left ); - bli_sgemv_ex + bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, @@ -2624,7 +2634,7 @@ void bli_sgemmsup_rv_zen_asm_2x64n_avx512 ); cij_ii += mr_cur*rs_c0; ai_ii += ps_a0; - } + } } n_left -= nr_cur; } @@ -2660,6 +2670,10 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // Query the panel stride of B and convert it to units of bytes. + uint64_t ps_b = bli_auxinfo_ps_b( data ); + uint64_t ps_b4 = ps_b * sizeof( float ); + float *abuf = a; float *bbuf = b; float *cbuf = c; @@ -2806,7 +2820,7 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 label( .SROWSTORED ) UPDATE_C4( 4, 8, 9, 10, 11 ) - + jmp( .SDONE ) // jump to the end @@ -2814,9 +2828,9 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 /* Transposing 1x16 tiles to 16x1 tiles */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rdi = rs_c *= sizeof(dt) => rs_c *= 4 - lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * rs_c + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c UPDATE_C_1X16( 8 ) UPDATE_C_1X16( 9 ) @@ -2843,9 +2857,9 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 /* Transposing 2x16 tiles to 16x2 tiles */ mov( var( cbuf ), rcx ) // load address of c - mov( var( cs_c ), rdi ) // load rs_c - lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) - lea( mem( rdi, rdi, 2 ), r12 ) + mov( var( cs_c ), rdi ) // load cs_c; rdi = cs_c + lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c*sizeof(dt) => rdi = cs_c*4 + lea( mem( rdi, rdi, 2 ), r12 ) // rdi += rdi * 2 => rdi = 3 * cs_c UPDATE_C_1X16_BZ( 8 ) UPDATE_C_1X16_BZ( 9 ) @@ -2857,21 +2871,19 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 label( .SDONE ) - mov( var( cs_b ), rdx ) - lea( mem( , rdx, 4 ), rdx ) - lea( mem( , rdx, 8 ), rdx ) // rdx = cs_b * 8 - lea( mem( , rdx, 8 ), rdx ) // rdx += cs_b * 8 => rdx = cs_b * 16 - mov( var( bbuf ), rbx ) - add( rdx, rbx ) + mov( var( ps_b4 ), rdx ) // load ps_b4 + mov( var( bbuf ), rbx ) // load b + add( rdx, rbx ) // b += ps_b4 mov( rbx, var( bbuf ) ) - mov( var( cs_c ), rdx ) - lea( mem( , rdx, 4 ), rdx ) - lea( mem( , rdx, 8 ), rdx ) // rdx = cs_c * 8 - lea( mem( , rdx, 8 ), rdx ) // rdx = rdx * 8 = cs_c * 8 * 8 => rdx = cs_c * 64 - mov( var( cbuf ), rcx ) // load address of c - add( rdx, rcx ) // c += rs_c * MR - mov( rcx, var( cbuf ) ) // store updated c + mov( var( cs_c ), rdx ) // load cs_c; rdx = cs_c + lea( mem( , rdx, 4 ), rdx ) // rdx = cs_c*sizeof(dt) => rdx = cs_c*4 + lea( mem( , rdx, 8 ), rdx ) // rdx = cs_c * 8 + lea( mem( , rdx, 8 ), rdx ) // rdx = rdx * 8 = cs_c * 8 * 8 + // => rdx = cs_c * 64 + mov( var( cbuf ), rcx ) // load address of c + add( rdx, rcx ) // c += rs_c * MR + mov( rcx, var( cbuf ) ) // store updated c dec( r11 ) jne( .N_LOOP_ITER ) @@ -2887,6 +2899,7 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), + [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), @@ -2937,7 +2950,7 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2953,7 +2966,7 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2969,7 +2982,7 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -2985,7 +2998,7 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 data,cntx ); cij += nr_cur*cs_c0; - bj += nr_cur*cs_b0; + bj += nr_cur*cs_b0; n_left -= nr_cur; } @@ -3051,7 +3064,7 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) ? mr : m_left ); - bli_sgemv_ex + bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, @@ -3059,7 +3072,7 @@ void bli_sgemmsup_rv_zen_asm_1x64n_avx512 ); cij_ii += mr_cur*rs_c0; ai_ii += ps_a0; - } + } } n_left -= nr_cur; } From df80f40ccd0c395565276e8b470f722d08a21e09 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Mon, 2 Oct 2023 12:39:14 +0530 Subject: [PATCH 142/226] Fixed incorrect ymm registers usage in FMA operation. - Incorrect ymm registers were used in dgemm SUP edge kernel, while computing FMA operation. - Due to incorrect vector register, it resulted into incorrect result. - Corrected vector registers usage for FMA operation. AMD-Internal: [CPUPL-3964] Change-Id: I37fcb5f8eeb5945fe994d8a5b69815a3bcca87df --- .../haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c index 809986ab48..31ee7ee1ab 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c @@ -777,7 +777,7 @@ void bli_dgemmsup_rv_haswell_asm_4x3 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm6) + vfmadd231pd(ymm1, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm12) vbroadcastsd(mem(rax, r13, 1), ymm13) @@ -798,7 +798,7 @@ void bli_dgemmsup_rv_haswell_asm_4x3 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm5) - vfmadd231pd(ymm1, ymm2, ymm7) + vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm12) vbroadcastsd(mem(rax, r13, 1), ymm13) @@ -882,7 +882,7 @@ void bli_dgemmsup_rv_haswell_asm_4x3 vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm1, ymm2, ymm4) - vfmadd231pd(ymm1, ymm2, ymm6) + vfmadd231pd(ymm1, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm12) vbroadcastsd(mem(rax, r13, 1), ymm13) From 24e4d58f9251d87494effe117d61bc57215d93cf Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Wed, 13 Sep 2023 04:39:29 -0400 Subject: [PATCH 143/226] Tidy zen bli_cntx_init and bli_family files Tidy formatting of config/*zen*/bli_cntx_init_zen*.c and config/*zen*/bli_family_*.c files to make them more consistent with each other and improve readability. AMD-Internal: [CPUPL-3519] Change-Id: I32c2bf6dc8365264a748a401cf3c83be4976f73b --- config/amdzen/bli_family_amdzen.h | 10 +- config/zen/bli_cntx_init_zen.c | 538 +++++++++++----------- config/zen/bli_family_zen.h | 8 +- config/zen2/bli_cntx_init_zen2.c | 164 +++---- config/zen2/bli_family_zen2.h | 8 +- config/zen3/bli_cntx_init_zen3.c | 541 +++++++++++----------- config/zen3/bli_family_zen3.h | 10 +- config/zen4/bli_cntx_init_zen4.c | 719 +++++++++++++++--------------- config/zen4/bli_family_zen4.h | 9 +- 9 files changed, 1019 insertions(+), 988 deletions(-) diff --git a/config/amdzen/bli_family_amdzen.h b/config/amdzen/bli_family_amdzen.h index eda853f356..aeacf75647 100644 --- a/config/amdzen/bli_family_amdzen.h +++ b/config/amdzen/bli_family_amdzen.h @@ -37,27 +37,23 @@ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops -// to be not paralleized. -// +// to be not parallelized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 - #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM - // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 // When running HPL with pure MPI without DGEMM threading (Single-threaded // BLIS), defining this macro as 1 yields better performance. #define AOCL_BLIS_MULTIINSTANCE 0 #endif - diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 83ce2cf8b6..7f44b499fc 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -35,283 +35,289 @@ #include "blis.h" -//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) - void bli_cntx_init_zen( cntx_t* cntx ) { - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; - - // Set default kernel blocksizes and functions. - bli_cntx_init_zen_ref( cntx ); - - // ------------------------------------------------------------------------- - - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs - ( - 8, - // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, - // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, - // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); - - // Update the context with architecture specific threshold functions - bli_cntx_set_l3_thresh_funcs - ( - 2, - // GEMMT - BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen, - // SYRK - BLIS_SYRK, bli_cntx_syrksup_thresh_is_met_zen, - cntx - ); - - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 12, - // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, - BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_zen_int_5, - // dotxaxpyf - BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_zen_int_8, - BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_zen_int_8, - // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_zen_int_6, - BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_zen_int_6, - //axpy2v - BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_zen_int, - BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_zen_int, - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 29, - - // amaxv - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, - BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, - - // axpbyv - BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10, - BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10, - BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int, - BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int, - - // axpyv - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, - BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, - BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, - - // dotv - BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, - BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, - BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, - BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, - - // dotxv - BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, - BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, - BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int, - BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int, - // scalv - - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, - BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int, - - // swapv - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, - - // copyv - BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, - BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, - BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen_int, - - //set - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - - // scal2v - BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int, - cntx - ); - - // Initialize level-3 blocksize objects with architecture-specific values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + blksz_t thresh[ BLIS_NUM_THRESH ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_zen_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native gemm micro-kernels and + // their storage preferences. + bli_cntx_set_l3_nat_ukrs + ( + 8, + // gemm + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, + cntx + ); + + // Update the context with architecture specific threshold functions + bli_cntx_set_l3_thresh_funcs + ( + 2, + // GEMMT + BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen, + // SYRK + BLIS_SYRK, bli_cntx_syrksup_thresh_is_met_zen, + cntx + ); + + // Update the context with optimized level-1f kernels. + bli_cntx_set_l1f_kers + ( + 12, + // axpyf + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_zen_int_5, + // dotxaxpyf + BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_zen_int_8, + BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_zen_int_8, + // dotxf + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_zen_int_6, + BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_zen_int_6, + // axpy2v + BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_zen_int, + BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_zen_int, + cntx + ); + + // Update the context with optimized level-1v kernels. + bli_cntx_set_l1v_kers + ( + 29, + // amaxv + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, + + // axpbyv + BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10, + BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10, + BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int, + BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int, + + // axpyv + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, + + // dotv + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, + + // dotxv + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int, + + // scalv + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int, + + // swapv + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + + // copyv + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen_int, + + // setv + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + + // scal2v + BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int, + cntx + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); /* - Multi Instance performance improvement of DGEMM when binded to a CCX - In Multi instance each thread runs a sequential DGEMM. + Multi Instance performance improvement of DGEMM when binded to a CCX + In Multi instance each thread runs a sequential DGEMM. - a) If BLIS is run in a multi-instance mode with - CPU freq 2.6/2.2 Ghz - DDR4 clock frequency 2400Mhz - mc = 240, kc = 512, and nc = 2040 - has better performance on EPYC server, over the default block sizes. + a) If BLIS is run in a multi-instance mode with + CPU freq 2.6/2.2 Ghz + DDR4 clock frequency 2400Mhz + mc = 240, kc = 512, and nc = 2040 + has better performance on EPYC server, over the default block sizes. - b) If BLIS is run in Single Instance mode - mc = 510, kc = 1024 and nc = 4080 + b) If BLIS is run in Single Instance mode + mc = 510, kc = 1024 and nc = 4080 */ + // Initialize level-3 blocksize objects with architecture-specific values. #ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES - #if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES + #if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES + + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); + #else - #else - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 ); + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 ); - #endif + #endif #else - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); #endif - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native execution. - bli_cntx_set_blkszs - ( - BLIS_NAT, 7, - // level-3 - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - // level-1f - BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, - BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); - - // Update the context with the current architecture's register and cache - // blocksizes for level-3 TRSM execution. - bli_cntx_set_trsm_blkszs - ( - 5, - // level-3 - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); - - // ------------------------------------------------------------------------- - - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 128 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, 256, 128 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 128 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Initialize the context with the sup handlers. - bli_cntx_set_l3_sup_handlers - ( - 1, - BLIS_GEMM, bli_gemmsup_ref, - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 30, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); -} + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 7, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, + cntx + ); + + // Update the context with the current architecture's register and cache + // blocksizes for level-3 TRSM problems. + bli_cntx_set_trsm_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + + // ------------------------------------------------------------------------- + + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 128 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, 256, 128 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 128 ); + + // Initialize the context with the sup thresholds. + bli_cntx_set_l3_sup_thresh + ( + 3, + BLIS_MT, &thresh[ BLIS_MT ], + BLIS_NT, &thresh[ BLIS_NT ], + BLIS_KT, &thresh[ BLIS_KT ], + cntx + ); + + // Initialize the context with the sup handlers. + bli_cntx_set_l3_sup_handlers + ( + 1, + BLIS_GEMM, bli_gemmsup_ref, + cntx + ); + + // Update the context with optimized small/unpacked gemm kernels. + bli_cntx_set_l3_sup_kers + ( + 30, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + + BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, + BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, + BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + + BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + + BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE, + BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE, + BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + cntx + ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); + + // Update the context with the current architecture's register and cache + // blocksizes for small/unpacked level-3 problems. + bli_cntx_set_l3_sup_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); +} diff --git a/config/zen/bli_family_zen.h b/config/zen/bli_family_zen.h index 8b31c32ca0..b833a11d1b 100644 --- a/config/zen/bli_family_zen.h +++ b/config/zen/bli_family_zen.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,7 +38,7 @@ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops -// to be not paralleized. +// to be not parallelized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 @@ -50,7 +50,7 @@ #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 #endif diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index 42eae35d95..0538c7defe 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -1,9 +1,11 @@ /* + BLIS An object-based framework for developing high-performance BLAS-like libraries. + Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -16,6 +18,7 @@ - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -27,6 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ #include "blis.h" @@ -46,7 +50,6 @@ void bli_cntx_init_zen2( cntx_t* cntx ) bli_cntx_set_l3_nat_ukrs ( 8, - // gemm BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, @@ -56,7 +59,6 @@ void bli_cntx_init_zen2( cntx_t* cntx ) // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, - // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, @@ -67,12 +69,12 @@ void bli_cntx_init_zen2( cntx_t* cntx ) bli_cntx_set_l3_thresh_funcs ( 2, - //gemmt + // GEMMT BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen, - //SYRK - BLIS_SYRK, bli_cntx_syrksup_thresh_is_met_zen, + // SYRK + BLIS_SYRK, bli_cntx_syrksup_thresh_is_met_zen, cntx - ); + ); // Update the context with optimized packm kernels. bli_cntx_set_packm_kers @@ -94,20 +96,20 @@ void bli_cntx_init_zen2( cntx_t* cntx ) ( 12, // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_zen_int_5, BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_zen_int_5, // dotxaxpyf BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_zen_int_8, BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_zen_int_8, // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_zen_int_6, BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_zen_int_6, // axpy2v - BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_zen_int, + BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_zen_int, BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_zen_int, cntx ); @@ -116,55 +118,54 @@ void bli_cntx_init_zen2( cntx_t* cntx ) bli_cntx_set_l1v_kers ( 29, - // amaxv - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, - BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, - // axpbyv - BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10, - BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10, + // axpbyv + BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10, + BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10, BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int, BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int, // axpyv - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, - BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, - BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, // dotv - BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, - BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, - BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, - BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, // dotxv - BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, - BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int, BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int, // scalv - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int, - //swap - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + // swapv + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, - //copy - BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, - BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + // copyv + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen_int, - //set - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + // setv + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, // scal2v - BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int, + BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int, cntx ); @@ -175,11 +176,11 @@ void bli_cntx_init_zen2( cntx_t* cntx ) #if AOCL_BLIS_MULTIINSTANCE bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 18 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 566 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 256 ); #else - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 18 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 18 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 566 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 256 ); #endif bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); @@ -204,31 +205,35 @@ void bli_cntx_init_zen2( cntx_t* cntx ) // ------------------------------------------------------------------------- - //Initialize TRSM blocksize objects with architecture-specific values. - //Using different cache block sizes for TRSM instead of common level-3 block sizes. - //Tuning is done for double-precision only. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); - - // Update the context with the current architecture's register and cache - // blocksizes for level-3 TRSM problems. - bli_cntx_set_trsm_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); - - // Initialize sup thresholds with architecture-appropriate values. s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 ); + // Initialize TRSM blocksize objects with architecture-specific values. + // Using different cache block sizes for TRSM instead of common level-3 block sizes. + // Tuning is done for double-precision only. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); + + // Update the context with the current architecture's register and cache + // blocksizes for level-3 TRSM problems. + bli_cntx_set_trsm_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + + // ------------------------------------------------------------------------- + + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh @@ -244,16 +249,15 @@ void bli_cntx_init_zen2( cntx_t* cntx ) bli_cntx_set_l3_sup_handlers ( 2, - BLIS_GEMM, bli_gemmsup_ref, - BLIS_GEMMT, bli_gemmtsup_ref, + BLIS_GEMM, bli_gemmsup_ref, + BLIS_GEMMT, bli_gemmtsup_ref, cntx ); // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( - 30, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, + 30, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, @@ -262,6 +266,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, @@ -270,6 +275,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, @@ -291,18 +297,19 @@ void bli_cntx_init_zen2( cntx_t* cntx ) // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); + bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); // Update the context with the current architecture's register and cache // blocksizes for small/unpacked level-3 problems. bli_cntx_set_l3_sup_blkszs ( 5, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], @@ -311,4 +318,3 @@ void bli_cntx_init_zen2( cntx_t* cntx ) cntx ); } - diff --git a/config/zen2/bli_family_zen2.h b/config/zen2/bli_family_zen2.h index 16fe50609e..ecff86be2e 100644 --- a/config/zen2/bli_family_zen2.h +++ b/config/zen2/bli_family_zen2.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,7 +38,7 @@ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops -// to be not paralleized. +// to be not parallelized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 @@ -50,8 +50,8 @@ #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 // When running HPL with pure MPI without DGEMM threading (Single-threaded // BLIS), defining this macro as 1 yields better performance. diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c index 31a9ff5957..6059ba3bc7 100644 --- a/config/zen3/bli_cntx_init_zen3.c +++ b/config/zen3/bli_cntx_init_zen3.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -37,272 +37,281 @@ void bli_cntx_init_zen3( cntx_t* cntx ) { - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; - // Set default kernel blocksizes and functions. - bli_cntx_init_zen3_ref( cntx ); - - // ------------------------------------------------------------------------- - - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs - ( - 8, - // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, - // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, - // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); - - // Update the context with architecture specific threshold functions - bli_cntx_set_l3_thresh_funcs - ( - 2, - // GEMMT - BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen, - // SYRK - BLIS_SYRK, bli_cntx_syrksup_thresh_is_met_zen, - cntx - ); - - // packm kernels - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); - - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 12, - // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_zen_int_5, - // dotxaxpyf - BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_zen_int_8, - BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_zen_int_8, - // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_zen_int_6, - BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_zen_int_6, - // axpy2v - BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_zen_int, - BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_zen_int, - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 29, - - // amaxv - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, - BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, - - // axpbyv - BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10, - BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10, - BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int, - BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int, - - // axpyv - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, - BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, - BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, - - // dotv - BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, - BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, - BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, - BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, - - // dotxv - BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, - BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, - BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int, - BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int, - - // scalv - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, - BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int, - - //swap - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, - - //copy - BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, - BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, - BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen_int, - - //set - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - - // scal2v - BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int, - cntx - ); - - // Initialize level-3 blocksize objects with architecture-specific values. + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + blksz_t thresh[ BLIS_NUM_THRESH ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_zen3_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native gemm micro-kernels and + // their storage preferences. + bli_cntx_set_l3_nat_ukrs + ( + 8, + // gemm + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, + cntx + ); + + // Update the context with architecture specific threshold functions + bli_cntx_set_l3_thresh_funcs + ( + 2, + // GEMMT + BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen, + // SYRK + BLIS_SYRK, bli_cntx_syrksup_thresh_is_met_zen, + cntx + ); + + // Update the context with optimized packm kernels. + bli_cntx_set_packm_kers + ( + 8, + BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, + BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, + cntx + ); + + // Update the context with optimized level-1f kernels. + bli_cntx_set_l1f_kers + ( + 12, + // axpyf + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_zen_int_5, + // dotxaxpyf + BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_zen_int_8, + BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_zen_int_8, + // dotxf + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_zen_int_6, + BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_zen_int_6, + // axpy2v + BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_zen_int, + BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_zen_int, + cntx + ); + + // Update the context with optimized level-1v kernels. + bli_cntx_set_l1v_kers + ( + 29, + // amaxv + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, + + // axpbyv + BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10, + BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10, + BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int, + BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int, + + // axpyv + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, + + // dotv + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, + + // dotxv + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int, + + // scalv + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int, + + // swapv + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + + // copyv + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen_int, + + // setv + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + + // scal2v + BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int, + cntx + ); + + // Initialize level-3 blocksize objects with architecture-specific values. // // These are reference block sizes and may be overridden based on // number of threads used at runtime. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 18 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 566 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 256 ); - - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native execution. - bli_cntx_set_blkszs - ( - BLIS_NAT, 7, - // level-3 - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - // level-1f - BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, - BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); -// ------------------------------------------------------------------------- - - //Initialize TRSM blocksize objects with architecture-specific values. - //Using different cache block sizes for TRSM instead of common level-3 block sizes. - //Tuning is done for double-precision only. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); - - // Update the context with the current architecture's register and cache - // blocksizes for level-3 TRSM problems. - bli_cntx_set_trsm_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); - - // Initialize sup thresholds with architecture-appropriate values. s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Initialize the context with the sup handlers. - bli_cntx_set_l3_sup_handlers - ( - 2, - BLIS_GEMM, bli_gemmsup_ref, - BLIS_GEMMT, bli_gemmtsup_ref, - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 30, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 18 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 566 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 256 ); + + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 7, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, + cntx + ); + + // ------------------------------------------------------------------------- + + // Initialize TRSM blocksize objects with architecture-specific values. + // Using different cache block sizes for TRSM instead of common level-3 block sizes. + // Tuning is done for double-precision only. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); + + // Update the context with the current architecture's register and cache + // blocksizes for level-3 TRSM problems. + bli_cntx_set_trsm_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + + // ------------------------------------------------------------------------- + + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 ); + + // Initialize the context with the sup thresholds. + bli_cntx_set_l3_sup_thresh + ( + 3, + BLIS_MT, &thresh[ BLIS_MT ], + BLIS_NT, &thresh[ BLIS_NT ], + BLIS_KT, &thresh[ BLIS_KT ], + cntx + ); + + // Initialize the context with the sup handlers. + bli_cntx_set_l3_sup_handlers + ( + 2, + BLIS_GEMM, bli_gemmsup_ref, + BLIS_GEMMT, bli_gemmtsup_ref, + cntx + ); + + // Update the context with optimized small/unpacked gemm kernels. + bli_cntx_set_l3_sup_kers + ( + 30, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + + BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, + BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, + BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + + BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + + BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE, + BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE, + BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + cntx + ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); + + // Update the context with the current architecture's register and cache + // blocksizes for small/unpacked level-3 problems. + bli_cntx_set_l3_sup_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); } diff --git a/config/zen3/bli_family_zen3.h b/config/zen3/bli_family_zen3.h index ce84104c52..35ffc9f19d 100644 --- a/config/zen3/bli_family_zen3.h +++ b/config/zen3/bli_family_zen3.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,9 +38,7 @@ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops -// to be not paralleized. -// - +// to be not parallelized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 @@ -52,7 +50,7 @@ #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 #endif diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index 8ef336d43b..5ac6f7b26b 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -40,362 +40,379 @@ */ #define BLI_CNTX_DEFAULT_BLKSZ_LIST_GENOA(blkszs) \ - /* s d c z */ \ - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 32, 3, 12 ); \ - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 6, 8, 4 ); \ - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 512, 128, 144, 60 ); \ - bli_blksz_init ( &blkszs[ BLIS_KC ], 480, 512, 256, 512, \ - 480, 320, 256, 160 ); \ - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 6144, 4002, 4080, 2004 ); \ - \ - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); \ - bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); \ + /* s d c z */ \ + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 32, 3, 12 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 6, 8, 4 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 512, 128, 144, 60 ); \ + bli_blksz_init ( &blkszs[ BLIS_KC ], 480, 512, 256, 512, \ + 480, 320, 256, 160 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 6144, 4002, 4080, 2004 ); \ + \ + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); \ #define BLI_CNTX_DEFAULT_BLKSZ_LIST_BERGAMO(blkszs) \ - /* s d c z */ \ - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 32, 3, 12 ); \ - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 6, 8, 4 ); \ - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 512, 64, 144, 60 ); \ - bli_blksz_init ( &blkszs[ BLIS_KC ], 480, 512, 256, 512, \ - 480, 320, 256, 160 ); \ - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 6144, 3600, 4080, 2004 ); \ - \ - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); \ - bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); \ + /* s d c z */ \ + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 32, 3, 12 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 6, 8, 4 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 512, 64, 144, 60 ); \ + bli_blksz_init ( &blkszs[ BLIS_KC ], 480, 512, 256, 512, \ + 480, 320, 256, 160 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 6144, 3600, 4080, 2004 ); \ + \ + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); \ void bli_cntx_init_zen4( cntx_t* cntx ) { - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; - // Set default kernel blocksizes and functions. - bli_cntx_init_zen4_ref( cntx ); - - // ------------------------------------------------------------------------- - - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs - ( - 10, - // gemm - BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_32x6, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - /*bli_zgemm_zen4_asm_12x4 is a column preferred kernel*/ - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_12x4, FALSE, - - // Different GEMM kernels are used for TRSM for zen4 architecture - BLIS_GEMM_FOR_TRSM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE, - - // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_zen4_asm_8x24, TRUE, - // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_zen4_asm_8x24, TRUE, - - cntx - ); - - // Update the context with architecture specific threshold functions - bli_cntx_set_l3_thresh_funcs - ( - 3, - // GEMM - BLIS_GEMM, bli_cntx_gemmsup_thresh_is_met_zen4, - // GEMMT - BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen, - // SYRK - BLIS_SYRK, bli_cntx_syrksup_thresh_is_met_zen, - cntx - ); - - // packm kernels - bli_cntx_set_packm_kers - ( - 11, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_zen4_asm_8xk, - BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_zen4_asm_24xk, - BLIS_PACKM_32XK_KER, BLIS_DOUBLE, bli_dpackm_zen4_asm_32xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_12XK_KER, BLIS_DCOMPLEX, bli_zpackm_zen4_asm_12xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_zen4_asm_4xk, - cntx - ); - - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 9, - // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_zen_int_5, - // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_zen_int_6, - BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_zen_int_6, - // axpy2v - BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_zen_int, - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 28, - - // amaxv - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512, - BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, - - // axpbyv - BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10, - BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10, - BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int, - BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int, - - // axpyv - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_avx512, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_avx512, - BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, - BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, - - // dotv - BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int_avx512, - BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int_avx512, - BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, - BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, - - // dotxv - BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, - BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, - BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int, - - // scalv - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_avx512, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_avx512, - BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int, - - //swap - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, - - //copy - BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, - BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, - BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen_int, - - //set - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - - // scal2v - BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int, - cntx - ); - - // Initialize level-3 blocksize objects with architecture-specific values. - // - // These are reference block sizes and may be overridden based on - // number of threads used at runtime. - - if ( bli_init_model_query_id() == BLIS_MODEL_BERGAMO ) - { - BLI_CNTX_DEFAULT_BLKSZ_LIST_BERGAMO(blkszs); - } - else // BLIS_MODEL_DEFAULT choice, also currently used for BLIS_MODEL_GENOA and BLIS_MODEL_GENOA_X - { - BLI_CNTX_DEFAULT_BLKSZ_LIST_GENOA(blkszs); - } - - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native execution. - bli_cntx_set_blkszs - ( - BLIS_NAT, 7, - // level-3 - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - // level-1f - BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, - BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); - - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native TRSMK execution. - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 8, 3, 12 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 24, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 120, 144, 60 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 512 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4008, 4080, 2004 ); - - bli_cntx_set_trsm_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); - // ------------------------------------------------------------------------- - - // Initialize sup thresholds with architecture-appropriate values. s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 682, 1000, 380, 110 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 1000, 256, 128 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Initialize the context with the sup handlers. - bli_cntx_set_l3_sup_handlers - ( - 2, - BLIS_GEMM, bli_gemmsup_ref, - BLIS_GEMMT, bli_gemmtsup_ref, - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 30, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64m_avx512, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64n_avx512, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE, - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, - BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, - BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 24, 3, 12, - 6, 9, 3, 12 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 64, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 144, 72, 48 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 480, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8064, 4080, 2040, 1020 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); - - // Initialize level-3 sup blocksize objects for operations dealing with - //triangular objects with architecture-specific values. - // - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native execution. - bli_cntx_set_l3_sup_tri_blkszs - ( - 5, - // level-3 - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); - - bli_cntx_set_l3_sup_tri_kers - ( - 30, - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - cntx - ); + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + blksz_t thresh[ BLIS_NUM_THRESH ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_zen4_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native gemm micro-kernels and + // their storage preferences. + bli_cntx_set_l3_nat_ukrs + ( + 10, + // gemm + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_skx_asm_32x12_l2, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_32x6, FALSE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, + /*bli_zgemm_zen4_asm_12x4 is a column preferred kernel*/ + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_12x4, FALSE, + + // Different GEMM kernels are used for TRSM for zen4 architecture + BLIS_GEMM_FOR_TRSM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, + BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_zen4_asm_8x24, TRUE, + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_zen4_asm_8x24, TRUE, + cntx + ); + + // Update the context with architecture specific threshold functions + bli_cntx_set_l3_thresh_funcs + ( + 3, + // GEMM + BLIS_GEMM, bli_cntx_gemmsup_thresh_is_met_zen4, + // GEMMT + BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen, + // SYRK + BLIS_SYRK, bli_cntx_syrksup_thresh_is_met_zen, + cntx + ); + + // Update the context with optimized packm kernels. + bli_cntx_set_packm_kers + ( + 11, + BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_zen4_asm_8xk, + BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_zen4_asm_24xk, + BLIS_PACKM_32XK_KER, BLIS_DOUBLE, bli_dpackm_zen4_asm_32xk, + BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_12XK_KER, BLIS_DCOMPLEX, bli_zpackm_zen4_asm_12xk, + BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_zen4_asm_4xk, + cntx + ); + + // Update the context with optimized level-1f kernels. + bli_cntx_set_l1f_kers + ( + 9, + // axpyf + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_zen_int_5, + // dotxf + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_zen_int_6, + BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_zen_int_6, + // axpy2v + BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_zen_int, + cntx + ); + + // Update the context with optimized level-1v kernels. + bli_cntx_set_l1v_kers + ( + 28, + // amaxv + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, + + // axpbyv + BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10, + BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10, + BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int, + BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int, + + // axpyv + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_avx512, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_avx512, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, + + // dotv + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int_avx512, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int_avx512, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, + + // dotxv + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int, + + // scalv + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_avx512, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_avx512, + BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int, + + // swapv + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + + // copyv + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen_int, + + // setv + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + + // scal2v + BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int, + cntx + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // + // These are reference block sizes and may be overridden based on + // number of threads used at runtime. + + if ( bli_init_model_query_id() == BLIS_MODEL_BERGAMO ) + { + BLI_CNTX_DEFAULT_BLKSZ_LIST_BERGAMO(blkszs); + } + else // BLIS_MODEL_DEFAULT choice, also currently used for BLIS_MODEL_GENOA and BLIS_MODEL_GENOA_X + { + BLI_CNTX_DEFAULT_BLKSZ_LIST_GENOA(blkszs); + } + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 7, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, + cntx + ); + + // ------------------------------------------------------------------------- + + // Initialize TRSM blocksize objects with architecture-specific values. + // Using different cache block sizes for TRSM instead of common level-3 block sizes. + // Tuning is done for double-precision only. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 8, 3, 12 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 24, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 120, 144, 60 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 512 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4008, 4080, 2004 ); + + // Update the context with the current architecture's register and cache + // blocksizes for level-3 TRSM problems. + bli_cntx_set_trsm_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + + // ------------------------------------------------------------------------- + + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &thresh[ BLIS_MT ], 682, 1000, 380, 110 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 1000, 256, 128 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 ); + + // Initialize the context with the sup thresholds. + bli_cntx_set_l3_sup_thresh + ( + 3, + BLIS_MT, &thresh[ BLIS_MT ], + BLIS_NT, &thresh[ BLIS_NT ], + BLIS_KT, &thresh[ BLIS_KT ], + cntx + ); + + // Initialize the context with the sup handlers. + bli_cntx_set_l3_sup_handlers + ( + 2, + BLIS_GEMM, bli_gemmsup_ref, + BLIS_GEMMT, bli_gemmtsup_ref, + cntx + ); + + // Update the context with optimized small/unpacked gemm kernels. + bli_cntx_set_l3_sup_kers + ( + 30, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + + BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE, + BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64m_avx512, TRUE, + BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE, + BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE, + BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE, + BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64n_avx512, TRUE, + BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE, + BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE, + + BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + + BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + cntx + ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 24, 3, 12, + 6, 9, 3, 12 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 64, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 144, 72, 48 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 480, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8064, 4080, 2040, 1020 ); + + // Update the context with the current architecture's register and cache + // blocksizes for small/unpacked level-3 problems. + bli_cntx_set_l3_sup_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + + // Initialize level-3 sup blocksize objects for operations dealing with + // triangular objects with architecture-specific values. + // + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_l3_sup_tri_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + + bli_cntx_set_l3_sup_tri_kers + ( + 30, + BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, + BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, + BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + + BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + + BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE, + BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE, + BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + cntx + ); } diff --git a/config/zen4/bli_family_zen4.h b/config/zen4/bli_family_zen4.h index 0cd41b2b93..25b0ddd509 100644 --- a/config/zen4/bli_family_zen4.h +++ b/config/zen4/bli_family_zen4.h @@ -37,16 +37,15 @@ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops -// to be not paralleized. -// -#define BLIS_THREAD_MAX_IR 1 -#define BLIS_THREAD_MAX_JR 1 +// to be not parallelized. +#define BLIS_THREAD_MAX_IR 1 +#define BLIS_THREAD_MAX_JR 1 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. -#define BLIS_SMALL_MATRIX_THRES 700 +#define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 From 30bdeecbcc7a7c62fa77786b8274cd2fe7f3c301 Mon Sep 17 00:00:00 2001 From: eashdash Date: Sun, 9 Jul 2023 23:33:02 +0530 Subject: [PATCH 144/226] Added BLAS Extension APIs - Get Size and Pack API 1. 4 new APIs are added to support packed compute GEMM operations 1. dgemm_pack_get_size 2. sgemm_pack_get_size 3. dgemm_pack 4. sgemm_pack 2. Pack_get_size API 1. Returns size in bytes required for packing of input 2. Requires identifier to identify the input matrix to be packed 3. Additionally requires 3 integer parameters for input dimensions 3. Packed buffer is allocated using the pack size computed 4. Pack API: 1. Performs full matrix packing of the input 2. Additionally, performs the alpha scaling 3. Packed buffer created contains the full packed matrix 5. The GEMM compute calls are required to be operated on the packed buffer with alpha = 1 since alpha scaling is already done by the Pack API 6. GEMM Pack API eliminate the cost of packing the input matrixes by avoiding on the go pack in the GEMM 5 loop. Packing of input matrixes are done when there is resue of matrixes across different GEMM calls. AMD-Internal: [CPUPL-3560] Change-Id: Ieeb5df2d2f3b10ebf2d00dab6f455cf64a047de3 --- frame/1m/packm/CMakeLists.txt | 3 +- frame/1m/packm/bli_pack_full.c | 391 ++++++++++++++++++++++ frame/1m/packm/bli_pack_full.h | 61 ++++ frame/1m/packm/bli_packm.h | 2 + frame/compat/CMakeLists.txt | 4 +- frame/compat/bla_gemm_pack.c | 197 +++++++++++ frame/compat/bla_gemm_pack.h | 95 ++++++ frame/compat/bla_gemm_pack_get_size.c | 255 ++++++++++++++ frame/compat/bla_gemm_pack_get_size.h | 77 +++++ frame/compat/bli_blas.h | 3 + frame/include/bli_gentprot_macro_defs.h | 7 +- frame/thread/CMakeLists.txt | 4 +- frame/thread/bli_pack_full_decor.h | 66 ++++ frame/thread/bli_pack_full_decor_openmp.c | 81 +++++ frame/thread/bli_pack_full_decor_openmp.h | 43 +++ frame/thread/bli_pack_full_decor_single.c | 73 ++++ frame/thread/bli_pack_full_decor_single.h | 43 +++ frame/thread/bli_thread.h | 4 + 18 files changed, 1404 insertions(+), 5 deletions(-) create mode 100644 frame/1m/packm/bli_pack_full.c create mode 100644 frame/1m/packm/bli_pack_full.h create mode 100644 frame/compat/bla_gemm_pack.c create mode 100644 frame/compat/bla_gemm_pack.h create mode 100644 frame/compat/bla_gemm_pack_get_size.c create mode 100644 frame/compat/bla_gemm_pack_get_size.h create mode 100644 frame/thread/bli_pack_full_decor.h create mode 100644 frame/thread/bli_pack_full_decor_openmp.c create mode 100644 frame/thread/bli_pack_full_decor_openmp.h create mode 100644 frame/thread/bli_pack_full_decor_single.c create mode 100644 frame/thread/bli_pack_full_decor_single.h diff --git a/frame/1m/packm/CMakeLists.txt b/frame/1m/packm/CMakeLists.txt index ee70b7124a..37963d46c0 100644 --- a/frame/1m/packm/CMakeLists.txt +++ b/frame/1m/packm/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.## target_sources("${PROJECT_NAME}" PRIVATE @@ -22,5 +22,6 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_struc_cxk_rih.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_thrinfo.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_unb_var1.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_pack_full.c ) diff --git a/frame/1m/packm/bli_pack_full.c b/frame/1m/packm/bli_pack_full.c new file mode 100644 index 0000000000..a6f30b0253 --- /dev/null +++ b/frame/1m/packm/bli_pack_full.c @@ -0,0 +1,391 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_pack_full_init + ( + const char* identifier, + obj_t* alpha_obj, + obj_t* src_obj, + obj_t* dest_obj, + cntx_t* cntx, + rntm_t* rntm + ) +{ + // Initializing the cntx if one isn't already passed. + if ( cntx == NULL ) { + cntx = bli_gks_query_cntx(); + } + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) + { + bli_rntm_init_from_global( &rntm_l ); + rntm = &rntm_l; + } + else + { + rntm_l = *rntm; + rntm = &rntm_l; + } + + const num_t dt = bli_obj_dt( src_obj ); + + bli_pack_full_thread_decorator + ( + bli_is_float( dt ) ? bli_spackm_full: bli_dpackm_full, + identifier, + alpha_obj, + src_obj, + dest_obj, + cntx, + rntm + ); + +} + +// Full pack function for A matrix + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, tfuncname ) \ +\ +void PASTEMAC(ch,tfuncname) \ + ( \ + dim_t m, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict src, \ + inc_t rs, \ + inc_t cs, \ + ctype* restrict dest, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{\ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + dim_t KC = KC0; \ +\ + const inc_t pcstep_a = cs; \ + const inc_t icstep_a = rs; \ +\ + const inc_t pcstep_a_use = ( ( m + MR - 1 ) / MR ) * MR; \ +\ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + const inc_t icstep_a_use = kc_cur; \ +\ + ctype* restrict a_pc = src + pp * pcstep_a; \ + ctype* restrict a_pc_use = dest + pp * pcstep_a_use; \ +\ + /* Grow the thrinfo_t tree. */ \ + thread_ic = thread; \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict a_ic_use = a_pc_use + ii * icstep_a_use; \ +\ + /* Packing is parallelized only at IC loop */ \ + thread_pa = &BLIS_GEMM_SINGLE_THREADED; \ +\ + pack_t schema = BLIS_PACKED_ROW_PANELS; \ + dim_t m_max = ( mc_cur / MR + ( mc_cur % MR ? 1 : 0 ) ) * MR; \ + dim_t k_max = kc_cur; \ +\ + rs_a_use = 1; \ + cs_a_use = MR; \ +\ + inc_t pd_a_use = MR; \ + ps_a_use = MR * kc_cur; \ +\ + /* For packing to column-stored row panels, use var1. */ \ + PASTEMAC(ch,packm_sup_var1) \ + ( \ + BLIS_NO_TRANSPOSE, \ + schema, \ + mc_cur, \ + kc_cur, \ + m_max, \ + k_max, \ + alpha, \ + a_ic, rs, cs, \ + a_ic_use, rs_a_use, cs_a_use, \ + pd_a_use, ps_a_use, \ + cntx, \ + thread_pa \ + ); \ +\ + } \ + } \ +\ +} \ + +INSERT_GENTFUNC_BASIC0_SD( pack_full_a ) + + + +// Full pack function for B matrix + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, tfuncname ) \ +\ +void PASTEMAC(ch,tfuncname) \ + ( \ + dim_t k, \ + dim_t n, \ + ctype* restrict alpha, \ + ctype* restrict src, \ + inc_t rs, \ + inc_t cs, \ + ctype* restrict dest, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + dim_t KC = KC0; \ +\ + const inc_t jcstep_b = cs; \ + const inc_t pcstep_b = rs; \ +\ + const inc_t jcstep_b_use = k; \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ +\ + thread_jc = thread; \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + const inc_t pcstep_b_use = ( ( nc_cur + NR - 1 ) / NR ) * NR; \ +\ + ctype* restrict b_jc = src + jj * jcstep_b; \ + ctype* restrict b_jc_use = dest + jj * jcstep_b_use; \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ + ctype* restrict b_pc_use = b_jc_use + pp * pcstep_b_use; \ +\ + /* Packing is parallelized only at JC loop */ \ + thread_pb = &BLIS_GEMM_SINGLE_THREADED; \ +\ + pack_t schema = BLIS_PACKED_COL_PANELS; \ + dim_t k_max = kc_cur; \ + dim_t n_max = ( nc_cur / NR + ( nc_cur % NR ? 1 : 0 ) ) * NR; \ +\ + rs_b_use = NR; \ + cs_b_use = 1; \ +\ + inc_t pd_b_use = NR; \ + ps_b_use = kc_cur * NR; \ +\ + /* For packing to row-stored column panels, use var1. */ \ + PASTEMAC(ch,packm_sup_var1) \ + ( \ + BLIS_NO_TRANSPOSE, \ + schema, \ + kc_cur, \ + nc_cur, \ + k_max, \ + n_max, \ + alpha, \ + b_pc, rs, cs, \ + b_pc_use, rs_b_use, cs_b_use, \ + pd_b_use, ps_b_use, \ + cntx, \ + thread_pb \ + ); \ +\ + } \ + } \ +\ +} \ + +INSERT_GENTFUNC_BASIC0_SD( pack_full_b ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, tfuncname ) \ +\ +void PASTEMAC(ch,tfuncname) \ + ( \ + const char* identifier, \ + obj_t* alpha_obj, \ + obj_t* src_obj, \ + obj_t* dest_obj, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ +\ + const num_t dt = bli_obj_dt( src_obj ); \ +\ + inc_t rs = bli_obj_row_stride( src_obj ); \ + inc_t cs = bli_obj_col_stride( src_obj ); \ + void* restrict src = bli_obj_buffer_at_off( src_obj ); \ + void* restrict dest = bli_obj_buffer_at_off( dest_obj ); \ + void* restrict alpha = bli_obj_buffer_for_1x1( dt, alpha_obj ); \ + dim_t length = bli_obj_length( src_obj ); \ + dim_t width = bli_obj_width(src_obj); \ +\ + if ( bli_obj_has_trans( src_obj ) ) \ + { \ + rs = cs; \ + cs = 1; \ + dim_t temp = length; \ + length = width; \ + width = temp; \ + } \ +\ + /*---------------------------------------A-----------------------------------*/\ + if (*identifier == 'a' || *identifier == 'A') \ + {\ + PASTEMAC(ch, pack_full_a) \ + ( \ + length, \ + width, \ + alpha, \ + src, \ + rs, \ + cs, \ + dest, \ + cntx, \ + rntm, \ + thread \ + ); \ + } \ +\ +/*---------------------------------------B-----------------------------------*/\ + if (*identifier == 'b' || *identifier == 'B') \ + {\ + PASTEMAC(ch, pack_full_b) \ + ( \ + length, \ + width, \ + alpha, \ + src, \ + rs, \ + cs, \ + dest, \ + cntx, \ + rntm, \ + thread \ + ); \ + } \ +\ +} \ + +INSERT_GENTFUNC_BASIC0_SD( packm_full ) + + diff --git a/frame/1m/packm/bli_pack_full.h b/frame/1m/packm/bli_pack_full.h new file mode 100644 index 0000000000..bbb3bbe234 --- /dev/null +++ b/frame/1m/packm/bli_pack_full.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_pack_full_init + ( + const char* identifier, + obj_t* alpha_obj, + obj_t* src_obj, + obj_t* dest_obj, + cntx_t* cntx, + rntm_t* rntm + ); + +#undef GENTPROT +#define GENTPROT( ctype, ch, tfuncname ) \ +\ +void PASTEMAC(ch,tfuncname) \ + ( \ + const char* identifier, \ + obj_t* alpha_obj, \ + obj_t* src_obj, \ + obj_t* dest, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ); \ + +INSERT_GENTPROT_BASIC0_SD( packm_full ) diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 85f7011655..b98fbc368c 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -54,6 +54,8 @@ #include "bli_packm_cxk_rih.h" #include "bli_packm_cxk_1er.h" +#include "bli_pack_full.h" + // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD #include "bli_packm_md.h" diff --git a/frame/compat/CMakeLists.txt b/frame/compat/CMakeLists.txt index bfe8e10508..3b1ab26705 100644 --- a/frame/compat/CMakeLists.txt +++ b/frame/compat/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## target_sources("${PROJECT_NAME}" PRIVATE @@ -49,6 +49,8 @@ ${TARGET_ARCH} STREQUAL amdzen) ${CMAKE_CURRENT_SOURCE_DIR}/bla_scal_amd.c ${CMAKE_CURRENT_SOURCE_DIR}/bla_swap_amd.c ${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_amd.c + ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack_get_size.c + ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack.c ) else() target_sources("${PROJECT_NAME}" diff --git a/frame/compat/bla_gemm_pack.c b/frame/compat/bla_gemm_pack.c new file mode 100644 index 0000000000..8feabc8af2 --- /dev/null +++ b/frame/compat/bla_gemm_pack.c @@ -0,0 +1,197 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// BLAS Extension APIs +/* ?gemm_pack.h */ +/* BLAS interface to perform scaling and packing of the */ +/* matrix to a packed matrix structure to be used in subsequent calls */ +/* Datatype : s & d (single and double precision only supported) */ +/* BLAS Extensions */ + +#include "blis.h" + +void sgemm_pack_blis_impl + ( + const f77_char* identifier, + const f77_char* trans, + const f77_int* mm, + const f77_int* nn, + const f77_int* kk, + const float* alpha, + const float* src, const f77_int* pld, + float* dest + ) +{ + dim_t m; + dim_t n; + dim_t k; + + dim_t m0 = 0; + dim_t n0 = 0; + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1( *mm, m ); + bli_convert_blas_dim1( *nn, n ); + bli_convert_blas_dim1( *kk, k ); + + inc_t cs = *pld; + inc_t rs = 1; + + trans_t blis_trans; + + num_t dt = BLIS_FLOAT; + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_trans( *trans, &blis_trans ); + + obj_t src_obj = BLIS_OBJECT_INITIALIZER; + obj_t dest_obj = BLIS_OBJECT_INITIALIZER; + obj_t alpha_obj = BLIS_OBJECT_INITIALIZER; + + + if (*identifier == 'a' || *identifier == 'A') + { + bli_set_dims_with_trans( blis_trans, m, k, &m0, &n0 ); + } + else if (*identifier == 'b' || *identifier == 'B') + { + bli_set_dims_with_trans( blis_trans, k, n, &m0, &n0 ); + } + else + { + bli_print_msg( " Invalid IDENTIFIER setting sgemm_pack_() .", __FILE__, __LINE__ ); + return; + } + + bli_obj_init_finish_1x1( dt, (float*)alpha, &alpha_obj ); + + bli_obj_init_finish( dt, m0, n0, (float*)src, rs, cs, &src_obj ); + bli_obj_init_finish( dt, m0, n0, (float*)dest, rs, cs, &dest_obj ); + + bli_obj_set_conjtrans( blis_trans, &src_obj ); + + bli_pack_full_init(identifier, &alpha_obj, &src_obj, &dest_obj, NULL, NULL); +} + +void sgemm_pack_ + ( + const f77_char* identifier, + const f77_char* trans, + const f77_int* mm, + const f77_int* nn, + const f77_int* kk, + const float* alpha, + const float* src, const f77_int* pld, + float* dest + ) +{ + sgemm_pack_blis_impl( identifier, trans, mm, nn, kk, alpha, src, pld, dest ); +} + +void dgemm_pack_blis_impl + ( + const f77_char* identifier, + const f77_char* trans, + const f77_int* mm, + const f77_int* nn, + const f77_int* kk, + const double* alpha, + const double* src, const f77_int* pld, + double* dest + ) +{ + dim_t m; + dim_t n; + dim_t k; + + dim_t m0 = 0; + dim_t n0 = 0; + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1( *mm, m ); + bli_convert_blas_dim1( *nn, n ); + bli_convert_blas_dim1( *kk, k ); + + inc_t cs = *pld; + inc_t rs = 1; + + trans_t blis_trans; + + num_t dt = BLIS_DOUBLE; + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_trans( *trans, &blis_trans ); + + obj_t src_obj = BLIS_OBJECT_INITIALIZER; + obj_t dest_obj = BLIS_OBJECT_INITIALIZER; + obj_t alpha_obj = BLIS_OBJECT_INITIALIZER; + + if (*identifier == 'a' || *identifier == 'A') + { + bli_set_dims_with_trans( blis_trans, m, k, &m0, &n0 ); + } + else if (*identifier == 'b' || *identifier == 'B') + { + bli_set_dims_with_trans( blis_trans, k, n, &m0, &n0 ); + } + else + { + bli_print_msg( " Invalid IDENTIFIER setting dgemm_pack_() .", __FILE__, __LINE__ ); + return; + } + + bli_obj_init_finish_1x1( dt, (double*)alpha, &alpha_obj ); + + bli_obj_init_finish( dt, m0, n0, (double*)src, rs, cs, &src_obj ); + bli_obj_init_finish( dt, m0, n0, (double*)dest, rs, cs, &dest_obj ); + + bli_obj_set_conjtrans( blis_trans, &src_obj ); + + bli_pack_full_init(identifier, &alpha_obj, &src_obj, &dest_obj, NULL, NULL); +} + +void dgemm_pack_ + ( + const f77_char* identifier, + const f77_char* trans, + const f77_int* mm, + const f77_int* nn, + const f77_int* kk, + const double* alpha, + const double* src, const f77_int* pld, + double* dest + ) +{ + dgemm_pack_blis_impl( identifier, trans, mm, nn, kk, alpha, src, pld, dest ); +} diff --git a/frame/compat/bla_gemm_pack.h b/frame/compat/bla_gemm_pack.h new file mode 100644 index 0000000000..1694ef0e4f --- /dev/null +++ b/frame/compat/bla_gemm_pack.h @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// BLAS Extension APIs +/* ?gemm_pack.h */ +/* BLAS interface to perform scaling and packing of the */ +/* matrix to a packed matrix structure to be used in subsequent calls */ +/* Datatype : s & d (single and double precision only supported) */ +/* BLAS Extensions */ +/* output is a packed buffer */ + +// Currently we are not adding blis interfaces - these BLAS interfaces will be available by default + +#ifdef BLIS_ENABLE_BLAS +BLIS_EXPORT_BLAS void dgemm_pack_ + ( + const f77_char* identifier, + const f77_char* trans, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const double* alpha, + const double* src, const f77_int* pld, + double* dest + ); +#endif + +BLIS_EXPORT_BLAS void dgemm_pack_blis_impl + ( + const f77_char* identifier, + const f77_char* trans, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const double* alpha, + const double* src, const f77_int* pld, + double* dest + ); + +#ifdef BLIS_ENABLE_BLAS +BLIS_EXPORT_BLAS void sgemm_pack_ + ( + const f77_char* identifier, + const f77_char* trans, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const float* alpha, + const float* src, const f77_int* pld, + float* dest + ); +#endif + +BLIS_EXPORT_BLAS void sgemm_pack_blis_impl + ( + const f77_char* identifier, + const f77_char* trans, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const float* alpha, + const float* src, const f77_int* pld, + float* dest + ); diff --git a/frame/compat/bla_gemm_pack_get_size.c b/frame/compat/bla_gemm_pack_get_size.c new file mode 100644 index 0000000000..53e64b5f2b --- /dev/null +++ b/frame/compat/bla_gemm_pack_get_size.c @@ -0,0 +1,255 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// BLAS Extension APIs +/* ?gemm_pack_get_size.c */ +/* This program is a C interface to query the size of storage */ +/* required for a packed matrix structure to be used in subsequent calls */ +/* Datatype : s & d (single and double precision only supported) */ +/* BLAS Extensions */ +/* returns number of bytes */ + +#include "blis.h" + +f77_int dgemm_pack_get_size_blis_impl + ( + const f77_char* identifier, + const f77_int* pm, + const f77_int* pn, + const f77_int* pk + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + + bli_init_auto(); // initialize blis + cntx_t* cntx = bli_gks_query_cntx(); // Get processor specific context. + + num_t dt = BLIS_DOUBLE; // Double precision + f77_int tbytes = 0; // total number of bytes needed for packing. + f77_int m = *pm; + f77_int n = *pn; + f77_int k = *pk; + + // Retreive cache-blocking parameters used in GEMM + +#if 0 // Not needed, MR and NR should do + const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); + const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); + +#endif + + const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); + const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); + + // Note: If one of the dimensions is zero - we return zero bytes + + // if we allocate memory based on MC, KC and NC. We might be wasting the memory + // for matrix sizes smaller than these values + // When packing A - MC x KC is size of a row-panel but when k < KC then + // we take only MC x k. Basically the row-panel size is MC x min(k, KC). + // But if m < MC. Then we make "m" aligned to MR. m_p_pad = m aligned to MR. + // Minimum unit of work the Kernel operates is by computing MR x NR block of C. + // Kernel: It multiplies MR x min(k, KC) of A column-micro panel with min(k, KC) x NR + // of B row-micro panel. + + // Therefore the packing sizes will be : + // For A pack - m_p_pad x k. where m_p_pad = m multiple of MR. + // For B pack - k x n_p_pad. where n_p_pad = n multiple of NR. + + if ( (*identifier == 'a') || (*identifier == 'A') ) + { + // Size of single packed A buffer is MC x KC elements - row-panels of A + // Number of elements in row-panel of A = MC x KC + // size of micro-panels is MR x KC + dim_t m_p_pad = ( (m + MR - 1)/MR ) * MR; + dim_t ps_n = m_p_pad * k; // size of all packed buffer (multiples of MR x k) + + // if A is transposed - then A' dimensions will be k x m + // here k should be multiple of MR + dim_t mt_p_pad = ((k + MR -1)/MR ) * MR; + + dim_t ps_t = mt_p_pad * m; + + // We pick the max size to ensure handling the transpose case. + dim_t ps_max = bli_max(ps_n, ps_t); + + tbytes = ps_max * sizeof( double ); + } + else if ( (*identifier == 'b') || (*identifier == 'B') ) + { + // Size of Single Packed B buffer is KC x NC elements. - Column panels of B + // Number of elements in column-panel of B = KC x NC + + // size of micro-panels is KC x NR + dim_t n_p_pad = ( (n + NR - 1)/NR ) * NR; + dim_t ps_n = k * n_p_pad; // size of packed buffer of B (multiples of k x NR) + + // if B is transposed then B' - dimension is n x k + // here k should be multiple of NR + dim_t nt_p_pad = ( (k + NR -1)/NR ) * NR; + dim_t ps_t = n * nt_p_pad; + + // We pick the max size to ensure handling the transpose case. + dim_t ps_max = bli_max(ps_n, ps_t); + + tbytes = ps_max * sizeof( double ); + } + else + { + bli_print_msg( " Invalid IDENTIFIER setting dgemm_pack_get_size_() .", __FILE__, __LINE__ ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return tbytes; + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return tbytes; +} + +#ifdef BLIS_ENABLE_BLAS +f77_int dgemm_pack_get_size_ + ( + const f77_char* identifier, + const f77_int* pm, + const f77_int* pn, + const f77_int* pk + ) +{ + return dgemm_pack_get_size_blis_impl( identifier, pm, pn, pk ); +} +#endif + +f77_int sgemm_pack_get_size_blis_impl + ( + const f77_char* identifier, + const f77_int* pm, + const f77_int* pn, + const f77_int* pk + ) +{ + bli_init_auto(); // initialize blis + cntx_t* cntx = bli_gks_query_cntx(); // Get processor specific context. + + num_t dt = BLIS_FLOAT; // Single precision + f77_int tbytes = 0; // total number of bytes needed for packing. + f77_int m = *pm; + f77_int n = *pn; + f77_int k = *pk; + + // Retreive cache-blocking parameters used in GEMM + +#if 0 // Not needed, MR and NR should do + const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); + const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); + +#endif + + const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); + const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); + + // Note: If one of the dimensions is zero - we return zero bytes + + // if we allocate memory based on MC, KC and NC. We might be wasting the memory + // for matrix sizes smaller than these values + // When packing A - MC x KC is size of a row-panel but when k < KC then + // we take only MC x k. Basically the row-panel size is MC x min(k, KC). + // But if m < MC. Then we make "m" aligned to MR. m_p_pad = m aligned to MR. + // Minimum unit of work the Kernel operates is by computing MR x NR block of C. + // Kernel: It multiplies MR x min(k, KC) of A column-micro panel with min(k, KC) x NR + // of B row-micro panel. + + // Therefore the packing sizes will be : + // For A pack - m_p_pad x k. where m_p_pad = m multiple of MR. + // For B pack - k x n_p_pad. where n_p_pad = n multiple of NR. + + if ( (*identifier == 'a') || (*identifier == 'A') ) + { + // Size of single packed A buffer is MC x KC elements - row-panels of A + // Number of elements in row-panel of A = MC x KC + // size of micro-panels is MR x KC + dim_t m_p_pad = ( (m + MR - 1)/MR ) * MR; + dim_t ps_n = m_p_pad * k; // size of all packed buffer (multiples of MR x k) + + // if A is transposed - then A' dimensions will be k x m + // here k should be multiple of MR + dim_t mt_p_pad = ((k + MR -1)/MR ) * MR; + + dim_t ps_t = mt_p_pad * m; + + // We pick the max size to ensure handling the transpose case. + dim_t ps_max = bli_max(ps_n, ps_t); + + tbytes = ps_max * sizeof( float ); + } + else if ( (*identifier == 'b') || (*identifier == 'B')) + { + // Size of Single Packed B buffer is KC x NC elements. - Column panels of B + // Number of elements in column-panel of B = KC x NC + + // size of micro-panels is KC x NR + dim_t n_p_pad = ( (n + NR - 1)/NR ) * NR; + dim_t ps_n = k * n_p_pad; // size of packed buffer of B (multiples of k x NR) + + // if B is transposed then B' - dimension is n x k + // here k should be multiple of NR + dim_t nt_p_pad = ( (k + NR -1)/NR ) * NR; + dim_t ps_t = n * nt_p_pad; + + // We pick the max size to ensure handling the transpose case. + dim_t ps_max = bli_max(ps_n, ps_t); + + tbytes = ps_max * sizeof( float ); + } + else + { + bli_print_msg( " Invalid IDENTIFIER setting sgemm_pack_get_size_() .", __FILE__, __LINE__ ); + return tbytes; + } + + return tbytes; +} + +#ifdef BLIS_ENABLE_BLAS +f77_int sgemm_pack_get_size_ + ( + const f77_char* identifier, + const f77_int* pm, + const f77_int* pn, + const f77_int* pk + ) +{ + return sgemm_pack_get_size_blis_impl( identifier, pm, pn, pk ); +} +#endif diff --git a/frame/compat/bla_gemm_pack_get_size.h b/frame/compat/bla_gemm_pack_get_size.h new file mode 100644 index 0000000000..67e389210c --- /dev/null +++ b/frame/compat/bla_gemm_pack_get_size.h @@ -0,0 +1,77 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// BLAS Extension APIs +/* ?gemm_pack_get_size.h */ +/* This program is a C interface to query the size of storage */ +/* required for a packed matrix structure to be used in subsequent calls */ +/* Datatype : s & d (single and double precision only supported) */ +/* BLAS Extensions */ +/* returns number of bytes */ + +#ifdef BLIS_ENABLE_BLAS +f77_int dgemm_pack_get_size_ + ( + const f77_char* identifier, + const f77_int* pm, + const f77_int* pn, + const f77_int* pk + ); +#endif + +BLIS_EXPORT_BLAS f77_int dgemm_pack_get_size_blis_impl + ( + const f77_char* identifier, + const f77_int* pm, + const f77_int* pn, + const f77_int* pk + ); + +#ifdef BLIS_ENABLE_BLAS +f77_int sgemm_pack_get_size_ + ( + const f77_char* identifier, + const f77_int* pm, + const f77_int* pn, + const f77_int* pk + ); +#endif + +BLIS_EXPORT_BLAS f77_int sgemm_pack_get_size_blis_impl + ( + const f77_char* identifier, + const f77_int* pm, + const f77_int* pn, + const f77_int* pk + ); diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h index 750692061a..99e8ff5962 100644 --- a/frame/compat/bli_blas.h +++ b/frame/compat/bli_blas.h @@ -199,6 +199,9 @@ #include "bla_gemm_batch.h" #include "bla_gemm3m.h" #include "bla_gemm3m_check.h" +#include "bla_gemm_pack_get_size.h" +#include "bla_gemm_pack.h" + // -- Transpose and Copy Routines -- #include "bla_omatadd.h" #include "bla_omatcopy.h" diff --git a/frame/include/bli_gentprot_macro_defs.h b/frame/include/bli_gentprot_macro_defs.h index 703bef68d8..8190ffc020 100644 --- a/frame/include/bli_gentprot_macro_defs.h +++ b/frame/include/bli_gentprot_macro_defs.h @@ -6,7 +6,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -170,7 +170,10 @@ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) - +#define INSERT_GENTPROT_BASIC0_SD( tfuncname ) \ +\ +GENTPROT( float, s, tfuncname ) \ +GENTPROT( double, d, tfuncname ) // -- (one auxiliary argument) -- diff --git a/frame/thread/CMakeLists.txt b/frame/thread/CMakeLists.txt index e6b83e3c24..9e93e69b5a 100644 --- a/frame/thread/CMakeLists.txt +++ b/frame/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## target_sources("${PROJECT_NAME}" PRIVATE @@ -16,4 +16,6 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_thread.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_thrinfo.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_thrinfo_sup.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_pack_full_decor_openmp.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_pack_full_decor_single.c ) diff --git a/frame/thread/bli_pack_full_decor.h b/frame/thread/bli_pack_full_decor.h new file mode 100644 index 0000000000..65c689a5d8 --- /dev/null +++ b/frame/thread/bli_pack_full_decor.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_PACK_FULL_DECOR_H +#define BLIS_PACK_FULL_DECOR_H + +// Pack Full internal function type. +typedef void (*pack_full_t) + ( + const char* identifier, + obj_t* alpha_obj, + obj_t* src_obj, + obj_t* dest_obj, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +// Pack Full thread decorator prototype. +void bli_pack_full_thread_decorator + ( + pack_full_t func, + const char* identifier, + obj_t* alpha_obj, + obj_t* src_obj, + obj_t* dest_obj, + cntx_t* cntx, + rntm_t* rntm + ); + +#include "bli_pack_full_decor_single.h" +#include "bli_pack_full_decor_openmp.h" +// #include "bli_pack_full_decor_pthreads.h" + +#endif diff --git a/frame/thread/bli_pack_full_decor_openmp.c b/frame/thread/bli_pack_full_decor_openmp.c new file mode 100644 index 0000000000..a6f94afbb6 --- /dev/null +++ b/frame/thread/bli_pack_full_decor_openmp.c @@ -0,0 +1,81 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_OPENMP + +void* bli_pack_full_thread_entry( void* data_void ) { return NULL; } + +void bli_pack_full_thread_decorator + ( + pack_full_t func, + const char* identifier, + obj_t* alpha_obj, + obj_t* src_obj, + obj_t* dest_obj, + cntx_t* cntx, + rntm_t* rntm + ) +{ + dim_t n_threads = bli_rntm_num_threads( rntm ); + + /* Ensure n_threads is always greater than or equal to 1 */ + /* Passing BLIS_IC_NT and BLIS_JC_NT for pack can lead to n_threads */ + /* becoming negative. In that case, packing is done using 1 thread */ + n_threads = ( n_threads > 0 ) ? n_threads : 1; + + _Pragma( "omp parallel num_threads(n_threads)" ) + { + thrinfo_t thread; + bli_thrinfo_set_n_way( n_threads, &thread ); + bli_thrinfo_set_work_id( omp_get_thread_num(), &thread ); + + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + func + ( + identifier, + alpha_obj, + src_obj, + dest_obj, + cntx, + rntm_p, + &thread + ); + } +} +#endif + diff --git a/frame/thread/bli_pack_full_decor_openmp.h b/frame/thread/bli_pack_full_decor_openmp.h new file mode 100644 index 0000000000..278b3ea9b0 --- /dev/null +++ b/frame/thread/bli_pack_full_decor_openmp.h @@ -0,0 +1,43 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_PACK_FULL_DECOR_OPENMP_H +#define BLIS_PACK_FULL_DECOR_OPENMP_H + +// Definitions specific to situations when OpenMP multithreading is enabled. +#ifdef BLIS_ENABLE_OPENMP + +#endif + +#endif diff --git a/frame/thread/bli_pack_full_decor_single.c b/frame/thread/bli_pack_full_decor_single.c new file mode 100644 index 0000000000..d88b35019a --- /dev/null +++ b/frame/thread/bli_pack_full_decor_single.c @@ -0,0 +1,73 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifndef BLIS_ENABLE_OPENMP + +#define SKIP_THRINFO_TREE + +void* bli_pack_full_thread_entry( void* data_void ) { return NULL; } + +void bli_pack_full_thread_decorator + ( + pack_full_t func, + const char* identifier, + obj_t* alpha_obj, + obj_t* src_obj, + obj_t* dest_obj, + cntx_t* cntx, + rntm_t* rntm + ) +{ + thrinfo_t thread = BLIS_GEMM_SINGLE_THREADED; + + { + rntm_t* restrict rntm_p = rntm; + + func + ( + identifier, + alpha_obj, + src_obj, + dest_obj, + cntx, + rntm_p, + &thread + ); + } + +} + +#endif diff --git a/frame/thread/bli_pack_full_decor_single.h b/frame/thread/bli_pack_full_decor_single.h new file mode 100644 index 0000000000..010bee11da --- /dev/null +++ b/frame/thread/bli_pack_full_decor_single.h @@ -0,0 +1,43 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_PACK_FULL_DECOR_SINGLE_H +#define BLIS_PACK_FULL_DECOR_SINGLE_H + +// Definitions specific to situations when multithreading is disabled. +#ifndef BLIS_ENABLE_MULTITHREADING + +#endif + +#endif diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 00ae53dff4..2cbee8ef87 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -57,6 +57,10 @@ // for the sup code path. #include "bli_l3_sup_decor.h" +// Include the pack full thread decorator and related definitions and prototypes +// for the pack code path. +#include "bli_pack_full_decor.h" + // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_update_tl( void ); From 712a84d50f418d4b84b28a63abfd823811132ee2 Mon Sep 17 00:00:00 2001 From: jagar Date: Thu, 5 Oct 2023 15:06:45 +0530 Subject: [PATCH 145/226] Gtestsuite: Update in cmake to search reflib in given path AMD-Internal: [CPUPL-2732] Change-Id: Ide2b98a95f81f394c7c01cc3a3b5ae6fa0403a82 --- gtestsuite/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gtestsuite/CMakeLists.txt b/gtestsuite/CMakeLists.txt index f044ceaec2..bc55cdc834 100644 --- a/gtestsuite/CMakeLists.txt +++ b/gtestsuite/CMakeLists.txt @@ -156,7 +156,7 @@ endif() if(LINUX) if(REF_LIB) set(REFLIB_PATH ${REF_LIB}/..) - find_library(reflib NAMES openblas cblas mkl_rt PATHS ${REFLIB_PATH}) + find_library(reflib NAMES openblas cblas mkl_rt HINTS ${REFLIB_PATH} PATHS ${REFLIB_PATH}) if(${reflib} STREQUAL reflib-NOTFOUND) message(FATAL_ERROR "Reference Library not found : " ${REF_LIB}) else() @@ -169,7 +169,7 @@ if(LINUX) during CMake invokation when OpenBLAS is used for reference results. Please use \ $ cmake .. -DOPENBLAS_PATH=/home/username/openblas_installation") endif() - find_library(reflib NAMES openblas PATHS ${OPENBLAS_PATH}) + find_library(reflib NAMES openblas HINTS ${OPENBLAS_PATH} PATHS ${OPENBLAS_PATH}) if(${reflib} STREQUAL reflib-NOTFOUND) message(FATAL_ERROR "OpenBLAS Reference Library not found : " ${OPENBLAS_PATH}) else() @@ -183,9 +183,9 @@ if(LINUX) $ cmake .. -DNETLIB_PATH=/home/username/netlib_installation") endif() if(INT_SIZE STREQUAL "32") - find_library(netlib NAMES cblas PATHS ${NETLIB_PATH}) + find_library(netlib NAMES cblas HINTS ${NETLIB_PATH} PATHS ${NETLIB_PATH}) else() - find_library(netlib NAMES cblas64 PATHS ${NETLIB_PATH}) + find_library(netlib NAMES cblas64 HINTS ${NETLIB_PATH} PATHS ${NETLIB_PATH}) endif() if(${netlib} STREQUAL netlib-NOTFOUND) message(FATAL_ERROR "Netlib Reference Library not found : " ${NETLIB_PATH}) @@ -196,7 +196,7 @@ if(LINUX) elseif(REF_CBLAS STREQUAL "MKL") set(MKL_PATH $ENV{MKLROOT}/lib/intel64 CACHE STRING "The path to MKL.") - find_library(mkllib NAMES mkl_rt PATHS ${MKL_PATH}) + find_library(mkllib NAMES mkl_rt HINTS ${MKL_PATH} PATHS ${MKL_PATH}) if(${mkllib} STREQUAL mkllib-NOTFOUND) message(FATAL_ERROR "MKL Reference Library not found : " ${MKL_PATH}) else() From 5d578684eacc3786800f9d545f86c0eab9793350 Mon Sep 17 00:00:00 2001 From: jagar Date: Fri, 6 Oct 2023 16:29:32 +0530 Subject: [PATCH 146/226] GtestSuite: Update in source code to make it compatible on MSVC(windows) AMD-Internal: [CPUPL-2732] Change-Id: Ifd9372bf9b0f00c2bf24442ea8519bfcf4e5db5b --- gtestsuite/testinghelpers/inc/common/testing_basics.h | 1 + .../testsuite/util/nrm2/nrm2_underflow_overflow.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/gtestsuite/testinghelpers/inc/common/testing_basics.h b/gtestsuite/testinghelpers/inc/common/testing_basics.h index 67e51254df..e7f92a9356 100644 --- a/gtestsuite/testinghelpers/inc/common/testing_basics.h +++ b/gtestsuite/testinghelpers/inc/common/testing_basics.h @@ -36,6 +36,7 @@ #include #include +#include #include "cblas.h" #include "common/type_info.h" diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp index 7ab2f99c91..9f7dc87d80 100644 --- a/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp +++ b/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp @@ -11,7 +11,7 @@ TYPED_TEST(OUT_nrm2, maxFP_scalar) { using T = TypeParam; using RT = typename testinghelpers::type_info::real_type; - RT maxval = std::numeric_limits::max(); + RT maxval = (std::numeric_limits::max)(); T x = T{maxval}; RT norm = nrm2(1, &x, 1); @@ -22,7 +22,7 @@ TYPED_TEST(OUT_nrm2, maxFP_vectorized) { using RT = typename testinghelpers::type_info::real_type; gtint_t n = 64; std::vector x(n, T{0}); - RT maxval = std::numeric_limits::max(); + RT maxval = (std::numeric_limits::max)(); x[17] = T{maxval}; RT norm = nrm2(n, x.data(), 1); computediff(maxval, norm); @@ -33,7 +33,7 @@ TYPED_TEST(OUT_nrm2, minFP_scalar) { using T = TypeParam; using RT = typename testinghelpers::type_info::real_type; - RT minval = std::numeric_limits::min(); + RT minval = (std::numeric_limits::min)(); T x = T{minval}; RT norm = nrm2(1, &x, 1); computediff(minval, norm); @@ -43,7 +43,7 @@ TYPED_TEST(OUT_nrm2, minFP_vectorized) { using RT = typename testinghelpers::type_info::real_type; gtint_t n = 64; std::vector x(n, T{0}); - RT minval = std::numeric_limits::min(); + RT minval = (std::numeric_limits::min)(); x[17] = T{minval}; RT norm = nrm2(n, x.data(), 1); computediff(minval, norm); From 85f2bf6c4adc0d7e339b45bb7988294c278af88c Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Wed, 4 Oct 2023 00:18:57 +0530 Subject: [PATCH 147/226] Fix for x86_64 builds Configuration x86_64 includes all Intel and AMD sub-configurations. Fixes to enable this to work correctly again are: - In config_registry use amdzen rather than amd64 in x86_64 family. - Copy settings from config/amdzen/bli_family_amdzen.h to config/x86_64/bli_family_x86_64.h - Modify configure to set enable_aocl_zen=yes for x86_64, but not for amd64_legacy. - Add "if defined(BLIS_FAMILY_X86_64)" to frame/3/bli_l3_sup.c and frame/3/bli_l3_sup_int_amd.c so zen-specific code paths are enabled. Note: sub-configurations knl and bulldozer use instructions that are not supported on most x86_64 processors. AMD-Internal: [CPUPL-3838] Change-Id: I0bd8fd89ccd846f80e5491ef44ade7d409970b04 --- config/x86_64/bli_family_x86_64.h | 27 ++++++++++++++++++++++++--- config_registry | 2 +- configure | 9 +++++---- frame/3/bli_l3_sup.c | 2 +- frame/3/bli_l3_sup_int_amd.c | 4 ++-- 5 files changed, 33 insertions(+), 11 deletions(-) diff --git a/config/x86_64/bli_family_x86_64.h b/config/x86_64/bli_family_x86_64.h index 21b44db870..c327a0b19a 100644 --- a/config/x86_64/bli_family_x86_64.h +++ b/config/x86_64/bli_family_x86_64.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,10 +33,30 @@ */ -//#ifndef BLIS_FAMILY_H -//#define BLIS_FAMILY_H +#ifndef BLIS_FAMILY_H +#define BLIS_FAMILY_H +// By default, it is effective to parallelize the outer loops. +// Setting these macros to 1 will force JR and IR inner loops +// to be not parallelized. +// +#define BLIS_THREAD_MAX_IR 1 +#define BLIS_THREAD_MAX_JR 1 +#define BLIS_ENABLE_SMALL_MATRIX +#define BLIS_ENABLE_SMALL_MATRIX_TRSM -//#endif +// This will select the threshold below which small matrix code will be called. +#define BLIS_SMALL_MATRIX_THRES 700 +#define BLIS_SMALL_M_RECT_MATRIX_THRES 160 +#define BLIS_SMALL_K_RECT_MATRIX_THRES 128 + +#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 + +// When running HPL with pure MPI without DGEMM threading (Single-threaded +// BLIS), defining this macro as 1 yields better performance. +#define AOCL_BLIS_MULTIINSTANCE 0 + +#endif diff --git a/config_registry b/config_registry index 4e6716dfa1..8a3a47bfbd 100644 --- a/config_registry +++ b/config_registry @@ -8,7 +8,7 @@ # # Processor families. -x86_64: intel64 amd64 amd64_legacy +x86_64: intel64 amdzen amd64_legacy intel64: skx knl haswell sandybridge penryn generic amd64_legacy: excavator steamroller piledriver bulldozer generic amdzen: zen4 zen3 zen2 zen generic diff --git a/configure b/configure index a165c1ad51..96a803504a 100755 --- a/configure +++ b/configure @@ -3332,10 +3332,11 @@ main() uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]') config_name_define="#define BLIS_FAMILY_${uconf}\n" - #create a AOCL specific #define - #This macro is enabled only for zen family configurations. - #This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes. - uconf=$(echo ${config_name} | grep -c 'zen\|amd64' | cut -d. -f1) + # Create a AOCL specific #define + # This macro is enabled only for zen family configurations. + # This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes. + # Note: amd64_legacy is for pre-zen architectures. + uconf=$(echo ${config_name} | grep -v amd64_legacy |grep -c 'zen\|amd64\|x86_64' | cut -d. -f1) if [[ $uconf == 1 ]]; then enable_aocl_zen='yes' enable_aocl_zen_01=1 diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index afd74d2ee4..317956ba4d 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -107,7 +107,7 @@ err_t bli_gemmsup if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } -#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) +#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) if((bli_arch_query_id() == BLIS_ARCH_ZEN4)) { diff --git a/frame/3/bli_l3_sup_int_amd.c b/frame/3/bli_l3_sup_int_amd.c index 69b691674c..2664f48bfa 100644 --- a/frame/3/bli_l3_sup_int_amd.c +++ b/frame/3/bli_l3_sup_int_amd.c @@ -134,7 +134,7 @@ err_t bli_gemmsup_int } } -#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) +#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) //Enable packing of B matrix for double data type when dims at per //thread level are above caches and enable packing of A when transA @@ -212,7 +212,7 @@ err_t bli_gemmsup_int } } -#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) +#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) //Enable packing of B matrix for double data type when dims at per //thread level are above caches and enable packing of A when transA From c3b97559c1c41f613e598c19286413a74b5b7f9f Mon Sep 17 00:00:00 2001 From: mkadavil Date: Fri, 8 Sep 2023 18:26:08 +0530 Subject: [PATCH 148/226] Zero Point support for 8s8s<32|16>os8 LPGEMM APIs -Downscaled / quantized value is calculated using the formula x' = (x / scale_factor) + zero_point. As it stands, the micro-kernels for these APIs only support scaling. Zero point addition is implemented as part of this commit, with it being fused as part of the downscale post-op in the micro-kernel. The zero point input is a vector of int8 values, and currently only vector based zero point addition is supported. -Bench enhancements to test/benchmark zero point addition. AMD-Internal: [SWLCSG-2332] Change-Id: I96b4b1e5a384a4683b50ca310dcfb63debb1ebea --- bench/bench_aocl_gemm/bench_lpgemm.c | 53 ++- .../s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c | 35 +- .../s8s8s16/lpgemm_s8_m_fringe_amd256.c | 64 +++- .../s8s8s16/lpgemm_s8_mn_fringe_amd256.c | 70 +++- .../s8s8s16/lpgemm_s8_n_fringe_amd256.c | 38 +- .../u8s8s16/lpgemm_6x32rowmajor_amd256.c | 43 ++- .../lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c | 64 +++- .../lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c | 70 +++- .../lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c | 38 +- .../lpgemm/u8s8s16/lpgemm_s16_kern_macros.h | 5 +- .../lpgemm_6x64rowmajor_s8_amd512vnni.c | 62 ++-- .../s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c | 180 ++++++---- .../s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c | 330 ++++++++++++------ .../s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c | 108 +++--- .../u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c | 62 ++-- .../u8s8s32/lpgemm_m_fringe_amd512vnni.c | 180 ++++++---- .../u8s8s32/lpgemm_mn_fringe_amd512vnni.c | 330 ++++++++++++------ .../lpgemm_n_extMR_fringe_amd512vnni.c | 99 +++--- .../u8s8s32/lpgemm_n_fringe_amd512vnni.c | 108 +++--- .../lpgemm/u8s8s32/lpgemm_s32_kern_macros.h | 10 +- 20 files changed, 1314 insertions(+), 635 deletions(-) diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 9fd41dc42e..47d6491c94 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -95,7 +95,7 @@ void fill_array_ ## ctype ( void* arr, dim_t size ) \ ctype* temp_arr = ( ctype* ) arr; \ for ( dim_t i = 0; i < size; ++i ) \ { \ - temp_arr[i] = ( ctype )( i % 10 ); \ + temp_arr[i] = ( ctype )( i % 20 ); \ } \ } \ @@ -352,23 +352,28 @@ int min (int a, int b) return ( a < b ? a : b ); } -#define GEN_MAT_MUL_ACC_CHK_DOWNSCALE(ACCUM_type,SCALE_type,BLAS_DOWNSCALE_SFX) \ +#define GEN_MAT_MUL_ACC_CHK_DOWNSCALE(C_type,ACCUM_type,SCALE_type,BLAS_DOWNSCALE_SFX) \ static inline ACCUM_type mat_mul_accuracy_check_downscale_ ## BLAS_DOWNSCALE_SFX \ (\ ACCUM_type temp_accum,\ aocl_post_op* post_op, \ dim_t j \ )\ -{\ - ACCUM_type out_temp_accum = ( ACCUM_type ) min ( max ( nearbyintf( ( SCALE_type )temp_accum * \ - ( *( ( SCALE_type* )post_op->sum.scale_factor + j ) ) ), S8_MIN ), S8_MAX ) ; \ +{ \ + ACCUM_type out_temp_accum = \ + ( ACCUM_type )min( \ + max( nearbyintf( ( SCALE_type )( temp_accum ) * \ + ( *( ( SCALE_type* )post_op->sum.scale_factor + j ) ) ) + \ + *( ( C_type* )post_op->sum.zero_point + j ), \ + S8_MIN ), \ + S8_MAX ); \ return out_temp_accum; \ }\ -GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int16_t,float,u8s8s16os8) -GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int32_t,float,u8s8s32os8) -GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int32_t,float,s8s8s32os8) -GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int16_t,float,s8s8s16os8) +GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int16_t,float,u8s8s16os8) +GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int32_t,float,u8s8s32os8) +GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int32_t,float,s8s8s32os8) +GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int16_t,float,s8s8s16os8) static inline float mat_mul_accuracy_check_downscale_bf16bf16f32obf16 ( @@ -735,7 +740,7 @@ GEN_MAT_MUL_ACC_CHK_DRV_FUNC(int8_t,int8_t,int16_t,int16_t,float,s8s8s16os16,s8s GEN_MAT_MUL_ACC_CHK_DRV_FUNC(int8_t,int8_t,int8_t,int16_t,float,s8s8s16os8,s8s8s16os8) /* Only supports bias followed by RELU and vice versa for now.*/ \ -#define GEN_MAT_MUL_POST_OPS_CREATOR(C_type,DSCALE_type,BLAS_SFX) \ +#define GEN_MAT_MUL_POST_OPS_CREATOR(C_DSCALE_type,C_type,DSCALE_type,BLAS_SFX) \ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ ( \ dim_t m, \ @@ -943,19 +948,27 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ { \ /* Allocate scale buffer, return early if alloc fails.*/ \ post_ops->sum.scale_factor = malloc( n * sizeof( DSCALE_type ) ); \ - if ( post_ops->sum.scale_factor == NULL ) \ + post_ops->sum.zero_point = malloc( n * sizeof( C_DSCALE_type ) ); \ + if ( ( post_ops->sum.scale_factor == NULL ) || \ + ( post_ops->sum.zero_point == NULL ) ) \ { \ free ( post_ops->eltwise ); \ free ( post_ops->bias.bias ); \ free( post_ops->seq_vector ); \ free( post_ops ); \ + if ( post_ops->sum.zero_point != NULL ) \ + { free( post_ops->sum.zero_point ); } \ + if ( post_ops->sum.scale_factor != NULL ) \ + { free( post_ops->sum.scale_factor ); } \ return NULL; \ } \ - /* Fill scale factor.*/ \ + /* Fill scale factor and zero points.*/ \ DSCALE_type* temp_dscale_ptr = ( DSCALE_type* )post_ops->sum.scale_factor; \ + C_DSCALE_type* temp_dzero_point_ptr = ( C_DSCALE_type* )post_ops->sum.zero_point; \ for ( dim_t i = 0; i < n; ++i ) \ { \ temp_dscale_ptr[i] = ( ( DSCALE_type )1 )/ ( ( DSCALE_type )1000 ); \ + temp_dzero_point_ptr[i] = (C_DSCALE_type)( i % 126 ); \ } \ } \ } \ @@ -965,12 +978,12 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ return post_ops; \ } \ -GEN_MAT_MUL_POST_OPS_CREATOR(int16_t,float,u8s8s16os16) -GEN_MAT_MUL_POST_OPS_CREATOR(int32_t,float,u8s8s32os32) -GEN_MAT_MUL_POST_OPS_CREATOR(float,float,bf16bf16f32of32) -GEN_MAT_MUL_POST_OPS_CREATOR(float,float,f32f32f32of32) -GEN_MAT_MUL_POST_OPS_CREATOR(int32_t,float,s8s8s32os32) -GEN_MAT_MUL_POST_OPS_CREATOR(int16_t,float,s8s8s16os16) +GEN_MAT_MUL_POST_OPS_CREATOR(int8_t,int16_t,float,u8s8s16os16) +GEN_MAT_MUL_POST_OPS_CREATOR(int8_t,int32_t,float,u8s8s32os32) +GEN_MAT_MUL_POST_OPS_CREATOR(bfloat16,float,float,bf16bf16f32of32) +GEN_MAT_MUL_POST_OPS_CREATOR(bfloat16,float,float,f32f32f32of32) +GEN_MAT_MUL_POST_OPS_CREATOR(int8_t,int32_t,float,s8s8s32os32) +GEN_MAT_MUL_POST_OPS_CREATOR(int8_t,int16_t,float,s8s8s16os16) void lpgemm_destroy_post_ops_struct( aocl_post_op* post_ops ) { @@ -998,6 +1011,10 @@ void lpgemm_destroy_post_ops_struct( aocl_post_op* post_ops ) { free( post_ops->sum.scale_factor ); } + if ( post_ops->sum.zero_point != NULL ) + { + free( post_ops->sum.zero_point ); + } if ( post_ops->bias.bias != NULL ) { free( post_ops->bias.bias ); diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c index 8b41f0e6da..10693f5f53 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c @@ -775,13 +775,19 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + __m128i zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2, zero_point_0) scale_1 = _mm256_loadu_ps( @@ -792,13 +798,18 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + // Scale next 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_4p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_5p1, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_4p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_5p1, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c index 8d0bea859b..e6825cb2eb 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c @@ -521,6 +521,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -533,11 +534,17 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) scale_1 = _mm256_loadu_ps( @@ -548,11 +555,16 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + // Scale next 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -918,6 +930,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -930,9 +943,15 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) scale_1 = _mm256_loadu_ps( @@ -943,9 +962,14 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + // Scale next 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1205,6 +1229,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1217,8 +1242,14 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) scale_1 = _mm256_loadu_ps( @@ -1229,8 +1260,13 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + // Scale next 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c index 79fa0bcd3f..a9a3e56eb5 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c @@ -384,6 +384,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -396,11 +397,17 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -809,6 +816,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; float float_buf[16]; @@ -820,11 +828,17 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + // Scale first 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1121,6 +1135,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1133,9 +1148,15 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 2 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1418,6 +1439,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; float float_buf[16]; @@ -1429,9 +1451,15 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + // Scale first 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1656,6 +1684,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1668,8 +1697,14 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 2 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1892,6 +1927,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; float float_buf[16]; @@ -1903,8 +1939,14 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + // Scale first 16 columns of the 2 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c index 69b7a9baa9..e904613d8e 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c @@ -505,6 +505,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -517,13 +518,19 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1120,6 +1127,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; float float_buf[16]; @@ -1131,13 +1139,19 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + // Scale first 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c index 859a377ce0..a3f1f01865 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c @@ -703,20 +703,26 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) /* Load the scale vector values into the register*/ scale_1 = _mm256_loadu_ps( - (float *)post_ops_list_temp->scale_factor + - post_ops_attr.post_op_c_j + (0 * 8)); + ( float* )post_ops_list_temp->scale_factor + + post_ops_attr.post_op_c_j + ( 0 * 8 ) ); scale_2 = _mm256_loadu_ps( - (float *)post_ops_list_temp->scale_factor + - post_ops_attr.post_op_c_j + (1 * 8)); + ( float* )post_ops_list_temp->scale_factor + + post_ops_attr.post_op_c_j + ( 1 * 8 ) ); + + // Load zero points (2 byte values). + __m128i zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // Scale first 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2, zero_point_0) scale_1 = _mm256_loadu_ps( @@ -727,13 +733,18 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + // Scale next 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_4p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_5p1, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_4p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_5p1, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c index 863c57a5b6..03834b4318 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c @@ -473,6 +473,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -485,11 +486,17 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) scale_1 = _mm256_loadu_ps( @@ -500,11 +507,16 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + // Scale next 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -838,6 +850,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -850,9 +863,15 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) scale_1 = _mm256_loadu_ps( @@ -863,9 +882,14 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + // Scale next 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1101,6 +1125,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1113,8 +1138,14 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) scale_1 = _mm256_loadu_ps( @@ -1125,8 +1156,13 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + // Scale next 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c index e4b04e80e1..9a02626e84 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c @@ -343,6 +343,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -355,11 +356,17 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 4 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -727,6 +734,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; float float_buf[16]; @@ -738,11 +746,17 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + // Scale first 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1012,6 +1026,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1024,9 +1039,15 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 2 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1282,6 +1303,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; float float_buf[16]; @@ -1293,9 +1315,15 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + // Scale first 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1500,6 +1528,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1512,8 +1541,14 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 2 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1716,6 +1751,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; float float_buf[16]; @@ -1727,8 +1763,14 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + // Scale first 16 columns of the 2 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c index a3270f3091..17ac89f3ad 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c @@ -449,6 +449,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -461,13 +462,19 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (1 * 8)); + // Load zero points (2 byte values). + zero_point_0 = + _mm_loadu_si128( + ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + // Scale first 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1008,6 +1015,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; + __m128i zero_point_0; __m256 res_1, res_2; float float_buf[16]; @@ -1019,13 +1027,19 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + // Scale first 16 columns of the 6 rows. - CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2) - CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2) + CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2, zero_point_0) + CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2, zero_point_0) POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h b/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h index 1ce68ed498..5f5cebbb7b 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h @@ -106,7 +106,7 @@ ); \ // Downscale macro -#define CVT_MULRND_CVT16(reg, scale0, scale1) \ +#define CVT_MULRND_CVT16(reg, scale0, scale1, zero_point_0) \ \ /* Extract the first 128 bits of the register*/ \ temp[0] = _mm256_extractf128_si256( reg, 0 ); \ @@ -159,6 +159,9 @@ \ /*Permute to make sure the order is correct*/ \ reg = _mm256_permute4x64_epi64( reg, 0XD8 ); \ + \ + /* Zero point addition.*/ \ + reg = _mm256_add_epi16( reg, _mm256_cvtepi8_epi16( zero_point_0 ) ); \ // Downscale store macro #define CVT_STORE_S16_S8(reg0, reg1, m_ind, n_ind) \ diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c index a2e487bcb3..302a723685 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c @@ -1060,77 +1060,91 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + // int8_t zero point value. + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[1, 48-63] - CVT_MULRND_CVT32(c_int32_1p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_1p3,a_int32_1,zero_point3); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[2, 48-63] - CVT_MULRND_CVT32(c_int32_2p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_2p3,a_int32_1,zero_point3); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); // c[3, 48-63] - CVT_MULRND_CVT32(c_int32_3p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_3p3,a_int32_1,zero_point3); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[4, 32-47] - CVT_MULRND_CVT32(c_int32_4p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_4p2,a_int32_0,zero_point2); // c[4, 48-63] - CVT_MULRND_CVT32(c_int32_4p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_4p3,a_int32_1,zero_point3); // c[5, 0-15] - CVT_MULRND_CVT32(c_int32_5p0,selector1); + CVT_MULRND_CVT32(c_int32_5p0,selector1,zero_point0); // c[5, 16-31] - CVT_MULRND_CVT32(c_int32_5p1,selector2); + CVT_MULRND_CVT32(c_int32_5p1,selector2,zero_point1); // c[5, 32-47] - CVT_MULRND_CVT32(c_int32_5p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_5p2,a_int32_0,zero_point2); // c[5, 48-63] - CVT_MULRND_CVT32(c_int32_5p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_5p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c index a338484df6..aae48e260a 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c @@ -825,66 +825,78 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) a_int32_1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[1, 48-63] - CVT_MULRND_CVT32(c_int32_1p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_1p3,a_int32_1,zero_point3); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[2, 48-63] - CVT_MULRND_CVT32(c_int32_2p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_2p3,a_int32_1,zero_point3); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); // c[3, 48-63] - CVT_MULRND_CVT32(c_int32_3p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_3p3,a_int32_1,zero_point3); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[4, 32-47] - CVT_MULRND_CVT32(c_int32_4p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_4p2,a_int32_0,zero_point2); // c[4, 48-63] - CVT_MULRND_CVT32(c_int32_4p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_4p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1684,54 +1696,66 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) a_int32_1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[1, 48-63] - CVT_MULRND_CVT32(c_int32_1p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_1p3,a_int32_1,zero_point3); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[2, 48-63] - CVT_MULRND_CVT32(c_int32_2p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_2p3,a_int32_1,zero_point3); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); // c[3, 48-63] - CVT_MULRND_CVT32(c_int32_3p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_3p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2381,42 +2405,54 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) a_int32_1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[1, 48-63] - CVT_MULRND_CVT32(c_int32_1p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_1p3,a_int32_1,zero_point3); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[2, 48-63] - CVT_MULRND_CVT32(c_int32_2p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_2p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2918,30 +2954,42 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) a_int32_1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[1, 48-63] - CVT_MULRND_CVT32(c_int32_1p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_1p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -3292,18 +3340,30 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64) a_int32_1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c index c009bdeaf3..6cb4e5f615 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c @@ -432,21 +432,27 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); // c[2, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1,zero_point); // c[3, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1,zero_point); // c[4, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -835,18 +841,24 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); // c[2, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1,zero_point); // c[3, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1177,15 +1189,21 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); // c[2, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1458,12 +1476,18 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1678,9 +1702,15 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2072,21 +2102,24 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2451,18 +2484,21 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2771,15 +2807,18 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -3032,12 +3071,15 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -3234,9 +3276,12 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -3760,36 +3805,42 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -4294,30 +4345,36 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -4737,24 +4794,30 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -5089,18 +5152,24 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -5350,12 +5419,18 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -6012,51 +6087,60 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[4, 32-47] - CVT_MULRND_CVT32(c_int32_4p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_4p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -6695,42 +6779,51 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -7255,33 +7348,42 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -7693,24 +7795,33 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -8008,15 +8119,24 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c index b88ef512d6..91af051c71 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c @@ -524,24 +524,30 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); // c[2, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1,zero_point); // c[3, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1,zero_point); // c[4, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1,zero_point); // c[5, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_5p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_5p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1129,24 +1135,27 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[5, 0-15] - CVT_MULRND_CVT32(c_int32_5p0,selector1); + CVT_MULRND_CVT32(c_int32_5p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1884,42 +1893,48 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[5, 0-15] - CVT_MULRND_CVT32(c_int32_5p0,selector1); + CVT_MULRND_CVT32(c_int32_5p0,selector1,zero_point0); // c[5, 16-31] - CVT_MULRND_CVT32(c_int32_5p1,selector2); + CVT_MULRND_CVT32(c_int32_5p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2843,60 +2858,69 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[4, 32-47] - CVT_MULRND_CVT32(c_int32_4p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_4p2,a_int32_0,zero_point2); // c[5, 0-15] - CVT_MULRND_CVT32(c_int32_5p0,selector1); + CVT_MULRND_CVT32(c_int32_5p0,selector1,zero_point0); // c[5, 16-31] - CVT_MULRND_CVT32(c_int32_5p1,selector2); + CVT_MULRND_CVT32(c_int32_5p1,selector2,zero_point1); // c[5, 32-47] - CVT_MULRND_CVT32(c_int32_5p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_5p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c index f79cd8775a..698e0817a4 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c @@ -906,77 +906,91 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + // int8_t zero point value. + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[1, 48-63] - CVT_MULRND_CVT32(c_int32_1p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_1p3,a_int32_1,zero_point3); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[2, 48-63] - CVT_MULRND_CVT32(c_int32_2p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_2p3,a_int32_1,zero_point3); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); // c[3, 48-63] - CVT_MULRND_CVT32(c_int32_3p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_3p3,a_int32_1,zero_point3); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[4, 32-47] - CVT_MULRND_CVT32(c_int32_4p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_4p2,a_int32_0,zero_point2); // c[4, 48-63] - CVT_MULRND_CVT32(c_int32_4p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_4p3,a_int32_1,zero_point3); // c[5, 0-15] - CVT_MULRND_CVT32(c_int32_5p0,selector1); + CVT_MULRND_CVT32(c_int32_5p0,selector1,zero_point0); // c[5, 16-31] - CVT_MULRND_CVT32(c_int32_5p1,selector2); + CVT_MULRND_CVT32(c_int32_5p1,selector2,zero_point1); // c[5, 32-47] - CVT_MULRND_CVT32(c_int32_5p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_5p2,a_int32_0,zero_point2); // c[5, 48-63] - CVT_MULRND_CVT32(c_int32_5p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_5p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c index bcaa2d81c3..0276e9b7d3 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c @@ -752,66 +752,78 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64) a_int32_1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[1, 48-63] - CVT_MULRND_CVT32(c_int32_1p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_1p3,a_int32_1,zero_point3); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[2, 48-63] - CVT_MULRND_CVT32(c_int32_2p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_2p3,a_int32_1,zero_point3); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); // c[3, 48-63] - CVT_MULRND_CVT32(c_int32_3p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_3p3,a_int32_1,zero_point3); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[4, 32-47] - CVT_MULRND_CVT32(c_int32_4p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_4p2,a_int32_0,zero_point2); // c[4, 48-63] - CVT_MULRND_CVT32(c_int32_4p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_4p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1547,54 +1559,66 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64) a_int32_1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[1, 48-63] - CVT_MULRND_CVT32(c_int32_1p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_1p3,a_int32_1,zero_point3); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[2, 48-63] - CVT_MULRND_CVT32(c_int32_2p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_2p3,a_int32_1,zero_point3); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); // c[3, 48-63] - CVT_MULRND_CVT32(c_int32_3p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_3p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2191,42 +2215,54 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64) a_int32_1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[1, 48-63] - CVT_MULRND_CVT32(c_int32_1p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_1p3,a_int32_1,zero_point3); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[2, 48-63] - CVT_MULRND_CVT32(c_int32_2p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_2p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2685,30 +2721,42 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64) a_int32_1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[1, 48-63] - CVT_MULRND_CVT32(c_int32_1p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_1p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -3027,18 +3075,30 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x64) a_int32_1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point3 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[0, 48-63] - CVT_MULRND_CVT32(c_int32_0p3,a_int32_1); + CVT_MULRND_CVT32(c_int32_0p3,a_int32_1,zero_point3); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c index 940d9e92fa..119d973a06 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c @@ -402,21 +402,27 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); // c[2, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1,zero_point); // c[3, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1,zero_point); // c[4, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -779,18 +785,24 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); // c[2, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1,zero_point); // c[3, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1098,15 +1110,21 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); // c[2, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1359,12 +1377,18 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1562,9 +1586,15 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1928,21 +1958,24 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2282,18 +2315,21 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2580,15 +2616,18 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2822,12 +2861,15 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -3008,9 +3050,12 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -3498,36 +3543,42 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -4000,30 +4051,36 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -4415,24 +4472,30 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -4743,18 +4806,24 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -4984,12 +5053,18 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -5602,51 +5677,60 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[4, 32-47] - CVT_MULRND_CVT32(c_int32_4p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_4p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -6246,42 +6330,51 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -6772,33 +6865,42 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -7180,24 +7282,33 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -7470,15 +7581,24 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c index f59c82721c..4b163350c8 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c @@ -783,42 +783,48 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); // c[2, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1,zero_point); // c[3, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1,zero_point); // c[4, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1,zero_point); // c[5, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_5p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_5p0,selector1,zero_point); // c[6, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_6p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_6p0,selector1,zero_point); // c[7, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_7p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_7p0,selector1,zero_point); // c[8, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_8p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_8p0,selector1,zero_point); // c[9, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_9p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_9p0,selector1,zero_point); // c[10, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_10p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_10p0,selector1,zero_point); // c[11, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_11p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_11p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1664,42 +1670,45 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[5, 0-15] - CVT_MULRND_CVT32(c_int32_5p0,selector1); + CVT_MULRND_CVT32(c_int32_5p0,selector1,zero_point0); // c[6, 0-15] - CVT_MULRND_CVT32(c_int32_6p0,selector1); + CVT_MULRND_CVT32(c_int32_6p0,selector1,zero_point0); // c[7, 0-15] - CVT_MULRND_CVT32(c_int32_7p0,selector1); + CVT_MULRND_CVT32(c_int32_7p0,selector1,zero_point0); // c[8, 0-15] - CVT_MULRND_CVT32(c_int32_8p0,selector1); + CVT_MULRND_CVT32(c_int32_8p0,selector1,zero_point0); // c[9, 0-15] - CVT_MULRND_CVT32(c_int32_9p0,selector1); + CVT_MULRND_CVT32(c_int32_9p0,selector1,zero_point0); // c[10, 0-15] - CVT_MULRND_CVT32(c_int32_10p0,selector1); + CVT_MULRND_CVT32(c_int32_10p0,selector1,zero_point0); // c[11, 0-15] - CVT_MULRND_CVT32(c_int32_11p0,selector1); + CVT_MULRND_CVT32(c_int32_11p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2569,60 +2578,66 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_9x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[5, 0-15] - CVT_MULRND_CVT32(c_int32_5p0,selector1); + CVT_MULRND_CVT32(c_int32_5p0,selector1,zero_point0); // c[5, 16-31] - CVT_MULRND_CVT32(c_int32_5p1,selector2); + CVT_MULRND_CVT32(c_int32_5p1,selector2,zero_point1); // c[6, 0-15] - CVT_MULRND_CVT32(c_int32_6p0,selector1); + CVT_MULRND_CVT32(c_int32_6p0,selector1,zero_point0); // c[6, 16-31] - CVT_MULRND_CVT32(c_int32_6p1,selector2); + CVT_MULRND_CVT32(c_int32_6p1,selector2,zero_point1); // c[7, 0-15] - CVT_MULRND_CVT32(c_int32_7p0,selector1); + CVT_MULRND_CVT32(c_int32_7p0,selector1,zero_point0); // c[7, 16-31] - CVT_MULRND_CVT32(c_int32_7p1,selector2); + CVT_MULRND_CVT32(c_int32_7p1,selector2,zero_point1); // c[8, 0-15] - CVT_MULRND_CVT32(c_int32_8p0,selector1); + CVT_MULRND_CVT32(c_int32_8p0,selector1,zero_point0); // c[8, 16-31] - CVT_MULRND_CVT32(c_int32_8p1,selector2); + CVT_MULRND_CVT32(c_int32_8p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c index d5f86338a6..f537057b3d 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c @@ -473,24 +473,30 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6xlt16) ( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j ) ); + __m128i zero_point = _mm_maskz_loadu_epi8 + ( + load_mask, + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ) + ); // c[0, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1,zero_point); // c[1, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1,zero_point); // c[2, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1,zero_point); // c[3, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1,zero_point); // c[4, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1,zero_point); // c[5, 0-15] - CVT_MULRND_CVT32_LT16(c_int32_5p0,selector1); + CVT_MULRND_CVT32_LT16(c_int32_5p0,selector1,zero_point); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1026,24 +1032,27 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x16) selector1 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[5, 0-15] - CVT_MULRND_CVT32(c_int32_5p0,selector1); + CVT_MULRND_CVT32(c_int32_5p0,selector1,zero_point0); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -1724,42 +1733,48 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x32) selector2 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[5, 0-15] - CVT_MULRND_CVT32(c_int32_5p0,selector1); + CVT_MULRND_CVT32(c_int32_5p0,selector1,zero_point0); // c[5, 16-31] - CVT_MULRND_CVT32(c_int32_5p1,selector2); + CVT_MULRND_CVT32(c_int32_5p1,selector2,zero_point1); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } @@ -2609,60 +2624,69 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x48) a_int32_0 = _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + __m128i zero_point0 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + __m128i zero_point1 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + __m128i zero_point2 = + _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); // c[0, 0-15] - CVT_MULRND_CVT32(c_int32_0p0,selector1); + CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); // c[0, 16-31] - CVT_MULRND_CVT32(c_int32_0p1,selector2); + CVT_MULRND_CVT32(c_int32_0p1,selector2,zero_point1); // c[0, 32-47] - CVT_MULRND_CVT32(c_int32_0p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_0p2,a_int32_0,zero_point2); // c[1, 0-15] - CVT_MULRND_CVT32(c_int32_1p0,selector1); + CVT_MULRND_CVT32(c_int32_1p0,selector1,zero_point0); // c[1, 16-31] - CVT_MULRND_CVT32(c_int32_1p1,selector2); + CVT_MULRND_CVT32(c_int32_1p1,selector2,zero_point1); // c[1, 32-47] - CVT_MULRND_CVT32(c_int32_1p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_1p2,a_int32_0,zero_point2); // c[2, 0-15] - CVT_MULRND_CVT32(c_int32_2p0,selector1); + CVT_MULRND_CVT32(c_int32_2p0,selector1,zero_point0); // c[2, 16-31] - CVT_MULRND_CVT32(c_int32_2p1,selector2); + CVT_MULRND_CVT32(c_int32_2p1,selector2,zero_point1); // c[2, 32-47] - CVT_MULRND_CVT32(c_int32_2p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_2p2,a_int32_0,zero_point2); // c[3, 0-15] - CVT_MULRND_CVT32(c_int32_3p0,selector1); + CVT_MULRND_CVT32(c_int32_3p0,selector1,zero_point0); // c[3, 16-31] - CVT_MULRND_CVT32(c_int32_3p1,selector2); + CVT_MULRND_CVT32(c_int32_3p1,selector2,zero_point1); // c[3, 32-47] - CVT_MULRND_CVT32(c_int32_3p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_3p2,a_int32_0,zero_point2); // c[4, 0-15] - CVT_MULRND_CVT32(c_int32_4p0,selector1); + CVT_MULRND_CVT32(c_int32_4p0,selector1,zero_point0); // c[4, 16-31] - CVT_MULRND_CVT32(c_int32_4p1,selector2); + CVT_MULRND_CVT32(c_int32_4p1,selector2,zero_point1); // c[4, 32-47] - CVT_MULRND_CVT32(c_int32_4p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_4p2,a_int32_0,zero_point2); // c[5, 0-15] - CVT_MULRND_CVT32(c_int32_5p0,selector1); + CVT_MULRND_CVT32(c_int32_5p0,selector1,zero_point0); // c[5, 16-31] - CVT_MULRND_CVT32(c_int32_5p1,selector2); + CVT_MULRND_CVT32(c_int32_5p1,selector2,zero_point1); // c[5, 32-47] - CVT_MULRND_CVT32(c_int32_5p2,a_int32_0); + CVT_MULRND_CVT32(c_int32_5p2,a_int32_0,zero_point2); POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR } diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h b/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h index deb35e8e09..f870f8da0b 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h @@ -124,7 +124,7 @@ reg = _mm512_mask_mullo_epi32( reg, relu_cmp_mask, reg, selector2 ); \ // Downscale macro -#define CVT_MULRND_CVT32(reg,selector) \ +#define CVT_MULRND_CVT32(reg,selector,zero_point) \ reg = \ _mm512_cvtps_epi32 \ ( \ @@ -134,7 +134,8 @@ ( __m512 )selector, \ ( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) \ ) \ - ) \ + ); \ + reg = _mm512_add_epi32( reg, _mm512_cvtepi8_epi32( zero_point ) ); \ // Downscale store macro #define CVT_STORE_S32_S8(reg,m_ind,n_ind) \ @@ -147,7 +148,7 @@ ) \ // Downscale n < 16 macro -#define CVT_MULRND_CVT32_LT16(reg,selector) \ +#define CVT_MULRND_CVT32_LT16(reg,selector,zero_point) \ reg = \ _mm512_cvtps_epi32 \ ( \ @@ -157,7 +158,8 @@ ( __m512 )selector, \ ( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) \ ) \ - ) \ + ); \ + reg = _mm512_add_epi32( reg, _mm512_cvtepi8_epi32( zero_point ) ); \ /* TANH GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) ) */ #define GELU_TANH_S32_AVX512(reg, y, r, r2, x, z, dn, x_tanh, q) \ From 9828039030902934c1cf11b86cf6b148c67d14e0 Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Tue, 3 Oct 2023 12:36:59 +0530 Subject: [PATCH 149/226] Bugfix : Inversion of sign bit with early return in SNRM2_ - The bli_snormfv_unb_var1( ... ) function returns early in case of n = 1, and uses the blis macro bli_fabs( ... ) to set the norm to the absolute value of the element. - This macro inverts the sign bit even if the element is 0.0. A check is added to re-invert the sign bit in this case, so that the norm is set to 0.0 instead of -0.0. - Added the same early exit condition on bli_dnormfv_unb_var1( ... ) when n = 1. AMD-Internal: [CPUPL-3923] Change-Id: If7f5ae41d2acfe89b505549d28215dde319d8c33 --- frame/util/bli_util_unb_var1.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index 402af1c644..1c841fd41d 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -557,12 +557,17 @@ void bli_snormfv_unb_var1 rntm_t* rntm ) { - // Early return if n=1. + // Early return if n = 1. if ( n == 1 ) { - *norm = bli_fabs(*x); + *norm = bli_fabs( *x ); + + // If the value in x is 0.0, the sign bit gets inverted + // Reinvert the sign bit in this case. + if ( ( *norm ) == -0.0 ) ( *norm ) = 0.0; return; } + /* Disable AVX2 codepath. if( bli_cpuid_is_avx2fma3_supported() == TRUE ) { @@ -611,6 +616,17 @@ void bli_dnormfv_unb_var1 rntm_t* rntm ) { + // Early return if n = 1. + if ( n == 1 ) + { + *norm = bli_fabs( *x ); + + // If the value in x is 0.0, the sign bit gets inverted + // Reinvert the sign bit in this case. + if ( ( *norm ) == -0.0 ) ( *norm ) = 0.0; + return; + } + arch_t id = bli_arch_query_id(); switch (id) { From 6132194468ca45b786ff89fd006c6d9aa7e1d27a Mon Sep 17 00:00:00 2001 From: Chandrashekara K R Date: Fri, 6 Oct 2023 08:57:02 +0530 Subject: [PATCH 150/226] Updated "HEADER_PATH" in main CMakeLists.txt file. - Updated "HEADER_PATH" cmake variable to make sure blis header file is not missing function declarations on Windows. AMD-Internal: [CPUPL-3881] Change-Id: Id71ec16c800411cd727fc78e3f772ea1b751f971 --- CMakeLists.txt | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 400214efdb..d16e82207a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -496,16 +496,37 @@ if(${AOCL_BLIS_FAMILY} STREQUAL "zen") " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/haswell/" elseif (${AOCL_BLIS_FAMILY} STREQUAL "zen2") " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen2/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen2/" " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen/" " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/haswell/" +elseif (${AOCL_BLIS_FAMILY} STREQUAL "zen3") + " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen3/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen3/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen2/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/haswell/" +elseif (${AOCL_BLIS_FAMILY} STREQUAL "zen4") + " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen4/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen3/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen2/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/haswell/" +elseif (${AOCL_BLIS_FAMILY} STREQUAL "amdzen") " ${CMAKE_CURRENT_SOURCE_DIR}/config/amdzen/" " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen/" " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen2/" " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen3/" - " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen4/" + " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen4/" " ${CMAKE_CURRENT_SOURCE_DIR}/config/generic/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen3/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen2/" " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen/" " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/haswell/" + " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/generic/" endif () " ${CMAKE_CURRENT_SOURCE_DIR}/frame/0/" " ${CMAKE_CURRENT_SOURCE_DIR}/frame/0/copysc/" From 25bab76f586e382f45959c8aa9490ce42c8061ee Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Wed, 11 Oct 2023 10:13:21 +0530 Subject: [PATCH 151/226] Changed threshold to use DTRSM Small MT - Threshold to use DTRSM small MT code path is lowered. AMD-Internal: [CPUPL-3781] Change-Id: Ie1f232aa6d216b839df23657b54edb0448a64267 --- frame/compat/bla_trsm_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frame/compat/bla_trsm_amd.c b/frame/compat/bla_trsm_amd.c index 0e627f7832..6f5c1137fd 100644 --- a/frame/compat/bla_trsm_amd.c +++ b/frame/compat/bla_trsm_amd.c @@ -1086,7 +1086,7 @@ void dtrsm_blis_impl * is doing better than small multithread and native multithread */ bool is_parallel = bli_thread_get_is_parallel(); if ((!is_parallel && ((dim_a < 1500) && (size_b < 5e6)) ) || - (is_parallel && (m0+n0)<320)) + (is_parallel && (m0+n0)<200)) { switch(id) { From 4874895a68b8a1dfc4f1b14e7ce42aea7f092c92 Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Thu, 31 Aug 2023 23:56:17 +0530 Subject: [PATCH 152/226] LPGEMM: Added transA support for bf16bf16f32o APIs Details: - Added new params(order, trans) to aocl_get_reorder_buf_size_ and aocl_reorder_ APIs. - Added new pack kernels that packs A matrix from either row-major or column major input matrix to pack buffer with row-major format. - Updated cntx with pack kernel function pointers for packing A matrix. - Transpose of A matrix is handled by packing A matrix to row-major format during run-time. - Updated Early-return check conditions to account for trans parameters. - Updated bench file to test/benchmark transpose support. AMD-Internal: [SWLCSG-2268, SWLCSG-2442] Change-Id: I43a113dc4bc11e6bb7cc4d768c239a16cb6bbea4 --- addon/aocl_gemm/aocl_gemm.h | 2 +- addon/aocl_gemm/aocl_gemm_bf16_utils.c | 27 + addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c | 81 +- addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c | 78 +- addon/aocl_gemm/aocl_gemm_interface_apis.h | 4 + addon/aocl_gemm/config/lpgemm_config.c | 2 +- addon/aocl_gemm/config/lpgemm_func_map.h | 6 +- .../aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c | 42 +- .../frame/bf16bf16f32/lpgemm_reorder_bf16.c | 2 +- ...lpgemm_packb_bf16.h => lpgemm_pack_bf16.h} | 11 + bench/bench_aocl_gemm/bench_input.txt | 1958 ++++++++--------- bench/bench_aocl_gemm/bench_lpgemm.c | 181 +- .../lpgemm_packa_bf16_amd256vnni.c | 1493 +++++++++++++ 13 files changed, 2799 insertions(+), 1088 deletions(-) rename addon/aocl_gemm/kernels/bf16bf16f32/{lpgemm_packb_bf16.h => lpgemm_pack_bf16.h} (91%) create mode 100644 kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packa_bf16_amd256vnni.c diff --git a/addon/aocl_gemm/aocl_gemm.h b/addon/aocl_gemm/aocl_gemm.h index 44de4ac658..4a5e574b6d 100644 --- a/addon/aocl_gemm/aocl_gemm.h +++ b/addon/aocl_gemm/aocl_gemm.h @@ -43,7 +43,7 @@ #include "lpgemm_post_ops.h" #include "lpgemm_kernels.h" #include "lpgemm_utils_kernels.h" -#include "lpgemm_packb_bf16.h" +#include "lpgemm_pack_bf16.h" #include "lpgemm_packb_s16.h" #include "lpgemm_packa.h" #include "lpgemm_packb.h" diff --git a/addon/aocl_gemm/aocl_gemm_bf16_utils.c b/addon/aocl_gemm/aocl_gemm_bf16_utils.c index d8c332a423..cad3a07eaa 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16_utils.c +++ b/addon/aocl_gemm/aocl_gemm_bf16_utils.c @@ -68,6 +68,19 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32) return 0; // A reorder not supported. } + // Reorder of a col-major matrix is not supported yet. + if (!( (order == 'r') || ( order == 'R' ))) + { + printf("returning with order:%c\n", order); + return 0; + } + + // Reorder of matrix is only supported for non-trans matrices. + if(!( ( trans == 'n' ) || ( trans == 'N' ) )) + { + printf("returning with trans:%c\n", trans); + return 0; + } // Extra space since packing does width in multiples of 16. The bf16 // instruction can be used as long as at least one zmm register can be fully // loaded; and since k_dim needs to be at least 2, having n_dim at least 16 @@ -112,6 +125,20 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32) { return; // A reorder not supported. } + + // Reorder of a col-major matrix is not supported yet. + if (!( (order == 'r') || ( order == 'R' ))) + { + printf("returning with order:%c\n", order); + return; + } + + // Reorder of matrix is only supported for non-trans matrices. + if (!( ( trans == 'n' ) || ( trans == 'N' ) )) + { + printf("Returning with trans:%c\n", trans); + return; + } // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c index 6c9e117358..cf81bcd1b1 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c @@ -83,14 +83,6 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) bli_param_map_netlib_to_blis_trans( transa, &blis_transa ); bli_param_map_netlib_to_blis_trans( transb, &blis_transb ); - /* Perform BLAS parameter checking. */ - // Transpose not supported. - if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || - ( blis_transb != BLIS_NO_TRANSPOSE ) ) - { - return; // Error. - } - // Sanitize order input. char order_use = ( ( order == 'r' ) || ( order == 'R' ) || @@ -100,15 +92,33 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - // Row major input expected with leading dimensions >= row stride. + // Transpose is not supported for B matrix yet. + if ( ( is_row_major == TRUE ) && ( bli_is_trans( blis_transb ) ) ) + { + return; // Error. + } + else if ( ( is_column_major == TRUE ) && ( bli_is_trans( blis_transa ) ) ) + { + return; // Error. + } + + // Check if strides are valid for Row major inputs. if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + ( ( bli_is_notrans( blis_transa ) && ( lda < k ) ) || + ( bli_is_trans( blis_transa ) && ( lda < m ) ) || + ( bli_is_notrans( blis_transb ) && ( ldb < n ) ) || + ( bli_is_trans( blis_transb ) && ( ldb < k ) ) || + ( ldc < n ) ) ) { return; // Error. } // Column major input expected with leading dimensions >= column stride. else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) + ( ( bli_is_notrans( blis_transa ) && ( lda < m ) ) || + ( bli_is_trans( blis_transa ) && ( lda < k ) ) || + ( bli_is_notrans( blis_transb ) && ( ldb < k ) ) || + ( bli_is_trans( blis_transb ) && ( ldb < n ) ) || + ( ldc < m ) ) ) { return; // Error. } @@ -120,10 +130,24 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) return; // Error. } - const inc_t rs_a = lda; - const inc_t cs_a = 1; - const inc_t rs_b = ldb; - const inc_t cs_b = 1; + inc_t rs_a = lda; + inc_t cs_a = 1; + + if ( bli_is_trans( blis_transa ) ) + { + rs_a = 1; + cs_a = lda; + } + + inc_t rs_b = ldb; + inc_t cs_b = 1; + + if( bli_is_trans( blis_transb ) ) + { + rs_b = 1; + cs_b = ldb; + } + const inc_t rs_c = ldc; const inc_t cs_c = 1; @@ -133,6 +157,19 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a ); bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b ); + // Reorder is not supported for A matrix + if( ( is_row_major == TRUE ) && ( mtag_a == REORDERED ) ) + { + return; + } + // Inputs swapped in column major, A becomes B from kernel point of view. + // Reorder is not supported for column major matrices. + else if ( ( is_column_major == TRUE ) && ( ( mtag_b == REORDERED ) || ( mtag_a == REORDERED ) ) ) + { + return; + } + + // From 5-loop function point of view, // B matrix needs to be packed in a certain format in order to be loaded // and used in bf16 instrution. As such the mtag_b always needs to be either // packed or reordered. B matrix as it is (unpacked) cannot be used, and @@ -147,15 +184,17 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) mtag_a = PACK; } - // Only unpacked A supported now. - if ( ( is_row_major == TRUE ) && ( mtag_a != UNPACKED ) ) + // From 5-loop function point of view, + // A matrix when in column major storage needs to be packed to row-major + // storage as kernel expects A matrix to be in row-major format. + if( ( is_row_major == TRUE ) && ( bli_is_trans(blis_transa ) ) ) { - return; // Error. + mtag_a = PACK; } - // Inputs swapped in column major, B becomes A from kernel point of view. - else if ( ( is_column_major == TRUE ) && ( mtag_b != UNPACKED ) ) + // Inputs swapped in column major, A becomes B from kernel point of view. + else if ( ( is_column_major == TRUE ) && ( bli_is_trans(blis_transb ) ) ) { - return; // Error. + mtag_b = PACK; } // Convert post op struct to post op linked list format. diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c index b37bd44703..9f799bae35 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c @@ -83,14 +83,6 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) bli_param_map_netlib_to_blis_trans( transa, &blis_transa ); bli_param_map_netlib_to_blis_trans( transb, &blis_transb ); - /* Perform BLAS parameter checking. */ - // Transpose not supported. - if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || - ( blis_transb != BLIS_NO_TRANSPOSE ) ) - { - return; // Error. - } - // Sanitize order input. char order_use = ( ( order == 'r' ) || ( order == 'R' ) || @@ -100,15 +92,33 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - // Row major input expected with leading dimensions >= row stride. + // Transpose is not supported for B matrix yet. + if ( ( is_row_major == TRUE ) && ( bli_is_trans( blis_transb ) ) ) + { + return; // Error. + } + else if ( ( is_column_major == TRUE ) && ( bli_is_trans( blis_transa ) ) ) + { + return; // Error. + } + + // Check if strides are valid for Row major inputs. if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + ( ( bli_is_notrans( blis_transa ) && ( lda < k ) ) || + ( bli_is_trans( blis_transa ) && ( lda < m ) ) || + ( bli_is_notrans( blis_transb ) && ( ldb < n ) ) || + ( bli_is_trans( blis_transb ) && ( ldb < k ) ) || + ( ldc < n ) ) ) { return; // Error. } // Column major input expected with leading dimensions >= column stride. else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) + ( ( bli_is_notrans( blis_transa ) && ( lda < m ) ) || + ( bli_is_trans( blis_transa ) && ( lda < k ) ) || + ( bli_is_notrans( blis_transb ) && ( ldb < k ) ) || + ( bli_is_trans( blis_transb ) && ( ldb < n ) ) || + ( ldc < m ) ) ) { return; // Error. } @@ -121,10 +131,23 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) } // The strides are set assuming a row major kernel. - const inc_t rs_a = lda; - const inc_t cs_a = 1; - const inc_t rs_b = ldb; - const inc_t cs_b = 1; + inc_t rs_a = lda; + inc_t cs_a = 1; + + if ( bli_is_trans( blis_transa ) ) + { + rs_a = 1; + cs_a = lda; + } + + inc_t rs_b = ldb; + inc_t cs_b = 1; + + if( bli_is_trans( blis_transb ) ) + { + rs_b = 1; + cs_b = ldb; + } const inc_t rs_c = ldc; const inc_t cs_c = 1; @@ -134,12 +157,19 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a ); bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b ); - if ( ( is_column_major == TRUE ) && ( mtag_b == REORDERED ) ) + // Reorder is not supported for A matrix + if( ( is_row_major == TRUE ) && ( mtag_a == REORDERED ) ) + { + return; + } + // Inputs swapped in column major, A becomes B from kernel point of view. + // Reorder is not supported for column major matrices. + else if ( ( is_column_major == TRUE ) && ( ( mtag_b == REORDERED ) || ( mtag_a == REORDERED ) ) ) { - // Reorder not supported with column major inputs. return; } + // From 5-loop function point of view // B matrix needs to be packed in a certain format in order to be loaded // and used in bf16 instrution. As such the mtag_b always needs to be either // packed or reordered. B matrix as it is (unpacked) cannot be used, and @@ -154,15 +184,17 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) mtag_a = PACK; } - // Only unpacked A supported now. - if ( ( is_row_major == TRUE ) && ( mtag_a != UNPACKED ) ) + // From 5-loop function point of view, + // A matrix when in column major storage needs to be packed to row-major + // storage as kernel expects A matrix to be in row-major format. + if( ( is_row_major == TRUE ) && ( bli_is_trans(blis_transa ) ) ) { - return; // Error. + mtag_a = PACK; } - // Inputs swapped in column major, B becomes A from kernel point of view. - else if ( ( is_column_major == TRUE ) && ( mtag_b != UNPACKED ) ) + // Inputs swapped in column major, A becomes B from kernel point of view. + else if ( ( is_column_major == TRUE ) && ( bli_is_trans(blis_transb ) ) ) { - return; // Error. + mtag_b = PACK; } // Convert post op struct to post op linked list format. diff --git a/addon/aocl_gemm/aocl_gemm_interface_apis.h b/addon/aocl_gemm/aocl_gemm_interface_apis.h index 718c0c3de2..41063343b1 100644 --- a/addon/aocl_gemm/aocl_gemm_interface_apis.h +++ b/addon/aocl_gemm/aocl_gemm_interface_apis.h @@ -42,6 +42,8 @@ #define AOCL_GEMM_GET_REORDER_BUF_SIZE(LP_SFX) \ BLIS_EXPORT_ADDON siz_t aocl_get_reorder_buf_size_ ## LP_SFX \ ( \ + const char order, \ + const char trans, \ const char mat_type, \ const dim_t k, \ const dim_t n \ @@ -60,6 +62,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(s8s8s16os16); #define AOCL_GEMM_REORDER(B_type,LP_SFX) \ BLIS_EXPORT_ADDON void aocl_reorder_ ## LP_SFX \ ( \ + const char order, \ + const char trans, \ const char mat_type, \ const B_type* input_buf_addr, \ B_type* reorder_buf_addr, \ diff --git a/addon/aocl_gemm/config/lpgemm_config.c b/addon/aocl_gemm/config/lpgemm_config.c index 0dad8c88a7..93eb7e9b3e 100644 --- a/addon/aocl_gemm/config/lpgemm_config.c +++ b/addon/aocl_gemm/config/lpgemm_config.c @@ -37,7 +37,7 @@ #include "lpgemm_func_map.h" #include "lpgemm_blksz_map.h" #include "lpgemm_kernels.h" -#include "lpgemm_packb_bf16.h" +#include "lpgemm_pack_bf16.h" #include "lpgemm_packb_s16.h" #include "lpgemm_packa.h" #include "lpgemm_packb.h" diff --git a/addon/aocl_gemm/config/lpgemm_func_map.h b/addon/aocl_gemm/config/lpgemm_func_map.h index 864f84aef2..c54a6e28a0 100644 --- a/addon/aocl_gemm/config/lpgemm_func_map.h +++ b/addon/aocl_gemm/config/lpgemm_func_map.h @@ -84,7 +84,7 @@ #define LPGEMM_PACKA_FUNC_MAP_AVX512_VNNI \ PAMACRO(U8S8S16OS16, NULL) \ PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \ - PAMACRO(BF16BF16F32OF32, NULL) \ + PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \ PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \ PAMACRO(S8S8S16OS16, NULL) \ @@ -112,7 +112,7 @@ #define LPGEMM_PACKA_FUNC_MAP_AVX512 \ PAMACRO(U8S8S16OS16, NULL) \ PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \ - PAMACRO(BF16BF16F32OF32, NULL) \ + PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \ PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \ PAMACRO(S8S8S16OS16, NULL) \ @@ -140,7 +140,7 @@ #define LPGEMM_PACKA_FUNC_MAP_AVX2 \ PAMACRO(U8S8S16OS16, NULL) \ PAMACRO(U8S8S32OS32, NULL) \ - PAMACRO(BF16BF16F32OF32, NULL) \ + PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \ PAMACRO(S8S8S32OS32, NULL) \ PAMACRO(S8S8S16OS16, NULL) \ diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c index 09d86c6e6e..d6cc33fbb5 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c @@ -34,12 +34,14 @@ #include "blis.h" #include "lpgemm_5loop_interface_apis.h" -#include "lpgemm_packb_bf16.h" +#include "lpgemm_pack_bf16.h" #include "lpgemm_kernels.h" #include "lpgemm_utils.h" #include "lpgemm_thrinfo_utils.h" #include "lpgemm_config.h" + + // Kernel function prototypes typedef void (*lpgemm_rowvar_bf16) ( @@ -73,6 +75,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) const int16_t* a_use = NULL; dim_t cs_a_use = cs_a; + dim_t rs_a_use = rs_a; dim_t a_block_stride = 0; const int16_t* b_use = NULL; @@ -86,8 +89,11 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) // Pack buffer for B. bfloat16* pack_b_buffer_bf16; + bfloat16* pack_a_buffer_bf16; mem_t mem_b = BLIS_MEM_INITIALIZER; + mem_t mem_a = BLIS_MEM_INITIALIZER; siz_t mem_b_size_req = 0; + siz_t mem_a_size_req = 0; dim_t packb_min_NR = 16; // Temporary buffer for C accumulation when downscaling is required. @@ -315,6 +321,31 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) // Non bf16 based kernel requires update to this code. cs_a_use = 2; a_block_stride = rs_a; + rs_a_use = rs_a; + } + else if ( mtag_a == PACK ) + { + + mem_a_size_req = sizeof( bfloat16 ) * mc0 * kc0; + + lpgemm_alloc_mem_panel + ( + mem_a_size_req, BLIS_BUFFER_FOR_A_BLOCK, + &mem_a, rntm + ); + + pack_a_buffer_bf16 = + ( bfloat16* ) bli_mem_buffer( &mem_a ); + + ( packa_mr16_bf16bf16f32of32) + ( + pack_a_buffer_bf16, + ( a + ( rs_a * ic ) + ( cs_a * pc )), rs_a, cs_a, + ( ic_end - ic_start ), kc0, + &rs_a_use, &cs_a_use + ); + a_use = pack_a_buffer_bf16; + a_block_stride = rs_a_use; } for ( dim_t jr = 0; jr < nc0; jr += NR ) @@ -330,7 +361,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) ( ( lpgemm_rowvar_bf16 )lcntx->kern_fun_ptr ) ( mc0, nr0, kc0, - a_use, rs_a, cs_a_use, a_block_stride, + a_use, rs_a_use, cs_a_use, a_block_stride, ( b_use + ( jr * kc0_updated ) ), rs_b_use, cs_b_use, ( c_use_ic + jr ), rs_c_use, 1, alpha, beta0, @@ -364,6 +395,13 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) } } } + if( mtag_a == PACK ) + { + if ( bli_mem_is_alloc( &mem_a ) ) + { + bli_pba_release(rntm, &mem_a); + } + } if ( c_downscale == TRUE ) { if ( bli_mem_is_alloc( &mem_scale_c ) ) diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c index b90d339664..91a14b8918 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c @@ -35,7 +35,7 @@ #include "blis.h" #include "lpgemm_utils.h" #include "lpgemm_reorder_bf16.h" -#include "lpgemm_packb_bf16.h" +#include "lpgemm_pack_bf16.h" #include "lpgemm_config.h" #include "aocl_bf16_type.h" diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_packb_bf16.h b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h similarity index 91% rename from addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_packb_bf16.h rename to addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h index db5d31e513..44c857c6ad 100644 --- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_packb_bf16.h +++ b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h @@ -69,4 +69,15 @@ void packb_nr64_bf16bf16f32of32 dim_t* cs_b ); +void packa_mr16_bf16bf16f32of32 + ( + bfloat16* pack_a_buffer, + const bfloat16* a, + const dim_t rs_a, + const dim_t cs_a, + const dim_t MC, + const dim_t KC, + dim_t* rs_p, + dim_t* cs_p + ); #endif //BLIS_GEMM_BF16_PACKB diff --git a/bench/bench_aocl_gemm/bench_input.txt b/bench/bench_aocl_gemm/bench_input.txt index 9034a0d550..3bfd6fb350 100644 --- a/bench/bench_aocl_gemm/bench_input.txt +++ b/bench/bench_aocl_gemm/bench_input.txt @@ -1,979 +1,979 @@ -u r p 480 20 2050 2050 20 20 -u r p 481 20 2050 2050 20 20 -u r p 482 20 2050 2050 20 20 -u r p 483 20 2050 2050 20 20 -u r R 484 20 2050 2050 20 20 -u r R 485 20 2050 2050 20 20 -u r R 480 39 2050 2050 39 39 -u r R 481 39 2050 2050 39 39 -u r R 482 39 2050 2050 39 39 -u r R 483 39 2050 2050 39 39 -u r R 484 39 2050 2050 39 39 -u r p 485 39 2050 2050 39 39 -u r p 480 50 2050 2050 50 50 -u r p 481 50 2050 2050 50 50 -u r p 482 50 2050 2050 50 50 -u r p 483 50 2050 2050 50 50 -u r p 484 50 2050 2050 50 50 -u r p 485 50 2050 2050 50 50 -u r R 480 1108 2050 2050 1108 1108 -u r R 481 1108 2050 2050 1108 1108 -u r R 482 1108 2050 2050 1108 1108 -u r R 483 1108 2050 2050 1108 1108 -u r R 484 1108 2050 2050 1108 1108 -u r R 485 1108 2050 2050 1108 1108 -u r R 480 1127 2050 2050 1127 1127 -u r R 481 1127 2050 2050 1127 1127 -u r R 482 1127 2050 2050 1127 1127 -u r R 483 1127 2050 2050 1127 1127 -u r p 484 1127 2050 2050 1127 1127 -u r p 485 1127 2050 2050 1127 1127 -u r p 480 1138 2050 2050 1138 1138 -u r p 481 1138 2050 2050 1138 1138 -u r p 482 1138 2050 2050 1138 1138 -u r p 483 1138 2050 2050 1138 1138 -u r p 484 1138 2050 2050 1138 1138 -u r p 485 1138 2050 2050 1138 1138 -u r p 1 1 3 3 1 1 -u r p 1 9 3 3 9 9 -u r p 1 2048 3 3 2048 2048 -u r p 1 2048 5192 5192 2048 2048 -u r p 9 1 3 3 1 1 -u r p 576 1 3500 3500 1 1 -u r p 1 1 1 1 1 1 -u r p 102 1088 1024 1024 1088 1088 -u r p 102 2048 1024 1024 2048 2048 -u r p 485 656 1024 1024 656 656 -u r p 483 656 1024 1024 656 656 -u r p 81 128 3 3 128 128 -u r p 1022 512 515 515 512 512 -u r p 74 512 515 515 512 512 -u r p 253 2048 515 515 2048 2048 -u r p 8192 1040 515 515 1040 1040 -u r p 10 1029 515 515 1029 1029 -u r p 24 1040 2050 2050 1040 1040 -u r p 1024 1029 2050 2050 1029 1029 -u r p 480 660 2050 2050 660 660 -u r p 481 660 2050 2050 660 660 -u r p 482 660 2050 2050 660 660 -u r p 483 660 2050 2050 660 660 -u r p 484 660 2050 2050 660 660 -u r p 485 660 2050 2050 660 660 -u r p 480 679 2050 2050 679 679 -u r p 481 679 2050 2050 679 679 -u r p 482 679 2050 2050 679 679 -u r p 483 679 2050 2050 679 679 -u r p 484 679 2050 2050 679 679 -u r p 485 679 2050 2050 679 679 -u r p 480 690 2050 2050 690 690 -u r p 481 690 2050 2050 690 690 -u r p 482 690 2050 2050 690 690 -u r p 483 690 2050 2050 690 690 -u r p 484 690 2050 2050 690 690 -u r p 485 690 2050 2050 690 690 -u r p 480 660 2048 2048 660 660 -u r p 481 660 2048 2048 660 660 -u r p 482 660 2048 2048 660 660 -u r p 483 660 2048 2048 660 660 -u r p 484 660 2048 2048 660 660 -u r p 485 660 2048 2048 660 660 -u r p 480 679 2048 2048 679 679 -u r p 481 679 2048 2048 679 679 -u r p 482 679 2048 2048 679 679 -u r p 483 679 2048 2048 679 679 -u r p 484 679 2048 2048 679 679 -u r p 485 679 2048 2048 679 679 -u r p 480 690 2048 2048 690 690 -u r p 481 690 2048 2048 690 690 -u r p 482 690 2048 2048 690 690 -u r p 483 690 2048 2048 690 690 -u r p 484 690 2048 2048 690 690 -u r p 485 690 2048 2048 690 690 -u r p 480 656 1024 1024 656 656 -u r p 480 128 3 3 128 128 -u r p 1024 512 515 515 512 512 -u r p 1024 2048 1024 1024 2048 2048 -u r p 1024 2048 515 515 2048 2048 -u r p 1024 1040 515 515 1040 1040 -u r p 5 1029 515 515 1029 1029 -u r p 1024 1029 515 515 1029 1029 -u r p 1024 1040 2050 2050 1040 1040 -u r p 1029 1029 2050 2050 1029 1029 -u r R 480 646 2050 2050 646 646 -u r R 481 646 2050 2050 646 646 -u r R 482 646 2050 2050 646 646 -u r R 483 646 2050 2050 646 646 -u r R 484 646 2050 2050 646 646 -u r R 485 646 2050 2050 646 646 -u r R 481 656 2050 2050 656 656 -u r R 482 656 2050 2050 656 656 -u r R 483 656 2050 2050 656 656 -u r R 484 656 2050 2050 656 656 -u r p 485 656 2050 2050 656 656 -u r p 480 672 2050 2050 672 672 -u r p 481 672 2050 2050 672 672 -u r p 482 672 2050 2050 672 672 -u r p 483 672 2050 2050 672 672 -u r p 484 672 2050 2050 672 672 -u r p 485 672 2050 2050 672 672 -u r p 480 688 2050 2050 688 688 -u r p 481 688 2050 2050 688 688 -u r r 482 688 2050 2050 688 688 -u r r 483 688 2050 2050 688 688 -u r r 484 688 2050 2050 688 688 -u r r 485 688 2050 2050 688 688 -u r r 1024 512 64 64 512 512 -u r r 16 256 512 512 256 256 -u r r 480 640 512 512 640 640 -u r r 64 768 512 512 768 768 -u r r 128 128 128 128 128 128 -u r r 1024 64 512 512 64 64 -u r r 1024 256 32 32 256 256 -u r r 1024 512 64 64 512 512 -u r r 480 640 512 512 640 640 -u r p 1024 32 256 256 32 32 -u r P 1024 64 512 512 64 64 -u r P 64 800 320 320 800 800 -u r P 64 768 512 512 768 768 -u r P 16 256 512 512 256 256 -u r P 128 128 128 128 128 128 -u r P 256 512 256 256 512 512 -u r P 1024 1024 1024 1024 1024 1024 -u r P 480 640 1024 1024 640 640 -u r P 480 640 256 256 640 640 -u r P 8 64 32 32 64 64 -u r P 9 64 32 32 64 64 -u r P 10 128 64 64 128 128 -u r P 8 8 8 8 8 8 -u r P 12 12 12 12 12 12 -u r P 25 25 25 25 25 25 -u r P 25 25 20 20 25 25 -u r r 4096 256 5 5 256 256 -u r r 3000 256 128 128 256 256 -u r r 4096 1024 512 512 1024 1024 -u r r 144 256 5 5 256 256 -u r r 144 256 128 128 256 256 -u r r 144 1024 512 512 1024 1024 -u r r 480 688 256 256 688 688 -u r r 480 640 512 512 640 640 -u r r 480 640 1024 1024 640 640 -u r r 64 800 320 320 800 800 -u r r 64 768 512 512 768 768 -u r r 16 256 512 512 256 256 -u r r 128 128 128 128 128 128 -u r r 256 512 256 256 512 512 -u r r 1024 1024 1024 1024 1024 1024 -u r r 1024 32 256 256 32 32 -u r r 1024 64 512 512 64 64 -u r r 1024 256 32 32 256 256 -u r r 1024 512 64 64 512 512 -u r r 512 32 256 256 32 32 -u r r 512 768 512 512 768 768 -u r r 512 256 32 32 256 256 -u r r 512 512 64 64 512 512 -u r r 512 256 768 768 256 256 -u r r 768 768 1024 1024 768 768 -u r r 768 768 768 768 768 768 -u r r 2048 2048 2048 2048 2048 2048 -u r r 4096 4096 4096 4096 4096 4096 -f c p 2482 1127 2050 2482 2050 2482 -f c p 2483 1127 2050 2483 2050 2483 -f c p 2484 1127 2050 2484 2050 2484 -f c p 2485 1127 2050 2485 2050 2485 -f c p 480 1138 2050 480 2050 480 -f c p 481 1138 2050 481 2050 481 -f c p 482 1138 2050 482 2050 482 -f c p 483 1138 2050 483 2050 483 -f c p 484 1138 2050 484 2050 484 -f c p 485 1138 2050 485 2050 485 -f c p 1 1 3 1 3 1 -f c p 1 9 3 1 3 1 -f c p 1 2048 3 1 3 1 -f c p 1 2048 5192 1 5192 1 -f c p 9 1 3 9 3 9 -f c p 576 1 3500 576 3500 576 -f c p 1 1 1 1 1 1 -f c p 102 1088 1024 102 1024 102 -b r r 480 20 2050 2050 20 20 -b r r 481 20 2050 2050 20 20 -b r r 482 20 2050 2050 20 20 -b r p 483 20 2050 2050 20 20 -b r R 484 20 2050 2050 20 20 -b r R 485 20 2050 2050 20 20 -b r R 480 39 2050 2050 39 39 -b r R 481 39 2050 2050 39 39 -b r R 482 39 2050 2050 39 39 -b r R 483 39 2050 2050 39 39 -b r R 484 39 2050 2050 39 39 -b r p 485 39 2050 2050 39 39 -b r p 480 50 2050 2050 50 50 -b r p 481 50 2050 2050 50 50 -b r p 482 50 2050 2050 50 50 -b r p 483 50 2050 2050 50 50 -b r p 484 50 2050 2050 50 50 -b r p 485 50 2050 2050 50 50 -b r R 480 1108 2050 2050 1108 1108 -b r R 481 1108 2050 2050 1108 1108 -b r R 482 1108 2050 2050 1108 1108 -b r R 483 1108 2050 2050 1108 1108 -b r R 484 1108 2050 2050 1108 1108 -b r R 485 1108 2050 2050 1108 1108 -b r R 480 1127 2050 2050 1127 1127 -b r R 481 1127 2050 2050 1127 1127 -b r R 482 1127 2050 2050 1127 1127 -b r R 483 1127 2050 2050 1127 1127 -b r p 484 1127 2050 2050 1127 1127 -b r p 485 1127 2050 2050 1127 1127 -b r p 480 1138 2050 2050 1138 1138 -b r p 481 1138 2050 2050 1138 1138 -b r p 482 1138 2050 2050 1138 1138 -b r p 483 1138 2050 2050 1138 1138 -b r p 484 1138 2050 2050 1138 1138 -b r p 485 1138 2050 2050 1138 1138 -b r p 1 1 3 3 1 1 -b r p 1 9 3 3 9 9 -b r p 1 2048 3 3 2048 2048 -b r p 1 2048 5192 5192 2048 2048 -b r p 9 1 3 3 1 1 -b r p 576 1 3500 3500 1 1 -b r p 1 1 1 1 1 1 -b r p 102 1088 1024 1024 1088 1088 -b r p 102 2048 1024 1024 2048 2048 -b r p 485 656 1024 1024 656 656 -b r p 483 656 1024 1024 656 656 -b r p 81 128 3 3 128 128 -b r p 1022 512 515 515 512 512 -b r p 74 512 515 515 512 512 -b r p 253 2048 515 515 2048 2048 -b r p 8192 1040 515 515 1040 1040 -b r p 10 1029 515 515 1029 1029 -b r p 24 1040 2050 2050 1040 1040 -b r p 1024 1029 2050 2050 1029 1029 -b r p 480 660 2050 2050 660 660 -b r p 481 660 2050 2050 660 660 -b r p 482 660 2050 2050 660 660 -b r p 483 660 2050 2050 660 660 -b r p 484 660 2050 2050 660 660 -b r p 485 660 2050 2050 660 660 -b r p 480 679 2050 2050 679 679 -b r p 481 679 2050 2050 679 679 -b r p 482 679 2050 2050 679 679 -b r p 483 679 2050 2050 679 679 -b r p 484 679 2050 2050 679 679 -b r p 485 679 2050 2050 679 679 -b r p 480 690 2050 2050 690 690 -b r p 481 690 2050 2050 690 690 -b r p 482 690 2050 2050 690 690 -b r p 483 690 2050 2050 690 690 -b r p 484 690 2050 2050 690 690 -b r p 485 690 2050 2050 690 690 -b r p 480 660 2048 2048 660 660 -b r p 481 660 2048 2048 660 660 -b r p 482 660 2048 2048 660 660 -b r p 483 660 2048 2048 660 660 -b r p 484 660 2048 2048 660 660 -b r p 485 660 2048 2048 660 660 -b r p 480 679 2048 2048 679 679 -b r p 481 679 2048 2048 679 679 -b r p 482 679 2048 2048 679 679 -b r p 483 679 2048 2048 679 679 -b r p 484 679 2048 2048 679 679 -b r p 485 679 2048 2048 679 679 -b r p 480 690 2048 2048 690 690 -b r p 481 690 2048 2048 690 690 -b r p 482 690 2048 2048 690 690 -b r p 483 690 2048 2048 690 690 -b r p 484 690 2048 2048 690 690 -b r p 485 690 2048 2048 690 690 -b r p 480 656 1024 1024 656 656 -b r p 480 128 3 3 128 128 -b r p 1024 512 515 515 512 512 -b r p 1024 2048 1024 1024 2048 2048 -b r p 1024 2048 515 515 2048 2048 -b r p 1024 1040 515 515 1040 1040 -b r p 5 1029 515 515 1029 1029 -b r p 1024 1029 515 515 1029 1029 -b r p 1024 1040 2050 2050 1040 1040 -b r p 1029 1029 2050 2050 1029 1029 -b r R 480 646 2050 2050 646 646 -b r R 481 646 2050 2050 646 646 -b r R 482 646 2050 2050 646 646 -b r R 483 646 2050 2050 646 646 -b r R 484 646 2050 2050 646 646 -b r R 485 646 2050 2050 646 646 -b r R 481 656 2050 2050 656 656 -b r R 482 656 2050 2050 656 656 -b r R 483 656 2050 2050 656 656 -b r R 484 656 2050 2050 656 656 -b r p 485 656 2050 2050 656 656 -b r p 480 672 2050 2050 672 672 -b r p 481 672 2050 2050 672 672 -b r p 482 672 2050 2050 672 672 -b r p 483 672 2050 2050 672 672 -b r p 484 672 2050 2050 672 672 -b r p 485 672 2050 2050 672 672 -b r p 480 688 2050 2050 688 688 -b r p 481 688 2050 2050 688 688 -b r r 482 688 2050 2050 688 688 -b r r 483 688 2050 2050 688 688 -b r r 484 688 2050 2050 688 688 -b r r 485 688 2050 2050 688 688 -b r r 1024 512 64 64 512 512 -b r r 16 256 512 512 256 256 -b r r 480 640 512 512 640 640 -b r r 64 768 512 512 768 768 -b r r 128 128 128 128 128 128 -b r r 1024 64 512 512 64 64 -b r r 1024 256 32 32 256 256 -b r r 1024 512 64 64 512 512 -b r r 480 640 512 512 640 640 -b r p 1024 32 256 256 32 32 -b r P 1024 64 512 512 64 64 -b r P 64 800 320 320 800 800 -b r P 64 768 512 512 768 768 -b r P 16 256 512 512 256 256 -b r P 128 128 128 128 128 128 -b r P 256 512 256 256 512 512 -b r P 1024 1024 1024 1024 1024 1024 -b r P 480 640 1024 1024 640 640 -b r P 480 640 256 256 640 640 -b r P 8 64 32 32 64 64 -b r P 9 64 32 32 64 64 -b r P 10 128 64 64 128 128 -b r P 8 8 8 8 8 8 -b r P 12 12 12 12 12 12 -b r P 25 25 25 25 25 25 -b r P 25 25 20 20 25 25 -b c p 485 39 2050 485 2050 485 -b c p 480 50 2050 480 2050 480 -b c p 481 50 2050 481 2050 481 -b c p 482 50 2050 482 2050 482 -b c p 483 50 2050 483 2050 483 -b c p 484 50 2050 484 2050 484 -b c p 485 50 2050 485 2050 485 -b c p 484 1127 2050 484 2050 484 -b c p 485 1127 2050 485 2050 485 -b c p 480 1138 2050 480 2050 480 -b c p 481 1138 2050 481 2050 481 -b c p 482 1138 2050 482 2050 482 -b c p 483 1138 2050 483 2050 483 -b c p 484 1138 2050 484 2050 484 -b c p 485 1138 2050 485 2050 485 -b c p 1 1 3 1 3 1 -b c p 1 9 3 1 3 1 -b c p 1 2048 3 1 3 1 -b c p 1 2048 5192 1 5192 1 -b c p 9 1 3 9 3 9 -b c p 576 1 3500 576 3500 576 -b c p 1 1 1 1 1 1 -b c p 102 1088 1024 102 1024 102 -b c p 102 2048 1024 102 1024 102 -b c p 485 656 1024 485 1024 485 -b c p 483 656 1024 483 1024 483 -b c p 81 128 3 81 3 81 -b c p 1022 512 515 1022 515 1022 -b c p 74 512 515 74 515 74 -b c p 253 2048 515 253 515 253 -b c p 8192 1040 515 8192 515 8192 -b c p 10 1029 515 10 515 10 -b c p 24 1040 2050 24 2050 24 -b c p 1024 1029 2050 1024 2050 1024 -b c p 480 660 2050 480 2050 480 -b c p 481 660 2050 481 2050 481 -b c p 482 660 2050 482 2050 482 -b c p 483 660 2050 483 2050 483 -b c p 484 660 2050 484 2050 484 -b c p 485 660 2050 485 2050 485 -b c p 480 679 2050 480 2050 480 -b c p 481 679 2050 481 2050 481 -b c p 482 679 2050 482 2050 482 -b c p 483 679 2050 483 2050 483 -b c p 484 679 2050 484 2050 484 -b c p 485 679 2050 485 2050 485 -b c p 480 690 2050 480 2050 480 -b c p 481 690 2050 481 2050 481 -b c p 482 690 2050 482 2050 482 -b c p 483 690 2050 483 2050 483 -b c p 484 690 2050 484 2050 484 -b c p 485 690 2050 485 2050 485 -b c p 480 660 2048 480 2048 480 -b c p 481 660 2048 481 2048 481 -b c p 482 660 2048 482 2048 482 -b c p 483 660 2048 483 2048 483 -b c p 484 660 2048 484 2048 484 -b c p 485 660 2048 485 2048 485 -b c p 480 679 2048 480 2048 480 -b c p 481 679 2048 481 2048 481 -b c p 482 679 2048 482 2048 482 -b c p 483 679 2048 483 2048 483 -b c p 484 679 2048 484 2048 484 -b c p 485 679 2048 485 2048 485 -b c p 480 690 2048 480 2048 480 -b c p 481 690 2048 481 2048 481 -b c p 482 690 2048 482 2048 482 -b c p 483 690 2048 483 2048 483 -b c p 484 690 2048 484 2048 484 -b c p 485 690 2048 485 2048 485 -b c p 480 656 1024 480 1024 480 -b c p 480 128 3 480 3 480 -b c p 1024 512 515 1024 515 1024 -b c p 1024 2048 1024 1024 1024 1024 -b c p 1024 2048 515 1024 515 1024 -b c p 1024 1040 515 1024 515 1024 -b c p 5 1029 515 5 515 5 -b c p 1024 1029 515 1024 515 1024 -b c p 1024 1040 2050 1024 2050 1024 -b c p 1029 1029 2050 1029 2050 1029 -b c p 485 656 2050 485 2050 485 -b c p 480 672 2050 480 2050 480 -b c p 481 672 2050 481 2050 481 -b c p 482 672 2050 482 2050 482 -b c p 483 672 2050 483 2050 483 -b c p 484 672 2050 484 2050 484 -b c p 485 672 2050 485 2050 485 -b c p 480 688 2050 480 2050 480 -b c p 481 688 2050 481 2050 481 -b c p 1024 32 256 1024 256 1024 -b c P 1024 64 512 1024 512 1024 -b c P 64 800 320 64 320 64 -b c P 64 768 512 64 512 64 -b c P 16 256 512 16 512 16 -b c P 128 128 128 128 128 128 -b c P 256 512 256 256 256 256 -b c P 1024 1024 1024 1024 1024 1024 -b c P 480 640 1024 480 1024 480 -b c P 480 640 256 480 256 480 -b c P 8 64 32 8 32 8 -b c P 9 64 32 9 32 9 -b c P 10 128 64 10 64 10 -b c P 8 8 8 8 8 8 -b c P 12 12 12 12 12 12 -b c P 25 25 25 25 25 25 -b c P 25 25 20 25 20 25 -s r r 480 20 2050 2050 20 20 -s r r 481 20 2050 2050 20 20 -s r r 482 20 2050 2050 20 20 -s r p 483 20 2050 2050 20 20 -s r R 484 20 2050 2050 20 20 -s r R 485 20 2050 2050 20 20 -s r R 480 39 2050 2050 39 39 -s r R 481 39 2050 2050 39 39 -s r R 482 39 2050 2050 39 39 -s r R 483 39 2050 2050 39 39 -s r R 484 39 2050 2050 39 39 -s r p 485 39 2050 2050 39 39 -s r p 480 50 2050 2050 50 50 -s r p 481 50 2050 2050 50 50 -s r p 482 50 2050 2050 50 50 -s r p 483 50 2050 2050 50 50 -s r p 484 50 2050 2050 50 50 -s r p 485 50 2050 2050 50 50 -s r R 480 1108 2050 2050 1108 1108 -s r R 481 1108 2050 2050 1108 1108 -s r R 482 1108 2050 2050 1108 1108 -s r R 483 1108 2050 2050 1108 1108 -s r R 484 1108 2050 2050 1108 1108 -s r R 485 1108 2050 2050 1108 1108 -s r R 480 1127 2050 2050 1127 1127 -s r R 481 1127 2050 2050 1127 1127 -s r R 482 1127 2050 2050 1127 1127 -s r R 483 1127 2050 2050 1127 1127 -s r p 484 1127 2050 2050 1127 1127 -s r p 485 1127 2050 2050 1127 1127 -s r p 480 1138 2050 2050 1138 1138 -s r p 481 1138 2050 2050 1138 1138 -s r p 482 1138 2050 2050 1138 1138 -s r p 483 1138 2050 2050 1138 1138 -s r p 484 1138 2050 2050 1138 1138 -s r p 485 1138 2050 2050 1138 1138 -s r p 1 1 3 3 1 1 -s r p 1 9 3 3 9 9 -s r p 1 2048 3 3 2048 2048 -s r p 1 2048 5192 5192 2048 2048 -s r p 9 1 3 3 1 1 -s r p 576 1 3500 3500 1 1 -s r p 1 1 1 1 1 1 -s r p 102 1088 1024 1024 1088 1088 -s r p 102 2048 1024 1024 2048 2048 -s r p 485 656 1024 1024 656 656 -s r p 483 656 1024 1024 656 656 -s r p 81 128 3 3 128 128 -s r p 1022 512 515 515 512 512 -s r p 74 512 515 515 512 512 -s r p 253 2048 515 515 2048 2048 -s r p 8192 1040 515 515 1040 1040 -s r p 10 1029 515 515 1029 1029 -s r p 24 1040 2050 2050 1040 1040 -s r p 1024 1029 2050 2050 1029 1029 -s r p 480 660 2050 2050 660 660 -s r p 481 660 2050 2050 660 660 -s r p 482 660 2050 2050 660 660 -s r p 483 660 2050 2050 660 660 -s r p 484 660 2050 2050 660 660 -s r p 485 660 2050 2050 660 660 -s r p 480 679 2050 2050 679 679 -s r p 481 679 2050 2050 679 679 -s r p 482 679 2050 2050 679 679 -s r p 483 679 2050 2050 679 679 -s r p 484 679 2050 2050 679 679 -s r p 485 679 2050 2050 679 679 -s r p 480 690 2050 2050 690 690 -s r p 481 690 2050 2050 690 690 -s r p 482 690 2050 2050 690 690 -s r p 483 690 2050 2050 690 690 -s r p 484 690 2050 2050 690 690 -s r p 485 690 2050 2050 690 690 -s r p 480 660 2048 2048 660 660 -s r p 481 660 2048 2048 660 660 -s r p 482 660 2048 2048 660 660 -s r p 483 660 2048 2048 660 660 -s r p 484 660 2048 2048 660 660 -s r p 485 660 2048 2048 660 660 -s r p 480 679 2048 2048 679 679 -s r p 481 679 2048 2048 679 679 -s r p 482 679 2048 2048 679 679 -s r p 483 679 2048 2048 679 679 -s r p 484 679 2048 2048 679 679 -s r p 485 679 2048 2048 679 679 -s r p 480 690 2048 2048 690 690 -s r p 481 690 2048 2048 690 690 -s r p 482 690 2048 2048 690 690 -s r p 483 690 2048 2048 690 690 -s r p 484 690 2048 2048 690 690 -s r p 485 690 2048 2048 690 690 -s r p 480 656 1024 1024 656 656 -s r p 480 128 3 3 128 128 -s r p 1024 512 515 515 512 512 -s r p 1024 2048 1024 1024 2048 2048 -s r p 1024 2048 515 515 2048 2048 -s r p 1024 1040 515 515 1040 1040 -s r p 5 1029 515 515 1029 1029 -s r p 1024 1029 515 515 1029 1029 -s r p 1024 1040 2050 2050 1040 1040 -s r p 1029 1029 2050 2050 1029 1029 -s r R 480 646 2050 2050 646 646 -s r R 481 646 2050 2050 646 646 -s r R 482 646 2050 2050 646 646 -s r R 483 646 2050 2050 646 646 -s r R 484 646 2050 2050 646 646 -s r R 485 646 2050 2050 646 646 -s r R 481 656 2050 2050 656 656 -s r R 482 656 2050 2050 656 656 -s r R 483 656 2050 2050 656 656 -s r R 484 656 2050 2050 656 656 -s r p 485 656 2050 2050 656 656 -s r p 480 672 2050 2050 672 672 -s r p 481 672 2050 2050 672 672 -s r p 482 672 2050 2050 672 672 -s r p 483 672 2050 2050 672 672 -s r p 484 672 2050 2050 672 672 -s r p 485 672 2050 2050 672 672 -s r p 480 688 2050 2050 688 688 -s r p 481 688 2050 2050 688 688 -s r r 482 688 2050 2050 688 688 -s r r 483 688 2050 2050 688 688 -s r r 484 688 2050 2050 688 688 -s r r 485 688 2050 2050 688 688 -s r r 1024 512 64 64 512 512 -s r r 16 256 512 512 256 256 -s r r 480 640 512 512 640 640 -s r r 64 768 512 512 768 768 -s r r 128 128 128 128 128 128 -s r r 1024 64 512 512 64 64 -s r r 1024 256 32 32 256 256 -s r r 1024 512 64 64 512 512 -s r r 480 640 512 512 640 640 -s r p 1024 32 256 256 32 32 -s r P 1024 64 512 512 64 64 -s r P 64 800 320 320 800 800 -s r P 64 768 512 512 768 768 -s r P 16 256 512 512 256 256 -s r P 128 128 128 128 128 128 -s r P 256 512 256 256 512 512 -s r P 1024 1024 1024 1024 1024 1024 -s r P 480 640 1024 1024 640 640 -s r P 480 640 256 256 640 640 -s r P 8 64 32 32 64 64 -s r P 9 64 32 32 64 64 -s r P 10 128 64 64 128 128 -s r P 8 8 8 8 8 8 -s r P 12 12 12 12 12 12 -s r P 25 25 25 25 25 25 -s r P 25 25 20 20 25 25 -i r p 480 20 2050 2050 20 20 -i r p 481 20 2050 2050 20 20 -i r p 482 20 2050 2050 20 20 -i r p 483 20 2050 2050 20 20 -i r R 484 20 2050 2050 20 20 -i r R 485 20 2050 2050 20 20 -i r R 480 39 2050 2050 39 39 -i r R 481 39 2050 2050 39 39 -i r R 482 39 2050 2050 39 39 -i r R 483 39 2050 2050 39 39 -i r R 484 39 2050 2050 39 39 -i r p 485 39 2050 2050 39 39 -i r p 480 50 2050 2050 50 50 -i r p 481 50 2050 2050 50 50 -i r p 482 50 2050 2050 50 50 -i r p 483 50 2050 2050 50 50 -i r p 484 50 2050 2050 50 50 -i r p 485 50 2050 2050 50 50 -i r R 480 1108 2050 2050 1108 1108 -i r R 481 1108 2050 2050 1108 1108 -i r R 482 1108 2050 2050 1108 1108 -i r R 483 1108 2050 2050 1108 1108 -i r R 484 1108 2050 2050 1108 1108 -i r R 485 1108 2050 2050 1108 1108 -i r R 480 1127 2050 2050 1127 1127 -i r R 481 1127 2050 2050 1127 1127 -i r R 482 1127 2050 2050 1127 1127 -i r R 483 1127 2050 2050 1127 1127 -i r p 484 1127 2050 2050 1127 1127 -i r p 485 1127 2050 2050 1127 1127 -i r p 480 1138 2050 2050 1138 1138 -i r p 481 1138 2050 2050 1138 1138 -i r p 482 1138 2050 2050 1138 1138 -i r p 483 1138 2050 2050 1138 1138 -i r p 484 1138 2050 2050 1138 1138 -i r p 485 1138 2050 2050 1138 1138 -i r p 1 1 3 3 1 1 -i r p 1 9 3 3 9 9 -i r p 1 2048 3 3 2048 2048 -i r p 1 2048 5192 5192 2048 2048 -i r p 9 1 3 3 1 1 -i r p 576 1 3500 3500 1 1 -i r p 1 1 1 1 1 1 -i r p 102 1088 1024 1024 1088 1088 -i r p 102 2048 1024 1024 2048 2048 -i r p 485 656 1024 1024 656 656 -i r p 483 656 1024 1024 656 656 -i r p 81 128 3 3 128 128 -i r p 1022 512 515 515 512 512 -i r p 74 512 515 515 512 512 -i r p 253 2048 515 515 2048 2048 -i r p 8192 1040 515 515 1040 1040 -i r p 10 1029 515 515 1029 1029 -i r p 24 1040 2050 2050 1040 1040 -i r p 1024 1029 2050 2050 1029 1029 -i r p 480 660 2050 2050 660 660 -i r p 481 660 2050 2050 660 660 -i r p 482 660 2050 2050 660 660 -i r p 483 660 2050 2050 660 660 -i r p 484 660 2050 2050 660 660 -i r p 485 660 2050 2050 660 660 -i r p 480 679 2050 2050 679 679 -i r p 481 679 2050 2050 679 679 -i r p 482 679 2050 2050 679 679 -i r p 483 679 2050 2050 679 679 -i r p 484 679 2050 2050 679 679 -i r p 485 679 2050 2050 679 679 -i r p 480 690 2050 2050 690 690 -i r p 481 690 2050 2050 690 690 -i r p 482 690 2050 2050 690 690 -i r p 483 690 2050 2050 690 690 -i r p 484 690 2050 2050 690 690 -i r p 485 690 2050 2050 690 690 -i r p 480 660 2048 2048 660 660 -i r p 481 660 2048 2048 660 660 -i r p 482 660 2048 2048 660 660 -i r p 483 660 2048 2048 660 660 -i r p 484 660 2048 2048 660 660 -i r p 485 660 2048 2048 660 660 -i r p 480 679 2048 2048 679 679 -i r p 481 679 2048 2048 679 679 -i r p 482 679 2048 2048 679 679 -i r p 483 679 2048 2048 679 679 -i r p 484 679 2048 2048 679 679 -i r p 485 679 2048 2048 679 679 -i r p 480 690 2048 2048 690 690 -i r p 481 690 2048 2048 690 690 -i r p 482 690 2048 2048 690 690 -i r p 483 690 2048 2048 690 690 -i r p 484 690 2048 2048 690 690 -i r p 485 690 2048 2048 690 690 -i r p 480 656 1024 1024 656 656 -i r p 480 128 3 3 128 128 -i r p 1024 512 515 515 512 512 -i r p 1024 2048 1024 1024 2048 2048 -i r p 1024 2048 515 515 2048 2048 -i r p 1024 1040 515 515 1040 1040 -i r p 5 1029 515 515 1029 1029 -i r p 1024 1029 515 515 1029 1029 -i r p 1024 1040 2050 2050 1040 1040 -i r p 1029 1029 2050 2050 1029 1029 -i r R 480 646 2050 2050 646 646 -i r R 481 646 2050 2050 646 646 -i r R 482 646 2050 2050 646 646 -i r R 483 646 2050 2050 646 646 -i r R 484 646 2050 2050 646 646 -i r R 485 646 2050 2050 646 646 -i r R 481 656 2050 2050 656 656 -i r R 482 656 2050 2050 656 656 -i r R 483 656 2050 2050 656 656 -i r R 484 656 2050 2050 656 656 -i r p 485 656 2050 2050 656 656 -i r p 480 672 2050 2050 672 672 -i r p 481 672 2050 2050 672 672 -i r p 482 672 2050 2050 672 672 -i r p 483 672 2050 2050 672 672 -i r p 484 672 2050 2050 672 672 -i r p 485 672 2050 2050 672 672 -i r p 480 688 2050 2050 688 688 -i r p 481 688 2050 2050 688 688 -i r r 482 688 2050 2050 688 688 -i r r 483 688 2050 2050 688 688 -i r r 484 688 2050 2050 688 688 -i r r 485 688 2050 2050 688 688 -i r r 1024 512 64 64 512 512 -i r r 16 256 512 512 256 256 -i r r 480 640 512 512 640 640 -i r r 64 768 512 512 768 768 -i r r 128 128 128 128 128 128 -i r r 1024 64 512 512 64 64 -i r r 1024 256 32 32 256 256 -i r r 1024 512 64 64 512 512 -i r r 480 640 512 512 640 640 -i r p 1024 32 256 256 32 32 -i r P 1024 64 512 512 64 64 -i r P 64 800 320 320 800 800 -i r P 64 768 512 512 768 768 -i r P 16 256 512 512 256 256 -i r P 128 128 128 128 128 128 -i r P 256 512 256 256 512 512 -i r P 1024 1024 1024 1024 1024 1024 -i r P 480 640 1024 1024 640 640 -i r P 480 640 256 256 640 640 -i r P 8 64 32 32 64 64 -i r P 9 64 32 32 64 64 -i r P 10 128 64 64 128 128 -i r P 8 8 8 8 8 8 -i r P 12 12 12 12 12 12 -i r P 25 25 25 25 25 25 -i r P 25 25 20 20 25 25 -f r p 480 20 2050 2050 20 20 -f r p 481 20 2050 2050 20 20 -f r p 482 20 2050 2050 20 20 -f r p 483 20 2050 2050 20 20 -f r R 484 20 2050 2050 20 20 -f r R 485 20 2050 2050 20 20 -f r R 480 39 2050 2050 39 39 -f r R 481 39 2050 2050 39 39 -f r R 482 39 2050 2050 39 39 -f r R 483 39 2050 2050 39 39 -f r R 484 39 2050 2050 39 39 -f r p 485 39 2050 2050 39 39 -f r p 480 50 2050 2050 50 50 -f r p 481 50 2050 2050 50 50 -f r p 482 50 2050 2050 50 50 -f r p 483 50 2050 2050 50 50 -f r p 484 50 2050 2050 50 50 -f r p 485 50 2050 2050 50 50 -f r R 480 1108 2050 2050 1108 1108 -f r R 481 1108 2050 2050 1108 1108 -f r R 482 1108 2050 2050 1108 1108 -f r R 483 1108 2050 2050 1108 1108 -f r R 484 1108 2050 2050 1108 1108 -f r R 485 1108 2050 2050 1108 1108 -f r R 480 1127 2050 2050 1127 1127 -f r R 481 1127 2050 2050 1127 1127 -f r R 482 1127 2050 2050 1127 1127 -f r R 483 1127 2050 2050 1127 1127 -f r p 484 1127 2050 2050 1127 1127 -f r p 485 1127 2050 2050 1127 1127 -f r p 480 1138 2050 2050 1138 1138 -f r p 481 1138 2050 2050 1138 1138 -f r p 482 1138 2050 2050 1138 1138 -f r p 483 1138 2050 2050 1138 1138 -f r p 484 1138 2050 2050 1138 1138 -f r p 485 1138 2050 2050 1138 1138 -f r p 1 1 3 3 1 1 -f r p 1 9 3 3 9 9 -f r p 1 2048 3 3 2048 2048 -f r p 1 2048 5192 5192 2048 2048 -f r p 9 1 3 3 1 1 -f r p 576 1 3500 3500 1 1 -f r p 1 1 1 1 1 1 -f r p 102 1088 1024 1024 1088 1088 -f r p 102 2048 1024 1024 2048 2048 -f r p 485 656 1024 1024 656 656 -f r p 483 656 1024 1024 656 656 -f r p 81 128 3 3 128 128 -f r p 1022 512 515 515 512 512 -f r p 74 512 515 515 512 512 -f r p 253 2048 515 515 2048 2048 -f r p 8192 1040 515 515 1040 1040 -f r p 10 1029 515 515 1029 1029 -f r p 24 1040 2050 2050 1040 1040 -f r p 1024 1029 2050 2050 1029 1029 -f r p 480 660 2050 2050 660 660 -f r p 481 660 2050 2050 660 660 -f r p 482 660 2050 2050 660 660 -f r p 483 660 2050 2050 660 660 -f r p 484 660 2050 2050 660 660 -f r p 485 660 2050 2050 660 660 -f r p 480 679 2050 2050 679 679 -f r p 481 679 2050 2050 679 679 -f r p 482 679 2050 2050 679 679 -f r p 483 679 2050 2050 679 679 -f r p 484 679 2050 2050 679 679 -f r p 485 679 2050 2050 679 679 -f r p 480 690 2050 2050 690 690 -f r p 481 690 2050 2050 690 690 -f r p 482 690 2050 2050 690 690 -f r p 483 690 2050 2050 690 690 -f r p 484 690 2050 2050 690 690 -f r p 485 690 2050 2050 690 690 -f r p 480 660 2048 2048 660 660 -f r p 481 660 2048 2048 660 660 -f r p 482 660 2048 2048 660 660 -f r p 483 660 2048 2048 660 660 -f r p 484 660 2048 2048 660 660 -f r p 485 660 2048 2048 660 660 -f r p 480 679 2048 2048 679 679 -f r p 481 679 2048 2048 679 679 -f r p 482 679 2048 2048 679 679 -f r p 483 679 2048 2048 679 679 -f r p 484 679 2048 2048 679 679 -f r p 485 679 2048 2048 679 679 -f r p 480 690 2048 2048 690 690 -f r p 481 690 2048 2048 690 690 -f r p 482 690 2048 2048 690 690 -f r p 483 690 2048 2048 690 690 -f r p 484 690 2048 2048 690 690 -f r p 485 690 2048 2048 690 690 -f r p 480 656 1024 1024 656 656 -f r p 480 128 3 3 128 128 -f r p 1024 512 515 515 512 512 -f r p 1024 2048 1024 1024 2048 2048 -f r p 1024 2048 515 515 2048 2048 -f r p 1024 1040 515 515 1040 1040 -f r p 5 1029 515 515 1029 1029 -f r p 1024 1029 515 515 1029 1029 -f r p 1024 1040 2050 2050 1040 1040 -f r p 1029 1029 2050 2050 1029 1029 -f r R 480 646 2050 2050 646 646 -f r R 481 646 2050 2050 646 646 -f r R 482 646 2050 2050 646 646 -f r R 483 646 2050 2050 646 646 -f r R 484 646 2050 2050 646 646 -f r R 485 646 2050 2050 646 646 -f r R 481 656 2050 2050 656 656 -f r R 482 656 2050 2050 656 656 -f r R 483 656 2050 2050 656 656 -f r R 484 656 2050 2050 656 656 -f r p 485 656 2050 2050 656 656 -f r p 480 672 2050 2050 672 672 -f r p 481 672 2050 2050 672 672 -f r p 482 672 2050 2050 672 672 -f r p 483 672 2050 2050 672 672 -f r p 484 672 2050 2050 672 672 -f r p 485 672 2050 2050 672 672 -f r p 480 688 2050 2050 688 688 -f r p 481 688 2050 2050 688 688 -f r r 482 688 2050 2050 688 688 -f r r 483 688 2050 2050 688 688 -f r r 484 688 2050 2050 688 688 -f r r 485 688 2050 2050 688 688 -f r r 1024 512 64 64 512 512 -f r r 16 256 512 512 256 256 -f r r 480 640 512 512 640 640 -f r r 64 768 512 512 768 768 -f r r 128 128 128 128 128 128 -f r r 1024 64 512 512 64 64 -f r r 1024 256 32 32 256 256 -f r r 1024 512 64 64 512 512 -f r r 480 640 512 512 640 640 -f r p 1024 32 256 256 32 32 -f r P 1024 64 512 512 64 64 -f r P 64 800 320 320 800 800 -f r P 64 768 512 512 768 768 -f r P 16 256 512 512 256 256 -f r P 128 128 128 128 128 128 -f r P 256 512 256 256 512 512 -f r P 1024 1024 1024 1024 1024 1024 -f r P 480 640 1024 1024 640 640 -f r P 480 640 256 256 640 640 -f r P 8 64 32 32 64 64 -f r P 9 64 32 32 64 64 -f r P 10 128 64 64 128 128 -f r P 8 8 8 8 8 8 -f r P 12 12 12 12 12 12 -f r P 25 25 25 25 25 25 -f r P 25 25 20 20 25 25 -i r r 4096 256 5 5 256 256 -i r r 3000 256 128 128 256 256 -i r r 4096 1024 512 512 1024 1024 -i r r 144 256 5 5 256 256 -i r r 144 256 128 128 256 256 -i r r 144 1024 512 512 1024 1024 -i r r 480 688 256 256 688 688 -i r r 480 640 512 512 640 640 -i r r 480 640 1024 1024 640 640 -i r r 64 800 320 320 800 800 -i r r 64 768 512 512 768 768 -i r r 16 256 512 512 256 256 -i r r 128 128 128 128 128 128 -i r r 256 512 256 256 512 512 -i r r 1024 1024 1024 1024 1024 1024 -i r r 1024 32 256 256 32 32 -i r r 1024 64 512 512 64 64 -i r r 1024 256 32 32 256 256 -i r r 1024 512 64 64 512 512 -i r r 512 32 256 256 32 32 -i r r 512 768 512 512 768 768 -i r r 512 256 32 32 256 256 -i r r 512 512 64 64 512 512 -i r r 512 256 768 768 256 256 -i r r 768 768 1024 1024 768 768 -i r r 768 768 768 768 768 768 -i r r 2048 2048 2048 2048 2048 2048 -i r r 4096 4096 4096 4096 4096 4096 -f r r 4096 256 5 5 256 256 -f r r 3000 256 128 128 256 256 -f r r 4096 1024 512 512 1024 1024 -f r r 144 256 5 5 256 256 -f r r 144 256 128 128 256 256 -f r r 144 1024 512 512 1024 1024 -f r r 480 688 256 256 688 688 -f r r 480 640 512 512 640 640 -f r r 480 640 1024 1024 640 640 -f r r 64 800 320 320 800 800 -f r r 64 768 512 512 768 768 -f r r 16 256 512 512 256 256 -f r r 128 128 128 128 128 128 -f r r 256 512 256 256 512 512 -f r r 1024 1024 1024 1024 1024 1024 -f r r 1024 32 256 256 32 32 -f r r 1024 64 512 512 64 64 -f r r 1024 256 32 32 256 256 -f r r 1024 512 64 64 512 512 -f r r 512 32 256 256 32 32 -f r r 512 768 512 512 768 768 -f r r 512 256 32 32 256 256 -f r r 512 512 64 64 512 512 -f r r 512 256 768 768 256 256 -f r r 768 768 1024 1024 768 768 -f r r 768 768 768 768 768 768 -f r r 2048 2048 2048 2048 2048 2048 -f r r 4096 4096 4096 4096 4096 4096 -f r r 2048 1024 1024 1024 1024 1024 -f r r 2048 4096 1024 1024 4096 4096 -f r r 2048 1024 4096 4096 1024 1024 -f r r 2048 1024 2 2 1024 1024 -f r r 128 1024 1024 1024 1024 1024 -f r r 1536 768 768 768 768 768 -f r r 1536 3072 768 768 3072 3072 -f r r 1536 768 3072 3072 768 768 -f r r 1536 768 2 2 768 768 -f r r 128 768 768 768 768 768 -f r r 1024 8 13 13 8 8 -f r r 1024 4 8 8 4 4 -f r r 1024 128 355 355 128 128 -f r r 1024 64 128 128 64 64 -f r r 1024 1 64 64 1 1 -f r r 480 1 256 256 1 1 -f r r 480 256 512 512 256 256 -f r r 480 1024 845 845 1024 1024 -f r r 480 512 1024 1024 512 512 -f r r 10 17191 128 128 17191 17191 -f r r 10 512 256 256 512 512 +u r n n n p 480 20 2050 2050 20 20 +u r n n n p 481 20 2050 2050 20 20 +u r n n n p 482 20 2050 2050 20 20 +u r n n n p 483 20 2050 2050 20 20 +u r n n n R 484 20 2050 2050 20 20 +u r n n n R 485 20 2050 2050 20 20 +u r n n n R 480 39 2050 2050 39 39 +u r n n n R 481 39 2050 2050 39 39 +u r n n n R 482 39 2050 2050 39 39 +u r n n n R 483 39 2050 2050 39 39 +u r n n n R 484 39 2050 2050 39 39 +u r n n n p 485 39 2050 2050 39 39 +u r n n n p 480 50 2050 2050 50 50 +u r n n n p 481 50 2050 2050 50 50 +u r n n n p 482 50 2050 2050 50 50 +u r n n n p 483 50 2050 2050 50 50 +u r n n n p 484 50 2050 2050 50 50 +u r n n n p 485 50 2050 2050 50 50 +u r n n n R 480 1108 2050 2050 1108 1108 +u r n n n R 481 1108 2050 2050 1108 1108 +u r n n n R 482 1108 2050 2050 1108 1108 +u r n n n R 483 1108 2050 2050 1108 1108 +u r n n n R 484 1108 2050 2050 1108 1108 +u r n n n R 485 1108 2050 2050 1108 1108 +u r n n n R 480 1127 2050 2050 1127 1127 +u r n n n R 481 1127 2050 2050 1127 1127 +u r n n n R 482 1127 2050 2050 1127 1127 +u r n n n R 483 1127 2050 2050 1127 1127 +u r n n n p 484 1127 2050 2050 1127 1127 +u r n n n p 485 1127 2050 2050 1127 1127 +u r n n n p 480 1138 2050 2050 1138 1138 +u r n n n p 481 1138 2050 2050 1138 1138 +u r n n n p 482 1138 2050 2050 1138 1138 +u r n n n p 483 1138 2050 2050 1138 1138 +u r n n n p 484 1138 2050 2050 1138 1138 +u r n n n p 485 1138 2050 2050 1138 1138 +u r n n n p 1 1 3 3 1 1 +u r n n n p 1 9 3 3 9 9 +u r n n n p 1 2048 3 3 2048 2048 +u r n n n p 1 2048 5192 5192 2048 2048 +u r n n n p 9 1 3 3 1 1 +u r n n n p 576 1 3500 3500 1 1 +u r n n n p 1 1 1 1 1 1 +u r n n n p 102 1088 1024 1024 1088 1088 +u r n n n p 102 2048 1024 1024 2048 2048 +u r n n n p 485 656 1024 1024 656 656 +u r n n n p 483 656 1024 1024 656 656 +u r n n n p 81 128 3 3 128 128 +u r n n n p 1022 512 515 515 512 512 +u r n n n p 74 512 515 515 512 512 +u r n n n p 253 2048 515 515 2048 2048 +u r n n n p 8192 1040 515 515 1040 1040 +u r n n n p 10 1029 515 515 1029 1029 +u r n n n p 24 1040 2050 2050 1040 1040 +u r n n n p 1024 1029 2050 2050 1029 1029 +u r n n n p 480 660 2050 2050 660 660 +u r n n n p 481 660 2050 2050 660 660 +u r n n n p 482 660 2050 2050 660 660 +u r n n n p 483 660 2050 2050 660 660 +u r n n n p 484 660 2050 2050 660 660 +u r n n n p 485 660 2050 2050 660 660 +u r n n n p 480 679 2050 2050 679 679 +u r n n n p 481 679 2050 2050 679 679 +u r n n n p 482 679 2050 2050 679 679 +u r n n n p 483 679 2050 2050 679 679 +u r n n n p 484 679 2050 2050 679 679 +u r n n n p 485 679 2050 2050 679 679 +u r n n n p 480 690 2050 2050 690 690 +u r n n n p 481 690 2050 2050 690 690 +u r n n n p 482 690 2050 2050 690 690 +u r n n n p 483 690 2050 2050 690 690 +u r n n n p 484 690 2050 2050 690 690 +u r n n n p 485 690 2050 2050 690 690 +u r n n n p 480 660 2048 2048 660 660 +u r n n n p 481 660 2048 2048 660 660 +u r n n n p 482 660 2048 2048 660 660 +u r n n n p 483 660 2048 2048 660 660 +u r n n n p 484 660 2048 2048 660 660 +u r n n n p 485 660 2048 2048 660 660 +u r n n n p 480 679 2048 2048 679 679 +u r n n n p 481 679 2048 2048 679 679 +u r n n n p 482 679 2048 2048 679 679 +u r n n n p 483 679 2048 2048 679 679 +u r n n n p 484 679 2048 2048 679 679 +u r n n n p 485 679 2048 2048 679 679 +u r n n n p 480 690 2048 2048 690 690 +u r n n n p 481 690 2048 2048 690 690 +u r n n n p 482 690 2048 2048 690 690 +u r n n n p 483 690 2048 2048 690 690 +u r n n n p 484 690 2048 2048 690 690 +u r n n n p 485 690 2048 2048 690 690 +u r n n n p 480 656 1024 1024 656 656 +u r n n n p 480 128 3 3 128 128 +u r n n n p 1024 512 515 515 512 512 +u r n n n p 1024 2048 1024 1024 2048 2048 +u r n n n p 1024 2048 515 515 2048 2048 +u r n n n p 1024 1040 515 515 1040 1040 +u r n n n p 5 1029 515 515 1029 1029 +u r n n n p 1024 1029 515 515 1029 1029 +u r n n n p 1024 1040 2050 2050 1040 1040 +u r n n n p 1029 1029 2050 2050 1029 1029 +u r n n n R 480 646 2050 2050 646 646 +u r n n n R 481 646 2050 2050 646 646 +u r n n n R 482 646 2050 2050 646 646 +u r n n n R 483 646 2050 2050 646 646 +u r n n n R 484 646 2050 2050 646 646 +u r n n n R 485 646 2050 2050 646 646 +u r n n n R 481 656 2050 2050 656 656 +u r n n n R 482 656 2050 2050 656 656 +u r n n n R 483 656 2050 2050 656 656 +u r n n n R 484 656 2050 2050 656 656 +u r n n n p 485 656 2050 2050 656 656 +u r n n n p 480 672 2050 2050 672 672 +u r n n n p 481 672 2050 2050 672 672 +u r n n n p 482 672 2050 2050 672 672 +u r n n n p 483 672 2050 2050 672 672 +u r n n n p 484 672 2050 2050 672 672 +u r n n n p 485 672 2050 2050 672 672 +u r n n n p 480 688 2050 2050 688 688 +u r n n n p 481 688 2050 2050 688 688 +u r n n n r 482 688 2050 2050 688 688 +u r n n n r 483 688 2050 2050 688 688 +u r n n n r 484 688 2050 2050 688 688 +u r n n n r 485 688 2050 2050 688 688 +u r n n n r 1024 512 64 64 512 512 +u r n n n r 16 256 512 512 256 256 +u r n n n r 480 640 512 512 640 640 +u r n n n r 64 768 512 512 768 768 +u r n n n r 128 128 128 128 128 128 +u r n n n r 1024 64 512 512 64 64 +u r n n n r 1024 256 32 32 256 256 +u r n n n r 1024 512 64 64 512 512 +u r n n n r 480 640 512 512 640 640 +u r n n n p 1024 32 256 256 32 32 +u r n n n P 1024 64 512 512 64 64 +u r n n n P 64 800 320 320 800 800 +u r n n n P 64 768 512 512 768 768 +u r n n n P 16 256 512 512 256 256 +u r n n n P 128 128 128 128 128 128 +u r n n n P 256 512 256 256 512 512 +u r n n n P 1024 1024 1024 1024 1024 1024 +u r n n n P 480 640 1024 1024 640 640 +u r n n n P 480 640 256 256 640 640 +u r n n n P 8 64 32 32 64 64 +u r n n n P 9 64 32 32 64 64 +u r n n n P 10 128 64 64 128 128 +u r n n n P 8 8 8 8 8 8 +u r n n n P 12 12 12 12 12 12 +u r n n n P 25 25 25 25 25 25 +u r n n n P 25 25 20 20 25 25 +u r n n n r 4096 256 5 5 256 256 +u r n n n r 3000 256 128 128 256 256 +u r n n n r 4096 1024 512 512 1024 1024 +u r n n n r 144 256 5 5 256 256 +u r n n n r 144 256 128 128 256 256 +u r n n n r 144 1024 512 512 1024 1024 +u r n n n r 480 688 256 256 688 688 +u r n n n r 480 640 512 512 640 640 +u r n n n r 480 640 1024 1024 640 640 +u r n n n r 64 800 320 320 800 800 +u r n n n r 64 768 512 512 768 768 +u r n n n r 16 256 512 512 256 256 +u r n n n r 128 128 128 128 128 128 +u r n n n r 256 512 256 256 512 512 +u r n n n r 1024 1024 1024 1024 1024 1024 +u r n n n r 1024 32 256 256 32 32 +u r n n n r 1024 64 512 512 64 64 +u r n n n r 1024 256 32 32 256 256 +u r n n n r 1024 512 64 64 512 512 +u r n n n r 512 32 256 256 32 32 +u r n n n r 512 768 512 512 768 768 +u r n n n r 512 256 32 32 256 256 +u r n n n r 512 512 64 64 512 512 +u r n n n r 512 256 768 768 256 256 +u r n n n r 768 768 1024 1024 768 768 +u r n n n r 768 768 768 768 768 768 +u r n n n r 2048 2048 2048 2048 2048 2048 +u r n n n r 4096 4096 4096 4096 4096 4096 +f c n n n p 2482 1127 2050 2482 2050 2482 +f c n n n p 2483 1127 2050 2483 2050 2483 +f c n n n p 2484 1127 2050 2484 2050 2484 +f c n n n p 2485 1127 2050 2485 2050 2485 +f c n n n p 480 1138 2050 480 2050 480 +f c n n n p 481 1138 2050 481 2050 481 +f c n n n p 482 1138 2050 482 2050 482 +f c n n n p 483 1138 2050 483 2050 483 +f c n n n p 484 1138 2050 484 2050 484 +f c n n n p 485 1138 2050 485 2050 485 +f c n n n p 1 1 3 1 3 1 +f c n n n p 1 9 3 1 3 1 +f c n n n p 1 2048 3 1 3 1 +f c n n n p 1 2048 5192 1 5192 1 +f c n n n p 9 1 3 9 3 9 +f c n n n p 576 1 3500 576 3500 576 +f c n n n p 1 1 1 1 1 1 +f c n n n p 102 1088 1024 102 1024 102 +b r n n n r 480 20 2050 2050 20 20 +b r n n n r 481 20 2050 2050 20 20 +b r n n n r 482 20 2050 2050 20 20 +b r n n n p 483 20 2050 2050 20 20 +b r n n n R 484 20 2050 2050 20 20 +b r n n n R 485 20 2050 2050 20 20 +b r n n n R 480 39 2050 2050 39 39 +b r n n n R 481 39 2050 2050 39 39 +b r n n n R 482 39 2050 2050 39 39 +b r n n n R 483 39 2050 2050 39 39 +b r n n n R 484 39 2050 2050 39 39 +b r n n n p 485 39 2050 2050 39 39 +b r n n n p 480 50 2050 2050 50 50 +b r n n n p 481 50 2050 2050 50 50 +b r n n n p 482 50 2050 2050 50 50 +b r n n n p 483 50 2050 2050 50 50 +b r n n n p 484 50 2050 2050 50 50 +b r n n n p 485 50 2050 2050 50 50 +b r n n n R 480 1108 2050 2050 1108 1108 +b r n n n R 481 1108 2050 2050 1108 1108 +b r n n n R 482 1108 2050 2050 1108 1108 +b r n n n R 483 1108 2050 2050 1108 1108 +b r n n n R 484 1108 2050 2050 1108 1108 +b r n n n R 485 1108 2050 2050 1108 1108 +b r n n n R 480 1127 2050 2050 1127 1127 +b r n n n R 481 1127 2050 2050 1127 1127 +b r n n n R 482 1127 2050 2050 1127 1127 +b r n n n R 483 1127 2050 2050 1127 1127 +b r n n n p 484 1127 2050 2050 1127 1127 +b r n n n p 485 1127 2050 2050 1127 1127 +b r n n n p 480 1138 2050 2050 1138 1138 +b r n n n p 481 1138 2050 2050 1138 1138 +b r n n n p 482 1138 2050 2050 1138 1138 +b r n n n p 483 1138 2050 2050 1138 1138 +b r n n n p 484 1138 2050 2050 1138 1138 +b r n n n p 485 1138 2050 2050 1138 1138 +b r n n n p 1 1 3 3 1 1 +b r n n n p 1 9 3 3 9 9 +b r n n n p 1 2048 3 3 2048 2048 +b r n n n p 1 2048 5192 5192 2048 2048 +b r n n n p 9 1 3 3 1 1 +b r n n n p 576 1 3500 3500 1 1 +b r n n n p 1 1 1 1 1 1 +b r n n n p 102 1088 1024 1024 1088 1088 +b r n n n p 102 2048 1024 1024 2048 2048 +b r n n n p 485 656 1024 1024 656 656 +b r n n n p 483 656 1024 1024 656 656 +b r n n n p 81 128 3 3 128 128 +b r n n n p 1022 512 515 515 512 512 +b r n n n p 74 512 515 515 512 512 +b r n n n p 253 2048 515 515 2048 2048 +b r n n n p 8192 1040 515 515 1040 1040 +b r n n n p 10 1029 515 515 1029 1029 +b r n n n p 24 1040 2050 2050 1040 1040 +b r n n n p 1024 1029 2050 2050 1029 1029 +b r n n n p 480 660 2050 2050 660 660 +b r n n n p 481 660 2050 2050 660 660 +b r n n n p 482 660 2050 2050 660 660 +b r n n n p 483 660 2050 2050 660 660 +b r n n n p 484 660 2050 2050 660 660 +b r n n n p 485 660 2050 2050 660 660 +b r n n n p 480 679 2050 2050 679 679 +b r n n n p 481 679 2050 2050 679 679 +b r n n n p 482 679 2050 2050 679 679 +b r n n n p 483 679 2050 2050 679 679 +b r n n n p 484 679 2050 2050 679 679 +b r n n n p 485 679 2050 2050 679 679 +b r n n n p 480 690 2050 2050 690 690 +b r n n n p 481 690 2050 2050 690 690 +b r n n n p 482 690 2050 2050 690 690 +b r n n n p 483 690 2050 2050 690 690 +b r n n n p 484 690 2050 2050 690 690 +b r n n n p 485 690 2050 2050 690 690 +b r n n n p 480 660 2048 2048 660 660 +b r n n n p 481 660 2048 2048 660 660 +b r n n n p 482 660 2048 2048 660 660 +b r n n n p 483 660 2048 2048 660 660 +b r n n n p 484 660 2048 2048 660 660 +b r n n n p 485 660 2048 2048 660 660 +b r n n n p 480 679 2048 2048 679 679 +b r n n n p 481 679 2048 2048 679 679 +b r n n n p 482 679 2048 2048 679 679 +b r n n n p 483 679 2048 2048 679 679 +b r n n n p 484 679 2048 2048 679 679 +b r n n n p 485 679 2048 2048 679 679 +b r n n n p 480 690 2048 2048 690 690 +b r n n n p 481 690 2048 2048 690 690 +b r n n n p 482 690 2048 2048 690 690 +b r n n n p 483 690 2048 2048 690 690 +b r n n n p 484 690 2048 2048 690 690 +b r n n n p 485 690 2048 2048 690 690 +b r n n n p 480 656 1024 1024 656 656 +b r n n n p 480 128 3 3 128 128 +b r n n n p 1024 512 515 515 512 512 +b r n n n p 1024 2048 1024 1024 2048 2048 +b r n n n p 1024 2048 515 515 2048 2048 +b r n n n p 1024 1040 515 515 1040 1040 +b r n n n p 5 1029 515 515 1029 1029 +b r n n n p 1024 1029 515 515 1029 1029 +b r n n n p 1024 1040 2050 2050 1040 1040 +b r n n n p 1029 1029 2050 2050 1029 1029 +b r n n n R 480 646 2050 2050 646 646 +b r n n n R 481 646 2050 2050 646 646 +b r n n n R 482 646 2050 2050 646 646 +b r n n n R 483 646 2050 2050 646 646 +b r n n n R 484 646 2050 2050 646 646 +b r n n n R 485 646 2050 2050 646 646 +b r n n n R 481 656 2050 2050 656 656 +b r n n n R 482 656 2050 2050 656 656 +b r n n n R 483 656 2050 2050 656 656 +b r n n n R 484 656 2050 2050 656 656 +b r n n n p 485 656 2050 2050 656 656 +b r n n n p 480 672 2050 2050 672 672 +b r n n n p 481 672 2050 2050 672 672 +b r n n n p 482 672 2050 2050 672 672 +b r n n n p 483 672 2050 2050 672 672 +b r n n n p 484 672 2050 2050 672 672 +b r n n n p 485 672 2050 2050 672 672 +b r n n n p 480 688 2050 2050 688 688 +b r n n n p 481 688 2050 2050 688 688 +b r n n n r 482 688 2050 2050 688 688 +b r n n n r 483 688 2050 2050 688 688 +b r n n n r 484 688 2050 2050 688 688 +b r n n n r 485 688 2050 2050 688 688 +b r n n n r 1024 512 64 64 512 512 +b r n n n r 16 256 512 512 256 256 +b r n n n r 480 640 512 512 640 640 +b r n n n r 64 768 512 512 768 768 +b r n n n r 128 128 128 128 128 128 +b r n n n r 1024 64 512 512 64 64 +b r n n n r 1024 256 32 32 256 256 +b r n n n r 1024 512 64 64 512 512 +b r n n n r 480 640 512 512 640 640 +b r n n n p 1024 32 256 256 32 32 +b r n n n P 1024 64 512 512 64 64 +b r n n n P 64 800 320 320 800 800 +b r n n n P 64 768 512 512 768 768 +b r n n n P 16 256 512 512 256 256 +b r n n n P 128 128 128 128 128 128 +b r n n n P 256 512 256 256 512 512 +b r n n n P 1024 1024 1024 1024 1024 1024 +b r n n n P 480 640 1024 1024 640 640 +b r n n n P 480 640 256 256 640 640 +b r n n n P 8 64 32 32 64 64 +b r n n n P 9 64 32 32 64 64 +b r n n n P 10 128 64 64 128 128 +b r n n n P 8 8 8 8 8 8 +b r n n n P 12 12 12 12 12 12 +b r n n n P 25 25 25 25 25 25 +b r n n n P 25 25 20 20 25 25 +b c n n n p 485 39 2050 485 2050 485 +b c n n n p 480 50 2050 480 2050 480 +b c n n n p 481 50 2050 481 2050 481 +b c n n n p 482 50 2050 482 2050 482 +b c n n n p 483 50 2050 483 2050 483 +b c n n n p 484 50 2050 484 2050 484 +b c n n n p 485 50 2050 485 2050 485 +b c n n n p 484 1127 2050 484 2050 484 +b c n n n p 485 1127 2050 485 2050 485 +b c n n n p 480 1138 2050 480 2050 480 +b c n n n p 481 1138 2050 481 2050 481 +b c n n n p 482 1138 2050 482 2050 482 +b c n n n p 483 1138 2050 483 2050 483 +b c n n n p 484 1138 2050 484 2050 484 +b c n n n p 485 1138 2050 485 2050 485 +b c n n n p 1 1 3 1 3 1 +b c n n n p 1 9 3 1 3 1 +b c n n n p 1 2048 3 1 3 1 +b c n n n p 1 2048 5192 1 5192 1 +b c n n n p 9 1 3 9 3 9 +b c n n n p 576 1 3500 576 3500 576 +b c n n n p 1 1 1 1 1 1 +b c n n n p 102 1088 1024 102 1024 102 +b c n n n p 102 2048 1024 102 1024 102 +b c n n n p 485 656 1024 485 1024 485 +b c n n n p 483 656 1024 483 1024 483 +b c n n n p 81 128 3 81 3 81 +b c n n n p 1022 512 515 1022 515 1022 +b c n n n p 74 512 515 74 515 74 +b c n n n p 253 2048 515 253 515 253 +b c n n n p 8192 1040 515 8192 515 8192 +b c n n n p 10 1029 515 10 515 10 +b c n n n p 24 1040 2050 24 2050 24 +b c n n n p 1024 1029 2050 1024 2050 1024 +b c n n n p 480 660 2050 480 2050 480 +b c n n n p 481 660 2050 481 2050 481 +b c n n n p 482 660 2050 482 2050 482 +b c n n n p 483 660 2050 483 2050 483 +b c n n n p 484 660 2050 484 2050 484 +b c n n n p 485 660 2050 485 2050 485 +b c n n n p 480 679 2050 480 2050 480 +b c n n n p 481 679 2050 481 2050 481 +b c n n n p 482 679 2050 482 2050 482 +b c n n n p 483 679 2050 483 2050 483 +b c n n n p 484 679 2050 484 2050 484 +b c n n n p 485 679 2050 485 2050 485 +b c n n n p 480 690 2050 480 2050 480 +b c n n n p 481 690 2050 481 2050 481 +b c n n n p 482 690 2050 482 2050 482 +b c n n n p 483 690 2050 483 2050 483 +b c n n n p 484 690 2050 484 2050 484 +b c n n n p 485 690 2050 485 2050 485 +b c n n n p 480 660 2048 480 2048 480 +b c n n n p 481 660 2048 481 2048 481 +b c n n n p 482 660 2048 482 2048 482 +b c n n n p 483 660 2048 483 2048 483 +b c n n n p 484 660 2048 484 2048 484 +b c n n n p 485 660 2048 485 2048 485 +b c n n n p 480 679 2048 480 2048 480 +b c n n n p 481 679 2048 481 2048 481 +b c n n n p 482 679 2048 482 2048 482 +b c n n n p 483 679 2048 483 2048 483 +b c n n n p 484 679 2048 484 2048 484 +b c n n n p 485 679 2048 485 2048 485 +b c n n n p 480 690 2048 480 2048 480 +b c n n n p 481 690 2048 481 2048 481 +b c n n n p 482 690 2048 482 2048 482 +b c n n n p 483 690 2048 483 2048 483 +b c n n n p 484 690 2048 484 2048 484 +b c n n n p 485 690 2048 485 2048 485 +b c n n n p 480 656 1024 480 1024 480 +b c n n n p 480 128 3 480 3 480 +b c n n n p 1024 512 515 1024 515 1024 +b c n n n p 1024 2048 1024 1024 1024 1024 +b c n n n p 1024 2048 515 1024 515 1024 +b c p n n n 1024 1040 515 1024 515 1024 +b c p n n n 5 1029 515 5 515 5 +b c p n n n 1024 1029 515 1024 515 1024 +b c p n n n 1024 1040 2050 1024 2050 1024 +b c p n n n 1029 1029 2050 1029 2050 1029 +b c p n n n 485 656 2050 485 2050 485 +b c p n n n 480 672 2050 480 2050 480 +b c p n n n 481 672 2050 481 2050 481 +b c p n n n 482 672 2050 482 2050 482 +b c p n n n 483 672 2050 483 2050 483 +b c p n n n 484 672 2050 484 2050 484 +b c p n n n 485 672 2050 485 2050 485 +b c p n n n 480 688 2050 480 2050 480 +b c p n n n 481 688 2050 481 2050 481 +b c p n n n 1024 32 256 1024 256 1024 +b c P n n n 1024 64 512 1024 512 1024 +b c P n n n 64 800 320 64 320 64 +b c P n n n 64 768 512 64 512 64 +b c P n n n 16 256 512 16 512 16 +b c P n n n 128 128 128 128 128 128 +b c P n n n 256 512 256 256 256 256 +b c P n n n 1024 1024 1024 1024 1024 1024 +b c P n n n 480 640 1024 480 1024 480 +b c P n n n 480 640 256 480 256 480 +b c P n n n 8 64 32 8 32 8 +b c P n n n 9 64 32 9 32 9 +b c P n n n 10 128 64 10 64 10 +b c P n n n 8 8 8 8 8 8 +b c P n n n 12 12 12 12 12 12 +b c P n n n 25 25 25 25 25 25 +b c P n n n 25 25 20 25 20 25 +s r n n n r 480 20 2050 2050 20 20 +s r n n n r 481 20 2050 2050 20 20 +s r n n n r 482 20 2050 2050 20 20 +s r n n n p 483 20 2050 2050 20 20 +s r n n n R 484 20 2050 2050 20 20 +s r n n n R 485 20 2050 2050 20 20 +s r n n n R 480 39 2050 2050 39 39 +s r n n n R 481 39 2050 2050 39 39 +s r n n n R 482 39 2050 2050 39 39 +s r n n n R 483 39 2050 2050 39 39 +s r n n n R 484 39 2050 2050 39 39 +s r n n n p 485 39 2050 2050 39 39 +s r n n n p 480 50 2050 2050 50 50 +s r n n n p 481 50 2050 2050 50 50 +s r n n n p 482 50 2050 2050 50 50 +s r n n n p 483 50 2050 2050 50 50 +s r n n n p 484 50 2050 2050 50 50 +s r n n n p 485 50 2050 2050 50 50 +s r n n n R 480 1108 2050 2050 1108 1108 +s r n n n R 481 1108 2050 2050 1108 1108 +s r n n n R 482 1108 2050 2050 1108 1108 +s r n n n R 483 1108 2050 2050 1108 1108 +s r n n n R 484 1108 2050 2050 1108 1108 +s r n n n R 485 1108 2050 2050 1108 1108 +s r n n n R 480 1127 2050 2050 1127 1127 +s r n n n R 481 1127 2050 2050 1127 1127 +s r n n n R 482 1127 2050 2050 1127 1127 +s r n n n R 483 1127 2050 2050 1127 1127 +s r n n n p 484 1127 2050 2050 1127 1127 +s r n n n p 485 1127 2050 2050 1127 1127 +s r n n n p 480 1138 2050 2050 1138 1138 +s r n n n p 481 1138 2050 2050 1138 1138 +s r n n n p 482 1138 2050 2050 1138 1138 +s r n n n p 483 1138 2050 2050 1138 1138 +s r n n n p 484 1138 2050 2050 1138 1138 +s r n n n p 485 1138 2050 2050 1138 1138 +s r n n n p 1 1 3 3 1 1 +s r n n n p 1 9 3 3 9 9 +s r n n n p 1 2048 3 3 2048 2048 +s r n n n p 1 2048 5192 5192 2048 2048 +s r n n n p 9 1 3 3 1 1 +s r n n n p 576 1 3500 3500 1 1 +s r n n n p 1 1 1 1 1 1 +s r n n n p 102 1088 1024 1024 1088 1088 +s r n n n p 102 2048 1024 1024 2048 2048 +s r n n n p 485 656 1024 1024 656 656 +s r n n n p 483 656 1024 1024 656 656 +s r n n n p 81 128 3 3 128 128 +s r n n n p 1022 512 515 515 512 512 +s r n n n p 74 512 515 515 512 512 +s r n n n p 253 2048 515 515 2048 2048 +s r n n n p 8192 1040 515 515 1040 1040 +s r n n n p 10 1029 515 515 1029 1029 +s r n n n p 24 1040 2050 2050 1040 1040 +s r n n n p 1024 1029 2050 2050 1029 1029 +s r n n n p 480 660 2050 2050 660 660 +s r n n n p 481 660 2050 2050 660 660 +s r n n n p 482 660 2050 2050 660 660 +s r n n n p 483 660 2050 2050 660 660 +s r n n n p 484 660 2050 2050 660 660 +s r n n n p 485 660 2050 2050 660 660 +s r n n n p 480 679 2050 2050 679 679 +s r n n n p 481 679 2050 2050 679 679 +s r n n n p 482 679 2050 2050 679 679 +s r n n n p 483 679 2050 2050 679 679 +s r n n n p 484 679 2050 2050 679 679 +s r n n n p 485 679 2050 2050 679 679 +s r n n n p 480 690 2050 2050 690 690 +s r n n n p 481 690 2050 2050 690 690 +s r n n n p 482 690 2050 2050 690 690 +s r n n n p 483 690 2050 2050 690 690 +s r n n n p 484 690 2050 2050 690 690 +s r n n n p 485 690 2050 2050 690 690 +s r n n n p 480 660 2048 2048 660 660 +s r n n n p 481 660 2048 2048 660 660 +s r n n n p 482 660 2048 2048 660 660 +s r n n n p 483 660 2048 2048 660 660 +s r n n n p 484 660 2048 2048 660 660 +s r n n n p 485 660 2048 2048 660 660 +s r n n n p 480 679 2048 2048 679 679 +s r n n n p 481 679 2048 2048 679 679 +s r n n n p 482 679 2048 2048 679 679 +s r n n n p 483 679 2048 2048 679 679 +s r n n n p 484 679 2048 2048 679 679 +s r n n n p 485 679 2048 2048 679 679 +s r n n n p 480 690 2048 2048 690 690 +s r n n n p 481 690 2048 2048 690 690 +s r n n n p 482 690 2048 2048 690 690 +s r n n n p 483 690 2048 2048 690 690 +s r n n n p 484 690 2048 2048 690 690 +s r n n n p 485 690 2048 2048 690 690 +s r n n n p 480 656 1024 1024 656 656 +s r n n n p 480 128 3 3 128 128 +s r n n n p 1024 512 515 515 512 512 +s r n n n p 1024 2048 1024 1024 2048 2048 +s r n n n p 1024 2048 515 515 2048 2048 +s r n n n p 1024 1040 515 515 1040 1040 +s r n n n p 5 1029 515 515 1029 1029 +s r n n n p 1024 1029 515 515 1029 1029 +s r n n n p 1024 1040 2050 2050 1040 1040 +s r n n n p 1029 1029 2050 2050 1029 1029 +s r n n n R 480 646 2050 2050 646 646 +s r n n n R 481 646 2050 2050 646 646 +s r n n n R 482 646 2050 2050 646 646 +s r n n n R 483 646 2050 2050 646 646 +s r n n n R 484 646 2050 2050 646 646 +s r n n n R 485 646 2050 2050 646 646 +s r n n n R 481 656 2050 2050 656 656 +s r n n n R 482 656 2050 2050 656 656 +s r n n n R 483 656 2050 2050 656 656 +s r n n n R 484 656 2050 2050 656 656 +s r n n n p 485 656 2050 2050 656 656 +s r n n n p 480 672 2050 2050 672 672 +s r n n n p 481 672 2050 2050 672 672 +s r n n n p 482 672 2050 2050 672 672 +s r n n n p 483 672 2050 2050 672 672 +s r n n n p 484 672 2050 2050 672 672 +s r n n n p 485 672 2050 2050 672 672 +s r n n n p 480 688 2050 2050 688 688 +s r n n n p 481 688 2050 2050 688 688 +s r n n n r 482 688 2050 2050 688 688 +s r n n n r 483 688 2050 2050 688 688 +s r n n n r 484 688 2050 2050 688 688 +s r n n n r 485 688 2050 2050 688 688 +s r n n n r 1024 512 64 64 512 512 +s r n n n r 16 256 512 512 256 256 +s r n n n r 480 640 512 512 640 640 +s r n n n r 64 768 512 512 768 768 +s r n n n r 128 128 128 128 128 128 +s r n n n r 1024 64 512 512 64 64 +s r n n n r 1024 256 32 32 256 256 +s r n n n r 1024 512 64 64 512 512 +s r n n n r 480 640 512 512 640 640 +s r n n n p 1024 32 256 256 32 32 +s r n n n P 1024 64 512 512 64 64 +s r n n n P 64 800 320 320 800 800 +s r n n n P 64 768 512 512 768 768 +s r n n n P 16 256 512 512 256 256 +s r n n n P 128 128 128 128 128 128 +s r n n n P 256 512 256 256 512 512 +s r n n n P 1024 1024 1024 1024 1024 1024 +s r n n n P 480 640 1024 1024 640 640 +s r n n n P 480 640 256 256 640 640 +s r n n n P 8 64 32 32 64 64 +s r n n n P 9 64 32 32 64 64 +s r n n n P 10 128 64 64 128 128 +s r n n n P 8 8 8 8 8 8 +s r n n n P 12 12 12 12 12 12 +s r n n n P 25 25 25 25 25 25 +s r n n n P 25 25 20 20 25 25 +i r n n n p 480 20 2050 2050 20 20 +i r n n n p 481 20 2050 2050 20 20 +i r n n n p 482 20 2050 2050 20 20 +i r n n n p 483 20 2050 2050 20 20 +i r n n n R 484 20 2050 2050 20 20 +i r n n n R 485 20 2050 2050 20 20 +i r n n n R 480 39 2050 2050 39 39 +i r n n n R 481 39 2050 2050 39 39 +i r n n n R 482 39 2050 2050 39 39 +i r n n n R 483 39 2050 2050 39 39 +i r n n n R 484 39 2050 2050 39 39 +i r n n n p 485 39 2050 2050 39 39 +i r n n n p 480 50 2050 2050 50 50 +i r n n n p 481 50 2050 2050 50 50 +i r n n n p 482 50 2050 2050 50 50 +i r n n n p 483 50 2050 2050 50 50 +i r n n n p 484 50 2050 2050 50 50 +i r n n n p 485 50 2050 2050 50 50 +i r n n n R 480 1108 2050 2050 1108 1108 +i r n n n R 481 1108 2050 2050 1108 1108 +i r n n n R 482 1108 2050 2050 1108 1108 +i r n n n R 483 1108 2050 2050 1108 1108 +i r n n n R 484 1108 2050 2050 1108 1108 +i r n n n R 485 1108 2050 2050 1108 1108 +i r n n n R 480 1127 2050 2050 1127 1127 +i r n n n R 481 1127 2050 2050 1127 1127 +i r n n n R 482 1127 2050 2050 1127 1127 +i r n n n R 483 1127 2050 2050 1127 1127 +i r n n n p 484 1127 2050 2050 1127 1127 +i r n n n p 485 1127 2050 2050 1127 1127 +i r n n n p 480 1138 2050 2050 1138 1138 +i r n n n p 481 1138 2050 2050 1138 1138 +i r n n n p 482 1138 2050 2050 1138 1138 +i r n n n p 483 1138 2050 2050 1138 1138 +i r n n n p 484 1138 2050 2050 1138 1138 +i r n n n p 485 1138 2050 2050 1138 1138 +i r n n n p 1 1 3 3 1 1 +i r n n n p 1 9 3 3 9 9 +i r n n n p 1 2048 3 3 2048 2048 +i r n n n p 1 2048 5192 5192 2048 2048 +i r n n n p 9 1 3 3 1 1 +i r n n n p 576 1 3500 3500 1 1 +i r n n n p 1 1 1 1 1 1 +i r n n n p 102 1088 1024 1024 1088 1088 +i r n n n p 102 2048 1024 1024 2048 2048 +i r n n n p 485 656 1024 1024 656 656 +i r n n n p 483 656 1024 1024 656 656 +i r n n n p 81 128 3 3 128 128 +i r n n n p 1022 512 515 515 512 512 +i r n n n p 74 512 515 515 512 512 +i r n n n p 253 2048 515 515 2048 2048 +i r n n n p 8192 1040 515 515 1040 1040 +i r n n n p 10 1029 515 515 1029 1029 +i r n n n p 24 1040 2050 2050 1040 1040 +i r n n n p 1024 1029 2050 2050 1029 1029 +i r n n n p 480 660 2050 2050 660 660 +i r n n n p 481 660 2050 2050 660 660 +i r n n n p 482 660 2050 2050 660 660 +i r n n n p 483 660 2050 2050 660 660 +i r n n n p 484 660 2050 2050 660 660 +i r n n n p 485 660 2050 2050 660 660 +i r n n n p 480 679 2050 2050 679 679 +i r n n n p 481 679 2050 2050 679 679 +i r n n n p 482 679 2050 2050 679 679 +i r n n n p 483 679 2050 2050 679 679 +i r n n n p 484 679 2050 2050 679 679 +i r n n n p 485 679 2050 2050 679 679 +i r n n n p 480 690 2050 2050 690 690 +i r n n n p 481 690 2050 2050 690 690 +i r n n n p 482 690 2050 2050 690 690 +i r n n n p 483 690 2050 2050 690 690 +i r n n n p 484 690 2050 2050 690 690 +i r n n n p 485 690 2050 2050 690 690 +i r n n n p 480 660 2048 2048 660 660 +i r n n n p 481 660 2048 2048 660 660 +i r n n n p 482 660 2048 2048 660 660 +i r n n n p 483 660 2048 2048 660 660 +i r n n n p 484 660 2048 2048 660 660 +i r n n n p 485 660 2048 2048 660 660 +i r n n n p 480 679 2048 2048 679 679 +i r n n n p 481 679 2048 2048 679 679 +i r n n n p 482 679 2048 2048 679 679 +i r n n n p 483 679 2048 2048 679 679 +i r n n n p 484 679 2048 2048 679 679 +i r n n n p 485 679 2048 2048 679 679 +i r n n n p 480 690 2048 2048 690 690 +i r n n n p 481 690 2048 2048 690 690 +i r n n n p 482 690 2048 2048 690 690 +i r n n n p 483 690 2048 2048 690 690 +i r n n n p 484 690 2048 2048 690 690 +i r n n n p 485 690 2048 2048 690 690 +i r n n n p 480 656 1024 1024 656 656 +i r n n n p 480 128 3 3 128 128 +i r n n n p 1024 512 515 515 512 512 +i r n n n p 1024 2048 1024 1024 2048 2048 +i r n n n p 1024 2048 515 515 2048 2048 +i r n n n p 1024 1040 515 515 1040 1040 +i r n n n p 5 1029 515 515 1029 1029 +i r n n n p 1024 1029 515 515 1029 1029 +i r n n n p 1024 1040 2050 2050 1040 1040 +i r n n n p 1029 1029 2050 2050 1029 1029 +i r n n n R 480 646 2050 2050 646 646 +i r n n n R 481 646 2050 2050 646 646 +i r n n n R 482 646 2050 2050 646 646 +i r n n n R 483 646 2050 2050 646 646 +i r n n n R 484 646 2050 2050 646 646 +i r n n n R 485 646 2050 2050 646 646 +i r n n n R 481 656 2050 2050 656 656 +i r n n n R 482 656 2050 2050 656 656 +i r n n n R 483 656 2050 2050 656 656 +i r n n n R 484 656 2050 2050 656 656 +i r n n n p 485 656 2050 2050 656 656 +i r n n n p 480 672 2050 2050 672 672 +i r n n n p 481 672 2050 2050 672 672 +i r n n n p 482 672 2050 2050 672 672 +i r n n n p 483 672 2050 2050 672 672 +i r n n n p 484 672 2050 2050 672 672 +i r n n n p 485 672 2050 2050 672 672 +i r n n n p 480 688 2050 2050 688 688 +i r n n n p 481 688 2050 2050 688 688 +i r n n n r 482 688 2050 2050 688 688 +i r n n n r 483 688 2050 2050 688 688 +i r n n n r 484 688 2050 2050 688 688 +i r n n n r 485 688 2050 2050 688 688 +i r n n n r 1024 512 64 64 512 512 +i r n n n r 16 256 512 512 256 256 +i r n n n r 480 640 512 512 640 640 +i r n n n r 64 768 512 512 768 768 +i r n n n r 128 128 128 128 128 128 +i r n n n r 1024 64 512 512 64 64 +i r n n n r 1024 256 32 32 256 256 +i r n n n r 1024 512 64 64 512 512 +i r n n n r 480 640 512 512 640 640 +i r n n n p 1024 32 256 256 32 32 +i r n n n P 1024 64 512 512 64 64 +i r n n n P 64 800 320 320 800 800 +i r n n n P 64 768 512 512 768 768 +i r n n n P 16 256 512 512 256 256 +i r n n n P 128 128 128 128 128 128 +i r n n n P 256 512 256 256 512 512 +i r n n n P 1024 1024 1024 1024 1024 1024 +i r n n n P 480 640 1024 1024 640 640 +i r n n n P 480 640 256 256 640 640 +i r n n n P 8 64 32 32 64 64 +i r n n n P 9 64 32 32 64 64 +i r n n n P 10 128 64 64 128 128 +i r n n n P 8 8 8 8 8 8 +i r n n n P 12 12 12 12 12 12 +i r n n n P 25 25 25 25 25 25 +i r n n n P 25 25 20 20 25 25 +f r n n n p 480 20 2050 2050 20 20 +f r n n n p 481 20 2050 2050 20 20 +f r n n n p 482 20 2050 2050 20 20 +f r n n n p 483 20 2050 2050 20 20 +f r n n n R 484 20 2050 2050 20 20 +f r n n n R 485 20 2050 2050 20 20 +f r n n n R 480 39 2050 2050 39 39 +f r n n n R 481 39 2050 2050 39 39 +f r n n n R 482 39 2050 2050 39 39 +f r n n n R 483 39 2050 2050 39 39 +f r n n n R 484 39 2050 2050 39 39 +f r n n n p 485 39 2050 2050 39 39 +f r n n n p 480 50 2050 2050 50 50 +f r n n n p 481 50 2050 2050 50 50 +f r n n n p 482 50 2050 2050 50 50 +f r n n n p 483 50 2050 2050 50 50 +f r n n n p 484 50 2050 2050 50 50 +f r n n n p 485 50 2050 2050 50 50 +f r n n n R 480 1108 2050 2050 1108 1108 +f r n n n R 481 1108 2050 2050 1108 1108 +f r n n n R 482 1108 2050 2050 1108 1108 +f r n n n R 483 1108 2050 2050 1108 1108 +f r n n n R 484 1108 2050 2050 1108 1108 +f r n n n R 485 1108 2050 2050 1108 1108 +f r n n n R 480 1127 2050 2050 1127 1127 +f r n n n R 481 1127 2050 2050 1127 1127 +f r n n n R 482 1127 2050 2050 1127 1127 +f r n n n R 483 1127 2050 2050 1127 1127 +f r n n n p 484 1127 2050 2050 1127 1127 +f r n n n p 485 1127 2050 2050 1127 1127 +f r n n n p 480 1138 2050 2050 1138 1138 +f r n n n p 481 1138 2050 2050 1138 1138 +f r n n n p 482 1138 2050 2050 1138 1138 +f r n n n p 483 1138 2050 2050 1138 1138 +f r n n n p 484 1138 2050 2050 1138 1138 +f r n n n p 485 1138 2050 2050 1138 1138 +f r n n n p 1 1 3 3 1 1 +f r n n n p 1 9 3 3 9 9 +f r n n n p 1 2048 3 3 2048 2048 +f r n n n p 1 2048 5192 5192 2048 2048 +f r n n n p 9 1 3 3 1 1 +f r n n n p 576 1 3500 3500 1 1 +f r n n n p 1 1 1 1 1 1 +f r n n n p 102 1088 1024 1024 1088 1088 +f r n n n p 102 2048 1024 1024 2048 2048 +f r n n n p 485 656 1024 1024 656 656 +f r n n n p 483 656 1024 1024 656 656 +f r n n n p 81 128 3 3 128 128 +f r n n n p 1022 512 515 515 512 512 +f r n n n p 74 512 515 515 512 512 +f r n n n p 253 2048 515 515 2048 2048 +f r n n n p 8192 1040 515 515 1040 1040 +f r n n n p 10 1029 515 515 1029 1029 +f r n n n p 24 1040 2050 2050 1040 1040 +f r n n n p 1024 1029 2050 2050 1029 1029 +f r n n n p 480 660 2050 2050 660 660 +f r n n n p 481 660 2050 2050 660 660 +f r n n n p 482 660 2050 2050 660 660 +f r n n n p 483 660 2050 2050 660 660 +f r n n n p 484 660 2050 2050 660 660 +f r n n n p 485 660 2050 2050 660 660 +f r n n n p 480 679 2050 2050 679 679 +f r n n n p 481 679 2050 2050 679 679 +f r n n n p 482 679 2050 2050 679 679 +f r n n n p 483 679 2050 2050 679 679 +f r n n n p 484 679 2050 2050 679 679 +f r n n n p 485 679 2050 2050 679 679 +f r n n n p 480 690 2050 2050 690 690 +f r n n n p 481 690 2050 2050 690 690 +f r n n n p 482 690 2050 2050 690 690 +f r n n n p 483 690 2050 2050 690 690 +f r n n n p 484 690 2050 2050 690 690 +f r n n n p 485 690 2050 2050 690 690 +f r n n n p 480 660 2048 2048 660 660 +f r n n n p 481 660 2048 2048 660 660 +f r n n n p 482 660 2048 2048 660 660 +f r n n n p 483 660 2048 2048 660 660 +f r n n n p 484 660 2048 2048 660 660 +f r n n n p 485 660 2048 2048 660 660 +f r n n n p 480 679 2048 2048 679 679 +f r n n n p 481 679 2048 2048 679 679 +f r n n n p 482 679 2048 2048 679 679 +f r n n n p 483 679 2048 2048 679 679 +f r n n n p 484 679 2048 2048 679 679 +f r n n n p 485 679 2048 2048 679 679 +f r n n n p 480 690 2048 2048 690 690 +f r n n n p 481 690 2048 2048 690 690 +f r n n n p 482 690 2048 2048 690 690 +f r n n n p 483 690 2048 2048 690 690 +f r n n n p 484 690 2048 2048 690 690 +f r n n n p 485 690 2048 2048 690 690 +f r n n n p 480 656 1024 1024 656 656 +f r n n n p 480 128 3 3 128 128 +f r n n n p 1024 512 515 515 512 512 +f r n n n p 1024 2048 1024 1024 2048 2048 +f r n n n p 1024 2048 515 515 2048 2048 +f r n n n p 1024 1040 515 515 1040 1040 +f r n n n p 5 1029 515 515 1029 1029 +f r n n n p 1024 1029 515 515 1029 1029 +f r n n n p 1024 1040 2050 2050 1040 1040 +f r n n n p 1029 1029 2050 2050 1029 1029 +f r n n n R 480 646 2050 2050 646 646 +f r n n n R 481 646 2050 2050 646 646 +f r n n n R 482 646 2050 2050 646 646 +f r n n n R 483 646 2050 2050 646 646 +f r n n n R 484 646 2050 2050 646 646 +f r n n n R 485 646 2050 2050 646 646 +f r n n n R 481 656 2050 2050 656 656 +f r n n n R 482 656 2050 2050 656 656 +f r n n n R 483 656 2050 2050 656 656 +f r n n n R 484 656 2050 2050 656 656 +f r n n n p 485 656 2050 2050 656 656 +f r n n n p 480 672 2050 2050 672 672 +f r n n n p 481 672 2050 2050 672 672 +f r n n n p 482 672 2050 2050 672 672 +f r n n n p 483 672 2050 2050 672 672 +f r n n n p 484 672 2050 2050 672 672 +f r n n n p 485 672 2050 2050 672 672 +f r n n n p 480 688 2050 2050 688 688 +f r n n n p 481 688 2050 2050 688 688 +f r n n n r 482 688 2050 2050 688 688 +f r n n n r 483 688 2050 2050 688 688 +f r n n n r 484 688 2050 2050 688 688 +f r n n n r 485 688 2050 2050 688 688 +f r n n n r 1024 512 64 64 512 512 +f r n n n r 16 256 512 512 256 256 +f r n n n r 480 640 512 512 640 640 +f r n n n r 64 768 512 512 768 768 +f r n n n r 128 128 128 128 128 128 +f r n n n r 1024 64 512 512 64 64 +f r n n n r 1024 256 32 32 256 256 +f r n n n r 1024 512 64 64 512 512 +f r n n n r 480 640 512 512 640 640 +f r n n n p 1024 32 256 256 32 32 +f r n n n P 1024 64 512 512 64 64 +f r n n n P 64 800 320 320 800 800 +f r n n n P 64 768 512 512 768 768 +f r n n n P 16 256 512 512 256 256 +f r n n n P 128 128 128 128 128 128 +f r n n n P 256 512 256 256 512 512 +f r n n n P 1024 1024 1024 1024 1024 1024 +f r n n n P 480 640 1024 1024 640 640 +f r n n n P 480 640 256 256 640 640 +f r n n n P 8 64 32 32 64 64 +f r n n n P 9 64 32 32 64 64 +f r n n n P 10 128 64 64 128 128 +f r n n n P 8 8 8 8 8 8 +f r n n n P 12 12 12 12 12 12 +f r n n n P 25 25 25 25 25 25 +f r n n n P 25 25 20 20 25 25 +i r n n n r 4096 256 5 5 256 256 +i r n n n r 3000 256 128 128 256 256 +i r n n n r 4096 1024 512 512 1024 1024 +i r n n n r 144 256 5 5 256 256 +i r n n n r 144 256 128 128 256 256 +i r n n n r 144 1024 512 512 1024 1024 +i r n n n r 480 688 256 256 688 688 +i r n n n r 480 640 512 512 640 640 +i r n n n r 480 640 1024 1024 640 640 +i r n n n r 64 800 320 320 800 800 +i r n n n r 64 768 512 512 768 768 +i r n n n r 16 256 512 512 256 256 +i r n n n r 128 128 128 128 128 128 +i r n n n r 256 512 256 256 512 512 +i r n n n r 1024 1024 1024 1024 1024 1024 +i r n n n r 1024 32 256 256 32 32 +i r n n n r 1024 64 512 512 64 64 +i r n n n r 1024 256 32 32 256 256 +i r n n n r 1024 512 64 64 512 512 +i r n n n r 512 32 256 256 32 32 +i r n n n r 512 768 512 512 768 768 +i r n n n r 512 256 32 32 256 256 +i r n n n r 512 512 64 64 512 512 +i r n n n r 512 256 768 768 256 256 +i r n n n r 768 768 1024 1024 768 768 +i r n n n r 768 768 768 768 768 768 +i r n n n r 2048 2048 2048 2048 2048 2048 +i r n n n r 4096 4096 4096 4096 4096 4096 +f r n n n r 4096 256 5 5 256 256 +f r n n n r 3000 256 128 128 256 256 +f r n n n r 4096 1024 512 512 1024 1024 +f r n n n r 144 256 5 5 256 256 +f r n n n r 144 256 128 128 256 256 +f r n n n r 144 1024 512 512 1024 1024 +f r n n n r 480 688 256 256 688 688 +f r n n n r 480 640 512 512 640 640 +f r n n n r 480 640 1024 1024 640 640 +f r n n n r 64 800 320 320 800 800 +f r n n n r 64 768 512 512 768 768 +f r n n n r 16 256 512 512 256 256 +f r n n n r 128 128 128 128 128 128 +f r n n n r 256 512 256 256 512 512 +f r n n n r 1024 1024 1024 1024 1024 1024 +f r n n n r 1024 32 256 256 32 32 +f r n n n r 1024 64 512 512 64 64 +f r n n n r 1024 256 32 32 256 256 +f r n n n r 1024 512 64 64 512 512 +f r n n n r 512 32 256 256 32 32 +f r n n n r 512 768 512 512 768 768 +f r n n n r 512 256 32 32 256 256 +f r n n n r 512 512 64 64 512 512 +f r n n n r 512 256 768 768 256 256 +f r n n n r 768 768 1024 1024 768 768 +f r n n n r 768 768 768 768 768 768 +f r n n n r 2048 2048 2048 2048 2048 2048 +f r n n n r 4096 4096 4096 4096 4096 4096 +f r n n n r 2048 1024 1024 1024 1024 1024 +f r n n n r 2048 4096 1024 1024 4096 4096 +f r n n n r 2048 1024 4096 4096 1024 1024 +f r n n n r 2048 1024 2 2 1024 1024 +f r n n n r 128 1024 1024 1024 1024 1024 +f r n n n r 1536 768 768 768 768 768 +f r n n n r 1536 3072 768 768 3072 3072 +f r n n n r 1536 768 3072 3072 768 768 +f r n n n r 1536 768 2 2 768 768 +f r n n n r 128 768 768 768 768 768 +f r n n n r 1024 8 13 13 8 8 +f r n n n r 1024 4 8 8 4 4 +f r n n n r 1024 128 355 355 128 128 +f r n n n r 1024 64 128 128 64 64 +f r n n n r 1024 1 64 64 1 1 +f r n n n r 480 1 256 256 1 1 +f r n n n r 480 256 512 512 256 256 +f r n n n r 480 1024 845 845 1024 1024 +f r n n n r 480 512 1024 1024 512 512 +f r n n n r 10 17191 128 128 17191 17191 +f r n n n r 10 512 256 256 512 512 diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 47d6491c94..765c293f8c 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -138,7 +138,10 @@ GEN_FILL_ARRAY_POST_OPS_FUNC(float) void mat_mul_ ## BLAS_SFX \ ( \ char stor_order, \ - char op_t, \ + char transa, \ + char transb, \ + char op_a, \ + char op_b, \ dim_t m, \ dim_t n, \ dim_t k, \ @@ -154,21 +157,26 @@ void mat_mul_ ## BLAS_SFX \ ) \ { \ char storage = stor_order; \ - char transa = 'n'; \ - char transb = 'n'; \ char reordera = 'n'; \ char reorderb = 'n'; \ \ - if ( ( op_t == 'p' ) || ( op_t == 'P' ) ) \ + if ( ( op_a == 'p' ) || ( op_a == 'P' ) ) \ + { \ + reordera = 'p'; \ + } \ + else if ( ( op_a == 'r' ) || ( op_a == 'R' ) ) \ + { \ + reordera = 'r'; \ + } \ + \ + if ( ( op_b == 'p' ) || ( op_b == 'P' ) ) \ { \ /* No reordering of B.*/ \ - reordera = 'n'; \ reorderb = 'n'; \ } \ - else if ( ( op_t == 'r' ) || ( op_t == 'R' ) ) \ + else if ( ( op_b == 'r' ) || ( op_b == 'R' ) ) \ { \ /* Reordered B.*/ \ - reordera = 'n'; \ reorderb = 'r'; \ } \ \ @@ -273,7 +281,7 @@ void print_result { double gflops = get_gflops( m, n, k, runtime ); printf("%s m: %ld, n: %ld, k: %ld, lda: %ld, ldb: %ld, ldc: %ld," \ - " Gops: %f, n_repeats: %d\n", + " Gops: %f, n_repeats: %d\n", msg, m, n, k, lda, ldb, ldc, gflops, n_repeats); } @@ -281,7 +289,10 @@ void print_result void mat_mul_bench_driver_ ## BLAS_SFX \ ( \ char stor_order, \ - char op_t, \ + char transa, \ + char transb, \ + char op_a, \ + char op_b, \ int32_t n_repeats, \ dim_t m, \ dim_t n, \ @@ -310,7 +321,7 @@ void mat_mul_bench_driver_ ## BLAS_SFX \ \ GEN_FUNC_NAME(mat_mul_,BLAS_SFX) \ ( \ - stor_order, op_t, m, n, k, \ + stor_order, transa, transb, op_a, op_b, m, n, k, \ alpha, \ a, lda, \ b, ldb, \ @@ -589,6 +600,8 @@ void mat_mul_accuracy_check_driver_ ## BLAS_SFX \ ( \ FILE* fout, \ const char stor_order, \ + char transa, \ + char transb, \ dim_t m, \ dim_t n, \ dim_t k, \ @@ -605,10 +618,28 @@ void mat_mul_accuracy_check_driver_ ## BLAS_SFX \ aocl_post_op* post_op\ ) \ { \ - dim_t rs_a = lda; \ - dim_t cs_a = 1; \ - dim_t rs_b = ldb; \ - dim_t cs_b = 1; \ + dim_t rs_a, cs_a; \ + if( ( transa == 'n' ) || ( transa == 'N' ) ) \ + { \ + rs_a = lda; \ + cs_a = 1; \ + } \ + else \ + { \ + rs_a = 1; \ + cs_a = lda; \ + } \ + dim_t rs_b, cs_b; \ + if( ( transb == 'n' ) || ( transb == 'N' ) ) \ + { \ + rs_b = ldb; \ + cs_b = 1; \ + } \ + else \ + { \ + rs_b = 1; \ + cs_b = ldb; \ + } \ dim_t rs_c = ldc; \ dim_t cs_c = 1; \ dim_t rs_c_ref = ldc_ref; \ @@ -616,10 +647,26 @@ void mat_mul_accuracy_check_driver_ ## BLAS_SFX \ \ if ( ( stor_order == 'C' ) || ( stor_order == 'c' ) ) \ { \ - rs_a = 1; \ - cs_a = lda; \ - rs_b = 1; \ - cs_b = ldb; \ + if( transa == 'n' || transa == 'N') \ + { \ + rs_a = 1; \ + cs_a = lda; \ + } \ + else \ + { \ + rs_a = lda; \ + cs_a = 1; \ + } \ + if( ( transb == 'n' ) || ( transb == 'N' ) ) \ + { \ + rs_b = 1; \ + cs_b = ldb; \ + } \ + else \ + { \ + rs_b = ldb; \ + cs_b = 1; \ + } \ rs_c = 1; \ cs_c = ldc; \ rs_c_ref = 1; \ @@ -1033,7 +1080,10 @@ void mat_mul_bench_main_ ## BLAS_SFX \ FILE* fin, \ FILE* fout, \ char stor_order, \ - char op_t, \ + char transa, \ + char transb, \ + char op_a, \ + char op_b, \ int32_t m, \ int32_t n, \ int32_t k, \ @@ -1043,9 +1093,16 @@ void mat_mul_bench_main_ ## BLAS_SFX \ char* post_ops_str \ ) \ { \ - if ( ( op_t != 'p' ) && ( op_t != 'P' ) && ( op_t != 'r' ) && ( op_t != 'R' ) ) \ + /* Reorder and pack of A matrix is not supported */ \ + if( ( op_a != 'N' ) && ( op_a != 'n' ) ) \ { \ - printf("The op_t ( 2nd arg in input.txt) is not valid\n"); \ + printf("The op_a ( 4th arg in input.txt) is not valid\n"); \ + return; \ + } \ + \ + if ( ( op_b != 'p' ) && ( op_b != 'P' ) && ( op_b != 'r' ) && ( op_b != 'R' ) && ( op_b != 'n' ) && ( op_b != 'N' ) ) \ + { \ + printf("The op_b ( 5th arg in input.txt) is not valid\n"); \ return; \ } \ \ @@ -1100,12 +1157,12 @@ void mat_mul_bench_main_ ## BLAS_SFX \ } \ } \ \ - if ( ( op_t == 'p' ) || ( op_t == 'P' ) ) \ + if ( ( op_b == 'p' ) || ( op_b == 'P' ) ) \ { \ /* No reordering of B.*/ \ GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ ( \ - stor_order, op_t, n_repeats, m, n, k, \ + stor_order, transa, transb, op_a, op_b, n_repeats, m, n, k, \ alpha, \ a, stride_a, \ b, stride_b, \ @@ -1114,18 +1171,18 @@ void mat_mul_bench_main_ ## BLAS_SFX \ post_op \ ); \ } \ - else if ( ( op_t == 'r' ) || ( op_t == 'R' ) ) \ + else if ( ( op_b == 'r' ) || ( op_b == 'R' ) ) \ { \ /* Reorder B.*/ \ siz_t b_reorder_buf_siz_req = \ - GEN_FUNC_NAME(aocl_get_reorder_buf_size_,REORDER_SFX)( 'B', k, n ); \ + GEN_FUNC_NAME(aocl_get_reorder_buf_size_,REORDER_SFX)( stor_order, transb, 'B', k, n ); \ \ B_type* b_reorder = ( B_type* ) bli_malloc_user( b_reorder_buf_siz_req, &bli_errors ); \ - GEN_FUNC_NAME(aocl_reorder_,REORDER_SFX)( 'B', b, b_reorder, k, n, stride_b ); \ + GEN_FUNC_NAME(aocl_reorder_,REORDER_SFX)( stor_order, transb, 'B', b, b_reorder, k, n, stride_b ); \ \ GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ ( \ - stor_order, op_t, n_repeats, m, n, k, \ + stor_order, transa, transb, op_a, op_b, n_repeats, m, n, k, \ alpha, \ a, stride_a, \ b_reorder, stride_b, \ @@ -1142,7 +1199,7 @@ void mat_mul_bench_main_ ## BLAS_SFX \ printf("Running accuracy check.\n"); \ GEN_FUNC_NAME(mat_mul_accuracy_check_driver_,BLAS_SFX) \ ( \ - fout, stor_order, m, n, k, \ + fout, stor_order, transa, transb, m, n, k, \ alpha, \ a, stride_a, \ b, stride_b, \ @@ -1188,8 +1245,11 @@ void mat_mul_bench_main_ ## BLAS_SFX \ ( \ FILE* fin, \ FILE* fout, \ - char stor_order, \ - char op_t, \ + char stor_order, \ + char transa, \ + char transb, \ + char op_a, \ + char op_b, \ int32_t m, \ int32_t n, \ int32_t k, \ @@ -1199,9 +1259,14 @@ void mat_mul_bench_main_ ## BLAS_SFX \ char* post_ops_str \ ) \ { \ - if ( ( op_t != 'p' ) && ( op_t != 'P' ) && ( op_t != 'r' ) && ( op_t != 'R' ) ) \ + /* Reorder is not supported for A matrix*/ \ + if( ( op_a != 'p' ) && ( op_a != 'P' ) && ( op_a != 'n' ) && ( op_a != 'N' ) ) \ { \ - printf("The op_t ( 2nd arg in input.txt) is not valid\n");\ + printf("The op_a (4th arg in input.txt) is not valid\n"); \ + } \ + if ( ( op_b != 'p' ) && ( op_b != 'P' ) && ( op_b != 'r' ) && ( op_b != 'R' ) && ( op_b != 'N' ) && ( op_b != 'n' ) ) \ + { \ + printf("The op_b ( 5th arg in input.txt) is not valid\n");\ return; \ } \ \ @@ -1216,9 +1281,10 @@ void mat_mul_bench_main_ ## BLAS_SFX \ bfloat16* a = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * m * k, &bli_errors ); \ float *a_float = bli_malloc_user( m * k * sizeof( float ), &bli_errors); \ for ( int32_t i = 0; i < m*k; ++i ) \ - { \ - a_float[i] = ( float ) ( i % 5 ); \ - } \ + { \ + a_float[i] = ( float ) ( i % 5 ); \ + } \ + \ convert_float_arr_to_bf16( a_float, a, m * k ); \ \ bfloat16* b = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * n * k, &bli_errors ); \ @@ -1265,12 +1331,12 @@ void mat_mul_bench_main_ ## BLAS_SFX \ } \ } \ \ - if ( ( op_t == 'p' ) || ( op_t == 'P' ) ) \ + if ( ( op_b == 'p' ) || ( op_b == 'P' ) ) \ { \ /* No reordering of B.*/ \ GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ ( \ - stor_order, op_t, n_repeats, m, n, k, \ + stor_order, transa, transb, op_a, op_b, n_repeats, m, n, k, \ alpha, \ a, stride_a, \ b, stride_b, \ @@ -1279,18 +1345,18 @@ void mat_mul_bench_main_ ## BLAS_SFX \ post_op \ ); \ } \ - else if ( ( op_t == 'r' ) || ( op_t == 'R' ) ) \ + else if ( ( op_b == 'r' ) || ( op_b == 'R' ) ) \ { \ /* Reorder B.*/ \ siz_t b_reorder_buf_siz_req = \ - aocl_get_reorder_buf_size_bf16bf16f32of32( 'B', k, n ); \ + aocl_get_reorder_buf_size_bf16bf16f32of32( stor_order, transb, 'B', k, n ); \ \ bfloat16* b_reorder = ( bfloat16* ) bli_malloc_user( b_reorder_buf_siz_req, &bli_errors ); \ - aocl_reorder_bf16bf16f32of32( 'B', b, b_reorder, k, n, stride_b ); \ + aocl_reorder_bf16bf16f32of32( stor_order, transb, 'B', b, b_reorder, k, n, stride_b ); \ \ GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ ( \ - stor_order, op_t, n_repeats, m, n, k, \ + stor_order, transa, transb, op_a, op_b, n_repeats, m, n, k, \ alpha, \ a, stride_a, \ b_reorder, stride_b, \ @@ -1305,7 +1371,7 @@ void mat_mul_bench_main_ ## BLAS_SFX \ printf(" Running accuracy check.\n"); \ GEN_FUNC_NAME(mat_mul_accuracy_check_driver_,BLAS_SFX) \ ( \ - fout, stor_order, m, n, k, \ + fout, stor_order, transa, transb, m, n, k, \ alpha, \ a, stride_a, \ b, stride_b, \ @@ -1450,8 +1516,9 @@ int main( int argc, char** argv ) fout = fopen( "lpgemm_accuracy_test_failures.txt", "w" ); char op_type_char; - char op_t; + char op_a, op_b; char stor_order; + char transa, transb; int32_t m, n, k; int32_t stride_a, stride_b, stride_c; @@ -1490,9 +1557,9 @@ int main( int argc, char** argv ) } // Input format: data_type stor_type pack/reorder m n k lda ldb ldc - while ( fscanf( fin, "%c %c %c %d %d %d %d %d %d\n", - &op_type_char, &stor_order, &op_t, &m, &n, &k, - &stride_a, &stride_b, &stride_c ) == 9 ) + while ( fscanf( fin, "%c %c %c %c %c %c %d %d %d %d %d %d\n", + &op_type_char, &stor_order, &transa, &transb, &op_a, &op_b, &m, &n, &k, + &stride_a, &stride_b, &stride_c ) == 12 ) { stor_order = ( ( stor_order == 'r' ) || ( stor_order == 'R' ) || ( stor_order == 'c' ) || ( stor_order == 'C' ) ) ? @@ -1504,7 +1571,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s32os32) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); @@ -1513,7 +1580,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s32os8) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); @@ -1523,7 +1590,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_,f32f32f32of32) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); @@ -1534,7 +1601,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16os16) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); @@ -1543,7 +1610,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16os8) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); @@ -1555,7 +1622,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_, bf16bf16f32of32) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); @@ -1564,7 +1631,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_, bf16bf16f32obf16) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); @@ -1576,7 +1643,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s32os32) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); @@ -1585,7 +1652,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s32os8) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); @@ -1597,7 +1664,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s16os16) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); @@ -1606,7 +1673,7 @@ int main( int argc, char** argv ) { GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s16os8) ( - fin, fout, stor_order, op_t, + fin, fout, stor_order, transa, transb, op_a, op_b, m, n, k, stride_a, stride_b, stride_c, post_ops_str_dest ); diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packa_bf16_amd256vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packa_bf16_amd256vnni.c new file mode 100644 index 0000000000..183677e96a --- /dev/null +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packa_bf16_amd256vnni.c @@ -0,0 +1,1493 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include +#include "blis.h" + +#ifdef BLIS_ADDON_LPGEMM + + +#define UNPACKLO_EPI16 \ + b_reg[0] = _mm256_unpacklo_epi16(a_reg[0], a_reg[1]); \ + b_reg[1] = _mm256_unpacklo_epi16(a_reg[2], a_reg[3]); \ + b_reg[2] = _mm256_unpacklo_epi16(a_reg[4], a_reg[5]); \ + b_reg[3] = _mm256_unpacklo_epi16(a_reg[6], a_reg[7]); \ + b_reg[4] = _mm256_unpacklo_epi16(a_reg[8], a_reg[9]); \ + b_reg[5] = _mm256_unpacklo_epi16(a_reg[10], a_reg[11]); \ + b_reg[6] = _mm256_unpacklo_epi16(a_reg[12], a_reg[13]); \ + b_reg[7] = _mm256_unpacklo_epi16(a_reg[14], a_reg[15]); + +#define UNPACKHI_EPI16 \ + b_reg[8] = _mm256_unpackhi_epi16(a_reg[0], a_reg[1]); \ + b_reg[9] = _mm256_unpackhi_epi16(a_reg[2], a_reg[3]); \ + b_reg[10] = _mm256_unpackhi_epi16(a_reg[4], a_reg[5]); \ + b_reg[11] = _mm256_unpackhi_epi16(a_reg[6], a_reg[7]); \ + b_reg[12] = _mm256_unpackhi_epi16(a_reg[8], a_reg[9]); \ + b_reg[13] = _mm256_unpackhi_epi16(a_reg[10], a_reg[11]); \ + b_reg[14] = _mm256_unpackhi_epi16(a_reg[12], a_reg[13]); \ + b_reg[15] = _mm256_unpackhi_epi16(a_reg[14], a_reg[15]); + +#define UNPACKLO_EPI32 \ + a_reg[0] = _mm256_unpacklo_epi32(b_reg[0], b_reg[1]); \ + a_reg[1] = _mm256_unpacklo_epi32(b_reg[2], b_reg[3]); \ + a_reg[2] = _mm256_unpacklo_epi32(b_reg[4], b_reg[5]); \ + a_reg[3] = _mm256_unpacklo_epi32(b_reg[6], b_reg[7]); \ +\ + a_reg[8] = _mm256_unpacklo_epi32(b_reg[8], b_reg[9]); \ + a_reg[9] = _mm256_unpacklo_epi32(b_reg[10], b_reg[11]); \ + a_reg[10] = _mm256_unpacklo_epi32(b_reg[12], b_reg[13]); \ + a_reg[11] = _mm256_unpacklo_epi32(b_reg[14], b_reg[15]); + +#define UNPACKHI_EPI32 \ + a_reg[4] = _mm256_unpackhi_epi32(b_reg[0], b_reg[1]); \ + a_reg[5] = _mm256_unpackhi_epi32(b_reg[2], b_reg[3]); \ + a_reg[6] = _mm256_unpackhi_epi32(b_reg[4], b_reg[5]); \ + a_reg[7] = _mm256_unpackhi_epi32(b_reg[6], b_reg[7]); \ +\ + a_reg[12] = _mm256_unpackhi_epi32(b_reg[8], b_reg[9]); \ + a_reg[13] = _mm256_unpackhi_epi32(b_reg[10], b_reg[11]); \ + a_reg[14] = _mm256_unpackhi_epi32(b_reg[12], b_reg[13]); \ + a_reg[15] = _mm256_unpackhi_epi32(b_reg[14], b_reg[15]); + +#define UNPACKLO_EPI64 \ + b_reg[0] = _mm256_unpacklo_epi64(a_reg[0], a_reg[1]); \ + b_reg[1] = _mm256_unpacklo_epi64(a_reg[2], a_reg[3]); \ + b_reg[2] = _mm256_unpacklo_epi64(a_reg[4], a_reg[5]); \ + b_reg[3] = _mm256_unpacklo_epi64(a_reg[6], a_reg[7]); \ +\ + b_reg[8] = _mm256_unpacklo_epi64(a_reg[8], a_reg[9]); \ + b_reg[9] = _mm256_unpacklo_epi64(a_reg[10], a_reg[11]); \ + b_reg[10] = _mm256_unpacklo_epi64(a_reg[12], a_reg[13]); \ + b_reg[11] = _mm256_unpacklo_epi64(a_reg[14], a_reg[15]); + +#define UNPACKHI_EPI64 \ + b_reg[4] = _mm256_unpackhi_epi64(a_reg[0], a_reg[1]); \ + b_reg[5] = _mm256_unpackhi_epi64(a_reg[2], a_reg[3]); \ + b_reg[6] = _mm256_unpackhi_epi64(a_reg[4], a_reg[5]); \ + b_reg[7] = _mm256_unpackhi_epi64(a_reg[6], a_reg[7]); \ +\ + b_reg[12] = _mm256_unpackhi_epi64(a_reg[8], a_reg[9]); \ + b_reg[13] = _mm256_unpackhi_epi64(a_reg[10], a_reg[11]); \ + b_reg[14] = _mm256_unpackhi_epi64(a_reg[12], a_reg[13]); \ + b_reg[15] = _mm256_unpackhi_epi64(a_reg[14], a_reg[15]); + +#define SHUFFLE_64x2 \ + a_reg[0] = _mm256_shuffle_i64x2(b_reg[0], b_reg[1], 0x0); \ + a_reg[1] = _mm256_shuffle_i64x2(b_reg[0], b_reg[1], 0x3); \ + a_reg[2] = _mm256_shuffle_i64x2(b_reg[2], b_reg[3], 0x0); \ + a_reg[3] = _mm256_shuffle_i64x2(b_reg[2], b_reg[3], 0x3); \ +\ + a_reg[4] = _mm256_shuffle_i64x2(b_reg[4], b_reg[5], 0x0); \ + a_reg[5] = _mm256_shuffle_i64x2(b_reg[4], b_reg[5], 0x3); \ + a_reg[6] = _mm256_shuffle_i64x2(b_reg[6], b_reg[7], 0x0); \ + a_reg[7] = _mm256_shuffle_i64x2(b_reg[6], b_reg[7], 0x3); \ +\ + a_reg[8] = _mm256_shuffle_i64x2(b_reg[8], b_reg[9], 0x0); \ + a_reg[9] = _mm256_shuffle_i64x2(b_reg[8], b_reg[9], 0x3); \ + a_reg[10] = _mm256_shuffle_i64x2(b_reg[10], b_reg[11], 0x0); \ + a_reg[11] = _mm256_shuffle_i64x2(b_reg[10], b_reg[11], 0x3); \ +\ + a_reg[12] = _mm256_shuffle_i64x2(b_reg[12], b_reg[13], 0x0); \ + a_reg[13] = _mm256_shuffle_i64x2(b_reg[12], b_reg[13], 0x3); \ + a_reg[14] = _mm256_shuffle_i64x2(b_reg[14], b_reg[15], 0x0); \ + a_reg[15] = _mm256_shuffle_i64x2(b_reg[14], b_reg[15], 0x3); + +#define MASKED_STORE_EPI64(mask) \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+0) * KC + kr ), mask, a_reg[0]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+1) * KC + kr ), mask, a_reg[4]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+2) * KC + kr ), mask, a_reg[2]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+3) * KC + kr ), mask, a_reg[6]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+4) * KC + kr ), mask, a_reg[8]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+5) * KC + kr ), mask, a_reg[12]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+6) * KC + kr ), mask, a_reg[10]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+7) * KC + kr ), mask, a_reg[14]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+8) * KC + kr ), mask, a_reg[1]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+9) * KC + kr ), mask, a_reg[5]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+10) * KC + kr ), mask, a_reg[3]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+11) * KC + kr ), mask, a_reg[7]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+12) * KC + kr ), mask, a_reg[9]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+13) * KC + kr ), mask, a_reg[13]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+14) * KC + kr ), mask, a_reg[11]); \ + _mm256_mask_storeu_epi64((pack_a_buffer + (ic+15) * KC + kr ), mask, a_reg[15]); + +#define MASKED_STORE_EPI32(mask) \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+0) * KC + kr ), mask, a_reg[0]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+1) * KC + kr ), mask, a_reg[4]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+2) * KC + kr ), mask, a_reg[2]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+3) * KC + kr ), mask, a_reg[6]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+4) * KC + kr ), mask, a_reg[8]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+5) * KC + kr ), mask, a_reg[12]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+6) * KC + kr ), mask, a_reg[10]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+7) * KC + kr ), mask, a_reg[14]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+8) * KC + kr ), mask, a_reg[1]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+9) * KC + kr ), mask, a_reg[5]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+10) * KC + kr ), mask, a_reg[3]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+11) * KC + kr ), mask, a_reg[7]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+12) * KC + kr ), mask, a_reg[9]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+13) * KC + kr ), mask, a_reg[13]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+14) * KC + kr ), mask, a_reg[11]); \ + _mm256_mask_storeu_epi32((pack_a_buffer + (ic+15) * KC + kr ), mask, a_reg[15]); + +#define MASKED_STORE_EPI16(mask) \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+0) * KC + kr ), mask, a_reg[0]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+1) * KC + kr ), mask, a_reg[4]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+2) * KC + kr ), mask, a_reg[2]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+3) * KC + kr ), mask, a_reg[6]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+4) * KC + kr ), mask, a_reg[8]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+5) * KC + kr ), mask, a_reg[12]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+6) * KC + kr ), mask, a_reg[10]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+7) * KC + kr ), mask, a_reg[14]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+8) * KC + kr ), mask, a_reg[1]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+9) * KC + kr ), mask, a_reg[5]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+10) * KC + kr ), mask, a_reg[3]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+11) * KC + kr ), mask, a_reg[7]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+12) * KC + kr ), mask, a_reg[9]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+13) * KC + kr ), mask, a_reg[13]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+14) * KC + kr ), mask, a_reg[11]); \ + _mm256_mask_storeu_epi16((pack_a_buffer + (ic+15) * KC + kr ), mask, a_reg[15]); + +#define MASKED_LOAD_32_ROWS_AVX512( mask ) \ + a_reg[0] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[1] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[2] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 2 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[3] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 3 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[4] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 4 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[5] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 5 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[6] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 6 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[7] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 7 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[8] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 8 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[9] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 9 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[10] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 10 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[11] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 11 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[12] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 12 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[13] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 13 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[14] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 14 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[15] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 15 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[16] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 16 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[17] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 17 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[18] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 18 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[19] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 19 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[20] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 20 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[21] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 21 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[22] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 22 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[23] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 23 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[24] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 24 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[25] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 25 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[26] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 26 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[27] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 27 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[28] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 28 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[29] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 29 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[30] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 30 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[31] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 31 ) * rs_a ) + ( kr * cs_a )); + +#define MASKED_STORE_32_ROWS_AVX512( mask ) \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, mask, a_reg[0] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr, mask, a_reg[1] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 2 ) * KC ) + kr, mask, a_reg[2] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 3 ) * KC ) + kr, mask, a_reg[3] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 4 ) * KC ) + kr, mask, a_reg[4] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 5 ) * KC ) + kr, mask, a_reg[5] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 6 ) * KC ) + kr, mask, a_reg[6] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 7 ) * KC ) + kr, mask, a_reg[7] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 8 ) * KC ) + kr, mask, a_reg[8] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 9 ) * KC ) + kr, mask, a_reg[9] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 10 ) * KC ) + kr, mask, a_reg[10] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 11 ) * KC ) + kr, mask, a_reg[11] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 12 ) * KC ) + kr, mask, a_reg[12] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 13 ) * KC ) + kr, mask, a_reg[13] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 14 ) * KC ) + kr, mask, a_reg[14] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 15 ) * KC ) + kr, mask, a_reg[15] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 16 ) * KC ) + kr, mask, a_reg[16] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 17 ) * KC ) + kr, mask, a_reg[17] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 18 ) * KC ) + kr, mask, a_reg[18] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 19 ) * KC ) + kr, mask, a_reg[19] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 20 ) * KC ) + kr, mask, a_reg[20] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 21 ) * KC ) + kr, mask, a_reg[21] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 22 ) * KC ) + kr, mask, a_reg[22] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 23 ) * KC ) + kr, mask, a_reg[23] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 24 ) * KC ) + kr, mask, a_reg[24] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 25 ) * KC ) + kr, mask, a_reg[25] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 26 ) * KC ) + kr, mask, a_reg[26] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 27 ) * KC ) + kr, mask, a_reg[27] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 28 ) * KC ) + kr, mask, a_reg[28] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 29 ) * KC ) + kr, mask, a_reg[29] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 30 ) * KC ) + kr, mask, a_reg[30] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 31 ) * KC ) + kr, mask, a_reg[31] ); + + +#define MASKED_LOAD_16_ROWS_AVX512( mask ) \ + a_reg[0] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[1] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[2] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 2 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[3] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 3 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[4] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 4 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[5] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 5 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[6] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 6 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[7] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 7 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[8] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 8 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[9] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 9 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[10] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 10 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[11] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 11 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[12] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 12 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[13] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 13 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[14] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 14 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[15] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 15 ) * rs_a ) + ( kr * cs_a )); + +#define MASKED_STORE_16_ROWS_AVX512( mask ) \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, mask, a_reg[0] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr, mask, a_reg[1] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 2 ) * KC ) + kr, mask, a_reg[2] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 3 ) * KC ) + kr, mask, a_reg[3] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 4 ) * KC ) + kr, mask, a_reg[4] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 5 ) * KC ) + kr, mask, a_reg[5] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 6 ) * KC ) + kr, mask, a_reg[6] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 7 ) * KC ) + kr, mask, a_reg[7] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 8 ) * KC ) + kr, mask, a_reg[8] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 9 ) * KC ) + kr, mask, a_reg[9] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 10 ) * KC ) + kr, mask, a_reg[10] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 11 ) * KC ) + kr, mask, a_reg[11] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 12 ) * KC ) + kr, mask, a_reg[12] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 13 ) * KC ) + kr, mask, a_reg[13] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 14 ) * KC ) + kr, mask, a_reg[14] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 15 ) * KC ) + kr, mask, a_reg[15] ); + + +#define MASKED_LOAD_8_ROWS_AVX512( mask ) \ + a_reg[0] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[1] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[2] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 2 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[3] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 3 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[4] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 4 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[5] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 5 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[6] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 6 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[7] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 7 ) * rs_a ) + ( kr * cs_a )); + +#define MASKED_STORE_8_ROWS_AVX512( mask ) \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, mask, a_reg[0] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr, mask, a_reg[1] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 2 ) * KC ) + kr, mask, a_reg[2] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 3 ) * KC ) + kr, mask, a_reg[3] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 4 ) * KC ) + kr, mask, a_reg[4] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 5 ) * KC ) + kr, mask, a_reg[5] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 6 ) * KC ) + kr, mask, a_reg[6] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 7 ) * KC ) + kr, mask, a_reg[7] ); + + +#define MASKED_LOAD_4_ROWS_AVX512( mask ) \ + a_reg[0] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[1] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[2] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 2 ) * rs_a ) + ( kr * cs_a )); \ + a_reg[3] = _mm512_maskz_loadu_epi16( mask, a + ( ( ic + 3 ) * rs_a ) + ( kr * cs_a )); + +#define MASKED_STORE_4_ROWS_AVX512( mask ) \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, mask, a_reg[0] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr, mask, a_reg[1] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 2 ) * KC ) + kr, mask, a_reg[2] ); \ + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 3 ) * KC ) + kr, mask, a_reg[3] ); + +void packa_mr16_bf16bf16f32of32_row_major + ( + bfloat16* pack_a_buffer, + const bfloat16* a, + const dim_t rs_a, + const dim_t cs_a, + const dim_t MC, + const dim_t KC, + dim_t* rs_p, + dim_t* cs_p + ); + +void packa_mr16_bf16bf16f32of32_col_major + ( + bfloat16* pack_a_buffer, + const bfloat16* a, + const dim_t rs_a, + const dim_t cs_a, + const dim_t MC, + const dim_t KC, + dim_t* rs_p, + dim_t* cs_p + ); + +void packa_mr16_bf16bf16f32of32 + ( + bfloat16* pack_a_buffer, + const bfloat16* a, + const dim_t rs_a, + const dim_t cs_a, + const dim_t MC, + const dim_t KC, + dim_t* rs_p, + dim_t* cs_p + ) +{ + if( cs_a == 1 ) + { + packa_mr16_bf16bf16f32of32_row_major + ( pack_a_buffer, a, rs_a, cs_a, MC, KC, rs_p, cs_p); + } + else + { + packa_mr16_bf16bf16f32of32_col_major + ( pack_a_buffer, a, rs_a, cs_a, MC, KC, rs_p, cs_p); + } +} + +void packa_mr16_bf16bf16f32of32_row_major + ( + bfloat16* pack_a_buffer, + const bfloat16* a, + const dim_t rs_a, + const dim_t cs_a, + const dim_t MC, + const dim_t KC, + dim_t* rs_p, + dim_t* cs_p + ) +{ + dim_t MR = 32; + + __m512i a_reg[32]; + + dim_t ic = 0, kr = 0; + + for( ic = 0; ( ic + MR - 1 ) < MC; ic += MR ) + { + for( kr = 0; ( kr + 32 - 1) < KC; kr += 32 ) + { + a_reg[0] = _mm512_loadu_si512( a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[1] = _mm512_loadu_si512( a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[2] = _mm512_loadu_si512( a + ( ( ic + 2 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[3] = _mm512_loadu_si512( a + ( ( ic + 3 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[4] = _mm512_loadu_si512( a + ( ( ic + 4 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[5] = _mm512_loadu_si512( a + ( ( ic + 5 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[6] = _mm512_loadu_si512( a + ( ( ic + 6 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[7] = _mm512_loadu_si512( a + ( ( ic + 7 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[8] = _mm512_loadu_si512( a + ( ( ic + 8 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[9] = _mm512_loadu_si512( a + ( ( ic + 9 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[10] = _mm512_loadu_si512( a + ( ( ic + 10 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[11] = _mm512_loadu_si512( a + ( ( ic + 11 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[12] = _mm512_loadu_si512( a + ( ( ic + 12 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[13] = _mm512_loadu_si512( a + ( ( ic + 13 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[14] = _mm512_loadu_si512( a + ( ( ic + 14 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[15] = _mm512_loadu_si512( a + ( ( ic + 15 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[16] = _mm512_loadu_si512( a + ( ( ic + 16 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[17] = _mm512_loadu_si512( a + ( ( ic + 17 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[18] = _mm512_loadu_si512( a + ( ( ic + 18 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[19] = _mm512_loadu_si512( a + ( ( ic + 19 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[20] = _mm512_loadu_si512( a + ( ( ic + 20 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[21] = _mm512_loadu_si512( a + ( ( ic + 21 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[22] = _mm512_loadu_si512( a + ( ( ic + 22 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[23] = _mm512_loadu_si512( a + ( ( ic + 23 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[24] = _mm512_loadu_si512( a + ( ( ic + 24 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[25] = _mm512_loadu_si512( a + ( ( ic + 25 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[26] = _mm512_loadu_si512( a + ( ( ic + 26 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[27] = _mm512_loadu_si512( a + ( ( ic + 27 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[28] = _mm512_loadu_si512( a + ( ( ic + 28 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[29] = _mm512_loadu_si512( a + ( ( ic + 29 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[30] = _mm512_loadu_si512( a + ( ( ic + 30 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[31] = _mm512_loadu_si512( a + ( ( ic + 31 ) * rs_a ) + ( kr * cs_a ) ); + + + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr , a_reg[0] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr , a_reg[1] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 2 ) * KC ) + kr , a_reg[2] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 3 ) * KC ) + kr , a_reg[3] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 4 ) * KC ) + kr , a_reg[4] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 5 ) * KC ) + kr , a_reg[5] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 6 ) * KC ) + kr , a_reg[6] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 7 ) * KC ) + kr , a_reg[7] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 8 ) * KC ) + kr , a_reg[8] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 9 ) * KC ) + kr , a_reg[9] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 10 ) * KC ) + kr , a_reg[10] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 11 ) * KC ) + kr , a_reg[11] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 12 ) * KC ) + kr , a_reg[12] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 13 ) * KC ) + kr , a_reg[13] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 14 ) * KC ) + kr , a_reg[14] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 15 ) * KC ) + kr , a_reg[15] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 16 ) * KC ) + kr , a_reg[16] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 17 ) * KC ) + kr , a_reg[17] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 18 ) * KC ) + kr , a_reg[18] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 19 ) * KC ) + kr , a_reg[19] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 20 ) * KC ) + kr , a_reg[20] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 21 ) * KC ) + kr , a_reg[21] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 22 ) * KC ) + kr , a_reg[22] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 23 ) * KC ) + kr , a_reg[23] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 24 ) * KC ) + kr , a_reg[24] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 25 ) * KC ) + kr , a_reg[25] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 26 ) * KC ) + kr , a_reg[26] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 27 ) * KC ) + kr , a_reg[27] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 28 ) * KC ) + kr , a_reg[28] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 29 ) * KC ) + kr , a_reg[29] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 30 ) * KC ) + kr , a_reg[30] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 31 ) * KC ) + kr , a_reg[31] ); + } + for( ; ( kr + 15 ) < KC; kr += 16 ) + { + MASKED_LOAD_32_ROWS_AVX512( 0xFFFF ) + + MASKED_STORE_32_ROWS_AVX512( 0xFFFF ) + } + for( ; ( kr + 7 ) < KC; kr += 8 ) + { + MASKED_LOAD_32_ROWS_AVX512( 0xFF ) + + MASKED_STORE_32_ROWS_AVX512( 0xFF ) + } + for( ; ( kr + 3 ) < KC; kr += 4 ) + { + MASKED_LOAD_32_ROWS_AVX512( 0xF ) + + MASKED_STORE_32_ROWS_AVX512( 0xF ) + } + for( ; ( kr + 1 ) < KC; kr += 2 ) + { + MASKED_LOAD_32_ROWS_AVX512( 0x3 ) + + MASKED_STORE_32_ROWS_AVX512( 0x3 ) + } + for( ; ( kr ) < KC; kr += 1 ) + { + MASKED_LOAD_32_ROWS_AVX512( 0x1 ) + + MASKED_STORE_32_ROWS_AVX512( 0x1 ) + } + } + for( ; ( ic + 16 - 1 ) < MC; ic += 16 ) + { + for( kr = 0; ( kr + 32 - 1 ) < KC; kr += 32 ) + { + a_reg[0] = _mm512_loadu_si512( a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[1] = _mm512_loadu_si512( a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[2] = _mm512_loadu_si512( a + ( ( ic + 2 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[3] = _mm512_loadu_si512( a + ( ( ic + 3 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[4] = _mm512_loadu_si512( a + ( ( ic + 4 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[5] = _mm512_loadu_si512( a + ( ( ic + 5 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[6] = _mm512_loadu_si512( a + ( ( ic + 6 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[7] = _mm512_loadu_si512( a + ( ( ic + 7 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[8] = _mm512_loadu_si512( a + ( ( ic + 8 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[9] = _mm512_loadu_si512( a + ( ( ic + 9 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[10] = _mm512_loadu_si512( a + ( ( ic + 10 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[11] = _mm512_loadu_si512( a + ( ( ic + 11 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[12] = _mm512_loadu_si512( a + ( ( ic + 12 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[13] = _mm512_loadu_si512( a + ( ( ic + 13 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[14] = _mm512_loadu_si512( a + ( ( ic + 14 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[15] = _mm512_loadu_si512( a + ( ( ic + 15 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr , a_reg[0] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr , a_reg[1] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 2 ) * KC ) + kr , a_reg[2] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 3 ) * KC ) + kr , a_reg[3] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 4 ) * KC ) + kr , a_reg[4] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 5 ) * KC ) + kr , a_reg[5] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 6 ) * KC ) + kr , a_reg[6] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 7 ) * KC ) + kr , a_reg[7] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 8 ) * KC ) + kr , a_reg[8] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 9 ) * KC ) + kr , a_reg[9] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 10 ) * KC ) + kr , a_reg[10] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 11 ) * KC ) + kr , a_reg[11] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 12 ) * KC ) + kr , a_reg[12] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 13 ) * KC ) + kr , a_reg[13] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 14 ) * KC ) + kr , a_reg[14] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 15 ) * KC ) + kr , a_reg[15] ); + } + for( ; ( kr + 16 - 1 ) < KC; kr += 16 ) + { + MASKED_LOAD_16_ROWS_AVX512( 0xFFFF ) + MASKED_STORE_16_ROWS_AVX512( 0xFFFF ) + } + for( ; ( kr + 7 ) < KC; kr += 8 ) + { + MASKED_LOAD_16_ROWS_AVX512( 0xFF ) + + MASKED_STORE_16_ROWS_AVX512( 0xFF ) + } + for( ; ( kr + 3 ) < KC; kr += 4 ) + { + MASKED_LOAD_16_ROWS_AVX512( 0xF ) + + MASKED_STORE_16_ROWS_AVX512( 0xF ) + } + for( ; ( kr + 1 ) < KC; kr += 2 ) + { + MASKED_LOAD_16_ROWS_AVX512( 0x3 ) + + MASKED_STORE_16_ROWS_AVX512( 0x3 ) + } + for( ; ( kr ) < KC; kr += 1 ) + { + MASKED_LOAD_16_ROWS_AVX512( 0x1 ) + + MASKED_STORE_16_ROWS_AVX512( 0x1 ) + } + } + for( ; ( ic + 7 - 1 ) < MC; ic += 8 ) + { + for( kr = 0; ( kr + 32 - 1 ) < KC; kr += 32 ) + { + a_reg[0] = _mm512_loadu_si512( a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[1] = _mm512_loadu_si512( a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[2] = _mm512_loadu_si512( a + ( ( ic + 2 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[3] = _mm512_loadu_si512( a + ( ( ic + 3 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[4] = _mm512_loadu_si512( a + ( ( ic + 4 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[5] = _mm512_loadu_si512( a + ( ( ic + 5 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[6] = _mm512_loadu_si512( a + ( ( ic + 6 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[7] = _mm512_loadu_si512( a + ( ( ic + 7 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr , a_reg[0] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr , a_reg[1] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 2 ) * KC ) + kr , a_reg[2] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 3 ) * KC ) + kr , a_reg[3] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 4 ) * KC ) + kr , a_reg[4] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 5 ) * KC ) + kr , a_reg[5] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 6 ) * KC ) + kr , a_reg[6] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 7 ) * KC ) + kr , a_reg[7] ); + } + for( ; ( kr + 16 - 1 ) < KC; kr += 16 ) + { + MASKED_LOAD_8_ROWS_AVX512( 0xFFFF ) + MASKED_STORE_8_ROWS_AVX512( 0xFFFF ) + } + for( ; ( kr + 7 ) < KC; kr += 8 ) + { + MASKED_LOAD_8_ROWS_AVX512( 0xFF ) + + MASKED_STORE_8_ROWS_AVX512( 0xFF ) + } + for( ; ( kr + 3 ) < KC; kr += 4 ) + { + MASKED_LOAD_8_ROWS_AVX512( 0xF ) + + MASKED_STORE_8_ROWS_AVX512( 0xF ) + } + for( ; ( kr + 1 ) < KC; kr += 2 ) + { + MASKED_LOAD_8_ROWS_AVX512( 0x3 ) + + MASKED_STORE_8_ROWS_AVX512( 0x3 ) + } + for( ; ( kr ) < KC; kr += 1 ) + { + MASKED_LOAD_8_ROWS_AVX512( 0x1 ) + + MASKED_STORE_8_ROWS_AVX512( 0x1 ) + } + } + for( ; ( ic + 4 - 1 ) < MC; ic += 4 ) + { + for( kr = 0; ( kr + 32 - 1 ) < KC; kr += 32 ) + { + a_reg[0] = _mm512_loadu_si512( a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[1] = _mm512_loadu_si512( a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[2] = _mm512_loadu_si512( a + ( ( ic + 2 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[3] = _mm512_loadu_si512( a + ( ( ic + 3 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr , a_reg[0] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr , a_reg[1] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 2 ) * KC ) + kr , a_reg[2] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 3 ) * KC ) + kr , a_reg[3] ); + } + for( ; ( kr + 16 - 1 ) < KC; kr += 16 ) + { + MASKED_LOAD_4_ROWS_AVX512( 0xFFFF ) + MASKED_STORE_4_ROWS_AVX512( 0xFFFF ) + } + for( ; ( kr + 7 ) < KC; kr += 8 ) + { + MASKED_LOAD_4_ROWS_AVX512( 0xFF ) + + MASKED_STORE_4_ROWS_AVX512( 0xFF ) + } + for( ; ( kr + 3 ) < KC; kr += 4 ) + { + MASKED_LOAD_4_ROWS_AVX512( 0xF ) + + MASKED_STORE_4_ROWS_AVX512( 0xF ) + } + for( ; ( kr + 1 ) < KC; kr += 2 ) + { + MASKED_LOAD_4_ROWS_AVX512( 0x3 ) + + MASKED_STORE_4_ROWS_AVX512( 0x3 ) + } + for( ; ( kr ) < KC; kr += 1 ) + { + MASKED_LOAD_4_ROWS_AVX512( 0x1 ) + + MASKED_STORE_4_ROWS_AVX512( 0x1 ) + } + } + + for( ; ( ic + 2 - 1 ) < MC; ic += 2 ) + { + for( kr = 0; ( kr + 32 - 1 ) < KC; kr += 32 ) + { + a_reg[0] = _mm512_loadu_si512( a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[1] = _mm512_loadu_si512( a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr , a_reg[0] ); + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr , a_reg[1] ); + } + for( ; ( kr + 16 - 1 ) < KC; kr += 16 ) + { + a_reg[0] = _mm512_maskz_loadu_epi16( 0xFFFF, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[1] = _mm512_maskz_loadu_epi16( 0xFFFF, a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, 0xFFFF, a_reg[0] ); + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr, 0xFFFF, a_reg[1] ); + } + for( ; ( kr + 7 ) < KC; kr += 8 ) + { + a_reg[0] = _mm512_maskz_loadu_epi16( 0xFF, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[1] = _mm512_maskz_loadu_epi16( 0xFF, a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, 0xFF, a_reg[0] ); + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr, 0xFF, a_reg[1] ); + } + for( ; ( kr + 3 ) < KC; kr += 4 ) + { + a_reg[0] = _mm512_maskz_loadu_epi16( 0xF, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[1] = _mm512_maskz_loadu_epi16( 0xF, a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, 0xF, a_reg[0] ); + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr, 0xF, a_reg[1] ); + } + for( ; ( kr + 1 ) < KC; kr += 2 ) + { + a_reg[0] = _mm512_maskz_loadu_epi16( 0x3, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[1] = _mm512_maskz_loadu_epi16( 0x3, a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, 0x3, a_reg[0] ); + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr, 0x3, a_reg[1] ); + } + for( ; ( kr ) < KC; kr += 1 ) + { + a_reg[0] = _mm512_maskz_loadu_epi16( 0x1, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + a_reg[1] = _mm512_maskz_loadu_epi16( 0x1, a + ( ( ic + 1 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, 0x1, a_reg[0] ); + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 1 ) * KC ) + kr, 0x1, a_reg[1] ); + } + } + for( ; ( ic ) < MC; ic += 1 ) + { + for( kr = 0; ( kr + 32 - 1 ) < KC; kr += 32 ) + { + a_reg[0] = _mm512_loadu_si512( a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_storeu_si512( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr , a_reg[0]); + } + for( ; ( kr + 16 - 1 ) < KC; kr += 16 ) + { + a_reg[0] = _mm512_maskz_loadu_epi16( 0xFFFF, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, 0xFFFF, a_reg[0] ); + } + for( ; ( kr + 7 ) < KC; kr += 8 ) + { + a_reg[0] = _mm512_maskz_loadu_epi16( 0xFF, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, 0xFF, a_reg[0] ); + } + for( ; ( kr + 3 ) < KC; kr += 4 ) + { + a_reg[0] = _mm512_maskz_loadu_epi16( 0xF, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, 0xF, a_reg[0] ); + } + for( ; ( kr + 1 ) < KC; kr += 2 ) + { + a_reg[0] = _mm512_maskz_loadu_epi16( 0x3, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, 0x3, a_reg[0] ); + } + for( ; ( kr ) < KC; kr += 1 ) + { + a_reg[0] = _mm512_maskz_loadu_epi16( 0x1, a + ( ( ic + 0 ) * rs_a ) + ( kr * cs_a ) ); + + _mm512_mask_storeu_epi16( pack_a_buffer + ( ( ic + 0 ) * KC ) + kr, 0x1, a_reg[0] ); + } + } + *rs_p = KC; + *cs_p = 2; + +} +void packa_mr16_bf16bf16f32of32_col_major + ( + bfloat16* pack_a_buffer, + const bfloat16* a, + const dim_t rs_a, + const dim_t cs_a, + const dim_t MC, + const dim_t KC, + dim_t* rs_p, + dim_t* cs_p + ) +{ + dim_t MR = 16; + + dim_t m_left = MC % 4; + + __m256i a_reg[16], b_reg[16]; + + dim_t ic, kr; + + for( ic = 0; ( ic + MR - 1 ) < MC; ic += MR) + { + for( kr = 0; ( kr + 15 ) < KC; kr += 16) + { + a_reg[0] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ) ); + a_reg[1] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ) ); + a_reg[2] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ) ); + a_reg[3] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ) ); + a_reg[4] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 4 ) * cs_a ) ) ); + a_reg[5] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 5 ) * cs_a ) ) ); + a_reg[6] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 6 ) * cs_a ) ) ); + a_reg[7] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 7 ) * cs_a ) ) ); + a_reg[8] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 8 ) * cs_a ) ) ); + a_reg[9] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 9 ) * cs_a ) ) ); + a_reg[10] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 10 ) * cs_a ) ) ); + a_reg[11] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 11 ) * cs_a ) ) ); + a_reg[12] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 12 ) * cs_a ) ) ); + a_reg[13] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 13 ) * cs_a ) ) ); + a_reg[14] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 14 ) * cs_a ) ) ); + a_reg[15] = _mm256_loadu_si256( (__m256i const *) ( a + ( ic * rs_a ) + ( ( kr + 15 ) * cs_a ) ) ); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 0 ) * KC + kr ), a_reg[0] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 1 ) * KC + kr ), a_reg[4] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 2 ) * KC + kr ), a_reg[2] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 3 ) * KC + kr ), a_reg[6] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 4 ) * KC + kr ), a_reg[8] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 5 ) * KC + kr ), a_reg[12] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 6 ) * KC + kr ), a_reg[10] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 7 ) * KC + kr ), a_reg[14] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 8 ) * KC + kr ), a_reg[1] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 9 ) * KC + kr ), a_reg[5] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 10 ) * KC + kr ), a_reg[3] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 11 ) * KC + kr ), a_reg[7] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 12 ) * KC + kr ), a_reg[9] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 13 ) * KC + kr ), a_reg[13] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 14 ) * KC + kr ), a_reg[11] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 15 ) * KC + kr ), a_reg[15] ); + } + + for( ; ( kr + 7 ) < KC; kr += 8) + { + a_reg[0] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ) ); + a_reg[1] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ) ); + a_reg[2] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ) ); + a_reg[3] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ) ); + a_reg[4] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 4 ) * cs_a ) ) ); + a_reg[5] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 5 ) * cs_a ) ) ); + a_reg[6] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 6 ) * cs_a ) ) ); + a_reg[7] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 7 ) * cs_a ) ) ); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + MASKED_STORE_EPI64(0x03) + + } + for( ; ( kr + 3 ) < KC; kr += 4) + { + a_reg[0] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ) ); + a_reg[1] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ) ); + a_reg[2] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ) ); + a_reg[3] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ) ); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + MASKED_STORE_EPI64(0x01) + } + for( ; ( kr + 1 ) < KC; kr += 2) + { + a_reg[0] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ) ); + a_reg[1] = _mm256_loadu_si256( (__m256i const *)( a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ) ); + a_reg[2] = _mm256_setzero_si256(); + a_reg[3] = _mm256_setzero_si256(); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + MASKED_STORE_EPI32(0x01) + } + for( ; ( kr ) < KC; kr += 1) + { + a_reg[0] = _mm256_loadu_si256( (__m256i const *)(a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ) ); + a_reg[1] = _mm256_setzero_si256(); + a_reg[2] = _mm256_setzero_si256(); + a_reg[3] = _mm256_setzero_si256(); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + MASKED_STORE_EPI16(0x01) + } + } + + for( ; ( ic + 8 - 1) < MC; ic += 8) + { + for( kr = 0; ( kr + 15 ) < KC; kr += 16) + { + a_reg[0] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ); + a_reg[3] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ); + a_reg[4] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 4 ) * cs_a ) ); + a_reg[5] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 5 ) * cs_a ) ); + a_reg[6] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 6 ) * cs_a ) ); + a_reg[7] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 7 ) * cs_a ) ); + a_reg[8] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 8 ) * cs_a ) ); + a_reg[9] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 9 ) * cs_a ) ); + a_reg[10] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 10 ) * cs_a ) ); + a_reg[11] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 11 ) * cs_a ) ); + a_reg[12] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 12 ) * cs_a ) ); + a_reg[13] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 13 ) * cs_a ) ); + a_reg[14] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 14 ) * cs_a ) ); + a_reg[15] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 15 ) * cs_a ) ); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 0 ) * KC + kr ), a_reg[0] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 1 ) * KC + kr ), a_reg[4] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 2 ) * KC + kr ), a_reg[2] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 3 ) * KC + kr ), a_reg[6] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 4 ) * KC + kr ), a_reg[8] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 5 ) * KC + kr ), a_reg[12] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 6 ) * KC + kr ), a_reg[10] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 7 ) * KC + kr ), a_reg[14] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 8 ) * KC + kr ), a_reg[1] ); + } + + for( ; ( kr + 7 ) < KC; kr += 8) + { + a_reg[0] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ); + a_reg[3] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ); + a_reg[4] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 4 ) * cs_a ) ); + a_reg[5] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 5 ) * cs_a ) ); + a_reg[6] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 6 ) * cs_a ) ); + a_reg[7] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 7 ) * cs_a ) ); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x03, a_reg[0] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x03, a_reg[4] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x03, a_reg[2] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 3 ) * KC + kr ), 0x03, a_reg[6] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 4 ) * KC + kr ), 0x03, a_reg[8] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 5 ) * KC + kr ), 0x03, a_reg[12] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 6 ) * KC + kr ), 0x03, a_reg[10] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 7 ) * KC + kr ), 0x03, a_reg[14] ); + } + for( ; ( kr + 3 ) < KC; kr += 4) + { + a_reg[0] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ); + a_reg[3] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x01, a_reg[2] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 3 ) * KC + kr ), 0x01, a_reg[6] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 4 ) * KC + kr ), 0x01, a_reg[8] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 5 ) * KC + kr ), 0x01, a_reg[12] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 6 ) * KC + kr ), 0x01, a_reg[10] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 7 ) * KC + kr ), 0x01, a_reg[14] ); + } + for( ; ( kr + 1 ) < KC; kr += 2) + { + a_reg[0] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_setzero_si256(); + a_reg[3] = _mm256_setzero_si256(); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x01, a_reg[2] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 3 ) * KC + kr ), 0x01, a_reg[6] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 4 ) * KC + kr ), 0x01, a_reg[8] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 5 ) * KC + kr ), 0x01, a_reg[12] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 6 ) * KC + kr ), 0x01, a_reg[10] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 7 ) * KC + kr ), 0x01, a_reg[14] ); + } + for( ; ( kr ) < KC; kr += 1) + { + a_reg[0] = _mm256_maskz_loadu_epi16( 0xFF, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_setzero_si256(); + a_reg[2] = _mm256_setzero_si256(); + a_reg[3] = _mm256_setzero_si256(); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x01, a_reg[2] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 3 ) * KC + kr ), 0x01, a_reg[6] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 4 ) * KC + kr ), 0x01, a_reg[8] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 5 ) * KC + kr ), 0x01, a_reg[12] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 6 ) * KC + kr ), 0x01, a_reg[10] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 7 ) * KC + kr ), 0x01, a_reg[14] ); + } + } + + for( ; ( ic + 4 - 1 ) < MC; ic += 4) + { + for( kr = 0; ( kr + 15 ) < KC; kr += 16) + { + a_reg[0] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ); + a_reg[3] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ); + a_reg[4] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 4 ) * cs_a ) ); + a_reg[5] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 5 ) * cs_a ) ); + a_reg[6] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 6 ) * cs_a ) ); + a_reg[7] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 7 ) * cs_a ) ); + a_reg[8] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 8 ) * cs_a ) ); + a_reg[9] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 9 ) * cs_a ) ); + a_reg[10] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 10 ) * cs_a ) ); + a_reg[11] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 11 ) * cs_a ) ); + a_reg[12] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 12 ) * cs_a ) ); + a_reg[13] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 13 ) * cs_a ) ); + a_reg[14] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 14 ) * cs_a ) ); + a_reg[15] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 15 ) * cs_a ) ); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 0 ) * KC + kr ), a_reg[0] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 1 ) * KC + kr ), a_reg[4] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 2 ) * KC + kr ), a_reg[2] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 3 ) * KC + kr ), a_reg[6] ); + } + + for( ; ( kr + 7 ) < KC; kr += 8) + { + a_reg[0] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ); + a_reg[3] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ); + a_reg[4] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 4 ) * cs_a ) ); + a_reg[5] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 5 ) * cs_a ) ); + a_reg[6] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 6 ) * cs_a ) ); + a_reg[7] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 7 ) * cs_a ) ); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x03, a_reg[0] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x03, a_reg[4] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x03, a_reg[2] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 3 ) * KC + kr ), 0x03, a_reg[6] ); + } + for( ; ( kr + 3 ) < KC; kr += 4) + { + a_reg[0] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ); + a_reg[3] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x01, a_reg[2] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 3 ) * KC + kr ), 0x01, a_reg[6] ); + } + for( ; ( kr + 1 ) < KC; kr += 2) + { + a_reg[0] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_setzero_si256(); + a_reg[3] = _mm256_setzero_si256(); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x01, a_reg[2] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 3 ) * KC + kr ), 0x01, a_reg[6] ); + } + for( ; ( kr ) < KC; kr += 1) + { + a_reg[0] = _mm256_maskz_loadu_epi16( 0x0F, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_setzero_si256(); + a_reg[2] = _mm256_setzero_si256(); + a_reg[3] = _mm256_setzero_si256(); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x01, a_reg[2] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 3 ) * KC + kr ), 0x01, a_reg[6] ); + } + } + + if( m_left ) + { + __mmask16 mask = 0xFFFF >> ( 16 - m_left ); + for( kr = 0; ( kr + 15 ) < KC; kr += 16) + { + a_reg[0] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ); + a_reg[3] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ); + a_reg[4] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 4 ) * cs_a ) ); + a_reg[5] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 5 ) * cs_a ) ); + a_reg[6] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 6 ) * cs_a ) ); + a_reg[7] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 7 ) * cs_a ) ); + a_reg[8] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 8 ) * cs_a ) ); + a_reg[9] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 9 ) * cs_a ) ); + a_reg[10] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 10 ) * cs_a ) ); + a_reg[11] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 11 ) * cs_a ) ); + a_reg[12] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 12 ) * cs_a ) ); + a_reg[13] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 13 ) * cs_a ) ); + a_reg[14] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 14 ) * cs_a ) ); + a_reg[15] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 15 ) * cs_a ) ); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + + switch( m_left ) + { + case 3: + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 0 ) * KC + kr ), a_reg[0] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 1 ) * KC + kr ), a_reg[4] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 2 ) * KC + kr ), a_reg[2] ); + break; + case 2: + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 0 ) * KC + kr ), a_reg[0] ); + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 1 ) * KC + kr ), a_reg[4] ); + break; + case 1: + _mm256_storeu_si256( (__m256i *)( pack_a_buffer + ( ic + 0 ) * KC + kr ), a_reg[0] ); + break; + } + } + + for( ; ( kr + 7 ) < KC; kr += 8) + { + a_reg[0] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ); + a_reg[3] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ); + a_reg[4] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 4 ) * cs_a ) ); + a_reg[5] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 5 ) * cs_a ) ); + a_reg[6] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 6 ) * cs_a ) ); + a_reg[7] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 7 ) * cs_a ) ); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + + switch( m_left ) + { + case 3: + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x03, a_reg[0] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x03, a_reg[4] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x03, a_reg[2] ); + break; + case 2: + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x03, a_reg[0] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x03, a_reg[4] ); + break; + case 1: + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x03, a_reg[0] ); + break; + } + } + for( ; ( kr + 3 ) < KC; kr += 4) + { + a_reg[0] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 2 ) * cs_a ) ); + a_reg[3] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 3 ) * cs_a ) ); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + + switch( m_left ) + { + case 3: + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x01, a_reg[2] ); + break; + case 2: + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0]); + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4]); + break; + case 1: + _mm256_mask_storeu_epi64( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0]); + break; + } + } + for( ; ( kr + 1 ) < KC; kr += 2) + { + a_reg[0] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 1 ) * cs_a ) ); + a_reg[2] = _mm256_setzero_si256(); + a_reg[3] = _mm256_setzero_si256(); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + switch( m_left ) + { + case 3: + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x01, a_reg[2] ); + break; + case 2: + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi32( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + break; + case 1: + _mm256_mask_storeu_epi32( (pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + break; + } + } + for( ; ( kr ) < KC; kr += 1) + { + a_reg[0] = _mm256_maskz_loadu_epi16( mask, a + ( ic * rs_a ) + ( ( kr + 0 ) * cs_a ) ); + a_reg[1] = _mm256_setzero_si256(); + a_reg[2] = _mm256_setzero_si256(); + a_reg[3] = _mm256_setzero_si256(); + a_reg[4] = _mm256_setzero_si256(); + a_reg[5] = _mm256_setzero_si256(); + a_reg[6] = _mm256_setzero_si256(); + a_reg[7] = _mm256_setzero_si256(); + a_reg[8] = _mm256_setzero_si256(); + a_reg[9] = _mm256_setzero_si256(); + a_reg[10] = _mm256_setzero_si256(); + a_reg[11] = _mm256_setzero_si256(); + a_reg[12] = _mm256_setzero_si256(); + a_reg[13] = _mm256_setzero_si256(); + a_reg[14] = _mm256_setzero_si256(); + a_reg[15] = _mm256_setzero_si256(); + + UNPACKLO_EPI16 + UNPACKHI_EPI16 + UNPACKLO_EPI32 + UNPACKHI_EPI32 + UNPACKLO_EPI64 + UNPACKHI_EPI64 + SHUFFLE_64x2 + switch( m_left ) + { + case 3: + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 2 ) * KC + kr ), 0x01, a_reg[2] ); + break; + case 2: + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 1 ) * KC + kr ), 0x01, a_reg[4] ); + break; + case 1: + _mm256_mask_storeu_epi16( ( pack_a_buffer + ( ic + 0 ) * KC + kr ), 0x01, a_reg[0] ); + break; + } + } + } + + *rs_p = KC; + *cs_p = 2; +} +#endif From 5fd24c27a7f883db2baf904b090099333cb8f03f Mon Sep 17 00:00:00 2001 From: bhaskarn Date: Wed, 11 Oct 2023 17:02:02 +0530 Subject: [PATCH 153/226] Updated expf max min precission fix nan issue in Tanh Description: The expf_max and expf_min have more precission than the computation which is leading to corss the clipping at the edge case which is causing nan's in the tanh output. Updated the thresholds to less precission to clip the edge cases to avoid nan's in the tanh output. AMD-Internal: [SWLCSG-2423 ] Change-Id: I25a665475692f47443f30ca5dd09e8e06a0bfe29 --- kernels/zen/lpgemm/math_utils_avx2.h | 12 ++++++------ kernels/zen4/lpgemm/math_utils_avx512.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernels/zen/lpgemm/math_utils_avx2.h b/kernels/zen/lpgemm/math_utils_avx2.h index e705adb8f7..bdd1dec86a 100644 --- a/kernels/zen/lpgemm/math_utils_avx2.h +++ b/kernels/zen/lpgemm/math_utils_avx2.h @@ -44,8 +44,8 @@ #define TBL_LN2 0x1.71547652b82fep+0 #define EXPF_HUGE 0x1.8p+23 -#define EXPF_MIN -88.7228393f -#define EXPF_MAX 88.7228393f +#define EXPF_MIN -88.0f +#define EXPF_MAX 88.0f #define inf 1.0/0.0 #define sign -2147483648 @@ -84,8 +84,8 @@ POLY_EVAL_6_AVX2 (r, r2, z); \ \ q = _mm256_add_epi32((__m256i) (r), _mm256_sllv_epi32 ((__m256i)dn, _mm256_set1_epi32 (23)) ); \ - q = (__m256i)_mm256_blendv_ps ((__m256)q, _mm256_set1_ps(inf), _mm256_cmp_ps (_mm256_set1_ps(88.0), x, 1)); \ - q = (__m256i)_mm256_blendv_ps ((__m256)q, _mm256_set1_ps(0.0), _mm256_cmp_ps (x, _mm256_set1_ps(-88.0), 1)); + q = (__m256i)_mm256_blendv_ps ((__m256)q, _mm256_set1_ps(inf), _mm256_cmp_ps (_mm256_set1_ps(EXPF_MAX), x, 1)); \ + q = (__m256i)_mm256_blendv_ps ((__m256)q, _mm256_set1_ps(0.0), _mm256_cmp_ps (x, _mm256_set1_ps(EXPF_MIN), 1)); #define TANHF_AVX2(x_tanh, r, r2, x, z, dn, q) \ x = _mm256_mul_ps (_mm256_andnot_ps(_mm256_set1_ps(-0.0f), x_tanh), _mm256_set1_ps(-2) ); \ @@ -132,8 +132,8 @@ POLY_EVAL_6_SSE (r, r2, z); \ \ q = _mm_add_epi32((__m128i) (r), _mm_sllv_epi32 ((__m128i)dn, _mm_set1_epi32 (23)) ); \ - q = (__m128i)_mm_blendv_ps ((__m128)q, _mm_set1_ps(inf), _mm_cmp_ps (_mm_set1_ps(88.0), x, 1)); \ - q = (__m128i)_mm_blendv_ps ((__m128)q, _mm_set1_ps(0.0), _mm_cmp_ps (x, _mm_set1_ps(-88.0), 1)); + q = (__m128i)_mm_blendv_ps ((__m128)q, _mm_set1_ps(inf), _mm_cmp_ps (_mm_set1_ps(EXPF_MAX), x, 1)); \ + q = (__m128i)_mm_blendv_ps ((__m128)q, _mm_set1_ps(0.0), _mm_cmp_ps (x, _mm_set1_ps(EXPF_MIN), 1)); #define TANHF_SSE(x_tanh, r, r2, x, z, dn, q) \ x = _mm_mul_ps (_mm_andnot_ps(_mm_set1_ps(-0.0f), x_tanh), _mm_set1_ps(-2) ); \ diff --git a/kernels/zen4/lpgemm/math_utils_avx512.h b/kernels/zen4/lpgemm/math_utils_avx512.h index 82c9c5650b..6221827c75 100644 --- a/kernels/zen4/lpgemm/math_utils_avx512.h +++ b/kernels/zen4/lpgemm/math_utils_avx512.h @@ -44,8 +44,8 @@ #define TBL_LN2 0x1.71547652b82fep+0 #define EXPF_HUGE 0x1.8p+23 -#define EXPF_MIN -88.7228393f -#define EXPF_MAX 88.7228393f +#define EXPF_MIN -88.0f +#define EXPF_MAX 88.0f #define inf 1.0/0.0 #define sign -2147483648 From 3a71550bc3debb3ceceff9f436ddf7f31a32aff7 Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Tue, 10 Oct 2023 23:47:59 +0530 Subject: [PATCH 154/226] Enabling SUP blocksizes & kernels for generic config Details: - pack and compute extension APIs derive blocksizes(MR, NR...) from SUP cntx. - SUP blocksizes are not set for generic/skx configs. As a result pack and compute APIs cause floating point exceptions. - To fix these issues, we have enabled non-zero SUP blocksizes for generic config and zen4 SUP blocksizes for skx config. - However, these changes will not enable SUP path for skx/generic config as thresholds are set to zero. - To enable SUP path for skx config, more work is needed like non-zero thresholds and modifications to build system. Change-Id: I54483ab0c196845ca175b8cb8deeb9e9ac2a42b9 --- config/skx/bli_cntx_init_skx.c | 65 ++++++++++++++++++++++++++++++++++ config_registry | 2 +- ref_kernels/bli_cntx_ref.c | 46 ++++++++++++++++++------ 3 files changed, 101 insertions(+), 12 deletions(-) diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c index f18503a7a7..91dd7e444f 100644 --- a/config/skx/bli_cntx_init_skx.c +++ b/config/skx/bli_cntx_init_skx.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -128,5 +129,69 @@ void bli_cntx_init_skx( cntx_t* cntx ) BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); + + bli_cntx_set_l3_sup_kers + ( + 30, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE, + + BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE, + BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64m_avx512, TRUE, + BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE, + BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE, + BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE, + BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64n_avx512, TRUE, + BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE, + BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE, + + BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + + BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE, + cntx + ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 24, 3, 12, + 6, 9, 3, 12 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 64, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 144, 72, 48 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 480, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8064, 4080, 2040, 1020 ); + + // Update the context with the current architecture's register and cache + // blocksizes for small/unpacked level-3 problems. + bli_cntx_set_l3_sup_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + } diff --git a/config_registry b/config_registry index 8a3a47bfbd..30832c04e6 100644 --- a/config_registry +++ b/config_registry @@ -19,7 +19,7 @@ amdzen: zen4 zen3 zen2 zen generic #arm32: cortexa15 cortexa9 generic # Intel architectures. -skx: skx/skx/haswell/zen +skx: skx/skx/haswell/zen/zen4 knl: knl/knl/haswell/zen haswell: haswell/haswell/zen sandybridge: sandybridge diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index 960a30d612..b0d47d26f1 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -482,6 +482,30 @@ void GENBARNAME(cntx_init) // -- Set level-3 small/unpacked micro-kernels and preferences ------------- + // -- Set SUP blocksizes ------------------------------------------------------- + // These blocksizes are copied from native blocksizes for ref + + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 4, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 ); + + // Initialize the context with the default blocksize objects and their + // multiples. + bli_cntx_set_l3_sup_blkszs + ( + 5, + // level-3 + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + funcs = bli_cntx_l3_sup_kers_buf( cntx ); mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); @@ -529,7 +553,7 @@ void GENBARNAME(cntx_init) bli_mbool_init( &mbools[ BLIS_XXX ], TRUE, TRUE, TRUE, TRUE ); - // -- Set level-3 small/unpacked micro-kernels, preferences and blocksizes + // -- Set level-3 small/unpacked micro-kernels, preferences and blocksizes // for matrices dealing with triangular matrices------------- // -- Set blocksizes ------------------------------------------------------- @@ -544,16 +568,16 @@ void GENBARNAME(cntx_init) // Initialize the context with the default blocksize objects and their // multiples. bli_cntx_set_l3_sup_tri_blkszs - ( - 5, - // level-3 - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); + ( + 5, + // level-3 + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); funcs = bli_cntx_l3_sup_tri_kers_buf( cntx ); mbools = bli_cntx_l3_sup_tri_kers_prefs_buf( cntx ); From a6a67fea2d0b87430f4dfe464fb3283c456e06c7 Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Wed, 16 Aug 2023 12:35:47 +0530 Subject: [PATCH 155/226] ZAXPBYV optimizations for handling unit and non-unit strides - Updated the bli_zaxpbyv_zen_int( ... ) kernel's computational logic. The kernel performs two different sets of compute based on the value of alpha, for both unit and non-unit strides. There are no constraints on beta scaling of the 'y' vector. - Updated the logic to support 'x' conjugate in the computation. The kernel supports conjugate/no conjugate operation through the usage of _mm256_fmsubadd_pd( ... ) and _mm256_addsub_pd( ... ) intrinsics. - Updated the early return condition in the kernel to adhere to the standard compliance. - Updated the scalar computation with vector computation(using 128 bit registers), in case of dealing with a single element(fringe case) in unit-stride or vectors with non-unit strides. A single dcomplex element occupies 128 bits in memory, thereby providing scope for this optimization. - Added accuracy and extreme value testing with sufficient sizes and initializations, to test the required main and fringe cases of the computation. AMD-Internal: [CPUPL-3623] Change-Id: I7ae918856e7aba49424162290f3e3d592c244826 --- .../inc/common/wrong_inputs_helpers.h | 17 +- .../testsuite/level1/axpbyv/IIT_ERS_test.cpp | 96 ++ .../testsuite/level1/axpbyv/test_axpbyv.h | 48 +- .../level1/axpbyv/zaxpbyv_evt_testing.cpp | 372 ++++++++ .../level1/axpbyv/zaxpbyv_generic.cpp | 176 ++-- kernels/zen/1/bli_axpbyv_zen_int.c | 871 +++++++++++------- 6 files changed, 1145 insertions(+), 435 deletions(-) create mode 100644 gtestsuite/testsuite/level1/axpbyv/IIT_ERS_test.cpp create mode 100644 gtestsuite/testsuite/level1/axpbyv/zaxpbyv_evt_testing.cpp diff --git a/gtestsuite/testinghelpers/inc/common/wrong_inputs_helpers.h b/gtestsuite/testinghelpers/inc/common/wrong_inputs_helpers.h index e66f4f3168..f6eec48959 100644 --- a/gtestsuite/testinghelpers/inc/common/wrong_inputs_helpers.h +++ b/gtestsuite/testinghelpers/inc/common/wrong_inputs_helpers.h @@ -9,14 +9,14 @@ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT @@ -42,6 +42,7 @@ namespace testinghelpers { namespace IIT { static const char STORAGE = 'c'; static const char TRANS = 'n'; + static const char CONJ = 'n'; static const char SIDE = 'l'; static const char UPLO = 'u'; static const char DIAG = 'u'; diff --git a/gtestsuite/testsuite/level1/axpbyv/IIT_ERS_test.cpp b/gtestsuite/testsuite/level1/axpbyv/IIT_ERS_test.cpp new file mode 100644 index 0000000000..5e568b0655 --- /dev/null +++ b/gtestsuite/testsuite/level1/axpbyv/IIT_ERS_test.cpp @@ -0,0 +1,96 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "common/testing_helpers.h" +#include "axpbyv.h" +#include "inc/check_error.h" +#include "common/wrong_inputs_helpers.h" + +template +class Axpby_IIT_ERS_Test : public ::testing::Test {}; +typedef ::testing::Types TypeParam; // The supported datatypes from BLAS calls for AXPBY +TYPED_TEST_SUITE(Axpby_IIT_ERS_Test, TypeParam); // Defining individual testsuites based on the datatype support. + +// Adding namespace to get default parameters(valid case) from testinghelpers/common/wrong_input_helpers.h. +using namespace testinghelpers::IIT; + +/* + Early Return Scenarios(ERS) : + + The AXPBY API is expected to return early in the following cases: + 1. When n < 0. + +*/ + +#ifdef TEST_BLAS + +// When n < 0 +TYPED_TEST(Axpby_IIT_ERS_Test, n_lt_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector y = testinghelpers::get_random_vector( -10, 10, N, INC ); + + T alpha, beta; + testinghelpers::initone( alpha ); + testinghelpers::initzero( beta ); + // Copy so that we check that the elements of C are not modified. + std::vector y_ref(y); + + axpbyv( CONJ, -1, alpha, nullptr, INC, beta, y.data(), INC ); + // Use bitwise comparison (no threshold). + computediff( N, y.data(), y_ref.data(), INC ); +} + +// When n = 0 +TYPED_TEST(Axpby_IIT_ERS_Test, n_eq_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector y = testinghelpers::get_random_vector( -10, 10, N, INC ); + + T alpha, beta; + testinghelpers::initone( alpha ); + testinghelpers::initzero( beta ); + // Copy so that we check that the elements of C are not modified. + std::vector y_ref(y); + + axpbyv( CONJ, 0, alpha, nullptr, INC, beta, y.data(), INC ); + // Use bitwise comparison (no threshold). + computediff( N, y.data(), y_ref.data(), INC ); +} + +#endif + diff --git a/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h b/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h index 487b95c734..973f8ebab4 100644 --- a/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h +++ b/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h @@ -9,14 +9,14 @@ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT @@ -68,4 +68,36 @@ static void test_axpbyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); +} + +template +static void test_axpbyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, + T alpha, T beta, gtint_t xi, T xexval, gtint_t yj, + T yexval, double thresh ) +{ + //---------------------------------------------------------- + // Initialize vectors with random numbers. + //---------------------------------------------------------- + std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); + std::vector y = testinghelpers::get_random_vector( -10, 10, n, incy ); + + x[xi*incx] = xexval; + y[yj*incy] = yexval; + + //---------------------------------------------------------- + // Call reference implementation to get ref results. + //---------------------------------------------------------- + // Create a copy of y so that we can check reference results. + std::vector y_ref(y); + testinghelpers::ref_axpbyv( conjx, n, alpha, x.data(), incx, beta, y_ref.data(), incy ); + + //---------------------------------------------------------- + // Call BLIS function. + //---------------------------------------------------------- + axpbyv( conjx, n, alpha, x.data(), incx, beta, y.data(), incy ); + + //---------------------------------------------------------- + // Compute component-wise error. + //---------------------------------------------------------- + computediff( n, y.data(), y_ref.data(), incy, thresh, true ); } \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_evt_testing.cpp b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_evt_testing.cpp new file mode 100644 index 0000000000..5b3f251851 --- /dev/null +++ b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_evt_testing.cpp @@ -0,0 +1,372 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "test_axpbyv.h" + +class zaxpbyvEVTTest : + public ::testing::TestWithParam> {}; +// Tests using random integers as vector elements. +TEST_P(zaxpbyvEVTTest, RandomData) +{ + using T = dcomplex; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // denotes whether x or conj(x) will be added to y: + char conj_x = std::get<0>(GetParam()); + // vector length: + gtint_t n = std::get<1>(GetParam()); + // stride size for x: + gtint_t incx = std::get<2>(GetParam()); + // stride size for y: + gtint_t incy = std::get<3>(GetParam()); + // index for exval in x + gtint_t xi = std::get<4>(GetParam()); + // exval for x + T xexval = std::get<5>(GetParam()); + // index for exval in y + gtint_t yj = std::get<6>(GetParam()); + // exval for x + T yexval = std::get<7>(GetParam()); + // alpha + T alpha = std::get<8>(GetParam()); + // beta + T beta = std::get<9>(GetParam()); + + // Set the threshold for the errors: + double thresh = 20 * testinghelpers::getEpsilon(); + + //---------------------------------------------------------- + // Call generic test body using those parameters + //---------------------------------------------------------- + test_axpbyv(conj_x, n, incx, incy, alpha, beta, xi, xexval, + yj, yexval, thresh); +} + +// Used to generate a test case with a sensible name. +// Beware that we cannot use fp numbers (e.g., 2.3) in the names, +// so we are only printing int(2.3). This should be enough for debugging purposes. +// If this poses an issue, please reach out. +class zaxpbyvEVTVecPrint +{ +public: + std::string operator()( + testing::TestParamInfo> str) const + { + char conj = std::get<0>(str.param); + gtint_t n = std::get<1>(str.param); + gtint_t incx = std::get<2>(str.param); + gtint_t incy = std::get<3>(str.param); + gtint_t xi = std::get<4>(str.param); + dcomplex xexval = std::get<5>(str.param); + gtint_t yj = std::get<6>(str.param); + dcomplex yexval = std::get<7>(str.param); + dcomplex alpha = std::get<8>(str.param); + dcomplex beta = std::get<9>(str.param); +#ifdef TEST_BLAS + std::string str_name = "zaxpby_"; +#elif TEST_CBLAS + std::string str_name = "cblas_zaxpby"; +#else // #elif TEST_BLIS_TYPED + std::string str_name = "bli_zaxpbyv"; +#endif + str_name += "_" + std::to_string(n); + str_name += "_" + std::string(&conj, 1); + std::string incx_str = (incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); + str_name += "_" + incx_str; + std::string incy_str = (incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); + str_name += "_" + incy_str; + std::string xexval_str = testinghelpers::get_value_string(xexval); + std::string yexval_str = testinghelpers::get_value_string(yexval); + str_name = str_name + "_X_" + std::to_string(xi); + str_name = str_name + "_" + xexval_str; + str_name = str_name + "_Y_" + std::to_string(yj); + str_name = str_name + "_" + yexval_str; + std::string alpha_str = testinghelpers::get_value_string(alpha); + std::string beta_str = testinghelpers::get_value_string(beta); + str_name = str_name + "_a" + alpha_str; + str_name = str_name + "_b" + beta_str; + return str_name; + } +}; + +class zaxpbyvAlphaBetaPrint +{ +public: + std::string operator()( + testing::TestParamInfo> str) const + { + char conj = std::get<0>(str.param); + gtint_t n = std::get<1>(str.param); + gtint_t incx = std::get<2>(str.param); + gtint_t incy = std::get<3>(str.param); + dcomplex alpha = std::get<8>(str.param); + dcomplex beta = std::get<9>(str.param); +#ifdef TEST_BLAS + std::string str_name = "zaxpby_"; +#elif TEST_CBLAS + std::string str_name = "cblas_zaxpby"; +#else // #elif TEST_BLIS_TYPED + std::string str_name = "bli_zaxpbyv"; +#endif + str_name += "_" + std::to_string(n); + str_name += "_" + std::string(&conj, 1); + std::string incx_str = (incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); + str_name += "_" + incx_str; + std::string incy_str = (incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); + str_name += "_" + incy_str; + std::string alpha_str = testinghelpers::get_value_string(alpha); + std::string beta_str = testinghelpers::get_value_string(beta); + str_name = str_name + "_a" + alpha_str; + str_name = str_name + "_b" + beta_str; + return str_name; + } +}; + +static double NaN = std::numeric_limits::quiet_NaN(); +static double Inf = std::numeric_limits::infinity(); + +/* + The code structure for bli_zaxpbyv_zen_int( ... ) is as follows : + For unit strides : + Main loop : In blocks of 8 --> L8 + Fringe loops : In blocks of 6 --> L6 + In blocks of 4 --> L4 + In blocks of 2 --> L2 + + For non-unit strides : A single loop, to process element wise. + NOTE : Any size, requiring the fringe case of 1 with unit stride falls to + the non-unit stride loop and executes it once for just the last element. + + With regards to exception value testing, every loop is tested separately. + The indices for setting exception values on the vectors are such that + every load associated with the loop has an exception value in it. Thus, + every arithmetic instruction associated with each load will be tested + for exception value handling. +*/ + +// Exception value testing(on vectors) for L8 +INSTANTIATE_TEST_SUITE_P( + bli_zaxpbyv_zen_int_evt_vec_L8, + zaxpbyvEVTTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) +#ifdef TEST_BLIS_TYPED + , + 'c' // this option is BLIS-api specific. +#endif + ), + ::testing::Values(gtint_t(8)), // m, size of vector to enter L8 directly. + ::testing::Values(gtint_t(1)), // stride size for x + ::testing::Values(gtint_t(1)), // stride size for y + ::testing::Values(gtint_t(1), gtint_t(3), gtint_t(4), gtint_t(7)), // indices to set exception values on x + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{NaN, 2.3}, + dcomplex{-Inf, 0.0}, dcomplex{Inf, NaN}, + dcomplex{NaN, -Inf}), // exception values to set on x + ::testing::Values(gtint_t(0), gtint_t(2), gtint_t(5), gtint_t(6)), // indices to set exception values on y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{NaN, 2.3}, + dcomplex{-Inf, 0.0}, dcomplex{Inf, NaN}, + dcomplex{NaN, -Inf}), // exception values to set on y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{2.2, -3.3}), // alpha + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{0.9, 4.5}) // beta + ), + ::zaxpbyvEVTVecPrint()); + +// Exception value testing(on vectors) for L6 +INSTANTIATE_TEST_SUITE_P( + bli_zaxpbyv_zen_int_evt_vec_L6, + zaxpbyvEVTTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) +#ifdef TEST_BLIS_TYPED + , + 'c' // this option is BLIS-api specific. +#endif + ), + ::testing::Values(gtint_t(6)), // m, size of vector to enter L8 directly. + ::testing::Values(gtint_t(1)), // stride size for x + ::testing::Values(gtint_t(1)), // stride size for y + ::testing::Values(gtint_t(1), gtint_t(3), gtint_t(4)), // indices to set exception values on x + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{NaN, 2.3}, + dcomplex{-Inf, 0.0}, dcomplex{Inf, NaN}, + dcomplex{NaN, -Inf}), // exception values to set on x + ::testing::Values(gtint_t(0), gtint_t(2), gtint_t(5)), // indices to set exception values on y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{NaN, 2.3}, + dcomplex{-Inf, 0.0}, dcomplex{Inf, NaN}, + dcomplex{NaN, -Inf}), // exception values to set on y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{2.2, -3.3}), // alpha + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{0.9, 4.5}) // beta + ), + ::zaxpbyvEVTVecPrint()); + +// Exception value testing(on vectors) for L4 +INSTANTIATE_TEST_SUITE_P( + bli_zaxpbyv_zen_int_evt_vec_L4, + zaxpbyvEVTTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) +#ifdef TEST_BLIS_TYPED + , + 'c' // this option is BLIS-api specific. +#endif + ), + ::testing::Values(gtint_t(4)), // m, size of vector to enter L8 directly. + ::testing::Values(gtint_t(1)), // stride size for x + ::testing::Values(gtint_t(1)), // stride size for y + ::testing::Values(gtint_t(1), gtint_t(3)), // indices to set exception values on x + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{NaN, 2.3}, + dcomplex{-Inf, 0.0}, dcomplex{Inf, NaN}, + dcomplex{NaN, -Inf}), // exception values to set on x + ::testing::Values(gtint_t(0), gtint_t(2)), // indices to set exception values on y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{NaN, 2.3}, + dcomplex{-Inf, 0.0}, dcomplex{Inf, NaN}, + dcomplex{NaN, -Inf}), // exception values to set on y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{2.2, -3.3}), // alpha + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{0.9, 4.5}) // beta + ), + ::zaxpbyvEVTVecPrint()); + +// Exception value testing(on vectors) for L2 +INSTANTIATE_TEST_SUITE_P( + bli_zaxpbyv_zen_int_evt_vec_L2, + zaxpbyvEVTTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) +#ifdef TEST_BLIS_TYPED + , + 'c' // this option is BLIS-api specific. +#endif + ), + ::testing::Values(gtint_t(2)), // m, size of vector to enter L8 directly. + ::testing::Values(gtint_t(1)), // stride size for x + ::testing::Values(gtint_t(1)), // stride size for y + ::testing::Values(gtint_t(1)), // indices to set exception values on x + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{NaN, 2.3}, + dcomplex{-Inf, 0.0}, dcomplex{Inf, NaN}, + dcomplex{NaN, -Inf}), // exception values to set on x + ::testing::Values(gtint_t(0)), // indices to set exception values on y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{NaN, 2.3}, + dcomplex{-Inf, 0.0}, dcomplex{Inf, NaN}, + dcomplex{NaN, -Inf}), // exception values to set on y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{2.2, -3.3}), // alpha + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{0.9, 4.5}) // beta + ), + ::zaxpbyvEVTVecPrint()); + +// Exception value testing(on vectors) with non unit strides +INSTANTIATE_TEST_SUITE_P( + bli_zaxpbyv_zen_int_evt_vec_NUS, + zaxpbyvEVTTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) +#ifdef TEST_BLIS_TYPED + , + 'c' // this option is BLIS-api specific. +#endif + ), + ::testing::Values(gtint_t(1), gtint_t(5)), // m, size of vector to enter NUS loop directly. + ::testing::Values(gtint_t(3)), // stride size for x + ::testing::Values(gtint_t(-4)), // stride size for y + ::testing::Values(gtint_t(0)), // indices to set exception values on x + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{NaN, 2.3}, + dcomplex{-Inf, 0.0}, dcomplex{Inf, NaN}), // exception values to set on x + ::testing::Values(gtint_t(0)), // indices to set exception values on y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{NaN, 2.3}, + dcomplex{-Inf, 0.0}, dcomplex{Inf, NaN}), // exception values to set on y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{2.2, -3.3}), // alpha + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{0.9, 4.5}) // beta + ), + ::zaxpbyvEVTVecPrint()); + +// Exception value testing(on alpha/beta) with unit stride +/* + NOTE : Here, every loop is tested for, with alpha and beta having exception values + Furthermore, the first element of x and second element of y are set to 0, which + includes testing that cover cases where NaN might be induced due to 0 * (Inf or -Inf). +*/ +INSTANTIATE_TEST_SUITE_P( + bli_zaxpbyv_zen_int_evt_alphabeta_US, + zaxpbyvEVTTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) +#ifdef TEST_BLIS_TYPED + , + 'c' // this option is BLIS-api specific. +#endif + ), + ::testing::Values(gtint_t(8), gtint_t(6), gtint_t(4), gtint_t(2)), // m size of vector to enter L8, L6, L4 and L2 respectively. + ::testing::Values(gtint_t(1)), // stride size for x + ::testing::Values(gtint_t(1)), // stride size for y + ::testing::Values(gtint_t(0)), // indices to set exception values on x + ::testing::Values(dcomplex{0.0, 0.0}), // exception values to set on x + ::testing::Values(gtint_t(1)), // indices to set exception values on y + ::testing::Values(dcomplex{0.0, 0.0}), // exception values to set on y + ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, dcomplex{-Inf, NaN}), // alpha + ::testing::Values(dcomplex{-0.9, NaN}, dcomplex{0.0, -Inf}, dcomplex{NaN, Inf}) // beta + ), + ::zaxpbyvEVTVecPrint()); + +// Exception value testing(on alpha/beta) with non-unit stride +INSTANTIATE_TEST_SUITE_P( + bli_zaxpbyv_zen_int_evt_alphabeta_NUS, + zaxpbyvEVTTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) +#ifdef TEST_BLIS_TYPED + , + 'c' // this option is BLIS-api specific. +#endif + ), + ::testing::Values(gtint_t(5)), // m, size of vector to enter NUS loop directly. + ::testing::Values(gtint_t(3)), // stride size for x + ::testing::Values(gtint_t(-4)), // stride size for y + ::testing::Values(gtint_t(0)), // indices to set exception values on x + ::testing::Values(dcomplex{0.0, 0.0}), // exception values to set on x + ::testing::Values(gtint_t(0)), // indices to set exception values on y + ::testing::Values(dcomplex{0.0, 0.0}), // exception values to set on y + ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, dcomplex{-Inf, NaN}), // alpha + ::testing::Values(dcomplex{-0.9, NaN}, dcomplex{0.0, -Inf}, dcomplex{NaN, Inf}) // beta + ), + ::zaxpbyvEVTVecPrint()); \ No newline at end of file diff --git a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp index d43e5a70f0..83cd127b77 100644 --- a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp @@ -9,14 +9,14 @@ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT @@ -35,7 +35,7 @@ #include #include "test_axpbyv.h" -class zaxpbyvGenericTest : +class zaxpbyvAccTest : public ::testing::TestWithParam> {}; // Tests using random integers as vector elements. -TEST_P( zaxpbyvGenericTest, RandomData ) +TEST_P(zaxpbyvAccTest, RandomData) { using T = dcomplex; //---------------------------------------------------------- @@ -64,106 +64,138 @@ TEST_P( zaxpbyvGenericTest, RandomData ) T beta = std::get<5>(GetParam()); // Set the threshold for the errors: - double thresh = 20*testinghelpers::getEpsilon(); + double thresh = 20 * testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call generic test body using those parameters //---------------------------------------------------------- - test_axpbyv( conj_x, n, incx, incy, alpha, beta, thresh ); + test_axpbyv(conj_x, n, incx, incy, alpha, beta, thresh); } // Used to generate a test case with a sensible name. // Beware that we cannot use fp numbers (e.g., 2.3) in the names, // so we are only printing int(2.3). This should be enough for debugging purposes. // If this poses an issue, please reach out. -class zaxpbyvGenericTestPrint { +class zaxpbyvAccTestPrint +{ public: std::string operator()( - testing::TestParamInfo> str) const { - char conj = std::get<0>(str.param); - gtint_t n = std::get<1>(str.param); - gtint_t incx = std::get<2>(str.param); - gtint_t incy = std::get<3>(str.param); + testing::TestParamInfo> str) const + { + char conj = std::get<0>(str.param); + gtint_t n = std::get<1>(str.param); + gtint_t incx = std::get<2>(str.param); + gtint_t incy = std::get<3>(str.param); dcomplex alpha = std::get<4>(str.param); - dcomplex beta = std::get<5>(str.param); + dcomplex beta = std::get<5>(str.param); #ifdef TEST_BLAS std::string str_name = "zaxpby_"; #elif TEST_CBLAS std::string str_name = "cblas_zaxpby"; -#else //#elif TEST_BLIS_TYPED +#else // #elif TEST_BLIS_TYPED std::string str_name = "bli_zaxpbyv"; #endif str_name += "_" + std::to_string(n); str_name += "_" + std::string(&conj, 1); - std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); + std::string incx_str = (incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx)); str_name += "_" + incx_str; - std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); + std::string incy_str = (incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy)); str_name += "_" + incy_str; - std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real)))); - alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); - std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real)))); - beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag))))); + std::string alpha_str = (alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real)))); + alpha_str = alpha_str + "pi" + ((alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag))))); + std::string beta_str = (beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real)))); + beta_str = beta_str + "pi" + ((beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag))))); str_name = str_name + "_a" + alpha_str; str_name = str_name + "_b" + beta_str; return str_name; } }; -// Black box testing for generic and main use of zaxpby. +/* + The code structure for bli_zaxpbyv_zen_int( ... ) is as follows : + For unit strides : + Main loop : In blocks of 8 --> L8 + Fringe loops : In blocks of 6 --> L6 + In blocks of 4 --> L4 + In blocks of 2 --> L2 + + For non-unit strides : A single loop, to process element wise. + NOTE : Any size, requiring the fringe case of 1 with unit stride falls to + the non-unit stride loop and executes it once for just the last element. +*/ + +// Accuracy testing of the main loop, single and multiple runs +INSTANTIATE_TEST_SUITE_P( + bli_zaxpbyv_zen_int_acc_US_main, + zaxpbyvAccTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) +#ifdef TEST_BLIS_TYPED + , + 'c' // this option is BLIS-api specific. +#endif + ), + ::testing::Values(gtint_t(8), gtint_t(40)), // m + ::testing::Values(gtint_t(1)), // stride size for x + ::testing::Values(gtint_t(1)), // stride size for y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{2.2, -3.3}), // alpha + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{1.0, 2.0}) // beta + ), + ::zaxpbyvAccTestPrint()); + +// Accuracy testing of different combinations of fringe loops(L6, L4, L2, 1) INSTANTIATE_TEST_SUITE_P( - Blackbox, - zaxpbyvGenericTest, - ::testing::Combine( - ::testing::Values('n' // n: use x, c: use conj(x) + bli_zaxpbyv_zen_int_acc_US_fringe, + zaxpbyvAccTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) #ifdef TEST_BLIS_TYPED - , 'c' // this option is BLIS-api specific. + , + 'c' // this option is BLIS-api specific. #endif - ), - ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(gtint_t(1)), // stride size for y - ::testing::Values(dcomplex{-3.0, 1.0}, dcomplex{1.0, 2.0}), // alpha - ::testing::Values(dcomplex{1.0, 2.0}) // beta + ), + ::testing::Range(gtint_t(1), gtint_t(7), 1), // m + ::testing::Values(gtint_t(1)), // stride size for x + ::testing::Values(gtint_t(1)), // stride size for y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{2.2, -3.3}), // alpha + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{1.0, 2.0}) // beta ), - ::zaxpbyvGenericTestPrint() - ); + ::zaxpbyvAccTestPrint()); -// Test for non-unit increments. -// Only test very few cases as sanity check. -// We can modify the values using implementantion details. +// Accuracy testing of 3*L8 + L6 + L4 + L2 + 1, a case of main + all fringe cases taken INSTANTIATE_TEST_SUITE_P( - NonUnitPositiveIncrements, - zaxpbyvGenericTest, - ::testing::Combine( - ::testing::Values('n' + bli_zaxpbyv_zen_int_acc_US_combine, + zaxpbyvAccTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) #ifdef TEST_BLIS_TYPED - , 'c' // this option is BLIS-api specific. + , + 'c' // this option is BLIS-api specific. #endif - ), // n: use x, c: use conj(x) - ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(2)), /*(gtint_t(-5), gtint_t(-17))*/// stride size for x - ::testing::Values(gtint_t(4)), /*(gtint_t(-12), gtint_t(-4))*/// stride size for y - ::testing::Values(dcomplex{4.0, 3.1}), // alpha - ::testing::Values(dcomplex{1.0, 2.0}) // beta + ), + ::testing::Values(gtint_t(30), gtint_t(34), gtint_t(36), gtint_t(37)), // m + ::testing::Values(gtint_t(1)), // stride size for x + ::testing::Values(gtint_t(1)), // stride size for y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{2.2, -3.3}), // alpha + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{1.0, 2.0}) // beta ), - ::zaxpbyvGenericTestPrint() - ); + ::zaxpbyvAccTestPrint()); -#ifndef TEST_BLIS_TYPED -// Test for negative increments. -// Only test very few cases as sanity check. -// We can modify the values using implementantion details. +// Accuracy testing with non-unit strides INSTANTIATE_TEST_SUITE_P( - NegativeIncrements, - zaxpbyvGenericTest, - ::testing::Combine( - ::testing::Values('n'), // n: use x, c: use conj(x) - ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(11), gtint_t(-11)), // stride size for x - ::testing::Values(gtint_t(-3), gtint_t(4)), // stride size for y - ::testing::Values(dcomplex{4.0, 3.1}), // alpha - ::testing::Values(dcomplex{1.0, -2.0}) // beta + bli_zaxpbyv_zen_int_acc_NUS, + zaxpbyvAccTest, + ::testing::Combine( + ::testing::Values('n' // n: use x, c: use conj(x) +#ifdef TEST_BLIS_TYPED + , + 'c' // this option is BLIS-api specific. +#endif + ), + ::testing::Values(gtint_t(10), gtint_t(17)), // m + ::testing::Values(gtint_t(-3), gtint_t(4)), // stride size for x + ::testing::Values(gtint_t(6), gtint_t(-2)), // stride size for y + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{2.2, -3.3}), // alpha + ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{1.0, 2.0}) // beta ), - ::zaxpbyvGenericTestPrint() - ); -#endif \ No newline at end of file + ::zaxpbyvAccTestPrint()); \ No newline at end of file diff --git a/kernels/zen/1/bli_axpbyv_zen_int.c b/kernels/zen/1/bli_axpbyv_zen_int.c index c92d44ad3e..2ada1dcb67 100644 --- a/kernels/zen/1/bli_axpbyv_zen_int.c +++ b/kernels/zen/1/bli_axpbyv_zen_int.c @@ -734,416 +734,593 @@ void bli_zaxpbyv_zen_int ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4) - const dim_t n_elem_per_reg = 4; // number of elements per register - dim_t i; // iterator + dim_t i = 0; // iterator + // Local pointers to x and y vectors double* restrict x0; double* restrict y0; + // Variables to store real and imaginary components of alpha and beta double alphaR, alphaI, betaR, betaI; - __m256d alphaRv; - __m256d alphaIv; - __m256d betaRv; - __m256d betaIv; - __m256d xv[4]; - __m256d yv[4]; - __m256d iv[4]; // intermediate registers - + // Local variable to store the conjugate type conj_t conjx_use = conjx; - - /* if the vector dimension is zero, or if alpha & beta are zero, - return early. */ - if ( bli_zero_dim1( n ) || - ( PASTEMAC( c, eq0 )( *alpha ) && PASTEMAC( c, eq0 )( *beta ) ) ) + + /* If the vector dimension is zero, return early. */ + if ( bli_zero_dim1( n ) ) { AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4) return; } - // initialize local pointers - x0 = ( double* ) x; - y0 = ( double* ) y; + // Initializing the local pointers + x0 = ( double* ) x; + y0 = ( double* ) y; alphaR = alpha->real; alphaI = alpha->imag; betaR = beta->real; betaI = beta->imag; - if ( incx == 1 && incy == 1 ) - { - //---------- Scalar algorithm BLIS_NO_CONJUGATE ------------- - // y = beta*y + alpha*x - // y = ( bR + ibI ) * ( yR + iyI ) + ( aR + iaI ) * ( xR + ixI ) - // y = bR.yR + ibR.yI + ibI.yR - ibIyI + aR.xR + iaR.xI + iaI.xR - aI.xI - // y = ( bR.yR - bI.yI + aR.xR - aI.xI ) + - // i ( bR.yI + bI.yR + aR.xI + aI.xR ) + // Vectors to store real and imaginary components of beta + __m256d betaRv, betaIv; - // SIMD Algorithm BLIS_NO_CONJUGATE - // yv = yR1 yI1 yR2 yI2 - // yv' = yI1 yR1 yI2 yR2 - // xv = xR1 xI1 xR2 xI2 - // xv' = xI1 xR1 xI2 xR2 - // arv = aR aR aR aR - // aiv = -aI aI -aI aI - // brv = bR bR bR bR - // biv = -bI bI -bI bI - // - // step 1: iv = brv * iv - // step 2: shuffle yv -> yv' - // step 3: FMA yv = biv * yv' + iv - // step 4: iv = arv * xv - // step 5: shuffle xv -> xv' - // step 6: FMA yv = aiv * xv' + iv + // Broadcasting real and imaginary components of beta onto the registers + betaRv = _mm256_broadcast_sd( &betaR ); + betaIv = _mm256_broadcast_sd( &betaI ); - //---------- Scalar algorithm BLIS_CONJUGATE ------------- - // y = beta*y + alpha*conj(x) - // y = ( bR + ibI ) * ( yR + iyI ) + ( aR + iaI ) * ( xR - ixI ) - // y = bR.yR + ibR.yI + ibI.yR - bI.yI + aR.xR - iaR.xI + iaI.xR + aI.xI - // y = ( bR.yR - bI.yI + aR.xR + aI.xI ) + - // i ( bR.yI + bI.yR - aR.xI + aI.xR ) + // Initializing a variable to classify the type of the computation + bool is_alpha_zero = bli_zeq0( *alpha ); - // SIMD Algorithm BLIS_CONJUGATE - // yv = yR1 yI1 yR2 yI2 - // yv' = yI1 yR1 yI2 yR2 - // xv = xR1 xI1 xR2 xI2 - // xv' = xI1 xR1 xI2 xR2 - // arv = aR -aR aR -aR - // aiv = aI aI aI aI - // brv = bR bR bR bR - // biv = -bI bI -bI bI - // - // step 1: iv = brv * iv - // step 2: shuffle yv -> yv' - // step 3: FMA yv = biv * yv' + iv - // step 4: iv = arv * xv - // step 5: shuffle xv -> xv' - // step 6: FMA yv = aiv * xv' + iv + // In case of unit strides for x and y vectors + if ( incx == 1 && incy == 1 ) + { + // Number of double precision elements in a YMM register + const dim_t n_elem_per_reg = 4; - // broadcast alpha & beta to all elements of respective vector registers - if ( !bli_is_conj( conjx ) ) - { - // alphaRv = aR aR aR aR - // alphaIv = -aI aI -aI aI - // betaRv = bR bR bR bR - // betaIv = -bI bI -bI bI - alphaRv = _mm256_broadcast_sd( &alphaR ); - alphaIv = _mm256_set_pd( alphaI, -alphaI, alphaI, -alphaI ); - betaRv = _mm256_broadcast_sd( &betaR ); - betaIv = _mm256_set_pd( betaI, -betaI, betaI, -betaI ); - } - else - { - // alphaRv = aR -aR aR -aR - // alphaIv = aI aI aI aI - // betaRv = bR bR bR bR - // betaIv = -bI bI -bI bI - alphaRv = _mm256_set_pd( -alphaR, alphaR, -alphaR, alphaR ); - alphaIv = _mm256_broadcast_sd( &alphaI ); - betaRv = _mm256_broadcast_sd( &betaR ); - betaIv = _mm256_set_pd( betaI, -betaI, betaI, -betaI ); - } + // Scratch registers + __m256d xv[4]; + __m256d yv[4]; + __m256d iv[4]; - // Processing 8 elements per loop, 8 FMAs - for ( i = 0; ( i + 7 ) < n; i += 8 ) + // In case of alpha being 0, we just need to scale y by beta + if( is_alpha_zero ) { - // xv = xR1 xI1 xR2 xI2 - xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); - xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); - xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); - - // yv = yR1 yI1 yR2 yI2 - yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); - yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); - yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); - - // iv = betaRv * yv - // = yR1.bR, yI1.bR, yR2.bR, yI2.bR, ... - iv[0] = _mm256_mul_pd( betaRv, yv[0] ); - iv[1] = _mm256_mul_pd( betaRv, yv[1] ); - iv[2] = _mm256_mul_pd( betaRv, yv[2] ); - iv[3] = _mm256_mul_pd( betaRv, yv[3] ); - - // yv' = yI1 yR1 yI2 yR2 - yv[0] = _mm256_permute_pd( yv[0], 5); - yv[1] = _mm256_permute_pd( yv[1], 5); - yv[2] = _mm256_permute_pd( yv[2], 5); - yv[3] = _mm256_permute_pd( yv[3], 5); - - // yv = betaIv * yv' + iv - // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... - yv[0] = _mm256_fmadd_pd( betaIv, yv[0], iv[0] ); - yv[1] = _mm256_fmadd_pd( betaIv, yv[1], iv[1] ); - yv[2] = _mm256_fmadd_pd( betaIv, yv[2], iv[2] ); - yv[3] = _mm256_fmadd_pd( betaIv, yv[3], iv[3] ); - - // iv = alphaRv * xv - // = xR1.aR, xI1.aR, xR2.aR, xI2.aR, ... - iv[0] = _mm256_mul_pd( alphaRv, xv[0] ); - iv[1] = _mm256_mul_pd( alphaRv, xv[1] ); - iv[2] = _mm256_mul_pd( alphaRv, xv[2] ); - iv[3] = _mm256_mul_pd( alphaRv, xv[3] ); - - // xv' = xI1 xR1 xI2 xR2 - xv[0] = _mm256_permute_pd( xv[0], 5); - xv[1] = _mm256_permute_pd( xv[1], 5); - xv[2] = _mm256_permute_pd( xv[2], 5); - xv[3] = _mm256_permute_pd( xv[3], 5); - - // yv = alphaIv * xv + yv - // = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ... - iv[0] = _mm256_fmadd_pd( alphaIv, xv[0], iv[0] ); - iv[1] = _mm256_fmadd_pd( alphaIv, xv[1], iv[1] ); - iv[2] = _mm256_fmadd_pd( alphaIv, xv[2], iv[2] ); - iv[3] = _mm256_fmadd_pd( alphaIv, xv[3], iv[3] ); + // Processing 8 elements per loop, 8 FMAs + for ( i = 0; ( i + 7 ) < n; i += 8 ) + { + // Load the y vector, 8 elements in total + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 ); + yv[1] = _mm256_loadu_pd( y0 + 1 * n_elem_per_reg ); + yv[2] = _mm256_loadu_pd( y0 + 2 * n_elem_per_reg ); + yv[3] = _mm256_loadu_pd( y0 + 3 * n_elem_per_reg ); + + // Permute the loaded vectors for the required compute + // xv = yI1 yR1 yI2 yR2 + xv[0] = _mm256_permute_pd( yv[0], 5 ); + xv[1] = _mm256_permute_pd( yv[1], 5 ); + xv[2] = _mm256_permute_pd( yv[2], 5 ); + xv[3] = _mm256_permute_pd( yv[3], 5 ); + + // Scale the permuted vectors with imaginary component of beta + // iv = yI1 yR1 yI2 yR2 + iv[0] = _mm256_mul_pd( betaIv, xv[0] ); + iv[1] = _mm256_mul_pd( betaIv, xv[1] ); + iv[2] = _mm256_mul_pd( betaIv, xv[2] ); + iv[3] = _mm256_mul_pd( betaIv, xv[3] ); + + // Using fmaddsub to scale with real component of beta and sub/add to iv + // yv = betaRv * yv -/+ iv + // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... + yv[0] = _mm256_fmaddsub_pd( betaRv, yv[0], iv[0] ); + yv[1] = _mm256_fmaddsub_pd( betaRv, yv[1], iv[1] ); + yv[2] = _mm256_fmaddsub_pd( betaRv, yv[2], iv[2] ); + yv[3] = _mm256_fmaddsub_pd( betaRv, yv[3], iv[3] ); + + // Storing the result to memory + _mm256_storeu_pd( ( y0 ), yv[0] ); + _mm256_storeu_pd( ( y0 + 1 * n_elem_per_reg ), yv[1] ); + _mm256_storeu_pd( ( y0 + 2 * n_elem_per_reg ), yv[2] ); + _mm256_storeu_pd( ( y0 + 3 * n_elem_per_reg ), yv[3] ); + + // Adjusting the pointers for the next iteration + y0 += 4 * n_elem_per_reg; + x0 += 4 * n_elem_per_reg; + } - yv[0] = _mm256_add_pd( yv[0], iv[0] ); - yv[1] = _mm256_add_pd( yv[1], iv[1] ); - yv[2] = _mm256_add_pd( yv[2], iv[2] ); - yv[3] = _mm256_add_pd( yv[3], iv[3] ); + // Processing 6 elements per loop, 6 FMAs + for ( ; ( i + 5 ) < n; i += 6 ) + { + // Load the y vector, 6 elements in total + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 ); + yv[1] = _mm256_loadu_pd( y0 + 1 * n_elem_per_reg ); + yv[2] = _mm256_loadu_pd( y0 + 2 * n_elem_per_reg ); + + // Permute the loaded vectors for the required compute + // xv = yI1 yR1 yI2 yR2 + xv[0] = _mm256_permute_pd( yv[0], 5 ); + xv[1] = _mm256_permute_pd( yv[1], 5 ); + xv[2] = _mm256_permute_pd( yv[2], 5 ); + + // Scale the permuted vectors with imaginary component of beta + // iv = yI1 yR1 yI2 yR2 + iv[0] = _mm256_mul_pd( betaIv, xv[0] ); + iv[1] = _mm256_mul_pd( betaIv, xv[1] ); + iv[2] = _mm256_mul_pd( betaIv, xv[2] ); + + // Using fmaddsub to scale with real component of beta + // and sub/add to iv + // yv = betaRv * yv -/+ iv + // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... + yv[0] = _mm256_fmaddsub_pd( betaRv, yv[0], iv[0] ); + yv[1] = _mm256_fmaddsub_pd( betaRv, yv[1], iv[1] ); + yv[2] = _mm256_fmaddsub_pd( betaRv, yv[2], iv[2] ); + + // Storing the result to memory + _mm256_storeu_pd( ( y0 ), yv[0] ); + _mm256_storeu_pd( ( y0 + 1 * n_elem_per_reg ), yv[1] ); + _mm256_storeu_pd( ( y0 + 2 * n_elem_per_reg ), yv[2] ); + + // Adjusting the pointers for the next iteration + y0 += 3 * n_elem_per_reg; + x0 += 3 * n_elem_per_reg; + } - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] ); - _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), yv[1] ); - _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), yv[2] ); - _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), yv[3] ); + // Processing 4 elements per loop, 4 FMAs + for ( ; ( i + 3 ) < n; i += 4 ) + { + // Load the y vector, 4 elements in total + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 ); + yv[1] = _mm256_loadu_pd( y0 + 1 * n_elem_per_reg ); + + // Permute the loaded vectors for the required compute + // xv = yI1 yR1 yI2 yR2 + xv[0] = _mm256_permute_pd( yv[0], 5 ); + xv[1] = _mm256_permute_pd( yv[1], 5 ); + + // Scale the permuted vectors with imaginary component of beta + // iv = yI1.bI, yR1.bI, yI2.bI, yR2.bI + iv[0] = _mm256_mul_pd( betaIv, xv[0] ); + iv[1] = _mm256_mul_pd( betaIv, xv[1] ); + + // Using fmaddsub to scale with real component of beta + // and sub/add to iv + // yv = betaRv * yv -/+ iv + // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... + yv[0] = _mm256_fmaddsub_pd( betaRv, yv[0], iv[0] ); + yv[1] = _mm256_fmaddsub_pd( betaRv, yv[1], iv[1] ); + + // Storing the result to memory + _mm256_storeu_pd( ( y0 ), yv[0] ); + _mm256_storeu_pd( ( y0 + 1 * n_elem_per_reg ), yv[1] ); + + // Adjusting the pointers for the next iteration + y0 += 2 * n_elem_per_reg; + x0 += 2 * n_elem_per_reg; + } - y0 += 4*n_elem_per_reg; - x0 += 4*n_elem_per_reg; + // Processing 2 elements per loop, 3 FMAs + for ( ; ( i + 1 ) < n; i += 2 ) + { + // Load the y vector, 2 elements in total + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 ); + + // Permute the loaded vectors for the required compute + // xv = yI1 yR1 yI2 yR2 + xv[0] = _mm256_permute_pd( yv[0], 5 ); + + // Scale the permuted vectors with imaginary component of beta + // iv = yI1 yR1 yI2 yR2 + iv[0] = _mm256_mul_pd( betaIv, xv[0] ); + + // Using fmaddsub to scale with real component of beta + // and sub/add to iv + // yv = betaRv * yv -/+ iv + // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... + yv[0] = _mm256_fmaddsub_pd( betaRv, yv[0], iv[0] ); + + // Storing the result to memory + _mm256_storeu_pd( ( y0 ), yv[0] ); + + // Adjusting the pointers for the next iteration + y0 += 1 * n_elem_per_reg; + x0 += 1 * n_elem_per_reg; + } } - // Processing 6 elements per loop, 6 FMAs - for ( ; ( i + 5 ) < n; i += 6 ) + else { - // xv = xR1 xI1 xR2 xI2 - xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); - xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); - - // yv = yR1 yI1 yR2 yI2 - yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); - yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); - - // iv = betaRv * yv - // = yR1.bR, yI1.bR, yR2.bR, yI2.bR, ... - iv[0] = _mm256_mul_pd( betaRv, yv[0] ); - iv[1] = _mm256_mul_pd( betaRv, yv[1] ); - iv[2] = _mm256_mul_pd( betaRv, yv[2] ); + // Scratch registers for storing real and imaginary components of alpha + __m256d alphaRv, alphaIv; - // yv' = yI1 yR1 yI2 yR2 - yv[0] = _mm256_permute_pd( yv[0], 5); - yv[1] = _mm256_permute_pd( yv[1], 5); - yv[2] = _mm256_permute_pd( yv[2], 5); + iv[0] = _mm256_setzero_pd(); - // yv = betaIv * yv' + iv - // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... - yv[0] = _mm256_fmadd_pd( betaIv, yv[0], iv[0] ); - yv[1] = _mm256_fmadd_pd( betaIv, yv[1], iv[1] ); - yv[2] = _mm256_fmadd_pd( betaIv, yv[2], iv[2] ); + alphaRv = _mm256_broadcast_sd( &alphaR ); + alphaIv = _mm256_broadcast_sd( &alphaI ); - // iv = alphaRv * xv - // = xR1.aR, xI1.aR, xR2.aR, xI2.aR, ... - iv[0] = _mm256_mul_pd( alphaRv, xv[0] ); - iv[1] = _mm256_mul_pd( alphaRv, xv[1] ); - iv[2] = _mm256_mul_pd( alphaRv, xv[2] ); + // The changes on alphaRv and alphaIv are as follows : + // If conjugate is required: + // alphaRv = aR -aR aR -aR + // Else : + // alphaIv = -aI aI -aI aI + if( bli_is_conj( conjx_use ) ) + { + alphaRv = _mm256_fmsubadd_pd( iv[0], iv[0], alphaRv ); + } + else + { + alphaIv = _mm256_addsub_pd( iv[0], alphaIv ); + } - // xv' = xI1 xR1 xI2 xR2 - xv[0] = _mm256_permute_pd( xv[0], 5); - xv[1] = _mm256_permute_pd( xv[1], 5); - xv[2] = _mm256_permute_pd( xv[2], 5); + // Processing 8 elements per loop, 8 FMAs + for ( i = 0; ( i + 7 ) < n; i += 8 ) + { + // Load the y vector, 6 elements in total + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 ); + yv[1] = _mm256_loadu_pd( y0 + 1 * n_elem_per_reg ); + yv[2] = _mm256_loadu_pd( y0 + 2 * n_elem_per_reg ); + yv[3] = _mm256_loadu_pd( y0 + 3 * n_elem_per_reg ); + + // Load the x vector, 6 elements in total + // xv = xR1 xI1 xR2 xI2 + xv[0] = _mm256_loadu_pd( x0 ); + xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg ); + xv[2] = _mm256_loadu_pd( x0 + 2 * n_elem_per_reg ); + xv[3] = _mm256_loadu_pd( x0 + 3 * n_elem_per_reg ); + + // Permute the vectors from y for the required compute + // iv = yI1 yR1 yI2 yR2 + iv[0] = _mm256_permute_pd( yv[0], 5 ); + iv[1] = _mm256_permute_pd( yv[1], 5 ); + iv[2] = _mm256_permute_pd( yv[2], 5 ); + iv[3] = _mm256_permute_pd( yv[3], 5 ); + + // Scale the permuted vectors with imaginary component of beta + // iv = betaIv * yv + // = yI1.bI, yR1.bI, yI2.bI, yR2.bI, ... + iv[0] = _mm256_mul_pd( betaIv, iv[0] ); + iv[1] = _mm256_mul_pd( betaIv, iv[1] ); + iv[2] = _mm256_mul_pd( betaIv, iv[2] ); + iv[3] = _mm256_mul_pd( betaIv, iv[3] ); + + // Using fmaddsub to scale with real component of beta + // and sub/add to iv + // yv = betaRv * yv -/+ iv + // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... + yv[0] = _mm256_fmaddsub_pd( betaRv, yv[0], iv[0] ); + yv[1] = _mm256_fmaddsub_pd( betaRv, yv[1], iv[1] ); + yv[2] = _mm256_fmaddsub_pd( betaRv, yv[2], iv[2] ); + yv[3] = _mm256_fmaddsub_pd( betaRv, yv[3], iv[3] ); + + // Permute the loaded vectors from x for the required compute + // xv' = xI1 xR1 xI2 xR2 + iv[0] = _mm256_permute_pd( xv[0], 5 ); + iv[1] = _mm256_permute_pd( xv[1], 5 ); + iv[2] = _mm256_permute_pd( xv[2], 5 ); + iv[3] = _mm256_permute_pd( xv[3], 5 ); + + // yv = alphaRv * xv + yv + // = yR1.bR - yR1.bI + xR1.aR, yI1.bR + yI1.bI + xI1.aR, ... + yv[0] = _mm256_fmadd_pd( alphaRv, xv[0], yv[0] ); + yv[1] = _mm256_fmadd_pd( alphaRv, xv[1], yv[1] ); + yv[2] = _mm256_fmadd_pd( alphaRv, xv[2], yv[2] ); + yv[3] = _mm256_fmadd_pd( alphaRv, xv[3], yv[3] ); + + // yv = alphaIv * iv + yv + // = yR1.bR - yR1.bI - xI1.aI, yI1.bR + yI1.bI + xR1.aI, ... + yv[0] = _mm256_fmadd_pd( alphaIv, iv[0], yv[0] ); + yv[1] = _mm256_fmadd_pd( alphaIv, iv[1], yv[1] ); + yv[2] = _mm256_fmadd_pd( alphaIv, iv[2], yv[2] ); + yv[3] = _mm256_fmadd_pd( alphaIv, iv[3], yv[3] ); + + // Storing the result to memory + _mm256_storeu_pd( ( y0 ), yv[0] ); + _mm256_storeu_pd( ( y0 + 1 * n_elem_per_reg ), yv[1] ); + _mm256_storeu_pd( ( y0 + 2 * n_elem_per_reg ), yv[2] ); + _mm256_storeu_pd( ( y0 + 3 * n_elem_per_reg ), yv[3] ); + + // Adjusting the pointers for the next iteration + y0 += 4 * n_elem_per_reg; + x0 += 4 * n_elem_per_reg; + } - // yv = alphaIv * xv + yv - // = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ... - iv[0] = _mm256_fmadd_pd( alphaIv, xv[0], iv[0] ); - iv[1] = _mm256_fmadd_pd( alphaIv, xv[1], iv[1] ); - iv[2] = _mm256_fmadd_pd( alphaIv, xv[2], iv[2] ); + // Processing 6 elements per loop, 6 FMAs + for ( ; ( i + 5 ) < n; i += 6 ) + { + // Load the y vector, 6 elements in total + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 ); + yv[1] = _mm256_loadu_pd( y0 + 1 * n_elem_per_reg ); + yv[2] = _mm256_loadu_pd( y0 + 2 * n_elem_per_reg ); + + // Load the x vector, 6 elements in total + // xv = xR1 xI1 xR2 xI2 + xv[0] = _mm256_loadu_pd( x0 ); + xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg ); + xv[2] = _mm256_loadu_pd( x0 + 2 * n_elem_per_reg ); + + // Permute the vectors from y for the required compute + // iv = yI1 yR1 yI2 yR2 + iv[0] = _mm256_permute_pd( yv[0], 5 ); + iv[1] = _mm256_permute_pd( yv[1], 5 ); + iv[2] = _mm256_permute_pd( yv[2], 5 ); + + // Scale the permuted vectors with imaginary component of beta + // iv = betaIv * yv + // = yI1.bI, yR1.bI, yI2.bI, yR2.bI, ...` + iv[0] = _mm256_mul_pd( betaIv, iv[0] ); + iv[1] = _mm256_mul_pd( betaIv, iv[1] ); + iv[2] = _mm256_mul_pd( betaIv, iv[2] ); + + // Using fmaddsub to scale with real component of beta + // and sub/add to iv + // yv = betaRv * yv -/+ iv + // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... + yv[0] = _mm256_fmaddsub_pd( betaRv, yv[0], iv[0] ); + yv[1] = _mm256_fmaddsub_pd( betaRv, yv[1], iv[1] ); + yv[2] = _mm256_fmaddsub_pd( betaRv, yv[2], iv[2] ); + + // Permute the loaded vectors from x for the required compute + // xv' = xI1 xR1 xI2 xR2 + iv[0] = _mm256_permute_pd( xv[0], 5 ); + iv[1] = _mm256_permute_pd( xv[1], 5 ); + iv[2] = _mm256_permute_pd( xv[2], 5 ); + + // yv = alphaRv * xv + yv + // = yR1.bR - yR1.bI + xR1.aR, yI1.bR + yI1.bI + xI1.aR, ... + yv[0] = _mm256_fmadd_pd( alphaRv, xv[0], yv[0] ); + yv[1] = _mm256_fmadd_pd( alphaRv, xv[1], yv[1] ); + yv[2] = _mm256_fmadd_pd( alphaRv, xv[2], yv[2] ); + + // yv = alphaIv * iv + yv + // = yR1.bR - yR1.bI - xI1.aI, yI1.bR + yI1.bI + xR1.aI, ... + yv[0] = _mm256_fmadd_pd( alphaIv, iv[0], yv[0] ); + yv[1] = _mm256_fmadd_pd( alphaIv, iv[1], yv[1] ); + yv[2] = _mm256_fmadd_pd( alphaIv, iv[2], yv[2] ); + + // Storing the result to memory + _mm256_storeu_pd( ( y0 ), yv[0] ); + _mm256_storeu_pd( ( y0 + 1 * n_elem_per_reg ), yv[1] ); + _mm256_storeu_pd( ( y0 + 2 * n_elem_per_reg ), yv[2] ); + + // Adjusting the pointers for the next iteration + y0 += 3 * n_elem_per_reg; + x0 += 3 * n_elem_per_reg; + } - yv[0] = _mm256_add_pd( yv[0], iv[0] ); - yv[1] = _mm256_add_pd( yv[1], iv[1] ); - yv[2] = _mm256_add_pd( yv[2], iv[2] ); + // Processing 4 elements per loop, 4 FMAs + for ( ; ( i + 3 ) < n; i += 4 ) + { + // Load the y vector, 6 elements in total + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 ); + yv[1] = _mm256_loadu_pd( y0 + 1 * n_elem_per_reg ); + + // Load the x vector, 6 elements in total + // xv = xR1 xI1 xR2 xI2 + xv[0] = _mm256_loadu_pd( x0 ); + xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg ); + + // Permute the vectors from y for the required compute + // iv = yI1 yR1 yI2 yR2 + iv[0] = _mm256_permute_pd( yv[0], 5 ); + iv[1] = _mm256_permute_pd( yv[1], 5 ); + + // Scale the permuted vectors with imaginary component of beta + // iv = betaIv * yv + // = yI1.bI, yR1.bI, yI2.bI, yR2.bI, ... + iv[0] = _mm256_mul_pd( betaIv, iv[0] ); + iv[1] = _mm256_mul_pd( betaIv, iv[1] ); + + // Using fmaddsub to scale with real component of beta + // and sub/add to iv + // yv = betaRv * yv -/+ iv + // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... + yv[0] = _mm256_fmaddsub_pd( betaRv, yv[0], iv[0] ); + yv[1] = _mm256_fmaddsub_pd( betaRv, yv[1], iv[1] ); + + // Permute the loaded vectors from x for the required compute + // xv' = xI1 xR1 xI2 xR2 + iv[0] = _mm256_permute_pd( xv[0], 5 ); + iv[1] = _mm256_permute_pd( xv[1], 5 ); + + // yv = alphaRv * xv + yv + // = yR1.bR - yR1.bI + xR1.aR, yI1.bR + yI1.bI + xI1.aR, ... + yv[0] = _mm256_fmadd_pd( alphaRv, xv[0], yv[0] ); + yv[1] = _mm256_fmadd_pd( alphaRv, xv[1], yv[1] ); + + // yv = alphaIv * iv + yv + // = yR1.bR - yR1.bI - xI1.aI, yI1.bR + yI1.bI + xR1.aI, ... + yv[0] = _mm256_fmadd_pd( alphaIv, iv[0], yv[0] ); + yv[1] = _mm256_fmadd_pd( alphaIv, iv[1], yv[1] ); + + // Storing the result to memory + _mm256_storeu_pd( ( y0 ), yv[0] ); + _mm256_storeu_pd( ( y0 + 1 * n_elem_per_reg ), yv[1] ); + + // Adjusting the pointers for the next iteration + y0 += 2 * n_elem_per_reg; + x0 += 2 * n_elem_per_reg; + } - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] ); - _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), yv[1] ); - _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), yv[2] ); + // Processing 2 elements per loop, 3 FMAs + for ( ; ( i + 1 ) < n; i += 2 ) + { + // Load the y vector, 6 elements in total + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 ); + + // Load the x vector, 6 elements in total + // xv = xR1 xI1 xR2 xI2 + xv[0] = _mm256_loadu_pd( x0 ); + + // Permute the vectors from y for the required compute + // iv = yI1 yR1 yI2 yR2 + iv[0] = _mm256_permute_pd( yv[0], 5 ); + + // Scale the permuted vectors with imaginary component of beta + // iv = betaIv * yv + // = yI1.bI, yR1.bI, yI2.bI, yR2.bI, ... + iv[0] = _mm256_mul_pd( betaIv, iv[0] ); + + // Using fmaddsub to scale with real component of beta + // and sub/add to iv + // yv = betaRv * yv -/+ iv + // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... + yv[0] = _mm256_fmaddsub_pd( betaRv, yv[0], iv[0] ); + + // Permute the loaded vectors from x for the required compute + // xv' = xI1 xR1 xI2 xR2 + iv[0] = _mm256_permute_pd( xv[0], 5 ); + + // yv = alphaRv * xv + yv + // = yR1.bR - yR1.bI + xR1.aR, yI1.bR + yI1.bI + xI1.aR, ... + yv[0] = _mm256_fmadd_pd( alphaRv, xv[0], yv[0] ); + + // yv = alphaIv * iv + yv + // = yR1.bR - yR1.bI - xI1.aI, yI1.bR + yI1.bI + xR1.aI, ... + yv[0] = _mm256_fmadd_pd( alphaIv, iv[0], yv[0] ); + + // Storing the result to memory + _mm256_storeu_pd( ( y0 ), yv[0] ); + + // Adjusting the pointers for the next iteration + y0 += 1 * n_elem_per_reg; + x0 += 1 * n_elem_per_reg; + } - y0 += 3*n_elem_per_reg; - x0 += 3*n_elem_per_reg; } - // Processing 4 elements per loop, 4 FMAs - for ( ; ( i + 3 ) < n; i += 4 ) - { - // xv = xR1 xI1 xR2 xI2 - xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); - - // yv = yR1 yI1 yR2 yI2 - yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); - - // iv = betaRv * yv - // = yR1.bR, yI1.bR, yR2.bR, yI2.bR, ... - iv[0] = _mm256_mul_pd( betaRv, yv[0] ); - iv[1] = _mm256_mul_pd( betaRv, yv[1] ); - - // yv' = yI1 yR1 yI2 yR2 - yv[0] = _mm256_permute_pd( yv[0], 5); - yv[1] = _mm256_permute_pd( yv[1], 5); - - // yv = betaIv * yv' + iv - // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... - yv[0] = _mm256_fmadd_pd( betaIv, yv[0], iv[0] ); - yv[1] = _mm256_fmadd_pd( betaIv, yv[1], iv[1] ); - - // iv = alphaRv * xv - // = xR1.aR, xI1.aR, xR2.aR, xI2.aR, ... - iv[0] = _mm256_mul_pd( alphaRv, xv[0] ); - iv[1] = _mm256_mul_pd( alphaRv, xv[1] ); + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from AVX to SSE instructions. + _mm256_zeroupper(); - // xv' = xI1 xR1 xI2 xR2 - xv[0] = _mm256_permute_pd( xv[0], 5); - xv[1] = _mm256_permute_pd( xv[1], 5); + } - // yv = alphaIv * xv + yv - // = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ... - iv[0] = _mm256_fmadd_pd( alphaIv, xv[0], iv[0] ); - iv[1] = _mm256_fmadd_pd( alphaIv, xv[1], iv[1] ); + // Scratch registers to be used in case of non-unit strides or fringe case of 1. + __m128d x_elem, y_elem, x_perm, y_perm; + __m128d betaRv_128, betaIv_128; - yv[0] = _mm256_add_pd( yv[0], iv[0] ); - yv[1] = _mm256_add_pd( yv[1], iv[1] ); + // Casting the lower 128-bit lanes from betaRv and betaIv to its 128-bit alternative + // registers to avoid redundant broadcasts. + betaRv_128 = _mm256_castpd256_pd128( betaRv ); + betaIv_128 = _mm256_castpd256_pd128( betaIv ); - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] ); - _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), yv[1] ); + // NOTE : We cannot similarly use _mm256_castpd256_pd128 to avoid loading alpha + // since alpha is loaded onto its YMM rgeisters on requirement basis. + // In case of directly falling to this compute(non-unit stride cases), + // alpha wouldn't have been loaded onto any YMM reigsters. - y0 += 2*n_elem_per_reg; - x0 += 2*n_elem_per_reg; - } + // Changing betaIv_128 to { -bI bI } for the compute + x_elem = _mm_setzero_pd(); + betaIv_128 = _mm_addsub_pd( x_elem, betaIv_128 ); - // Processing 2 elements per loop, 3 FMAs - for ( ; ( i + 1 ) < n; i += 2 ) + // In case of alpha being 0, we just need to scale y by beta + if ( is_alpha_zero ) + { + // Iterate over y, one element at a time + for ( ; i < n; i += 1 ) { - // xv = xR1 xI1 xR2 xI2 - xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); - - // yv = yR1 yI1 yR2 yI2 - yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - - // iv = betaRv * yv - // = yR1.bR, yI1.bR, yR2.bR, yI2.bR, ... - iv[0] = _mm256_mul_pd( betaRv, yv[0] ); - - // yv' = yI1 yR1 yI2 yR2 - yv[0] = _mm256_permute_pd( yv[0], 5); - - // yv = betaIv * yv' + iv - // = yR1.bR - yI1.bI, yI1.bR + yR1.bI, ... - yv[0] = _mm256_fmadd_pd( betaIv, yv[0], iv[0] ); - - // iv = alphaRv * xv - // = xR1.aR, xI1.aR, xR2.aR, xI2.aR, ... - iv[0] = _mm256_mul_pd( alphaRv, xv[0] ); - - // xv' = xI1 xR1 xI2 xR2 - xv[0] = _mm256_permute_pd( xv[0], 5); - - // yv = alphaIv * xv + yv - // = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ... - iv[0] = _mm256_fmadd_pd( alphaIv, xv[0], iv[0] ); + // Load an element from y + // y_elem = yR1 yI1 + y_elem = _mm_loadu_pd( y0 ); - yv[0] = _mm256_add_pd( yv[0], iv[0] ); - - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] ); - - y0 += 1*n_elem_per_reg; - x0 += 1*n_elem_per_reg; - } - - // Issue vzeroupper instruction to clear upper lanes of ymm registers. - // This avoids a performance penalty caused by false dependencies when - // transitioning from AVX to SSE instructions (which may occur as soon - // as the n_left cleanup loop below if BLIS is compiled with - // -mfpmath=sse). - _mm256_zeroupper(); - - if ( !bli_is_conj( conjx_use ) ) - { - for ( ; i < n ; ++i ) - { - const double yRc = *y0; - const double yIc = *( y0 + 1 ); + // Permute y in accordance to its compute + // y_perm = yI1 yR1 + y_perm = _mm_permute_pd( y_elem, 0x1 ); - // yReal = ( bR.yR - bI.yI + aR.xR - aI.xI ) - *y0 = ( betaR * yRc ) - ( betaI * yIc ) + - ( alphaR * (*x0) ) - ( alphaI * (*(x0 + 1)) ); - // yImag = ( bR.yI + bI.yR + aR.xI + aI.xR ) - *(y0 + 1) = ( betaR * yIc ) + ( betaI * yRc ) + - ( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) ); + // Scale y_perm by the imaginary + // component of beta + // y_perm = -yI1.bI, yR1.bI + y_perm = _mm_mul_pd( betaIv_128, y_perm ); - x0 += 2; - y0 += 2; - } - } - else - { - for ( ; i < n ; ++i ) - { - const double yRc = *y0; - const double yIc = *( y0 + 1 ); + // Use fmadd to scale with real component of + // beta and add with intermediate result + // y_elem = yR1.bR - yI1.bI, yI1.bR + yR1.bI + y_elem = _mm_fmadd_pd( betaRv_128, y_elem, y_perm ); - // yReal = ( bR.yR - bI.yI + aR.xR - aI.xI ) - *y0 = ( betaR * yRc ) - ( betaI * yIc ) + - ( alphaR * (*x0) ) + ( alphaI * (*(x0 + 1)) ); - // yImag = ( bR.yI + bI.yR + aR.xI + aI.xR ) - *(y0 + 1) = ( betaR * yIc ) + ( betaI * yRc ) - - ( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) ); + // Storing the result to memory + _mm_storeu_pd( y0, y_elem ); - x0 += 2; - y0 += 2; - } + // Adjusting the pointer for the next iteration + y0 += incy * 2; } } else { - // for non-unit increments, use scaler code - if ( !bli_is_conj( conjx_use ) ) + // Scratch registers to store real and imaginary components + // of alpha onto XMM registers + __m128d alphaRv_128, alphaIv_128; + + // Broadcasting real and imaginary components of alpha + x_elem = _mm_setzero_pd(); + alphaRv_128 = _mm_loaddup_pd( &alphaR ); + alphaIv_128 = _mm_loaddup_pd( &alphaI ); + + // The changes on alphaRv_128 and alphaIv_128 are as follows : + // If conjugate is required: + // alphaRv_128 = aR -aR + // Else : + // alphaIv_128 = -aI aI + if( bli_is_conj( conjx_use ) ) { - for ( i = 0; i < n ; ++i ) - { - const double yRc = *y0; - const double yIc = *( y0 + 1 ); - - // yReal = ( bR.yR - bI.yI + aR.xR - aI.xI ) - *y0 = ( betaR * yRc ) - ( betaI * yIc ) + - ( alphaR * (*x0) ) - ( alphaI * (*(x0 + 1)) ); - // yImag = ( bR.yI + bI.yR + aR.xI + aI.xR ) - *(y0 + 1) = ( betaR * yIc ) + ( betaI * yRc ) + - ( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) ); - - x0 += incx * 2; - y0 += incy * 2; - } + alphaRv_128 = _mm_addsub_pd( x_elem, alphaRv_128 ); + alphaRv_128 = _mm_permute_pd( alphaRv_128, 0x1 ); } else { - for ( i = 0; i < n ; ++i ) - { - const double yRc = *y0; - const double yIc = *( y0 + 1 ); - - // yReal = ( bR.yR - bI.yI + aR.xR - aI.xI ) - *y0 = ( betaR * yRc ) - ( betaI * yIc ) + - ( alphaR * (*x0) ) + ( alphaI * (*(x0 + 1)) ); - // yImag = ( bR.yI + bI.yR + aR.xI + aI.xR ) - *(y0 + 1) = ( betaR * yIc ) + ( betaI * yRc ) - - ( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) ); + alphaIv_128 = _mm_addsub_pd( x_elem, alphaIv_128 ); + } - x0 += incx * 2; - y0 += incy * 2; - } + // Iterating over x and y vectors, on element at a time + for ( ; i < n; i += 1 ) + { + // Load an element from x and y + // y_elem = yR1 yI1 + // x_elem = xR1 xI1 + y_elem = _mm_loadu_pd( y0 ); + x_elem = _mm_loadu_pd( x0 ); + + // Permute y in accordance to its compute + // y_perm = yI1 yR1 + // x_perm = xR1 xI1 + y_perm = _mm_permute_pd( y_elem, 0x1 ); + x_perm = _mm_permute_pd( x_elem, 0x1 ); + + // Scale y_perm and x_perm by the imaginary + // component of beta and alpha + // y_perm = -yI1.bI, yR1.bI + // x_perm = -xI1.aI, xR1.aI + y_perm = _mm_mul_pd( betaIv_128, y_perm ); + x_perm = _mm_mul_pd( alphaIv_128, x_perm ); + + // Use fmadd to scale with y_elem with + // real component of beta and add with + // intermediate result. Similarly do + // for x_elem. + // y_elem = yR1.bR - yI1.bI, yI1.bR + yR1.bI + // x_elem = xR1.aR - xI1.aI, xI1.aR + xR1.aI + y_elem = _mm_fmadd_pd( betaRv_128, y_elem, y_perm ); + x_elem = _mm_fmadd_pd( alphaRv_128, x_elem, x_perm ); + + // Add the computed x and y vectors, store on y. + y_elem = _mm_add_pd( y_elem, x_elem ); + + // Storing the result to memory + _mm_storeu_pd( y0, y_elem ); + + // Adjusting the pointer for the next iteration + x0 += incx * 2; + y0 += incy * 2; } } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4) } From ea0324ab9558fe9bdf9579bb557706970e2ea2fb Mon Sep 17 00:00:00 2001 From: mkadavil Date: Mon, 9 Oct 2023 17:12:03 +0530 Subject: [PATCH 156/226] Multi data type downscaling support for u8s8s16 - u8s8s16 Downscaling is used when GEMM output is accumulated at a higher precision and needs to be converted to a lower precision afterwards. Currently the u8s8s16 flavor of api only supports downscaling to s8 (int8_t) via aocl_gemm_u8s8s16os8 after results are accumulated at int16_t. LPGEMM is modified to support downscaling to different data types, like u8, s16, apart from s8. The framework (5 loop) passes the downscale data type to the micro-kernels. Within the micro-kernel, based on the downscale type, appropriate beta scaling and output buffer store logic is executed. This support is only enabled for u8s8s16 flavor of api's. The LPGEMM bench is also modified to support passing downscale data type for performance and accuracy testing. AMD-Internal: [SWLCSG-2313] Change-Id: I723d0802baf8649e5e41236b239880a6043bfd30 --- addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c | 8 +- addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c | 8 +- addon/aocl_gemm/aocl_gemm_f32f32f32of32.c | 8 +- addon/aocl_gemm/aocl_gemm_interface_apis.h | 1 + addon/aocl_gemm/aocl_gemm_s8s8s16os16.c | 4 +- addon/aocl_gemm/aocl_gemm_s8s8s16os8.c | 4 +- addon/aocl_gemm/aocl_gemm_s8s8s32os32.c | 4 +- addon/aocl_gemm/aocl_gemm_s8s8s32os8.c | 4 +- addon/aocl_gemm/aocl_gemm_u8s8s16os16.c | 4 +- addon/aocl_gemm/aocl_gemm_u8s8s16os8.c | 4 +- addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c | 170 +++++++++ addon/aocl_gemm/aocl_gemm_u8s8s32os32.c | 4 +- addon/aocl_gemm/aocl_gemm_u8s8s32os8.c | 4 +- .../aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c | 11 +- .../frame/f32f32f32/lpgemm_f32f32f32.c | 3 +- .../frame/lpgemm_5loop_interface_apis.h | 2 +- addon/aocl_gemm/frame/lpgemm_post_ops.h | 1 + addon/aocl_gemm/frame/lpgemm_types.h | 18 + .../aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c | 11 +- .../aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c | 11 +- .../threading/lpgemm_thread_decor_openmp.c | 4 +- .../threading/lpgemm_thread_decor_openmp.h | 4 +- .../aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c | 11 +- .../aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c | 11 +- bench/bench_aocl_gemm/bench_lpgemm.c | 149 +++++--- .../u8s8s16/lpgemm_6x32rowmajor_amd256.c | 141 ++++++-- .../lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c | 194 +++++++--- .../lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c | 342 +++++++++++++----- .../lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c | 206 ++++++++--- .../lpgemm/u8s8s16/lpgemm_s16_kern_macros.h | 175 ++++++--- 30 files changed, 1142 insertions(+), 379 deletions(-) create mode 100644 addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c index cf81bcd1b1..9e27ae4fc7 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c @@ -225,7 +225,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) ( float* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, BF16 ); } else @@ -238,7 +238,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) ( float* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, BF16 ); } #else @@ -253,7 +253,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) ( float* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, BF16 ); } else @@ -266,7 +266,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) ( float* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, BF16 ); } #endif diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c index 9f799bae35..bbb53d4d92 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c @@ -225,7 +225,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, F32 ); } else @@ -238,7 +238,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, F32 ); } #else @@ -253,7 +253,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, F32 ); } else @@ -266,7 +266,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, F32 ); } #endif diff --git a/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c b/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c index 0a4963cfc2..311b6a05e2 100644 --- a/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c +++ b/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c @@ -197,7 +197,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, F32 ); } else @@ -210,7 +210,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, F32 ); } #else @@ -229,7 +229,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, F32 ); } else @@ -242,7 +242,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, F32 ); } #endif diff --git a/addon/aocl_gemm/aocl_gemm_interface_apis.h b/addon/aocl_gemm/aocl_gemm_interface_apis.h index 41063343b1..142f15fae9 100644 --- a/addon/aocl_gemm/aocl_gemm_interface_apis.h +++ b/addon/aocl_gemm/aocl_gemm_interface_apis.h @@ -110,6 +110,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16); AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32); AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8); AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8); +AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8); AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16); AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32); AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8); diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c b/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c index c8cb6fa528..f8f22c215d 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c @@ -153,7 +153,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, S16 ); #else lpgemm_s8s8s16o16_thread_decorator @@ -164,7 +164,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, S16 ); #endif } diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c b/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c index dfc4954045..f1a640bcd7 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c @@ -153,7 +153,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8) ( int16_t* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, S8 ); #else lpgemm_s8s8s16o16_thread_decorator @@ -164,7 +164,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8) ( int16_t* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, S8 ); #endif } diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c b/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c index 26a3196e1d..9f4f565974 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c @@ -154,7 +154,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, S32 ); #else lpgemm_s8s8s32o32_thread_decorator @@ -165,7 +165,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, S32 ); #endif } diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c b/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c index eb68d6ca41..e3562170e3 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c @@ -154,7 +154,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8) ( int32_t* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, S8 ); #else lpgemm_s8s8s32o32_thread_decorator @@ -165,7 +165,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8) ( int32_t* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, S8 ); #endif } diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c index 62b43793ca..970200cf9e 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c @@ -153,7 +153,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, S16 ); #else lpgemm_u8s8s16o16_thread_decorator @@ -164,7 +164,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, S16 ); #endif } diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c index 9cdc8b58d0..f40a558b5d 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c @@ -153,7 +153,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8) ( int16_t* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, S8 ); #else lpgemm_u8s8s16o16_thread_decorator @@ -164,7 +164,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8) ( int16_t* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, S8 ); #endif } diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c b/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c new file mode 100644 index 0000000000..80b1619ce1 --- /dev/null +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c @@ -0,0 +1,170 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "aocl_gemm_interface_apis.h" +#include "lpgemm_types.h" +#include "lpgemm_5loop_interface_apis.h" +#include "lpgemm_config.h" +#include "lpgemm_utils.h" +#include "lpgemm_thread_decor_openmp.h" +#include "lpgemm_post_ops.h" + +AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8) +{ + trans_t blis_transa; + trans_t blis_transb; + + // Check if AVX2 ISA is supported, lpgemm u8s8s16os16 matmul only works with it. + if ( bli_cpuid_is_avx2fma3_supported() == FALSE ) + { + bli_print_msg(" AVX2 ISA not supported by processor, " + "cannot perform u8s8s16 gemm.", __FILE__, __LINE__ ); + return; // Error. + } + + /* Initialize BLIS. */ + bli_init_auto(); + + // Set MC, NC, KC, NR, MR. + aocl_lpgemm_init_global_cntx(); + + // Null check for pointers. + if ((a == NULL) || (b == NULL) || (c == NULL)) + { + return; // Error. + } + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_trans(transa, &blis_transa); + bli_param_map_netlib_to_blis_trans(transb, &blis_transb); + + /* Perform BLAS parameter checking. */ + // Transpose not supported. + if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || + ( blis_transb != BLIS_NO_TRANSPOSE ) ) + { + return; // Error. + } + + // Sanitize order input. + char order_use = + ( ( order == 'r' ) || ( order == 'R' ) || + ( order == 'c' ) || ( order == 'C' ) ) ? + order : 'r'; + if ( ( order_use != 'r' ) && ( order_use != 'R' ) ) + { + return; // Only row major supported. + } + + // Row major input expected with leading dimensions equal to row stride. + if ((lda != k) || (ldb != n) || (ldc != n)) + { + return; // Error. + } + + // Check if dimensions are valid. + if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0)) + { + return; // Error. + } + + const inc_t rs_a = lda; + const inc_t cs_a = 1; + const inc_t rs_b = ldb; + const inc_t cs_b = 1; + const inc_t rs_c = ldc; + const inc_t cs_c = 1; + + AOCL_MEMORY_TAG mtag_a; + AOCL_MEMORY_TAG mtag_b; + + bli_param_map_char_to_lpmtag(mem_format_a, &mtag_a); + bli_param_map_char_to_lpmtag(mem_format_b, &mtag_b); + + // B matrix needs to be packed in a certain format in order to be loaded + // and used in VNNI instrution. As such the mtag_b always needs to be either + // packed or reordered. B matrix as it is (unpacked) cannot be used, and + // the mtag_b is set to packed to enable runtime packing. + if (mtag_b == UNPACKED) + { + mtag_b = PACK; + } + + // Only unpacked A supported now. + if (mtag_a != UNPACKED) + { + return; // Error. + } + + // Convert post op struct to post op linked list format. + lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; + lpgemm_translate_to_post_ops_list + ( + post_op_unparsed, post_op_list, + ( void* )c, ( void* )( &order_use ) + ); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_g; + bli_rntm_init_from_global(&rntm_g); + bli_pba_rntm_set_pba(&rntm_g); + + lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 ); + +#ifdef BLIS_ENABLE_OPENMP + lpgemm_u8s8s16o16_openmp_thread_decorator + ( + m, n, k, + a, rs_a, cs_a, mtag_a, + b, rs_b, cs_b, mtag_b, + ( int16_t* )c, rs_c, cs_c, + alpha, beta, + &rntm_g, lcntx_g, + post_op_list, U8 + ); +#else + lpgemm_u8s8s16o16_thread_decorator + ( + m, n, k, + a, rs_a, cs_a, mtag_a, + b, rs_b, cs_b, mtag_b, + ( int16_t* )c, rs_c, cs_c, + alpha, beta, + &rntm_g, lcntx_g, + post_op_list, U8 + ); +#endif +} diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c index b092e2ed64..1f44770ec8 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c @@ -154,7 +154,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, S32 ); #else lpgemm_u8s8s32o32_thread_decorator @@ -165,7 +165,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32) c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, FALSE + post_op_list, S32 ); #endif } diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c index eabe2428ba..dad9c56ab9 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c @@ -154,7 +154,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8) ( int32_t* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, S8 ); #else lpgemm_u8s8s32o32_thread_decorator @@ -165,7 +165,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8) ( int32_t* )c, rs_c, cs_c, alpha, beta, &rntm_g, lcntx_g, - post_op_list, TRUE + post_op_list, S8 ); #endif } diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c index d6cc33fbb5..da2427af4c 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c @@ -115,7 +115,8 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) bool is_first_k = FALSE; lpgemm_post_op_attr post_ops_attr; - if ( c_downscale == TRUE ) + post_ops_attr.c_stor_type = c_downscale; + if ( c_downscale < F32 ) { post_ops_attr.buf_downscale = c; } @@ -155,12 +156,12 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) ); } - if ( c_downscale == FALSE ) + if ( c_downscale == F32 ) { c_use_jc = c + jc; } // Temp accumulaton buffer for C allocation. - else if ( c_downscale == TRUE ) + else if ( c_downscale < F32 ) { // Buffer memory is only required if output needs to be // persisted across iterations of the pc/KC loop. @@ -303,7 +304,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) // Only per thread C matrix is stored in temp buffer, so both // per thread jc and ic start should be normalized to zero. - if ( c_downscale == TRUE ) + if ( c_downscale < F32 ) { c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) ); } @@ -402,7 +403,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) bli_pba_release(rntm, &mem_a); } } - if ( c_downscale == TRUE ) + if ( c_downscale < F32 ) { if ( bli_mem_is_alloc( &mem_scale_c ) ) { diff --git a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c index e513aa86d2..61e8cf8654 100644 --- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c +++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c @@ -150,7 +150,8 @@ LPGEMM_5LOOP(float,float,float,f32f32f32of32) bool is_first_k = FALSE; lpgemm_post_op_attr post_ops_attr; - if ( c_downscale == TRUE ) + post_ops_attr.c_stor_type = c_downscale; + if ( c_downscale < F32 ) { post_ops_attr.buf_downscale = c; } diff --git a/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h b/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h index 62fc678faa..78ccc358a3 100644 --- a/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h +++ b/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h @@ -62,7 +62,7 @@ void lpgemm_rowvar_ ## LP_SFX \ lpgemm_thrinfo_t* thread, \ lpgemm_cntx_t* lcntx, \ lpgemm_post_op* post_op_list, \ - bool c_downscale \ + AOCL_STORAGE_TYPE c_downscale \ ) \ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32); diff --git a/addon/aocl_gemm/frame/lpgemm_post_ops.h b/addon/aocl_gemm/frame/lpgemm_post_ops.h index 7509e57a39..f0a0cea8b5 100644 --- a/addon/aocl_gemm/frame/lpgemm_post_ops.h +++ b/addon/aocl_gemm/frame/lpgemm_post_ops.h @@ -70,6 +70,7 @@ typedef struct lpgemm_post_op_attr_t void* buf_downscale; bool is_first_k; bool is_last_k; + AOCL_STORAGE_TYPE c_stor_type; dim_t b_sum_offset; int32_t* b_col_sum_vec; int16_t* b_col_sum_vec_s16; diff --git a/addon/aocl_gemm/frame/lpgemm_types.h b/addon/aocl_gemm/frame/lpgemm_types.h index b700c03878..02c1813369 100644 --- a/addon/aocl_gemm/frame/lpgemm_types.h +++ b/addon/aocl_gemm/frame/lpgemm_types.h @@ -42,6 +42,24 @@ typedef enum INT32 = 2 } AOCL_ARRAY_TYPE; +// Enum to denote the storage data type (output matrix). +// It is expected that the enum entries are in ascending order of +// storage data type size. +typedef enum +{ + S8 = 0, + U8 = 1, + S16 = 2, + U16 = 3, + BF16 = 4, + S32 = 5, + U32 = 6, + F32 = 7, + S64 = 8, + U64 = 9, + F64 = 10 +} AOCL_STORAGE_TYPE; + // Enum name template:A_mat_type ## B_mat_type ## Accumulate_type ## C_mat_type. typedef enum { diff --git a/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c b/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c index 76c899ddd4..40d1f70ccb 100644 --- a/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c +++ b/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c @@ -116,7 +116,8 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) bool is_first_k = FALSE; lpgemm_post_op_attr post_ops_attr; - if ( c_downscale == TRUE ) + post_ops_attr.c_stor_type = c_downscale; + if ( c_downscale < S16 ) { post_ops_attr.buf_downscale = c; } @@ -156,12 +157,12 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) ); } - if ( c_downscale == FALSE ) + if ( c_downscale == S16 ) { c_use_jc = c + jc; } // Temp accumulaton buffer for C allocation. - else if ( c_downscale == TRUE ) + else if ( c_downscale < S16 ) { // Buffer memory is only required if output needs to be // persisted across iterations of the pc/KC loop. @@ -329,7 +330,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) // Only per thread C matrix is stored in temp buffer, so both // per thread jc and ic start should be normalized to zero. - if ( c_downscale == TRUE ) + if ( c_downscale < S16 ) { c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) ); } @@ -392,7 +393,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) } } } - if ( c_downscale == TRUE ) + if ( c_downscale < S16 ) { if ( bli_mem_is_alloc( &mem_scale_c ) ) { diff --git a/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c b/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c index 15ebba6561..e8decd4ca2 100644 --- a/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c +++ b/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c @@ -123,7 +123,8 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32) bool is_first_k = FALSE; lpgemm_post_op_attr post_ops_attr; - if ( c_downscale == TRUE ) + post_ops_attr.c_stor_type = c_downscale; + if ( c_downscale < S32 ) { post_ops_attr.buf_downscale = c; } @@ -163,12 +164,12 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32) ); } - if ( c_downscale == FALSE ) + if ( c_downscale == S32 ) { c_use_jc = c + jc; } // Temp accumulaton buffer for C allocation. - else if ( c_downscale == TRUE ) + else if ( c_downscale < S32 ) { // Buffer memory is only required if output needs to be // persisted across iterations of the pc/KC loop. @@ -335,7 +336,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32) // Only per thread C matrix is stored in temp buffer, so both // per thread jc and ic start should be normalized to zero. - if ( c_downscale == TRUE ) + if ( c_downscale < S32 ) { c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) ); } @@ -437,7 +438,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32) bli_pba_release( rntm, &mem_a ); } } - if ( c_downscale == TRUE ) + if ( c_downscale < S32 ) { if ( bli_mem_is_alloc( &mem_scale_c ) ) { diff --git a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c index 36be28e570..a9f9d2a236 100644 --- a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c +++ b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c @@ -657,7 +657,7 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \ rntm_t* rntm_g, \ lpgemm_cntx_t* lcntx, \ lpgemm_post_op* post_op_list, \ - bool c_downscale \ + AOCL_STORAGE_TYPE c_downscale \ ) \ { \ dim_t n_threads; \ @@ -759,7 +759,7 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \ rntm_t* rntm_g, \ lpgemm_cntx_t* lcntx, \ lpgemm_post_op* post_op_list, \ - bool c_downscale \ + AOCL_STORAGE_TYPE c_downscale \ ) \ { \ dim_t n_threads = 1; \ diff --git a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h index 80c657b230..a7460bb061 100644 --- a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h +++ b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h @@ -63,7 +63,7 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \ rntm_t* rntm_g, \ lpgemm_cntx_t* lcntx, \ lpgemm_post_op* post_op_list, \ - bool c_downscale \ + AOCL_STORAGE_TYPE c_downscale \ ); \ GEN_LPGEMM_OPENMP_DECORATOR_FN(uint8_t,int8_t,int16_t,u8s8s16o16) @@ -97,7 +97,7 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \ rntm_t* rntm_g, \ lpgemm_cntx_t* lcntx, \ lpgemm_post_op* post_op_list, \ - bool c_downscale \ + AOCL_STORAGE_TYPE c_downscale \ ); \ GEN_LPGEMM_DECORATOR_FN(uint8_t,int8_t,int16_t,u8s8s16o16) diff --git a/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c b/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c index 8a1b03fcc5..1e8d9357c1 100644 --- a/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c +++ b/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c @@ -113,7 +113,8 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16) bool is_first_k = FALSE; lpgemm_post_op_attr post_ops_attr; - if ( c_downscale == TRUE ) + post_ops_attr.c_stor_type = c_downscale; + if ( c_downscale < S16 ) { post_ops_attr.buf_downscale = c; } @@ -153,12 +154,12 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16) ); } - if ( c_downscale == FALSE ) + if ( c_downscale == S16 ) { c_use_jc = c + jc; } // Temp accumulaton buffer for C allocation. - else if ( c_downscale == TRUE ) + else if ( c_downscale < S16 ) { // Buffer memory is only required if output needs to be // persisted across iterations of the pc/KC loop. @@ -305,7 +306,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16) // Only per thread C matrix is stored in temp buffer, so both // per thread jc and ic start should be normalized to zero. - if ( c_downscale == TRUE ) + if ( c_downscale < S16 ) { c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) ); } @@ -365,7 +366,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16) } } } - if ( c_downscale == TRUE ) + if ( c_downscale < S16 ) { if ( bli_mem_is_alloc( &mem_scale_c ) ) { diff --git a/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c b/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c index 078593097c..d0b06f207b 100644 --- a/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c +++ b/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c @@ -122,7 +122,8 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32) bool is_first_k = FALSE; lpgemm_post_op_attr post_ops_attr; - if ( c_downscale == TRUE ) + post_ops_attr.c_stor_type = c_downscale; + if ( c_downscale < S32 ) { post_ops_attr.buf_downscale = c; } @@ -162,12 +163,12 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32) ); } - if ( c_downscale == FALSE ) + if ( c_downscale == S32 ) { c_use_jc = c + jc; } // Temp accumulaton buffer for C allocation. - else if ( c_downscale == TRUE ) + else if ( c_downscale < S32 ) { // Buffer memory is only required if output needs to be // persisted across iterations of the pc/KC loop. @@ -313,7 +314,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32) // Only per thread C matrix is stored in temp buffer, so both // per thread jc and ic start should be normalized to zero. - if ( c_downscale == TRUE ) + if ( c_downscale < S32 ) { c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) ); } @@ -416,7 +417,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32) bli_pba_release( rntm, &mem_a ); } } - if ( c_downscale == TRUE ) + if ( c_downscale < S32 ) { if ( bli_mem_is_alloc( &mem_scale_c ) ) { diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 765c293f8c..6f43dac4bf 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -43,8 +43,10 @@ #include "blis.h" -#define S8_MIN (-128) -#define S8_MAX (+127) +// Used to clip downscaled output, will be set in the main loop based +// on the accumulation and C data type. +int64_t DSCALE_CLIP_MIN = 0; +int64_t DSCALE_CLIP_MAX = 0; // Mode can be one of the follwoing: // 1. p - performance, used for benchmarks. @@ -77,7 +79,8 @@ static inline float bf16_to_float { int32_t inter_temp = *( ( int16_t* ) &bf16_val ); inter_temp = inter_temp << 16; - float float_value = *( float* ) ( &inter_temp ); + float float_value = 0.0; + memcpy( &float_value, &inter_temp, sizeof( int32_t ) ); return float_value; } @@ -245,6 +248,7 @@ void mat_mul_ ## BLAS_SFX \ GEN_BLIS_MAT_MUL_FUNC(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) GEN_BLIS_MAT_MUL_FUNC(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8) +GEN_BLIS_MAT_MUL_FUNC(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8) GEN_BLIS_MAT_MUL_FUNC(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32) GEN_BLIS_MAT_MUL_FUNC(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8) GEN_BLIS_MAT_MUL_FUNC(bfloat16,bfloat16,float,float,bf16bf16f32of32) @@ -343,6 +347,7 @@ void mat_mul_bench_driver_ ## BLAS_SFX \ GEN_MAT_MUL_BENCH_DRV_FUNC(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) GEN_MAT_MUL_BENCH_DRV_FUNC(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8) +GEN_MAT_MUL_BENCH_DRV_FUNC(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8) GEN_MAT_MUL_BENCH_DRV_FUNC(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32) GEN_MAT_MUL_BENCH_DRV_FUNC(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8) GEN_MAT_MUL_BENCH_DRV_FUNC(bfloat16,bfloat16,float,float,bf16bf16f32of32) @@ -376,12 +381,13 @@ static inline ACCUM_type mat_mul_accuracy_check_downscale_ ## BLAS_DOWNSCALE_SFX max( nearbyintf( ( SCALE_type )( temp_accum ) * \ ( *( ( SCALE_type* )post_op->sum.scale_factor + j ) ) ) + \ *( ( C_type* )post_op->sum.zero_point + j ), \ - S8_MIN ), \ - S8_MAX ); \ + DSCALE_CLIP_MIN ), \ + DSCALE_CLIP_MAX ); \ return out_temp_accum; \ }\ GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int16_t,float,u8s8s16os8) +GEN_MAT_MUL_ACC_CHK_DOWNSCALE(uint8_t,int16_t,float,u8s8s16ou8) GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int32_t,float,u8s8s32os8) GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int32_t,float,s8s8s32os8) GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int16_t,float,s8s8s16os8) @@ -428,6 +434,7 @@ static inline ACCUM_type mat_mul_accuracy_check_accum_ ## BLAS_SFX \ }\ GEN_MAT_MUL_ACC_CHK_ACCUM(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8) +GEN_MAT_MUL_ACC_CHK_ACCUM(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8) GEN_MAT_MUL_ACC_CHK_ACCUM(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) GEN_MAT_MUL_ACC_CHK_ACCUM(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8) GEN_MAT_MUL_ACC_CHK_ACCUM(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32) @@ -512,6 +519,7 @@ static inline ACCUM_type GELU_TANH_post_op_ ## BLAS_SFX \ }\ GEN_GELU_TANH_POSTOP_INT(int16_t,u8s8s16os8) +GEN_GELU_TANH_POSTOP_INT(int16_t,u8s8s16ou8) GEN_GELU_TANH_POSTOP_INT(int16_t,u8s8s16os16) GEN_GELU_TANH_POSTOP_INT(int32_t,u8s8s32os8) GEN_GELU_TANH_POSTOP_INT(int32_t,u8s8s32os32) @@ -548,6 +556,7 @@ static inline ACCUM_type GELU_ERF_post_op_ ## BLAS_SFX \ }\ GEN_GELU_ERF_POSTOP_INT(int16_t,u8s8s16os8) +GEN_GELU_ERF_POSTOP_INT(int16_t,u8s8s16ou8) GEN_GELU_ERF_POSTOP_INT(int16_t,u8s8s16os16) GEN_GELU_ERF_POSTOP_INT(int32_t,u8s8s32os8) GEN_GELU_ERF_POSTOP_INT(int32_t,u8s8s32os32) @@ -584,6 +593,7 @@ GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int32_t,int32_t) GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int8_t,int32_t) GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int16_t,int16_t) GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int8_t,int16_t) +GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(uint8_t,int16_t) GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(float,float) void mat_mul_get_output_type_valfloatbfloat16 @@ -776,6 +786,7 @@ cleanup_acc: \ GEN_MAT_MUL_ACC_CHK_DRV_FUNC(uint8_t,int8_t,int16_t,int16_t,float,u8s8s16os16,u8s8s16os8) GEN_MAT_MUL_ACC_CHK_DRV_FUNC(uint8_t,int8_t,int8_t,int16_t,float,u8s8s16os8,u8s8s16os8) +GEN_MAT_MUL_ACC_CHK_DRV_FUNC(uint8_t,int8_t,uint8_t,int16_t,float,u8s8s16ou8,u8s8s16ou8) GEN_MAT_MUL_ACC_CHK_DRV_FUNC(uint8_t,int8_t,int32_t,int32_t,float,u8s8s32os32,u8s8s32os8) GEN_MAT_MUL_ACC_CHK_DRV_FUNC(uint8_t,int8_t,int8_t,int32_t,float,u8s8s32os8,u8s8s32os8) GEN_MAT_MUL_ACC_CHK_DRV_FUNC(bfloat16,bfloat16,float,float,float,bf16bf16f32of32,bf16bf16f32obf16) @@ -786,7 +797,6 @@ GEN_MAT_MUL_ACC_CHK_DRV_FUNC(int8_t,int8_t,int8_t,int32_t,float,s8s8s32os8,s8s8s GEN_MAT_MUL_ACC_CHK_DRV_FUNC(int8_t,int8_t,int16_t,int16_t,float,s8s8s16os16,s8s8s16os8) GEN_MAT_MUL_ACC_CHK_DRV_FUNC(int8_t,int8_t,int8_t,int16_t,float,s8s8s16os8,s8s8s16os8) -/* Only supports bias followed by RELU and vice versa for now.*/ \ #define GEN_MAT_MUL_POST_OPS_CREATOR(C_DSCALE_type,C_type,DSCALE_type,BLAS_SFX) \ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ ( \ @@ -840,12 +850,12 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ num_eltwise = 0; \ while ( ops_tok ) \ { \ - if ( strcmp( ops_tok, "bias") == 0 ) \ + if ( strcmp( ops_tok, "bias" ) == 0 ) \ { \ post_ops->seq_vector[cur_op_index] = BIAS; \ cur_op_index++; \ } \ - else if ( ( strcmp( ops_tok, "relu") == 0 ) && \ + else if ( ( strcmp( ops_tok, "relu" ) == 0 ) && \ ( is_activator_set == FALSE ) ) \ { \ post_ops->seq_vector[cur_op_index] = ELTWISE; \ @@ -855,7 +865,7 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ activator_idx = cur_op_index; \ cur_op_index++; \ } \ - else if ( ( strcmp( ops_tok, "prelu") == 0 ) && \ + else if ( ( strcmp( ops_tok, "prelu" ) == 0 ) && \ ( is_activator_set == FALSE ) ) \ { \ post_ops->seq_vector[cur_op_index] = ELTWISE; \ @@ -865,7 +875,7 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ activator_idx = cur_op_index; \ cur_op_index++; \ } \ - else if ( ( strcmp( ops_tok, "gelu_tanh") == 0 ) && \ + else if ( ( strcmp( ops_tok, "gelu_tanh" ) == 0 ) && \ ( is_activator_set == FALSE ) ) \ { \ post_ops->seq_vector[cur_op_index] = ELTWISE; \ @@ -875,7 +885,7 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ activator_idx = cur_op_index; \ cur_op_index++; \ } \ - else if ( ( strcmp( ops_tok, "gelu_erf") == 0 ) && \ + else if ( ( strcmp( ops_tok, "gelu_erf" ) == 0 ) && \ ( is_activator_set == FALSE ) ) \ { \ post_ops->seq_vector[cur_op_index] = ELTWISE; \ @@ -885,7 +895,7 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ activator_idx = cur_op_index; \ cur_op_index++; \ } \ - else if ( strcmp( ops_tok, "clip") == 0 ) \ + else if ( strcmp( ops_tok, "clip" ) == 0 ) \ { \ post_ops->seq_vector[cur_op_index] = ELTWISE; \ is_clip = TRUE; \ @@ -977,7 +987,7 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ ( post_ops->eltwise + clip_idx )->algo.alpha = malloc( sizeof( C_type ) ); \ ( post_ops->eltwise + clip_idx )->algo.beta = malloc( sizeof( C_type ) ); \ *( ( C_type* ) ( post_ops->eltwise + clip_idx )->algo.alpha ) = ( C_type ) ( -64 ); \ - *( ( C_type* ) ( post_ops->eltwise + clip_idx )->algo.beta ) = ( C_type ) ( 3 ); \ + *( ( C_type* ) ( post_ops->eltwise + clip_idx )->algo.beta ) = ( C_type ) ( 23 ); \ ( post_ops->eltwise + clip_idx )->algo.algo_type = CLIP; \ } \ } \ @@ -1133,8 +1143,8 @@ void mat_mul_bench_main_ ## BLAS_SFX \ GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( m * n ) ); \ } \ \ - C_type alpha; \ - C_type beta; \ + C_type alpha = 0; \ + C_type beta = 0; \ if ( bench_mode == 'p' ) \ { \ alpha = 1; \ @@ -1232,6 +1242,7 @@ void mat_mul_bench_main_ ## BLAS_SFX \ GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int16_t,u8s8s16os16,u8s8s16os16) GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int8_t,u8s8s16os8,u8s8s16os16) +GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,uint8_t,u8s8s16ou8,u8s8s16os16) GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int32_t,u8s8s32os32,u8s8s32os32) GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int8_t,u8s8s32os8,u8s8s32os32) GEN_MAT_MUL_BENCH_MAIN_FUNC(float,float,float,f32f32f32of32,f32f32f32of32) @@ -1307,8 +1318,8 @@ void mat_mul_bench_main_ ## BLAS_SFX \ GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( m * n ) ); \ } \ \ - float alpha; \ - float beta; \ + float alpha = 0.0f; \ + float beta = 0.0f; \ if ( bench_mode == 'p' ) \ { \ alpha = 1; \ @@ -1454,11 +1465,12 @@ int main( int argc, char** argv ) char* file_name = NULL; char* post_ops_str = NULL; char* post_ops_str_dest = NULL; //Strtok is used to parse, need to maintain a copy. + char* dscale_type_str = NULL; // Parse CLI arguments. opterr = 0; int opt_val; - while ( ( opt_val = getopt( argc, argv, "i:m:n:o:d" ) ) != -1 ) + while ( ( opt_val = getopt( argc, argv, "i:m:n:o:d:" ) ) != -1 ) { switch ( opt_val ) { @@ -1476,6 +1488,7 @@ int main( int argc, char** argv ) break; case 'd': global_dscale_out = 'y'; + dscale_type_str = optarg; break; default: break; @@ -1578,12 +1591,22 @@ int main( int argc, char** argv ) } else { - GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s32os8) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); + if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || + ( strcmp( dscale_type_str, "s8" ) == 0 ) ) + { + DSCALE_CLIP_MIN = -128; + DSCALE_CLIP_MAX = +127; + GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s32os8) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + printf("Downscale type not supported.\n"); + } } } else if ( ( op_type_char == 'f' ) || ( op_type_char == 'F' ) ) @@ -1608,12 +1631,34 @@ int main( int argc, char** argv ) } else { - GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16os8) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); + if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || + ( strcmp( dscale_type_str, "s8" ) == 0 ) ) + { + DSCALE_CLIP_MIN = -128; + DSCALE_CLIP_MAX = +127; + GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16os8) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else if ( ( strcmp( dscale_type_str, "U8" ) == 0 ) || + ( strcmp( dscale_type_str, "u8" ) == 0 ) ) + { + DSCALE_CLIP_MIN = 0; + DSCALE_CLIP_MAX = +255; + GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16ou8) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + printf("Downscale type not supported.\n"); + } } } else if ((op_type_char == 'b') || (op_type_char == 'B')) @@ -1650,12 +1695,22 @@ int main( int argc, char** argv ) } else { - GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s32os8) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); + if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || + ( strcmp( dscale_type_str, "s8" ) == 0 ) ) + { + DSCALE_CLIP_MIN = -128; + DSCALE_CLIP_MAX = +127; + GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s32os8) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + printf("Downscale type not supported.\n"); + } } } else if ( ( op_type_char == 'v' ) || ( op_type_char == 'V' ) ) @@ -1671,12 +1726,22 @@ int main( int argc, char** argv ) } else { - GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s16os8) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); + if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || + ( strcmp( dscale_type_str, "s8" ) == 0 ) ) + { + DSCALE_CLIP_MIN = -128; + DSCALE_CLIP_MAX = +127; + GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s16os8) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + printf("Downscale type not supported.\n"); + } } } if ( post_ops_str != NULL ) diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c index a3f1f01865..286d6422b7 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c @@ -348,41 +348,82 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - // c[0,0-15] - S8_S16_BETA_OP(c_int16_0p0,ir,0,0,alphav,betav) + if ( post_ops_attr.c_stor_type == S8 ) + { + // c[0,0-15] + S8_S16_BETA_OP(c_int16_0p0,ir,0,0,alphav,betav) - // c[0, 16-31] - S8_S16_BETA_OP(c_int16_0p1,ir,0,1,alphav,betav) + // c[0, 16-31] + S8_S16_BETA_OP(c_int16_0p1,ir,0,1,alphav,betav) - // c[1,0-15] - S8_S16_BETA_OP(c_int16_1p0,ir,1,0,alphav,betav) + // c[1,0-15] + S8_S16_BETA_OP(c_int16_1p0,ir,1,0,alphav,betav) - // c[1,16-31] - S8_S16_BETA_OP(c_int16_1p1,ir,1,1,alphav,betav) + // c[1,16-31] + S8_S16_BETA_OP(c_int16_1p1,ir,1,1,alphav,betav) - // c[2,0-15] - S8_S16_BETA_OP(c_int16_2p0,ir,2,0,alphav,betav) + // c[2,0-15] + S8_S16_BETA_OP(c_int16_2p0,ir,2,0,alphav,betav) - // c[2,16-31] - S8_S16_BETA_OP(c_int16_2p1,ir,2,1,alphav,betav) + // c[2,16-31] + S8_S16_BETA_OP(c_int16_2p1,ir,2,1,alphav,betav) - // c[3,0-15] - S8_S16_BETA_OP(c_int16_3p0,ir,3,0,alphav,betav) + // c[3,0-15] + S8_S16_BETA_OP(c_int16_3p0,ir,3,0,alphav,betav) - // c[3,16-31] - S8_S16_BETA_OP(c_int16_3p1,ir,3,1,alphav,betav) + // c[3,16-31] + S8_S16_BETA_OP(c_int16_3p1,ir,3,1,alphav,betav) - // c[4,0-15] - S8_S16_BETA_OP(c_int16_4p0,ir,4,0,alphav,betav) + // c[4,0-15] + S8_S16_BETA_OP(c_int16_4p0,ir,4,0,alphav,betav) - // c[4,16-31] - S8_S16_BETA_OP(c_int16_4p1,ir,4,1,alphav,betav) + // c[4,16-31] + S8_S16_BETA_OP(c_int16_4p1,ir,4,1,alphav,betav) - // c[5,0-15] - S8_S16_BETA_OP(c_int16_5p0,ir,5,0,alphav,betav) + // c[5,0-15] + S8_S16_BETA_OP(c_int16_5p0,ir,5,0,alphav,betav) - // c[5,16-31] - S8_S16_BETA_OP(c_int16_5p1,ir,5,1,alphav,betav) + // c[5,16-31] + S8_S16_BETA_OP(c_int16_5p1,ir,5,1,alphav,betav) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // c[0,0-15] + U8_S16_BETA_OP(c_int16_0p0,ir,0,0,alphav,betav) + + // c[0, 16-31] + U8_S16_BETA_OP(c_int16_0p1,ir,0,1,alphav,betav) + + // c[1,0-15] + U8_S16_BETA_OP(c_int16_1p0,ir,1,0,alphav,betav) + + // c[1,16-31] + U8_S16_BETA_OP(c_int16_1p1,ir,1,1,alphav,betav) + + // c[2,0-15] + U8_S16_BETA_OP(c_int16_2p0,ir,2,0,alphav,betav) + + // c[2,16-31] + U8_S16_BETA_OP(c_int16_2p1,ir,2,1,alphav,betav) + + // c[3,0-15] + U8_S16_BETA_OP(c_int16_3p0,ir,3,0,alphav,betav) + + // c[3,16-31] + U8_S16_BETA_OP(c_int16_3p1,ir,3,1,alphav,betav) + + // c[4,0-15] + U8_S16_BETA_OP(c_int16_4p0,ir,4,0,alphav,betav) + + // c[4,16-31] + U8_S16_BETA_OP(c_int16_4p1,ir,4,1,alphav,betav) + + // c[5,0-15] + U8_S16_BETA_OP(c_int16_5p0,ir,5,0,alphav,betav) + + // c[5,16-31] + U8_S16_BETA_OP(c_int16_5p1,ir,5,1,alphav,betav) + } } else { @@ -756,25 +797,49 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - // c[0,0-31] - CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0); + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int16). + // c[0,0-31] + CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0); - // c[1,0-31] - CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0); + // c[1,0-31] + CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0); - // c[2,0-31] - CVT_STORE_S16_S8(c_int16_2p0, c_int16_2p1, 2, 0); + // c[2,0-31] + CVT_STORE_S16_S8(c_int16_2p0, c_int16_2p1, 2, 0); - // c[3,0-31] - CVT_STORE_S16_S8(c_int16_3p0, c_int16_3p1, 3, 0); + // c[3,0-31] + CVT_STORE_S16_S8(c_int16_3p0, c_int16_3p1, 3, 0); - // c[4,0-31] - CVT_STORE_S16_S8(c_int16_4p0, c_int16_4p1, 4, 0); + // c[4,0-31] + CVT_STORE_S16_S8(c_int16_4p0, c_int16_4p1, 4, 0); - // c[5,0-31] - CVT_STORE_S16_S8(c_int16_5p0, c_int16_5p1, 5, 0); - } + // c[5,0-31] + CVT_STORE_S16_S8(c_int16_5p0, c_int16_5p1, 5, 0); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + // c[0,0-31] + CVT_STORE_S16_U8(c_int16_0p0, c_int16_0p1, 0, 0); + + // c[1,0-31] + CVT_STORE_S16_U8(c_int16_1p0, c_int16_1p1, 1, 0); + + // c[2,0-31] + CVT_STORE_S16_U8(c_int16_2p0, c_int16_2p1, 2, 0); + + // c[3,0-31] + CVT_STORE_S16_U8(c_int16_3p0, c_int16_3p1, 3, 0); + + // c[4,0-31] + CVT_STORE_S16_U8(c_int16_4p0, c_int16_4p1, 4, 0); + + // c[5,0-31] + CVT_STORE_S16_U8(c_int16_5p0, c_int16_5p1, 5, 0); + } + } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. else diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c index 03834b4318..be65426b31 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c @@ -223,29 +223,58 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - // c[0,0-15] - S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + if ( post_ops_attr.c_stor_type == S8 ) + { + // c[0,0-15] + S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) - // c[0, 16-31] - S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2) + // c[0, 16-31] + S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2) - // c[1,0-15] - S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) + // c[1,0-15] + S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) - // c[1,16-31] - S8_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2) + // c[1,16-31] + S8_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2) - // c[2,0-15] - S8_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2) + // c[2,0-15] + S8_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2) - // c[2,16-31] - S8_S16_BETA_OP(c_int16_2p1,0,2,1,selector1,selector2) + // c[2,16-31] + S8_S16_BETA_OP(c_int16_2p1,0,2,1,selector1,selector2) - // c[3,0-15] - S8_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2) + // c[3,0-15] + S8_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2) - // c[3,16-31] - S8_S16_BETA_OP(c_int16_3p1,0,3,1,selector1,selector2) + // c[3,16-31] + S8_S16_BETA_OP(c_int16_3p1,0,3,1,selector1,selector2) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // c[0,0-15] + U8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + U8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2) + + // c[1,0-15] + U8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + U8_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2) + + // c[2,0-15] + U8_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2) + + // c[2,16-31] + U8_S16_BETA_OP(c_int16_2p1,0,2,1,selector1,selector2) + + // c[3,0-15] + U8_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2) + + // c[3,16-31] + U8_S16_BETA_OP(c_int16_3p1,0,3,1,selector1,selector2) + } } else { @@ -528,18 +557,36 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - // c[0,0-31] - CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0); + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int16). + // c[0,0-31] + CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0); + + // c[1,0-31] + CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0); + + // c[2,0-31] + CVT_STORE_S16_S8(c_int16_2p0, c_int16_2p1, 2, 0); + + // c[3,0-31] + CVT_STORE_S16_S8(c_int16_3p0, c_int16_3p1, 3, 0); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + // c[0,0-31] + CVT_STORE_S16_U8(c_int16_0p0, c_int16_0p1, 0, 0); - // c[1,0-31] - CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0); + // c[1,0-31] + CVT_STORE_S16_U8(c_int16_1p0, c_int16_1p1, 1, 0); - // c[2,0-31] - CVT_STORE_S16_S8(c_int16_2p0, c_int16_2p1, 2, 0); + // c[2,0-31] + CVT_STORE_S16_U8(c_int16_2p0, c_int16_2p1, 2, 0); - // c[3,0-31] - CVT_STORE_S16_S8(c_int16_3p0, c_int16_3p1, 3, 0); + // c[3,0-31] + CVT_STORE_S16_U8(c_int16_3p0, c_int16_3p1, 3, 0); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. @@ -696,17 +743,34 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - // c[0,0-15] - S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) - - // c[0, 16-31] - S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2) - - // c[1,0-15] - S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) - - // c[1,16-31] - S8_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2) + if ( post_ops_attr.c_stor_type == S8 ) + { + // c[0,0-15] + S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2) + + // c[1,0-15] + S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + S8_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // c[0,0-15] + U8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + U8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2) + + // c[1,0-15] + U8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) + + // c[1,16-31] + U8_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2) + } } else { @@ -901,12 +965,24 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - // c[0,0-31] - CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0); + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int16). + // c[0,0-31] + CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0); + + // c[1,0-31] + CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + // c[0,0-31] + CVT_STORE_S16_U8(c_int16_0p0, c_int16_0p1, 0, 0); - // c[1,0-31] - CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0); + // c[1,0-31] + CVT_STORE_S16_U8(c_int16_1p0, c_int16_1p1, 1, 0); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. @@ -1019,11 +1095,22 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - // c[0,0-15] - S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) - - // c[0, 16-31] - S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2) + if ( post_ops_attr.c_stor_type == S8 ) + { + // c[0,0-15] + S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // c[0,0-15] + U8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + + // c[0, 16-31] + U8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2) + } } else { @@ -1174,9 +1261,18 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - // c[0,0-31] - CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0); + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int16). + // c[0,0-31] + CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + // c[0,0-31] + CVT_STORE_S16_U8(c_int16_0p0, c_int16_0p1, 0, 0); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c index 9a02626e84..dc5108386b 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c @@ -192,17 +192,34 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - // c[0,0-15] - S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) - - // c[1,0-15] - S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) - - // c[2,0-15] - S8_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2) - - // c[3,0-15] - S8_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2) + if ( post_ops_attr.c_stor_type == S8 ) + { + // c[0,0-15] + S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + + // c[1,0-15] + S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) + + // c[2,0-15] + S8_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2) + + // c[3,0-15] + S8_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // c[0,0-15] + U8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + + // c[1,0-15] + U8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) + + // c[2,0-15] + U8_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2) + + // c[3,0-15] + U8_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2) + } } else { @@ -378,14 +395,28 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - __m128i temp[2]; + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int32). + __m128i temp[2]; + + // c[0-1,0-15] + CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0); - // c[0-1,0-15] - CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0); + // c[2-3,0-15] + CVT_STORE_S16_S8_2ROW(c_int16_2p0, c_int16_3p0, 2, 3, 0); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + __m128i temp[2]; + + // c[0-1,0-15] + CVT_STORE_S16_U8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0); - // c[2-3,0-15] - CVT_STORE_S16_S8_2ROW(c_int16_2p0, c_int16_3p0, 2, 3, 0); + // c[2-3,0-15] + CVT_STORE_S16_U8_2ROW(c_int16_2p0, c_int16_3p0, 2, 3, 0); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. @@ -567,24 +598,48 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); + if ( post_ops_attr.c_stor_type == S8 ) + { + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); - // c[0,0-15] - S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) + // c[0,0-15] + S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) - // c[1,0-15] - S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2) + // c[1,0-15] + S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2) - // c[2,0-15] - S8_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2) + // c[2,0-15] + S8_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2) - // c[3,0-15] - S8_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2) + // c[3,0-15] + S8_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( uint8_t ); + + U8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + U8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + U8_S16_BETA_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); + U8_S16_BETA_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); + + // c[0,0-15] + U8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) + + // c[1,0-15] + U8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2) + + // c[2,0-15] + U8_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2) + + // c[3,0-15] + U8_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2) + } } else { @@ -768,21 +823,42 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - __m128i temp[2]; + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int16). + __m128i temp[2]; + + // c[0-1,0-15] + CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1); - // c[0-1,0-15] - CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1); + // c[2-3,0-15] + CVT_STORE_S16_S8_2ROW_NLT16(c_int16_2p0, c_int16_3p0, buf2, buf3); - // c[2-3,0-15] - CVT_STORE_S16_S8_2ROW_NLT16(c_int16_2p0, c_int16_3p0, buf2, buf3); + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); - dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + __m128i temp[2]; - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); + // c[0-1,0-15] + CVT_STORE_S16_U8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1); + + // c[2-3,0-15] + CVT_STORE_S16_U8_2ROW_NLT16(c_int16_2p0, c_int16_3p0, buf2, buf3); + + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( uint8_t ); + + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. @@ -923,11 +999,22 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - // c[0,0-15] - S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) - - // c[1,0-15] - S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) + if ( post_ops_attr.c_stor_type == S8 ) + { + // c[0,0-15] + S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + + // c[1,0-15] + S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // c[0,0-15] + U8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + + // c[1,0-15] + U8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2) + } } else { @@ -1059,11 +1146,22 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - __m128i temp[2]; + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int32). + __m128i temp[2]; - // c[0-1,0-15] - CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0); + // c[0-1,0-15] + CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + __m128i temp[2]; + + // c[0-1,0-15] + CVT_STORE_S16_U8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. @@ -1188,16 +1286,32 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); + if ( post_ops_attr.c_stor_type == S8 ) + { + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); - // c[0,0-15] - S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) + // c[0,0-15] + S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) - // c[1,0-15] - S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2) + // c[1,0-15] + S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( uint8_t ); + + U8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + U8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + + // c[0,0-15] + U8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) + + // c[1,0-15] + U8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2) + } } else { @@ -1335,16 +1449,32 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - __m128i temp[2]; + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int16). + __m128i temp[2]; + + // c[0-1,0-15] + CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1); + + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); - // c[0-1,0-15] - CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + __m128i temp[2]; + + // c[0-1,0-15] + CVT_STORE_S16_U8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1); - dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( uint8_t ); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. @@ -1449,8 +1579,16 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - // c[0,0-15] - S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + if ( post_ops_attr.c_stor_type == S8 ) + { + // c[0,0-15] + S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // c[0,0-15] + U8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2) + } } else { @@ -1560,12 +1698,24 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - __m128i temp[2]; - __m256i zero_reg = _mm256_setzero_si256(); + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int16). + __m128i temp[2]; + __m256i zero_reg = _mm256_setzero_si256(); + + // c[0-1,0-15] + CVT_STORE_S16_S8_1ROW(c_int16_0p0, zero_reg, 0, 0); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + __m128i temp[2]; + __m256i zero_reg = _mm256_setzero_si256(); - // c[0-1,0-15] - CVT_STORE_S16_S8_1ROW(c_int16_0p0, zero_reg, 0, 0); + // c[0-1,0-15] + CVT_STORE_S16_U8_1ROW(c_int16_0p0, zero_reg, 0, 0); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. @@ -1662,12 +1812,24 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); + if ( post_ops_attr.c_stor_type == S8 ) + { + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); - // c[0,0-15] - S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) + // c[0,0-15] + S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) + } + if ( post_ops_attr.c_stor_type == U8 ) + { + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( uint8_t ); + + U8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + + // c[0,0-15] + U8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) + } } else { @@ -1782,16 +1944,32 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - __m128i temp[2]; - __m256i zero_reg = _mm256_setzero_si256(); + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int16). + __m128i temp[2]; + __m256i zero_reg = _mm256_setzero_si256(); - // c[0-1,0-15] - CVT_STORE_S16_S8_1ROW_NLT16(c_int16_0p0, zero_reg, buf0); + // c[0-1,0-15] + CVT_STORE_S16_S8_1ROW_NLT16(c_int16_0p0, zero_reg, buf0); - dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + __m128i temp[2]; + __m256i zero_reg = _mm256_setzero_si256(); + + // c[0-1,0-15] + CVT_STORE_S16_U8_1ROW_NLT16(c_int16_0p0, zero_reg, buf0); + + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( uint8_t ); + + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c index 17ac89f3ad..c4182324d8 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c @@ -250,23 +250,46 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - // c[0,0-15] - S8_S16_BETA_OP(c_int16_0p0,ir,0,0,selector1,selector2) + if ( post_ops_attr.c_stor_type == S8 ) + { + // c[0,0-15] + S8_S16_BETA_OP(c_int16_0p0,ir,0,0,selector1,selector2) - // c[1,0-15] - S8_S16_BETA_OP(c_int16_1p0,ir,1,0,selector1,selector2) + // c[1,0-15] + S8_S16_BETA_OP(c_int16_1p0,ir,1,0,selector1,selector2) - // c[2,0-15] - S8_S16_BETA_OP(c_int16_2p0,ir,2,0,selector1,selector2) + // c[2,0-15] + S8_S16_BETA_OP(c_int16_2p0,ir,2,0,selector1,selector2) - // c[3,0-15] - S8_S16_BETA_OP(c_int16_3p0,ir,3,0,selector1,selector2) + // c[3,0-15] + S8_S16_BETA_OP(c_int16_3p0,ir,3,0,selector1,selector2) - // c[4,0-15] - S8_S16_BETA_OP(c_int16_4p0,ir,4,0,selector1,selector2) + // c[4,0-15] + S8_S16_BETA_OP(c_int16_4p0,ir,4,0,selector1,selector2) - // c[5,0-15] - S8_S16_BETA_OP(c_int16_5p0,ir,5,0,selector1,selector2) + // c[5,0-15] + S8_S16_BETA_OP(c_int16_5p0,ir,5,0,selector1,selector2) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // c[0,0-15] + U8_S16_BETA_OP(c_int16_0p0,ir,0,0,selector1,selector2) + + // c[1,0-15] + U8_S16_BETA_OP(c_int16_1p0,ir,1,0,selector1,selector2) + + // c[2,0-15] + U8_S16_BETA_OP(c_int16_2p0,ir,2,0,selector1,selector2) + + // c[3,0-15] + U8_S16_BETA_OP(c_int16_3p0,ir,3,0,selector1,selector2) + + // c[4,0-15] + U8_S16_BETA_OP(c_int16_4p0,ir,4,0,selector1,selector2) + + // c[5,0-15] + U8_S16_BETA_OP(c_int16_5p0,ir,5,0,selector1,selector2) + } } else { @@ -486,17 +509,34 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - __m128i temp[2]; + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int16). + __m128i temp[2]; + + // c[0-1,0-15] + CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0); + + // c[2-3,0-15] + CVT_STORE_S16_S8_2ROW(c_int16_2p0, c_int16_3p0, 2, 3, 0); + + // c[4-5,0-15] + CVT_STORE_S16_S8_2ROW(c_int16_4p0, c_int16_5p0, 4, 5, 0); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + __m128i temp[2]; - // c[0-1,0-15] - CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0); + // c[0-1,0-15] + CVT_STORE_S16_U8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0); - // c[2-3,0-15] - CVT_STORE_S16_S8_2ROW(c_int16_2p0, c_int16_3p0, 2, 3, 0); + // c[2-3,0-15] + CVT_STORE_S16_U8_2ROW(c_int16_2p0, c_int16_3p0, 2, 3, 0); - // c[4-5,0-15] - CVT_STORE_S16_S8_2ROW(c_int16_4p0, c_int16_5p0, 4, 5, 0); + // c[4-5,0-15] + CVT_STORE_S16_U8_2ROW(c_int16_4p0, c_int16_5p0, 4, 5, 0); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. @@ -796,32 +836,64 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_first_k == TRUE ) ) { - dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); + if ( post_ops_attr.c_stor_type == S8 ) + { + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf4, 4, n0_rem_dscale_bytes); - S8_S16_BETA_NLT16_MEMCP_UTIL(buf5, 5, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf4, 4, n0_rem_dscale_bytes); + S8_S16_BETA_NLT16_MEMCP_UTIL(buf5, 5, n0_rem_dscale_bytes); - // c[0,0-15] - S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) + // c[0,0-15] + S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) - // c[1,0-15] - S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2) + // c[1,0-15] + S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2) - // c[2,0-15] - S8_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2) + // c[2,0-15] + S8_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2) - // c[3,0-15] - S8_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2) + // c[3,0-15] + S8_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2) - // c[4,0-15] - S8_S16_BETA_OP_NLT16(c_int16_4p0,buf4,selector1,selector2) + // c[4,0-15] + S8_S16_BETA_OP_NLT16(c_int16_4p0,buf4,selector1,selector2) - // c[5,0-15] - S8_S16_BETA_OP_NLT16(c_int16_5p0,buf5,selector1,selector2) + // c[5,0-15] + S8_S16_BETA_OP_NLT16(c_int16_5p0,buf5,selector1,selector2) + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( uint8_t ); + + U8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + U8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + U8_S16_BETA_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); + U8_S16_BETA_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); + U8_S16_BETA_NLT16_MEMCP_UTIL(buf4, 4, n0_rem_dscale_bytes); + U8_S16_BETA_NLT16_MEMCP_UTIL(buf5, 5, n0_rem_dscale_bytes); + + // c[0,0-15] + U8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2) + + // c[1,0-15] + U8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2) + + // c[2,0-15] + U8_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2) + + // c[3,0-15] + U8_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2) + + // c[4,0-15] + U8_S16_BETA_OP_NLT16(c_int16_4p0,buf4,selector1,selector2) + + // c[5,0-15] + U8_S16_BETA_OP_NLT16(c_int16_5p0,buf5,selector1,selector2) + } } else { @@ -1051,26 +1123,52 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) ) { - // Store the results in downscaled type (int8 instead of int32). - __m128i temp[2]; + if ( post_ops_attr.c_stor_type == S8 ) + { + // Store the results in downscaled type (int8 instead of int16). + __m128i temp[2]; + + // c[0-1,0-15] + CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1); + + // c[2-3,0-15] + CVT_STORE_S16_S8_2ROW_NLT16(c_int16_2p0, c_int16_3p0, buf2, buf3); - // c[0-1,0-15] - CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1); + // c[4-5,0-15] + CVT_STORE_S16_S8_2ROW_NLT16(c_int16_4p0, c_int16_5p0, buf4, buf5); - // c[2-3,0-15] - CVT_STORE_S16_S8_2ROW_NLT16(c_int16_2p0, c_int16_3p0, buf2, buf3); + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); - // c[4-5,0-15] - CVT_STORE_S16_S8_2ROW_NLT16(c_int16_4p0, c_int16_5p0, buf4, buf5); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf4, 4, n0_rem_dscale_bytes); + CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf5, 5, n0_rem_dscale_bytes); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + // Store the results in downscaled type (uint8 instead of int16). + __m128i temp[2]; - dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); + // c[0-1,0-15] + CVT_STORE_S16_U8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf4, 4, n0_rem_dscale_bytes); - CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf5, 5, n0_rem_dscale_bytes); + // c[2-3,0-15] + CVT_STORE_S16_U8_2ROW_NLT16(c_int16_2p0, c_int16_3p0, buf2, buf3); + + // c[4-5,0-15] + CVT_STORE_S16_U8_2ROW_NLT16(c_int16_4p0, c_int16_5p0, buf4, buf5); + + dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t ); + + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes); + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes); + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes); + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes); + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf4, 4, n0_rem_dscale_bytes); + CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf5, 5, n0_rem_dscale_bytes); + } } // Case where the output C matrix is s16 or is the temp buffer used to // store intermediate s16 accumulated values for downscaled (C-s8) api. diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h b/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h index 5f5cebbb7b..cb04b2c8c3 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h @@ -77,7 +77,7 @@ scratch1 = _mm256_loadu_si256( ( __m256i const* )buf_ ); \ S16_BETA_FMA(reg,scratch1,scratch2) \ -// Downscale beta scale macro, scratch2=beta +// Downscale beta scale macro (s8 -> s16), scratch2=beta #define S8_S16_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \ scratch1 = \ _mm256_cvtepi8_epi16 \ @@ -91,19 +91,44 @@ ); \ S16_BETA_FMA(reg,scratch1,scratch2) \ -// Downscale beta n < 16 scale macro, scratch2=beta +// Downscale beta scale macro (u8 -> s16), scratch2=beta +#define U8_S16_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \ + scratch1 = \ + _mm256_cvtepu8_epi16 \ + ( \ + _mm_loadu_si128 \ + ( \ + ( __m128i const* )( ( uint8_t* )post_ops_attr.buf_downscale + \ + ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ + post_ops_attr.post_op_c_j + ( n_ind * 16 ) )\ + ) \ + ); \ + S16_BETA_FMA(reg,scratch1,scratch2) \ + +// Downscale beta n < 16 scale macro (s8 -> s16), scratch2=beta #define S8_S16_BETA_OP_NLT16(reg,buf_,scratch1,scratch2) \ scratch1 = _mm256_cvtepi8_epi16( _mm_loadu_si128( ( __m128i const* )buf_ ) ); \ S16_BETA_FMA(reg,scratch1,scratch2) \ -#define S8_S16_BETA_NLT16_MEMCP_UTIL(buf_,m_ind,bytes) \ +// Downscale beta n < 16 scale macro (u8 -> s16), scratch2=beta +#define U8_S16_BETA_OP_NLT16(reg,buf_,scratch1,scratch2) \ + scratch1 = _mm256_cvtepu8_epi16( _mm_loadu_si128( ( __m128i const* )buf_ ) ); \ + S16_BETA_FMA(reg,scratch1,scratch2) \ + +#define US8_S16_BETA_NLT16_MEMCP_HELPER(buf_,m_ind,bytes, C_type) \ memcpy \ ( \ buf_, \ - ( ( int8_t* )post_ops_attr.buf_downscale + \ + ( ( C_type* )post_ops_attr.buf_downscale + \ ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ post_ops_attr.post_op_c_j ), bytes \ ); \ + +#define S8_S16_BETA_NLT16_MEMCP_UTIL(buf_,m_ind,bytes) \ + US8_S16_BETA_NLT16_MEMCP_HELPER(buf_,m_ind,bytes,int8_t) \ + +#define U8_S16_BETA_NLT16_MEMCP_UTIL(buf_,m_ind,bytes) \ + US8_S16_BETA_NLT16_MEMCP_HELPER(buf_,m_ind,bytes,uint8_t) \ // Downscale macro #define CVT_MULRND_CVT16(reg, scale0, scale1, zero_point_0) \ @@ -122,33 +147,17 @@ res_1 = _mm256_mul_ps( temp_float[0], scale0 ); \ res_2 = _mm256_mul_ps( temp_float[1], scale1 ); \ \ - /* Round the resultant value to the nearest float value and clip the values between [-128, 127] */ \ + /* Round the resultant value to the nearest float value. */ \ res_1 = \ - _mm256_min_ps \ - ( \ - _mm256_max_ps \ - ( \ _mm256_round_ps \ ( \ res_1, ( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) \ - ), \ - _mm256_set1_ps( ( float )S8_MIN ) \ - ), \ - _mm256_set1_ps( ( float )S8_MAX ) \ - );\ + ); \ res_2 = \ - _mm256_min_ps \ - ( \ - _mm256_max_ps \ - ( \ _mm256_round_ps \ ( \ res_2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) \ - ), \ - _mm256_set1_ps( ( float )S8_MIN ) \ - ), \ - _mm256_set1_ps( ( float )S8_MAX ) \ - );\ + ); \ \ /* Convert the clipped float32 scaled rounded value to int32 */ \ temp_32[0] = _mm256_cvtps_epi32( res_1 ); \ @@ -163,96 +172,152 @@ /* Zero point addition.*/ \ reg = _mm256_add_epi16( reg, _mm256_cvtepi8_epi16( zero_point_0 ) ); \ -// Downscale store macro -#define CVT_STORE_S16_S8(reg0, reg1, m_ind, n_ind) \ - /* Convert the s16 to s8 */ \ - reg0 = _mm256_packs_epi16( reg0, reg1 ); \ - reg0 = _mm256_permute4x64_epi64( reg0, 0XD8 ); \ +// Downscale store macro helper +#define CVT_STORE_S16_SU8_HELPER(reg, m_ind, n_ind, C_type) \ + reg = _mm256_permute4x64_epi64( reg, 0XD8 ); \ \ _mm256_storeu_si256 \ ( \ - ( __m256i* )( ( int8_t* )post_ops_attr.buf_downscale + \ + ( __m256i* )( ( C_type* )post_ops_attr.buf_downscale + \ ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ post_ops_attr.post_op_c_j + ( n_ind * 32 ) ), \ - reg0 \ - ) \ + reg \ + ); \ -// Downscale store macro for fringe cases -#define CVT_STORE_S16_S8_2ROW(reg0, reg1, m_ind0, m_ind1, n_ind) \ - /* Convert the s16 to s8 */ \ +// Downscale store macro (s16 -> s8) +#define CVT_STORE_S16_S8(reg0, reg1, m_ind, n_ind) \ + /* Convert the s16 to s8 */ \ reg0 = _mm256_packs_epi16( reg0, reg1 ); \ - reg0 = _mm256_permute4x64_epi64( reg0, 0XD8 ); \ + CVT_STORE_S16_SU8_HELPER(reg0, m_ind, n_ind, int8_t) \ + +// Downscale store macro (s16 -> u8) +#define CVT_STORE_S16_U8(reg0, reg1, m_ind, n_ind) \ + /* Convert the s16 to s8 */ \ + reg0 = _mm256_packus_epi16( reg0, reg1 ); \ + CVT_STORE_S16_SU8_HELPER(reg0, m_ind, n_ind, uint8_t) \ + +// Downscale store helper macro for fringe cases +#define CVT_STORE_S16_US8_2ROW_HELPER(reg, m_ind0, m_ind1, n_ind, C_type) \ + reg = _mm256_permute4x64_epi64( reg, 0XD8 ); \ \ /* Extract the first 128 bits of the register*/ \ - temp[0] = _mm256_extractf128_si256( reg0, 0 ); \ + temp[0] = _mm256_extractf128_si256( reg, 0 ); \ /* Extract the second 128 bits of the register*/ \ - temp[1] = _mm256_extractf128_si256( reg0, 1 ); \ + temp[1] = _mm256_extractf128_si256( reg, 1 ); \ \ _mm_storeu_si128 \ ( \ - ( __m128i* )( ( int8_t* )post_ops_attr.buf_downscale + \ + ( __m128i* )( ( C_type* )post_ops_attr.buf_downscale + \ ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind0 ) ) + \ post_ops_attr.post_op_c_j + ( n_ind * 16 ) ), \ temp[0] \ ); \ _mm_storeu_si128 \ ( \ - ( __m128i* )( ( int8_t* )post_ops_attr.buf_downscale + \ + ( __m128i* )( ( C_type* )post_ops_attr.buf_downscale + \ ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind1 ) ) + \ post_ops_attr.post_op_c_j + ( n_ind * 16 ) ), \ temp[1] \ ); \ -// Downscale store macro for fringe cases -#define CVT_STORE_S16_S8_1ROW(reg0, reg1, m_ind0, n_ind) \ +// Downscale store macro for fringe cases (s16 -> s8) +#define CVT_STORE_S16_S8_2ROW(reg0, reg1, m_ind0, m_ind1, n_ind) \ /* Convert the s16 to s8 */ \ reg0 = _mm256_packs_epi16( reg0, reg1 ); \ - reg0 = _mm256_permute4x64_epi64( reg0, 0XD8 ); \ + CVT_STORE_S16_US8_2ROW_HELPER(reg0, m_ind0, m_ind1, n_ind, int8_t) \ + +// Downscale store macro for fringe cases (s16 -> u8) +#define CVT_STORE_S16_U8_2ROW(reg0, reg1, m_ind0, m_ind1, n_ind) \ + /* Convert the s16 to u8 */ \ + reg0 = _mm256_packus_epi16( reg0, reg1 ); \ + CVT_STORE_S16_US8_2ROW_HELPER(reg0, m_ind0, m_ind1, n_ind, uint8_t) \ + +// Downscale store helper macro for fringe cases +#define CVT_STORE_S16_US8_1ROW(reg, m_ind0, n_ind, C_type) \ + reg = _mm256_permute4x64_epi64( reg, 0XD8 ); \ \ /* Extract the first 128 bits of the register*/ \ - temp[0] = _mm256_extractf128_si256( reg0, 0 ); \ + temp[0] = _mm256_extractf128_si256( reg, 0 ); \ \ _mm_storeu_si128 \ ( \ - ( __m128i* )( ( int8_t* )post_ops_attr.buf_downscale + \ + ( __m128i* )( ( C_type* )post_ops_attr.buf_downscale + \ ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind0 ) ) + \ post_ops_attr.post_op_c_j + ( n_ind * 16 ) ), \ temp[0] \ ); \ -// Downscale store macro for n < 16 fringe cases -#define CVT_STORE_S16_S8_2ROW_NLT16(reg0, reg1, buf0, buf1) \ +// Downscale store (s16 -> s8) macro for fringe cases +#define CVT_STORE_S16_S8_1ROW(reg0, reg1, m_ind0, n_ind) \ /* Convert the s16 to s8 */ \ reg0 = _mm256_packs_epi16( reg0, reg1 ); \ - reg0 = _mm256_permute4x64_epi64( reg0, 0XD8 ); \ + CVT_STORE_S16_US8_1ROW(reg0, m_ind0, n_ind, int8_t) \ + +// Downscale store (s16 -> u8) macro for fringe cases +#define CVT_STORE_S16_U8_1ROW(reg0, reg1, m_ind0, n_ind) \ + /* Convert the s16 to u8 */ \ + reg0 = _mm256_packus_epi16( reg0, reg1 ); \ + CVT_STORE_S16_US8_1ROW(reg0, m_ind0, n_ind, uint8_t) \ + +// Downscale store helper macro for n < 16 fringe cases +#define CVT_STORE_S16_US8_2ROW_NLT16(reg, buf0, buf1) \ + reg = _mm256_permute4x64_epi64( reg, 0XD8 ); \ \ /* Extract the first 128 bits of the register*/ \ - temp[0] = _mm256_extractf128_si256( reg0, 0 ); \ + temp[0] = _mm256_extractf128_si256( reg, 0 ); \ /* Extract the second 128 bits of the register*/ \ - temp[1] = _mm256_extractf128_si256( reg0, 1 ); \ + temp[1] = _mm256_extractf128_si256( reg, 1 ); \ \ _mm_storeu_si128( ( __m128i* )buf0, temp[0] ); \ _mm_storeu_si128( ( __m128i* )buf1, temp[1] ); \ -// Downscale store macro for n < 16 fringe cases -#define CVT_STORE_S16_S8_1ROW_NLT16(reg0, reg1, buf0) \ +// Downscale store (int16 -> s8) macro for n < 16 fringe cases +#define CVT_STORE_S16_S8_2ROW_NLT16(reg0, reg1, buf0, buf1) \ /* Convert the s16 to s8 */ \ reg0 = _mm256_packs_epi16( reg0, reg1 ); \ - reg0 = _mm256_permute4x64_epi64( reg0, 0XD8 ); \ + CVT_STORE_S16_US8_2ROW_NLT16(reg0, buf0, buf1) \ + +// Downscale store (int16 -> u8) macro for n < 16 fringe cases +#define CVT_STORE_S16_U8_2ROW_NLT16(reg0, reg1, buf0, buf1) \ + /* Convert the s16 to s8 */ \ + reg0 = _mm256_packus_epi16( reg0, reg1 ); \ + CVT_STORE_S16_US8_2ROW_NLT16(reg0, buf0, buf1) \ + +// Downscale store helper macro for n < 16 fringe cases +#define CVT_STORE_S16_US8_1ROW_NLT16(reg, buf0) \ + reg = _mm256_permute4x64_epi64( reg, 0XD8 ); \ \ /* Extract the first 128 bits of the register*/ \ - temp[0] = _mm256_extractf128_si256( reg0, 0 ); \ + temp[0] = _mm256_extractf128_si256( reg, 0 ); \ \ _mm_storeu_si128( ( __m128i* )buf0, temp[0] ); \ -#define CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf_,m_ind,bytes) \ +// Downscale store (s16 -> s8) macro for n < 16 fringe cases +#define CVT_STORE_S16_S8_1ROW_NLT16(reg0, reg1, buf0) \ + /* Convert the s16 to s8 */ \ + reg0 = _mm256_packs_epi16( reg0, reg1 ); \ + CVT_STORE_S16_US8_1ROW_NLT16(reg0, buf0) \ + +// Downscale store (s16 -> u8) macro for n < 16 fringe cases +#define CVT_STORE_S16_U8_1ROW_NLT16(reg0, reg1, buf0) \ + /* Convert the s16 to u8 */ \ + reg0 = _mm256_packus_epi16( reg0, reg1 ); \ + CVT_STORE_S16_US8_1ROW_NLT16(reg0, buf0) \ + +#define CVT_STORE_S16_US8_NLT16_MEMCP_HELPER(buf_,m_ind,bytes, C_type) \ memcpy \ ( \ - ( ( int8_t* )post_ops_attr.buf_downscale + \ + ( ( C_type* )post_ops_attr.buf_downscale + \ ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \ post_ops_attr.post_op_c_j ), buf_, bytes \ ); \ +#define CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf_,m_ind,bytes) \ + CVT_STORE_S16_US8_NLT16_MEMCP_HELPER(buf_,m_ind,bytes, int8_t) \ + +#define CVT_STORE_S16_U8_NLT16_MEMCP_UTIL(buf_,m_ind,bytes) \ + CVT_STORE_S16_US8_NLT16_MEMCP_HELPER(buf_,m_ind,bytes, uint8_t) \ + //-------------------------------------------------------------------------- /* GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) ) */ #define GELU_TANH_S16_AVX2(reg, y1, y2, r, r2, x, z, dn, x_tanh, q) \ From eb5ab3f76285865f09458c3b7b93de88eac07055 Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Fri, 22 Sep 2023 04:25:59 +0530 Subject: [PATCH 157/226] LPGEMM: Added transB support for bf16bf16f32o APIs Details: - Modified aocl_get_reorder_buf_size_ and aocl_reorder_ APIs to allow reordering from column major input matrix. - Added new pack kernels that packs/reorders B matrix from column-major input format. - Updated Early-return check conditions to account for trans parameters. - Updated bench file to test/benchmark transpose support. AMD-Internal: [CPUPL-2268] Change-Id: Ida66d7e3033c52cca0229c6b78d16976fbbecc4c --- addon/aocl_gemm/aocl_gemm_bf16_utils.c | 54 +- addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c | 13 +- .../aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c | 2 +- .../frame/bf16bf16f32/lpgemm_reorder_bf16.c | 11 +- .../kernels/bf16bf16f32/lpgemm_pack_bf16.h | 9 +- bench/bench_aocl_gemm/bench_lpgemm.c | 14 +- .../lpgemm_packb_bf16_amd512vnni.c | 697 ++++++++++++++++-- 7 files changed, 686 insertions(+), 114 deletions(-) diff --git a/addon/aocl_gemm/aocl_gemm_bf16_utils.c b/addon/aocl_gemm/aocl_gemm_bf16_utils.c index cad3a07eaa..020065a364 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16_utils.c +++ b/addon/aocl_gemm/aocl_gemm_bf16_utils.c @@ -68,19 +68,6 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32) return 0; // A reorder not supported. } - // Reorder of a col-major matrix is not supported yet. - if (!( (order == 'r') || ( order == 'R' ))) - { - printf("returning with order:%c\n", order); - return 0; - } - - // Reorder of matrix is only supported for non-trans matrices. - if(!( ( trans == 'n' ) || ( trans == 'N' ) )) - { - printf("returning with trans:%c\n", trans); - return 0; - } // Extra space since packing does width in multiples of 16. The bf16 // instruction can be used as long as at least one zmm register can be fully // loaded; and since k_dim needs to be at least 2, having n_dim at least 16 @@ -98,12 +85,34 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32) AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32) { + trans_t blis_trans; + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_trans( trans, &blis_trans ); + if ( ( input_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) || - ( k <= 0 ) || ( n <= 0 ) || ( ldb < n ) ) + ( k <= 0 ) || ( n <= 0 ) || ( bli_is_notrans( blis_trans ) && ( ldb < n ) ) || + ( bli_is_trans( blis_trans ) && ( ldb < k ) ) ) { return; // Error. } + inc_t rs_b, cs_b; + if( ( order == 'r') || ( order == 'R' ) ) + { + rs_b = bli_is_notrans( blis_trans ) ? ldb : 1; + cs_b = bli_is_notrans( blis_trans ) ? 1 : ldb; + } + else if ( ( order == 'c' ) || ( order == 'C' ) ) + { + rs_b = bli_is_notrans( blis_trans ) ? 1 : ldb; + cs_b = bli_is_notrans( blis_trans ) ? ldb : 1; + } + else + { + return; // Error + } + // Check if avx512_bf16 ISA is supported, lpgemm matmul only works with it. if ( bli_cpuid_is_avx512bf16_supported() == FALSE ) { @@ -125,20 +134,6 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32) { return; // A reorder not supported. } - - // Reorder of a col-major matrix is not supported yet. - if (!( (order == 'r') || ( order == 'R' ))) - { - printf("returning with order:%c\n", order); - return; - } - - // Reorder of matrix is only supported for non-trans matrices. - if (!( ( trans == 'n' ) || ( trans == 'N' ) )) - { - printf("Returning with trans:%c\n", trans); - return; - } // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. @@ -155,7 +150,8 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32) // Create dummy original b obj; lpgemm_obj_t b; b.storage.aligned_buffer = ( void* )input_buf_addr; - b.rs = ldb; + b.rs = rs_b; + b.cs = cs_b; b.width = n; b.length = k; diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c index bbb53d4d92..475a39c4a0 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c @@ -92,17 +92,8 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - // Transpose is not supported for B matrix yet. - if ( ( is_row_major == TRUE ) && ( bli_is_trans( blis_transb ) ) ) - { - return; // Error. - } - else if ( ( is_column_major == TRUE ) && ( bli_is_trans( blis_transa ) ) ) - { - return; // Error. - } - // Check if strides are valid for Row major inputs. + // Check if strides are valid for Row major inputs. if ( ( is_row_major == TRUE ) && ( ( bli_is_notrans( blis_transa ) && ( lda < k ) ) || ( bli_is_trans( blis_transa ) && ( lda < m ) ) || @@ -112,7 +103,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) { return; // Error. } - // Column major input expected with leading dimensions >= column stride. + // Chcek if strides are valid for Column major inputs. else if ( ( is_column_major == TRUE ) && ( ( bli_is_notrans( blis_transa ) && ( lda < m ) ) || ( bli_is_trans( blis_transa ) && ( lda < k ) ) || diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c index da2427af4c..10855970d3 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c @@ -265,7 +265,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) ( pack_b_buffer_bf16 + ( jc_packb_start * kc0_updated ), ( b + ( rs_b * pc ) + ( cs_b * jc ) + - ( cs_b * jc_packb_start ) ), rs_b, + ( cs_b * jc_packb_start ) ), rs_b, cs_b, ( jc_packb_end - jc_packb_start ), kc0, &rs_b_use, &cs_b_use ); diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c index 91a14b8918..8774f2ea95 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c @@ -53,6 +53,7 @@ void reorderb_nr64_bf16bf16f32of32 // Extracting the matrix properties from the lpgemm object dim_t rs_b = b->rs; + dim_t cs_b = b->cs; dim_t n = b->width; dim_t k = b->length; @@ -150,12 +151,12 @@ void reorderb_nr64_bf16bf16f32of32 // + ( NC' * kc0_updated) ( ( packb_bf16 )lcntx->packb_fun_ptr ) ( - ( ( ( bfloat16* )b_reorder->storage.aligned_buffer ) + - ( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) + - ( jc_cur_loop_rem * kc0_updated ) ), + ( ( bfloat16* )b_reorder->storage.aligned_buffer ) + + ( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) + + ( jc_cur_loop_rem * kc0_updated ), ( ( ( bfloat16* )b->storage.aligned_buffer ) + - ( rs_b * pc ) + jc ), - rs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder + ( rs_b * pc ) + (jc * cs_b)), + rs_b, cs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder ); } diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h index 44c857c6ad..cec9195f61 100644 --- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h +++ b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h @@ -54,6 +54,7 @@ typedef void (*packb_bf16) const dim_t, const dim_t, const dim_t, + const dim_t, dim_t*, dim_t* ); @@ -62,13 +63,15 @@ void packb_nr64_bf16bf16f32of32 ( bfloat16* pack_b_buffer_bf16bf16f32of32, const bfloat16* b, - const dim_t ldb, + const dim_t rs_b, + const dim_t cs_b, const dim_t NC, const dim_t KC, - dim_t* rs_b, - dim_t* cs_b + dim_t* rs_p, + dim_t* cs_p ); + void packa_mr16_bf16bf16f32of32 ( bfloat16* pack_a_buffer, diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 6f43dac4bf..23acb828ea 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -1012,11 +1012,15 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ free ( post_ops->eltwise ); \ free ( post_ops->bias.bias ); \ free( post_ops->seq_vector ); \ - free( post_ops ); \ if ( post_ops->sum.zero_point != NULL ) \ - { free( post_ops->sum.zero_point ); } \ + { \ + free( post_ops->sum.zero_point ); \ + } \ if ( post_ops->sum.scale_factor != NULL ) \ - { free( post_ops->sum.scale_factor ); } \ + { \ + free( post_ops->sum.scale_factor ); \ + } \ + free( post_ops ); \ return NULL; \ } \ /* Fill scale factor and zero points.*/ \ @@ -1167,7 +1171,7 @@ void mat_mul_bench_main_ ## BLAS_SFX \ } \ } \ \ - if ( ( op_b == 'p' ) || ( op_b == 'P' ) ) \ + if ( ( op_b == 'p' ) || ( op_b == 'P' ) || ( op_b == 'n' ) || ( op_b == 'N' ) ) \ { \ /* No reordering of B.*/ \ GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ @@ -1342,7 +1346,7 @@ void mat_mul_bench_main_ ## BLAS_SFX \ } \ } \ \ - if ( ( op_b == 'p' ) || ( op_b == 'P' ) ) \ + if ( ( op_b == 'p' ) || ( op_b == 'P' ) || ( op_b == 'n' ) || ( op_b == 'N' ) ) \ { \ /* No reordering of B.*/ \ GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c index fe39c8c038..701d61a6ba 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c @@ -38,50 +38,116 @@ #ifdef BLIS_ADDON_LPGEMM -void packb_nrlt16_bf16bf16f32of32 + +void packb_nr64_bf16bf16f32of32_row_major + ( + bfloat16* pack_b_buffer_bf16bf16f32of32, + const bfloat16* b, + const dim_t ldb, + const dim_t NC, + const dim_t KC, + dim_t* rs_b, + dim_t* cs_b + ); + +void packb_nr64_bf16bf16f32of32_col_major + ( + bfloat16* pack_b_buffer_bf16bf16f32of32, + const bfloat16* b, + const dim_t ldb, + const dim_t NC, + const dim_t KC, + dim_t* rs_b, + dim_t* cs_b + ); + +void packb_nrlt16_bf16bf16f32of32_row_major ( bfloat16* pack_b_buffer_bf16bf16f32of32, const bfloat16* b, const dim_t ldb, - const dim_t KC, + const dim_t KC, const dim_t n0_partial_rem ); -void packb_nr16_bf16bf16f32of32 +void packb_nr16_bf16bf16f32of32_row_major ( bfloat16* pack_b_buffer_bf16bf16f32of32, const bfloat16* b, const dim_t ldb, - const dim_t KC + const dim_t KC ); -void packb_nr32_bf16bf16f32of32 +void packb_nr32_bf16bf16f32of32_row_major ( bfloat16* pack_b_buffer_bf16bf16f32of32, const bfloat16* b, const dim_t ldb, - const dim_t KC + const dim_t KC ); -void packb_nr48_bf16bf16f32of32 +void packb_nr48_bf16bf16f32of32_row_major ( bfloat16* pack_b_buffer_bf16bf16f32of32, const bfloat16* b, const dim_t ldb, - const dim_t KC + const dim_t KC ); + +void packb_nrlt16_bf16bf16f32of32_col_major + ( + bfloat16* pack_b_buffer_bf16bf16f32of32, + const bfloat16* b, + const dim_t ldb, + const dim_t KC, + const dim_t n0_partial_rem + ); + +void packb_nr_mult_16_bf16bf16f32of32_col_major + ( + bfloat16* pack_b_buffer, + const bfloat16* b, + const dim_t NR, + const dim_t ldb, + const dim_t KC + ); + + void packb_nr64_bf16bf16f32of32 + ( + bfloat16* pack_b_buffer_bf16bf16f32of32, + const bfloat16* b, + const dim_t rs_b, + const dim_t cs_b, + const dim_t NC, + const dim_t KC, + dim_t* rs_p, + dim_t* cs_p + ) +{ + if( cs_b == 1 ) + { + packb_nr64_bf16bf16f32of32_row_major( pack_b_buffer_bf16bf16f32of32, + b, rs_b, NC, KC, rs_p, cs_p ); + } + else + { + packb_nr64_bf16bf16f32of32_col_major( pack_b_buffer_bf16bf16f32of32, + b, cs_b, NC, KC, rs_p, cs_p ); + } +} +void packb_nr64_bf16bf16f32of32_row_major ( bfloat16* pack_b_buffer_bf16bf16f32of32, const bfloat16* b, const dim_t ldb, - const dim_t NC, - const dim_t KC, + const dim_t NC, + const dim_t KC, dim_t* rs_b, dim_t* cs_b ) -{ +{ dim_t NR = 64; // Used for permuting the mm512i elements for use in dpbf16_ps instruction. @@ -111,7 +177,7 @@ void packb_nr64_bf16bf16f32of32 } for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR ) - { + { for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 ) { // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. @@ -131,12 +197,12 @@ void packb_nr64_bf16bf16f32of32 a0 = _mm512_permutex2var_epi64( a01, selector1_1, a0 ); c0 = _mm512_permutex2var_epi64( c01, selector1_1, c0 ); - //store to pack_b buffer + //store to pack_b buffer _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 0 ) * NR ), b0 ); _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) + 32, a0 ); _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 1 ) * NR ), d0 ); _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) + 32, c0 ); - } + } // Handle k remainder. if( k_partial_pieces > 0) { @@ -156,12 +222,12 @@ void packb_nr64_bf16bf16f32of32 a0 = _mm512_permutex2var_epi64( a01, selector1_1, a0 ); c0 = _mm512_permutex2var_epi64( c01, selector1_1, c0 ); - //store to pack_b buffer + //store to pack_b buffer _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ), b0 ); _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) + 32, a0 ); _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ), d0 ); _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) + 32, c0 ); - } + } } if(n_partial_pieces > 0) @@ -178,64 +244,64 @@ void packb_nr64_bf16bf16f32of32 if ( n0_48 == 1 ) { - packb_nr48_bf16bf16f32of32 + packb_nr48_bf16bf16f32of32_row_major ( ( pack_b_buffer_bf16bf16f32of32 + ( n_full_pieces_loop_limit * KC_updated ) ), ( b + n_full_pieces_loop_limit ), ldb, KC ); n0_partial_pack = 48; - } + } else if ( n0_32 == 1 ) { - packb_nr32_bf16bf16f32of32 - ( + packb_nr32_bf16bf16f32of32_row_major + ( ( pack_b_buffer_bf16bf16f32of32 + ( n_full_pieces_loop_limit * KC_updated ) ), ( b + n_full_pieces_loop_limit ), ldb, KC ); n0_partial_pack = 32; - } + } else if ( n0_16 == 1 ) { - packb_nr16_bf16bf16f32of32 + packb_nr16_bf16bf16f32of32_row_major ( ( pack_b_buffer_bf16bf16f32of32 + ( n_full_pieces_loop_limit * KC_updated ) ), ( b + n_full_pieces_loop_limit ), ldb, KC ); n0_partial_pack = 16; - } + } if ( n0_partial_rem > 0 ) { - packb_nrlt16_bf16bf16f32of32 + packb_nrlt16_bf16bf16f32of32_row_major ( ( pack_b_buffer_bf16bf16f32of32 + ( n_full_pieces_loop_limit * KC_updated ) + ( n0_partial_pack * KC_updated ) ), ( b + n_full_pieces_loop_limit + n0_partial_pack ), ldb, KC, n0_partial_rem ); - } - } + } + } *rs_b = NR * 2; *cs_b = NR / 2; } -void packb_nr48_bf16bf16f32of32 +void packb_nr48_bf16bf16f32of32_row_major ( bfloat16* pack_b_buffer_bf16bf16f32of32, const bfloat16* b, const dim_t ldb, - const dim_t KC + const dim_t KC ) -{ +{ dim_t NR1 = 32; dim_t NR2 = 16; // Used for permuting the mm512i elements for use in dpbf16_ps instruction. __m512i selector1 = _mm512_setr_epi64(0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xA, 0xB); - __m512i selector1_1 = _mm512_setr_epi64( 0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF ); + __m512i selector1_1 = _mm512_setr_epi64( 0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF ); __m512i a0x; __m512i b0x; @@ -256,21 +322,21 @@ void packb_nr48_bf16bf16f32of32 for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 ) { // Rearrange for dpbf16_ps, read 2 rows from B with 32 elements in each row. - a0x = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) ); + a0x = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) ); c0x = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) ); a01x = _mm512_unpacklo_epi16( a0x, c0x ); a0x = _mm512_unpackhi_epi16( a0x, c0x ); - b0x = _mm512_permutex2var_epi64( a01x, selector1, a0x ); + b0x = _mm512_permutex2var_epi64( a01x, selector1, a0x ); a0x = _mm512_permutex2var_epi64( a01x, selector1_1, a0x ); //First 2x32 elements - _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x ); - _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x ); // Rearrange for dpbf16_ps, read 2 rows from B with next 16 elements in each row. - a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 0 ) ) + NR1 ); + a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 0 ) ) + NR1 ); c0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 1 ) ) + NR1 ); a01 = _mm256_unpacklo_epi16( a0, c0 ); @@ -279,7 +345,7 @@ void packb_nr48_bf16bf16f32of32 b0 = _mm256_permute2f128_si256(a01, a0, 0x20); a0 = _mm256_permute2f128_si256(a01, a0, 0x31); - //Last 2x16 elements + //Last 2x16 elements _mm256_mask_storeu_epi64 ( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ), @@ -296,20 +362,20 @@ void packb_nr48_bf16bf16f32of32 // Handle k remainder. if ( k_partial_pieces > 0 ) { - a0x = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) ); + a0x = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) ); c0x = _mm512_setzero_si512(); a01x = _mm512_unpacklo_epi16( a0x, c0x ); a0x = _mm512_unpackhi_epi16( a0x, c0x ); - b0x = _mm512_permutex2var_epi64( a01x, selector1, a0x ); + b0x = _mm512_permutex2var_epi64( a01x, selector1, a0x ); a0x = _mm512_permutex2var_epi64( a01x, selector1_1, a0x ); //First 2x32 elements - _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x ); - _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x ); + _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x ); - a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + NR1 ); + a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + NR1 ); c0 = _mm256_setzero_si256(); a01 = _mm256_unpacklo_epi16( a0, c0 ); @@ -318,7 +384,7 @@ void packb_nr48_bf16bf16f32of32 b0 = _mm256_permute2f128_si256(a01, a0, 0x20); a0 = _mm256_permute2f128_si256(a01, a0, 0x31); - //Last 2x16 elements + //Last 2x16 elements _mm256_mask_storeu_epi64 ( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ), @@ -332,12 +398,12 @@ void packb_nr48_bf16bf16f32of32 } } -void packb_nr32_bf16bf16f32of32 +void packb_nr32_bf16bf16f32of32_row_major ( bfloat16* pack_b_buffer_bf16bf16f32of32, const bfloat16* b, const dim_t ldb, - const dim_t KC + const dim_t KC ) { dim_t NR = 32; @@ -373,7 +439,7 @@ void packb_nr32_bf16bf16f32of32 _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 ); kr_new += 2; - } + } // Handle k remainder. if ( k_partial_pieces > 0 ) { @@ -389,14 +455,14 @@ void packb_nr32_bf16bf16f32of32 _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new ) * NR ), b0 ); _mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 ); } -} +} -void packb_nr16_bf16bf16f32of32 +void packb_nr16_bf16bf16f32of32_row_major ( bfloat16* pack_b_buffer_bf16bf16f32of32, const bfloat16* b, const dim_t ldb, - const dim_t KC + const dim_t KC ) { dim_t NR = 16; @@ -413,12 +479,12 @@ void packb_nr16_bf16bf16f32of32 dim_t kr_new = 0; for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 ) - { + { // Rearrange for dpbf16_ps, read 2 rows from B with 16 elements in each row. a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 0 ) ) ); - c0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 1 ) ) ); + c0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 1 ) ) ); - a01 = _mm256_unpacklo_epi16( a0, c0 ); + a01 = _mm256_unpacklo_epi16( a0, c0 ); a0 = _mm256_unpackhi_epi16( a0, c0 ); b0 = _mm256_permute2f128_si256(a01, a0, 0x20); @@ -443,7 +509,7 @@ void packb_nr16_bf16bf16f32of32 a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) ); c0 = _mm256_setzero_si256(); - a01 = _mm256_unpacklo_epi16( a0, c0 ); + a01 = _mm256_unpacklo_epi16( a0, c0 ); a0 = _mm256_unpackhi_epi16( a0, c0 ); b0 = _mm256_permute2f128_si256(a01, a0, 0x20); @@ -459,15 +525,15 @@ void packb_nr16_bf16bf16f32of32 pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), 0xFF, a0 ); - } -} + } +} -void packb_nrlt16_bf16bf16f32of32 +void packb_nrlt16_bf16bf16f32of32_row_major ( bfloat16* pack_b_buffer_bf16bf16f32of32, const bfloat16* b, const dim_t ldb, - const dim_t KC, + const dim_t KC, const dim_t n0_partial_rem ) { @@ -488,14 +554,14 @@ void packb_nrlt16_bf16bf16f32of32 bfloat16 buf1[16]; for ( int kr = 0; kr < k_full_pieces; kr += 2 ) - { + { memcpy( buf0, ( b + ( ldb * ( kr + 0 ) ) ), ( n0_partial_rem * sizeof( bfloat16 ) ) ); memcpy( buf1, ( b + ( ldb * ( kr + 1 ) ) ), ( n0_partial_rem * sizeof( bfloat16 ) ) ); // Rearrange for dpbf16_ps, read 2 rows from B with next 16 elements in each row. a0 = _mm256_maskz_loadu_epi16( 0xFFFF, buf0 ); c0 = _mm256_maskz_loadu_epi16( 0xFFFF, buf1 ); - a01 = _mm256_unpacklo_epi16( a0, c0 ); + a01 = _mm256_unpacklo_epi16( a0, c0 ); a0 = _mm256_unpackhi_epi16( a0, c0 ); b0 = _mm256_permute2f128_si256(a01, a0, 0x20); @@ -521,7 +587,7 @@ void packb_nrlt16_bf16bf16f32of32 a0 = _mm256_maskz_loadu_epi16( 0xFFFF, buf0 ); c0 = _mm256_setzero_si256(); - a01 = _mm256_unpacklo_epi16( a0, c0 ); + a01 = _mm256_unpacklo_epi16( a0, c0 ); a0 = _mm256_unpackhi_epi16( a0, c0 ); b0 = _mm256_permute2f128_si256(a01, a0, 0x20); @@ -537,6 +603,517 @@ void packb_nrlt16_bf16bf16f32of32 pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), 0xFF, a0 ); - } + } +} + +#define LOAD_16_COLS_AVX512 \ + a_reg[0] = _mm512_loadu_si512(b + ( ldb * ( jr + 0 ) ) + kr); \ + a_reg[1] = _mm512_loadu_si512(b + ( ldb * ( jr + 1 ) ) + kr); \ + a_reg[2] = _mm512_loadu_si512(b + ( ldb * ( jr + 2 ) ) + kr); \ + a_reg[3] = _mm512_loadu_si512(b + ( ldb * ( jr + 3 ) ) + kr); \ + a_reg[4] = _mm512_loadu_si512(b + ( ldb * ( jr + 4 ) ) + kr); \ + a_reg[5] = _mm512_loadu_si512(b + ( ldb * ( jr + 5 ) ) + kr); \ + a_reg[6] = _mm512_loadu_si512(b + ( ldb * ( jr + 6 ) ) + kr); \ + a_reg[7] = _mm512_loadu_si512(b + ( ldb * ( jr + 7 ) ) + kr); \ + a_reg[8] = _mm512_loadu_si512(b + ( ldb * ( jr + 8 ) ) + kr); \ + a_reg[9] = _mm512_loadu_si512(b + ( ldb * ( jr + 9 ) ) + kr); \ + a_reg[10] = _mm512_loadu_si512(b + ( ldb * ( jr + 10 ) ) + kr); \ + a_reg[11] = _mm512_loadu_si512(b + ( ldb * ( jr + 11 ) ) + kr); \ + a_reg[12] = _mm512_loadu_si512(b + ( ldb * ( jr + 12 ) ) + kr); \ + a_reg[13] = _mm512_loadu_si512(b + ( ldb * ( jr + 13 ) ) + kr); \ + a_reg[14] = _mm512_loadu_si512(b + ( ldb * ( jr + 14 ) ) + kr); \ + a_reg[15] = _mm512_loadu_si512(b + ( ldb * ( jr + 15 ) ) + kr); + +#define UNPACKHILO32_AVX512 \ + b_reg[0] = _mm512_unpacklo_epi32(a_reg[0], a_reg[1]); \ + b_reg[2] = _mm512_unpacklo_epi32(a_reg[2], a_reg[3]); \ + b_reg[4] = _mm512_unpacklo_epi32(a_reg[4], a_reg[5]); \ + b_reg[6] = _mm512_unpacklo_epi32(a_reg[6], a_reg[7]); \ + b_reg[8] = _mm512_unpacklo_epi32(a_reg[8], a_reg[9]); \ + b_reg[10] = _mm512_unpacklo_epi32(a_reg[10], a_reg[11]); \ + b_reg[12] = _mm512_unpacklo_epi32(a_reg[12], a_reg[13]); \ + b_reg[14] = _mm512_unpacklo_epi32(a_reg[14], a_reg[15]); \ +\ + b_reg[1] = _mm512_unpackhi_epi32(a_reg[0], a_reg[1]); \ + b_reg[3] = _mm512_unpackhi_epi32(a_reg[2], a_reg[3]); \ + b_reg[5] = _mm512_unpackhi_epi32(a_reg[4], a_reg[5]); \ + b_reg[7] = _mm512_unpackhi_epi32(a_reg[6], a_reg[7]); \ + b_reg[9] = _mm512_unpackhi_epi32(a_reg[8], a_reg[9]); \ + b_reg[11] = _mm512_unpackhi_epi32(a_reg[10], a_reg[11]); \ + b_reg[13] = _mm512_unpackhi_epi32(a_reg[12], a_reg[13]); \ + b_reg[15] = _mm512_unpackhi_epi32(a_reg[14], a_reg[15]); + +#define UNPACKHILO64_AVX512 \ + a_reg[0] = _mm512_unpacklo_epi64(b_reg[0], b_reg[2]); \ + a_reg[1] = _mm512_unpacklo_epi64(b_reg[4], b_reg[6]); \ + a_reg[2] = _mm512_unpacklo_epi64(b_reg[8], b_reg[10]); \ + a_reg[3] = _mm512_unpacklo_epi64(b_reg[12], b_reg[14]); \ + a_reg[4] = _mm512_unpacklo_epi64(b_reg[1], b_reg[3]); \ + a_reg[5] = _mm512_unpacklo_epi64(b_reg[5], b_reg[7]); \ + a_reg[6] = _mm512_unpacklo_epi64(b_reg[9], b_reg[11]); \ + a_reg[7] = _mm512_unpacklo_epi64(b_reg[13], b_reg[15]); \ +\ + a_reg[8] = _mm512_unpackhi_epi64(b_reg[0], b_reg[2]); \ + a_reg[9] = _mm512_unpackhi_epi64(b_reg[4], b_reg[6]); \ + a_reg[10] = _mm512_unpackhi_epi64(b_reg[8], b_reg[10]); \ + a_reg[11] = _mm512_unpackhi_epi64(b_reg[12], b_reg[14]); \ + a_reg[12] = _mm512_unpackhi_epi64(b_reg[1], b_reg[3]); \ + a_reg[13] = _mm512_unpackhi_epi64(b_reg[5], b_reg[7]); \ + a_reg[14] = _mm512_unpackhi_epi64(b_reg[9], b_reg[11]); \ + a_reg[15] = _mm512_unpackhi_epi64(b_reg[13], b_reg[15]); + +#define PERMUTEX2_VAR64_AVX512 \ + b_reg[0] = _mm512_permutex2var_epi64(a_reg[0], selector1, a_reg[1]); \ + b_reg[1] = _mm512_permutex2var_epi64(a_reg[2], selector1, a_reg[3]); \ + b_reg[2] = _mm512_permutex2var_epi64(a_reg[8], selector1, a_reg[9]); \ + b_reg[3] = _mm512_permutex2var_epi64(a_reg[10], selector1, a_reg[11]); \ + b_reg[4] = _mm512_permutex2var_epi64(a_reg[4], selector1, a_reg[5]); \ + b_reg[5] = _mm512_permutex2var_epi64(a_reg[6], selector1, a_reg[7]); \ + b_reg[6] = _mm512_permutex2var_epi64(a_reg[12], selector1, a_reg[13]); \ + b_reg[7] = _mm512_permutex2var_epi64(a_reg[14], selector1, a_reg[15]); \ + b_reg[8] = _mm512_permutex2var_epi64(a_reg[0], selector2, a_reg[1]); \ + b_reg[9] = _mm512_permutex2var_epi64(a_reg[2], selector2, a_reg[3]); \ + b_reg[10] = _mm512_permutex2var_epi64(a_reg[8], selector2, a_reg[9]); \ + b_reg[11] = _mm512_permutex2var_epi64(a_reg[10], selector2, a_reg[11]); \ + b_reg[12] = _mm512_permutex2var_epi64(a_reg[4], selector2, a_reg[5]); \ + b_reg[13] = _mm512_permutex2var_epi64(a_reg[6], selector2, a_reg[7]); \ + b_reg[14] = _mm512_permutex2var_epi64(a_reg[12], selector2, a_reg[13]); \ + b_reg[15] = _mm512_permutex2var_epi64(a_reg[14], selector2, a_reg[15]); + +#define SHUFFLE64x2_AVX512 \ + a_reg[0] = _mm512_shuffle_i64x2(b_reg[0], b_reg[1], 0x44); \ + a_reg[1] = _mm512_shuffle_i64x2(b_reg[2], b_reg[3], 0x44); \ + a_reg[2] = _mm512_shuffle_i64x2(b_reg[4], b_reg[5], 0x44); \ + a_reg[3] = _mm512_shuffle_i64x2(b_reg[6], b_reg[7], 0x44); \ + a_reg[4] = _mm512_shuffle_i64x2(b_reg[8], b_reg[9], 0x44); \ + a_reg[5] = _mm512_shuffle_i64x2(b_reg[10], b_reg[11], 0x44); \ + a_reg[6] = _mm512_shuffle_i64x2(b_reg[12], b_reg[13], 0x44); \ + a_reg[7] = _mm512_shuffle_i64x2(b_reg[14], b_reg[15], 0x44); \ + a_reg[8] = _mm512_shuffle_i64x2(b_reg[0], b_reg[1], 0xEE); \ + a_reg[9] = _mm512_shuffle_i64x2(b_reg[2], b_reg[3], 0xEE); \ + a_reg[10] = _mm512_shuffle_i64x2(b_reg[4], b_reg[5], 0xEE); \ + a_reg[11] = _mm512_shuffle_i64x2(b_reg[6], b_reg[7], 0xEE); \ + a_reg[12] = _mm512_shuffle_i64x2(b_reg[8], b_reg[9], 0xEE); \ + a_reg[13] = _mm512_shuffle_i64x2(b_reg[10], b_reg[11], 0xEE); \ + a_reg[14] = _mm512_shuffle_i64x2(b_reg[12], b_reg[13], 0xEE); \ + a_reg[15] = _mm512_shuffle_i64x2(b_reg[14], b_reg[15], 0xEE); + +#define MASK_LOAD_16_COLS_AVX512(mask) \ + a_reg[0] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 0 ) ) + kr); \ + a_reg[1] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 1 ) ) + kr); \ + a_reg[2] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 2 ) ) + kr); \ + a_reg[3] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 3 ) ) + kr); \ + a_reg[4] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 4 ) ) + kr); \ + a_reg[5] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 5 ) ) + kr); \ + a_reg[6] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 6 ) ) + kr); \ + a_reg[7] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 7 ) ) + kr); \ + a_reg[8] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 8 ) ) + kr); \ + a_reg[9] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 9 ) ) + kr); \ + a_reg[10] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 10 ) ) + kr); \ + a_reg[11] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 11 ) ) + kr); \ + a_reg[12] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 12 ) ) + kr); \ + a_reg[13] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 13 ) ) + kr); \ + a_reg[14] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 14 ) ) + kr); \ + a_reg[15] = _mm512_maskz_loadu_epi16( mask, b + ( ldb * ( jr + 15 ) ) + kr); + +void packb_nr64_bf16bf16f32of32_col_major + ( + bfloat16* pack_b_buffer, + const bfloat16* b, + const dim_t ldb, + const dim_t NC, + const dim_t KC, + dim_t* rs_b, + dim_t* cs_b + ) +{ + dim_t NR = 64; + + dim_t n_full_pieces = NC / NR; + dim_t n_full_pieces_loop_limit = n_full_pieces * NR; + dim_t n_partial_pieces = NC % NR; + + dim_t k_partial_pieces = KC % 2; + + dim_t KC_updated = KC; + if ( k_partial_pieces > 0 ) + { + KC_updated += ( 2 - k_partial_pieces ); + } + + for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR ) + { + packb_nr_mult_16_bf16bf16f32of32_col_major + ( pack_b_buffer + (jc * KC_updated), + b + (jc * ldb), 64, ldb, KC + ); + } + + if(n_partial_pieces > 0) + { + dim_t n0_partial_rem = n_partial_pieces % 16; + dim_t n0_partial_pack = 0; + + // Split into multiple smaller fringe kernels, so as to maximize + // vectorization after packing. Any n0 < NR(64) can be expressed + // as n0 = 48 + n` / n0 = 32 + n` / n0 = 16 + n`, where n` < 16. + dim_t n0_48 = n_partial_pieces / 48; + dim_t n0_32 = n_partial_pieces / 32; + dim_t n0_16 = n_partial_pieces / 16; + + if ( n0_48 == 1 ) + { + packb_nr_mult_16_bf16bf16f32of32_col_major + ( + ( pack_b_buffer + ( n_full_pieces_loop_limit * KC_updated ) ), + ( b + n_full_pieces_loop_limit * ldb ), 48, ldb, KC + ); + + n0_partial_pack = 48; + } + else if ( n0_32 == 1 ) + { + packb_nr_mult_16_bf16bf16f32of32_col_major + ( + ( pack_b_buffer + ( n_full_pieces_loop_limit * KC_updated ) ), + ( b + n_full_pieces_loop_limit * ldb ), 32, ldb, KC + ); + + n0_partial_pack = 32; + } + else if ( n0_16 == 1 ) + { + packb_nr_mult_16_bf16bf16f32of32_col_major + ( + ( pack_b_buffer + ( n_full_pieces_loop_limit * KC_updated ) ), + ( b + n_full_pieces_loop_limit * ldb ), 16, ldb, KC + ); + + n0_partial_pack = 16; + } + + if ( n0_partial_rem > 0 ) + { + packb_nrlt16_bf16bf16f32of32_col_major + ( + ( pack_b_buffer + ( n_full_pieces_loop_limit * KC_updated ) + + ( n0_partial_pack * KC_updated ) ), + ( b + ( n_full_pieces_loop_limit + n0_partial_pack ) * ldb ), ldb, KC, + n0_partial_rem + ); + } + } + *rs_b = NR * 2; + *cs_b = NR / 2; +} + +void packb_nr_mult_16_bf16bf16f32of32_col_major + ( + bfloat16* pack_b_buffer, + const bfloat16* b, + const dim_t NR, + const dim_t ldb, + const dim_t KC + ) +{ + + // Used for permuting the mm512i elements for use in dpbf16_ps instruction. + __m512i selector1 = _mm512_setr_epi64( 0x0, 0x1, 0x8, 0x9, 0x4, 0x5, 0xC, 0xD ); + __m512i selector2 = _mm512_setr_epi64( 0x2, 0x3, 0xA, 0xB, 0x6, 0x7, 0xE, 0xF ); + + __m512i a_reg[16]; + __m512i b_reg[16]; + + dim_t kr = 0; + for ( kr = 0; ( kr + 31 ) < KC; kr += 32 ) + { + for( dim_t jr = 0; jr < NR; jr += 16 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + + LOAD_16_COLS_AVX512 + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 0 ) * NR ), a_reg[0] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 2 ) * NR ), a_reg[1] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 4 ) * NR ), a_reg[2] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 6 ) * NR ), a_reg[3] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 8 ) * NR ), a_reg[4] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 10 ) * NR ), a_reg[5] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 12 ) * NR ), a_reg[6] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 14 ) * NR ), a_reg[7] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 16 ) * NR ), a_reg[8] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 18 ) * NR ), a_reg[9] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 20 ) * NR ), a_reg[10] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 22 ) * NR ), a_reg[11] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 24 ) * NR ), a_reg[12] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 26 ) * NR ), a_reg[13] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 28 ) * NR ), a_reg[14] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 30 ) * NR ), a_reg[15] ); + + } + } + for ( ; ( kr + 15 ) < KC; kr += 16 ) + { + for( dim_t jr = 0; jr < NR; jr += 16 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + + MASK_LOAD_16_COLS_AVX512( 0xFFFF ) + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 0 ) * NR ), a_reg[0] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 2 ) * NR ), a_reg[1] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 4 ) * NR ), a_reg[2] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 6 ) * NR ), a_reg[3] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 8 ) * NR ), a_reg[4] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 10 ) * NR ), a_reg[5] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 12 ) * NR ), a_reg[6] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 14 ) * NR ), a_reg[7] ); + } + } + + for( ; ( kr +7 ) < KC; kr += 8 ) + { + for( dim_t jr = 0; jr < NR; jr += 16 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + + MASK_LOAD_16_COLS_AVX512( 0xFF ) + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 0 ) * NR ), a_reg[0] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 2 ) * NR ), a_reg[1] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 4 ) * NR ), a_reg[2] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 6 ) * NR ), a_reg[3] ); + } + } + for( ; ( kr +3 ) < KC; kr += 4 ) + { + for( dim_t jr = 0; jr < NR; jr += 16 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + MASK_LOAD_16_COLS_AVX512( 0x0F ) + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 0 ) * NR ), a_reg[0] ); + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr + 2 ) * NR ), a_reg[1] ); + } + } + for( ; ( kr +1 ) < KC; kr += 2 ) + { + for( dim_t jr = 0; jr < NR; jr += 16 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + MASK_LOAD_16_COLS_AVX512( 0x03 ) + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( ( kr ) * NR ), a_reg[0] ); + } + } + for( ; kr < KC; kr += 1 ) + { + for( dim_t jr = 0; jr < NR; jr += 16 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + MASK_LOAD_16_COLS_AVX512( 0x01 ) + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( jr * 2 ) + ( kr * NR ), a_reg[0] ); + } + } +} + + +void packb_nrlt16_bf16bf16f32of32_col_major + ( + bfloat16* pack_b_buffer, + const bfloat16* b, + const dim_t ldb, + const dim_t KC, + const dim_t n0_partial_rem + ) +{ + dim_t NR = 16; + + // Used for permuting the mm512i elements for use in dpbf16_ps instruction. + __m512i selector1 = _mm512_setr_epi64( 0x0, 0x1, 0x8, 0x9, 0x4, 0x5, 0xC, 0xD ); + __m512i selector2 = _mm512_setr_epi64( 0x2, 0x3, 0xA, 0xB, 0x6, 0x7, 0xE, 0xF ); + + __m512i a_reg[16]; + __m512i b_reg[16]; + + dim_t kr = 0, jr = 0; + for ( kr = 0; ( kr + 31 ) < KC; kr += 32 ) + { + for( jr = 0; jr < n0_partial_rem; jr += 1 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + a_reg[jr] = _mm512_loadu_si512( b + ( ldb * ( jr + 0 ) ) + kr ); + } + for(; jr < NR; jr++) + { + a_reg[jr] = _mm512_setzero_si512(); + } + + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 0 ) * NR ), a_reg[0] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 2 ) * NR ), a_reg[1] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 4 ) * NR ), a_reg[2] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 6 ) * NR ), a_reg[3] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 8 ) * NR ), a_reg[4] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 10 ) * NR ), a_reg[5] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 12 ) * NR ), a_reg[6] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 14 ) * NR ), a_reg[7] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 16 ) * NR ), a_reg[8] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 18 ) * NR ), a_reg[9] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 20 ) * NR ), a_reg[10] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 22 ) * NR ), a_reg[11] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 24 ) * NR ), a_reg[12] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 26 ) * NR ), a_reg[13] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 28 ) * NR ), a_reg[14] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 30 ) * NR ), a_reg[15] ); + + } + for ( ; ( kr + 15 ) < KC; kr += 16 ) + { + for( jr = 0; jr < n0_partial_rem; jr += 1 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + a_reg[jr] = _mm512_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( jr + 0 ) ) + kr ); + } + for( ; jr < NR; jr++ ) + { + a_reg[jr] = _mm512_setzero_si512(); + } + + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 0 ) * NR ), a_reg[0] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 2 ) * NR ), a_reg[1] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 4 ) * NR ), a_reg[2] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 6 ) * NR ), a_reg[3] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 8 ) * NR ), a_reg[4] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 10 ) * NR ), a_reg[5] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 12 ) * NR ), a_reg[6] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 14 ) * NR ), a_reg[7] ); + } + + for ( ; ( kr + 7 ) < KC; kr += 8 ) + { + for( jr = 0; jr < n0_partial_rem; jr += 1 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + a_reg[jr] = _mm512_maskz_loadu_epi16( 0xFF, b + ( ldb * ( jr + 0 ) ) + kr ); + } + for( ; jr < NR; jr++ ) + { + a_reg[jr] = _mm512_setzero_si512(); + } + + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 0 ) * NR ), a_reg[0] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 2 ) * NR ), a_reg[1] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 4 ) * NR ), a_reg[2] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 6 ) * NR ), a_reg[3] ); + } + for ( ; (kr+3) < KC; kr += 4 ) + { + for( jr = 0; jr < n0_partial_rem; jr += 1 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + a_reg[jr] = _mm512_maskz_loadu_epi16( 0x0F, b + ( ldb * ( jr + 0 ) ) + kr ); + } + for( ; jr < NR; jr++ ) + { + a_reg[jr] = _mm512_setzero_si512(); + } + + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 0 ) * NR ), a_reg[0] ); + _mm512_storeu_si512( pack_b_buffer + ( ( kr + 2 ) * NR ), a_reg[1] ); + } + for ( ; ( kr + 1 ) < KC; kr += 2 ) + { + for( jr = 0; jr < n0_partial_rem; jr += 1 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + a_reg[jr] = _mm512_maskz_loadu_epi16( 0x03, b + ( ldb * ( jr + 0 ) ) + kr ); + } + for( ; jr < NR; jr++ ) + { + a_reg[jr] = _mm512_setzero_si512(); + } + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( kr * NR ), a_reg[0] ); + } + for ( ; kr < KC; kr += 1 ) + { + for( jr = 0; jr < n0_partial_rem; jr += 1 ) + { + // Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row. + a_reg[jr] = _mm512_maskz_loadu_epi16( 0x01, b + ( ldb * ( jr + 0 ) ) + kr ); + } + for( ; jr < NR; jr++ ) + { + a_reg[jr] = _mm512_setzero_si512(); + } + + UNPACKHILO32_AVX512 + UNPACKHILO64_AVX512 + PERMUTEX2_VAR64_AVX512 + SHUFFLE64x2_AVX512 + + // store to pack_b buffer + _mm512_storeu_si512( pack_b_buffer + ( kr * NR ), a_reg[0] ); + } } #endif From 105de694cf837e3a80af8f2ff8e75380aec880e0 Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Mon, 4 Sep 2023 15:07:50 +0530 Subject: [PATCH 158/226] Optimized ZGEMV variant 1 - Added an explicit function definition for ZGEMV var 1. This removes the need to query the context for Zen architectures. - Added a new INSERT_GENTFUNC to generate the definition only for scomplex type. - Rewrote ZDOTXF kernel and added the function name for ZDOTV instead of querying it. - With this change fringe loop is vectorized using SSE instructions. AMD-Internal:[CPUPL-3997] Change-Id: I790214d528f9e39f63387bc95bf611f84d3faca3 --- frame/2/gemv/bli_gemv_unf_var1_amd.c | 195 +++++- frame/include/bli_gentfunc_macro_defs.h | 4 + kernels/zen/1f/bli_dotxf_zen_int_8.c | 771 +++++++++++++----------- 3 files changed, 603 insertions(+), 367 deletions(-) diff --git a/frame/2/gemv/bli_gemv_unf_var1_amd.c b/frame/2/gemv/bli_gemv_unf_var1_amd.c index bad0a4f27c..e0a1c861db 100644 --- a/frame/2/gemv/bli_gemv_unf_var1_amd.c +++ b/frame/2/gemv/bli_gemv_unf_var1_amd.c @@ -677,5 +677,198 @@ void bli_sgemv_unf_var1 #endif// BLIS_ENABLE_OPENMP } -INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 ) +void bli_zgemv_unf_var1 + ( + trans_t transa, + conj_t conjx, + dim_t m, + dim_t n, + dcomplex* alpha, + dcomplex* a, inc_t rs_a, inc_t cs_a, + dcomplex* x, inc_t incx, + dcomplex* beta, + dcomplex* y, inc_t incy, + cntx_t* cntx + ) +{ + + const num_t dt = PASTEMAC(z,type); + + dcomplex* A1; + dcomplex* x1; + dcomplex* y1; + dim_t i; + dim_t b_fuse, f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; + + /* Memory pool declarations for packing vector X. */ + mem_t mem_bufX; + rntm_t rntm; + dcomplex* x_temp = x; + inc_t temp_incx = incx; + /* + Boolean to check if the X has been packed + and memory needs to be freed in the end + */ + bool is_x_temp_buf_created = FALSE; + + bli_set_dims_incs_with_trans( transa, + m, n, rs_a, cs_a, + &n_iter, &n_elem, &rs_at, &cs_at ); + + conja = bli_extract_conj( transa ); + + /* + Function pointer declaration for the functions + that will be used by this API + */ + zdotxf_ker_ft dotxf_kr_ptr; // ZDOTXF + zscal2v_ker_ft scal2v_kr_ptr; // ZSCAL2V + + /* + Fatbinary config amdzen when run on non-AMD X86 will query for + the support of AVX512 or AVX2, if AVX512 - arch_id will be zen4 + or for AVX2 it will be zen3. + */ + arch_t id = bli_arch_query_id(); + + switch (id) + { + case BLIS_ARCH_ZEN4: + case BLIS_ARCH_ZEN: + case BLIS_ARCH_ZEN2: + case BLIS_ARCH_ZEN3: + + /* + Assign the AVX2 based kernel function pointers for + DOTXF, SCAL2Vand corresponding fusing + factor of DOTXF kernel + */ + + dotxf_kr_ptr = bli_zdotxf_zen_int_6; + b_fuse = 6; + + scal2v_kr_ptr = bli_zscal2v_zen_int; + break; + + default: + // For non-Zen architectures, query the context if it is NULL + if(cntx == NULL) cntx = bli_gks_query_cntx(); + + /* + Query the context for the kernel function pointers for + DOTXF, SCAL2V and corresponding fusing + factor of DOTXF kernel + */ + dotxf_kr_ptr = bli_cntx_get_l1f_ker_dt( BLIS_DCOMPLEX, BLIS_DOTXF_KER, cntx );; + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); + + scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(dt, BLIS_SCAL2V_KER, cntx); + } + + if( incx > 1 ) + { + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_pba_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */ + mem_bufX.pblk.buf = NULL; + mem_bufX.pblk.block_size = 0; + mem_bufX.buf_type = 0; + mem_bufX.size = 0; + mem_bufX.pool = NULL; + + /* + In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm + */ + + bli_rntm_init_from_global(&rntm); + bli_rntm_set_num_threads_only(1, &rntm); + bli_pba_rntm_set_pba(&rntm); + + /* + Calculate the size required for n_elem double elements in vector Y. + */ + size_t buffer_size = n_elem * sizeof(dcomplex); + + /* + Acquire a Buffer(n_elem*size(dcomplex)) from the memory broker + and save the associated mem_t entry to mem_bufX. + */ + bli_pba_acquire_m(&rntm, buffer_size, BLIS_BUFFER_FOR_B_PANEL, &mem_bufX); + + /* + Continue packing X if buffer memory is allocated + */ + if ((bli_mem_is_alloc(&mem_bufX))) + { + x_temp = bli_mem_buffer(&mem_bufX); + temp_incx = 1; + dcomplex* alpha_passed = PASTEMAC(z,1); + + /* + Invoke the ZSCAL2V function using the function pointer + */ + scal2v_kr_ptr + ( + BLIS_NO_CONJUGATE, + n_elem, + alpha_passed, + x, incx, + x_temp, temp_incx, + cntx + ); + + /* + Set x is packed as the memory allocation was + successful and contents have been copied + */ + is_x_temp_buf_created = TRUE; + } + } + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (i )*rs_at + (0 )*cs_at; + x1 = x_temp + (0 )*temp_incx; + y1 = y + (i )*incy; + + /* y1 = beta * y1 + alpha * A1 * x; */ + dotxf_kr_ptr + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x1, temp_incx, + beta, + y1, incy, + cntx + ); + + } + /* + Check if temp X buffer was used for compute + */ + if (is_x_temp_buf_created) + { + /* + Return the buffer to pool + */ + bli_pba_release(&rntm, &mem_bufX); + } +} + + +INSERT_GENTFUNC_BASIC0_C( gemv_unf_var1 ) diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h index 96a658110d..9836819b98 100644 --- a/frame/include/bli_gentfunc_macro_defs.h +++ b/frame/include/bli_gentfunc_macro_defs.h @@ -214,6 +214,10 @@ GENTFUNC( double, d, tfuncname ) GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) +#define INSERT_GENTFUNC_BASIC0_C( tfuncname ) \ +\ +GENTFUNC( scomplex, c, tfuncname ) + // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c index 6f652db309..4a73fd0bc2 100644 --- a/kernels/zen/1f/bli_dotxf_zen_int_8.c +++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c @@ -1616,419 +1616,458 @@ void bli_zdotxf_zen_int_6 cntx_t* restrict cntx ) { - /** - * Handles only unit stride cases and 6 column at a time - * b_n check for columns to be 6. - */ - if ( (inca == 1) && (incx == 1) && (incy == 1) && (b_n == 6) ) + /* If the vectors are empty or if alpha is zero, return early */ + if ( bli_zero_dim1( m ) || PASTEMAC(z,eq0)( *alpha ) ) { - /* Temporary rho buffer holds computed dot product result */ - dcomplex r[ 6 ]; + bli_zscalv_zen_int + ( + BLIS_NO_CONJUGATE, + b_n, + beta, + y, incy, + cntx + ); - /* If beta is zero, clear y. Otherwise, scale by beta. */ - if ( PASTEMAC(z,eq0)( *beta ) ) - { - for ( dim_t i = 0; i < 6; ++i ) - { - PASTEMAC(z,set0s)( y[i] ); - } - } - else + return; + } + + // If b_n is not equal to the fusing factor, then perform the entire + // operation as a loop over dotxv. + if ( b_n != 6 ) + { + for ( dim_t i = 0; i < b_n; ++i ) { - for ( dim_t i = 0; i < 6; ++i ) - { - PASTEMAC(z,scals)( *beta, y[i] ); - } + dcomplex* restrict a1 = a + (0 )*inca + (i )*lda; + dcomplex* restrict x1 = x + (0 )*incx; + dcomplex* restrict psi1 = y + (i )*incy; + + bli_zdotxv_zen_int + ( + conjat, + conjx, + m, + alpha, + a1, inca, + x1, incx, + beta, + psi1, + cntx + ); } - /* If the vectors are empty or if alpha is zero, return early*/ - if ( bli_zero_dim1( m ) || PASTEMAC(z,eq0)( *alpha ) ) return; + return; + } - /* Initialize r vector to 0. */ - for ( dim_t i = 0; i < 6; ++i ) PASTEMAC(z,set0s)( r[i] ); + dim_t rem = m; - /* If a must be conjugated, we do so indirectly by first - * toggling the effective conjugation of x and then conjugating - * the resulting do products. - * Rather conjugating each element of a matrix, final computed result - * can be conjugated at the end of loop. This takes off the overhead - * of conjugating each element inside the loop and improves the - * performance. - */ - conj_t conjx_use = conjx; + double *restrict av[6]; + double *restrict x_temp = (double *)(x); - if ( bli_is_conj( conjat ) ) - { - bli_toggle_conj( &conjx_use ); - } + av[0] = (double *)(a + 0 * lda); + av[1] = (double *)(a + 1 * lda); + av[2] = (double *)(a + 2 * lda); + av[3] = (double *)(a + 3 * lda); + av[4] = (double *)(a + 4 * lda); + av[5] = (double *)(a + 5 * lda); - /* Setting rho vectors to 0 */ - v4df_t rho0v; rho0v.v = _mm256_setzero_pd(); - v4df_t rho1v; rho1v.v = _mm256_setzero_pd(); - v4df_t rho2v; rho2v.v = _mm256_setzero_pd(); - v4df_t rho3v; rho3v.v = _mm256_setzero_pd(); - v4df_t rho4v; rho4v.v = _mm256_setzero_pd(); - v4df_t rho5v; rho5v.v = _mm256_setzero_pd(); + dcomplex res[6]; - v4df_t rho6v; rho6v.v = _mm256_setzero_pd(); - v4df_t rho7v; rho7v.v = _mm256_setzero_pd(); - v4df_t rho8v; rho8v.v = _mm256_setzero_pd(); - v4df_t rho9v; rho9v.v = _mm256_setzero_pd(); - v4df_t rho10v; rho10v.v = _mm256_setzero_pd(); - v4df_t rho11v; rho11v.v = _mm256_setzero_pd(); + res[0] = res[1] = res[2] = res[3] = res[4] = res[5] = (*bli_z0); - /* Holds 2 dcomplex element of x vector - * for computing dot product with A tile - */ - v4df_t x0v, x1v; - /* Holds 2x6 tile of matrix A */ - v4df_t a0v, a1v, a2v, a3v, a4v, a5v; - /** - * Since complex datatype multiplication is - * being held in two sets of rho vectors. - * Where first set holds the computaion with - * real part of vector x and other holds - * imaginary part of vector x. - * For final computation, based on conj sign - * of imaginary component needs to be toggled. - */ - __m256d no_conju = _mm256_setr_pd(-1, 1, -1, 1); - __m256d conju = _mm256_setr_pd(1, -1, 1, -1); - dim_t iter = m / 2; - dim_t rem = m % 2; - dim_t i = 0; + conj_t conjx_use = conjx; + + if (bli_is_conj(conjat)) + { + bli_toggle_conj(&conjx_use); + } + + if (incx == 1 && inca == 1) + { + rem = m % 2; + v4df_t rhov[12], a_vec[6], xv[2], conj_mul; + + rhov[0].v = _mm256_setzero_pd(); + rhov[1].v = _mm256_setzero_pd(); + rhov[2].v = _mm256_setzero_pd(); + rhov[3].v = _mm256_setzero_pd(); + rhov[4].v = _mm256_setzero_pd(); + rhov[5].v = _mm256_setzero_pd(); + rhov[6].v = _mm256_setzero_pd(); + rhov[7].v = _mm256_setzero_pd(); + rhov[8].v = _mm256_setzero_pd(); + rhov[9].v = _mm256_setzero_pd(); + rhov[10].v = _mm256_setzero_pd(); + rhov[11].v = _mm256_setzero_pd(); - if ( bli_is_noconj( conjx_use ) ) + for (dim_t i = 0; (i + 1) < m; i += 2) { - if(iter) - { - for ( ; (i+1) < m; i+=2) - { - /*Load 2 dcomplex elements from - * vector x - */ - x0v.v = _mm256_loadu_pd( - (double *)(x + i) ); - /* x1v.v holds imaginary part of dcomplex - * elements from vector x - * It will do following operation. - * R0 I0 R1 I1 => I0 I0 I1 I1 - * - */ - x1v.v = _mm256_permute_pd( x0v.v, 15 ); - /* x1v.v holds real part of dcomplex - * elements from vector x - * It will do following operation. - * R0 I0 R1 I1 => R0 R0 R1 R1 - */ - x0v.v = _mm256_permute_pd( x0v.v, 0 ); - - /*Load 2x6 tile of matrix A*/ - a0v.v = _mm256_loadu_pd( (double *) - (a + i + 0 * lda) ); - a1v.v = _mm256_loadu_pd( (double *) - (a + i + 1 * lda) ); - a2v.v = _mm256_loadu_pd( (double *) - (a + i + 2 * lda) ); - a3v.v = _mm256_loadu_pd( (double *) - (a + i + 3 * lda) ); - a4v.v = _mm256_loadu_pd( (double *) - (a + i + 4 * lda) ); - a5v.v = _mm256_loadu_pd( (double *) - (a + i + 5 * lda) ); - - // perform: rho?v += a?v * x0v; - rho0v.v = _mm256_fmadd_pd( a0v.v, - x0v.v, rho0v.v ); - rho6v.v = _mm256_fmadd_pd( a0v.v, - x1v.v, rho6v.v ); - - rho1v.v = _mm256_fmadd_pd( a1v.v, - x0v.v, rho1v.v ); - rho7v.v = _mm256_fmadd_pd( a1v.v, - x1v.v, rho7v.v ); - - rho2v.v = _mm256_fmadd_pd( a2v.v, - x0v.v, rho2v.v ); - rho8v.v = _mm256_fmadd_pd( a2v.v, - x1v.v, rho8v.v ); - - rho3v.v = _mm256_fmadd_pd( a3v.v, - x0v.v, rho3v.v ); - rho9v.v = _mm256_fmadd_pd( a3v.v, - x1v.v, rho9v.v ); - - rho4v.v = _mm256_fmadd_pd( a4v.v, - x0v.v, rho4v.v ); - rho10v.v = _mm256_fmadd_pd( a4v.v, - x1v.v, rho10v.v ); - - rho5v.v = _mm256_fmadd_pd( a5v.v, - x0v.v, rho5v.v ); - rho11v.v = _mm256_fmadd_pd( a5v.v, - x1v.v, rho11v.v ); - } + // Load 2 dcomplex elements from vector x + xv[0].v = _mm256_loadu_pd(x_temp); - /*Swapping position of real and imag component - * for horizontal addition to get the final - * dot product computation - * rho register are holding computation which needs - * to be arranged in following manner. - * Ra0*Ix0 | Ia0*Ix0 | Ra1*Ix1 | Ia1*Ix1 - * || - * \/ - * Ia0*Ix0 | Ra0*Ix0 | Ia1*Ix1 | Ra1*Ix1 - */ - rho6v.v = _mm256_permute_pd(rho6v.v, 0x05); - rho7v.v = _mm256_permute_pd(rho7v.v, 0x05); - rho8v.v = _mm256_permute_pd(rho8v.v, 0x05); - rho9v.v = _mm256_permute_pd(rho9v.v, 0x05); - rho10v.v = _mm256_permute_pd(rho10v.v, 0x05); - rho11v.v = _mm256_permute_pd(rho11v.v, 0x05); + // xv[1].v - R0 I0 R1 I1 => I0 I0 I1 I1 + xv[1].v = _mm256_permute_pd(xv[0].v, 15); - /*Negating imaginary part for computing - * the final result of dcomplex multiplication - */ - rho6v.v = _mm256_mul_pd(rho6v.v, no_conju); - rho7v.v = _mm256_mul_pd(rho7v.v, no_conju); - rho8v.v = _mm256_mul_pd(rho8v.v, no_conju); - rho9v.v = _mm256_mul_pd(rho9v.v, no_conju); - rho10v.v = _mm256_mul_pd(rho10v.v, no_conju); - rho11v.v = _mm256_mul_pd(rho11v.v, no_conju); - - rho0v.v = _mm256_add_pd(rho0v.v, rho6v.v); - rho1v.v = _mm256_add_pd(rho1v.v, rho7v.v); - rho2v.v = _mm256_add_pd(rho2v.v, rho8v.v); - rho3v.v = _mm256_add_pd(rho3v.v, rho9v.v); - rho4v.v = _mm256_add_pd(rho4v.v, rho10v.v); - rho5v.v = _mm256_add_pd(rho5v.v, rho11v.v); - - /*rho0, rho1, rho2 holds final dot product - * result of 6 dcomplex elements. - */ - rho0v.d[0] += rho0v.d[2]; - rho0v.d[1] += rho0v.d[3]; + // xv[0].v - R0 I0 R1 I1 => R0 R0 R1 R1 + xv[0].v = _mm256_permute_pd(xv[0].v, 0); - rho0v.d[2] = rho1v.d[0] + rho1v.d[2]; - rho0v.d[3] = rho1v.d[1] + rho1v.d[3]; + a_vec[0].v = _mm256_loadu_pd((double *)(av[0])); + a_vec[1].v = _mm256_loadu_pd((double *)(av[1])); + a_vec[2].v = _mm256_loadu_pd((double *)(av[2])); + a_vec[3].v = _mm256_loadu_pd((double *)(av[3])); + a_vec[4].v = _mm256_loadu_pd((double *)(av[4])); + a_vec[5].v = _mm256_loadu_pd((double *)(av[5])); - rho1v.d[0] = rho2v.d[0] + rho2v.d[2]; - rho1v.d[1] = rho2v.d[1] + rho2v.d[3]; + // perform: rho?v += a?v * xv[0]; + rhov[0].v = _mm256_fmadd_pd(a_vec[0].v, xv[0].v, rhov[0].v); + rhov[6].v = _mm256_fmadd_pd(a_vec[0].v, xv[1].v, rhov[6].v); - rho1v.d[2] = rho3v.d[0] + rho3v.d[2]; - rho1v.d[3] = rho3v.d[1] + rho3v.d[3]; + rhov[1].v = _mm256_fmadd_pd(a_vec[1].v, xv[0].v, rhov[1].v); + rhov[7].v = _mm256_fmadd_pd(a_vec[1].v, xv[1].v, rhov[7].v); - rho2v.d[0] = rho4v.d[0] + rho4v.d[2]; - rho2v.d[1] = rho4v.d[1] + rho4v.d[3]; + rhov[2].v = _mm256_fmadd_pd(a_vec[2].v, xv[0].v, rhov[2].v); + rhov[8].v = _mm256_fmadd_pd(a_vec[2].v, xv[1].v, rhov[8].v); - rho2v.d[2] = rho5v.d[0] + rho5v.d[2]; - rho2v.d[3] = rho5v.d[1] + rho5v.d[3]; + rhov[3].v = _mm256_fmadd_pd(a_vec[3].v, xv[0].v, rhov[3].v); + rhov[9].v = _mm256_fmadd_pd(a_vec[3].v, xv[1].v, rhov[9].v); - /*Computed dot product result is being stored - * in temp buffer r for further computation. - */ - _mm256_storeu_pd((double *)r, rho0v.v); - _mm256_storeu_pd((double *)(r+2) , rho1v.v); - _mm256_storeu_pd((double *)(r+4) , rho2v.v); + rhov[4].v = _mm256_fmadd_pd(a_vec[4].v, xv[0].v, rhov[4].v); + rhov[10].v = _mm256_fmadd_pd(a_vec[4].v, xv[1].v, rhov[10].v); - } - /*handles remainder cases*/ - if(rem) - { - PRAGMA_SIMD - for(dim_t p = 0; p < 6 ; p++) - { - PASTEMAC(z,axpys)( a[i + p*lda] - , x[i], r[p] ); - } - } + rhov[5].v = _mm256_fmadd_pd(a_vec[5].v, xv[0].v, rhov[5].v); + rhov[11].v = _mm256_fmadd_pd(a_vec[5].v, xv[1].v, rhov[11].v); + + av[0] += 4; + av[1] += 4; + av[2] += 4; + av[3] += 4; + av[4] += 4; + av[5] += 4; + + x_temp += 4; + } + + if (bli_is_noconj(conjx_use)) + { + conj_mul.v = _mm256_setr_pd(-1, 1, -1, 1); } else { - if(iter) - { - for ( ; (i+1) < m; i+=2) - { - /*Load 2 dcomplex elements from - * vector x - */ - x0v.v = _mm256_loadu_pd( (double *) - (x + i) ); - /* x1v.v holds imaginary part of dcomplex - * elements from vector x - */ - x1v.v = _mm256_permute_pd( x0v.v, 15 ); - /* x1v.v holds real part of dcomplex - * elements from vector x - */ - x0v.v = _mm256_permute_pd( x0v.v, 0 ); - - /*Load 2x6 tile of matrix A*/ - a0v.v = _mm256_loadu_pd( (double *) - (a + i + 0 * lda)); - a1v.v = _mm256_loadu_pd( (double *) - (a + i + 1 * lda)); - a2v.v = _mm256_loadu_pd( (double *) - (a + i + 2 * lda)); - a3v.v = _mm256_loadu_pd( (double *) - (a + i + 3 * lda)); - a4v.v = _mm256_loadu_pd( (double *) - (a + i + 4 * lda)); - a5v.v = _mm256_loadu_pd( (double *) - (a + i + 5 * lda)); - - // perform: rho?v += a?v * x0v; - rho0v.v = _mm256_fmadd_pd( a0v.v, - x0v.v, rho0v.v ); - rho6v.v = _mm256_fmadd_pd( a0v.v, - x1v.v, rho6v.v ); - - rho1v.v = _mm256_fmadd_pd( a1v.v, - x0v.v, rho1v.v ); - rho7v.v = _mm256_fmadd_pd( a1v.v, - x1v.v, rho7v.v ); - - rho2v.v = _mm256_fmadd_pd( a2v.v, - x0v.v, rho2v.v ); - rho8v.v = _mm256_fmadd_pd( a2v.v, - x1v.v, rho8v.v ); - - rho3v.v = _mm256_fmadd_pd( a3v.v, - x0v.v, rho3v.v ); - rho9v.v = _mm256_fmadd_pd( a3v.v, - x1v.v, rho9v.v ); - - rho4v.v = _mm256_fmadd_pd( a4v.v, - x0v.v, rho4v.v ); - rho10v.v = _mm256_fmadd_pd( a4v.v, - x1v.v, rho10v.v ); - - rho5v.v = _mm256_fmadd_pd( a5v.v, - x0v.v, rho5v.v ); - rho11v.v = _mm256_fmadd_pd( a5v.v, - x1v.v, rho11v.v ); - } + conj_mul.v = _mm256_setr_pd(1, -1, 1, -1); + } - /*Swapping position of real and imag component - * for horizontal addition to get the final - * dot product computation - * rho register are holding computation which needs - * to be arranged in following manner. - * Ra0*Ix0 | Ia0*Ix0 | Ra1*Ix1 | Ia1*Ix1 - * || - * \/ - * Ia0*Ix0 | Ra0*Ix0 | Ia1*Ix1 | Ra1*Ix1 - */ - rho6v.v = _mm256_permute_pd(rho6v.v, 0x05); - rho7v.v = _mm256_permute_pd(rho7v.v, 0x05); - rho8v.v = _mm256_permute_pd(rho8v.v, 0x05); - rho9v.v = _mm256_permute_pd(rho9v.v, 0x05); - rho10v.v = _mm256_permute_pd(rho10v.v, 0x05); - rho11v.v = _mm256_permute_pd(rho11v.v, 0x05); + /*Swapping position of real and imag component + * for horizontal addition to get the final + * dot product computation + * rho register are holding computation which needs + * to be arranged in following manner. + * Ra0*Ix0 | Ia0*Ix0 | Ra1*Ix1 | Ia1*Ix1 + * || + * \/ + * Ia0*Ix0 | Ra0*Ix0 | Ia1*Ix1 | Ra1*Ix1 + */ + rhov[6].v = _mm256_permute_pd(rhov[6].v, 0x05); + rhov[7].v = _mm256_permute_pd(rhov[7].v, 0x05); + rhov[8].v = _mm256_permute_pd(rhov[8].v, 0x05); + rhov[9].v = _mm256_permute_pd(rhov[9].v, 0x05); + rhov[10].v = _mm256_permute_pd(rhov[10].v, 0x05); + rhov[11].v = _mm256_permute_pd(rhov[11].v, 0x05); + + /* + Modifying the imag sign according to the conj value + */ + rhov[6].v = _mm256_mul_pd(rhov[6].v, conj_mul.v); + rhov[7].v = _mm256_mul_pd(rhov[7].v, conj_mul.v); + rhov[8].v = _mm256_mul_pd(rhov[8].v, conj_mul.v); + rhov[9].v = _mm256_mul_pd(rhov[9].v, conj_mul.v); + rhov[10].v = _mm256_mul_pd(rhov[10].v, conj_mul.v); + rhov[11].v = _mm256_mul_pd(rhov[11].v, conj_mul.v); + + rhov[0].v = _mm256_add_pd(rhov[0].v, rhov[6].v); + rhov[1].v = _mm256_add_pd(rhov[1].v, rhov[7].v); + rhov[2].v = _mm256_add_pd(rhov[2].v, rhov[8].v); + rhov[3].v = _mm256_add_pd(rhov[3].v, rhov[9].v); + rhov[4].v = _mm256_add_pd(rhov[4].v, rhov[10].v); + rhov[5].v = _mm256_add_pd(rhov[5].v, rhov[11].v); + + /*rho0, rho1, rho2 holds final dot product + * result of 6 dcomplex elements. + */ + rhov[0].d[0] += rhov[0].d[2]; + rhov[0].d[1] += rhov[0].d[3]; - /*Negating imaginary part for computing - * the final result of dcomplex multiplication - */ - rho6v.v = _mm256_mul_pd(rho6v.v, conju); - rho7v.v = _mm256_mul_pd(rho7v.v, conju); - rho8v.v = _mm256_mul_pd(rho8v.v, conju); - rho9v.v = _mm256_mul_pd(rho9v.v, conju); - rho10v.v = _mm256_mul_pd(rho10v.v, conju); - rho11v.v = _mm256_mul_pd(rho11v.v, conju); - - rho0v.v = _mm256_add_pd(rho0v.v, rho6v.v); - rho1v.v = _mm256_add_pd(rho1v.v, rho7v.v); - rho2v.v = _mm256_add_pd(rho2v.v, rho8v.v); - rho3v.v = _mm256_add_pd(rho3v.v, rho9v.v); - rho4v.v = _mm256_add_pd(rho4v.v, rho10v.v); - rho5v.v = _mm256_add_pd(rho5v.v, rho11v.v); - - /*rho0, rho1, rho2 holds final dot product - * result of 6 dcomplex elements. - */ - rho0v.d[0] += rho0v.d[2]; - rho0v.d[1] += rho0v.d[3]; + rhov[0].d[2] = rhov[1].d[0] + rhov[1].d[2]; + rhov[0].d[3] = rhov[1].d[1] + rhov[1].d[3]; - rho0v.d[2] = rho1v.d[0] + rho1v.d[2]; - rho0v.d[3] = rho1v.d[1] + rho1v.d[3]; + rhov[1].d[0] = rhov[2].d[0] + rhov[2].d[2]; + rhov[1].d[1] = rhov[2].d[1] + rhov[2].d[3]; - rho1v.d[0] = rho2v.d[0] + rho2v.d[2]; - rho1v.d[1] = rho2v.d[1] + rho2v.d[3]; + rhov[1].d[2] = rhov[3].d[0] + rhov[3].d[2]; + rhov[1].d[3] = rhov[3].d[1] + rhov[3].d[3]; - rho1v.d[2] = rho3v.d[0] + rho3v.d[2]; - rho1v.d[3] = rho3v.d[1] + rho3v.d[3]; + rhov[2].d[0] = rhov[4].d[0] + rhov[4].d[2]; + rhov[2].d[1] = rhov[4].d[1] + rhov[4].d[3]; - rho2v.d[0] = rho4v.d[0] + rho4v.d[2]; - rho2v.d[1] = rho4v.d[1] + rho4v.d[3]; + rhov[2].d[2] = rhov[5].d[0] + rhov[5].d[2]; + rhov[2].d[3] = rhov[5].d[1] + rhov[5].d[3]; - rho2v.d[2] = rho5v.d[0] + rho5v.d[2]; - rho2v.d[3] = rho5v.d[1] + rho5v.d[3]; + /* + Computed dot product result is being stored + in temp buffer r for further computation. + */ + _mm256_storeu_pd((double *)res, rhov[0].v); + _mm256_storeu_pd((double *)(res + 2), rhov[1].v); + _mm256_storeu_pd((double *)(res + 4), rhov[2].v); + } - /*Computed dot product result is being stored - * in temp buffer r for further computation. - */ - _mm256_storeu_pd((double *)r, rho0v.v); - _mm256_storeu_pd((double *)(r+2) , rho1v.v); - _mm256_storeu_pd((double *)(r+4) , rho2v.v); + // This section will have the whole of compute when incx != 1 || inca != 1 + if (rem) + { + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from AVX to SSE instructions (which may occur later, + // especially if BLIS is compiled with -mfpmath=sse). + _mm256_zeroupper(); + + v2df_t rhov[12], a_vec[6], xv[2], conj_mul; + + rhov[0].v = _mm_setzero_pd(); + rhov[1].v = _mm_setzero_pd(); + rhov[2].v = _mm_setzero_pd(); + rhov[3].v = _mm_setzero_pd(); + rhov[4].v = _mm_setzero_pd(); + rhov[5].v = _mm_setzero_pd(); + rhov[6].v = _mm_setzero_pd(); + rhov[7].v = _mm_setzero_pd(); + rhov[8].v = _mm_setzero_pd(); + rhov[9].v = _mm_setzero_pd(); + rhov[10].v = _mm_setzero_pd(); + rhov[11].v = _mm_setzero_pd(); + + for (dim_t i = 0; i < rem; i++) + { + // Load 2 dcomplex elements from vector x + xv[0].v = _mm_loadu_pd(x_temp); - } - if(rem) - { - PRAGMA_SIMD - for(dim_t p = 0; p < 6 ; p++) - { - PASTEMAC(z,axpyjs)(a[i + p*lda] - , x[i], r[p] ); - } - } - } + // xv[1].v - R0 I0 R1 I1 => I0 I0 I1 I1 + xv[1].v = _mm_permute_pd(xv[0].v, 0b11); - if ( bli_is_conj( conjat ) ) - for ( dim_t i = 0; i < 6; ++i ) - { - PASTEMAC(z,conjs)( r[i] ); - } + // xv[0].v - R0 I0 R1 I1 => R0 R0 R1 R1 + xv[0].v = _mm_permute_pd(xv[0].v, 0b00); - /*scaling dot product result with alpha and - * adding the result to vector - */ - for ( dim_t i = 0; i < 6; ++i ) + a_vec[0].v = _mm_loadu_pd((double *)(av[0])); + a_vec[1].v = _mm_loadu_pd((double *)(av[1])); + a_vec[2].v = _mm_loadu_pd((double *)(av[2])); + a_vec[3].v = _mm_loadu_pd((double *)(av[3])); + a_vec[4].v = _mm_loadu_pd((double *)(av[4])); + a_vec[5].v = _mm_loadu_pd((double *)(av[5])); + + // perform: rho?v += a?v * xv[0]; + rhov[0].v = _mm_fmadd_pd(a_vec[0].v, xv[0].v, rhov[0].v); + rhov[6].v = _mm_fmadd_pd(a_vec[0].v, xv[1].v, rhov[6].v); + + rhov[1].v = _mm_fmadd_pd(a_vec[1].v, xv[0].v, rhov[1].v); + rhov[7].v = _mm_fmadd_pd(a_vec[1].v, xv[1].v, rhov[7].v); + + rhov[2].v = _mm_fmadd_pd(a_vec[2].v, xv[0].v, rhov[2].v); + rhov[8].v = _mm_fmadd_pd(a_vec[2].v, xv[1].v, rhov[8].v); + + rhov[3].v = _mm_fmadd_pd(a_vec[3].v, xv[0].v, rhov[3].v); + rhov[9].v = _mm_fmadd_pd(a_vec[3].v, xv[1].v, rhov[9].v); + + rhov[4].v = _mm_fmadd_pd(a_vec[4].v, xv[0].v, rhov[4].v); + rhov[10].v = _mm_fmadd_pd(a_vec[4].v, xv[1].v, rhov[10].v); + + rhov[5].v = _mm_fmadd_pd(a_vec[5].v, xv[0].v, rhov[5].v); + rhov[11].v = _mm_fmadd_pd(a_vec[5].v, xv[1].v, rhov[11].v); + + av[0] += 2 * inca; + av[1] += 2 * inca; + av[2] += 2 * inca; + av[3] += 2 * inca; + av[4] += 2 * inca; + av[5] += 2 * inca; + + x_temp += 2 * incx; + } + + if (bli_is_noconj(conjx_use)) + { + conj_mul.v = _mm_setr_pd(-1, 1); + } + else { - PASTEMAC(z,axpys)( *alpha, r[i], y[i] ); + conj_mul.v = _mm_setr_pd(1, -1); } + + rhov[6].v = _mm_permute_pd(rhov[6].v, 0b01); + rhov[7].v = _mm_permute_pd(rhov[7].v, 0b01); + rhov[8].v = _mm_permute_pd(rhov[8].v, 0b01); + rhov[9].v = _mm_permute_pd(rhov[9].v, 0b01); + rhov[10].v = _mm_permute_pd(rhov[10].v, 0b01); + rhov[11].v = _mm_permute_pd(rhov[11].v, 0b01); + + /* + Modifying the imag sign according to the conj value + */ + rhov[6].v = _mm_mul_pd(rhov[6].v, conj_mul.v); + rhov[7].v = _mm_mul_pd(rhov[7].v, conj_mul.v); + rhov[8].v = _mm_mul_pd(rhov[8].v, conj_mul.v); + rhov[9].v = _mm_mul_pd(rhov[9].v, conj_mul.v); + rhov[10].v = _mm_mul_pd(rhov[10].v, conj_mul.v); + rhov[11].v = _mm_mul_pd(rhov[11].v, conj_mul.v); + + rhov[0].v = _mm_add_pd(rhov[0].v, rhov[6].v); + rhov[1].v = _mm_add_pd(rhov[1].v, rhov[7].v); + rhov[2].v = _mm_add_pd(rhov[2].v, rhov[8].v); + rhov[3].v = _mm_add_pd(rhov[3].v, rhov[9].v); + rhov[4].v = _mm_add_pd(rhov[4].v, rhov[10].v); + rhov[5].v = _mm_add_pd(rhov[5].v, rhov[11].v); + + rhov[6].v = _mm_loadu_pd((double *)(res)); + rhov[7].v = _mm_loadu_pd((double *)(res + 1)); + rhov[8].v = _mm_loadu_pd((double *)(res + 2)); + rhov[9].v = _mm_loadu_pd((double *)(res + 3)); + rhov[10].v = _mm_loadu_pd((double *)(res + 4)); + rhov[11].v = _mm_loadu_pd((double *)(res + 5)); + + rhov[0].v = _mm_add_pd(rhov[0].v, rhov[6].v); + rhov[1].v = _mm_add_pd(rhov[1].v, rhov[7].v); + rhov[2].v = _mm_add_pd(rhov[2].v, rhov[8].v); + rhov[3].v = _mm_add_pd(rhov[3].v, rhov[9].v); + rhov[4].v = _mm_add_pd(rhov[4].v, rhov[10].v); + rhov[5].v = _mm_add_pd(rhov[5].v, rhov[11].v); + + /* + Computed dot product result is being stored + in temp buffer r for further computation. + */ + _mm_storeu_pd((double *)res, rhov[0].v); + _mm_storeu_pd((double *)(res + 1), rhov[1].v); + _mm_storeu_pd((double *)(res + 2), rhov[2].v); + _mm_storeu_pd((double *)(res + 3), rhov[3].v); + _mm_storeu_pd((double *)(res + 4), rhov[4].v); + _mm_storeu_pd((double *)(res + 5), rhov[5].v); + + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from AVX to SSE instructions (which may occur later, + // especially if BLIS is compiled with -mfpmath=sse). + _mm256_zeroupper(); } - else + + // Multiplying 'A' * 'x' by 'alpha' + __m256d alpha_r, alpha_i, temp_v[3]; + v4df_t rhov[3]; + + rhov[0].v = _mm256_loadu_pd((double *)(res)); + rhov[1].v = _mm256_loadu_pd((double *)(res + 2)); + rhov[2].v = _mm256_loadu_pd((double *)(res + 4)); + + if (bli_is_conj(conjat)) { - /* Query the context for the kernel function pointer. */ - const num_t dt = PASTEMAC(z,type); - PASTECH(z,dotxv_ker_ft) kfp_dv - = - bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); + __m256d conj_mul = _mm256_setr_pd(1, -1, 1, -1); - for ( dim_t i = 0; i < b_n; ++i ) - { - dcomplex* restrict a1 = a + (0 )*inca + (i )*lda; - dcomplex* restrict x1 = x + (0 )*incx; - dcomplex* restrict psi1 = y + (i )*incy; + rhov[0].v = _mm256_mul_pd(rhov[0].v, conj_mul); + rhov[1].v = _mm256_mul_pd(rhov[1].v, conj_mul); + rhov[2].v = _mm256_mul_pd(rhov[2].v, conj_mul); + } - kfp_dv - ( - conjat, - conjx, - m, - alpha, - a1, inca, - x1, incx, - beta, - psi1, - cntx - ); + alpha_r = _mm256_broadcast_sd(&((*alpha).real)); + alpha_i = _mm256_broadcast_sd(&((*alpha).imag)); + + temp_v[0] = _mm256_mul_pd(rhov[0].v, alpha_i); + temp_v[1] = _mm256_mul_pd(rhov[1].v, alpha_i); + temp_v[2] = _mm256_mul_pd(rhov[2].v, alpha_i); + + temp_v[0] = _mm256_permute_pd(temp_v[0], 0b0101); + temp_v[1] = _mm256_permute_pd(temp_v[1], 0b0101); + temp_v[2] = _mm256_permute_pd(temp_v[2], 0b0101); + + rhov[0].v = _mm256_fmaddsub_pd(rhov[0].v, alpha_r, temp_v[0]); + rhov[1].v = _mm256_fmaddsub_pd(rhov[1].v, alpha_r, temp_v[1]); + rhov[2].v = _mm256_fmaddsub_pd(rhov[2].v, alpha_r, temp_v[2]); + + // When 'beta' is not zero we need to multiply scale 'y' by 'beta' + if (!PASTEMAC(z, eq0)(*beta)) + { + v4df_t yv[3]; + __m256d beta_r, beta_i; + + beta_r = _mm256_broadcast_sd(&((*beta).real)); + beta_i = _mm256_broadcast_sd(&((*beta).imag)); + + if (incy == 1) + { + yv[0].v = _mm256_loadu_pd((double *)(y)); + yv[1].v = _mm256_loadu_pd((double *)(y + 2)); + yv[2].v = _mm256_loadu_pd((double *)(y + 4)); } + else + { + /* + This can be done using SSE instructions + but has been kept as scalar code to avoid + mixing SSE with AVX + */ + yv[0].d[0] = (*(y + 0 * incy)).real; + yv[0].d[1] = (*(y + 0 * incy)).imag; + yv[0].d[2] = (*(y + 1 * incy)).real; + yv[0].d[3] = (*(y + 1 * incy)).imag; + + yv[1].d[0] = (*(y + 2 * incy)).real; + yv[1].d[1] = (*(y + 2 * incy)).imag; + yv[1].d[2] = (*(y + 3 * incy)).real; + yv[1].d[3] = (*(y + 3 * incy)).imag; + + yv[2].d[0] = (*(y + 4 * incy)).real; + yv[2].d[1] = (*(y + 4 * incy)).imag; + yv[2].d[2] = (*(y + 5 * incy)).real; + yv[2].d[3] = (*(y + 5 * incy)).imag; + } + + temp_v[0] = _mm256_mul_pd(yv[0].v, beta_i); + temp_v[1] = _mm256_mul_pd(yv[1].v, beta_i); + temp_v[2] = _mm256_mul_pd(yv[2].v, beta_i); + + temp_v[0] = _mm256_permute_pd(temp_v[0], 0b0101); + temp_v[1] = _mm256_permute_pd(temp_v[1], 0b0101); + temp_v[2] = _mm256_permute_pd(temp_v[2], 0b0101); + + yv[0].v = _mm256_fmaddsub_pd(yv[0].v, beta_r, temp_v[0]); + yv[1].v = _mm256_fmaddsub_pd(yv[1].v, beta_r, temp_v[1]); + yv[2].v = _mm256_fmaddsub_pd(yv[2].v, beta_r, temp_v[2]); + + // Here we 'rhov' has 'alpha' * 'A' * 'x' that is added with 'y' + rhov[0].v = _mm256_add_pd(yv[0].v, rhov[0].v); + rhov[1].v = _mm256_add_pd(yv[1].v, rhov[1].v); + rhov[2].v = _mm256_add_pd(yv[2].v, rhov[2].v); } + if (incy == 1) + { + _mm256_storeu_pd((double *)y, rhov[0].v); + _mm256_storeu_pd((double *)(y + 2), rhov[1].v); + _mm256_storeu_pd((double *)(y + 4), rhov[2].v); + } + else + { + (*(y + 0 * incy)).real = rhov[0].d[0]; + (*(y + 0 * incy)).imag = rhov[0].d[1]; + (*(y + 1 * incy)).real = rhov[0].d[2]; + (*(y + 1 * incy)).imag = rhov[0].d[3]; + + (*(y + 2 * incy)).real = rhov[1].d[0]; + (*(y + 2 * incy)).imag = rhov[1].d[1]; + (*(y + 3 * incy)).real = rhov[1].d[2]; + (*(y + 3 * incy)).imag = rhov[1].d[3]; + + (*(y + 4 * incy)).real = rhov[2].d[0]; + (*(y + 4 * incy)).imag = rhov[2].d[1]; + (*(y + 5 * incy)).real = rhov[2].d[2]; + (*(y + 5 * incy)).imag = rhov[2].d[3]; + } } - /** * Performs dotxf operation on scomplex. * x and y are vectors and a is the matrix. From 9a2a4151ac415f65d4721d2378f343cbcf53b6bc Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Tue, 10 Oct 2023 11:25:36 +0530 Subject: [PATCH 159/226] Added improved ZTRSM AVX2 kernels - Added 2x6 ZGEMM row-preferred kernel. - Kernel supports prefetch_a, prefetch_b, prefetch_a_next and prefetch_b_next. - Multiple Ways to prefetch c are supported. - prefetch_a and prefetch_c are enabled by default. - K loop is divided into multiple subloops for better c prefetch. - Added 2x6 ZTRSM row-preferred lower and upper kernels using AVX2 ISA. - These kernels are used for ZTRSM only, zgemm still uses 3x4 kernel. - Kernels support row/col/gen storage. - Updated the zen3 and zen4 config to enable use of these kernels for TRSM in zen3 and zen4 path. - Updated CMakeLists.txt with ZGEMM kernels for windows build. AMD-Internal: [CPUPL-3781] Change-Id: I236205f63a7f6b60bf1a5127a677d27425511e73 --- config/zen3/bli_cntx_init_zen3.c | 13 +- config/zen4/bli_cntx_init_zen4.c | 13 +- kernels/zen/3/CMakeLists.txt | 3 + kernels/zen/3/bli_zgemm_zen_2x6.c | 652 ++++++++++++++++++++++++++++ kernels/zen/3/bli_zgemmtrsm_l_2x6.c | 559 ++++++++++++++++++++++++ kernels/zen/3/bli_zgemmtrsm_u_2x6.c | 561 ++++++++++++++++++++++++ kernels/zen/bli_kernels_zen.h | 5 + 7 files changed, 1797 insertions(+), 9 deletions(-) create mode 100644 kernels/zen/3/bli_zgemm_zen_2x6.c create mode 100644 kernels/zen/3/bli_zgemmtrsm_l_2x6.c create mode 100644 kernels/zen/3/bli_zgemmtrsm_u_2x6.c diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c index 6059ba3bc7..cc508c5cca 100644 --- a/config/zen3/bli_cntx_init_zen3.c +++ b/config/zen3/bli_cntx_init_zen3.c @@ -49,19 +49,22 @@ void bli_cntx_init_zen3( cntx_t* cntx ) // their storage preferences. bli_cntx_set_l3_nat_ukrs ( - 8, + 11, // gemm BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_FOR_TRSM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen_asm_2x6, TRUE, // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_zen_asm_2x6, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_zen_asm_2x6, TRUE, cntx ); @@ -206,9 +209,11 @@ void bli_cntx_init_zen3( cntx_t* cntx ) // Using different cache block sizes for TRSM instead of common level-3 block sizes. // Tuning is done for double-precision only. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 2 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 6 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 24 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 512 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 1536 ); // Update the context with the current architecture's register and cache // blocksizes for level-3 TRSM problems. diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index 5ac6f7b26b..c7f25fa5c3 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -79,7 +79,7 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // their storage preferences. bli_cntx_set_l3_nat_ukrs ( - 10, + 13, // gemm BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_skx_asm_32x12_l2, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_32x6, FALSE, @@ -90,13 +90,16 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // Different GEMM kernels are used for TRSM for zen4 architecture BLIS_GEMM_FOR_TRSM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE, + BLIS_GEMM_FOR_TRSM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen_asm_2x6, TRUE, // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_zen4_asm_8x24, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_zen_asm_2x6, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_zen4_asm_8x24, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_zen_asm_2x6, TRUE, cntx ); @@ -241,11 +244,11 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // Using different cache block sizes for TRSM instead of common level-3 block sizes. // Tuning is done for double-precision only. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 8, 3, 12 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 24, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 120, 144, 60 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 8, 3, 2 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 24, 8, 6 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 120, 144, 24 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 512 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4008, 4080, 2004 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4008, 4080, 1536 ); // Update the context with the current architecture's register and cache // blocksizes for level-3 TRSM problems. diff --git a/kernels/zen/3/CMakeLists.txt b/kernels/zen/3/CMakeLists.txt index 97a067bb64..f8035d96b7 100644 --- a/kernels/zen/3/CMakeLists.txt +++ b/kernels/zen/3/CMakeLists.txt @@ -6,6 +6,9 @@ add_library(zen_3 ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_avx2_k1.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_avx2_k1.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_zen_2x6.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemmtrsm_l_2x6.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemmtrsm_u_2x6.c ) target_compile_options(zen_3 PRIVATE /arch:AVX2) if(BUILD_SHARED_LIBS) diff --git a/kernels/zen/3/bli_zgemm_zen_2x6.c b/kernels/zen/3/bli_zgemm_zen_2x6.c new file mode 100644 index 0000000000..1aaec9c948 --- /dev/null +++ b/kernels/zen/3/bli_zgemm_zen_2x6.c @@ -0,0 +1,652 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "bli_x86_asm_macros.h" + +#define A_L1_PREFETCH_DIST 4 +#define B_L1_PREFETCH_DIST 4 +#define TAIL_NITER 4 +#define PREFETCH_A +// #define PREFETCH_B +// #define PREFETCH_A_NEXT +// #define PREFETCH_B_NEXT +#define PREFETCH_C // perfetch c in middle loop over 2 iterations of k +// #define PREFETCH_C_SLOW // prefetch c in middle loop over 4 iterations of k +// #define PREFETCH_C_SIMPL // prefetch c before k loop + + +#ifdef PREFETCH_A + #define PREFETCH_A_L1(n, k) \ + PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*2*16 + (2*n+k)*(16))) +#else + #define PREFETCH_A_L1(n, k) +#endif + +#ifdef PREFETCH_B + #define PREFETCH_B_L1(n, k) \ + PREFETCH(0, MEM(RBX, B_L1_PREFETCH_DIST*6*16 + (6*n+(2*k))*(16))) +#else + #define PREFETCH_B_L1(n, k) +#endif + + +/* + * A Registers: YMM3 + * B Registers: YMM0, YMM1, YMM2 + * C Registers: YMM[4-15] + */ + +#define LOOP_ALIGN ALIGN32 + +#define SUBITER(n) \ +\ + PREFETCH_A_L1(n, 0)\ + VBROADCASTSD(YMM(3), MEM(RAX,(4*n+0)*8)) \ + VFMADD231PD(YMM(4), YMM(0), YMM(3)) \ + VFMADD231PD(YMM(5), YMM(1), YMM(3)) \ + VFMADD231PD(YMM(6), YMM(2), YMM(3)) \ + VBROADCASTSD(YMM(3), MEM(RAX,(4*n+1)*8)) \ + VFMADD231PD(YMM(7), YMM(0), YMM(3)) \ + VFMADD231PD(YMM(8), YMM(1), YMM(3)) \ + VFMADD231PD(YMM(9), YMM(2), YMM(3)) \ + \ + PREFETCH_B_L1(n, 0)\ + VBROADCASTSD(YMM( 3), MEM(RAX,(4*n+2)*8)) \ + VFMADD231PD(YMM(10), YMM(0), YMM(3)) \ + VFMADD231PD(YMM(11), YMM(1), YMM(3)) \ + VFMADD231PD(YMM(12), YMM(2), YMM(3)) \ + VBROADCASTSD(YMM( 3), MEM(RAX,(4*n+3)*8)) \ + VFMADD231PD(YMM(13), YMM(0), YMM(3)) \ + VFMADD231PD(YMM(14), YMM(1), YMM(3)) \ + VFMADD231PD(YMM(15), YMM(2), YMM(3)) \ + \ + VMOVAPD(YMM(0), MEM(RBX,(6*n+0)*16)) \ + VMOVAPD(YMM(1), MEM(RBX,(6*n+2)*16)) \ + VMOVAPD(YMM(2), MEM(RBX,(6*n+4)*16)) \ + \ + + +/**********************************************************/ +/* Kernel : bli_zgemm_zen_asm_2x6 */ +/* It performs C = C * beta + alpha * A * B */ +/* It is row preferred kernel, A and B are packed */ +/* C could be Row/Col/Gen Stored Matrix */ +/* Registers are allocated as below */ +/* Broadcast A : YMM(3) */ +/* load B : YMM(0, 1, 2) */ +/* Accumulation of B(real,imag)*Areal : */ +/* YMM(4-6,10-12) */ +/* Accumulation of B(real,imag)*Aimag : */ +/* YMM(7-9,13-15) */ +/* Computation of A(real,imag)*B(real,imag): */ +/* YMM(4-6,10-12) */ +/**********************************************************/ +void bli_zgemm_zen_asm_2x6( + dim_t k_, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + const int64_t k = k_; + /*rowstride * size of one dcomplex element*/ + const int64_t rs_c = rs_c_*16; + /*colstride * size of one dcomplex element*/ + const int64_t cs_c = cs_c_*16; + + + char beta_mul_type = BLIS_MUL_DEFAULT; + if(beta->imag == 0.0 && beta->real == 0.0 ) + { + beta_mul_type = BLIS_MUL_ZERO; + } + + BEGIN_ASM() + + VXORPD(YMM( 4), YMM( 4), YMM( 4)) + VXORPD(YMM( 5), YMM( 5), YMM( 5)) + VMOVAPD(YMM(6) , YMM(4)) + VMOVAPD(YMM(7) , YMM(4)) + VMOVAPD(YMM(8) , YMM(4)) + VMOVAPD(YMM(9) , YMM(4)) + VMOVAPD(YMM(10), YMM(4)) + VMOVAPD(YMM(11), YMM(4)) + VMOVAPD(YMM(12), YMM(4)) + VMOVAPD(YMM(13), YMM(4)) + VMOVAPD(YMM(14), YMM(4)) + VMOVAPD(YMM(15), YMM(4)) + + MOV(RSI, VAR(k)) //loop index + MOV(RAX, VAR(a)) //load address of a + MOV(RBX, VAR(b)) //load address of b + MOV(RCX, VAR(c)) //load address of c + + #ifdef PREFETCH_C + LEA(R9, MEM(RCX, 63)) // c for prefetch, first cache line + LEA(R8, MEM(RCX, 95)) // c for prefetch, second cache line + #endif + + + VMOVAPD(YMM(0), MEM(RBX, 0*16)) //pre-load b + VMOVAPD(YMM(1), MEM(RBX, 2*16)) //pre-load b + VMOVAPD(YMM(2), MEM(RBX, 4*16)) //pre-load b + LEA(RBX, MEM(RBX,6*16)) //adjust b for pre-load + + MOV(R12, VAR(rs_c)) + MOV(R10, VAR(cs_c)) + + #if defined PREFETCH_A_NEXT || defined PREFETCH_B_NEXT + MOV(RDI, RSI) + IMUL(RDI, IMM(16*2)) // rdi = k * 16*2 + #endif + + #ifdef PREFETCH_A_NEXT + LEA(R14, MEM(RAX, RDI, 1)) // r14(a_next) = A + (k*16*2) + #endif + + #ifdef PREFETCH_B_NEXT + IMUL(RDI, IMM(3)) // rdi = k * 16*6 + LEA(R15, MEM(RBX, RDI, 1)) // r15(b_next) = B + (k*16*6) + #endif + + + MOV(RDI, RSI) + AND(RSI, IMM(3)) + SAR(RDI, IMM(2)) + + /************************************************************/ + /* Operation: */ + /* SUBITER = (Ar, Ai)*(Br, Bi) = Ar*(Br, Bi) , Ai*(Br, Bi) */ + /* Prefetch_C_SIMPLE: */ + /* LOOP1: k/4 - TAIL_NITER */ + /* LOOP2: 0 */ + /* LOOP3: 0 */ + /* LOOP4: TAIL_NITER */ + /* PREFETCH_C_SLOW: */ + /* LOOP1: k/4 - TAIL_NITER - 4 */ + /* LOOP2: 2 */ + /* LOOP3: 2 */ + /* LOOP4: TAIL_NITER */ + /* PREFETCH_C: */ + /* LOOP1: k/4 - TAIL_NITER - 2 */ + /* LOOP2: 2 */ + /* LOOP3: 0 */ + /* LOOP4: TAIL_NITER */ + /************************************************************/ + #ifdef PREFETCH_C + #ifdef PREFETCH_C_SIMPLE + /* prefetch c over 1 iteration of k*/ + SUB(RDI, IMM(0+TAIL_NITER)) + #elif defined PREFETCH_C_SLOW + /* prefetch c over 4 iterations of k*/ + SUB(RDI, IMM(4+TAIL_NITER)) + #else + /* prefetch c over 2 iterations of k*/ + SUB(RDI, IMM(2+TAIL_NITER)) + #endif + #endif + JLE(K_PREFETCH_C) + + LOOP_ALIGN + LABEL(LOOP1) + #ifdef PREFETCH_A_NEXT + PREFETCH(1, MEM(R14)) + #endif + SUBITER(0) + #ifdef PREFETCH_B_NEXT + PREFETCH(1, MEM(R15)) + #endif + SUBITER(1) + #ifdef PREFETCH_A_NEXT + PREFETCH(1, MEM(R14, 64)) + #endif + SUB(RDI, IMM(1)) + SUBITER(2) + #ifdef PREFETCH_B_NEXT + PREFETCH(1, MEM(R15, 64)) + #endif + SUBITER(3) + + LEA(RAX, MEM(RAX,4*2*16)) + LEA(RBX, MEM(RBX,4*6*16)) + #ifdef PREFETCH_A_NEXT + LEA(R14, MEM(R14,128)) + #endif + #ifdef PREFETCH_B_NEXT + LEA(R15, MEM(R15,64)) + #endif + + JNZ(LOOP1) + + LABEL(K_PREFETCH_C) + +#ifdef PREFETCH_C +#if defined PREFETCH_C_SIMPLE + /*****************************/ + /* prefetch 2x6 of C at once */ + /*****************************/ + PREFETCH(0, MEM(R9)) + PREFETCH(0, MEM(R9, 31)) + PREFETCH(0, MEM(R9,R12, 1)) + PREFETCH(0, MEM(R9,R12, 1, 31)) + PREFETCH(0, MEM(R9,R12, 2)) + PREFETCH(0, MEM(R9,R12, 2, 31)) +#else + ADD(RDI, IMM(2)) + JLE(K_TAIL_NITER) + + LOOP_ALIGN + LABEL(LOOP2) + #ifdef PREFETCH_C + PREFETCH(0, MEM(R9)) + #endif + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + #ifndef PREFETCH_C_SLOW + /************************************************/ + /* if prefetch is being done over 2 iterations, */ + /* prefetch 2 cache lines per iteration */ + /* prefetch one row of C per iteration of Loop2 */ + /************************************************/ + PREFETCH(0, MEM(R9,31)) + #endif + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*2*16)) + LEA(RBX, MEM(RBX,4*6*16)) + #ifdef PREFETCH_C + LEA(R9, MEM(R9,R12,1)) + #endif + JNZ(LOOP2) + + LABEL(K_TAIL_NITER) + + #ifdef PREFETCH_C_SLOW + ADD(RDI, IMM(2)) + JLE(K_TAIL_NITER_2) + + LOOP_ALIGN + LABEL(LOOP3) + #ifdef PREFETCH_C + PREFETCH(0, MEM(R8)) + #endif + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*2*16)) + LEA(RBX, MEM(RBX,4*6*16)) + #ifdef PREFETCH_C + LEA(R8, MEM(R8,R12,1)) + #endif + JNZ(LOOP3) + LABEL(K_TAIL_NITER_2) + + #endif //PREFETCH_C_SLOW + +#endif //PREFETCH_C_SIMPLE + ADD(RDI, IMM(0+TAIL_NITER)) + JLE(TAIL) + + LOOP_ALIGN + LABEL(LOOP4) + + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*2*16)) + LEA(RBX, MEM(RBX,4*6*16)) + + JNZ(LOOP4) + +#endif //PREFETCH_C + + LABEL(TAIL) + + TEST(RSI, RSI) + JZ(POSTACCUM) + + LOOP_ALIGN + LABEL(TAIL_LOOP) + + SUB(RSI, IMM(1)) + SUBITER(0) + LEA(RAX, MEM(RAX,2*16)) + LEA(RBX, MEM(RBX,6*16)) + + JNZ(TAIL_LOOP) + + LABEL(POSTACCUM) + + VPERMILPD(YMM( 7), YMM( 7), IMM(0x5)) + VPERMILPD(YMM( 8), YMM( 8), IMM(0x5)) + VPERMILPD(YMM( 9), YMM( 9), IMM(0x5)) + VPERMILPD(YMM(13), YMM(13), IMM(0x5)) + VPERMILPD(YMM(14), YMM(14), IMM(0x5)) + VPERMILPD(YMM(15), YMM(15), IMM(0x5)) + + VADDSUBPD(YMM(4), YMM(4), YMM(7)) + VADDSUBPD(YMM(5), YMM(5), YMM(8)) + VADDSUBPD(YMM(6), YMM(6), YMM(9)) + + VADDSUBPD(YMM(10), YMM(10), YMM(13)) + VADDSUBPD(YMM(11), YMM(11), YMM(14)) + VADDSUBPD(YMM(12), YMM(12), YMM(15)) + + /******************/ + /* scale by alpha */ + /******************/ + MOV(RAX, VAR(alpha)) + VBROADCASTSD(YMM(0), MEM(RAX)) + VBROADCASTSD(YMM(1), MEM(RAX, 8)) + + VPERMILPD(YMM(3), YMM(4), IMM(0X5)) + VMULPD(YMM(4), YMM(4), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(4), YMM(4), YMM(3)) + + VPERMILPD(YMM(3), YMM(5), IMM(0X5)) + VMULPD(YMM(5), YMM(5), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(5), YMM(5), YMM(3)) + + VPERMILPD(YMM(3), YMM(6), IMM(0X5)) + VMULPD(YMM(6), YMM(6), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(6), YMM(6), YMM(3)) + + // ROW 2 + VPERMILPD(YMM(3), YMM(10), IMM(0X5)) + VMULPD(YMM(10), YMM(10), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(10), YMM(10), YMM(3)) + + VPERMILPD(YMM(3), YMM(11), IMM(0X5)) + VMULPD(YMM(11), YMM(11), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(11), YMM(11), YMM(3)) + + VPERMILPD(YMM(3), YMM(12), IMM(0X5)) + VMULPD(YMM(12), YMM(12), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(12), YMM(12), YMM(3)) + + + MOV(RBX, VAR(beta)) + VBROADCASTSD(YMM(1), MEM(RBX)) + VBROADCASTSD(YMM(2), MEM(RBX, 8)) + + + MOV(AL, VAR(beta_mul_type)) + CMP(AL, IMM(0)) + JE(.ZBETAZERO) + + CMP(R10, IMM(16)) //CS == 1 IMPLIES ROW STORED + JNZ(.ZCOLSTORED) + + LABEL(.ZROWSTORED) + LEA(RDX, MEM(RCX, R12, 1)) + + // ROW 1 + VMOVUPD(YMM(0), MEM(RCX)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(4)) + VMOVUPD(MEM(RCX), YMM(0)) + + VMOVUPD(YMM(0), MEM(RCX, R10, 2)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(5)) + VMOVUPD(MEM(RCX, R10, 2), YMM(0)) + + VMOVUPD(YMM(0), MEM(RCX, R10, 4)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(6)) + VMOVUPD(MEM(RCX, R10, 4), YMM(0)) + + //ROW 2 + VMOVUPD(YMM(0), MEM(RDX)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(10)) + VMOVUPD(MEM(RDX), YMM(0)) + + VMOVUPD(YMM(0), MEM(RDX, R10, 2)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(11)) + VMOVUPD(MEM(RDX, R10, 2), YMM(0)) + + VMOVUPD(YMM(0), MEM(RDX, R10, 4)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(12)) + VMOVUPD(MEM(RDX, R10, 4), YMM(0)) + + JMP(.ZDONE) + + LABEL(.ZCOLSTORED) + LEA(RDX, MEM(RCX, R12, 1)) + LEA(RDI, MEM(, R10, 2)) + + VMOVUPD(XMM(0), MEM(RCX )) + VMOVUPD(XMM(3), MEM(RCX, R10, 1)) + VINSERTF128(YMM(0), YMM(0), XMM(3), IMM(0x1)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(4)) + VEXTRACTF128(XMM(3), YMM(0), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(0)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + ADD(RCX, RDI) + + VMOVUPD(XMM(0), MEM(RCX )) + VMOVUPD(XMM(3), MEM(RCX, R10, 1)) + VINSERTF128(YMM(0), YMM(0), XMM(3), IMM(0x1)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(5)) + VEXTRACTF128(XMM(3), YMM(0), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(0)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + ADD(RCX, RDI) + + VMOVUPD(XMM(0), MEM(RCX )) + VMOVUPD(XMM(3), MEM(RCX, R10, 1)) + VINSERTF128(YMM(0), YMM(0), XMM(3), IMM(0x1)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(6)) + VEXTRACTF128(XMM(3), YMM(0), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(0)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + + + VMOVUPD(XMM(0), MEM(RDX )) + VMOVUPD(XMM(3), MEM(RDX, R10, 1)) + VINSERTF128(YMM(0), YMM(0), XMM(3), IMM(0x1)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(10)) + VEXTRACTF128(XMM(3), YMM(0), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(0)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + ADD(RDX, RDI) + + VMOVUPD(XMM(0), MEM(RDX )) + VMOVUPD(XMM(3), MEM(RDX, R10, 1)) + VINSERTF128(YMM(0), YMM(0), XMM(3), IMM(0x1)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(11)) + VEXTRACTF128(XMM(3), YMM(0), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(0)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + ADD(RDX, RDI) + + VMOVUPD(XMM(0), MEM(RDX )) + VMOVUPD(XMM(3), MEM(RDX, R10, 1)) + VINSERTF128(YMM(0), YMM(0), XMM(3), IMM(0x1)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VADDPD(YMM(0), YMM(0), YMM(12)) + VEXTRACTF128(XMM(3), YMM(0), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(0)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + ADD(RDX, RDI) + + + JMP(.ZDONE) + + LABEL(.ZBETAZERO) + CMP(R12, IMM(16)) + JNZ(.ZROWSTORBZ) + + LABEL(.ZCOLSTORBZ) + LEA(RDX, MEM(RCX, R12, 1)) + LEA(RDI, MEM(, R10, 2)) + + VEXTRACTF128(XMM(3), YMM(4), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(4)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + ADD(RCX, RDI) + + VEXTRACTF128(XMM(3), YMM(5), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(5)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + ADD(RCX, RDI) + + VEXTRACTF128(XMM(3), YMM(6), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(6)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + + + VEXTRACTF128(XMM(3), YMM(10), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(10)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + ADD(RDX, RDI) + + VEXTRACTF128(XMM(3), YMM(11), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(11)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + ADD(RDX, RDI) + + VEXTRACTF128(XMM(3), YMM(12), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(12)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + JMP(.ZDONE) + + + LABEL(.ZROWSTORBZ) + LEA(RDX, MEM(RCX, R12, 1)) + + VMOVUPD(MEM(RCX), YMM(4)) + VMOVUPD(MEM(RCX, R10, 2), YMM(5)) + VMOVUPD(MEM(RCX, R10, 4), YMM(6)) + + VMOVUPD(MEM(RDX), YMM(10)) + VMOVUPD(MEM(RDX, R10, 2), YMM(11)) + VMOVUPD(MEM(RDX, R10, 4), YMM(12)) + + + + LABEL(.ZDONE) + + + VZEROUPPER() + + END_ASM + ( + : // output operands (none) + : // input operands + [beta_mul_type] "m" (beta_mul_type), + [k] "m" (k), + [a] "m" (a), + [b] "m" (b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13", "xmm14", "xmm15", + "memory" + ) +} \ No newline at end of file diff --git a/kernels/zen/3/bli_zgemmtrsm_l_2x6.c b/kernels/zen/3/bli_zgemmtrsm_l_2x6.c new file mode 100644 index 0000000000..4d11a6648b --- /dev/null +++ b/kernels/zen/3/bli_zgemmtrsm_l_2x6.c @@ -0,0 +1,559 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "bli_x86_asm_macros.h" + +#define A_L1_PREFETCH_DIST 4 +#define B_L1_PREFETCH_DIST 4 +#define TAIL_NITER 6 + +#define PREFETCH_A_L1(n, k) \ + PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*2*16 + (2*n+k)*(16))) +#define PREFETCH_B_L1(n, k) \ + PREFETCH(0, MEM(RBX, B_L1_PREFETCH_DIST*6*16 + (2*n+k)*(48))) + +/* + * A Registers: YMM3 + * B Registers: YMM0, YMM1, YMM2 + * C Registers: YMM[4-15] + */ + +#define LOOP_ALIGN ALIGN32 + +#define SUBITER(n) \ +\ + PREFETCH_A_L1(n, 0) \ + VBROADCASTSD(YMM( 3), MEM(RAX,(4*n+ 0)*8)) \ + VFMADD231PD(YMM( 4), YMM(0), YMM(3)) \ + VFMADD231PD(YMM( 5), YMM(1), YMM(3)) \ + VFMADD231PD(YMM( 6), YMM(2), YMM(3)) \ + VBROADCASTSD(YMM( 3), MEM(RAX,(4*n+ 1)*8)) \ + VFMADD231PD(YMM( 7), YMM(0), YMM(3)) \ + VFMADD231PD(YMM( 8), YMM(1), YMM(3)) \ + VFMADD231PD(YMM( 9), YMM(2), YMM(3)) \ + \ + PREFETCH_B_L1(n, 0) \ + VBROADCASTSD(YMM( 3), MEM(RAX,(4*n+ 2)*8)) \ + VFMADD231PD(YMM(10), YMM(0), YMM(3)) \ + VFMADD231PD(YMM(11), YMM(1), YMM(3)) \ + VFMADD231PD(YMM(12), YMM(2), YMM(3)) \ + VBROADCASTSD(YMM( 3), MEM(RAX,(4*n+ 3)*8)) \ + VFMADD231PD(YMM(13), YMM(0), YMM(3)) \ + VFMADD231PD(YMM(14), YMM(1), YMM(3)) \ + VFMADD231PD(YMM(15), YMM(2), YMM(3)) \ + \ + VMOVAPD(YMM(0), MEM(RBX,(6*n+0)*16)) \ + VMOVAPD(YMM(1), MEM(RBX,(6*n+2)*16)) \ + VMOVAPD(YMM(2), MEM(RBX,(6*n+4)*16)) \ + +// used for division of complex number if TRSM_PREINV is disabled +static double negative[4] __attribute__((aligned(64))) + = {-1, -1, -1, -1}; + +/**********************************************************/ +/* Kernel : bli_zgemmtrsm_l_zen_asm_2x6 */ +/* It performs A * X = alpha * B */ +/* It is row preferred kernel, A and B are packed */ +/* C could be Row/Col/Gen Stored Matrix */ +/* Registers are allocated as below */ +/* Broadcast A : YMM(3) */ +/* load B : YMM(0, 1, 2) */ +/* Accumulation of B(real,imag)*Areal : */ +/* YMM(4-6,10-12) */ +/* Accumulation of B(real,imag)*Aimag : */ +/* YMM(7-9,13-15) */ +/* Computation of A(real,imag)*B(real,imag): */ +/* YMM(4-6,10-12) */ +/**********************************************************/ +void bli_zgemmtrsm_l_zen_asm_2x6 + ( + dim_t k_, + dcomplex* restrict alpha, + dcomplex* restrict a10, + dcomplex* restrict a11, + dcomplex* restrict b01, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + const int64_t k = k_; + /*rowstride * size of one dcomplex element*/ + const int64_t rs_c = rs_c_*16; + /*colstride * size of one dcomplex element*/ + const int64_t cs_c = cs_c_*16; + const double* negPtr = &negative[0]; + + + BEGIN_ASM() + + VXORPD(YMM( 4), YMM( 4), YMM( 4)) + VXORPD(YMM( 5), YMM( 5), YMM( 5)) + VMOVAPD(YMM(6) , YMM(4)) + VMOVAPD(YMM(7) , YMM(4)) + VMOVAPD(YMM(8) , YMM(4)) + VMOVAPD(YMM(9) , YMM(4)) + VXORPD(YMM(10), YMM(10), YMM(10)) + VXORPD(YMM(11), YMM(11), YMM(11)) + VMOVAPD(YMM(12), YMM(4)) + VMOVAPD(YMM(13), YMM(4)) + VMOVAPD(YMM(14), YMM(4)) + VMOVAPD(YMM(15), YMM(4)) + + MOV(RSI, VAR(k)) //loop index + MOV(RAX, VAR(a10)) //load address of a + MOV(RBX, VAR(b01)) //load address of b + MOV(RCX, VAR(b11)) //load address of c + MOV(R9, VAR(c11)) // load C for prefetch + MOV(R11, VAR(negPtr)) + + VMOVAPD(YMM(0), MEM(RBX, 0*16)) //pre-load b + VMOVAPD(YMM(1), MEM(RBX, 2*16)) //pre-load b + VMOVAPD(YMM(2), MEM(RBX, 4*16)) //pre-load b + LEA(RBX, MEM(RBX,6*16)) //adjust b for pre-load + + MOV(R12, VAR(rs_c)) + MOV(R10, VAR(cs_c)) + + MOV(RDI, RSI) + AND(RSI, IMM(3)) + SAR(RDI, IMM(2)) + + /************************************************************/ + /* Operation: */ + /* SUBITER = (Ar, Ai)*(Br, Bi) = Ar*(Br, Bi) , Ai*(Br, Bi) */ + /* Loop counts: */ + /* LOOP1: k/4 - TAIL_NITER - 2 */ + /* LOOP2: 2 <--prefetch_c */ + /* LOOP4: TAIL_NITER */ + /************************************************************/ + SUB(RDI, IMM(2+TAIL_NITER)) + JLE(K_PREFETCH_C) + + LOOP_ALIGN + LABEL(LOOP1) + + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*2*16)) + LEA(RBX, MEM(RBX,4*6*16)) + + + JNZ(LOOP1) + + LABEL(K_PREFETCH_C) + + ADD(RDI, IMM(2)) + JLE(K_TAIL_NITER) + + LOOP_ALIGN + LABEL(LOOP2) + + PREFETCH(0, MEM(R9)) + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + PREFETCH(0, MEM(R9,64)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*2*16)) + LEA(RBX, MEM(RBX,4*6*16)) + LEA(R9, MEM(R9,R12,1)) + + JNZ(LOOP2) + + LABEL(K_TAIL_NITER) + + ADD(RDI, IMM(0+TAIL_NITER)) + JLE(TAIL) + + LOOP_ALIGN + LABEL(LOOP3) + + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*2*16)) + LEA(RBX, MEM(RBX,4*6*16)) + + JNZ(LOOP3) + + LABEL(TAIL) + + TEST(RSI, RSI) + JZ(POSTACCUM) + + LOOP_ALIGN + LABEL(TAIL_LOOP) + + SUB(RSI, IMM(1)) + SUBITER(0) + LEA(RAX, MEM(RAX,2*16)) + LEA(RBX, MEM(RBX,6*16)) + + JNZ(TAIL_LOOP) + + LABEL(POSTACCUM) + + /**************************************************/ + /* Permute imag component register. Shuffle even */ + /* and odd components */ + /* SRC: YMM7 =(Ai0*Br0, Ai0*Bi0, Ai0*Br1, Ai0*Bi1)*/ + /* DST: YMM7 =(Ai0*Bi0, Ai0*Br0, Ai0*Bi1, Ai0*Br1)*/ + /**************************************************/ + VPERMILPD(YMM( 7), YMM( 7), IMM(0x5)) + VPERMILPD(YMM( 8), YMM( 8), IMM(0x5)) + VPERMILPD(YMM( 9), YMM( 9), IMM(0x5)) + VPERMILPD(YMM(13), YMM(13), IMM(0x5)) + VPERMILPD(YMM(14), YMM(14), IMM(0x5)) + VPERMILPD(YMM(15), YMM(15), IMM(0x5)) + + /***************************************************/ + /* SRC: YMM4 = (Ar0*Br0, Ar0*Bi0, Ar0*Br1, Ar0*Bi1)*/ + /* SRC: YMM7 = (Ai0*Bi0, Ai0*Br0, Ai0*Bi1, Ai0*Br1)*/ + /* DST: YMM4 =(Ar0*Br0-Ai0*Bi0, Ai0*Br0+Ar0*Bi0, */ + /* Ar0*Br1-Ai0*Bi1, Ai0*Br1+Ar0*Bi1) */ + /***************************************************/ + VADDSUBPD(YMM(4), YMM(4), YMM(7)) + VADDSUBPD(YMM(5), YMM(5), YMM(8)) + VADDSUBPD(YMM(6), YMM(6), YMM(9)) + VADDSUBPD(YMM(10), YMM(10), YMM(13)) + VADDSUBPD(YMM(11), YMM(11), YMM(14)) + VADDSUBPD(YMM(12), YMM(12), YMM(15)) + + /*Load alpha*/ + MOV(R9, VAR(alpha)) + VBROADCASTSD(YMM(7), MEM(R9)) + VBROADCASTSD(YMM(8), MEM(R9, 8)) + MOV(RDX, RCX) + MOV(RDI, IMM(6*16)) + + VMOVUPD(YMM(0), MEM(RDX, 0*16)) + VMOVUPD(YMM(1), MEM(RDX, 2*16)) + VMOVUPD(YMM(2), MEM(RDX, 4*16)) + ADD(RDX, RDI) + + /************************************************************************/ + /* gemm_output -= C * alpha */ + /* */ + /* Let C * alpha = (a + ib) * (c + id) */ + /* (a + ib) * (c + id) = (ac - bd) + i(ad + bc) */ + /* */ + /*Steps: */ + /* YMM(0) = a0, b0, a1, b1 */ + /* YMM(3) = b0, a0, b1, a1 */ + /* YMM(0) = a0*c0, b0*c0, a1*c1, b1*c1 */ + /* YMM(3) = b0*d0, a0*d0, b1*d1, a1*d1 */ + /* YMM(0) = (a0c0 - b0d0), (b0c0 + a0d0), (a1c1 - b1d1), (b1c1 + a1d1) */ + /************************************************************************/ + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(7)) // a*c, b*c + VMULPD(YMM(3), YMM(3), YMM(8)) // b*d, a*d + VADDSUBPD(YMM(0), YMM(0), YMM(3)) // ac - bd, bc + ad + VSUBPD(YMM(4), YMM(0), YMM(4)) // gemm_output - c * alpha + + VMOVUPD(YMM(0), MEM(RDX, 0*16)) + VPERMILPD(YMM(3), YMM(1), IMM(0x5)) + VMULPD(YMM(1), YMM(1), YMM(7)) + VMULPD(YMM(3), YMM(3), YMM(8)) + VADDSUBPD(YMM(1), YMM(1), YMM(3)) + VSUBPD(YMM(5), YMM(1), YMM(5)) + + VMOVUPD(YMM(1), MEM(RDX, 2*16)) + VPERMILPD(YMM(3), YMM(2), IMM(0x5)) + VMULPD(YMM(2), YMM(2), YMM(7)) + VMULPD(YMM(3), YMM(3), YMM(8)) + VADDSUBPD(YMM(2), YMM(2), YMM(3)) + VSUBPD(YMM(6), YMM(2), YMM(6)) + + VMOVUPD(YMM(2), MEM(RDX, 4*16)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(7)) + VMULPD(YMM(3), YMM(3), YMM(8)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VSUBPD(YMM(10), YMM(0), YMM(10)) + + VPERMILPD(YMM(3), YMM(1), IMM(0x5)) + VMULPD(YMM(1), YMM(1), YMM(7)) + VMULPD(YMM(3), YMM(3), YMM(8)) + VADDSUBPD(YMM(1), YMM(1), YMM(3)) + VSUBPD(YMM(11), YMM(1), YMM(11)) + + VPERMILPD(YMM(3), YMM(2), IMM(0x5)) + VMULPD(YMM(2), YMM(2), YMM(7)) + VMULPD(YMM(3), YMM(3), YMM(8)) + VADDSUBPD(YMM(2), YMM(2), YMM(3)) + VSUBPD(YMM(12), YMM(2), YMM(12)) + + + // REGION - TRSM + MOV(RAX, VAR(a11)) + //iteration 0 ------------------------------------- + VBROADCASTSD(YMM(0), MEM(RAX, (0+0*2)*16+0)) + VBROADCASTSD(YMM(1), MEM(RAX, (0+0*2)*16+8)) + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + /****************************************************/ + /* C = C * A11 */ + /* (a + ib) * (c + id) = (ac - bd) + i(ad + bc) */ + /****************************************************/ + VPERMILPD(YMM(3), YMM(4), IMM(0x5)) + VMULPD(YMM(4), YMM(4), YMM(0)) //a*c, b*c + VMULPD(YMM(3), YMM(3), YMM(1)) //b*d, a*d + VADDSUBPD(YMM(4), YMM(4), YMM(3)) // (ac - bd), (bc + ad) + + VPERMILPD(YMM(3), YMM(5), IMM(0x5)) + VMULPD(YMM(5), YMM(5), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(5), YMM(5), YMM(3)) + + VPERMILPD(YMM(3), YMM(6), IMM(0x5)) + VMULPD(YMM(6), YMM(6), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(6), YMM(6), YMM(3)) + #else + /************************************************************************/ + /* C = C / A11 */ + /* */ + /* Let C / A11 = (a + ib) / (c + id) = */ + /* ((ac + bd) / (c^2 + d^2)) + i ((bc - ad) / (c^2+d^2)) */ + /* */ + /*Steps: */ + /* YMM(4) = a0, b0, a1, b1 */ + /* YMM(3) = b0, a0, b1, a1 */ + /* YMM(4) = a0*c0, b0*c0, a1*c1, b1*c1 */ + /* YMM(3) = b0*d0, a0*d0, b1*d1, a1*d1 */ + /* YMM(3) = -b0*d0, -a0*d0, -b1*d1, -a1*d1 */ + /* YMM(4) = (a0c0 - b0d0), (b0c0 + a0d0), (a1c1 - b1d1), (b1c1 + a1d1) */ + /* YMM(4) = (a0c0 - b0d0) / (c^2 + d^2), (b0c0 + a0d0) / (c^2 + d^2), */ + /* (a1c1 - b1d1) / (c^2 + d^2), (b1c1 + a1d1 / (c^2 + d^2) */ + /************************************************************************/ + VMOVUPD(YMM(2), MEM(R11)) // -1 + VMULPD(YMM(9), YMM(0), YMM(0)) + VFMADD231PD(YMM(9), YMM(1), YMM(1)) + + VPERMILPD(YMM(3), YMM(4), IMM(0x5)) + VMULPD(YMM(4), YMM(4), YMM(0)) // a*c, b*c + VMULPD(YMM(3), YMM(3), YMM(1)) // b*d, a*d + VMULPD(YMM(3), YMM(3), YMM(2)) // -bd, -ad + VADDSUBPD(YMM(4), YMM(4), YMM(3)) // ac + bd, bc - ad + VDIVPD(YMM(4), YMM(4), YMM(9)) // (ac + bd) / (c^2 + d^2), (bc - ad) / (c^2 + d^2) + + VPERMILPD(YMM(3), YMM(5), IMM(0x5)) + VMULPD(YMM(5), YMM(5), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(5), YMM(5), YMM(3)) + VDIVPD(YMM(5), YMM(5), YMM(9)) + + VPERMILPD(YMM(3), YMM(6), IMM(0x5)) + VMULPD(YMM(6), YMM(6), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(6), YMM(6), YMM(3)) + VDIVPD(YMM(6), YMM(6), YMM(9)) + #endif + VMOVUPD(MEM(RCX, 0*16), YMM(4)) + VMOVUPD(MEM(RCX, 2*16), YMM(5)) + VMOVUPD(MEM(RCX, 4*16), YMM(6)) + ADD(RCX, RDI) + + //iteration 1 ------------------------------------- + + VBROADCASTSD(YMM(0), MEM(RAX, (1+0*2)*16+0)) + VBROADCASTSD(YMM(1), MEM(RAX, (1+0*2)*16+8)) + + VPERMILPD(YMM(3), YMM(4), IMM(0x5)) + VMULPD(YMM(2), YMM(4), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(7), YMM(2), YMM(3)) + + VPERMILPD(YMM(3), YMM(5), IMM(0x5)) + VMULPD(YMM(2), YMM(5), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(8), YMM(2), YMM(3)) + + VPERMILPD(YMM(3), YMM(6), IMM(0x5)) + VMULPD(YMM(2), YMM(6), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(9), YMM(2), YMM(3)) + + VSUBPD(YMM(10), YMM(10), YMM(7)) + VSUBPD(YMM(11), YMM(11), YMM(8)) + VSUBPD(YMM(12), YMM(12), YMM(9)) + + VBROADCASTSD(YMM(0), MEM(RAX, (1+1*2)*16+0)) + VBROADCASTSD(YMM(1), MEM(RAX, (1+1*2)*16+8)) + + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + VPERMILPD(YMM(3), YMM(10), IMM(0x5)) + VMULPD(YMM(10), YMM(10), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(10), YMM(10), YMM(3)) + + VPERMILPD(YMM(3), YMM(11), IMM(0x5)) + VMULPD(YMM(11), YMM(11), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(11), YMM(11), YMM(3)) + + VPERMILPD(YMM(3), YMM(12), IMM(0x5)) + VMULPD(YMM(12), YMM(12), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(12), YMM(12), YMM(3)) + #else + VMOVUPD(YMM(2), MEM(R11)) + VMULPD(YMM(9), YMM(0), YMM(0)) + VFMADD231PD(YMM(9), YMM(1), YMM(1)) + + VPERMILPD(YMM(3), YMM(10), IMM(0x5)) + VMULPD(YMM(10), YMM(10), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(10), YMM(10), YMM(3)) + VDIVPD(YMM(10), YMM(10), YMM(9)) + + VPERMILPD(YMM(3), YMM(11), IMM(0x5)) + VMULPD(YMM(11), YMM(11), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(11), YMM(11), YMM(3)) + VDIVPD(YMM(11), YMM(11), YMM(9)) + + VPERMILPD(YMM(3), YMM(12), IMM(0x5)) + VMULPD(YMM(12), YMM(12), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(12), YMM(12), YMM(3)) + VDIVPD(YMM(12), YMM(12), YMM(9)) + #endif + VMOVUPD(MEM(RCX, 0*16), YMM(10)) + VMOVUPD(MEM(RCX, 2*16), YMM(11)) + VMOVUPD(MEM(RCX, 4*16), YMM(12)) + +// ENDREGION - TRSM + + MOV(RAX, R12) + MOV(RBX, R10) + MOV(RCX, VAR(c11)) + + CMP(RBX, IMM(16)) + JE(ROWUPDATE) + + LABEL(COLUPDATE) + LEA(RDX, MEM(RCX, R12, 1)) + LEA(RDI, MEM(, R10, 2)) + + VEXTRACTF128(XMM(3), YMM(4), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(4)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + ADD(RCX, RDI) + + VEXTRACTF128(XMM(3), YMM(5), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(5)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + ADD(RCX, RDI) + + VEXTRACTF128(XMM(3), YMM(6), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(6)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + + + VEXTRACTF128(XMM(3), YMM(10), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(10)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + ADD(RDX, RDI) + + VEXTRACTF128(XMM(3), YMM(11), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(11)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + ADD(RDX, RDI) + + VEXTRACTF128(XMM(3), YMM(12), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(12)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + JMP(END) + + + LABEL(ROWUPDATE) + LEA(RDX, MEM(RCX, R12, 1)) + + VMOVUPD(MEM(RCX ), YMM(4)) + VMOVUPD(MEM(RCX, R10, 2), YMM(5)) + VMOVUPD(MEM(RCX, R10, 4), YMM(6)) + + VMOVUPD(MEM(RDX ), YMM(10)) + VMOVUPD(MEM(RDX, R10, 2), YMM(11)) + VMOVUPD(MEM(RDX, R10, 4), YMM(12)) + JMP(END) + + LABEL(END) + + VZEROUPPER() + + + END_ASM + ( + : // output operands (none) + : // input operands + [a10] "m" (a10), + [k] "m" (k), + [b01] "m" (b01), + [a11] "m" (a11), + [b11] "m" (b11), + [c11] "m" (c11), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [alpha] "m" (alpha), + [negPtr] "m" (negPtr) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13", "xmm14", "xmm15", + "memory" + ) +} \ No newline at end of file diff --git a/kernels/zen/3/bli_zgemmtrsm_u_2x6.c b/kernels/zen/3/bli_zgemmtrsm_u_2x6.c new file mode 100644 index 0000000000..07bc47f016 --- /dev/null +++ b/kernels/zen/3/bli_zgemmtrsm_u_2x6.c @@ -0,0 +1,561 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "bli_x86_asm_macros.h" + +#define A_L1_PREFETCH_DIST 4 +#define B_L1_PREFETCH_DIST 4 +#define TAIL_NITER 6 + +#define PREFETCH_A_L1(n, k) \ + PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*2*16 + (2*n+k)*(16))) +#define PREFETCH_B_L1(n, k) \ + PREFETCH(0, MEM(RBX, B_L1_PREFETCH_DIST*6*16 + (2*n+k)*(48))) + + +/* + * A Registers: YMM3 + * B Registers: YMM0, YMM1, YMM2 + * C Registers: YMM[4-15] + */ + +#define LOOP_ALIGN ALIGN32 + +#define SUBITER(n) \ +\ + PREFETCH_A_L1(n, 0) \ + VBROADCASTSD(YMM( 3), MEM(RAX,(4*n+ 0)*8)) \ + VFMADD231PD(YMM( 4), YMM(0), YMM(3)) \ + VFMADD231PD(YMM( 5), YMM(1), YMM(3)) \ + VFMADD231PD(YMM( 6), YMM(2), YMM(3)) \ + VBROADCASTSD(YMM( 3), MEM(RAX,(4*n+ 1)*8)) \ + VFMADD231PD(YMM( 7), YMM(0), YMM(3)) \ + VFMADD231PD(YMM( 8), YMM(1), YMM(3)) \ + VFMADD231PD(YMM( 9), YMM(2), YMM(3)) \ + \ + PREFETCH_B_L1(n, 0) \ + VBROADCASTSD(YMM( 3), MEM(RAX,(4*n+ 2)*8)) \ + VFMADD231PD(YMM(10), YMM(0), YMM(3)) \ + VFMADD231PD(YMM(11), YMM(1), YMM(3)) \ + VFMADD231PD(YMM(12), YMM(2), YMM(3)) \ + VBROADCASTSD(YMM( 3), MEM(RAX,(4*n+ 3)*8)) \ + VFMADD231PD(YMM(13), YMM(0), YMM(3)) \ + VFMADD231PD(YMM(14), YMM(1), YMM(3)) \ + VFMADD231PD(YMM(15), YMM(2), YMM(3)) \ + \ + VMOVAPD(YMM(0), MEM(RBX,(6*n+0)*16)) \ + VMOVAPD(YMM(1), MEM(RBX,(6*n+2)*16)) \ + VMOVAPD(YMM(2), MEM(RBX,(6*n+4)*16)) \ + +// used for division of complex number if TRSM_PREINV is disabled +static double negative[4] __attribute__((aligned(64))) + = {-1, -1, -1, -1}; + +/**********************************************************/ +/* Kernel : bli_zgemmtrsm_u_zen_asm_2x6 */ +/* It performs A * X = alpha * B */ +/* It is row preferred kernel, A and B are packed */ +/* C could be Row/Col/Gen Stored Matrix */ +/* Registers are allocated as below */ +/* Broadcast A : YMM(3) */ +/* load B : YMM(0, 1, 2) */ +/* Accumulation of B(real,imag)*Areal : */ +/* YMM(4-6,10-12) */ +/* Accumulation of B(real,imag)*Aimag : */ +/* YMM(7-9,13-15) */ +/* Computation of A(real,imag)*B(real,imag): */ +/* YMM(4-6,10-12) */ +/**********************************************************/ +void bli_zgemmtrsm_u_zen_asm_2x6 + ( + dim_t k_, + dcomplex* restrict alpha, + dcomplex* restrict a10, + dcomplex* restrict a11, + dcomplex* restrict b01, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + const int64_t k = k_; + /*rowstride * size of one dcomplex element*/ + const int64_t rs_c = rs_c_*16; + /*colstride * size of one dcomplex element*/ + const int64_t cs_c = cs_c_*16; + const double* negPtr = &negative[0]; + + + BEGIN_ASM() + + VXORPD(YMM( 4), YMM( 4), YMM( 4)) + VXORPD(YMM( 5), YMM( 5), YMM( 5)) + VMOVAPD(YMM(6) , YMM(4)) + VMOVAPD(YMM(7) , YMM(4)) + VMOVAPD(YMM(8) , YMM(4)) + VMOVAPD(YMM(9) , YMM(4)) + VXORPD(YMM(10), YMM(10), YMM(10)) + VXORPD(YMM(11), YMM(11), YMM(11)) + VMOVAPD(YMM(12), YMM(4)) + VMOVAPD(YMM(13), YMM(4)) + VMOVAPD(YMM(14), YMM(4)) + VMOVAPD(YMM(15), YMM(4)) + + MOV(RSI, VAR(k)) //loop index + MOV(RAX, VAR(a10)) //load address of a + MOV(RBX, VAR(b01)) //load address of b + MOV(RCX, VAR(b11)) //load address of c + MOV(R9, VAR(c11)) // laod C for prefetch + MOV(R11, VAR(negPtr)) + + // MOV(R9, RCX) + + VMOVAPD(YMM(0), MEM(RBX, 0*16)) //pre-load b + VMOVAPD(YMM(1), MEM(RBX, 2*16)) //pre-load b + VMOVAPD(YMM(2), MEM(RBX, 4*16)) //pre-load b + LEA(RBX, MEM(RBX,6*16)) //adjust b for pre-load + + MOV(R12, VAR(rs_c)) + MOV(R10, VAR(cs_c)) + + MOV(RDI, RSI) + AND(RSI, IMM(3)) + SAR(RDI, IMM(2)) + + /************************************************************/ + /* Operation: */ + /* SUBITER = (Ar, Ai)*(Br, Bi) = Ar*(Br, Bi) , Ai*(Br, Bi) */ + /* Loop counts: */ + /* LOOP1: k/4 - TAIL_NITER - 2 */ + /* LOOP2: 2 <--prefetch_c */ + /* LOOP4: TAIL_NITER */ + /************************************************************/ + SUB(RDI, IMM(2+TAIL_NITER)) + JLE(K_PREFETCH_C) + + LOOP_ALIGN + LABEL(LOOP1) + + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*2*16)) + LEA(RBX, MEM(RBX,4*6*16)) + + + JNZ(LOOP1) + + LABEL(K_PREFETCH_C) + + ADD(RDI, IMM(2)) + JLE(K_TAIL_NITER) + + LOOP_ALIGN + LABEL(LOOP2) + + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*2*16)) + LEA(RBX, MEM(RBX,4*6*16)) + + JNZ(LOOP2) + + LABEL(K_TAIL_NITER) + + ADD(RDI, IMM(0+TAIL_NITER)) + JLE(TAIL) + + LOOP_ALIGN + LABEL(LOOP3) + + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*2*16)) + LEA(RBX, MEM(RBX,4*6*16)) + + JNZ(LOOP3) + + LABEL(TAIL) + + TEST(RSI, RSI) + JZ(POSTACCUM) + + LOOP_ALIGN + LABEL(TAIL_LOOP) + + SUB(RSI, IMM(1)) + SUBITER(0) + LEA(RAX, MEM(RAX,2*16)) + LEA(RBX, MEM(RBX,6*16)) + + JNZ(TAIL_LOOP) + + LABEL(POSTACCUM) + + /**************************************************/ + /* Permute imag component register. Shuffle even */ + /* and odd components */ + /* SRC: YMM7 =(Ai0*Br0, Ai0*Bi0, Ai0*Br1, Ai0*Bi1)*/ + /* DST: YMM7 =(Ai0*Bi0, Ai0*Br0, Ai0*Bi1, Ai0*Br1)*/ + /**************************************************/ + VPERMILPD(YMM( 7), YMM( 7), IMM(0x5)) + VPERMILPD(YMM( 8), YMM( 8), IMM(0x5)) + VPERMILPD(YMM( 9), YMM( 9), IMM(0x5)) + VPERMILPD(YMM(13), YMM(13), IMM(0x5)) + VPERMILPD(YMM(14), YMM(14), IMM(0x5)) + VPERMILPD(YMM(15), YMM(15), IMM(0x5)) + + /***************************************************/ + /* SRC: YMM4 = (Ar0*Br0, Ar0*Bi0, Ar0*Br1, Ar0*Bi1)*/ + /* SRC: YMM7 = (Ai0*Bi0, Ai0*Br0, Ai0*Bi1, Ai0*Br1)*/ + /* DST: YMM4 =(Ar0*Br0-Ai0*Bi0, Ai0*Br0+Ar0*Bi0, */ + /* Ar0*Br1-Ai0*Bi1, Ai0*Br1+Ar0*Bi1) */ + /***************************************************/ + VADDSUBPD(YMM(4), YMM(4), YMM(7)) + VADDSUBPD(YMM(5), YMM(5), YMM(8)) + VADDSUBPD(YMM(6), YMM(6), YMM(9)) + VADDSUBPD(YMM(10), YMM(10), YMM(13)) + VADDSUBPD(YMM(11), YMM(11), YMM(14)) + VADDSUBPD(YMM(12), YMM(12), YMM(15)) + + /*Load alpha*/ + MOV(R9, VAR(alpha)) + VBROADCASTSD(YMM(7), MEM(R9)) + VBROADCASTSD(YMM(8), MEM(R9, 8)) + MOV(RDX, RCX) + MOV(RDI, IMM(6*16)) + + VMOVUPD(YMM(0), MEM(RDX, 0*16)) + VMOVUPD(YMM(1), MEM(RDX, 2*16)) + VMOVUPD(YMM(2), MEM(RDX, 4*16)) + ADD(RDX, RDI) + + /************************************************************************/ + /* gemm_output -= C * alpha */ + /* */ + /* Let C * alpha = (a + ib) * (c + id) */ + /* (a + ib) * (c + id) = (ac - bd) + i(ad + bc) */ + /* */ + /*Steps: */ + /* YMM(0) = a0, b0, a1, b1 */ + /* YMM(3) = b0, a0, b1, a1 */ + /* YMM(0) = a0*c0, b0*c0, a1*c1, b1*c1 */ + /* YMM(3) = b0*d0, a0*d0, b1*d1, a1*d1 */ + /* YMM(0) = (a0c0 - b0d0), (b0c0 + a0d0), (a1c1 - b1d1), (b1c1 + a1d1) */ + /************************************************************************/ + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(7)) // a*c, b*c + VMULPD(YMM(3), YMM(3), YMM(8)) // b*d, a*d + VADDSUBPD(YMM(0), YMM(0), YMM(3)) // ac - bd, bc + ad + VSUBPD(YMM(4), YMM(0), YMM(4)) // gemm_output - c * alpha + + VMOVUPD(YMM(0), MEM(RDX, 0*16)) + VPERMILPD(YMM(3), YMM(1), IMM(0x5)) + VMULPD(YMM(1), YMM(1), YMM(7)) + VMULPD(YMM(3), YMM(3), YMM(8)) + VADDSUBPD(YMM(1), YMM(1), YMM(3)) + VSUBPD(YMM(5), YMM(1), YMM(5)) + + VMOVUPD(YMM(1), MEM(RDX, 2*16)) + VPERMILPD(YMM(3), YMM(2), IMM(0x5)) + VMULPD(YMM(2), YMM(2), YMM(7)) + VMULPD(YMM(3), YMM(3), YMM(8)) + VADDSUBPD(YMM(2), YMM(2), YMM(3)) + VSUBPD(YMM(6), YMM(2), YMM(6)) + + VMOVUPD(YMM(2), MEM(RDX, 4*16)) + VPERMILPD(YMM(3), YMM(0), IMM(0x5)) + VMULPD(YMM(0), YMM(0), YMM(7)) + VMULPD(YMM(3), YMM(3), YMM(8)) + VADDSUBPD(YMM(0), YMM(0), YMM(3)) + VSUBPD(YMM(10), YMM(0), YMM(10)) + + VPERMILPD(YMM(3), YMM(1), IMM(0x5)) + VMULPD(YMM(1), YMM(1), YMM(7)) + VMULPD(YMM(3), YMM(3), YMM(8)) + VADDSUBPD(YMM(1), YMM(1), YMM(3)) + VSUBPD(YMM(11), YMM(1), YMM(11)) + + VPERMILPD(YMM(3), YMM(2), IMM(0x5)) + VMULPD(YMM(2), YMM(2), YMM(7)) + VMULPD(YMM(3), YMM(3), YMM(8)) + VADDSUBPD(YMM(2), YMM(2), YMM(3)) + VSUBPD(YMM(12), YMM(2), YMM(12)) + + + MOV(RAX, VAR(a11)) + ADD(RCX, RDI) + // REGION - TRSM + //iteration 0 ------------------------------------- + VBROADCASTSD(YMM(0), MEM(RAX, (1+1*2)*16+0)) + VBROADCASTSD(YMM(1), MEM(RAX, (1+1*2)*16+8)) + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + /****************************************************/ + /* C = C * A11 */ + /* (a + ib) * (c + id) = (ac - bd) + i(ad + bc) */ + /****************************************************/ + VPERMILPD(YMM(3), YMM(10), IMM(0x5)) + VMULPD(YMM(10), YMM(10), YMM(0)) //a*c, b*c + VMULPD(YMM(3), YMM(3), YMM(1)) //b*d, a*d + VADDSUBPD(YMM(10), YMM(10), YMM(3)) // (ac - bd), (bc + ad) + + VPERMILPD(YMM(3), YMM(11), IMM(0x5)) + VMULPD(YMM(11), YMM(11), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(11), YMM(11), YMM(3)) + + VPERMILPD(YMM(3), YMM(12), IMM(0x5)) + VMULPD(YMM(12), YMM(12), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(12), YMM(12), YMM(3)) + #else + /************************************************************************/ + /* C = C / A11 */ + /* */ + /* Let C / A11 = (a + ib) / (c + id) = */ + /* ((ac + bd) / (c^2 + d^2)) + i ((bc - ad) / (c^2+d^2)) */ + /* */ + /*Steps: */ + /* YMM(10) = a0, b0, a1, b1 */ + /* YMM(3) = b0, a0, b1, a1 */ + /* YMM(10) = a0*c0, b0*c0, a1*c1, b1*c1 */ + /* YMM(3) = b0*d0, a0*d0, b1*d1, a1*d1 */ + /* YMM(3) = -b0*d0, -a0*d0, -b1*d1, -a1*d1 */ + /* YMM(10) = (a0c0 - b0d0), (b0c0 + a0d0), (a1c1 - b1d1), (b1c1 + a1d1) */ + /* YMM(10) = (a0c0 - b0d0) / (c^2 + d^2), (b0c0 + a0d0) / (c^2 + d^2), */ + /* (a1c1 - b1d1) / (c^2 + d^2), (b1c1 + a1d1 / (c^2 + d^2) */ + /************************************************************************/ + VMOVUPD(YMM(2), MEM(R11)) // -1 + VMULPD(YMM(9), YMM(0), YMM(0)) + VFMADD231PD(YMM(9), YMM(1), YMM(1)) + + VPERMILPD(YMM(3), YMM(10), IMM(0x5)) + VMULPD(YMM(10), YMM(10), YMM(0)) // a*c, b*c + VMULPD(YMM(3), YMM(3), YMM(1)) // b*d, a*d + VMULPD(YMM(3), YMM(3), YMM(2)) // -bd, -ad + VADDSUBPD(YMM(10), YMM(10), YMM(3)) // ac + bd, bc - ad + VDIVPD(YMM(10), YMM(10), YMM(9))//(ac + bd) / (c^2 + d^2),(bc - ad) / (c^2 + d^2) + + VPERMILPD(YMM(3), YMM(11), IMM(0x5)) + VMULPD(YMM(11), YMM(11), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(11), YMM(11), YMM(3)) + VDIVPD(YMM(11), YMM(11), YMM(9)) + + VPERMILPD(YMM(3), YMM(12), IMM(0x5)) + VMULPD(YMM(12), YMM(12), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(12), YMM(12), YMM(3)) + VDIVPD(YMM(12), YMM(12), YMM(9)) + + #endif + VMOVUPD(MEM(RCX, 0*16), YMM(10)) + VMOVUPD(MEM(RCX, 2*16), YMM(11)) + VMOVUPD(MEM(RCX, 4*16), YMM(12)) + SUB(RCX, RDI) + + //iteration 1 ------------------------------------- + + VBROADCASTSD(YMM(0), MEM(RAX, (0+1*2)*16+0)) + VBROADCASTSD(YMM(1), MEM(RAX, (0+1*2)*16+8)) + + VPERMILPD(YMM(3), YMM(10), IMM(0x5)) + VMULPD(YMM(2), YMM(10), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(7), YMM(2), YMM(3)) + + VPERMILPD(YMM(3), YMM(11), IMM(0x5)) + VMULPD(YMM(2), YMM(11), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(8), YMM(2), YMM(3)) + + VPERMILPD(YMM(3), YMM(12), IMM(0x5)) + VMULPD(YMM(2), YMM(12), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(9), YMM(2), YMM(3)) + + VSUBPD(YMM(4), YMM(4), YMM(7)) + VSUBPD(YMM(5), YMM(5), YMM(8)) + VSUBPD(YMM(6), YMM(6), YMM(9)) + + VBROADCASTSD(YMM(0), MEM(RAX, (0+0*2)*16+0)) + VBROADCASTSD(YMM(1), MEM(RAX, (0+0*2)*16+8)) + + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + VPERMILPD(YMM(3), YMM(4), IMM(0x5)) + VMULPD(YMM(4), YMM(4), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(4), YMM(4), YMM(3)) + + VPERMILPD(YMM(3), YMM(5), IMM(0x5)) + VMULPD(YMM(5), YMM(5), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(5), YMM(5), YMM(3)) + + VPERMILPD(YMM(3), YMM(6), IMM(0x5)) + VMULPD(YMM(6), YMM(6), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VADDSUBPD(YMM(6), YMM(6), YMM(3)) + #else + VMOVUPD(YMM(2), MEM(R11)) + VMULPD(YMM(9), YMM(0), YMM(0)) + VFMADD231PD(YMM(9), YMM(1), YMM(1)) + + VPERMILPD(YMM(3), YMM(4), IMM(0x5)) + VMULPD(YMM(4), YMM(4), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(4), YMM(4), YMM(3)) + VDIVPD(YMM(4), YMM(4), YMM(9)) + + VPERMILPD(YMM(3), YMM(5), IMM(0x5)) + VMULPD(YMM(5), YMM(5), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(5), YMM(5), YMM(3)) + VDIVPD(YMM(5), YMM(5), YMM(9)) + + VPERMILPD(YMM(3), YMM(6), IMM(0x5)) + VMULPD(YMM(6), YMM(6), YMM(0)) + VMULPD(YMM(3), YMM(3), YMM(1)) + VMULPD(YMM(3), YMM(3), YMM(2)) + VADDSUBPD(YMM(6), YMM(6), YMM(3)) + VDIVPD(YMM(6), YMM(6), YMM(9)) + #endif + VMOVUPD(MEM(RCX, 0*16), YMM(4)) + VMOVUPD(MEM(RCX, 2*16), YMM(5)) + VMOVUPD(MEM(RCX, 4*16), YMM(6)) + +// ENDREGION - TRSM + + MOV(RAX, R12) + MOV(RBX, R10) + MOV(RCX, VAR(c11)) + + CMP(RBX, IMM(16)) + JE(ROWUPDATE) + + LABEL(COLUPDATE) + LEA(RDX, MEM(RCX, R12, 1)) + LEA(RDI, MEM(, R10, 2)) + + VEXTRACTF128(XMM(3), YMM(4), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(4)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + ADD(RCX, RDI) + + VEXTRACTF128(XMM(3), YMM(5), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(5)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + ADD(RCX, RDI) + + VEXTRACTF128(XMM(3), YMM(6), IMM(0x1)) + VMOVUPD(MEM(RCX ), XMM(6)) + VMOVUPD(MEM(RCX, R10, 1), XMM(3)) + + + VEXTRACTF128(XMM(3), YMM(10), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(10)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + ADD(RDX, RDI) + + VEXTRACTF128(XMM(3), YMM(11), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(11)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + ADD(RDX, RDI) + + VEXTRACTF128(XMM(3), YMM(12), IMM(0x1)) + VMOVUPD(MEM(RDX ), XMM(12)) + VMOVUPD(MEM(RDX, R10, 1), XMM(3)) + JMP(END) + + + LABEL(ROWUPDATE) + LEA(RDX, MEM(RCX, R12, 1)) + + VMOVUPD(MEM(RCX ), YMM(4)) + VMOVUPD(MEM(RCX, R10, 2), YMM(5)) + VMOVUPD(MEM(RCX, R10, 4), YMM(6)) + + VMOVUPD(MEM(RDX ), YMM(10)) + VMOVUPD(MEM(RDX, R10, 2), YMM(11)) + VMOVUPD(MEM(RDX, R10, 4), YMM(12)) + JMP(END) + + LABEL(END) + + VZEROUPPER() + + + END_ASM + ( + : // output operands (none) + : // input operands + [a10] "m" (a10), + [k] "m" (k), + [b01] "m" (b01), + [a11] "m" (a11), + [b11] "m" (b11), + [c11] "m" (c11), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [alpha] "m" (alpha), + [negPtr] "m" (negPtr) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13", "xmm14", "xmm15", + "memory" + ) +} \ No newline at end of file diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 8c338006ad..78831e715c 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -427,3 +427,8 @@ void bli_dznorm2fv_unb_var1_avx2 double* norm, cntx_t* cntx ); + +GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_2x6) + +GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsm_l_zen_asm_2x6) +GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsm_u_zen_asm_2x6) \ No newline at end of file From 46459a958daef944dbb624b395de80dd9a07b4ba Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Thu, 5 Oct 2023 12:01:58 +0530 Subject: [PATCH 160/226] Updating BLIS C++ interface trsm test. - Making A diagonally dominant to ensure that the problem at hand is solvable. AMD-Internal: [CPUPL-3575] Change-Id: I27cc76a212d4d10aacce880895e1e0d7532e4eb7 --- vendor/testcpp/test.hh | 2 +- vendor/testcpp/test_trsm.cc | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/vendor/testcpp/test.hh b/vendor/testcpp/test.hh index b1be412d64..ccd5804332 100644 --- a/vendor/testcpp/test.hh +++ b/vendor/testcpp/test.hh @@ -138,7 +138,7 @@ int computeErrorM( for ( i = 0; i < m; i ++ ) { for ( j = 0; j < n; j ++ ) { if ( (fabs (A( i, j )) - fabs( A_ref( i, j ))) > 0.0000001 ) { - cout << A(i,j) << A_ref(i,j); + cout << A(i,j) << A_ref(i,j)<< "\n"; ret = 1; break; } diff --git a/vendor/testcpp/test_trsm.cc b/vendor/testcpp/test_trsm.cc index 4c5ead3bcf..1c0c570a53 100644 --- a/vendor/testcpp/test_trsm.cc +++ b/vendor/testcpp/test_trsm.cc @@ -102,6 +102,12 @@ void test_trsm( ) allocate_init_buffer(B , m , n); copy_buffer(B, B_ref , m ,n); + // Make A diagonally dominant to guarantee that the system has a solution. + for(int i=0; i Date: Fri, 6 Oct 2023 11:49:03 +0530 Subject: [PATCH 161/226] Optimized AVX512 DGEMM SUP edge kernels - For edge kernels which handles the corner cases and specially for cases where there is really small amount of computation to be done, executing FMA efficiently becomes very crucial. - In previous implementation, edge kernels were using same, limited number of vector register to hold FMA result, which indirectly creates dependency on previous FMA to complete before CPU can issue new FMA. - This commit address this issue by using different vector registers that are available at disposal to hold FMA result. - That way we hold FMA results in two sets of vector registers, so that sub-sequent FMA won't have to wait for previous FMA to complete. - At the end of un-rolled K loop these two sets of vector registers are added together to store correct result in intended vector registers. - Following kernels are modified: bli_dgemmsup_rv_zen4_asm_24x4m, bli_dgemmsup_rv_zen4_asm_24x3m, bli_dgemmsup_rv_zen4_asm_24x2m, bli_dgemmsup_rv_zen4_asm_24x1m, bli_dgemmsup_rv_zen4_asm_24x1, bli_dgemmsup_rv_zen4_asm_16x1, bli_dgemmsup_rv_zen4_asm_8x1, bli_dgemmsup_rv_zen4_asm_24x2, bli_dgemmsup_rv_zen4_asm_16x2, bli_dgemmsup_rv_zen4_asm_8x2, bli_dgemmsup_rv_zen4_asm_24x3, bli_dgemmsup_rv_zen4_asm_16x3, bli_dgemmsup_rv_zen4_asm_8x3, bli_dgemmsup_rv_zen4_asm_16x4, bli_dgemmsup_rv_zen4_asm_8x4, bli_dgemmsup_rv_zen4_asm_16x5, bli_dgemmsup_rv_zen4_asm_8x5, bli_dgemmsup_rv_zen4_asm_16x6, bli_dgemmsup_rv_zen4_asm_8x6, bli_dgemmsup_rv_zen4_asm_8x7, bli_dgemmsup_rv_zen4_asm_8x8 AMD-Internal: [CPUPL-3574] Change-Id: I318ff8e2f075820bcc0505aa1c13d0679f73af44 --- .../3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c | 847 ++++++++++-------- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c | 196 ++-- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c | 355 +++++--- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c | 514 ++++++----- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c | 344 ++++--- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c | 422 +++++---- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c | 501 ++++++----- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c | 197 ++-- .../sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c | 223 +++-- 9 files changed, 2083 insertions(+), 1516 deletions(-) diff --git a/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c index 97ac0985dc..649aa416b5 100644 --- a/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c +++ b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c @@ -6079,6 +6079,18 @@ void bli_dgemmsup_rv_zen4_asm_24x4m vxorpd(zmm12, zmm12, zmm12) vxorpd(zmm13, zmm13, zmm13) vxorpd(zmm27,zmm27, zmm27) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + vxorpd(zmm20, zmm20, zmm20) + vxorpd(zmm21, zmm21, zmm21) + vxorpd(zmm22, zmm22, zmm22) + vxorpd(zmm23, zmm23, zmm23) + vxorpd(zmm24, zmm24, zmm24) + vxorpd(zmm25, zmm25, zmm25) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -6089,7 +6101,22 @@ void bli_dgemmsup_rv_zen4_asm_24x4m jle(.PREFETCHLOOP) // jump if i <= 0 label(.LOOP1) - + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, zmm28, + * zmm8, zmm9, zmm29, zmm10, zmm11, zmm26, zmm12, zmm13, zmm27 + * to hold fma result. + * While even iterations uses zmm14, zmm15, zmm16, zmm17, zmm18 + * zmm19, zmm20, zmm21, zmm22, zmm23, zmm24, zmm25 to hold fma + * result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, zmm28, + * zmm8, zmm9, zmm29, zmm10, zmm11, zmm26, zmm12, zmm13, zmm27. + */ // ---------------------------------- iteration 1 vmovupd( mem(rax),zmm0 ) // load A @@ -6138,21 +6165,21 @@ void bli_dgemmsup_rv_zen4_asm_24x4m prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) // ---------------------------------- iteration 3 @@ -6198,21 +6225,21 @@ void bli_dgemmsup_rv_zen4_asm_24x4m prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) // ---------------------------------- iteration 5 @@ -6256,21 +6283,21 @@ void bli_dgemmsup_rv_zen4_asm_24x4m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) // ---------------------------------- iteration 7 @@ -6310,21 +6337,21 @@ void bli_dgemmsup_rv_zen4_asm_24x4m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -6383,21 +6410,21 @@ void bli_dgemmsup_rv_zen4_asm_24x4m prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) // ---------------------------------- iteration 3 prefetchw0( mem(rdx, 128)) // prefetch C @@ -6442,21 +6469,21 @@ void bli_dgemmsup_rv_zen4_asm_24x4m prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -6498,21 +6525,21 @@ void bli_dgemmsup_rv_zen4_asm_24x4m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -6550,28 +6577,28 @@ void bli_dgemmsup_rv_zen4_asm_24x4m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 jnz(.LOOP2) // iterate again if i != 0. label(.TAILITER) add(imm(TAIL_NITER), rsi) // i += TAIL_NITER - jle(.TAIL) // jump if i <= 0 + jle(.TAIL) // jump if i <= 0 label(.LOOP3) @@ -6621,21 +6648,21 @@ void bli_dgemmsup_rv_zen4_asm_24x4m prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -6679,21 +6706,21 @@ void bli_dgemmsup_rv_zen4_asm_24x4m prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -6735,21 +6762,21 @@ void bli_dgemmsup_rv_zen4_asm_24x4m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -6787,25 +6814,37 @@ void bli_dgemmsup_rv_zen4_asm_24x4m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) - vfmadd231pd( zmm5,zmm31,zmm27 ) + vfmadd231pd( zmm3,zmm31,zmm23 ) + vfmadd231pd( zmm4,zmm31,zmm24 ) + vfmadd231pd( zmm5,zmm31,zmm25 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm14, zmm6, zmm6) + vaddpd(zmm15, zmm7, zmm7) + vaddpd(zmm16, zmm28, zmm28) + vaddpd(zmm17, zmm8, zmm8) + vaddpd(zmm18, zmm9, zmm9) + vaddpd(zmm19, zmm29, zmm29) + vaddpd(zmm20, zmm10, zmm10) + vaddpd(zmm21, zmm11, zmm11) + vaddpd(zmm22, zmm26, zmm26) + vaddpd(zmm23, zmm12, zmm12) + vaddpd(zmm24, zmm13, zmm13) + vaddpd(zmm25, zmm27, zmm27) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -7177,6 +7216,15 @@ void bli_dgemmsup_rv_zen4_asm_24x3m vxorpd(zmm10, zmm10, zmm10) vxorpd(zmm11, zmm11, zmm11) vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + vxorpd(zmm20, zmm20, zmm20) + vxorpd(zmm21, zmm21, zmm21) + vxorpd(zmm22, zmm22, zmm22) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -7186,6 +7234,22 @@ void bli_dgemmsup_rv_zen4_asm_24x3m sub(imm( 3+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, zmm28, + * zmm8, zmm9, zmm29, zmm10, zmm11, zmm26 to hold fma result. + * While even iterations uses zmm14, zmm15, zmm16, zmm17, zmm18 + * zmm19, zmm20, zmm21, zmm22 to hold fma + * result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, zmm28, + * zmm8, zmm9, zmm29, zmm10, zmm11, zmm26. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -7232,17 +7296,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 3 @@ -7283,17 +7347,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 5 @@ -7333,17 +7397,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 7 @@ -7379,17 +7443,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -7444,17 +7508,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) - add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) + add( r8,rbx ) // b += rs_b + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 3 prefetchw0( mem(rdx, 128)) // prefetch C @@ -7494,17 +7558,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -7542,17 +7606,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -7586,17 +7650,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -7649,17 +7713,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -7698,17 +7762,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -7746,17 +7810,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -7790,21 +7854,30 @@ void bli_dgemmsup_rv_zen4_asm_24x3m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm14, zmm6, zmm6) + vaddpd(zmm15, zmm7, zmm7) + vaddpd(zmm16, zmm28, zmm28) + vaddpd(zmm17, zmm8, zmm8) + vaddpd(zmm18, zmm9, zmm9) + vaddpd(zmm19, zmm29, zmm29) + vaddpd(zmm20, zmm10, zmm10) + vaddpd(zmm21, zmm11, zmm11) + vaddpd(zmm22, zmm26, zmm26) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -8157,6 +8230,12 @@ void bli_dgemmsup_rv_zen4_asm_24x2m vxorpd(zmm8, zmm8, zmm8) vxorpd(zmm9, zmm9, zmm9) vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -8166,6 +8245,21 @@ void bli_dgemmsup_rv_zen4_asm_24x2m sub(imm( 2+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, zmm28, + * zmm8, zmm9, zmm29 to hold fma result. + * While even iterations uses zmm14, zmm15, zmm16, zmm17, zmm18 + * zmm19, zmm20, zmm21 to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, zmm28, + * zmm8, zmm9, zmm29. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -8208,13 +8302,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 3 @@ -8250,13 +8344,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 5 @@ -8292,13 +8386,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 7 @@ -8330,13 +8424,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -8387,13 +8481,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 3 prefetchw0( mem(rdx, 128)) // prefetch C @@ -8428,13 +8522,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -8468,13 +8562,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -8504,13 +8598,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -8559,13 +8653,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -8599,13 +8693,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -8639,13 +8733,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -8675,17 +8769,23 @@ void bli_dgemmsup_rv_zen4_asm_24x2m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm14, zmm6, zmm6) + vaddpd(zmm15, zmm7, zmm7) + vaddpd(zmm16, zmm28, zmm28) + vaddpd(zmm17, zmm8, zmm8) + vaddpd(zmm18, zmm9, zmm9) + vaddpd(zmm19, zmm29, zmm29) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -9022,7 +9122,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m vxorpd(zmm6, zmm6, zmm6) vxorpd(zmm7, zmm7, zmm7) vxorpd(zmm28, zmm28, zmm28) - + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b @@ -9031,6 +9133,20 @@ void bli_dgemmsup_rv_zen4_asm_24x1m sub(imm( 1+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, zmm28, + * to hold fma result. + * While even iterations uses zmm14, zmm15, zmm16 to hold fma + * result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, zmm28, + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -9068,9 +9184,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) // ---------------------------------- iteration 3 @@ -9102,9 +9218,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) // ---------------------------------- iteration 5 @@ -9136,9 +9252,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) // ---------------------------------- iteration 7 @@ -9166,9 +9282,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -9214,9 +9330,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) // ---------------------------------- iteration 3 prefetchw0( mem(rdx, 128)) // prefetch C @@ -9247,9 +9363,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -9279,9 +9395,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -9307,9 +9423,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -9353,9 +9469,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -9385,9 +9501,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -9417,9 +9533,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -9445,13 +9561,16 @@ void bli_dgemmsup_rv_zen4_asm_24x1m add( r10,r14 ) // a_next += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm14, zmm6, zmm6) + vaddpd(zmm15, zmm7, zmm7) + vaddpd(zmm16, zmm28, zmm28) label(.TAIL) mov(var(k_left), rsi) // i = k_left diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c index d8806362e8..690404628e 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c @@ -472,6 +472,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 vxorpd(zmm6, zmm6, zmm6) vxorpd(zmm7, zmm7, zmm7) vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm10, zmm10, zmm10) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -481,6 +484,20 @@ void bli_dgemmsup_rv_zen4_asm_24x1 sub(imm( 1+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, zmm28, + * to hold fma result. + * While even iterations uses zmm8, zmm9, zmm10 to hold fma + * result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, zmm28, + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -508,9 +525,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) // ---------------------------------- iteration 3 @@ -532,9 +549,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) // ---------------------------------- iteration 5 @@ -556,9 +573,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) // ---------------------------------- iteration 7 @@ -576,9 +593,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -614,9 +631,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) // ---------------------------------- iteration 3 prefetchw0( mem(rdx, 128)) // prefetch C @@ -637,9 +654,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -659,9 +676,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -677,9 +694,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -713,9 +730,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -735,9 +752,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -757,9 +774,9 @@ void bli_dgemmsup_rv_zen4_asm_24x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -775,13 +792,16 @@ void bli_dgemmsup_rv_zen4_asm_24x1 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) + vfmadd231pd( zmm5,zmm30,zmm10 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm8, zmm6, zmm6) + vaddpd(zmm9, zmm7, zmm7) + vaddpd(zmm10, zmm28, zmm28) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -1168,6 +1188,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // zero out all accumulation registers vxorpd(zmm6, zmm6, zmm6) vxorpd(zmm7, zmm7, zmm7) + vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -1177,6 +1199,19 @@ void bli_dgemmsup_rv_zen4_asm_16x1 sub(imm( 1+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7 + * to hold fma result. + * While even iterations uses zmm8, zmm9 to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -1200,8 +1235,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) // ---------------------------------- iteration 3 @@ -1220,8 +1255,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) // ---------------------------------- iteration 5 @@ -1240,8 +1275,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) // ---------------------------------- iteration 7 @@ -1257,8 +1292,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -1290,8 +1325,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -1308,8 +1343,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -1326,8 +1361,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -1341,8 +1376,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -1372,8 +1407,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -1390,8 +1425,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -1408,8 +1443,8 @@ void bli_dgemmsup_rv_zen4_asm_16x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -1423,12 +1458,14 @@ void bli_dgemmsup_rv_zen4_asm_16x1 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm8 ) + vfmadd231pd( zmm4,zmm30,zmm9 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm8, zmm6, zmm6) + vaddpd(zmm9, zmm7, zmm7) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -1783,6 +1820,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // zero out all accumulation registers vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -1792,6 +1830,19 @@ void bli_dgemmsup_rv_zen4_asm_8x1 sub(imm( 1+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6 + * to hold fma result. + * While even iterations uses zmm7 to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -1811,7 +1862,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) // ---------------------------------- iteration 3 @@ -1827,7 +1878,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) // ---------------------------------- iteration 5 @@ -1843,7 +1894,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) // ---------------------------------- iteration 7 @@ -1857,7 +1908,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -1884,7 +1935,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -1898,7 +1949,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -1912,7 +1963,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -1924,7 +1975,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -1950,7 +2001,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -1964,7 +2015,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -1978,7 +2029,7 @@ void bli_dgemmsup_rv_zen4_asm_8x1 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -1990,11 +2041,12 @@ void bli_dgemmsup_rv_zen4_asm_8x1 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm7, zmm6, zmm6) label(.TAIL) mov(var(k_left), rsi) // i = k_left diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c index d8b5c73ad8..67a58c1b82 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c @@ -476,6 +476,12 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vxorpd(zmm8, zmm8, zmm8) vxorpd(zmm9, zmm9, zmm9) vxorpd(zmm29, zmm29, zmm29) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -485,6 +491,21 @@ void bli_dgemmsup_rv_zen4_asm_24x2 sub(imm( 2+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, zmm28, + * zmm8, zmm9, zmm29 to hold fma result. + * While even iterations uses zmm14, zmm15, zmm16, zmm17, zmm18 + * zmm19 to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, zmm28, + * zmm8, zmm9, zmm29. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -517,13 +538,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 3 @@ -549,13 +570,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 5 @@ -581,13 +602,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 7 @@ -609,13 +630,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -656,13 +677,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 3 prefetchw0( mem(rdx, 128)) // prefetch C @@ -687,13 +708,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -717,13 +738,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -743,13 +764,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -788,13 +809,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -818,13 +839,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -848,13 +869,13 @@ void bli_dgemmsup_rv_zen4_asm_24x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -874,17 +895,23 @@ void bli_dgemmsup_rv_zen4_asm_24x2 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm14, zmm6, zmm6) + vaddpd(zmm15, zmm7, zmm7) + vaddpd(zmm16, zmm28, zmm28) + vaddpd(zmm17, zmm8, zmm8) + vaddpd(zmm18, zmm9, zmm9) + vaddpd(zmm19, zmm29, zmm29) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -1292,6 +1319,10 @@ void bli_dgemmsup_rv_zen4_asm_16x2 vxorpd(zmm7, zmm7, zmm7) vxorpd(zmm8, zmm8, zmm8) vxorpd(zmm9, zmm9, zmm9) + vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -1301,6 +1332,21 @@ void bli_dgemmsup_rv_zen4_asm_16x2 sub(imm( 2+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, + * zmm8, zmm9 to hold fma result. + * While even iterations uses zmm10, zmm11, zmm12, zmm13 + * to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7,zmm8, + * zmm9. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -1328,11 +1374,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) // ---------------------------------- iteration 3 @@ -1354,11 +1400,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) // ---------------------------------- iteration 5 @@ -1380,11 +1426,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) // ---------------------------------- iteration 7 @@ -1403,11 +1449,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -1443,11 +1489,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -1467,11 +1513,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -1491,11 +1537,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -1512,11 +1558,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -1550,11 +1596,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -1574,11 +1620,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -1598,11 +1644,11 @@ void bli_dgemmsup_rv_zen4_asm_16x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -1619,15 +1665,19 @@ void bli_dgemmsup_rv_zen4_asm_16x2 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm4,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm4,zmm31,zmm13 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm10, zmm6, zmm6) + vaddpd(zmm11, zmm7, zmm7) + vaddpd(zmm12, zmm8, zmm8) + vaddpd(zmm13, zmm9, zmm9) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -1995,7 +2045,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // zero out all accumulation registers vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -2005,6 +2057,19 @@ void bli_dgemmsup_rv_zen4_asm_8x2 sub(imm( 2+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm8 + * to hold fma result. + * While even iterations uses zmm7, zmm9 to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm8. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -2027,9 +2092,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) // ---------------------------------- iteration 3 @@ -2047,9 +2112,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) // ---------------------------------- iteration 5 @@ -2067,9 +2132,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) // ---------------------------------- iteration 7 @@ -2085,9 +2150,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -2117,9 +2182,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2135,9 +2200,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2153,9 +2218,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2169,9 +2234,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -2200,9 +2265,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2218,9 +2283,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2236,9 +2301,9 @@ void bli_dgemmsup_rv_zen4_asm_8x2 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2252,13 +2317,15 @@ void bli_dgemmsup_rv_zen4_asm_8x2 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm7, zmm6, zmm6) + vaddpd(zmm9, zmm8, zmm8) label(.TAIL) mov(var(k_left), rsi) // i = k_left diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c index a739183e98..ee6c3c573d 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c @@ -480,6 +480,15 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vxorpd(zmm10, zmm10, zmm10) vxorpd(zmm11, zmm11, zmm11) vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + vxorpd(zmm20, zmm20, zmm20) + vxorpd(zmm21, zmm21, zmm21) + vxorpd(zmm22, zmm22, zmm22) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -489,6 +498,21 @@ void bli_dgemmsup_rv_zen4_asm_24x3 sub(imm( 3+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, zmm28, + * zmm8, zmm9, zmm29, zmm10, zmm11, zmm26 to hold fma result. + * While even iterations uses zmm14, zmm15, zmm16, zmm17, zmm18 + * zmm19, zmm20, zmm21, zmm22 to hold fma + * result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, zmm28, + * zmm8, zmm9, zmm29, zmm10, zmm11, zmm26. + */ label(.LOOP1) // ---------------------------------- iteration 1 @@ -525,17 +549,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 3 @@ -566,17 +590,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 5 @@ -606,17 +630,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 7 @@ -642,17 +666,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -697,17 +721,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 3 prefetchw0( mem(rdx, 128)) // prefetch C @@ -737,17 +761,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -775,17 +799,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -809,17 +833,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -862,17 +886,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -901,17 +925,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -939,17 +963,17 @@ void bli_dgemmsup_rv_zen4_asm_24x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -973,21 +997,30 @@ void bli_dgemmsup_rv_zen4_asm_24x3 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) - vfmadd231pd( zmm5,zmm30,zmm28 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm5,zmm30,zmm16 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) - vfmadd231pd( zmm5,zmm31,zmm29 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) + vfmadd231pd( zmm4,zmm31,zmm18 ) + vfmadd231pd( zmm5,zmm31,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) - vfmadd231pd( zmm5,zmm30,zmm26 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) + vfmadd231pd( zmm5,zmm30,zmm22 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm14, zmm6, zmm6) + vaddpd(zmm15, zmm7, zmm7) + vaddpd(zmm16, zmm28, zmm28) + vaddpd(zmm17, zmm8, zmm8) + vaddpd(zmm18, zmm9, zmm9) + vaddpd(zmm19, zmm29, zmm29) + vaddpd(zmm20, zmm10, zmm10) + vaddpd(zmm21, zmm11, zmm11) + vaddpd(zmm22, zmm26, zmm26) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -1410,6 +1443,12 @@ void bli_dgemmsup_rv_zen4_asm_16x3 vxorpd(zmm9, zmm9, zmm9) vxorpd(zmm10, zmm10, zmm10) vxorpd(zmm11, zmm11, zmm11) + vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -1419,6 +1458,22 @@ void bli_dgemmsup_rv_zen4_asm_16x3 sub(imm( 3+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, zmm8, + * zmm9, zmm10, zmm11 to hold fma result. + * While even iterations uses zmm12, zmm13, zmm14, zmm15, zmm16 + * zmm17 to hold fma + * result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, zmm8, + * zmm9, zmm10, zmm11. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -1449,14 +1504,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) // ---------------------------------- iteration 3 @@ -1482,14 +1537,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) // ---------------------------------- iteration 5 @@ -1514,14 +1569,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) // ---------------------------------- iteration 7 @@ -1543,14 +1598,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -1589,14 +1644,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -1620,14 +1675,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -1650,14 +1705,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -1677,14 +1732,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -1721,14 +1776,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -1752,14 +1807,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -1782,14 +1837,14 @@ void bli_dgemmsup_rv_zen4_asm_16x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -1809,18 +1864,24 @@ void bli_dgemmsup_rv_zen4_asm_16x3 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm12 ) + vfmadd231pd( zmm4,zmm30,zmm13 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm14 ) + vfmadd231pd( zmm4,zmm31,zmm15 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm12, zmm6, zmm6) + vaddpd(zmm13, zmm7, zmm7) + vaddpd(zmm14, zmm8, zmm8) + vaddpd(zmm15, zmm9, zmm9) + vaddpd(zmm16, zmm10, zmm10) + vaddpd(zmm17, zmm11, zmm11) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -2197,8 +2258,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // zero out all accumulation registers vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -2208,6 +2272,21 @@ void bli_dgemmsup_rv_zen4_asm_8x3 sub(imm( 3+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm8, + * zmm10 to hold fma result. + * While even iterations uses zmm7, zmm9, zmm11 to hold fma + * result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm8, + * zmm10. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -2232,11 +2311,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) // ---------------------------------- iteration 3 @@ -2257,11 +2336,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) // ---------------------------------- iteration 5 @@ -2281,11 +2360,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) // ---------------------------------- iteration 7 @@ -2303,11 +2382,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -2339,11 +2418,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2362,11 +2441,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2384,11 +2463,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2404,11 +2483,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -2439,11 +2518,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2462,11 +2541,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2484,11 +2563,11 @@ void bli_dgemmsup_rv_zen4_asm_8x3 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2504,15 +2583,18 @@ void bli_dgemmsup_rv_zen4_asm_8x3 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm7, zmm6, zmm6) + vaddpd(zmm9, zmm8, zmm8) + vaddpd(zmm11, zmm10, zmm10) label(.TAIL) mov(var(k_left), rsi) // i = k_left diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c index e5d70ae5fd..f8a3968f7b 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c @@ -1529,6 +1529,15 @@ void bli_dgemmsup_rv_zen4_asm_16x4 vxorpd(zmm11, zmm11, zmm11) vxorpd(zmm12, zmm12, zmm12) vxorpd(zmm13, zmm13, zmm13) + vxorpd(zmm14,zmm14, zmm14) + vxorpd(zmm15,zmm15, zmm15) + vxorpd(zmm16,zmm16, zmm16) + vxorpd(zmm17,zmm17, zmm17) + vxorpd(zmm18,zmm18, zmm18) + vxorpd(zmm19,zmm19, zmm19) + vxorpd(zmm20,zmm20, zmm20) + vxorpd(zmm21,zmm21, zmm21) + // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -1538,6 +1547,22 @@ void bli_dgemmsup_rv_zen4_asm_16x4 sub(imm( 4+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, + * zmm8, zmm9, zmm10, zmm11, zmm12, zmm13 + * to hold fma result. + * While even iterations uses zmm14, zmm15, zmm16, zmm17, zmm18 + * zmm19, zmm20, zmm21 to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, + * zmm8, zmm9, zmm10, zmm11, zmm12, zmm13 + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -1571,17 +1596,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) // ---------------------------------- iteration 3 @@ -1611,17 +1636,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) // ---------------------------------- iteration 5 @@ -1649,17 +1674,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) // ---------------------------------- iteration 7 @@ -1684,17 +1709,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -1736,17 +1761,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -1774,17 +1799,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -1810,17 +1835,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -1843,17 +1868,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -1893,17 +1918,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -1931,17 +1956,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -1967,17 +1992,17 @@ void bli_dgemmsup_rv_zen4_asm_16x4 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -2000,21 +2025,29 @@ void bli_dgemmsup_rv_zen4_asm_16x4 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm4,zmm30,zmm15 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm4,zmm31,zmm17 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm14, zmm6, zmm6) + vaddpd(zmm15, zmm7, zmm7) + vaddpd(zmm16, zmm8, zmm8) + vaddpd(zmm17, zmm9, zmm9) + vaddpd(zmm18, zmm10, zmm10) + vaddpd(zmm19, zmm11, zmm11) + vaddpd(zmm20, zmm12, zmm12) + vaddpd(zmm21, zmm13, zmm13) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -2406,9 +2439,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // zero out all accumulation registers vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -2418,6 +2455,21 @@ void bli_dgemmsup_rv_zen4_asm_8x4 sub(imm( 4+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, + * zmm8, zmm10, zmm12 to hold fma result. + * While even iterations uses zmm7, zmm9, zmm11, zmm12 + * to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, + * zmm8, zmm10, zmm12. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -2444,13 +2496,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) // ---------------------------------- iteration 3 @@ -2474,13 +2526,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) // ---------------------------------- iteration 5 @@ -2502,13 +2554,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) // ---------------------------------- iteration 7 @@ -2528,13 +2580,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP1) // iterate again if i != 0. @@ -2568,13 +2620,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2596,13 +2648,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2622,13 +2674,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2646,13 +2698,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b sub(imm(1), rsi) // i -= 1 @@ -2685,13 +2737,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2713,13 +2765,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2739,13 +2791,13 @@ void bli_dgemmsup_rv_zen4_asm_8x4 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2763,17 +2815,21 @@ void bli_dgemmsup_rv_zen4_asm_8x4 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm7, zmm6, zmm6) + vaddpd(zmm9, zmm8, zmm8) + vaddpd(zmm11, zmm10, zmm10) + vaddpd(zmm13, zmm12, zmm12) label(.TAIL) mov(var(k_left), rsi) // i = k_left diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c index a41cbc4905..d014358c84 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c @@ -1707,6 +1707,17 @@ void bli_dgemmsup_rv_zen4_asm_16x5 vxorpd(zmm13, zmm13, zmm13) vxorpd(zmm14, zmm14, zmm14) vxorpd(zmm15, zmm15, zmm15) + vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + vxorpd(zmm20, zmm20, zmm20) + vxorpd(zmm21, zmm21, zmm21) + vxorpd(zmm22, zmm22, zmm22) + vxorpd(zmm23, zmm23, zmm23) + vxorpd(zmm24, zmm24, zmm24) + vxorpd(zmm25, zmm25, zmm25) + // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -1716,6 +1727,22 @@ void bli_dgemmsup_rv_zen4_asm_16x5 sub(imm( 5+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, + * zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15 + * to hold fma result. + * While even iterations uses zmm16, zmm17, zmm18, zmm19, zmm20 + * zmm21, zmm22, zmm23, zmm24, zmm25 to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, + * zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -1753,21 +1780,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) // ---------------------------------- iteration 3 @@ -1801,21 +1828,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) // ---------------------------------- iteration 5 @@ -1848,21 +1875,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) // ---------------------------------- iteration 7 @@ -1891,21 +1918,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b dec(rsi) // i -= 1 @@ -1952,21 +1979,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -1998,21 +2025,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -2043,21 +2070,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -2084,21 +2111,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b @@ -2143,21 +2170,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -2189,21 +2216,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -2234,21 +2261,21 @@ void bli_dgemmsup_rv_zen4_asm_16x5 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -2275,26 +2302,36 @@ void bli_dgemmsup_rv_zen4_asm_16x5 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm16 ) + vfmadd231pd( zmm4,zmm30,zmm17 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm18 ) + vfmadd231pd( zmm4,zmm31,zmm19 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm20 ) + vfmadd231pd( zmm4,zmm30,zmm21 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm22 ) + vfmadd231pd( zmm4,zmm31,zmm23 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm24 ) + vfmadd231pd( zmm4,zmm30,zmm25 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm16, zmm6, zmm6) + vaddpd(zmm17, zmm7, zmm7) + vaddpd(zmm18, zmm8, zmm8) + vaddpd(zmm19, zmm9, zmm9) + vaddpd(zmm20, zmm10, zmm10) + vaddpd(zmm21, zmm11, zmm11) + vaddpd(zmm22, zmm12, zmm12) + vaddpd(zmm23, zmm13, zmm13) + vaddpd(zmm24, zmm14, zmm14) + vaddpd(zmm25, zmm15, zmm15) label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -2715,10 +2752,15 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // zero out all accumulation registers vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -2728,6 +2770,21 @@ void bli_dgemmsup_rv_zen4_asm_8x5 sub(imm( 5+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, + * zmm8, zmm10, zmm12, zmm14 to hold fma result. + * While even iterations uses zmm7, zmm9, zmm11, zmm13, zmm15 + * to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, + * zmm8, zmm10, zmm12, zmm14 + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -2757,16 +2814,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) // ---------------------------------- iteration 3 @@ -2793,16 +2850,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) // ---------------------------------- iteration 5 @@ -2828,16 +2885,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) // ---------------------------------- iteration 7 @@ -2860,16 +2917,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b dec(rsi) // i -= 1 @@ -2907,16 +2964,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2941,16 +2998,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -2974,16 +3031,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3004,16 +3061,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b @@ -3050,16 +3107,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3084,16 +3141,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3117,16 +3174,16 @@ void bli_dgemmsup_rv_zen4_asm_8x5 add( r10,rax ) // a += cs_a vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3147,21 +3204,26 @@ void bli_dgemmsup_rv_zen4_asm_8x5 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm7, zmm6, zmm6) + vaddpd(zmm9, zmm8, zmm8) + vaddpd(zmm11, zmm10, zmm10) + vaddpd(zmm13, zmm12, zmm12) + vaddpd(zmm15, zmm14, zmm14) label(.TAIL) mov(var(k_left), rsi) // i = k_left diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c index fe638c320f..db9ba7cae2 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c @@ -1828,6 +1828,18 @@ void bli_dgemmsup_rv_zen4_asm_16x6 vxorpd(zmm15, zmm15, zmm15) vxorpd(zmm16, zmm16, zmm16) vxorpd(zmm17, zmm17, zmm17) + vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) + vxorpd(zmm20, zmm20, zmm20) + vxorpd(zmm21, zmm21, zmm21) + vxorpd(zmm22, zmm22, zmm22) + vxorpd(zmm23, zmm23, zmm23) + vxorpd(zmm24, zmm24, zmm24) + vxorpd(zmm25, zmm25, zmm25) + vxorpd(zmm26, zmm26, zmm26) + vxorpd(zmm27, zmm27, zmm27) + vxorpd(zmm28, zmm28, zmm28) + vxorpd(zmm29, zmm29, zmm29) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -1837,6 +1849,23 @@ void bli_dgemmsup_rv_zen4_asm_16x6 sub(imm( 6+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, zmm7, + * zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15 + * zmm16, zmm17 to hold fma result. + * While even iterations uses zmm18, zmm19, zmm20 + * zmm21, zmm22, zmm23, zmm24, zmm25, zmm26, zmm27, zmm28, zmm29 + * to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, zmm7, + * zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15, zmm16, zmm17 + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -1877,24 +1906,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) // ---------------------------------- iteration 3 @@ -1931,24 +1960,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) // ---------------------------------- iteration 5 @@ -1985,24 +2014,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) // ---------------------------------- iteration 7 @@ -2034,24 +2063,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b dec(rsi) // i -= 1 @@ -2101,24 +2130,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -2153,24 +2182,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -2205,24 +2234,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -2252,24 +2281,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b @@ -2317,24 +2346,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 ) // load A @@ -2369,24 +2398,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 ) // load A @@ -2421,24 +2450,24 @@ void bli_dgemmsup_rv_zen4_asm_16x6 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 ) // load A @@ -2468,29 +2497,42 @@ void bli_dgemmsup_rv_zen4_asm_16x6 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) - vfmadd231pd( zmm4,zmm30,zmm7 ) + vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm4,zmm30,zmm19 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) - vfmadd231pd( zmm4,zmm31,zmm9 ) + vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm4,zmm31,zmm21 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) - vfmadd231pd( zmm4,zmm30,zmm11 ) + vfmadd231pd( zmm3,zmm30,zmm22 ) + vfmadd231pd( zmm4,zmm30,zmm23 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) - vfmadd231pd( zmm4,zmm31,zmm13 ) + vfmadd231pd( zmm3,zmm31,zmm24 ) + vfmadd231pd( zmm4,zmm31,zmm25 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) - vfmadd231pd( zmm4,zmm30,zmm15 ) + vfmadd231pd( zmm3,zmm30,zmm26 ) + vfmadd231pd( zmm4,zmm30,zmm27 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) - vfmadd231pd( zmm4,zmm31,zmm17 ) + vfmadd231pd( zmm3,zmm31,zmm28 ) + vfmadd231pd( zmm4,zmm31,zmm29 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm18, zmm6, zmm6) + vaddpd(zmm19, zmm7, zmm7) + vaddpd(zmm20, zmm8, zmm8) + vaddpd(zmm21, zmm9, zmm9) + vaddpd(zmm22, zmm10, zmm10) + vaddpd(zmm23, zmm11, zmm11) + vaddpd(zmm24, zmm12, zmm12) + vaddpd(zmm25, zmm13, zmm13) + vaddpd(zmm26, zmm14, zmm14) + vaddpd(zmm27, zmm15, zmm15) + vaddpd(zmm28, zmm16, zmm16) + vaddpd(zmm29, zmm17, zmm17) + label(.TAIL) mov(var(k_left), rsi) // i = k_left @@ -2924,11 +2966,17 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // zero out all accumulation registers vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -2938,6 +2986,21 @@ void bli_dgemmsup_rv_zen4_asm_8x6 sub(imm( 6+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, + * zmm8, zmm10, zmm12, zmm14, zmm16 to hold fma result. + * While even iterations uses zmm7, zmm9, zmm11 + * zmm13, zmm15, zmm17 to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, + * zmm8, zmm10, zmm12, zmm14, zmm16. + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -2969,18 +3032,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) // ---------------------------------- iteration 3 @@ -3009,18 +3072,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) // ---------------------------------- iteration 5 @@ -3049,18 +3112,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) // ---------------------------------- iteration 7 @@ -3085,18 +3148,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b dec(rsi) // i -= 1 @@ -3136,18 +3199,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3174,18 +3237,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3212,18 +3275,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3246,18 +3309,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b @@ -3296,18 +3359,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3334,18 +3397,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3372,18 +3435,18 @@ void bli_dgemmsup_rv_zen4_asm_8x6 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3406,23 +3469,29 @@ void bli_dgemmsup_rv_zen4_asm_8x6 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm7, zmm6, zmm6) + vaddpd(zmm9, zmm8, zmm8) + vaddpd(zmm11, zmm10, zmm10) + vaddpd(zmm13, zmm12, zmm12) + vaddpd(zmm15, zmm14, zmm14) + vaddpd(zmm17, zmm16, zmm16) label(.TAIL) mov(var(k_left), rsi) // i = k_left diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c index 610871ab2e..9e4194c118 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c @@ -3128,12 +3128,19 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // zero out all accumulation registers vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -3143,6 +3150,21 @@ void bli_dgemmsup_rv_zen4_asm_8x7 sub(imm( 7+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, + * zmm8, zmm10, zmm12, zmm14, zmm16, zmm18 to hold fma result. + * While even iterations uses zmm7, zmm9, zmm11 + * zmm13, zmm15, zmm17, zmm19 to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, + * zmm8, zmm10, zmm12, zmm14, zmm16, zmm18 + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -3176,20 +3198,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) // ---------------------------------- iteration 3 @@ -3220,20 +3242,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) // ---------------------------------- iteration 5 @@ -3264,20 +3286,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) // ---------------------------------- iteration 7 @@ -3305,20 +3327,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b dec(rsi) // i -= 1 @@ -3360,20 +3382,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3402,20 +3424,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3444,20 +3466,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3483,20 +3505,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b @@ -3537,20 +3559,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3579,20 +3601,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3621,20 +3643,20 @@ void bli_dgemmsup_rv_zen4_asm_8x7 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3660,25 +3682,32 @@ void bli_dgemmsup_rv_zen4_asm_8x7 // ---------------------------------- iteration 8 vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm7, zmm6, zmm6) + vaddpd(zmm9, zmm8, zmm8) + vaddpd(zmm11, zmm10, zmm10) + vaddpd(zmm13, zmm12, zmm12) + vaddpd(zmm15, zmm14, zmm14) + vaddpd(zmm17, zmm16, zmm16) + vaddpd(zmm19, zmm18, zmm18) label(.TAIL) mov(var(k_left), rsi) // i = k_left diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c index 8cf46b43c5..065cbd5bb6 100644 --- a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c +++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c @@ -3284,13 +3284,21 @@ void bli_dgemmsup_rv_zen4_asm_8x8 // zero out all accumulation registers vxorpd(zmm6, zmm6, zmm6) + vxorpd(zmm7, zmm7, zmm7) vxorpd(zmm8, zmm8, zmm8) + vxorpd(zmm9, zmm9, zmm9) vxorpd(zmm10, zmm10, zmm10) + vxorpd(zmm11, zmm11, zmm11) vxorpd(zmm12, zmm12, zmm12) + vxorpd(zmm13, zmm13, zmm13) vxorpd(zmm14, zmm14, zmm14) + vxorpd(zmm15, zmm15, zmm15) vxorpd(zmm16, zmm16, zmm16) + vxorpd(zmm17, zmm17, zmm17) vxorpd(zmm18, zmm18, zmm18) + vxorpd(zmm19, zmm19, zmm19) vxorpd(zmm20, zmm20, zmm20) + vxorpd(zmm21, zmm21, zmm21) // K is unrolled by 8 to facilitate prefetch of B // Assuming B to be col-stored, for each iteration of K, @@ -3300,6 +3308,21 @@ void bli_dgemmsup_rv_zen4_asm_8x8 sub(imm( 8+TAIL_NITER), rsi) // i -= NR + TAIL_NITER jle(.PREFETCHLOOP) // jump if i <= 0 + /** + * This edge kernel uses two separate vector register bank + * to hold fma result. + * Once the K loop is completed these two vector register banks + * are added together and final result is available in one + * register bank. + * Here odd iterations uses vector register zmm6, + * zmm8, zmm10, zmm12, zmm14, zmm16, zmm18, zmm20 to hold fma result. + * While even iterations uses zmm7, zmm9, zmm11 + * zmm13, zmm15, zmm17, zmm19, zmm21 to hold fma result. + * At the end of K loop, these two banks are added together and + * final result is available in vector register zmm6, + * zmm8, zmm10, zmm12, zmm14, zmm16, zmm18, zmm20 + */ + label(.LOOP1) // ---------------------------------- iteration 1 @@ -3335,22 +3358,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) // ---------------------------------- iteration 3 @@ -3383,22 +3406,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) // ---------------------------------- iteration 5 @@ -3431,22 +3454,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) // ---------------------------------- iteration 7 @@ -3477,22 +3500,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r15,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer to b_next += 8*rs_b dec(rsi) // i -= 1 @@ -3536,22 +3559,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3582,22 +3605,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3628,22 +3651,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3672,22 +3695,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r15,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) lea(mem(rdx, rdi, 1), rdx) // C += cs_c lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // second pointer of b_next += 8*rs_b @@ -3730,22 +3753,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r11,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) // ---------------------------------- iteration 3 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3776,22 +3799,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r11,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) // ---------------------------------- iteration 5 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3822,22 +3845,22 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r15,r9,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) // ---------------------------------- iteration 7 vmovupd( mem(rax),zmm3 MASK_KZ(2) ) // load A // Load A with mask and zero hint @@ -3866,27 +3889,35 @@ void bli_dgemmsup_rv_zen4_asm_8x8 prefetch( 0,mem(r15,r13,1) ) // prefetch B vbroadcastsd( mem(rbx),zmm30 ) vbroadcastsd( mem(rbx,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm6 ) + vfmadd231pd( zmm3,zmm30,zmm7 ) vbroadcastsd( mem(rbx,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm8 ) + vfmadd231pd( zmm3,zmm31,zmm9 ) vbroadcastsd( mem(rbx,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm10 ) + vfmadd231pd( zmm3,zmm30,zmm11 ) vbroadcastsd( mem(r12),zmm30 ) add( r8,rbx ) // b += rs_b - vfmadd231pd( zmm3,zmm31,zmm12 ) + vfmadd231pd( zmm3,zmm31,zmm13 ) vbroadcastsd( mem(r12,r9,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm14 ) + vfmadd231pd( zmm3,zmm30,zmm15 ) vbroadcastsd( mem(r12,r9,2),zmm30 ) - vfmadd231pd( zmm3,zmm31,zmm16 ) + vfmadd231pd( zmm3,zmm31,zmm17 ) vbroadcastsd( mem(r12,r13,1),zmm31 ) - vfmadd231pd( zmm3,zmm30,zmm18 ) + vfmadd231pd( zmm3,zmm30,zmm19 ) add( r8,r12 ) // second pointer of b += rs_b - vfmadd231pd( zmm3,zmm31,zmm20 ) + vfmadd231pd( zmm3,zmm31,zmm21 ) lea(mem(r11,r8,8), r11) // b_next += 8*rs_b lea(mem(r15,r8,8), r15) // Second pointer of b_next += 8*rs_b dec(rsi) // i -= 1 jnz(.LOOP3) // iterate again if i != 0. + vaddpd(zmm7, zmm6, zmm6) + vaddpd(zmm9, zmm8, zmm8) + vaddpd(zmm11, zmm10, zmm10) + vaddpd(zmm13, zmm12, zmm12) + vaddpd(zmm15, zmm14, zmm14) + vaddpd(zmm17, zmm16, zmm16) + vaddpd(zmm19, zmm18, zmm18) + vaddpd(zmm21, zmm20, zmm20) label(.TAIL) mov(var(k_left), rsi) // i = k_left From 7a4f84fbac706c335c497c38f2fbd4c56ef5db67 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Fri, 22 Sep 2023 04:58:27 -0500 Subject: [PATCH 162/226] Optimized dgemm for tiny input sizes. - This commit focused on enhancing the performance of dgemm for matrices for very small dimenstions. - blis_dgemm_tiny function re-uses dgemm sup kernels, bypassing the conventional SUP framework code path. As SUP framework code path requires the creation and initilization of blis objects, accessing all the needed meta-information from objects, querying contexts which adds performance penaulty while computing for matrices with very small dimensions. - To avoid such performance penaulty blis_dgemm_tiny function implements a lightweight support code so that it can re-use dgemm SUP kernels such a way that it directly operates on input buffers. It avoids framework overhead of creating and intializing blis objects, context intialization, accessing other large framework data structures. - blis_dgemm_tiny function checks for threshold condition to match before picking the kernel. For zen, zen2, zen3 architecture tiny kernel is invoked for any shape as long as m < 8 and k <= 1500 or m < 1000 and n <= 24 and k <=1500. While for zen4 as long as dimensions are less than 1500 for m,n,k tiny kernel is invoked. -blis_dgemm_tiny function supports single threaded computation as of now. AMD-Internal: [CPUPL-3574] Change-Id: Ife66d35b51add4fccbeebd29911e0c957e59a05f --- frame/compat/bla_gemm_amd.c | 37 +- .../testsuite/level3/gemm/dgemm_generic.cpp | 162 +++++- kernels/zen/3/CMakeLists.txt | 1 + kernels/zen/3/bli_gemm_tiny.c | 485 ++++++++++++++++++ kernels/zen/bli_kernels_zen.h | 14 + 5 files changed, 696 insertions(+), 3 deletions(-) create mode 100644 kernels/zen/3/bli_gemm_tiny.c diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index decd7e1aa5..87b5c107a7 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -468,8 +468,14 @@ void dgemm_blis_impl return; } - /* If alpha is zero scale C by beta and return early. */ - if( PASTEMAC(d,eq0)( *alpha )) + /** + * If alpha is zero or k is zero scale C by beta and return early. + * Since k is zero, the only operation to be done is scaling of C by beta. + * Scalm function checks for beta = zero internally, if it is zero it invokes + setm kernel, otherwise it goes ahead and do the scaling + of C matrix. + */ + if( (PASTEMAC(d,eq0)( *alpha )) || (*k == 0) ) { bli_convert_blas_dim1(*m, m0); bli_convert_blas_dim1(*n, n0); @@ -652,6 +658,33 @@ void dgemm_blis_impl return; } + /** + *Early check for tiny sizes. + *if inputs are in range of tiny gemm kernel, + *we avoid creating and initalizing objects and directly + *operate on memory buffers. + *Function return failure in case of input matrix sizes are + *beyond threshold(larger inputs). + *It also returns failure for multi-threaded computation as it + *supports single threaded computation as of now. + */ + err_t tiny_ret = bli_dgemm_tiny + ( + blis_transa, + blis_transb, + m0, n0, k0, + alpha, + a, rs_a, cs_a, + b, rs_b, cs_b, + beta, + c, rs_c, cs_c + ); + + if(tiny_ret == BLIS_SUCCESS) + { + return; + } + const num_t dt = BLIS_DOUBLE; obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; diff --git a/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp index 4c76593b7a..b74f63aea2 100644 --- a/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp @@ -147,4 +147,164 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::DGemmTestPrint() - ); \ No newline at end of file + ); + + +// Tests 5 loops +INSTANTIATE_TEST_SUITE_P( + tiny_dgemm_kernel, + DGemmTest, + ::testing::Combine( + // No condition based on storage scheme of matrices + ::testing::Values('c'), // storage format + // No conditions based on trans of matrices + ::testing::Values('n', 't'), // transa + ::testing::Values('n', 't'), // transb + + ::testing::Values(13, 25, 48, 60, 256, 512, 1000), // m + + ::testing::Values(8, 48, 72, 144, 237), // n + + ::testing::Values(16, 24, 48, 64, 128, 557), // k + // No condition based on alpha + ::testing::Values( -1.0), // alpha + // No condition based on betaa + ::testing::Values(-1.0), // beta + ::testing::Values(0,3), // increment to the leading dim of a + ::testing::Values(0,3), // increment to the leading dim of b + ::testing::Values(0,3) // increment to the leading dim of c + ), + ::DGemmTestPrint() + ); + +//zero beta test case +INSTANTIATE_TEST_SUITE_P( + zero_beta, + DGemmTest, + ::testing::Combine( + // No condition based on storage scheme of matrices + ::testing::Values('c'), // storage format + // No conditions based on trans of matrices + ::testing::Values('n', 't'), // transa + ::testing::Values('n', 't'), // transb + + ::testing::Values(13, 25, 48, 60, 256, 512, 1000), // m + + ::testing::Values(8, 48, 72, 144, 237), // n + + ::testing::Values(16, 24, 48, 64, 128, 557), // k + + ::testing::Values( -1.0), // alpha + ::testing::Values(0.0), // beta + ::testing::Values(0,3), // increment to the leading dim of a + ::testing::Values(0,3), // increment to the leading dim of b + ::testing::Values(0,3) // increment to the leading dim of c + ), + ::DGemmTestPrint() + ); + +//zero alpha test case +INSTANTIATE_TEST_SUITE_P( + zero_alpha, + DGemmTest, + ::testing::Combine( + // No condition based on storage scheme of matrices + ::testing::Values('c'), // storage format + // No conditions based on trans of matrices + ::testing::Values('n', 't'), // transa + ::testing::Values('n', 't'), // transb + + ::testing::Values(13, 25, 48, 60, 256, 512, 1000), // m + + ::testing::Values(8, 48, 72, 144, 237), // n + + ::testing::Values(16, 24, 48, 64, 128, 557), // k + + ::testing::Values( 0.0), // alpha + ::testing::Values(-1.0), // beta + ::testing::Values(0,3), // increment to the leading dim of a + ::testing::Values(0,3), // increment to the leading dim of b + ::testing::Values(0,3) // increment to the leading dim of c + ), + ::DGemmTestPrint() + ); + +//unit beta test case +INSTANTIATE_TEST_SUITE_P( + unit_beta, + DGemmTest, + ::testing::Combine( + // No condition based on storage scheme of matrices + ::testing::Values('c'), // storage format + // No conditions based on trans of matrices + ::testing::Values('n', 't'), // transa + ::testing::Values('n', 't'), // transb + + ::testing::Values(13, 25, 48, 60, 256, 512, 1000), // m + + ::testing::Values(8, 48, 72, 144, 237), // n + + ::testing::Values(16, 24, 48, 64, 128, 557), // k + + ::testing::Values( -1.0), // alpha + ::testing::Values(1.0), // beta + ::testing::Values(0,3), // increment to the leading dim of a + ::testing::Values(0,3), // increment to the leading dim of b + ::testing::Values(0,3) // increment to the leading dim of c + ), + ::DGemmTestPrint() + ); + +// Covers all corner cases of tiny dgemm kernel +INSTANTIATE_TEST_SUITE_P( + tiny_edge_kernels, + DGemmTest, + ::testing::Combine( + // To test col storage of C + // Storage of A and B is handled by packing + ::testing::Values('c'), // storage format + // Tests scalar code of 8xk and 6xk pack kernels for both storage formats + ::testing::Values('n','t'), // transa + ::testing::Values('n','t'), // transb + + ::testing::Range(gtint_t(1), gtint_t(23), 1), // m + ::testing::Range(gtint_t(1), gtint_t(7), 1), // n + + ::testing::Values(24), // k + // No condition based on alpha + ::testing::Values( -1.0, 1.0), // alpha + // checks for beta-zero and beta non-zero cases + ::testing::Values(0.0, 1.0, -1.0), // beta + ::testing::Values(23), // increment to the leading dim of a + ::testing::Values(23), // increment to the leading dim of b + ::testing::Values(23) // increment to the leading dim of c + ), + ::DGemmTestPrint() + ); + + +//m = 0, n = 0 k = 0 testcase +INSTANTIATE_TEST_SUITE_P( + mnkzero, + DGemmTest, + ::testing::Combine( + // No condition based on storage scheme of matrices + ::testing::Values('c'), // storage format + // No conditions based on trans of matrices + ::testing::Values('n', 't'), // transa + ::testing::Values('n', 't'), // transb + + ::testing::Values(0, 8, 24), // m + + ::testing::Values(0, 6, 8), // n + + ::testing::Values(3), // k + + ::testing::Values( -1.0), // alpha + ::testing::Values(1.0), // beta + ::testing::Values(0,3), // increment to the leading dim of a + ::testing::Values(0,3), // increment to the leading dim of b + ::testing::Values(0,3) // increment to the leading dim of c + ), + ::DGemmTestPrint() + ); diff --git a/kernels/zen/3/CMakeLists.txt b/kernels/zen/3/CMakeLists.txt index f8035d96b7..3c99cbafb8 100644 --- a/kernels/zen/3/CMakeLists.txt +++ b/kernels/zen/3/CMakeLists.txt @@ -3,6 +3,7 @@ add_library(zen_3 OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_small.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_tiny.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_avx2_k1.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_avx2_k1.c diff --git a/kernels/zen/3/bli_gemm_tiny.c b/kernels/zen/3/bli_gemm_tiny.c new file mode 100644 index 0000000000..10f37d714a --- /dev/null +++ b/kernels/zen/3/bli_gemm_tiny.c @@ -0,0 +1,485 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "immintrin.h" +#include "xmmintrin.h" +#include "blis.h" + +static dgemmsup_ker_ft kern_fp[] = +{ + bli_dgemmsup_rv_haswell_asm_6x8m, + bli_dgemmsup_rd_haswell_asm_6x8m, + bli_dgemmsup_rv_haswell_asm_6x8m, + bli_dgemmsup_rv_haswell_asm_6x8n, + bli_dgemmsup_rv_haswell_asm_6x8m, + bli_dgemmsup_rd_haswell_asm_6x8n, + bli_dgemmsup_rv_haswell_asm_6x8n, + bli_dgemmsup_rv_haswell_asm_6x8n +}; + +#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) +static err_t bli_dgemm_tiny_24x8_kernel + ( + conj_t conja, + conj_t conjb, + trans_t transa, + trans_t transb, + dim_t m, + dim_t n, + dim_t k, + const double* alpha, + const double* a, const inc_t rs_a0, const inc_t cs_a0, + const double* b, const inc_t rs_b0, const inc_t cs_b0, + const double* beta, + double* c, const inc_t rs_c0, const inc_t cs_c0 + ) +{ + double *a_local = (double *)a; + double *b_local = (double *)b; + double *c_local = (double *)c; + guint_t cs_a = cs_a0; + guint_t rs_a = rs_a0; + guint_t cs_b = cs_b0; + guint_t rs_b = rs_b0; + guint_t cs_c = cs_c0; + guint_t rs_c = rs_c0; + inc_t rs_a_local = rs_a0; + inc_t cs_a_local = cs_a0; + inc_t rs_b_local = rs_b0; + inc_t cs_b_local = cs_b0; + inc_t rs_c_local = rs_c0; + inc_t cs_c_local = cs_c0; + + gint_t M = m; + gint_t N = n; + gint_t K = k; + + inc_t storage = 0; + if(transb == BLIS_NO_TRANSPOSE || transb == BLIS_CONJ_NO_TRANSPOSE) + { + storage = 1 * (rs_b == 1); //1st bit + } + else if(transb == BLIS_TRANSPOSE || transb == BLIS_CONJ_TRANSPOSE) + { + storage = 1 * (cs_b == 1); //1st bit + rs_b = cs_b0; + cs_b = rs_b0; + } + + if(transa == BLIS_NO_TRANSPOSE || transa == BLIS_CONJ_NO_TRANSPOSE) + { + storage |= ((1 * (rs_a == 1)) << 1); //2nd bit + } + else if(transa == BLIS_TRANSPOSE || transa == BLIS_CONJ_TRANSPOSE) + { + storage |= ((1 * (cs_a == 1)) << 1); //2nd bit + rs_a = cs_a0; + cs_a = rs_a0; + } + + storage |= ((1 * (rs_c == 1)) << 2); //3rd bit + + stor3_t stor_id = (stor3_t) storage; + + //Early return, since we do not support dot product gemm kernels. + if(stor_id == BLIS_CRC || stor_id == BLIS_RRC) + { + return BLIS_FAILURE; + } + + const bool is_rrr_rrc_rcr_crr = ( + stor_id == BLIS_RRR || + stor_id == BLIS_RRC || + stor_id == BLIS_RCR || + stor_id == BLIS_CRR + ); + + const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; + const bool row_pref = false; + const bool col_pref = !row_pref; + + const bool is_primary = ( row_pref && is_rrr_rrc_rcr_crr ) || + ( col_pref && is_rcc_crc_ccr_ccc ); + + /** + * Based on matrix storage scheme and kernel preference, + * decision is made here that whether it is primary storage + * scheme or not. + */ + if ( !is_primary ) + { + /** + * For non-primary storage scheme, we configure parameters, + * for kernel re-use. + */ + a_local = (double *)b; + b_local = (double *)a; + rs_a_local = cs_b; + cs_a_local = rs_b; + rs_b_local = cs_a; + cs_b_local = rs_a; + rs_c_local = cs_c0; + cs_c_local = rs_c0; + M = n; + N = m; + + rs_a = rs_a_local; + cs_a = cs_a_local; + cs_c = cs_c_local; + rs_b = rs_b_local; + cs_b = cs_b_local; + rs_c = rs_c_local; + } + + double *A = a_local; + double *B = b_local; + double *C = c_local; + double *alpha_cast, *beta_cast; + alpha_cast = (double *)alpha; + beta_cast = (double *)beta; + /** + * Set blocking and micro tile parameters before computing + */ + dim_t NC = 4080; + dim_t MC = 144; + dim_t KC = 480; + dim_t MR_ = 24; + dim_t NR_ = 8; + /** + * NC and MC must be in multiple of MR_ and NR_. + * if not return early. + */ + if( (NC % NR_ != 0) || (MC % MR_ != 0) ) + { + return BLIS_FAILURE; + } + dim_t n_part_rem = N % NC; + dim_t n_rem = N % NR_; + dim_t m_part_rem = M % MC; + dim_t k_rem = K % KC; + dim_t n_part = 0; + dim_t n_cur = 0; + dim_t m_cur = 0; + dim_t k_cur = 0; + auxinfo_t aux; + inc_t ps_a_use = (MR_ * rs_a); + bli_auxinfo_set_ps_a( ps_a_use, &aux ); + for ( dim_t n_iter = 0; n_iter < N; n_iter += NC ) + { + n_part = (NC <= (N - n_iter) ? NC : n_part_rem); + for ( dim_t k_iter = 0; k_iter < K; k_iter += KC ) + { + k_cur = (KC <= (K - k_iter) ? KC : k_rem); + for ( dim_t m_iter = 0; m_iter < M; m_iter += MC) + { + m_cur = (MC <= (M - m_iter) ? MC : m_part_rem); + for ( dim_t jr_iter = 0; jr_iter < n_part; jr_iter += NR_ ) + { + n_cur = (NR_ <= (N - jr_iter) ? NR_ : n_rem); + bli_dgemmsup_rv_zen4_asm_24x8m(conja, conjb, m_cur, n_cur, k_cur, + alpha_cast, A + m_iter * rs_a, + rs_a, cs_a, + B + jr_iter * cs_b, rs_b, cs_b, + beta_cast, + (C + jr_iter * cs_c + m_iter * rs_c), rs_c, cs_c, &aux, NULL); + } + } + } + } + + return BLIS_SUCCESS; +} +#endif + +static err_t bli_dgemm_tiny_6x8_kernel + ( + conj_t conja, + conj_t conjb, + trans_t transa, + trans_t transb, + dim_t m, + dim_t n, + dim_t k, + const double* alpha, + const double* a, const inc_t rs_a0, const inc_t cs_a0, + const double* b, const inc_t rs_b0, const inc_t cs_b0, + const double* beta, + double* c, const inc_t rs_c0, const inc_t cs_c0 + ) +{ + double *a_local = (double *)a; + double *b_local = (double *)b; + double *c_local = (double *)c; + guint_t cs_a = cs_a0; + guint_t rs_a = rs_a0; + guint_t cs_b = cs_b0; + guint_t rs_b = rs_b0; + guint_t cs_c = cs_c0; + guint_t rs_c = rs_c0; + inc_t rs_a_local = rs_a0; + inc_t cs_a_local = cs_a0; + inc_t rs_b_local = rs_b0; + inc_t cs_b_local = cs_b0; + inc_t rs_c_local = rs_c0; + inc_t cs_c_local = cs_c0; + + gint_t M = m; + gint_t N = n; + gint_t K = k; + + inc_t storage = 0; + if(transb == BLIS_NO_TRANSPOSE || transb == BLIS_CONJ_NO_TRANSPOSE) + { + storage = 1 * (rs_b == 1); //1st bit + } + else if(transb == BLIS_TRANSPOSE || transb == BLIS_CONJ_TRANSPOSE) + { + storage = 1 * (cs_b == 1); //1st bit + rs_b = cs_b0; + cs_b = rs_b0; + } + + if(transa == BLIS_NO_TRANSPOSE || transa == BLIS_CONJ_NO_TRANSPOSE) + { + storage |= ((1 * (rs_a == 1)) << 1); //2nd bit + } + else if(transa == BLIS_TRANSPOSE || transa == BLIS_CONJ_TRANSPOSE) + { + storage |= ((1 * (cs_a == 1)) << 1); //2nd bit + rs_a = cs_a0; + cs_a = rs_a0; + } + + storage |= ((1 * (rs_c == 1)) << 2); //3rd bit + + /** + * typecast storage into stor_idd, + * stores default storage scheme before we optimze + * for respective gemm kernel. */ + stor3_t stor_idd = (stor3_t) storage; + stor3_t stor_id = 0; + + stor_id = stor_idd; + + const bool is_rrr_rrc_rcr_crr = ( + stor_idd == BLIS_RRR || + stor_idd == BLIS_RRC || + stor_idd == BLIS_RCR || + stor_idd == BLIS_CRR + ); + + const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; + const bool row_pref = true; + const bool col_pref = !row_pref; + + /** + * Based on matrix storage scheme and kernel preference, + * decision is made here that whether it is primary storage + * scheme or not. + */ + const bool is_primary = ( row_pref && is_rrr_rrc_rcr_crr ) || + ( col_pref && is_rcc_crc_ccr_ccc ); + + /** + * For non-primary storage scheme, we configure parameters, + * for kernel re-use. + */ + if ( !is_primary ) + { + a_local = (double *)b; + b_local = (double *)a; + rs_a_local = cs_b; + cs_a_local = rs_b; + rs_b_local = cs_a; + cs_b_local = rs_a; + rs_c_local = cs_c0; + cs_c_local = rs_c0; + M = n; + N = m; + + stor_id = bli_stor3_trans(stor_idd); + + rs_a = rs_a_local; + cs_a = cs_a_local; + cs_c = cs_c_local; + rs_b = rs_b_local; + cs_b = cs_b_local; + rs_c = rs_c_local; + } + + double *A = a_local; + double *B = b_local; + double *C = c_local; + double *alpha_cast, *beta_cast; + alpha_cast = (double *)alpha; + beta_cast = (double *)beta; + /** + * Set blocking and micro tile parameters before computing + */ + dim_t NC = 4080; + dim_t MC = 72; + dim_t KC = 256; + dim_t MR_ = 6; + dim_t NR_ = 8; + /** + * NC and MC must be in multiple of MR_ and NR_. + * if not return early. + */ + if( (NC % NR_ != 0) || (MC % MR_ != 0) ) + { + return BLIS_FAILURE; + } + dim_t n_part_rem = N % NC; + dim_t n_rem = N % NR_; + dim_t m_part_rem = M % MC; + dim_t k_rem = K % KC; + dim_t n_part = 0; + dim_t n_cur = 0; + dim_t m_cur = 0; + dim_t k_cur = 0; + auxinfo_t aux; + inc_t ps_a_use = (MR_ * rs_a); + bli_auxinfo_set_ps_a( ps_a_use, &aux ); + for ( dim_t n_iter = 0; n_iter < N; n_iter += NC ) + { + n_part = (NC <= (N - n_iter) ? NC : n_part_rem); + for ( dim_t k_iter = 0; k_iter < K; k_iter += KC ) + { + k_cur = (KC <= (K - k_iter) ? KC : k_rem); + for ( dim_t m_iter = 0; m_iter < M; m_iter += MC) + { + m_cur = (MC <= (M - m_iter) ? MC : m_part_rem); + for ( dim_t jr_iter = 0; jr_iter < n_part; jr_iter += NR_ ) + { + n_cur = (NR_ <= (N - jr_iter) ? NR_ : n_rem); + kern_fp[stor_id](conja, conjb, m_cur, n_cur, k_cur, + alpha_cast, A + m_iter * rs_a, + rs_a, cs_a, + B + jr_iter * cs_b, rs_b, cs_b, + beta_cast, + (C + jr_iter * cs_c + m_iter * rs_c), rs_c, cs_c, &aux, NULL); + } + } + } + } + + return BLIS_SUCCESS; +} + +static arch_t get_arch_id(void) +{ + static arch_t arch_id = BLIS_NUM_ARCHS + 1; + if(arch_id == BLIS_NUM_ARCHS + 1) + { + arch_id = bli_cpuid_query_id(); + } + + return arch_id; +} + +err_t bli_dgemm_tiny +( + trans_t transa, + trans_t transb, + dim_t m, + dim_t n, + dim_t k, + const double* alpha, + const double* a, const inc_t rs_a0, const inc_t cs_a0, + const double* b, const inc_t rs_b0, const inc_t cs_b0, + const double* beta, + double* c, const inc_t rs_c0, const inc_t cs_c0 +) +{ + arch_t arch_id = get_arch_id(); + if(FALSE == bli_thread_get_is_parallel()) + { + if( + BLIS_ARCH_ZEN == arch_id || + BLIS_ARCH_ZEN2 == arch_id || + BLIS_ARCH_ZEN3 == arch_id + ) + { + if( ( (m <= 8) || ( (m <= 1000) && (n <= 24) && (k >= 4) ) ) && (k <= 1500) ) + { + return bli_dgemm_tiny_6x8_kernel + ( + 1 * (transa == BLIS_CONJ_NO_TRANSPOSE), + 1 * (transb == BLIS_CONJ_NO_TRANSPOSE), + transa, + transb, + m, + n, + k, + alpha, + a, rs_a0, cs_a0, + b, rs_b0, cs_b0, + beta, + c, rs_c0, cs_c0 + ); + } + } +#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) + else if(BLIS_ARCH_ZEN4 == arch_id) + { + if(((m == n) && (m < 400) && (k < 1000)) || + ( (m != n) && (( ((m + n -k) < 1500) && + ((m + k-n) < 1500) && ((n + k-m) < 1500) ) || + ((n <= 100) && (k <=100))))) + { + return bli_dgemm_tiny_24x8_kernel + ( + 1 * (transa == BLIS_CONJ_NO_TRANSPOSE), + 1 * (transb == BLIS_CONJ_NO_TRANSPOSE), + transa, + transb, + m, + n, + k, + alpha, + a, rs_a0, cs_a0, + b, rs_b0, cs_b0, + beta, + c, rs_c0, cs_c0 + ); + } + } +#endif + else + { + ;//Return failure + } + } + + return BLIS_FAILURE; +} diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 78831e715c..1266d33ea6 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -260,6 +260,20 @@ GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) +err_t bli_dgemm_tiny +( + trans_t transa, + trans_t transb, + dim_t m, + dim_t n, + dim_t k, + const double* alpha, + const double* a, const inc_t rs_a0, const inc_t cs_a0, + const double* b, const inc_t rs_b0, const inc_t cs_b0, + const double* beta, + double* c, const inc_t rs_c0, const inc_t cs_c0 +); + err_t bli_dgemm_small ( obj_t* alpha, From 81161066e53d42788e678fe4998310935923040d Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Wed, 27 Sep 2023 11:01:16 +0530 Subject: [PATCH 163/226] Multithreading the DNRM2 and DZNRM2 API - Updated the bli_dnormfv_unb_var1( ... ) and bli_znormfv_unb_var1( ... ) function to support multithreaded calls to the respective computational kernels, if and when the OpenMP support is enabled. - Added the logic to distribute the job among the threads such that only one thread has to deal with fringe case(if required). The remaining threads will execute only the AVX-2 code section of the computational kernel. - Added reduction logic post parallel region, to handle overflow and/or underflow conditions as per the mandate. The reduction for both the APIs involve calling the vectorized kernel of dnormfv operation. - Added changes to the kernel to have the scaling factors and thresholds prebroadcasted onto the registers, instead of broadcasting every time on a need basis. - Non-unit stride cases are packed to be redirected to the vectorized implementation. In case the packing fails, the input is handled by the fringe case loop in the kernel. - Added the SSE implementation in bli_dnorm2fv_unb_var1_avx2( ... ) and bli_dznorm2fv_unb_var1_avx2( ... ) kernels, to handle fringe cases of size = 2 ( and ) size = 1 or non-unit strides respectively. AMD-Internal: [CPUPL-3916][CPUPL-3633] Change-Id: Ib9131568d4c048b7e5f2b82526145622a5e8f93d --- frame/thread/bli_thread.c | 88 ++ frame/thread/bli_thread.h | 11 + frame/util/bli_util_unb_var1.c | 500 +++++++++- .../util/nrm2/dnrm2_extreme_values.cpp | 85 +- .../testsuite/util/nrm2/dnrm2_generic.cpp | 103 +- .../util/nrm2/dznrm2_extreme_values.cpp | 84 +- .../testsuite/util/nrm2/dznrm2_generic.cpp | 106 ++- .../testsuite/util/nrm2/nrm2_corner_cases.cpp | 23 + .../util/nrm2/nrm2_underflow_overflow.cpp | 93 ++ gtestsuite/testsuite/util/nrm2/test_nrm2.h | 4 +- kernels/zen/1/bli_norm2_zen_int.c | 892 ++++++++---------- 11 files changed, 1457 insertions(+), 532 deletions(-) diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index b5e9cfed73..0e21ab0f6a 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -2119,3 +2119,91 @@ void bli_thread_vector_partition } } } + +/* + Functionality : + -------------- + This function calculated the amount of work the calling thread is supposed + to perform on a vector, in case of the norm api. + + Function signature + ------------------- + + This function takes the following input: + + * n_elem - Number of element in the vector + * t_count - Number of threads in the group + * start - Vector start index (where the thread should start its processing) + * compute_len - Size of the chunk it needs to process + * block_size - The factor by which the size should be a multiple for the AVX-2 + code-section alone to be executed in the kernel. + * incx - Increment of the vector + * thread_id - ID of the thread + + Exception + ---------- + + None +*/ +void bli_normfv_thread_partition + ( + dim_t n_elem, + dim_t t_count, + dim_t* start, + dim_t* compute_len, + dim_t block_size, + dim_t incx, + dim_t thread_id + ) +{ + dim_t job_per_thread = n_elem / t_count; + dim_t job_rem = n_elem % t_count; + dim_t job_rem_per_thread = job_per_thread % block_size; + dim_t thread_lim_excess = 0; + + // Code-section to make job_per_thread as its nearset multiple of block_size + if( job_rem_per_thread ) + { + job_rem += t_count * job_rem_per_thread; + job_per_thread -= job_rem_per_thread; + } + + // Limit for the thread index, until which each thread gets block_size more elements + thread_lim_excess = job_rem / block_size; + + // Add block_size to a thread's job size if its thread_id is within the thread limit + if ( thread_id < thread_lim_excess ) + { + job_per_thread += block_size; + *start = thread_id * job_per_thread * incx; + } + + // The last thread that has to deal with fringe cases, if they are present + else if ( thread_id == ( t_count - 1 ) ) + { + *start = ( thread_lim_excess * block_size + thread_id * job_per_thread ) * incx; + job_per_thread += job_rem % block_size; + } + + // Job allocation to the remaining threads + else + { + *start = ( thread_lim_excess * block_size + thread_id * job_per_thread ) * incx; + } + + /* + As an example, let us consider the case where n_elem is 57 and t_count is 4. + Let us take block_size to be 4. + + Thread 0 - 16 + Thread 1 - 16 + Thread 2 - 12 + Thread 3 - 13 + + Here, only thread-3(last thread) has to deal with fringe cases. Every other thread has their + job size being the nearest upper/lower multiple of 4(block_size). Thus, the maximum + job difference between any two threads is 4(block_size). + */ + + *compute_len = job_per_thread; +} diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 2cbee8ef87..ea41dbeecf 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -203,6 +203,17 @@ void bli_thread_vector_partition dim_t thread_id ); +void bli_normfv_thread_partition + ( + dim_t n_elem, + dim_t t_count, + dim_t* start, + dim_t* compute_len, + dim_t block_size, + dim_t incx, + dim_t thread_id + ); + // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index 1c841fd41d..9913a94ee6 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -372,15 +372,48 @@ void bli_znormfv_unb_var1 rntm_t* rntm ) { + /* + Declaring a function pointer to point to the supported vectorized kernels. + Based on the arch_id support, the appropriate function is set to the function + pointer. Deployment happens post the switch cases. + + NOTE : A separate function pointer type is set to NULL, which will be used + only for reduction purpose. This is because the norm(per thread) + is of type double, and thus requires call to the vectorized + kernel for dnormfv operation. + */ + void ( *norm_fp )( dim_t, dcomplex*, inc_t, double*, cntx_t* ) = NULL; + void ( *reduce_fp )( dim_t, double*, inc_t, double*, cntx_t* ) = NULL; + + dcomplex *x_buf = x; + dim_t nt_ideal = -1; arch_t id = bli_arch_query_id(); - switch (id) + switch ( id ) { case BLIS_ARCH_ZEN4: case BLIS_ARCH_ZEN3: case BLIS_ARCH_ZEN2: case BLIS_ARCH_ZEN: #ifdef BLIS_KERNELS_ZEN - bli_dznorm2fv_unb_var1_avx2( n, x, incx, norm, cntx ); + + norm_fp = bli_dznorm2fv_unb_var1_avx2; + reduce_fp = bli_dnorm2fv_unb_var1_avx2; + + // Setting the ideal number of threads if support is enabled + #if defined( BLIS_ENABLE_OPENMP ) && defined( AOCL_DYNAMIC ) + if ( n < 2000 ) + nt_ideal = 1; + else if ( n < 6500 ) + nt_ideal = 4; + else if ( n < 71000 ) + nt_ideal = 8; + else if ( n < 200000 ) + nt_ideal = 16; + else if ( n < 1530000 ) + nt_ideal = 32; + + #endif + break; #endif default:; @@ -414,6 +447,219 @@ void bli_znormfv_unb_var1 // Store the final value to the output variable. bli_dcopys( sqrt_sumsq, *norm ); } + + /* + If the function signature to vectorized kernel was not set, + the default case would have been performed. Thus exit early. + + NOTE : Both the pointers are used here to avoid compilation warning. + */ + if ( norm_fp == NULL && reduce_fp == NULL ) + return; + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } + + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_pba_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */ + + mem_t mem_buf_X = { 0 }; + inc_t incx_buf = incx; + dim_t nt; + + nt = bli_rntm_num_threads( &rntm_l ); + + // nt is less than 1 if BLIS was configured with default settings for parallelism + nt = ( nt < 1 )? 1 : nt; + + // Altering the ideal thread count if it was not set or if it is greater than nt + if ( ( nt_ideal == -1 ) || ( nt_ideal > nt ) ) + nt_ideal = nt; + + // Packing for non-unit strided vector x. + // In order to get the buffer from pool via rntm access to memory broker + // is needed. Following are initializations for rntm. + bli_rntm_set_num_threads_only( 1, &rntm_l ); + bli_pba_rntm_set_pba( &rntm_l ); + + if ( incx == 0 ) nt_ideal = 1; + else if ( incx != 1 ) + { + // Calculate the size required for "n" double elements in vector x. + size_t buffer_size = n * sizeof( dcomplex ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_znorm2fv_unb_var1(): get mem pool block\n" ); + #endif + + // Acquire a buffer of the required size from the memory broker + // and save the associated mem_t entry to mem_buf_X. + bli_pba_acquire_m( + &rntm_l, + buffer_size, + BLIS_BITVAL_BUFFER_FOR_A_BLOCK, + &mem_buf_X + ); + + + // Continue packing X if buffer memory is allocated. + if ( bli_mem_is_alloc( &mem_buf_X ) ) + { + x_buf = bli_mem_buffer( &mem_buf_X ); + // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride. + for ( dim_t x_index = 0; x_index < n; x_index++ ) + { + *( x_buf + x_index ) = *( x + ( x_index * incx ) ); + } + incx_buf = 1; + } + else + { + nt_ideal = 1; + } + } + + #ifdef BLIS_ENABLE_OPENMP + + if( nt_ideal == 1 ) + { + #endif + /* + The overhead cost with OpenMP is avoided in case + the ideal number of threads needed is 1. + */ + + norm_fp( n, x_buf, incx_buf, norm, cntx ); + + if ( bli_mem_is_alloc( &mem_buf_X ) ) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_znorm2fv_unb_var1(): releasing mem pool block\n" ); + #endif + // Return the buffer to pool. + bli_pba_release( &rntm_l , &mem_buf_X ); + } + return; + + #ifdef BLIS_ENABLE_OPENMP + } + + /* + The following code-section is touched only in the case of + requiring multiple threads for the computation. + + Every thread will calculate its own local norm, and all + the local results will finally be reduced as per the mandate. + */ + + mem_t mem_buf_norm = { 0 }; + + double *norm_per_thread = NULL; + + // Calculate the size required for buffer. + size_t buffer_size = nt_ideal * sizeof(double); + + /* + Acquire a buffer (nt_ideal * size(double)) from the memory broker + and save the associated mem_t entry to mem_buf_norm. + */ + + bli_pba_acquire_m( + &rntm_l, + buffer_size, + BLIS_BITVAL_BUFFER_FOR_A_BLOCK, + &mem_buf_norm + ); + + /* Continue if norm buffer memory is allocated*/ + if ( bli_mem_is_alloc( &mem_buf_norm ) ) + { + norm_per_thread = bli_mem_buffer( &mem_buf_norm ); + + /* + In case the number of threads launched is not + equal to the number of threads required, we will + need to ensure that the garbage values are not part + of the reduction step. + + Every local norm is initialized to 0.0 to avoid this. + */ + + for ( dim_t i = 0; i < nt_ideal; i++ ) + norm_per_thread[i] = 0.0; + + // Parallel code-section + _Pragma("omp parallel num_threads(nt_ideal)") + { + /* + The number of actual threads spawned is + obtained here, so as to distribute the + job precisely. + */ + + dim_t n_threads = omp_get_num_threads(); + dim_t thread_id = omp_get_thread_num(); + dcomplex *x_start; + + // Obtain the job-size and region for compute + dim_t job_per_thread, offset; + + bli_normfv_thread_partition( n, n_threads, &offset, &job_per_thread, 2, incx_buf, thread_id ); + x_start = x_buf + offset; + + // Call to the kernel with the appropriate starting address + norm_fp( job_per_thread, x_start, incx_buf, ( norm_per_thread + thread_id ), cntx ); + } + + /* + Reduce the partial results onto a final scalar, based + on the mandate. + + Every partial result needs to be subjected to overflow or + underflow handling if needed. Thus this reduction step involves + the same logic as the one present in the kernel. The kernel is + therefore reused for the reduction step. + */ + + reduce_fp( nt_ideal, norm_per_thread, 1, norm, cntx ); + + // Releasing the allocated memory if it was allocated + bli_pba_release( &rntm_l, &mem_buf_norm ); + } + + /* + In case of failing to acquire the buffer from the memory + pool, call the single-threaded kernel and return. + */ + else + { + norm_fp( n, x_buf, incx_buf, norm, cntx ); + } + + /* + By this point, the norm value would have been set by the appropriate + code-section that was touched. The assignment is not abstracted outside + in order to avoid unnecessary conditionals. + */ + + #endif + + if ( bli_mem_is_alloc( &mem_buf_X ) ) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_znorm2fv_unb_var1(): releasing mem pool block\n" ); + #endif + // Return the buffer to pool. + bli_pba_release( &rntm_l , &mem_buf_X ); + } } #undef GENTFUNCR @@ -626,19 +872,48 @@ void bli_dnormfv_unb_var1 if ( ( *norm ) == -0.0 ) ( *norm ) = 0.0; return; } - + /* + Declaring a function pointer to point to the supported vectorized kernels. + Based on the arch_id support, the appropriate function is set to the function + pointer. Deployment happens post the switch cases. In case of adding any + AVX-512 kernel, the code for deployment remains the same. + */ + void ( *norm_fp )( dim_t, double*, inc_t, double*, cntx_t* ) = NULL; + + double *x_buf = x; + dim_t nt_ideal = -1; arch_t id = bli_arch_query_id(); - switch (id) + switch ( id ) { case BLIS_ARCH_ZEN4: case BLIS_ARCH_ZEN3: case BLIS_ARCH_ZEN2: case BLIS_ARCH_ZEN: #ifdef BLIS_KERNELS_ZEN - bli_dnorm2fv_unb_var1_avx2( n, x, incx, norm, cntx ); - break; + + norm_fp = bli_dnorm2fv_unb_var1_avx2; + + // Setting the ideal number of threads if support is enabled + #if defined( BLIS_ENABLE_OPENMP ) && defined( AOCL_DYNAMIC ) + + if ( n < 4000 ) + nt_ideal = 1; + else if ( n < 17000 ) + nt_ideal = 4; + else if ( n < 136000 ) + nt_ideal = 8; + else if ( n < 365000 ) + nt_ideal = 16; + else if ( n < 2950000 ) + nt_ideal = 32; + + #endif + + break; #endif default:; + // The following call to the kernel is + // single threaded in this case. double* zero = bli_d0; double* one = bli_d1; double scale; @@ -650,7 +925,7 @@ void bli_dnormfv_unb_var1 bli_ddcopys( *one, sumsq ); // Compute the sum of the squares of the vector. - bli_dsumsqv_unb_var1 + bli_dsumsqv_unb_var1 ( n, x, @@ -668,6 +943,217 @@ void bli_dnormfv_unb_var1 // Store the final value to the output variable. bli_dcopys( sqrt_sumsq, *norm ); } + + /* + If the function signature to vectorized kernel was not set, + the default case would have been performed. Thus exit early. + */ + if ( norm_fp == NULL ) + return; + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } + + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_pba_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */ + + mem_t mem_buf_X = { 0 }; + inc_t incx_buf = incx; + dim_t nt; + + nt = bli_rntm_num_threads( &rntm_l ); + + // nt is less than 1 if BLIS was configured with default settings for parallelism + nt = ( nt < 1 )? 1 : nt; + + if ( ( nt_ideal == -1 ) || ( nt_ideal > nt ) ) + nt_ideal = nt; + + // Packing for non-unit strided vector x. + // In order to get the buffer from pool via rntm access to memory broker + // is needed. Following are initializations for rntm. + bli_rntm_set_num_threads_only( 1, &rntm_l ); + bli_pba_rntm_set_pba( &rntm_l ); + + if ( incx == 0 ) nt_ideal = 1; + else if ( incx != 1 ) + { + // Calculate the size required for "n" double elements in vector x. + size_t buffer_size = n * sizeof( double ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dnorm2fv_unb_var1(): get mem pool block\n" ); + #endif + + // Acquire a buffer of the required size from the memory broker + // and save the associated mem_t entry to mem_buf_X. + bli_pba_acquire_m( + &rntm_l, + buffer_size, + BLIS_BITVAL_BUFFER_FOR_A_BLOCK, + &mem_buf_X + ); + + + // Continue packing X if buffer memory is allocated. + if ( bli_mem_is_alloc( &mem_buf_X ) ) + { + x_buf = bli_mem_buffer( &mem_buf_X ); + // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride. + for ( dim_t x_index = 0; x_index < n; x_index++ ) + { + *( x_buf + x_index ) = *( x + ( x_index * incx ) ); + } + incx_buf = 1; + } + else + { + nt_ideal = 1; + } + } + + #ifdef BLIS_ENABLE_OPENMP + + if( nt_ideal == 1 ) + { + #endif + /* + The overhead cost with OpenMP is avoided in case + the ideal number of threads needed is 1. + */ + + norm_fp( n, x_buf, incx_buf, norm, cntx ); + + if ( bli_mem_is_alloc( &mem_buf_X ) ) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dnorm2fv_unb_var1(): releasing mem pool block\n" ); + #endif + // Return the buffer to pool. + bli_pba_release( &rntm_l , &mem_buf_X ); + } + return; + + #ifdef BLIS_ENABLE_OPENMP + } + + /* + The following code-section is touched only in the case of + requiring multiple threads for the computation. + + Every thread will calculate its own local norm, and all + the local results will finally be reduced as per the mandate. + */ + + mem_t mem_buf_norm = { 0 }; + + double *norm_per_thread = NULL; + + // Calculate the size required for buffer. + size_t buffer_size = nt_ideal * sizeof(double); + + /* + Acquire a buffer (nt_ideal * size(double)) from the memory broker + and save the associated mem_t entry to mem_buf_norm. + */ + + bli_pba_acquire_m( + &rntm_l, + buffer_size, + BLIS_BITVAL_BUFFER_FOR_A_BLOCK, + &mem_buf_norm + ); + + /* Continue if norm buffer memory is allocated*/ + if ( bli_mem_is_alloc( &mem_buf_norm ) ) + { + norm_per_thread = bli_mem_buffer( &mem_buf_norm ); + + /* + In case the number of threads launched is not + equal to the number of threads required, we will + need to ensure that the garbage values are not part + of the reduction step. + + Every local norm is initialized to 0.0 to avoid this. + */ + + for ( dim_t i = 0; i < nt_ideal; i++ ) + norm_per_thread[i] = 0.0; + + // Parallel code-section + _Pragma("omp parallel num_threads(nt_ideal)") + { + /* + The number of actual threads spawned is + obtained here, so as to distribute the + job precisely. + */ + + dim_t n_threads = omp_get_num_threads(); + + dim_t thread_id = omp_get_thread_num(); + double *x_start; + + // Obtain the job-size and region for compute + dim_t job_per_thread, offset; + bli_normfv_thread_partition( n, n_threads, &offset, &job_per_thread, 4, incx_buf, thread_id ); + + x_start = x_buf + offset; + + // Call to the kernel with the appropriate starting address + norm_fp( job_per_thread, x_start, incx_buf, ( norm_per_thread + thread_id ), cntx ); + } + + /* + Reduce the partial results onto a final scalar, based + on the mandate. + + Every partial result needs to be subjected to overflow or + underflow handling if needed. Thus this reduction step involves + the same logic as the one present in the kernel. The kernel is + therefore reused for the reduction step. + */ + + norm_fp( nt_ideal, norm_per_thread, 1, norm, cntx ); + + // Releasing the allocated memory if it was allocated + bli_pba_release( &rntm_l, &mem_buf_norm ); + } + + /* + In case of failing to acquire the buffer from the memory + pool, call the single-threaded kernel and return. + */ + else + { + norm_fp( n, x_buf, incx_buf, norm, cntx ); + } + + /* + By this point, the norm value would have been set by the appropriate + code-section that was touched. The assignment is not abstracted outside + in order to avoid unnecessary conditionals. + */ + + #endif + + if ( bli_mem_is_alloc( &mem_buf_X ) ) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dnorm2fv_unb_var1(): releasing mem pool block\n" ); + #endif + // Return the buffer to pool. + bli_pba_release( &rntm_l , &mem_buf_X ); + } } #undef GENTFUNCR diff --git a/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp index 04e9d1fc37..b1642c6dfb 100644 --- a/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp +++ b/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp @@ -120,7 +120,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(3)), // stride size for x ::testing::Values(gtint_t(1)), - // i : index of x that has value iexval + // i : index of x that has value iexval ::testing::Values(0), // iexval ::testing::Values(NaN, Inf, -Inf), @@ -138,7 +138,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(8)), // stride size for x ::testing::Values(gtint_t(1)), - // i : index of x that has value iexval + // i : index of x that has value iexval ::testing::Values(3), // iexval ::testing::Values(NaN, Inf, -Inf), @@ -158,7 +158,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(12)), // stride size for x ::testing::Values(gtint_t(1)), - // i : index of x that has value iexval + // i : index of x that has value iexval ::testing::Values(9), // iexval ::testing::Values(NaN, Inf, -Inf), @@ -168,7 +168,7 @@ INSTANTIATE_TEST_SUITE_P( ::dnrm2_TestPrint() ); -// Now let's check the combination of a vectorized path and +// Now let's check the combination of a vectorized path and // the scalar path, by putting an extreme value in each // to check that the checks are integrated correctly. INSTANTIATE_TEST_SUITE_P( @@ -179,7 +179,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(10)), // stride size for x ::testing::Values(gtint_t(1)), - // i : index of x that has value iexval + // i : index of x that has value iexval ::testing::Values(5), // iexval ::testing::Values(NaN, Inf, -Inf), @@ -189,3 +189,78 @@ INSTANTIATE_TEST_SUITE_P( ::dnrm2_TestPrint() ); +// Multithreading unit tester +/* + The following instantiator has data points that would suffice + the unit testing with <= 64 threads. + + Sizes from 256 to 259 ensure that each thread gets a minimum + size of 4, with some sizes inducing fringe cases. + + Sizes from 512 to 515 ensure that each thread gets a minimum + size of 8, with some sizes inducing fringe cases. + + Sizes from 768 to 771 ensure that each thread gets a minimum + size of 12, with some sizes inducing fringe cases. + + NOTE : Extreme values are induced at indices that are valid + for all the listed sizes in the instantiator. + + Non-unit strides are also tested, since they might get packed. +*/ +INSTANTIATE_TEST_SUITE_P( + EVT_MT_Unit_Tester, + dnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(256), + gtint_t(257), + gtint_t(258), + gtint_t(259), + gtint_t(512), + gtint_t(513), + gtint_t(514), + gtint_t(515), + gtint_t(768), + gtint_t(769), + gtint_t(770), + gtint_t(771)), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(5)), + // i : index of x that has value iexval + ::testing::Values(0, 5, 100, 255), + // iexval + ::testing::Values(NaN, Inf, -Inf), + ::testing::Values(4, 17, 125, 201), + ::testing::Values(1.0, NaN, Inf, -Inf) + ), + ::dnrm2_TestPrint() + ); + +// Instantiator if AOCL_DYNAMIC is enabled +/* + The instantiator here checks for correctness of + the compute with sizes large enough to bypass + the thread setting logic with AOCL_DYNAMIC enabled +*/ +INSTANTIATE_TEST_SUITE_P( + EVT_MT_AOCL_DYNAMIC, + dnrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(2950000), + gtint_t(2950001), + gtint_t(2950002), + gtint_t(2950003) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(5)), + // i : index of x that has value iexval + ::testing::Values(1000000, 2000000), + // iexval + ::testing::Values(NaN, Inf), + ::testing::Values(1500000, 2500000), + ::testing::Values(-Inf, NaN) + ), + ::dnrm2_TestPrint() + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp index 419c8499d7..eb18436788 100644 --- a/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp @@ -9,14 +9,14 @@ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT @@ -87,13 +87,14 @@ class dnrm2TestPrint { * - for-loop for multiples of 4 (F4) * - scalar path for n<=4 (S) */ + INSTANTIATE_TEST_SUITE_P( - AT, + AT_1T, dnrm2Test, ::testing::Combine( // m size of vector ::testing::Values(gtint_t(1), // trivial case n=1 - gtint_t(3), // will only go through S + gtint_t(3), // will go through SSE and scalar gtint_t(8), // 1*8 - will only go through F8 gtint_t(24), // 3*8 - will go through F8 gtint_t(34), // 4*8 + 2 - will go through F8 & S @@ -105,11 +106,87 @@ INSTANTIATE_TEST_SUITE_P( gtint_t(217) ), // stride size for x - ::testing::Values(gtint_t(1), gtint_t(4) + ::testing::Values(gtint_t(1), gtint_t(3) +#ifndef TEST_BLIS_TYPED + , gtint_t(-1), gtint_t(-7) +#endif + ) + ), + ::dnrm2TestPrint() + ); + +// Multithreading unit tester +/* + NOTE : The following instantiator is the most useful if BLIS + configured with aocl-dynamic disabled, since then it + would be sufficient to verify functionality upto 64 + threads. + + The following instantiator has data points that would suffice + the extreme value testing with <= 64 threads. + + Sizes from 256 to 259 ensure that each thread gets a minimum + size of 4, with some sizes inducing fringe cases. + + Sizes from 512 to 515 ensure that each thread gets a minimum + size of 8, with some sizes inducing fringe cases. + + Sizes from 768 to 771 ensure that each thread gets a minimum + size of 12( i.e 8-block loop + 4-block loop), with some sizes + inducing fringe cases. + + Non-unit strides are also tested, since they might get packed. +*/ +INSTANTIATE_TEST_SUITE_P( + AT_MT_Unit_Tester, + dnrm2Test, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(256), + gtint_t(257), + gtint_t(258), + gtint_t(259), + gtint_t(512), + gtint_t(513), + gtint_t(514), + gtint_t(515), + gtint_t(768), + gtint_t(769), + gtint_t(770), + gtint_t(771) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(3) +#ifndef TEST_BLIS_TYPED + , gtint_t(-1), gtint_t(-7) +#endif + ) + ), + ::dnrm2TestPrint() + ); + +// Instantiator if AOCL_DYNAMIC is enabled +/* + The instantiator here checks for correctness of + the compute with sizes large enough to bypass + the thread setting logic with AOCL_DYNAMIC enabled +*/ +INSTANTIATE_TEST_SUITE_P( + AT_MT_AOCL_DYNAMIC, + dnrm2Test, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(2950000), + gtint_t(2950001), + gtint_t(2950002), + gtint_t(2950003) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(3) #ifndef TEST_BLIS_TYPED - , gtint_t(-1), gtint_t(-5) + , gtint_t(-1), gtint_t(-7) #endif - ) // stride size for x + ) ), ::dnrm2TestPrint() ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp index 3d61719eea..6eab297ac6 100644 --- a/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp +++ b/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp @@ -119,7 +119,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2)), // stride size for x ::testing::Values(gtint_t(1)), - // i : index of x that has value iexval + // i : index of x that has value iexval ::testing::Values(0), // iexval ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}, dcomplex{NaN, Inf}, dcomplex{Inf, NaN}), @@ -137,7 +137,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(4)), // stride size for x ::testing::Values(gtint_t(1)), - // i : index of x that has value iexval + // i : index of x that has value iexval ::testing::Values(1), // iexval ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}, dcomplex{NaN, Inf}, dcomplex{Inf, NaN}), @@ -157,7 +157,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(6)), // stride size for x ::testing::Values(gtint_t(1)), - // i : index of x that has value iexval + // i : index of x that has value iexval ::testing::Values(4), // iexval ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}, dcomplex{NaN, Inf}, dcomplex{Inf, NaN}), @@ -167,7 +167,7 @@ INSTANTIATE_TEST_SUITE_P( ::dznrm2_TestPrint() ); -// Now let's check the combination of a vectorized path and +// Now let's check the combination of a vectorized path and // the scalar path, by putting an extreme value in each // to check that the checks are integrated correctly. INSTANTIATE_TEST_SUITE_P( @@ -178,7 +178,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(7)), // stride size for x ::testing::Values(gtint_t(1)), - // i : index of x that has value iexval + // i : index of x that has value iexval ::testing::Values(2), // iexval ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}, dcomplex{NaN, Inf}, dcomplex{Inf, NaN}), @@ -188,3 +188,77 @@ INSTANTIATE_TEST_SUITE_P( ::dznrm2_TestPrint() ); +// Mutlthreading Unit Tester +/* + The following instantiator has data points that would suffice + the extreme value testing with 64 threads. + + Sizes 128 and 129 ensure that each thread gets size 2, with + the first thread dealing with fringe case also, if required. + + Sizes 256, 257 and 259 ensure that each thread gets size 4, with + the first two threads dealing wtih extra AVX and SSE cases also, + if required. + + Sizes from 384 to 389 ensure that each thread gets size 6, with + the first few threads dealing with extra AVX and SSE cases if needed. + + NOTE : Extreme values are induced at indices that are valid + for all the listed sizes in the instantiator. + + Non-unit strides are also tested, since they might get packed +*/ +INSTANTIATE_TEST_SUITE_P( + EVT_MT_Unit_Tester, + dznrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(128), + gtint_t(129), + gtint_t(256), + gtint_t(257), + gtint_t(259), + gtint_t(384), + gtint_t(385), + gtint_t(386), + gtint_t(387), + gtint_t(388), + gtint_t(389) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(3)), + // i : index of x that has value iexval + ::testing::Values(2, 17, 65, 110), + // iexval + ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}, dcomplex{NaN, Inf}, dcomplex{Inf, NaN}), + ::testing::Values(6, 25, 64, 127), + ::testing::Values(dcomplex{NaN, 1.0}, dcomplex{Inf, 9.0}, dcomplex{-1.0, -Inf}, dcomplex{2.0, NaN}) + ), + ::dznrm2_TestPrint() + ); + +// Instantiator if AOCL_DYNAMIC is enabled +/* + The instantiator here checks for correctness of + the compute with sizes large enough to bypass + the thread setting logic with AOCL_DYNAMIC enabled +*/ +INSTANTIATE_TEST_SUITE_P( + EVT_MT_AOCL_DYNAMIC, + dznrm2_EVT, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(1530000), + gtint_t(1530001) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(5)), + // i : index of x that has value iexval + ::testing::Values(800000, 1000000), + // iexval + ::testing::Values(dcomplex{NaN, Inf}, dcomplex{-Inf, NaN}, dcomplex{Inf, 0.0}), + ::testing::Values(1100000, 1500000), + ::testing::Values(dcomplex{NaN, Inf}, dcomplex{-Inf, NaN}, dcomplex{Inf, 0.0}) + ), + ::dznrm2_TestPrint() + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp index e6477ff427..dfabea06ae 100644 --- a/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp @@ -1,3 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + #include #include "test_nrm2.h" @@ -54,12 +88,12 @@ class dznrm2TestPrint { * - scalar path for n<=2 (S) */ INSTANTIATE_TEST_SUITE_P( - AT, + AT_1T, dznrm2Test, ::testing::Combine( // m size of vector ::testing::Values(gtint_t(1), // trivial case n=1 - gtint_t(2), // will only go through S + gtint_t(2), // 1*2 - will only go through F2 gtint_t(4), // 1*4 - will only go through F4 gtint_t(12), // 3*4 - will go through F4 gtint_t(17), // 4*4 + 1 - will go through F4 & S @@ -74,6 +108,74 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(1), gtint_t(3) #ifndef TEST_BLIS_TYPED , gtint_t(-1), gtint_t(-7) +#endif + ) + ), + ::dznrm2TestPrint() + ); + +// Multithreading unit tester +/* + The following instantiator has data points that would suffice + the unit testing with 64 threads. + + Sizes 128 and 129 ensure that each thread gets a minimum + size of 2, with some sizes inducing fringe cases. + + Sizes 256, 257 and 259 ensure that each thread gets a minimum + size of 4, with some sizes inducing fringe cases. + + Sizes from 384 to 389 ensure that each thread gets a minimum + size of 6( 4-block loop + 2-block loop), with some sizes inducing + fringe cases. + + Non-unit strides are also tested, since they might get packed. +*/ +INSTANTIATE_TEST_SUITE_P( + AT_MT_Unit_Tester, + dznrm2Test, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(128), + gtint_t(129), + gtint_t(256), + gtint_t(257), + gtint_t(259), + gtint_t(384), + gtint_t(385), + gtint_t(386), + gtint_t(387), + gtint_t(388), + gtint_t(389) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(3) +#ifndef TEST_BLIS_TYPED + , gtint_t(-1), gtint_t(-7) +#endif + ) + ), + ::dznrm2TestPrint() + ); + +// Instantiator if AOCL_DYNAMIC is enabled +/* + The instantiator here checks for correctness of + the compute with sizes large enough to bypass + the thread setting logic with AOCL_DYNAMIC enabled +*/ +INSTANTIATE_TEST_SUITE_P( + AT_MT_AOCL_DYNAMIC, + dznrm2Test, + ::testing::Combine( + // m size of vector + ::testing::Values(gtint_t(1530000), + gtint_t(1530001) + ), + // stride size for x + ::testing::Values(gtint_t(1), gtint_t(3) +#ifndef TEST_BLIS_TYPED + , gtint_t(-1), gtint_t(-7) #endif ) ), diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp index ac8f104697..3134c88897 100644 --- a/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp +++ b/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp @@ -70,4 +70,27 @@ TYPED_TEST(nrm2_EIC, zero_incx_vectorized) { blis_norm = nrm2(n, x.data(), incx); RT ref_norm = testinghelpers::ref_nrm2(n, x.data(), incx); computediff(blis_norm, ref_norm); +} + +/* + The following test is specific to dnrm2 and dznrm2 apis. + In case of multithreading, each thread will calculate its + norm based on the data it operates on. All these norms will + be reduced post the parallel region. +*/ +TYPED_TEST( nrm2_EIC, zero_incx_MT ) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + gtint_t n = 2950000; + gtint_t incx = 0; + std::vector x(n); + for (auto &xi : x) + testinghelpers::initone(xi); + // For incx=0, nrm2 iterates through the first element n-times. + // So, we initialize x[0] with a different value than the rest + // of the elements. + x[0] = T{10.0}*x[0]; + RT blis_norm = nrm2(n, x.data(), incx); + RT ref_norm = testinghelpers::ref_nrm2(n, x.data(), incx); + computediff(blis_norm, ref_norm); } \ No newline at end of file diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp index 9f7dc87d80..22e0141292 100644 --- a/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp +++ b/gtestsuite/testsuite/util/nrm2/nrm2_underflow_overflow.cpp @@ -69,6 +69,99 @@ TYPED_TEST(OUT_nrm2, zeroFP_vectorized) { computediff(0, norm); } +/* + Adding a type-parameterized test to check for + overflow and underflow handling with multiple threads + in case of dnrm2 and dznrm2. Can also be used if snrm2 + and scnrm2 are multithreaded. +*/ + +// Checking only for overflow, based on the threshold +TYPED_TEST( OUT_nrm2, OFlow_MT ) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + gtint_t n = 2950000; + std::vector x(n, T{1.0}); // A normal value + RT bigval; + if constexpr ( std::is_same::value ) + { + bigval = powf( ( float )FLT_RADIX, floorf( ( FLT_MAX_EXP - 23) * 0.5f ) ) * ( 1.0f + FLT_EPSILON ); + } + else + { + bigval = pow( ( double )FLT_RADIX, floor( ( DBL_MAX_EXP - 52) * 0.5 ) ) * ( 1.0 + DBL_EPSILON ); + } + + // Set the threshold for the errors: + double thresh = 2*testinghelpers::getEpsilon(); + x[1000] = T{ bigval }; + x[50000] = T{ bigval }; + x[151001] = T{ bigval }; + x[2949999] = T{ bigval }; + + RT norm = nrm2( n, x.data(), 1 ); + RT ref_norm = testinghelpers::ref_nrm2( n, x.data(), 1 ); + computediff( norm, ref_norm, thresh ); +} + +// Checking only for underflow, based on the threshold +TYPED_TEST( OUT_nrm2, UFlow_MT ) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + gtint_t n = 2950000; + std::vector x(n, T{1.0}); // A normal value + RT smlval; + if constexpr ( std::is_same::value ) + { + smlval = powf( ( float )FLT_RADIX, ceilf( ( FLT_MIN_EXP - 1 ) * 0.5f ) ) * ( 1.0f - FLT_EPSILON ); + } + else + { + smlval = pow( ( double )FLT_RADIX, ceil( ( DBL_MIN_EXP - 1 ) * 0.5 ) ) * ( 1.0 - DBL_EPSILON ); + } + + // Set the threshold for the errors: + double thresh = 2*testinghelpers::getEpsilon(); + x[1000] = T{ smlval }; + x[50000] = T{ smlval }; + x[151001] = T{ smlval }; + x[2949999] = T{ smlval }; + + RT norm = nrm2( n, x.data(), 1 ); + RT ref_norm = testinghelpers::ref_nrm2( n, x.data(), 1 ); + computediff( norm, ref_norm, thresh ); +} + +// Checking for both overflow and underflow, based on the thresholds +TYPED_TEST( OUT_nrm2, OUFlow_MT ) { + using T = TypeParam; + using RT = typename testinghelpers::type_info::real_type; + gtint_t n = 2950000; + std::vector x(n, T{1.0}); // A normal value + RT bigval, smlval; + if constexpr ( std::is_same::value ) + { + bigval = powf( ( float )FLT_RADIX, floorf( ( FLT_MAX_EXP - 23) * 0.5f ) ) * ( 1.0f + FLT_EPSILON ); + smlval = powf( ( float )FLT_RADIX, ceilf( ( FLT_MIN_EXP - 1 ) * 0.5f ) ) * ( 1.0f - FLT_EPSILON ); + } + else + { + bigval = pow( ( double )FLT_RADIX, floor( ( DBL_MAX_EXP - 52) * 0.5 ) ) * ( 1.0 + DBL_EPSILON ); + smlval = pow( ( double )FLT_RADIX, ceil( ( DBL_MIN_EXP - 1 ) * 0.5 ) ) * ( 1.0 - DBL_EPSILON ); + } + + // Set the threshold for the errors: + double thresh = 2*testinghelpers::getEpsilon(); + x[1000] = T{ smlval }; + x[50000] = T{ bigval }; + x[151001] = T{ bigval }; + x[2949999] = T{ smlval }; + + RT norm = nrm2( n, x.data(), 1 ); + RT ref_norm = testinghelpers::ref_nrm2( n, x.data(), 1 ); + computediff( norm, ref_norm, thresh ); +} + // Specific test case used by an ISV. // Checks for overflow. TEST(dnrm2, largeDouble) { diff --git a/gtestsuite/testsuite/util/nrm2/test_nrm2.h b/gtestsuite/testsuite/util/nrm2/test_nrm2.h index a0ec2f3b35..b2fdf213e1 100644 --- a/gtestsuite/testsuite/util/nrm2/test_nrm2.h +++ b/gtestsuite/testsuite/util/nrm2/test_nrm2.h @@ -80,9 +80,9 @@ void test_nrm2( gtint_t n, gtint_t incx, gtint_t i, T iexval, gtint_t j = 0, T j //---------------------------------------------------------- std::vector x = testinghelpers::get_random_vector(-10, 10, n, incx); // Initialize ith element of vector x to iexval. - x[i*incx] = iexval; + x[i*std::abs(incx)] = iexval; // Initialize jth element of vector x to jexval. - x[j*incx] = jexval; + x[j*std::abs(incx)] = jexval; //---------------------------------------------------------- // Call reference implementation to get ref results. //---------------------------------------------------------- diff --git a/kernels/zen/1/bli_norm2_zen_int.c b/kernels/zen/1/bli_norm2_zen_int.c index d6fcaf902d..c2d0ebe7cd 100644 --- a/kernels/zen/1/bli_norm2_zen_int.c +++ b/kernels/zen/1/bli_norm2_zen_int.c @@ -50,6 +50,14 @@ typedef union double d[4] __attribute__( ( aligned( 64 ) ) ); } v4df_t; +// Union data structure to access SSE registers +// One 128-bit AVX register holds 2 DP elements. +typedef union +{ + __m128d v; + double d[2] __attribute__( ( aligned( 64 ) ) ); +} v2df_t; + // Return a mask which indicates either: // v <= t or v >= T #define CMP256_sf( v, t, T ) \ @@ -58,6 +66,9 @@ typedef union #define CMP256_df( v, t, T ) \ _mm256_or_pd( _mm256_cmp_pd( v, t, _CMP_LE_OS ), _mm256_cmp_pd( v, T, _CMP_GE_OS ) ); +#define CMP128_df( v, t, T ) \ + _mm_or_pd( _mm_cmp_pd( v, t, _CMP_LE_OS ), _mm_cmp_pd( v, T, _CMP_GE_OS ) ); + // Returns true if any of the values in the mask vector a is true, // and false, otherwise. // In more detail, __mm256_testz_ps() performs the bitwise (a AND b) operation and returns: @@ -75,6 +86,7 @@ typedef union // 1 (true) if the mask is true for at least one element in a. static inline bool bli_horizontal_or_sf( __m256 a ) { return ! _mm256_testz_ps( a, a ); } static inline bool bli_horizontal_or_df( __m256d a ) { return ! _mm256_testz_pd( a, a ); } +static inline bool bli_horizontal_or_df_128( __m128d a ) { return ! _mm_testz_pd( a, a ); } float horizontal_add_sf(__m256 const a) { __m256 t1 = _mm256_hadd_ps(a, a); @@ -2051,64 +2063,16 @@ void bli_dnorm2fv_unb_var1_avx2 AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 ); double sumsq = 0; - dim_t i = 0; - dim_t n_remainder = 0; - double *x_buf = x; - - // Memory pool declarations for packing vector X. - // Initialize mem pool buffer to NULL and size to 0. - // "buf" and "size" fields are assigned once memory - // is allocated from the pool in bli_pba_acquire_m(). - // This will ensure bli_mem_is_alloc() will be passed on - // an allocated memory if created or a NULL. - mem_t mem_bufX = {0}; - rntm_t rntm; - - // Packing for non-unit strided vector x. - if ( incx != 1 ) - { - // In order to get the buffer from pool via rntm access to memory broker - //is needed. Following are initializations for rntm. - bli_rntm_init_from_global( &rntm ); - bli_rntm_set_num_threads_only( 1, &rntm ); - bli_pba_rntm_set_pba( &rntm ); - - // Calculate the size required for "n" double elements in vector x. - size_t buffer_size = n * sizeof( double ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dnorm2fv_unb_var1(): get mem pool block\n" ); - #endif - - // Acquire a Buffer(n*size(double)) from the memory broker - // and save the associated mem_t entry to mem_bufX. - bli_pba_acquire_m - ( - &rntm, - buffer_size, - BLIS_BUFFER_FOR_B_PANEL, - &mem_bufX - ); - // Continue packing X if buffer memory is allocated. - if ( ( bli_mem_is_alloc( &mem_bufX ) ) ) - { - x_buf = bli_mem_buffer( &mem_bufX ); - // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride. - for ( dim_t x_index = 0; x_index < n; x_index++ ) - { - *( x_buf + x_index ) = *( x + ( x_index * incx ) ); - } - } - } + double *xt = x; - double *xt = x_buf; + dim_t n_rem = n % 4; // Compute the sum of squares on 3 accumulators to avoid overflow // and underflow, depending on the vector element value. // Accumulator for small values; using scaling to avoid underflow. double sum_sml = 0; - // Accumulator for medium values; no scaling required. + // Accumulator for medium values; no scaling required. double sum_med = 0; // Accumulator for big values; using scaling to avoid overflow. double sum_big = 0; @@ -2120,21 +2084,123 @@ void bli_dnorm2fv_unb_var1_avx2 const double scale_big = pow( ( double )FLT_RADIX, - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) ); double scale; - double abs_chi; bool isbig = false; - if ( n > 4 ) + dim_t i = 0; + + if( incx == 1 ) { - // Constants used for comparisons. - v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1; - temp.v = _mm256_set1_pd( -0.0 ); - thres_sml_vec.v = _mm256_set1_pd( thres_sml ); - thres_big_vec.v = _mm256_set1_pd( thres_big ); - v4df_t x0v, x1v, mask_vec0, mask_vec1; - zerov.v = _mm256_setzero_pd(); + // Attending to the fringe case requiring SSE code section. + if ( n_rem >= 2 ) + { + // Clearing the upper 128-bit lanes if and when required. + // This ensures that the AVX-SSE transition penalty is avoided. + _mm256_zeroupper(); + + // Partial sums used for scaling, and registers to store thresholds + // and scaling factors + v2df_t sum_med_vec, sum_big_vec, sum_sml_vec; + v2df_t thres_sml_vec, thres_big_vec; + v2df_t scale_sml_vec, scale_big_vec; + + // Vectors used for intermediate arithmetic and absolute value + v2df_t temp, zerov; + sum_med_vec.v = _mm_setzero_pd(); + sum_big_vec.v = _mm_setzero_pd(); + sum_sml_vec.v = _mm_setzero_pd(); + + temp.v = _mm_set1_pd( -0.0 ); + thres_big_vec.v = _mm_loaddup_pd( &thres_big ); + thres_sml_vec.v = _mm_loaddup_pd( &thres_sml ); + + // Vectors used for loading from memory and setting masks + v2df_t x0v, mask_vec; + + v2df_t med_blend, non_med_blend; + + x0v.v = _mm_loadu_pd( xt ); + + // Getting the abs of the vector elements. + x0v.v = _mm_andnot_pd( temp.v, x0v.v ); + + // Check if any of the values is a NaN and if so, return. + mask_vec.v = _mm_cmp_pd( x0v.v, x0v.v, _CMP_UNORD_Q ); + + // Checking for the presence of atleast one NaN + if ( bli_horizontal_or_df_128( mask_vec.v ) ) + { + *norm = NAN; + return; + } + + mask_vec.v = CMP128_df( x0v.v, thres_sml_vec.v, thres_big_vec.v ); + + if ( !bli_horizontal_or_df_128( mask_vec.v ) ) + { + // Scaling is not necessary; only medium values. + sum_med_vec.v = _mm_fmadd_pd( x0v.v, x0v.v, sum_med_vec.v ); + } + else + { + // Mask vector which indicate whether xi > thres_big. + mask_vec.v = _mm_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ ); + zerov.v = _mm_setzero_pd(); + + if ( bli_horizontal_or_df_128( mask_vec.v ) ) + { + scale_big_vec.v = _mm_loaddup_pd( &scale_big ); + isbig = true; + + // Fill sum_med vector without scaling. + med_blend.v = _mm_blendv_pd( x0v.v, zerov.v, mask_vec.v ); + sum_med_vec.v = _mm_fmadd_pd( med_blend.v, med_blend.v, sum_med_vec.v ); + + // Fill sum_big vector using scaling. + zerov.v = _mm_setzero_pd(); + non_med_blend.v = _mm_blendv_pd( zerov.v, scale_big_vec.v, mask_vec.v ); + non_med_blend.v = _mm_mul_pd( x0v.v, non_med_blend.v ); + sum_big_vec.v = _mm_fmadd_pd( non_med_blend.v, non_med_blend.v, sum_big_vec.v ); + } + else + { + scale_sml_vec.v = _mm_loaddup_pd( &scale_sml ); + // Mask vector which indicates whether xi > thres_small. + mask_vec.v = _mm_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ ); + // Fill sum_med vector without scaling. + med_blend.v = _mm_blendv_pd( x0v.v, zerov.v, mask_vec.v ); + sum_med_vec.v = _mm_fmadd_pd( med_blend.v, med_blend.v, sum_med_vec.v ); + + // Accumulate small values only if there have not been any big values so far. + if ( !isbig ) + { + // Fill sum_sml vector using scaling. + zerov.v = _mm_setzero_pd(); + non_med_blend.v = _mm_blendv_pd( zerov.v, scale_sml_vec.v, mask_vec.v ); + non_med_blend.v = _mm_mul_pd( x0v.v, non_med_blend.v ); + sum_sml_vec.v = _mm_fmadd_pd( non_med_blend.v, non_med_blend.v, sum_sml_vec.v ); + } + } + } + + // Final accumulation on the appropriate scalars + sum_sml += sum_sml_vec.v[0] + sum_sml_vec.v[1]; + sum_med += sum_med_vec.v[0] + sum_med_vec.v[1]; + sum_big += sum_big_vec.v[0] + sum_big_vec.v[1]; + + xt += 2; + i += 2; + } + + // AVX-2 code-section // Partial sums used for scaling. - v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1; + v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0; + v4df_t sum_med_vec1, sum_big_vec1, sum_sml_vec1; + + // Vectors used for comparisons and getting absolute values. + v4df_t thres_sml_vec, thres_big_vec, scale_sml_vec, scale_big_vec; + v4df_t temp, zerov; + sum_med_vec0.v = _mm256_setzero_pd(); sum_big_vec0.v = _mm256_setzero_pd(); sum_sml_vec0.v = _mm256_setzero_pd(); @@ -2142,51 +2208,40 @@ void bli_dnorm2fv_unb_var1_avx2 sum_big_vec1.v = _mm256_setzero_pd(); sum_sml_vec1.v = _mm256_setzero_pd(); - for (; ( i + 8 ) <= n; i = i + 8) + // Pre-broadcasting the thresholds and scale factors before entering the loops + thres_sml_vec.v = _mm256_broadcast_sd( &thres_sml ); + thres_big_vec.v = _mm256_broadcast_sd( &thres_big ); + scale_sml_vec.v = _mm256_broadcast_sd( &scale_sml ); + scale_big_vec.v = _mm256_broadcast_sd( &scale_big ); + + // This is used to convert the values in a vector to their absolute value + temp.v = _mm256_set1_pd( -0.0 ); + + // Vectors used for loading from memory and setting masks + v4df_t x0v, x1v, mask_vec0, mask_vec1; + + for ( ; ( i + 8 ) <= n; i = i + 8 ) { x0v.v = _mm256_loadu_pd( xt ); x1v.v = _mm256_loadu_pd( xt + 4 ); - // Getting the abs of the vector elements. - x0v.v = _mm256_andnot_pd( temp.v, x0v.v ); - x1v.v = _mm256_andnot_pd( temp.v, x1v.v ); - // Check if any of the values is a NaN and if so, return. - mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q); - mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q); - if ( bli_horizontal_or_df( mask_vec0.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } + mask_vec0.v = _mm256_cmp_pd( x0v.v, x0v.v, _CMP_UNORD_Q ); + mask_vec1.v = _mm256_cmp_pd( x1v.v, x1v.v, _CMP_UNORD_Q ); - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_df( mask_vec1.v ) ) + // Checking for the presence of atleast one NaN + if ( bli_horizontal_or_df( mask_vec0.v ) || bli_horizontal_or_df( mask_vec1.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; } + // Getting the abs of the vector elements. + x0v.v = _mm256_andnot_pd( temp.v, x0v.v ); + x1v.v = _mm256_andnot_pd( temp.v, x1v.v ); + // Mask vectors which indicate whether - // xi<=thres_sml or xi>=thres_big. + // xi <= thres_sml or xi >= thres_big. mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v ); mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v ); @@ -2199,39 +2254,38 @@ void bli_dnorm2fv_unb_var1_avx2 { // Mask vector which indicate whether xi > thres_big. mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ ); + zerov.v = _mm256_setzero_pd(); if ( bli_horizontal_or_df( mask_vec0.v ) ) { isbig = true; // Fill sum_med vector without scaling. - ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); - sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v ); + zerov.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); + sum_med_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec0.v ); // Fill sum_big vector using scaling. - temp.v = _mm256_set1_pd( scale_big ); - ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); - ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v ); - sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_big_vec.v, mask_vec0.v ); + zerov.v = _mm256_mul_pd( x0v.v, zerov.v ); + sum_big_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_big_vec0.v ); } else { // Mask vector which indicates whether xi > thres_small. mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ ); // Fill sum_med vector without scaling. - ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); - sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v ); + zerov.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); + sum_med_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec0.v ); // Accumulate small values only if there have not been any big values so far. if ( !isbig ) { // Fill sum_sml vector using scaling. - temp.v = _mm256_set1_pd( scale_sml ); - ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); - ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v ); - sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_sml_vec.v, mask_vec0.v ); + zerov.v = _mm256_mul_pd( x0v.v, zerov.v ); + sum_sml_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_sml_vec0.v ); } } } @@ -2246,38 +2300,38 @@ void bli_dnorm2fv_unb_var1_avx2 // Mask vector which indicate whether xi > thres_big. mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ ); + zerov.v = _mm256_setzero_pd(); + if ( bli_horizontal_or_df( mask_vec1.v ) ) { isbig = true; // Fill sum_med vector without scaling. - ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v ); - sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v ); + zerov.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v ); + sum_med_vec1.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec1.v ); // Fill sum_big vector using scaling. - temp.v = _mm256_set1_pd( scale_big ); - ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v ); - ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v ); - sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_big_vec.v, mask_vec1.v ); + zerov.v = _mm256_mul_pd( x1v.v, zerov.v ); + sum_big_vec1.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_big_vec1.v ); } else { // Mask vector which indicates whether xi > thres_small. mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ ); // Fill sum_med vector without scaling. - ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v ); - sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v ); + zerov.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v ); + sum_med_vec1.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec1.v ); // Accumulate small values only if there have not been any big values so far. if ( !isbig ) { // Fill sum_sml vector using scaling. - temp.v = _mm256_set1_pd( scale_sml ); - ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v ); - ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v ); - sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_sml_vec.v, mask_vec1.v ); + zerov.v = _mm256_mul_pd( x1v.v, zerov.v ); + sum_sml_vec1.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_sml_vec1.v ); } } } @@ -2293,20 +2347,11 @@ void bli_dnorm2fv_unb_var1_avx2 x0v.v = _mm256_andnot_pd( temp.v, x0v.v ); // Check if any of the values is a NaN and if so, return. - mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q); + mask_vec0.v = _mm256_cmp_pd( x0v.v, x0v.v, _CMP_UNORD_Q ); + if ( bli_horizontal_or_df( mask_vec0.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; } @@ -2323,39 +2368,38 @@ void bli_dnorm2fv_unb_var1_avx2 { // Mask vector which indicate whether xi > thres_big. mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ ); + zerov.v = _mm256_setzero_pd(); if ( bli_horizontal_or_df( mask_vec0.v ) ) { isbig = true; // Fill sum_med vector without scaling. - ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); - sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v ); + zerov.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); + sum_med_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec0.v ); // Fill sum_big vector using scaling. - temp.v = _mm256_set1_pd( scale_big ); - ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); - ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v ); - sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_big_vec.v, mask_vec0.v ); + zerov.v = _mm256_mul_pd( x0v.v, zerov.v ); + sum_big_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_big_vec0.v ); } else { // Mask vector which indicates whether xi > thres_small. mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ ); // Fill sum_med vector without scaling. - ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); - sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v ); + zerov.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); + sum_med_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec0.v ); // Accumulate small values only if there have not been any big values so far. if ( !isbig ) { // Fill sum_sml vector using scaling. - temp.v = _mm256_set1_pd( scale_sml ); - ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); - ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v ); - sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_sml_vec.v, mask_vec0.v ); + zerov.v = _mm256_mul_pd( x0v.v, zerov.v ); + sum_sml_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_sml_vec0.v ); } } } @@ -2366,82 +2410,44 @@ void bli_dnorm2fv_unb_var1_avx2 sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v ); sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v ); - sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1] - + sum_sml_vec0.v[2] + sum_sml_vec0.v[3]; - sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1] - + sum_med_vec0.v[2] + sum_med_vec0.v[3]; - sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1] - + sum_big_vec0.v[2] + sum_big_vec0.v[3]; + sum_sml_vec0.v = _mm256_hadd_pd( sum_sml_vec0.v, sum_sml_vec0.v ); + sum_med_vec0.v = _mm256_hadd_pd( sum_med_vec0.v, sum_med_vec0.v ); + sum_big_vec0.v = _mm256_hadd_pd( sum_big_vec0.v, sum_big_vec0.v ); + + sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[2]; + sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[2]; + sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[2]; } - n_remainder = n - i; - bool hasInf = false; - if ( ( n_remainder > 0 ) ) + // Dealing with fringe cases + for( ; i < n; i += 1 ) { - // Put first the most likely to happen to avoid evaluations on if statements. - for (i = 0; i < n_remainder; i++) + double abs_chi; + abs_chi = bli_fabs( *xt ); + // Any thread encountering a NAN sets the sum_med accumalator to NAN + if ( bli_isnan( abs_chi ) ) { - abs_chi = bli_fabs( *xt ); - // If any of the elements is NaN, then return NaN as a result. - if ( bli_isnan( abs_chi ) ) - { - *norm = abs_chi; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - // Else, if any of the elements is an Inf, then return +Inf as a result. - if ( bli_isinf( abs_chi ) ) - { - *norm = abs_chi; - // Instead of returning immediately, use this flag - // to denote that there is an Inf element in the vector. - // That is used to avoid cases where there is a NaN which comes - // after an Inf. - hasInf = true; - } - // Most likely case: medium values, not over/under-flow. - if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) - { - sum_med += abs_chi * abs_chi; - } - // Case where there could be an overflow. Scaling is required. - else if ( abs_chi > thres_big ) - { - sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); - isbig = true; - } - // Case where there could be an underflow. Scaling is required. - else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) - { - sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); - } - xt++; + *norm = NAN; + return; } - } - - // Early return if there is an Inf. - if ( hasInf ) - { - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) + // Most likely case: medium values, not over/under-flow. + else if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); + sum_med += abs_chi * abs_chi; + } + // Case where there could be an overflow. Scaling is required. + else if ( abs_chi > thres_big ) + { + sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); + isbig = true; + } + // Case where there could be an underflow. Scaling is required. + else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) + { + sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); } - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; + xt += incx; } // Combine accumulators. @@ -2492,15 +2498,6 @@ void bli_dnorm2fv_unb_var1_avx2 *norm = scale * sqrt( sumsq ); - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dnorm2fv_unb_var1(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; @@ -2515,67 +2512,15 @@ void bli_dznorm2fv_unb_var1_avx2 cntx_t* cntx ) { - AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 ); - double sumsq = 0; - dim_t i = 0; - dim_t n_remainder = 0; - dcomplex *x_buf = x; - // Memory pool declarations for packing vector X. - // Initialize mem pool buffer to NULL and size to 0. - // "buf" and "size" fields are assigned once memory - // is allocated from the pool in bli_pba_acquire_m(). - // This will ensure bli_mem_is_alloc() will be passed on - // an allocated memory if created or a NULL. - mem_t mem_bufX = {0}; - rntm_t rntm; - - // Packing for non-unit strided vector x. - if ( incx != 1 ) - { - // In order to get the buffer from pool via rntm access to memory broker - //is needed. Following are initializations for rntm. - bli_rntm_init_from_global( &rntm ); - bli_rntm_set_num_threads_only( 1, &rntm ); - bli_pba_rntm_set_pba( &rntm ); - - // Calculate the size required for "n" dcomplex elements in vector x. - size_t buffer_size = n * sizeof( dcomplex ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dznorm2fv_unb_var1(): get mem pool block\n" ); - #endif - - // Acquire a Buffer(n*size(dcomplex)) from the memory broker - // and save the associated mem_t entry to mem_bufX. - bli_pba_acquire_m - ( - &rntm, - buffer_size, - BLIS_BUFFER_FOR_B_PANEL, - &mem_bufX - ); - - // Continue packing X if buffer memory is allocated. - if ( ( bli_mem_is_alloc( &mem_bufX ) ) ) - { - x_buf = bli_mem_buffer( &mem_bufX ); - // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride. - for ( dim_t x_index = 0; x_index < n; x_index++ ) - { - *( x_buf + x_index ) = *( x + ( x_index * incx ) ); - } - } - } - - dcomplex *xt = x_buf; + dcomplex *xt = x; // Compute the sum of squares on 3 accumulators to avoid overflow // and underflow, depending on the vector element value. // Accumulator for small values; using scaling to avoid underflow. double sum_sml = 0; - // Accumulator for medium values; no scaling required. + // Accumulator for medium values; no scaling required. double sum_med = 0; // Accumulator for big values; using scaling to avoid overflow. double sum_big = 0; @@ -2587,21 +2532,20 @@ void bli_dznorm2fv_unb_var1_avx2 const double scale_big = pow( ( double )FLT_RADIX, - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) ); double scale; - double abs_chi; bool isbig = false; - if ( n > 2 ) - { - // Constants used for comparisons. - v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1; - temp.v = _mm256_set1_pd( -0.0 ); - thres_sml_vec.v = _mm256_set1_pd( thres_sml ); - thres_big_vec.v = _mm256_set1_pd( thres_big ); - v4df_t x0v, x1v, mask_vec0, mask_vec1; - zerov.v = _mm256_setzero_pd(); + dim_t i = 0; + if ( incx == 1 ) + { // Partial sums used for scaling. - v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1; + v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0; + v4df_t sum_med_vec1, sum_big_vec1, sum_sml_vec1; + + // Vectors used for comparisons and getting absolute values. + v4df_t thres_sml_vec, thres_big_vec, scale_sml_vec, scale_big_vec; + v4df_t temp, zerov; + sum_med_vec0.v = _mm256_setzero_pd(); sum_big_vec0.v = _mm256_setzero_pd(); sum_sml_vec0.v = _mm256_setzero_pd(); @@ -2609,51 +2553,40 @@ void bli_dznorm2fv_unb_var1_avx2 sum_big_vec1.v = _mm256_setzero_pd(); sum_sml_vec1.v = _mm256_setzero_pd(); - for (; ( i + 4 ) <= n; i = i + 4) - { - x0v.v = _mm256_loadu_pd( (double*) xt ); - x1v.v = _mm256_loadu_pd( (double*) (xt + 2) ); + // Pre-broadcasting the thresholds and scale factors before entering the loops + thres_sml_vec.v = _mm256_broadcast_sd( &thres_sml ); + thres_big_vec.v = _mm256_broadcast_sd( &thres_big ); + scale_sml_vec.v = _mm256_broadcast_sd( &scale_sml ); + scale_big_vec.v = _mm256_broadcast_sd( &scale_big ); - // Getting the abs of the vector elements. - x0v.v = _mm256_andnot_pd( temp.v, x0v.v ); - x1v.v = _mm256_andnot_pd( temp.v, x1v.v ); + // This is used to convert the values in a vector to their absolute value + temp.v = _mm256_set1_pd( -0.0 ); + + // Vectors used for loading from memory and setting masks + v4df_t x0v, x1v, mask_vec0, mask_vec1; + + for ( ; ( i + 4 ) <= n; i += 4 ) + { + x0v.v = _mm256_loadu_pd( ( const double * )xt ); + x1v.v = _mm256_loadu_pd( ( const double * )( xt + 2 ) ); // Check if any of the values is a NaN and if so, return. - mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q); - mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q); - if ( bli_horizontal_or_df( mask_vec0.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } + mask_vec0.v = _mm256_cmp_pd( x0v.v, x0v.v, _CMP_UNORD_Q ); + mask_vec1.v = _mm256_cmp_pd( x1v.v, x1v.v, _CMP_UNORD_Q ); - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_df( mask_vec1.v ) ) + // Checking for the presence of atleast one NaN + if ( bli_horizontal_or_df( mask_vec0.v ) || bli_horizontal_or_df( mask_vec1.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; } + // Getting the abs of the vector elements. + x0v.v = _mm256_andnot_pd( temp.v, x0v.v ); + x1v.v = _mm256_andnot_pd( temp.v, x1v.v ); + // Mask vectors which indicate whether - // xi<=thres_sml or xi>=thres_big. + // xi <= thres_sml or xi >= thres_big. mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v ); mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v ); @@ -2666,39 +2599,38 @@ void bli_dznorm2fv_unb_var1_avx2 { // Mask vector which indicate whether xi > thres_big. mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ ); + zerov.v = _mm256_setzero_pd(); if ( bli_horizontal_or_df( mask_vec0.v ) ) { isbig = true; // Fill sum_med vector without scaling. - ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); - sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v ); + zerov.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); + sum_med_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec0.v ); // Fill sum_big vector using scaling. - temp.v = _mm256_set1_pd( scale_big ); - ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); - ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v ); - sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_big_vec.v, mask_vec0.v ); + zerov.v = _mm256_mul_pd( x0v.v, zerov.v ); + sum_big_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_big_vec0.v ); } else { // Mask vector which indicates whether xi > thres_small. mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ ); // Fill sum_med vector without scaling. - ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); - sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v ); + zerov.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); + sum_med_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec0.v ); // Accumulate small values only if there have not been any big values so far. if ( !isbig ) { // Fill sum_sml vector using scaling. - temp.v = _mm256_set1_pd( scale_sml ); - ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); - ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v ); - sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_sml_vec.v, mask_vec0.v ); + zerov.v = _mm256_mul_pd( x0v.v, zerov.v ); + sum_sml_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_sml_vec0.v ); } } } @@ -2713,38 +2645,38 @@ void bli_dznorm2fv_unb_var1_avx2 // Mask vector which indicate whether xi > thres_big. mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ ); + zerov.v = _mm256_setzero_pd(); + if ( bli_horizontal_or_df( mask_vec1.v ) ) { isbig = true; // Fill sum_med vector without scaling. - ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v ); - sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v ); + zerov.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v ); + sum_med_vec1.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec1.v ); // Fill sum_big vector using scaling. - temp.v = _mm256_set1_pd( scale_big ); - ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v ); - ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v ); - sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_big_vec.v, mask_vec1.v ); + zerov.v = _mm256_mul_pd( x1v.v, zerov.v ); + sum_big_vec1.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_big_vec1.v ); } else { // Mask vector which indicates whether xi > thres_small. mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ ); // Fill sum_med vector without scaling. - ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v ); - sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v ); + zerov.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v ); + sum_med_vec1.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec1.v ); // Accumulate small values only if there have not been any big values so far. if ( !isbig ) { // Fill sum_sml vector using scaling. - temp.v = _mm256_set1_pd( scale_sml ); - ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v ); - ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v ); - sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_sml_vec.v, mask_vec1.v ); + zerov.v = _mm256_mul_pd( x1v.v, zerov.v ); + sum_sml_vec1.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_sml_vec1.v ); } } } @@ -2752,28 +2684,19 @@ void bli_dznorm2fv_unb_var1_avx2 xt += 4; } - for ( ; ( i + 2 ) <= n; i = i + 2 ) + for ( ; ( i + 2 ) <= n; i += 2 ) { - x0v.v = _mm256_loadu_pd( (double*) xt ); + x0v.v = _mm256_loadu_pd( ( const double * )xt ); // Getting the abs of the vector elements. x0v.v = _mm256_andnot_pd( temp.v, x0v.v ); // Check if any of the values is a NaN and if so, return. - mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q); + mask_vec0.v = _mm256_cmp_pd( x0v.v, x0v.v, _CMP_UNORD_Q ); + if ( bli_horizontal_or_df( mask_vec0.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; } @@ -2790,39 +2713,38 @@ void bli_dznorm2fv_unb_var1_avx2 { // Mask vector which indicate whether xi > thres_big. mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ ); + zerov.v = _mm256_setzero_pd(); if ( bli_horizontal_or_df( mask_vec0.v ) ) { isbig = true; // Fill sum_med vector without scaling. - ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); - sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v ); + zerov.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); + sum_med_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec0.v ); // Fill sum_big vector using scaling. - temp.v = _mm256_set1_pd( scale_big ); - ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); - ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v ); - sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_big_vec.v, mask_vec0.v ); + zerov.v = _mm256_mul_pd( x0v.v, zerov.v ); + sum_big_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_big_vec0.v ); } else { // Mask vector which indicates whether xi > thres_small. mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ ); // Fill sum_med vector without scaling. - ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); - sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v ); + zerov.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v ); + sum_med_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_med_vec0.v ); // Accumulate small values only if there have not been any big values so far. if ( !isbig ) { // Fill sum_sml vector using scaling. - temp.v = _mm256_set1_pd( scale_sml ); - ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); - ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v ); - sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v ); - temp.v = _mm256_set1_pd( -0.0 ); + zerov.v = _mm256_setzero_pd(); + zerov.v = _mm256_blendv_pd( zerov.v, scale_sml_vec.v, mask_vec0.v ); + zerov.v = _mm256_mul_pd( x0v.v, zerov.v ); + sum_sml_vec0.v = _mm256_fmadd_pd( zerov.v, zerov.v, sum_sml_vec0.v ); } } } @@ -2833,133 +2755,115 @@ void bli_dznorm2fv_unb_var1_avx2 sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v ); sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v ); - sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1] - + sum_sml_vec0.v[2] + sum_sml_vec0.v[3]; - sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1] - + sum_med_vec0.v[2] + sum_med_vec0.v[3]; - sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1] - + sum_big_vec0.v[2] + sum_big_vec0.v[3]; + sum_sml_vec0.v = _mm256_hadd_pd( sum_sml_vec0.v, sum_sml_vec0.v ); + sum_med_vec0.v = _mm256_hadd_pd( sum_med_vec0.v, sum_med_vec0.v ); + sum_big_vec0.v = _mm256_hadd_pd( sum_big_vec0.v, sum_big_vec0.v ); + + // Final accumulation on the appropriate scalars + sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[2]; + sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[2]; + sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[2]; } - n_remainder = n - i; - bool hasInf = false; - double chi_r, chi_i; - if ( ( n_remainder > 0 ) ) + + // Clearing the upper 128-bit lanes if and when required. + // This ensures that the AVX-SSE transition penalty is avoided. + _mm256_zeroupper(); + + // Dealing with fringe cases using SSE instructions and 128-bit registers. + // This is because each element of dcomplex type is 128 bits in size, thereby + // giving scope for this optimization. + for( ; i < n; i += 1 ) { - // Put first the most likely to happen to avoid evaluations on if statements. - for (i = 0; i < n_remainder; i++) + v2df_t sum_med_vec, sum_big_vec, sum_sml_vec; + v2df_t thres_sml_vec, thres_big_vec; + v2df_t scale_sml_vec, scale_big_vec; + + v2df_t temp, zerov; + + sum_med_vec.v = _mm_setzero_pd(); + sum_big_vec.v = _mm_setzero_pd(); + sum_sml_vec.v = _mm_setzero_pd(); + + temp.v = _mm_set1_pd( -0.0 ); + thres_big_vec.v = _mm_loaddup_pd( &thres_big ); + thres_sml_vec.v = _mm_loaddup_pd( &thres_sml ); + + // Vectors used for loading from memory and setting masks + v2df_t x0v, mask_vec; + + v2df_t med_blend, non_med_blend; + + x0v.v = _mm_loadu_pd( ( const double * )xt ); + + // Getting the abs of the vector elements. + x0v.v = _mm_andnot_pd( temp.v, x0v.v ); + + // Check if any of the values is a NaN and if so, return. + mask_vec.v = _mm_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q); + + // Checking for the presence of atleast one NaN + if ( bli_horizontal_or_df_128( mask_vec.v ) ) { - // Get real and imaginary component of the vector element. - bli_zdgets(*xt, chi_r, chi_i); + *norm = NAN; + return; + } - // Start with accumulating the real component of the vector element. - abs_chi = bli_fabs( chi_r ); - // If any of the elements is NaN, then return NaN as a result. - if ( bli_isnan( abs_chi ) ) - { - *norm = abs_chi; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } + mask_vec.v = CMP128_df( x0v.v, thres_sml_vec.v, thres_big_vec.v ); - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - // Else, if any of the elements is an Inf, then return +Inf as a result. - if ( bli_isinf( abs_chi ) ) - { - *norm = abs_chi; - // Instead of returning immediately, use this flag - // to denote that there is an Inf element in the vector. - // That is used to avoid cases where there is a NaN which comes - // after an Inf. - hasInf = true; - } - // Most likely case: medium values, not over/under-flow. - if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) - { - sum_med += abs_chi * abs_chi; - } - // Case where there could be an overflow. Scaling is required. - else if ( abs_chi > thres_big ) + if ( !bli_horizontal_or_df_128( mask_vec.v ) ) + { + // Scaling is not necessary; only medium values. + sum_med_vec.v = _mm_fmadd_pd( x0v.v, x0v.v, sum_med_vec.v ); + } + else + { + // Mask vector which indicate whether xi > thres_big. + mask_vec.v = _mm_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ ); + zerov.v = _mm_setzero_pd(); + + if ( bli_horizontal_or_df_128( mask_vec.v ) ) { - sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); + scale_big_vec.v = _mm_loaddup_pd( &scale_big ); isbig = true; + + // Fill sum_med vector without scaling. + med_blend.v = _mm_blendv_pd( x0v.v, zerov.v, mask_vec.v ); + sum_med_vec.v = _mm_fmadd_pd( med_blend.v, med_blend.v, sum_med_vec.v ); + + // Fill sum_big vector using scaling. + zerov.v = _mm_setzero_pd(); + non_med_blend.v = _mm_blendv_pd( zerov.v, scale_big_vec.v, mask_vec.v ); + non_med_blend.v = _mm_mul_pd( x0v.v, non_med_blend.v ); + sum_big_vec.v = _mm_fmadd_pd( non_med_blend.v, non_med_blend.v, sum_big_vec.v ); } - // Case where there could be an underflow. Scaling is required. - else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) + else { - sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); - } + scale_sml_vec.v = _mm_loaddup_pd( &scale_sml ); + // Mask vector which indicates whether xi > thres_small. + mask_vec.v = _mm_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ ); + // Fill sum_med vector without scaling. + med_blend.v = _mm_blendv_pd( x0v.v, zerov.v, mask_vec.v ); + sum_med_vec.v = _mm_fmadd_pd( med_blend.v, med_blend.v, sum_med_vec.v ); - // Accumulate the imaginary component of the vector element. - abs_chi = bli_fabs( chi_i ); - // If any of the elements is NaN, then return NaN as a result. - if ( bli_isnan( abs_chi ) ) - { - *norm = abs_chi; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) + // Accumulate small values only if there have not been any big values so far. + if ( !isbig ) { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); + // Fill sum_sml vector using scaling. + zerov.v = _mm_setzero_pd(); + non_med_blend.v = _mm_blendv_pd( zerov.v, scale_sml_vec.v, mask_vec.v ); + non_med_blend.v = _mm_mul_pd( x0v.v, non_med_blend.v ); + sum_sml_vec.v = _mm_fmadd_pd( non_med_blend.v, non_med_blend.v, sum_sml_vec.v ); } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; } - // Else, if any of the elements is an Inf, then return +Inf as a result. - if ( bli_isinf( abs_chi ) ) - { - *norm = abs_chi; - // Instead of returning immediately, use this flag - // to denote that there is an Inf element in the vector. - // That is used to avoid cases where there is a NaN which comes - // after an Inf. - hasInf = true; - } - // Most likely case: medium values, not over/under-flow. - if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) - { - sum_med += abs_chi * abs_chi; - } - // Case where there could be an overflow. Scaling is required. - else if ( abs_chi > thres_big ) - { - sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); - isbig = true; - } - // Case where there could be an underflow. Scaling is required. - else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) - { - sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); - } - - xt++; } - } - // Early return if there is an Inf. - if ( hasInf ) - { - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } + // Final accumulation on the appropriate scalars + sum_sml += sum_sml_vec.v[0] + sum_sml_vec.v[1]; + sum_med += sum_med_vec.v[0] + sum_med_vec.v[1]; + sum_big += sum_big_vec.v[0] + sum_big_vec.v[1]; - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; + xt += incx; } // Combine accumulators. @@ -3001,6 +2905,7 @@ void bli_dznorm2fv_unb_var1_avx2 sumsq = sum_sml; } } + else { // If all values are mid-range: @@ -3010,15 +2915,6 @@ void bli_dznorm2fv_unb_var1_avx2 *norm = scale * sqrt( sumsq ); - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dznorm2fv_unb_var1(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; From c8f14edcf50fd4f90e183fee4f32452e46f0d118 Mon Sep 17 00:00:00 2001 From: Arnav Sharma Date: Mon, 17 Jul 2023 12:44:42 +0530 Subject: [PATCH 164/226] BLAS Extension API - ?gemm_compute() - Added support for 2 new APIs: 1. sgemm_compute() 2. dgemm_compute() These are dependent on the ?gemm_pack_get_size() and ?gemm_pack() APIs. - ?gemm_compute() takes the packed matrix buffer (represented by the packed matrix identifier) and performs the GEMM operation: C := A * B + beta * C. - Whenever the kernel storage preference and the matrix storage scheme isn't matching, and the respective matrix being loaded isn't packed either, on-the-go packing has been enabled for such cases to pack that matrix. - Note: If both the matrices are packed using the ?gemm_pack() API, it is the responsibility of the user to pack only one matrix with alpha scalar and the other with a unit scalar. - Note: Support is presently limited to Single Thread only. Both, pack and compute APIs are forced to take n_threads=1. AMD-Internal: [CPUPL-3560] Change-Id: I825d98a0a5038d31668d2a4b84b3ccc204e6c158 --- bench/Makefile | 8 +- bench/bench_gemm_pack_compute.c | 930 ++++++++++++++++++ bench/inputgemmpackcompute.txt | 92 ++ frame/3/CMakeLists.txt | 9 +- frame/3/bli_l3.h | 5 +- frame/3/bli_l3_compute.c | 637 ++++++++++++ frame/3/bli_l3_compute.h | 80 ++ frame/base/bli_param_map.h | 3 +- frame/compat/CMakeLists.txt | 9 +- frame/compat/bla_gemm_compute.c | 285 ++++++ frame/compat/bla_gemm_compute.h | 72 ++ frame/compat/bli_blas.h | 2 + frame/compat/cblas/src/cblas.h | 186 ++++ frame/compat/cblas/src/cblas_dgemm_compute.c | 172 ++++ frame/compat/cblas/src/cblas_dgemm_pack.c | 157 +++ .../cblas/src/cblas_dgemm_pack_get_size.c | 83 ++ frame/compat/cblas/src/cblas_f77.h | 16 + frame/compat/cblas/src/cblas_sgemm_compute.c | 171 ++++ frame/compat/cblas/src/cblas_sgemm_pack.c | 157 +++ .../cblas/src/cblas_sgemm_pack_get_size.c | 83 ++ frame/compat/check/CMakeLists.txt | 7 +- frame/compat/check/bla_gemm_compute_check.h | 87 ++ frame/include/bli_macro_defs.h | 4 + frame/include/bli_type_defs.h | 3 +- frame/thread/CMakeLists.txt | 2 + frame/thread/bli_l3_compute_decor.h | 67 ++ frame/thread/bli_l3_compute_decor_openmp.c | 133 +++ frame/thread/bli_l3_compute_decor_openmp.h | 44 + frame/thread/bli_l3_compute_decor_single.c | 87 ++ frame/thread/bli_l3_compute_decor_single.h | 43 + frame/thread/bli_pack_full_decor_openmp.c | 6 +- frame/thread/bli_thread.h | 3 + 32 files changed, 3623 insertions(+), 20 deletions(-) create mode 100755 bench/bench_gemm_pack_compute.c create mode 100644 bench/inputgemmpackcompute.txt create mode 100644 frame/3/bli_l3_compute.c create mode 100644 frame/3/bli_l3_compute.h create mode 100644 frame/compat/bla_gemm_compute.c create mode 100644 frame/compat/bla_gemm_compute.h create mode 100644 frame/compat/cblas/src/cblas_dgemm_compute.c create mode 100644 frame/compat/cblas/src/cblas_dgemm_pack.c create mode 100644 frame/compat/cblas/src/cblas_dgemm_pack_get_size.c create mode 100644 frame/compat/cblas/src/cblas_sgemm_compute.c create mode 100644 frame/compat/cblas/src/cblas_sgemm_pack.c create mode 100644 frame/compat/cblas/src/cblas_sgemm_pack_get_size.c create mode 100644 frame/compat/check/bla_gemm_compute_check.h create mode 100644 frame/thread/bli_l3_compute_decor.h create mode 100644 frame/thread/bli_l3_compute_decor_openmp.c create mode 100644 frame/thread/bli_l3_compute_decor_openmp.h create mode 100644 frame/thread/bli_l3_compute_decor_single.c create mode 100644 frame/thread/bli_l3_compute_decor_single.h diff --git a/bench/Makefile b/bench/Makefile index 751f7129a5..cc1b7297dc 100755 --- a/bench/Makefile +++ b/bench/Makefile @@ -6,7 +6,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -193,7 +193,8 @@ blis: \ bench_amaxv_blis.x \ bench_copyv_blis.x \ bench_swapv_blis.x \ - bench_axpbyv_blis.x + bench_axpbyv_blis.x \ + bench_gemm_pack_compute_blis.x openblas: \ bench_gemm_openblas.x \ @@ -240,7 +241,8 @@ mkl: \ bench_amaxv_mkl.x \ bench_copyv_mkl.x \ bench_swapv_mkl.x \ - bench_axpbyv_mkl.x + bench_axpbyv_mkl.x \ + bench_gemm_pack_compute_mkl.x # --Object file rules -- diff --git a/bench/bench_gemm_pack_compute.c b/bench/bench_gemm_pack_compute.c new file mode 100755 index 0000000000..2394f608b2 --- /dev/null +++ b/bench/bench_gemm_pack_compute.c @@ -0,0 +1,930 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef WIN32 +#include +#else +#include +#endif +#include "blis.h" + + +// Benchmark application to process aocl logs generated by BLIS library. +#ifndef DT +#define DT BLIS_DOUBLE +#endif + +#ifndef IND +#define IND BLIS_NAT +#endif + +#ifndef N_REPEAT +//#define N_REPEAT 100 +#endif + + +#define AOCL_MATRIX_INITIALISATION +#define BUFFER_SIZE 256 + +/* For BLIS since logs are collected at BLAS interfaces + * we disable cblas interfaces for this benchmark application + */ + +#ifdef BLIS_ENABLE_CBLAS +// #define CBLAS +#endif + +// #define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta, alpha_one; + dim_t m, n, k; + dim_t p_inc = 0; // to keep track of number of inputs + num_t dt; + // ind_t ind; + char dt_ch; + int r, n_repeats; + trans_t transa; + trans_t transb; + + double dtime; + double dtime_save; + double gflops; + + int packA, packB; + + FILE* fin = NULL; + FILE* fout = NULL; + + n_repeats = N_REPEAT; // This macro will get from Makefile. + + dt = DT; + + if (argc < 3) + { + printf("Usage: ./test_gemm_pack_compute_XX.x input.csv output.csv\n"); + exit(1); + } + fin = fopen(argv[1], "r"); + if (fin == NULL) + { + printf("Error opening the file %s\n", argv[1]); + exit(1); + } + fout = fopen(argv[2], "w"); + if (fout == NULL) + { + printf("Error opening output file %s\n", argv[2]); + exit(1); + } + if (argc > 3) + { + n_repeats = atoi(argv[3]); + } + + fprintf(fout, "Dt transa transb identifier m n k alphaR alphaI lda ldb betaR betaI ldc gflops\n"); + + // Following variables are needed for scanf to read inputs properly + // however they are not used in bench. + char api_name[BUFFER_SIZE]; // to store function name, line no present in logs + char dummy_buffer[BUFFER_SIZE]; + + // Variables extracted from the logs which are used by bench + char stor_scheme, transA_c, transB_c, packA_c, packB_c; + double alpha_r, beta_r, alpha_i, beta_i; + dim_t m_trans, n_trans; + inc_t lda, ldb, ldc; + + stor_scheme = 'C'; // By default set it to Column Major + + //{S, D, C, Z} transa, transb, packA, packB, m, n, k, alpha_real, + // alpha_imag, lda ldb, beta_real, beta_imag, ldc, + // + // number of threads, execution time, gflops ---> ignored by bench + while (fscanf(fin, "%s %c %c %c %c %c " INT_FS INT_FS INT_FS " %lf %lf " INT_FS INT_FS " %lf %lf " INT_FS"[^\n]", + api_name, &dt_ch, &transA_c, &transB_c, &packA_c, &packB_c, &m, &n, &k, &alpha_r, &alpha_i, + &lda, &ldb, &beta_r, &beta_i, &ldc) == 16) + { + // Discard any extra data on current line in the input file. + fgets(dummy_buffer, BUFFER_SIZE, fin ); + + // At BLAS level only column major order is supported. + stor_scheme = 'C'; + + if (dt_ch == 'D' || dt_ch == 'd') dt = BLIS_DOUBLE; + else if (dt_ch == 'S' || dt_ch == 's') dt = BLIS_FLOAT; + else + { + printf("Invalid data type %c\n", dt_ch); + continue; + } + + if ( transA_c == 'n' || transA_c == 'N' ) transa = BLIS_NO_TRANSPOSE; + else if ( transA_c == 't' || transA_c == 'T' ) transa = BLIS_TRANSPOSE; + else if ( transA_c == 'c' || transA_c == 'C' ) transa = BLIS_CONJ_TRANSPOSE; + else + { + printf("Invalid option for transA \n"); + continue; + } + + if ( transB_c == 'n' || transB_c == 'N' ) transb = BLIS_NO_TRANSPOSE; + else if ( transB_c == 't' || transB_c == 'T' ) transb = BLIS_TRANSPOSE; + else if ( transB_c == 'c' || transB_c == 'C' ) transb = BLIS_CONJ_TRANSPOSE; + else + { + printf("Invalid option for transB \n"); + continue; + } + + if ( packA_c == 'p' || packA_c == 'P' ) packA = TRUE; + else if ( packA_c == 'u' || packA_c == 'U' ) packA = FALSE; + else + { + printf("Invalid option for packA \n"); + continue; + } + + if ( packB_c == 'p' || packB_c == 'P') packB = TRUE; + else if ( packB_c == 'u' || packB_c == 'U') packB = FALSE; + else + { + printf("Invalid option for packB \n"); + continue; + } + + bli_obj_create( dt, 1, 1, 0, 0, &alpha); + bli_obj_create( dt, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt, 1, 1, 0, 0, &alpha_one); + + if( (stor_scheme == 'C') || (stor_scheme == 'c') ) + { + // leading dimension should be greater than number of rows + // if ((m > lda) || (k > ldb) || (m > ldc)) continue; + // Since this bench app is run on logs generated by AOCL trace logs + // - we have relaxed the checks on the input parameters. + + // if A is transpose - A(lda x m), lda >= max(1,k) + // if A is non-transpose - A (lda x k), lda >= max(1,m) + // if B is transpose - B (ldb x k), ldb >= max(1,n) + // if B is non-transpose - B (ldb x n), ldb >= max(1,k) + // C is ldc x n - ldc >= max(1, m) + //if(transa) lda = k; // We will end up overwriting lda + bli_set_dims_with_trans( transa, m, k, &m_trans, &n_trans); + bli_obj_create( dt, m_trans, n_trans, 1, lda, &a); + + //if(transb) ldb = n; // we will end up overwriting ldb, ldb >= n + bli_set_dims_with_trans( transb, k, n, &m_trans, &n_trans); + bli_obj_create( dt, m_trans, n_trans, 1, ldb, &b); + + bli_obj_create( dt, m, n, 1, ldc, &c); + bli_obj_create( dt, m, n, 1, ldc, &c_save ); + } + else if( (stor_scheme == 'r') || (stor_scheme == 'R') ) + { + //leading dimension should be greater than number of columns + //if ((k > lda) || (n > ldb) || (n > ldc)) continue; + // Since this bench app is run on logs generated by AOCL trace logs + // - we have relaxed the checks on the input parameters. + + // if A is transpose - A(k x lda), lda >= max(1,m) + // if A is non-transpose - A (m x lda), lda >= max(1,k) + // if B is transpose - B (n x ldb), ldb >= max(1,k) + // if B is non-transpose - B (k x ldb ), ldb >= max(1,n) + // C is m x ldc - ldc >= max(1, n) + + //if(transa) lda = m; // this will overwrite lda + bli_set_dims_with_trans(transa, m, k, &m_trans, &n_trans); + bli_obj_create( dt, m_trans, n_trans, lda, 1, &a); + + //if(transb) ldb = k; // this will overwrite ldb + bli_set_dims_with_trans(transb, k, n, &m_trans, &n_trans); + bli_obj_create( dt, m_trans, n_trans, ldb, 1, &b); + + bli_obj_create( dt, m, n, ldc, 1, &c); + bli_obj_create( dt, m, n, ldc, 1, &c_save ); + } + else + { + printf("Invalid storage scheme\n"); + continue; + } +#ifndef BLIS // Incase if we are using blis interface we don't have to check for col-storage. + #ifndef CBLAS + if( ( stor_scheme == 'R' ) || ( stor_scheme == 'r' ) ) + { + printf("BLAS APIs doesn't support row-storage: Enable CBLAS\n"); + continue; + } + #endif +#endif + +#ifdef AOCL_MATRIX_INITIALISATION + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); +#endif + bli_copym( &c, &c_save ); + + bli_obj_set_conjtrans( transa, &a); + bli_obj_set_conjtrans( transb, &b); + + bli_setsc( 1.0, 1.0, &alpha_one ); + bli_setsc( alpha_r, alpha_i, &alpha ); + bli_setsc( beta_r, beta_i, &beta ); + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); +#ifdef PRINT + bli_printm( "a", &a, "%4.6f", "" ); + bli_printm( "b", &b, "%4.6f", "" ); + bli_printm( "c", &c, "%4.6f", "" ); +#endif + dtime = bli_clock(); + +#ifdef BLIS + + printf( "BLAS Extension APIs don't have a BLIS interface." + "Enable CBLAS or BLAS interface!\n" ); + +#else + +#ifdef CBLAS + enum CBLAS_ORDER cblas_order; + enum CBLAS_TRANSPOSE cblas_transa; + enum CBLAS_TRANSPOSE cblas_transb; + enum CBLAS_IDENTIFIER cblas_identifierA; + enum CBLAS_IDENTIFIER cblas_identifierB; + + size_t bufSizeA; + size_t bufSizeB; + + if ( ( stor_scheme == 'C' ) || ( stor_scheme == 'c' ) ) + cblas_order = CblasColMajor; + else + cblas_order = CblasRowMajor; + + if( bli_is_trans( transa ) ) + cblas_transa = CblasTrans; + else if( bli_is_conjtrans( transa ) ) + cblas_transa = CblasConjTrans; + else + cblas_transa = CblasNoTrans; + + if( bli_is_trans( transb ) ) + cblas_transb = CblasTrans; + else if( bli_is_conjtrans( transb ) ) + cblas_transb = CblasConjTrans; + else + cblas_transb = CblasNoTrans; + + if ( packA ) + cblas_identifierA = CblasAMatrix; + + if ( packB ) + cblas_identifierB = CblasBMatrix; +#else + f77_char f77_transa; + f77_char f77_transb; + f77_char f77_identifierA; + f77_char f77_identifierB; + f77_int f77_bufSizeA; + f77_int f77_bufSizeB; + + f77_char f77_packed = 'P'; + f77_identifierA = 'A'; + f77_identifierB = 'B'; + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + bli_param_map_blis_to_netlib_trans( transb, &f77_transb ); + + err_t err = BLIS_SUCCESS; + +#endif + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + + float* alphaonep = bli_obj_buffer( &alpha_one ); + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* bp = bli_obj_buffer( &b ); + float* betap = bli_obj_buffer( &beta ); + float* cp = bli_obj_buffer( &c ); + +#ifdef CBLAS + float* aBuffer; + float* bBuffer; + + if ( packA && !packB ) + { + // Only A is pre-packed. + bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix, + mm, + nn, + kk ); + aBuffer = (float*) bli_malloc_user( bufSizeA, &err ); + + cblas_sgemm_pack( cblas_order, + CblasAMatrix, + cblas_transa, + mm, + nn, + kk, + *alphap, + ap, lda, + aBuffer ); + + cblas_sgemm_compute( cblas_order, + CblasPacked, + cblas_transb, + mm, + nn, + kk, + aBuffer, lda, + bp, ldb, + *betap, + cp, ldc ); + + bli_free_user(aBuffer); + } + else if ( !packA && packB ) + { + // Only B is pre-packed. + bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix, + mm, + nn, + kk ); + bBuffer = (float*) bli_malloc_user( bufSizeB, &err ); + + cblas_sgemm_pack( cblas_order, + CblasBMatrix, + cblas_transb, + mm, + nn, + kk, + *alphap, + bp, ldb, + bBuffer ); + + cblas_sgemm_compute( cblas_order, + cblas_transa, + CblasPacked, + mm, + nn, + kk, + ap, lda, + bBuffer, ldb, + *betap, + cp, ldc ); + + bli_free_user(bBuffer); + } + else if ( packA && packB ) + { + // Both A & B are pre-packed. + bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix, + mm, + nn, + kk ); + aBuffer = (float*) bli_malloc_user( bufSizeA, &err ); + + bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix, + mm, + nn, + kk ); + bBuffer = (float*) bli_malloc_user( bufSizeB, &err ); + + cblas_sgemm_pack( cblas_order, + CblasAMatrix, + cblas_transa, + mm, + nn, + kk, + *alphap, + ap, lda, + aBuffer ); + + cblas_sgemm_pack( cblas_order, + CblasBMatrix, + cblas_transb, + mm, + nn, + kk, + *alphaonep, + bp, ldb, + bBuffer ); + + cblas_sgemm_compute( cblas_order, + CblasPacked, + CblasPacked, + mm, + nn, + kk, + aBuffer, lda, + bBuffer, ldb, + *betap, + cp, ldc ); + + bli_free_user(aBuffer); + bli_free_user(bBuffer); + } + else + { + // Neither A nor B is pre-packed. + cblas_sgemm_compute( cblas_order, + cblas_transa, + cblas_transb, + mm, + nn, + kk, + ap, lda, + bp, ldb, + *betap, + cp, ldc ); + } +#else // -- BLAS API -- + float* aBuffer; + float* bBuffer; + + if ( packA && !packB ) + { + // Only A is pre-packed. + f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA, + &mm, + &nn, + &kk ); + aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err ); + + sgemm_pack_( &f77_identifierA, + &f77_transa, + &mm, + &nn, + &kk, + alphap, + ap, + (f77_int*)&lda, + aBuffer ); + + sgemm_compute_( &f77_packed, + &f77_transb, + &mm, + &nn, + &kk, + aBuffer, (f77_int*)&lda, + bp, (f77_int*)&ldb, + betap, + cp, (f77_int*)&ldc ); + + bli_free_user( aBuffer ); + } + else if ( !packA && packB ) + { + // Only B is pre-packed. + f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB, + &mm, + &nn, + &kk ); + bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err ); + + sgemm_pack_( &f77_identifierB, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + bp, + (f77_int*)&ldb, + bBuffer ); + + sgemm_compute_( &f77_transa, + &f77_packed, + &mm, + &nn, + &kk, + ap, (f77_int*)&lda, + bBuffer, (f77_int*)&ldb, + betap, + cp, (f77_int*)&ldc ); + + bli_free_user( bBuffer ); + } + else if ( packA && packB ) + { + // Both A & B are pre-packed. + f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB, + &mm, + &nn, + &kk ); + + bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err ); + + f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA, + &mm, + &nn, + &kk ); + + aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err ); + + sgemm_pack_( &f77_identifierA, + &f77_transa, + &mm, + &nn, + &kk, + alphap, + ap, + (f77_int*)&lda, + aBuffer ); + + sgemm_pack_( &f77_identifierB, + &f77_transb, + &mm, + &nn, + &kk, + alphaonep, + bp, + (f77_int*)&ldb, + bBuffer ); + + sgemm_compute_( &f77_packed, + &f77_packed, + &mm, + &nn, + &kk, + aBuffer, (f77_int*)&lda, + bBuffer, (f77_int*)&ldb, + betap, + cp, (f77_int*)&ldc ); + + bli_free_user(aBuffer); + bli_free_user(bBuffer); + } + else + { + // Neither A nor B is reordered. + sgemm_compute_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + ap, (f77_int*)&lda, + bp, (f77_int*)&ldb, + betap, + cp, (f77_int*)&ldc ); + } +#endif + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + + double* alphap = bli_obj_buffer( &alpha ); + double* alphaonep = bli_obj_buffer( &alpha_one ); + double* ap = bli_obj_buffer( &a ); + double* bp = bli_obj_buffer( &b ); + double* betap = bli_obj_buffer( &beta ); + double* cp = bli_obj_buffer( &c ); + +#ifdef CBLAS + double* aBuffer; + double* bBuffer; + + if ( packA && !packB ) + { + // Only A is pre-packed. + bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix, + mm, + nn, + kk ); + aBuffer = (double*) bli_malloc_user( bufSizeA, &err ); + + cblas_dgemm_pack( cblas_order, + CblasAMatrix, + cblas_transa, + mm, + nn, + kk, + *alphap, + ap, lda, + aBuffer ); + + cblas_dgemm_compute( cblas_order, + CblasPacked, + cblas_transb, + mm, + nn, + kk, + aBuffer, lda, + bp, ldb, + *betap, + cp, ldc ); + + bli_free_user(aBuffer); + } + else if ( !packA && packB ) + { + // Only B is pre-packed. + bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix, + mm, + nn, + kk ); + + cblas_dgemm_pack( cblas_order, + CblasBMatrix, + cblas_transb, + mm, + nn, + kk, + *alphap, + bp, ldb, + bBuffer ); + + cblas_dgemm_compute( cblas_order, + cblas_transa, + CblasPacked, + mm, + nn, + kk, + ap, lda, + bBuffer, ldb, + *betap, + cp, ldc ); + + bli_free_user(bBuffer); + } + else if ( packA && packB ) + { + // Both A & B are pre-packed. + bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix, + mm, + nn, + kk ); + aBuffer = (double*) bli_malloc_user( bufSizeA, &err ); + + bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix, + mm, + nn, + kk ); + bBuffer = (double*) bli_malloc_user( bufSizeB, &err ); + + cblas_dgemm_pack( cblas_order, + CblasAMatrix, + cblas_transa, + mm, + nn, + kk, + *alphap, + ap, lda, + aBuffer ); + + cblas_dgemm_pack( cblas_order, + CblasBMatrix, + cblas_transb, + mm, + nn, + kk, + *alphap, + bp, ldb, + bBuffer ); + + cblas_dgemm_compute( cblas_order, + CblasPacked, + CblasPacked, + mm, + nn, + kk, + aBuffer, lda, + bBuffer, ldb, + *betap, + cp, ldc ); + + bli_free_user(aBuffer); + bli_free_user(bBuffer); + } + else + { + // Neither A nor B is pre-packed. + cblas_dgemm_compute( cblas_order, + cblas_transa, + cblas_transb, + mm, + nn, + kk, + ap, lda, + bp, ldb, + *betap, + cp, ldc ); + } + +#else // -- BLAS API -- + double* aBuffer; + double* bBuffer; + + if ( packA && !packB ) + { + // Only A is pre-packed. + f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA, + &mm, + &nn, + &kk ); + aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err ); + + dgemm_pack_( &f77_identifierA, + &f77_transa, + &mm, + &nn, + &kk, + alphap, + ap, + (f77_int*)&lda, + aBuffer ); + + dgemm_compute_( &f77_packed, + &f77_transb, + &mm, + &nn, + &kk, + aBuffer, (f77_int*)&lda, + bp, (f77_int*)&ldb, + betap, + cp, (f77_int*)&ldc ); + + bli_free_user( aBuffer ); + } + else if ( !packA && packB ) + { + // Only B is pre-packed. + f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB, + &mm, + &nn, + &kk ); + bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err ); + + dgemm_pack_( &f77_identifierB, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + bp, + (f77_int*)&ldb, + bBuffer ); + + dgemm_compute_( &f77_transa, + &f77_packed, + &mm, + &nn, + &kk, + ap, (f77_int*)&lda, + bBuffer, (f77_int*)&ldb, + betap, + cp, (f77_int*)&ldc ); + + bli_free_user( bBuffer ); + } + else if ( packA && packB ) + { + // Both A & B are pre-packed. + f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA, + &mm, + &nn, + &kk ); + aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err ); + + f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB, + &mm, + &nn, + &kk ); + bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err ); + + dgemm_pack_( &f77_identifierA, + &f77_transa, + &mm, + &nn, + &kk, + alphap, + ap, + (f77_int*)&lda, + aBuffer ); + + dgemm_pack_( &f77_identifierB, + &f77_transb, + &mm, + &nn, + &kk, + alphaonep, + bp, + (f77_int*)&ldb, + bBuffer ); + + dgemm_compute_( &f77_packed, + &f77_packed, + &mm, + &nn, + &kk, + aBuffer, (f77_int*)&lda, + bBuffer, (f77_int*)&ldb, + betap, + cp, (f77_int*)&ldc ); + + bli_free_user(aBuffer); + bli_free_user(bBuffer); + } + else + { + // Neither A nor B is reordered. + dgemm_compute_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + ap, (f77_int*)&lda, + bp, (f77_int*)&ldb, + betap, + cp, (f77_int*)&ldc ); + } +#endif + } +#endif + +#ifdef PRINT + bli_printm( "c compute", &c, "%4.6f", "" ); +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + + printf( "data_%cgemm_%s", dt_ch, BLAS ); + + p_inc++; + printf("( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + (unsigned long)(p_inc), + (unsigned long)m, + (unsigned long)n, + (unsigned long)k, gflops); + + fprintf (fout, "%c %c %c %c %c %ld %ld %ld %lf %lf %ld %ld %lf %lf %ld %6.3f\n", \ + dt_ch, transA_c, transB_c, packA_c, packB_c, m, n, k, alpha_r, alpha_i, lda, ldb, beta_r, beta_i, ldc, gflops); + + fflush(fout); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); + fclose(fin); + fclose(fout); + + return 0; +} \ No newline at end of file diff --git a/bench/inputgemmpackcompute.txt b/bench/inputgemmpackcompute.txt new file mode 100644 index 0000000000..8b01d33d6b --- /dev/null +++ b/bench/inputgemmpackcompute.txt @@ -0,0 +1,92 @@ +sgemm_ S N N P U 1 1 1 1 0 1 1 1 0 1 +sgemm_ S N N P U 2 2 2 1 0 2 2 1 0 2 +sgemm_ S N N P U 3 3 3 1 0 3 3 1 0 3 +sgemm_ S N N P U 4 4 4 1 0 4 4 1 0 4 +sgemm_ S N N P U 5 5 5 1 0 5 5 1 0 5 +sgemm_ S N N P U 6 6 6 1 0 6 6 1 0 6 +sgemm_ S N N P U 7 7 7 1 0 7 7 1 0 7 +sgemm_ S N N P U 8 8 8 1 0 8 8 1 0 8 +sgemm_ S N N P U 9 9 9 1 0 9 9 1 0 9 +sgemm_ S N N P U 10 10 10 1 0 10 10 1 0 10 +sgemm_ S N N P U 20 20 20 1 0 20 20 1 0 20 +sgemm_ S N N P U 30 30 30 1 0 30 30 1 0 30 +sgemm_ S N N P U 40 40 40 1 0 40 40 1 0 40 +sgemm_ S N N P U 50 50 50 1 0 50 50 1 0 50 +sgemm_ S N N P U 60 60 60 1 0 60 60 1 0 60 +sgemm_ S N N P U 70 70 70 1 0 70 70 1 0 70 +sgemm_ S N N P U 80 80 80 1 0 80 80 1 0 80 +sgemm_ S N N P U 90 90 90 1 0 90 90 1 0 90 +sgemm_ S N N P U 100 100 100 1 0 100 100 1 0 100 +sgemm_ S N N P U 200 200 200 1 0 200 200 1 0 200 +sgemm_ S N N P U 300 300 300 1 0 300 300 1 0 300 +sgemm_ S N N P U 400 400 400 1 0 400 400 1 0 400 +sgemm_ S N N P U 500 500 500 1 0 500 500 1 0 500 +dgemm_ D N N P U 1 1 1 1 0 1 1 1 0 1 +dgemm_ D N N P U 2 2 2 1 0 2 2 1 0 2 +dgemm_ D N N P U 3 3 3 1 0 3 3 1 0 3 +dgemm_ D N N P U 4 4 4 1 0 4 4 1 0 4 +dgemm_ D N N P U 5 5 5 1 0 5 5 1 0 5 +dgemm_ D N N P U 6 6 6 1 0 6 6 1 0 6 +dgemm_ D N N P U 7 7 7 1 0 7 7 1 0 7 +dgemm_ D N N P U 8 8 8 1 0 8 8 1 0 8 +dgemm_ D N N P U 9 9 9 1 0 9 9 1 0 9 +dgemm_ D N N P U 10 10 10 1 0 10 10 1 0 10 +dgemm_ D N N P U 20 20 20 1 0 20 20 1 0 20 +dgemm_ D N N P U 30 30 30 1 0 30 30 1 0 30 +dgemm_ D N N P U 40 40 40 1 0 40 40 1 0 40 +dgemm_ D N N P U 50 50 50 1 0 50 50 1 0 50 +dgemm_ D N N P U 60 60 60 1 0 60 60 1 0 60 +dgemm_ D N N P U 70 70 70 1 0 70 70 1 0 70 +dgemm_ D N N P U 80 80 80 1 0 80 80 1 0 80 +dgemm_ D N N P U 90 90 90 1 0 90 90 1 0 90 +dgemm_ D N N P U 100 100 100 1 0 100 100 1 0 100 +dgemm_ D N N P U 200 200 200 1 0 200 200 1 0 200 +dgemm_ D N N P U 300 300 300 1 0 300 300 1 0 300 +dgemm_ D N N P U 400 400 400 1 0 400 400 1 0 400 +dgemm_ D N N P U 500 500 500 1 0 500 500 1 0 500 +sgemm_ S N N U P 1 1 1 1 0 1 1 1 0 1 +sgemm_ S N N U P 2 2 2 1 0 2 2 1 0 2 +sgemm_ S N N U P 3 3 3 1 0 3 3 1 0 3 +sgemm_ S N N U P 4 4 4 1 0 4 4 1 0 4 +sgemm_ S N N U P 5 5 5 1 0 5 5 1 0 5 +sgemm_ S N N U P 6 6 6 1 0 6 6 1 0 6 +sgemm_ S N N U P 7 7 7 1 0 7 7 1 0 7 +sgemm_ S N N U P 8 8 8 1 0 8 8 1 0 8 +sgemm_ S N N U P 9 9 9 1 0 9 9 1 0 9 +sgemm_ S N N U P 10 10 10 1 0 10 10 1 0 10 +sgemm_ S N N U P 20 20 20 1 0 20 20 1 0 20 +sgemm_ S N N U P 30 30 30 1 0 30 30 1 0 30 +sgemm_ S N N U P 40 40 40 1 0 40 40 1 0 40 +sgemm_ S N N U P 50 50 50 1 0 50 50 1 0 50 +sgemm_ S N N U P 60 60 60 1 0 60 60 1 0 60 +sgemm_ S N N U P 70 70 70 1 0 70 70 1 0 70 +sgemm_ S N N U P 80 80 80 1 0 80 80 1 0 80 +sgemm_ S N N U P 90 90 90 1 0 90 90 1 0 90 +sgemm_ S N N U P 100 100 100 1 0 100 100 1 0 100 +sgemm_ S N N U P 200 200 200 1 0 200 200 1 0 200 +sgemm_ S N N U P 300 300 300 1 0 300 300 1 0 300 +sgemm_ S N N U P 400 400 400 1 0 400 400 1 0 400 +sgemm_ S N N U P 500 500 500 1 0 500 500 1 0 500 +dgemm_ D N N U P 1 1 1 1 0 1 1 1 0 1 +dgemm_ D N N U P 2 2 2 1 0 2 2 1 0 2 +dgemm_ D N N U P 3 3 3 1 0 3 3 1 0 3 +dgemm_ D N N U P 4 4 4 1 0 4 4 1 0 4 +dgemm_ D N N U P 5 5 5 1 0 5 5 1 0 5 +dgemm_ D N N U P 6 6 6 1 0 6 6 1 0 6 +dgemm_ D N N U P 7 7 7 1 0 7 7 1 0 7 +dgemm_ D N N U P 8 8 8 1 0 8 8 1 0 8 +dgemm_ D N N U P 9 9 9 1 0 9 9 1 0 9 +dgemm_ D N N U P 10 10 10 1 0 10 10 1 0 10 +dgemm_ D N N U P 20 20 20 1 0 20 20 1 0 20 +dgemm_ D N N U P 30 30 30 1 0 30 30 1 0 30 +dgemm_ D N N U P 40 40 40 1 0 40 40 1 0 40 +dgemm_ D N N U P 50 50 50 1 0 50 50 1 0 50 +dgemm_ D N N U P 60 60 60 1 0 60 60 1 0 60 +dgemm_ D N N U P 70 70 70 1 0 70 70 1 0 70 +dgemm_ D N N U P 80 80 80 1 0 80 80 1 0 80 +dgemm_ D N N U P 90 90 90 1 0 90 90 1 0 90 +dgemm_ D N N U P 100 100 100 1 0 100 100 1 0 100 +dgemm_ D N N U P 200 200 200 1 0 200 200 1 0 200 +dgemm_ D N N U P 300 300 300 1 0 300 300 1 0 300 +dgemm_ D N N U P 400 400 400 1 0 400 400 1 0 400 +dgemm_ D N N U P 500 500 500 1 0 500 500 1 0 500 \ No newline at end of file diff --git a/frame/3/CMakeLists.txt b/frame/3/CMakeLists.txt index 734622344a..b3db987c3a 100644 --- a/frame/3/CMakeLists.txt +++ b/frame/3/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.## target_sources("${PROJECT_NAME}" PRIVATE @@ -26,12 +26,13 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_oapi.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_tapi.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_smart_threading.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute.c ) # Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR - ${TARGET_ARCH} STREQUAL zen2 OR +if(${TARGET_ARCH} STREQUAL zen OR + ${TARGET_ARCH} STREQUAL zen2 OR ${TARGET_ARCH} STREQUAL zen3 OR - ${TARGET_ARCH} STREQUAL zen4 OR + ${TARGET_ARCH} STREQUAL zen4 OR ${TARGET_ARCH} STREQUAL amdzen) target_sources("${PROJECT_NAME}" PRIVATE diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index af6d93a7fb..6250405995 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-22, Advanced Micro Devices, Inc. + Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -105,3 +105,6 @@ // Smart Threading API's. #include "bli_l3_smart_threading.h" + +// BLAS Extension API - Compute +#include "bli_l3_compute.h" \ No newline at end of file diff --git a/frame/3/bli_l3_compute.c b/frame/3/bli_l3_compute.c new file mode 100644 index 0000000000..c7c48a8f49 --- /dev/null +++ b/frame/3/bli_l3_compute.c @@ -0,0 +1,637 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_compute_init +( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm +) +{ + if ( bli_error_checking_is_enabled() ) + { + // @todo: Add call to error checking function here + } + + // Initializing the cntx if one isn't already passed. + if ( cntx == NULL ) { + cntx = bli_gks_query_cntx(); + } + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) + { + bli_rntm_init_from_global( &rntm_l ); + rntm = &rntm_l; + } + else + { + rntm_l = *rntm; + rntm = &rntm_l; + } + + // @todo: AOCL Dynamic yet to be implemented for pack-compute APIs. +#ifdef AOCL_DYNAMIC + // If dynamic-threading is enabled, calculate optimum number + // of threads. + // rntm will be updated with optimum number of threads. + + // bli_nthreads_optimum(a, b, c, BLIS_GEMM, rntm ); +#endif + + // Explicitly set n_threads=1 and update rntm since only ST supported. + dim_t n_threads = 1; + bli_rntm_set_num_threads( n_threads, rntm ); + bli_rntm_set_ways_from_rntm_sup + ( + bli_obj_length( c ), + bli_obj_width( c ), + bli_obj_width( a ), + rntm + ); + + bli_l3_compute_thread_decorator + ( + bli_gemm_compute, + BLIS_GEMM, + a, + b, + beta, + c, + cntx, + rntm + ); +} + +err_t bli_gemm_compute +( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread +) +{ + const num_t dt = bli_obj_dt( c ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + inc_t rs_a; + inc_t cs_a; + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b; + inc_t cs_b; + + stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); + const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); + + // packedX defines whether matrix X is pre-packed (reordered) or not. + bool packeda = bli_obj_is_packed( a ); + bool packedb = bli_obj_is_packed( b ); + + // packX defines whether to pack matrix X on-the-go or not. + bool packa = bli_rntm_pack_a( rntm ); + bool packb = bli_rntm_pack_b( rntm ); + const bool transa = bli_obj_has_trans( a ); + const bool transb = bli_obj_has_trans( b ); + + // is_col_stored_a = TRUE when, + // A is col stored and not transposed, + // or, A is row stored and transposed. + const bool is_col_stored_a = bli_obj_is_col_stored( a ) && !transa; + + // is_row_stored_b = TRUE when, + // B is row stored and not transposed, + // or, B is col stored and transposed. + const bool is_row_stored_b = bli_obj_is_row_stored( b ) && !transb; + + // If kernel is row-preferred but B is not row-stored and unpacked, + // enable on-the-go packing of B. + // Else if kernel is col-preferred but A is not col-stored and unpacked, + // enable on-the-go packing of A. + if ( row_pref ) + { + if ( !packedb && !is_row_stored_b ) packb = TRUE; + } + else // if ( col_pref ) + { + if ( !packeda && !is_col_stored_a ) packa = TRUE; + } + + if ( bli_obj_has_notrans( a ) ) + { + k = bli_obj_width( a ); + + rs_a = bli_obj_row_stride( a ); + cs_a = bli_obj_col_stride( a ); + } + else // if ( bli_obj_has_trans( a ) ) + { + // Assign the variables with an implicit transposition. + k = bli_obj_length( a ); + + rs_a = bli_obj_col_stride( a ); + cs_a = bli_obj_row_stride( a ); + } + + if ( bli_obj_has_notrans( b ) ) + { + rs_b = bli_obj_row_stride( b ); + cs_b = bli_obj_col_stride( b ); + } + else // if ( bli_obj_has_trans( b ) ) + { + rs_b = bli_obj_col_stride( b ); + cs_b = bli_obj_row_stride( b ); + } + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + + // Setting the packing status in rntm. + if ( packa ) bli_rntm_set_pack_a( 1, rntm ); + else bli_rntm_set_pack_a( 0, rntm ); + + if ( packb ) bli_rntm_set_pack_b( 1, rntm ); + else bli_rntm_set_pack_b( 0, rntm ); + + if ( bli_is_float( dt ) ) + { + PASTEMAC( s, gemm_compute ) + ( + packa, + packb, + packeda, + packedb, + m, + n, + k, + buf_a, rs_a, cs_a, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + BLIS_RRR, // Using BLIS_RRR since we want to redirect to m kernels. + cntx, + rntm, + thread + ); + } + else if ( bli_is_double( dt ) ) + { + PASTEMAC( d, gemm_compute ) + ( + packa, + packb, + packeda, + packedb, + m, + n, + k, + buf_a, rs_a, cs_a, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + BLIS_RRR, // Using BLIS_RRR since we want to redirect to m kernels. + cntx, + rntm, + thread + ); + } + + return BLIS_SUCCESS; +} + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC( ch, varname ) \ + ( \ + bool packa, \ + bool packb, \ + bool packeda, \ + bool packedb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + stor3_t stor_id, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC( ch, type ); \ +\ + /* If m or n is zero, return immediately. */ \ + if ( bli_zero_dim2( m, n ) ) return; \ +\ + /* @todo Add early return for k < 1 or alpha = 0 here. */ \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* @note: Modifications of KC are just a part of optimizations. + Such optimizations have been removed for simplicity and will be a part + of the optimizations patch. */ \ + dim_t KC; \ + KC = KC0; \ +\ + /* Query the maximum blocksize for NR, which implies a maximum blocksize + extension for the final iteration. */ \ + const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ + const dim_t NRE = NRM - NR; \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t jcstep_b_use = k; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t pcstep_a_use = ( ( m + MR - 1 ) / MR ) * MR; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + PASTECH(ch,gemmsup_ker_ft) \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of beta and one scalars to prevent any unnecessary + sharing of cache lines between the cores' caches. */ \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ +\ + auxinfo_t aux; \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + /* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \ + bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t* restrict bszids; \ +\ + /* Set the bszids pointer to the correct bszids array above based on which + matrices (if any) are being packed. */ \ +\ + if ( packa ) { if ( packb ) bszids = bszids_packab; \ + else bszids = bszids_packa; } \ + else { if ( packb ) bszids = bszids_packb; \ + else bszids = bszids_nopack; } \ +\ + /* Determine whether we are using more than one thread. */ \ + const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jc = bszids; \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ + const inc_t pcstep_b_use = ( ( nc_cur + NR - 1 ) / NR ) * NR; \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict b_jc_use = b_00 + jj * jcstep_b_use; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_pc = &bszids_jc[1]; \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ + const inc_t icstep_a_use = kc_cur; \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ + ctype* restrict b_pc_use; \ + ctype* restrict a_pc_use = a_00 + pp * pcstep_a_use; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing B. If we won't be packing B, we alias to + the _pc variables so that code further down can unconditionally + reference the _pb variables. Note that *if* we will be packing + B, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pb; \ + if ( packb ) { bszids_pb = &bszids_pc[1]; \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); } \ + else { bszids_pb = &bszids_pc[0]; \ + thread_pb = thread_pc; } \ +\ + /* Determine the packing buffer and related parameters for matrix + B. (If B will not be packed, then a_use will be set to point to + b and the _b_use strides will be set accordingly.) Then call + the packm sup variant chooser, which will call the appropriate + implementation based on the schema deduced from the stor_id. */ \ +\ + /* packedb == TRUE indicates that B is reordered thus, update the + necessary pointers. + Else, call packm routine to pack B on-the-go. */ \ + if ( packedb ) \ + { \ + rs_b_use = NR; \ + cs_b_use = 1; \ + ps_b_use = kc_cur * NR; \ + b_pc_use = b_jc_use + pp * pcstep_b_use; \ + } else \ + { \ + PASTEMAC(ch,packm_sup_b) \ + ( \ + packb, \ + BLIS_BUFFER_FOR_B_PANEL, \ + stor_id, \ + BLIS_NO_TRANSPOSE, \ + KC, NC, \ + kc_cur, nc_cur, NR, \ + &one_local, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + b_pc_use = b_use; \ + } \ +\ + /* We don't need to embed the panel stride of B within the auxinfo_t + object because this variant iterates through B in the jr loop, + which occurs here, within the macrokernel, not within the + millikernel. */ \ + bli_auxinfo_set_ps_b( ps_b_use, &aux ); \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_ic = &bszids_pb[1]; \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ + ctype* restrict a_ic_use; \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing B. If we won't be packing A, we alias to + the _ic variables so that code further down can unconditionally + reference the _pa variables. Note that *if* we will be packing + A, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pa; \ + if ( packa ) { bszids_pa = &bszids_ic[1]; \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); } \ + else { bszids_pa = &bszids_ic[0]; \ + thread_pa = thread_ic; } \ +\ + /* Determine the packing buffer and related parameters for matrix + A. (If A will not be packed, then a_use will be set to point to + a and the _a_use strides will be set accordingly.) Then call + the packm sup variant chooser, which will call the appropriate + implementation based on the schema deduced from the stor_id. */ \ + /* packedb == TRUE indicates that B is reordered thus, update the + necessary pointers. + Else, call packm routine to pack B on-the-go. */ \ + if ( packeda ) \ + { \ + rs_a_use = 1; \ + cs_a_use = MR; \ + ps_a_use = MR * kc_cur; \ + a_ic_use = a_pc_use + ii * icstep_a_use; \ + } \ + else \ + { \ + PASTEMAC(ch,packm_sup_a) \ + ( \ + packa, \ + BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \ + stor_id, /* a "block of A." */ \ + BLIS_NO_TRANSPOSE, \ + MC, KC, /* This "block of A" is (at most) MC x KC. */ \ + mc_cur, kc_cur, MR, \ + &one_local, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + a_ic_use = a_use; \ + } \ +\ + /* Embed the panel stride of A within the auxinfo_t object. The + millikernel will query and use this to iterate through + micropanels of A (if needed). */ \ + bli_auxinfo_set_ps_a( ps_a_use, &aux ); \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jr = &bszids_pa[1]; \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ + dim_t jr_left = nc_cur % NR; \ +\ + /* An optimization: allow the last jr iteration to contain up to NRE + columns of C and B. (If NRE > NR, the mkernel has agreed to handle + these cases.) Note that this prevents us from declaring jr_iter and + jr_left as const. NOTE: We forgo this optimization when packing B + since packing an extended edge case is not yet supported. */ \ + if ( !packb && !is_mt ) \ + if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \ + { \ + jr_iter--; jr_left += NR; \ + } \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + { \ + /* Invoke the gemmsup millikernel. */ \ + gemmsup_ker \ + ( \ + BLIS_NO_CONJUGATE, \ + BLIS_NO_CONJUGATE, \ + mc_cur, \ + nr_cur, \ + kc_cur, \ + &one_local, \ + a_ic_use, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + beta_use, \ + c_jr, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* NOTE: This barrier is only needed if we are packing B (since + that matrix is packed within the pc loop of this variant). */ \ + if ( packb ) bli_thread_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTEMAC(ch,packm_sup_finalize_mem_a) \ + ( \ + packa, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTEMAC(ch,packm_sup_finalize_mem_b) \ + ( \ + packb, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +} + +INSERT_GENTFUNC_BASIC0_SD( gemm_compute ) \ No newline at end of file diff --git a/frame/3/bli_l3_compute.h b/frame/3/bli_l3_compute.h new file mode 100644 index 0000000000..ed036d8d2d --- /dev/null +++ b/frame/3/bli_l3_compute.h @@ -0,0 +1,80 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm_compute_init +( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm +); + +err_t bli_gemm_compute +( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread +); + +// Prototype BLAS-like interfaces with void pointer operands. + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC( ch, varname ) \ + ( \ + bool packa, \ + bool packb, \ + bool packeda, \ + bool packedb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + stor3_t stor_id, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ); + +INSERT_GENTPROT_BASIC0( gemm_compute ) \ No newline at end of file diff --git a/frame/base/bli_param_map.h b/frame/base/bli_param_map.h index 58f179d006..5fc0fe9058 100644 --- a/frame/base/bli_param_map.h +++ b/frame/base/bli_param_map.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -84,6 +84,7 @@ BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_t if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; + else if ( trans == 'p' || trans == 'P' ) *blis_trans = BLIS_PACKED; else { // See comment for bli_param_map_netlib_to_blis_side() above. diff --git a/frame/compat/CMakeLists.txt b/frame/compat/CMakeLists.txt index 3b1ab26705..0cd2059d8a 100644 --- a/frame/compat/CMakeLists.txt +++ b/frame/compat/CMakeLists.txt @@ -30,11 +30,14 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bla_omatcopy.c ${CMAKE_CURRENT_SOURCE_DIR}/bla_imatcopy.c ${CMAKE_CURRENT_SOURCE_DIR}/bla_omatcopy2.c ${CMAKE_CURRENT_SOURCE_DIR}/bla_omatadd.c +${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack_get_size.c +${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack.c +${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_compute.c ) # Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR -${TARGET_ARCH} STREQUAL zen2 OR +if(${TARGET_ARCH} STREQUAL zen OR +${TARGET_ARCH} STREQUAL zen2 OR ${TARGET_ARCH} STREQUAL zen3 OR ${TARGET_ARCH} STREQUAL zen4 OR ${TARGET_ARCH} STREQUAL amdzen) @@ -49,8 +52,6 @@ ${TARGET_ARCH} STREQUAL amdzen) ${CMAKE_CURRENT_SOURCE_DIR}/bla_scal_amd.c ${CMAKE_CURRENT_SOURCE_DIR}/bla_swap_amd.c ${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack_get_size.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack.c ) else() target_sources("${PROJECT_NAME}" diff --git a/frame/compat/bla_gemm_compute.c b/frame/compat/bla_gemm_compute.c new file mode 100644 index 0000000000..e68aa68df0 --- /dev/null +++ b/frame/compat/bla_gemm_compute.c @@ -0,0 +1,285 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// BLAS Extension APIs +/* ?gemm_compute.h */ +/* BLAS interface to compute matrix-matrix product */ +/* Datatype : s & d (single and double precision only supported) */ +/* BLAS Extensions */ +/* output is the gemm result */ + +#include "blis.h" + +void sgemm_compute_blis_impl +( + const f77_char* transa, + const f77_char* transb, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const float* a, const f77_int* rs_a, const f77_int* cs_a, + const float* b, const f77_int* rs_b, const f77_int* cs_b, + const float* beta, + float* c, const f77_int* rs_c, const f77_int* cs_c +) +{ + trans_t blis_transa; + trans_t blis_transb; + dim_t m0, n0, k0; + dim_t m0_a, n0_a; + dim_t m0_b, n0_b; + + /* Initialize BLIS. */ + bli_init_auto(); + + // @todo: Add AOCL DTL logs + // AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + // AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, + // (void*)alpha, *lda, *ldb, (void*)beta, *ldc); + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemm_compute) + ( + MKSTR(s), + MKSTR(gemm), + transa, + transb, + m, + n, + k, + ( ( *rs_a != 1 ) ? rs_a : cs_a ), + ( ( *rs_b != 1 ) ? rs_b : cs_b ), + rs_c, cs_c + ); + + /* Quick return if possible. */ + if ( *m == 0 || *n == 0 ) + { + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); + bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1(*m, m0); + bli_convert_blas_dim1(*n, n0); + bli_convert_blas_dim1(*k, k0); + + const num_t dt = BLIS_FLOAT; + + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t co = BLIS_OBJECT_INITIALIZER; + + bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); + bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); + + bli_obj_init_finish_1x1( dt, (float*)beta, &betao ); + + bli_obj_init_finish( dt, m0_a, n0_a, (float*)a, *rs_a, *cs_a, &ao ); + bli_obj_init_finish( dt, m0_b, n0_b, (float*)b, *rs_b, *cs_b, &bo ); + bli_obj_init_finish( dt, m0, n0, (float*)c, *rs_c, *cs_c, &co ); + + bli_obj_set_conjtrans( blis_transa, &ao ); + bli_obj_set_conjtrans( blis_transb, &bo ); + + PASTEMAC0( gemm_compute_init ) + ( + &ao, + &bo, + &betao, + &co, + NULL, + NULL + ); + + /* Finalize BLIS. */ + bli_finalize_auto(); + return; +} + +#ifdef BLIS_ENABLE_BLAS +void sgemm_compute_ +( + const f77_char* transa, + const f77_char* transb, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const float* a, const f77_int* lda, + const float* b, const f77_int* ldb, + const float* beta, + float* c, const f77_int* ldc +) +{ + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int rs_c = 1; + sgemm_compute_blis_impl( transa, + transb, + m, + n, + k, + a, &rs_a, lda, + b, &rs_b, ldb, + beta, + c, &rs_c, ldc ); +} +#endif + +void dgemm_compute_blis_impl +( + const f77_char* transa, + const f77_char* transb, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const double* a, const f77_int* rs_a, const f77_int* cs_a, + const double* b, const f77_int* rs_b, const f77_int* cs_b, + const double* beta, + double* c, const f77_int* rs_c, const f77_int* cs_c +) +{ + trans_t blis_transa; + trans_t blis_transb; + dim_t m0, n0, k0; + dim_t m0_a, n0_a; + dim_t m0_b, n0_b; + + /* Initialize BLIS. */ + bli_init_auto(); + + // @todo: Add AOCL DTL logs + // AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + // AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, + // (void*)alpha, *lda, *ldb, (void*)beta, *ldc); + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemm_compute) + ( + MKSTR(d), + MKSTR(gemm), + transa, + transb, + m, + n, + k, + ( ( *rs_a != 1 ) ? rs_a : cs_a ), + ( ( *rs_b != 1 ) ? rs_b : cs_b ), + rs_c, cs_c + ); + + /* Quick return if possible. */ + if ( *m == 0 || *n == 0 ) + { + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); + bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1(*m, m0); + bli_convert_blas_dim1(*n, n0); + bli_convert_blas_dim1(*k, k0); + + const num_t dt = BLIS_DOUBLE; + + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t co = BLIS_OBJECT_INITIALIZER; + + bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); + bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); + + bli_obj_init_finish_1x1( dt, (double*)beta, &betao ); + + bli_obj_init_finish( dt, m0_a, n0_a, (double*)a, *rs_a, *cs_a, &ao ); + bli_obj_init_finish( dt, m0_b, n0_b, (double*)b, *rs_b, *cs_b, &bo ); + bli_obj_init_finish( dt, m0, n0, (double*)c, *rs_c, *cs_c, &co ); + + bli_obj_set_conjtrans( blis_transa, &ao ); + bli_obj_set_conjtrans( blis_transb, &bo ); + + PASTEMAC0( gemm_compute_init ) + ( + &ao, + &bo, + &betao, + &co, + NULL, + NULL + ); + + /* Finalize BLIS. */ + bli_finalize_auto(); +} + +#ifdef BLIS_ENABLE_BLAS +BLIS_EXPORT_BLAS void dgemm_compute_ +( + const f77_char* transa, + const f77_char* transb, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const double* a, const f77_int* lda, + const double* b, const f77_int* ldb, + const double* beta, + double* c, const f77_int* ldc +) +{ + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int rs_c = 1; + dgemm_compute_blis_impl( transa, + transb, + m, + n, + k, + a, &rs_a, lda, + b, &rs_b, ldb, + beta, + c, &rs_c, ldc ); +} +#endif \ No newline at end of file diff --git a/frame/compat/bla_gemm_compute.h b/frame/compat/bla_gemm_compute.h new file mode 100644 index 0000000000..c50e5b884d --- /dev/null +++ b/frame/compat/bla_gemm_compute.h @@ -0,0 +1,72 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// BLAS Extension APIs +/* ?gemm_compute.h */ +/* BLAS interface to compute matrix-matrix product */ +/* Datatype : s & d (single and double precision only supported) */ +/* BLAS Extensions */ +/* output is the gemm result */ + +#undef GENTPROTRO +#define GENTPROTRO( ftype, ch, blasname ) \ +\ +IF_BLIS_ENABLE_BLAS(\ +BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa, \ + const f77_char* transb, \ + const f77_int* m, \ + const f77_int* n, \ + const f77_int* k, \ + const ftype* a, const f77_int* lda, \ + const ftype* b, const f77_int* ldb, \ + const ftype* beta, \ + ftype* c, const f77_int* ldc \ + ); \ +)\ +BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \ + ( \ + const f77_char* transa, \ + const f77_char* transb, \ + const f77_int* m, \ + const f77_int* n, \ + const f77_int* k, \ + const ftype* a, const f77_int* rs_a, const f77_int* cs_a, \ + const ftype* b, const f77_int* rs_b, const f77_int* cs_b, \ + const ftype* beta, \ + ftype* c, const f77_int* rs_c, const f77_int* cs_c \ + ); + +INSERT_GENTPROTRO_BLAS( gemm_compute ) \ No newline at end of file diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h index 99e8ff5962..c3028c1e1f 100644 --- a/frame/compat/bli_blas.h +++ b/frame/compat/bli_blas.h @@ -183,6 +183,7 @@ #include "bla_trmm.h" #include "bla_trsm.h" #include "bla_gemmt.h" +#include "bla_gemm_compute.h" #include "bla_gemm_check.h" #include "bla_hemm_check.h" @@ -194,6 +195,7 @@ #include "bla_trmm_check.h" #include "bla_trsm_check.h" #include "bla_gemmt_check.h" +#include "bla_gemm_compute_check.h" // -- Batch Extension prototypes -- #include "bla_gemm_batch.h" diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h index dcccb07baa..fa957b9f84 100644 --- a/frame/compat/cblas/src/cblas.h +++ b/frame/compat/cblas/src/cblas.h @@ -48,6 +48,8 @@ enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; +enum CBLAS_STORAGE {CblasPacked=151}; +enum CBLAS_IDENTIFIER {CblasAMatrix=161, CblasBMatrix=162}; #ifdef __cplusplus extern "C" { @@ -993,6 +995,190 @@ BLIS_EXPORT_BLAS f77_int cblas_idamin(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamin(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamin(f77_int N, const void *X, f77_int incX); + +// -- PACK COMPUTE APIs -- +/** \addtogroup INTERFACE CBLAS INTERFACE + * @{ + */ + +/** +* cblas_sgemm_pack_get_size calculates and returns the number of bytes necessary +* to store the specified matrix after packing. +* +* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix. +* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B). +* @param[in] N Specifies the order of the matrix C. +* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B). +* @return The size in bytes required to store the specified matrix after packing. +*/ +BLIS_EXPORT_BLAS f77_int cblas_sgemm_pack_get_size(enum CBLAS_IDENTIFIER Identifier, + const f77_int M, const f77_int N, const f77_int K); + +/** +* cblas_dgemm_pack_get_size calculates and returns the number of bytes necessary +* to store the specified matrix after packing. +* +* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix. +* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B). +* @param[in] N Specifies the order of the matrix C. +* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B). +* @return The size in bytes required to store the specified matrix after packing. +*/ +BLIS_EXPORT_BLAS f77_int cblas_dgemm_pack_get_size(enum CBLAS_IDENTIFIER Identifier, + const f77_int M, const f77_int N, const f77_int K); + +/** +* cblas_sgemm_pack scales by alpha and packs the specified matrix into the +* allocated buffer. It is imperative to allocate a buffer of type float and size +* as returned by the cblas_sgemm_pack_get_size() before invoking this routine. +* +* @note If both the matrices are to be packed, the user must ensure that only +* one matrix is packed with the scalar alpha and the other with a unit-scalar. +* +* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor. +* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix. +* @param[in] Trans Specifies the form of Mat(X) used in the matrix multiplication: +* if trans = CblasNoTrans, then Mat(X) = X; +* if trans = CblasTrans, then Mat(X) = \f$X^T\f$; +* if trans = CblasConjTrans, then Mat(X) = \f$X^H\f$. +* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B). +* @param[in] N Specifies the order of the matrix C. +* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B). +* @param[in] alpha Specifies the scalar alpha. +* @param[in] src The matrix to be packed. +* @param[in] ld Specifies the leading dimension of the matrix to be packed. +* @param[out] dest The buffer to store the scaled and packed matrix. +* @return None +*/ +BLIS_EXPORT_BLAS void cblas_sgemm_pack(enum CBLAS_ORDER Order, + enum CBLAS_IDENTIFIER Identifier, enum CBLAS_TRANSPOSE Trans, + const f77_int M, const f77_int N, const f77_int K, + const float alpha, const float *src, const f77_int ld, + float* dest ); + +/** +* cblas_dgemm_pack scales by alpha and packs the specified matrix into the +* allocated buffer. It is imperative to allocate a buffer of type double and +* size as returned by the cblas_dgemm_pack_get_size() before invoking this +* routine. +* +* @note If both the matrices are to be packed, the user must ensure that only +* one matrix is packed with the scalar alpha and the other with a unit-scalar. +* +* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor. +* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix. +* @param[in] Trans Specifies the form of Mat(X) used in the matrix multiplication: +* if trans = CblasNoTrans, then Mat(X) = X; +* if trans = CblasTrans, then Mat(X) = \f$X^T\f$; +* if trans = CblasConjTrans, then Mat(X) = \f$X^H\f$. +* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B). +* @param[in] N Specifies the order of the matrix C. +* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B). +* @param[in] alpha Specifies the scalar alpha. +* @param[in] src The matrix to be packed. +* @param[in] ld Specifies the leading dimension of the matrix to be packed. +* @param[out] dest The buffer to store the scaled and packed matrix. +* @return None +*/ +BLIS_EXPORT_BLAS void cblas_dgemm_pack(enum CBLAS_ORDER Order, + enum CBLAS_IDENTIFIER Identifier, enum CBLAS_TRANSPOSE Trans, + const f77_int M, const f77_int N, const f77_int K, + const double alpha, const double *src, const f77_int ld, + double* dest ); + +/** +* cblas_sgemm_compute computes the matrix-matrix product where one or both the +* input matrices are packed and adds this to the scalar-matrix product. This +* operation is defined as: +* C := Mat(A) * Mat(B) + beta*C, +* where, +* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$, +* beta is a scalar, +* A, B and C are matrices: +* Mat(A) is an nxk matrix, or a packed matrix buffer, +* Mat(B) is a kxn matrix, or a packed matrix buffer, +* C is an mxn matrix. +* +* @note In case both the matrices are to be packed, the user must ensure that +* only one matrix is packed with alpha scalar and the other with a unit-scalar, +* during the packing process +* +* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor. +* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication: +* if transa = CblasNoTrans, then Mat(A) = A; +* if transa = CblasTrans, then Mat(A) = \f$A^T\f$; +* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$; +* if transa = CblasPacked, then A matrix is packed and lda is ignored. +* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication: +* if transb = CblasNoTrans, then Mat(B) = B; +* if transb = CblasTrans, then Mat(B) = \f$B^T\f$; +* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$; +* if transb = CblasPacked, then B matrix is packed and ldb is ignored. +* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B). +* @param[in] N Specifies the order of the matrix C. +* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B). +* @param[in] A The array is float matrix A or a buffer with packed matrix A. +* @param[in] lda Specifies the leading dimension of A. +* @param[in] B The array is float matrix B or a buffer with packed matrix B. +* @param[in] ldb Specifies the leading dimension of B. +* @param[in] beta Specifies the scalar beta. +* @param[in,out] C The array is float matrix C. +* @param[in] ldc Specifies the leading dimension of C. +* @return None +*/ +BLIS_EXPORT_BLAS void cblas_sgemm_compute(enum CBLAS_ORDER Order, + f77_int TransA, f77_int TransB, + const f77_int M, const f77_int N, const f77_int K, + const float* A, f77_int lda, const float* B, f77_int ldb, + float beta, float* C, f77_int ldc); + +/** +* cblas_dgemm_compute computes the matrix-matrix product where one or both the +* input matrices are packed and adds this to the scalar-matrix product. This +* operation is defined as: +* C := Mat(A) * Mat(B) + beta*C, +* where, +* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$, +* beta is a scalar, +* A, B and C are matrices: +* Mat(A) is an nxk matrix, or a packed matrix buffer, +* Mat(B) is a kxn matrix, or a packed matrix buffer, +* C is an mxn matrix. +* +* @note In case both the matrices are to be packed, the user must ensure that +* only one matrix is packed with alpha scalar and the other with a unit-scalar, +* during the packing process +* +* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor. +* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication: +* if transa = CblasNoTrans, then Mat(A) = A; +* if transa = CblasTrans, then Mat(A) = \f$A^T\f$; +* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$; +* if transa = CblasPacked, then A matrix is packed and lda is ignored. +* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication: +* if transb = CblasNoTrans, then Mat(B) = B; +* if transb = CblasTrans, then Mat(B) = \f$B^T\f$; +* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$; +* if transb = CblasPacked, then B matrix is packed and ldb is ignored. +* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B). +* @param[in] N Specifies the order of the matrix C. +* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B). +* @param[in] A The array is double matrix A or a buffer with packed matrix A. +* @param[in] lda Specifies the leading dimension of A. +* @param[in] B The array is double matrix B or a buffer with packed matrix B. +* @param[in] ldb Specifies the leading dimension of B. +* @param[in] beta Specifies the scalar beta. +* @param[in,out] C The array is double matrix C. +* @param[in] ldc Specifies the leading dimension of C. +* @return None +*/ +BLIS_EXPORT_BLAS void cblas_dgemm_compute(enum CBLAS_ORDER Order, + f77_int TransA, f77_int TransB, + const f77_int M, const f77_int N, const f77_int K, + const double* A, f77_int lda, const double* B, f77_int ldb, + double beta, double* C, f77_int ldc); +/** @}*/ + #ifdef __cplusplus } #endif diff --git a/frame/compat/cblas/src/cblas_dgemm_compute.c b/frame/compat/cblas/src/cblas_dgemm_compute.c new file mode 100644 index 0000000000..ed55f8a805 --- /dev/null +++ b/frame/compat/cblas/src/cblas_dgemm_compute.c @@ -0,0 +1,172 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS + +#include "cblas.h" +#include "cblas_f77.h" + +BLIS_EXPORT_BLAS void cblas_dgemm_compute( enum CBLAS_ORDER Order, + f77_int TransA, + f77_int TransB, + const f77_int M, const f77_int N, + const f77_int K, + const double* A, f77_int lda, + const double* B, f77_int ldb, + double beta, + double* C, f77_int ldc ) +{ + char TA, TB; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_TB; +#else + #define F77_TA &TA + #define F77_TB &TB +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if ( Order == CblasColMajor ) // CblasColMajor + { + if ( TransA == CblasTrans ) TA='T'; + else if ( TransA == CblasConjTrans ) TA='T'; + else if ( TransA == CblasNoTrans ) TA='N'; + else if ( TransA == CblasPacked ) TA='P'; + else + { + cblas_xerbla(2, "cblas_dgemm_compute", + "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( TransB == CblasTrans ) TB='T'; + else if ( TransB == CblasConjTrans ) TB='T'; + else if ( TransB == CblasNoTrans ) TB='N'; + else if ( TransB == CblasPacked ) TB='P'; + else + { + cblas_xerbla(3, "cblas_dgemm_compute", + "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); +#endif + + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int rs_c = 1; + + F77_dgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda, + B, &rs_b, &F77_ldb, &beta, C, &rs_c, &F77_ldc); + } + else if ( Order == CblasRowMajor ) // CblasRowMajor + { + RowMajorStrg = 1; + + // If Row Major, and A is not already reordered + // then toggle the transA parameter and interchange the strides. + if ( TransA == CblasPacked ) TA='P'; + else if ( TransA == CblasTrans ) TA='N'; + else if ( TransA == CblasNoTrans ) TA='T'; + else if ( TransA == CblasConjTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_dgemm_compute", + "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + // If Row Major, and B is not already reordered + // then toggle the transB parameter and interchange the strides. + if ( TransB == CblasPacked ) TB='P'; + else if ( TransB == CblasTrans ) TB='N'; + else if ( TransB == CblasNoTrans ) TB='T'; + else if ( TransB == CblasConjTrans ) TB='N'; + else + { + cblas_xerbla(2, "cblas_dgemm_compute", + "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); +#endif + + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int cs_c = 1; + + F77_dgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda, + B, &rs_b, &F77_ldb, &beta, C, &F77_ldc, &cs_c ); + } + else + { + cblas_xerbla(1, "cblas_dgemm_compute", + "Illegal Order setting, %d\n", Order); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + return; +} +#endif \ No newline at end of file diff --git a/frame/compat/cblas/src/cblas_dgemm_pack.c b/frame/compat/cblas/src/cblas_dgemm_pack.c new file mode 100644 index 0000000000..9ddba3bcaa --- /dev/null +++ b/frame/compat/cblas/src/cblas_dgemm_pack.c @@ -0,0 +1,157 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS + +#include "cblas.h" +#include "cblas_f77.h" + +BLIS_EXPORT_BLAS void cblas_dgemm_pack( enum CBLAS_ORDER Order, + enum CBLAS_IDENTIFIER Identifier, + enum CBLAS_TRANSPOSE Trans, + const f77_int M, + const f77_int N, + const f77_int K, + const double alpha, + const double* src, const f77_int ld, + double* dest ) +{ + char TR; + char ID; + +#ifdef F77_CHAR + F77_CHAR F77_TR; + F77_CHAR F77_ID; +#else +#define F77_TR &TR +#define F77_ID &ID +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K, F77_ld=ld; +#else + +#define F77_M M +#define F77_N N +#define F77_K K +#define F77_ld ld + +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + + if ( Order == CblasColMajor ) // CblasColMajor + { + if ( Trans == CblasNoTrans ) TR = 'N'; + else if ( Trans == CblasTrans ) TR = 'T'; + else if ( Trans == CblasConjTrans ) TR = 'T'; + else + { + cblas_xerbla(3, "cblas_dgemm_pack","Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( Identifier == CblasAMatrix ) ID = 'A'; + else if ( Identifier == CblasBMatrix ) ID = 'B'; + else + { + cblas_xerbla(3, "cblas_dgemm_pack","Illegal Identifier setting, %d\n", Identifier); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TR = C2F_CHAR(&TR); + F77_ID = C2F_CHAR(&ID); +#endif + F77_dgemm_pack( F77_ID, + F77_TR, + &F77_M, + &F77_N, + &F77_K, + &alpha, + src, &F77_ld, + dest ); + } + else if ( Order == CblasRowMajor ) // CblasRowMajor + { + RowMajorStrg = 1; + if ( Trans == CblasNoTrans ) TR = 'T'; + else if ( Trans == CblasTrans ) TR = 'N'; + else if ( Trans == CblasConjTrans ) TR = 'N'; + else + { + cblas_xerbla(3, "cblas_dgemm_pack","Invalid Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( Identifier == CblasAMatrix ) ID = 'A'; + else if ( Identifier == CblasBMatrix ) ID = 'B'; + else + { + cblas_xerbla(3, "cblas_dgemm_pack","Illegal Identifier setting, %d\n", Identifier); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TR = C2F_CHAR(&TR); + F77_ID = C2F_CHAR(&ID); +#endif + F77_dgemm_pack ( F77_ID, + F77_TR, + &F77_M, + &F77_N, + &F77_K, + &alpha, + src, &F77_ld, + dest ); + } + else cblas_xerbla(1, "cblas_dgemm_pack", "Invalid Order setting, %d\n", Order); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} +#endif \ No newline at end of file diff --git a/frame/compat/cblas/src/cblas_dgemm_pack_get_size.c b/frame/compat/cblas/src/cblas_dgemm_pack_get_size.c new file mode 100644 index 0000000000..5001ed15a8 --- /dev/null +++ b/frame/compat/cblas/src/cblas_dgemm_pack_get_size.c @@ -0,0 +1,83 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS + +#include "cblas.h" +#include "cblas_f77.h" + +f77_int cblas_dgemm_pack_get_size( enum CBLAS_IDENTIFIER Identifier, + const f77_int M, + const f77_int N, + const f77_int K ) +{ + AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_1 ); + + char ID; + f77_int tbytes = 0; + +#ifdef F77_CHAR + F77_CHAR F77_ID; +#else + #define F77_ID &ID +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K; +#else + #define F77_M M + #define F77_N N + #define F77_K K +#endif + + if (Identifier == CblasAMatrix ) ID = 'A'; + else if (Identifier == CblasBMatrix ) ID = 'B'; + else + { + cblas_xerbla( 1, "cblas_dgemm_pack_get_size", + "Illegal CBLAS_IDENTIFIER setting, %d\n", Identifier ); + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 ); + return 0; + } + +#ifdef F77_CHAR + F77_ID = C2F_CHAR( &ID ); +#endif + tbytes = F77_dgemm_pack_get_size ( F77_ID, &F77_M, &F77_N, &F77_K ); + + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 ); + return tbytes; +} +#endif \ No newline at end of file diff --git a/frame/compat/cblas/src/cblas_f77.h b/frame/compat/cblas/src/cblas_f77.h index be02986ae7..18bbad51b7 100644 --- a/frame/compat/cblas/src/cblas_f77.h +++ b/frame/compat/cblas/src/cblas_f77.h @@ -202,6 +202,14 @@ #define F77_cgemm_batch cgemm_batch #define F77_zgemm_batch zgemm_batch +// -- Pack-Compute APIs -- +#define F77_sgemm_pack_get_size sgemm_pack_get_size_blis_impl +#define F77_dgemm_pack_get_size dgemm_pack_get_size_blis_impl +#define F77_sgemm_pack sgemm_pack_blis_impl +#define F77_dgemm_pack dgemm_pack_blis_impl +#define F77_sgemm_compute sgemm_compute_blis_impl +#define F77_dgemm_compute dgemm_compute_blis_impl + // (BLIS_ENABLE_NO_UNDERSCORE_API) ends #else /* @@ -389,6 +397,14 @@ #define F77_dgemm_batch dgemm_batch_ #define F77_cgemm_batch cgemm_batch_ #define F77_zgemm_batch zgemm_batch_ + +// -- Pack-Compute APIs -- +#define F77_sgemm_pack_get_size sgemm_pack_get_size_blis_impl +#define F77_dgemm_pack_get_size dgemm_pack_get_size_blis_impl +#define F77_sgemm_pack sgemm_pack_blis_impl +#define F77_dgemm_pack dgemm_pack_blis_impl +#define F77_sgemm_compute sgemm_compute_blis_impl +#define F77_dgemm_compute dgemm_compute_blis_impl #endif #endif /* CBLAS_F77_H */ diff --git a/frame/compat/cblas/src/cblas_sgemm_compute.c b/frame/compat/cblas/src/cblas_sgemm_compute.c new file mode 100644 index 0000000000..4a3902db1a --- /dev/null +++ b/frame/compat/cblas/src/cblas_sgemm_compute.c @@ -0,0 +1,171 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS + +#include "cblas.h" +#include "cblas_f77.h" + +BLIS_EXPORT_BLAS void cblas_sgemm_compute( enum CBLAS_ORDER Order, + f77_int TransA, + f77_int TransB, + const f77_int M, + const f77_int N, + const f77_int K, + const float* A, f77_int lda, + const float* B, f77_int ldb, + float beta, + float* C, f77_int ldc) +{ + char TA, TB; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_TB; +#else + #define F77_TA &TA + #define F77_TB &TB +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + if( Order == CblasColMajor ) // CblasColMajor + { + if ( TransA == CblasTrans ) TA='T'; + else if ( TransA == CblasConjTrans ) TA='T'; + else if ( TransA == CblasNoTrans ) TA='N'; + else if ( TransA == CblasPacked ) TA='P'; + else + { + cblas_xerbla(2, "cblas_sgemm_compute", + "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( TransB == CblasTrans ) TB='T'; + else if ( TransB == CblasConjTrans ) TB='T'; + else if ( TransB == CblasNoTrans ) TB='N'; + else if ( TransB == CblasPacked ) TB='P'; + else + { + cblas_xerbla(3, "cblas_sgemm_compute", + "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int rs_c = 1; + + F77_sgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda, + B, &rs_b, &F77_ldb, &beta, C, &rs_c, &F77_ldc); + } + else if ( Order == CblasRowMajor ) // CblasRowMajor + { + RowMajorStrg = 1; + + // If Row Major, and A is not already reordered + // then toggle the transA parameter and interchange the strides. + if ( TransA == CblasPacked ) TA='P'; + else if ( TransA == CblasTrans ) TA='N'; + else if ( TransA == CblasNoTrans ) TA='T'; + else if ( TransA == CblasConjTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_sgemm_compute", + "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + // If Row Major, and B is not already reordered + // then toggle the transB parameter and interchange the strides. + if ( TransB == CblasPacked ) TB='P'; + else if ( TransB == CblasTrans ) TB='N'; + else if ( TransB == CblasNoTrans ) TB='T'; + else if ( TransB == CblasConjTrans ) TB='N'; + else + { + cblas_xerbla(2, "cblas_sgemm_compute", + "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int cs_c = 1; + + F77_sgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda, + B, &rs_b, &F77_ldb, &beta, C, &F77_ldc, &cs_c); + } + else + { + cblas_xerbla(1, "cblas_sgemm_compute", + "Illegal Order setting, %d\n", Order); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + return; +} +#endif diff --git a/frame/compat/cblas/src/cblas_sgemm_pack.c b/frame/compat/cblas/src/cblas_sgemm_pack.c new file mode 100644 index 0000000000..39a6e055fe --- /dev/null +++ b/frame/compat/cblas/src/cblas_sgemm_pack.c @@ -0,0 +1,157 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS + +#include "cblas.h" +#include "cblas_f77.h" + +BLIS_EXPORT_BLAS void cblas_sgemm_pack( enum CBLAS_ORDER Order, + enum CBLAS_IDENTIFIER Identifier, + enum CBLAS_TRANSPOSE Trans, + const f77_int M, + const f77_int N, + const f77_int K, + const float alpha, + const float* src, const f77_int ld, + float* dest ) +{ + char TR; + char ID; + +#ifdef F77_CHAR + F77_CHAR F77_TR; + F77_CHAR F77_ID; +#else +#define F77_TR &TR +#define F77_ID &ID +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K, F77_ld=ld; +#else + +#define F77_M M +#define F77_N N +#define F77_K K +#define F77_ld ld + +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + + if ( Order == CblasColMajor ) // CblasColMajor + { + if ( Trans == CblasNoTrans ) TR = 'N'; + else if ( Trans == CblasTrans ) TR = 'T'; + else if ( Trans == CblasConjTrans ) TR = 'T'; + else + { + cblas_xerbla(3, "cblas_sgemm_pack","Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( Identifier == CblasAMatrix ) ID = 'A'; + else if ( Identifier == CblasBMatrix ) ID = 'B'; + else + { + cblas_xerbla(3, "cblas_sgemm_pack","Illegal Identifier setting, %d\n", Identifier); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TR = C2F_CHAR(&TR); + F77_ID = C2F_CHAR(&ID); +#endif + F77_sgemm_pack( F77_ID, + F77_TR, + &F77_M, + &F77_N, + &F77_K, + &alpha, + src, &F77_ld, + dest ); + } + else if ( Order == CblasRowMajor ) // CblasRowMajor + { + RowMajorStrg = 1; + if ( Trans == CblasNoTrans ) TR = 'T'; + else if ( Trans == CblasTrans ) TR = 'N'; + else if ( Trans == CblasConjTrans ) TR = 'N'; + else + { + cblas_xerbla(3, "cblas_sgemm_pack","Invalid Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( Identifier == CblasAMatrix ) ID = 'A'; + else if ( Identifier == CblasBMatrix ) ID = 'B'; + else + { + cblas_xerbla(3, "cblas_sgemm_pack","Illegal Identifier setting, %d\n", Identifier); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TR = C2F_CHAR(&TR); + F77_ID = C2F_CHAR(&ID); +#endif + F77_sgemm_pack ( F77_ID, + F77_TR, + &F77_M, + &F77_N, + &F77_K, + &alpha, + src, &F77_ld, + dest ); + } + else cblas_xerbla(1, "cblas_sgemm_pack", "Invalid Order setting, %d\n", Order); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} +#endif \ No newline at end of file diff --git a/frame/compat/cblas/src/cblas_sgemm_pack_get_size.c b/frame/compat/cblas/src/cblas_sgemm_pack_get_size.c new file mode 100644 index 0000000000..bf82bb104b --- /dev/null +++ b/frame/compat/cblas/src/cblas_sgemm_pack_get_size.c @@ -0,0 +1,83 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS + +#include "cblas.h" +#include "cblas_f77.h" + +f77_int cblas_sgemm_pack_get_size( enum CBLAS_IDENTIFIER Identifier, + const f77_int M, + const f77_int N, + const f77_int K ) +{ + AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_1 ); + + char ID; + f77_int tbytes = 0; + +#ifdef F77_CHAR + F77_CHAR F77_ID; +#else + #define F77_ID &ID +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K; +#else + #define F77_M M + #define F77_N N + #define F77_K K +#endif + + if ( Identifier == CblasAMatrix ) ID = 'A'; + else if ( Identifier == CblasBMatrix ) ID = 'B'; + else + { + cblas_xerbla( 1, "cblas_sgemm_pack_get_size", + "Illegal CBLAS_IDENTIFIER setting, %d\n", Identifier ); + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 ); + return 0; + } + +#ifdef F77_CHAR + F77_ID = C2F_CHAR( &ID ); +#endif + tbytes = F77_sgemm_pack_get_size ( F77_ID, &F77_M, &F77_N, &F77_K ); + + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 ); + return tbytes; +} +#endif \ No newline at end of file diff --git a/frame/compat/check/CMakeLists.txt b/frame/compat/check/CMakeLists.txt index 518e6ff133..e3519ecfb5 100644 --- a/frame/compat/check/CMakeLists.txt +++ b/frame/compat/check/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc.## +##Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved. ## target_sources("${PROJECT_NAME}" PRIVATE @@ -23,8 +23,5 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bla_trmv_check.h ${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_check.h ${CMAKE_CURRENT_SOURCE_DIR}/bla_trsv_check.h ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm3m_check.h +${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_compute_check.h ) - - - - diff --git a/frame/compat/check/bla_gemm_compute_check.h b/frame/compat/check/bla_gemm_compute_check.h new file mode 100644 index 0000000000..1e24168110 --- /dev/null +++ b/frame/compat/check/bla_gemm_compute_check.h @@ -0,0 +1,87 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#define bla_gemm_compute_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, rs_c, cs_c ) \ +{ \ + f77_int info = 0; \ + f77_int nota, notb; \ + f77_int conja, conjb; \ + f77_int ta, tb; \ + f77_int packa, packb; \ + f77_int nrowa, nrowb; \ +\ + nota = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \ + notb = PASTE_LSAME( transb, "N", (ftnlen)1, (ftnlen)1 ); \ + conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \ + conjb = PASTE_LSAME( transb, "C", (ftnlen)1, (ftnlen)1 ); \ + ta = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \ + tb = PASTE_LSAME( transb, "T", (ftnlen)1, (ftnlen)1 ); \ + packa = PASTE_LSAME( transa, "P", (ftnlen)1, (ftnlen)1 ); \ + packb = PASTE_LSAME( transb, "P", (ftnlen)1, (ftnlen)1 ); \ +\ + if ( nota || packa ) { nrowa = *m; } \ + else { nrowa = *k; } \ + if ( notb || packb ) { nrowb = *k; } \ + else { nrowb = *n; } \ +\ + if ( !nota && !conja && !ta && !packa ) \ + info = 1; \ + else if ( !notb && !conjb && !tb && !packb ) \ + info = 2; \ + else if ( *m < 0 ) \ + info = 3; \ + else if ( *n < 0 ) \ + info = 4; \ + else if ( *k < 0 ) \ + info = 5; \ + else if ( !packa && *lda < bli_max( 1, nrowa ) ) /* lda is ignored when A is packed. */ \ + info = 7; \ + else if ( !packb && *ldb < bli_max( 1, nrowb ) ) /* ldb is ignored when B is packed. */ \ + info = 9; \ + else if ( ( *rs_c == 1 && *cs_c < bli_max( 1, *m ) ) || ( *cs_c == 1 && *rs_c < bli_max( 1, *n ) ) ) \ + info = 12; \ +\ + if ( info != 0 ) \ + { \ + char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ +\ + sprintf( func_str, "%s%-5s", dt_str, op_str ); \ +\ + bli_string_mkupper( func_str ); \ +\ + PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + return; \ + } \ +} \ No newline at end of file diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index e131acb4ac..7946be6c75 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -287,6 +287,8 @@ #define dgemm_batch_ dgemm_batch #define cgemm_batch_ cgemm_batch #define zgemm_batch_ zgemm_batch +#define sgemm_compute_ sgemm_compute +#define dgemm_compute_ dgemm_compute #define saxpby_ saxpby #define daxpby_ daxpby #define caxpby_ caxpby @@ -391,6 +393,7 @@ #define dgbmv DGBMV #define dgemm DGEMM #define dgemm_batch DGEMM_BATCH +#define dgemm_compute DGEMM_COMPUTE #define dgemmt DGEMMT #define dgemv DGEMV #define dger DGER @@ -464,6 +467,7 @@ #define sgbmv SGBMV #define sgemm SGEMM #define sgemm_batch SGEMM_BATCH +#define sgemm_compute SGEMM_COMPUTE #define sgemmt SGEMMT #define sgemv SGEMV #define sger SGER diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 823410e0aa..304dfb7816 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -470,7 +470,8 @@ typedef enum BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, - BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS + BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS, + BLIS_PACKED = BLIS_BITVAL_PACKED_UNSPEC } trans_t; typedef enum diff --git a/frame/thread/CMakeLists.txt b/frame/thread/CMakeLists.txt index 9e93e69b5a..71c9d6f9b0 100644 --- a/frame/thread/CMakeLists.txt +++ b/frame/thread/CMakeLists.txt @@ -2,6 +2,8 @@ target_sources("${PROJECT_NAME}" PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute_decor_openmp.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute_decor_single.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_openmp.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_pthreads.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_single.c diff --git a/frame/thread/bli_l3_compute_decor.h b/frame/thread/bli_l3_compute_decor.h new file mode 100644 index 0000000000..83ce718ecc --- /dev/null +++ b/frame/thread/bli_l3_compute_decor.h @@ -0,0 +1,67 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_COMPUTE_DECOR_H +#define BLIS_L3_COMPUTE_DECOR_H + +// Level-3 compute internal function type. +typedef err_t (*l3computeint_t) + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +// Level-3 compute thread decorator prototype. +err_t bli_l3_compute_thread_decorator + ( + l3computeint_t func, + opid_t family, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +#include "bli_l3_compute_decor_single.h" +#include "bli_l3_compute_decor_openmp.h" +// #include "bli_l3_compute_decor_pthreads.h" + +#endif \ No newline at end of file diff --git a/frame/thread/bli_l3_compute_decor_openmp.c b/frame/thread/bli_l3_compute_decor_openmp.c new file mode 100644 index 0000000000..4219e76c8e --- /dev/null +++ b/frame/thread/bli_l3_compute_decor_openmp.c @@ -0,0 +1,133 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// @note: Presently MT is not supported, so n_threads have been explicitly +// initialized to 1 while intializing. Thus, even if BLIS is build with OpenMP +// support, the compute APIs work as an ST implementation. + +#include "blis.h" + +#ifdef BLIS_ENABLE_OPENMP + +void* bli_l3_compute_thread_entry( void* data_void ) { return NULL; } + +err_t bli_l3_compute_thread_decorator + ( + l3computeint_t func, + opid_t family, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + // Query the total number of threads from the rntm_t object. + const dim_t n_threads = bli_rntm_num_threads( rntm ); + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. We do + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. + bli_pba_rntm_set_pba( rntm ); + + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + + _Pragma( "omp parallel num_threads(n_threads)" ) + { + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + // Query the thread's id from OpenMP. + const dim_t tid = omp_get_thread_num(); + + // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // NOTE: This calls the same function used for the conventional/large + // code path. + bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + bli_sba_rntm_set_pool( tid, array, rntm_p ); + + thrinfo_t* thread = NULL; + + // Create the root node of the thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + + func + ( + a, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); + } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called from the thread entry function). + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); + + return BLIS_SUCCESS; +} + +#endif \ No newline at end of file diff --git a/frame/thread/bli_l3_compute_decor_openmp.h b/frame/thread/bli_l3_compute_decor_openmp.h new file mode 100644 index 0000000000..14e80314f9 --- /dev/null +++ b/frame/thread/bli_l3_compute_decor_openmp.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_SUP_DECOR_OPENMP_H +#define BLIS_L3_SUP_DECOR_OPENMP_H + +// Definitions specific to situations when OpenMP multithreading is enabled. +#ifdef BLIS_ENABLE_OPENMP + +#endif + +#endif + diff --git a/frame/thread/bli_l3_compute_decor_single.c b/frame/thread/bli_l3_compute_decor_single.c new file mode 100644 index 0000000000..cadcd413cf --- /dev/null +++ b/frame/thread/bli_l3_compute_decor_single.c @@ -0,0 +1,87 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifndef BLIS_ENABLE_MULTITHREADING + +err_t bli_l3_compute_thread_decorator + ( + l3computeint_t func, + opid_t family, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + const dim_t n_threads = 1; + array_t* restrict array = bli_sba_checkout_array( n_threads ); + bli_sba_rntm_set_pool( 0, array, rntm ); + bli_pba_rntm_set_pba( rntm ); + + { + rntm_t* restrict rntm_p = rntm; + const dim_t tid = 0; + + // This optimization allows us to use one of the global thrinfo_t + // objects for single-threaded execution rather than grow one from + // scratch. The key is that bli_thrinfo_sup_grow(), which is called + // from within the variants, will immediately return if it detects + // that the thrinfo_t* passed into it is either + // &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED. + thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED; + + ( void )tid; + + func + ( + a, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + } + + bli_sba_checkin_array( array ); + + return BLIS_SUCCESS; +} + +#endif \ No newline at end of file diff --git a/frame/thread/bli_l3_compute_decor_single.h b/frame/thread/bli_l3_compute_decor_single.h new file mode 100644 index 0000000000..7b5d6fee3c --- /dev/null +++ b/frame/thread/bli_l3_compute_decor_single.h @@ -0,0 +1,43 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_COMPUTE_DECOR_SINGLE_H +#define BLIS_L3_COMPUTE_DECOR_SINGLE_H + +// Definitions specific to situations when multithreading is disabled. +#ifndef BLIS_ENABLE_MULTITHREADING + +#endif + +#endif \ No newline at end of file diff --git a/frame/thread/bli_pack_full_decor_openmp.c b/frame/thread/bli_pack_full_decor_openmp.c index a6f94afbb6..5d5034d193 100644 --- a/frame/thread/bli_pack_full_decor_openmp.c +++ b/frame/thread/bli_pack_full_decor_openmp.c @@ -54,7 +54,11 @@ void bli_pack_full_thread_decorator /* Ensure n_threads is always greater than or equal to 1 */ /* Passing BLIS_IC_NT and BLIS_JC_NT for pack can lead to n_threads */ /* becoming negative. In that case, packing is done using 1 thread */ - n_threads = ( n_threads > 0 ) ? n_threads : 1; + // n_threads = ( n_threads > 0 ) ? n_threads : 1; + + // Explicitly setting n_threads = 1 to force packing with only a single + // thread. + n_threads = 1; _Pragma( "omp parallel num_threads(n_threads)" ) { diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index ea41dbeecf..0f67ab7cd0 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -60,6 +60,9 @@ // Include the pack full thread decorator and related definitions and prototypes // for the pack code path. #include "bli_pack_full_decor.h" +// Include the level-3 thread decorator and related definitions and prototypes +// for the compute code path. +#include "bli_l3_compute_decor.h" // Initialization-related prototypes. void bli_thread_init( void ); From 6d0444497f6283f6d5beee959f09c59f471df169 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Wed, 11 Oct 2023 05:59:28 -0400 Subject: [PATCH 165/226] Improvements to xerbla functionality The following improvements have been implemented: - Option to stop in xerbla on error. This is controlled by setting the environment variable BLIS_STOP_ON_ERROR=1 - Option to disable printing of error message from BLIS. This is controlled by setting the environment variable BLIS_PRINT_ON_ERROR=0 - Added a function to return the value of INFO passed to xerbla, assuming xerbla was not set to stop on error. Example call is info = bli_info_get_info_value(); The default behaviour remains to print but don't stop on error, i.e. the equivalent to export BLIS_PRINT_ON_ERROR=1 BLIS_STOP_ON_ERROR=0 Implementation details: - Values of the environment variables are stored and retrieved from global_rntm. - Info value is stored and retrieved from tl_rntm. It is set to 0 during initialization for all calls and updated by xerbla if an error has occurred. - Call to bli_init_auto before calling PASTEBLACHK macro (which calls xerbla) will reinitialize info_value to 0 via call to bli_thread_update_rntm_from_env AMD-Internal: [CPUPL-3520] Change-Id: I151f6de9b5a437c3a6e3fcf453d5b8fa9c579b9d --- frame/base/bli_info.c | 10 +++++++- frame/base/bli_info.h | 4 ++- frame/base/bli_rntm.c | 2 +- frame/base/bli_rntm.h | 35 ++++++++++++++++++++++++++- frame/compat/bla_gemv_amd.c | 30 +++++++++++++++++------ frame/compat/blis/thread/b77_thread.c | 11 +++++++++ frame/compat/blis/thread/b77_thread.h | 4 +++ frame/compat/f2c/bla_xerbla.c | 30 ++++++++++++++++++++--- frame/include/bli_type_defs.h | 6 +++++ frame/thread/bli_thread.c | 33 +++++++++++++++++++++++++ 10 files changed, 149 insertions(+), 16 deletions(-) diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index 2e4c1347ce..6d901c7288 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -35,6 +35,9 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; // -- General library information ---------------------------------------------- @@ -157,6 +160,11 @@ gint_t bli_info_get_enable_sandbox( void ) #endif } +// -- Error code produced from within xerbla (if called), otherwise 0 +gint_t bli_info_get_info_value( void ) +{ + return tl_rntm.info_value; +} // -- Kernel implementation-related -------------------------------------------- diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index d900ca4f51..4cb5b13219 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -75,6 +75,8 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); +// -- Get value of info from within xerbla (if called), otherwise 0 returned +BLIS_EXPORT_BLIS gint_t bli_info_get_info_value( void ); // -- Kernel implementation-related -------------------------------------------- diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index e8c29b0cc5..ce9a38798d 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -37,7 +37,7 @@ // The global rntm_t structure, which holds the global thread settings // along with a few other key parameters. -rntm_t global_rntm; +rntm_t global_rntm = BLIS_RNTM_INITIALIZER; // Make thread settings local to each thread calling BLIS routines BLIS_THREAD_LOCAL rntm_t tl_rntm = BLIS_RNTM_INITIALIZER; diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h index 13b5765681..5df21f811e 100644 --- a/frame/base/bli_rntm.h +++ b/frame/base/bli_rntm.h @@ -225,6 +225,21 @@ BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) } #endif +BLIS_INLINE bool bli_rntm_stop_on_error( rntm_t* rntm ) +{ + return rntm->stop_on_error; +} + +BLIS_INLINE bool bli_rntm_print_on_error( rntm_t* rntm ) +{ + return rntm->print_on_error; +} + +BLIS_INLINE gint_t bli_rntm_info_value( rntm_t* rntm ) +{ + return rntm->info_value; +} + // // -- rntm_t modification (internal use only) ---------------------------------- // @@ -312,6 +327,21 @@ BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) bli_rntm_set_pba( NULL, rntm ); } +BLIS_INLINE void bli_rntm_set_stop_on_error_only( bool stop_on_error, rntm_t* rntm ) +{ + rntm->stop_on_error = stop_on_error; +} + +BLIS_INLINE void bli_rntm_set_print_on_error_only( bool print_on_error, rntm_t* rntm ) +{ + rntm->print_on_error = print_on_error; +} + +BLIS_INLINE void bli_rntm_set_info_value_only( gint_t info_value, rntm_t* rntm ) +{ + rntm->info_value = info_value; +} + // // -- rntm_t modification (public API) ----------------------------------------- // @@ -422,7 +452,10 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) .blis_mt = FALSE, \ .sba_pool = NULL, \ .pba = NULL, \ - } \ + .stop_on_error = FALSE, \ + .print_on_error = TRUE, \ + .info_value = 0, \ + } BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { diff --git a/frame/compat/bla_gemv_amd.c b/frame/compat/bla_gemv_amd.c index c2d743e80d..4077711e57 100644 --- a/frame/compat/bla_gemv_amd.c +++ b/frame/compat/bla_gemv_amd.c @@ -54,8 +54,6 @@ void PASTEF77S(ch,blasname) \ ftype* y, const f77_int* incy \ ) \ { \ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \ - AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); \ trans_t blis_transa; \ dim_t m0, n0; \ dim_t m_y, n_x; \ @@ -64,6 +62,9 @@ void PASTEF77S(ch,blasname) \ inc_t incx0; \ inc_t incy0; \ inc_t rs_a, cs_a; \ +\ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \ + AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -186,6 +187,9 @@ void dgemv_blis_impl AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); + /* Initialize BLIS. */ + bli_init_auto(); + /* Perform BLAS parameter checking. */ PASTEBLACHK(gemv) ( @@ -386,6 +390,10 @@ void sgemv_blis_impl AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); + + /* Initialize BLIS. */ + bli_init_auto(); + /* Perform BLAS parameter checking. */ PASTEBLACHK(gemv) ( @@ -570,9 +578,6 @@ void cgemv_blis_impl scomplex* y, const f77_int* incy ) { - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); - trans_t blis_transa; dim_t m0, n0; dim_t m_y, n_x; @@ -582,6 +587,12 @@ void cgemv_blis_impl inc_t incy0; inc_t rs_a, cs_a; + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); + + /* Initialize BLIS. */ + bli_init_auto(); + /* Perform BLAS parameter checking. */ PASTEBLACHK(gemv) ( @@ -808,9 +819,6 @@ void zgemv_blis_impl dcomplex* y, const f77_int* incy ) { - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); - trans_t blis_transa; dim_t m0, n0; dim_t m_y, n_x; @@ -820,6 +828,12 @@ void zgemv_blis_impl inc_t incy0; inc_t rs_a, cs_a; + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); + + /* Initialize BLIS. */ + bli_init_auto(); + /* Perform BLAS parameter checking. */ PASTEBLACHK(gemv) ( diff --git a/frame/compat/blis/thread/b77_thread.c b/frame/compat/blis/thread/b77_thread.c index fa28b959ba..d8446ebf60 100644 --- a/frame/compat/blis/thread/b77_thread.c +++ b/frame/compat/blis/thread/b77_thread.c @@ -91,3 +91,14 @@ void PASTEF770(bli_thread_set_num_threads) //bli_finalize_auto(); } +f77_int PASTEF770(bli_info_get_info_value) + ( + ) +{ + // Call the BLIS function. + gint_t info_value = bli_info_get_info_value(); + f77_int f77_info_value = (f77_int) info_value; + + return f77_info_value; +} + diff --git a/frame/compat/blis/thread/b77_thread.h b/frame/compat/blis/thread/b77_thread.h index 922ed6e13e..e3106d14ec 100644 --- a/frame/compat/blis/thread/b77_thread.h +++ b/frame/compat/blis/thread/b77_thread.h @@ -51,3 +51,7 @@ BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) const f77_int* nt ); +BLIS_EXPORT_BLAS f77_int PASTEF770(bli_info_get_info_value) + ( + ); + diff --git a/frame/compat/f2c/bla_xerbla.c b/frame/compat/f2c/bla_xerbla.c index 62dd6b5edf..0e0ec59d34 100644 --- a/frame/compat/f2c/bla_xerbla.c +++ b/frame/compat/f2c/bla_xerbla.c @@ -35,6 +35,13 @@ #include "blis.h" +// The global rntm_t structure. (The definition resides in bli_rntm.c.) +extern rntm_t global_rntm; + +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* xerbla.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -76,10 +83,25 @@ //for ( i = 0; i < srname_len; ++i ) // srname[i] = toupper( srname[i] ); - printf("** On entry to %6s, parameter number %2i had an illegal value\n", - srname, (int)*info); - - //bli_abort(); + // Make sure rntm variables are initialized. + bli_init_once(); + + // Store info value in thread-local rntm data structure. + gint_t info_value = (gint_t) *info; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + + bool print_on_error = bli_rntm_print_on_error( &global_rntm ); + if (print_on_error) + { + printf("** On entry to %6s, parameter number %2i had an illegal value\n", + srname, (int)*info); + } + + bool stop_on_error = bli_rntm_stop_on_error( &global_rntm ); + if (stop_on_error) + { + bli_abort(); + } /* End of XERBLA */ diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 304dfb7816..4eb9c098c8 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1534,6 +1534,12 @@ typedef struct rntm_s // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; + // Store values of environment variables to control BLIS version of xerbla + // and error code from xerbla + bool stop_on_error; + bool print_on_error; + gint_t info_value; + } rntm_t; diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 0e21ab0f6a..3333df89cf 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -1853,6 +1853,35 @@ void bli_thread_init_rntm_from_env #endif // BLIS_ENABLE_MULTITHREADING + // Check environment for options to control xerbla + + // Default: Don't stop on error + gint_t bli_stop_on_error_int = bli_env_get_var( "BLIS_STOP_ON_ERROR", 0 ); + bool bli_stop_on_error; + + if ( bli_stop_on_error_int != 0 ) + { + bli_stop_on_error = TRUE; + } + else + { + bli_stop_on_error = FALSE; + } + bli_rntm_set_stop_on_error_only(bli_stop_on_error, rntm); + + // Default: print on error + gint_t bli_print_on_error_int = bli_env_get_var( "BLIS_PRINT_ON_ERROR", 1 ); + bool bli_print_on_error; + if (bli_print_on_error_int != 0 ) + { + bli_print_on_error = TRUE; + } + else + { + bli_print_on_error = FALSE; + } + bli_rntm_set_print_on_error_only(bli_print_on_error, rntm); + // Save the results back in the runtime object. bli_rntm_set_auto_factor_only( auto_factor, rntm ); bli_rntm_set_num_threads_only( nt, rntm ); @@ -2025,6 +2054,10 @@ void bli_thread_update_rntm_from_env bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); bli_rntm_set_blis_mt_only( blis_mt, rntm ); + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, rntm ); + #ifdef PRINT_THREADING printf( "bli_thread_update_rntm_from_env(): tl_rntm\n" ); bli_rntm_print( rntm ); From c1612f683891c42ad086607120d8c634e40193bf Mon Sep 17 00:00:00 2001 From: Arnav Sharma Date: Mon, 28 Aug 2023 14:09:10 +0530 Subject: [PATCH 166/226] Gtestsuite Framework and Unit Tests for Pack and Compute Extension APIs - Added framework for unit testing of BLAS and CBLAS interfaces for the Pack and Compute Extension APIs. - These test the integrated functionality of the trio of ?gemm_pack_get_size(), ?gemm_pack() and ?gemm_compute() APIs. - Note: Only MKL can be used as reference for now. AMD-Internal: [CPUPL-3560] Change-Id: I801654447a716da06c9ccf9db01d553817871571 --- .../inc/level3/ref_gemm_compute.h | 69 +++ .../src/level3/ref_gemm_compute.cpp | 200 ++++++++ .../testsuite/level3/gemm/dgemm_generic.cpp | 2 +- .../gemm_compute/dgemm_compute_generic.cpp | 187 +++++++ .../level3/gemm_compute/gemm_compute.h | 456 ++++++++++++++++++ .../gemm_compute/gemm_compute_IIT_ERS.cpp | 222 +++++++++ .../gemm_compute/sgemm_compute_generic.cpp | 189 ++++++++ .../level3/gemm_compute/test_gemm_compute.h | 79 +++ 8 files changed, 1403 insertions(+), 1 deletion(-) create mode 100644 gtestsuite/testinghelpers/inc/level3/ref_gemm_compute.h create mode 100644 gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp create mode 100644 gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp create mode 100644 gtestsuite/testsuite/level3/gemm_compute/gemm_compute.h create mode 100644 gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp create mode 100644 gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp create mode 100644 gtestsuite/testsuite/level3/gemm_compute/test_gemm_compute.h diff --git a/gtestsuite/testinghelpers/inc/level3/ref_gemm_compute.h b/gtestsuite/testinghelpers/inc/level3/ref_gemm_compute.h new file mode 100644 index 0000000000..283a2b06ec --- /dev/null +++ b/gtestsuite/testinghelpers/inc/level3/ref_gemm_compute.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#pragma once + +#include "common/testing_helpers.h" + +/* + * ========================================================================== + * GEMM Compute performs one of the matrix-matrix operations + * C := op( A )*op( B ) + beta*C, + * where op( A ) is one of + * op( A ) = alpha * A or op( A ) = alpha * A**T + * op( A ) = A or op( A ) = A**T + * op( B ) is one of + * op( B ) = alpha * B or op( B ) = alpha * B**T + * op( B ) = B or op( B ) = B**T + * alpha and beta are scalars, and A, B and C are matrices, with op( A ) + * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix, + * where either op( A ) or op( B ) or both may be reordered. + ========================================================================== +*/ + +namespace testinghelpers { + +template +void ref_gemm_compute ( + char storage, char trnsa, char trnsb, + char pcka, char pckb, + gtint_t m, gtint_t n, gtint_t k, + T alpha, + T* ap, gtint_t lda, + T* bp, gtint_t ldb, + T beta, + T* cp, gtint_t ldc +); + +} //end of namespace testinghelpers diff --git a/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp b/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp new file mode 100644 index 0000000000..2b15ffea2b --- /dev/null +++ b/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp @@ -0,0 +1,200 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include +#include "level3/ref_gemm_compute.h" + +/* + * ========================================================================== + * GEMM Pack and Compute Extension performs the GEMM matrix-matrix operations + * by first packing/reordering A/B matrix and computing the GEMM operation + * on the packed buffer. + * + * Pack: + * Reorders the A or B matrix or both the matrices and scales them with + * alpha. + * + * Compute: + * C := A * B + beta*C, + * where, + * Either A or B or both A and B matrices are packed matrices. + * Alpha and beta are scalars, and A, B and C are matrices, with A + * an m by k matrix, B a k by n matrix and C an m by n matrix, + * where either A or B or both may be scaled by alpha and reordered. + * ========================================================================== + */ + +namespace testinghelpers { + +template +void ref_gemm_compute(char storage, char trnsa, char trnsb, char pcka, char pckb, gtint_t m, gtint_t n, gtint_t k, T alpha, + T* ap, gtint_t lda, T* bp, gtint_t ldb, T beta, T* cp, gtint_t ldc) +{ + T unit_alpha = 1.0; + enum CBLAS_ORDER cblas_order; + enum CBLAS_TRANSPOSE cblas_transa; + enum CBLAS_TRANSPOSE cblas_transb; + + char_to_cblas_order( storage, &cblas_order ); + char_to_cblas_trans( trnsa, &cblas_transa ); + char_to_cblas_trans( trnsb, &cblas_transb ); + + using scalar_t = std::conditional_t::is_complex, T&, T>; + + typedef gint_t (*Fptr_ref_cblas_gemm_pack_get_size)( const CBLAS_IDENTIFIER, + const f77_int, const f77_int, const f77_int ); + Fptr_ref_cblas_gemm_pack_get_size ref_cblas_gemm_pack_get_size; + + typedef void (*Fptr_ref_cblas_gemm_pack)( const CBLAS_ORDER, const CBLAS_IDENTIFIER, const CBLAS_TRANSPOSE, + const f77_int, const f77_int, const f77_int, const T, const T*, f77_int, + T*); + Fptr_ref_cblas_gemm_pack ref_cblas_gemm_pack; + + typedef void (*Fptr_ref_cblas_gemm_compute)( const CBLAS_ORDER, const f77_int, const f77_int, + const f77_int, const f77_int, const f77_int, const T*, f77_int, + const T*, f77_int, const scalar_t, T*, f77_int); + Fptr_ref_cblas_gemm_compute ref_cblas_gemm_compute; + + // Call C function + /* Check the typename T passed to this function template and call respective function.*/ + if (typeid(T) == typeid(float)) + { + ref_cblas_gemm_pack_get_size = (Fptr_ref_cblas_gemm_pack_get_size)refCBLASModule.loadSymbol("cblas_sgemm_pack_get_size"); + ref_cblas_gemm_pack = (Fptr_ref_cblas_gemm_pack)refCBLASModule.loadSymbol("cblas_sgemm_pack"); + ref_cblas_gemm_compute = (Fptr_ref_cblas_gemm_compute)refCBLASModule.loadSymbol("cblas_sgemm_compute"); + } + else if (typeid(T) == typeid(double)) + { + ref_cblas_gemm_pack_get_size = (Fptr_ref_cblas_gemm_pack_get_size)refCBLASModule.loadSymbol("cblas_dgemm_pack_get_size"); + ref_cblas_gemm_pack = (Fptr_ref_cblas_gemm_pack)refCBLASModule.loadSymbol("cblas_dgemm_pack"); + ref_cblas_gemm_compute = (Fptr_ref_cblas_gemm_compute)refCBLASModule.loadSymbol("cblas_dgemm_compute"); + } + else + { + throw std::runtime_error("Error in ref_gemm.cpp: Invalid typename is passed function template."); + } + if( !ref_cblas_gemm_compute ) { + throw std::runtime_error("Error in ref_gemm.cpp: Function pointer == 0 -- symbol not found."); + } + + err_t err = BLIS_SUCCESS; + + if ( ( pcka == 'P' || pcka == 'p' ) && ( pckb == 'P' || pckb == 'p' ) ) + { + // Reorder A + CBLAS_IDENTIFIER cblas_identifierA = CblasAMatrix; + CBLAS_STORAGE cblas_packed = CblasPacked; + gtint_t bufSizeA = ref_cblas_gemm_pack_get_size( cblas_identifierA, + m, + n, + k ); + + T* aBuffer = (T*) bli_malloc_user( bufSizeA, &err ); + + ref_cblas_gemm_pack( cblas_order, cblas_identifierA, cblas_transa, + m, n, k, alpha, ap, lda, aBuffer ); + + // Reorder B + CBLAS_IDENTIFIER cblas_identifierB = CblasBMatrix; + gtint_t bufSizeB = ref_cblas_gemm_pack_get_size( cblas_identifierB, + m, + n, + k ); + + T* bBuffer = (T*) bli_malloc_user( bufSizeB, &err ); + + ref_cblas_gemm_pack( cblas_order, cblas_identifierB, cblas_transb, + m, n, k, unit_alpha, bp, ldb, bBuffer ); + + ref_cblas_gemm_compute( cblas_order, cblas_packed, cblas_packed, + m, n, k, aBuffer, lda, bBuffer, ldb, beta, cp, ldc ); + + bli_free_user( aBuffer ); + bli_free_user( bBuffer ); + } + else if ( ( pcka == 'P' || pcka == 'p' ) ) + { + // Reorder A + CBLAS_IDENTIFIER cblas_identifier = CblasAMatrix; + CBLAS_STORAGE cblas_packed = CblasPacked; + gtint_t bufSizeA = ref_cblas_gemm_pack_get_size( cblas_identifier, + m, + n, + k ); + + T* aBuffer = (T*) bli_malloc_user( bufSizeA, &err ); + + ref_cblas_gemm_pack( cblas_order, cblas_identifier, cblas_transa, + m, n, k, alpha, ap, lda, aBuffer ); + + ref_cblas_gemm_compute( cblas_order, cblas_packed, cblas_transb, + m, n, k, aBuffer, lda, bp, ldb, beta, cp, ldc ); + + bli_free_user( aBuffer ); + } + else if ( ( pckb == 'P' || pckb == 'p' ) ) + { + // Reorder B + CBLAS_IDENTIFIER cblas_identifier = CblasBMatrix; + CBLAS_STORAGE cblas_packed = CblasPacked; + gtint_t bufSizeB = ref_cblas_gemm_pack_get_size( cblas_identifier, + m, + n, + k ); + + T* bBuffer = (T*) bli_malloc_user( bufSizeB, &err ); + + ref_cblas_gemm_pack( cblas_order, cblas_identifier, cblas_transb, + m, n, k, alpha, bp, ldb, bBuffer ); + + ref_cblas_gemm_compute( cblas_order, cblas_transa, cblas_packed, + m, n, k, ap, lda, bBuffer, ldb, beta, cp, ldc ); + + bli_free_user( bBuffer ); + } + else + { + ref_cblas_gemm_compute( cblas_order, cblas_transa, cblas_transb, + m, n, k, ap, lda, bp, ldb, beta, cp, ldc ); + } +} + +// Explicit template instantiations +template void ref_gemm_compute(char, char, char, char, char, gtint_t, gtint_t, gtint_t, float, + float*, gtint_t, float*, gtint_t, float, float*, gtint_t ); +template void ref_gemm_compute(char, char, char, char, char, gtint_t, gtint_t, gtint_t, double, + double*, gtint_t, double*, gtint_t, double, double*, gtint_t ); + +} //end of namespace testinghelpers diff --git a/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp index b74f63aea2..8d07668cc4 100644 --- a/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp @@ -79,7 +79,7 @@ TEST_P(DGemmTest, RandomData) gtint_t ldc_inc = std::get<10>(GetParam()); // Set the threshold for the errors: - double thresh = 10*m*n*k*testinghelpers::getEpsilon(); + double thresh = 10*m*n*testinghelpers::getEpsilon(); //---------------------------------------------------------- // Call test body using these parameters diff --git a/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp b/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp new file mode 100644 index 0000000000..82b89b7191 --- /dev/null +++ b/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp @@ -0,0 +1,187 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "test_gemm_compute.h" + +class DGemmComputeTest : + public ::testing::TestWithParam> {}; + +TEST_P(DGemmComputeTest, RandomData) +{ + using T = double; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // matrix storage format(row major, column major) + char storage = std::get<0>(GetParam()); + // denotes whether matrix a is n,c,t + char transa = std::get<1>(GetParam()); + // denotes whether matrix b is n,c,t + char transb = std::get<2>(GetParam()); + // denotes whether matrix a is packed (p) or unpacked (u) + char packa = std::get<3>(GetParam()); + // denotes whether matrix b is packed (p) or unpacked (u) + char packb = std::get<4>(GetParam()); + // matrix size m + gtint_t m = std::get<5>(GetParam()); + // matrix size n + gtint_t n = std::get<6>(GetParam()); + // matrix size k + gtint_t k = std::get<7>(GetParam()); + // specifies alpha value + T alpha = std::get<8>(GetParam()); + // specifies beta value + T beta = std::get<9>(GetParam()); + // lda, ldb, ldc increments. + // If increments are zero, then the array size matches the matrix size. + // If increments are nonnegative, the array size is bigger than the matrix size. + gtint_t lda_inc = std::get<10>(GetParam()); + gtint_t ldb_inc = std::get<11>(GetParam()); + gtint_t ldc_inc = std::get<12>(GetParam()); + + // Set the threshold for the errors: + double intermediate = (double)m*n*k; + double thresh = 10*intermediate*testinghelpers::getEpsilon(); + //---------------------------------------------------------- + // Call test body using these parameters + //---------------------------------------------------------- + test_gemm_compute( storage, transa, transb, packa, packb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); +} + +class DGemmComputeTestPrint { +public: + std::string operator()( + testing::TestParamInfo> str) const { + char sfm = std::get<0>(str.param); + char tsa = std::get<1>(str.param); + char tsb = std::get<2>(str.param); + char pka = std::get<3>(str.param); + char pkb = std::get<4>(str.param); + gtint_t m = std::get<5>(str.param); + gtint_t n = std::get<6>(str.param); + gtint_t k = std::get<7>(str.param); + double alpha = std::get<8>(str.param); + double beta = std::get<9>(str.param); + gtint_t lda_inc = std::get<10>(str.param); + gtint_t ldb_inc = std::get<11>(str.param); + gtint_t ldc_inc = std::get<12>(str.param); +#ifdef TEST_BLAS + std::string str_name = "dgemm_compute_"; +#elif TEST_CBLAS + std::string str_name = "cblas_dgemm_compute"; +#else //#elif TEST_BLIS_TYPED + // BLIS interface not yet implemented for pack and compute APIs. + std::string str_name = "blis_dgemm_compute"; +#endif + str_name = str_name + "_" + sfm+sfm+sfm; + str_name = str_name + "_" + tsa + tsb; + str_name = str_name + "_" + pka + pkb; + str_name = str_name + "_" + std::to_string(m); + str_name = str_name + "_" + std::to_string(n); + str_name = str_name + "_" + std::to_string(k); + std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha))); + str_name = str_name + "_a" + alpha_str; + std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta))); + str_name = str_name + "_b" + beta_str; + str_name = str_name + "_" + std::to_string(lda_inc); + str_name = str_name + "_" + std::to_string(ldb_inc); + str_name = str_name + "_" + std::to_string(ldc_inc); + return str_name; + } +}; + +// Black box testing. +INSTANTIATE_TEST_SUITE_P( + Blackbox, + DGemmComputeTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n', 't', 'c'), // transa + ::testing::Values('n', 't', 'c'), // transb + ::testing::Values('u', 'p'), // packa + ::testing::Values('u', 'p'), // packb + ::testing::Range(gtint_t(10), gtint_t(31), 10), // m + ::testing::Range(gtint_t(10), gtint_t(31), 10), // n + ::testing::Range(gtint_t(10), gtint_t(31), 10), // k + ::testing::Values(0.0, 1.0, -1.2, 2.1), // alpha + ::testing::Values(0.0, 1.0, -1.2, 2.1), // beta + ::testing::Values(gtint_t(0)), // increment to the leading dim of a + ::testing::Values(gtint_t(0)), // increment to the leading dim of b + ::testing::Values(gtint_t(0)) // increment to the leading dim of c + ), + ::DGemmComputeTestPrint() + ); + +INSTANTIATE_TEST_SUITE_P( + DimensionsGtBlocksizes, // Dimensions > SUP Blocksizes + DGemmComputeTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n'), // transa + ::testing::Values('n'), // transb + ::testing::Values('u', 'p'), // packa + ::testing::Values('u', 'p'), // packb + ::testing::Values(71, 73), // m (MC - 1, MC + 1) + ::testing::Values(4079, 4081), // n (NC - 1, NC + 1) + ::testing::Values(255, 257), // k (KC - 1, KC + 1) + ::testing::Values(1.0), // alpha + ::testing::Values(1.0), // beta + ::testing::Values(gtint_t(0)), // increment to the leading dim of a + ::testing::Values(gtint_t(0)), // increment to the leading dim of b + ::testing::Values(gtint_t(0)) // increment to the leading dim of c + ), + ::DGemmComputeTestPrint() + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm_compute/gemm_compute.h b/gtestsuite/testsuite/level3/gemm_compute/gemm_compute.h new file mode 100644 index 0000000000..b57691dfe3 --- /dev/null +++ b/gtestsuite/testsuite/level3/gemm_compute/gemm_compute.h @@ -0,0 +1,456 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#pragma once + +#include "blis.h" +#include "common/testing_helpers.h" + +/** + * @brief Performs the operation: + * C := op( A )*op( B ) + beta*C, + * where op( A ) is one of + * op( A ) = alpha * A or op( A ) = alpha * A**T + * op( A ) = A or op( A ) = A**T + * op( B ) is one of + * op( B ) = alpha * B or op( B ) = alpha * B**T + * op( B ) = B or op( B ) = B**T + * @param[in] transa specifies the form of op( A ) to be used in + the matrix multiplication. + * @param[in] transb specifies the form of op( B ) to be used in + the matrix multiplication. + * @param[in] packa specifies whether to reorder op( A ). + * @param[in] packb specifies whether to reorder op( B ). + * @param[in] m specifies the number of rows of the matrix + op( A ) and of the matrix C. + * @param[in] n specifies the number of columns of the matrix + op( B ) and the number of columns of the matrix C. + * @param[in] k specifies the number of columns of the matrix + op( A ) and the number of rows of the matrix op( B ). + * @param[in] ap specifies pointer which points to the first element of ap. + * @param[in] lda specifies the leading dimension of ap. + * @param[in] bp specifies pointer which points to the first element of bp. + * @param[in] ldb specifies the leading dimension of bp. + * @param[in] beta specifies the scalar beta. + * @param[in,out] cp specifies pointer which points to the first element of cp. + * @param[in] ldc specifies the leading dimension of cp. + */ + +template +static void gemm_compute_(char transa, char transb, char packa, char packb, gtint_t m, gtint_t n, gtint_t k, T* alpha, + T* ap, gtint_t lda, T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc ) +{ + T unit_alpha = 1.0; + err_t err = BLIS_SUCCESS; + if constexpr (std::is_same::value) + { + if ( ( packa == 'P' || packa == 'p' ) && ( packb == 'P' || packb == 'p' ) ) + { + // Reorder A + char identifierA = 'A'; + gtint_t bufSizeA = sgemm_pack_get_size_( &identifierA, + &m, + &n, + &k ); + + float* aBuffer = (float*) bli_malloc_user( bufSizeA, &err ); + sgemm_pack_( &identifierA, + &transa, + &m, + &n, + &k, + &unit_alpha, + ap, + &lda, + aBuffer ); + + // Reorder B + char identifierB = 'B'; + gtint_t bufSizeB = sgemm_pack_get_size_( &identifierB, + &m, + &n, + &k ); + + float* bBuffer = (float*) bli_malloc_user( bufSizeB, &err ); + sgemm_pack_( &identifierB, + &transb, + &m, + &n, + &k, + alpha, + bp, + &ldb, + bBuffer ); + + sgemm_compute_( &packa, &packb, &m, &n, &k, aBuffer, &lda, bBuffer, &ldb, beta, cp, &ldc ); + + bli_free_user( aBuffer ); + bli_free_user( bBuffer ); + } + else if ( ( packa == 'P' || packa == 'p' ) ) + { + // Reorder A + char identifierA = 'A'; + gtint_t bufSizeA = sgemm_pack_get_size_( &identifierA, + &m, + &n, + &k ); + + float* aBuffer = (float*) bli_malloc_user( bufSizeA, &err ); + sgemm_pack_( &identifierA, + &transa, + &m, + &n, + &k, + alpha, + ap, + &lda, + aBuffer ); + + sgemm_compute_( &packa, &transb, &m, &n, &k, aBuffer, &lda, bp, &ldb, beta, cp, &ldc ); + bli_free_user( aBuffer ); + } + else if ( ( packb == 'P' || packb == 'p' ) ) + { + // Reorder B + char identifierB = 'B'; + gtint_t bufSizeB = sgemm_pack_get_size_( &identifierB, + &m, + &n, + &k ); + + float* bBuffer = (float*) bli_malloc_user( bufSizeB, &err ); + sgemm_pack_( &identifierB, + &transb, + &m, + &n, + &k, + alpha, + bp, + &ldb, + bBuffer ); + + sgemm_compute_( &transa, &packb, &m, &n, &k, ap, &lda, bBuffer, &ldb, beta, cp, &ldc ); + bli_free_user( bBuffer ); + } + else + { + sgemm_compute_( &transa, &transb, &m, &n, &k, ap, &lda, bp, &ldb, beta, cp, &ldc ); + } + } + else if constexpr (std::is_same::value) + { + if ( ( packa == 'P' || packa == 'p' ) && ( packb == 'P' || packb == 'p' ) ) + { + // Reorder A + char identifierA = 'A'; + gtint_t bufSizeA = dgemm_pack_get_size_( &identifierA, + &m, + &n, + &k ); + + double* aBuffer = (double*) bli_malloc_user( bufSizeA, &err ); + dgemm_pack_( &identifierA, + &transa, + &m, + &n, + &k, + &unit_alpha, + ap, + &lda, + aBuffer ); + + // Reorder B + char identifierB = 'B'; + gtint_t bufSizeB = dgemm_pack_get_size_( &identifierB, + &m, + &n, + &k ); + + double* bBuffer = (double*) bli_malloc_user( bufSizeB, &err ); + dgemm_pack_( &identifierB, + &transb, + &m, + &n, + &k, + alpha, + bp, + &ldb, + bBuffer ); + + dgemm_compute_( &packa, &packb, &m, &n, &k, aBuffer, &lda, bBuffer, &ldb, beta, cp, &ldc ); + bli_free_user( aBuffer ); + bli_free_user( bBuffer ); + } + else if ( ( packa == 'P' || packa == 'p' ) ) + { + // Reorder A + char identifierA = 'A'; + gtint_t bufSizeA = dgemm_pack_get_size_( &identifierA, + &m, + &n, + &k ); + + double* aBuffer = (double*) bli_malloc_user( bufSizeA, &err ); + dgemm_pack_( &identifierA, + &transa, + &m, + &n, + &k, + alpha, + ap, + &lda, + aBuffer ); + + dgemm_compute_( &packa, &transb, &m, &n, &k, aBuffer, &lda, bp, &ldb, beta, cp, &ldc ); + bli_free_user( aBuffer ); + } + else if ( ( packb == 'P' || packb == 'p' ) ) + { + // Reorder B + char identifierB = 'B'; + gtint_t bufSizeB = dgemm_pack_get_size_( &identifierB, + &m, + &n, + &k ); + + double* bBuffer = (double*) bli_malloc_user( bufSizeB, &err ); + dgemm_pack_( &identifierB, + &transb, + &m, + &n, + &k, + alpha, + bp, + &ldb, + bBuffer ); + + dgemm_compute_( &transa, &packb, &m, &n, &k, ap, &lda, bBuffer, &ldb, beta, cp, &ldc ); + bli_free_user( bBuffer ); + } + else + { + dgemm_compute_( &transa, &transb, &m, &n, &k, ap, &lda, bp, &ldb, beta, cp, &ldc ); + } + } + else + throw std::runtime_error("Error in testsuite/level3/gemm.h: Invalid typename in gemm_compute_()."); +} + +template +static void cblas_gemm_compute(char storage, char transa, char transb, char pcka, char pckb, + gtint_t m, gtint_t n, gtint_t k, T* alpha, T* ap, gtint_t lda, + T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc) +{ + enum CBLAS_ORDER cblas_order; + enum CBLAS_TRANSPOSE cblas_transa; + enum CBLAS_TRANSPOSE cblas_transb; + + testinghelpers::char_to_cblas_order( storage, &cblas_order ); + testinghelpers::char_to_cblas_trans( transa, &cblas_transa ); + testinghelpers::char_to_cblas_trans( transb, &cblas_transb ); + + T unit_alpha = 1.0; + CBLAS_IDENTIFIER cblas_identifierA = CblasAMatrix; + CBLAS_IDENTIFIER cblas_identifierB = CblasBMatrix; + CBLAS_STORAGE cblas_packed = CblasPacked; + + err_t err = BLIS_SUCCESS; + + if constexpr (std::is_same::value) + { + if ( ( pcka == 'p' || pcka == 'P' ) && ( pckb == 'p' || pckb == 'P' ) ) + { + gtint_t bufSizeA = cblas_sgemm_pack_get_size( cblas_identifierA, + m, + n, + k ); + + T* aBuffer = (T*) bli_malloc_user( bufSizeA, &err ); + + cblas_sgemm_pack( cblas_order, cblas_identifierA, cblas_transa, + m, n, k, *alpha, ap, lda, aBuffer ); + + gtint_t bufSizeB = cblas_sgemm_pack_get_size( cblas_identifierB, + m, + n, + k ); + + T* bBuffer = (T*) bli_malloc_user( bufSizeB, &err ); + + cblas_sgemm_pack( cblas_order, cblas_identifierB, cblas_transb, + m, n, k, unit_alpha, bp, ldb, bBuffer ); + + cblas_sgemm_compute( cblas_order, cblas_packed, cblas_packed, + m, n, k, aBuffer, lda, bBuffer, ldb, *beta, cp, ldc ); + + bli_free_user( aBuffer ); + bli_free_user( bBuffer ); + } + else if ( pcka == 'p' || pcka == 'P' ) + { + gtint_t bufSizeA = cblas_sgemm_pack_get_size( cblas_identifierA, + m, + n, + k ); + + T* aBuffer = (T*) bli_malloc_user( bufSizeA, &err ); + + cblas_sgemm_pack( cblas_order, cblas_identifierA, cblas_transa, + m, n, k, *alpha, ap, lda, aBuffer ); + + + cblas_sgemm_compute( cblas_order, cblas_packed, cblas_transb, + m, n, k, aBuffer, lda, bp, ldb, *beta, cp, ldc ); + + bli_free_user( aBuffer ); + } + else if ( pckb == 'p' || pckb == 'P' ) + { + gtint_t bufSizeB = cblas_sgemm_pack_get_size( cblas_identifierB, + m, + n, + k ); + + T* bBuffer = (T*) bli_malloc_user( bufSizeB, &err ); + + cblas_sgemm_pack( cblas_order, cblas_identifierB, cblas_transb, + m, n, k, *alpha, bp, ldb, bBuffer ); + + cblas_sgemm_compute( cblas_order, cblas_transa, cblas_packed, + m, n, k, ap, lda, bBuffer, ldb, *beta, cp, ldc ); + + bli_free_user( bBuffer ); + } + else + { + cblas_sgemm_compute( cblas_order, cblas_transa, cblas_transb, + m, n, k, ap, lda, bp, ldb, *beta, cp, ldc ); + } + } + else if constexpr (std::is_same::value) + { + if ( ( pcka == 'p' || pcka == 'P' ) && ( pckb == 'p' || pckb == 'P' ) ) + { + gtint_t bufSizeA = cblas_dgemm_pack_get_size( cblas_identifierA, + m, + n, + k ); + + T* aBuffer = (T*) bli_malloc_user( bufSizeA, &err ); + + cblas_dgemm_pack( cblas_order, cblas_identifierA, cblas_transa, + m, n, k, *alpha, ap, lda, aBuffer ); + + gtint_t bufSizeB = cblas_dgemm_pack_get_size( cblas_identifierB, + m, + n, + k ); + + T* bBuffer = (T*) bli_malloc_user( bufSizeB, &err ); + + cblas_dgemm_pack( cblas_order, cblas_identifierB, cblas_transb, + m, n, k, unit_alpha, bp, ldb, bBuffer ); + + cblas_dgemm_compute( cblas_order, cblas_packed, cblas_packed, + m, n, k, aBuffer, lda, bBuffer, ldb, *beta, cp, ldc ); + + bli_free_user( aBuffer ); + bli_free_user( bBuffer ); + } + else if ( pcka == 'p' || pcka == 'P' ) + { + gtint_t bufSizeA = cblas_dgemm_pack_get_size( cblas_identifierA, + m, + n, + k ); + + T* aBuffer = (T*) bli_malloc_user( bufSizeA, &err ); + + cblas_dgemm_pack( cblas_order, cblas_identifierA, cblas_transa, + m, n, k, *alpha, ap, lda, aBuffer ); + + + cblas_dgemm_compute( cblas_order, cblas_packed, cblas_transb, + m, n, k, aBuffer, lda, bp, ldb, *beta, cp, ldc ); + + bli_free_user( aBuffer ); + } + else if ( pckb == 'p' || pckb == 'P' ) + { + gtint_t bufSizeB = cblas_dgemm_pack_get_size( cblas_identifierB, + m, + n, + k ); + + T* bBuffer = (T*) bli_malloc_user( bufSizeB, &err ); + + cblas_dgemm_pack( cblas_order, cblas_identifierB, cblas_transb, + m, n, k, *alpha, bp, ldb, bBuffer ); + + cblas_dgemm_compute( cblas_order, cblas_transa, cblas_packed, + m, n, k, ap, lda, bBuffer, ldb, *beta, cp, ldc ); + + bli_free_user( bBuffer ); + } + else + { + cblas_dgemm_compute( cblas_order, cblas_transa, cblas_transb, + m, n, k, ap, lda, bp, ldb, *beta, cp, ldc ); + } + } + else + { + throw std::runtime_error("Error in testsuite/level3/gemm_compute.h: Invalid typename in cblas_gemm_compute()."); + } +} + +template +static void gemm_compute( char storage, char transa, char transb, char packa, char packb, gtint_t m, gtint_t n, gtint_t k, T* alpha, + T* ap, gtint_t lda, T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc ) +{ +#ifdef TEST_BLAS + if( storage == 'c' || storage == 'C' ) + gemm_compute_( transa, transb, packa, packb, m, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc ); + else + throw std::runtime_error("Error in testsuite/level3/gemm_compute.h: BLAS interface cannot be tested for row-major order."); + +#elif TEST_CBLAS + cblas_gemm_compute( storage, transa, transb, packa, packb, m, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc ); +#elif TEST_BLIS_TYPED + throw std::runtime_error("Error in testsuite/level3/gemm_compute.h: BLIS interfaces not yet implemented for pack and compute BLAS extensions."); +#else + throw std::runtime_error("Error in testsuite/level3/gemm_compute.h: No interfaces are set to be tested."); +#endif +} \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp b/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp new file mode 100644 index 0000000000..c70a048bca --- /dev/null +++ b/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp @@ -0,0 +1,222 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "test_gemm_compute.h" +#include "common/wrong_inputs_helpers.h" +#include "common/testing_helpers.h" +#include "inc/check_error.h" + +template +class GEMM_Compute_IIT_ERS_Test : public ::testing::Test {}; +typedef ::testing::Types TypeParam; +TYPED_TEST_SUITE(GEMM_Compute_IIT_ERS_Test, TypeParam); + +using namespace testinghelpers::IIT; + +#ifdef TEST_BLAS + +/* + Incorrect Input Testing(IIT) + + BLAS exceptions get triggered in the following cases(for GEMM Compute): + 1. When TRANSA != 'N' || TRANSA != 'T' || TRANSA != 'C' || TRANSA != 'P' (info = 1) + 2. When TRANSB != 'N' || TRANSB != 'T' || TRANSB != 'C' || TRANSB != 'P' (info = 2) + 3. When m < 0 (info = 3) + 4. When n < 0 (info = 4) + 5. When k < 0 (info = 5) + 6. When lda < max(1, thresh) (info = 7), thresh set based on TRANSA value + 7. When ldb < max(1, thresh) (info = 9), thresh set based on TRANSB value + 8. When ldc < max(1, n) (info = 12) +*/ + +// When info == 1 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, invalid_transa) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for TRANS value for A. + gemm_compute( STORAGE, 'x', TRANS, 'U', 'U', M, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 2 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, invalid_transb) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for TRANS value for A. + gemm_compute( STORAGE, TRANS, 'x', 'U', 'U', M, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 3 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, m_lt_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for m. + gemm_compute( STORAGE, TRANS, TRANS, 'U', 'U', -1, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 4 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, n_lt_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for m. + gemm_compute( STORAGE, TRANS, TRANS, 'U', 'U', M, -1, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 5 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, k_lt_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for m. + gemm_compute( STORAGE, TRANS, TRANS, 'U', 'U', M, N, -1, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 7 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, invalid_lda) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for m. + gemm_compute( STORAGE, TRANS, TRANS, 'U', 'U', M, N, K, nullptr, nullptr, LDA - 1, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 9 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, invalid_ldb) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for m. + gemm_compute( STORAGE, TRANS, TRANS, 'U', 'U', M, N, K, nullptr, nullptr, LDA, nullptr, LDB - 1, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When info == 12 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, invalid_ldc) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for m. + gemm_compute( STORAGE, TRANS, TRANS, 'U', 'U', M, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC - 1 ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +/* + Early Return Scenarios(ERS) : + + The GEMM Compute API is expected to return early in the following cases: + + 1. When m == 0. + 2. When n == 0. +*/ + +// When m = 0 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, m_eq_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for m. + gemm_compute( STORAGE, TRANS, TRANS, 'U', 'U', 0, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + +// When n = 0 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, n_eq_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for m. + gemm_compute( STORAGE, TRANS, TRANS, 'U', 'U', M, 0, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, LDC ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} +#endif \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp b/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp new file mode 100644 index 0000000000..e261f65835 --- /dev/null +++ b/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp @@ -0,0 +1,189 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "test_gemm_compute.h" + +class SGemmComputeTest : + public ::testing::TestWithParam> {}; + +TEST_P(SGemmComputeTest, RandomData) +{ +// printf("SGemmCompute_test!!\n"); + using T = float; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // matrix storage format(row major, column major) + char storage = std::get<0>(GetParam()); + // denotes whether matrix a is n,c,t,h + char transa = std::get<1>(GetParam()); + // denotes whether matrix b is n,c,t,h + char transb = std::get<2>(GetParam()); + // denotes whether matrix a is packed (p) or unpacked (u) + char packa = std::get<3>(GetParam()); + // denotes whether matrix b is packed (p) or unpacked (u) + char packb = std::get<4>(GetParam()); + // matrix size m + gtint_t m = std::get<5>(GetParam()); + // matrix size n + gtint_t n = std::get<6>(GetParam()); + // matrix size k + gtint_t k = std::get<7>(GetParam()); + // specifies alpha value + T alpha = std::get<8>(GetParam()); + // specifies beta value + T beta = std::get<9>(GetParam()); + // lda, ldb, ldc increments. + // If increments are zero, then the array size matches the matrix size. + // If increments are nonnegative, the array size is bigger than the matrix size. + gtint_t lda_inc = std::get<10>(GetParam()); + gtint_t ldb_inc = std::get<11>(GetParam()); + gtint_t ldc_inc = std::get<12>(GetParam()); + + // Set the threshold for the errors: + float intermediate = (float)m*n*k; + float thresh = 10*intermediate*testinghelpers::getEpsilon(); + + //---------------------------------------------------------- + // Call test body using these parameters + //---------------------------------------------------------- + test_gemm_compute( storage, transa, transb, packa, packb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh ); +} + +class SGemmComputeTestPrint { +public: + std::string operator()( + testing::TestParamInfo> str) const { + char sfm = std::get<0>(str.param); + char tsa = std::get<1>(str.param); + char tsb = std::get<2>(str.param); + char pka = std::get<3>(str.param); + char pkb = std::get<4>(str.param); + gtint_t m = std::get<5>(str.param); + gtint_t n = std::get<6>(str.param); + gtint_t k = std::get<7>(str.param); + float alpha = std::get<8>(str.param); + float beta = std::get<9>(str.param); + gtint_t lda_inc = std::get<10>(str.param); + gtint_t ldb_inc = std::get<11>(str.param); + gtint_t ldc_inc = std::get<12>(str.param); +#ifdef TEST_BLAS + std::string str_name = "sgemm_compute_"; +#elif TEST_CBLAS + std::string str_name = "cblas_sgemm_compute"; +#else //#elif TEST_BLIS_TYPED + // BLIS interface not yet implemented for pack and compute APIs. + std::string str_name = "blis_sgemm_compute"; +#endif + str_name = str_name + "_" + sfm+sfm+sfm; + str_name = str_name + "_" + tsa + tsb; + str_name = str_name + "_" + pka + pkb; + str_name = str_name + "_" + std::to_string(m); + str_name = str_name + "_" + std::to_string(n); + str_name = str_name + "_" + std::to_string(k); + std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha))); + str_name = str_name + "_a" + alpha_str; + std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta))); + str_name = str_name + "_b" + beta_str; + str_name = str_name + "_" + std::to_string(lda_inc); + str_name = str_name + "_" + std::to_string(ldb_inc); + str_name = str_name + "_" + std::to_string(ldc_inc); + return str_name; + } +}; + +// Black box testing. +INSTANTIATE_TEST_SUITE_P( + Blackbox, + SGemmComputeTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n', 't', 'c'), // transa + ::testing::Values('n', 't', 'c'), // transb + ::testing::Values('u', 'p'), // packa + ::testing::Values('u', 'p'), // packb + ::testing::Range(gtint_t(10), gtint_t(31), 10), // m + ::testing::Range(gtint_t(10), gtint_t(31), 10), // n + ::testing::Range(gtint_t(10), gtint_t(31), 10), // k + ::testing::Values(0.0, 1.0, -1.2, 2.1), // alpha + ::testing::Values(0.0, 1.0, -1.2, 2.1), // beta + ::testing::Values(gtint_t(0)), // increment to the leading dim of a + ::testing::Values(gtint_t(0)), // increment to the leading dim of b + ::testing::Values(gtint_t(0)) // increment to the leading dim of c + ), + ::SGemmComputeTestPrint() + ); + +INSTANTIATE_TEST_SUITE_P( + DimensionsGtBlocksizes, // Dimensions > SUP Blocksizes + SGemmComputeTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n'), // transa + ::testing::Values('n'), // transb + ::testing::Values('u', 'p'), // packa + ::testing::Values('u', 'p'), // packb + ::testing::Values(143, 145), // m (MC - 1, MC + 1) + ::testing::Values(8159, 8161), // n (NC - 1, NC + 1) + ::testing::Values(511, 513), // k (KC - 1, KC + 1) + ::testing::Values(1.0), // alpha + ::testing::Values(1.0), // beta + ::testing::Values(gtint_t(0)), // increment to the leading dim of a + ::testing::Values(gtint_t(0)), // increment to the leading dim of b + ::testing::Values(gtint_t(0)) // increment to the leading dim of c + ), + ::SGemmComputeTestPrint() + ); \ No newline at end of file diff --git a/gtestsuite/testsuite/level3/gemm_compute/test_gemm_compute.h b/gtestsuite/testsuite/level3/gemm_compute/test_gemm_compute.h new file mode 100644 index 0000000000..7d1016941b --- /dev/null +++ b/gtestsuite/testsuite/level3/gemm_compute/test_gemm_compute.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#pragma once + +#include "gemm_compute.h" +#include "level3/ref_gemm_compute.h" +#include "inc/check_error.h" +#include +#include + +template +void test_gemm_compute( char storage, char trnsa, char trnsb, char pcka, char pckb, + gtint_t m, gtint_t n, gtint_t k, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc, + T alpha, T beta, double thresh ) +{ + // Compute the leading dimensions of a, b, and c. + gtint_t lda = testinghelpers::get_leading_dimension( storage, trnsa, m, k, lda_inc ); + gtint_t ldb = testinghelpers::get_leading_dimension( storage, trnsb, k, n, ldb_inc ); + gtint_t ldc = testinghelpers::get_leading_dimension( storage, 'n', m, n, ldc_inc ); + + //---------------------------------------------------------- + // Initialize matrics with random numbers + //---------------------------------------------------------- + std::vector a = testinghelpers::get_random_matrix( -2, 8, storage, trnsa, m, k, lda ); + std::vector b = testinghelpers::get_random_matrix( -5, 2, storage, trnsb, k, n, ldb ); + std::vector c = testinghelpers::get_random_matrix( -3, 5, storage, 'n', m, n, ldc ); + + // Create a copy of c so that we can check reference results. + std::vector c_ref(c); + + //---------------------------------------------------------- + // Call BLIS function + //---------------------------------------------------------- + gemm_compute( storage, trnsa, trnsb, pcka, pckb, m, n, k, &alpha, a.data(), lda, + b.data(), ldb, &beta, c.data(), ldc ); + + //---------------------------------------------------------- + // Call reference implementation. + //---------------------------------------------------------- + testinghelpers::ref_gemm_compute( storage, trnsa, trnsb, pcka, pckb, m, n, k, alpha, + a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc ); + + //---------------------------------------------------------- + // check component-wise error. + //---------------------------------------------------------- + computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh ); +} \ No newline at end of file From 26d1ab5ebcec95fa97f8e888d042edadc5eac1de Mon Sep 17 00:00:00 2001 From: mkadavil Date: Mon, 23 Oct 2023 19:19:20 +0530 Subject: [PATCH 167/226] 8s8s<16|32>os8 memory allocation fix to circumvent scaling issue. -When bli_pba_acquire_m api is used for packbuf type BLIS_BUFFER_FOR_ , the memory is allocated by checking out a block from an internal memory pool. In order to ensure thread safety, the memory pool checkout is protected using mutex (bli_pba_lock/ bli_pba_unlock). When the number of threads trying to checkout memory (in parallel) are high, these locks tend to become a scaling bottleneck, especially when the memory is to be used for non-packing purposes (packing could hide some of this cost). LPGEMM uses bli_pba_acquire_m with BLIS_BUFFER_FOR_C_PANEL to checkout memory when downscale is enabled for temporary C accumulation. This multi-threaded lock overhead becomes prominent when m/n dimensions are relatively small, even when k is large. In order to address this, bli_pba_acquire_m is used with BLIS_BUFFER_FOR_GEN_USE for LPGEMM. For *GEN_USE, the memory is allocated using aligned malloc instead of checking out from memory pool. Experiments have shown malloc costs to be far lower than memory pool guarded by locks, especially for higher thread count. -LPGEMM bench fixes for crash observed when benchmarking with post-ops enabled and no downscale. AMD-Internal: [SWLCSG-2354] Change-Id: I4e92feadd2cf638bb26dd03b773556800a1a3d50 --- addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c | 2 +- addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c | 2 +- addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c | 2 +- addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c | 2 +- bench/bench_aocl_gemm/bench_lpgemm.c | 5 ++--- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c b/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c index 40d1f70ccb..974ff4f3eb 100644 --- a/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c +++ b/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c @@ -175,7 +175,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16) lpgemm_alloc_mem_panel ( - mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, + mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE, &mem_scale_c, rntm ); diff --git a/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c b/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c index e8decd4ca2..21fa102fd4 100644 --- a/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c +++ b/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c @@ -182,7 +182,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32) lpgemm_alloc_mem_panel ( - mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, + mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE, &mem_scale_c, rntm ); diff --git a/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c b/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c index 1e8d9357c1..c55e4a39af 100644 --- a/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c +++ b/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c @@ -172,7 +172,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16) lpgemm_alloc_mem_panel ( - mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, + mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE, &mem_scale_c, rntm ); diff --git a/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c b/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c index d0b06f207b..b69f5395f0 100644 --- a/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c +++ b/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c @@ -181,7 +181,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32) lpgemm_alloc_mem_panel ( - mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, + mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE, &mem_scale_c, rntm ); diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 23acb828ea..6f93dba961 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -834,6 +834,8 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ post_ops->eltwise = NULL; \ post_ops->bias.bias = NULL; \ post_ops->sum.scale_factor = NULL; \ + post_ops->sum.buff = NULL; \ + post_ops->sum.zero_point = NULL; \ if ( post_ops_str != NULL ) \ { \ char* ops_tok = strtok(post_ops_str, ", " ); \ @@ -998,9 +1000,6 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ cur_op_index++; \ \ post_ops->sum.is_power_of_2 = FALSE; \ - post_ops->sum.scale_factor = NULL; \ - post_ops->sum.buff = NULL; \ - post_ops->sum.zero_point = NULL; \ if ( global_dscale_out == 'y' ) \ { \ /* Allocate scale buffer, return early if alloc fails.*/ \ From ac3e8ff01ba0839961fe0eafa33cd5eca4a107d2 Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Wed, 25 Oct 2023 05:53:46 +0530 Subject: [PATCH 168/226] Bug fix and enhancements in bf16bf16f32obf16|f32 Details: - Updated pack function call in ic loop to accept correct params. - Modified documentation in bench file to reflect updated usage of bench for downscaled APIs. - Modified memory allocation for C panel in BF16 APIs to use BLIS_BUFFER_FOR_GEN_USE while requesting for memory from pool. Change-Id: Id624ed92ae7c8dafd7f6a32fc1554d2357de4df5 --- addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c | 4 ++-- bench/bench_aocl_gemm/bench_lpgemm.c | 15 +++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c index 10855970d3..2c9e188ea1 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c @@ -174,7 +174,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) lpgemm_alloc_mem_panel ( - mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL, + mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE, &mem_scale_c, rntm ); @@ -342,7 +342,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) ( pack_a_buffer_bf16, ( a + ( rs_a * ic ) + ( cs_a * pc )), rs_a, cs_a, - ( ic_end - ic_start ), kc0, + mc0, kc0, &rs_a_use, &cs_a_use ); a_use = pack_a_buffer_bf16; diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 6f93dba961..f20c819c1f 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -1453,14 +1453,17 @@ int main( int argc, char** argv ) " It is to be noted only one activator can be used at a time.\n" \ " If more than one activator is used, only the first activator is\n" \ " applied and the other activators are ignored.\n" \ - "--Downscaled version of an API is enabled by using -d arg.\n" \ + "--Downscaled version of an API is enabled by using -d arg followed\n" \ + " by the datatype that needs to be downscaled to" " Downscaled api's are used to enable quantization workflows.\n" \ " Following downscaled api's are supported:\n" \ - " 1. u8s8s32os32 -d = u8s8s32os8.\n" \ - " 2. u8s8s16os16 -d = u8s8s16os8.\n" \ - " 3. bf16bf16f32obf32 -d = bf16bf16f32obf16.\n" \ - " 4. s8s8s32os32 -d = s8s8s32os8.\n" \ - " 5. s8s8s16os16 -d = s8s8s16os8.\n" \ + " 1. u8s8s32os32 -d s8 = u8s8s32os8.\n" \ + " 2. u8s8s16os16 -d s8 = u8s8s16os8.\n" \ + " 3. u8s8s16os16 -d u8 = u8s8s16ou8.\n" \ + " 4. bf16bf16f32obf32 -d bf16 = bf16bf16f32obf16.\n" \ + " 5. s8s8s32os32 -d s8 = s8s8s32os8.\n" \ + " 6. s8s8s16os16 -d s8 = s8s8s16os8.\n" \ + " Example: ./bench_lpgemm -m a -n 2 -o bias,relu -d bf16 -i input.txt\n" \ ); exit( 1 ); } From 7bcb701b79f6f3813c7458c3e1c46a3868ecf080 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Wed, 25 Oct 2023 14:38:16 +0530 Subject: [PATCH 169/226] Fixed functionality failure for dgemm tiny kernel. - For k > KC, C matrix is getting scaled by beta on each iteration. It should be scaled only once. Fixed the scaling of C matrix by beta in K loop. - Corrected A and B matrix buffer offsets, for cases where k > KC. AMD-Internal: [CPUPL-4078] AMD-Internal: [CPUPL-4079] AMD-Internal: [CPUPL-4081] AMD-Internal: [CPUPL-4080] AMD-Internal: [CPUPL-4087] Change-Id: I27f426caf48e094fd75f1f719acb4ac37d9daeaa --- frame/compat/bla_gemm_amd.c | 4 + kernels/zen/3/bli_gemm_tiny.c | 201 +++++++++++++++++++++++++--------- 2 files changed, 151 insertions(+), 54 deletions(-) diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index 87b5c107a7..c9cf3342e4 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -682,6 +682,10 @@ void dgemm_blis_impl if(tiny_ret == BLIS_SUCCESS) { + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS */ + bli_finalize_auto(); return; } diff --git a/kernels/zen/3/bli_gemm_tiny.c b/kernels/zen/3/bli_gemm_tiny.c index 10f37d714a..667f776a18 100644 --- a/kernels/zen/3/bli_gemm_tiny.c +++ b/kernels/zen/3/bli_gemm_tiny.c @@ -164,55 +164,99 @@ static err_t bli_dgemm_tiny_24x8_kernel double *A = a_local; double *B = b_local; double *C = c_local; - double *alpha_cast, *beta_cast; + double *alpha_cast; + double beta_cast = *beta; + double one_local = 1.0; alpha_cast = (double *)alpha; - beta_cast = (double *)beta; /** * Set blocking and micro tile parameters before computing */ - dim_t NC = 4080; - dim_t MC = 144; - dim_t KC = 480; - dim_t MR_ = 24; - dim_t NR_ = 8; + const dim_t MC = 144; + const dim_t KC = 480; + const dim_t MR_ = 24; + const dim_t NR_ = 8; /** - * NC and MC must be in multiple of MR_ and NR_. + * MC must be in multiple of MR_. * if not return early. */ - if( (NC % NR_ != 0) || (MC % MR_ != 0) ) + if( MC % MR_ != 0 ) { return BLIS_FAILURE; } - dim_t n_part_rem = N % NC; dim_t n_rem = N % NR_; dim_t m_part_rem = M % MC; dim_t k_rem = K % KC; - dim_t n_part = 0; dim_t n_cur = 0; dim_t m_cur = 0; dim_t k_cur = 0; + dim_t k_iter = 0; auxinfo_t aux; inc_t ps_a_use = (MR_ * rs_a); bli_auxinfo_set_ps_a( ps_a_use, &aux ); - for ( dim_t n_iter = 0; n_iter < N; n_iter += NC ) + + /** + * JC Loop is eliminated as it iterates only once, So computation + * can start from K loop. + * Here K loop is divided into two parts to avoid repetitive check for Beta. + * For first iteration, it will use Beta to scale C matrix. + * Subsequent iterations will scale C matrix by 1. + */ + k_iter = 0; //1st k loop, scale C matrix by beta + k_cur = (KC <= K ? KC : k_rem); + for ( dim_t m_iter = 0; m_iter < M; m_iter += MC) { - n_part = (NC <= (N - n_iter) ? NC : n_part_rem); - for ( dim_t k_iter = 0; k_iter < K; k_iter += KC ) + m_cur = (MC <= (M - m_iter) ? MC : m_part_rem); + for ( dim_t jr_iter = 0; jr_iter < N; jr_iter += NR_ ) { - k_cur = (KC <= (K - k_iter) ? KC : k_rem); - for ( dim_t m_iter = 0; m_iter < M; m_iter += MC) + n_cur = (NR_ <= (N - jr_iter) ? NR_ : n_rem); + bli_dgemmsup_rv_zen4_asm_24x8m(conja, + conjb, + m_cur, + n_cur, + k_cur, + alpha_cast, + (A + (m_iter * rs_a) + (k_iter * cs_a)), /*A matrix offset*/ + rs_a, + cs_a, + (B + (jr_iter * cs_b) + (k_iter * rs_b)), /*B matrix offset*/ + rs_b, + cs_b, + &beta_cast, + (C + jr_iter * cs_c + m_iter * rs_c), /*C matrix offset*/ + rs_c, + cs_c, + &aux, + NULL); + } + } + // k_iter = KC loop where C matrix is scaled by one. Beta is one. + for (k_iter = KC ; k_iter < K; k_iter += KC ) + { + k_cur = (KC <= (K - k_iter) ? KC : k_rem); + for ( dim_t m_iter = 0; m_iter < M; m_iter += MC) + { + m_cur = (MC <= (M - m_iter) ? MC : m_part_rem); + for ( dim_t jr_iter = 0; jr_iter < N; jr_iter += NR_ ) { - m_cur = (MC <= (M - m_iter) ? MC : m_part_rem); - for ( dim_t jr_iter = 0; jr_iter < n_part; jr_iter += NR_ ) - { - n_cur = (NR_ <= (N - jr_iter) ? NR_ : n_rem); - bli_dgemmsup_rv_zen4_asm_24x8m(conja, conjb, m_cur, n_cur, k_cur, - alpha_cast, A + m_iter * rs_a, - rs_a, cs_a, - B + jr_iter * cs_b, rs_b, cs_b, - beta_cast, - (C + jr_iter * cs_c + m_iter * rs_c), rs_c, cs_c, &aux, NULL); - } + n_cur = (NR_ <= (N - jr_iter) ? NR_ : n_rem); + bli_dgemmsup_rv_zen4_asm_24x8m(conja, + conjb, + m_cur, + n_cur, + k_cur, + alpha_cast, + (A + (m_iter * rs_a) + (k_iter * cs_a)), /*A matrix offset*/ + rs_a, + cs_a, + (B + (jr_iter * cs_b) + (k_iter * rs_b)), /*B matrix offset*/ + rs_b, + cs_b, + &one_local, + (C + jr_iter * cs_c + m_iter * rs_c), /*C matrix offset*/ + rs_c, + cs_c, + &aux, + NULL); } } } @@ -340,55 +384,104 @@ static err_t bli_dgemm_tiny_6x8_kernel double *A = a_local; double *B = b_local; double *C = c_local; - double *alpha_cast, *beta_cast; + double *alpha_cast; + double beta_cast = *beta; + double one_local = 1.0; + alpha_cast = (double *)alpha; - beta_cast = (double *)beta; /** * Set blocking and micro tile parameters before computing */ - dim_t NC = 4080; - dim_t MC = 72; - dim_t KC = 256; - dim_t MR_ = 6; - dim_t NR_ = 8; + const dim_t MC = 72; + const dim_t KC = 256; + const dim_t MR_ = 6; + const dim_t NR_ = 8; + + /** - * NC and MC must be in multiple of MR_ and NR_. + * MC must be in multiple of MR_. * if not return early. */ - if( (NC % NR_ != 0) || (MC % MR_ != 0) ) + if( MC % MR_ != 0 ) { return BLIS_FAILURE; } - dim_t n_part_rem = N % NC; dim_t n_rem = N % NR_; dim_t m_part_rem = M % MC; dim_t k_rem = K % KC; - dim_t n_part = 0; dim_t n_cur = 0; dim_t m_cur = 0; dim_t k_cur = 0; + dim_t k_iter = 0; + auxinfo_t aux; inc_t ps_a_use = (MR_ * rs_a); bli_auxinfo_set_ps_a( ps_a_use, &aux ); - for ( dim_t n_iter = 0; n_iter < N; n_iter += NC ) + dgemmsup_ker_ft kern_ptr = kern_fp[stor_id]; + + /** + * JC Loop is eliminated as it iterates only once, So computation + * can start from K loop. + * Here K loop is divided into parts to avoid repetitive check for Beta. + * For first iteration, it will use Beta to scale C matrix. + * Subsequent iterations will scale C matrix by 1. + */ + k_iter = 0; //1st k loop, scale C matrix by beta + k_cur = (KC <= K ? KC : k_rem); + for ( dim_t m_iter = 0; m_iter < M; m_iter += MC) + { + m_cur = (MC <= (M - m_iter) ? MC : m_part_rem); + for ( dim_t jr_iter = 0; jr_iter < N; jr_iter += NR_ ) + { + n_cur = (NR_ <= (N - jr_iter) ? NR_ : n_rem); + kern_ptr(conja, + conjb, + m_cur, + n_cur, + k_cur, + alpha_cast, + (A + (m_iter * rs_a) + (k_iter * cs_a)), /*A matrix offset*/ + rs_a, + cs_a, + (B + (jr_iter * cs_b) + (k_iter * rs_b)), /*B matrix offset*/ + rs_b, + cs_b, + &beta_cast, + (C + (jr_iter * cs_c) + (m_iter * rs_c)), /*C matrix offset*/ + rs_c, + cs_c, + &aux, + NULL); + } + } + // k_iter = KC loop where C matrix is scaled by one. Beta is one. + for (k_iter = KC; k_iter < K; k_iter += KC ) { - n_part = (NC <= (N - n_iter) ? NC : n_part_rem); - for ( dim_t k_iter = 0; k_iter < K; k_iter += KC ) + k_cur = (KC <= (K - k_iter) ? KC : k_rem); + for ( dim_t m_iter = 0; m_iter < M; m_iter += MC) { - k_cur = (KC <= (K - k_iter) ? KC : k_rem); - for ( dim_t m_iter = 0; m_iter < M; m_iter += MC) + m_cur = (MC <= (M - m_iter) ? MC : m_part_rem); + for ( dim_t jr_iter = 0; jr_iter < N; jr_iter += NR_ ) { - m_cur = (MC <= (M - m_iter) ? MC : m_part_rem); - for ( dim_t jr_iter = 0; jr_iter < n_part; jr_iter += NR_ ) - { - n_cur = (NR_ <= (N - jr_iter) ? NR_ : n_rem); - kern_fp[stor_id](conja, conjb, m_cur, n_cur, k_cur, - alpha_cast, A + m_iter * rs_a, - rs_a, cs_a, - B + jr_iter * cs_b, rs_b, cs_b, - beta_cast, - (C + jr_iter * cs_c + m_iter * rs_c), rs_c, cs_c, &aux, NULL); - } + n_cur = (NR_ <= (N - jr_iter) ? NR_ : n_rem); + kern_ptr(conja, + conjb, + m_cur, + n_cur, + k_cur, + alpha_cast, + (A + (m_iter * rs_a) + (k_iter * cs_a)), /*A matrix offset*/ + rs_a, + cs_a, + (B + (jr_iter * cs_b) + (k_iter * rs_b)), /*B matrix offset*/ + rs_b, + cs_b, + &one_local, + (C + (jr_iter * cs_c) + (m_iter * rs_c)), /*C matrix offset*/ + rs_c, + cs_c, + &aux, + NULL); } } } From d45d1d68c65405f49126eacaf3b86087333fad42 Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Wed, 25 Oct 2023 15:14:06 +0530 Subject: [PATCH 170/226] Reset ZMM Registers before exiting, in L3 APIs - Register ZMM16 to ZMM31 are zeroed after L3 api calls. - This change is done only for ZEN4 code path. - bli_zero_zmm function is added which resets these registers. AMD-Internal: [CPUPL-3882] Change-Id: I7f16fde567c72ae6e9d5d6c6d5d167dd7d54a3b8 (cherry picked from commit d245ef5fb264cd1fcfa03c842ea97a436a26e7a2) --- frame/compat/bla_gemm_amd.c | 43 +++++++++++++++++++++-- frame/compat/bla_gemmt.c | 20 +++++++++-- frame/compat/bla_hemm.c | 21 +++++++++-- frame/compat/bla_her2k.c | 21 +++++++++-- frame/compat/bla_herk.c | 21 +++++++++-- frame/compat/bla_symm.c | 21 +++++++++-- frame/compat/bla_syr2k.c | 21 +++++++++-- frame/compat/bla_syrk.c | 21 +++++++++-- frame/compat/bla_trmm.c | 21 +++++++++-- frame/compat/bla_trsm_amd.c | 50 ++++++++++++++++++++++++-- kernels/zen4/3/CMakeLists.txt | 1 + kernels/zen4/3/bli_zero_zmm.c | 62 +++++++++++++++++++++++++++++++++ kernels/zen4/bli_kernels_zen4.h | 3 ++ 13 files changed, 306 insertions(+), 20 deletions(-) create mode 100644 kernels/zen4/3/bli_zero_zmm.c diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index c9cf3342e4..9d92763666 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -38,6 +38,24 @@ // // Define BLAS-to-BLIS interfaces. // +#if defined(BLIS_KERNELS_ZEN4) + + #define GEMM_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + arch_t id = bli_arch_query_id(); \ + if (id == BLIS_ARCH_ZEN4) \ + { \ + bli_zero_zmm(); \ + } \ + +#else + + #define GEMM_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + +#endif + + #define ENABLE_INDUCED_METHOD 0 #ifdef BLIS_BLAS3_CALLS_TAPI @@ -179,7 +197,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + GEMM_BLIS_IMPL(ch,blasname) \ } \ ) @@ -412,7 +430,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + GEMM_BLIS_IMPL(ch,blasname) \ } \ ) @@ -834,6 +852,13 @@ void dgemm_ ) { dgemm_blis_impl(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +#if defined(BLIS_KERNELS_ZEN4) + arch_t id = bli_arch_query_id(); + if (id == BLIS_ARCH_ZEN4) + { + bli_zero_zmm(); + } +#endif } #endif void zgemm_blis_impl @@ -1188,6 +1213,13 @@ void zgemm_ ) { zgemm_blis_impl(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +#if defined(BLIS_KERNELS_ZEN4) + arch_t id = bli_arch_query_id(); + if (id == BLIS_ARCH_ZEN4) + { + bli_zero_zmm(); + } +#endif } #endif INSERT_GENTFUNC_BLAS_SC( gemm, gemm ) @@ -1336,5 +1368,12 @@ void dzgemm_ ) { dzgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); +#if defined(BLIS_KERNELS_ZEN4) + arch_t id = bli_arch_query_id(); + if (id == BLIS_ARCH_ZEN4) + { + bli_zero_zmm(); + } +#endif } #endif diff --git a/frame/compat/bla_gemmt.c b/frame/compat/bla_gemmt.c index 815cab7372..043342fe40 100644 --- a/frame/compat/bla_gemmt.c +++ b/frame/compat/bla_gemmt.c @@ -38,6 +38,22 @@ // // Define BLAS-to-BLIS interfaces. // +#if defined(BLIS_KERNELS_ZEN4) + + #define GEMMT_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + arch_t id = bli_arch_query_id(); \ + if (id == BLIS_ARCH_ZEN4) \ + { \ + bli_zero_zmm(); \ + } \ + +#else + + #define GEMMT_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + +#endif #ifdef BLIS_BLAS3_CALLS_TAPI @@ -150,7 +166,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + GEMMT_BLIS_IMPL(ch,blasname) \ } \ ) @@ -283,7 +299,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + GEMMT_BLIS_IMPL(ch,blasname) \ } \ ) diff --git a/frame/compat/bla_hemm.c b/frame/compat/bla_hemm.c index c0af5fe0ba..36390edb2b 100644 --- a/frame/compat/bla_hemm.c +++ b/frame/compat/bla_hemm.c @@ -40,6 +40,23 @@ // Define BLAS-to-BLIS interfaces. // +#if defined(BLIS_KERNELS_ZEN4) + + #define HEMM_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \ + arch_t id = bli_arch_query_id(); \ + if (id == BLIS_ARCH_ZEN4) \ + { \ + bli_zero_zmm(); \ + } \ + +#else + + #define HEMM_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \ + +#endif + #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNCCO @@ -166,7 +183,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \ + HEMM_BLIS_IMPL(ch,blasname) \ } \ ) @@ -318,7 +335,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \ + HEMM_BLIS_IMPL(ch,blasname) \ } \ ) diff --git a/frame/compat/bla_her2k.c b/frame/compat/bla_her2k.c index 78456ee77f..6912cf5772 100755 --- a/frame/compat/bla_her2k.c +++ b/frame/compat/bla_her2k.c @@ -40,6 +40,23 @@ // Define BLAS-to-BLIS interfaces. // +#if defined(BLIS_KERNELS_ZEN4) + + #define HER2K_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + arch_t id = bli_arch_query_id(); \ + if (id == BLIS_ARCH_ZEN4) \ + { \ + bli_zero_zmm(); \ + } \ + +#else + + #define HER2K_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + +#endif + #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNCCO @@ -182,7 +199,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + HER2K_BLIS_IMPL(ch,blasname) \ } \ ) @@ -350,7 +367,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + HER2K_BLIS_IMPL(ch,blasname) \ } \ ) diff --git a/frame/compat/bla_herk.c b/frame/compat/bla_herk.c index 9678ec4845..bd4a79eff2 100755 --- a/frame/compat/bla_herk.c +++ b/frame/compat/bla_herk.c @@ -40,6 +40,23 @@ // Define BLAS-to-BLIS interfaces. // +#if defined(BLIS_KERNELS_ZEN4) + + #define HERK_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \ + arch_t id = bli_arch_query_id(); \ + if (id == BLIS_ARCH_ZEN4) \ + { \ + bli_zero_zmm(); \ + } \ + +#else + + #define HERK_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \ + +#endif + #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNCCO @@ -175,7 +192,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \ + HERK_BLIS_IMPL(ch,blasname) \ } \ ) @@ -332,7 +349,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \ + HERK_BLIS_IMPL(ch,blasname) \ } \ ) diff --git a/frame/compat/bla_symm.c b/frame/compat/bla_symm.c index f171c495fe..9d86662515 100755 --- a/frame/compat/bla_symm.c +++ b/frame/compat/bla_symm.c @@ -40,6 +40,23 @@ // Define BLAS-to-BLIS interfaces. // +#if defined(BLIS_KERNELS_ZEN4) + + #define SYMM_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \ + arch_t id = bli_arch_query_id(); \ + if (id == BLIS_ARCH_ZEN4) \ + { \ + bli_zero_zmm(); \ + } \ + +#else + + #define SYMM_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \ + +#endif + #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC @@ -165,7 +182,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \ + SYMM_BLIS_IMPL(ch,blasname) \ } \ ) @@ -316,7 +333,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \ + SYMM_BLIS_IMPL(ch,blasname) \ } \ ) diff --git a/frame/compat/bla_syr2k.c b/frame/compat/bla_syr2k.c index 0cf6367537..b7ebfc0c7f 100644 --- a/frame/compat/bla_syr2k.c +++ b/frame/compat/bla_syr2k.c @@ -40,6 +40,23 @@ // Define BLAS-to-BLIS interfaces. // +#if defined(BLIS_KERNELS_ZEN4) + + #define SYR2K_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + arch_t id = bli_arch_query_id(); \ + if (id == BLIS_ARCH_ZEN4) \ + { \ + bli_zero_zmm(); \ + } \ + +#else + + #define SYR2K_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + +#endif + #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC @@ -174,7 +191,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + SYR2K_BLIS_IMPL(ch,blasname) \ } \ ) @@ -334,7 +351,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \ + SYR2K_BLIS_IMPL(ch,blasname) \ } \ ) diff --git a/frame/compat/bla_syrk.c b/frame/compat/bla_syrk.c index dc93422146..ad9a51b67a 100644 --- a/frame/compat/bla_syrk.c +++ b/frame/compat/bla_syrk.c @@ -40,6 +40,23 @@ // Define BLAS-to-BLIS interfaces. // +#if defined(BLIS_KERNELS_ZEN4) + + #define SYRK_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \ + arch_t id = bli_arch_query_id(); \ + if (id == BLIS_ARCH_ZEN4) \ + { \ + bli_zero_zmm(); \ + } \ + +#else + + #define SYRK_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \ + +#endif + #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC @@ -166,7 +183,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \ + SYRK_BLIS_IMPL(ch,blasname) \ } \ ) @@ -313,7 +330,7 @@ void PASTEF77(ch,blasname) \ ftype* c, const f77_int* ldc \ ) \ { \ - PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \ + SYRK_BLIS_IMPL(ch,blasname) \ } \ ) diff --git a/frame/compat/bla_trmm.c b/frame/compat/bla_trmm.c index a687850332..1390338135 100644 --- a/frame/compat/bla_trmm.c +++ b/frame/compat/bla_trmm.c @@ -39,6 +39,23 @@ // Define BLAS-to-BLIS interfaces. // +#if defined(BLIS_KERNELS_ZEN4) + + #define TRMM_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \ + arch_t id = bli_arch_query_id(); \ + if (id == BLIS_ARCH_ZEN4) \ + { \ + bli_zero_zmm(); \ + } \ + +#else + + #define TRMM_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \ + +#endif + #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC @@ -163,7 +180,7 @@ void PASTEF77(ch,blasname) \ ftype* b, const f77_int* ldb \ ) \ { \ - PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \ + TRMM_BLIS_IMPL(ch,blasname) \ } \ ) @@ -306,7 +323,7 @@ void PASTEF77(ch,blasname) \ ftype* b, const f77_int* ldb \ ) \ { \ - PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \ + TRMM_BLIS_IMPL(ch,blasname) \ } \ ) diff --git a/frame/compat/bla_trsm_amd.c b/frame/compat/bla_trsm_amd.c index 6f5c1137fd..2294518b6a 100644 --- a/frame/compat/bla_trsm_amd.c +++ b/frame/compat/bla_trsm_amd.c @@ -40,6 +40,24 @@ // Define BLAS-to-BLIS interfaces. // +#if defined(BLIS_KERNELS_ZEN4) + + #define TRSM_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \ + arch_t id = bli_arch_query_id(); \ + if (id == BLIS_ARCH_ZEN4) \ + { \ + bli_zero_zmm(); \ + } \ + +#else + + #define TRSM_BLIS_IMPL(ch, blasname) \ + PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \ + +#endif + + #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC @@ -169,7 +187,7 @@ void PASTEF77(ch,blasname) \ ftype* b, const f77_int* ldb \ ) \ { \ - PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \ + TRSM_BLIS_IMPL(ch,blasname) \ } \ ) #else @@ -474,7 +492,7 @@ void PASTEF77(ch,blasname) \ ftype* b, const f77_int* ldb \ ) \ { \ - PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \ + TRSM_BLIS_IMPL(ch, blasname) \ } \ #endif @@ -794,6 +812,13 @@ void strsm_ ) { strsm_blis_impl ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); +#if defined(BLIS_KERNELS_ZEN4) + arch_t id = bli_arch_query_id(); + if (id == BLIS_ARCH_ZEN4) + { + bli_zero_zmm(); + } +#endif } #endif void dtrsm_blis_impl @@ -1178,6 +1203,13 @@ void dtrsm_ ) { dtrsm_blis_impl ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); +#if defined(BLIS_KERNELS_ZEN4) + arch_t id = bli_arch_query_id(); + if (id == BLIS_ARCH_ZEN4) + { + bli_zero_zmm(); + } +#endif } #endif @@ -1557,6 +1589,13 @@ void ztrsm_ ) { ztrsm_blis_impl ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); +#if defined(BLIS_KERNELS_ZEN4) + arch_t id = bli_arch_query_id(); + if (id == BLIS_ARCH_ZEN4) + { + bli_zero_zmm(); + } +#endif } #endif @@ -1934,6 +1973,13 @@ void ctrsm_ ) { ctrsm_blis_impl ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); +#if defined(BLIS_KERNELS_ZEN4) + arch_t id = bli_arch_query_id(); + if (id == BLIS_ARCH_ZEN4) + { + bli_zero_zmm(); + } +#endif } #endif diff --git a/kernels/zen4/3/CMakeLists.txt b/kernels/zen4/3/CMakeLists.txt index 0b38920998..6b03b08ec8 100644 --- a/kernels/zen4/3/CMakeLists.txt +++ b/kernels/zen4/3/CMakeLists.txt @@ -10,6 +10,7 @@ add_library(zen4_3 ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_zen4_asm_8x24.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small_AVX512.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_zen4_asm_12x4.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_zero_zmm.c ) target_compile_options(zen4_3 PRIVATE /arch:AVX2 /arch:AVX512) diff --git a/kernels/zen4/3/bli_zero_zmm.c b/kernels/zen4/3/bli_zero_zmm.c new file mode 100644 index 0000000000..47cae67c49 --- /dev/null +++ b/kernels/zen4/3/bli_zero_zmm.c @@ -0,0 +1,62 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "blis.h" +#include "bli_x86_asm_macros.h" +void bli_zero_zmm() +{ + + BEGIN_ASM() + VXORPD(ZMM(16), ZMM(16), ZMM(16)) + VXORPD(ZMM(17), ZMM(17), ZMM(17)) + VXORPD(ZMM(18), ZMM(18), ZMM(18)) + VXORPD(ZMM(19), ZMM(19), ZMM(19)) + VXORPD(ZMM(20), ZMM(20), ZMM(20)) + VXORPD(ZMM(21), ZMM(21), ZMM(21)) + VXORPD(ZMM(22), ZMM(22), ZMM(22)) + VXORPD(ZMM(23), ZMM(23), ZMM(23)) + VXORPD(ZMM(24), ZMM(24), ZMM(24)) + VXORPD(ZMM(25), ZMM(25), ZMM(25)) + VXORPD(ZMM(26), ZMM(26), ZMM(26)) + VXORPD(ZMM(27), ZMM(27), ZMM(27)) + VXORPD(ZMM(28), ZMM(28), ZMM(28)) + VXORPD(ZMM(29), ZMM(29), ZMM(29)) + VXORPD(ZMM(30), ZMM(30), ZMM(30)) + VXORPD(ZMM(31), ZMM(31), ZMM(31)) + + END_ASM (::: + "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", + "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", + "zmm29", "zmm30", "zmm31", "memory" + ) +} \ No newline at end of file diff --git a/kernels/zen4/bli_kernels_zen4.h b/kernels/zen4/bli_kernels_zen4.h index d8ec5e6d7d..1757ae3fae 100644 --- a/kernels/zen4/bli_kernels_zen4.h +++ b/kernels/zen4/bli_kernels_zen4.h @@ -208,3 +208,6 @@ bool bli_cntx_gemmsup_thresh_is_met_zen4 obj_t* c, cntx_t* cntx ); + +// function for resetting zmm registers after L3 apis +void bli_zero_zmm(); From 834bf604c18780b3fdaa50b6071b359fca1accf1 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Thu, 26 Oct 2023 10:31:12 -0400 Subject: [PATCH 171/226] Option to use shared library for BLIS tests Current BLIS makefile always uses the static library on Linux for all BLIS test programs. This commit adds the option to use the shared library instead by specifying e.g. make checkblis USE_SHARED=yes Executables are generated in different sub-directories for static and shared libraries. AMD-Internal: [CPUPL-4107] Change-Id: I3ab5d505cfbc5f6ef47aa28fcbb846c52d56c3f2 --- Makefile | 40 ++++++++++++++++++++++------------------ common.mk | 38 +++++++++++++++++++++++--------------- vendor/testcpp/Makefile | 14 ++++++++------ 3 files changed, 53 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index d0537efa8c..8f504e5f3b 100644 --- a/Makefile +++ b/Makefile @@ -320,6 +320,7 @@ BLASTEST_INPUT_PATH := $(DIST_PATH)/$(BLASTEST_DIR)/input # The location of the BLAS test suite object directory. BASE_OBJ_BLASTEST_PATH := $(BASE_OBJ_PATH)/$(BLASTEST_DIR) +BASE_EXE_BLASTEST_PATH := $(BASE_OBJ_BLASTEST_PATH)/$(MK_USE_LIB) # The locations of the BLAS test suite source code (f2c and drivers). BLASTEST_F2C_SRC_PATH := $(DIST_PATH)/$(BLASTEST_DIR)/f2c @@ -347,7 +348,7 @@ BLASTEST_DRV_BASES := $(basename $(notdir $(BLASTEST_DRV_OBJS))) # The binary executable driver names. BLASTEST_DRV_BINS := $(addsuffix .x,$(BLASTEST_DRV_BASES)) -BLASTEST_DRV_BIN_PATHS := $(addprefix $(BASE_OBJ_BLASTEST_PATH)/,$(BLASTEST_DRV_BINS)) +BLASTEST_DRV_BIN_PATHS := $(addprefix $(BASE_EXE_BLASTEST_PATH)/,$(BLASTEST_DRV_BINS)) # Binary executable driver "run-" names BLASTEST_DRV_BINS_R := $(addprefix run-,$(BLASTEST_DRV_BASES)) @@ -393,6 +394,7 @@ TESTSUITE_SALT_OPS_PATH := $(DIST_PATH)/$(TESTSUITE_DIR)/$(TESTSUITE_SALT_OPS) # directory. TESTSUITE_SRC_PATH := $(DIST_PATH)/$(TESTSUITE_DIR)/src BASE_OBJ_TESTSUITE_PATH := $(BASE_OBJ_PATH)/$(TESTSUITE_DIR) +BASE_EXE_TESTSUITE_PATH := $(BASE_OBJ_PATH)/$(TESTSUITE_DIR)/$(MK_USE_LIB) # Convert source file paths to object file paths by replacing the base source # directories with the base object directories, and also replacing the source @@ -414,7 +416,7 @@ MK_TESTSUITE_OBJS := $(sort \ # unusual environments (e.g. ARM) can run the testsuite through some other # binary. See .travis.yml for details on how the variable is employed in # practice. -TESTSUITE_BIN := test_$(LIBBLIS).x +TESTSUITE_BIN := $(BASE_EXE_TESTSUITE_PATH)/test_$(LIBBLIS).x TESTSUITE_WRAPPER ?= # The location of the script that checks the BLIS testsuite output. @@ -850,7 +852,8 @@ endif # first argument: the base name of the BLAS test driver. define make-blat-rule -$(BASE_OBJ_BLASTEST_PATH)/$(1).x: $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK) +$(BASE_EXE_BLASTEST_PATH)/$(1).x: $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK) + @mkdir -p $(BASE_EXE_BLASTEST_PATH) ifeq ($(ENABLE_VERBOSE),yes) $(LINKER) $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $$@ else @@ -864,12 +867,12 @@ $(foreach name, $(BLASTEST_DRV_BASES), $(eval $(call make-blat-rule,$(name)))) # A rule to run ?blat1.x driver files. define make-run-blat1-rule -run-$(1): $(BASE_OBJ_BLASTEST_PATH)/$(1).x +run-$(1): $(BASE_EXE_BLASTEST_PATH)/$(1).x ifeq ($(ENABLE_VERBOSE),yes) - $(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x > out.$(1) + $(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x > out.$(1) else @echo "Running $(1).x > 'out.$(1)'" - @$(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x > out.$(1) + @$(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x > out.$(1) endif endef @@ -878,12 +881,12 @@ $(foreach name, $(BLASTEST_DRV1_BASES), $(eval $(call make-run-blat1-rule,$(name # A rule to run ?blat2.x and ?blat3.x driver files. define make-run-blat23-rule -run-$(1): $(BASE_OBJ_BLASTEST_PATH)/$(1).x +run-$(1): $(BASE_EXE_BLASTEST_PATH)/$(1).x ifeq ($(ENABLE_VERBOSE),yes) - $(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in + $(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in else @echo "Running $(1).x < '$(BLASTEST_INPUT_PATH)/$(1).in' (output to 'out.$(1)')" - @$(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in + @$(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in endif endef @@ -926,6 +929,7 @@ endif # Testsuite binary rule. $(TESTSUITE_BIN): $(MK_TESTSUITE_OBJS) $(LIBBLIS_LINK) + @mkdir -p $(BASE_EXE_TESTSUITE_PATH) ifeq ($(ENABLE_VERBOSE),yes) $(LINKER) $(MK_TESTSUITE_OBJS) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ else @@ -936,13 +940,13 @@ endif # A rule to run the testsuite using the normal input.* files. testsuite-run: testsuite-bin ifeq ($(ENABLE_VERBOSE),yes) - $(TESTSUITE_WRAPPER) ./$(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \ + $(TESTSUITE_WRAPPER) $(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \ -o $(TESTSUITE_CONF_OPS_PATH) \ > $(TESTSUITE_OUT_FILE) else @echo "Running $(TESTSUITE_BIN) with output redirected to '$(TESTSUITE_OUT_FILE)'" - @$(TESTSUITE_WRAPPER) ./$(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \ + @$(TESTSUITE_WRAPPER) $(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \ -o $(TESTSUITE_CONF_OPS_PATH) \ > $(TESTSUITE_OUT_FILE) endif @@ -1285,7 +1289,7 @@ ifeq ($(IS_CONFIGURED),yes) ifeq ($(ENABLE_VERBOSE),yes) - $(RM_F) $(BLASTEST_F2C_OBJS) $(BLASTEST_DRV_OBJS) - $(RM_F) $(BLASTEST_F2C_LIB) - - $(RM_F) $(BLASTEST_DRV_BIN_PATHS) + - $(RM_RF) $(BASE_OBJ_BLASTEST_PATH)/{shared,static} - $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES)) else @echo "Removing object files from $(BASE_OBJ_BLASTEST_PATH)" @@ -1293,7 +1297,7 @@ else @echo "Removing libf2c.a from $(BASE_OBJ_BLASTEST_PATH)" @- $(RM_F) $(BLASTEST_F2C_LIB) @echo "Removing binaries from $(BASE_OBJ_BLASTEST_PATH)" - @- $(RM_F) $(BLASTEST_DRV_BIN_PATHS) + @- $(RM_RF) $(BASE_OBJ_BLASTEST_PATH)/{shared,static} @echo "Removing driver output files 'out.*'" @- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES)) endif # ENABLE_VERBOSE @@ -1328,13 +1332,13 @@ cleanblistesttop: ifeq ($(IS_CONFIGURED),yes) ifeq ($(ENABLE_VERBOSE),yes) - $(RM_F) $(MK_TESTSUITE_OBJS) - - $(RM_F) $(TESTSUITE_BIN) + - $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static} - $(RM_F) $(TESTSUITE_OUT_FILE) else @echo "Removing object files from $(BASE_OBJ_TESTSUITE_PATH)" @- $(RM_F) $(MK_TESTSUITE_OBJS) @echo "Removing binary $(TESTSUITE_BIN)" - @- $(RM_F) $(TESTSUITE_BIN) + @- $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static} @echo "Removing $(TESTSUITE_OUT_FILE)" @- $(RM_F) $(TESTSUITE_OUT_FILE) endif # ENABLE_VERBOSE @@ -1344,13 +1348,13 @@ cleanblistestdir: ifeq ($(IS_CONFIGURED),yes) ifeq ($(ENABLE_VERBOSE),yes) - $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F) - - $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN) + - $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static} - $(MAKE) -C $(VEND_TESTCPP_DIR) clean else @echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)" @- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F) - @echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)" - @- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN) + @echo "Removing binary $(TESTSUITE_BIN)" + @- $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static} @$(MAKE) -C $(VEND_TESTCPP_DIR) clean endif # ENABLE_VERBOSE endif # IS_CONFIGURED diff --git a/common.mk b/common.mk index 4c55588b05..56820d06f2 100644 --- a/common.mk +++ b/common.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -599,27 +599,35 @@ SOFLAGS += -Wl,-soname,$(LIBBLIS_SONAME) endif endif +# Decide whether to use static or shared library on Linux and OS X +MK_USE_LIB=static +ifeq ($(MK_ENABLE_STATIC),no) + MK_USE_LIB=shared +endif +ifeq ($(USE_SHARED),yes) + MK_USE_LIB=shared +endif + # Decide which library to link to for things like the testsuite and BLIS test # drivers. We default to the static library, unless only the shared library was # enabled, in which case we use the shared library. LIBBLIS_L := $(LIBBLIS_A) LIBBLIS_LINK := $(LIBBLIS_A_PATH) ifeq ($(MK_ENABLE_SHARED),yes) -ifeq ($(MK_ENABLE_STATIC),no) -LIBBLIS_L := $(LIBBLIS_SO) -LIBBLIS_LINK := $(LIBBLIS_SO_PATH) -ifeq ($(IS_WIN),no) -# For Linux and OS X: set rpath property of shared object. -LDFLAGS += -Wl,-rpath,$(BASE_LIB_PATH) + ifeq ($(MK_USE_LIB),shared) + LIBBLIS_L := $(LIBBLIS_SO) + LIBBLIS_LINK := $(LIBBLIS_SO_PATH) + ifeq ($(IS_WIN),no) + # For Linux and OS X: set rpath property of shared object. + LDFLAGS += -Wl,-rpath,$(BASE_LIB_PATH) + endif + endif + # On windows, use the shared library even if static is created. + ifeq ($(IS_WIN),yes) + LIBBLIS_L := $(LIBBLIS_SO) + LIBBLIS_LINK := $(LIBBLIS_SO_PATH) + endif endif -endif -# On windows, use the shared library even if static is created. -ifeq ($(IS_WIN),yes) -LIBBLIS_L := $(LIBBLIS_SO) -LIBBLIS_LINK := $(LIBBLIS_SO_PATH) -endif -endif - # # --- Include makefile definitions file ---------------------------------------- diff --git a/vendor/testcpp/Makefile b/vendor/testcpp/Makefile index 9a5a466f59..0f6b5f9cc3 100644 --- a/vendor/testcpp/Makefile +++ b/vendor/testcpp/Makefile @@ -3,7 +3,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2017 - 2021, Advanced Micro Devices, Inc. +# Copyright (C) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -142,8 +142,7 @@ LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L) # all: blis - -blis: test_asum_blis.x \ +CPPEXES := test_asum_blis.x \ test_axpy_blis.x \ test_copy_blis.x \ test_dot_blis.x \ @@ -183,8 +182,10 @@ blis: test_asum_blis.x \ test_trmm_blis.x \ test_trsm_blis.x \ test_trsv_blis.x - +CPPEXES := $(addprefix $(MK_USE_LIB)/,$(CPPEXES)) + +blis: $(CPPEXES) # --Object file rules -- @@ -197,7 +198,8 @@ test_%_blis.o: test_%.cc # -- Executable file rules -- -test_%_blis.x: test_%_blis.o $(LIBBLIS_LINK) +$(MK_USE_LIB)/test_%_blis.x: test_%_blis.o $(LIBBLIS_LINK) + @mkdir -p ./$(MK_USE_LIB) @$(LINKER) $^ $(LIBBLIS_LINK) $(LDFLAGS) -o $@ ./$@ @@ -206,5 +208,5 @@ test_%_blis.x: test_%_blis.o $(LIBBLIS_LINK) clean: cleanx cleanx: - - $(RM_F) *.o *.x + - $(RM_F) ./*.o ./{shared,static}/*.x From 248dc2af9a518ced89c726b0f8ff60e19a4173ac Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Thu, 26 Oct 2023 09:59:07 -0400 Subject: [PATCH 172/226] Implement AOCL_ENABLE_INSTRUCTIONS environment variable Add AOCL_ENABLE_INSTRUCTIONS environment variable as an alternative to BLIS_ARCH_TYPE. The details are: 1. AOCL_ENABLE_INSTRUCTIONS and BLIS_ARCH_TYPE env vars are both supported, with BLIS_ARCH_TYPE taking precedence if both are set. 2. Values of "avx2" and "avx512" are aliases for "zen3" and "zen4" code paths respectively in AMD focused builds, or for "skx" and "haswell" respectively in Intel focused builds. These names are not case-sensitive. 3. BLIS_ARCH_TYPE specifies the code path to use. If this is unsupported, e.g. zen4 code path on a Milan or earlier system, that code path is still executed, likely resulting in an illegal instruction error. 4. By contrast, AOCL_ENABLE_INSTRUCTIONS will check ISA support on the system (for AVX2 and AVX512), and try a "lower" ISA option if the desired one is not supported, i.e. AVX512->AVX2, AVX2->generic. 5. Appropriate messages are printed if BLIS_ARCH_DEBUG=1 is set. AMD-Internal: [CPUPL-4105] Change-Id: Ia941b41d4b7d11f5589d7c5e16f607618baed315 --- frame/base/bli_arch.c | 166 +++++++++++++++++++++++++++++++++++++----- frame/base/bli_env.c | 38 ++++++++++ 2 files changed, 186 insertions(+), 18 deletions(-) diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 3792264e9a..1fc9ef43c1 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -62,6 +62,9 @@ static model_t model_id = -1; // bli_arch_set_id() and bli_arch_check_id() static dim_t __attribute__ ((unused)) req_id = -1; +// Keep track if AOCL_ENABLE_INSTRUCTIONS environment variable was set. +static bool __attribute__ ((unused)) aocl_e_i = FALSE; + arch_t bli_arch_query_id( void ) { bli_arch_set_id_once(); @@ -136,6 +139,19 @@ void bli_arch_set_id( void ) // configure command in bli_config.h, with the default name of BLIS_ARCH_TYPE req_id = bli_env_get_var_arch_type( __blis_arch_type_name, -1 ); + // If "__blis_arch_type_name" environment variable was not set, check + // AOCL generic environment variable AOCL_ENABLE_INSTRUCTIONS. For simplicity + // we refer to either of these options below as "BLIS_ARCH_TYPE" and only + // distinguish between them where necessary. + if ( req_id == -1 ) + { + req_id = bli_env_get_var_arch_type( "AOCL_ENABLE_INSTRUCTIONS", -1 ); + if ( req_id != -1 ) + { + aocl_e_i = TRUE; + } + } + #ifndef BLIS_CONFIGURETIME_CPUID if ( req_id != -1 ) { @@ -316,6 +332,10 @@ void bli_arch_check_id( void ) { bli_arch_set_id_once(); + bool arch_reset = FALSE; + arch_t orig_arch_id= req_id; + model_t orig_model_id = model_id; + // Check arch value against configured options. Only needed // if user has set it. This function will also do the // logging of chosen arch and model (if desired). @@ -338,19 +358,100 @@ void bli_arch_check_id( void ) // called, so ensure cntx has been initialized here. bli_gks_init_once(); - // At this point, we know that req_id is in the valid range, but we - // don't yet know if it refers to a context that was actually - // initialized. Query the address of an internal context data structure - // corresponding to req_id. This pointer will be NULL if the associated - // subconfig is not available. - cntx_t** req_cntx = bli_gks_lookup_id( req_id ); - - // This function checks the context pointer and aborts with a useful - // error message if the pointer is found to be NULL. - if ( bli_error_checking_is_enabled() ) + bool test_arch = TRUE; + while (test_arch) { - err_t e_val = bli_check_initialized_gks_cntx( req_cntx ); - bli_check_error_code( e_val ); + + // At this point, we know that req_id is in the valid range, but we + // don't yet know if it refers to a context that was actually + // initialized. Query the address of an internal context data structure + // corresponding to req_id. This pointer will be NULL if the associated + // subconfig is not available. + cntx_t** req_cntx = bli_gks_lookup_id( req_id ); + + // This function checks the context pointer and aborts with a useful + // error message if the pointer is found to be NULL. + if ( bli_error_checking_is_enabled() ) + { + err_t e_val = bli_check_initialized_gks_cntx( req_cntx ); + bli_check_error_code( e_val ); + } + + // If BLIS_ARCH_TYPE (or renamed version of this environment variable) + // was set, we always use this value of req_id to set arch_id. + // However, if AOCL_ENABLE_INSTRUCTIONS was set instead, we check for + // ISA compatibility and switch to a supported option if necessary. + if ( aocl_e_i ) + { +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) + + // If AVX2 test fails here we assume either: + // 1. Config was either zen, zen2, zen3, zen4, haswell or skx, + // so there is no fallback code path, hence error checking + // above will fail. + // 2. Config was amdzen, intel64 or x86_64, and will have + // generic code path. + if ( !bli_cpuid_is_avx2fma3_supported() ) + { + switch (req_id) + { + case BLIS_ARCH_ZEN4: + case BLIS_ARCH_ZEN3: + case BLIS_ARCH_ZEN2: + case BLIS_ARCH_ZEN: + case BLIS_ARCH_EXCAVATOR: + case BLIS_ARCH_SKX: + case BLIS_ARCH_HASWELL: + arch_reset = TRUE; + req_id = BLIS_ARCH_GENERIC; + model_id = BLIS_MODEL_DEFAULT; + continue; + break; + } + } + // If AVX512 test fails here we assume either: + // 1. Config was either zen4 or skx, so there is + // no fallback code path, hence error checking + // above will fail. + // 2. Config was amdzen, intel64 or x86_64, and will have + // appropriate avx2 code path to try. + if ( !bli_cpuid_is_avx512_supported() ) + { + switch (req_id) + { + case BLIS_ARCH_ZEN4: + arch_reset = TRUE; + req_id = BLIS_ARCH_ZEN3; + model_id = BLIS_MODEL_DEFAULT; + continue; + break; + case BLIS_ARCH_SKX: + arch_reset = TRUE; + req_id = BLIS_ARCH_HASWELL; + model_id = BLIS_MODEL_DEFAULT; + continue; + break; + } + } + // If both tests above pass, we accept req_id choice. + test_arch = FALSE; + + // Note: Pre-AVX2 systems from AMD and Intel, and Intel KNL, + // have not been included in these tests, and thus could + // continue to give illegal instruction errors on other + // platforms, just as if BLIS_ARCH_TYPE was set to the + // same value. +#else + // Non-x86 platforms just accept value given for now. + // Similar logic to x86 if block could be implemented + // here if desired. + test_arch = FALSE; +#endif + } + else + { + test_arch = FALSE; + } } // Finally, we can be confident that req_id (1) is in range and (2) @@ -361,19 +462,48 @@ void bli_arch_check_id( void ) #endif - if ( bli_arch_get_logging() ) { - if ( model_id == BLIS_MODEL_DEFAULT ) + if ( arch_reset ) + { + if ( orig_model_id == BLIS_MODEL_DEFAULT ) + { + fprintf( stderr, "libblis: Sub-configuration '%s' is not supported on this system.\nlibblis: Switching to sub-configuration '%s'.\n", + bli_arch_string( orig_arch_id ), bli_arch_string( arch_id ) ); + } + else + { + fprintf( stderr, "libblis: Sub-configuration '%s', model '%s' is not supported on this system.\nlibblis: Switching to sub-configuration '%s', model '%s'.\n", + bli_arch_string( orig_arch_id ), bli_model_string( orig_model_id ), bli_arch_string( arch_id ), bli_model_string( model_id ) ); + } + } + else { - fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n", - bli_arch_string( arch_id ) ); + if ( model_id == BLIS_MODEL_DEFAULT ) + { + fprintf( stderr, "libblis: Selecting sub-configuration '%s'.\n", + bli_arch_string( arch_id ) ); + } + else + { + fprintf( stderr, "libblis: Selecting sub-configuration '%s', model '%s'.\n", + bli_arch_string( arch_id ), bli_model_string( model_id ) ); + } + } +#if 0 + if ( orig_model_id == BLIS_MODEL_DEFAULT ) + { + fprintf( stderr, "libblis: Actual hardware '%s'.\n", + bli_arch_string( actual_arch_id ) ); + } else { - fprintf( stderr, "libblis: selecting sub-configuration '%s', model '%s'.\n", - bli_arch_string( arch_id ), bli_model_string( model_id ) ); + fprintf( stderr, "libblis: Actual hardware '%s', model '%s'.\n", + bli_arch_string( actual_arch_id ), bli_model_string( actual_model_id ) ); + } +#endif } //printf( "blis_arch_check_id(): arch_id, model_id = %u, %u\n", arch_id, model_id ); diff --git a/frame/base/bli_env.c b/frame/base/bli_env.c index 2585a16ce7..d2e7d74d8e 100644 --- a/frame/base/bli_env.c +++ b/frame/base/bli_env.c @@ -163,6 +163,44 @@ gint_t bli_env_get_var_arch_type( const char* env, gint_t fallback ) { r_val = BLIS_ARCH_BULLDOZER; } + // Some aliases for mapping AMD and Intel ISA + // names to a suitable sub-configuration. +#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) || defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_ZEN2) || defined(BLIS_FAMILY_ZEN) + else if (strcmp(str, "avx512") == 0) + { + r_val = BLIS_ARCH_ZEN4; + } +#endif +#if defined(BLIS_FAMILY_INTEL64) || defined(BLIS_FAMILY_SKX) || defined(BLIS_FAMILY_HASWELL) + else if (strcmp(str, "avx512") == 0) + { + r_val = BLIS_ARCH_SKX; + } +#endif +#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) || defined(BLIS_FAMILY_ZEN4) ||defined(BLIS_FAMILY_ZEN3) + else if (strcmp(str, "avx2") == 0) + { + r_val = BLIS_ARCH_ZEN3; + } +#endif +#if defined(BLIS_FAMILY_ZEN2) + else if (strcmp(str, "avx2") == 0) + { + r_val = BLIS_ARCH_ZEN2; + } +#endif +#if defined(BLIS_FAMILY_ZEN) + else if (strcmp(str, "avx2") == 0) + { + r_val = BLIS_ARCH_ZEN; + } +#endif +#if defined(BLIS_FAMILY_INTEL64) || defined(BLIS_FAMILY_SKX) || defined(BLIS_FAMILY_HASWELL) + else if (strcmp(str, "avx2") == 0) + { + r_val = BLIS_ARCH_HASWELL; + } +#endif // ARM else if (strcmp(str, "thunderx2") == 0) { From b3391ef5da79150b2657d48c693f77321b98c384 Mon Sep 17 00:00:00 2001 From: Nallani Bhaskar Date: Sun, 29 Oct 2023 22:15:21 +0530 Subject: [PATCH 173/226] Updated ERF threshold and packa changes in bf16 Description: 1. Updated ERF function threshold from 3.91920590400 to 3.553 to match with the reference erf float implementation which reduced errors a the borders and also clipped the output to 1.0 2. Updated packa function call with pack function ptr in bf16 api to avoid compilation issues for non avx512bf16 archs 3. Updated lpgemm bench [AMD-Internal: SWLCSG-2423 ] Change-Id: Id432c0669521285e6e6a151739d9a72a7340381d --- addon/aocl_gemm/config/lpgemm_func_map.h | 2 +- addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c | 4 ++-- addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c | 2 +- addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h | 2 +- bench/bench_aocl_gemm/bench_lpgemm.c | 1 + kernels/zen/lpgemm/math_utils_avx2.h | 6 ++++-- kernels/zen4/lpgemm/math_utils_avx512.h | 4 +++- 7 files changed, 13 insertions(+), 8 deletions(-) diff --git a/addon/aocl_gemm/config/lpgemm_func_map.h b/addon/aocl_gemm/config/lpgemm_func_map.h index c54a6e28a0..ab2e153b91 100644 --- a/addon/aocl_gemm/config/lpgemm_func_map.h +++ b/addon/aocl_gemm/config/lpgemm_func_map.h @@ -140,7 +140,7 @@ #define LPGEMM_PACKA_FUNC_MAP_AVX2 \ PAMACRO(U8S8S16OS16, NULL) \ PAMACRO(U8S8S32OS32, NULL) \ - PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \ + PAMACRO(BF16BF16F32OF32, NULL) \ PAMACRO(S8S8S32OS32, NULL) \ PAMACRO(S8S8S16OS16, NULL) \ diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c index 2c9e188ea1..f781e70daf 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c @@ -261,7 +261,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) if ( ( jc_packb_end > jc_packb_start ) && ( jc_packb_start < ( jc + nc0 ) ) ) { - ( ( packb_bf16 )lcntx->packb_fun_ptr ) + ( ( pack_bf16 )lcntx->packb_fun_ptr ) ( pack_b_buffer_bf16 + ( jc_packb_start * kc0_updated ), ( b + ( rs_b * pc ) + ( cs_b * jc ) + @@ -338,7 +338,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32) pack_a_buffer_bf16 = ( bfloat16* ) bli_mem_buffer( &mem_a ); - ( packa_mr16_bf16bf16f32of32) + ( ( pack_bf16 )lcntx->packa_fun_ptr ) ( pack_a_buffer_bf16, ( a + ( rs_a * ic ) + ( cs_a * pc )), rs_a, cs_a, diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c index 8774f2ea95..40dfa051bd 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c @@ -149,7 +149,7 @@ void reorderb_nr64_bf16bf16f32of32 // st = ( jc_cur_loop * k ) // + ( n_sub_updated * pc ) // + ( NC' * kc0_updated) - ( ( packb_bf16 )lcntx->packb_fun_ptr ) + ( ( pack_bf16 )lcntx->packb_fun_ptr ) ( ( ( bfloat16* )b_reorder->storage.aligned_buffer ) + ( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) + diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h index cec9195f61..92f53f36ab 100644 --- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h +++ b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h @@ -47,7 +47,7 @@ BLIS_INLINE dim_t get_packb_bf16bf16f32of32_min_NR() return 16; } -typedef void (*packb_bf16) +typedef void (*pack_bf16) ( bfloat16*, const bfloat16*, diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index f20c819c1f..662f306122 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -1157,6 +1157,7 @@ void mat_mul_bench_main_ ## BLAS_SFX \ { \ alpha = 2; \ beta = 9; \ + n_repeats = 1; \ } \ \ aocl_post_op* post_op = NULL; \ diff --git a/kernels/zen/lpgemm/math_utils_avx2.h b/kernels/zen/lpgemm/math_utils_avx2.h index bdd1dec86a..5f503fa3e7 100644 --- a/kernels/zen/lpgemm/math_utils_avx2.h +++ b/kernels/zen/lpgemm/math_utils_avx2.h @@ -112,7 +112,8 @@ \ POLY_EVAL_HORNER_16_0_AVX2(r,x); \ \ - x = _mm256_blendv_ps (x, _mm256_set1_ps(1), _mm256_cmp_ps (_mm256_set1_ps(3.9192059040069580078125f), r, 1)); \ + x = _mm256_blendv_ps (x, _mm256_set1_ps(1), _mm256_cmp_ps (_mm256_set1_ps(3.553f), r, 1)); \ + x = _mm256_blendv_ps (x, _mm256_set1_ps(1), _mm256_cmp_ps (_mm256_set1_ps(1.0f), x, 1)); \ x_erf = _mm256_or_ps(_mm256_and_ps (x_erf, (__m256)_mm256_set1_epi32(~(0x7FFFFFFF))), x); //Trignometric EXP, TANH and ERF functions for SSE @@ -160,7 +161,8 @@ \ POLY_EVAL_HORNER_16_0_SSE(r,x); \ \ - x = _mm_blendv_ps (x, _mm_set1_ps(1), _mm_cmp_ps (_mm_set1_ps(3.9192059040069580078125f), r, 1)); \ + x = _mm_blendv_ps (x, _mm_set1_ps(1), _mm_cmp_ps (_mm_set1_ps(3.553f), r, 1)); \ + x = _mm_blendv_ps (x, _mm_set1_ps(1), _mm_cmp_ps (_mm_set1_ps(1.0f), x, 1)); \ x_erf = _mm_or_ps(_mm_and_ps (x_erf, (__m128)_mm_set1_epi32(~(0x7FFFFFFF))), x); #endif // AOCL_LPGEMM_MATH_UTILS_AVX2_H diff --git a/kernels/zen4/lpgemm/math_utils_avx512.h b/kernels/zen4/lpgemm/math_utils_avx512.h index 6221827c75..dddfd58825 100644 --- a/kernels/zen4/lpgemm/math_utils_avx512.h +++ b/kernels/zen4/lpgemm/math_utils_avx512.h @@ -113,7 +113,9 @@ POLY_EVAL_HORNER_16_0_AVX512(r,x); \ \ x = (__m512)_mm512_mask_xor_epi32 ((__m512i)_mm512_set1_ps(1), _mm512_cmpnle_ps_mask \ - ( _mm512_set1_ps(3.9192059040069580078125f), r), (__m512i)x, _mm512_set1_epi32(0)); \ + ( _mm512_set1_ps(3.553f), r), (__m512i)x, _mm512_set1_epi32(0)); \ + x = (__m512)_mm512_mask_xor_epi32 ((__m512i)_mm512_set1_ps(1), _mm512_cmpnle_ps_mask \ + ( _mm512_set1_ps(1.0f), x), (__m512i)x, _mm512_set1_epi32(0)); \ x_erf = (__m512)_mm512_or_epi32(_mm512_and_epi32 ((__m512i)x_erf, _mm512_set1_epi32(~(0x7FFFFFFF))), (__m512i)x); #endif // AOCL_LPGEMM_MATH_UTILS_AVX512_H From d1844678f4cb31ddf721953095b4eb6a0861e4da Mon Sep 17 00:00:00 2001 From: mkadavil Date: Tue, 31 Oct 2023 07:50:00 +0530 Subject: [PATCH 174/226] LPGEMM 8s8s16ou8 fixes for incorrect zero point addition. -The zero point data type is different based on the downscale data type. For int8_t downscale type, zero point type is int8_t whereas for uint8_t downscale type, it is uint8_t. During downscale post-op, the micro-kernels upscales the zero point from its data type (int8_t or uint8_t) to that of the accumulation data type and then performs the zero point addition. The accumulated output is then stored as downscaled type in a later storage phase. For the 8s8s16 micro-kernels, the upscaling to int16_t (accumulation type) is always performed assuming the zero point is int8_t using the _mm256_cvtepi8_epi16 instruction. However this will result in incorrect upscaled zero point values if the downscale type is uint8_t and the associated zero point type is also uint8_t. This issue is corrected by switching between the correct upscale instruction based on the zero point type. AMD-Internal: [SWLCSG-2500] Change-Id: I92eed4aed686c447d29312836b9e551d6dd4b076 --- .../s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c | 21 +++- .../s8s8s16/lpgemm_s8_m_fringe_amd256.c | 69 +++++++++-- .../s8s8s16/lpgemm_s8_mn_fringe_amd256.c | 111 ++++++++++++++---- .../s8s8s16/lpgemm_s8_n_fringe_amd256.c | 37 ++++-- .../u8s8s16/lpgemm_6x32rowmajor_amd256.c | 21 +++- .../lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c | 69 +++++++++-- .../lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c | 111 ++++++++++++++---- .../lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c | 37 ++++-- .../lpgemm/u8s8s16/lpgemm_s16_kern_macros.h | 2 +- 9 files changed, 399 insertions(+), 79 deletions(-) diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c index 10693f5f53..7893af7437 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c @@ -776,10 +776,19 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - __m128i zero_point_0 = + __m128i _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + __m256i zero_point_0 = _mm256_setzero_si256(); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -798,10 +807,18 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale next 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c index e6825cb2eb..af112831d9 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c @@ -521,7 +521,8 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -535,10 +536,18 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -555,10 +564,18 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale next 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) @@ -930,7 +947,8 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -944,10 +962,18 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -962,10 +988,18 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale next 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) @@ -1229,7 +1263,8 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1243,10 +1278,18 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1260,10 +1303,18 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale next 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c index a9a3e56eb5..77f3553f67 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c @@ -384,7 +384,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -398,10 +399,18 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -816,7 +825,8 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; float float_buf[16]; @@ -828,11 +838,24 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); - int8_t zero_point_buf[16]; + if ( post_ops_attr.c_stor_type == S8 ) + { + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + uint8_t zero_point_buf[16]; - memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); - zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + memcpy( zero_point_buf, ( ( uint8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( uint8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1135,7 +1158,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1149,10 +1173,18 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 2 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1439,7 +1471,8 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; float float_buf[16]; @@ -1451,11 +1484,24 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); - int8_t zero_point_buf[16]; + if ( post_ops_attr.c_stor_type == S8 ) + { + int8_t zero_point_buf[16]; - memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); - zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + uint8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( uint8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( uint8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1684,7 +1730,8 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1698,10 +1745,18 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 2 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1927,7 +1982,8 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; float float_buf[16]; @@ -1939,11 +1995,24 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); - int8_t zero_point_buf[16]; + if ( post_ops_attr.c_stor_type == S8 ) + { + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + uint8_t zero_point_buf[16]; - memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); - zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + memcpy( zero_point_buf, ( ( uint8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( uint8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 2 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c index e904613d8e..3b6d21bb37 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c @@ -505,7 +505,8 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -519,10 +520,18 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1127,7 +1136,8 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; float float_buf[16]; @@ -1139,11 +1149,24 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); - int8_t zero_point_buf[16]; + if ( post_ops_attr.c_stor_type == S8 ) + { + int8_t zero_point_buf[16]; - memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); - zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + uint8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( uint8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( uint8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c index 286d6422b7..ba577f3b25 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c @@ -752,10 +752,19 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) post_ops_attr.post_op_c_j + ( 1 * 8 ) ); // Load zero points (2 byte values). - __m128i zero_point_0 = + __m128i _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + __m256i zero_point_0 = _mm256_setzero_si256(); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -774,10 +783,18 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale next 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c index be65426b31..0fb7a297bb 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c @@ -502,7 +502,8 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -516,10 +517,18 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -536,10 +545,18 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale next 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) @@ -914,7 +931,8 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -928,10 +946,18 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -946,10 +972,18 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale next 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) @@ -1212,7 +1246,8 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1226,10 +1261,18 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1243,10 +1286,18 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32) (float *)post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + (3 * 8)); - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale next 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2, zero_point_0) diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c index dc5108386b..a9a8925eaa 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c @@ -360,7 +360,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -374,10 +375,18 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 4 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -789,7 +798,8 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; float float_buf[16]; @@ -801,11 +811,24 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); - int8_t zero_point_buf[16]; + if ( post_ops_attr.c_stor_type == S8 ) + { + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + uint8_t zero_point_buf[16]; - memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); - zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + memcpy( zero_point_buf, ( ( uint8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( uint8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1113,7 +1136,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1127,10 +1151,18 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 2 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1417,7 +1449,8 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; float float_buf[16]; @@ -1429,11 +1462,24 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); - int8_t zero_point_buf[16]; + if ( post_ops_attr.c_stor_type == S8 ) + { + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + uint8_t zero_point_buf[16]; - memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); - zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + memcpy( zero_point_buf, ( ( uint8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( uint8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1666,7 +1712,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -1680,10 +1727,18 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 2 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1913,7 +1968,8 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; float float_buf[16]; @@ -1925,11 +1981,24 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); - int8_t zero_point_buf[16]; + if ( post_ops_attr.c_stor_type == S8 ) + { + int8_t zero_point_buf[16]; - memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); - zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + uint8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( uint8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( uint8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 2 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c index c4182324d8..a6a167f95b 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c @@ -472,7 +472,8 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; /* Load the scale vector values into the register*/ @@ -486,10 +487,18 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) post_ops_attr.post_op_c_j + (1 * 8)); // Load zero points (2 byte values). - zero_point_0 = + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )( ( int8_t* )post_ops_list_temp->op_args1 + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); + if ( post_ops_attr.c_stor_type == S8 ) + { + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) @@ -1087,7 +1096,8 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) __m256i temp_32[2]; __m256 temp_float[2]; __m256 scale_1, scale_2; - __m128i zero_point_0; + __m128i _zero_point_0; + __m256i zero_point_0 = _mm256_setzero_si256(); __m256 res_1, res_2; float float_buf[16]; @@ -1099,11 +1109,24 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) scale_1 = _mm256_loadu_ps(float_buf + (0 * 8)); scale_2 = _mm256_loadu_ps(float_buf + (1 * 8)); - int8_t zero_point_buf[16]; + if ( post_ops_attr.c_stor_type == S8 ) + { + int8_t zero_point_buf[16]; + + memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepi8_epi16( _zero_point_0 ); + } + else if ( post_ops_attr.c_stor_type == U8 ) + { + uint8_t zero_point_buf[16]; - memcpy( zero_point_buf, ( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( int8_t ) ) ); - zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + memcpy( zero_point_buf, ( ( uint8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( uint8_t ) ) ); + _zero_point_0 = _mm_loadu_si128( ( __m128i const* )zero_point_buf ); + zero_point_0 = _mm256_cvtepu8_epi16( _zero_point_0 ); + } // Scale first 16 columns of the 6 rows. CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2, zero_point_0) diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h b/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h index cb04b2c8c3..e2b0ebd86d 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h @@ -170,7 +170,7 @@ reg = _mm256_permute4x64_epi64( reg, 0XD8 ); \ \ /* Zero point addition.*/ \ - reg = _mm256_add_epi16( reg, _mm256_cvtepi8_epi16( zero_point_0 ) ); \ + reg = _mm256_add_epi16( reg, zero_point_0 ); \ // Downscale store macro helper #define CVT_STORE_S16_SU8_HELPER(reg, m_ind, n_ind, C_type) \ From 106342f402b11feee051667d71333f2608911eea Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Tue, 31 Oct 2023 16:50:32 +0530 Subject: [PATCH 175/226] ZGEMV optimization for special cases in beta - Avoiding scaling of y vector by beta when beta is 1. AMD-Internal: [CPUPL-3829] Change-Id: I9cf46f44c5f1c2da3653937ff035594b4046b4a1 --- frame/2/gemv/bli_gemv_unf_var2_amd.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/frame/2/gemv/bli_gemv_unf_var2_amd.c b/frame/2/gemv/bli_gemv_unf_var2_amd.c index 554c2531fd..1d9bd0deef 100644 --- a/frame/2/gemv/bli_gemv_unf_var2_amd.c +++ b/frame/2/gemv/bli_gemv_unf_var2_amd.c @@ -814,15 +814,21 @@ void bli_zgemv_unf_var2 } else { - // Invoke the ZSCALV function using the function pointer - scalv_kr_ptr - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y_buf, buf_incy, - cntx - ); + /* + Invoke the ZSCALV function using the function + pointer only when alpha is not 1. + */ + if(!PASTEMAC(z, eq1)(*beta)) + { + scalv_kr_ptr + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y_buf, buf_incy, + cntx + ); + } } // If alpha is zero(0), we only need to scalv y and return From ef545b928ee5951d01515128643d3f5bd5ca13cf Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Wed, 11 Oct 2023 14:33:21 +0530 Subject: [PATCH 176/226] Bugfix : Changing fuse factor for the call to vectorized SAXPYF kernel - The call to the bli_saxpyf_zen_int_6( ... ) is explicitly present in the bli_gemv_unf_var2_amd.c file, as part of the bli_sgemv_unf_var2( ... ) function. This was changed to bli_saxpyf_zen_int_5( ... )( thereby changing the fuse factor from 6 to 5 ), in accordance to the function pointer present in the zen3 and zen4 context files. - Changed the accumulator type to double from float, inside the fringe loop for unit-strides(vectorized path) and non-unit strides (scalar code). AMD-Internal: [CPUPL-4028] Change-Id: Iab1a0318f461cba9a7041093c6865ae8396d231e --- frame/2/gemv/bli_gemv_unf_var2_amd.c | 4 ++-- kernels/zen/1f/bli_axpyf_zen_int_6.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/frame/2/gemv/bli_gemv_unf_var2_amd.c b/frame/2/gemv/bli_gemv_unf_var2_amd.c index 1d9bd0deef..a0f5054195 100644 --- a/frame/2/gemv/bli_gemv_unf_var2_amd.c +++ b/frame/2/gemv/bli_gemv_unf_var2_amd.c @@ -617,7 +617,7 @@ void bli_sgemv_unf_var2 } /* Query the context for the kernel function pointer and fusing factor. */ - b_fuse = 6; + b_fuse = 5; for ( i = 0; i < n_iter; i += f ) { @@ -628,7 +628,7 @@ void bli_sgemv_unf_var2 y1 = y + (0 )*incy; /* y = y + alpha * A1 * x1; */ - bli_saxpyf_zen_int_6 + bli_saxpyf_zen_int_5 ( conja, conjx, diff --git a/kernels/zen/1f/bli_axpyf_zen_int_6.c b/kernels/zen/1f/bli_axpyf_zen_int_6.c index 6da5d99e6d..27cf3b7d89 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_6.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_6.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -185,7 +185,7 @@ void bli_saxpyf_zen_int_6 // If there are leftover iterations, perform them with scalar code. for ( ; (i + 0) < m ; ++i ) { - float y0c = *y0; + double y0c = *y0; const float a0c = *a0; const float a1c = *(a0+ 1*lda); @@ -211,7 +211,7 @@ void bli_saxpyf_zen_int_6 { for ( i = 0; (i + 0) < m ; ++i ) { - float y0c = *y0; + double y0c = *y0; const float a0c = *a0; const float a1c = *(a0+ 1*lda); const float a2c = *(a0+ 2*lda); From f8f4343b5572f5c36caf265619b28ee9272efcf5 Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Tue, 31 Oct 2023 01:57:01 +0530 Subject: [PATCH 177/226] Updated cntx with packA function pointer for AVX512_VNNI support Details: - Modified bench to support testing for sizes where matrix strides are larger than the corresponding dimensions. - Modified early-return checks in all interface APIs to check validity of strides in relation to the corresponding dimension rather than checking if strides are equal to dimensions. Change-Id: I382529b636a4acc75f6d93d997af22a168a7bfc4 --- addon/aocl_gemm/aocl_gemm_s8s8s16os16.c | 14 ++- addon/aocl_gemm/aocl_gemm_s8s8s16os8.c | 14 ++- addon/aocl_gemm/aocl_gemm_s8s8s32os32.c | 14 ++- addon/aocl_gemm/aocl_gemm_s8s8s32os8.c | 14 ++- addon/aocl_gemm/aocl_gemm_u8s8s16os16.c | 14 ++- addon/aocl_gemm/aocl_gemm_u8s8s16os8.c | 14 ++- addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c | 14 ++- addon/aocl_gemm/aocl_gemm_u8s8s32os32.c | 14 ++- addon/aocl_gemm/aocl_gemm_u8s8s32os8.c | 14 ++- addon/aocl_gemm/config/lpgemm_func_map.h | 2 +- bench/bench_aocl_gemm/bench_lpgemm.c | 103 +++++++++++++++++------ 11 files changed, 187 insertions(+), 44 deletions(-) diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c b/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c index f8f22c215d..2d4186305a 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c @@ -88,8 +88,18 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16) return; // Only row major supported. } - // Row major input expected with leading dimensions equal to row stride. - if ((lda != k) || (ldb != n) || (ldc != n)) + bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); + bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); + + // Row major input expected with leading dimensions >= row stride. + if ( ( is_row_major == TRUE ) && + ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + { + return; // Error. + } + // Column major input expected with leading dimensions >= column stride. + else if ( ( is_column_major == TRUE ) && + ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) { return; // Error. } diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c b/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c index f1a640bcd7..6afd6bdd91 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c @@ -88,8 +88,18 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8) return; // Only row major supported. } - // Row major input expected with leading dimensions equal to row stride. - if ((lda != k) || (ldb != n) || (ldc != n)) + bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); + bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); + + // Row major input expected with leading dimensions >= row stride. + if ( ( is_row_major == TRUE ) && + ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + { + return; // Error. + } + // Column major input expected with leading dimensions >= column stride. + else if ( ( is_column_major == TRUE ) && + ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) { return; // Error. } diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c b/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c index 9f4f565974..fa9a58ab2a 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c @@ -88,8 +88,18 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32) return; // Only row major supported. } - // Row major input expected with leading dimensions equal to row stride. - if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) ) + bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); + bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); + + // Row major input expected with leading dimensions >= row stride. + if ( ( is_row_major == TRUE ) && + ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + { + return; // Error. + } + // Column major input expected with leading dimensions >= column stride. + else if ( ( is_column_major == TRUE ) && + ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) { return; // Error. } diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c b/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c index e3562170e3..90c54e2d3e 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c @@ -88,8 +88,18 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8) return; // Only row major supported. } - // Row major input expected with leading dimensions equal to row stride. - if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) ) + bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); + bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); + + // Row major input expected with leading dimensions >= row stride. + if ( ( is_row_major == TRUE ) && + ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + { + return; // Error. + } + // Column major input expected with leading dimensions >= column stride. + else if ( ( is_column_major == TRUE ) && + ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) { return; // Error. } diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c index 970200cf9e..c31ba7a855 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c @@ -88,8 +88,18 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) return; // Only row major supported. } - // Row major input expected with leading dimensions equal to row stride. - if ((lda != k) || (ldb != n) || (ldc != n)) + bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); + bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); + + // Row major input expected with leading dimensions >= row stride. + if ( ( is_row_major == TRUE ) && + ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + { + return; // Error. + } + // Column major input expected with leading dimensions >= column stride. + else if ( ( is_column_major == TRUE ) && + ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) { return; // Error. } diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c index f40a558b5d..5869eb79c2 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c @@ -88,8 +88,18 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8) return; // Only row major supported. } - // Row major input expected with leading dimensions equal to row stride. - if ((lda != k) || (ldb != n) || (ldc != n)) + bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); + bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); + + // Row major input expected with leading dimensions >= row stride. + if ( ( is_row_major == TRUE ) && + ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + { + return; // Error. + } + // Column major input expected with leading dimensions >= column stride. + else if ( ( is_column_major == TRUE ) && + ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) { return; // Error. } diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c b/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c index 80b1619ce1..325160ffac 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c @@ -88,8 +88,18 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8) return; // Only row major supported. } - // Row major input expected with leading dimensions equal to row stride. - if ((lda != k) || (ldb != n) || (ldc != n)) + bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); + bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); + + // Row major input expected with leading dimensions >= row stride. + if ( ( is_row_major == TRUE ) && + ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + { + return; // Error. + } + // Column major input expected with leading dimensions >= column stride. + else if ( ( is_column_major == TRUE ) && + ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) { return; // Error. } diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c index 1f44770ec8..95291c1aef 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c @@ -88,8 +88,18 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32) return; // Only row major supported. } - // Row major input expected with leading dimensions equal to row stride. - if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) ) + bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); + bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); + + // Row major input expected with leading dimensions >= row stride. + if ( ( is_row_major == TRUE ) && + ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + { + return; // Error. + } + // Column major input expected with leading dimensions >= column stride. + else if ( ( is_column_major == TRUE ) && + ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) { return; // Error. } diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c index dad9c56ab9..10f8208808 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c @@ -88,8 +88,18 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8) return; // Only row major supported. } - // Row major input expected with leading dimensions equal to row stride. - if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) ) + bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); + bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); + + // Row major input expected with leading dimensions >= row stride. + if ( ( is_row_major == TRUE ) && + ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) + { + return; // Error. + } + // Column major input expected with leading dimensions >= column stride. + else if ( ( is_column_major == TRUE ) && + ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) { return; // Error. } diff --git a/addon/aocl_gemm/config/lpgemm_func_map.h b/addon/aocl_gemm/config/lpgemm_func_map.h index ab2e153b91..875a211985 100644 --- a/addon/aocl_gemm/config/lpgemm_func_map.h +++ b/addon/aocl_gemm/config/lpgemm_func_map.h @@ -56,7 +56,7 @@ #define LPGEMM_PACKA_FUNC_MAP_AVX512_VNNI_BF16 \ PAMACRO(U8S8S16OS16, NULL) \ PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \ - PAMACRO(BF16BF16F32OF32, NULL) \ + PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \ PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \ PAMACRO(S8S8S16OS16, NULL) \ diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 662f306122..5b6fddb0f4 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -317,7 +317,8 @@ void mat_mul_bench_driver_ ## BLAS_SFX \ { \ if ( bench_mode == 'a' ) \ { \ - GEN_FUNC_NAME(fill_array_,C_type)( c, ( m * n ) ); \ + int32_t size_C = ( ( stor_order == 'r') || ( stor_order == 'R' ) )? m * ldc : n * ldc; \ + GEN_FUNC_NAME(fill_array_,C_type)( c, ( size_C ) ); \ } \ \ struct timespec tstart={0,0}, tend={0,0}; \ @@ -1106,6 +1107,21 @@ void mat_mul_bench_main_ ## BLAS_SFX \ char* post_ops_str \ ) \ { \ + if( ( stor_order != 'r' ) && ( stor_order != 'R' ) ) \ + { \ + printf("The stor_order(1st arg in input.txt) is not valid\n"); \ + return; \ + } \ + if( ( transa != 'n' ) && ( transa != 'N' ) ) \ + { \ + printf("The transa(2nd arg in input.txt) is not valid\n"); \ + return; \ + } \ + if( ( transb != 'n' ) && ( transb != 'N' ) ) \ + { \ + printf("The transb (3rd arg in input.txt) is not valid\n"); \ + return; \ + } \ /* Reorder and pack of A matrix is not supported */ \ if( ( op_a != 'N' ) && ( op_a != 'n' ) ) \ { \ @@ -1125,25 +1141,31 @@ void mat_mul_bench_main_ ## BLAS_SFX \ n_repeats = global_n_repeat; \ } \ \ + /* sizes are hardcoded since all datatypes other than bf16 only support + row major and no-transpose cases. In future, when we support transpose support + for all datatypes, these needs to be modified. */ \ + int32_t size_A = m * stride_a; \ + int32_t size_B = k * stride_b; \ + int32_t size_C = m * stride_c; \ /* Get 64 byte aligned memory.*/ \ err_t bli_errors = BLIS_SUCCESS; \ - A_type* a = ( A_type* ) bli_malloc_user( sizeof( A_type ) * m * k, &bli_errors ); \ + A_type* a = ( A_type* ) bli_malloc_user( sizeof( A_type ) * size_A, &bli_errors ); \ \ - B_type* b = ( B_type* ) bli_malloc_user( sizeof( B_type ) * n * k, &bli_errors ); \ + B_type* b = ( B_type* ) bli_malloc_user( sizeof( B_type ) * size_B, &bli_errors ); \ \ - C_type* c = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n, &bli_errors ); \ - memset( ( void* ) c, 0, sizeof( C_type ) * m * n ); \ + C_type* c = ( C_type* ) bli_malloc_user( sizeof( C_type ) * size_C, &bli_errors ); \ + memset( ( void* ) c, 0, sizeof( C_type ) * size_C ); \ \ - C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n, &bli_errors ); \ - memset( ( void* ) c_ref, 0, sizeof( C_type ) * m * n ); \ + C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * size_C, &bli_errors ); \ + memset( ( void* ) c_ref, 0, sizeof( C_type ) * size_C ); \ \ - GEN_FUNC_NAME(fill_array_,A_type)( a, ( m * k ) ); \ - GEN_FUNC_NAME(fill_array_,B_type)( b, ( k * n ) ); \ + GEN_FUNC_NAME(fill_array_,A_type)( a, ( size_A ) ); \ + GEN_FUNC_NAME(fill_array_,B_type)( b, ( size_B ) ); \ \ if ( bench_mode == 'a' ) \ { \ - GEN_FUNC_NAME(fill_array_,C_type)( c, ( m * n ) ); \ - GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( m * n ) ); \ + GEN_FUNC_NAME(fill_array_,C_type)( c, ( size_C ) ); \ + GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( size_C ) ); \ } \ \ C_type alpha = 0; \ @@ -1274,10 +1296,26 @@ void mat_mul_bench_main_ ## BLAS_SFX \ char* post_ops_str \ ) \ { \ + if( ( stor_order != 'r' ) && ( stor_order != 'R' ) && ( stor_order != 'c' ) && ( stor_order != 'C' ) ) \ + { \ + printf("The stor_order(1st arg in input.txt) is not valid\n"); \ + return; \ + } \ + if( ( transa != 'n' ) && ( transa != 'N' ) && ( transa != 't' ) && (transa != 'T' ) ) \ + { \ + printf("The transa ( 2nd arg in input.txt) is not valid\n"); \ + return; \ + } \ + if( ( transb != 'n' ) && ( transb != 'N' ) && ( transb != 't' ) && (transb != 'T' ) ) \ + { \ + printf("The transb ( 3nd arg in input.txt) is not valid\n"); \ + return; \ + } \ /* Reorder is not supported for A matrix*/ \ if( ( op_a != 'p' ) && ( op_a != 'P' ) && ( op_a != 'n' ) && ( op_a != 'N' ) ) \ { \ printf("The op_a (4th arg in input.txt) is not valid\n"); \ + return; \ } \ if ( ( op_b != 'p' ) && ( op_b != 'P' ) && ( op_b != 'r' ) && ( op_b != 'R' ) && ( op_b != 'N' ) && ( op_b != 'n' ) ) \ { \ @@ -1291,35 +1329,50 @@ void mat_mul_bench_main_ ## BLAS_SFX \ n_repeats = global_n_repeat; \ } \ \ + int32_t size_A = 0; \ + int32_t size_B = 0; \ + int32_t size_C = 0; \ + if( ( stor_order == 'r' ) || ( stor_order == 'R' ) ) \ + { \ + size_A = ( ( transa == 'n' ) || ( transa == 'N' ) ) ? m * stride_a : k * stride_a; \ + size_B = ( ( transb == 'n' ) || ( transb == 'N' ) ) ? k * stride_b : n * stride_b; \ + size_C = m * stride_c; \ + } \ + else \ + { \ + size_A = ( ( transa == 'n' ) || ( transa == 'N' ) ) ? k * stride_a : m * stride_a; \ + size_B = ( ( transb == 'n' ) || ( transb == 'N' ) ) ? n * stride_b : k * stride_b; \ + size_C = n * stride_c; \ + } \ err_t bli_errors = BLIS_SUCCESS; \ /* Get 64 byte aligned memory.*/ \ - bfloat16* a = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * m * k, &bli_errors ); \ - float *a_float = bli_malloc_user( m * k * sizeof( float ), &bli_errors); \ - for ( int32_t i = 0; i < m*k; ++i ) \ + bfloat16* a = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * size_A, &bli_errors ); \ + float *a_float = bli_malloc_user( size_A * sizeof( float ), &bli_errors); \ + for ( int32_t i = 0; i < size_A; ++i ) \ { \ a_float[i] = ( float ) ( i % 5 ); \ } \ \ - convert_float_arr_to_bf16( a_float, a, m * k ); \ + convert_float_arr_to_bf16( a_float, a, size_A ); \ \ - bfloat16* b = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * n * k, &bli_errors ); \ - float *b_float = bli_malloc_user( k * n * sizeof( float ), &bli_errors); \ - for ( int32_t i = 0; i < k*n; ++i ) \ + bfloat16* b = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * size_B, &bli_errors ); \ + float *b_float = bli_malloc_user( size_B * sizeof( float ), &bli_errors); \ + for ( int32_t i = 0; i < size_B; ++i ) \ { \ b_float[i] = ( float ) ( i % 5 );\ } \ - convert_float_arr_to_bf16( b_float, b, k * n ); \ + convert_float_arr_to_bf16( b_float, b, size_B ); \ \ - C_type* c = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n, &bli_errors ); \ - memset( ( void* ) c, 0, sizeof( C_type ) * m * n ); \ + C_type* c = ( C_type* ) bli_malloc_user( sizeof( C_type ) * size_C, &bli_errors ); \ + memset( ( void* ) c, 0, sizeof( C_type ) * size_C ); \ \ - C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n, &bli_errors ); \ - memset( ( void* ) c_ref, 0, sizeof( C_type ) * m * n ); \ + C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * size_C, &bli_errors ); \ + memset( ( void* ) c_ref, 0, sizeof( C_type ) * size_C ); \ \ if ( bench_mode == 'a' ) \ { \ - GEN_FUNC_NAME(fill_array_,C_type)( c, ( m * n ) ); \ - GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( m * n ) ); \ + GEN_FUNC_NAME(fill_array_,C_type)( c, ( size_C ) ); \ + GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( size_C ) ); \ } \ \ float alpha = 0.0f; \ From d8b8f68066e4a1b9f250e794a99b9a2944f97f5e Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Tue, 31 Oct 2023 09:20:36 -0400 Subject: [PATCH 178/226] Improvements to xerbla functionality (2) Improvements to functionality introduced in commit 6d0444497f: - Call to bli_init_auto() before calling PASTEBLACHK macro in gemv caused significant runtime overhead. Initialize stored info_value directly. - Add similar code in frame/compat/f2c routines. AMD-Internal: [CPUPL-3520] Change-Id: I2df201aed7dbceb4cbe66d6c81b5a03e8092de89 --- frame/compat/bla_gemv_amd.c | 23 +++++++++++++++-------- frame/compat/f2c/bla_gbmv.c | 24 ++++++++++++++++++++++++ frame/compat/f2c/bla_hbmv.c | 14 ++++++++++++++ frame/compat/f2c/bla_hpmv.c | 14 ++++++++++++++ frame/compat/f2c/bla_hpr.c | 14 ++++++++++++++ frame/compat/f2c/bla_hpr2.c | 14 ++++++++++++++ frame/compat/f2c/bla_sbmv.c | 14 ++++++++++++++ frame/compat/f2c/bla_spmv.c | 14 ++++++++++++++ frame/compat/f2c/bla_spr.c | 14 ++++++++++++++ frame/compat/f2c/bla_spr2.c | 14 ++++++++++++++ frame/compat/f2c/bla_tbmv.c | 24 ++++++++++++++++++++++++ frame/compat/f2c/bla_tbsv.c | 24 ++++++++++++++++++++++++ frame/compat/f2c/bla_tpmv.c | 24 ++++++++++++++++++++++++ frame/compat/f2c/bla_tpsv.c | 24 ++++++++++++++++++++++++ 14 files changed, 247 insertions(+), 8 deletions(-) diff --git a/frame/compat/bla_gemv_amd.c b/frame/compat/bla_gemv_amd.c index 4077711e57..b3a6112af2 100644 --- a/frame/compat/bla_gemv_amd.c +++ b/frame/compat/bla_gemv_amd.c @@ -35,6 +35,9 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; // // Define BLAS-to-BLIS interfaces. @@ -187,8 +190,9 @@ void dgemv_blis_impl AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); - /* Initialize BLIS. */ - bli_init_auto(); + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); /* Perform BLAS parameter checking. */ PASTEBLACHK(gemv) @@ -391,8 +395,9 @@ void sgemv_blis_impl AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); - /* Initialize BLIS. */ - bli_init_auto(); + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); /* Perform BLAS parameter checking. */ PASTEBLACHK(gemv) @@ -590,8 +595,9 @@ void cgemv_blis_impl AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); - /* Initialize BLIS. */ - bli_init_auto(); + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); /* Perform BLAS parameter checking. */ PASTEBLACHK(gemv) @@ -831,8 +837,9 @@ void zgemv_blis_impl AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); - /* Initialize BLIS. */ - bli_init_auto(); + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); /* Perform BLAS parameter checking. */ PASTEBLACHK(gemv) diff --git a/frame/compat/f2c/bla_gbmv.c b/frame/compat/f2c/bla_gbmv.c index d6c4076fd8..0ff036f421 100644 --- a/frame/compat/f2c/bla_gbmv.c +++ b/frame/compat/f2c/bla_gbmv.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* cgbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -202,6 +206,11 @@ int PASTEF77S(c,gbmv)(const bla_character *trans, const bla_integer *m, const bl --y; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "T", ( ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (ftnlen)1) @@ -635,6 +644,11 @@ int PASTEF77S(d,gbmv)(const bla_character *trans, const bla_integer *m, const bl --y; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "T", ( ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (ftnlen)1) @@ -992,6 +1006,11 @@ int PASTEF77S(s,gbmv)(const bla_character *trans, const bla_integer *m, const bl --y; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "T", ( ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (ftnlen)1) @@ -1358,6 +1377,11 @@ int PASTEF77S(z,gbmv)(const bla_character *trans, const bla_integer *m, const bl --y; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "T", ( ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (ftnlen)1) diff --git a/frame/compat/f2c/bla_hbmv.c b/frame/compat/f2c/bla_hbmv.c index f07e80f394..526314e0e7 100644 --- a/frame/compat/f2c/bla_hbmv.c +++ b/frame/compat/f2c/bla_hbmv.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* chbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -203,6 +207,11 @@ int PASTEF77S(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla --y; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -650,6 +659,11 @@ int PASTEF77S(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla --y; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_hpmv.c b/frame/compat/f2c/bla_hpmv.c index 9743aaf835..97384b173a 100644 --- a/frame/compat/f2c/bla_hpmv.c +++ b/frame/compat/f2c/bla_hpmv.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* chpmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -167,6 +171,11 @@ int PASTEF77S(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -566,6 +575,11 @@ int PASTEF77S(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_hpr.c b/frame/compat/f2c/bla_hpr.c index f3f591a8cc..85e743c5e3 100644 --- a/frame/compat/f2c/bla_hpr.c +++ b/frame/compat/f2c/bla_hpr.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* chpr.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -154,6 +158,11 @@ int PASTEF77S(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_ --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -467,6 +476,11 @@ int PASTEF77S(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_ --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_hpr2.c b/frame/compat/f2c/bla_hpr2.c index 75d0c54169..2e7c6f17ac 100644 --- a/frame/compat/f2c/bla_hpr2.c +++ b/frame/compat/f2c/bla_hpr2.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* chpr2.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -166,6 +170,11 @@ int PASTEF77S(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -555,6 +564,11 @@ int PASTEF77S(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_sbmv.c b/frame/compat/f2c/bla_sbmv.c index c30c976b2d..5abd25ebe3 100644 --- a/frame/compat/f2c/bla_sbmv.c +++ b/frame/compat/f2c/bla_sbmv.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* dsbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -196,6 +200,11 @@ int PASTEF77S(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla --y; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -548,6 +557,11 @@ int PASTEF77S(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla --y; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_spmv.c b/frame/compat/f2c/bla_spmv.c index 64cb020828..90cdd2f515 100644 --- a/frame/compat/f2c/bla_spmv.c +++ b/frame/compat/f2c/bla_spmv.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* dspmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -159,6 +163,11 @@ int PASTEF77S(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -461,6 +470,11 @@ int PASTEF77S(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_spr.c b/frame/compat/f2c/bla_spr.c index fd99de5508..3cf360ae1f 100644 --- a/frame/compat/f2c/bla_spr.c +++ b/frame/compat/f2c/bla_spr.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* dspr.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -145,6 +149,11 @@ int PASTEF77S(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_ --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -373,6 +382,11 @@ int PASTEF77S(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_ --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_spr2.c b/frame/compat/f2c/bla_spr2.c index a67ef4800d..22a88955d5 100644 --- a/frame/compat/f2c/bla_spr2.c +++ b/frame/compat/f2c/bla_spr2.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* dspr2.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -157,6 +161,11 @@ int PASTEF77S(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -417,6 +426,11 @@ int PASTEF77S(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_tbmv.c b/frame/compat/f2c/bla_tbmv.c index 6c0454c9a8..49feae27d1 100644 --- a/frame/compat/f2c/bla_tbmv.c +++ b/frame/compat/f2c/bla_tbmv.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* ctbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -208,6 +212,11 @@ int PASTEF77S(c,tbmv)(const bla_character *uplo, const bla_character *trans, con --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -775,6 +784,11 @@ int PASTEF77S(d,tbmv)(const bla_character *uplo, const bla_character *trans, con --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -1187,6 +1201,11 @@ int PASTEF77S(s,tbmv)(const bla_character *uplo, const bla_character *trans, con --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -1603,6 +1622,11 @@ int PASTEF77S(z,tbmv)(const bla_character *uplo, const bla_character *trans, con --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_tbsv.c b/frame/compat/f2c/bla_tbsv.c index f7e9e804bd..101adbe38e 100644 --- a/frame/compat/f2c/bla_tbsv.c +++ b/frame/compat/f2c/bla_tbsv.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* ctbsv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -212,6 +216,11 @@ int PASTEF77S(c,tbsv)(const bla_character *uplo, const bla_character *trans, con --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -789,6 +798,11 @@ int PASTEF77S(d,tbsv)(const bla_character *uplo, const bla_character *trans, con --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -1221,6 +1235,11 @@ int PASTEF77S(s,tbsv)(const bla_character *uplo, const bla_character *trans, con --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -1658,6 +1677,11 @@ int PASTEF77S(z,tbsv)(const bla_character *uplo, const bla_character *trans, con --x; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_tpmv.c b/frame/compat/f2c/bla_tpmv.c index 27ba219c93..aca9f0f4c0 100644 --- a/frame/compat/f2c/bla_tpmv.c +++ b/frame/compat/f2c/bla_tpmv.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* ctpmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -166,6 +170,11 @@ int PASTEF77S(c,tpmv)(const bla_character *uplo, const bla_character *trans, con --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -663,6 +672,11 @@ int PASTEF77S(d,tpmv)(const bla_character *uplo, const bla_character *trans, con --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -1012,6 +1026,11 @@ int PASTEF77S(s,tpmv)(const bla_character *uplo, const bla_character *trans, con --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -1366,6 +1385,11 @@ int PASTEF77S(z,tpmv)(const bla_character *uplo, const bla_character *trans, con --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { diff --git a/frame/compat/f2c/bla_tpsv.c b/frame/compat/f2c/bla_tpsv.c index 1c7b22271d..1dae05bfd2 100644 --- a/frame/compat/f2c/bla_tpsv.c +++ b/frame/compat/f2c/bla_tpsv.c @@ -35,6 +35,10 @@ #include "blis.h" +// Make thread settings local to each thread calling BLIS routines. +// (The definition resides in bli_rntm.c.) +extern BLIS_THREAD_LOCAL rntm_t tl_rntm; + /* ctpsv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) @@ -169,6 +173,11 @@ int PASTEF77S(c,tpsv)(const bla_character *uplo, const bla_character *trans, con --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -658,6 +667,11 @@ int PASTEF77S(d,tpsv)(const bla_character *uplo, const bla_character *trans, con --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -1010,6 +1024,11 @@ int PASTEF77S(s,tpsv)(const bla_character *uplo, const bla_character *trans, con --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { @@ -1368,6 +1387,11 @@ int PASTEF77S(z,tpsv)(const bla_character *uplo, const bla_character *trans, con --ap; /* Function Body */ + + // Initialize info_value to 0 + gint_t info_value = 0; + bli_rntm_set_info_value_only( info_value, &tl_rntm ); + info = 0; if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", ( ftnlen)1, (ftnlen)1)) { From 84faccdd7df25c733fdca3ee7c313cb744448db0 Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Fri, 3 Nov 2023 11:32:54 +0530 Subject: [PATCH 179/226] Enabling the vectorized path for SNRM2_ - Enabled the vectorized AVX-2 code-path for SNRM2_. The framework queries the architecture ID and calls the vectorized kernel based on the architecture support. - In case of not having the architecture support, we use the default path based on the sumsqv method. AMD-Internal: [CPUPL-3277] Change-Id: Ic60c0782dec0b7eb09fac21818eb625e57b1d14f --- frame/util/bli_util_unb_var1.c | 74 ++++++++++--------- .../testsuite/util/nrm2/nrm2_corner_cases.cpp | 2 +- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index 9913a94ee6..460ec76f94 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -814,41 +814,47 @@ void bli_snormfv_unb_var1 return; } - /* Disable AVX2 codepath. - if( bli_cpuid_is_avx2fma3_supported() == TRUE ) - { - bli_snorm2fv_unb_var1_avx2( n, x, incx, norm, cntx ); - } - else*/ + // Querying the architecture ID to deploy the appropriate kernel + arch_t id = bli_arch_query_id(); + switch ( id ) { - float* zero = bli_s0; - float* one = bli_s1; - float scale; - float sumsq; - float sqrt_sumsq; - - // Initialize scale and sumsq to begin the summation. - bli_sscopys( *zero, scale ); - bli_sscopys( *one, sumsq ); - - // Compute the sum of the squares of the vector. - bli_ssumsqv_unb_var1 - ( - n, - x, - incx, - &scale, - &sumsq, - cntx, - rntm - ); - - // Compute: norm = scale * sqrt( sumsq ) - bli_ssqrt2s( sumsq, sqrt_sumsq ); - bli_sscals( scale, sqrt_sumsq ); - - // Store the final value to the output variable. - bli_scopys( sqrt_sumsq, *norm ); + case BLIS_ARCH_ZEN4: + case BLIS_ARCH_ZEN3: + case BLIS_ARCH_ZEN2: + case BLIS_ARCH_ZEN: +#ifdef BLIS_KERNELS_ZEN + bli_snorm2fv_unb_var1_avx2( n, x, incx, norm, cntx ); + break; +#endif + default:; + float* zero = bli_s0; + float* one = bli_s1; + float scale; + float sumsq; + float sqrt_sumsq; + + // Initialize scale and sumsq to begin the summation. + bli_sscopys( *zero, scale ); + bli_sscopys( *one, sumsq ); + + // Compute the sum of the squares of the vector. + bli_ssumsqv_unb_var1 + ( + n, + x, + incx, + &scale, + &sumsq, + cntx, + rntm + ); + + // Compute: norm = scale * sqrt( sumsq ) + bli_ssqrt2s( sumsq, sqrt_sumsq ); + bli_sscals( scale, sqrt_sumsq ); + + // Store the final value to the output variable. + bli_scopys( sqrt_sumsq, *norm ); } } diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp index 3134c88897..899fb01025 100644 --- a/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp +++ b/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp @@ -89,7 +89,7 @@ TYPED_TEST( nrm2_EIC, zero_incx_MT ) { // For incx=0, nrm2 iterates through the first element n-times. // So, we initialize x[0] with a different value than the rest // of the elements. - x[0] = T{10.0}*x[0]; + x[0] = T{2.0}*x[0]; RT blis_norm = nrm2(n, x.data(), incx); RT ref_norm = testinghelpers::ref_nrm2(n, x.data(), incx); computediff(blis_norm, ref_norm); From c3d1a3878ca07bcc2660db8d2e87675e5bb9ca8f Mon Sep 17 00:00:00 2001 From: Eashan Dash Date: Tue, 17 Oct 2023 14:48:16 +0530 Subject: [PATCH 180/226] Parallelized Pack and Compute Extension APIs 1. OpenMP based multi-threading parallelism is added for BLAS extension APIs of Pack and Compute 2. Both pack and compute APIs are parallelized. 3. Multi-threading of pack and compute APIs done with different number of threads can lead to inconsistent results due to output difference of the full packed matrix buffer when packed with different number of threads. 4. In multi-threaded execution, we ensure output of packed buffer is exactly the same as in single threaded execution. 5. Similarly for compute API, read of packed buffer in multi- threaded execution is exactly the same as in single-threaded execution. 6. Routines are added to compute the offsets for thread workload distribution for MT execution. 1. The offsets are calculated in such a way that it resembles the reorder buffer traversal in single threaded reordering. 2. The panel boundaries (KCxNC) remain as it is accessed in single thread, and as a consequence a thread with jc_start inside the panel cannot consider NC range for reorder. 3. It has to work with NC' < NC, and the offset is calulated using prev NC panels spanning k dim + cur NC panel spaning pc loop cur iteration + (NC - NC') spanning current kc0 (<= KC). 7. Routines to ensure the same are added for MT execution 1. frame/base/bli_pack_compute_utils.c 2. frame/base/bli_pack_compute_utils.h AMD-Internal: [CPUPL-3560] Change-Id: I0dad33e0062519de807c32f6071e61fba976d9ac --- bench/bench_gemm_pack_compute.c | 72 ++++++++++++- frame/1m/packm/bli_pack_full.c | 66 ++++++++++-- frame/3/bli_l3_compute.c | 40 +++++--- frame/base/CMakeLists.txt | 3 +- frame/base/bli_pack_compute_utils.c | 112 +++++++++++++++++++++ frame/base/bli_pack_compute_utils.h | 65 ++++++++++++ frame/include/blis.h | 2 +- frame/thread/bli_l3_compute_decor_openmp.c | 4 - frame/thread/bli_l3_compute_decor_single.c | 2 +- frame/thread/bli_pack_full_decor_openmp.c | 6 +- frame/thread/bli_pack_full_decor_single.c | 4 +- 11 files changed, 334 insertions(+), 42 deletions(-) create mode 100644 frame/base/bli_pack_compute_utils.c create mode 100644 frame/base/bli_pack_compute_utils.h diff --git a/bench/bench_gemm_pack_compute.c b/bench/bench_gemm_pack_compute.c index 2394f608b2..e2f218846e 100755 --- a/bench/bench_gemm_pack_compute.c +++ b/bench/bench_gemm_pack_compute.c @@ -278,7 +278,6 @@ int main( int argc, char** argv ) bli_printm( "b", &b, "%4.6f", "" ); bli_printm( "c", &c, "%4.6f", "" ); #endif - dtime = bli_clock(); #ifdef BLIS @@ -374,6 +373,8 @@ int main( int argc, char** argv ) ap, lda, aBuffer ); + dtime = bli_clock(); + cblas_sgemm_compute( cblas_order, CblasPacked, cblas_transb, @@ -385,6 +386,8 @@ int main( int argc, char** argv ) *betap, cp, ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user(aBuffer); } else if ( !packA && packB ) @@ -406,6 +409,8 @@ int main( int argc, char** argv ) bp, ldb, bBuffer ); + dtime = bli_clock(); + cblas_sgemm_compute( cblas_order, cblas_transa, CblasPacked, @@ -417,6 +422,9 @@ int main( int argc, char** argv ) *betap, cp, ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + + bli_free_user(bBuffer); } else if ( packA && packB ) @@ -454,6 +462,8 @@ int main( int argc, char** argv ) bp, ldb, bBuffer ); + dtime = bli_clock(); + cblas_sgemm_compute( cblas_order, CblasPacked, CblasPacked, @@ -465,12 +475,17 @@ int main( int argc, char** argv ) *betap, cp, ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user(aBuffer); bli_free_user(bBuffer); } else { // Neither A nor B is pre-packed. + + dtime = bli_clock(); + cblas_sgemm_compute( cblas_order, cblas_transa, cblas_transb, @@ -481,6 +496,8 @@ int main( int argc, char** argv ) bp, ldb, *betap, cp, ldc ); + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); } #else // -- BLAS API -- float* aBuffer; @@ -505,6 +522,8 @@ int main( int argc, char** argv ) (f77_int*)&lda, aBuffer ); + dtime = bli_clock(); + sgemm_compute_( &f77_packed, &f77_transb, &mm, @@ -515,6 +534,8 @@ int main( int argc, char** argv ) betap, cp, (f77_int*)&ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user( aBuffer ); } else if ( !packA && packB ) @@ -536,6 +557,8 @@ int main( int argc, char** argv ) (f77_int*)&ldb, bBuffer ); + dtime = bli_clock(); + sgemm_compute_( &f77_transa, &f77_packed, &mm, @@ -546,6 +569,8 @@ int main( int argc, char** argv ) betap, cp, (f77_int*)&ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user( bBuffer ); } else if ( packA && packB ) @@ -585,6 +610,8 @@ int main( int argc, char** argv ) (f77_int*)&ldb, bBuffer ); + dtime = bli_clock(); + sgemm_compute_( &f77_packed, &f77_packed, &mm, @@ -595,12 +622,17 @@ int main( int argc, char** argv ) betap, cp, (f77_int*)&ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user(aBuffer); bli_free_user(bBuffer); } else { // Neither A nor B is reordered. + + dtime = bli_clock(); + sgemm_compute_( &f77_transa, &f77_transb, &mm, @@ -610,6 +642,8 @@ int main( int argc, char** argv ) bp, (f77_int*)&ldb, betap, cp, (f77_int*)&ldc ); + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); } #endif } @@ -649,6 +683,8 @@ int main( int argc, char** argv ) ap, lda, aBuffer ); + dtime = bli_clock(); + cblas_dgemm_compute( cblas_order, CblasPacked, cblas_transb, @@ -660,6 +696,8 @@ int main( int argc, char** argv ) *betap, cp, ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user(aBuffer); } else if ( !packA && packB ) @@ -680,6 +718,8 @@ int main( int argc, char** argv ) bp, ldb, bBuffer ); + dtime = bli_clock(); + cblas_dgemm_compute( cblas_order, cblas_transa, CblasPacked, @@ -691,6 +731,8 @@ int main( int argc, char** argv ) *betap, cp, ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user(bBuffer); } else if ( packA && packB ) @@ -728,6 +770,8 @@ int main( int argc, char** argv ) bp, ldb, bBuffer ); + dtime = bli_clock(); + cblas_dgemm_compute( cblas_order, CblasPacked, CblasPacked, @@ -739,12 +783,17 @@ int main( int argc, char** argv ) *betap, cp, ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user(aBuffer); bli_free_user(bBuffer); } else { // Neither A nor B is pre-packed. + + dtime = bli_clock(); + cblas_dgemm_compute( cblas_order, cblas_transa, cblas_transb, @@ -755,6 +804,8 @@ int main( int argc, char** argv ) bp, ldb, *betap, cp, ldc ); + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); } #else // -- BLAS API -- @@ -780,6 +831,8 @@ int main( int argc, char** argv ) (f77_int*)&lda, aBuffer ); + dtime = bli_clock(); + dgemm_compute_( &f77_packed, &f77_transb, &mm, @@ -790,6 +843,8 @@ int main( int argc, char** argv ) betap, cp, (f77_int*)&ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user( aBuffer ); } else if ( !packA && packB ) @@ -811,6 +866,8 @@ int main( int argc, char** argv ) (f77_int*)&ldb, bBuffer ); + dtime = bli_clock(); + dgemm_compute_( &f77_transa, &f77_packed, &mm, @@ -821,6 +878,8 @@ int main( int argc, char** argv ) betap, cp, (f77_int*)&ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user( bBuffer ); } else if ( packA && packB ) @@ -858,6 +917,8 @@ int main( int argc, char** argv ) (f77_int*)&ldb, bBuffer ); + dtime = bli_clock(); + dgemm_compute_( &f77_packed, &f77_packed, &mm, @@ -868,12 +929,17 @@ int main( int argc, char** argv ) betap, cp, (f77_int*)&ldc ); + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + bli_free_user(aBuffer); bli_free_user(bBuffer); } else { // Neither A nor B is reordered. + + dtime = bli_clock(); + dgemm_compute_( &f77_transa, &f77_transb, &mm, @@ -883,6 +949,8 @@ int main( int argc, char** argv ) bp, (f77_int*)&ldb, betap, cp, (f77_int*)&ldc ); + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); } #endif } @@ -891,8 +959,6 @@ int main( int argc, char** argv ) #ifdef PRINT bli_printm( "c compute", &c, "%4.6f", "" ); #endif - - dtime_save = bli_clock_min_diff( dtime_save, dtime ); } gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); diff --git a/frame/1m/packm/bli_pack_full.c b/frame/1m/packm/bli_pack_full.c index a6f30b0253..682fd243df 100644 --- a/frame/1m/packm/bli_pack_full.c +++ b/frame/1m/packm/bli_pack_full.c @@ -33,6 +33,7 @@ */ #include "blis.h" +#include "../../base/bli_pack_compute_utils.h" void bli_pack_full_init ( @@ -238,11 +239,6 @@ void PASTEMAC(ch,tfuncname) \ /* Compute the JC loop thread range for the current thread. */ \ dim_t jc_start, jc_end; \ bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ - const dim_t n_local = jc_end - jc_start; \ -\ - /* Compute number of primary and leftover components of the JC loop. */ \ - /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ - const dim_t jc_left = n_local % NC; \ \ inc_t rs_b_use, cs_b_use, ps_b_use; \ \ @@ -251,12 +247,59 @@ void PASTEMAC(ch,tfuncname) \ for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ { \ /* Calculate the thread's current JC block dimension. */ \ - const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ + dim_t nc_cur = ( NC <= ( jc_end - jj ) ? NC : ( jc_end - jj ) ); \ +\ + dim_t jc_cur_loop = jj;\ + dim_t jc_cur_loop_rem = 0;\ + dim_t n_sub_updated = 0;\ \ - const inc_t pcstep_b_use = ( ( nc_cur + NR - 1 ) / NR ) * NR; \ + /* This function returns the offsets that are computed for */ \ + /* thread workload distribution in MT execution. */ \ + get_B_panel_reordered_start_offset_width \ + ( \ + jj, n, NC, NR, \ + &jc_cur_loop, &jc_cur_loop_rem, \ + &nc_cur, &n_sub_updated \ + ); \ +\ + /* The offsets are calculated in such a way that it resembles */ \ + /* the reorder buffer traversal in single threaded reordering. */ \ + /* The panel boundaries (KCxNC) remain as it is accessed in */ \ + /* single thread, and as a consequence a thread with jc_start */ \ + /* inside the panel cannot consider NC range for reorder. It */ \ + /* has to work with NC' < NC, and the offset is calulated using */ \ + /* prev NC panels spanning k dim + cur NC panel spaning pc loop */ \ + /* cur iteration + (NC - NC') spanning current kc0 (<= KC). */ \ + /* */ \ + /* Eg: Consider the following reordered buffer diagram: */ \ + /* t1 t2 */ \ + /* | | */ \ + /* | |..NC..| */ \ + /* | | | */ \ + /* |.NC. |.NC. |NC'|NC" */ \ + /* pc=0-+-----+-----+---+--+ */ \ + /* KC| | | | | */ \ + /* | 1 | 3 | 5 | */ \ + /* pc=KC-+-----+-----+---st-+ */ \ + /* KC| | | | | */ \ + /* | 2 | 4 | 6 | 7| */ \ + /* pc=k=2KC-+-----+-----+---+--+ */ \ + /* |jc=0 |jc=NC|jc=2NC| */ \ + /* */ \ + /* The numbers 1,2..6,7 denotes the order in which reordered */ \ + /* KCxNC blocks are stored in memory, ie: block 1 followed by 2 */ \ + /* followed by 3, etc. Given two threads t1 and t2, and t2 needs */ \ + /* to acces point st in the reorder buffer to write the data: */ \ + /* The offset calulation logic will be: */ \ + /* jc_cur_loop = 2NC, jc_cur_loop_rem = NC', pc = KC, */ \ + /* n_sub_updated = NC, k = 2KC, kc0_updated = KC */ \ + /* */ \ + /* st = ( jc_cur_loop * k ) */ \ + /* + ( n_sub_updated * pc ) */ \ + /* + ( NC' * kc0_updated) */ \ \ ctype* restrict b_jc = src + jj * jcstep_b; \ - ctype* restrict b_jc_use = dest + jj * jcstep_b_use; \ + ctype* restrict b_jc_use = dest + jc_cur_loop * jcstep_b_use; \ \ /* Compute the PC loop thread range for the current thread. */ \ const dim_t pc_start = 0, pc_end = k; \ @@ -271,10 +314,10 @@ void PASTEMAC(ch,tfuncname) \ for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ { \ /* Calculate the thread's current PC block dimension. */ \ - const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ + const dim_t kc_cur = ( KC <= ( pc_end - pp ) ? KC : pc_left ); \ \ ctype* restrict b_pc = b_jc + pp * pcstep_b; \ - ctype* restrict b_pc_use = b_jc_use + pp * pcstep_b_use; \ + ctype* restrict b_pc_use = b_jc_use + pp * n_sub_updated + jc_cur_loop_rem * kc_cur; \ \ /* Packing is parallelized only at JC loop */ \ thread_pb = &BLIS_GEMM_SINGLE_THREADED; \ @@ -307,6 +350,9 @@ void PASTEMAC(ch,tfuncname) \ ); \ \ } \ +\ + adjust_B_panel_reordered_jc( &jj, jc_cur_loop ); \ +\ } \ \ } \ diff --git a/frame/3/bli_l3_compute.c b/frame/3/bli_l3_compute.c index c7c48a8f49..e9925e48b7 100644 --- a/frame/3/bli_l3_compute.c +++ b/frame/3/bli_l3_compute.c @@ -33,6 +33,7 @@ */ #include "blis.h" +#include "../base/bli_pack_compute_utils.h" void bli_gemm_compute_init ( @@ -77,9 +78,6 @@ void bli_gemm_compute_init // bli_nthreads_optimum(a, b, c, BLIS_GEMM, rntm ); #endif - // Explicitly set n_threads=1 and update rntm since only ST supported. - dim_t n_threads = 1; - bli_rntm_set_num_threads( n_threads, rntm ); bli_rntm_set_ways_from_rntm_sup ( bli_obj_length( c ), @@ -362,22 +360,32 @@ void PASTEMAC( ch, varname ) \ /* Compute the JC loop thread range for the current thread. */ \ dim_t jc_start, jc_end; \ bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ - const dim_t n_local = jc_end - jc_start; \ -\ - /* Compute number of primary and leftover components of the JC loop. */ \ - /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ - const dim_t jc_left = n_local % NC; \ \ /* Loop over the n dimension (NC rows/columns at a time). */ \ /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ { \ /* Calculate the thread's current JC block dimension. */ \ - const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ - const inc_t pcstep_b_use = ( ( nc_cur + NR - 1 ) / NR ) * NR; \ + dim_t nc_cur = ( NC <= ( jc_end - jj ) ? NC : ( jc_end - jj ) ); \ +\ + /* For MT correctness- to ensure full packing order of packed buffer */ \ + /* for Single and Multi Threaded executions are same. */ \ + dim_t jc_cur_loop = jj;\ + dim_t jc_cur_loop_rem = 0;\ + dim_t n_sub_updated = 0;\ +\ + if ( packedb ) \ + { \ + get_B_panel_reordered_start_offset_width \ + ( \ + jj, n, NC, NR, \ + &jc_cur_loop, &jc_cur_loop_rem, \ + &nc_cur, &n_sub_updated \ + ); \ + } \ \ ctype* restrict b_jc = b_00 + jj * jcstep_b; \ - ctype* restrict b_jc_use = b_00 + jj * jcstep_b_use; \ + ctype* restrict b_jc_use = b_00 + jc_cur_loop * jcstep_b_use; \ ctype* restrict c_jc = c_00 + jj * jcstep_c; \ \ /* Grow the thrinfo_t tree. */ \ @@ -398,7 +406,7 @@ void PASTEMAC( ch, varname ) \ for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ { \ /* Calculate the thread's current PC block dimension. */ \ - const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ + const dim_t kc_cur = ( KC <= ( pc_end - pp ) ? KC : pc_left ); \ const inc_t icstep_a_use = kc_cur; \ \ ctype* restrict a_pc = a_00 + pp * pcstep_a; \ @@ -440,7 +448,7 @@ void PASTEMAC( ch, varname ) \ rs_b_use = NR; \ cs_b_use = 1; \ ps_b_use = kc_cur * NR; \ - b_pc_use = b_jc_use + pp * pcstep_b_use; \ + b_pc_use = b_jc_use + pp * n_sub_updated + jc_cur_loop_rem * kc_cur; \ } else \ { \ PASTEMAC(ch,packm_sup_b) \ @@ -615,6 +623,10 @@ void PASTEMAC( ch, varname ) \ that matrix is packed within the pc loop of this variant). */ \ if ( packb ) bli_thread_barrier( thread_pb ); \ } \ + if ( packedb ) \ + { \ + adjust_B_panel_reordered_jc( &jj, jc_cur_loop ); \ + } \ } \ \ /* Release any memory that was acquired for packing matrices A and B. */ \ @@ -634,4 +646,4 @@ void PASTEMAC( ch, varname ) \ ); \ } -INSERT_GENTFUNC_BASIC0_SD( gemm_compute ) \ No newline at end of file +INSERT_GENTFUNC_BASIC0_SD( gemm_compute ) diff --git a/frame/base/CMakeLists.txt b/frame/base/CMakeLists.txt index a64e57ec45..4e7bd4abdb 100644 --- a/frame/base/CMakeLists.txt +++ b/frame/base/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.## target_sources("${PROJECT_NAME}" PUBLIC @@ -39,6 +39,7 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_setri.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_string.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_winsys.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_pack_compute_utils.c ) #Add all subdirectories diff --git a/frame/base/bli_pack_compute_utils.c b/frame/base/bli_pack_compute_utils.c new file mode 100644 index 0000000000..ce75efc744 --- /dev/null +++ b/frame/base/bli_pack_compute_utils.c @@ -0,0 +1,112 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "blis.h" + +// Utility function to compute the offset for K dimension traversal +// such that it is a multiple of NR. +dim_t get_Bpanel_width_for_kdim_traversal + ( + dim_t jc, + dim_t n, + dim_t NC, + dim_t NR + ) +{ + dim_t n_mod_NR = n % NR; + dim_t n_sub_updated = NC; + + if ( ( n % NC ) != 0 ) + { + // Only applicable to final NC part of jc loop where jc + remaining + // elements is less than NC; or when n < NC in which case panel width + // is atmost n. + dim_t n_last_loop = ( n / NC ) * NC; + if ( jc >= n_last_loop ) + { + n_sub_updated = n - n_last_loop; + if ( n_mod_NR != 0 ) + { + n_sub_updated += ( NR - n_mod_NR ); + } + } + } + + return n_sub_updated; +} + +void get_B_panel_reordered_start_offset_width + ( + dim_t jc, + dim_t n, + dim_t NC, + dim_t NR, + dim_t* panel_start, + dim_t* panel_offset, + dim_t* panel_width, + dim_t* panel_width_kdim_trav + ) +{ + // Since n dimension is split across threads in units of NR blocks, + // it could happen that B matrix chunk for a thread may be part of + // two separate NCxKC panels. In this case nc0 is updated such that + // the jr loop only accesses the remaining portion of current NCxKC + // panel, with the next jc iteration taking care of the other panel. + // This ensures that jr loop does not cross panel boundaries. + ( *panel_start ) = ( jc / NC ) * NC; + ( *panel_offset ) = jc - ( *panel_start ); + + // Check if jc + current_panel_width (nc0) crosses panel boundaries. + if ( ( jc + ( *panel_width ) ) > ( ( *panel_start ) + NC ) ) + { + ( *panel_width ) = NC - ( *panel_offset ); + } + + ( *panel_width_kdim_trav ) = get_Bpanel_width_for_kdim_traversal + ( + jc, n, NC, NR + ); +} + +void adjust_B_panel_reordered_jc( dim_t* jc, dim_t panel_start ) +{ + // Since n dimension is split across threads in units of NR blocks, + // it could happen that B matrix chunk for a thread may be part of + // two separate NCxKC panels. In this case jc is reset to immediate + // previous panel offset so that in the next iteration, the + // following panel belonging to the B chunk is accessed. This + // ensures that jr loop does not cross panel boundaries. + ( *jc ) = panel_start; +} + + diff --git a/frame/base/bli_pack_compute_utils.h b/frame/base/bli_pack_compute_utils.h new file mode 100644 index 0000000000..56928e45e4 --- /dev/null +++ b/frame/base/bli_pack_compute_utils.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "blis.h" + +#ifndef PACK_COMPUTE_UTILS_H +#define PACK_COMPUTE_UTILS_H + +dim_t get_Bpanel_width_for_kdim_traversal + ( + dim_t jc, + dim_t n, + dim_t NC, + dim_t NR + ); + +void get_B_panel_reordered_start_offset_width + ( + dim_t jc, + dim_t n, + dim_t NC, + dim_t NR, + dim_t* panel_start, + dim_t* panel_offset, + dim_t* panel_width, + dim_t* panel_width_kdim_trav + ); + +void adjust_B_panel_reordered_jc( dim_t* jc, dim_t panel_start ); + +#endif //PACK_COMPUTE_UTILS_H + + + + diff --git a/frame/include/blis.h b/frame/include/blis.h index 1910f6fe98..d2856d2aa3 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -6,7 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP - Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/frame/thread/bli_l3_compute_decor_openmp.c b/frame/thread/bli_l3_compute_decor_openmp.c index 4219e76c8e..ab4dffa872 100644 --- a/frame/thread/bli_l3_compute_decor_openmp.c +++ b/frame/thread/bli_l3_compute_decor_openmp.c @@ -32,10 +32,6 @@ */ -// @note: Presently MT is not supported, so n_threads have been explicitly -// initialized to 1 while intializing. Thus, even if BLIS is build with OpenMP -// support, the compute APIs work as an ST implementation. - #include "blis.h" #ifdef BLIS_ENABLE_OPENMP diff --git a/frame/thread/bli_l3_compute_decor_single.c b/frame/thread/bli_l3_compute_decor_single.c index cadcd413cf..8995691428 100644 --- a/frame/thread/bli_l3_compute_decor_single.c +++ b/frame/thread/bli_l3_compute_decor_single.c @@ -34,7 +34,7 @@ #include "blis.h" -#ifndef BLIS_ENABLE_MULTITHREADING +#if !defined (BLIS_ENABLE_MULTITHREADING) || defined (BLIS_ENABLE_PTHREADS) err_t bli_l3_compute_thread_decorator ( diff --git a/frame/thread/bli_pack_full_decor_openmp.c b/frame/thread/bli_pack_full_decor_openmp.c index 5d5034d193..a6f94afbb6 100644 --- a/frame/thread/bli_pack_full_decor_openmp.c +++ b/frame/thread/bli_pack_full_decor_openmp.c @@ -54,11 +54,7 @@ void bli_pack_full_thread_decorator /* Ensure n_threads is always greater than or equal to 1 */ /* Passing BLIS_IC_NT and BLIS_JC_NT for pack can lead to n_threads */ /* becoming negative. In that case, packing is done using 1 thread */ - // n_threads = ( n_threads > 0 ) ? n_threads : 1; - - // Explicitly setting n_threads = 1 to force packing with only a single - // thread. - n_threads = 1; + n_threads = ( n_threads > 0 ) ? n_threads : 1; _Pragma( "omp parallel num_threads(n_threads)" ) { diff --git a/frame/thread/bli_pack_full_decor_single.c b/frame/thread/bli_pack_full_decor_single.c index d88b35019a..b946a0326d 100644 --- a/frame/thread/bli_pack_full_decor_single.c +++ b/frame/thread/bli_pack_full_decor_single.c @@ -34,9 +34,7 @@ #include "blis.h" -#ifndef BLIS_ENABLE_OPENMP - -#define SKIP_THRINFO_TREE +#if !defined (BLIS_ENABLE_MULTITHREADING) || defined (BLIS_ENABLE_PTHREADS) void* bli_pack_full_thread_entry( void* data_void ) { return NULL; } From 8885510db21314513f622b33112077ee4a1a0941 Mon Sep 17 00:00:00 2001 From: Arnav Sharma Date: Thu, 26 Oct 2023 12:26:22 +0530 Subject: [PATCH 181/226] Fix for Missing Symbols for gemm_pack_get_size - Symbols for gemm_pack_get_size were not being exported properly when BLIS was built as a shared library. - Correctly assigned the BLIS_EXPORT_BLAS macro to ?gemm_pack_get_size_ function declaration. - Added missing gemm_pack and gemm_pack_get_size macros to bli_macro_defs.h file. - Removed an unnecessary BLIS_EXPORT_BLAS macro from dgemm_compute function definition. - Updated bli_util_api_wrap with no underscore API wrappers for pack and compute set of BLAS Extension APIs: 1. ?gemm_pack_get_size 2. ?gemm_pack 3. ?gemm_compute AMD-Internal: [CPUPL-4083] Change-Id: I78cd7642c2fcbfdf02676e654a377ad2aa5295c1 --- frame/compat/bla_gemm_compute.c | 2 +- frame/compat/bla_gemm_pack.h | 76 +++++++----------- frame/compat/bla_gemm_pack_get_size.h | 52 +++++-------- frame/include/bli_macro_defs.h | 8 ++ frame/util/bli_util_api_wrap.c | 108 ++++++++++++++++++++++++++ frame/util/bli_util_api_wrap.h | 48 ++++++++++++ 6 files changed, 211 insertions(+), 83 deletions(-) diff --git a/frame/compat/bla_gemm_compute.c b/frame/compat/bla_gemm_compute.c index e68aa68df0..0778172bbf 100644 --- a/frame/compat/bla_gemm_compute.c +++ b/frame/compat/bla_gemm_compute.c @@ -256,7 +256,7 @@ void dgemm_compute_blis_impl } #ifdef BLIS_ENABLE_BLAS -BLIS_EXPORT_BLAS void dgemm_compute_ +void dgemm_compute_ ( const f77_char* transa, const f77_char* transb, diff --git a/frame/compat/bla_gemm_pack.h b/frame/compat/bla_gemm_pack.h index 1694ef0e4f..1621bfc70a 100644 --- a/frame/compat/bla_gemm_pack.h +++ b/frame/compat/bla_gemm_pack.h @@ -42,54 +42,32 @@ // Currently we are not adding blis interfaces - these BLAS interfaces will be available by default -#ifdef BLIS_ENABLE_BLAS -BLIS_EXPORT_BLAS void dgemm_pack_ - ( - const f77_char* identifier, - const f77_char* trans, - const f77_int* m, - const f77_int* n, - const f77_int* k, - const double* alpha, - const double* src, const f77_int* pld, - double* dest +#undef GENTPROTRO +#define GENTPROTRO( ftype, ch, blasname ) \ +\ +IF_BLIS_ENABLE_BLAS(\ +BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ + ( \ + const f77_char* identifier, \ + const f77_char* trans, \ + const f77_int* m, \ + const f77_int* n, \ + const f77_int* k, \ + const ftype* alpha, \ + const ftype* src, const f77_int* pld, \ + ftype* dest \ + ); \ +)\ +BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \ + ( \ + const f77_char* identifier, \ + const f77_char* trans, \ + const f77_int* m, \ + const f77_int* n, \ + const f77_int* k, \ + const ftype* alpha, \ + const ftype* src, const f77_int* pld, \ + ftype* dest \ ); -#endif -BLIS_EXPORT_BLAS void dgemm_pack_blis_impl - ( - const f77_char* identifier, - const f77_char* trans, - const f77_int* m, - const f77_int* n, - const f77_int* k, - const double* alpha, - const double* src, const f77_int* pld, - double* dest - ); - -#ifdef BLIS_ENABLE_BLAS -BLIS_EXPORT_BLAS void sgemm_pack_ - ( - const f77_char* identifier, - const f77_char* trans, - const f77_int* m, - const f77_int* n, - const f77_int* k, - const float* alpha, - const float* src, const f77_int* pld, - float* dest - ); -#endif - -BLIS_EXPORT_BLAS void sgemm_pack_blis_impl - ( - const f77_char* identifier, - const f77_char* trans, - const f77_int* m, - const f77_int* n, - const f77_int* k, - const float* alpha, - const float* src, const f77_int* pld, - float* dest - ); +INSERT_GENTPROTRO_BLAS( gemm_pack ) \ No newline at end of file diff --git a/frame/compat/bla_gemm_pack_get_size.h b/frame/compat/bla_gemm_pack_get_size.h index 67e389210c..42c4a22072 100644 --- a/frame/compat/bla_gemm_pack_get_size.h +++ b/frame/compat/bla_gemm_pack_get_size.h @@ -40,38 +40,24 @@ /* BLAS Extensions */ /* returns number of bytes */ -#ifdef BLIS_ENABLE_BLAS -f77_int dgemm_pack_get_size_ - ( - const f77_char* identifier, - const f77_int* pm, - const f77_int* pn, - const f77_int* pk +#undef GENTPROTRO +#define GENTPROTRO( ftype, ch, blasname ) \ +\ +IF_BLIS_ENABLE_BLAS(\ +BLIS_EXPORT_BLAS f77_int PASTEF77(ch,blasname) \ + ( \ + const f77_char* identifier, \ + const f77_int* pm, \ + const f77_int* pn, \ + const f77_int* pk \ + ); \ +)\ +BLIS_EXPORT_BLAS f77_int PASTEF77S(ch,blasname) \ + ( \ + const f77_char* identifier, \ + const f77_int* pm, \ + const f77_int* pn, \ + const f77_int* pk \ ); -#endif -BLIS_EXPORT_BLAS f77_int dgemm_pack_get_size_blis_impl - ( - const f77_char* identifier, - const f77_int* pm, - const f77_int* pn, - const f77_int* pk - ); - -#ifdef BLIS_ENABLE_BLAS -f77_int sgemm_pack_get_size_ - ( - const f77_char* identifier, - const f77_int* pm, - const f77_int* pn, - const f77_int* pk - ); -#endif - -BLIS_EXPORT_BLAS f77_int sgemm_pack_get_size_blis_impl - ( - const f77_char* identifier, - const f77_int* pm, - const f77_int* pn, - const f77_int* pk - ); +INSERT_GENTPROTRO_BLAS( gemm_pack_get_size ) \ No newline at end of file diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index 7946be6c75..fcbe6998ad 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -289,6 +289,10 @@ #define zgemm_batch_ zgemm_batch #define sgemm_compute_ sgemm_compute #define dgemm_compute_ dgemm_compute +#define sgemm_pack_get_size_ sgemm_pack_get_size +#define dgemm_pack_get_size_ dgemm_pack_get_size +#define sgemm_pack_ sgemm_pack +#define dgemm_pack_ dgemm_pack #define saxpby_ saxpby #define daxpby_ daxpby #define caxpby_ caxpby @@ -394,6 +398,8 @@ #define dgemm DGEMM #define dgemm_batch DGEMM_BATCH #define dgemm_compute DGEMM_COMPUTE +#define dgemm_pack_get_size DGEMM_PACK_GET_SIZE +#define dgemm_pack DGEMM_PACK #define dgemmt DGEMMT #define dgemv DGEMV #define dger DGER @@ -468,6 +474,8 @@ #define sgemm SGEMM #define sgemm_batch SGEMM_BATCH #define sgemm_compute SGEMM_COMPUTE +#define sgemm_pack_get_size SGEMM_PACK_GET_SIZE +#define sgemm_pack SGEMM_PACK #define sgemmt SGEMMT #define sgemv SGEMV #define sger SGER diff --git a/frame/util/bli_util_api_wrap.c b/frame/util/bli_util_api_wrap.c index 21ccd2fb1c..b77adad865 100644 --- a/frame/util/bli_util_api_wrap.c +++ b/frame/util/bli_util_api_wrap.c @@ -2574,6 +2574,60 @@ void DGEMM_BATCH_( const f77_char* transa_array, const f77_char* transb_array,co dgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } +f77_int DGEMM_PACK_GET_SIZE(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk) +{ + return dgemm_pack_get_size_blis_impl( identifier, pm, pn, pk ); +} + +f77_int dgemm_pack_get_size(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk) +{ + return dgemm_pack_get_size_blis_impl( identifier, pm, pn, pk ); +} + +f77_int DGEMM_PACK_GET_SIZE_(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk) +{ + return dgemm_pack_get_size_blis_impl( identifier, pm, pn, pk ); +} + +void DGEMM_PACK( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const double* alpha, const double* src, const f77_int* pld, double* dest ) +{ + dgemm_pack_blis_impl( identifier, trans, mm, nn, kk, alpha, src, pld, dest ); +} + +void dgemm_pack( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const double* alpha, const double* src, const f77_int* pld, double* dest ) +{ + dgemm_pack_blis_impl( identifier, trans, mm, nn, kk, alpha, src, pld, dest ); +} + +void DGEMM_PACK_( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const double* alpha, const double* src, const f77_int* pld, double* dest ) +{ + dgemm_pack_blis_impl( identifier, trans, mm, nn, kk, alpha, src, pld, dest ); +} + +void DGEMM_COMPUTE( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc ) +{ + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int rs_c = 1; + dgemm_compute_blis_impl( transa, transb, m, n, k, a, &rs_a, lda, b, &rs_b, ldb, beta, c, &rs_c, ldc ); +} + +void dgemm_compute( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc ) +{ + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int rs_c = 1; + dgemm_compute_blis_impl( transa, transb, m, n, k, a, &rs_a, lda, b, &rs_b, ldb, beta, c, &rs_c, ldc ); +} + +void DGEMM_COMPUTE_( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc ) +{ + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int rs_c = 1; + dgemm_compute_blis_impl( transa, transb, m, n, k, a, &rs_a, lda, b, &rs_b, ldb, beta, c, &rs_c, ldc ); +} + void DGEMMT( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const double* alpha, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc) { dgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -2981,6 +3035,60 @@ void SGEMM_BATCH_(const f77_char* transa_array, const f77_char* transb_array,con sgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size); } +f77_int SGEMM_PACK_GET_SIZE(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk) +{ + return sgemm_pack_get_size_blis_impl( identifier, pm, pn, pk ); +} + +f77_int sgemm_pack_get_size(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk) +{ + return sgemm_pack_get_size_blis_impl( identifier, pm, pn, pk ); +} + +f77_int SGEMM_PACK_GET_SIZE_(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk) +{ + return sgemm_pack_get_size_blis_impl( identifier, pm, pn, pk ); +} + +void SGEMM_PACK( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const float* alpha, const float* src, const f77_int* pld, float* dest ) +{ + sgemm_pack_blis_impl( identifier, trans, mm, nn, kk, alpha, src, pld, dest ); +} + +void sgemm_pack( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const float* alpha, const float* src, const f77_int* pld, float* dest ) +{ + sgemm_pack_blis_impl( identifier, trans, mm, nn, kk, alpha, src, pld, dest ); +} + +void SGEMM_PACK_( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const float* alpha, const float* src, const f77_int* pld, float* dest ) +{ + sgemm_pack_blis_impl( identifier, trans, mm, nn, kk, alpha, src, pld, dest ); +} + +void SGEMM_COMPUTE( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc ) +{ + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int rs_c = 1; + sgemm_compute_blis_impl( transa, transb, m, n, k, a, &rs_a, lda, b, &rs_b, ldb, beta, c, &rs_c, ldc ); +} + +void sgemm_compute( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc ) +{ + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int rs_c = 1; + sgemm_compute_blis_impl( transa, transb, m, n, k, a, &rs_a, lda, b, &rs_b, ldb, beta, c, &rs_c, ldc ); +} + +void SGEMM_COMPUTE_( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc ) +{ + f77_int rs_a = 1; + f77_int rs_b = 1; + f77_int rs_c = 1; + sgemm_compute_blis_impl( transa, transb, m, n, k, a, &rs_a, lda, b, &rs_b, ldb, beta, c, &rs_c, ldc ); +} + void SGEMMT( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const float* alpha, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc) { sgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); diff --git a/frame/util/bli_util_api_wrap.h b/frame/util/bli_util_api_wrap.h index 7f458316d0..15e541cb9a 100644 --- a/frame/util/bli_util_api_wrap.h +++ b/frame/util/bli_util_api_wrap.h @@ -1560,6 +1560,30 @@ BLIS_EXPORT_BLIS void DGEMM_BATCH_( const f77_char* transa_array, const f77_cha +BLIS_EXPORT_BLIS f77_int DGEMM_PACK_GET_SIZE(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk); + +BLIS_EXPORT_BLIS f77_int dgemm_pack_get_size(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk); + +BLIS_EXPORT_BLIS f77_int DGEMM_PACK_GET_SIZE_(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk); + + + +BLIS_EXPORT_BLIS void DGEMM_PACK( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const double* alpha, const double* src, const f77_int* pld, double* dest ); + +BLIS_EXPORT_BLIS void dgemm_pack( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const double* alpha, const double* src, const f77_int* pld, double* dest ); + +BLIS_EXPORT_BLIS void DGEMM_PACK_( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const double* alpha, const double* src, const f77_int* pld, double* dest ); + + + +BLIS_EXPORT_BLIS void DGEMM_COMPUTE( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc ); + +BLIS_EXPORT_BLIS void dgemm_compute( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc ); + +BLIS_EXPORT_BLIS void DGEMM_COMPUTE_( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc ); + + + BLIS_EXPORT_BLIS void DGEMMT( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const double* alpha, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc); BLIS_EXPORT_BLIS void dgemmt( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const double* alpha, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc); @@ -1584,6 +1608,30 @@ BLIS_EXPORT_BLIS void SGEMM_BATCH_(const f77_char* transa_array, const f77_char +BLIS_EXPORT_BLIS f77_int SGEMM_PACK_GET_SIZE(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk); + +BLIS_EXPORT_BLIS f77_int sgemm_pack_get_size(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk); + +BLIS_EXPORT_BLIS f77_int SGEMM_PACK_GET_SIZE_(const f77_char* identifier, const f77_int* pm, const f77_int* pn, const f77_int* pk); + + + +BLIS_EXPORT_BLIS void SGEMM_PACK( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const float* alpha, const float* src, const f77_int* pld, float* dest ); + +BLIS_EXPORT_BLIS void sgemm_pack( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const float* alpha, const float* src, const f77_int* pld, float* dest ); + +BLIS_EXPORT_BLIS void SGEMM_PACK_( const f77_char* identifier, const f77_char* trans, const f77_int* mm, const f77_int* nn, const f77_int* kk, const float* alpha, const float* src, const f77_int* pld, float* dest ); + + + +BLIS_EXPORT_BLIS void SGEMM_COMPUTE( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc ); + +BLIS_EXPORT_BLIS void sgemm_compute( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc ); + +BLIS_EXPORT_BLIS void SGEMM_COMPUTE_( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc ); + + + BLIS_EXPORT_BLIS void SGEMMT( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const float* alpha, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc); BLIS_EXPORT_BLIS void sgemmt( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const float* alpha, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc); From 44dfc7a5153800ab4bc0e99f940ef1e13c9f9d2b Mon Sep 17 00:00:00 2001 From: Arnav Sharma Date: Fri, 27 Oct 2023 12:27:45 +0530 Subject: [PATCH 182/226] Fix for gemm_compute BLAS Check - BLAS compute checks updated to properly check for rs_c and cs_c. - Updated BLAS compute checks to skip validity check if m==1 or n==1. For the same reason, added a check just before to validate rs_c and cs_c are greater than or equal to 1. - Added tiny size tests to gtestsuite as a sanity check. - Also updated the Invalid Input Tests to test for the updated checks. AMD-Internal: [CPUPL-4140] Change-Id: I984339ec7909778b58409ffcdbeed4ee33f28cfb --- frame/compat/check/bla_gemm_compute_check.h | 7 +++++- .../gemm_compute/dgemm_compute_generic.cpp | 25 +++++++++++++++++++ .../gemm_compute/gemm_compute_IIT_ERS.cpp | 15 +++++++++++ .../gemm_compute/sgemm_compute_generic.cpp | 25 +++++++++++++++++++ 4 files changed, 71 insertions(+), 1 deletion(-) diff --git a/frame/compat/check/bla_gemm_compute_check.h b/frame/compat/check/bla_gemm_compute_check.h index 1e24168110..4264462af6 100644 --- a/frame/compat/check/bla_gemm_compute_check.h +++ b/frame/compat/check/bla_gemm_compute_check.h @@ -69,7 +69,12 @@ info = 7; \ else if ( !packb && *ldb < bli_max( 1, nrowb ) ) /* ldb is ignored when B is packed. */ \ info = 9; \ - else if ( ( *rs_c == 1 && *cs_c < bli_max( 1, *m ) ) || ( *cs_c == 1 && *rs_c < bli_max( 1, *n ) ) ) \ + else if ( *rs_c < 1 || *cs_c < 1 ) \ + info = 12; \ + else if ( /* Skip check for validity of strides when m==1 or n==1. */ \ + ( *m != 1 && *n != 1 ) && \ + ( ( *rs_c == 1 && *cs_c < bli_max( 1, *m ) ) || \ + ( *cs_c == 1 && *rs_c < bli_max( 1, *n ) ) ) ) \ info = 12; \ \ if ( info != 0 ) \ diff --git a/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp b/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp index 82b89b7191..e26b8e9624 100644 --- a/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp @@ -161,6 +161,31 @@ INSTANTIATE_TEST_SUITE_P( ::DGemmComputeTestPrint() ); +INSTANTIATE_TEST_SUITE_P( + TinySizes, + DGemmComputeTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n', 't', 'c'), // transa + ::testing::Values('n', 't', 'c'), // transb + ::testing::Values('u', 'p'), // packa + ::testing::Values('u', 'p'), // packb + ::testing::Range(gtint_t(1), gtint_t(3), 1), // m + ::testing::Range(gtint_t(1), gtint_t(3), 1), // n + ::testing::Range(gtint_t(1), gtint_t(3), 1), // k + ::testing::Values(0.0, 1.0, -1.2, 2.1), // alpha + ::testing::Values(0.0, 1.0, -1.2, 2.1), // beta + ::testing::Values(gtint_t(0)), // increment to the leading dim of a + ::testing::Values(gtint_t(0)), // increment to the leading dim of b + ::testing::Values(gtint_t(0)) // increment to the leading dim of c + ), + ::DGemmComputeTestPrint() + ); + INSTANTIATE_TEST_SUITE_P( DimensionsGtBlocksizes, // Dimensions > SUP Blocksizes DGemmComputeTest, diff --git a/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp b/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp index c70a048bca..89c439c6ef 100644 --- a/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp +++ b/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp @@ -166,6 +166,21 @@ TYPED_TEST(GEMM_Compute_IIT_ERS_Test, invalid_ldb) computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); } +// When info == 12 +TYPED_TEST(GEMM_Compute_IIT_ERS_Test, invalid_ldc_lt_zero) +{ + using T = TypeParam; + // Defining the C matrix with values for debugging purposes + std::vector c = testinghelpers::get_random_matrix(-10, 10, STORAGE, 'N', N, N, LDC, 'f'); + + // Copy so that we check that the elements of C are not modified. + std::vector c_ref(c); + // Call BLIS Gemm with a invalid value for m. + gemm_compute( STORAGE, TRANS, TRANS, 'U', 'U', M, N, K, nullptr, nullptr, LDA, nullptr, LDB, nullptr, nullptr, -1 ); + // Use bitwise comparison (no threshold). + computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); +} + // When info == 12 TYPED_TEST(GEMM_Compute_IIT_ERS_Test, invalid_ldc) { diff --git a/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp b/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp index e261f65835..a75ac16916 100644 --- a/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp @@ -163,6 +163,31 @@ INSTANTIATE_TEST_SUITE_P( ::SGemmComputeTestPrint() ); +INSTANTIATE_TEST_SUITE_P( + TinySizes, + SGemmComputeTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n', 't', 'c'), // transa + ::testing::Values('n', 't', 'c'), // transb + ::testing::Values('u', 'p'), // packa + ::testing::Values('u', 'p'), // packb + ::testing::Range(gtint_t(1), gtint_t(3), 1), // m + ::testing::Range(gtint_t(1), gtint_t(3), 1), // n + ::testing::Range(gtint_t(1), gtint_t(3), 1), // k + ::testing::Values(0.0, 1.0, -1.2, 2.1), // alpha + ::testing::Values(0.0, 1.0, -1.2, 2.1), // beta + ::testing::Values(gtint_t(0)), // increment to the leading dim of a + ::testing::Values(gtint_t(0)), // increment to the leading dim of b + ::testing::Values(gtint_t(0)) // increment to the leading dim of c + ), + ::SGemmComputeTestPrint() + ); + INSTANTIATE_TEST_SUITE_P( DimensionsGtBlocksizes, // Dimensions > SUP Blocksizes SGemmComputeTest, From ffa8f584bec77bdb4161d435263463e7ec73daa2 Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Tue, 31 Oct 2023 15:52:18 +0530 Subject: [PATCH 183/226] Added ZTRSM AVX512 native path kernels - Added 4x12 ZGEMM row-preferred kernel. - Added 4x12 ZTRSM row-preferred lower and upper kernels using AVX512 ISA. - These kernels are used for ZTRSM only, zgemm still uses 12x4 kernel. - Kernels support row/col/gen storage. - Kernels support A prefetch, B prefetch, A_next prefetch, B_next prefetch and c prefetch. - B prefetch, B_next prefetch and C prefetch are enabled by default. - Updated CMakeLists.txt with ZGEMM kernels for windows build. AMD-Internal: [CPUPL-3781] Change-Id: I0fb4b2ec2f4bd66db6499c25f12bcc4bdb09804a --- config/zen4/bli_cntx_init_zen4.c | 14 +- kernels/zen4/3/CMakeLists.txt | 3 + kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c | 565 ++++++++++++++++++ kernels/zen4/3/bli_zgemmtrsm_l_4x12.c | 705 ++++++++++++++++++++++ kernels/zen4/3/bli_zgemmtrsm_u_4x12.c | 715 +++++++++++++++++++++++ kernels/zen4/bli_kernels_zen4.h | 4 + 6 files changed, 1999 insertions(+), 7 deletions(-) create mode 100644 kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c create mode 100644 kernels/zen4/3/bli_zgemmtrsm_l_4x12.c create mode 100644 kernels/zen4/3/bli_zgemmtrsm_u_4x12.c diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index c7f25fa5c3..cc836d6292 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -90,16 +90,16 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // Different GEMM kernels are used for TRSM for zen4 architecture BLIS_GEMM_FOR_TRSM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE, - BLIS_GEMM_FOR_TRSM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen_asm_2x6, TRUE, + BLIS_GEMM_FOR_TRSM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_4x12, TRUE, // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_zen4_asm_8x24, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_zen_asm_2x6, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_zen4_asm_4x12, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_zen4_asm_8x24, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_zen_asm_2x6, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_zen4_asm_4x12, TRUE, cntx ); @@ -244,11 +244,11 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // Using different cache block sizes for TRSM instead of common level-3 block sizes. // Tuning is done for double-precision only. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 8, 3, 2 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 24, 8, 6 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 120, 144, 24 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 8, 3, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 24, 8, 12 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 120, 144, 40 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 512 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4008, 4080, 1536 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4008, 4080, 2004 ); // Update the context with the current architecture's register and cache // blocksizes for level-3 TRSM problems. diff --git a/kernels/zen4/3/CMakeLists.txt b/kernels/zen4/3/CMakeLists.txt index 6b03b08ec8..f92f01f2a3 100644 --- a/kernels/zen4/3/CMakeLists.txt +++ b/kernels/zen4/3/CMakeLists.txt @@ -11,6 +11,9 @@ add_library(zen4_3 ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small_AVX512.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_zen4_asm_12x4.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_zero_zmm.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_zen4_asm_4x12.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemmtrsm_l_4x12.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemmtrsm_u_4x12.c ) target_compile_options(zen4_3 PRIVATE /arch:AVX2 /arch:AVX512) diff --git a/kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c b/kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c new file mode 100644 index 0000000000..e8bdf4f503 --- /dev/null +++ b/kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c @@ -0,0 +1,565 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "bli_x86_asm_macros.h" + +#define A_L1_PREFETCH_DIST 6 +#define B_L1_PREFETCH_DIST 6 +#define TAIL_NITER 7 +// #define PREFETCH_A +#define PREFETCH_B +// #define PREFETCH_A_NEXT +#define PREFETCH_B_NEXT +#define PREFETCH_C // perfetch c in middle loop over 4 iterations of k + + +#ifdef PREFETCH_A + #define PREFETCH_A_L1(n) \ + PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*4*16 + 4*n*16)) +#else + #define PREFETCH_A_L1(n) +#endif + +#ifdef PREFETCH_B + #define PREFETCH_B_L1(n, k) \ + PREFETCH(0, MEM(RBX, B_L1_PREFETCH_DIST*12*16 + (12*n+(4*k))*16)) +#else + #define PREFETCH_B_L1(n, k) +#endif + + +/* + * A Registers: ZMM3, ZMM4, ZMM29, ZMM30 + * B Registers: ZMM0, ZMM1, ZMM2 + * C Registers: ZMM[8-28] + */ + +#define LOOP_ALIGN ALIGN32 + +#define SUBITER(n) \ +\ + PREFETCH_A_L1(n)\ + VBROADCASTSD(ZMM(3), MEM(RAX, (8*n+2)*8)) \ + VFMADD231PD(ZMM(5) , ZMM(0), ZMM(29)) \ + VFMADD231PD(ZMM(6) , ZMM(1), ZMM(29)) \ + VFMADD231PD(ZMM(7) , ZMM(2), ZMM(29)) \ + VBROADCASTSD(ZMM(4), MEM(RAX, (8*n+3)*8)) \ + VFMADD231PD(ZMM(8) , ZMM(0), ZMM(30)) \ + VFMADD231PD(ZMM(9) , ZMM(1), ZMM(30)) \ + VFMADD231PD(ZMM(10), ZMM(2), ZMM(30)) \ + \ + PREFETCH_B_L1(n, 0)\ + VBROADCASTSD(ZMM(29), MEM(RAX, (8*n+4)*8)) \ + VFMADD231PD(ZMM(11), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(12), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(13), ZMM(2), ZMM(3)) \ + VBROADCASTSD(ZMM(30), MEM(RAX, (8*n+5)*8)) \ + VFMADD231PD(ZMM(14), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(15), ZMM(1), ZMM(4)) \ + VFMADD231PD(ZMM(16), ZMM(2), ZMM(4)) \ + \ + PREFETCH_B_L1(n, 1)\ + VBROADCASTSD(ZMM(3), MEM(RAX, (8*n+6)*8)) \ + VFMADD231PD(ZMM(17), ZMM(0), ZMM(29)) \ + VFMADD231PD(ZMM(18), ZMM(1), ZMM(29)) \ + VFMADD231PD(ZMM(19), ZMM(2), ZMM(29)) \ + VBROADCASTSD(ZMM(4), MEM(RAX, (8*n+7)*8)) \ + VFMADD231PD(ZMM(20), ZMM(0), ZMM(30)) \ + VFMADD231PD(ZMM(21), ZMM(1), ZMM(30)) \ + VFMADD231PD(ZMM(22), ZMM(2), ZMM(30)) \ + \ + PREFETCH_B_L1(n, 2)\ + VBROADCASTSD(ZMM(29), MEM(RAX, (8*n+8)*8)) \ + VFMADD231PD(ZMM(23), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(24), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(25), ZMM(2), ZMM(3)) \ + VBROADCASTSD(ZMM(30), MEM(RAX, (8*n+9)*8)) \ + VFMADD231PD(ZMM(26), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(27), ZMM(1), ZMM(4)) \ + VFMADD231PD(ZMM(28), ZMM(2), ZMM(4)) \ + \ + VMOVAPD(ZMM(0), MEM(RBX, (12*n+0)*16)) \ + VMOVAPD(ZMM(1), MEM(RBX, (12*n+4)*16)) \ + VMOVAPD(ZMM(2), MEM(RBX, (12*n+8)*16)) + +#define SCALE_REG(a, b, c) \ + VPERMILPD(ZMM(3), a, IMM(0x55)) \ + VMULPD(a, a, b) \ + VMULPD(ZMM(3), ZMM(3), c) \ + VFMADDSUB132PD(a, ZMM(3), ZMM(31)) \ + +#define STORE_C_ROW(R1, R2, R3) \ + VMOVUPD(ZMM(0), MEM(RCX)) \ + SCALE_REG(ZMM(0), ZMM(1), ZMM(2)) \ + VADDPD(ZMM(0), ZMM(0), ZMM(R1)) \ + VMOVUPD(MEM(RCX), ZMM(0)) \ + \ + VMOVUPD(ZMM(0), MEM(RCX, R10, 4)) \ + SCALE_REG(ZMM(0), ZMM(1), ZMM(2)) \ + VADDPD(ZMM(0), ZMM(0), ZMM(R2)) \ + VMOVUPD(MEM(RCX, R10, 4), ZMM(0)) \ + \ + VMOVUPD(ZMM(0), MEM(RCX, R10, 8)) \ + SCALE_REG(ZMM(0), ZMM(1), ZMM(2)) \ + VADDPD(ZMM(0), ZMM(0), ZMM(R3)) \ + VMOVUPD(MEM(RCX, R10, 8), ZMM(0)) \ + +#define LOAD_ROW_GEN() \ + VMOVUPD(XMM(0), MEM(RDX)) \ + VMOVUPD(XMM(27), MEM(RDX, R10, 1)) \ + VMOVUPD(XMM(28), MEM(RDX, R10, 2)) \ + VMOVUPD(XMM(29), MEM(RDX, R11, 1)) \ + VINSERTF64X2(ZMM(0), ZMM(0), XMM(27), IMM(0x1)) \ + VINSERTF64X2(ZMM(0), ZMM(0), XMM(28), IMM(0x2)) \ + VINSERTF64X2(ZMM(0), ZMM(0), XMM(29), IMM(0x3)) \ + +#define STORE_ROW_GEN() \ + VEXTRACTF64X2(XMM(27), ZMM(0), IMM(0x1)) \ + VEXTRACTF64X2(XMM(28), ZMM(0), IMM(0x2)) \ + VEXTRACTF64X2(XMM(29), ZMM(0), IMM(0x3)) \ + VMOVUPD(MEM(RDX) , XMM(0)) \ + VMOVUPD(MEM(RDX, R10, 1), XMM(27)) \ + VMOVUPD(MEM(RDX, R10, 2), XMM(28)) \ + VMOVUPD(MEM(RDX, R11, 1), XMM(29)) \ + +#define STORE_C_COL_GEN(R1, R2, R3) \ + MOV(RDX, RCX) \ + LEA(RCX, MEM(RCX, R12, 1)) \ + LOAD_ROW_GEN() \ + SCALE_REG(ZMM(0), ZMM(1), ZMM(2)) \ + VADDPD(ZMM(0), ZMM(0), ZMM(R1)) \ + STORE_ROW_GEN() \ + LEA(RDX, MEM(RDX, R10, 4)) \ + \ + LOAD_ROW_GEN() \ + SCALE_REG(ZMM(0), ZMM(1), ZMM(2)) \ + VADDPD(ZMM(0), ZMM(0), ZMM(R2)) \ + STORE_ROW_GEN() \ + LEA(RDX, MEM(RDX, R10, 4)) \ + \ + LOAD_ROW_GEN() \ + SCALE_REG(ZMM(0), ZMM(1), ZMM(2)) \ + VADDPD(ZMM(0), ZMM(0), ZMM(R3)) \ + STORE_ROW_GEN() \ + +/**********************************************************/ +/* Kernel : bli_zgemm_zen4_asm_4x12 */ +/* It performs C = C * beta + alpha * A * B */ +/* It is row preferred kernel, A and B are packed */ +/* C could be Row/Col/Gen Stored Matrix */ +/* Registers are allocated as below */ +/* Broadcast A : ZMM(3, 4, 29, 30) */ +/* load B : ZMM(0, 1, 2) */ +/* Accumulation of B(real,imag)*Areal : */ +/* ZMM(5-7 , 11-13, 17-19, 23-25) */ +/* Accumulation of B(real,imag)*Aimag : */ +/* ZMM(8-10, 14-16, 20-22, 26-28) */ +/* Computation of A(real,imag)*B(real,imag): */ +/* ZMM(5-7 , 11-13, 17-19, 23-25) */ +/**********************************************************/ +void bli_zgemm_zen4_asm_4x12( + dim_t k_, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + const int64_t k = k_; + /*rowstride * size of one dcomplex element*/ + const int64_t rs_c = rs_c_*16; + /*colstride * size of one dcomplex element*/ + const int64_t cs_c = cs_c_*16; + + + char beta_mul_type = BLIS_MUL_DEFAULT; + if(beta->imag == 0.0 && beta->real == 0.0 ) + { + beta_mul_type = BLIS_MUL_ZERO; + } + double one = 1; // used for FMADDSUB instruction + double *one_addr = &one; + + BEGIN_ASM() + + VXORPD(XMM(5) , XMM(5) , XMM(5) ) + VXORPD(XMM(6) , XMM(6) , XMM(6) ) + VXORPD(XMM(7) , XMM(7) , XMM(7) ) + VXORPD(XMM(8) , XMM(8) , XMM(8) ) + VXORPD(XMM(9) , XMM(9) , XMM(9) ) + VXORPD(XMM(10), XMM(10), XMM(10)) + VXORPD(XMM(11), XMM(11), XMM(11)) + VXORPD(XMM(12), XMM(12), XMM(12)) + VXORPD(XMM(13), XMM(13), XMM(13)) + VXORPD(XMM(14), XMM(14), XMM(14)) + VXORPD(XMM(15), XMM(15), XMM(15)) + VXORPD(XMM(16), XMM(16), XMM(16)) + VXORPD(XMM(17), XMM(17), XMM(17)) + VXORPD(XMM(18), XMM(18), XMM(18)) + VXORPD(XMM(19), XMM(19), XMM(19)) + VXORPD(XMM(20), XMM(20), XMM(20)) + VXORPD(XMM(21), XMM(21), XMM(21)) + VXORPD(XMM(22), XMM(22), XMM(22)) + VXORPD(XMM(23), XMM(23), XMM(23)) + VXORPD(XMM(24), XMM(24), XMM(24)) + VXORPD(XMM(25), XMM(25), XMM(25)) + VXORPD(XMM(26), XMM(26), XMM(26)) + VXORPD(XMM(27), XMM(27), XMM(27)) + VXORPD(XMM(28), XMM(28), XMM(28)) + + MOV(RSI, VAR(k)) //loop index + MOV(RAX, VAR(a)) //load address of a + MOV(RBX, VAR(b)) //load address of b + MOV(RCX, VAR(c)) //load address of c + + #ifdef PREFETCH_C + LEA(R9, MEM(RCX, 63)) // c for prefetch, first cache line + LEA(R8, MEM(R9, 128)) // c for prefetch, second cache line + #endif + + + VMOVAPD(ZMM(0), MEM(RBX, 0*16)) //pre-load b + VMOVAPD(ZMM(1), MEM(RBX, 4*16)) //pre-load b + VMOVAPD(ZMM(2), MEM(RBX, 8*16)) //pre-load b + VBROADCASTSD(ZMM(29), MEM(RAX, 0)) + VBROADCASTSD(ZMM(30), MEM(RAX, 8)) + LEA(RBX, MEM(RBX, 12*16)) //adjust b for pre-load + + MOV(R12, VAR(rs_c)) + MOV(R10, VAR(cs_c)) + + #if defined PREFETCH_A_NEXT || defined PREFETCH_B_NEXT + MOV(RDI, RSI) + IMUL(RDI, IMM(16*4)) // rdi = k * 16*4 + #endif + + #ifdef PREFETCH_A_NEXT + LEA(R14, MEM(RAX, RDI, 1)) // r14(a_next) = A + (k*16*4) + #endif + + #ifdef PREFETCH_B_NEXT + IMUL(RDI, IMM(3)) // rdi = k * 16*12 + LEA(R15, MEM(RBX, RDI, 1)) // r15(b_next) = B + (k*16*12) + #endif + + MOV(RDI, RSI) + AND(RSI, IMM(3)) + SAR(RDI, IMM(2)) + /************************************************************/ + /* Operation: */ + /* SUBITER = (Ar, Ai)*(Br, Bi) = Ar*(Br, Bi) , Ai*(Br, Bi) */ + /* C_PREFETCH loop count: */ + /* LOOP1: k/4 - TAIL_NITER - 4 */ + /* LOOP2: 4 */ + /* LOOP4: TAIL_NITER */ + /* TAIL_LOOP: k%4 */ + /* */ + /* No prefetch loop count: */ + /* LOOP1: k/4 */ + /* TAIL_LOOP: k%4 */ + /************************************************************/ + #ifdef PREFETCH_C + /* prefetch c over 4 iterations of k*/ + SUB(RDI, IMM(4+TAIL_NITER)) + #endif + JLE(K_PREFETCH_C) + + LOOP_ALIGN + LABEL(LOOP1) + #ifdef PREFETCH_A_NEXT + PREFETCH(1, MEM(R14)) + #endif + SUBITER(0) + #ifdef PREFETCH_B_NEXT + PREFETCH(1, MEM(R15)) + #endif + SUBITER(1) + #ifdef PREFETCH_A_NEXT + PREFETCH(2, MEM(R14, 64)) + #endif + SUB(RDI, IMM(1)) + SUBITER(2) + #ifdef PREFETCH_B_NEXT + PREFETCH(2, MEM(R15, 64)) + #endif + SUBITER(3) + + LEA(RAX, MEM(RAX,4*4*16)) + LEA(RBX, MEM(RBX,4*12*16)) + #ifdef PREFETCH_A_NEXT + LEA(R14, MEM(R14,128)) + #endif + #ifdef PREFETCH_B_NEXT + LEA(R15, MEM(R15,64)) + #endif + + JNZ(LOOP1) + + LABEL(K_PREFETCH_C) + +#ifdef PREFETCH_C + ADD(RDI, IMM(4)) + JLE(K_TAIL_NITER) + + LOOP_ALIGN + LABEL(LOOP2) + SUBITER(0) + PREFETCH(0, MEM(R9)) + SUBITER(1) + PREFETCH(0, MEM(R9, 64)) + SUB(RDI, IMM(1)) + PREFETCH(0, MEM(R9,128)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*4*16)) + LEA(RBX, MEM(RBX,4*12*16)) + LEA(R9, MEM(R9,R12,1)) + JNZ(LOOP2) + + LABEL(K_TAIL_NITER) + + ADD(RDI, IMM(0+TAIL_NITER)) + JLE(TAIL) + + LOOP_ALIGN + LABEL(LOOP4) + + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*4*16)) + LEA(RBX, MEM(RBX,4*12*16)) + + JNZ(LOOP4) + +#endif //PREFETCH_C + + LABEL(TAIL) + + TEST(RSI, RSI) + JZ(POSTACCUM) + + LOOP_ALIGN + LABEL(TAIL_LOOP) + + SUB(RSI, IMM(1)) + SUBITER(0) + LEA(RAX, MEM(RAX,4*16)) + LEA(RBX, MEM(RBX,12*16)) + + JNZ(TAIL_LOOP) + + LABEL(POSTACCUM) + + VPERMILPD(ZMM8 , ZMM8 , IMM(0x55)) + VPERMILPD(ZMM9 , ZMM9 , IMM(0x55)) + VPERMILPD(ZMM10, ZMM10, IMM(0x55)) + VPERMILPD(ZMM14, ZMM14, IMM(0x55)) + VPERMILPD(ZMM15, ZMM15, IMM(0x55)) + VPERMILPD(ZMM16, ZMM16, IMM(0x55)) + VPERMILPD(ZMM20, ZMM20, IMM(0x55)) + VPERMILPD(ZMM21, ZMM21, IMM(0x55)) + VPERMILPD(ZMM22, ZMM22, IMM(0x55)) + VPERMILPD(ZMM26, ZMM26, IMM(0x55)) + VPERMILPD(ZMM27, ZMM27, IMM(0x55)) + VPERMILPD(ZMM28, ZMM28, IMM(0x55)) + + MOV(R8, VAR(one_addr)) + VBROADCASTSD(ZMM(31), MEM(R8)) + VFMADDSUB132PD(ZMM(5) , ZMM(8) , ZMM(31)) + VFMADDSUB132PD(ZMM(6) , ZMM(9) , ZMM(31)) + VFMADDSUB132PD(ZMM(7) , ZMM(10), ZMM(31)) + + VFMADDSUB132PD(ZMM(11), ZMM(14), ZMM(31)) + VFMADDSUB132PD(ZMM(12), ZMM(15), ZMM(31)) + VFMADDSUB132PD(ZMM(13), ZMM(16), ZMM(31)) + + VFMADDSUB132PD(ZMM(17), ZMM(20), ZMM(31)) + VFMADDSUB132PD(ZMM(18), ZMM(21), ZMM(31)) + VFMADDSUB132PD(ZMM(19), ZMM(22), ZMM(31)) + + VFMADDSUB132PD(ZMM(23), ZMM(26), ZMM(31)) + VFMADDSUB132PD(ZMM(24), ZMM(27), ZMM(31)) + VFMADDSUB132PD(ZMM(25), ZMM(28), ZMM(31)) + + MOV(RAX, VAR(alpha)) + VBROADCASTSD(ZMM(0), MEM(RAX)) + VBROADCASTSD(ZMM(1), MEM(RAX, 8)) + + SCALE_REG(ZMM(5) , ZMM(0), ZMM(1)) + SCALE_REG(ZMM(6) , ZMM(0), ZMM(1)) + SCALE_REG(ZMM(7) , ZMM(0), ZMM(1)) + + SCALE_REG(ZMM(11), ZMM(0), ZMM(1)) + SCALE_REG(ZMM(12), ZMM(0), ZMM(1)) + SCALE_REG(ZMM(13), ZMM(0), ZMM(1)) + + SCALE_REG(ZMM(17), ZMM(0), ZMM(1)) + SCALE_REG(ZMM(18), ZMM(0), ZMM(1)) + SCALE_REG(ZMM(19), ZMM(0), ZMM(1)) + + SCALE_REG(ZMM(23), ZMM(0), ZMM(1)) + SCALE_REG(ZMM(24), ZMM(0), ZMM(1)) + SCALE_REG(ZMM(25), ZMM(0), ZMM(1)) + + MOV(RBX, VAR(beta)) + VBROADCASTSD(ZMM(1), MEM(RBX)) + VBROADCASTSD(ZMM(2), MEM(RBX, 8)) + + + MOV(AL, VAR(beta_mul_type)) + CMP(AL, IMM(0)) + JE(.ZBETAZERO) + + CMP(R10, IMM(16)) //CS == 1 IMPLIES ROW STORED + JNZ(.ZCOLSTORED) + + LABEL(.ZROWSTORED) + STORE_C_ROW(5 , 6 , 7 ) ADD(RCX, R12) + STORE_C_ROW(11, 12, 13) ADD(RCX, R12) + STORE_C_ROW(17, 18, 19) ADD(RCX, R12) + STORE_C_ROW(23, 24, 25) + JMP(.ZDONE) + + LABEL(.ZCOLSTORED) + LEA(R11, MEM(R10, R10, 2)) + STORE_C_COL_GEN(5, 6, 7) + STORE_C_COL_GEN(11, 12, 13) + STORE_C_COL_GEN(17, 18, 19) + STORE_C_COL_GEN(23, 24, 25) + JMP(.ZDONE) + + LABEL(.ZBETAZERO) + CMP(R10, IMM(16)) + JZ(.ZROWSTORBZ) + + LABEL(.ZCOLSTORBZ) + LEA(R11, MEM(R10, R10, 2)) + MOV(RDX, RCX) + ADD(RCX, R12) + VMOVUPD(ZMM(0), ZMM(5)) STORE_ROW_GEN() + LEA(RDX, MEM(RDX, R10, 4)) + VMOVUPD(ZMM(0), ZMM(6)) STORE_ROW_GEN() + LEA(RDX, MEM(RDX, R10, 4)) + VMOVUPD(ZMM(0), ZMM(7)) STORE_ROW_GEN() + + MOV(RDX, RCX) + LEA(RCX, MEM(RCX, R12, 1)) + VMOVUPD(ZMM(0), ZMM(11)) STORE_ROW_GEN() + LEA(RDX, MEM(RDX, R10, 4)) + VMOVUPD(ZMM(0), ZMM(12)) STORE_ROW_GEN() + LEA(RDX, MEM(RDX, R10, 4)) + VMOVUPD(ZMM(0), ZMM(13)) STORE_ROW_GEN() + + MOV(RDX, RCX) + LEA(RCX, MEM(RCX, R12, 1)) + VMOVUPD(ZMM(0), ZMM(17)) STORE_ROW_GEN() + LEA(RDX, MEM(RDX, R10, 4)) + VMOVUPD(ZMM(0), ZMM(18)) STORE_ROW_GEN() + LEA(RDX, MEM(RDX, R10, 4)) + VMOVUPD(ZMM(0), ZMM(19)) STORE_ROW_GEN() + + MOV(RDX, RCX) + VMOVUPD(ZMM(0), ZMM(23)) STORE_ROW_GEN() + LEA(RDX, MEM(RDX, R10, 4)) + VMOVUPD(ZMM(0), ZMM(24)) STORE_ROW_GEN() + LEA(RDX, MEM(RDX, R10, 4)) + VMOVUPD(ZMM(0), ZMM(25)) STORE_ROW_GEN() + + JMP(.ZDONE) + + + LABEL(.ZROWSTORBZ) + VMOVUPD(MEM(RCX ), ZMM(5)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(6)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(7)) + LEA(RCX, MEM(RCX, R12, 1)) + + VMOVUPD(MEM(RCX ), ZMM(11)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(12)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(13)) + LEA(RCX, MEM(RCX, R12, 1)) + + VMOVUPD(MEM(RCX ), ZMM(17)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(18)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(19)) + LEA(RCX, MEM(RCX, R12, 1)) + + VMOVUPD(MEM(RCX ), ZMM(23)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(24)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(25)) + + LABEL(.ZDONE) + + VZEROUPPER() + + END_ASM + ( + : // output operands (none) + : // input operands + [beta_mul_type] "m" (beta_mul_type), + [k] "m" (k), + [a] "m" (a), + [b] "m" (b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [one_addr] "m" (one_addr) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", + "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", + "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", + "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", + "xmm14", "xmm15", "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", + "xmm21", "xmm22", "xmm23", "xmm24", "xmm25", "xmm26", + "xmm27", "xmm28", "xmm29", "xmm30", "xmm31", + "memory" + ) +} \ No newline at end of file diff --git a/kernels/zen4/3/bli_zgemmtrsm_l_4x12.c b/kernels/zen4/3/bli_zgemmtrsm_l_4x12.c new file mode 100644 index 0000000000..5341bf4851 --- /dev/null +++ b/kernels/zen4/3/bli_zgemmtrsm_l_4x12.c @@ -0,0 +1,705 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "bli_x86_asm_macros.h" + +#define A_L1_PREFETCH_DIST 6 +#define B_L1_PREFETCH_DIST 6 +#define TAIL_NITER 7 +// #define PREFETCH_A +#define PREFETCH_B +// #define PREFETCH_A_NEXT +#define PREFETCH_B_NEXT +#define PREFETCH_C // perfetch c in middle loop over 4 iterations of k + + +#ifdef PREFETCH_A + #define PREFETCH_A_L1(n) \ + PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*4*16 + 4*n*16)) +#else + #define PREFETCH_A_L1(n) +#endif + +#ifdef PREFETCH_B + #define PREFETCH_B_L1(n, k) \ + PREFETCH(0, MEM(RBX, B_L1_PREFETCH_DIST*12*16 + (12*n+(4*k))*16)) +#else + #define PREFETCH_B_L1(n, k) +#endif + + +/* + * A Registers: ZMM3, ZMM4, ZMM29, ZMM30 + * B Registers: ZMM0, ZMM1, ZMM2 + * C Registers: ZMM[8-28] + */ + +#define LOOP_ALIGN ALIGN32 + +#define SUBITER(n) \ +\ + PREFETCH_A_L1(n)\ + VBROADCASTSD(ZMM(3), MEM(RAX, (8*n+2)*8)) \ + VFMADD231PD(ZMM(5) , ZMM(0), ZMM(29)) \ + VFMADD231PD(ZMM(6) , ZMM(1), ZMM(29)) \ + VFMADD231PD(ZMM(7) , ZMM(2), ZMM(29)) \ + VBROADCASTSD(ZMM(4), MEM(RAX, (8*n+3)*8)) \ + VFMADD231PD(ZMM(8) , ZMM(0), ZMM(30)) \ + VFMADD231PD(ZMM(9) , ZMM(1), ZMM(30)) \ + VFMADD231PD(ZMM(10), ZMM(2), ZMM(30)) \ + \ + PREFETCH_B_L1(n, 0)\ + VBROADCASTSD(ZMM(29), MEM(RAX, (8*n+4)*8)) \ + VFMADD231PD(ZMM(11), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(12), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(13), ZMM(2), ZMM(3)) \ + VBROADCASTSD(ZMM(30), MEM(RAX, (8*n+5)*8)) \ + VFMADD231PD(ZMM(14), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(15), ZMM(1), ZMM(4)) \ + VFMADD231PD(ZMM(16), ZMM(2), ZMM(4)) \ + \ + PREFETCH_B_L1(n, 1)\ + VBROADCASTSD(ZMM(3), MEM(RAX, (8*n+6)*8)) \ + VFMADD231PD(ZMM(17), ZMM(0), ZMM(29)) \ + VFMADD231PD(ZMM(18), ZMM(1), ZMM(29)) \ + VFMADD231PD(ZMM(19), ZMM(2), ZMM(29)) \ + VBROADCASTSD(ZMM(4), MEM(RAX, (8*n+7)*8)) \ + VFMADD231PD(ZMM(20), ZMM(0), ZMM(30)) \ + VFMADD231PD(ZMM(21), ZMM(1), ZMM(30)) \ + VFMADD231PD(ZMM(22), ZMM(2), ZMM(30)) \ + \ + PREFETCH_B_L1(n, 2)\ + VBROADCASTSD(ZMM(29), MEM(RAX, (8*n+8)*8)) \ + VFMADD231PD(ZMM(23), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(24), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(25), ZMM(2), ZMM(3)) \ + VBROADCASTSD(ZMM(30), MEM(RAX, (8*n+9)*8)) \ + VFMADD231PD(ZMM(26), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(27), ZMM(1), ZMM(4)) \ + VFMADD231PD(ZMM(28), ZMM(2), ZMM(4)) \ + \ + VMOVAPD(ZMM(0), MEM(RBX, (12*n+0)*16)) \ + VMOVAPD(ZMM(1), MEM(RBX, (12*n+4)*16)) \ + VMOVAPD(ZMM(2), MEM(RBX, (12*n+8)*16)) + +#define SCALE_REG(a, b, c, out) \ + VPERMILPD(ZMM(3), a, IMM(0x55)) \ + VMULPD(out, a, b) \ + VMULPD(ZMM(3), ZMM(3), c) \ + VFMADDSUB132PD(out, ZMM(3), ZMM(31)) \ + +#define DIVIDE_COMPLEX(R1, c, d, csq_dsq) \ + VPERMILPD(ZMM(3), R1, IMM(0x55)) \ + VMULPD(R1, R1, c) \ + VMULPD(ZMM(3), ZMM(3), d) \ + VMULPD(ZMM(3), ZMM(3), ZMM(2)) \ + VFMADDSUB132PD(R1, ZMM(3), ZMM(31)) \ + VDIVPD(R1, R1, csq_dsq) \ + +#define STORE_REG_GEN(reg) \ + VEXTRACTF64X2(XMM(27), ZMM(reg), IMM(0x1)) \ + VEXTRACTF64X2(XMM(28), ZMM(reg), IMM(0x2)) \ + VEXTRACTF64X2(XMM(29), ZMM(reg), IMM(0x3)) \ + VMOVUPD(MEM(RDX) , XMM(reg)) \ + VMOVUPD(MEM(RDX, R10, 1), XMM(27)) \ + VMOVUPD(MEM(RDX, R10, 2), XMM(28)) \ + VMOVUPD(MEM(RDX, R11, 1), XMM(29)) \ + + +/**********************************************************/ +/* Kernel : bli_zgemmtrsm_l_zen4_asm_4x12 */ +/* It performs C = C * beta + alpha * A * B */ +/* It is row preferred kernel, A and B are packed */ +/* C could be Row/Col/Gen Stored Matrix */ +/* Registers are allocated as below */ +/* Broadcast A : ZMM(3, 4, 29, 30) */ +/* load B : ZMM(0, 1, 2) */ +/* Accumulation of B(real,imag)*Areal : */ +/* ZMM(5-7 , 11-13, 17-19, 23-25) */ +/* Accumulation of B(real,imag)*Aimag : */ +/* ZMM(8-10, 14-16, 20-22, 26-28) */ +/* Computation of A(real,imag)*B(real,imag): */ +/* ZMM(5-7 , 11-13, 17-19, 23-25) */ +/**********************************************************/ +void bli_zgemmtrsm_l_zen4_asm_4x12( + dim_t k_, + dcomplex* restrict alpha, + dcomplex* restrict a10, + dcomplex* restrict a11, + dcomplex* restrict b01, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + const int64_t k = k_; + /*rowstride * size of one dcomplex element*/ + const int64_t rs_c = rs_c_*16; + /*colstride * size of one dcomplex element*/ + const int64_t cs_c = cs_c_*16; + double one = 1; // used for FMADDSUB instruction + double neg_one = -1; // used for complex division + double *one_addr = &one; + double *neg_one_addr = &neg_one; + + BEGIN_ASM() + + VXORPD(XMM(5) , XMM(5) , XMM(5) ) + VXORPD(XMM(6) , XMM(6) , XMM(6) ) + VXORPD(XMM(7) , XMM(7) , XMM(7) ) + VXORPD(XMM(8) , XMM(8) , XMM(8) ) + VXORPD(XMM(9) , XMM(9) , XMM(9) ) + VXORPD(XMM(10), XMM(10), XMM(10)) + VXORPD(XMM(11), XMM(11), XMM(11)) + VXORPD(XMM(12), XMM(12), XMM(12)) + VXORPD(XMM(13), XMM(13), XMM(13)) + VXORPD(XMM(14), XMM(14), XMM(14)) + VXORPD(XMM(15), XMM(15), XMM(15)) + VXORPD(XMM(16), XMM(16), XMM(16)) + VXORPD(XMM(17), XMM(17), XMM(17)) + VXORPD(XMM(18), XMM(18), XMM(18)) + VXORPD(XMM(19), XMM(19), XMM(19)) + VXORPD(XMM(20), XMM(20), XMM(20)) + VXORPD(XMM(21), XMM(21), XMM(21)) + VXORPD(XMM(22), XMM(22), XMM(22)) + VXORPD(XMM(23), XMM(23), XMM(23)) + VXORPD(XMM(24), XMM(24), XMM(24)) + VXORPD(XMM(25), XMM(25), XMM(25)) + VXORPD(XMM(26), XMM(26), XMM(26)) + VXORPD(XMM(27), XMM(27), XMM(27)) + VXORPD(XMM(28), XMM(28), XMM(28)) + + MOV(RSI, VAR(k)) //loop index + MOV(RAX, VAR(a10)) //load address of a + MOV(RBX, VAR(b01)) //load address of b + MOV(RCX, VAR(b11)) //load address of c + MOV(R9, VAR(c11)) //load address of c + MOV(R11, VAR(neg_one_addr)) + + #ifdef PREFETCH_C + LEA(R9, MEM(R9, 63)) // c for prefetch, first cache line + LEA(R8, MEM(R9, 128)) // c for prefetch, second cache line + #endif + + + VMOVAPD(ZMM(0), MEM(RBX, 0*16)) //pre-load b + VMOVAPD(ZMM(1), MEM(RBX, 4*16)) //pre-load b + VMOVAPD(ZMM(2), MEM(RBX, 8*16)) //pre-load b + VBROADCASTSD(ZMM(29), MEM(RAX, 0)) + VBROADCASTSD(ZMM(30), MEM(RAX, 8)) + LEA(RBX, MEM(RBX, 12*16)) //adjust b for pre-load + + MOV(R12, VAR(rs_c)) + MOV(R10, VAR(cs_c)) + + #if defined PREFETCH_A_NEXT || defined PREFETCH_B_NEXT + MOV(RDI, RSI) + IMUL(RDI, IMM(16*4)) // rdi = k * 16*4 + #endif + + #ifdef PREFETCH_A_NEXT + LEA(R14, MEM(RAX, RDI, 1)) // r14(a_next) = A + (k*16*4) + #endif + + #ifdef PREFETCH_B_NEXT + IMUL(RDI, IMM(3)) // rdi = k * 16*12 + LEA(R15, MEM(RBX, RDI, 1)) // r15(b_next) = B + (k*16*12) + #endif + + MOV(RDI, RSI) + AND(RSI, IMM(3)) + SAR(RDI, IMM(2)) + /************************************************************/ + /* Operation: */ + /* SUBITER = (Ar, Ai)*(Br, Bi) = Ar*(Br, Bi) , Ai*(Br, Bi) */ + /* C_PREFETCH loop count: */ + /* LOOP1: k/4 - TAIL_NITER - 4 */ + /* LOOP2: 4 */ + /* LOOP4: TAIL_NITER */ + /* TAIL_LOOP: k%4 */ + /* */ + /* No prefetch loop count: */ + /* LOOP1: k/4 */ + /* TAIL_LOOP: k%4 */ + /************************************************************/ + #ifdef PREFETCH_C + /* prefetch c over 4 iterations of k*/ + SUB(RDI, IMM(4+TAIL_NITER)) + #endif + JLE(K_PREFETCH_C) + + LOOP_ALIGN + LABEL(LOOP1) + #ifdef PREFETCH_A_NEXT + PREFETCH(1, MEM(R14)) + #endif + SUBITER(0) + #ifdef PREFETCH_B_NEXT + PREFETCH(1, MEM(R15)) + #endif + SUBITER(1) + #ifdef PREFETCH_A_NEXT + PREFETCH(2, MEM(R14, 64)) + #endif + SUB(RDI, IMM(1)) + SUBITER(2) + #ifdef PREFETCH_B_NEXT + PREFETCH(2, MEM(R15, 64)) + #endif + SUBITER(3) + + LEA(RAX, MEM(RAX,4*4*16)) + LEA(RBX, MEM(RBX,4*12*16)) + #ifdef PREFETCH_A_NEXT + LEA(R14, MEM(R14,128)) + #endif + #ifdef PREFETCH_B_NEXT + LEA(R15, MEM(R15,64)) + #endif + + JNZ(LOOP1) + + LABEL(K_PREFETCH_C) + +#ifdef PREFETCH_C + ADD(RDI, IMM(4)) + JLE(K_TAIL_NITER) + + LOOP_ALIGN + LABEL(LOOP2) + SUBITER(0) + PREFETCH(0, MEM(R9)) + SUBITER(1) + PREFETCH(0, MEM(R9, 64)) + SUB(RDI, IMM(1)) + PREFETCH(0, MEM(R9,128)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*4*16)) + LEA(RBX, MEM(RBX,4*12*16)) + LEA(R9, MEM(R9,R12,1)) + JNZ(LOOP2) + + LABEL(K_TAIL_NITER) + + ADD(RDI, IMM(0+TAIL_NITER)) + JLE(TAIL) + + LOOP_ALIGN + LABEL(LOOP4) + + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*4*16)) + LEA(RBX, MEM(RBX,4*12*16)) + + JNZ(LOOP4) + +#endif //PREFETCH_C + + LABEL(TAIL) + + TEST(RSI, RSI) + JZ(POSTACCUM) + + LOOP_ALIGN + LABEL(TAIL_LOOP) + + SUB(RSI, IMM(1)) + SUBITER(0) + LEA(RAX, MEM(RAX,4*16)) + LEA(RBX, MEM(RBX,12*16)) + + JNZ(TAIL_LOOP) + + LABEL(POSTACCUM) + + /******************************************************/ + /* Permute imag component register. Shuffle even */ + /* and odd components */ + /* SRC: ZMM8 =(Ai0*Br0, Ai0*Bi0, Ai0*Br1, Ai0*Bi1, ..)*/ + /* DST: ZMM8 =(Ai0*Bi0, Ai0*Br0, Ai0*Bi1, Ai0*Br1, ..)*/ + /******************************************************/ + VPERMILPD(ZMM8 , ZMM8 , IMM(0x55)) + VPERMILPD(ZMM9 , ZMM9 , IMM(0x55)) + VPERMILPD(ZMM10, ZMM10, IMM(0x55)) + VPERMILPD(ZMM14, ZMM14, IMM(0x55)) + VPERMILPD(ZMM15, ZMM15, IMM(0x55)) + VPERMILPD(ZMM16, ZMM16, IMM(0x55)) + VPERMILPD(ZMM20, ZMM20, IMM(0x55)) + VPERMILPD(ZMM21, ZMM21, IMM(0x55)) + VPERMILPD(ZMM22, ZMM22, IMM(0x55)) + VPERMILPD(ZMM26, ZMM26, IMM(0x55)) + VPERMILPD(ZMM27, ZMM27, IMM(0x55)) + VPERMILPD(ZMM28, ZMM28, IMM(0x55)) + + /*******************************************************/ + /* SRC: ZMM5 = (Ar0*Br0, Ar0*Bi0, Ar0*Br1, Ar0*Bi1, ..)*/ + /* SRC: ZMM8 = (Ai0*Bi0, Ai0*Br0, Ai0*Bi1, Ai0*Br1, ..)*/ + /* DST: ZMM8 =(Ar0*Br0-Ai0*Bi0, Ai0*Br0+Ar0*Bi0, */ + /* Ar0*Br1-Ai0*Bi1, Ai0*Br1+Ar0*Bi1, ..) */ + /*******************************************************/ + MOV(R8, VAR(one_addr)) + VBROADCASTSD(ZMM(31), MEM(R8)) + VFMADDSUB132PD(ZMM(5) , ZMM(8) , ZMM(31)) + VFMADDSUB132PD(ZMM(6) , ZMM(9) , ZMM(31)) + VFMADDSUB132PD(ZMM(7) , ZMM(10), ZMM(31)) + + VFMADDSUB132PD(ZMM(11), ZMM(14), ZMM(31)) + VFMADDSUB132PD(ZMM(12), ZMM(15), ZMM(31)) + VFMADDSUB132PD(ZMM(13), ZMM(16), ZMM(31)) + + VFMADDSUB132PD(ZMM(17), ZMM(20), ZMM(31)) + VFMADDSUB132PD(ZMM(18), ZMM(21), ZMM(31)) + VFMADDSUB132PD(ZMM(19), ZMM(22), ZMM(31)) + + VFMADDSUB132PD(ZMM(23), ZMM(26), ZMM(31)) + VFMADDSUB132PD(ZMM(24), ZMM(27), ZMM(31)) + VFMADDSUB132PD(ZMM(25), ZMM(28), ZMM(31)) + + MOV(RAX, VAR(alpha)) + VBROADCASTSD(ZMM(0), MEM(RAX)) + VBROADCASTSD(ZMM(1), MEM(RAX, 8)) + MOV(RDX, RCX) + MOV(RDI, IMM(12*16)) + + VMOVUPD(ZMM(14), MEM(RDX, 0*16)) + VMOVUPD(ZMM(15), MEM(RDX, 4*16)) + VMOVUPD(ZMM(16), MEM(RDX, 8*16)) + ADD(RDX, RDI) + + /*****************************/ + /* gemm_output -= C * alpha */ + /*****************************/ + SCALE_REG(ZMM(14) , ZMM(0), ZMM(1), ZMM(14)) + VSUBPD(ZMM(5), ZMM(14), ZMM(5)) + VMOVUPD(ZMM(14), MEM(RDX, 0*16)) + + SCALE_REG(ZMM(15) , ZMM(0), ZMM(1), ZMM(15)) + VSUBPD(ZMM(6), ZMM(15), ZMM(6)) + VMOVUPD(ZMM(15), MEM(RDX, 4*16)) + + SCALE_REG(ZMM(16) , ZMM(0), ZMM(1), ZMM(16)) + VSUBPD(ZMM(7), ZMM(16), ZMM(7)) + VMOVUPD(ZMM(16), MEM(RDX, 8*16)) + ADD(RDX, RDI) + + + SCALE_REG(ZMM(14) , ZMM(0), ZMM(1), ZMM(14)) + VSUBPD(ZMM(11), ZMM(14), ZMM(11)) + VMOVUPD(ZMM(14), MEM(RDX, 0*16)) + + SCALE_REG(ZMM(15) , ZMM(0), ZMM(1), ZMM(15)) + VSUBPD(ZMM(12), ZMM(15), ZMM(12)) + VMOVUPD(ZMM(15), MEM(RDX, 4*16)) + + SCALE_REG(ZMM(16) , ZMM(0), ZMM(1), ZMM(16)) + VSUBPD(ZMM(13), ZMM(16), ZMM(13)) + VMOVUPD(ZMM(16), MEM(RDX, 8*16)) + ADD(RDX, RDI) + + + SCALE_REG(ZMM(14) , ZMM(0), ZMM(1), ZMM(14)) + VSUBPD(ZMM(17), ZMM(14), ZMM(17)) + VMOVUPD(ZMM(14), MEM(RDX, 0*16)) + + SCALE_REG(ZMM(15) , ZMM(0), ZMM(1), ZMM(15)) + VSUBPD(ZMM(18), ZMM(15), ZMM(18)) + VMOVUPD(ZMM(15), MEM(RDX, 4*16)) + + SCALE_REG(ZMM(16) , ZMM(0), ZMM(1), ZMM(16)) + VSUBPD(ZMM(19), ZMM(16), ZMM(19)) + VMOVUPD(ZMM(16), MEM(RDX, 8*16)) + + + SCALE_REG(ZMM(14) , ZMM(0), ZMM(1), ZMM(14)) + VSUBPD(ZMM(23), ZMM(14), ZMM(23)) + VMOVUPD(ZMM(14), MEM(RDX, 0*16)) + + SCALE_REG(ZMM(15) , ZMM(0), ZMM(1), ZMM(15)) + VSUBPD(ZMM(24), ZMM(15), ZMM(24)) + VMOVUPD(ZMM(15), MEM(RDX, 4*16)) + + SCALE_REG(ZMM(16) , ZMM(0), ZMM(1), ZMM(16)) + VSUBPD(ZMM(25), ZMM(16), ZMM(25)) + VMOVUPD(ZMM(16), MEM(RDX, 8*16)) + + + //REGION - TRSM + + MOV(RAX, VAR(a11)) + //iteration 0 ----------------------------------- + VBROADCASTSD(ZMM(0), MEM(RAX, (0+0*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (0+0*4)*16+8)) + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + /*****************/ + /* C = C * A11 */ + /*****************/ + SCALE_REG(ZMM(5), ZMM(0), ZMM(1), ZMM(5)) + SCALE_REG(ZMM(6), ZMM(0), ZMM(1), ZMM(6)) + SCALE_REG(ZMM(7), ZMM(0), ZMM(1), ZMM(7)) + #else + /**************************************************************/ + /* C = C / A11 */ + /* */ + /* Let C / A11 = (a + ib) / (c + id) = */ + /* ((ac + bd) / (c^2 + d^2)) + i ((bc - ad) / (c^2+d^2)) */ + /**************************************************************/ + VBROADCASTSD(ZMM(2), MEM(R11)) // -1 + VMULPD(ZMM(8), ZMM(0), ZMM(0)) // c*c + VFMADD231PD(ZMM(8), ZMM(1), ZMM(1)) // c*c + d*d + + DIVIDE_COMPLEX(ZMM(5), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(6), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(7), ZMM(0), ZMM(1), ZMM(8)) + #endif + VMOVUPD(MEM(RCX, 0*16), ZMM(5)) + VMOVUPD(MEM(RCX, 4*16), ZMM(6)) + VMOVUPD(MEM(RCX, 8*16), ZMM(7)) + ADD(RCX, RDI) + + //iteration 1 ----------------------------------- + VBROADCASTSD(ZMM(0), MEM(RAX, (1+0*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (1+0*4)*16+8)) + SCALE_REG(ZMM(5), ZMM(0), ZMM(1), ZMM(14)) + SCALE_REG(ZMM(6), ZMM(0), ZMM(1), ZMM(15)) + SCALE_REG(ZMM(7), ZMM(0), ZMM(1), ZMM(16)) + + VSUBPD(ZMM(11), ZMM(11), ZMM(14)) + VSUBPD(ZMM(12), ZMM(12), ZMM(15)) + VSUBPD(ZMM(13), ZMM(13), ZMM(16)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (1+1*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (1+1*4)*16+8)) + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + SCALE_REG(ZMM(11), ZMM(0), ZMM(1), ZMM(11)) + SCALE_REG(ZMM(12), ZMM(0), ZMM(1), ZMM(12)) + SCALE_REG(ZMM(13), ZMM(0), ZMM(1), ZMM(13)) + #else + VBROADCASTSD(ZMM(2), MEM(R11)) + VMULPD(ZMM(8), ZMM(0), ZMM(0)) + VFMADD231PD(ZMM(8), ZMM(1), ZMM(1)) + + DIVIDE_COMPLEX(ZMM(11), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(12), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(13), ZMM(0), ZMM(1), ZMM(8)) + #endif + VMOVUPD(MEM(RCX, 0*16), ZMM(11)) + VMOVUPD(MEM(RCX, 4*16), ZMM(12)) + VMOVUPD(MEM(RCX, 8*16), ZMM(13)) + ADD(RCX, RDI) + + //iteration 2 ----------------------------------- + VBROADCASTSD(ZMM(0), MEM(RAX, (2+0*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (2+0*4)*16+8)) + SCALE_REG(ZMM(5), ZMM(0), ZMM(1), ZMM(14)) + SCALE_REG(ZMM(6), ZMM(0), ZMM(1), ZMM(15)) + SCALE_REG(ZMM(7), ZMM(0), ZMM(1), ZMM(16)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (2+1*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (2+1*4)*16+8)) + SCALE_REG(ZMM(11), ZMM(0), ZMM(1), ZMM(20)) + SCALE_REG(ZMM(12), ZMM(0), ZMM(1), ZMM(21)) + SCALE_REG(ZMM(13), ZMM(0), ZMM(1), ZMM(22)) + VADDPD(ZMM(14), ZMM(14), ZMM(20)) + VADDPD(ZMM(15), ZMM(15), ZMM(21)) + VADDPD(ZMM(16), ZMM(16), ZMM(22)) + + VSUBPD(ZMM(17), ZMM(17), ZMM(14)) + VSUBPD(ZMM(18), ZMM(18), ZMM(15)) + VSUBPD(ZMM(19), ZMM(19), ZMM(16)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (2+2*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (2+2*4)*16+8)) + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + SCALE_REG(ZMM(17), ZMM(0), ZMM(1), ZMM(17)) + SCALE_REG(ZMM(18), ZMM(0), ZMM(1), ZMM(18)) + SCALE_REG(ZMM(19), ZMM(0), ZMM(1), ZMM(19)) + #else + VBROADCASTSD(ZMM(2), MEM(R11)) + VMULPD(ZMM(8), ZMM(0), ZMM(0)) + VFMADD231PD(ZMM(8), ZMM(1), ZMM(1)) + + DIVIDE_COMPLEX(ZMM(17), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(18), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(19), ZMM(0), ZMM(1), ZMM(8)) + #endif + VMOVUPD(MEM(RCX, 0*16), ZMM(17)) + VMOVUPD(MEM(RCX, 4*16), ZMM(18)) + VMOVUPD(MEM(RCX, 8*16), ZMM(19)) + ADD(RCX, RDI) + + //iteration 3 ----------------------------------- + VBROADCASTSD(ZMM(0), MEM(RAX, (3+0*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (3+0*4)*16+8)) + SCALE_REG(ZMM(5), ZMM(0), ZMM(1), ZMM(14)) + SCALE_REG(ZMM(6), ZMM(0), ZMM(1), ZMM(15)) + SCALE_REG(ZMM(7), ZMM(0), ZMM(1), ZMM(16)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (3+1*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (3+1*4)*16+8)) + SCALE_REG(ZMM(11), ZMM(0), ZMM(1), ZMM(20)) + SCALE_REG(ZMM(12), ZMM(0), ZMM(1), ZMM(21)) + SCALE_REG(ZMM(13), ZMM(0), ZMM(1), ZMM(22)) + VADDPD(ZMM(14), ZMM(14), ZMM(20)) + VADDPD(ZMM(15), ZMM(15), ZMM(21)) + VADDPD(ZMM(16), ZMM(16), ZMM(22)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (3+2*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (3+2*4)*16+8)) + SCALE_REG(ZMM(17), ZMM(0), ZMM(1), ZMM(20)) + SCALE_REG(ZMM(18), ZMM(0), ZMM(1), ZMM(21)) + SCALE_REG(ZMM(19), ZMM(0), ZMM(1), ZMM(22)) + VADDPD(ZMM(14), ZMM(14), ZMM(20)) + VADDPD(ZMM(15), ZMM(15), ZMM(21)) + VADDPD(ZMM(16), ZMM(16), ZMM(22)) + + VSUBPD(ZMM(23), ZMM(23), ZMM(14)) + VSUBPD(ZMM(24), ZMM(24), ZMM(15)) + VSUBPD(ZMM(25), ZMM(25), ZMM(16)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (3+3*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (3+3*4)*16+8)) + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + SCALE_REG(ZMM(23), ZMM(0), ZMM(1), ZMM(23)) + SCALE_REG(ZMM(24), ZMM(0), ZMM(1), ZMM(24)) + SCALE_REG(ZMM(25), ZMM(0), ZMM(1), ZMM(25)) + #else + VBROADCASTSD(ZMM(2), MEM(R11)) + VMULPD(ZMM(8), ZMM(0), ZMM(0)) + VFMADD231PD(ZMM(8), ZMM(1), ZMM(1)) + + DIVIDE_COMPLEX(ZMM(23), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(24), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(25), ZMM(0), ZMM(1), ZMM(8)) + #endif + VMOVUPD(MEM(RCX, 0*16), ZMM(23)) + VMOVUPD(MEM(RCX, 4*16), ZMM(24)) + VMOVUPD(MEM(RCX, 8*16), ZMM(25)) + +// ENDREGION - TRSM + + MOV(RCX, VAR(c11)) + CMP(R10, IMM(16)) //CS == 1 IMPLIES ROW STORED + JNZ(.ZCOLSTORED) + + LABEL(.ZROWSTORED) + VMOVUPD(MEM(RCX ), ZMM(5)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(6)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(7)) + ADD(RCX, R12) + + VMOVUPD(MEM(RCX ), ZMM(11)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(12)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(13)) + ADD(RCX, R12) + + VMOVUPD(MEM(RCX ), ZMM(17)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(18)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(19)) + ADD(RCX, R12) + + VMOVUPD(MEM(RCX ), ZMM(23)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(24)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(25)) + + JMP(.ZDONE) + + LABEL(.ZCOLSTORED) + LEA(R11, MEM(R10, R10, 2)) + MOV(RDX, RCX) + ADD(RCX, R12) + STORE_REG_GEN(5) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(6) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(7) + + MOV(RDX, RCX) + ADD(RCX, R12) + STORE_REG_GEN(11) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(12) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(13) + + MOV(RDX, RCX) + ADD(RCX, R12) + STORE_REG_GEN(17) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(18) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(19) + + MOV(RDX, RCX) + STORE_REG_GEN(23) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(24) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(25) + + LABEL(.ZDONE) + VZEROUPPER() + + END_ASM + ( + : // output operands (none) + : // input operands + [a10] "m" (a10), + [k] "m" (k), + [b01] "m" (b01), + [a11] "m" (a11), + [b11] "m" (b11), + [c11] "m" (c11), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [alpha] "m" (alpha), + [neg_one_addr] "m" (neg_one_addr), + [one_addr] "m" (one_addr) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", + "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", + "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", + "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", + "xmm14", "xmm15", "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", + "xmm21", "xmm22", "xmm23", "xmm24", "xmm25", "xmm26", + "xmm27", "xmm28", "xmm29", "xmm30", "xmm31", + "memory" + ) +} \ No newline at end of file diff --git a/kernels/zen4/3/bli_zgemmtrsm_u_4x12.c b/kernels/zen4/3/bli_zgemmtrsm_u_4x12.c new file mode 100644 index 0000000000..bb2017f5bb --- /dev/null +++ b/kernels/zen4/3/bli_zgemmtrsm_u_4x12.c @@ -0,0 +1,715 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "bli_x86_asm_macros.h" + +#define A_L1_PREFETCH_DIST 6 +#define B_L1_PREFETCH_DIST 6 +#define TAIL_NITER 7 +// #define PREFETCH_A +#define PREFETCH_B +// #define PREFETCH_A_NEXT +#define PREFETCH_B_NEXT +#define PREFETCH_C // perfetch c in middle loop over 4 iterations of k + + +#ifdef PREFETCH_A + #define PREFETCH_A_L1(n) \ + PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*4*16 + 4*n*16)) +#else + #define PREFETCH_A_L1(n) +#endif + +#ifdef PREFETCH_B + #define PREFETCH_B_L1(n, k) \ + PREFETCH(0, MEM(RBX, B_L1_PREFETCH_DIST*12*16 + (12*n+(4*k))*16)) +#else + #define PREFETCH_B_L1(n, k) +#endif + + +/* + * A Registers: ZMM3, ZMM4, ZMM29, ZMM30 + * B Registers: ZMM0, ZMM1, ZMM2 + * C Registers: ZMM[8-28] + */ + +#define LOOP_ALIGN ALIGN32 + +#define SUBITER(n) \ +\ + PREFETCH_A_L1(n)\ + VBROADCASTSD(ZMM(3), MEM(RAX, (8*n+2)*8)) \ + VFMADD231PD(ZMM(5) , ZMM(0), ZMM(29)) \ + VFMADD231PD(ZMM(6) , ZMM(1), ZMM(29)) \ + VFMADD231PD(ZMM(7) , ZMM(2), ZMM(29)) \ + VBROADCASTSD(ZMM(4), MEM(RAX, (8*n+3)*8)) \ + VFMADD231PD(ZMM(8) , ZMM(0), ZMM(30)) \ + VFMADD231PD(ZMM(9) , ZMM(1), ZMM(30)) \ + VFMADD231PD(ZMM(10), ZMM(2), ZMM(30)) \ + \ + PREFETCH_B_L1(n, 0)\ + VBROADCASTSD(ZMM(29), MEM(RAX, (8*n+4)*8)) \ + VFMADD231PD(ZMM(11), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(12), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(13), ZMM(2), ZMM(3)) \ + VBROADCASTSD(ZMM(30), MEM(RAX, (8*n+5)*8)) \ + VFMADD231PD(ZMM(14), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(15), ZMM(1), ZMM(4)) \ + VFMADD231PD(ZMM(16), ZMM(2), ZMM(4)) \ + \ + PREFETCH_B_L1(n, 1)\ + VBROADCASTSD(ZMM(3), MEM(RAX, (8*n+6)*8)) \ + VFMADD231PD(ZMM(17), ZMM(0), ZMM(29)) \ + VFMADD231PD(ZMM(18), ZMM(1), ZMM(29)) \ + VFMADD231PD(ZMM(19), ZMM(2), ZMM(29)) \ + VBROADCASTSD(ZMM(4), MEM(RAX, (8*n+7)*8)) \ + VFMADD231PD(ZMM(20), ZMM(0), ZMM(30)) \ + VFMADD231PD(ZMM(21), ZMM(1), ZMM(30)) \ + VFMADD231PD(ZMM(22), ZMM(2), ZMM(30)) \ + \ + PREFETCH_B_L1(n, 2)\ + VBROADCASTSD(ZMM(29), MEM(RAX, (8*n+8)*8)) \ + VFMADD231PD(ZMM(23), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(24), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(25), ZMM(2), ZMM(3)) \ + VBROADCASTSD(ZMM(30), MEM(RAX, (8*n+9)*8)) \ + VFMADD231PD(ZMM(26), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(27), ZMM(1), ZMM(4)) \ + VFMADD231PD(ZMM(28), ZMM(2), ZMM(4)) \ + \ + VMOVAPD(ZMM(0), MEM(RBX, (12*n+0)*16)) \ + VMOVAPD(ZMM(1), MEM(RBX, (12*n+4)*16)) \ + VMOVAPD(ZMM(2), MEM(RBX, (12*n+8)*16)) + +#define SCALE_REG(a, b, c, out) \ + VPERMILPD(ZMM(3), a, IMM(0x55)) \ + VMULPD(out, a, b) \ + VMULPD(ZMM(3), ZMM(3), c) \ + VFMADDSUB132PD(out, ZMM(3), ZMM(31)) \ + +#define DIVIDE_COMPLEX(R1, c, d, csq_dsq) \ + VPERMILPD(ZMM(3), R1, IMM(0x55)) \ + VMULPD(R1, R1, c) \ + VMULPD(ZMM(3), ZMM(3), d) \ + VMULPD(ZMM(3), ZMM(3), ZMM(2)) \ + VFMADDSUB132PD(R1, ZMM(3), ZMM(31)) \ + VDIVPD(R1, R1, csq_dsq) \ + +#define STORE_REG_GEN(reg) \ + VEXTRACTF64X2(XMM(27), ZMM(reg), IMM(0x1)) \ + VEXTRACTF64X2(XMM(28), ZMM(reg), IMM(0x2)) \ + VEXTRACTF64X2(XMM(29), ZMM(reg), IMM(0x3)) \ + VMOVUPD(MEM(RDX) , XMM(reg)) \ + VMOVUPD(MEM(RDX, R10, 1), XMM(27)) \ + VMOVUPD(MEM(RDX, R10, 2), XMM(28)) \ + VMOVUPD(MEM(RDX, R11, 1), XMM(29)) \ + + +/**********************************************************/ +/* Kernel : bli_zgemmtrsm_l_zen4_asm_4x12 */ +/* It performs C = C * beta + alpha * A * B */ +/* It is row preferred kernel, A and B are packed */ +/* C could be Row/Col/Gen Stored Matrix */ +/* Registers are allocated as below */ +/* Broadcast A : ZMM(3, 4, 29, 30) */ +/* load B : ZMM(0, 1, 2) */ +/* Accumulation of B(real,imag)*Areal : */ +/* ZMM(5-7 , 11-13, 17-19, 23-25) */ +/* Accumulation of B(real,imag)*Aimag : */ +/* ZMM(8-10, 14-16, 20-22, 26-28) */ +/* Computation of A(real,imag)*B(real,imag): */ +/* ZMM(5-7 , 11-13, 17-19, 23-25) */ +/**********************************************************/ +void bli_zgemmtrsm_u_zen4_asm_4x12( + dim_t k_, + dcomplex* restrict alpha, + dcomplex* restrict a10, + dcomplex* restrict a11, + dcomplex* restrict b01, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); + const int64_t k = k_; + /*rowstride * size of one dcomplex element*/ + const int64_t rs_c = rs_c_*16; + /*colstride * size of one dcomplex element*/ + const int64_t cs_c = cs_c_*16; + double one = 1; // used for FMADDSUB instruction + double neg_one = -1; // used for complex division + double *one_addr = &one; + double *neg_one_addr = &neg_one; + + BEGIN_ASM() + + VXORPD(XMM(5) , XMM(5) , XMM(5) ) + VXORPD(XMM(6) , XMM(6) , XMM(6) ) + VXORPD(XMM(7) , XMM(7) , XMM(7) ) + VXORPD(XMM(8) , XMM(8) , XMM(8) ) + VXORPD(XMM(9) , XMM(9) , XMM(9) ) + VXORPD(XMM(10), XMM(10), XMM(10)) + VXORPD(XMM(11), XMM(11), XMM(11)) + VXORPD(XMM(12), XMM(12), XMM(12)) + VXORPD(XMM(13), XMM(13), XMM(13)) + VXORPD(XMM(14), XMM(14), XMM(14)) + VXORPD(XMM(15), XMM(15), XMM(15)) + VXORPD(XMM(16), XMM(16), XMM(16)) + VXORPD(XMM(17), XMM(17), XMM(17)) + VXORPD(XMM(18), XMM(18), XMM(18)) + VXORPD(XMM(19), XMM(19), XMM(19)) + VXORPD(XMM(20), XMM(20), XMM(20)) + VXORPD(XMM(21), XMM(21), XMM(21)) + VXORPD(XMM(22), XMM(22), XMM(22)) + VXORPD(XMM(23), XMM(23), XMM(23)) + VXORPD(XMM(24), XMM(24), XMM(24)) + VXORPD(XMM(25), XMM(25), XMM(25)) + VXORPD(XMM(26), XMM(26), XMM(26)) + VXORPD(XMM(27), XMM(27), XMM(27)) + VXORPD(XMM(28), XMM(28), XMM(28)) + + MOV(RSI, VAR(k)) //loop index + MOV(RAX, VAR(a10)) //load address of a + MOV(RBX, VAR(b01)) //load address of b + MOV(RCX, VAR(b11)) //load address of c + MOV(R9, VAR(c11)) //load address of c + MOV(R11, VAR(neg_one_addr)) + + #ifdef PREFETCH_C + LEA(R9, MEM(R9, 63)) // c for prefetch, first cache line + LEA(R8, MEM(R9, 128)) // c for prefetch, second cache line + #endif + + + VMOVAPD(ZMM(0), MEM(RBX, 0*16)) //pre-load b + VMOVAPD(ZMM(1), MEM(RBX, 4*16)) //pre-load b + VMOVAPD(ZMM(2), MEM(RBX, 8*16)) //pre-load b + VBROADCASTSD(ZMM(29), MEM(RAX, 0)) + VBROADCASTSD(ZMM(30), MEM(RAX, 8)) + LEA(RBX, MEM(RBX, 12*16)) //adjust b for pre-load + + MOV(R12, VAR(rs_c)) + MOV(R10, VAR(cs_c)) + + #if defined PREFETCH_A_NEXT || defined PREFETCH_B_NEXT + MOV(RDI, RSI) + IMUL(RDI, IMM(16*4)) // rdi = k * 16*4 + #endif + + #ifdef PREFETCH_A_NEXT + LEA(R14, MEM(RAX, RDI, 1)) // r14(a_next) = A + (k*16*4) + #endif + + #ifdef PREFETCH_B_NEXT + IMUL(RDI, IMM(3)) // rdi = k * 16*12 + LEA(R15, MEM(RBX, RDI, 1)) // r15(b_next) = B + (k*16*12) + #endif + + MOV(RDI, RSI) + AND(RSI, IMM(3)) + SAR(RDI, IMM(2)) + /************************************************************/ + /* Operation: */ + /* SUBITER = (Ar, Ai)*(Br, Bi) = Ar*(Br, Bi) , Ai*(Br, Bi) */ + /* C_PREFETCH loop count: */ + /* LOOP1: k/4 - TAIL_NITER - 4 */ + /* LOOP2: 4 */ + /* LOOP4: TAIL_NITER */ + /* TAIL_LOOP: k%4 */ + /* */ + /* No prefetch loop count: */ + /* LOOP1: k/4 */ + /* TAIL_LOOP: k%4 */ + /************************************************************/ + #ifdef PREFETCH_C + /* prefetch c over 4 iterations of k*/ + SUB(RDI, IMM(4+TAIL_NITER)) + #endif + JLE(K_PREFETCH_C) + + LOOP_ALIGN + LABEL(LOOP1) + #ifdef PREFETCH_A_NEXT + PREFETCH(1, MEM(R14)) + #endif + SUBITER(0) + #ifdef PREFETCH_B_NEXT + PREFETCH(1, MEM(R15)) + #endif + SUBITER(1) + #ifdef PREFETCH_A_NEXT + PREFETCH(2, MEM(R14, 64)) + #endif + SUB(RDI, IMM(1)) + SUBITER(2) + #ifdef PREFETCH_B_NEXT + PREFETCH(2, MEM(R15, 64)) + #endif + SUBITER(3) + + LEA(RAX, MEM(RAX,4*4*16)) + LEA(RBX, MEM(RBX,4*12*16)) + #ifdef PREFETCH_A_NEXT + LEA(R14, MEM(R14,128)) + #endif + #ifdef PREFETCH_B_NEXT + LEA(R15, MEM(R15,64)) + #endif + + JNZ(LOOP1) + + LABEL(K_PREFETCH_C) + +#ifdef PREFETCH_C + ADD(RDI, IMM(4)) + JLE(K_TAIL_NITER) + + LOOP_ALIGN + LABEL(LOOP2) + SUBITER(0) + PREFETCH(0, MEM(R9)) + SUBITER(1) + PREFETCH(0, MEM(R9, 64)) + SUB(RDI, IMM(1)) + PREFETCH(0, MEM(R9,128)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*4*16)) + LEA(RBX, MEM(RBX,4*12*16)) + LEA(R9, MEM(R9,R12,1)) + JNZ(LOOP2) + + LABEL(K_TAIL_NITER) + + ADD(RDI, IMM(0+TAIL_NITER)) + JLE(TAIL) + + LOOP_ALIGN + LABEL(LOOP4) + + SUBITER(0) + SUBITER(1) + SUB(RDI, IMM(1)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,4*4*16)) + LEA(RBX, MEM(RBX,4*12*16)) + + JNZ(LOOP4) + +#endif //PREFETCH_C + + LABEL(TAIL) + + TEST(RSI, RSI) + JZ(POSTACCUM) + + LOOP_ALIGN + LABEL(TAIL_LOOP) + + SUB(RSI, IMM(1)) + SUBITER(0) + LEA(RAX, MEM(RAX,4*16)) + LEA(RBX, MEM(RBX,12*16)) + + JNZ(TAIL_LOOP) + + LABEL(POSTACCUM) + + /******************************************************/ + /* Permute imag component register. Shuffle even */ + /* and odd components */ + /* SRC: ZMM8 =(Ai0*Br0, Ai0*Bi0, Ai0*Br1, Ai0*Bi1, ..)*/ + /* DST: ZMM8 =(Ai0*Bi0, Ai0*Br0, Ai0*Bi1, Ai0*Br1, ..)*/ + /******************************************************/ + VPERMILPD(ZMM8 , ZMM8 , IMM(0x55)) + VPERMILPD(ZMM9 , ZMM9 , IMM(0x55)) + VPERMILPD(ZMM10, ZMM10, IMM(0x55)) + VPERMILPD(ZMM14, ZMM14, IMM(0x55)) + VPERMILPD(ZMM15, ZMM15, IMM(0x55)) + VPERMILPD(ZMM16, ZMM16, IMM(0x55)) + VPERMILPD(ZMM20, ZMM20, IMM(0x55)) + VPERMILPD(ZMM21, ZMM21, IMM(0x55)) + VPERMILPD(ZMM22, ZMM22, IMM(0x55)) + VPERMILPD(ZMM26, ZMM26, IMM(0x55)) + VPERMILPD(ZMM27, ZMM27, IMM(0x55)) + VPERMILPD(ZMM28, ZMM28, IMM(0x55)) + + /*******************************************************/ + /* SRC: ZMM5 = (Ar0*Br0, Ar0*Bi0, Ar0*Br1, Ar0*Bi1, ..)*/ + /* SRC: ZMM8 = (Ai0*Bi0, Ai0*Br0, Ai0*Bi1, Ai0*Br1, ..)*/ + /* DST: ZMM8 =(Ar0*Br0-Ai0*Bi0, Ai0*Br0+Ar0*Bi0, */ + /* Ar0*Br1-Ai0*Bi1, Ai0*Br1+Ar0*Bi1, ..) */ + /*******************************************************/ + MOV(R8, VAR(one_addr)) + VBROADCASTSD(ZMM(31), MEM(R8)) + VFMADDSUB132PD(ZMM(5) , ZMM(8) , ZMM(31)) + VFMADDSUB132PD(ZMM(6) , ZMM(9) , ZMM(31)) + VFMADDSUB132PD(ZMM(7) , ZMM(10), ZMM(31)) + + VFMADDSUB132PD(ZMM(11), ZMM(14), ZMM(31)) + VFMADDSUB132PD(ZMM(12), ZMM(15), ZMM(31)) + VFMADDSUB132PD(ZMM(13), ZMM(16), ZMM(31)) + + VFMADDSUB132PD(ZMM(17), ZMM(20), ZMM(31)) + VFMADDSUB132PD(ZMM(18), ZMM(21), ZMM(31)) + VFMADDSUB132PD(ZMM(19), ZMM(22), ZMM(31)) + + VFMADDSUB132PD(ZMM(23), ZMM(26), ZMM(31)) + VFMADDSUB132PD(ZMM(24), ZMM(27), ZMM(31)) + VFMADDSUB132PD(ZMM(25), ZMM(28), ZMM(31)) + + MOV(RAX, VAR(alpha)) + VBROADCASTSD(ZMM(0), MEM(RAX)) + VBROADCASTSD(ZMM(1), MEM(RAX, 8)) + MOV(RDX, RCX) + MOV(RDI, IMM(12*16)) + + VMOVUPD(ZMM(14), MEM(RDX, 0*16)) + VMOVUPD(ZMM(15), MEM(RDX, 4*16)) + VMOVUPD(ZMM(16), MEM(RDX, 8*16)) + ADD(RDX, RDI) + + /*****************************/ + /* gemm_output -= C * alpha */ + /*****************************/ + SCALE_REG(ZMM(14) , ZMM(0), ZMM(1), ZMM(14)) + VSUBPD(ZMM(5), ZMM(14), ZMM(5)) + VMOVUPD(ZMM(14), MEM(RDX, 0*16)) + + SCALE_REG(ZMM(15) , ZMM(0), ZMM(1), ZMM(15)) + VSUBPD(ZMM(6), ZMM(15), ZMM(6)) + VMOVUPD(ZMM(15), MEM(RDX, 4*16)) + + SCALE_REG(ZMM(16) , ZMM(0), ZMM(1), ZMM(16)) + VSUBPD(ZMM(7), ZMM(16), ZMM(7)) + VMOVUPD(ZMM(16), MEM(RDX, 8*16)) + ADD(RDX, RDI) + + + SCALE_REG(ZMM(14) , ZMM(0), ZMM(1), ZMM(14)) + VSUBPD(ZMM(11), ZMM(14), ZMM(11)) + VMOVUPD(ZMM(14), MEM(RDX, 0*16)) + + SCALE_REG(ZMM(15) , ZMM(0), ZMM(1), ZMM(15)) + VSUBPD(ZMM(12), ZMM(15), ZMM(12)) + VMOVUPD(ZMM(15), MEM(RDX, 4*16)) + + SCALE_REG(ZMM(16) , ZMM(0), ZMM(1), ZMM(16)) + VSUBPD(ZMM(13), ZMM(16), ZMM(13)) + VMOVUPD(ZMM(16), MEM(RDX, 8*16)) + ADD(RDX, RDI) + + + SCALE_REG(ZMM(14) , ZMM(0), ZMM(1), ZMM(14)) + VSUBPD(ZMM(17), ZMM(14), ZMM(17)) + VMOVUPD(ZMM(14), MEM(RDX, 0*16)) + + SCALE_REG(ZMM(15) , ZMM(0), ZMM(1), ZMM(15)) + VSUBPD(ZMM(18), ZMM(15), ZMM(18)) + VMOVUPD(ZMM(15), MEM(RDX, 4*16)) + + SCALE_REG(ZMM(16) , ZMM(0), ZMM(1), ZMM(16)) + VSUBPD(ZMM(19), ZMM(16), ZMM(19)) + VMOVUPD(ZMM(16), MEM(RDX, 8*16)) + + + SCALE_REG(ZMM(14) , ZMM(0), ZMM(1), ZMM(14)) + VSUBPD(ZMM(23), ZMM(14), ZMM(23)) + VMOVUPD(ZMM(14), MEM(RDX, 0*16)) + + SCALE_REG(ZMM(15) , ZMM(0), ZMM(1), ZMM(15)) + VSUBPD(ZMM(24), ZMM(15), ZMM(24)) + VMOVUPD(ZMM(15), MEM(RDX, 4*16)) + + SCALE_REG(ZMM(16) , ZMM(0), ZMM(1), ZMM(16)) + VSUBPD(ZMM(25), ZMM(16), ZMM(25)) + VMOVUPD(ZMM(16), MEM(RDX, 8*16)) + + + //REGION - TRSM + + MOV(RAX, VAR(a11)) + LEA(RCX, MEM(RCX, RDI, 2)) + ADD(RCX, RDI) + //iteration 0 ----------------------------------- + VBROADCASTSD(ZMM(0), MEM(RAX, (3+3*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (3+3*4)*16+8)) + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + SCALE_REG(ZMM(23), ZMM(0), ZMM(1), ZMM(23)) + SCALE_REG(ZMM(24), ZMM(0), ZMM(1), ZMM(24)) + SCALE_REG(ZMM(25), ZMM(0), ZMM(1), ZMM(25)) + #else + VBROADCASTSD(ZMM(2), MEM(R11)) + VMULPD(ZMM(8), ZMM(0), ZMM(0)) + VFMADD231PD(ZMM(8), ZMM(1), ZMM(1)) + + DIVIDE_COMPLEX(ZMM(23), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(24), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(25), ZMM(0), ZMM(1), ZMM(8)) + #endif + VMOVUPD(MEM(RCX, 0*16), ZMM(23)) + VMOVUPD(MEM(RCX, 4*16), ZMM(24)) + VMOVUPD(MEM(RCX, 8*16), ZMM(25)) + SUB(RCX, RDI) + + //iteration 1 ----------------------------------- + VBROADCASTSD(ZMM(0), MEM(RAX, (2+3*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (2+3*4)*16+8)) + SCALE_REG(ZMM(23), ZMM(0), ZMM(1), ZMM(14)) + SCALE_REG(ZMM(24), ZMM(0), ZMM(1), ZMM(15)) + SCALE_REG(ZMM(25), ZMM(0), ZMM(1), ZMM(16)) + + VSUBPD(ZMM(17), ZMM(17), ZMM(14)) + VSUBPD(ZMM(18), ZMM(18), ZMM(15)) + VSUBPD(ZMM(19), ZMM(19), ZMM(16)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (2+2*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (2+2*4)*16+8)) + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + SCALE_REG(ZMM(17), ZMM(0), ZMM(1), ZMM(17)) + SCALE_REG(ZMM(18), ZMM(0), ZMM(1), ZMM(18)) + SCALE_REG(ZMM(19), ZMM(0), ZMM(1), ZMM(19)) + #else + VBROADCASTSD(ZMM(2), MEM(R11)) + VMULPD(ZMM(8), ZMM(0), ZMM(0)) + VFMADD231PD(ZMM(8), ZMM(1), ZMM(1)) + + DIVIDE_COMPLEX(ZMM(17), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(18), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(19), ZMM(0), ZMM(1), ZMM(8)) + #endif + VMOVUPD(MEM(RCX, 0*16), ZMM(17)) + VMOVUPD(MEM(RCX, 4*16), ZMM(18)) + VMOVUPD(MEM(RCX, 8*16), ZMM(19)) + SUB(RCX, RDI) + + //iteration 2 ----------------------------------- + VBROADCASTSD(ZMM(0), MEM(RAX, (1+3*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (1+3*4)*16+8)) + SCALE_REG(ZMM(23), ZMM(0), ZMM(1), ZMM(14)) + SCALE_REG(ZMM(24), ZMM(0), ZMM(1), ZMM(15)) + SCALE_REG(ZMM(25), ZMM(0), ZMM(1), ZMM(16)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (1+2*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (1+2*4)*16+8)) + SCALE_REG(ZMM(17), ZMM(0), ZMM(1), ZMM(20)) + SCALE_REG(ZMM(18), ZMM(0), ZMM(1), ZMM(21)) + SCALE_REG(ZMM(19), ZMM(0), ZMM(1), ZMM(22)) + VADDPD(ZMM(14), ZMM(14), ZMM(20)) + VADDPD(ZMM(15), ZMM(15), ZMM(21)) + VADDPD(ZMM(16), ZMM(16), ZMM(22)) + + VSUBPD(ZMM(11), ZMM(11), ZMM(14)) + VSUBPD(ZMM(12), ZMM(12), ZMM(15)) + VSUBPD(ZMM(13), ZMM(13), ZMM(16)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (1+1*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (1+1*4)*16+8)) + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + SCALE_REG(ZMM(11), ZMM(0), ZMM(1), ZMM(11)) + SCALE_REG(ZMM(12), ZMM(0), ZMM(1), ZMM(12)) + SCALE_REG(ZMM(13), ZMM(0), ZMM(1), ZMM(13)) + #else + VBROADCASTSD(ZMM(2), MEM(R11)) + VMULPD(ZMM(8), ZMM(0), ZMM(0)) + VFMADD231PD(ZMM(8), ZMM(1), ZMM(1)) + + DIVIDE_COMPLEX(ZMM(11), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(12), ZMM(0), ZMM(1), ZMM(8)) + DIVIDE_COMPLEX(ZMM(13), ZMM(0), ZMM(1), ZMM(8)) + #endif + VMOVUPD(MEM(RCX, 0*16), ZMM(11)) + VMOVUPD(MEM(RCX, 4*16), ZMM(12)) + VMOVUPD(MEM(RCX, 8*16), ZMM(13)) + SUB(RCX, RDI) + + //iteration 3 ----------------------------------- + VBROADCASTSD(ZMM(0), MEM(RAX, (0+3*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (0+3*4)*16+8)) + SCALE_REG(ZMM(23), ZMM(0), ZMM(1), ZMM(14)) + SCALE_REG(ZMM(24), ZMM(0), ZMM(1), ZMM(15)) + SCALE_REG(ZMM(25), ZMM(0), ZMM(1), ZMM(16)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (0+2*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (0+2*4)*16+8)) + SCALE_REG(ZMM(17), ZMM(0), ZMM(1), ZMM(20)) + SCALE_REG(ZMM(18), ZMM(0), ZMM(1), ZMM(21)) + SCALE_REG(ZMM(19), ZMM(0), ZMM(1), ZMM(22)) + VADDPD(ZMM(14), ZMM(14), ZMM(20)) + VADDPD(ZMM(15), ZMM(15), ZMM(21)) + VADDPD(ZMM(16), ZMM(16), ZMM(22)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (0+1*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (0+1*4)*16+8)) + SCALE_REG(ZMM(11), ZMM(0), ZMM(1), ZMM(20)) + SCALE_REG(ZMM(12), ZMM(0), ZMM(1), ZMM(21)) + SCALE_REG(ZMM(13), ZMM(0), ZMM(1), ZMM(22)) + VADDPD(ZMM(14), ZMM(14), ZMM(20)) + VADDPD(ZMM(15), ZMM(15), ZMM(21)) + VADDPD(ZMM(16), ZMM(16), ZMM(22)) + + VSUBPD(ZMM(5), ZMM(5), ZMM(14)) + VSUBPD(ZMM(6), ZMM(6), ZMM(15)) + VSUBPD(ZMM(7), ZMM(7), ZMM(16)) + + VBROADCASTSD(ZMM(0), MEM(RAX, (0+0*4)*16+0)) + VBROADCASTSD(ZMM(1), MEM(RAX, (0+0*4)*16+8)) + #ifdef BLIS_ENABLE_TRSM_PREINVERSION + SCALE_REG(ZMM(5), ZMM(0), ZMM(1), ZMM(5)) + SCALE_REG(ZMM(6), ZMM(0), ZMM(1), ZMM(6)) + SCALE_REG(ZMM(7), ZMM(0), ZMM(1), ZMM(7)) + #else + VBROADCASTSD(ZMM(2), MEM(R11)) + VMULPD(ZMM(8), ZMM(0), ZMM(0)) + VFMADD231PD(ZMM(8), ZMM(1), ZMM(1)) + + VPERMILPD(ZMM(3), ZMM(5), IMM(0x55)) + VMULPD(ZMM(5), ZMM(5), ZMM(0)) + VMULPD(ZMM(3), ZMM(3), ZMM(1)) + VMULPD(ZMM(3), ZMM(3), ZMM(2)) + VFMADDSUB132PD(ZMM(5), ZMM(3), ZMM(31)) + VDIVPD(ZMM(5), ZMM(5), ZMM(8)) + + VPERMILPD(ZMM(3), ZMM(6), IMM(0x55)) + VMULPD(ZMM(6), ZMM(6), ZMM(0)) + VMULPD(ZMM(3), ZMM(3), ZMM(1)) + VMULPD(ZMM(3), ZMM(3), ZMM(2)) + VFMADDSUB132PD(ZMM(6), ZMM(3), ZMM(31)) + VDIVPD(ZMM(6), ZMM(6), ZMM(8)) + + VPERMILPD(ZMM(3), ZMM(7), IMM(0x55)) + VMULPD(ZMM(7), ZMM(7), ZMM(0)) + VMULPD(ZMM(3), ZMM(3), ZMM(1)) + VMULPD(ZMM(3), ZMM(3), ZMM(2)) + VFMADDSUB132PD(ZMM(7), ZMM(3), ZMM(31)) + VDIVPD(ZMM(7), ZMM(7), ZMM(8)) + #endif + VMOVUPD(MEM(RCX, 0*16), ZMM(5)) + VMOVUPD(MEM(RCX, 4*16), ZMM(6)) + VMOVUPD(MEM(RCX, 8*16), ZMM(7)) + +// ENDREGION - TRSM + + MOV(RCX, VAR(c11)) + CMP(R10, IMM(16)) //CS == 1 IMPLIES ROW STORED + JNZ(.ZCOLSTORED) + + LABEL(.ZROWSTORED) + VMOVUPD(MEM(RCX ), ZMM(5)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(6)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(7)) + ADD(RCX, R12) + + VMOVUPD(MEM(RCX ), ZMM(11)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(12)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(13)) + ADD(RCX, R12) + + VMOVUPD(MEM(RCX ), ZMM(17)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(18)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(19)) + ADD(RCX, R12) + + VMOVUPD(MEM(RCX ), ZMM(23)) + VMOVUPD(MEM(RCX, R10, 4), ZMM(24)) + VMOVUPD(MEM(RCX, R10, 8), ZMM(25)) + + JMP(.ZDONE) + + LABEL(.ZCOLSTORED) + LEA(R11, MEM(R10, R10, 2)) + MOV(RDX, RCX) + ADD(RCX, R12) + STORE_REG_GEN(5) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(6) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(7) + + MOV(RDX, RCX) + ADD(RCX, R12) + STORE_REG_GEN(11) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(12) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(13) + + MOV(RDX, RCX) + ADD(RCX, R12) + STORE_REG_GEN(17) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(18) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(19) + + MOV(RDX, RCX) + STORE_REG_GEN(23) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(24) LEA(RDX, MEM(RDX, R10, 4)) + STORE_REG_GEN(25) + + LABEL(.ZDONE) + VZEROUPPER() + + END_ASM + ( + : // output operands (none) + : // input operands + [a10] "m" (a10), + [k] "m" (k), + [b01] "m" (b01), + [a11] "m" (a11), + [b11] "m" (b11), + [c11] "m" (c11), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [alpha] "m" (alpha), + [neg_one_addr] "m" (neg_one_addr), + [one_addr] "m" (one_addr) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", + "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", + "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", + "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", + "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", + "xmm14", "xmm15", "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", + "xmm21", "xmm22", "xmm23", "xmm24", "xmm25", "xmm26", + "xmm27", "xmm28", "xmm29", "xmm30", "xmm31", + "memory" + ) +} \ No newline at end of file diff --git a/kernels/zen4/bli_kernels_zen4.h b/kernels/zen4/bli_kernels_zen4.h index 1757ae3fae..4a25b78572 100644 --- a/kernels/zen4/bli_kernels_zen4.h +++ b/kernels/zen4/bli_kernels_zen4.h @@ -55,6 +55,8 @@ GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_zen_asm_16x14) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_zen_asm_16x14) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_zen4_asm_8x24) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_zen4_asm_8x24) +GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsm_l_zen4_asm_4x12) +GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsm_u_zen4_asm_4x12) //packing kernels PACKM_KER_PROT( double, d, packm_zen4_asm_16xk ) @@ -69,6 +71,8 @@ PACKM_KER_PROT( dcomplex, z, packm_zen4_asm_4xk ) GEMM_UKR_PROT( double, d, gemm_zen4_asm_32x6 ) GEMM_UKR_PROT( double, d, gemm_zen4_asm_8x24 ) GEMM_UKR_PROT( dcomplex, z, gemm_zen4_asm_12x4 ) +GEMM_UKR_PROT( dcomplex, z, gemm_zen4_asm_4x12 ) + //sgemm rv sup GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x64m_avx512 ) From dd1cf230905e1bb1f3d99a31925d09db8490b2f3 Mon Sep 17 00:00:00 2001 From: Arnav Sharma Date: Thu, 26 Oct 2023 14:35:05 +0530 Subject: [PATCH 184/226] Gtestsuite Update for Pack and Compute Extension APIs - Pack and compute are now compared against GEMM operation of reference library when MKL is not used as a reference. - For the case where both A and B are unpacked, the reference GEMM is invoked with a unit-alpha scalar. - If MKL is used as reference, then these APIs are compared against pack and compute operations of MKL. - Updated description in ref_gemm_compute.cpp to reflect this behavior. AMD-Internal: [CPUPL-4084] Change-Id: Id0521c9cad8743a7ae471a7f3c547ceb67191f86 --- .../src/level3/ref_gemm_compute.cpp | 68 +++++++++++++++++-- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp b/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp index 2b15ffea2b..dd069fcd8a 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp @@ -53,11 +53,20 @@ * Alpha and beta are scalars, and A, B and C are matrices, with A * an m by k matrix, B a k by n matrix and C an m by n matrix, * where either A or B or both may be scaled by alpha and reordered. + * + * NOTE: + * - For MKL comparing against pack and compute APIs. + * - For all other reference libraries (except MKL), we compare the result of + * BLIS pack and compute against the GEMM operation of the reference library. + * In case when both A & B are unpacked, we do not invoke xgemm_pack() thus, + * not computing alpha * X operation. So to handle this case, we pass + * unit-alpha to the reference GEMM. * ========================================================================== */ namespace testinghelpers { +#ifdef REF_IS_MKL template void ref_gemm_compute(char storage, char trnsa, char trnsb, char pcka, char pckb, gtint_t m, gtint_t n, gtint_t k, T alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb, T beta, T* cp, gtint_t ldc) @@ -103,10 +112,10 @@ void ref_gemm_compute(char storage, char trnsa, char trnsb, char pcka, char pckb } else { - throw std::runtime_error("Error in ref_gemm.cpp: Invalid typename is passed function template."); + throw std::runtime_error("Error in ref_gemm_compute.cpp: Invalid typename is passed function template."); } if( !ref_cblas_gemm_compute ) { - throw std::runtime_error("Error in ref_gemm.cpp: Function pointer == 0 -- symbol not found."); + throw std::runtime_error("Error in ref_gemm_compute.cpp: Function pointer == 0 -- symbol not found."); } err_t err = BLIS_SUCCESS; @@ -161,7 +170,7 @@ void ref_gemm_compute(char storage, char trnsa, char trnsb, char pcka, char pckb ref_cblas_gemm_compute( cblas_order, cblas_packed, cblas_transb, m, n, k, aBuffer, lda, bp, ldb, beta, cp, ldc ); - + bli_free_user( aBuffer ); } else if ( ( pckb == 'P' || pckb == 'p' ) ) @@ -181,7 +190,7 @@ void ref_gemm_compute(char storage, char trnsa, char trnsb, char pcka, char pckb ref_cblas_gemm_compute( cblas_order, cblas_transa, cblas_packed, m, n, k, ap, lda, bBuffer, ldb, beta, cp, ldc ); - + bli_free_user( bBuffer ); } else @@ -190,6 +199,57 @@ void ref_gemm_compute(char storage, char trnsa, char trnsb, char pcka, char pckb m, n, k, ap, lda, bp, ldb, beta, cp, ldc ); } } +#else +template +void ref_gemm_compute(char storage, char trnsa, char trnsb, char pcka, char pckb, gtint_t m, gtint_t n, gtint_t k, T alpha, + T* ap, gtint_t lda, T* bp, gtint_t ldb, T beta, T* cp, gtint_t ldc) +{ + // throw std::runtime_error("Error in ref_gemm_compute.cpp: Reference is only defined for MKL. Please use MKL as reference library."); + enum CBLAS_ORDER cblas_order; + enum CBLAS_TRANSPOSE cblas_transa; + enum CBLAS_TRANSPOSE cblas_transb; + + char_to_cblas_order( storage, &cblas_order ); + char_to_cblas_trans( trnsa, &cblas_transa ); + char_to_cblas_trans( trnsb, &cblas_transb ); + + using scalar_t = std::conditional_t::is_complex, T&, T>; + typedef void (*Fptr_ref_cblas_gemm)( const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, + const f77_int, const f77_int, const f77_int, const scalar_t, const T*, f77_int, + const T*, f77_int, const scalar_t, T*, f77_int); + Fptr_ref_cblas_gemm ref_cblas_gemm; + + // Call C function + /* Check the typename T passed to this function template and call respective function.*/ + if (typeid(T) == typeid(float)) + { + ref_cblas_gemm = (Fptr_ref_cblas_gemm)refCBLASModule.loadSymbol("cblas_sgemm"); + } + else if (typeid(T) == typeid(double)) + { + ref_cblas_gemm = (Fptr_ref_cblas_gemm)refCBLASModule.loadSymbol("cblas_dgemm"); + } + else + { + throw std::runtime_error("Error in ref_gemm.cpp: Invalid typename is passed function template."); + } + if( !ref_cblas_gemm ) { + throw std::runtime_error("Error in ref_gemm.cpp: Function pointer == 0 -- symbol not found."); + } + + if ( ( pcka == 'U' or pcka == 'u' ) && ( pckb == 'U' or pckb == 'u' ) ) + { + T unit_alpha = 1.0; + ref_cblas_gemm( cblas_order, cblas_transa, cblas_transb, + m, n, k, unit_alpha, ap, lda, bp, ldb, beta, cp, ldc ); + } + else + { + ref_cblas_gemm( cblas_order, cblas_transa, cblas_transb, + m, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc ); + } +} +#endif // Explicit template instantiations template void ref_gemm_compute(char, char, char, char, char, gtint_t, gtint_t, gtint_t, float, From bed6bd4941e2ea7d558e9b6a6d0aebac26b1a30c Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Mon, 30 Oct 2023 14:35:29 +0530 Subject: [PATCH 185/226] Modified DSCALV AOCL dynamic - AOCL dynamic logic that determines the number of threads to be launched has been modified. AMD-Internal: [CPUPL-3956] Change-Id: Ia6c052515bd24e93660f020a7d0894fc75a229fc --- frame/base/bli_rntm.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index ce9a38798d..91d3b5753e 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -1606,14 +1606,22 @@ static void aocl_dscalv_dynamic case BLIS_ARCH_ZEN2: case BLIS_ARCH_ZEN3: - if ( n_elem <= 10000 ) + if ( n_elem <= 30000) *nt_ideal = 1; - else if (n_elem <= 20000) + else if (n_elem <= 100000) *nt_ideal = 2; - else if (n_elem <= 50000) - *nt_ideal = 4; - else + else if (n_elem <= 500000) *nt_ideal = 8; + else if (n_elem <= 4000000) + *nt_ideal = 12; + else if (n_elem <= 2500000) + *nt_ideal = 16; + else if(n_elem <= 7000000) + *nt_ideal = 24; + else if(n_elem <= 10000000) + *nt_ideal = 32; + else + *nt_ideal = 64; break; From 0de10cc86c93185f77dd5a60999c78bf8aa72c13 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Mon, 30 Oct 2023 10:45:03 +0530 Subject: [PATCH 186/226] Added k=1 avx512 dgemm kernel. - This commit implements avx512 dgemm kernel for k=1 cases. which gets called for zen4 codepath. - Added architecture check for k=1 kernel in dgemm code path to pick correct kernel based on cpu arhcitecture since now blis is having avx2 and avx512 dgemm kernels for k=1 case. - Previously in dgemm path bli_dgemm_8x6_avx2_k1_nn kernel was being called irrespective of architecture type. - Added architecture check before calling the kernel for case where k=1, so only for respective architectures this kernel is invoked. AMD-Internal: [CPUPL-4017] Change-Id: I418bbc933b41db41d323b331c6d89893868a6971 --- frame/compat/bla_gemm_amd.c | 46 +- kernels/zen/3/bli_dgemm_avx2_k1.c | 6 +- kernels/zen/bli_kernels_zen.h | 4 +- kernels/zen4/3/CMakeLists.txt | 1 + kernels/zen4/3/bli_dgemm_avx512_k1.c | 6556 ++++++++++++++++++++++++++ kernels/zen4/bli_kernels_zen4.h | 12 + 6 files changed, 6608 insertions(+), 17 deletions(-) create mode 100644 kernels/zen4/3/bli_dgemm_avx512_k1.c diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index 9d92763666..22338c3247 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -588,18 +588,40 @@ void dgemm_blis_impl if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb)) { - bli_dgemm_8x6_avx2_k1_nn( m0, n0, k0, - (double*)alpha, - (double*)a, *lda, - (double*)b, *ldb, - (double*)beta, - c, *ldc - ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS */ - bli_finalize_auto(); - return; + err_t ret = BLIS_FAILURE; + arch_t arch_id = bli_arch_query_id(); + if(arch_id == BLIS_ARCH_ZEN || + arch_id == BLIS_ARCH_ZEN2 || + arch_id == BLIS_ARCH_ZEN3 ) + { + ret = bli_dgemm_8x6_avx2_k1_nn( m0, n0, k0, + (double*)alpha, + (double*)a, *lda, + (double*)b, *ldb, + (double*)beta, + c, *ldc + ); + } +#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) + else if( arch_id == BLIS_ARCH_ZEN4 ) + { + ret = bli_dgemm_24x8_avx512_k1_nn( m0, n0, k0, + (double*)alpha, + (double*)a, *lda, + (double*)b, *ldb, + (double*)beta, + c, *ldc + ); + } +#endif + if(ret == BLIS_SUCCESS) + { + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS */ + bli_finalize_auto(); + return; + } } if (n0 == 1) diff --git a/kernels/zen/3/bli_dgemm_avx2_k1.c b/kernels/zen/3/bli_dgemm_avx2_k1.c index b225fdad1a..1e7367002a 100644 --- a/kernels/zen/3/bli_dgemm_avx2_k1.c +++ b/kernels/zen/3/bli_dgemm_avx2_k1.c @@ -40,7 +40,7 @@ #define D_MR 8 #define D_NR 6 -void bli_dgemm_8x6_avx2_k1_nn +err_t bli_dgemm_8x6_avx2_k1_nn ( dim_t m, dim_t n, @@ -58,7 +58,7 @@ void bli_dgemm_8x6_avx2_k1_nn alpha_val = *alpha; if((m == 0) || (n == 0) || (((alpha_val == 0.0) || (k == 0)) && (beta_val == 1.0))){ - return; + return BLIS_FAILURE; } dim_t m_remainder = (m % D_MR); @@ -1090,5 +1090,5 @@ void bli_dgemm_8x6_avx2_k1_nn } n_remainder = n_remainder - 2; } - return; + return BLIS_SUCCESS; } diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 1266d33ea6..9e2cf7e24d 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -318,7 +318,7 @@ err_t bli_zgemm_small_At cntl_t* cntl ); -void bli_dgemm_8x6_avx2_k1_nn +err_t bli_dgemm_8x6_avx2_k1_nn ( dim_t m, dim_t n, @@ -445,4 +445,4 @@ void bli_dznorm2fv_unb_var1_avx2 GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_2x6) GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsm_l_zen_asm_2x6) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsm_u_zen_asm_2x6) \ No newline at end of file +GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsm_u_zen_asm_2x6) diff --git a/kernels/zen4/3/CMakeLists.txt b/kernels/zen4/3/CMakeLists.txt index f92f01f2a3..6573f85ed8 100644 --- a/kernels/zen4/3/CMakeLists.txt +++ b/kernels/zen4/3/CMakeLists.txt @@ -14,6 +14,7 @@ add_library(zen4_3 ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_zen4_asm_4x12.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemmtrsm_l_4x12.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemmtrsm_u_4x12.c + ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_avx512_k1.c ) target_compile_options(zen4_3 PRIVATE /arch:AVX2 /arch:AVX512) diff --git a/kernels/zen4/3/bli_dgemm_avx512_k1.c b/kernels/zen4/3/bli_dgemm_avx512_k1.c new file mode 100644 index 0000000000..e3c15c78c5 --- /dev/null +++ b/kernels/zen4/3/bli_dgemm_avx512_k1.c @@ -0,0 +1,6556 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include "immintrin.h" + + +#define D_MR 24 +#define D_NR 8 + +err_t bli_dgemm_24x8_avx512_k1_nn +( + dim_t m, + dim_t n, + dim_t k, + double* alpha, + double* a, const inc_t lda, + double* b, const inc_t ldb, + double* beta, + double* c, const inc_t ldc +) +{ + err_t ret_status = BLIS_FAILURE; + double alpha_val, beta_val; + + beta_val = *beta; + alpha_val = *alpha; + + dim_t m_remainder = (m % D_MR); + dim_t n_remainder = (n % D_NR); + + //scratch registers + __m512d zmm0, zmm1, zmm2, zmm3; + __m512d zmm4, zmm5, zmm6, zmm7; + __m512d zmm8, zmm9, zmm10, zmm11; + __m512d zmm12, zmm13, zmm14, zmm15; + __m512d zmm16, zmm17, zmm18, zmm19; + __m512d zmm20, zmm21, zmm22, zmm23; + __m512d zmm24, zmm25, zmm26, zmm27; + __m512d zmm28, zmm29, zmm30, zmm31; + + if(alpha_val != 0.0 && beta_val != 0.0) + { + /* Compute C = alpha*A*B + beta*c */ + for(dim_t j = 0; (j + (D_NR-1) < n ); j += D_NR) + { + double* temp_b = b + j*ldb; + double* temp_a = a; + double* temp_c = c + j*ldc; + + for(dim_t i = 0; i < ( m - D_MR+1); i += D_MR) + { + //Clear out vector registers to hold fma result. + //zmm6 to zmm29 holds fma result. + //zmm0, zmm1, zmm2 are used to load 24 elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm26 = _mm512_setzero_pd(); + zmm27 = _mm512_setzero_pd(); + zmm28 = _mm512_setzero_pd(); + zmm29 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x8 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from next column of B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + //Compute A*B. + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + //Broadcast element from B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + //Compute A*B. + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + //Compute A*B. + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + //Broadcast element from B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + //Compute A*B. + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + //Compute A*B. + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + //Broadcast element from B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 7)); + //Compute A*B. + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm30, zmm26); + //Compute A*B. + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + zmm28 = _mm512_fmadd_pd(zmm1, zmm31, zmm28); + zmm29 = _mm512_fmadd_pd(zmm2, zmm31, zmm29); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + zmm26 = _mm512_mul_pd(zmm0, zmm26); + zmm27 = _mm512_mul_pd(zmm0, zmm27); + zmm28 = _mm512_mul_pd(zmm0, zmm28); + zmm29 = _mm512_mul_pd(zmm0, zmm29); + + //Broadcast Beta into zmm31 + zmm31 = _mm512_set1_pd(beta_val); + + //zmm0, zmm1, zmm2 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + //zmm0, zmm1, zmm2 are used to load 24 elements from + //matrix C. + zmm3 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm4 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm5 = _mm512_loadu_pd((double const *)(temp_c + ldc + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm9 = _mm512_fmadd_pd(zmm3, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm4, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm5, zmm31, zmm11); + + //zmm0, zmm1, zmm2 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + //zmm0, zmm1, zmm2 are used to load 24 elements from + //matrix C. + zmm3 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm4 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 8)); + zmm5 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm15 = _mm512_fmadd_pd(zmm3, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm4, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm5, zmm31, zmm17); + + //zmm0, zmm1, zmm2 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm31, zmm20); + + //zmm0, zmm1, zmm2 are used to load 24 elements from + //matrix C. + zmm3 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5)); + zmm4 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5 + 8)); + zmm5 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm21 = _mm512_fmadd_pd(zmm3, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm4, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm5, zmm31, zmm23); + + //zmm0, zmm1, zmm2 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm24 = _mm512_fmadd_pd(zmm0, zmm31, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm31, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm31, zmm26); + + //zmm0, zmm1, zmm2 are used to load 24 elements from + //matrix C. + zmm3 = _mm512_loadu_pd((double const *)(temp_c + ldc * 7)); + zmm4 = _mm512_loadu_pd((double const *)(temp_c + ldc * 7 + 8)); + zmm5 = _mm512_loadu_pd((double const *)(temp_c + ldc * 7 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm27 = _mm512_fmadd_pd(zmm3, zmm31, zmm27); + zmm28 = _mm512_fmadd_pd(zmm4, zmm31, zmm28); + zmm29 = _mm512_fmadd_pd(zmm5, zmm31, zmm29); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm29. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 16), zmm17); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 16), zmm20); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 16), zmm23); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 8), zmm25); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 16), zmm26); + //C matrix 8th column + _mm512_storeu_pd((double *)(temp_c + ldc*7), zmm27); + _mm512_storeu_pd((double *)(temp_c + ldc*7 + 8), zmm28); + _mm512_storeu_pd((double *)(temp_c + ldc*7 + 16), zmm29); + + //Update temp_c and temp_a pointer to + //respective offset. + temp_c += D_MR; + temp_a += D_MR; + } + + dim_t m_rem = m_remainder; + //Handles the edge case for m_remainder from 17 to 23. + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + //Clear out vector registers to hold fma result. + //zmm6 to zmm29 holds fma result. + //zmm0, zmm1, zmm2 are used to load elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm26 = _mm512_setzero_pd(); + zmm27 = _mm512_setzero_pd(); + zmm28 = _mm512_setzero_pd(); + zmm29 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >16x8 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from next column of B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + //Compute A*B. + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + //Compute A*B. + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + //Compute A*B. + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + //Compute A*B. + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + //Compute A*B. + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 7)); + //Compute A*B. + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm30, zmm26); + //Compute A*B. + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + zmm28 = _mm512_fmadd_pd(zmm1, zmm31, zmm28); + zmm29 = _mm512_fmadd_pd(zmm2, zmm31, zmm29); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + zmm26 = _mm512_mul_pd(zmm0, zmm26); + zmm27 = _mm512_mul_pd(zmm0, zmm27); + zmm28 = _mm512_mul_pd(zmm0, zmm28); + zmm29 = _mm512_mul_pd(zmm0, zmm29); + + //Broadcast Beta into zmm31 + zmm31 = _mm512_set1_pd(beta_val); + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 2 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 3 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 4 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm31, zmm20); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 5 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 6 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm24 = _mm512_fmadd_pd(zmm0, zmm31, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm31, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm31, zmm26); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 7)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 7 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 7 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + zmm28 = _mm512_fmadd_pd(zmm1, zmm31, zmm28); + zmm29 = _mm512_fmadd_pd(zmm2, zmm31, zmm29); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm29. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 3 + 16), k0, zmm17); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*4 + 16), k0, zmm20); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*5 + 16), k0, zmm23); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 8), zmm25); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*6 + 16), k0, zmm26); + //C matrix 8th column + _mm512_storeu_pd((double *)(temp_c + ldc*7), zmm27); + _mm512_storeu_pd((double *)(temp_c + ldc*7 + 8), zmm28); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*7 + 16), k0, zmm29); + } + //Handles the edge cases where m_remainder is from 9 to 16 + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + //Clear out vector registers to hold fma result. + //zmm6 to zmm28 holds fma result. + //zmm0, zmm1 are used to load elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm27 = _mm512_setzero_pd(); + zmm28 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x8 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from next column of B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 7)); + + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + zmm28 = _mm512_fmadd_pd(zmm1, zmm31, zmm28); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + zmm27 = _mm512_mul_pd(zmm0, zmm27); + zmm28 = _mm512_mul_pd(zmm0, zmm28); + + //Broadcast Beta into zmm31 + zmm31 = _mm512_set1_pd(beta_val); + //zmm0, zmm1 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + 8)); + //Compute C * Beta + fma result(AB*Alpha) + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + + //zmm0, zmm1 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc + 8)); + //Compute C * Beta + fma result(AB*Alpha) + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + //zmm0, zmm1 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 2 + 8)); + //Compute C * Beta + fma result(AB*Alpha) + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + + //zmm0, zmm1 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 3 + 8)); + //Compute C * Beta + fma result(AB*Alpha) + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + //zmm0, zmm1 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 4 + 8)); + //Compute C * Beta + fma result(AB*Alpha) + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + + //zmm0, zmm1 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 5 + 8)); + //Compute C * Beta + fma result(AB*Alpha) + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + + //zmm0, zmm1 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 6 + 8)); + //Compute C * Beta + fma result(AB*Alpha) + zmm24 = _mm512_fmadd_pd(zmm0, zmm31, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm31, zmm25); + + //zmm0, zmm1 are used to load 24 elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 7)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 7 + 8)); + //Compute C * Beta + fma result(AB*Alpha) + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + zmm28 = _mm512_fmadd_pd(zmm1, zmm31, zmm28); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm28. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 3 + 8), k0, zmm16); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4 + 8), k0, zmm19); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5 + 8), k0, zmm22); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*6 + 8), k0, zmm25); + //C matrix 8th column + _mm512_storeu_pd((double *)(temp_c + ldc*7), zmm27); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*7 + 8), k0, zmm28); + } + //Handles the edge case where m_remainder is from 1 to 8 + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + //Clear out vector registers to hold fma result. + //zmm6 to zmm27 holds fma result. + //zmm0 are used to load 8 elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm27 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x8 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from next column of B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 7)); + + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm27 = _mm512_mul_pd(zmm0, zmm27); + + //Broadcast Beta into zmm31 + zmm31 = _mm512_set1_pd(beta_val); + //zmm0 used to load 8 elements from + //matrix C. + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c)); + //Compute C * Beta + fma result(AB*Alpha) + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + + //zmm0 used to load 8 elements from + //matrix C. + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc )); + //Compute C * Beta + fma result(AB*Alpha) + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + //zmm0 used to load 8 elements from + //matrix C. + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 2)); + //Compute C * Beta + fma result(AB*Alpha) + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + + //zmm0 used to load 8 elements from + //matrix C. + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 3)); + //Compute C * Beta + fma result(AB*Alpha) + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + //zmm0 used to load 8 elements from + //matrix C. + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 4)); + //Compute C * Beta + fma result(AB*Alpha) + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + + //zmm0 used to load 8 elements from + //matrix C. + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 5)); + //Compute C * Beta + fma result(AB*Alpha) + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + + //zmm0 used to load 8 elements from + //matrix C. + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 6)); + //Compute C * Beta + fma result(AB*Alpha) + zmm24 = _mm512_fmadd_pd(zmm0, zmm31, zmm24); + + //zmm0 used to load 8 elements from + //matrix C. + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 7)); + //Compute C * Beta + fma result(AB*Alpha) + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + + //Store the result back to Matrix C. + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + //C matrix 2nd column + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + //C matrix 3rd column + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + //C matrix 4th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*3), k0, zmm15); + //C matrix 5th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4), k0, zmm18); + //C matrix 6th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5), k0, zmm21); + //C matrix 7th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*6), k0, zmm24); + //C matrix 8th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*7), k0, zmm27); + } + } + + switch(n_remainder) + { + case 7: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + //Clear out vector registers to hold fma result. + //zmm6 to zmm26 holds fma result. + //zmm0, zmm1, zmm2 are used to load 24 elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm26 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x7 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + //Compute A*B. + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + //Compute A*B. + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + //Compute A*B. + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + //Compute A*B. + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + //Compute A*B. + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm30, zmm26); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + zmm26 = _mm512_mul_pd(zmm0, zmm26); + + //Broadcast Beta into zmm31 + zmm31 = _mm512_set1_pd(beta_val); + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm31, zmm20); + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6 + 16)); + //Compute C * Beta + fma result(AB*Alpha) + zmm24 = _mm512_fmadd_pd(zmm0, zmm31, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm31, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm31, zmm26); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm26. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 16), zmm17); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 16), zmm20); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 16), zmm23); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 8), zmm25); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 16), zmm26); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + //Handles the edge case where m_remainder is from 17 to 23 + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + //Clear out vector registers to hold fma result. + //zmm6 to zmm26 holds fma result. + //zmm0, zmm1, zmm2 are used to load elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm26 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with (>16)x7 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from next column of B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + //Compute A*B. + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + //Compute A*B. + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + //Compute A*B. + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + //Compute A*B. + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + //Compute A*B. + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm30, zmm26); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + zmm26 = _mm512_mul_pd(zmm0, zmm26); + + //Broadcast Beta into zmm31 + zmm31 = _mm512_set1_pd(beta_val); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + 16)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc + 16)); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 2 + 16)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 3 + 16)); + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 4 + 16)); + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm31, zmm20); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 5 + 16)); + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + //zmm0, zmm1, zmm2 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 6 + 16)); + zmm24 = _mm512_fmadd_pd(zmm0, zmm31, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm31, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm31, zmm26); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm26. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 3 + 16), k0, zmm17); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*4 + 16), k0, zmm20); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*5 + 16), k0, zmm23); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 8), zmm25); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*6 + 16), k0, zmm26); + + } + //Handles the edge case where m_remadiner is from 9 to 16. + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with (>8)x7 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + //Compute A*B. + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + //Compute A*B. + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + //Compute A*B. + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + //Compute A*B. + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + //Compute A*B. + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + //Compute A*B. + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + + //Broadcast Beta into zmm31 + zmm31 = _mm512_set1_pd(beta_val); + //zmm0, zmm1 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + 8)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + + //zmm0, zmm1 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc + 8)); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + //zmm0, zmm1 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 2 + 8)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + + //zmm0, zmm1 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 3 + 8)); + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + //zmm0, zmm1 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 4 + 8)); + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + + //zmm0, zmm1 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 5 + 8)); + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + + //zmm0, zmm1 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 6)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 6 + 8)); + zmm24 = _mm512_fmadd_pd(zmm0, zmm31, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm31, zmm25); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm25. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 3 + 8), k0, zmm16); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4 + 8), k0, zmm19); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5 + 8), k0, zmm22); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*6 + 8), k0, zmm25); + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with (>1)x7 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + //Broadcast Beta into zmm31 + zmm31 = _mm512_set1_pd(beta_val); + //zmm0 are used to load elements from + //matrix C. + //Compute C * Beta + fma result(AB*Alpha) + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc )); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 2)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 3)); + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 4)); + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 5)); + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 6)); + zmm24 = _mm512_fmadd_pd(zmm0, zmm31, zmm24); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm24. + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + //C matrix 2nd column + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + //C matrix 3rd column + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + //C matrix 4th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*3), k0, zmm15); + //C matrix 5th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4), k0, zmm18); + //C matrix 6th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5), k0, zmm21); + //C matrix 7th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*6), k0, zmm24); + } + break; + } + case 6: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc + 16)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 16)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 16)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 16)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm31, zmm20); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5 + 16)); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 16), zmm17); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 16), zmm20); + + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 16), zmm23); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >16x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc + 16)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 2 + 16)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 3 + 16)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 4 + 16)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm31, zmm20); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 5 + 16)); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 3 + 16), k0, zmm17); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*4 + 16), k0, zmm20); + + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*5 + 16), k0, zmm23); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + 8)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc + 8)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 2 + 8)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 3 + 8)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 4 + 8)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 5)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 5 + 8)); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 3 + 8), k0, zmm16); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4 + 8), k0, zmm19); + + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5 + 8), k0, zmm22); + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc )); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 2)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 3)); + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 4)); + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 5)); + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*3), k0, zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4), k0, zmm18); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5), k0, zmm21); + } + break; + } + case 5: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x5 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc + 16)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 16)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 16)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 16)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm31, zmm20); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 16), zmm17); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 16), zmm20); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 8x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc + 16)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 2 + 16)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 3 + 16)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 4 + 16)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm31, zmm20); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 3 + 16), k0, zmm17); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*4 + 16), k0, zmm20); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + 8)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc + 8)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 2 + 8)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 3 + 8)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 4)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 4 + 8)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm31, zmm19); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 3 + 8), k0, zmm16); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4 + 8), k0, zmm19); + + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc )); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 2)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 3)); + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 4)); + zmm18 = _mm512_fmadd_pd(zmm0, zmm31, zmm18); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*3), k0, zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4), k0, zmm18); + } + break; + } + case 4: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x4 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc + 16)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 16)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 16)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 16), zmm17); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >16x4 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc + 16)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 2 + 16)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 3 + 16)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 3 + 16), k0, zmm17); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x4 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + 8)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc + 8)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 2 + 8)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 3)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 3 + 8)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 3 + 8), k0, zmm16); + + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x4 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc )); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 2)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 3)); + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*3), k0, zmm15); + } + break; + } + case 3: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 8x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc + 16)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 16)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 8x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc + 16)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2 + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc * 2 + 16)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm31, zmm14); + + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x3 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + 8)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc + 8)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc * 2)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc * 2 + 8)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm31, zmm13); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x3 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc )); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc * 2)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm31, zmm12); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + } + break; + } + case 2: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x2 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + ldc + 16)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >16x2 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + ldc + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + ldc + 16)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x2 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + 8)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c + ldc )); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + ldc + 8)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x2 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c + ldc )); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + } + break; + } + case 1: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x1 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >16x1 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_loadu_pd((double const *)(temp_c + 8)); + zmm2 = _mm512_mask_loadu_pd(zmm2, k0, (double const *)(temp_c + 16)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm31, zmm8); + + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x1 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_loadu_pd((double const *)(temp_c)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_c + 8)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm31, zmm7); + + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x1 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + + zmm31 = _mm512_set1_pd(beta_val); + + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_c)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm31, zmm6); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + } + break; + } + default: + { + break; + } + } + ret_status = BLIS_SUCCESS; + } + else if(alpha_val != 0.0 && beta_val == 0.0) + { + /* Compute C = alpha*A*B + beta*c */ + for(dim_t j = 0; (j + (D_NR-1) < n ); j += D_NR) + { + double* temp_b = b + j*ldb; + double* temp_a = a; + double* temp_c = c + j*ldc; + + for(dim_t i = 0; i < ( m - D_MR+1); i += D_MR) + { + //Clear out vector registers to hold fma result. + //zmm6 to zmm29 holds fma result. + //zmm0, zmm1, zmm2 are used to load 24 elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm26 = _mm512_setzero_pd(); + zmm27 = _mm512_setzero_pd(); + zmm28 = _mm512_setzero_pd(); + zmm29 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x8 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from next column of B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + //Compute A*B. + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + //Broadcast element from B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + //Compute A*B. + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + //Compute A*B. + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + //Broadcast element from B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + //Compute A*B. + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + //Compute A*B. + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + //Broadcast element from B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 7)); + //Compute A*B. + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm30, zmm26); + //Compute A*B. + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + zmm28 = _mm512_fmadd_pd(zmm1, zmm31, zmm28); + zmm29 = _mm512_fmadd_pd(zmm2, zmm31, zmm29); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + zmm26 = _mm512_mul_pd(zmm0, zmm26); + zmm27 = _mm512_mul_pd(zmm0, zmm27); + zmm28 = _mm512_mul_pd(zmm0, zmm28); + zmm29 = _mm512_mul_pd(zmm0, zmm29); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm29. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 16), zmm17); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 16), zmm20); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 16), zmm23); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 8), zmm25); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 16), zmm26); + //C matrix 8th column + _mm512_storeu_pd((double *)(temp_c + ldc*7), zmm27); + _mm512_storeu_pd((double *)(temp_c + ldc*7 + 8), zmm28); + _mm512_storeu_pd((double *)(temp_c + ldc*7 + 16), zmm29); + + //Update temp_c and temp_a pointer to + //respective offset. + temp_c += D_MR; + temp_a += D_MR; + } + + dim_t m_rem = m_remainder; + //Handles the edge case for m_remainder from 17 to 23. + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + //Clear out vector registers to hold fma result. + //zmm6 to zmm29 holds fma result. + //zmm0, zmm1, zmm2 are used to load elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm26 = _mm512_setzero_pd(); + zmm27 = _mm512_setzero_pd(); + zmm28 = _mm512_setzero_pd(); + zmm29 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >16x8 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from next column of B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + //Compute A*B. + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + //Compute A*B. + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + //Compute A*B. + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + //Compute A*B. + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + //Compute A*B. + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 7)); + //Compute A*B. + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm30, zmm26); + //Compute A*B. + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + zmm28 = _mm512_fmadd_pd(zmm1, zmm31, zmm28); + zmm29 = _mm512_fmadd_pd(zmm2, zmm31, zmm29); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + zmm26 = _mm512_mul_pd(zmm0, zmm26); + zmm27 = _mm512_mul_pd(zmm0, zmm27); + zmm28 = _mm512_mul_pd(zmm0, zmm28); + zmm29 = _mm512_mul_pd(zmm0, zmm29); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm29. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 3 + 16), k0, zmm17); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*4 + 16), k0, zmm20); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*5 + 16), k0, zmm23); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 8), zmm25); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*6 + 16), k0, zmm26); + //C matrix 8th column + _mm512_storeu_pd((double *)(temp_c + ldc*7), zmm27); + _mm512_storeu_pd((double *)(temp_c + ldc*7 + 8), zmm28); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*7 + 16), k0, zmm29); + } + //Handles the edge cases where m_remainder is from 9 to 16 + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + //Clear out vector registers to hold fma result. + //zmm6 to zmm28 holds fma result. + //zmm0, zmm1 are used to load elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm27 = _mm512_setzero_pd(); + zmm28 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x8 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from next column of B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 7)); + + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + zmm28 = _mm512_fmadd_pd(zmm1, zmm31, zmm28); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + zmm27 = _mm512_mul_pd(zmm0, zmm27); + zmm28 = _mm512_mul_pd(zmm0, zmm28); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm28. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 3 + 8), k0, zmm16); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4 + 8), k0, zmm19); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5 + 8), k0, zmm22); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*6 + 8), k0, zmm25); + //C matrix 8th column + _mm512_storeu_pd((double *)(temp_c + ldc*7), zmm27); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*7 + 8), k0, zmm28); + } + //Handles the edge case where m_remainder is from 1 to 8 + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + //Clear out vector registers to hold fma result. + //zmm6 to zmm27 holds fma result. + //zmm0 are used to load 8 elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm27 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x8 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from next column of B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 7)); + + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm27 = _mm512_fmadd_pd(zmm0, zmm31, zmm27); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm27 = _mm512_mul_pd(zmm0, zmm27); + + //Store the result back to Matrix C. + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + //C matrix 2nd column + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + //C matrix 3rd column + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + //C matrix 4th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*3), k0, zmm15); + //C matrix 5th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4), k0, zmm18); + //C matrix 6th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5), k0, zmm21); + //C matrix 7th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*6), k0, zmm24); + //C matrix 8th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*7), k0, zmm27); + } + } + + switch(n_remainder) + { + case 7: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + //Clear out vector registers to hold fma result. + //zmm6 to zmm26 holds fma result. + //zmm0, zmm1, zmm2 are used to load 24 elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm26 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x7 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + //Compute A*B. + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + //Compute A*B. + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + //Compute A*B. + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + //Compute A*B. + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + //Compute A*B. + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm30, zmm26); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + zmm26 = _mm512_mul_pd(zmm0, zmm26); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm26. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 16), zmm17); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 16), zmm20); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 16), zmm23); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 8), zmm25); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 16), zmm26); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + //Handles the edge case where m_remainder is from 17 to 23 + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + //Clear out vector registers to hold fma result. + //zmm6 to zmm26 holds fma result. + //zmm0, zmm1, zmm2 are used to load elements from + //A matrix. + //zmm30 and zmm31 are alternatively used to broadcast element + //from B matrix. + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm26 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with (>16)x7 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + //Broadcast element from B matrix in zmm30 + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + //Broadcast element from next column of B matrix in zmm31 + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + //Compute A*B. + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + //Compute A*B. + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + //Compute A*B. + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + //Compute A*B. + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + //Compute A*B. + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + zmm26 = _mm512_fmadd_pd(zmm2, zmm30, zmm26); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + zmm26 = _mm512_mul_pd(zmm0, zmm26); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm26. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 3 + 16), k0, zmm17); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*4 + 16), k0, zmm20); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*5 + 16), k0, zmm23); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_storeu_pd((double *)(temp_c + ldc*6 + 8), zmm25); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*6 + 16), k0, zmm26); + + } + //Handles the edge case where m_remadiner is from 9 to 16. + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm25 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with (>8)x7 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + //Compute A*B. + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + //Compute A*B. + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + //Compute A*B. + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + //Compute A*B. + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + //Compute A*B. + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + //Compute A*B. + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + //Compute A*B. + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + zmm25 = _mm512_fmadd_pd(zmm1, zmm30, zmm25); + + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + zmm25 = _mm512_mul_pd(zmm0, zmm25); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm25. + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + //C matrix 2nd column + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + //C matrix 3rd column + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + //C matrix 4th column + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 3 + 8), k0, zmm16); + //C matrix 5th column + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4 + 8), k0, zmm19); + //C matrix 6th column + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5 + 8), k0, zmm22); + //C matrix 7th column + _mm512_storeu_pd((double *)(temp_c + ldc*6), zmm24); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*6 + 8), k0, zmm25); + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm24 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with (>1)x7 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 6)); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + + zmm24 = _mm512_fmadd_pd(zmm0, zmm30, zmm24); + //Broadcast Alpha into zmm0 + zmm0 = _mm512_set1_pd(alpha_val); + //Scale fma result with Alpha. + //Alpha * AB + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm24 = _mm512_mul_pd(zmm0, zmm24); + + //Store the result back to Matrix C. + //Result is available in zmm6 to zmm24. + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + //C matrix 2nd column + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + //C matrix 3rd column + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + //C matrix 4th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*3), k0, zmm15); + //C matrix 5th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4), k0, zmm18); + //C matrix 6th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5), k0, zmm21); + //C matrix 7th column + _mm512_mask_storeu_pd((double *)(temp_c + ldc*6), k0, zmm24); + } + break; + } + case 6: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 16), zmm17); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 16), zmm20); + + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 16), zmm23); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm23 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >16x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + zmm23 = _mm512_fmadd_pd(zmm2, zmm31, zmm23); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + zmm23 = _mm512_mul_pd(zmm0, zmm23); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 3 + 16), k0, zmm17); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*4 + 16), k0, zmm20); + + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_storeu_pd((double *)(temp_c + ldc*5 + 8), zmm22); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*5 + 16), k0, zmm23); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm22 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + zmm22 = _mm512_fmadd_pd(zmm1, zmm31, zmm22); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + + zmm21 = _mm512_mul_pd(zmm0, zmm21); + zmm22 = _mm512_mul_pd(zmm0, zmm22); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 3 + 8), k0, zmm16); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4 + 8), k0, zmm19); + + _mm512_storeu_pd((double *)(temp_c + ldc*5), zmm21); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5 + 8), k0, zmm22); + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm21 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 5)); + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + + zmm21 = _mm512_fmadd_pd(zmm0, zmm31, zmm21); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm21 = _mm512_mul_pd(zmm0, zmm21); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*3), k0, zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4), k0, zmm18); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*5), k0, zmm21); + } + break; + } + case 5: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x5 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 16), zmm17); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 16), zmm20); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm20 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 8x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + zmm20 = _mm512_fmadd_pd(zmm2, zmm30, zmm20); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + zmm20 = _mm512_mul_pd(zmm0, zmm20); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 3 + 16), k0, zmm17); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_storeu_pd((double *)(temp_c + ldc*4 + 8), zmm19); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc*4 + 16), k0, zmm20); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm19 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + zmm19 = _mm512_fmadd_pd(zmm1, zmm30, zmm19); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + + zmm18 = _mm512_mul_pd(zmm0, zmm18); + zmm19 = _mm512_mul_pd(zmm0, zmm19); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 3 + 8), k0, zmm16); + + _mm512_storeu_pd((double *)(temp_c + ldc*4), zmm18); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4 + 8), k0, zmm19); + + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm18 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 4)); + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm18 = _mm512_fmadd_pd(zmm0, zmm30, zmm18); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm18 = _mm512_mul_pd(zmm0, zmm18); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*3), k0, zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*4), k0, zmm18); + } + break; + } + case 4: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x4 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 16), zmm17); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm17 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >16x4 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + zmm17 = _mm512_fmadd_pd(zmm2, zmm31, zmm17); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + zmm17 = _mm512_mul_pd(zmm0, zmm17); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_storeu_pd((double *)(temp_c + ldc * 3 + 8), zmm16); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 3 + 16), k0, zmm17); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm16 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x4 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + zmm16 = _mm512_fmadd_pd(zmm1, zmm31, zmm16); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + + zmm15 = _mm512_mul_pd(zmm0, zmm15); + zmm16 = _mm512_mul_pd(zmm0, zmm16); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + + _mm512_storeu_pd((double *)(temp_c + ldc*3), zmm15); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 3 + 8), k0, zmm16); + + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm15 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x4 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 3)); + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm15 = _mm512_fmadd_pd(zmm0, zmm31, zmm15); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm15 = _mm512_mul_pd(zmm0, zmm15); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc*3), k0, zmm15); + } + break; + } + case 3: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 8x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 16), zmm14); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm14 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 8x6 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + zmm14 = _mm512_fmadd_pd(zmm2, zmm30, zmm14); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + zmm14 = _mm512_mul_pd(zmm0, zmm14); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_storeu_pd((double *)(temp_c + ldc * 2 + 8), zmm13); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc * 2 + 16), k0, zmm14); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm13 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x3 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + zmm13 = _mm512_fmadd_pd(zmm1, zmm30, zmm13); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + + zmm12 = _mm512_mul_pd(zmm0, zmm12); + zmm13 = _mm512_mul_pd(zmm0, zmm13); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + + _mm512_storeu_pd((double *)(temp_c + ldc * 2), zmm12); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2 + 8), k0, zmm13); + + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm12 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x3 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 2)); + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm12 = _mm512_fmadd_pd(zmm0, zmm30, zmm12); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm12 = _mm512_mul_pd(zmm0, zmm12); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc * 2), k0, zmm12); + } + break; + } + case 2: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x2 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + _mm_prefetch((char*)( temp_a + 192), _MM_HINT_T0); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_storeu_pd((double *)(temp_c + ldc + 16), zmm11); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm11 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >16x2 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + zmm11 = _mm512_fmadd_pd(zmm2, zmm31, zmm11); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + zmm11 = _mm512_mul_pd(zmm0, zmm11); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_storeu_pd((double *)(temp_c + ldc + 8), zmm10); + _mm512_mask_storeu_pd ((double *)(temp_c + ldc + 16), k0, zmm11); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm10 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x2 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + zmm10 = _mm512_fmadd_pd(zmm1, zmm31, zmm10); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + zmm9 = _mm512_mul_pd(zmm0, zmm9); + zmm10 = _mm512_mul_pd(zmm0, zmm10); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + + _mm512_storeu_pd((double *)(temp_c + ldc), zmm9); + _mm512_mask_storeu_pd((double *)(temp_c + ldc + 8), k0, zmm10); + + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm9 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x2 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm31 = _mm512_set1_pd(*(double const *)(temp_b + ldb * 1)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm9 = _mm512_fmadd_pd(zmm0, zmm31, zmm9); + + zmm0 = _mm512_set1_pd(alpha_val); + + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm9 = _mm512_mul_pd(zmm0, zmm9); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + ldc), k0, zmm9); + } + break; + } + case 1: + { + double* temp_b = b + (n - n_remainder)*ldb; + double* temp_a = a; + double* temp_c = c + (n - n_remainder)*ldc; + for(dim_t i = 0;i < (m-D_MR+1);i=i+D_MR) + { + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with 24x1 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_loadu_pd((double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm0 = _mm512_set1_pd(alpha_val); + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_storeu_pd((double *)(temp_c + 16), zmm8); + + temp_c += D_MR; + temp_a += D_MR; + } + dim_t m_rem = m_remainder; + if(m_rem > 16) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm8 = _mm512_setzero_pd(); + zmm2 = _mm512_setzero_pd(); + + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >16x1 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_loadu_pd((double const *)(temp_a + 8)); + zmm2 = _mm512_mask_loadu_pd (zmm2, k0, (double const *)(temp_a + 16)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + zmm8 = _mm512_fmadd_pd(zmm2, zmm30, zmm8); + + zmm0 = _mm512_set1_pd(alpha_val); + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + zmm8 = _mm512_mul_pd(zmm0, zmm8); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_storeu_pd((double *)(temp_c + 8), zmm7); + _mm512_mask_storeu_pd ((double *)(temp_c + 16), k0, zmm8); + + } + else if(m_rem > 8) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm7 = _mm512_setzero_pd(); + zmm1 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >8x1 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_loadu_pd((double const *)(temp_a)); + zmm1 = _mm512_mask_loadu_pd(zmm1, k0, (double const *)(temp_a + 8)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + zmm7 = _mm512_fmadd_pd(zmm1, zmm30, zmm7); + + zmm0 = _mm512_set1_pd(alpha_val); + zmm6 = _mm512_mul_pd(zmm0, zmm6); + zmm7 = _mm512_mul_pd(zmm0, zmm7); + + _mm512_storeu_pd((double *)(temp_c), zmm6); + _mm512_mask_storeu_pd((double *)(temp_c + 8), k0, zmm7); + } + else if(m_rem > 0) + { + uint8_t mask = (0xff >> (0x8 - (m & 7))); // calculate mask based on m_remainder + if (mask == 0) mask = 0xff; + __mmask8 k0 = _load_mask8(&mask); + zmm6 = _mm512_setzero_pd(); + zmm0 = _mm512_setzero_pd(); + /* + a. Perform alpha*A*B using temp_a, temp_b and alpha_val, + where alpha_val is not zero. + b. This loop operates with >1x1 block size + along n dimension for every D_NR columns of temp_b where + computing all D_MR rows of temp_a. + c. Same approach is used in remaining fringe cases. + */ + zmm0 = _mm512_mask_loadu_pd(zmm0, k0, (double const *)(temp_a)); + + zmm30 = _mm512_set1_pd(*(double const *)(temp_b)); + zmm6 = _mm512_fmadd_pd(zmm0, zmm30, zmm6); + + zmm0 = _mm512_set1_pd(alpha_val); + zmm6 = _mm512_mul_pd(zmm0, zmm6); + + _mm512_mask_storeu_pd((double *)(temp_c), k0, zmm6); + } + break; + } + default: + { + break; + } + } + ret_status = BLIS_SUCCESS; + } + else + { + ;//return failure; + } + return ret_status; + +} diff --git a/kernels/zen4/bli_kernels_zen4.h b/kernels/zen4/bli_kernels_zen4.h index 4a25b78572..40ef3c8830 100644 --- a/kernels/zen4/bli_kernels_zen4.h +++ b/kernels/zen4/bli_kernels_zen4.h @@ -204,6 +204,18 @@ GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x3 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x1 ) +err_t bli_dgemm_24x8_avx512_k1_nn + ( + dim_t m, + dim_t n, + dim_t k, + double* alpha, + double* a, const inc_t lda, + double* b, const inc_t ldb, + double* beta, + double* c, const inc_t ldc + ); + // threshold functions bool bli_cntx_gemmsup_thresh_is_met_zen4 ( From 06f23c4fd4ee045fc17b67597a6e041dce91dcd3 Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Wed, 1 Nov 2023 23:54:29 +0530 Subject: [PATCH 187/226] Bugfix : Functional correctness of DNRM2_ and DZNRM2_ APIs - Updated the final reduction of partial sums( AVX-2 code section ) to use scalar accumulation entirely, instead of using the _mm256_hadd_pd( ... ) intrinsic. This will in turn change the associativity in the reduction step. - Reverted to using scalar code on the fringe cases in AVX-2 kernel for DNRM2 and DZNRM2, for improving functional correctness. AMD-Internal: [CPUPL-4049] Change-Id: I9d320b39d23a0cbcc77fb24d951fced778ea5ea5 --- kernels/zen/1/bli_norm2_zen_int.c | 281 ++++++++---------------------- 1 file changed, 75 insertions(+), 206 deletions(-) diff --git a/kernels/zen/1/bli_norm2_zen_int.c b/kernels/zen/1/bli_norm2_zen_int.c index c2d0ebe7cd..82addbd005 100644 --- a/kernels/zen/1/bli_norm2_zen_int.c +++ b/kernels/zen/1/bli_norm2_zen_int.c @@ -2066,8 +2066,6 @@ void bli_dnorm2fv_unb_var1_avx2 double *xt = x; - dim_t n_rem = n % 4; - // Compute the sum of squares on 3 accumulators to avoid overflow // and underflow, depending on the vector element value. // Accumulator for small values; using scaling to avoid underflow. @@ -2090,108 +2088,6 @@ void bli_dnorm2fv_unb_var1_avx2 if( incx == 1 ) { - // Attending to the fringe case requiring SSE code section. - if ( n_rem >= 2 ) - { - // Clearing the upper 128-bit lanes if and when required. - // This ensures that the AVX-SSE transition penalty is avoided. - _mm256_zeroupper(); - - // Partial sums used for scaling, and registers to store thresholds - // and scaling factors - v2df_t sum_med_vec, sum_big_vec, sum_sml_vec; - v2df_t thres_sml_vec, thres_big_vec; - v2df_t scale_sml_vec, scale_big_vec; - - // Vectors used for intermediate arithmetic and absolute value - v2df_t temp, zerov; - - sum_med_vec.v = _mm_setzero_pd(); - sum_big_vec.v = _mm_setzero_pd(); - sum_sml_vec.v = _mm_setzero_pd(); - - temp.v = _mm_set1_pd( -0.0 ); - thres_big_vec.v = _mm_loaddup_pd( &thres_big ); - thres_sml_vec.v = _mm_loaddup_pd( &thres_sml ); - - // Vectors used for loading from memory and setting masks - v2df_t x0v, mask_vec; - - v2df_t med_blend, non_med_blend; - - x0v.v = _mm_loadu_pd( xt ); - - // Getting the abs of the vector elements. - x0v.v = _mm_andnot_pd( temp.v, x0v.v ); - - // Check if any of the values is a NaN and if so, return. - mask_vec.v = _mm_cmp_pd( x0v.v, x0v.v, _CMP_UNORD_Q ); - - // Checking for the presence of atleast one NaN - if ( bli_horizontal_or_df_128( mask_vec.v ) ) - { - *norm = NAN; - return; - } - - mask_vec.v = CMP128_df( x0v.v, thres_sml_vec.v, thres_big_vec.v ); - - if ( !bli_horizontal_or_df_128( mask_vec.v ) ) - { - // Scaling is not necessary; only medium values. - sum_med_vec.v = _mm_fmadd_pd( x0v.v, x0v.v, sum_med_vec.v ); - } - else - { - // Mask vector which indicate whether xi > thres_big. - mask_vec.v = _mm_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ ); - zerov.v = _mm_setzero_pd(); - - if ( bli_horizontal_or_df_128( mask_vec.v ) ) - { - scale_big_vec.v = _mm_loaddup_pd( &scale_big ); - isbig = true; - - // Fill sum_med vector without scaling. - med_blend.v = _mm_blendv_pd( x0v.v, zerov.v, mask_vec.v ); - sum_med_vec.v = _mm_fmadd_pd( med_blend.v, med_blend.v, sum_med_vec.v ); - - // Fill sum_big vector using scaling. - zerov.v = _mm_setzero_pd(); - non_med_blend.v = _mm_blendv_pd( zerov.v, scale_big_vec.v, mask_vec.v ); - non_med_blend.v = _mm_mul_pd( x0v.v, non_med_blend.v ); - sum_big_vec.v = _mm_fmadd_pd( non_med_blend.v, non_med_blend.v, sum_big_vec.v ); - } - else - { - scale_sml_vec.v = _mm_loaddup_pd( &scale_sml ); - // Mask vector which indicates whether xi > thres_small. - mask_vec.v = _mm_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ ); - // Fill sum_med vector without scaling. - med_blend.v = _mm_blendv_pd( x0v.v, zerov.v, mask_vec.v ); - sum_med_vec.v = _mm_fmadd_pd( med_blend.v, med_blend.v, sum_med_vec.v ); - - // Accumulate small values only if there have not been any big values so far. - if ( !isbig ) - { - // Fill sum_sml vector using scaling. - zerov.v = _mm_setzero_pd(); - non_med_blend.v = _mm_blendv_pd( zerov.v, scale_sml_vec.v, mask_vec.v ); - non_med_blend.v = _mm_mul_pd( x0v.v, non_med_blend.v ); - sum_sml_vec.v = _mm_fmadd_pd( non_med_blend.v, non_med_blend.v, sum_sml_vec.v ); - } - } - } - - // Final accumulation on the appropriate scalars - sum_sml += sum_sml_vec.v[0] + sum_sml_vec.v[1]; - sum_med += sum_med_vec.v[0] + sum_med_vec.v[1]; - sum_big += sum_big_vec.v[0] + sum_big_vec.v[1]; - - xt += 2; - i += 2; - } - // AVX-2 code-section // Partial sums used for scaling. v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0; @@ -2233,6 +2129,8 @@ void bli_dnorm2fv_unb_var1_avx2 if ( bli_horizontal_or_df( mask_vec0.v ) || bli_horizontal_or_df( mask_vec1.v ) ) { *norm = NAN; + + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; } @@ -2352,6 +2250,8 @@ void bli_dnorm2fv_unb_var1_avx2 if ( bli_horizontal_or_df( mask_vec0.v ) ) { *norm = NAN; + + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; } @@ -2410,24 +2310,25 @@ void bli_dnorm2fv_unb_var1_avx2 sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v ); sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v ); - sum_sml_vec0.v = _mm256_hadd_pd( sum_sml_vec0.v, sum_sml_vec0.v ); - sum_med_vec0.v = _mm256_hadd_pd( sum_med_vec0.v, sum_med_vec0.v ); - sum_big_vec0.v = _mm256_hadd_pd( sum_big_vec0.v, sum_big_vec0.v ); - - sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[2]; - sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[2]; - sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[2]; + sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1] + + sum_sml_vec0.v[2] + sum_sml_vec0.v[3]; + sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1] + + sum_med_vec0.v[2] + sum_med_vec0.v[3]; + sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1] + + sum_big_vec0.v[2] + sum_big_vec0.v[3]; } // Dealing with fringe cases + double abs_chi; for( ; i < n; i += 1 ) { - double abs_chi; abs_chi = bli_fabs( *xt ); // Any thread encountering a NAN sets the sum_med accumalator to NAN if ( bli_isnan( abs_chi ) ) { *norm = NAN; + + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; } // Most likely case: medium values, not over/under-flow. @@ -2512,6 +2413,8 @@ void bli_dznorm2fv_unb_var1_avx2 cntx_t* cntx ) { + AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 ); + double sumsq = 0; dcomplex *xt = x; @@ -2578,6 +2481,8 @@ void bli_dznorm2fv_unb_var1_avx2 if ( bli_horizontal_or_df( mask_vec0.v ) || bli_horizontal_or_df( mask_vec1.v ) ) { *norm = NAN; + + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; } @@ -2697,6 +2602,8 @@ void bli_dznorm2fv_unb_var1_avx2 if ( bli_horizontal_or_df( mask_vec0.v ) ) { *norm = NAN; + + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; } @@ -2755,113 +2662,75 @@ void bli_dznorm2fv_unb_var1_avx2 sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v ); sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v ); - sum_sml_vec0.v = _mm256_hadd_pd( sum_sml_vec0.v, sum_sml_vec0.v ); - sum_med_vec0.v = _mm256_hadd_pd( sum_med_vec0.v, sum_med_vec0.v ); - sum_big_vec0.v = _mm256_hadd_pd( sum_big_vec0.v, sum_big_vec0.v ); - - // Final accumulation on the appropriate scalars - sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[2]; - sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[2]; - sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[2]; + sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1] + + sum_sml_vec0.v[2] + sum_sml_vec0.v[3]; + sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1] + + sum_med_vec0.v[2] + sum_med_vec0.v[3]; + sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1] + + sum_big_vec0.v[2] + sum_big_vec0.v[3]; } - - // Clearing the upper 128-bit lanes if and when required. - // This ensures that the AVX-SSE transition penalty is avoided. - _mm256_zeroupper(); - - // Dealing with fringe cases using SSE instructions and 128-bit registers. - // This is because each element of dcomplex type is 128 bits in size, thereby - // giving scope for this optimization. - for( ; i < n; i += 1 ) + // Scalar loop to handle the fringe cases + double chi_r, chi_i; + double abs_chi; + for ( ; i < n; i++) { - v2df_t sum_med_vec, sum_big_vec, sum_sml_vec; - v2df_t thres_sml_vec, thres_big_vec; - v2df_t scale_sml_vec, scale_big_vec; - - v2df_t temp, zerov; - - sum_med_vec.v = _mm_setzero_pd(); - sum_big_vec.v = _mm_setzero_pd(); - sum_sml_vec.v = _mm_setzero_pd(); - - temp.v = _mm_set1_pd( -0.0 ); - thres_big_vec.v = _mm_loaddup_pd( &thres_big ); - thres_sml_vec.v = _mm_loaddup_pd( &thres_sml ); - - // Vectors used for loading from memory and setting masks - v2df_t x0v, mask_vec; - - v2df_t med_blend, non_med_blend; - - x0v.v = _mm_loadu_pd( ( const double * )xt ); - - // Getting the abs of the vector elements. - x0v.v = _mm_andnot_pd( temp.v, x0v.v ); - - // Check if any of the values is a NaN and if so, return. - mask_vec.v = _mm_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q); + // Get real and imaginary component of the vector element. + bli_zdgets(*xt, chi_r, chi_i); - // Checking for the presence of atleast one NaN - if ( bli_horizontal_or_df_128( mask_vec.v ) ) + // Start with accumulating the real component of the vector element. + abs_chi = bli_fabs( chi_r ); + // If any of the elements is NaN, then return NaN as a result. + if ( bli_isnan( abs_chi ) ) { - *norm = NAN; + *norm = abs_chi; + + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; } - - mask_vec.v = CMP128_df( x0v.v, thres_sml_vec.v, thres_big_vec.v ); - - if ( !bli_horizontal_or_df_128( mask_vec.v ) ) + // Most likely case: medium values, not over/under-flow. + if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) { - // Scaling is not necessary; only medium values. - sum_med_vec.v = _mm_fmadd_pd( x0v.v, x0v.v, sum_med_vec.v ); + sum_med += abs_chi * abs_chi; } - else + // Case where there could be an overflow. Scaling is required. + else if ( abs_chi > thres_big ) { - // Mask vector which indicate whether xi > thres_big. - mask_vec.v = _mm_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ ); - zerov.v = _mm_setzero_pd(); - - if ( bli_horizontal_or_df_128( mask_vec.v ) ) - { - scale_big_vec.v = _mm_loaddup_pd( &scale_big ); - isbig = true; - - // Fill sum_med vector without scaling. - med_blend.v = _mm_blendv_pd( x0v.v, zerov.v, mask_vec.v ); - sum_med_vec.v = _mm_fmadd_pd( med_blend.v, med_blend.v, sum_med_vec.v ); + sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); + isbig = true; + } + // Case where there could be an underflow. Scaling is required. + else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) + { + sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); + } - // Fill sum_big vector using scaling. - zerov.v = _mm_setzero_pd(); - non_med_blend.v = _mm_blendv_pd( zerov.v, scale_big_vec.v, mask_vec.v ); - non_med_blend.v = _mm_mul_pd( x0v.v, non_med_blend.v ); - sum_big_vec.v = _mm_fmadd_pd( non_med_blend.v, non_med_blend.v, sum_big_vec.v ); - } - else - { - scale_sml_vec.v = _mm_loaddup_pd( &scale_sml ); - // Mask vector which indicates whether xi > thres_small. - mask_vec.v = _mm_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ ); - // Fill sum_med vector without scaling. - med_blend.v = _mm_blendv_pd( x0v.v, zerov.v, mask_vec.v ); - sum_med_vec.v = _mm_fmadd_pd( med_blend.v, med_blend.v, sum_med_vec.v ); + // Accumulate the imaginary component of the vector element. + abs_chi = bli_fabs( chi_i ); + // If any of the elements is NaN, then return NaN as a result. + if ( bli_isnan( abs_chi ) ) + { + *norm = abs_chi; - // Accumulate small values only if there have not been any big values so far. - if ( !isbig ) - { - // Fill sum_sml vector using scaling. - zerov.v = _mm_setzero_pd(); - non_med_blend.v = _mm_blendv_pd( zerov.v, scale_sml_vec.v, mask_vec.v ); - non_med_blend.v = _mm_mul_pd( x0v.v, non_med_blend.v ); - sum_sml_vec.v = _mm_fmadd_pd( non_med_blend.v, non_med_blend.v, sum_sml_vec.v ); - } - } + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); + return; + } + // Most likely case: medium values, not over/under-flow. + if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) + { + sum_med += abs_chi * abs_chi; + } + // Case where there could be an overflow. Scaling is required. + else if ( abs_chi > thres_big ) + { + sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); + isbig = true; + } + // Case where there could be an underflow. Scaling is required. + else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) + { + sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); } - - // Final accumulation on the appropriate scalars - sum_sml += sum_sml_vec.v[0] + sum_sml_vec.v[1]; - sum_med += sum_med_vec.v[0] + sum_med_vec.v[1]; - sum_big += sum_big_vec.v[0] + sum_big_vec.v[1]; xt += incx; } From fa355c0049dc22030a9ac1eca8dbe9b6d5a9793d Mon Sep 17 00:00:00 2001 From: mangala v Date: Mon, 6 Nov 2023 21:10:24 +0530 Subject: [PATCH 188/226] Removed warning during compilation of gemv api for non-zen config - When configured for haswell config "Warning unused variable 'zero'" was throwed during compilation. - Removed zero variable which is not being used AMD-Internal: [CPUPL-3973] Change-Id: I45a1f16b4c50307b07148bba63ca5332c48648b8 --- frame/2/gemv/bli_gemv_unf_var2.c | 1 - 1 file changed, 1 deletion(-) diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c index 69add27de7..701d414b4b 100644 --- a/frame/2/gemv/bli_gemv_unf_var2.c +++ b/frame/2/gemv/bli_gemv_unf_var2.c @@ -62,7 +62,6 @@ void PASTEMAC(ch,varname) \ \ const num_t dt = PASTEMAC(ch,type); \ \ - ctype* zero = PASTEMAC(ch,0); \ ctype* A1; \ ctype* x1; \ ctype* y1; \ From 5f9c8c6929f899467f9daf621a437a744db37a7d Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Thu, 26 Oct 2023 16:47:24 +0530 Subject: [PATCH 189/226] Bugfix : Fallback mechanism in SNRM2 and SCNRM2 kernels if packing fails - Abstracted packing from the vectorized kernels for SNRM2 and SCNRM2 to a layer higher. - Added a scalar loop to handle compute in case of non-unit strides. This loop ensures functionality in case packing fails at the framework level. AMD-Internal: [CPUPL-3633] Change-Id: I555aea519d7434d43c541bb0f661f81105135b98 --- frame/util/bli_util_unb_var1.c | 142 ++++++- kernels/zen/1/bli_norm2_zen_int.c | 627 +++++------------------------- 2 files changed, 221 insertions(+), 548 deletions(-) diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index 460ec76f94..dbabc5a345 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -318,15 +318,76 @@ void bli_cnormfv_unb_var1 rntm_t* rntm ) { + scomplex *x_buf = x; + inc_t incx_buf = incx; + + // Querying the architecture ID to deploy the appropriate kernel arch_t id = bli_arch_query_id(); - switch (id) + switch ( id ) { case BLIS_ARCH_ZEN4: case BLIS_ARCH_ZEN3: case BLIS_ARCH_ZEN2: - case BLIS_ARCH_ZEN: + case BLIS_ARCH_ZEN:; #ifdef BLIS_KERNELS_ZEN - bli_scnorm2fv_unb_var1_avx2( n, x, incx, norm, cntx ); + // Memory pool declarations for packing vector X. + // Initialize mem pool buffer to NULL and size to 0. + // "buf" and "size" fields are assigned once memory + // is allocated from the pool in bli_pba_acquire_m(). + // This will ensure bli_mem_is_alloc() will be passed on + // an allocated memory if created or a NULL. + mem_t mem_buf_X = { 0 }; + rntm_t rntm_l; + // Packing for non-unit strided vector x. + if ( incx != 1 ) + { + // In order to get the buffer from pool via rntm access to memory broker + // is needed. Following are initializations for rntm. + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } + bli_rntm_set_num_threads_only( 1, &rntm_l ); + bli_pba_rntm_set_pba( &rntm_l ); + + // Calculate the size required for "n" scomplex elements in vector x. + size_t buffer_size = n * sizeof( scomplex ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_scnorm2fv_unb_var1_avx2(): get mem pool block\n" ); + #endif + + // Acquire a Buffer(n*size(scomplex)) from the memory broker + // and save the associated mem_t entry to mem_buf_X. + bli_pba_acquire_m + ( + &rntm_l, + buffer_size, + BLIS_BUFFER_FOR_B_PANEL, + &mem_buf_X + ); + + // Continue packing X if buffer memory is allocated. + if ( bli_mem_is_alloc( &mem_buf_X ) ) + { + x_buf = bli_mem_buffer( &mem_buf_X ); + // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride. + for ( dim_t x_index = 0; x_index < n; x_index++ ) + { + *( x_buf + x_index ) = *( x + ( x_index * incx ) ); + } + incx_buf = 1; + } + } + + bli_scnorm2fv_unb_var1_avx2( n, x_buf, incx_buf, norm, cntx ); + + if ( bli_mem_is_alloc( &mem_buf_X ) ) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); + #endif + // Return the buffer to pool. + bli_pba_release( &rntm_l , &mem_buf_X ); + } break; #endif default:; @@ -345,8 +406,8 @@ void bli_cnormfv_unb_var1 bli_csumsqv_unb_var1 ( n, - x, - incx, + x_buf, + incx_buf, &scale, &sumsq, cntx, @@ -814,6 +875,9 @@ void bli_snormfv_unb_var1 return; } + float *x_buf = x; + inc_t incx_buf = incx; + // Querying the architecture ID to deploy the appropriate kernel arch_t id = bli_arch_query_id(); switch ( id ) @@ -821,9 +885,69 @@ void bli_snormfv_unb_var1 case BLIS_ARCH_ZEN4: case BLIS_ARCH_ZEN3: case BLIS_ARCH_ZEN2: - case BLIS_ARCH_ZEN: + case BLIS_ARCH_ZEN:; #ifdef BLIS_KERNELS_ZEN - bli_snorm2fv_unb_var1_avx2( n, x, incx, norm, cntx ); + // Memory pool declarations for packing vector X. + // Initialize mem pool buffer to NULL and size to 0. + // "buf" and "size" fields are assigned once memory + // is allocated from the pool in bli_pba_acquire_m(). + // This will ensure bli_mem_is_alloc() will be passed on + // an allocated memory if created or a NULL. + mem_t mem_buf_X = { 0 }; + rntm_t rntm_l; + // Packing for non-unit strided vector x. + if ( incx != 1 ) + { + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } + + // In order to get the buffer from pool via rntm access to memory broker + // is needed. Following are initializations for rntm. + bli_rntm_set_num_threads_only( 1, &rntm_l ); + bli_pba_rntm_set_pba( &rntm_l ); + + // Calculate the size required for "n" float elements in vector x. + size_t buffer_size = n * sizeof( float ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_snorm2fv_unb_var1_avx2(): get mem pool block\n" ); + #endif + + // Acquire a Buffer(n*size(float)) from the memory broker + // and save the associated mem_t entry to mem_buf_X. + bli_pba_acquire_m + ( + &rntm_l, + buffer_size, + BLIS_BUFFER_FOR_B_PANEL, + &mem_buf_X + ); + + // Continue packing X if buffer memory is allocated. + if ( bli_mem_is_alloc( &mem_buf_X ) ) + { + x_buf = bli_mem_buffer( &mem_buf_X ); + // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride. + for ( dim_t x_index = 0; x_index < n; x_index++ ) + { + *( x_buf + x_index ) = *( x + ( x_index * incx ) ); + } + incx_buf = 1; + } + } + + bli_snorm2fv_unb_var1_avx2( n, x_buf, incx_buf, norm, cntx ); + + if ( bli_mem_is_alloc( &mem_buf_X ) ) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); + #endif + // Return the buffer to pool. + bli_pba_release( &rntm_l , &mem_buf_X ); + } break; #endif default:; @@ -841,8 +965,8 @@ void bli_snormfv_unb_var1 bli_ssumsqv_unb_var1 ( n, - x, - incx, + x_buf, + incx_buf, &scale, &sumsq, cntx, diff --git a/kernels/zen/1/bli_norm2_zen_int.c b/kernels/zen/1/bli_norm2_zen_int.c index 82addbd005..aa13f72061 100644 --- a/kernels/zen/1/bli_norm2_zen_int.c +++ b/kernels/zen/1/bli_norm2_zen_int.c @@ -109,57 +109,8 @@ void bli_snorm2fv_unb_var1_avx2 float sumsq = 0.0f; dim_t i = 0; - dim_t n_remainder = 0; - float *x_buf = x; - - // Memory pool declarations for packing vector X. - // Initialize mem pool buffer to NULL and size to 0. - // "buf" and "size" fields are assigned once memory - // is allocated from the pool in bli_pba_acquire_m(). - // This will ensure bli_mem_is_alloc() will be passed on - // an allocated memory if created or a NULL. - mem_t mem_bufX = {0}; - rntm_t rntm; - - // Packing for non-unit strided vector x. - if ( incx != 1 ) - { - // In order to get the buffer from pool via rntm access to memory broker - //is needed. Following are initializations for rntm. - bli_rntm_init_from_global( &rntm ); - bli_rntm_set_num_threads_only( 1, &rntm ); - bli_pba_rntm_set_pba( &rntm ); - - // Calculate the size required for "n" float elements in vector x. - size_t buffer_size = n * sizeof( float ); - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): get mem pool block\n" ); - #endif - - // Acquire a Buffer(n*size(float)) from the memory broker - // and save the associated mem_t entry to mem_bufX. - bli_pba_acquire_m - ( - &rntm, - buffer_size, - BLIS_BUFFER_FOR_B_PANEL, - &mem_bufX - ); - - // Continue packing X if buffer memory is allocated. - if ( ( bli_mem_is_alloc( &mem_bufX ) ) ) - { - x_buf = bli_mem_buffer( &mem_bufX ); - // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride. - for ( dim_t x_index = 0; x_index < n; x_index++ ) - { - *( x_buf + x_index ) = *( x + ( x_index * incx ) ); - } - } - } - - float *xt = x_buf; + float *xt = x; // Compute the sum of squares on 3 accumulators to avoid overflow // and underflow, depending on the vector element value. @@ -180,7 +131,7 @@ void bli_snorm2fv_unb_var1_avx2 float abs_chi; bool isbig = false; - if ( n >= 64 ) + if ( ( n >= 64 ) && ( incx == 1 ) ) { // Constants used for comparisons. v8sf_t temp, thres_sml_vec, thres_big_vec, zerov; @@ -229,62 +180,11 @@ void bli_snorm2fv_unb_var1_avx2 mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q); mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q); mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q); - if ( bli_horizontal_or_sf( mask_vec0.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec1.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec2.v ) ) + if ( bli_horizontal_or_sf( mask_vec0.v ) || bli_horizontal_or_sf( mask_vec1.v ) + || bli_horizontal_or_sf( mask_vec2.v ) || bli_horizontal_or_sf( mask_vec3.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec3.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; @@ -492,47 +392,10 @@ void bli_snorm2fv_unb_var1_avx2 mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q); mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q); mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q); - if ( bli_horizontal_or_sf( mask_vec0.v ) ) + if ( bli_horizontal_or_sf( mask_vec0.v ) || bli_horizontal_or_sf( mask_vec1.v ) + || bli_horizontal_or_sf( mask_vec2.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec1.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec2.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; @@ -695,32 +558,9 @@ void bli_snorm2fv_unb_var1_avx2 // Check if any of the values is a NaN and if so, return. mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q); mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q); - if ( bli_horizontal_or_sf( mask_vec0.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec1.v ) ) + if ( bli_horizontal_or_sf( mask_vec0.v ) || bli_horizontal_or_sf( mask_vec1.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; @@ -837,14 +677,6 @@ void bli_snorm2fv_unb_var1_avx2 if ( bli_horizontal_or_sf( mask_vec0.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; @@ -918,75 +750,35 @@ void bli_snorm2fv_unb_var1_avx2 sum_big = horizontal_add_sf(sum_big_vec0.v); } - n_remainder = n - i; - bool hasInf = false; - - if ( ( n_remainder > 0 ) ) + // Put first the most likely to happen to avoid evaluations on if statements. + for ( ; i < n; i++) { - // Put first the most likely to happen to avoid evaluations on if statements. - for (i = 0; i < n_remainder; i++) + abs_chi = bli_fabs( *xt ); + // If any of the elements is NaN, then return NaN as a result. + if ( bli_isnan( abs_chi ) ) { - abs_chi = bli_fabs( *xt ); - // If any of the elements is NaN, then return NaN as a result. - if ( bli_isnan( abs_chi ) ) - { - *norm = abs_chi; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } + *norm = abs_chi; - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - // Else, if any of the elements is an Inf, then return +Inf as a result. - if ( bli_isinf( abs_chi ) ) - { - *norm = abs_chi; - // Instead of returning immediately, use this flag - // to denote that there is an Inf element in the vector. - // That is used to avoid cases where there is a NaN which comes - // after an Inf. - hasInf = true; - } - // Most likely case: medium values, not over/under-flow. - if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) - { - sum_med += abs_chi * abs_chi; - } - // Case where there could be an overflow. Scaling is required. - else if ( abs_chi > thres_big ) - { - sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); - isbig = true; - } - // Case where there could be an underflow. Scaling is required. - else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) - { - sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); - } - xt++; + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); + return; } - } - // Early return if there is an Inf. - if ( hasInf ) - { - - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) + // Most likely case: medium values, not over/under-flow. + if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); + sum_med += abs_chi * abs_chi; } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; + // Case where there could be an overflow. Scaling is required. + else if ( abs_chi > thres_big ) + { + sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); + isbig = true; + } + // Case where there could be an underflow. Scaling is required. + else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) + { + sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); + } + xt += incx; } // Combine accumulators. @@ -1036,15 +828,6 @@ void bli_snorm2fv_unb_var1_avx2 *norm = scale * sqrtf( sumsq ); - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; @@ -1063,57 +846,8 @@ void bli_scnorm2fv_unb_var1_avx2 float sumsq = 0.0f; dim_t i = 0; - dim_t n_remainder = 0; - scomplex *x_buf = x; - - // Memory pool declarations for packing vector X. - // Initialize mem pool buffer to NULL and size to 0. - // "buf" and "size" fields are assigned once memory - // is allocated from the pool in bli_pba_acquire_m(). - // This will ensure bli_mem_is_alloc() will be passed on - // an allocated memory if created or a NULL. - mem_t mem_bufX = {0}; - rntm_t rntm; - - // Packing for non-unit strided vector x. - if ( incx != 1 ) - { - // In order to get the buffer from pool via rntm access to memory broker - //is needed. Following are initializations for rntm. - bli_rntm_init_from_global( &rntm ); - bli_rntm_set_num_threads_only( 1, &rntm ); - bli_pba_rntm_set_pba( &rntm ); - - // Calculate the size required for "n" scomplex elements in vector x. - size_t buffer_size = n * sizeof( scomplex ); - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): get mem pool block\n" ); - #endif - - // Acquire a Buffer(n*size(scomplex)) from the memory broker - // and save the associated mem_t entry to mem_bufX. - bli_pba_acquire_m - ( - &rntm, - buffer_size, - BLIS_BUFFER_FOR_B_PANEL, - &mem_bufX - ); - - // Continue packing X if buffer memory is allocated. - if ( ( bli_mem_is_alloc( &mem_bufX ) ) ) - { - x_buf = bli_mem_buffer( &mem_bufX ); - // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride. - for ( dim_t x_index = 0; x_index < n; x_index++ ) - { - *( x_buf + x_index ) = *( x + ( x_index * incx ) ); - } - } - } - - scomplex *xt = x_buf; + scomplex *xt = x; // Compute the sum of squares on 3 accumulators to avoid overflow // and underflow, depending on the vector element value. @@ -1134,7 +868,7 @@ void bli_scnorm2fv_unb_var1_avx2 float abs_chi; bool isbig = false; - if ( n >= 64 ) + if ( ( n >= 64 ) && ( incx == 1 ) ) { // Constants used for comparisons. v8sf_t temp, thres_sml_vec, thres_big_vec, zerov; @@ -1183,62 +917,10 @@ void bli_scnorm2fv_unb_var1_avx2 mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q); mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q); mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q); - if ( bli_horizontal_or_sf( mask_vec0.v ) ) + if ( bli_horizontal_or_sf( mask_vec0.v ) || bli_horizontal_or_sf( mask_vec1.v ) + || bli_horizontal_or_sf( mask_vec2.v ) || bli_horizontal_or_sf( mask_vec3.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec1.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec2.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec3.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; @@ -1447,47 +1129,10 @@ void bli_scnorm2fv_unb_var1_avx2 mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q); mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q); mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q); - if ( bli_horizontal_or_sf( mask_vec0.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec1.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec2.v ) ) + if ( bli_horizontal_or_sf( mask_vec0.v ) || bli_horizontal_or_sf( mask_vec1.v ) + || bli_horizontal_or_sf( mask_vec2.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; @@ -1650,32 +1295,9 @@ void bli_scnorm2fv_unb_var1_avx2 // Check if any of the values is a NaN and if so, return. mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q); mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q); - if ( bli_horizontal_or_sf( mask_vec0.v ) ) + if ( bli_horizontal_or_sf( mask_vec0.v ) || bli_horizontal_or_sf( mask_vec1.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - if ( bli_horizontal_or_sf( mask_vec1.v ) ) - { - *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; @@ -1791,14 +1413,6 @@ void bli_scnorm2fv_unb_var1_avx2 if ( bli_horizontal_or_sf( mask_vec0.v ) ) { *norm = NAN; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; @@ -1872,122 +1486,66 @@ void bli_scnorm2fv_unb_var1_avx2 sum_big = horizontal_add_sf(sum_big_vec0.v); } - n_remainder = n - i; - bool hasInf = false; double chi_r, chi_i; - if ( ( n_remainder > 0 ) ) + // Put first the most likely to happen to avoid evaluations on if statements. + for ( ; i < n; i++) { - // Put first the most likely to happen to avoid evaluations on if statements. - for (i = 0; i < n_remainder; i++) + // Get real and imaginary component of the vector element. + bli_csgets(*xt, chi_r, chi_i); + // Start with accumulating the real component of the vector element. + abs_chi = bli_fabs( chi_r ); + // If any of the elements is NaN, then return NaN as a result. + if ( bli_isnan( abs_chi ) ) { - // Get real and imaginary component of the vector element. - bli_csgets(*xt, chi_r, chi_i); - // Start with accumulating the real component of the vector element. - abs_chi = bli_fabs( chi_r ); - // If any of the elements is NaN, then return NaN as a result. - if ( bli_isnan( abs_chi ) ) - { - *norm = abs_chi; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - // Else, if any of the elements is an Inf, then return +Inf as a result. - if ( bli_isinf( abs_chi ) ) - { - *norm = abs_chi; - // Instead of returning immediately, use this flag - // to denote that there is an Inf element in the vector. - // That is used to avoid cases where there is a NaN which comes - // after an Inf. - hasInf = true; - } - // Most likely case: medium values, not over/under-flow. - if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) - { - sum_med += abs_chi * abs_chi; - } - // Case where there could be an overflow. Scaling is required. - else if ( abs_chi > thres_big ) - { - sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); - isbig = true; - } - // Case where there could be an underflow. Scaling is required. - else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) - { - sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); - } - // Accumulate the imaginary component of the vector element. - abs_chi = bli_fabs( chi_i ); - // If any of the elements is NaN, then return NaN as a result. - if ( bli_isnan( abs_chi ) ) - { - *norm = abs_chi; - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } + *norm = abs_chi; - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; - } - // Else, if any of the elements is an Inf, then return +Inf as a result. - if ( bli_isinf( abs_chi ) ) - { - *norm = abs_chi; - // Instead of returning immediately, use this flag - // to denote that there is an Inf element in the vector. - // That is used to avoid cases where there is a NaN which comes - // after an Inf. - hasInf = true; - } - // Most likely case: medium values, not over/under-flow. - if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) - { - sum_med += abs_chi * abs_chi; - } - // Case where there could be an overflow. Scaling is required. - else if ( abs_chi > thres_big ) - { - sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); - isbig = true; - } - // Case where there could be an underflow. Scaling is required. - else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) - { - sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); - } + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); + return; + } + // Most likely case: medium values, not over/under-flow. + if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) + { + sum_med += abs_chi * abs_chi; + } + // Case where there could be an overflow. Scaling is required. + else if ( abs_chi > thres_big ) + { + sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); + isbig = true; + } + // Case where there could be an underflow. Scaling is required. + else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) + { + sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); + } + // Accumulate the imaginary component of the vector element. + abs_chi = bli_fabs( chi_i ); + // If any of the elements is NaN, then return NaN as a result. + if ( bli_isnan( abs_chi ) ) + { + *norm = abs_chi; - xt++; + AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); + return; } - } - // Early return if there is an Inf. - if ( hasInf ) - { - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) + // Most likely case: medium values, not over/under-flow. + if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) ) { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); + sum_med += abs_chi * abs_chi; + } + // Case where there could be an overflow. Scaling is required. + else if ( abs_chi > thres_big ) + { + sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big ); + isbig = true; + } + // Case where there could be an underflow. Scaling is required. + else if ( ( !isbig ) && ( abs_chi < thres_sml ) ) + { + sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml ); } - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); - return; + xt += incx; } // Combine accumulators. @@ -2037,15 +1595,6 @@ void bli_scnorm2fv_unb_var1_avx2 *norm = scale * sqrtf( sumsq ); - if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool. - bli_pba_release( &rntm , &mem_bufX ); - } - AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 ); return; From 008b77e94d10f9f474ee499fa55ea38f848326d5 Mon Sep 17 00:00:00 2001 From: Arnav Sharma Date: Mon, 6 Nov 2023 16:01:54 +0530 Subject: [PATCH 190/226] BLAS Compliance: SCALV Early Returns - According to BLAS Standards, SCALV should return when incx .le. 0. - To make SCALV compliant to this, added an early return inside the BLAS layer, for the cases where incx <= 0. - Also, added early return for the case where alpha is a unit scalar. AMD-Internal: [CPUPL-3562] Change-Id: Id474fdd6ed9232226f5c5381d0398f43384e4a49 --- frame/compat/bla_scal_amd.c | 86 ++++++++++++------------------------- 1 file changed, 28 insertions(+), 58 deletions(-) diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c index 3041f3bbe6..00aa8b9c2e 100644 --- a/frame/compat/bla_scal_amd.c +++ b/frame/compat/bla_scal_amd.c @@ -82,15 +82,26 @@ void PASTEF772S(chx,cha,blasname) \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ \ - bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \ + /* If the input increments are less than or equal to zero, return. */ \ + if ( (*incx) <= 0 ) { \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ + return ; \ + } else { \ + incx0 = ( inc_t )(*incx); \ + x0 = (x); \ + } \ \ /* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS. that is, we just always sub-optimally implement those cases by casting alpha to ctype_x (potentially the complex domain) and using the homogeneous datatype instance according to that type. */ \ PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \ +\ + /* If alpha is a unit scalar, return early. */ \ + if ( PASTEMAC(c, eq1)(alpha_cast) ) { \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ + return ; \ + } \ \ /* Call BLIS interface. */ \ PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \ @@ -142,42 +153,18 @@ void sscal_blis_impl /* Convert/typecast negative values of n to zero. */ if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); + else n0 = ( dim_t )(*n); - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) + /* If the input increments are less than or equal to zero, return. */ + if ( (*incx) <= 0 ) { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = (x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return ; } else { - x0 = (x); - incx0 = ( inc_t )(*incx); - } - - /* - According to the BLAS definition, return early when incx <= 0 - */ - if (incx0 <= 0) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; + x0 = (x); + incx0 = ( inc_t )(*incx); } cntx_t *cntx = NULL; @@ -263,39 +250,22 @@ void dscal_blis_impl Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception Return early when alpha pointer is NULL - BLIS exception */ - if ((*n) <= 0 || alpha == NULL || bli_deq1(*alpha) || (*incx) <= 0) + if ((*n) <= 0 || alpha == NULL || bli_deq1(*alpha)) { AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); return; } - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. - * This check is redundant and can be safely removed - */ - if ( *incx < 0 ) + /* If the input increments are less than or equal to zero, return. */ + if ( (*incx) <= 0 ) { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = (x) + (n_elem-1)*(-*incx); - incx0 = ( inc_t )(*incx); - + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return ; } else { - x0 = (x); - incx0 = ( inc_t )(*incx); + x0 = (x); + incx0 = ( inc_t )(*incx); } // Definition of function pointer From 75356d45e54285abcb68d7701383b46b4ae699be Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Wed, 8 Nov 2023 09:05:44 +0530 Subject: [PATCH 191/226] DGEMM improvement for very tiny sizes less than 24. - This commit helps improving performance for very small input by reducing framework check and routing all such inputs to bli_dgemm_tiny_6x8_kernel. It forces single threaded computation for such sizes. - It invokes bli_dgemm_tiny_6x8_kernel for ZEN, ZEN2, ZEN3 and ZEN4 code path. Except for the case AOCL_ENABLE_INSTRUCTIONS environment variable is set to avx512. In that case, such a small inputs are routed to bli_dgemm_tiny_24x8_kernel avx512 kernel. AMD-Internal: [CPUPL-1701] Change-Id: Idf59f4a8ee76ee8f2514a33be2b618e3ce02383e --- frame/base/bli_arch.c | 6 +++++ frame/base/bli_arch.h | 1 + kernels/zen/3/bli_gemm_tiny.c | 51 +++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 1fc9ef43c1..b1b800ce9d 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -65,6 +65,12 @@ static dim_t __attribute__ ((unused)) req_id = -1; // Keep track if AOCL_ENABLE_INSTRUCTIONS environment variable was set. static bool __attribute__ ((unused)) aocl_e_i = FALSE; +bool bli_aocl_enable_instruction_query( void ) +{ + // Return whether the AOCL_ENABLE_INSTRUCTIONS environment variable is set or not. + return aocl_e_i; +} + arch_t bli_arch_query_id( void ) { bli_arch_set_id_once(); diff --git a/frame/base/bli_arch.h b/frame/base/bli_arch.h index b36c669fd5..e944fb964e 100644 --- a/frame/base/bli_arch.h +++ b/frame/base/bli_arch.h @@ -36,6 +36,7 @@ #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); +BLIS_EXPORT_BLIS bool bli_aocl_enable_instruction_query( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); diff --git a/kernels/zen/3/bli_gemm_tiny.c b/kernels/zen/3/bli_gemm_tiny.c index 667f776a18..bf6ffa5cc2 100644 --- a/kernels/zen/3/bli_gemm_tiny.c +++ b/kernels/zen/3/bli_gemm_tiny.c @@ -515,6 +515,57 @@ err_t bli_dgemm_tiny ) { arch_t arch_id = get_arch_id(); + //for the below tiny sizes of matrix, we force it to be ST compute. + if( + m <= 24 && n <= 24 && k <= 20 && + (BLIS_ARCH_ZEN == arch_id || + BLIS_ARCH_ZEN2 == arch_id || + BLIS_ARCH_ZEN3 == arch_id || + BLIS_ARCH_ZEN4 == arch_id) + ) + { + bool ret = bli_aocl_enable_instruction_query(); + if((ret == FALSE) || + (arch_id != BLIS_ARCH_ZEN4) + ) + { + return bli_dgemm_tiny_6x8_kernel + ( + 1 * (transa == BLIS_CONJ_NO_TRANSPOSE), + 1 * (transb == BLIS_CONJ_NO_TRANSPOSE), + transa, + transb, + m, + n, + k, + alpha, + a, rs_a0, cs_a0, + b, rs_b0, cs_b0, + beta, + c, rs_c0, cs_c0 + ); + } +#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) + else if(BLIS_ARCH_ZEN4 == arch_id) + { + return bli_dgemm_tiny_24x8_kernel + ( + 1 * (transa == BLIS_CONJ_NO_TRANSPOSE), + 1 * (transb == BLIS_CONJ_NO_TRANSPOSE), + transa, + transb, + m, + n, + k, + alpha, + a, rs_a0, cs_a0, + b, rs_b0, cs_b0, + beta, + c, rs_c0, cs_c0 + ); + } +#endif + } if(FALSE == bli_thread_get_is_parallel()) { if( From 9500cbee636d89632e2d9bb3c28394d646e28b39 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Wed, 1 Nov 2023 04:55:30 -0400 Subject: [PATCH 192/226] Code cleanup: spelling corrections Corrections for some spelling mistakes in comments. AMD-Internal: [CPUPL-3519] Change-Id: I9a82518cde6476bc77fc3861a4b9f8729c6380ba --- addon/gemmd/thread/bao_l3_decor_openmp.c | 2 +- aocl_dtl/aocldtl.c | 6 +- aocl_dtl/aoclos.c | 8 +- frame/3/gemmt/bli_gemmt_sup_var1n2m.c | 2 +- frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c | 2 +- frame/thread/bli_l3_compute_decor_openmp.c | 4 +- frame/thread/bli_l3_decor_openmp.c | 2 +- frame/thread/bli_l3_sup_decor_openmp.c | 2 +- gtestsuite/testsuite/level3/gemm/test_gemm.h | 4 +- kernels/zen/3/bli_gemm_small.c | 96 +++++++++---------- .../s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c | 24 ++--- .../s8s8s16/lpgemm_s8_m_fringe_amd256.c | 28 +++--- .../s8s8s16/lpgemm_s8_mn_fringe_amd256.c | 56 +++++------ .../s8s8s16/lpgemm_s8_n_fringe_amd256.c | 46 ++++----- .../u8s8s16/lpgemm_6x32rowmajor_amd256.c | 24 ++--- .../lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c | 28 +++--- .../lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c | 56 +++++------ .../lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c | 46 ++++----- kernels/zen4/1/bli_amaxv_zen_int_avx512.c | 10 +- kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c | 2 +- kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c | 2 +- kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c | 2 +- kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c | 2 +- kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c | 2 +- kernels/zen4/3/bli_trsm_small_AVX512.c | 10 +- .../zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c | 8 +- sandbox/gemmlike/thread/bls_l3_decor_openmp.c | 2 +- sandbox/ref99/old/thread/blx_gemm_thread.c | 2 +- test/Makefile | 2 +- test/test_gemm_batch.c | 2 +- 30 files changed, 241 insertions(+), 241 deletions(-) diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.c b/addon/gemmd/thread/bao_l3_decor_openmp.c index 1aca8de275..d77b0d8a7a 100644 --- a/addon/gemmd/thread/bao_l3_decor_openmp.c +++ b/addon/gemmd/thread/bao_l3_decor_openmp.c @@ -93,7 +93,7 @@ void bao_l3_thread_decorator // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); - // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // Check for a somewhat obscure OpenMP thread-mismatch issue. // NOTE: This calls the same function used for the conventional/large // code path. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); diff --git a/aocl_dtl/aocldtl.c b/aocl_dtl/aocldtl.c index a9b3db1786..6faa1e4b51 100644 --- a/aocl_dtl/aocldtl.c +++ b/aocl_dtl/aocldtl.c @@ -539,11 +539,11 @@ uint64 AOCL_DTL_get_time_spent(void) #ifdef AOCL_DTL_AUTO_TRACE_ENABLE /* - Disable intrumentation for these functions as they will also be - called from compiler generated instumation code to trace + Disable instrumentation for these functions as they will also be + called from compiler generated instrumentation code to trace function execution. - It needs to be part of declration in the C file so can't be + It needs to be part of declaration in the C file so can't be moved to header file. WARNING: These functions are automatically invoked. however any function diff --git a/aocl_dtl/aoclos.c b/aocl_dtl/aoclos.c index 896b1c89b3..6cbf075487 100644 --- a/aocl_dtl/aoclos.c +++ b/aocl_dtl/aoclos.c @@ -20,18 +20,18 @@ #endif // BLIS TODO: This is workaround to check if BLIS is built with -// openmp support. Ideally we dont' want any library +// openmp support. Ideally we don't want any library // specific code in dtl. #include #if defined(__linux__) /* - Disable intrumentation for these functions as they will also be - called from compiler generated instumation code to trace + Disable instrumentation for these functions as they will also be + called from compiler generated instrumentation code to trace function execution. - It needs to be part of declration in the C file so can't be + It needs to be part of declaration in the C file so can't be moved to header file. */ diff --git a/frame/3/gemmt/bli_gemmt_sup_var1n2m.c b/frame/3/gemmt/bli_gemmt_sup_var1n2m.c index 382ca6f67d..270c1b7c7d 100644 --- a/frame/3/gemmt/bli_gemmt_sup_var1n2m.c +++ b/frame/3/gemmt/bli_gemmt_sup_var1n2m.c @@ -1873,7 +1873,7 @@ void PASTEMACT(ch,opname,uplo,varname) \ m_off_cblock += mr_cur; \ } \ \ - /* Invoke the gemmsup millikerneli for remaining rectangular part. */ \ + /* Invoke the gemmsup millikernel for remaining rectangular part. */ \ gemmsup_ker \ ( \ conja, \ diff --git a/frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c b/frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c index 0b5176a6ab..0e73471c81 100644 --- a/frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c +++ b/frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c @@ -2073,7 +2073,7 @@ void PASTEMACT(ch,opname,uplo,varname) \ m_off_cblock += mr_cur; \ } \ \ - /* Invoke the gemmsup millikerneli for remaining rectangular part. */ \ + /* Invoke the gemmsup millikernel for remaining rectangular part. */ \ gemmsup_ker \ ( \ conja, \ diff --git a/frame/thread/bli_l3_compute_decor_openmp.c b/frame/thread/bli_l3_compute_decor_openmp.c index ab4dffa872..6841e0e1c4 100644 --- a/frame/thread/bli_l3_compute_decor_openmp.c +++ b/frame/thread/bli_l3_compute_decor_openmp.c @@ -83,7 +83,7 @@ err_t bli_l3_compute_thread_decorator // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); - // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // Check for a somewhat obscure OpenMP thread-mismatch issue. // NOTE: This calls the same function used for the conventional/large // code path. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); @@ -126,4 +126,4 @@ err_t bli_l3_compute_thread_decorator return BLIS_SUCCESS; } -#endif \ No newline at end of file +#endif diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c index f505a94f7a..761321fc60 100644 --- a/frame/thread/bli_l3_decor_openmp.c +++ b/frame/thread/bli_l3_decor_openmp.c @@ -110,7 +110,7 @@ void bli_l3_thread_decorator // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); - // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // Check for a somewhat obscure OpenMP thread-mismatch issue. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); // Use the thread id to access the appropriate pool_t* within the diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c index 1db9514fd4..caa3c1dd37 100644 --- a/frame/thread/bli_l3_sup_decor_openmp.c +++ b/frame/thread/bli_l3_sup_decor_openmp.c @@ -93,7 +93,7 @@ err_t bli_l3_sup_thread_decorator // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); - // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // Check for a somewhat obscure OpenMP thread-mismatch issue. // NOTE: This calls the same function used for the conventional/large // code path. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); diff --git a/gtestsuite/testsuite/level3/gemm/test_gemm.h b/gtestsuite/testsuite/level3/gemm/test_gemm.h index 862d47b168..147bcdab50 100644 --- a/gtestsuite/testsuite/level3/gemm/test_gemm.h +++ b/gtestsuite/testsuite/level3/gemm/test_gemm.h @@ -78,7 +78,7 @@ void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n, computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh ); } -// Test body used for exception value testing, by iducing an exception value +// Test body used for exception value testing, by inducing an exception value // in the index that is passed for each of the matrices. /* (ai, aj) is the index with corresponding exception value aexval in matrix A. @@ -135,4 +135,4 @@ void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh, true ); -} \ No newline at end of file +} diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c index 4d33771b65..be9f6aa8d2 100644 --- a/kernels/zen/3/bli_gemm_small.c +++ b/kernels/zen/3/bli_gemm_small.c @@ -2532,7 +2532,7 @@ err_t bli_dgemm_small ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_pd(tC + 8); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC + 12, maskVec); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); @@ -2545,7 +2545,7 @@ err_t bli_dgemm_small ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(ttC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC + 12, maskVec); ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11); @@ -2558,7 +2558,7 @@ err_t bli_dgemm_small ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(ttC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC + 12, maskVec); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); @@ -2566,7 +2566,7 @@ err_t bli_dgemm_small _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); _mm256_storeu_pd(tC + 8, ymm6); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 12, maskVec, ymm7); tC += ldc; @@ -2574,7 +2574,7 @@ err_t bli_dgemm_small _mm256_storeu_pd(tC, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 12, maskVec, ymm11); tC += ldc; @@ -2582,7 +2582,7 @@ err_t bli_dgemm_small _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 12, maskVec, ymm15); } n_remainder = N - col_idx; @@ -2660,7 +2660,7 @@ err_t bli_dgemm_small ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(tC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC + 12, maskVec); ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11); @@ -2674,7 +2674,7 @@ err_t bli_dgemm_small ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(ttC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC + 12, maskVec); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); @@ -2683,7 +2683,7 @@ err_t bli_dgemm_small _mm256_storeu_pd(tC + 0, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 12, maskVec, ymm11); tC += ldc; @@ -2691,7 +2691,7 @@ err_t bli_dgemm_small _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 12, maskVec, ymm15); col_idx += 2; } @@ -2755,7 +2755,7 @@ err_t bli_dgemm_small ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(tC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC + 12, maskVec); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); @@ -2764,7 +2764,7 @@ err_t bli_dgemm_small _mm256_storeu_pd(tC + 0, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 12, maskVec, ymm15); } } @@ -2847,7 +2847,7 @@ err_t bli_dgemm_small ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC + 8, maskVec); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); @@ -2859,7 +2859,7 @@ err_t bli_dgemm_small ymm2 = _mm256_loadu_pd(ttC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC + 8, maskVec); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); @@ -2871,7 +2871,7 @@ err_t bli_dgemm_small ymm2 = _mm256_loadu_pd(ttC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC + 8, maskVec); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); @@ -2879,21 +2879,21 @@ err_t bli_dgemm_small } _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 8, maskVec, ymm6); tC += ldc; _mm256_storeu_pd(tC, ymm8); _mm256_storeu_pd(tC + 4, ymm9); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 8, maskVec, ymm10); tC += ldc; _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 8, maskVec, ymm14); } n_remainder = N - col_idx; @@ -2962,7 +2962,7 @@ err_t bli_dgemm_small ymm2 = _mm256_loadu_pd(tC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC + 8, maskVec); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); @@ -2975,7 +2975,7 @@ err_t bli_dgemm_small ymm2 = _mm256_loadu_pd(ttC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC + 8, maskVec); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); @@ -2983,14 +2983,14 @@ err_t bli_dgemm_small } _mm256_storeu_pd(tC + 0, ymm8); _mm256_storeu_pd(tC + 4, ymm9); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 8, maskVec, ymm10); tC += ldc; _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 8, maskVec, ymm14); col_idx += 2; @@ -3050,7 +3050,7 @@ err_t bli_dgemm_small ymm2 = _mm256_loadu_pd(tC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC + 8, maskVec); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); @@ -3058,7 +3058,7 @@ err_t bli_dgemm_small } _mm256_storeu_pd(tC + 0, ymm12); _mm256_storeu_pd(tC + 4, ymm13); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 8, maskVec, ymm14); } } @@ -3135,7 +3135,7 @@ err_t bli_dgemm_small // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC + 4, maskVec); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); @@ -3144,7 +3144,7 @@ err_t bli_dgemm_small double *ttC = tC +ldc; ymm2 = _mm256_loadu_pd(ttC); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC + 4, maskVec); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); @@ -3153,25 +3153,25 @@ err_t bli_dgemm_small ttC += ldc; ymm2 = _mm256_loadu_pd(ttC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC + 4, maskVec); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); } _mm256_storeu_pd(tC, ymm4); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 4, maskVec, ymm5); tC += ldc; _mm256_storeu_pd(tC, ymm8); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 4, maskVec, ymm9); tC += ldc; _mm256_storeu_pd(tC, ymm12); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 4, maskVec, ymm13); } n_remainder = N - col_idx; @@ -3231,7 +3231,7 @@ err_t bli_dgemm_small // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC + 0); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC + 4, maskVec); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); @@ -3241,20 +3241,20 @@ err_t bli_dgemm_small // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(ttC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC + 4, maskVec); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); } _mm256_storeu_pd(tC + 0, ymm8); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 4, maskVec, ymm9); tC += ldc; _mm256_storeu_pd(tC, ymm12); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 4, maskVec, ymm13); col_idx += 2; @@ -3305,13 +3305,13 @@ err_t bli_dgemm_small // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC + 0); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC + 4, maskVec); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); } _mm256_storeu_pd(tC + 0, ymm12); - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC + 4, maskVec, ymm13); } } @@ -3362,34 +3362,34 @@ err_t bli_dgemm_small if(is_beta_non_zero) { - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC, maskVec); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); double* ttC = tC + ldc; - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC, maskVec); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); ttC += ldc; - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC, maskVec); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); } - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC, maskVec, ymm4); tC += ldc; - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC, maskVec, ymm5); tC += ldc; - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC, maskVec, ymm6); } n_remainder = N - col_idx; @@ -3434,23 +3434,23 @@ err_t bli_dgemm_small if(is_beta_non_zero) { - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC, maskVec); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); double* ttC = tC + ldc; - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(ttC, maskVec); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); } - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC, maskVec, ymm4); tC += ldc; - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC, maskVec, ymm5); col_idx += 2; @@ -3492,13 +3492,13 @@ err_t bli_dgemm_small if(is_beta_non_zero) { - // Masked load the relevant remaider elements of C matrix + // Masked load the relevant remainder elements of C matrix // Scale by beta. ymm2 = _mm256_maskload_pd(tC, maskVec); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); } - // Masked store the relevant remaider elements of C matrix + // Masked store the relevant remainder elements of C matrix _mm256_maskstore_pd(tC, maskVec, ymm4); } } diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c index 7893af7437..c102a89dea 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c @@ -151,7 +151,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) __m256i b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1))); - // Seperate register for intermediate op + // Separate register for intermediate op __m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -168,7 +168,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -185,7 +185,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. // c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31] @@ -201,7 +201,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -218,7 +218,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -236,7 +236,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -262,7 +262,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op __m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -278,7 +278,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -294,7 +294,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -311,7 +311,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -327,7 +327,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -343,7 +343,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c index af112831d9..8d5a99968c 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c @@ -104,7 +104,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -119,7 +119,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -134,7 +134,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -143,7 +143,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) c_int16_2p0 = _mm256_add_epi16(inter_vec[0], c_int16_2p0); c_int16_2p1 = _mm256_add_epi16(inter_vec[1], c_int16_2p1); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -167,7 +167,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -182,7 +182,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -197,7 +197,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -212,7 +212,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -697,7 +697,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -706,7 +706,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32) c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0); c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -729,7 +729,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -744,7 +744,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -1090,7 +1090,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32) b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 0))); b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -1113,7 +1113,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c index 77f3553f67..9e2355a711 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c @@ -88,7 +88,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -101,7 +101,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -114,7 +114,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -127,7 +127,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -148,7 +148,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -161,7 +161,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -174,7 +174,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -187,7 +187,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -513,7 +513,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -526,7 +526,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -539,7 +539,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -552,7 +552,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -573,7 +573,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -586,7 +586,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -599,7 +599,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -612,7 +612,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -969,7 +969,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -982,7 +982,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1002,7 +1002,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1015,7 +1015,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1270,7 +1270,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1283,7 +1283,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 4. @@ -1303,7 +1303,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1316,7 +1316,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1594,7 +1594,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16) b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1614,7 +1614,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1836,7 +1836,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1856,7 +1856,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c index 3b6d21bb37..36cad252a6 100644 --- a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c +++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c @@ -102,7 +102,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -115,7 +115,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -128,7 +128,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -141,7 +141,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -154,7 +154,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -167,7 +167,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -188,7 +188,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -201,7 +201,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -214,7 +214,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -227,7 +227,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -240,7 +240,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -253,7 +253,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -714,7 +714,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -727,7 +727,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -740,7 +740,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -753,7 +753,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -766,7 +766,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -779,7 +779,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -812,7 +812,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -825,7 +825,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -838,7 +838,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -851,7 +851,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -864,7 +864,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16) //convert signed int8 to uint8 for u8s8s16 FMA ops a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 ); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c index ba577f3b25..820f1500b7 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c @@ -144,7 +144,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) __m256i b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1))); - // Seperate register for intermediate op + // Separate register for intermediate op __m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -158,7 +158,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -172,7 +172,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. // c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31] @@ -185,7 +185,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -199,7 +199,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 4) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -214,7 +214,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 5) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -237,7 +237,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) uint8_t a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); __m256i a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op __m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -250,7 +250,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -263,7 +263,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -277,7 +277,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -290,7 +290,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -303,7 +303,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32) a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c index 0fb7a297bb..07e6ca05bd 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c @@ -95,7 +95,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) // Broadcast a[1,kr:kr+2]. a_int32_1 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -107,7 +107,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) // Broadcast a[2,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -119,7 +119,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) // Broadcast a[3,kr:kr+2]. a_int32_1 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -128,7 +128,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) c_int16_2p0 = _mm256_add_epi16(inter_vec[0], c_int16_2p0); c_int16_2p1 = _mm256_add_epi16(inter_vec[1], c_int16_2p1); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -149,7 +149,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -161,7 +161,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2))); a_int32_1 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -173,7 +173,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -185,7 +185,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32) a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2))); a_int32_1 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -687,7 +687,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) // Broadcast a[1,kr:kr+2]. a_int32_1 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -696,7 +696,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0); c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -716,7 +716,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -728,7 +728,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32) a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2))); a_int32_1 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0); inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1); @@ -1080,7 +1080,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32) b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 0))); b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); @@ -1100,7 +1100,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32) a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0); inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1); diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c index a9a8925eaa..e1e2bb73e8 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c @@ -82,7 +82,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) // Broadcast a[0,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -92,7 +92,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) // Broadcast a[1,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -102,7 +102,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) // Broadcast a[2,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -112,7 +112,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) // Broadcast a[3,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -130,7 +130,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -140,7 +140,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -150,7 +150,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -160,7 +160,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16) a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -497,7 +497,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) // Broadcast a[0,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -507,7 +507,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) // Broadcast a[1,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -517,7 +517,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) // Broadcast a[2,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -527,7 +527,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) // Broadcast a[3,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -545,7 +545,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -555,7 +555,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -565,7 +565,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -575,7 +575,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16) a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -957,7 +957,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16) // Broadcast a[0,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -967,7 +967,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16) // Broadcast a[1,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -984,7 +984,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16) a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -994,7 +994,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16) a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1253,7 +1253,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16) // Broadcast a[0,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1263,7 +1263,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16) // Broadcast a[1,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 4. @@ -1280,7 +1280,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16) a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1290,7 +1290,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16) a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1582,7 +1582,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16) b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1599,7 +1599,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16) a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1824,7 +1824,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16) // Broadcast a[0,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -1841,7 +1841,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16) a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c index a6a167f95b..fa6828127c 100644 --- a/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c +++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c @@ -96,7 +96,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) // Broadcast a[0,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -106,7 +106,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) // Broadcast a[1,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -116,7 +116,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) // Broadcast a[2,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -126,7 +126,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) // Broadcast a[3,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -136,7 +136,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) // Broadcast a[4,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 4) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -146,7 +146,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) // Broadcast a[5,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 5) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -164,7 +164,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -174,7 +174,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -184,7 +184,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -194,7 +194,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -204,7 +204,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -214,7 +214,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16) a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -692,7 +692,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) // Broadcast a[0,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -702,7 +702,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) // Broadcast a[1,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -712,7 +712,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) // Broadcast a[2,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -722,7 +722,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) // Broadcast a[3,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -732,7 +732,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) // Broadcast a[4,kr:kr+2]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 4) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -742,7 +742,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) // Broadcast a[5,kr:kr+4]. a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 5) + (cs_a * offset))); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -769,7 +769,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -779,7 +779,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -789,7 +789,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -799,7 +799,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. @@ -809,7 +809,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16) a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2))); a_int32_0 = _mm256_set1_epi8(a_kfringe); - // Seperate register for intermediate op + // Separate register for intermediate op inter_vec = _mm256_maddubs_epi16(a_int32_0, b0); // Perform column direction mat-mul with k = 2. diff --git a/kernels/zen4/1/bli_amaxv_zen_int_avx512.c b/kernels/zen4/1/bli_amaxv_zen_int_avx512.c index 85c3f0d356..07ca37bac8 100644 --- a/kernels/zen4/1/bli_amaxv_zen_int_avx512.c +++ b/kernels/zen4/1/bli_amaxv_zen_int_avx512.c @@ -278,7 +278,7 @@ void bli_samaxv_zen_int_avx512( mask.v = _mm512_sub_ps(max_vec_1.v, x_vec_1.v); // Type cast mask from IEEE754 (float) to integer type // This operation will not need a new register, its just to convince - // the compiler. But its accounted as seperate register in the + // the compiler. But its accounted as separate register in the // above calculations intMask = _mm512_castps_si512(mask.v); // Extract the signbit and build the mask. @@ -312,7 +312,7 @@ void bli_samaxv_zen_int_avx512( mask.v = _mm512_sub_ps(max_vec_2.v, x_vec_2.v); // Type cast mask from IEEE754 (float) to integer type // This operation will not need a new register, its just to convince - // the compiler. But its accounted as seperate register in the + // the compiler. But its accounted as separate register in the // above calculations intMask = _mm512_castps_si512(mask.v); // Extract the signbit and build the mask. @@ -345,7 +345,7 @@ void bli_samaxv_zen_int_avx512( mask.v = _mm512_sub_ps(max_vec_3.v, x_vec_3.v); // Type cast mask from IEEE754 (float) to integer type // This operation will not need a new register, its just to convince - // the compiler. But its accounted as seperate register in the + // the compiler. But its accounted as separate register in the // above calculations intMask = _mm512_castps_si512(mask.v); // Extract the signbit and build the mask. @@ -397,7 +397,7 @@ void bli_samaxv_zen_int_avx512( mask.v = _mm512_sub_ps(max_vec_2.v, max_vec_3.v); // Type cast mask from IEEE754 (float) to integer type // This operation will not need a new register, its just to convince - // the compiler. But its accounted as seperate register in the + // the compiler. But its accounted as separate register in the // above calculations intMask = _mm512_castps_si512(mask.v); // Extract the signbit and build the mask. @@ -423,7 +423,7 @@ void bli_samaxv_zen_int_avx512( mask.v = _mm512_sub_ps(max_vec_1.v, max_vec_2.v); // Type cast mask from IEEE754 (float) to integer type // This operation will not need a new register, its just to convince - // the compiler. But its accounted as seperate register in the + // the compiler. But its accounted as separate register in the // above calculations intMask = _mm512_castps_si512(mask.v); // Extract the signbit and build the mask. diff --git a/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c b/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c index 4a1c416a5d..887f27889c 100644 --- a/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c +++ b/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c @@ -218,7 +218,7 @@ static int64_t offsets[24] __attribute__((aligned(64))) = /* * number of accumulation registers = 24/8 * 8 = 24 zmm8 to zmm31 * number of registers used for load B = 24/8 = 3 zmm0 to zmm2 - * number of regusters used for broadcast A = 2 zmm6 and zmm7 + * number of registers used for broadcast A = 2 zmm6 and zmm7 */ void bli_dgemm_zen4_asm_8x24( dim_t k_, diff --git a/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c b/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c index 001bcd910c..d5a10aa209 100644 --- a/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c +++ b/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c @@ -156,7 +156,7 @@ static int64_t offsets[24] __attribute__((aligned(64))) = /* * number of accumulation registers = 24/8 * 8 = 24 zmm8 to zmm31 * number of registers used for load B = 24/8 = 3 zmm0 to zmm2 - * number of regusters used for broadcast A = 2 zmm6 and zmm7 + * number of registers used for broadcast A = 2 zmm6 and zmm7 */ void bli_dgemmtrsm_l_zen4_asm_8x24 ( diff --git a/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c b/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c index 08edcb574f..5826b5e55c 100644 --- a/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c +++ b/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c @@ -411,7 +411,7 @@ void bli_dgemmtrsm_l_zen_asm_16x14 /* C prefetch Loop Note: This loop runs 14 times, - These 14 iterations are done seperately so that c11 can be prefetched here. + These 14 iterations are done separately so that c11 can be prefetched here. */ ADD(R11, RSI) ADD(IMM(14), RSI) diff --git a/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c b/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c index b620410c89..e9dae78ba7 100644 --- a/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c +++ b/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c @@ -156,7 +156,7 @@ static int64_t offsets[24] __attribute__((aligned(64))) = /* * number of accumulation registers = 24/8 * 8 = 24 zmm8 to zmm31 * number of registers used for load B = 24/8 = 3 zmm0 to zmm2 - * number of regusters used for broadcast A = 2 zmm6 and zmm7 + * number of registers used for broadcast A = 2 zmm6 and zmm7 */ void bli_dgemmtrsm_u_zen4_asm_8x24 ( diff --git a/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c b/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c index 401c6e7d23..d15a52823f 100644 --- a/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c +++ b/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c @@ -407,7 +407,7 @@ void bli_dgemmtrsm_u_zen_asm_16x14 /* C prefetch Loop Note: This loop runs 14 times, - These 14 iterations are done seperately so that c11 can be prefetched here. + These 14 iterations are done separately so that c11 can be prefetched here. */ ADD(R11, RSI) ADD(IMM(14), RSI) diff --git a/kernels/zen4/3/bli_trsm_small_AVX512.c b/kernels/zen4/3/bli_trsm_small_AVX512.c index 93490d5a65..3d10c3a9e4 100644 --- a/kernels/zen4/3/bli_trsm_small_AVX512.c +++ b/kernels/zen4/3/bli_trsm_small_AVX512.c @@ -729,7 +729,7 @@ err_t bli_trsm_small_mt_AVX512 // region - GEMM DTRSM for right variants #define BLIS_DTRSM_SMALL_GEMM_8nx8m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \ - /*K loop is broken into two seperate loops + /*K loop is broken into two separate loops each loop computes k/2 iterations */ \ \ int itr = (k_iter / 2); /*itr count for first loop*/\ @@ -900,7 +900,7 @@ err_t bli_trsm_small_mt_AVX512 */ #define BLIS_DTRSM_SMALL_GEMM_8nx4m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \ - /*K loop is broken into two seperate loops + /*K loop is broken into two separate loops each loop computes k/2 iterations */ \ \ int itr = (k_iter / 2); /*itr count for first loop*/\ @@ -979,7 +979,7 @@ err_t bli_trsm_small_mt_AVX512 #define BLIS_DTRSM_SMALL_GEMM_8nx3m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \ - /*K loop is broken into two seperate loops + /*K loop is broken into two separate loops each loop computes k/2 iterations */ \ \ int itr = (k_iter / 2); /*itr count for first loop*/\ @@ -1062,7 +1062,7 @@ err_t bli_trsm_small_mt_AVX512 ymm16 = _mm256_add_pd(ymm16, ymm31); #define BLIS_DTRSM_SMALL_GEMM_8nx2m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \ - /*K loop is broken into two seperate loops + /*K loop is broken into two separate loops each loop computes k/2 iterations */ \ \ int itr = (k_iter / 2); /*itr count for first loop*/\ @@ -1142,7 +1142,7 @@ err_t bli_trsm_small_mt_AVX512 ymm16 = _mm256_add_pd(ymm16, ymm31); #define BLIS_DTRSM_SMALL_GEMM_8nx1m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \ - /*K loop is broken into two seperate loops + /*K loop is broken into two separate loops each loop computes k/2 iterations */ \ \ int itr = (k_iter / 2); /*itr count for first loop*/\ diff --git a/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c b/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c index 117a6cb564..4fc04901ca 100644 --- a/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c +++ b/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c @@ -916,7 +916,7 @@ void bli_zgemmsup_cv_zen4_asm_12x4m const double *v = &value; // Assigning the type of alpha and beta scaling - // In order to facilitate handling special cases seperately + // In order to facilitate handling special cases separately char alpha_mul_type = BLIS_MUL_DEFAULT; char beta_mul_type = BLIS_MUL_DEFAULT; @@ -1400,7 +1400,7 @@ void bli_zgemmsup_cv_zen4_asm_12x3m const double *v = &value; // Assigning the type of alpha and beta scaling - // In order to facilitate handling special cases seperately + // In order to facilitate handling special cases separately char alpha_mul_type = BLIS_MUL_DEFAULT; char beta_mul_type = BLIS_MUL_DEFAULT; @@ -1819,7 +1819,7 @@ void bli_zgemmsup_cv_zen4_asm_12x2m const double *v = &value; // Assigning the type of alpha and beta scaling - // In order to facilitate handling special cases seperately + // In order to facilitate handling special cases separately char alpha_mul_type = BLIS_MUL_DEFAULT; char beta_mul_type = BLIS_MUL_DEFAULT; @@ -2224,7 +2224,7 @@ void bli_zgemmsup_cv_zen4_asm_12x1m */ // Assigning the type of alpha and beta scaling - // In order to facilitate handling special cases seperately + // In order to facilitate handling special cases separately char alpha_mul_type = BLIS_MUL_DEFAULT; char beta_mul_type = BLIS_MUL_DEFAULT; diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c index bf0d4d8bcd..0086a48e8f 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c @@ -92,7 +92,7 @@ void bls_l3_thread_decorator // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); - // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // Check for a somewhat obscure OpenMP thread-mismatch issue. // NOTE: This calls the same function used for the conventional/large // code path. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); diff --git a/sandbox/ref99/old/thread/blx_gemm_thread.c b/sandbox/ref99/old/thread/blx_gemm_thread.c index b5657aa4f2..3ae0da4dde 100644 --- a/sandbox/ref99/old/thread/blx_gemm_thread.c +++ b/sandbox/ref99/old/thread/blx_gemm_thread.c @@ -164,7 +164,7 @@ void blx_gemm_thread // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); - // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // Check for a somewhat obscure OpenMP thread-mismatch issue. //bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); // Use the thread id to access the appropriate pool_t* within the diff --git a/test/Makefile b/test/Makefile index 3370ce7157..5d1958b876 100644 --- a/test/Makefile +++ b/test/Makefile @@ -160,7 +160,7 @@ TEST_OPS := dotv axpyv \ gemm hemm herk her2k trmm trsm \ # Include extension API's added by AMD in operations list -# Keeping it seperate in case it needs to be guarded by a variable +# Keeping it separate in case it needs to be guarded by a variable TEST_OPS := $(TEST_OPS) axpbyv cabs1 copyv gemm3m gemm_batch \ gemmt imatcopy omatadd omatcopy omatcopy2 \ diff --git a/test/test_gemm_batch.c b/test/test_gemm_batch.c index 5660e4150e..cf69a9277a 100644 --- a/test/test_gemm_batch.c +++ b/test/test_gemm_batch.c @@ -45,7 +45,7 @@ /* Format for FILE input * For each input set, first line contains 'storage scheme' - * and 'group count' seperated by space. + * and 'group count' separated by space. * Following 'group_count' number of lines contains all the parameters of * each group separated by space in each line in the following order: * tA tB m n k lda ldb ldc alpha_r alpha_i beta_r beta_i group_size From d2ad26852598a823c2fce80136d1e90da469aaf2 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Wed, 8 Nov 2023 16:28:01 -0500 Subject: [PATCH 193/226] Compilation error when using pthreads on FreeBSD We are using pthread_self to get a thread id for use in the DTL tracing functionality to name individual output files per thread. This is not an appropriate use of pthread_self as its return type (pthread_t) is an opaque type that can vary between implementations. On linux we haven't had a problem, as pthread_t is an unsigned long int. However on freeBSD it is a pointer to an empty struct. The difference between this and the int type we used for its value within the BLIS code was causing a compile error. The best long term solution would be for pthread builds to maintain their own internal thread id. A mechanism to implement this has not yet been identifie. In the meantime, we make the following changes as a stopgap: - Explicitly cast from pthread_t return value to our BLIS internal data type AOCL_TID. - Make AOCL_TID a long int rather than pid_t (i.e. an int) in pthread builds to match the sizes expected on both Linux and FreeBSD. AMD-Internal: [CPUPL-4167] Change-Id: Ia07ee8f97273cc3bab46f6bca1eeb7954320415b --- aocl_dtl/aoclflist.c | 16 ++++++++++++++-- aocl_dtl/aoclos.c | 14 +++++++++++--- aocl_dtl/aocltpdef.h | 7 +++++-- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/aocl_dtl/aoclflist.c b/aocl_dtl/aoclflist.c index 5d44fdba87..15b58c9e80 100644 --- a/aocl_dtl/aoclflist.c +++ b/aocl_dtl/aoclflist.c @@ -5,10 +5,11 @@ * each thread. This is used to log the data * to correct file as per the current thread id. * - * Copyright (C) 2020, Advanced Micro Devices, Inc + * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ +#include "blis.h" #include "aocltpdef.h" #include "aocldtl.h" #include "aoclfal.h" @@ -63,7 +64,11 @@ AOCL_FLIST_Node * AOCL_FLIST_GetNode(AOCL_FLIST_Node *plist, AOCL_TID tid) { if (temp->fp == NULL) { +#ifdef BLIS_ENABLE_PTHREADS + AOCL_DEBUGPRINT("Could not get saved time stamp for thread = %ld", tid); +#else AOCL_DEBUGPRINT("Could not get saved time stamp for thread = %d", tid); +#endif } return temp; } @@ -92,7 +97,11 @@ AOCL_FAL_FILE *AOCL_FLIST_GetFile(AOCL_FLIST_Node *plist, AOCL_TID tid) { if (temp->fp == NULL) { +#ifdef BLIS_ENABLE_PTHREADS + AOCL_DEBUGPRINT("File associated with this thread id %ld does not exists or closed", tid); +#else AOCL_DEBUGPRINT("File associated with this thread id %d does not exists or closed", tid); +#endif } return temp->fp; } @@ -118,8 +127,11 @@ AOCL_FAL_FILE *AOCL_FLIST_AddFile(const int8 *pchFilePrefix, AOCL_FLIST_Node **p } /* We don't have exiting file, lets try to open new one */ +#ifdef BLIS_ENABLE_PTHREADS + sprintf(pchFileName, "P%d_T%lu_%s", AOCL_getpid(), tid, pchFilePrefix); +#else sprintf(pchFileName, "P%d_T%u_%s", AOCL_getpid(), tid, pchFilePrefix); - +#endif file = AOCL_FAL_Open(pchFileName, "wb"); if (file == NULL) { diff --git a/aocl_dtl/aoclos.c b/aocl_dtl/aoclos.c index 6cbf075487..2e74091f55 100644 --- a/aocl_dtl/aoclos.c +++ b/aocl_dtl/aoclos.c @@ -3,9 +3,10 @@ * * Description : Abstraction for os services used by DTL. * - * Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ +#include "blis.h" #include "aocltpdef.h" #include "aocldtl.h" #include "aoclfal.h" @@ -47,7 +48,10 @@ AOCL_TID AOCL_gettid(void) return omp_get_thread_num(); #else #ifdef BLIS_ENABLE_PTHREADS - return pthread_self(); + // pthread_self is not suitable for this purpose and may be replaced + // in a later release with something else. It returns a value of type + // pthread_t, which on linux is an unsigned long int. + return (AOCL_TID) pthread_self(); #else return 0; #endif @@ -89,7 +93,11 @@ AOCL_TID AOCL_gettid(void) return omp_get_thread_num(); #else #ifdef BLIS_ENABLE_PTHREADS - return pthread_self(); + // pthread_self is not suitable for this purpose and may be replaced + // in a later release with something else. It returns a value of type + // pthread_t, whose type may depend upon the operating system. On + // freeBSD it is a pointer to an empty struct. + return (AOCL_TID) pthread_self(); #else return 0; #endif diff --git a/aocl_dtl/aocltpdef.h b/aocl_dtl/aocltpdef.h index d842fffbac..0036a6aea2 100644 --- a/aocl_dtl/aocltpdef.h +++ b/aocl_dtl/aocltpdef.h @@ -4,7 +4,7 @@ * * Description : Abstraction for various datatypes used by DTL. * - * Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ #ifndef AOCL_TYPEDEF_H_ @@ -35,8 +35,11 @@ typedef signed long int int32; typedef short int int16; typedef Void *AOCL_HANDLE; +#ifdef BLIS_ENABLE_PTHREADS +typedef long int AOCL_TID; +#else typedef pid_t AOCL_TID; - +#endif #endif /*AOCL_TYPEDEF_H_ */ /* --------------- End of aocltpdef.h ----------------- */ From ed052c6c44cfed69511054ab587b147f421f6dc1 Mon Sep 17 00:00:00 2001 From: mkadavil Date: Wed, 8 Nov 2023 01:27:32 +0530 Subject: [PATCH 194/226] Smart Threading for LPGEMM 8s8s<16|32>os<8|16|32> API. The LPGEMM micro-kernel operates on blocks of dimension MRxKC and KCxNR. Current LPGEMM design involves using all the available threads for computing the output. If the number of threads assigned along ic or jc direction is more than M/MR or N/NR blocks respectively, it could results in threads sleeping due to the lack of MR or NR blocks. This scenario is now handled by reducing the number of threads if there are threads without any work (MR or NR blocks). AMD-Internal: [SWLCSG-2354, SWLCSG-2389, SWLCSG-2267] Change-Id: I74819337c7a0d3ab05ea0e18bb42780f977ea8f6 --- .../threading/lpgemm_thread_decor_openmp.c | 287 ++++++++++-------- 1 file changed, 157 insertions(+), 130 deletions(-) diff --git a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c index a9f9d2a236..ef798aa023 100644 --- a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c +++ b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c @@ -268,7 +268,7 @@ BLIS_INLINE void lpgemm_adjust_ic_jc_ways } } -BLIS_INLINE void lpgemm_u8s8s16o16_get_threading +BLIS_INLINE void lpgemm_s16o16_get_threading ( dim_t* n_threads, dim_t* ic_ways, @@ -276,7 +276,8 @@ BLIS_INLINE void lpgemm_u8s8s16o16_get_threading dim_t m, dim_t n, dim_t k, - rntm_t* rntm_g + rntm_t* rntm_g, + AOCL_OPERATION_TYPE op_type ) { *n_threads = bli_rntm_num_threads( rntm_g ); @@ -295,19 +296,47 @@ BLIS_INLINE void lpgemm_u8s8s16o16_get_threading else if ( ( *n_threads ) > 1 ) { - dim_t NR = lpgemm_get_block_size_NR_global_cntx( U8S8S16OS16 ); + dim_t NR = lpgemm_get_block_size_NR_global_cntx( op_type ); + dim_t MR = lpgemm_get_block_size_MR_global_cntx( op_type ); + dim_t mr_blks = ( m + MR - 1 ) / MR; + dim_t nr_blks = ( n + NR - 1 ) / NR; if ( n <= NR ) { - // If n is less than micro panel dimension, allocating all threads - // to ic resulted in gains. - ( *ic_ways ) = ( *n_threads ); + ( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads ); ( *jc_ways ) = 1; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else if ( m <= MR ) + { + ( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads ); + ( *ic_ways ) = 1; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); } else { // If BLIS_NUM_THREADS are set, generate jc,ic from the same. bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways ); + if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) ) + { + ( *ic_ways ) = mr_blks; + ( *jc_ways ) = nr_blks; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else if ( mr_blks < ( *ic_ways ) ) + { + ( *ic_ways ) = mr_blks; + dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) ); + ( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else if ( nr_blks < ( *jc_ways ) ) + { + ( *jc_ways ) = nr_blks; + dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) ); + ( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } } } else @@ -320,7 +349,7 @@ BLIS_INLINE void lpgemm_u8s8s16o16_get_threading } } -BLIS_INLINE void lpgemm_u8s8s32o32_get_threading +BLIS_INLINE void lpgemm_u8s8s16o16_get_threading ( dim_t* n_threads, dim_t* ic_ways, @@ -330,6 +359,47 @@ BLIS_INLINE void lpgemm_u8s8s32o32_get_threading dim_t k, rntm_t* rntm_g ) +{ + lpgemm_s16o16_get_threading + ( + n_threads, + ic_ways, jc_ways, + m, n, k, rntm_g, + U8S8S16OS16 + ); +} + +BLIS_INLINE void lpgemm_s8s8s16o16_get_threading + ( + dim_t* n_threads, + dim_t* ic_ways, + dim_t* jc_ways, + dim_t m, + dim_t n, + dim_t k, + rntm_t* rntm_g + ) +{ + lpgemm_s16o16_get_threading + ( + n_threads, + ic_ways, jc_ways, + m, n, k, rntm_g, + S8S8S16OS16 + ); +} + +BLIS_INLINE void lpgemm_s32o32_get_threading + ( + dim_t* n_threads, + dim_t* ic_ways, + dim_t* jc_ways, + dim_t m, + dim_t n, + dim_t k, + rntm_t* rntm_g, + AOCL_OPERATION_TYPE op_type + ) { *n_threads = bli_rntm_num_threads( rntm_g ); *jc_ways = bli_rntm_jc_ways( rntm_g ); @@ -347,26 +417,55 @@ BLIS_INLINE void lpgemm_u8s8s32o32_get_threading else if ( ( *n_threads ) > 1 ) { - dim_t NR = lpgemm_get_block_size_NR_global_cntx( U8S8S32OS32 ); - dim_t MR = lpgemm_get_block_size_MR_global_cntx( U8S8S32OS32 ); + dim_t NR = lpgemm_get_block_size_NR_global_cntx( op_type ); + dim_t MR = lpgemm_get_block_size_MR_global_cntx( op_type ); + dim_t mr_blks = ( m + MR - 1 ) / MR; + dim_t nr_blks = ( n + NR - 1 ) / NR; if ( n <= NR ) { - // If n is less than micro panel dimension, allocating all threads - // to ic resulted in gains. - ( *ic_ways ) = ( *n_threads ); + ( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads ); ( *jc_ways ) = 1; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else if ( m <= MR ) + { + ( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads ); + ( *ic_ways ) = 1; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); } else { // If BLIS_NUM_THREADS are set, generate jc,ic from the same. bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways ); - - lpgemm_pnl_wrk_heur_adjust_ic_jc_ways - ( - MR, NR, m, n, - n_threads, ic_ways, jc_ways - ); + if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) ) + { + ( *ic_ways ) = mr_blks; + ( *jc_ways ) = nr_blks; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else if ( mr_blks < ( *ic_ways ) ) + { + ( *ic_ways ) = mr_blks; + dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) ); + ( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else if ( nr_blks < ( *jc_ways ) ) + { + ( *jc_ways ) = nr_blks; + dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) ); + ( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else + { + lpgemm_pnl_wrk_heur_adjust_ic_jc_ways + ( + MR, NR, m, n, + n_threads, ic_ways, jc_ways + ); + } } } else @@ -379,6 +478,46 @@ BLIS_INLINE void lpgemm_u8s8s32o32_get_threading } } +BLIS_INLINE void lpgemm_u8s8s32o32_get_threading + ( + dim_t* n_threads, + dim_t* ic_ways, + dim_t* jc_ways, + dim_t m, + dim_t n, + dim_t k, + rntm_t* rntm_g + ) +{ + lpgemm_s32o32_get_threading + ( + n_threads, + ic_ways, jc_ways, + m, n, k, rntm_g, + U8S8S32OS32 + ); +} + +BLIS_INLINE void lpgemm_s8s8s32o32_get_threading + ( + dim_t* n_threads, + dim_t* ic_ways, + dim_t* jc_ways, + dim_t m, + dim_t n, + dim_t k, + rntm_t* rntm_g + ) +{ + lpgemm_s32o32_get_threading + ( + n_threads, + ic_ways, jc_ways, + m, n, k, rntm_g, + S8S8S32OS32 + ); +} + BLIS_INLINE void lpgemm_bf16bf16f32of32_get_threading ( dim_t* n_threads, @@ -523,118 +662,6 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading } } -BLIS_INLINE void lpgemm_s8s8s32o32_get_threading - ( - dim_t* n_threads, - dim_t* ic_ways, - dim_t* jc_ways, - dim_t m, - dim_t n, - dim_t k, - rntm_t* rntm_g - ) -{ - *n_threads = bli_rntm_num_threads( rntm_g ); - *jc_ways = bli_rntm_jc_ways( rntm_g ); - *ic_ways = bli_rntm_ic_ways( rntm_g ); - - if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) ) - { - // If BLIS_IC_NT or JC_NT are set. - // Default cases. - *ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1; - *jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1; - - *n_threads = ( *jc_ways ) * ( *ic_ways ); - } - else if ( ( *n_threads ) > 1 ) - { - - dim_t NR = lpgemm_get_block_size_NR_global_cntx( S8S8S32OS32 ); - dim_t MR = lpgemm_get_block_size_MR_global_cntx( S8S8S32OS32 ); - - if ( n <= NR ) - { - // If n is less than micro panel dimension, allocating all threads - // to ic resulted in gains. - ( *ic_ways ) = ( *n_threads ); - ( *jc_ways ) = 1; - } - else - { - // If BLIS_NUM_THREADS are set, generate jc,ic from the same. - bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways ); - - lpgemm_pnl_wrk_heur_adjust_ic_jc_ways - ( - MR, NR, m, n, - n_threads, ic_ways, jc_ways - ); - } - } - else - { - // Setting all the values to 1 in case n_threads <= 1. This ensures - // the threading parameters are valid. - *n_threads = 1; - *jc_ways = 1; - *ic_ways = 1; - } -} - -BLIS_INLINE void lpgemm_s8s8s16o16_get_threading - ( - dim_t* n_threads, - dim_t* ic_ways, - dim_t* jc_ways, - dim_t m, - dim_t n, - dim_t k, - rntm_t* rntm_g - ) -{ - *n_threads = bli_rntm_num_threads( rntm_g ); - *jc_ways = bli_rntm_jc_ways( rntm_g ); - *ic_ways = bli_rntm_ic_ways( rntm_g ); - - if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) ) - { - // If BLIS_IC_NT or JC_NT are set. - // Default cases. - *ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1; - *jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1; - - *n_threads = ( *jc_ways ) * ( *ic_ways ); - } - else if ( ( *n_threads ) > 1 ) - { - - dim_t NR = lpgemm_get_block_size_NR_global_cntx( S8S8S16OS16 ); - - if ( n <= NR ) - { - // If n is less than micro panel dimension, allocating all threads - // to ic resulted in gains. - ( *ic_ways ) = ( *n_threads ); - ( *jc_ways ) = 1; - } - else - { - // If BLIS_NUM_THREADS are set, generate jc,ic from the same. - bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways ); - } - } - else - { - // Setting all the values to 1 in case n_threads <= 1. This ensures - // the threading parameters are valid. - *n_threads = 1; - *jc_ways = 1; - *ic_ways = 1; - } -} - - #define GEN_LPGEMM_OPENMP_DECORATOR(A_type,B_type,C_type,LPGEMM_SFX) \ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \ ( \ From 0c12b72651e4d295d25096b28106268cc6e16d56 Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Mon, 6 Nov 2023 05:42:56 +0530 Subject: [PATCH 195/226] LPGEMM bench enhancements Details: - Moved the downscale & postop options from commmandline to input file. - Now the format of the input file is as follows: dt_in dt_out stor transa transb op_a op_b m n k lda ldb ldc postops - In case of no-postops, 'none' has to be passed in the place of postops. - Removed duplication of mat_mul_bench_main function for bf16 APIs. - Added a function called print_matrix for each datatype which can help in printing matrices while debugging. - Added printing of ref, computed and diff values while reporting failure. - Added new functions for memory allocation and freeing. Different types of memory allocation is chosen based on mode bench is running(performance or accuracy mode). Change-Id: Ia7d740c53035bc76e578a03869590c9f04396b72 --- addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c | 10 - bench/bench_aocl_gemm/bench_input.txt | 4761 ++++++++++++++---- bench/bench_aocl_gemm/bench_lpgemm.c | 2480 +++++---- 3 files changed, 4960 insertions(+), 2291 deletions(-) diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c index 9e27ae4fc7..f258755e1e 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c @@ -92,16 +92,6 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - // Transpose is not supported for B matrix yet. - if ( ( is_row_major == TRUE ) && ( bli_is_trans( blis_transb ) ) ) - { - return; // Error. - } - else if ( ( is_column_major == TRUE ) && ( bli_is_trans( blis_transa ) ) ) - { - return; // Error. - } - // Check if strides are valid for Row major inputs. if ( ( is_row_major == TRUE ) && ( ( bli_is_notrans( blis_transa ) && ( lda < k ) ) || diff --git a/bench/bench_aocl_gemm/bench_input.txt b/bench/bench_aocl_gemm/bench_input.txt index 3bfd6fb350..fbde59de5a 100644 --- a/bench/bench_aocl_gemm/bench_input.txt +++ b/bench/bench_aocl_gemm/bench_input.txt @@ -1,979 +1,3782 @@ -u r n n n p 480 20 2050 2050 20 20 -u r n n n p 481 20 2050 2050 20 20 -u r n n n p 482 20 2050 2050 20 20 -u r n n n p 483 20 2050 2050 20 20 -u r n n n R 484 20 2050 2050 20 20 -u r n n n R 485 20 2050 2050 20 20 -u r n n n R 480 39 2050 2050 39 39 -u r n n n R 481 39 2050 2050 39 39 -u r n n n R 482 39 2050 2050 39 39 -u r n n n R 483 39 2050 2050 39 39 -u r n n n R 484 39 2050 2050 39 39 -u r n n n p 485 39 2050 2050 39 39 -u r n n n p 480 50 2050 2050 50 50 -u r n n n p 481 50 2050 2050 50 50 -u r n n n p 482 50 2050 2050 50 50 -u r n n n p 483 50 2050 2050 50 50 -u r n n n p 484 50 2050 2050 50 50 -u r n n n p 485 50 2050 2050 50 50 -u r n n n R 480 1108 2050 2050 1108 1108 -u r n n n R 481 1108 2050 2050 1108 1108 -u r n n n R 482 1108 2050 2050 1108 1108 -u r n n n R 483 1108 2050 2050 1108 1108 -u r n n n R 484 1108 2050 2050 1108 1108 -u r n n n R 485 1108 2050 2050 1108 1108 -u r n n n R 480 1127 2050 2050 1127 1127 -u r n n n R 481 1127 2050 2050 1127 1127 -u r n n n R 482 1127 2050 2050 1127 1127 -u r n n n R 483 1127 2050 2050 1127 1127 -u r n n n p 484 1127 2050 2050 1127 1127 -u r n n n p 485 1127 2050 2050 1127 1127 -u r n n n p 480 1138 2050 2050 1138 1138 -u r n n n p 481 1138 2050 2050 1138 1138 -u r n n n p 482 1138 2050 2050 1138 1138 -u r n n n p 483 1138 2050 2050 1138 1138 -u r n n n p 484 1138 2050 2050 1138 1138 -u r n n n p 485 1138 2050 2050 1138 1138 -u r n n n p 1 1 3 3 1 1 -u r n n n p 1 9 3 3 9 9 -u r n n n p 1 2048 3 3 2048 2048 -u r n n n p 1 2048 5192 5192 2048 2048 -u r n n n p 9 1 3 3 1 1 -u r n n n p 576 1 3500 3500 1 1 -u r n n n p 1 1 1 1 1 1 -u r n n n p 102 1088 1024 1024 1088 1088 -u r n n n p 102 2048 1024 1024 2048 2048 -u r n n n p 485 656 1024 1024 656 656 -u r n n n p 483 656 1024 1024 656 656 -u r n n n p 81 128 3 3 128 128 -u r n n n p 1022 512 515 515 512 512 -u r n n n p 74 512 515 515 512 512 -u r n n n p 253 2048 515 515 2048 2048 -u r n n n p 8192 1040 515 515 1040 1040 -u r n n n p 10 1029 515 515 1029 1029 -u r n n n p 24 1040 2050 2050 1040 1040 -u r n n n p 1024 1029 2050 2050 1029 1029 -u r n n n p 480 660 2050 2050 660 660 -u r n n n p 481 660 2050 2050 660 660 -u r n n n p 482 660 2050 2050 660 660 -u r n n n p 483 660 2050 2050 660 660 -u r n n n p 484 660 2050 2050 660 660 -u r n n n p 485 660 2050 2050 660 660 -u r n n n p 480 679 2050 2050 679 679 -u r n n n p 481 679 2050 2050 679 679 -u r n n n p 482 679 2050 2050 679 679 -u r n n n p 483 679 2050 2050 679 679 -u r n n n p 484 679 2050 2050 679 679 -u r n n n p 485 679 2050 2050 679 679 -u r n n n p 480 690 2050 2050 690 690 -u r n n n p 481 690 2050 2050 690 690 -u r n n n p 482 690 2050 2050 690 690 -u r n n n p 483 690 2050 2050 690 690 -u r n n n p 484 690 2050 2050 690 690 -u r n n n p 485 690 2050 2050 690 690 -u r n n n p 480 660 2048 2048 660 660 -u r n n n p 481 660 2048 2048 660 660 -u r n n n p 482 660 2048 2048 660 660 -u r n n n p 483 660 2048 2048 660 660 -u r n n n p 484 660 2048 2048 660 660 -u r n n n p 485 660 2048 2048 660 660 -u r n n n p 480 679 2048 2048 679 679 -u r n n n p 481 679 2048 2048 679 679 -u r n n n p 482 679 2048 2048 679 679 -u r n n n p 483 679 2048 2048 679 679 -u r n n n p 484 679 2048 2048 679 679 -u r n n n p 485 679 2048 2048 679 679 -u r n n n p 480 690 2048 2048 690 690 -u r n n n p 481 690 2048 2048 690 690 -u r n n n p 482 690 2048 2048 690 690 -u r n n n p 483 690 2048 2048 690 690 -u r n n n p 484 690 2048 2048 690 690 -u r n n n p 485 690 2048 2048 690 690 -u r n n n p 480 656 1024 1024 656 656 -u r n n n p 480 128 3 3 128 128 -u r n n n p 1024 512 515 515 512 512 -u r n n n p 1024 2048 1024 1024 2048 2048 -u r n n n p 1024 2048 515 515 2048 2048 -u r n n n p 1024 1040 515 515 1040 1040 -u r n n n p 5 1029 515 515 1029 1029 -u r n n n p 1024 1029 515 515 1029 1029 -u r n n n p 1024 1040 2050 2050 1040 1040 -u r n n n p 1029 1029 2050 2050 1029 1029 -u r n n n R 480 646 2050 2050 646 646 -u r n n n R 481 646 2050 2050 646 646 -u r n n n R 482 646 2050 2050 646 646 -u r n n n R 483 646 2050 2050 646 646 -u r n n n R 484 646 2050 2050 646 646 -u r n n n R 485 646 2050 2050 646 646 -u r n n n R 481 656 2050 2050 656 656 -u r n n n R 482 656 2050 2050 656 656 -u r n n n R 483 656 2050 2050 656 656 -u r n n n R 484 656 2050 2050 656 656 -u r n n n p 485 656 2050 2050 656 656 -u r n n n p 480 672 2050 2050 672 672 -u r n n n p 481 672 2050 2050 672 672 -u r n n n p 482 672 2050 2050 672 672 -u r n n n p 483 672 2050 2050 672 672 -u r n n n p 484 672 2050 2050 672 672 -u r n n n p 485 672 2050 2050 672 672 -u r n n n p 480 688 2050 2050 688 688 -u r n n n p 481 688 2050 2050 688 688 -u r n n n r 482 688 2050 2050 688 688 -u r n n n r 483 688 2050 2050 688 688 -u r n n n r 484 688 2050 2050 688 688 -u r n n n r 485 688 2050 2050 688 688 -u r n n n r 1024 512 64 64 512 512 -u r n n n r 16 256 512 512 256 256 -u r n n n r 480 640 512 512 640 640 -u r n n n r 64 768 512 512 768 768 -u r n n n r 128 128 128 128 128 128 -u r n n n r 1024 64 512 512 64 64 -u r n n n r 1024 256 32 32 256 256 -u r n n n r 1024 512 64 64 512 512 -u r n n n r 480 640 512 512 640 640 -u r n n n p 1024 32 256 256 32 32 -u r n n n P 1024 64 512 512 64 64 -u r n n n P 64 800 320 320 800 800 -u r n n n P 64 768 512 512 768 768 -u r n n n P 16 256 512 512 256 256 -u r n n n P 128 128 128 128 128 128 -u r n n n P 256 512 256 256 512 512 -u r n n n P 1024 1024 1024 1024 1024 1024 -u r n n n P 480 640 1024 1024 640 640 -u r n n n P 480 640 256 256 640 640 -u r n n n P 8 64 32 32 64 64 -u r n n n P 9 64 32 32 64 64 -u r n n n P 10 128 64 64 128 128 -u r n n n P 8 8 8 8 8 8 -u r n n n P 12 12 12 12 12 12 -u r n n n P 25 25 25 25 25 25 -u r n n n P 25 25 20 20 25 25 -u r n n n r 4096 256 5 5 256 256 -u r n n n r 3000 256 128 128 256 256 -u r n n n r 4096 1024 512 512 1024 1024 -u r n n n r 144 256 5 5 256 256 -u r n n n r 144 256 128 128 256 256 -u r n n n r 144 1024 512 512 1024 1024 -u r n n n r 480 688 256 256 688 688 -u r n n n r 480 640 512 512 640 640 -u r n n n r 480 640 1024 1024 640 640 -u r n n n r 64 800 320 320 800 800 -u r n n n r 64 768 512 512 768 768 -u r n n n r 16 256 512 512 256 256 -u r n n n r 128 128 128 128 128 128 -u r n n n r 256 512 256 256 512 512 -u r n n n r 1024 1024 1024 1024 1024 1024 -u r n n n r 1024 32 256 256 32 32 -u r n n n r 1024 64 512 512 64 64 -u r n n n r 1024 256 32 32 256 256 -u r n n n r 1024 512 64 64 512 512 -u r n n n r 512 32 256 256 32 32 -u r n n n r 512 768 512 512 768 768 -u r n n n r 512 256 32 32 256 256 -u r n n n r 512 512 64 64 512 512 -u r n n n r 512 256 768 768 256 256 -u r n n n r 768 768 1024 1024 768 768 -u r n n n r 768 768 768 768 768 768 -u r n n n r 2048 2048 2048 2048 2048 2048 -u r n n n r 4096 4096 4096 4096 4096 4096 -f c n n n p 2482 1127 2050 2482 2050 2482 -f c n n n p 2483 1127 2050 2483 2050 2483 -f c n n n p 2484 1127 2050 2484 2050 2484 -f c n n n p 2485 1127 2050 2485 2050 2485 -f c n n n p 480 1138 2050 480 2050 480 -f c n n n p 481 1138 2050 481 2050 481 -f c n n n p 482 1138 2050 482 2050 482 -f c n n n p 483 1138 2050 483 2050 483 -f c n n n p 484 1138 2050 484 2050 484 -f c n n n p 485 1138 2050 485 2050 485 -f c n n n p 1 1 3 1 3 1 -f c n n n p 1 9 3 1 3 1 -f c n n n p 1 2048 3 1 3 1 -f c n n n p 1 2048 5192 1 5192 1 -f c n n n p 9 1 3 9 3 9 -f c n n n p 576 1 3500 576 3500 576 -f c n n n p 1 1 1 1 1 1 -f c n n n p 102 1088 1024 102 1024 102 -b r n n n r 480 20 2050 2050 20 20 -b r n n n r 481 20 2050 2050 20 20 -b r n n n r 482 20 2050 2050 20 20 -b r n n n p 483 20 2050 2050 20 20 -b r n n n R 484 20 2050 2050 20 20 -b r n n n R 485 20 2050 2050 20 20 -b r n n n R 480 39 2050 2050 39 39 -b r n n n R 481 39 2050 2050 39 39 -b r n n n R 482 39 2050 2050 39 39 -b r n n n R 483 39 2050 2050 39 39 -b r n n n R 484 39 2050 2050 39 39 -b r n n n p 485 39 2050 2050 39 39 -b r n n n p 480 50 2050 2050 50 50 -b r n n n p 481 50 2050 2050 50 50 -b r n n n p 482 50 2050 2050 50 50 -b r n n n p 483 50 2050 2050 50 50 -b r n n n p 484 50 2050 2050 50 50 -b r n n n p 485 50 2050 2050 50 50 -b r n n n R 480 1108 2050 2050 1108 1108 -b r n n n R 481 1108 2050 2050 1108 1108 -b r n n n R 482 1108 2050 2050 1108 1108 -b r n n n R 483 1108 2050 2050 1108 1108 -b r n n n R 484 1108 2050 2050 1108 1108 -b r n n n R 485 1108 2050 2050 1108 1108 -b r n n n R 480 1127 2050 2050 1127 1127 -b r n n n R 481 1127 2050 2050 1127 1127 -b r n n n R 482 1127 2050 2050 1127 1127 -b r n n n R 483 1127 2050 2050 1127 1127 -b r n n n p 484 1127 2050 2050 1127 1127 -b r n n n p 485 1127 2050 2050 1127 1127 -b r n n n p 480 1138 2050 2050 1138 1138 -b r n n n p 481 1138 2050 2050 1138 1138 -b r n n n p 482 1138 2050 2050 1138 1138 -b r n n n p 483 1138 2050 2050 1138 1138 -b r n n n p 484 1138 2050 2050 1138 1138 -b r n n n p 485 1138 2050 2050 1138 1138 -b r n n n p 1 1 3 3 1 1 -b r n n n p 1 9 3 3 9 9 -b r n n n p 1 2048 3 3 2048 2048 -b r n n n p 1 2048 5192 5192 2048 2048 -b r n n n p 9 1 3 3 1 1 -b r n n n p 576 1 3500 3500 1 1 -b r n n n p 1 1 1 1 1 1 -b r n n n p 102 1088 1024 1024 1088 1088 -b r n n n p 102 2048 1024 1024 2048 2048 -b r n n n p 485 656 1024 1024 656 656 -b r n n n p 483 656 1024 1024 656 656 -b r n n n p 81 128 3 3 128 128 -b r n n n p 1022 512 515 515 512 512 -b r n n n p 74 512 515 515 512 512 -b r n n n p 253 2048 515 515 2048 2048 -b r n n n p 8192 1040 515 515 1040 1040 -b r n n n p 10 1029 515 515 1029 1029 -b r n n n p 24 1040 2050 2050 1040 1040 -b r n n n p 1024 1029 2050 2050 1029 1029 -b r n n n p 480 660 2050 2050 660 660 -b r n n n p 481 660 2050 2050 660 660 -b r n n n p 482 660 2050 2050 660 660 -b r n n n p 483 660 2050 2050 660 660 -b r n n n p 484 660 2050 2050 660 660 -b r n n n p 485 660 2050 2050 660 660 -b r n n n p 480 679 2050 2050 679 679 -b r n n n p 481 679 2050 2050 679 679 -b r n n n p 482 679 2050 2050 679 679 -b r n n n p 483 679 2050 2050 679 679 -b r n n n p 484 679 2050 2050 679 679 -b r n n n p 485 679 2050 2050 679 679 -b r n n n p 480 690 2050 2050 690 690 -b r n n n p 481 690 2050 2050 690 690 -b r n n n p 482 690 2050 2050 690 690 -b r n n n p 483 690 2050 2050 690 690 -b r n n n p 484 690 2050 2050 690 690 -b r n n n p 485 690 2050 2050 690 690 -b r n n n p 480 660 2048 2048 660 660 -b r n n n p 481 660 2048 2048 660 660 -b r n n n p 482 660 2048 2048 660 660 -b r n n n p 483 660 2048 2048 660 660 -b r n n n p 484 660 2048 2048 660 660 -b r n n n p 485 660 2048 2048 660 660 -b r n n n p 480 679 2048 2048 679 679 -b r n n n p 481 679 2048 2048 679 679 -b r n n n p 482 679 2048 2048 679 679 -b r n n n p 483 679 2048 2048 679 679 -b r n n n p 484 679 2048 2048 679 679 -b r n n n p 485 679 2048 2048 679 679 -b r n n n p 480 690 2048 2048 690 690 -b r n n n p 481 690 2048 2048 690 690 -b r n n n p 482 690 2048 2048 690 690 -b r n n n p 483 690 2048 2048 690 690 -b r n n n p 484 690 2048 2048 690 690 -b r n n n p 485 690 2048 2048 690 690 -b r n n n p 480 656 1024 1024 656 656 -b r n n n p 480 128 3 3 128 128 -b r n n n p 1024 512 515 515 512 512 -b r n n n p 1024 2048 1024 1024 2048 2048 -b r n n n p 1024 2048 515 515 2048 2048 -b r n n n p 1024 1040 515 515 1040 1040 -b r n n n p 5 1029 515 515 1029 1029 -b r n n n p 1024 1029 515 515 1029 1029 -b r n n n p 1024 1040 2050 2050 1040 1040 -b r n n n p 1029 1029 2050 2050 1029 1029 -b r n n n R 480 646 2050 2050 646 646 -b r n n n R 481 646 2050 2050 646 646 -b r n n n R 482 646 2050 2050 646 646 -b r n n n R 483 646 2050 2050 646 646 -b r n n n R 484 646 2050 2050 646 646 -b r n n n R 485 646 2050 2050 646 646 -b r n n n R 481 656 2050 2050 656 656 -b r n n n R 482 656 2050 2050 656 656 -b r n n n R 483 656 2050 2050 656 656 -b r n n n R 484 656 2050 2050 656 656 -b r n n n p 485 656 2050 2050 656 656 -b r n n n p 480 672 2050 2050 672 672 -b r n n n p 481 672 2050 2050 672 672 -b r n n n p 482 672 2050 2050 672 672 -b r n n n p 483 672 2050 2050 672 672 -b r n n n p 484 672 2050 2050 672 672 -b r n n n p 485 672 2050 2050 672 672 -b r n n n p 480 688 2050 2050 688 688 -b r n n n p 481 688 2050 2050 688 688 -b r n n n r 482 688 2050 2050 688 688 -b r n n n r 483 688 2050 2050 688 688 -b r n n n r 484 688 2050 2050 688 688 -b r n n n r 485 688 2050 2050 688 688 -b r n n n r 1024 512 64 64 512 512 -b r n n n r 16 256 512 512 256 256 -b r n n n r 480 640 512 512 640 640 -b r n n n r 64 768 512 512 768 768 -b r n n n r 128 128 128 128 128 128 -b r n n n r 1024 64 512 512 64 64 -b r n n n r 1024 256 32 32 256 256 -b r n n n r 1024 512 64 64 512 512 -b r n n n r 480 640 512 512 640 640 -b r n n n p 1024 32 256 256 32 32 -b r n n n P 1024 64 512 512 64 64 -b r n n n P 64 800 320 320 800 800 -b r n n n P 64 768 512 512 768 768 -b r n n n P 16 256 512 512 256 256 -b r n n n P 128 128 128 128 128 128 -b r n n n P 256 512 256 256 512 512 -b r n n n P 1024 1024 1024 1024 1024 1024 -b r n n n P 480 640 1024 1024 640 640 -b r n n n P 480 640 256 256 640 640 -b r n n n P 8 64 32 32 64 64 -b r n n n P 9 64 32 32 64 64 -b r n n n P 10 128 64 64 128 128 -b r n n n P 8 8 8 8 8 8 -b r n n n P 12 12 12 12 12 12 -b r n n n P 25 25 25 25 25 25 -b r n n n P 25 25 20 20 25 25 -b c n n n p 485 39 2050 485 2050 485 -b c n n n p 480 50 2050 480 2050 480 -b c n n n p 481 50 2050 481 2050 481 -b c n n n p 482 50 2050 482 2050 482 -b c n n n p 483 50 2050 483 2050 483 -b c n n n p 484 50 2050 484 2050 484 -b c n n n p 485 50 2050 485 2050 485 -b c n n n p 484 1127 2050 484 2050 484 -b c n n n p 485 1127 2050 485 2050 485 -b c n n n p 480 1138 2050 480 2050 480 -b c n n n p 481 1138 2050 481 2050 481 -b c n n n p 482 1138 2050 482 2050 482 -b c n n n p 483 1138 2050 483 2050 483 -b c n n n p 484 1138 2050 484 2050 484 -b c n n n p 485 1138 2050 485 2050 485 -b c n n n p 1 1 3 1 3 1 -b c n n n p 1 9 3 1 3 1 -b c n n n p 1 2048 3 1 3 1 -b c n n n p 1 2048 5192 1 5192 1 -b c n n n p 9 1 3 9 3 9 -b c n n n p 576 1 3500 576 3500 576 -b c n n n p 1 1 1 1 1 1 -b c n n n p 102 1088 1024 102 1024 102 -b c n n n p 102 2048 1024 102 1024 102 -b c n n n p 485 656 1024 485 1024 485 -b c n n n p 483 656 1024 483 1024 483 -b c n n n p 81 128 3 81 3 81 -b c n n n p 1022 512 515 1022 515 1022 -b c n n n p 74 512 515 74 515 74 -b c n n n p 253 2048 515 253 515 253 -b c n n n p 8192 1040 515 8192 515 8192 -b c n n n p 10 1029 515 10 515 10 -b c n n n p 24 1040 2050 24 2050 24 -b c n n n p 1024 1029 2050 1024 2050 1024 -b c n n n p 480 660 2050 480 2050 480 -b c n n n p 481 660 2050 481 2050 481 -b c n n n p 482 660 2050 482 2050 482 -b c n n n p 483 660 2050 483 2050 483 -b c n n n p 484 660 2050 484 2050 484 -b c n n n p 485 660 2050 485 2050 485 -b c n n n p 480 679 2050 480 2050 480 -b c n n n p 481 679 2050 481 2050 481 -b c n n n p 482 679 2050 482 2050 482 -b c n n n p 483 679 2050 483 2050 483 -b c n n n p 484 679 2050 484 2050 484 -b c n n n p 485 679 2050 485 2050 485 -b c n n n p 480 690 2050 480 2050 480 -b c n n n p 481 690 2050 481 2050 481 -b c n n n p 482 690 2050 482 2050 482 -b c n n n p 483 690 2050 483 2050 483 -b c n n n p 484 690 2050 484 2050 484 -b c n n n p 485 690 2050 485 2050 485 -b c n n n p 480 660 2048 480 2048 480 -b c n n n p 481 660 2048 481 2048 481 -b c n n n p 482 660 2048 482 2048 482 -b c n n n p 483 660 2048 483 2048 483 -b c n n n p 484 660 2048 484 2048 484 -b c n n n p 485 660 2048 485 2048 485 -b c n n n p 480 679 2048 480 2048 480 -b c n n n p 481 679 2048 481 2048 481 -b c n n n p 482 679 2048 482 2048 482 -b c n n n p 483 679 2048 483 2048 483 -b c n n n p 484 679 2048 484 2048 484 -b c n n n p 485 679 2048 485 2048 485 -b c n n n p 480 690 2048 480 2048 480 -b c n n n p 481 690 2048 481 2048 481 -b c n n n p 482 690 2048 482 2048 482 -b c n n n p 483 690 2048 483 2048 483 -b c n n n p 484 690 2048 484 2048 484 -b c n n n p 485 690 2048 485 2048 485 -b c n n n p 480 656 1024 480 1024 480 -b c n n n p 480 128 3 480 3 480 -b c n n n p 1024 512 515 1024 515 1024 -b c n n n p 1024 2048 1024 1024 1024 1024 -b c n n n p 1024 2048 515 1024 515 1024 -b c p n n n 1024 1040 515 1024 515 1024 -b c p n n n 5 1029 515 5 515 5 -b c p n n n 1024 1029 515 1024 515 1024 -b c p n n n 1024 1040 2050 1024 2050 1024 -b c p n n n 1029 1029 2050 1029 2050 1029 -b c p n n n 485 656 2050 485 2050 485 -b c p n n n 480 672 2050 480 2050 480 -b c p n n n 481 672 2050 481 2050 481 -b c p n n n 482 672 2050 482 2050 482 -b c p n n n 483 672 2050 483 2050 483 -b c p n n n 484 672 2050 484 2050 484 -b c p n n n 485 672 2050 485 2050 485 -b c p n n n 480 688 2050 480 2050 480 -b c p n n n 481 688 2050 481 2050 481 -b c p n n n 1024 32 256 1024 256 1024 -b c P n n n 1024 64 512 1024 512 1024 -b c P n n n 64 800 320 64 320 64 -b c P n n n 64 768 512 64 512 64 -b c P n n n 16 256 512 16 512 16 -b c P n n n 128 128 128 128 128 128 -b c P n n n 256 512 256 256 256 256 -b c P n n n 1024 1024 1024 1024 1024 1024 -b c P n n n 480 640 1024 480 1024 480 -b c P n n n 480 640 256 480 256 480 -b c P n n n 8 64 32 8 32 8 -b c P n n n 9 64 32 9 32 9 -b c P n n n 10 128 64 10 64 10 -b c P n n n 8 8 8 8 8 8 -b c P n n n 12 12 12 12 12 12 -b c P n n n 25 25 25 25 25 25 -b c P n n n 25 25 20 25 20 25 -s r n n n r 480 20 2050 2050 20 20 -s r n n n r 481 20 2050 2050 20 20 -s r n n n r 482 20 2050 2050 20 20 -s r n n n p 483 20 2050 2050 20 20 -s r n n n R 484 20 2050 2050 20 20 -s r n n n R 485 20 2050 2050 20 20 -s r n n n R 480 39 2050 2050 39 39 -s r n n n R 481 39 2050 2050 39 39 -s r n n n R 482 39 2050 2050 39 39 -s r n n n R 483 39 2050 2050 39 39 -s r n n n R 484 39 2050 2050 39 39 -s r n n n p 485 39 2050 2050 39 39 -s r n n n p 480 50 2050 2050 50 50 -s r n n n p 481 50 2050 2050 50 50 -s r n n n p 482 50 2050 2050 50 50 -s r n n n p 483 50 2050 2050 50 50 -s r n n n p 484 50 2050 2050 50 50 -s r n n n p 485 50 2050 2050 50 50 -s r n n n R 480 1108 2050 2050 1108 1108 -s r n n n R 481 1108 2050 2050 1108 1108 -s r n n n R 482 1108 2050 2050 1108 1108 -s r n n n R 483 1108 2050 2050 1108 1108 -s r n n n R 484 1108 2050 2050 1108 1108 -s r n n n R 485 1108 2050 2050 1108 1108 -s r n n n R 480 1127 2050 2050 1127 1127 -s r n n n R 481 1127 2050 2050 1127 1127 -s r n n n R 482 1127 2050 2050 1127 1127 -s r n n n R 483 1127 2050 2050 1127 1127 -s r n n n p 484 1127 2050 2050 1127 1127 -s r n n n p 485 1127 2050 2050 1127 1127 -s r n n n p 480 1138 2050 2050 1138 1138 -s r n n n p 481 1138 2050 2050 1138 1138 -s r n n n p 482 1138 2050 2050 1138 1138 -s r n n n p 483 1138 2050 2050 1138 1138 -s r n n n p 484 1138 2050 2050 1138 1138 -s r n n n p 485 1138 2050 2050 1138 1138 -s r n n n p 1 1 3 3 1 1 -s r n n n p 1 9 3 3 9 9 -s r n n n p 1 2048 3 3 2048 2048 -s r n n n p 1 2048 5192 5192 2048 2048 -s r n n n p 9 1 3 3 1 1 -s r n n n p 576 1 3500 3500 1 1 -s r n n n p 1 1 1 1 1 1 -s r n n n p 102 1088 1024 1024 1088 1088 -s r n n n p 102 2048 1024 1024 2048 2048 -s r n n n p 485 656 1024 1024 656 656 -s r n n n p 483 656 1024 1024 656 656 -s r n n n p 81 128 3 3 128 128 -s r n n n p 1022 512 515 515 512 512 -s r n n n p 74 512 515 515 512 512 -s r n n n p 253 2048 515 515 2048 2048 -s r n n n p 8192 1040 515 515 1040 1040 -s r n n n p 10 1029 515 515 1029 1029 -s r n n n p 24 1040 2050 2050 1040 1040 -s r n n n p 1024 1029 2050 2050 1029 1029 -s r n n n p 480 660 2050 2050 660 660 -s r n n n p 481 660 2050 2050 660 660 -s r n n n p 482 660 2050 2050 660 660 -s r n n n p 483 660 2050 2050 660 660 -s r n n n p 484 660 2050 2050 660 660 -s r n n n p 485 660 2050 2050 660 660 -s r n n n p 480 679 2050 2050 679 679 -s r n n n p 481 679 2050 2050 679 679 -s r n n n p 482 679 2050 2050 679 679 -s r n n n p 483 679 2050 2050 679 679 -s r n n n p 484 679 2050 2050 679 679 -s r n n n p 485 679 2050 2050 679 679 -s r n n n p 480 690 2050 2050 690 690 -s r n n n p 481 690 2050 2050 690 690 -s r n n n p 482 690 2050 2050 690 690 -s r n n n p 483 690 2050 2050 690 690 -s r n n n p 484 690 2050 2050 690 690 -s r n n n p 485 690 2050 2050 690 690 -s r n n n p 480 660 2048 2048 660 660 -s r n n n p 481 660 2048 2048 660 660 -s r n n n p 482 660 2048 2048 660 660 -s r n n n p 483 660 2048 2048 660 660 -s r n n n p 484 660 2048 2048 660 660 -s r n n n p 485 660 2048 2048 660 660 -s r n n n p 480 679 2048 2048 679 679 -s r n n n p 481 679 2048 2048 679 679 -s r n n n p 482 679 2048 2048 679 679 -s r n n n p 483 679 2048 2048 679 679 -s r n n n p 484 679 2048 2048 679 679 -s r n n n p 485 679 2048 2048 679 679 -s r n n n p 480 690 2048 2048 690 690 -s r n n n p 481 690 2048 2048 690 690 -s r n n n p 482 690 2048 2048 690 690 -s r n n n p 483 690 2048 2048 690 690 -s r n n n p 484 690 2048 2048 690 690 -s r n n n p 485 690 2048 2048 690 690 -s r n n n p 480 656 1024 1024 656 656 -s r n n n p 480 128 3 3 128 128 -s r n n n p 1024 512 515 515 512 512 -s r n n n p 1024 2048 1024 1024 2048 2048 -s r n n n p 1024 2048 515 515 2048 2048 -s r n n n p 1024 1040 515 515 1040 1040 -s r n n n p 5 1029 515 515 1029 1029 -s r n n n p 1024 1029 515 515 1029 1029 -s r n n n p 1024 1040 2050 2050 1040 1040 -s r n n n p 1029 1029 2050 2050 1029 1029 -s r n n n R 480 646 2050 2050 646 646 -s r n n n R 481 646 2050 2050 646 646 -s r n n n R 482 646 2050 2050 646 646 -s r n n n R 483 646 2050 2050 646 646 -s r n n n R 484 646 2050 2050 646 646 -s r n n n R 485 646 2050 2050 646 646 -s r n n n R 481 656 2050 2050 656 656 -s r n n n R 482 656 2050 2050 656 656 -s r n n n R 483 656 2050 2050 656 656 -s r n n n R 484 656 2050 2050 656 656 -s r n n n p 485 656 2050 2050 656 656 -s r n n n p 480 672 2050 2050 672 672 -s r n n n p 481 672 2050 2050 672 672 -s r n n n p 482 672 2050 2050 672 672 -s r n n n p 483 672 2050 2050 672 672 -s r n n n p 484 672 2050 2050 672 672 -s r n n n p 485 672 2050 2050 672 672 -s r n n n p 480 688 2050 2050 688 688 -s r n n n p 481 688 2050 2050 688 688 -s r n n n r 482 688 2050 2050 688 688 -s r n n n r 483 688 2050 2050 688 688 -s r n n n r 484 688 2050 2050 688 688 -s r n n n r 485 688 2050 2050 688 688 -s r n n n r 1024 512 64 64 512 512 -s r n n n r 16 256 512 512 256 256 -s r n n n r 480 640 512 512 640 640 -s r n n n r 64 768 512 512 768 768 -s r n n n r 128 128 128 128 128 128 -s r n n n r 1024 64 512 512 64 64 -s r n n n r 1024 256 32 32 256 256 -s r n n n r 1024 512 64 64 512 512 -s r n n n r 480 640 512 512 640 640 -s r n n n p 1024 32 256 256 32 32 -s r n n n P 1024 64 512 512 64 64 -s r n n n P 64 800 320 320 800 800 -s r n n n P 64 768 512 512 768 768 -s r n n n P 16 256 512 512 256 256 -s r n n n P 128 128 128 128 128 128 -s r n n n P 256 512 256 256 512 512 -s r n n n P 1024 1024 1024 1024 1024 1024 -s r n n n P 480 640 1024 1024 640 640 -s r n n n P 480 640 256 256 640 640 -s r n n n P 8 64 32 32 64 64 -s r n n n P 9 64 32 32 64 64 -s r n n n P 10 128 64 64 128 128 -s r n n n P 8 8 8 8 8 8 -s r n n n P 12 12 12 12 12 12 -s r n n n P 25 25 25 25 25 25 -s r n n n P 25 25 20 20 25 25 -i r n n n p 480 20 2050 2050 20 20 -i r n n n p 481 20 2050 2050 20 20 -i r n n n p 482 20 2050 2050 20 20 -i r n n n p 483 20 2050 2050 20 20 -i r n n n R 484 20 2050 2050 20 20 -i r n n n R 485 20 2050 2050 20 20 -i r n n n R 480 39 2050 2050 39 39 -i r n n n R 481 39 2050 2050 39 39 -i r n n n R 482 39 2050 2050 39 39 -i r n n n R 483 39 2050 2050 39 39 -i r n n n R 484 39 2050 2050 39 39 -i r n n n p 485 39 2050 2050 39 39 -i r n n n p 480 50 2050 2050 50 50 -i r n n n p 481 50 2050 2050 50 50 -i r n n n p 482 50 2050 2050 50 50 -i r n n n p 483 50 2050 2050 50 50 -i r n n n p 484 50 2050 2050 50 50 -i r n n n p 485 50 2050 2050 50 50 -i r n n n R 480 1108 2050 2050 1108 1108 -i r n n n R 481 1108 2050 2050 1108 1108 -i r n n n R 482 1108 2050 2050 1108 1108 -i r n n n R 483 1108 2050 2050 1108 1108 -i r n n n R 484 1108 2050 2050 1108 1108 -i r n n n R 485 1108 2050 2050 1108 1108 -i r n n n R 480 1127 2050 2050 1127 1127 -i r n n n R 481 1127 2050 2050 1127 1127 -i r n n n R 482 1127 2050 2050 1127 1127 -i r n n n R 483 1127 2050 2050 1127 1127 -i r n n n p 484 1127 2050 2050 1127 1127 -i r n n n p 485 1127 2050 2050 1127 1127 -i r n n n p 480 1138 2050 2050 1138 1138 -i r n n n p 481 1138 2050 2050 1138 1138 -i r n n n p 482 1138 2050 2050 1138 1138 -i r n n n p 483 1138 2050 2050 1138 1138 -i r n n n p 484 1138 2050 2050 1138 1138 -i r n n n p 485 1138 2050 2050 1138 1138 -i r n n n p 1 1 3 3 1 1 -i r n n n p 1 9 3 3 9 9 -i r n n n p 1 2048 3 3 2048 2048 -i r n n n p 1 2048 5192 5192 2048 2048 -i r n n n p 9 1 3 3 1 1 -i r n n n p 576 1 3500 3500 1 1 -i r n n n p 1 1 1 1 1 1 -i r n n n p 102 1088 1024 1024 1088 1088 -i r n n n p 102 2048 1024 1024 2048 2048 -i r n n n p 485 656 1024 1024 656 656 -i r n n n p 483 656 1024 1024 656 656 -i r n n n p 81 128 3 3 128 128 -i r n n n p 1022 512 515 515 512 512 -i r n n n p 74 512 515 515 512 512 -i r n n n p 253 2048 515 515 2048 2048 -i r n n n p 8192 1040 515 515 1040 1040 -i r n n n p 10 1029 515 515 1029 1029 -i r n n n p 24 1040 2050 2050 1040 1040 -i r n n n p 1024 1029 2050 2050 1029 1029 -i r n n n p 480 660 2050 2050 660 660 -i r n n n p 481 660 2050 2050 660 660 -i r n n n p 482 660 2050 2050 660 660 -i r n n n p 483 660 2050 2050 660 660 -i r n n n p 484 660 2050 2050 660 660 -i r n n n p 485 660 2050 2050 660 660 -i r n n n p 480 679 2050 2050 679 679 -i r n n n p 481 679 2050 2050 679 679 -i r n n n p 482 679 2050 2050 679 679 -i r n n n p 483 679 2050 2050 679 679 -i r n n n p 484 679 2050 2050 679 679 -i r n n n p 485 679 2050 2050 679 679 -i r n n n p 480 690 2050 2050 690 690 -i r n n n p 481 690 2050 2050 690 690 -i r n n n p 482 690 2050 2050 690 690 -i r n n n p 483 690 2050 2050 690 690 -i r n n n p 484 690 2050 2050 690 690 -i r n n n p 485 690 2050 2050 690 690 -i r n n n p 480 660 2048 2048 660 660 -i r n n n p 481 660 2048 2048 660 660 -i r n n n p 482 660 2048 2048 660 660 -i r n n n p 483 660 2048 2048 660 660 -i r n n n p 484 660 2048 2048 660 660 -i r n n n p 485 660 2048 2048 660 660 -i r n n n p 480 679 2048 2048 679 679 -i r n n n p 481 679 2048 2048 679 679 -i r n n n p 482 679 2048 2048 679 679 -i r n n n p 483 679 2048 2048 679 679 -i r n n n p 484 679 2048 2048 679 679 -i r n n n p 485 679 2048 2048 679 679 -i r n n n p 480 690 2048 2048 690 690 -i r n n n p 481 690 2048 2048 690 690 -i r n n n p 482 690 2048 2048 690 690 -i r n n n p 483 690 2048 2048 690 690 -i r n n n p 484 690 2048 2048 690 690 -i r n n n p 485 690 2048 2048 690 690 -i r n n n p 480 656 1024 1024 656 656 -i r n n n p 480 128 3 3 128 128 -i r n n n p 1024 512 515 515 512 512 -i r n n n p 1024 2048 1024 1024 2048 2048 -i r n n n p 1024 2048 515 515 2048 2048 -i r n n n p 1024 1040 515 515 1040 1040 -i r n n n p 5 1029 515 515 1029 1029 -i r n n n p 1024 1029 515 515 1029 1029 -i r n n n p 1024 1040 2050 2050 1040 1040 -i r n n n p 1029 1029 2050 2050 1029 1029 -i r n n n R 480 646 2050 2050 646 646 -i r n n n R 481 646 2050 2050 646 646 -i r n n n R 482 646 2050 2050 646 646 -i r n n n R 483 646 2050 2050 646 646 -i r n n n R 484 646 2050 2050 646 646 -i r n n n R 485 646 2050 2050 646 646 -i r n n n R 481 656 2050 2050 656 656 -i r n n n R 482 656 2050 2050 656 656 -i r n n n R 483 656 2050 2050 656 656 -i r n n n R 484 656 2050 2050 656 656 -i r n n n p 485 656 2050 2050 656 656 -i r n n n p 480 672 2050 2050 672 672 -i r n n n p 481 672 2050 2050 672 672 -i r n n n p 482 672 2050 2050 672 672 -i r n n n p 483 672 2050 2050 672 672 -i r n n n p 484 672 2050 2050 672 672 -i r n n n p 485 672 2050 2050 672 672 -i r n n n p 480 688 2050 2050 688 688 -i r n n n p 481 688 2050 2050 688 688 -i r n n n r 482 688 2050 2050 688 688 -i r n n n r 483 688 2050 2050 688 688 -i r n n n r 484 688 2050 2050 688 688 -i r n n n r 485 688 2050 2050 688 688 -i r n n n r 1024 512 64 64 512 512 -i r n n n r 16 256 512 512 256 256 -i r n n n r 480 640 512 512 640 640 -i r n n n r 64 768 512 512 768 768 -i r n n n r 128 128 128 128 128 128 -i r n n n r 1024 64 512 512 64 64 -i r n n n r 1024 256 32 32 256 256 -i r n n n r 1024 512 64 64 512 512 -i r n n n r 480 640 512 512 640 640 -i r n n n p 1024 32 256 256 32 32 -i r n n n P 1024 64 512 512 64 64 -i r n n n P 64 800 320 320 800 800 -i r n n n P 64 768 512 512 768 768 -i r n n n P 16 256 512 512 256 256 -i r n n n P 128 128 128 128 128 128 -i r n n n P 256 512 256 256 512 512 -i r n n n P 1024 1024 1024 1024 1024 1024 -i r n n n P 480 640 1024 1024 640 640 -i r n n n P 480 640 256 256 640 640 -i r n n n P 8 64 32 32 64 64 -i r n n n P 9 64 32 32 64 64 -i r n n n P 10 128 64 64 128 128 -i r n n n P 8 8 8 8 8 8 -i r n n n P 12 12 12 12 12 12 -i r n n n P 25 25 25 25 25 25 -i r n n n P 25 25 20 20 25 25 -f r n n n p 480 20 2050 2050 20 20 -f r n n n p 481 20 2050 2050 20 20 -f r n n n p 482 20 2050 2050 20 20 -f r n n n p 483 20 2050 2050 20 20 -f r n n n R 484 20 2050 2050 20 20 -f r n n n R 485 20 2050 2050 20 20 -f r n n n R 480 39 2050 2050 39 39 -f r n n n R 481 39 2050 2050 39 39 -f r n n n R 482 39 2050 2050 39 39 -f r n n n R 483 39 2050 2050 39 39 -f r n n n R 484 39 2050 2050 39 39 -f r n n n p 485 39 2050 2050 39 39 -f r n n n p 480 50 2050 2050 50 50 -f r n n n p 481 50 2050 2050 50 50 -f r n n n p 482 50 2050 2050 50 50 -f r n n n p 483 50 2050 2050 50 50 -f r n n n p 484 50 2050 2050 50 50 -f r n n n p 485 50 2050 2050 50 50 -f r n n n R 480 1108 2050 2050 1108 1108 -f r n n n R 481 1108 2050 2050 1108 1108 -f r n n n R 482 1108 2050 2050 1108 1108 -f r n n n R 483 1108 2050 2050 1108 1108 -f r n n n R 484 1108 2050 2050 1108 1108 -f r n n n R 485 1108 2050 2050 1108 1108 -f r n n n R 480 1127 2050 2050 1127 1127 -f r n n n R 481 1127 2050 2050 1127 1127 -f r n n n R 482 1127 2050 2050 1127 1127 -f r n n n R 483 1127 2050 2050 1127 1127 -f r n n n p 484 1127 2050 2050 1127 1127 -f r n n n p 485 1127 2050 2050 1127 1127 -f r n n n p 480 1138 2050 2050 1138 1138 -f r n n n p 481 1138 2050 2050 1138 1138 -f r n n n p 482 1138 2050 2050 1138 1138 -f r n n n p 483 1138 2050 2050 1138 1138 -f r n n n p 484 1138 2050 2050 1138 1138 -f r n n n p 485 1138 2050 2050 1138 1138 -f r n n n p 1 1 3 3 1 1 -f r n n n p 1 9 3 3 9 9 -f r n n n p 1 2048 3 3 2048 2048 -f r n n n p 1 2048 5192 5192 2048 2048 -f r n n n p 9 1 3 3 1 1 -f r n n n p 576 1 3500 3500 1 1 -f r n n n p 1 1 1 1 1 1 -f r n n n p 102 1088 1024 1024 1088 1088 -f r n n n p 102 2048 1024 1024 2048 2048 -f r n n n p 485 656 1024 1024 656 656 -f r n n n p 483 656 1024 1024 656 656 -f r n n n p 81 128 3 3 128 128 -f r n n n p 1022 512 515 515 512 512 -f r n n n p 74 512 515 515 512 512 -f r n n n p 253 2048 515 515 2048 2048 -f r n n n p 8192 1040 515 515 1040 1040 -f r n n n p 10 1029 515 515 1029 1029 -f r n n n p 24 1040 2050 2050 1040 1040 -f r n n n p 1024 1029 2050 2050 1029 1029 -f r n n n p 480 660 2050 2050 660 660 -f r n n n p 481 660 2050 2050 660 660 -f r n n n p 482 660 2050 2050 660 660 -f r n n n p 483 660 2050 2050 660 660 -f r n n n p 484 660 2050 2050 660 660 -f r n n n p 485 660 2050 2050 660 660 -f r n n n p 480 679 2050 2050 679 679 -f r n n n p 481 679 2050 2050 679 679 -f r n n n p 482 679 2050 2050 679 679 -f r n n n p 483 679 2050 2050 679 679 -f r n n n p 484 679 2050 2050 679 679 -f r n n n p 485 679 2050 2050 679 679 -f r n n n p 480 690 2050 2050 690 690 -f r n n n p 481 690 2050 2050 690 690 -f r n n n p 482 690 2050 2050 690 690 -f r n n n p 483 690 2050 2050 690 690 -f r n n n p 484 690 2050 2050 690 690 -f r n n n p 485 690 2050 2050 690 690 -f r n n n p 480 660 2048 2048 660 660 -f r n n n p 481 660 2048 2048 660 660 -f r n n n p 482 660 2048 2048 660 660 -f r n n n p 483 660 2048 2048 660 660 -f r n n n p 484 660 2048 2048 660 660 -f r n n n p 485 660 2048 2048 660 660 -f r n n n p 480 679 2048 2048 679 679 -f r n n n p 481 679 2048 2048 679 679 -f r n n n p 482 679 2048 2048 679 679 -f r n n n p 483 679 2048 2048 679 679 -f r n n n p 484 679 2048 2048 679 679 -f r n n n p 485 679 2048 2048 679 679 -f r n n n p 480 690 2048 2048 690 690 -f r n n n p 481 690 2048 2048 690 690 -f r n n n p 482 690 2048 2048 690 690 -f r n n n p 483 690 2048 2048 690 690 -f r n n n p 484 690 2048 2048 690 690 -f r n n n p 485 690 2048 2048 690 690 -f r n n n p 480 656 1024 1024 656 656 -f r n n n p 480 128 3 3 128 128 -f r n n n p 1024 512 515 515 512 512 -f r n n n p 1024 2048 1024 1024 2048 2048 -f r n n n p 1024 2048 515 515 2048 2048 -f r n n n p 1024 1040 515 515 1040 1040 -f r n n n p 5 1029 515 515 1029 1029 -f r n n n p 1024 1029 515 515 1029 1029 -f r n n n p 1024 1040 2050 2050 1040 1040 -f r n n n p 1029 1029 2050 2050 1029 1029 -f r n n n R 480 646 2050 2050 646 646 -f r n n n R 481 646 2050 2050 646 646 -f r n n n R 482 646 2050 2050 646 646 -f r n n n R 483 646 2050 2050 646 646 -f r n n n R 484 646 2050 2050 646 646 -f r n n n R 485 646 2050 2050 646 646 -f r n n n R 481 656 2050 2050 656 656 -f r n n n R 482 656 2050 2050 656 656 -f r n n n R 483 656 2050 2050 656 656 -f r n n n R 484 656 2050 2050 656 656 -f r n n n p 485 656 2050 2050 656 656 -f r n n n p 480 672 2050 2050 672 672 -f r n n n p 481 672 2050 2050 672 672 -f r n n n p 482 672 2050 2050 672 672 -f r n n n p 483 672 2050 2050 672 672 -f r n n n p 484 672 2050 2050 672 672 -f r n n n p 485 672 2050 2050 672 672 -f r n n n p 480 688 2050 2050 688 688 -f r n n n p 481 688 2050 2050 688 688 -f r n n n r 482 688 2050 2050 688 688 -f r n n n r 483 688 2050 2050 688 688 -f r n n n r 484 688 2050 2050 688 688 -f r n n n r 485 688 2050 2050 688 688 -f r n n n r 1024 512 64 64 512 512 -f r n n n r 16 256 512 512 256 256 -f r n n n r 480 640 512 512 640 640 -f r n n n r 64 768 512 512 768 768 -f r n n n r 128 128 128 128 128 128 -f r n n n r 1024 64 512 512 64 64 -f r n n n r 1024 256 32 32 256 256 -f r n n n r 1024 512 64 64 512 512 -f r n n n r 480 640 512 512 640 640 -f r n n n p 1024 32 256 256 32 32 -f r n n n P 1024 64 512 512 64 64 -f r n n n P 64 800 320 320 800 800 -f r n n n P 64 768 512 512 768 768 -f r n n n P 16 256 512 512 256 256 -f r n n n P 128 128 128 128 128 128 -f r n n n P 256 512 256 256 512 512 -f r n n n P 1024 1024 1024 1024 1024 1024 -f r n n n P 480 640 1024 1024 640 640 -f r n n n P 480 640 256 256 640 640 -f r n n n P 8 64 32 32 64 64 -f r n n n P 9 64 32 32 64 64 -f r n n n P 10 128 64 64 128 128 -f r n n n P 8 8 8 8 8 8 -f r n n n P 12 12 12 12 12 12 -f r n n n P 25 25 25 25 25 25 -f r n n n P 25 25 20 20 25 25 -i r n n n r 4096 256 5 5 256 256 -i r n n n r 3000 256 128 128 256 256 -i r n n n r 4096 1024 512 512 1024 1024 -i r n n n r 144 256 5 5 256 256 -i r n n n r 144 256 128 128 256 256 -i r n n n r 144 1024 512 512 1024 1024 -i r n n n r 480 688 256 256 688 688 -i r n n n r 480 640 512 512 640 640 -i r n n n r 480 640 1024 1024 640 640 -i r n n n r 64 800 320 320 800 800 -i r n n n r 64 768 512 512 768 768 -i r n n n r 16 256 512 512 256 256 -i r n n n r 128 128 128 128 128 128 -i r n n n r 256 512 256 256 512 512 -i r n n n r 1024 1024 1024 1024 1024 1024 -i r n n n r 1024 32 256 256 32 32 -i r n n n r 1024 64 512 512 64 64 -i r n n n r 1024 256 32 32 256 256 -i r n n n r 1024 512 64 64 512 512 -i r n n n r 512 32 256 256 32 32 -i r n n n r 512 768 512 512 768 768 -i r n n n r 512 256 32 32 256 256 -i r n n n r 512 512 64 64 512 512 -i r n n n r 512 256 768 768 256 256 -i r n n n r 768 768 1024 1024 768 768 -i r n n n r 768 768 768 768 768 768 -i r n n n r 2048 2048 2048 2048 2048 2048 -i r n n n r 4096 4096 4096 4096 4096 4096 -f r n n n r 4096 256 5 5 256 256 -f r n n n r 3000 256 128 128 256 256 -f r n n n r 4096 1024 512 512 1024 1024 -f r n n n r 144 256 5 5 256 256 -f r n n n r 144 256 128 128 256 256 -f r n n n r 144 1024 512 512 1024 1024 -f r n n n r 480 688 256 256 688 688 -f r n n n r 480 640 512 512 640 640 -f r n n n r 480 640 1024 1024 640 640 -f r n n n r 64 800 320 320 800 800 -f r n n n r 64 768 512 512 768 768 -f r n n n r 16 256 512 512 256 256 -f r n n n r 128 128 128 128 128 128 -f r n n n r 256 512 256 256 512 512 -f r n n n r 1024 1024 1024 1024 1024 1024 -f r n n n r 1024 32 256 256 32 32 -f r n n n r 1024 64 512 512 64 64 -f r n n n r 1024 256 32 32 256 256 -f r n n n r 1024 512 64 64 512 512 -f r n n n r 512 32 256 256 32 32 -f r n n n r 512 768 512 512 768 768 -f r n n n r 512 256 32 32 256 256 -f r n n n r 512 512 64 64 512 512 -f r n n n r 512 256 768 768 256 256 -f r n n n r 768 768 1024 1024 768 768 -f r n n n r 768 768 768 768 768 768 -f r n n n r 2048 2048 2048 2048 2048 2048 -f r n n n r 4096 4096 4096 4096 4096 4096 -f r n n n r 2048 1024 1024 1024 1024 1024 -f r n n n r 2048 4096 1024 1024 4096 4096 -f r n n n r 2048 1024 4096 4096 1024 1024 -f r n n n r 2048 1024 2 2 1024 1024 -f r n n n r 128 1024 1024 1024 1024 1024 -f r n n n r 1536 768 768 768 768 768 -f r n n n r 1536 3072 768 768 3072 3072 -f r n n n r 1536 768 3072 3072 768 768 -f r n n n r 1536 768 2 2 768 768 -f r n n n r 128 768 768 768 768 768 -f r n n n r 1024 8 13 13 8 8 -f r n n n r 1024 4 8 8 4 4 -f r n n n r 1024 128 355 355 128 128 -f r n n n r 1024 64 128 128 64 64 -f r n n n r 1024 1 64 64 1 1 -f r n n n r 480 1 256 256 1 1 -f r n n n r 480 256 512 512 256 256 -f r n n n r 480 1024 845 845 1024 1024 -f r n n n r 480 512 1024 1024 512 512 -f r n n n r 10 17191 128 128 17191 17191 -f r n n n r 10 512 256 256 512 512 +u s32 r n n n p 480 20 2050 2050 20 20 none +u s8 r n n n p 480 20 2050 2050 20 20 none +u s32 s8 r n n n p 480 20 2050 2050 20 20 bias,relu,clip +u s8 s8 r n n n p 480 20 2050 2050 20 20 bias,relu,clip +u s32 r n n n p 481 20 2050 2050 20 20 none +u s8 r n n n p 481 20 2050 2050 20 20 none +u s32 s8 r n n n p 481 20 2050 2050 20 20 bias,relu,clip +u s8 s8 r n n n p 481 20 2050 2050 20 20 bias,relu,clip +u s32 r n n n p 482 20 2050 2050 20 20 none +u s8 r n n n p 482 20 2050 2050 20 20 none +u s32 s8 r n n n p 482 20 2050 2050 20 20 bias,relu,clip +u s8 s8 r n n n p 482 20 2050 2050 20 20 bias,relu,clip +u s32 r n n n p 483 20 2050 2050 20 20 none +u s8 r n n n p 483 20 2050 2050 20 20 none +u s32 s8 r n n n p 483 20 2050 2050 20 20 bias,relu,clip +u s8 s8 r n n n p 483 20 2050 2050 20 20 bias,relu,clip +u s32 r n n n R 484 20 2050 2050 20 20 none +u s8 r n n n R 484 20 2050 2050 20 20 none +u s32 s8 r n n n R 484 20 2050 2050 20 20 bias,relu,clip +u s8 s8 r n n n R 484 20 2050 2050 20 20 bias,relu,clip +u s32 r n n n R 485 20 2050 2050 20 20 none +u s8 r n n n R 485 20 2050 2050 20 20 none +u s32 s8 r n n n R 485 20 2050 2050 20 20 bias,relu,clip +u s8 s8 r n n n R 485 20 2050 2050 20 20 bias,relu,clip +u s32 r n n n R 480 39 2050 2050 39 39 none +u s8 r n n n R 480 39 2050 2050 39 39 none +u s32 s8 r n n n R 480 39 2050 2050 39 39 bias,relu,clip +u s8 s8 r n n n R 480 39 2050 2050 39 39 bias,relu,clip +u s32 r n n n R 481 39 2050 2050 39 39 none +u s8 r n n n R 481 39 2050 2050 39 39 none +u s32 s8 r n n n R 481 39 2050 2050 39 39 bias,relu,clip +u s8 s8 r n n n R 481 39 2050 2050 39 39 bias,relu,clip +u s32 r n n n R 482 39 2050 2050 39 39 none +u s8 r n n n R 482 39 2050 2050 39 39 none +u s32 s8 r n n n R 482 39 2050 2050 39 39 bias,relu,clip +u s8 s8 r n n n R 482 39 2050 2050 39 39 bias,relu,clip +u s32 r n n n R 483 39 2050 2050 39 39 none +u s8 r n n n R 483 39 2050 2050 39 39 none +u s32 s8 r n n n R 483 39 2050 2050 39 39 bias,relu,clip +u s8 s8 r n n n R 483 39 2050 2050 39 39 bias,relu,clip +u s32 r n n n R 484 39 2050 2050 39 39 none +u s8 r n n n R 484 39 2050 2050 39 39 none +u s32 s8 r n n n R 484 39 2050 2050 39 39 bias,relu,clip +u s8 s8 r n n n R 484 39 2050 2050 39 39 bias,relu,clip +u s32 r n n n p 485 39 2050 2050 39 39 none +u s8 r n n n p 485 39 2050 2050 39 39 none +u s32 s8 r n n n p 485 39 2050 2050 39 39 bias,relu,clip +u s8 s8 r n n n p 485 39 2050 2050 39 39 bias,relu,clip +u s32 r n n n p 480 50 2050 2050 50 50 none +u s8 r n n n p 480 50 2050 2050 50 50 none +u s32 s8 r n n n p 480 50 2050 2050 50 50 bias,relu,clip +u s8 s8 r n n n p 480 50 2050 2050 50 50 bias,relu,clip +u s32 r n n n p 481 50 2050 2050 50 50 none +u s8 r n n n p 481 50 2050 2050 50 50 none +u s32 s8 r n n n p 481 50 2050 2050 50 50 bias,relu,clip +u s8 s8 r n n n p 481 50 2050 2050 50 50 bias,relu,clip +u s32 r n n n p 482 50 2050 2050 50 50 none +u s8 r n n n p 482 50 2050 2050 50 50 none +u s32 s8 r n n n p 482 50 2050 2050 50 50 bias,relu,clip +u s8 s8 r n n n p 482 50 2050 2050 50 50 bias,relu,clip +u s32 r n n n p 483 50 2050 2050 50 50 none +u s8 r n n n p 483 50 2050 2050 50 50 none +u s32 s8 r n n n p 483 50 2050 2050 50 50 bias,relu,clip +u s8 s8 r n n n p 483 50 2050 2050 50 50 bias,relu,clip +u s32 r n n n p 484 50 2050 2050 50 50 none +u s8 r n n n p 484 50 2050 2050 50 50 none +u s32 s8 r n n n p 484 50 2050 2050 50 50 bias,relu,clip +u s8 s8 r n n n p 484 50 2050 2050 50 50 bias,relu,clip +u s32 r n n n p 485 50 2050 2050 50 50 none +u s8 r n n n p 485 50 2050 2050 50 50 none +u s32 s8 r n n n p 485 50 2050 2050 50 50 bias,relu,clip +u s8 s8 r n n n p 485 50 2050 2050 50 50 bias,relu,clip +u s32 r n n n R 480 1108 2050 2050 1108 1108 none +u s8 r n n n R 480 1108 2050 2050 1108 1108 none +u s32 s8 r n n n R 480 1108 2050 2050 1108 1108 bias,relu,clip +u s8 s8 r n n n R 480 1108 2050 2050 1108 1108 bias,relu,clip +u s32 r n n n R 481 1108 2050 2050 1108 1108 none +u s8 r n n n R 481 1108 2050 2050 1108 1108 none +u s32 s8 r n n n R 481 1108 2050 2050 1108 1108 bias,relu,clip +u s8 s8 r n n n R 481 1108 2050 2050 1108 1108 bias,relu,clip +u s32 r n n n R 482 1108 2050 2050 1108 1108 none +u s8 r n n n R 482 1108 2050 2050 1108 1108 none +u s32 s8 r n n n R 482 1108 2050 2050 1108 1108 bias,relu,clip +u s8 s8 r n n n R 482 1108 2050 2050 1108 1108 bias,relu,clip +u s32 r n n n R 483 1108 2050 2050 1108 1108 none +u s8 r n n n R 483 1108 2050 2050 1108 1108 none +u s32 s8 r n n n R 483 1108 2050 2050 1108 1108 bias,relu,clip +u s8 s8 r n n n R 483 1108 2050 2050 1108 1108 bias,relu,clip +u s32 r n n n R 484 1108 2050 2050 1108 1108 none +u s8 r n n n R 484 1108 2050 2050 1108 1108 none +u s32 s8 r n n n R 484 1108 2050 2050 1108 1108 bias,relu,clip +u s8 s8 r n n n R 484 1108 2050 2050 1108 1108 bias,relu,clip +u s32 r n n n R 485 1108 2050 2050 1108 1108 none +u s8 r n n n R 485 1108 2050 2050 1108 1108 none +u s32 s8 r n n n R 485 1108 2050 2050 1108 1108 bias,relu,clip +u s8 s8 r n n n R 485 1108 2050 2050 1108 1108 bias,relu,clip +u s32 r n n n R 480 1127 2050 2050 1127 1127 none +u s8 r n n n R 480 1127 2050 2050 1127 1127 none +u s32 s8 r n n n R 480 1127 2050 2050 1127 1127 bias,relu,clip +u s8 s8 r n n n R 480 1127 2050 2050 1127 1127 bias,relu,clip +u s32 r n n n R 481 1127 2050 2050 1127 1127 none +u s8 r n n n R 481 1127 2050 2050 1127 1127 none +u s32 s8 r n n n R 481 1127 2050 2050 1127 1127 bias,relu,clip +u s8 s8 r n n n R 481 1127 2050 2050 1127 1127 bias,relu,clip +u s32 r n n n R 482 1127 2050 2050 1127 1127 none +u s8 r n n n R 482 1127 2050 2050 1127 1127 none +u s32 s8 r n n n R 482 1127 2050 2050 1127 1127 bias,relu,clip +u s8 s8 r n n n R 482 1127 2050 2050 1127 1127 bias,relu,clip +u s32 r n n n R 483 1127 2050 2050 1127 1127 none +u s8 r n n n R 483 1127 2050 2050 1127 1127 none +u s32 s8 r n n n R 483 1127 2050 2050 1127 1127 bias,relu,clip +u s8 s8 r n n n R 483 1127 2050 2050 1127 1127 bias,relu,clip +u s32 r n n n p 484 1127 2050 2050 1127 1127 none +u s8 r n n n p 484 1127 2050 2050 1127 1127 none +u s32 s8 r n n n p 484 1127 2050 2050 1127 1127 bias,relu,clip +u s8 s8 r n n n p 484 1127 2050 2050 1127 1127 bias,relu,clip +u s32 r n n n p 485 1127 2050 2050 1127 1127 none +u s8 r n n n p 485 1127 2050 2050 1127 1127 none +u s32 s8 r n n n p 485 1127 2050 2050 1127 1127 bias,relu,clip +u s8 s8 r n n n p 485 1127 2050 2050 1127 1127 bias,relu,clip +u s32 r n n n p 480 1138 2050 2050 1138 1138 none +u s8 r n n n p 480 1138 2050 2050 1138 1138 none +u s32 s8 r n n n p 480 1138 2050 2050 1138 1138 bias,relu,clip +u s8 s8 r n n n p 480 1138 2050 2050 1138 1138 bias,relu,clip +u s32 r n n n p 481 1138 2050 2050 1138 1138 none +u s8 r n n n p 481 1138 2050 2050 1138 1138 none +u s32 s8 r n n n p 481 1138 2050 2050 1138 1138 bias,relu,clip +u s8 s8 r n n n p 481 1138 2050 2050 1138 1138 bias,relu,clip +u s32 r n n n p 482 1138 2050 2050 1138 1138 none +u s8 r n n n p 482 1138 2050 2050 1138 1138 none +u s32 s8 r n n n p 482 1138 2050 2050 1138 1138 bias,relu,clip +u s8 s8 r n n n p 482 1138 2050 2050 1138 1138 bias,relu,clip +u s32 r n n n p 483 1138 2050 2050 1138 1138 none +u s8 r n n n p 483 1138 2050 2050 1138 1138 none +u s32 s8 r n n n p 483 1138 2050 2050 1138 1138 bias,relu,clip +u s8 s8 r n n n p 483 1138 2050 2050 1138 1138 bias,relu,clip +u s32 r n n n p 484 1138 2050 2050 1138 1138 none +u s8 r n n n p 484 1138 2050 2050 1138 1138 none +u s32 s8 r n n n p 484 1138 2050 2050 1138 1138 bias,relu,clip +u s8 s8 r n n n p 484 1138 2050 2050 1138 1138 bias,relu,clip +u s32 r n n n p 485 1138 2050 2050 1138 1138 none +u s8 r n n n p 485 1138 2050 2050 1138 1138 none +u s32 s8 r n n n p 485 1138 2050 2050 1138 1138 bias,relu,clip +u s8 s8 r n n n p 485 1138 2050 2050 1138 1138 bias,relu,clip +u s32 r n n n p 1 1 3 3 1 1 none +u s8 r n n n p 1 1 3 3 1 1 none +u s32 s8 r n n n p 1 1 3 3 1 1 bias,relu,clip +u s8 s8 r n n n p 1 1 3 3 1 1 bias,relu,clip +u s32 r n n n p 1 9 3 3 9 9 none +u s8 r n n n p 1 9 3 3 9 9 none +u s32 s8 r n n n p 1 9 3 3 9 9 bias,relu,clip +u s8 s8 r n n n p 1 9 3 3 9 9 bias,relu,clip +u s32 r n n n p 1 2048 3 3 2048 2048 none +u s8 r n n n p 1 2048 3 3 2048 2048 none +u s32 s8 r n n n p 1 2048 3 3 2048 2048 bias,relu,clip +u s8 s8 r n n n p 1 2048 3 3 2048 2048 bias,relu,clip +u s32 r n n n p 1 2048 5192 5192 2048 2048 none +u s8 r n n n p 1 2048 5192 5192 2048 2048 none +u s32 s8 r n n n p 1 2048 5192 5192 2048 2048 bias,relu,clip +u s8 s8 r n n n p 1 2048 5192 5192 2048 2048 bias,relu,clip +u s32 r n n n p 9 1 3 3 1 1 none +u s8 r n n n p 9 1 3 3 1 1 none +u s32 s8 r n n n p 9 1 3 3 1 1 bias,relu,clip +u s8 s8 r n n n p 9 1 3 3 1 1 bias,relu,clip +u s32 r n n n p 576 1 3500 3500 1 1 none +u s8 r n n n p 576 1 3500 3500 1 1 none +u s32 s8 r n n n p 576 1 3500 3500 1 1 bias,relu,clip +u s8 s8 r n n n p 576 1 3500 3500 1 1 bias,relu,clip +u s32 r n n n p 1 1 1 1 1 1 none +u s8 r n n n p 1 1 1 1 1 1 none +u s32 s8 r n n n p 1 1 1 1 1 1 bias,relu,clip +u s8 s8 r n n n p 1 1 1 1 1 1 bias,relu,clip +u s32 r n n n p 102 1088 1024 1024 1088 1088 none +u s8 r n n n p 102 1088 1024 1024 1088 1088 none +u s32 s8 r n n n p 102 1088 1024 1024 1088 1088 bias,relu,clip +u s8 s8 r n n n p 102 1088 1024 1024 1088 1088 bias,relu,clip +u s32 r n n n p 102 2048 1024 1024 2048 2048 none +u s8 r n n n p 102 2048 1024 1024 2048 2048 none +u s32 s8 r n n n p 102 2048 1024 1024 2048 2048 bias,relu,clip +u s8 s8 r n n n p 102 2048 1024 1024 2048 2048 bias,relu,clip +u s32 r n n n p 485 656 1024 1024 656 656 none +u s8 r n n n p 485 656 1024 1024 656 656 none +u s32 s8 r n n n p 485 656 1024 1024 656 656 bias,relu,clip +u s8 s8 r n n n p 485 656 1024 1024 656 656 bias,relu,clip +u s32 r n n n p 483 656 1024 1024 656 656 none +u s8 r n n n p 483 656 1024 1024 656 656 none +u s32 s8 r n n n p 483 656 1024 1024 656 656 bias,relu,clip +u s8 s8 r n n n p 483 656 1024 1024 656 656 bias,relu,clip +u s32 r n n n p 81 128 3 3 128 128 none +u s8 r n n n p 81 128 3 3 128 128 none +u s32 s8 r n n n p 81 128 3 3 128 128 bias,relu,clip +u s8 s8 r n n n p 81 128 3 3 128 128 bias,relu,clip +u s32 r n n n p 1022 512 515 515 512 512 none +u s8 r n n n p 1022 512 515 515 512 512 none +u s32 s8 r n n n p 1022 512 515 515 512 512 bias,relu,clip +u s8 s8 r n n n p 1022 512 515 515 512 512 bias,relu,clip +u s32 r n n n p 74 512 515 515 512 512 none +u s8 r n n n p 74 512 515 515 512 512 none +u s32 s8 r n n n p 74 512 515 515 512 512 bias,relu,clip +u s8 s8 r n n n p 74 512 515 515 512 512 bias,relu,clip +u s32 r n n n p 253 2048 515 515 2048 2048 none +u s8 r n n n p 253 2048 515 515 2048 2048 none +u s32 s8 r n n n p 253 2048 515 515 2048 2048 bias,relu,clip +u s8 s8 r n n n p 253 2048 515 515 2048 2048 bias,relu,clip +u s32 r n n n p 8192 1040 515 515 1040 1040 none +u s8 r n n n p 8192 1040 515 515 1040 1040 none +u s32 s8 r n n n p 8192 1040 515 515 1040 1040 bias,relu,clip +u s8 s8 r n n n p 8192 1040 515 515 1040 1040 bias,relu,clip +u s32 r n n n p 10 1029 515 515 1029 1029 none +u s8 r n n n p 10 1029 515 515 1029 1029 none +u s32 s8 r n n n p 10 1029 515 515 1029 1029 bias,relu,clip +u s8 s8 r n n n p 10 1029 515 515 1029 1029 bias,relu,clip +u s32 r n n n p 24 1040 2050 2050 1040 1040 none +u s8 r n n n p 24 1040 2050 2050 1040 1040 none +u s32 s8 r n n n p 24 1040 2050 2050 1040 1040 bias,relu,clip +u s8 s8 r n n n p 24 1040 2050 2050 1040 1040 bias,relu,clip +u s32 r n n n p 1024 1029 2050 2050 1029 1029 none +u s8 r n n n p 1024 1029 2050 2050 1029 1029 none +u s32 s8 r n n n p 1024 1029 2050 2050 1029 1029 bias,relu,clip +u s8 s8 r n n n p 1024 1029 2050 2050 1029 1029 bias,relu,clip +u s32 r n n n p 480 660 2050 2050 660 660 none +u s8 r n n n p 480 660 2050 2050 660 660 none +u s32 s8 r n n n p 480 660 2050 2050 660 660 bias,relu,clip +u s8 s8 r n n n p 480 660 2050 2050 660 660 bias,relu,clip +u s32 r n n n p 481 660 2050 2050 660 660 none +u s8 r n n n p 481 660 2050 2050 660 660 none +u s32 s8 r n n n p 481 660 2050 2050 660 660 bias,relu,clip +u s8 s8 r n n n p 481 660 2050 2050 660 660 bias,relu,clip +u s32 r n n n p 482 660 2050 2050 660 660 none +u s8 r n n n p 482 660 2050 2050 660 660 none +u s32 s8 r n n n p 482 660 2050 2050 660 660 bias,relu,clip +u s8 s8 r n n n p 482 660 2050 2050 660 660 bias,relu,clip +u s32 r n n n p 483 660 2050 2050 660 660 none +u s8 r n n n p 483 660 2050 2050 660 660 none +u s32 s8 r n n n p 483 660 2050 2050 660 660 bias,relu,clip +u s8 s8 r n n n p 483 660 2050 2050 660 660 bias,relu,clip +u s32 r n n n p 484 660 2050 2050 660 660 none +u s8 r n n n p 484 660 2050 2050 660 660 none +u s32 s8 r n n n p 484 660 2050 2050 660 660 bias,relu,clip +u s8 s8 r n n n p 484 660 2050 2050 660 660 bias,relu,clip +u s32 r n n n p 485 660 2050 2050 660 660 none +u s8 r n n n p 485 660 2050 2050 660 660 none +u s32 s8 r n n n p 485 660 2050 2050 660 660 bias,relu,clip +u s8 s8 r n n n p 485 660 2050 2050 660 660 bias,relu,clip +u s32 r n n n p 480 679 2050 2050 679 679 none +u s8 r n n n p 480 679 2050 2050 679 679 none +u s32 s8 r n n n p 480 679 2050 2050 679 679 bias,relu,clip +u s8 s8 r n n n p 480 679 2050 2050 679 679 bias,relu,clip +u s32 r n n n p 481 679 2050 2050 679 679 none +u s8 r n n n p 481 679 2050 2050 679 679 none +u s32 s8 r n n n p 481 679 2050 2050 679 679 bias,relu,clip +u s8 s8 r n n n p 481 679 2050 2050 679 679 bias,relu,clip +u s32 r n n n p 482 679 2050 2050 679 679 none +u s8 r n n n p 482 679 2050 2050 679 679 none +u s32 s8 r n n n p 482 679 2050 2050 679 679 bias,relu,clip +u s8 s8 r n n n p 482 679 2050 2050 679 679 bias,relu,clip +u s32 r n n n p 483 679 2050 2050 679 679 none +u s8 r n n n p 483 679 2050 2050 679 679 none +u s32 s8 r n n n p 483 679 2050 2050 679 679 bias,relu,clip +u s8 s8 r n n n p 483 679 2050 2050 679 679 bias,relu,clip +u s32 r n n n p 484 679 2050 2050 679 679 none +u s8 r n n n p 484 679 2050 2050 679 679 none +u s32 s8 r n n n p 484 679 2050 2050 679 679 bias,relu,clip +u s8 s8 r n n n p 484 679 2050 2050 679 679 bias,relu,clip +u s32 r n n n p 485 679 2050 2050 679 679 none +u s8 r n n n p 485 679 2050 2050 679 679 none +u s32 s8 r n n n p 485 679 2050 2050 679 679 bias,relu,clip +u s8 s8 r n n n p 485 679 2050 2050 679 679 bias,relu,clip +u s32 r n n n p 480 690 2050 2050 690 690 none +u s8 r n n n p 480 690 2050 2050 690 690 none +u s32 s8 r n n n p 480 690 2050 2050 690 690 bias,relu,clip +u s8 s8 r n n n p 480 690 2050 2050 690 690 bias,relu,clip +u s32 r n n n p 481 690 2050 2050 690 690 none +u s8 r n n n p 481 690 2050 2050 690 690 none +u s32 s8 r n n n p 481 690 2050 2050 690 690 bias,relu,clip +u s8 s8 r n n n p 481 690 2050 2050 690 690 bias,relu,clip +u s32 r n n n p 482 690 2050 2050 690 690 none +u s8 r n n n p 482 690 2050 2050 690 690 none +u s32 s8 r n n n p 482 690 2050 2050 690 690 bias,relu,clip +u s8 s8 r n n n p 482 690 2050 2050 690 690 bias,relu,clip +u s32 r n n n p 483 690 2050 2050 690 690 none +u s8 r n n n p 483 690 2050 2050 690 690 none +u s32 s8 r n n n p 483 690 2050 2050 690 690 bias,relu,clip +u s8 s8 r n n n p 483 690 2050 2050 690 690 bias,relu,clip +u s32 r n n n p 484 690 2050 2050 690 690 none +u s8 r n n n p 484 690 2050 2050 690 690 none +u s32 s8 r n n n p 484 690 2050 2050 690 690 bias,relu,clip +u s8 s8 r n n n p 484 690 2050 2050 690 690 bias,relu,clip +u s32 r n n n p 485 690 2050 2050 690 690 none +u s8 r n n n p 485 690 2050 2050 690 690 none +u s32 s8 r n n n p 485 690 2050 2050 690 690 bias,relu,clip +u s8 s8 r n n n p 485 690 2050 2050 690 690 bias,relu,clip +u s32 r n n n p 480 660 2048 2048 660 660 none +u s8 r n n n p 480 660 2048 2048 660 660 none +u s32 s8 r n n n p 480 660 2048 2048 660 660 bias,relu,clip +u s8 s8 r n n n p 480 660 2048 2048 660 660 bias,relu,clip +u s32 r n n n p 481 660 2048 2048 660 660 none +u s8 r n n n p 481 660 2048 2048 660 660 none +u s32 s8 r n n n p 481 660 2048 2048 660 660 bias,relu,clip +u s8 s8 r n n n p 481 660 2048 2048 660 660 bias,relu,clip +u s32 r n n n p 482 660 2048 2048 660 660 none +u s8 r n n n p 482 660 2048 2048 660 660 none +u s32 s8 r n n n p 482 660 2048 2048 660 660 bias,relu,clip +u s8 s8 r n n n p 482 660 2048 2048 660 660 bias,relu,clip +u s32 r n n n p 483 660 2048 2048 660 660 none +u s8 r n n n p 483 660 2048 2048 660 660 none +u s32 s8 r n n n p 483 660 2048 2048 660 660 bias,relu,clip +u s8 s8 r n n n p 483 660 2048 2048 660 660 bias,relu,clip +u s32 r n n n p 484 660 2048 2048 660 660 none +u s8 r n n n p 484 660 2048 2048 660 660 none +u s32 s8 r n n n p 484 660 2048 2048 660 660 bias,relu,clip +u s8 s8 r n n n p 484 660 2048 2048 660 660 bias,relu,clip +u s32 r n n n p 485 660 2048 2048 660 660 none +u s8 r n n n p 485 660 2048 2048 660 660 none +u s32 s8 r n n n p 485 660 2048 2048 660 660 bias,relu,clip +u s8 s8 r n n n p 485 660 2048 2048 660 660 bias,relu,clip +u s32 r n n n p 480 679 2048 2048 679 679 none +u s8 r n n n p 480 679 2048 2048 679 679 none +u s32 s8 r n n n p 480 679 2048 2048 679 679 bias,relu,clip +u s8 s8 r n n n p 480 679 2048 2048 679 679 bias,relu,clip +u s32 r n n n p 481 679 2048 2048 679 679 none +u s8 r n n n p 481 679 2048 2048 679 679 none +u s32 s8 r n n n p 481 679 2048 2048 679 679 bias,relu,clip +u s8 s8 r n n n p 481 679 2048 2048 679 679 bias,relu,clip +u s32 r n n n p 482 679 2048 2048 679 679 none +u s8 r n n n p 482 679 2048 2048 679 679 none +u s32 s8 r n n n p 482 679 2048 2048 679 679 bias,relu,clip +u s8 s8 r n n n p 482 679 2048 2048 679 679 bias,relu,clip +u s32 r n n n p 483 679 2048 2048 679 679 none +u s8 r n n n p 483 679 2048 2048 679 679 none +u s32 s8 r n n n p 483 679 2048 2048 679 679 bias,relu,clip +u s8 s8 r n n n p 483 679 2048 2048 679 679 bias,relu,clip +u s32 r n n n p 484 679 2048 2048 679 679 none +u s8 r n n n p 484 679 2048 2048 679 679 none +u s32 s8 r n n n p 484 679 2048 2048 679 679 bias,relu,clip +u s8 s8 r n n n p 484 679 2048 2048 679 679 bias,relu,clip +u s32 r n n n p 485 679 2048 2048 679 679 none +u s8 r n n n p 485 679 2048 2048 679 679 none +u s32 s8 r n n n p 485 679 2048 2048 679 679 bias,relu,clip +u s8 s8 r n n n p 485 679 2048 2048 679 679 bias,relu,clip +u s32 r n n n p 480 690 2048 2048 690 690 none +u s8 r n n n p 480 690 2048 2048 690 690 none +u s32 s8 r n n n p 480 690 2048 2048 690 690 bias,relu,clip +u s8 s8 r n n n p 480 690 2048 2048 690 690 bias,relu,clip +u s32 r n n n p 481 690 2048 2048 690 690 none +u s8 r n n n p 481 690 2048 2048 690 690 none +u s32 s8 r n n n p 481 690 2048 2048 690 690 bias,relu,clip +u s8 s8 r n n n p 481 690 2048 2048 690 690 bias,relu,clip +u s32 r n n n p 482 690 2048 2048 690 690 none +u s8 r n n n p 482 690 2048 2048 690 690 none +u s32 s8 r n n n p 482 690 2048 2048 690 690 bias,relu,clip +u s8 s8 r n n n p 482 690 2048 2048 690 690 bias,relu,clip +u s32 r n n n p 483 690 2048 2048 690 690 none +u s8 r n n n p 483 690 2048 2048 690 690 none +u s32 s8 r n n n p 483 690 2048 2048 690 690 bias,relu,clip +u s8 s8 r n n n p 483 690 2048 2048 690 690 bias,relu,clip +u s32 r n n n p 484 690 2048 2048 690 690 none +u s8 r n n n p 484 690 2048 2048 690 690 none +u s32 s8 r n n n p 484 690 2048 2048 690 690 bias,relu,clip +u s8 s8 r n n n p 484 690 2048 2048 690 690 bias,relu,clip +u s32 r n n n p 485 690 2048 2048 690 690 none +u s8 r n n n p 485 690 2048 2048 690 690 none +u s32 s8 r n n n p 485 690 2048 2048 690 690 bias,relu,clip +u s8 s8 r n n n p 485 690 2048 2048 690 690 bias,relu,clip +u s32 r n n n p 480 656 1024 1024 656 656 none +u s8 r n n n p 480 656 1024 1024 656 656 none +u s32 s8 r n n n p 480 656 1024 1024 656 656 bias,relu,clip +u s8 s8 r n n n p 480 656 1024 1024 656 656 bias,relu,clip +u s32 r n n n p 480 128 3 3 128 128 none +u s8 r n n n p 480 128 3 3 128 128 none +u s32 s8 r n n n p 480 128 3 3 128 128 bias,relu,clip +u s8 s8 r n n n p 480 128 3 3 128 128 bias,relu,clip +u s32 r n n n p 1024 512 515 515 512 512 none +u s8 r n n n p 1024 512 515 515 512 512 none +u s32 s8 r n n n p 1024 512 515 515 512 512 bias,relu,clip +u s8 s8 r n n n p 1024 512 515 515 512 512 bias,relu,clip +u s32 r n n n p 1024 2048 1024 1024 2048 2048 none +u s8 r n n n p 1024 2048 1024 1024 2048 2048 none +u s32 s8 r n n n p 1024 2048 1024 1024 2048 2048 bias,relu,clip +u s8 s8 r n n n p 1024 2048 1024 1024 2048 2048 bias,relu,clip +u s32 r n n n p 1024 2048 515 515 2048 2048 none +u s8 r n n n p 1024 2048 515 515 2048 2048 none +u s32 s8 r n n n p 1024 2048 515 515 2048 2048 bias,relu,clip +u s8 s8 r n n n p 1024 2048 515 515 2048 2048 bias,relu,clip +u s32 r n n n p 1024 1040 515 515 1040 1040 none +u s8 r n n n p 1024 1040 515 515 1040 1040 none +u s32 s8 r n n n p 1024 1040 515 515 1040 1040 bias,relu,clip +u s8 s8 r n n n p 1024 1040 515 515 1040 1040 bias,relu,clip +u s32 r n n n p 5 1029 515 515 1029 1029 none +u s8 r n n n p 5 1029 515 515 1029 1029 none +u s32 s8 r n n n p 5 1029 515 515 1029 1029 bias,relu,clip +u s8 s8 r n n n p 5 1029 515 515 1029 1029 bias,relu,clip +u s32 r n n n p 1024 1029 515 515 1029 1029 none +u s8 r n n n p 1024 1029 515 515 1029 1029 none +u s32 s8 r n n n p 1024 1029 515 515 1029 1029 bias,relu,clip +u s8 s8 r n n n p 1024 1029 515 515 1029 1029 bias,relu,clip +u s32 r n n n p 1024 1040 2050 2050 1040 1040 none +u s8 r n n n p 1024 1040 2050 2050 1040 1040 none +u s32 s8 r n n n p 1024 1040 2050 2050 1040 1040 bias,relu,clip +u s8 s8 r n n n p 1024 1040 2050 2050 1040 1040 bias,relu,clip +u s32 r n n n p 1029 1029 2050 2050 1029 1029 none +u s8 r n n n p 1029 1029 2050 2050 1029 1029 none +u s32 s8 r n n n p 1029 1029 2050 2050 1029 1029 bias,relu,clip +u s8 s8 r n n n p 1029 1029 2050 2050 1029 1029 bias,relu,clip +u s32 r n n n R 480 646 2050 2050 646 646 none +u s8 r n n n R 480 646 2050 2050 646 646 none +u s32 s8 r n n n R 480 646 2050 2050 646 646 bias,relu,clip +u s8 s8 r n n n R 480 646 2050 2050 646 646 bias,relu,clip +u s32 r n n n R 481 646 2050 2050 646 646 none +u s8 r n n n R 481 646 2050 2050 646 646 none +u s32 s8 r n n n R 481 646 2050 2050 646 646 bias,relu,clip +u s8 s8 r n n n R 481 646 2050 2050 646 646 bias,relu,clip +u s32 r n n n R 482 646 2050 2050 646 646 none +u s8 r n n n R 482 646 2050 2050 646 646 none +u s32 s8 r n n n R 482 646 2050 2050 646 646 bias,relu,clip +u s8 s8 r n n n R 482 646 2050 2050 646 646 bias,relu,clip +u s32 r n n n R 483 646 2050 2050 646 646 none +u s8 r n n n R 483 646 2050 2050 646 646 none +u s32 s8 r n n n R 483 646 2050 2050 646 646 bias,relu,clip +u s8 s8 r n n n R 483 646 2050 2050 646 646 bias,relu,clip +u s32 r n n n R 484 646 2050 2050 646 646 none +u s8 r n n n R 484 646 2050 2050 646 646 none +u s32 s8 r n n n R 484 646 2050 2050 646 646 bias,relu,clip +u s8 s8 r n n n R 484 646 2050 2050 646 646 bias,relu,clip +u s32 r n n n R 485 646 2050 2050 646 646 none +u s8 r n n n R 485 646 2050 2050 646 646 none +u s32 s8 r n n n R 485 646 2050 2050 646 646 bias,relu,clip +u s8 s8 r n n n R 485 646 2050 2050 646 646 bias,relu,clip +u s32 r n n n R 481 656 2050 2050 656 656 none +u s8 r n n n R 481 656 2050 2050 656 656 none +u s32 s8 r n n n R 481 656 2050 2050 656 656 bias,relu,clip +u s8 s8 r n n n R 481 656 2050 2050 656 656 bias,relu,clip +u s32 r n n n R 482 656 2050 2050 656 656 none +u s8 r n n n R 482 656 2050 2050 656 656 none +u s32 s8 r n n n R 482 656 2050 2050 656 656 bias,relu,clip +u s8 s8 r n n n R 482 656 2050 2050 656 656 bias,relu,clip +u s32 r n n n R 483 656 2050 2050 656 656 none +u s8 r n n n R 483 656 2050 2050 656 656 none +u s32 s8 r n n n R 483 656 2050 2050 656 656 bias,relu,clip +u s8 s8 r n n n R 483 656 2050 2050 656 656 bias,relu,clip +u s32 r n n n R 484 656 2050 2050 656 656 none +u s8 r n n n R 484 656 2050 2050 656 656 none +u s32 s8 r n n n R 484 656 2050 2050 656 656 bias,relu,clip +u s8 s8 r n n n R 484 656 2050 2050 656 656 bias,relu,clip +u s32 r n n n p 485 656 2050 2050 656 656 none +u s8 r n n n p 485 656 2050 2050 656 656 none +u s32 s8 r n n n p 485 656 2050 2050 656 656 bias,relu,clip +u s8 s8 r n n n p 485 656 2050 2050 656 656 bias,relu,clip +u s32 r n n n p 480 672 2050 2050 672 672 none +u s8 r n n n p 480 672 2050 2050 672 672 none +u s32 s8 r n n n p 480 672 2050 2050 672 672 bias,relu,clip +u s8 s8 r n n n p 480 672 2050 2050 672 672 bias,relu,clip +u s32 r n n n p 481 672 2050 2050 672 672 none +u s8 r n n n p 481 672 2050 2050 672 672 none +u s32 s8 r n n n p 481 672 2050 2050 672 672 bias,relu,clip +u s8 s8 r n n n p 481 672 2050 2050 672 672 bias,relu,clip +u s32 r n n n p 482 672 2050 2050 672 672 none +u s8 r n n n p 482 672 2050 2050 672 672 none +u s32 s8 r n n n p 482 672 2050 2050 672 672 bias,relu,clip +u s8 s8 r n n n p 482 672 2050 2050 672 672 bias,relu,clip +u s32 r n n n p 483 672 2050 2050 672 672 none +u s8 r n n n p 483 672 2050 2050 672 672 none +u s32 s8 r n n n p 483 672 2050 2050 672 672 bias,relu,clip +u s8 s8 r n n n p 483 672 2050 2050 672 672 bias,relu,clip +u s32 r n n n p 484 672 2050 2050 672 672 none +u s8 r n n n p 484 672 2050 2050 672 672 none +u s32 s8 r n n n p 484 672 2050 2050 672 672 bias,relu,clip +u s8 s8 r n n n p 484 672 2050 2050 672 672 bias,relu,clip +u s32 r n n n p 485 672 2050 2050 672 672 none +u s8 r n n n p 485 672 2050 2050 672 672 none +u s32 s8 r n n n p 485 672 2050 2050 672 672 bias,relu,clip +u s8 s8 r n n n p 485 672 2050 2050 672 672 bias,relu,clip +u s32 r n n n p 480 688 2050 2050 688 688 none +u s8 r n n n p 480 688 2050 2050 688 688 none +u s32 s8 r n n n p 480 688 2050 2050 688 688 bias,relu,clip +u s8 s8 r n n n p 480 688 2050 2050 688 688 bias,relu,clip +u s32 r n n n p 481 688 2050 2050 688 688 none +u s8 r n n n p 481 688 2050 2050 688 688 none +u s32 s8 r n n n p 481 688 2050 2050 688 688 bias,relu,clip +u s8 s8 r n n n p 481 688 2050 2050 688 688 bias,relu,clip +u s32 r n n n r 482 688 2050 2050 688 688 none +u s8 r n n n r 482 688 2050 2050 688 688 none +u s32 s8 r n n n r 482 688 2050 2050 688 688 bias,relu,clip +u s8 s8 r n n n r 482 688 2050 2050 688 688 bias,relu,clip +u s32 r n n n r 483 688 2050 2050 688 688 none +u s8 r n n n r 483 688 2050 2050 688 688 none +u s32 s8 r n n n r 483 688 2050 2050 688 688 bias,relu,clip +u s8 s8 r n n n r 483 688 2050 2050 688 688 bias,relu,clip +u s32 r n n n r 484 688 2050 2050 688 688 none +u s8 r n n n r 484 688 2050 2050 688 688 none +u s32 s8 r n n n r 484 688 2050 2050 688 688 bias,relu,clip +u s8 s8 r n n n r 484 688 2050 2050 688 688 bias,relu,clip +u s32 r n n n r 485 688 2050 2050 688 688 none +u s8 r n n n r 485 688 2050 2050 688 688 none +u s32 s8 r n n n r 485 688 2050 2050 688 688 bias,relu,clip +u s8 s8 r n n n r 485 688 2050 2050 688 688 bias,relu,clip +u s32 r n n n r 1024 512 64 64 512 512 none +u s8 r n n n r 1024 512 64 64 512 512 none +u s32 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +u s8 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +u s32 r n n n r 16 256 512 512 256 256 none +u s8 r n n n r 16 256 512 512 256 256 none +u s32 s8 r n n n r 16 256 512 512 256 256 bias,relu,clip +u s8 s8 r n n n r 16 256 512 512 256 256 bias,relu,clip +u s32 r n n n r 480 640 512 512 640 640 none +u s8 r n n n r 480 640 512 512 640 640 none +u s32 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +u s8 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +u s32 r n n n r 64 768 512 512 768 768 none +u s8 r n n n r 64 768 512 512 768 768 none +u s32 s8 r n n n r 64 768 512 512 768 768 bias,relu,clip +u s8 s8 r n n n r 64 768 512 512 768 768 bias,relu,clip +u s32 r n n n r 128 128 128 128 128 128 none +u s8 r n n n r 128 128 128 128 128 128 none +u s32 s8 r n n n r 128 128 128 128 128 128 bias,relu,clip +u s8 s8 r n n n r 128 128 128 128 128 128 bias,relu,clip +u s32 r n n n r 1024 64 512 512 64 64 none +u s8 r n n n r 1024 64 512 512 64 64 none +u s32 s8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +u s8 s8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +u s32 r n n n r 1024 256 32 32 256 256 none +u s8 r n n n r 1024 256 32 32 256 256 none +u s32 s8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +u s8 s8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +u s32 r n n n r 1024 512 64 64 512 512 none +u s8 r n n n r 1024 512 64 64 512 512 none +u s32 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +u s8 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +u s32 r n n n r 480 640 512 512 640 640 none +u s8 r n n n r 480 640 512 512 640 640 none +u s32 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +u s8 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +u s32 r n n n p 1024 32 256 256 32 32 none +u s8 r n n n p 1024 32 256 256 32 32 none +u s32 s8 r n n n p 1024 32 256 256 32 32 bias,relu,clip +u s8 s8 r n n n p 1024 32 256 256 32 32 bias,relu,clip +u s32 r n n n P 1024 64 512 512 64 64 none +u s8 r n n n P 1024 64 512 512 64 64 none +u s32 s8 r n n n P 1024 64 512 512 64 64 bias,relu,clip +u s8 s8 r n n n P 1024 64 512 512 64 64 bias,relu,clip +u s32 r n n n P 64 800 320 320 800 800 none +u s8 r n n n P 64 800 320 320 800 800 none +u s32 s8 r n n n P 64 800 320 320 800 800 bias,relu,clip +u s8 s8 r n n n P 64 800 320 320 800 800 bias,relu,clip +u s32 r n n n P 64 768 512 512 768 768 none +u s8 r n n n P 64 768 512 512 768 768 none +u s32 s8 r n n n P 64 768 512 512 768 768 bias,relu,clip +u s8 s8 r n n n P 64 768 512 512 768 768 bias,relu,clip +u s32 r n n n P 16 256 512 512 256 256 none +u s8 r n n n P 16 256 512 512 256 256 none +u s32 s8 r n n n P 16 256 512 512 256 256 bias,relu,clip +u s8 s8 r n n n P 16 256 512 512 256 256 bias,relu,clip +u s32 r n n n P 128 128 128 128 128 128 none +u s8 r n n n P 128 128 128 128 128 128 none +u s32 s8 r n n n P 128 128 128 128 128 128 bias,relu,clip +u s8 s8 r n n n P 128 128 128 128 128 128 bias,relu,clip +u s32 r n n n P 256 512 256 256 512 512 none +u s8 r n n n P 256 512 256 256 512 512 none +u s32 s8 r n n n P 256 512 256 256 512 512 bias,relu,clip +u s8 s8 r n n n P 256 512 256 256 512 512 bias,relu,clip +u s32 r n n n P 1024 1024 1024 1024 1024 1024 none +u s8 r n n n P 1024 1024 1024 1024 1024 1024 none +u s32 s8 r n n n P 1024 1024 1024 1024 1024 1024 bias,relu,clip +u s8 s8 r n n n P 1024 1024 1024 1024 1024 1024 bias,relu,clip +u s32 r n n n P 480 640 1024 1024 640 640 none +u s8 r n n n P 480 640 1024 1024 640 640 none +u s32 s8 r n n n P 480 640 1024 1024 640 640 bias,relu,clip +u s8 s8 r n n n P 480 640 1024 1024 640 640 bias,relu,clip +u s32 r n n n P 480 640 256 256 640 640 none +u s8 r n n n P 480 640 256 256 640 640 none +u s32 s8 r n n n P 480 640 256 256 640 640 bias,relu,clip +u s8 s8 r n n n P 480 640 256 256 640 640 bias,relu,clip +u s32 r n n n P 8 64 32 32 64 64 none +u s8 r n n n P 8 64 32 32 64 64 none +u s32 s8 r n n n P 8 64 32 32 64 64 bias,relu,clip +u s8 s8 r n n n P 8 64 32 32 64 64 bias,relu,clip +u s32 r n n n P 9 64 32 32 64 64 none +u s8 r n n n P 9 64 32 32 64 64 none +u s32 s8 r n n n P 9 64 32 32 64 64 bias,relu,clip +u s8 s8 r n n n P 9 64 32 32 64 64 bias,relu,clip +u s32 r n n n P 10 128 64 64 128 128 none +u s8 r n n n P 10 128 64 64 128 128 none +u s32 s8 r n n n P 10 128 64 64 128 128 bias,relu,clip +u s8 s8 r n n n P 10 128 64 64 128 128 bias,relu,clip +u s32 r n n n P 8 8 8 8 8 8 none +u s8 r n n n P 8 8 8 8 8 8 none +u s32 s8 r n n n P 8 8 8 8 8 8 bias,relu,clip +u s8 s8 r n n n P 8 8 8 8 8 8 bias,relu,clip +u s32 r n n n P 12 12 12 12 12 12 none +u s8 r n n n P 12 12 12 12 12 12 none +u s32 s8 r n n n P 12 12 12 12 12 12 bias,relu,clip +u s8 s8 r n n n P 12 12 12 12 12 12 bias,relu,clip +u s32 r n n n P 25 25 25 25 25 25 none +u s8 r n n n P 25 25 25 25 25 25 none +u s32 s8 r n n n P 25 25 25 25 25 25 bias,relu,clip +u s8 s8 r n n n P 25 25 25 25 25 25 bias,relu,clip +u s32 r n n n P 25 25 20 20 25 25 none +u s8 r n n n P 25 25 20 20 25 25 none +u s32 s8 r n n n P 25 25 20 20 25 25 bias,relu,clip +u s8 s8 r n n n P 25 25 20 20 25 25 bias,relu,clip +u s32 r n n n r 4096 256 5 5 256 256 none +u s8 r n n n r 4096 256 5 5 256 256 none +u s32 s8 r n n n r 4096 256 5 5 256 256 bias,relu,clip +u s8 s8 r n n n r 4096 256 5 5 256 256 bias,relu,clip +u s32 r n n n r 3000 256 128 128 256 256 none +u s8 r n n n r 3000 256 128 128 256 256 none +u s32 s8 r n n n r 3000 256 128 128 256 256 bias,relu,clip +u s8 s8 r n n n r 3000 256 128 128 256 256 bias,relu,clip +u s32 r n n n r 4096 1024 512 512 1024 1024 none +u s8 r n n n r 4096 1024 512 512 1024 1024 none +u s32 s8 r n n n r 4096 1024 512 512 1024 1024 bias,relu,clip +u s8 s8 r n n n r 4096 1024 512 512 1024 1024 bias,relu,clip +u s32 r n n n r 144 256 5 5 256 256 none +u s8 r n n n r 144 256 5 5 256 256 none +u s32 s8 r n n n r 144 256 5 5 256 256 bias,relu,clip +u s8 s8 r n n n r 144 256 5 5 256 256 bias,relu,clip +u s32 r n n n r 144 256 128 128 256 256 none +u s8 r n n n r 144 256 128 128 256 256 none +u s32 s8 r n n n r 144 256 128 128 256 256 bias,relu,clip +u s8 s8 r n n n r 144 256 128 128 256 256 bias,relu,clip +u s32 r n n n r 144 1024 512 512 1024 1024 none +u s8 r n n n r 144 1024 512 512 1024 1024 none +u s32 s8 r n n n r 144 1024 512 512 1024 1024 bias,relu,clip +u s8 s8 r n n n r 144 1024 512 512 1024 1024 bias,relu,clip +u s32 r n n n r 480 688 256 256 688 688 none +u s8 r n n n r 480 688 256 256 688 688 none +u s32 s8 r n n n r 480 688 256 256 688 688 bias,relu,clip +u s8 s8 r n n n r 480 688 256 256 688 688 bias,relu,clip +u s32 r n n n r 480 640 512 512 640 640 none +u s8 r n n n r 480 640 512 512 640 640 none +u s32 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +u s8 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +u s32 r n n n r 480 640 1024 1024 640 640 none +u s8 r n n n r 480 640 1024 1024 640 640 none +u s32 s8 r n n n r 480 640 1024 1024 640 640 bias,relu,clip +u s8 s8 r n n n r 480 640 1024 1024 640 640 bias,relu,clip +u s32 r n n n r 64 800 320 320 800 800 none +u s8 r n n n r 64 800 320 320 800 800 none +u s32 s8 r n n n r 64 800 320 320 800 800 bias,relu,clip +u s8 s8 r n n n r 64 800 320 320 800 800 bias,relu,clip +u s32 r n n n r 64 768 512 512 768 768 none +u s8 r n n n r 64 768 512 512 768 768 none +u s32 s8 r n n n r 64 768 512 512 768 768 bias,relu,clip +u s8 s8 r n n n r 64 768 512 512 768 768 bias,relu,clip +u s32 r n n n r 16 256 512 512 256 256 none +u s8 r n n n r 16 256 512 512 256 256 none +u s32 s8 r n n n r 16 256 512 512 256 256 bias,relu,clip +u s8 s8 r n n n r 16 256 512 512 256 256 bias,relu,clip +u s32 r n n n r 128 128 128 128 128 128 none +u s8 r n n n r 128 128 128 128 128 128 none +u s32 s8 r n n n r 128 128 128 128 128 128 bias,relu,clip +u s8 s8 r n n n r 128 128 128 128 128 128 bias,relu,clip +u s32 r n n n r 256 512 256 256 512 512 none +u s8 r n n n r 256 512 256 256 512 512 none +u s32 s8 r n n n r 256 512 256 256 512 512 bias,relu,clip +u s8 s8 r n n n r 256 512 256 256 512 512 bias,relu,clip +u s32 r n n n r 1024 1024 1024 1024 1024 1024 none +u s8 r n n n r 1024 1024 1024 1024 1024 1024 none +u s32 s8 r n n n r 1024 1024 1024 1024 1024 1024 bias,relu,clip +u s8 s8 r n n n r 1024 1024 1024 1024 1024 1024 bias,relu,clip +u s32 r n n n r 1024 32 256 256 32 32 none +u s8 r n n n r 1024 32 256 256 32 32 none +u s32 s8 r n n n r 1024 32 256 256 32 32 bias,relu,clip +u s8 s8 r n n n r 1024 32 256 256 32 32 bias,relu,clip +u s32 r n n n r 1024 64 512 512 64 64 none +u s8 r n n n r 1024 64 512 512 64 64 none +u s32 s8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +u s8 s8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +u s32 r n n n r 1024 256 32 32 256 256 none +u s8 r n n n r 1024 256 32 32 256 256 none +u s32 s8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +u s8 s8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +u s32 r n n n r 1024 512 64 64 512 512 none +u s8 r n n n r 1024 512 64 64 512 512 none +u s32 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +u s8 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +u s32 r n n n r 512 32 256 256 32 32 none +u s8 r n n n r 512 32 256 256 32 32 none +u s32 s8 r n n n r 512 32 256 256 32 32 bias,relu,clip +u s8 s8 r n n n r 512 32 256 256 32 32 bias,relu,clip +u s32 r n n n r 512 768 512 512 768 768 none +u s8 r n n n r 512 768 512 512 768 768 none +u s32 s8 r n n n r 512 768 512 512 768 768 bias,relu,clip +u s8 s8 r n n n r 512 768 512 512 768 768 bias,relu,clip +u s32 r n n n r 512 256 32 32 256 256 none +u s8 r n n n r 512 256 32 32 256 256 none +u s32 s8 r n n n r 512 256 32 32 256 256 bias,relu,clip +u s8 s8 r n n n r 512 256 32 32 256 256 bias,relu,clip +u s32 r n n n r 512 512 64 64 512 512 none +u s8 r n n n r 512 512 64 64 512 512 none +u s32 s8 r n n n r 512 512 64 64 512 512 bias,relu,clip +u s8 s8 r n n n r 512 512 64 64 512 512 bias,relu,clip +u s32 r n n n r 512 256 768 768 256 256 none +u s8 r n n n r 512 256 768 768 256 256 none +u s32 s8 r n n n r 512 256 768 768 256 256 bias,relu,clip +u s8 s8 r n n n r 512 256 768 768 256 256 bias,relu,clip +u s32 r n n n r 768 768 1024 1024 768 768 none +u s8 r n n n r 768 768 1024 1024 768 768 none +u s32 s8 r n n n r 768 768 1024 1024 768 768 bias,relu,clip +u s8 s8 r n n n r 768 768 1024 1024 768 768 bias,relu,clip +u s32 r n n n r 768 768 768 768 768 768 none +u s8 r n n n r 768 768 768 768 768 768 none +u s32 s8 r n n n r 768 768 768 768 768 768 bias,relu,clip +u s8 s8 r n n n r 768 768 768 768 768 768 bias,relu,clip +u s32 r n n n r 2048 2048 2048 2048 2048 2048 none +u s8 r n n n r 2048 2048 2048 2048 2048 2048 none +u s32 s8 r n n n r 2048 2048 2048 2048 2048 2048 bias,relu,clip +u s8 s8 r n n n r 2048 2048 2048 2048 2048 2048 bias,relu,clip +u s32 r n n n r 4096 4096 4096 4096 4096 4096 none +u s8 r n n n r 4096 4096 4096 4096 4096 4096 none +u s32 s8 r n n n r 4096 4096 4096 4096 4096 4096 bias,relu,clip +u s8 s8 r n n n r 4096 4096 4096 4096 4096 4096 bias,relu,clip +f f32 c n n n p 2482 1127 2050 2482 2050 2482 none +f f32 f32 c n n n p 2482 1127 2050 2482 2050 2482 bias,relu,clip +f f32 c n n n p 2483 1127 2050 2483 2050 2483 none +f f32 f32 c n n n p 2483 1127 2050 2483 2050 2483 bias,relu,clip +f f32 c n n n p 2484 1127 2050 2484 2050 2484 none +f f32 f32 c n n n p 2484 1127 2050 2484 2050 2484 bias,relu,clip +f f32 c n n n p 2485 1127 2050 2485 2050 2485 none +f f32 f32 c n n n p 2485 1127 2050 2485 2050 2485 bias,relu,clip +f f32 c n n n p 480 1138 2050 480 2050 480 none +f f32 f32 c n n n p 480 1138 2050 480 2050 480 bias,relu,clip +f f32 c n n n p 481 1138 2050 481 2050 481 none +f f32 f32 c n n n p 481 1138 2050 481 2050 481 bias,relu,clip +f f32 c n n n p 482 1138 2050 482 2050 482 none +f f32 f32 c n n n p 482 1138 2050 482 2050 482 bias,relu,clip +f f32 c n n n p 483 1138 2050 483 2050 483 none +f f32 f32 c n n n p 483 1138 2050 483 2050 483 bias,relu,clip +f f32 c n n n p 484 1138 2050 484 2050 484 none +f f32 f32 c n n n p 484 1138 2050 484 2050 484 bias,relu,clip +f f32 c n n n p 485 1138 2050 485 2050 485 none +f f32 f32 c n n n p 485 1138 2050 485 2050 485 bias,relu,clip +f f32 c n n n p 1 1 3 1 3 1 none +f f32 f32 c n n n p 1 1 3 1 3 1 bias,relu,clip +f f32 c n n n p 1 9 3 1 3 1 none +f f32 f32 c n n n p 1 9 3 1 3 1 bias,relu,clip +f f32 c n n n p 1 2048 3 1 3 1 none +f f32 f32 c n n n p 1 2048 3 1 3 1 bias,relu,clip +f f32 c n n n p 1 2048 5192 1 5192 1 none +f f32 f32 c n n n p 1 2048 5192 1 5192 1 bias,relu,clip +f f32 c n n n p 9 1 3 9 3 9 none +f f32 f32 c n n n p 9 1 3 9 3 9 bias,relu,clip +f f32 c n n n p 576 1 3500 576 3500 576 none +f f32 f32 c n n n p 576 1 3500 576 3500 576 bias,relu,clip +f f32 c n n n p 1 1 1 1 1 1 none +f f32 f32 c n n n p 1 1 1 1 1 1 bias,relu,clip +f f32 c n n n p 102 1088 1024 102 1024 102 none +f f32 f32 c n n n p 102 1088 1024 102 1024 102 bias,relu,clip +b f32 r n n n r 480 20 2050 2050 20 20 none +b bf16 r n n n r 480 20 2050 2050 20 20 none +b f32 bf16 r n n n r 480 20 2050 2050 20 20 bias,relu,clip +b bf16 bf16 r n n n r 480 20 2050 2050 20 20 bias,relu,clip +b f32 r n n n r 481 20 2050 2050 20 20 none +b bf16 r n n n r 481 20 2050 2050 20 20 none +b f32 bf16 r n n n r 481 20 2050 2050 20 20 bias,relu,clip +b bf16 bf16 r n n n r 481 20 2050 2050 20 20 bias,relu,clip +b f32 r n n n r 482 20 2050 2050 20 20 none +b bf16 r n n n r 482 20 2050 2050 20 20 none +b f32 bf16 r n n n r 482 20 2050 2050 20 20 bias,relu,clip +b bf16 bf16 r n n n r 482 20 2050 2050 20 20 bias,relu,clip +b f32 r n n n p 483 20 2050 2050 20 20 none +b bf16 r n n n p 483 20 2050 2050 20 20 none +b f32 bf16 r n n n p 483 20 2050 2050 20 20 bias,relu,clip +b bf16 bf16 r n n n p 483 20 2050 2050 20 20 bias,relu,clip +b f32 r n n n R 484 20 2050 2050 20 20 none +b bf16 r n n n R 484 20 2050 2050 20 20 none +b f32 bf16 r n n n R 484 20 2050 2050 20 20 bias,relu,clip +b bf16 bf16 r n n n R 484 20 2050 2050 20 20 bias,relu,clip +b f32 r n n n R 485 20 2050 2050 20 20 none +b bf16 r n n n R 485 20 2050 2050 20 20 none +b f32 bf16 r n n n R 485 20 2050 2050 20 20 bias,relu,clip +b bf16 bf16 r n n n R 485 20 2050 2050 20 20 bias,relu,clip +b f32 r n n n R 480 39 2050 2050 39 39 none +b bf16 r n n n R 480 39 2050 2050 39 39 none +b f32 bf16 r n n n R 480 39 2050 2050 39 39 bias,relu,clip +b bf16 bf16 r n n n R 480 39 2050 2050 39 39 bias,relu,clip +b f32 r n n n R 481 39 2050 2050 39 39 none +b bf16 r n n n R 481 39 2050 2050 39 39 none +b f32 bf16 r n n n R 481 39 2050 2050 39 39 bias,relu,clip +b bf16 bf16 r n n n R 481 39 2050 2050 39 39 bias,relu,clip +b f32 r n n n R 482 39 2050 2050 39 39 none +b bf16 r n n n R 482 39 2050 2050 39 39 none +b f32 bf16 r n n n R 482 39 2050 2050 39 39 bias,relu,clip +b bf16 bf16 r n n n R 482 39 2050 2050 39 39 bias,relu,clip +b f32 r n n n R 483 39 2050 2050 39 39 none +b bf16 r n n n R 483 39 2050 2050 39 39 none +b f32 bf16 r n n n R 483 39 2050 2050 39 39 bias,relu,clip +b bf16 bf16 r n n n R 483 39 2050 2050 39 39 bias,relu,clip +b f32 r n n n R 484 39 2050 2050 39 39 none +b bf16 r n n n R 484 39 2050 2050 39 39 none +b f32 bf16 r n n n R 484 39 2050 2050 39 39 bias,relu,clip +b bf16 bf16 r n n n R 484 39 2050 2050 39 39 bias,relu,clip +b f32 r n n n p 485 39 2050 2050 39 39 none +b bf16 r n n n p 485 39 2050 2050 39 39 none +b f32 bf16 r n n n p 485 39 2050 2050 39 39 bias,relu,clip +b bf16 bf16 r n n n p 485 39 2050 2050 39 39 bias,relu,clip +b f32 r n n n p 480 50 2050 2050 50 50 none +b bf16 r n n n p 480 50 2050 2050 50 50 none +b f32 bf16 r n n n p 480 50 2050 2050 50 50 bias,relu,clip +b bf16 bf16 r n n n p 480 50 2050 2050 50 50 bias,relu,clip +b f32 r n n n p 481 50 2050 2050 50 50 none +b bf16 r n n n p 481 50 2050 2050 50 50 none +b f32 bf16 r n n n p 481 50 2050 2050 50 50 bias,relu,clip +b bf16 bf16 r n n n p 481 50 2050 2050 50 50 bias,relu,clip +b f32 r n n n p 482 50 2050 2050 50 50 none +b bf16 r n n n p 482 50 2050 2050 50 50 none +b f32 bf16 r n n n p 482 50 2050 2050 50 50 bias,relu,clip +b bf16 bf16 r n n n p 482 50 2050 2050 50 50 bias,relu,clip +b f32 r n n n p 483 50 2050 2050 50 50 none +b bf16 r n n n p 483 50 2050 2050 50 50 none +b f32 bf16 r n n n p 483 50 2050 2050 50 50 bias,relu,clip +b bf16 bf16 r n n n p 483 50 2050 2050 50 50 bias,relu,clip +b f32 r n n n p 484 50 2050 2050 50 50 none +b bf16 r n n n p 484 50 2050 2050 50 50 none +b f32 bf16 r n n n p 484 50 2050 2050 50 50 bias,relu,clip +b bf16 bf16 r n n n p 484 50 2050 2050 50 50 bias,relu,clip +b f32 r n n n p 485 50 2050 2050 50 50 none +b bf16 r n n n p 485 50 2050 2050 50 50 none +b f32 bf16 r n n n p 485 50 2050 2050 50 50 bias,relu,clip +b bf16 bf16 r n n n p 485 50 2050 2050 50 50 bias,relu,clip +b f32 r n n n R 480 1108 2050 2050 1108 1108 none +b bf16 r n n n R 480 1108 2050 2050 1108 1108 none +b f32 bf16 r n n n R 480 1108 2050 2050 1108 1108 bias,relu,clip +b bf16 bf16 r n n n R 480 1108 2050 2050 1108 1108 bias,relu,clip +b f32 r n n n R 481 1108 2050 2050 1108 1108 none +b bf16 r n n n R 481 1108 2050 2050 1108 1108 none +b f32 bf16 r n n n R 481 1108 2050 2050 1108 1108 bias,relu,clip +b bf16 bf16 r n n n R 481 1108 2050 2050 1108 1108 bias,relu,clip +b f32 r n n n R 482 1108 2050 2050 1108 1108 none +b bf16 r n n n R 482 1108 2050 2050 1108 1108 none +b f32 bf16 r n n n R 482 1108 2050 2050 1108 1108 bias,relu,clip +b bf16 bf16 r n n n R 482 1108 2050 2050 1108 1108 bias,relu,clip +b f32 r n n n R 483 1108 2050 2050 1108 1108 none +b bf16 r n n n R 483 1108 2050 2050 1108 1108 none +b f32 bf16 r n n n R 483 1108 2050 2050 1108 1108 bias,relu,clip +b bf16 bf16 r n n n R 483 1108 2050 2050 1108 1108 bias,relu,clip +b f32 r n n n R 484 1108 2050 2050 1108 1108 none +b bf16 r n n n R 484 1108 2050 2050 1108 1108 none +b f32 bf16 r n n n R 484 1108 2050 2050 1108 1108 bias,relu,clip +b bf16 bf16 r n n n R 484 1108 2050 2050 1108 1108 bias,relu,clip +b f32 r n n n R 485 1108 2050 2050 1108 1108 none +b bf16 r n n n R 485 1108 2050 2050 1108 1108 none +b f32 bf16 r n n n R 485 1108 2050 2050 1108 1108 bias,relu,clip +b bf16 bf16 r n n n R 485 1108 2050 2050 1108 1108 bias,relu,clip +b f32 r n n n R 480 1127 2050 2050 1127 1127 none +b bf16 r n n n R 480 1127 2050 2050 1127 1127 none +b f32 bf16 r n n n R 480 1127 2050 2050 1127 1127 bias,relu,clip +b bf16 bf16 r n n n R 480 1127 2050 2050 1127 1127 bias,relu,clip +b f32 r n n n R 481 1127 2050 2050 1127 1127 none +b bf16 r n n n R 481 1127 2050 2050 1127 1127 none +b f32 bf16 r n n n R 481 1127 2050 2050 1127 1127 bias,relu,clip +b bf16 bf16 r n n n R 481 1127 2050 2050 1127 1127 bias,relu,clip +b f32 r n n n R 482 1127 2050 2050 1127 1127 none +b bf16 r n n n R 482 1127 2050 2050 1127 1127 none +b f32 bf16 r n n n R 482 1127 2050 2050 1127 1127 bias,relu,clip +b bf16 bf16 r n n n R 482 1127 2050 2050 1127 1127 bias,relu,clip +b f32 r n n n R 483 1127 2050 2050 1127 1127 none +b bf16 r n n n R 483 1127 2050 2050 1127 1127 none +b f32 bf16 r n n n R 483 1127 2050 2050 1127 1127 bias,relu,clip +b bf16 bf16 r n n n R 483 1127 2050 2050 1127 1127 bias,relu,clip +b f32 r n n n p 484 1127 2050 2050 1127 1127 none +b bf16 r n n n p 484 1127 2050 2050 1127 1127 none +b f32 bf16 r n n n p 484 1127 2050 2050 1127 1127 bias,relu,clip +b bf16 bf16 r n n n p 484 1127 2050 2050 1127 1127 bias,relu,clip +b f32 r n n n p 485 1127 2050 2050 1127 1127 none +b bf16 r n n n p 485 1127 2050 2050 1127 1127 none +b f32 bf16 r n n n p 485 1127 2050 2050 1127 1127 bias,relu,clip +b bf16 bf16 r n n n p 485 1127 2050 2050 1127 1127 bias,relu,clip +b f32 r n n n p 480 1138 2050 2050 1138 1138 none +b bf16 r n n n p 480 1138 2050 2050 1138 1138 none +b f32 bf16 r n n n p 480 1138 2050 2050 1138 1138 bias,relu,clip +b bf16 bf16 r n n n p 480 1138 2050 2050 1138 1138 bias,relu,clip +b f32 r n n n p 481 1138 2050 2050 1138 1138 none +b bf16 r n n n p 481 1138 2050 2050 1138 1138 none +b f32 bf16 r n n n p 481 1138 2050 2050 1138 1138 bias,relu,clip +b bf16 bf16 r n n n p 481 1138 2050 2050 1138 1138 bias,relu,clip +b f32 r n n n p 482 1138 2050 2050 1138 1138 none +b bf16 r n n n p 482 1138 2050 2050 1138 1138 none +b f32 bf16 r n n n p 482 1138 2050 2050 1138 1138 bias,relu,clip +b bf16 bf16 r n n n p 482 1138 2050 2050 1138 1138 bias,relu,clip +b f32 r n n n p 483 1138 2050 2050 1138 1138 none +b bf16 r n n n p 483 1138 2050 2050 1138 1138 none +b f32 bf16 r n n n p 483 1138 2050 2050 1138 1138 bias,relu,clip +b bf16 bf16 r n n n p 483 1138 2050 2050 1138 1138 bias,relu,clip +b f32 r n n n p 484 1138 2050 2050 1138 1138 none +b bf16 r n n n p 484 1138 2050 2050 1138 1138 none +b f32 bf16 r n n n p 484 1138 2050 2050 1138 1138 bias,relu,clip +b bf16 bf16 r n n n p 484 1138 2050 2050 1138 1138 bias,relu,clip +b f32 r n n n p 485 1138 2050 2050 1138 1138 none +b bf16 r n n n p 485 1138 2050 2050 1138 1138 none +b f32 bf16 r n n n p 485 1138 2050 2050 1138 1138 bias,relu,clip +b bf16 bf16 r n n n p 485 1138 2050 2050 1138 1138 bias,relu,clip +b f32 r n n n p 1 1 3 3 1 1 none +b bf16 r n n n p 1 1 3 3 1 1 none +b f32 bf16 r n n n p 1 1 3 3 1 1 bias,relu,clip +b bf16 bf16 r n n n p 1 1 3 3 1 1 bias,relu,clip +b f32 r n n n p 1 9 3 3 9 9 none +b bf16 r n n n p 1 9 3 3 9 9 none +b f32 bf16 r n n n p 1 9 3 3 9 9 bias,relu,clip +b bf16 bf16 r n n n p 1 9 3 3 9 9 bias,relu,clip +b f32 r n n n p 1 2048 3 3 2048 2048 none +b bf16 r n n n p 1 2048 3 3 2048 2048 none +b f32 bf16 r n n n p 1 2048 3 3 2048 2048 bias,relu,clip +b bf16 bf16 r n n n p 1 2048 3 3 2048 2048 bias,relu,clip +b f32 r n n n p 1 2048 5192 5192 2048 2048 none +b bf16 r n n n p 1 2048 5192 5192 2048 2048 none +b f32 bf16 r n n n p 1 2048 5192 5192 2048 2048 bias,relu,clip +b bf16 bf16 r n n n p 1 2048 5192 5192 2048 2048 bias,relu,clip +b f32 r n n n p 9 1 3 3 1 1 none +b bf16 r n n n p 9 1 3 3 1 1 none +b f32 bf16 r n n n p 9 1 3 3 1 1 bias,relu,clip +b bf16 bf16 r n n n p 9 1 3 3 1 1 bias,relu,clip +b f32 r n n n p 576 1 3500 3500 1 1 none +b bf16 r n n n p 576 1 3500 3500 1 1 none +b f32 bf16 r n n n p 576 1 3500 3500 1 1 bias,relu,clip +b bf16 bf16 r n n n p 576 1 3500 3500 1 1 bias,relu,clip +b f32 r n n n p 1 1 1 1 1 1 none +b bf16 r n n n p 1 1 1 1 1 1 none +b f32 bf16 r n n n p 1 1 1 1 1 1 bias,relu,clip +b bf16 bf16 r n n n p 1 1 1 1 1 1 bias,relu,clip +b f32 r n n n p 102 1088 1024 1024 1088 1088 none +b bf16 r n n n p 102 1088 1024 1024 1088 1088 none +b f32 bf16 r n n n p 102 1088 1024 1024 1088 1088 bias,relu,clip +b bf16 bf16 r n n n p 102 1088 1024 1024 1088 1088 bias,relu,clip +b f32 r n n n p 102 2048 1024 1024 2048 2048 none +b bf16 r n n n p 102 2048 1024 1024 2048 2048 none +b f32 bf16 r n n n p 102 2048 1024 1024 2048 2048 bias,relu,clip +b bf16 bf16 r n n n p 102 2048 1024 1024 2048 2048 bias,relu,clip +b f32 r n n n p 485 656 1024 1024 656 656 none +b bf16 r n n n p 485 656 1024 1024 656 656 none +b f32 bf16 r n n n p 485 656 1024 1024 656 656 bias,relu,clip +b bf16 bf16 r n n n p 485 656 1024 1024 656 656 bias,relu,clip +b f32 r n n n p 483 656 1024 1024 656 656 none +b bf16 r n n n p 483 656 1024 1024 656 656 none +b f32 bf16 r n n n p 483 656 1024 1024 656 656 bias,relu,clip +b bf16 bf16 r n n n p 483 656 1024 1024 656 656 bias,relu,clip +b f32 r n n n p 81 128 3 3 128 128 none +b bf16 r n n n p 81 128 3 3 128 128 none +b f32 bf16 r n n n p 81 128 3 3 128 128 bias,relu,clip +b bf16 bf16 r n n n p 81 128 3 3 128 128 bias,relu,clip +b f32 r n n n p 1022 512 515 515 512 512 none +b bf16 r n n n p 1022 512 515 515 512 512 none +b f32 bf16 r n n n p 1022 512 515 515 512 512 bias,relu,clip +b bf16 bf16 r n n n p 1022 512 515 515 512 512 bias,relu,clip +b f32 r n n n p 74 512 515 515 512 512 none +b bf16 r n n n p 74 512 515 515 512 512 none +b f32 bf16 r n n n p 74 512 515 515 512 512 bias,relu,clip +b bf16 bf16 r n n n p 74 512 515 515 512 512 bias,relu,clip +b f32 r n n n p 253 2048 515 515 2048 2048 none +b bf16 r n n n p 253 2048 515 515 2048 2048 none +b f32 bf16 r n n n p 253 2048 515 515 2048 2048 bias,relu,clip +b bf16 bf16 r n n n p 253 2048 515 515 2048 2048 bias,relu,clip +b f32 r n n n p 8192 1040 515 515 1040 1040 none +b bf16 r n n n p 8192 1040 515 515 1040 1040 none +b f32 bf16 r n n n p 8192 1040 515 515 1040 1040 bias,relu,clip +b bf16 bf16 r n n n p 8192 1040 515 515 1040 1040 bias,relu,clip +b f32 r n n n p 10 1029 515 515 1029 1029 none +b bf16 r n n n p 10 1029 515 515 1029 1029 none +b f32 bf16 r n n n p 10 1029 515 515 1029 1029 bias,relu,clip +b bf16 bf16 r n n n p 10 1029 515 515 1029 1029 bias,relu,clip +b f32 r n n n p 24 1040 2050 2050 1040 1040 none +b bf16 r n n n p 24 1040 2050 2050 1040 1040 none +b f32 bf16 r n n n p 24 1040 2050 2050 1040 1040 bias,relu,clip +b bf16 bf16 r n n n p 24 1040 2050 2050 1040 1040 bias,relu,clip +b f32 r n n n p 1024 1029 2050 2050 1029 1029 none +b bf16 r n n n p 1024 1029 2050 2050 1029 1029 none +b f32 bf16 r n n n p 1024 1029 2050 2050 1029 1029 bias,relu,clip +b bf16 bf16 r n n n p 1024 1029 2050 2050 1029 1029 bias,relu,clip +b f32 r n n n p 480 660 2050 2050 660 660 none +b bf16 r n n n p 480 660 2050 2050 660 660 none +b f32 bf16 r n n n p 480 660 2050 2050 660 660 bias,relu,clip +b bf16 bf16 r n n n p 480 660 2050 2050 660 660 bias,relu,clip +b f32 r n n n p 481 660 2050 2050 660 660 none +b bf16 r n n n p 481 660 2050 2050 660 660 none +b f32 bf16 r n n n p 481 660 2050 2050 660 660 bias,relu,clip +b bf16 bf16 r n n n p 481 660 2050 2050 660 660 bias,relu,clip +b f32 r n n n p 482 660 2050 2050 660 660 none +b bf16 r n n n p 482 660 2050 2050 660 660 none +b f32 bf16 r n n n p 482 660 2050 2050 660 660 bias,relu,clip +b bf16 bf16 r n n n p 482 660 2050 2050 660 660 bias,relu,clip +b f32 r n n n p 483 660 2050 2050 660 660 none +b bf16 r n n n p 483 660 2050 2050 660 660 none +b f32 bf16 r n n n p 483 660 2050 2050 660 660 bias,relu,clip +b bf16 bf16 r n n n p 483 660 2050 2050 660 660 bias,relu,clip +b f32 r n n n p 484 660 2050 2050 660 660 none +b bf16 r n n n p 484 660 2050 2050 660 660 none +b f32 bf16 r n n n p 484 660 2050 2050 660 660 bias,relu,clip +b bf16 bf16 r n n n p 484 660 2050 2050 660 660 bias,relu,clip +b f32 r n n n p 485 660 2050 2050 660 660 none +b bf16 r n n n p 485 660 2050 2050 660 660 none +b f32 bf16 r n n n p 485 660 2050 2050 660 660 bias,relu,clip +b bf16 bf16 r n n n p 485 660 2050 2050 660 660 bias,relu,clip +b f32 r n n n p 480 679 2050 2050 679 679 none +b bf16 r n n n p 480 679 2050 2050 679 679 none +b f32 bf16 r n n n p 480 679 2050 2050 679 679 bias,relu,clip +b bf16 bf16 r n n n p 480 679 2050 2050 679 679 bias,relu,clip +b f32 r n n n p 481 679 2050 2050 679 679 none +b bf16 r n n n p 481 679 2050 2050 679 679 none +b f32 bf16 r n n n p 481 679 2050 2050 679 679 bias,relu,clip +b bf16 bf16 r n n n p 481 679 2050 2050 679 679 bias,relu,clip +b f32 r n n n p 482 679 2050 2050 679 679 none +b bf16 r n n n p 482 679 2050 2050 679 679 none +b f32 bf16 r n n n p 482 679 2050 2050 679 679 bias,relu,clip +b bf16 bf16 r n n n p 482 679 2050 2050 679 679 bias,relu,clip +b f32 r n n n p 483 679 2050 2050 679 679 none +b bf16 r n n n p 483 679 2050 2050 679 679 none +b f32 bf16 r n n n p 483 679 2050 2050 679 679 bias,relu,clip +b bf16 bf16 r n n n p 483 679 2050 2050 679 679 bias,relu,clip +b f32 r n n n p 484 679 2050 2050 679 679 none +b bf16 r n n n p 484 679 2050 2050 679 679 none +b f32 bf16 r n n n p 484 679 2050 2050 679 679 bias,relu,clip +b bf16 bf16 r n n n p 484 679 2050 2050 679 679 bias,relu,clip +b f32 r n n n p 485 679 2050 2050 679 679 none +b bf16 r n n n p 485 679 2050 2050 679 679 none +b f32 bf16 r n n n p 485 679 2050 2050 679 679 bias,relu,clip +b bf16 bf16 r n n n p 485 679 2050 2050 679 679 bias,relu,clip +b f32 r n n n p 480 690 2050 2050 690 690 none +b bf16 r n n n p 480 690 2050 2050 690 690 none +b f32 bf16 r n n n p 480 690 2050 2050 690 690 bias,relu,clip +b bf16 bf16 r n n n p 480 690 2050 2050 690 690 bias,relu,clip +b f32 r n n n p 481 690 2050 2050 690 690 none +b bf16 r n n n p 481 690 2050 2050 690 690 none +b f32 bf16 r n n n p 481 690 2050 2050 690 690 bias,relu,clip +b bf16 bf16 r n n n p 481 690 2050 2050 690 690 bias,relu,clip +b f32 r n n n p 482 690 2050 2050 690 690 none +b bf16 r n n n p 482 690 2050 2050 690 690 none +b f32 bf16 r n n n p 482 690 2050 2050 690 690 bias,relu,clip +b bf16 bf16 r n n n p 482 690 2050 2050 690 690 bias,relu,clip +b f32 r n n n p 483 690 2050 2050 690 690 none +b bf16 r n n n p 483 690 2050 2050 690 690 none +b f32 bf16 r n n n p 483 690 2050 2050 690 690 bias,relu,clip +b bf16 bf16 r n n n p 483 690 2050 2050 690 690 bias,relu,clip +b f32 r n n n p 484 690 2050 2050 690 690 none +b bf16 r n n n p 484 690 2050 2050 690 690 none +b f32 bf16 r n n n p 484 690 2050 2050 690 690 bias,relu,clip +b bf16 bf16 r n n n p 484 690 2050 2050 690 690 bias,relu,clip +b f32 r n n n p 485 690 2050 2050 690 690 none +b bf16 r n n n p 485 690 2050 2050 690 690 none +b f32 bf16 r n n n p 485 690 2050 2050 690 690 bias,relu,clip +b bf16 bf16 r n n n p 485 690 2050 2050 690 690 bias,relu,clip +b f32 r n n n p 480 660 2048 2048 660 660 none +b bf16 r n n n p 480 660 2048 2048 660 660 none +b f32 bf16 r n n n p 480 660 2048 2048 660 660 bias,relu,clip +b bf16 bf16 r n n n p 480 660 2048 2048 660 660 bias,relu,clip +b f32 r n n n p 481 660 2048 2048 660 660 none +b bf16 r n n n p 481 660 2048 2048 660 660 none +b f32 bf16 r n n n p 481 660 2048 2048 660 660 bias,relu,clip +b bf16 bf16 r n n n p 481 660 2048 2048 660 660 bias,relu,clip +b f32 r n n n p 482 660 2048 2048 660 660 none +b bf16 r n n n p 482 660 2048 2048 660 660 none +b f32 bf16 r n n n p 482 660 2048 2048 660 660 bias,relu,clip +b bf16 bf16 r n n n p 482 660 2048 2048 660 660 bias,relu,clip +b f32 r n n n p 483 660 2048 2048 660 660 none +b bf16 r n n n p 483 660 2048 2048 660 660 none +b f32 bf16 r n n n p 483 660 2048 2048 660 660 bias,relu,clip +b bf16 bf16 r n n n p 483 660 2048 2048 660 660 bias,relu,clip +b f32 r n n n p 484 660 2048 2048 660 660 none +b bf16 r n n n p 484 660 2048 2048 660 660 none +b f32 bf16 r n n n p 484 660 2048 2048 660 660 bias,relu,clip +b bf16 bf16 r n n n p 484 660 2048 2048 660 660 bias,relu,clip +b f32 r n n n p 485 660 2048 2048 660 660 none +b bf16 r n n n p 485 660 2048 2048 660 660 none +b f32 bf16 r n n n p 485 660 2048 2048 660 660 bias,relu,clip +b bf16 bf16 r n n n p 485 660 2048 2048 660 660 bias,relu,clip +b f32 r n n n p 480 679 2048 2048 679 679 none +b bf16 r n n n p 480 679 2048 2048 679 679 none +b f32 bf16 r n n n p 480 679 2048 2048 679 679 bias,relu,clip +b bf16 bf16 r n n n p 480 679 2048 2048 679 679 bias,relu,clip +b f32 r n n n p 481 679 2048 2048 679 679 none +b bf16 r n n n p 481 679 2048 2048 679 679 none +b f32 bf16 r n n n p 481 679 2048 2048 679 679 bias,relu,clip +b bf16 bf16 r n n n p 481 679 2048 2048 679 679 bias,relu,clip +b f32 r n n n p 482 679 2048 2048 679 679 none +b bf16 r n n n p 482 679 2048 2048 679 679 none +b f32 bf16 r n n n p 482 679 2048 2048 679 679 bias,relu,clip +b bf16 bf16 r n n n p 482 679 2048 2048 679 679 bias,relu,clip +b f32 r n n n p 483 679 2048 2048 679 679 none +b bf16 r n n n p 483 679 2048 2048 679 679 none +b f32 bf16 r n n n p 483 679 2048 2048 679 679 bias,relu,clip +b bf16 bf16 r n n n p 483 679 2048 2048 679 679 bias,relu,clip +b f32 r n n n p 484 679 2048 2048 679 679 none +b bf16 r n n n p 484 679 2048 2048 679 679 none +b f32 bf16 r n n n p 484 679 2048 2048 679 679 bias,relu,clip +b bf16 bf16 r n n n p 484 679 2048 2048 679 679 bias,relu,clip +b f32 r n n n p 485 679 2048 2048 679 679 none +b bf16 r n n n p 485 679 2048 2048 679 679 none +b f32 bf16 r n n n p 485 679 2048 2048 679 679 bias,relu,clip +b bf16 bf16 r n n n p 485 679 2048 2048 679 679 bias,relu,clip +b f32 r n n n p 480 690 2048 2048 690 690 none +b bf16 r n n n p 480 690 2048 2048 690 690 none +b f32 bf16 r n n n p 480 690 2048 2048 690 690 bias,relu,clip +b bf16 bf16 r n n n p 480 690 2048 2048 690 690 bias,relu,clip +b f32 r n n n p 481 690 2048 2048 690 690 none +b bf16 r n n n p 481 690 2048 2048 690 690 none +b f32 bf16 r n n n p 481 690 2048 2048 690 690 bias,relu,clip +b bf16 bf16 r n n n p 481 690 2048 2048 690 690 bias,relu,clip +b f32 r n n n p 482 690 2048 2048 690 690 none +b bf16 r n n n p 482 690 2048 2048 690 690 none +b f32 bf16 r n n n p 482 690 2048 2048 690 690 bias,relu,clip +b bf16 bf16 r n n n p 482 690 2048 2048 690 690 bias,relu,clip +b f32 r n n n p 483 690 2048 2048 690 690 none +b bf16 r n n n p 483 690 2048 2048 690 690 none +b f32 bf16 r n n n p 483 690 2048 2048 690 690 bias,relu,clip +b bf16 bf16 r n n n p 483 690 2048 2048 690 690 bias,relu,clip +b f32 r n n n p 484 690 2048 2048 690 690 none +b bf16 r n n n p 484 690 2048 2048 690 690 none +b f32 bf16 r n n n p 484 690 2048 2048 690 690 bias,relu,clip +b bf16 bf16 r n n n p 484 690 2048 2048 690 690 bias,relu,clip +b f32 r n n n p 485 690 2048 2048 690 690 none +b bf16 r n n n p 485 690 2048 2048 690 690 none +b f32 bf16 r n n n p 485 690 2048 2048 690 690 bias,relu,clip +b bf16 bf16 r n n n p 485 690 2048 2048 690 690 bias,relu,clip +b f32 r n n n p 480 656 1024 1024 656 656 none +b bf16 r n n n p 480 656 1024 1024 656 656 none +b f32 bf16 r n n n p 480 656 1024 1024 656 656 bias,relu,clip +b bf16 bf16 r n n n p 480 656 1024 1024 656 656 bias,relu,clip +b f32 r n n n p 480 128 3 3 128 128 none +b bf16 r n n n p 480 128 3 3 128 128 none +b f32 bf16 r n n n p 480 128 3 3 128 128 bias,relu,clip +b bf16 bf16 r n n n p 480 128 3 3 128 128 bias,relu,clip +b f32 r n n n p 1024 512 515 515 512 512 none +b bf16 r n n n p 1024 512 515 515 512 512 none +b f32 bf16 r n n n p 1024 512 515 515 512 512 bias,relu,clip +b bf16 bf16 r n n n p 1024 512 515 515 512 512 bias,relu,clip +b f32 r n n n p 1024 2048 1024 1024 2048 2048 none +b bf16 r n n n p 1024 2048 1024 1024 2048 2048 none +b f32 bf16 r n n n p 1024 2048 1024 1024 2048 2048 bias,relu,clip +b bf16 bf16 r n n n p 1024 2048 1024 1024 2048 2048 bias,relu,clip +b f32 r n n n p 1024 2048 515 515 2048 2048 none +b bf16 r n n n p 1024 2048 515 515 2048 2048 none +b f32 bf16 r n n n p 1024 2048 515 515 2048 2048 bias,relu,clip +b bf16 bf16 r n n n p 1024 2048 515 515 2048 2048 bias,relu,clip +b f32 r n n n p 1024 1040 515 515 1040 1040 none +b bf16 r n n n p 1024 1040 515 515 1040 1040 none +b f32 bf16 r n n n p 1024 1040 515 515 1040 1040 bias,relu,clip +b bf16 bf16 r n n n p 1024 1040 515 515 1040 1040 bias,relu,clip +b f32 r n n n p 5 1029 515 515 1029 1029 none +b bf16 r n n n p 5 1029 515 515 1029 1029 none +b f32 bf16 r n n n p 5 1029 515 515 1029 1029 bias,relu,clip +b bf16 bf16 r n n n p 5 1029 515 515 1029 1029 bias,relu,clip +b f32 r n n n p 1024 1029 515 515 1029 1029 none +b bf16 r n n n p 1024 1029 515 515 1029 1029 none +b f32 bf16 r n n n p 1024 1029 515 515 1029 1029 bias,relu,clip +b bf16 bf16 r n n n p 1024 1029 515 515 1029 1029 bias,relu,clip +b f32 r n n n p 1024 1040 2050 2050 1040 1040 none +b bf16 r n n n p 1024 1040 2050 2050 1040 1040 none +b f32 bf16 r n n n p 1024 1040 2050 2050 1040 1040 bias,relu,clip +b bf16 bf16 r n n n p 1024 1040 2050 2050 1040 1040 bias,relu,clip +b f32 r n n n p 1029 1029 2050 2050 1029 1029 none +b bf16 r n n n p 1029 1029 2050 2050 1029 1029 none +b f32 bf16 r n n n p 1029 1029 2050 2050 1029 1029 bias,relu,clip +b bf16 bf16 r n n n p 1029 1029 2050 2050 1029 1029 bias,relu,clip +b f32 r n n n R 480 646 2050 2050 646 646 none +b bf16 r n n n R 480 646 2050 2050 646 646 none +b f32 bf16 r n n n R 480 646 2050 2050 646 646 bias,relu,clip +b bf16 bf16 r n n n R 480 646 2050 2050 646 646 bias,relu,clip +b f32 r n n n R 481 646 2050 2050 646 646 none +b bf16 r n n n R 481 646 2050 2050 646 646 none +b f32 bf16 r n n n R 481 646 2050 2050 646 646 bias,relu,clip +b bf16 bf16 r n n n R 481 646 2050 2050 646 646 bias,relu,clip +b f32 r n n n R 482 646 2050 2050 646 646 none +b bf16 r n n n R 482 646 2050 2050 646 646 none +b f32 bf16 r n n n R 482 646 2050 2050 646 646 bias,relu,clip +b bf16 bf16 r n n n R 482 646 2050 2050 646 646 bias,relu,clip +b f32 r n n n R 483 646 2050 2050 646 646 none +b bf16 r n n n R 483 646 2050 2050 646 646 none +b f32 bf16 r n n n R 483 646 2050 2050 646 646 bias,relu,clip +b bf16 bf16 r n n n R 483 646 2050 2050 646 646 bias,relu,clip +b f32 r n n n R 484 646 2050 2050 646 646 none +b bf16 r n n n R 484 646 2050 2050 646 646 none +b f32 bf16 r n n n R 484 646 2050 2050 646 646 bias,relu,clip +b bf16 bf16 r n n n R 484 646 2050 2050 646 646 bias,relu,clip +b f32 r n n n R 485 646 2050 2050 646 646 none +b bf16 r n n n R 485 646 2050 2050 646 646 none +b f32 bf16 r n n n R 485 646 2050 2050 646 646 bias,relu,clip +b bf16 bf16 r n n n R 485 646 2050 2050 646 646 bias,relu,clip +b f32 r n n n R 481 656 2050 2050 656 656 none +b bf16 r n n n R 481 656 2050 2050 656 656 none +b f32 bf16 r n n n R 481 656 2050 2050 656 656 bias,relu,clip +b bf16 bf16 r n n n R 481 656 2050 2050 656 656 bias,relu,clip +b f32 r n n n R 482 656 2050 2050 656 656 none +b bf16 r n n n R 482 656 2050 2050 656 656 none +b f32 bf16 r n n n R 482 656 2050 2050 656 656 bias,relu,clip +b bf16 bf16 r n n n R 482 656 2050 2050 656 656 bias,relu,clip +b f32 r n n n R 483 656 2050 2050 656 656 none +b bf16 r n n n R 483 656 2050 2050 656 656 none +b f32 bf16 r n n n R 483 656 2050 2050 656 656 bias,relu,clip +b bf16 bf16 r n n n R 483 656 2050 2050 656 656 bias,relu,clip +b f32 r n n n R 484 656 2050 2050 656 656 none +b bf16 r n n n R 484 656 2050 2050 656 656 none +b f32 bf16 r n n n R 484 656 2050 2050 656 656 bias,relu,clip +b bf16 bf16 r n n n R 484 656 2050 2050 656 656 bias,relu,clip +b f32 r n n n p 485 656 2050 2050 656 656 none +b bf16 r n n n p 485 656 2050 2050 656 656 none +b f32 bf16 r n n n p 485 656 2050 2050 656 656 bias,relu,clip +b bf16 bf16 r n n n p 485 656 2050 2050 656 656 bias,relu,clip +b f32 r n n n p 480 672 2050 2050 672 672 none +b bf16 r n n n p 480 672 2050 2050 672 672 none +b f32 bf16 r n n n p 480 672 2050 2050 672 672 bias,relu,clip +b bf16 bf16 r n n n p 480 672 2050 2050 672 672 bias,relu,clip +b f32 r n n n p 481 672 2050 2050 672 672 none +b bf16 r n n n p 481 672 2050 2050 672 672 none +b f32 bf16 r n n n p 481 672 2050 2050 672 672 bias,relu,clip +b bf16 bf16 r n n n p 481 672 2050 2050 672 672 bias,relu,clip +b f32 r n n n p 482 672 2050 2050 672 672 none +b bf16 r n n n p 482 672 2050 2050 672 672 none +b f32 bf16 r n n n p 482 672 2050 2050 672 672 bias,relu,clip +b bf16 bf16 r n n n p 482 672 2050 2050 672 672 bias,relu,clip +b f32 r n n n p 483 672 2050 2050 672 672 none +b bf16 r n n n p 483 672 2050 2050 672 672 none +b f32 bf16 r n n n p 483 672 2050 2050 672 672 bias,relu,clip +b bf16 bf16 r n n n p 483 672 2050 2050 672 672 bias,relu,clip +b f32 r n n n p 484 672 2050 2050 672 672 none +b bf16 r n n n p 484 672 2050 2050 672 672 none +b f32 bf16 r n n n p 484 672 2050 2050 672 672 bias,relu,clip +b bf16 bf16 r n n n p 484 672 2050 2050 672 672 bias,relu,clip +b f32 r n n n p 485 672 2050 2050 672 672 none +b bf16 r n n n p 485 672 2050 2050 672 672 none +b f32 bf16 r n n n p 485 672 2050 2050 672 672 bias,relu,clip +b bf16 bf16 r n n n p 485 672 2050 2050 672 672 bias,relu,clip +b f32 r n n n p 480 688 2050 2050 688 688 none +b bf16 r n n n p 480 688 2050 2050 688 688 none +b f32 bf16 r n n n p 480 688 2050 2050 688 688 bias,relu,clip +b bf16 bf16 r n n n p 480 688 2050 2050 688 688 bias,relu,clip +b f32 r n n n p 481 688 2050 2050 688 688 none +b bf16 r n n n p 481 688 2050 2050 688 688 none +b f32 bf16 r n n n p 481 688 2050 2050 688 688 bias,relu,clip +b bf16 bf16 r n n n p 481 688 2050 2050 688 688 bias,relu,clip +b f32 r n n n r 482 688 2050 2050 688 688 none +b bf16 r n n n r 482 688 2050 2050 688 688 none +b f32 bf16 r n n n r 482 688 2050 2050 688 688 bias,relu,clip +b bf16 bf16 r n n n r 482 688 2050 2050 688 688 bias,relu,clip +b f32 r n n n r 483 688 2050 2050 688 688 none +b bf16 r n n n r 483 688 2050 2050 688 688 none +b f32 bf16 r n n n r 483 688 2050 2050 688 688 bias,relu,clip +b bf16 bf16 r n n n r 483 688 2050 2050 688 688 bias,relu,clip +b f32 r n n n r 484 688 2050 2050 688 688 none +b bf16 r n n n r 484 688 2050 2050 688 688 none +b f32 bf16 r n n n r 484 688 2050 2050 688 688 bias,relu,clip +b bf16 bf16 r n n n r 484 688 2050 2050 688 688 bias,relu,clip +b f32 r n n n r 485 688 2050 2050 688 688 none +b bf16 r n n n r 485 688 2050 2050 688 688 none +b f32 bf16 r n n n r 485 688 2050 2050 688 688 bias,relu,clip +b bf16 bf16 r n n n r 485 688 2050 2050 688 688 bias,relu,clip +b f32 r n n n r 1024 512 64 64 512 512 none +b bf16 r n n n r 1024 512 64 64 512 512 none +b f32 bf16 r n n n r 1024 512 64 64 512 512 bias,relu,clip +b bf16 bf16 r n n n r 1024 512 64 64 512 512 bias,relu,clip +b f32 r n n n r 16 256 512 512 256 256 none +b bf16 r n n n r 16 256 512 512 256 256 none +b f32 bf16 r n n n r 16 256 512 512 256 256 bias,relu,clip +b bf16 bf16 r n n n r 16 256 512 512 256 256 bias,relu,clip +b f32 r n n n r 480 640 512 512 640 640 none +b bf16 r n n n r 480 640 512 512 640 640 none +b f32 bf16 r n n n r 480 640 512 512 640 640 bias,relu,clip +b bf16 bf16 r n n n r 480 640 512 512 640 640 bias,relu,clip +b f32 r n n n r 64 768 512 512 768 768 none +b bf16 r n n n r 64 768 512 512 768 768 none +b f32 bf16 r n n n r 64 768 512 512 768 768 bias,relu,clip +b bf16 bf16 r n n n r 64 768 512 512 768 768 bias,relu,clip +b f32 r n n n r 128 128 128 128 128 128 none +b bf16 r n n n r 128 128 128 128 128 128 none +b f32 bf16 r n n n r 128 128 128 128 128 128 bias,relu,clip +b bf16 bf16 r n n n r 128 128 128 128 128 128 bias,relu,clip +b f32 r n n n r 1024 64 512 512 64 64 none +b bf16 r n n n r 1024 64 512 512 64 64 none +b f32 bf16 r n n n r 1024 64 512 512 64 64 bias,relu,clip +b bf16 bf16 r n n n r 1024 64 512 512 64 64 bias,relu,clip +b f32 r n n n r 1024 256 32 32 256 256 none +b bf16 r n n n r 1024 256 32 32 256 256 none +b f32 bf16 r n n n r 1024 256 32 32 256 256 bias,relu,clip +b bf16 bf16 r n n n r 1024 256 32 32 256 256 bias,relu,clip +b f32 r n n n r 1024 512 64 64 512 512 none +b bf16 r n n n r 1024 512 64 64 512 512 none +b f32 bf16 r n n n r 1024 512 64 64 512 512 bias,relu,clip +b bf16 bf16 r n n n r 1024 512 64 64 512 512 bias,relu,clip +b f32 r n n n r 480 640 512 512 640 640 none +b bf16 r n n n r 480 640 512 512 640 640 none +b f32 bf16 r n n n r 480 640 512 512 640 640 bias,relu,clip +b bf16 bf16 r n n n r 480 640 512 512 640 640 bias,relu,clip +b f32 r n n n p 1024 32 256 256 32 32 none +b bf16 r n n n p 1024 32 256 256 32 32 none +b f32 bf16 r n n n p 1024 32 256 256 32 32 bias,relu,clip +b bf16 bf16 r n n n p 1024 32 256 256 32 32 bias,relu,clip +b f32 r n n n P 1024 64 512 512 64 64 none +b bf16 r n n n P 1024 64 512 512 64 64 none +b f32 bf16 r n n n P 1024 64 512 512 64 64 bias,relu,clip +b bf16 bf16 r n n n P 1024 64 512 512 64 64 bias,relu,clip +b f32 r n n n P 64 800 320 320 800 800 none +b bf16 r n n n P 64 800 320 320 800 800 none +b f32 bf16 r n n n P 64 800 320 320 800 800 bias,relu,clip +b bf16 bf16 r n n n P 64 800 320 320 800 800 bias,relu,clip +b f32 r n n n P 64 768 512 512 768 768 none +b bf16 r n n n P 64 768 512 512 768 768 none +b f32 bf16 r n n n P 64 768 512 512 768 768 bias,relu,clip +b bf16 bf16 r n n n P 64 768 512 512 768 768 bias,relu,clip +b f32 r n n n P 16 256 512 512 256 256 none +b bf16 r n n n P 16 256 512 512 256 256 none +b f32 bf16 r n n n P 16 256 512 512 256 256 bias,relu,clip +b bf16 bf16 r n n n P 16 256 512 512 256 256 bias,relu,clip +b f32 r n n n P 128 128 128 128 128 128 none +b bf16 r n n n P 128 128 128 128 128 128 none +b f32 bf16 r n n n P 128 128 128 128 128 128 bias,relu,clip +b bf16 bf16 r n n n P 128 128 128 128 128 128 bias,relu,clip +b f32 r n n n P 256 512 256 256 512 512 none +b bf16 r n n n P 256 512 256 256 512 512 none +b f32 bf16 r n n n P 256 512 256 256 512 512 bias,relu,clip +b bf16 bf16 r n n n P 256 512 256 256 512 512 bias,relu,clip +b f32 r n n n P 1024 1024 1024 1024 1024 1024 none +b bf16 r n n n P 1024 1024 1024 1024 1024 1024 none +b f32 bf16 r n n n P 1024 1024 1024 1024 1024 1024 bias,relu,clip +b bf16 bf16 r n n n P 1024 1024 1024 1024 1024 1024 bias,relu,clip +b f32 r n n n P 480 640 1024 1024 640 640 none +b bf16 r n n n P 480 640 1024 1024 640 640 none +b f32 bf16 r n n n P 480 640 1024 1024 640 640 bias,relu,clip +b bf16 bf16 r n n n P 480 640 1024 1024 640 640 bias,relu,clip +b f32 r n n n P 480 640 256 256 640 640 none +b bf16 r n n n P 480 640 256 256 640 640 none +b f32 bf16 r n n n P 480 640 256 256 640 640 bias,relu,clip +b bf16 bf16 r n n n P 480 640 256 256 640 640 bias,relu,clip +b f32 r n n n P 8 64 32 32 64 64 none +b bf16 r n n n P 8 64 32 32 64 64 none +b f32 bf16 r n n n P 8 64 32 32 64 64 bias,relu,clip +b bf16 bf16 r n n n P 8 64 32 32 64 64 bias,relu,clip +b f32 r n n n P 9 64 32 32 64 64 none +b bf16 r n n n P 9 64 32 32 64 64 none +b f32 bf16 r n n n P 9 64 32 32 64 64 bias,relu,clip +b bf16 bf16 r n n n P 9 64 32 32 64 64 bias,relu,clip +b f32 r n n n P 10 128 64 64 128 128 none +b bf16 r n n n P 10 128 64 64 128 128 none +b f32 bf16 r n n n P 10 128 64 64 128 128 bias,relu,clip +b bf16 bf16 r n n n P 10 128 64 64 128 128 bias,relu,clip +b f32 r n n n P 8 8 8 8 8 8 none +b bf16 r n n n P 8 8 8 8 8 8 none +b f32 bf16 r n n n P 8 8 8 8 8 8 bias,relu,clip +b bf16 bf16 r n n n P 8 8 8 8 8 8 bias,relu,clip +b f32 r n n n P 12 12 12 12 12 12 none +b bf16 r n n n P 12 12 12 12 12 12 none +b f32 bf16 r n n n P 12 12 12 12 12 12 bias,relu,clip +b bf16 bf16 r n n n P 12 12 12 12 12 12 bias,relu,clip +b f32 r n n n P 25 25 25 25 25 25 none +b bf16 r n n n P 25 25 25 25 25 25 none +b f32 bf16 r n n n P 25 25 25 25 25 25 bias,relu,clip +b bf16 bf16 r n n n P 25 25 25 25 25 25 bias,relu,clip +b f32 r n n n P 25 25 20 20 25 25 none +b bf16 r n n n P 25 25 20 20 25 25 none +b f32 bf16 r n n n P 25 25 20 20 25 25 bias,relu,clip +b bf16 bf16 r n n n P 25 25 20 20 25 25 bias,relu,clip +b f32 c n n n p 485 39 2050 485 2050 485 none +b bf16 c n n n p 485 39 2050 485 2050 485 none +b f32 bf16 c n n n p 485 39 2050 485 2050 485 bias,relu,clip +b bf16 bf16 c n n n p 485 39 2050 485 2050 485 bias,relu,clip +b f32 c n n n p 480 50 2050 480 2050 480 none +b bf16 c n n n p 480 50 2050 480 2050 480 none +b f32 bf16 c n n n p 480 50 2050 480 2050 480 bias,relu,clip +b bf16 bf16 c n n n p 480 50 2050 480 2050 480 bias,relu,clip +b f32 c n n n p 481 50 2050 481 2050 481 none +b bf16 c n n n p 481 50 2050 481 2050 481 none +b f32 bf16 c n n n p 481 50 2050 481 2050 481 bias,relu,clip +b bf16 bf16 c n n n p 481 50 2050 481 2050 481 bias,relu,clip +b f32 c n n n p 482 50 2050 482 2050 482 none +b bf16 c n n n p 482 50 2050 482 2050 482 none +b f32 bf16 c n n n p 482 50 2050 482 2050 482 bias,relu,clip +b bf16 bf16 c n n n p 482 50 2050 482 2050 482 bias,relu,clip +b f32 c n n n p 483 50 2050 483 2050 483 none +b bf16 c n n n p 483 50 2050 483 2050 483 none +b f32 bf16 c n n n p 483 50 2050 483 2050 483 bias,relu,clip +b bf16 bf16 c n n n p 483 50 2050 483 2050 483 bias,relu,clip +b f32 c n n n p 484 50 2050 484 2050 484 none +b bf16 c n n n p 484 50 2050 484 2050 484 none +b f32 bf16 c n n n p 484 50 2050 484 2050 484 bias,relu,clip +b bf16 bf16 c n n n p 484 50 2050 484 2050 484 bias,relu,clip +b f32 c n n n p 485 50 2050 485 2050 485 none +b bf16 c n n n p 485 50 2050 485 2050 485 none +b f32 bf16 c n n n p 485 50 2050 485 2050 485 bias,relu,clip +b bf16 bf16 c n n n p 485 50 2050 485 2050 485 bias,relu,clip +b f32 c n n n p 484 1127 2050 484 2050 484 none +b bf16 c n n n p 484 1127 2050 484 2050 484 none +b f32 bf16 c n n n p 484 1127 2050 484 2050 484 bias,relu,clip +b bf16 bf16 c n n n p 484 1127 2050 484 2050 484 bias,relu,clip +b f32 c n n n p 485 1127 2050 485 2050 485 none +b bf16 c n n n p 485 1127 2050 485 2050 485 none +b f32 bf16 c n n n p 485 1127 2050 485 2050 485 bias,relu,clip +b bf16 bf16 c n n n p 485 1127 2050 485 2050 485 bias,relu,clip +b f32 c n n n p 480 1138 2050 480 2050 480 none +b bf16 c n n n p 480 1138 2050 480 2050 480 none +b f32 bf16 c n n n p 480 1138 2050 480 2050 480 bias,relu,clip +b bf16 bf16 c n n n p 480 1138 2050 480 2050 480 bias,relu,clip +b f32 c n n n p 481 1138 2050 481 2050 481 none +b bf16 c n n n p 481 1138 2050 481 2050 481 none +b f32 bf16 c n n n p 481 1138 2050 481 2050 481 bias,relu,clip +b bf16 bf16 c n n n p 481 1138 2050 481 2050 481 bias,relu,clip +b f32 c n n n p 482 1138 2050 482 2050 482 none +b bf16 c n n n p 482 1138 2050 482 2050 482 none +b f32 bf16 c n n n p 482 1138 2050 482 2050 482 bias,relu,clip +b bf16 bf16 c n n n p 482 1138 2050 482 2050 482 bias,relu,clip +b f32 c n n n p 483 1138 2050 483 2050 483 none +b bf16 c n n n p 483 1138 2050 483 2050 483 none +b f32 bf16 c n n n p 483 1138 2050 483 2050 483 bias,relu,clip +b bf16 bf16 c n n n p 483 1138 2050 483 2050 483 bias,relu,clip +b f32 c n n n p 484 1138 2050 484 2050 484 none +b bf16 c n n n p 484 1138 2050 484 2050 484 none +b f32 bf16 c n n n p 484 1138 2050 484 2050 484 bias,relu,clip +b bf16 bf16 c n n n p 484 1138 2050 484 2050 484 bias,relu,clip +b f32 c n n n p 485 1138 2050 485 2050 485 none +b bf16 c n n n p 485 1138 2050 485 2050 485 none +b f32 bf16 c n n n p 485 1138 2050 485 2050 485 bias,relu,clip +b bf16 bf16 c n n n p 485 1138 2050 485 2050 485 bias,relu,clip +b f32 c n n n p 1 1 3 1 3 1 none +b bf16 c n n n p 1 1 3 1 3 1 none +b f32 bf16 c n n n p 1 1 3 1 3 1 bias,relu,clip +b bf16 bf16 c n n n p 1 1 3 1 3 1 bias,relu,clip +b f32 c n n n p 1 9 3 1 3 1 none +b bf16 c n n n p 1 9 3 1 3 1 none +b f32 bf16 c n n n p 1 9 3 1 3 1 bias,relu,clip +b bf16 bf16 c n n n p 1 9 3 1 3 1 bias,relu,clip +b f32 c n n n p 1 2048 3 1 3 1 none +b bf16 c n n n p 1 2048 3 1 3 1 none +b f32 bf16 c n n n p 1 2048 3 1 3 1 bias,relu,clip +b bf16 bf16 c n n n p 1 2048 3 1 3 1 bias,relu,clip +b f32 c n n n p 1 2048 5192 1 5192 1 none +b bf16 c n n n p 1 2048 5192 1 5192 1 none +b f32 bf16 c n n n p 1 2048 5192 1 5192 1 bias,relu,clip +b bf16 bf16 c n n n p 1 2048 5192 1 5192 1 bias,relu,clip +b f32 c n n n p 9 1 3 9 3 9 none +b bf16 c n n n p 9 1 3 9 3 9 none +b f32 bf16 c n n n p 9 1 3 9 3 9 bias,relu,clip +b bf16 bf16 c n n n p 9 1 3 9 3 9 bias,relu,clip +b f32 c n n n p 576 1 3500 576 3500 576 none +b bf16 c n n n p 576 1 3500 576 3500 576 none +b f32 bf16 c n n n p 576 1 3500 576 3500 576 bias,relu,clip +b bf16 bf16 c n n n p 576 1 3500 576 3500 576 bias,relu,clip +b f32 c n n n p 1 1 1 1 1 1 none +b bf16 c n n n p 1 1 1 1 1 1 none +b f32 bf16 c n n n p 1 1 1 1 1 1 bias,relu,clip +b bf16 bf16 c n n n p 1 1 1 1 1 1 bias,relu,clip +b f32 c n n n p 102 1088 1024 102 1024 102 none +b bf16 c n n n p 102 1088 1024 102 1024 102 none +b f32 bf16 c n n n p 102 1088 1024 102 1024 102 bias,relu,clip +b bf16 bf16 c n n n p 102 1088 1024 102 1024 102 bias,relu,clip +b f32 c n n n p 102 2048 1024 102 1024 102 none +b bf16 c n n n p 102 2048 1024 102 1024 102 none +b f32 bf16 c n n n p 102 2048 1024 102 1024 102 bias,relu,clip +b bf16 bf16 c n n n p 102 2048 1024 102 1024 102 bias,relu,clip +b f32 c n n n p 485 656 1024 485 1024 485 none +b bf16 c n n n p 485 656 1024 485 1024 485 none +b f32 bf16 c n n n p 485 656 1024 485 1024 485 bias,relu,clip +b bf16 bf16 c n n n p 485 656 1024 485 1024 485 bias,relu,clip +b f32 c n n n p 483 656 1024 483 1024 483 none +b bf16 c n n n p 483 656 1024 483 1024 483 none +b f32 bf16 c n n n p 483 656 1024 483 1024 483 bias,relu,clip +b bf16 bf16 c n n n p 483 656 1024 483 1024 483 bias,relu,clip +b f32 c n n n p 81 128 3 81 3 81 none +b bf16 c n n n p 81 128 3 81 3 81 none +b f32 bf16 c n n n p 81 128 3 81 3 81 bias,relu,clip +b bf16 bf16 c n n n p 81 128 3 81 3 81 bias,relu,clip +b f32 c n n n p 1022 512 515 1022 515 1022 none +b bf16 c n n n p 1022 512 515 1022 515 1022 none +b f32 bf16 c n n n p 1022 512 515 1022 515 1022 bias,relu,clip +b bf16 bf16 c n n n p 1022 512 515 1022 515 1022 bias,relu,clip +b f32 c n n n p 74 512 515 74 515 74 none +b bf16 c n n n p 74 512 515 74 515 74 none +b f32 bf16 c n n n p 74 512 515 74 515 74 bias,relu,clip +b bf16 bf16 c n n n p 74 512 515 74 515 74 bias,relu,clip +b f32 c n n n p 253 2048 515 253 515 253 none +b bf16 c n n n p 253 2048 515 253 515 253 none +b f32 bf16 c n n n p 253 2048 515 253 515 253 bias,relu,clip +b bf16 bf16 c n n n p 253 2048 515 253 515 253 bias,relu,clip +b f32 c n n n p 8192 1040 515 8192 515 8192 none +b bf16 c n n n p 8192 1040 515 8192 515 8192 none +b f32 bf16 c n n n p 8192 1040 515 8192 515 8192 bias,relu,clip +b bf16 bf16 c n n n p 8192 1040 515 8192 515 8192 bias,relu,clip +b f32 c n n n p 10 1029 515 10 515 10 none +b bf16 c n n n p 10 1029 515 10 515 10 none +b f32 bf16 c n n n p 10 1029 515 10 515 10 bias,relu,clip +b bf16 bf16 c n n n p 10 1029 515 10 515 10 bias,relu,clip +b f32 c n n n p 24 1040 2050 24 2050 24 none +b bf16 c n n n p 24 1040 2050 24 2050 24 none +b f32 bf16 c n n n p 24 1040 2050 24 2050 24 bias,relu,clip +b bf16 bf16 c n n n p 24 1040 2050 24 2050 24 bias,relu,clip +b f32 c n n n p 1024 1029 2050 1024 2050 1024 none +b bf16 c n n n p 1024 1029 2050 1024 2050 1024 none +b f32 bf16 c n n n p 1024 1029 2050 1024 2050 1024 bias,relu,clip +b bf16 bf16 c n n n p 1024 1029 2050 1024 2050 1024 bias,relu,clip +b f32 c n n n p 480 660 2050 480 2050 480 none +b bf16 c n n n p 480 660 2050 480 2050 480 none +b f32 bf16 c n n n p 480 660 2050 480 2050 480 bias,relu,clip +b bf16 bf16 c n n n p 480 660 2050 480 2050 480 bias,relu,clip +b f32 c n n n p 481 660 2050 481 2050 481 none +b bf16 c n n n p 481 660 2050 481 2050 481 none +b f32 bf16 c n n n p 481 660 2050 481 2050 481 bias,relu,clip +b bf16 bf16 c n n n p 481 660 2050 481 2050 481 bias,relu,clip +b f32 c n n n p 482 660 2050 482 2050 482 none +b bf16 c n n n p 482 660 2050 482 2050 482 none +b f32 bf16 c n n n p 482 660 2050 482 2050 482 bias,relu,clip +b bf16 bf16 c n n n p 482 660 2050 482 2050 482 bias,relu,clip +b f32 c n n n p 483 660 2050 483 2050 483 none +b bf16 c n n n p 483 660 2050 483 2050 483 none +b f32 bf16 c n n n p 483 660 2050 483 2050 483 bias,relu,clip +b bf16 bf16 c n n n p 483 660 2050 483 2050 483 bias,relu,clip +b f32 c n n n p 484 660 2050 484 2050 484 none +b bf16 c n n n p 484 660 2050 484 2050 484 none +b f32 bf16 c n n n p 484 660 2050 484 2050 484 bias,relu,clip +b bf16 bf16 c n n n p 484 660 2050 484 2050 484 bias,relu,clip +b f32 c n n n p 485 660 2050 485 2050 485 none +b bf16 c n n n p 485 660 2050 485 2050 485 none +b f32 bf16 c n n n p 485 660 2050 485 2050 485 bias,relu,clip +b bf16 bf16 c n n n p 485 660 2050 485 2050 485 bias,relu,clip +b f32 c n n n p 480 679 2050 480 2050 480 none +b bf16 c n n n p 480 679 2050 480 2050 480 none +b f32 bf16 c n n n p 480 679 2050 480 2050 480 bias,relu,clip +b bf16 bf16 c n n n p 480 679 2050 480 2050 480 bias,relu,clip +b f32 c n n n p 481 679 2050 481 2050 481 none +b bf16 c n n n p 481 679 2050 481 2050 481 none +b f32 bf16 c n n n p 481 679 2050 481 2050 481 bias,relu,clip +b bf16 bf16 c n n n p 481 679 2050 481 2050 481 bias,relu,clip +b f32 c n n n p 482 679 2050 482 2050 482 none +b bf16 c n n n p 482 679 2050 482 2050 482 none +b f32 bf16 c n n n p 482 679 2050 482 2050 482 bias,relu,clip +b bf16 bf16 c n n n p 482 679 2050 482 2050 482 bias,relu,clip +b f32 c n n n p 483 679 2050 483 2050 483 none +b bf16 c n n n p 483 679 2050 483 2050 483 none +b f32 bf16 c n n n p 483 679 2050 483 2050 483 bias,relu,clip +b bf16 bf16 c n n n p 483 679 2050 483 2050 483 bias,relu,clip +b f32 c n n n p 484 679 2050 484 2050 484 none +b bf16 c n n n p 484 679 2050 484 2050 484 none +b f32 bf16 c n n n p 484 679 2050 484 2050 484 bias,relu,clip +b bf16 bf16 c n n n p 484 679 2050 484 2050 484 bias,relu,clip +b f32 c n n n p 485 679 2050 485 2050 485 none +b bf16 c n n n p 485 679 2050 485 2050 485 none +b f32 bf16 c n n n p 485 679 2050 485 2050 485 bias,relu,clip +b bf16 bf16 c n n n p 485 679 2050 485 2050 485 bias,relu,clip +b f32 c n n n p 480 690 2050 480 2050 480 none +b bf16 c n n n p 480 690 2050 480 2050 480 none +b f32 bf16 c n n n p 480 690 2050 480 2050 480 bias,relu,clip +b bf16 bf16 c n n n p 480 690 2050 480 2050 480 bias,relu,clip +b f32 c n n n p 481 690 2050 481 2050 481 none +b bf16 c n n n p 481 690 2050 481 2050 481 none +b f32 bf16 c n n n p 481 690 2050 481 2050 481 bias,relu,clip +b bf16 bf16 c n n n p 481 690 2050 481 2050 481 bias,relu,clip +b f32 c n n n p 482 690 2050 482 2050 482 none +b bf16 c n n n p 482 690 2050 482 2050 482 none +b f32 bf16 c n n n p 482 690 2050 482 2050 482 bias,relu,clip +b bf16 bf16 c n n n p 482 690 2050 482 2050 482 bias,relu,clip +b f32 c n n n p 483 690 2050 483 2050 483 none +b bf16 c n n n p 483 690 2050 483 2050 483 none +b f32 bf16 c n n n p 483 690 2050 483 2050 483 bias,relu,clip +b bf16 bf16 c n n n p 483 690 2050 483 2050 483 bias,relu,clip +b f32 c n n n p 484 690 2050 484 2050 484 none +b bf16 c n n n p 484 690 2050 484 2050 484 none +b f32 bf16 c n n n p 484 690 2050 484 2050 484 bias,relu,clip +b bf16 bf16 c n n n p 484 690 2050 484 2050 484 bias,relu,clip +b f32 c n n n p 485 690 2050 485 2050 485 none +b bf16 c n n n p 485 690 2050 485 2050 485 none +b f32 bf16 c n n n p 485 690 2050 485 2050 485 bias,relu,clip +b bf16 bf16 c n n n p 485 690 2050 485 2050 485 bias,relu,clip +b f32 c n n n p 480 660 2048 480 2048 480 none +b bf16 c n n n p 480 660 2048 480 2048 480 none +b f32 bf16 c n n n p 480 660 2048 480 2048 480 bias,relu,clip +b bf16 bf16 c n n n p 480 660 2048 480 2048 480 bias,relu,clip +b f32 c n n n p 481 660 2048 481 2048 481 none +b bf16 c n n n p 481 660 2048 481 2048 481 none +b f32 bf16 c n n n p 481 660 2048 481 2048 481 bias,relu,clip +b bf16 bf16 c n n n p 481 660 2048 481 2048 481 bias,relu,clip +b f32 c n n n p 482 660 2048 482 2048 482 none +b bf16 c n n n p 482 660 2048 482 2048 482 none +b f32 bf16 c n n n p 482 660 2048 482 2048 482 bias,relu,clip +b bf16 bf16 c n n n p 482 660 2048 482 2048 482 bias,relu,clip +b f32 c n n n p 483 660 2048 483 2048 483 none +b bf16 c n n n p 483 660 2048 483 2048 483 none +b f32 bf16 c n n n p 483 660 2048 483 2048 483 bias,relu,clip +b bf16 bf16 c n n n p 483 660 2048 483 2048 483 bias,relu,clip +b f32 c n n n p 484 660 2048 484 2048 484 none +b bf16 c n n n p 484 660 2048 484 2048 484 none +b f32 bf16 c n n n p 484 660 2048 484 2048 484 bias,relu,clip +b bf16 bf16 c n n n p 484 660 2048 484 2048 484 bias,relu,clip +b f32 c n n n p 485 660 2048 485 2048 485 none +b bf16 c n n n p 485 660 2048 485 2048 485 none +b f32 bf16 c n n n p 485 660 2048 485 2048 485 bias,relu,clip +b bf16 bf16 c n n n p 485 660 2048 485 2048 485 bias,relu,clip +b f32 c n n n p 480 679 2048 480 2048 480 none +b bf16 c n n n p 480 679 2048 480 2048 480 none +b f32 bf16 c n n n p 480 679 2048 480 2048 480 bias,relu,clip +b bf16 bf16 c n n n p 480 679 2048 480 2048 480 bias,relu,clip +b f32 c n n n p 481 679 2048 481 2048 481 none +b bf16 c n n n p 481 679 2048 481 2048 481 none +b f32 bf16 c n n n p 481 679 2048 481 2048 481 bias,relu,clip +b bf16 bf16 c n n n p 481 679 2048 481 2048 481 bias,relu,clip +b f32 c n n n p 482 679 2048 482 2048 482 none +b bf16 c n n n p 482 679 2048 482 2048 482 none +b f32 bf16 c n n n p 482 679 2048 482 2048 482 bias,relu,clip +b bf16 bf16 c n n n p 482 679 2048 482 2048 482 bias,relu,clip +b f32 c n n n p 483 679 2048 483 2048 483 none +b bf16 c n n n p 483 679 2048 483 2048 483 none +b f32 bf16 c n n n p 483 679 2048 483 2048 483 bias,relu,clip +b bf16 bf16 c n n n p 483 679 2048 483 2048 483 bias,relu,clip +b f32 c n n n p 484 679 2048 484 2048 484 none +b bf16 c n n n p 484 679 2048 484 2048 484 none +b f32 bf16 c n n n p 484 679 2048 484 2048 484 bias,relu,clip +b bf16 bf16 c n n n p 484 679 2048 484 2048 484 bias,relu,clip +b f32 c n n n p 485 679 2048 485 2048 485 none +b bf16 c n n n p 485 679 2048 485 2048 485 none +b f32 bf16 c n n n p 485 679 2048 485 2048 485 bias,relu,clip +b bf16 bf16 c n n n p 485 679 2048 485 2048 485 bias,relu,clip +b f32 c n n n p 480 690 2048 480 2048 480 none +b bf16 c n n n p 480 690 2048 480 2048 480 none +b f32 bf16 c n n n p 480 690 2048 480 2048 480 bias,relu,clip +b bf16 bf16 c n n n p 480 690 2048 480 2048 480 bias,relu,clip +b f32 c n n n p 481 690 2048 481 2048 481 none +b bf16 c n n n p 481 690 2048 481 2048 481 none +b f32 bf16 c n n n p 481 690 2048 481 2048 481 bias,relu,clip +b bf16 bf16 c n n n p 481 690 2048 481 2048 481 bias,relu,clip +b f32 c n n n p 482 690 2048 482 2048 482 none +b bf16 c n n n p 482 690 2048 482 2048 482 none +b f32 bf16 c n n n p 482 690 2048 482 2048 482 bias,relu,clip +b bf16 bf16 c n n n p 482 690 2048 482 2048 482 bias,relu,clip +b f32 c n n n p 483 690 2048 483 2048 483 none +b bf16 c n n n p 483 690 2048 483 2048 483 none +b f32 bf16 c n n n p 483 690 2048 483 2048 483 bias,relu,clip +b bf16 bf16 c n n n p 483 690 2048 483 2048 483 bias,relu,clip +b f32 c n n n p 484 690 2048 484 2048 484 none +b bf16 c n n n p 484 690 2048 484 2048 484 none +b f32 bf16 c n n n p 484 690 2048 484 2048 484 bias,relu,clip +b bf16 bf16 c n n n p 484 690 2048 484 2048 484 bias,relu,clip +b f32 c n n n p 485 690 2048 485 2048 485 none +b bf16 c n n n p 485 690 2048 485 2048 485 none +b f32 bf16 c n n n p 485 690 2048 485 2048 485 bias,relu,clip +b bf16 bf16 c n n n p 485 690 2048 485 2048 485 bias,relu,clip +b f32 c n n n p 480 656 1024 480 1024 480 none +b bf16 c n n n p 480 656 1024 480 1024 480 none +b f32 bf16 c n n n p 480 656 1024 480 1024 480 bias,relu,clip +b bf16 bf16 c n n n p 480 656 1024 480 1024 480 bias,relu,clip +b f32 c n n n p 480 128 3 480 3 480 none +b bf16 c n n n p 480 128 3 480 3 480 none +b f32 bf16 c n n n p 480 128 3 480 3 480 bias,relu,clip +b bf16 bf16 c n n n p 480 128 3 480 3 480 bias,relu,clip +b f32 c n n n p 1024 512 515 1024 515 1024 none +b bf16 c n n n p 1024 512 515 1024 515 1024 none +b f32 bf16 c n n n p 1024 512 515 1024 515 1024 bias,relu,clip +b bf16 bf16 c n n n p 1024 512 515 1024 515 1024 bias,relu,clip +b f32 c n n n p 1024 2048 1024 1024 1024 1024 none +b bf16 c n n n p 1024 2048 1024 1024 1024 1024 none +b f32 bf16 c n n n p 1024 2048 1024 1024 1024 1024 bias,relu,clip +b bf16 bf16 c n n n p 1024 2048 1024 1024 1024 1024 bias,relu,clip +b f32 c n n n p 1024 2048 515 1024 515 1024 none +b bf16 c n n n p 1024 2048 515 1024 515 1024 none +b f32 bf16 c n n n p 1024 2048 515 1024 515 1024 bias,relu,clip +b bf16 bf16 c n n n p 1024 2048 515 1024 515 1024 bias,relu,clip +b f32 c p n n n 1024 1040 515 1024 515 1024 none +b bf16 c p n n n 1024 1040 515 1024 515 1024 none +b f32 bf16 c p n n n 1024 1040 515 1024 515 1024 bias,relu,clip +b bf16 bf16 c p n n n 1024 1040 515 1024 515 1024 bias,relu,clip +b f32 c p n n n 5 1029 515 5 515 5 none +b bf16 c p n n n 5 1029 515 5 515 5 none +b f32 bf16 c p n n n 5 1029 515 5 515 5 bias,relu,clip +b bf16 bf16 c p n n n 5 1029 515 5 515 5 bias,relu,clip +b f32 c p n n n 1024 1029 515 1024 515 1024 none +b bf16 c p n n n 1024 1029 515 1024 515 1024 none +b f32 bf16 c p n n n 1024 1029 515 1024 515 1024 bias,relu,clip +b bf16 bf16 c p n n n 1024 1029 515 1024 515 1024 bias,relu,clip +b f32 c p n n n 1024 1040 2050 1024 2050 1024 none +b bf16 c p n n n 1024 1040 2050 1024 2050 1024 none +b f32 bf16 c p n n n 1024 1040 2050 1024 2050 1024 bias,relu,clip +b bf16 bf16 c p n n n 1024 1040 2050 1024 2050 1024 bias,relu,clip +b f32 c p n n n 1029 1029 2050 1029 2050 1029 none +b bf16 c p n n n 1029 1029 2050 1029 2050 1029 none +b f32 bf16 c p n n n 1029 1029 2050 1029 2050 1029 bias,relu,clip +b bf16 bf16 c p n n n 1029 1029 2050 1029 2050 1029 bias,relu,clip +b f32 c p n n n 485 656 2050 485 2050 485 none +b bf16 c p n n n 485 656 2050 485 2050 485 none +b f32 bf16 c p n n n 485 656 2050 485 2050 485 bias,relu,clip +b bf16 bf16 c p n n n 485 656 2050 485 2050 485 bias,relu,clip +b f32 c p n n n 480 672 2050 480 2050 480 none +b bf16 c p n n n 480 672 2050 480 2050 480 none +b f32 bf16 c p n n n 480 672 2050 480 2050 480 bias,relu,clip +b bf16 bf16 c p n n n 480 672 2050 480 2050 480 bias,relu,clip +b f32 c p n n n 481 672 2050 481 2050 481 none +b bf16 c p n n n 481 672 2050 481 2050 481 none +b f32 bf16 c p n n n 481 672 2050 481 2050 481 bias,relu,clip +b bf16 bf16 c p n n n 481 672 2050 481 2050 481 bias,relu,clip +b f32 c p n n n 482 672 2050 482 2050 482 none +b bf16 c p n n n 482 672 2050 482 2050 482 none +b f32 bf16 c p n n n 482 672 2050 482 2050 482 bias,relu,clip +b bf16 bf16 c p n n n 482 672 2050 482 2050 482 bias,relu,clip +b f32 c p n n n 483 672 2050 483 2050 483 none +b bf16 c p n n n 483 672 2050 483 2050 483 none +b f32 bf16 c p n n n 483 672 2050 483 2050 483 bias,relu,clip +b bf16 bf16 c p n n n 483 672 2050 483 2050 483 bias,relu,clip +b f32 c p n n n 484 672 2050 484 2050 484 none +b bf16 c p n n n 484 672 2050 484 2050 484 none +b f32 bf16 c p n n n 484 672 2050 484 2050 484 bias,relu,clip +b bf16 bf16 c p n n n 484 672 2050 484 2050 484 bias,relu,clip +b f32 c p n n n 485 672 2050 485 2050 485 none +b bf16 c p n n n 485 672 2050 485 2050 485 none +b f32 bf16 c p n n n 485 672 2050 485 2050 485 bias,relu,clip +b bf16 bf16 c p n n n 485 672 2050 485 2050 485 bias,relu,clip +b f32 c p n n n 480 688 2050 480 2050 480 none +b bf16 c p n n n 480 688 2050 480 2050 480 none +b f32 bf16 c p n n n 480 688 2050 480 2050 480 bias,relu,clip +b bf16 bf16 c p n n n 480 688 2050 480 2050 480 bias,relu,clip +b f32 c p n n n 481 688 2050 481 2050 481 none +b bf16 c p n n n 481 688 2050 481 2050 481 none +b f32 bf16 c p n n n 481 688 2050 481 2050 481 bias,relu,clip +b bf16 bf16 c p n n n 481 688 2050 481 2050 481 bias,relu,clip +b f32 c p n n n 1024 32 256 1024 256 1024 none +b bf16 c p n n n 1024 32 256 1024 256 1024 none +b f32 bf16 c p n n n 1024 32 256 1024 256 1024 bias,relu,clip +b bf16 bf16 c p n n n 1024 32 256 1024 256 1024 bias,relu,clip +b f32 c P n n n 1024 64 512 1024 512 1024 none +b bf16 c P n n n 1024 64 512 1024 512 1024 none +b f32 bf16 c P n n n 1024 64 512 1024 512 1024 bias,relu,clip +b bf16 bf16 c P n n n 1024 64 512 1024 512 1024 bias,relu,clip +b f32 c P n n n 64 800 320 64 320 64 none +b bf16 c P n n n 64 800 320 64 320 64 none +b f32 bf16 c P n n n 64 800 320 64 320 64 bias,relu,clip +b bf16 bf16 c P n n n 64 800 320 64 320 64 bias,relu,clip +b f32 c P n n n 64 768 512 64 512 64 none +b bf16 c P n n n 64 768 512 64 512 64 none +b f32 bf16 c P n n n 64 768 512 64 512 64 bias,relu,clip +b bf16 bf16 c P n n n 64 768 512 64 512 64 bias,relu,clip +b f32 c P n n n 16 256 512 16 512 16 none +b bf16 c P n n n 16 256 512 16 512 16 none +b f32 bf16 c P n n n 16 256 512 16 512 16 bias,relu,clip +b bf16 bf16 c P n n n 16 256 512 16 512 16 bias,relu,clip +b f32 c P n n n 128 128 128 128 128 128 none +b bf16 c P n n n 128 128 128 128 128 128 none +b f32 bf16 c P n n n 128 128 128 128 128 128 bias,relu,clip +b bf16 bf16 c P n n n 128 128 128 128 128 128 bias,relu,clip +b f32 c P n n n 256 512 256 256 256 256 none +b bf16 c P n n n 256 512 256 256 256 256 none +b f32 bf16 c P n n n 256 512 256 256 256 256 bias,relu,clip +b bf16 bf16 c P n n n 256 512 256 256 256 256 bias,relu,clip +b f32 c P n n n 1024 1024 1024 1024 1024 1024 none +b bf16 c P n n n 1024 1024 1024 1024 1024 1024 none +b f32 bf16 c P n n n 1024 1024 1024 1024 1024 1024 bias,relu,clip +b bf16 bf16 c P n n n 1024 1024 1024 1024 1024 1024 bias,relu,clip +b f32 c P n n n 480 640 1024 480 1024 480 none +b bf16 c P n n n 480 640 1024 480 1024 480 none +b f32 bf16 c P n n n 480 640 1024 480 1024 480 bias,relu,clip +b bf16 bf16 c P n n n 480 640 1024 480 1024 480 bias,relu,clip +b f32 c P n n n 480 640 256 480 256 480 none +b bf16 c P n n n 480 640 256 480 256 480 none +b f32 bf16 c P n n n 480 640 256 480 256 480 bias,relu,clip +b bf16 bf16 c P n n n 480 640 256 480 256 480 bias,relu,clip +b f32 c P n n n 8 64 32 8 32 8 none +b bf16 c P n n n 8 64 32 8 32 8 none +b f32 bf16 c P n n n 8 64 32 8 32 8 bias,relu,clip +b bf16 bf16 c P n n n 8 64 32 8 32 8 bias,relu,clip +b f32 c P n n n 9 64 32 9 32 9 none +b bf16 c P n n n 9 64 32 9 32 9 none +b f32 bf16 c P n n n 9 64 32 9 32 9 bias,relu,clip +b bf16 bf16 c P n n n 9 64 32 9 32 9 bias,relu,clip +b f32 c P n n n 10 128 64 10 64 10 none +b bf16 c P n n n 10 128 64 10 64 10 none +b f32 bf16 c P n n n 10 128 64 10 64 10 bias,relu,clip +b bf16 bf16 c P n n n 10 128 64 10 64 10 bias,relu,clip +b f32 c P n n n 8 8 8 8 8 8 none +b bf16 c P n n n 8 8 8 8 8 8 none +b f32 bf16 c P n n n 8 8 8 8 8 8 bias,relu,clip +b bf16 bf16 c P n n n 8 8 8 8 8 8 bias,relu,clip +b f32 c P n n n 12 12 12 12 12 12 none +b bf16 c P n n n 12 12 12 12 12 12 none +b f32 bf16 c P n n n 12 12 12 12 12 12 bias,relu,clip +b bf16 bf16 c P n n n 12 12 12 12 12 12 bias,relu,clip +b f32 c P n n n 25 25 25 25 25 25 none +b bf16 c P n n n 25 25 25 25 25 25 none +b f32 bf16 c P n n n 25 25 25 25 25 25 bias,relu,clip +b bf16 bf16 c P n n n 25 25 25 25 25 25 bias,relu,clip +b f32 c P n n n 25 25 20 25 20 25 none +b bf16 c P n n n 25 25 20 25 20 25 none +b f32 bf16 c P n n n 25 25 20 25 20 25 bias,relu,clip +b bf16 bf16 c P n n n 25 25 20 25 20 25 bias,relu,clip +s s16 r n n n r 480 20 2050 2050 20 20 none +s s8 r n n n r 480 20 2050 2050 20 20 none +s u8 r n n n r 480 20 2050 2050 20 20 none +s s16 u8 r n n n r 480 20 2050 2050 20 20 bias,relu,clip +s s8 u8 r n n n r 480 20 2050 2050 20 20 bias,relu,clip +s u8 u8 r n n n r 480 20 2050 2050 20 20 bias,relu,clip +s s16 r n n n r 481 20 2050 2050 20 20 none +s s8 r n n n r 481 20 2050 2050 20 20 none +s u8 r n n n r 481 20 2050 2050 20 20 none +s s16 u8 r n n n r 481 20 2050 2050 20 20 bias,relu,clip +s s8 u8 r n n n r 481 20 2050 2050 20 20 bias,relu,clip +s u8 u8 r n n n r 481 20 2050 2050 20 20 bias,relu,clip +s s16 r n n n r 482 20 2050 2050 20 20 none +s s8 r n n n r 482 20 2050 2050 20 20 none +s u8 r n n n r 482 20 2050 2050 20 20 none +s s16 u8 r n n n r 482 20 2050 2050 20 20 bias,relu,clip +s s8 u8 r n n n r 482 20 2050 2050 20 20 bias,relu,clip +s u8 u8 r n n n r 482 20 2050 2050 20 20 bias,relu,clip +s s16 r n n n p 483 20 2050 2050 20 20 none +s s8 r n n n p 483 20 2050 2050 20 20 none +s u8 r n n n p 483 20 2050 2050 20 20 none +s s16 u8 r n n n p 483 20 2050 2050 20 20 bias,relu,clip +s s8 u8 r n n n p 483 20 2050 2050 20 20 bias,relu,clip +s u8 u8 r n n n p 483 20 2050 2050 20 20 bias,relu,clip +s s16 r n n n R 484 20 2050 2050 20 20 none +s s8 r n n n R 484 20 2050 2050 20 20 none +s u8 r n n n R 484 20 2050 2050 20 20 none +s s16 u8 r n n n R 484 20 2050 2050 20 20 bias,relu,clip +s s8 u8 r n n n R 484 20 2050 2050 20 20 bias,relu,clip +s u8 u8 r n n n R 484 20 2050 2050 20 20 bias,relu,clip +s s16 r n n n R 485 20 2050 2050 20 20 none +s s8 r n n n R 485 20 2050 2050 20 20 none +s u8 r n n n R 485 20 2050 2050 20 20 none +s s16 u8 r n n n R 485 20 2050 2050 20 20 bias,relu,clip +s s8 u8 r n n n R 485 20 2050 2050 20 20 bias,relu,clip +s u8 u8 r n n n R 485 20 2050 2050 20 20 bias,relu,clip +s s16 r n n n R 480 39 2050 2050 39 39 none +s s8 r n n n R 480 39 2050 2050 39 39 none +s u8 r n n n R 480 39 2050 2050 39 39 none +s s16 u8 r n n n R 480 39 2050 2050 39 39 bias,relu,clip +s s8 u8 r n n n R 480 39 2050 2050 39 39 bias,relu,clip +s u8 u8 r n n n R 480 39 2050 2050 39 39 bias,relu,clip +s s16 r n n n R 481 39 2050 2050 39 39 none +s s8 r n n n R 481 39 2050 2050 39 39 none +s u8 r n n n R 481 39 2050 2050 39 39 none +s s16 u8 r n n n R 481 39 2050 2050 39 39 bias,relu,clip +s s8 u8 r n n n R 481 39 2050 2050 39 39 bias,relu,clip +s u8 u8 r n n n R 481 39 2050 2050 39 39 bias,relu,clip +s s16 r n n n R 482 39 2050 2050 39 39 none +s s8 r n n n R 482 39 2050 2050 39 39 none +s u8 r n n n R 482 39 2050 2050 39 39 none +s s16 u8 r n n n R 482 39 2050 2050 39 39 bias,relu,clip +s s8 u8 r n n n R 482 39 2050 2050 39 39 bias,relu,clip +s u8 u8 r n n n R 482 39 2050 2050 39 39 bias,relu,clip +s s16 r n n n R 483 39 2050 2050 39 39 none +s s8 r n n n R 483 39 2050 2050 39 39 none +s u8 r n n n R 483 39 2050 2050 39 39 none +s s16 u8 r n n n R 483 39 2050 2050 39 39 bias,relu,clip +s s8 u8 r n n n R 483 39 2050 2050 39 39 bias,relu,clip +s u8 u8 r n n n R 483 39 2050 2050 39 39 bias,relu,clip +s s16 r n n n R 484 39 2050 2050 39 39 none +s s8 r n n n R 484 39 2050 2050 39 39 none +s u8 r n n n R 484 39 2050 2050 39 39 none +s s16 u8 r n n n R 484 39 2050 2050 39 39 bias,relu,clip +s s8 u8 r n n n R 484 39 2050 2050 39 39 bias,relu,clip +s u8 u8 r n n n R 484 39 2050 2050 39 39 bias,relu,clip +s s16 r n n n p 485 39 2050 2050 39 39 none +s s8 r n n n p 485 39 2050 2050 39 39 none +s u8 r n n n p 485 39 2050 2050 39 39 none +s s16 u8 r n n n p 485 39 2050 2050 39 39 bias,relu,clip +s s8 u8 r n n n p 485 39 2050 2050 39 39 bias,relu,clip +s u8 u8 r n n n p 485 39 2050 2050 39 39 bias,relu,clip +s s16 r n n n p 480 50 2050 2050 50 50 none +s s8 r n n n p 480 50 2050 2050 50 50 none +s u8 r n n n p 480 50 2050 2050 50 50 none +s s16 u8 r n n n p 480 50 2050 2050 50 50 bias,relu,clip +s s8 u8 r n n n p 480 50 2050 2050 50 50 bias,relu,clip +s u8 u8 r n n n p 480 50 2050 2050 50 50 bias,relu,clip +s s16 r n n n p 481 50 2050 2050 50 50 none +s s8 r n n n p 481 50 2050 2050 50 50 none +s u8 r n n n p 481 50 2050 2050 50 50 none +s s16 u8 r n n n p 481 50 2050 2050 50 50 bias,relu,clip +s s8 u8 r n n n p 481 50 2050 2050 50 50 bias,relu,clip +s u8 u8 r n n n p 481 50 2050 2050 50 50 bias,relu,clip +s s16 r n n n p 482 50 2050 2050 50 50 none +s s8 r n n n p 482 50 2050 2050 50 50 none +s u8 r n n n p 482 50 2050 2050 50 50 none +s s16 u8 r n n n p 482 50 2050 2050 50 50 bias,relu,clip +s s8 u8 r n n n p 482 50 2050 2050 50 50 bias,relu,clip +s u8 u8 r n n n p 482 50 2050 2050 50 50 bias,relu,clip +s s16 r n n n p 483 50 2050 2050 50 50 none +s s8 r n n n p 483 50 2050 2050 50 50 none +s u8 r n n n p 483 50 2050 2050 50 50 none +s s16 u8 r n n n p 483 50 2050 2050 50 50 bias,relu,clip +s s8 u8 r n n n p 483 50 2050 2050 50 50 bias,relu,clip +s u8 u8 r n n n p 483 50 2050 2050 50 50 bias,relu,clip +s s16 r n n n p 484 50 2050 2050 50 50 none +s s8 r n n n p 484 50 2050 2050 50 50 none +s u8 r n n n p 484 50 2050 2050 50 50 none +s s16 u8 r n n n p 484 50 2050 2050 50 50 bias,relu,clip +s s8 u8 r n n n p 484 50 2050 2050 50 50 bias,relu,clip +s u8 u8 r n n n p 484 50 2050 2050 50 50 bias,relu,clip +s s16 r n n n p 485 50 2050 2050 50 50 none +s s8 r n n n p 485 50 2050 2050 50 50 none +s u8 r n n n p 485 50 2050 2050 50 50 none +s s16 u8 r n n n p 485 50 2050 2050 50 50 bias,relu,clip +s s8 u8 r n n n p 485 50 2050 2050 50 50 bias,relu,clip +s u8 u8 r n n n p 485 50 2050 2050 50 50 bias,relu,clip +s s16 r n n n R 480 1108 2050 2050 1108 1108 none +s s8 r n n n R 480 1108 2050 2050 1108 1108 none +s u8 r n n n R 480 1108 2050 2050 1108 1108 none +s s16 u8 r n n n R 480 1108 2050 2050 1108 1108 bias,relu,clip +s s8 u8 r n n n R 480 1108 2050 2050 1108 1108 bias,relu,clip +s u8 u8 r n n n R 480 1108 2050 2050 1108 1108 bias,relu,clip +s s16 r n n n R 481 1108 2050 2050 1108 1108 none +s s8 r n n n R 481 1108 2050 2050 1108 1108 none +s u8 r n n n R 481 1108 2050 2050 1108 1108 none +s s16 u8 r n n n R 481 1108 2050 2050 1108 1108 bias,relu,clip +s s8 u8 r n n n R 481 1108 2050 2050 1108 1108 bias,relu,clip +s u8 u8 r n n n R 481 1108 2050 2050 1108 1108 bias,relu,clip +s s16 r n n n R 482 1108 2050 2050 1108 1108 none +s s8 r n n n R 482 1108 2050 2050 1108 1108 none +s u8 r n n n R 482 1108 2050 2050 1108 1108 none +s s16 u8 r n n n R 482 1108 2050 2050 1108 1108 bias,relu,clip +s s8 u8 r n n n R 482 1108 2050 2050 1108 1108 bias,relu,clip +s u8 u8 r n n n R 482 1108 2050 2050 1108 1108 bias,relu,clip +s s16 r n n n R 483 1108 2050 2050 1108 1108 none +s s8 r n n n R 483 1108 2050 2050 1108 1108 none +s u8 r n n n R 483 1108 2050 2050 1108 1108 none +s s16 u8 r n n n R 483 1108 2050 2050 1108 1108 bias,relu,clip +s s8 u8 r n n n R 483 1108 2050 2050 1108 1108 bias,relu,clip +s u8 u8 r n n n R 483 1108 2050 2050 1108 1108 bias,relu,clip +s s16 r n n n R 484 1108 2050 2050 1108 1108 none +s s8 r n n n R 484 1108 2050 2050 1108 1108 none +s u8 r n n n R 484 1108 2050 2050 1108 1108 none +s s16 u8 r n n n R 484 1108 2050 2050 1108 1108 bias,relu,clip +s s8 u8 r n n n R 484 1108 2050 2050 1108 1108 bias,relu,clip +s u8 u8 r n n n R 484 1108 2050 2050 1108 1108 bias,relu,clip +s s16 r n n n R 485 1108 2050 2050 1108 1108 none +s s8 r n n n R 485 1108 2050 2050 1108 1108 none +s u8 r n n n R 485 1108 2050 2050 1108 1108 none +s s16 u8 r n n n R 485 1108 2050 2050 1108 1108 bias,relu,clip +s s8 u8 r n n n R 485 1108 2050 2050 1108 1108 bias,relu,clip +s u8 u8 r n n n R 485 1108 2050 2050 1108 1108 bias,relu,clip +s s16 r n n n R 480 1127 2050 2050 1127 1127 none +s s8 r n n n R 480 1127 2050 2050 1127 1127 none +s u8 r n n n R 480 1127 2050 2050 1127 1127 none +s s16 u8 r n n n R 480 1127 2050 2050 1127 1127 bias,relu,clip +s s8 u8 r n n n R 480 1127 2050 2050 1127 1127 bias,relu,clip +s u8 u8 r n n n R 480 1127 2050 2050 1127 1127 bias,relu,clip +s s16 r n n n R 481 1127 2050 2050 1127 1127 none +s s8 r n n n R 481 1127 2050 2050 1127 1127 none +s u8 r n n n R 481 1127 2050 2050 1127 1127 none +s s16 u8 r n n n R 481 1127 2050 2050 1127 1127 bias,relu,clip +s s8 u8 r n n n R 481 1127 2050 2050 1127 1127 bias,relu,clip +s u8 u8 r n n n R 481 1127 2050 2050 1127 1127 bias,relu,clip +s s16 r n n n R 482 1127 2050 2050 1127 1127 none +s s8 r n n n R 482 1127 2050 2050 1127 1127 none +s u8 r n n n R 482 1127 2050 2050 1127 1127 none +s s16 u8 r n n n R 482 1127 2050 2050 1127 1127 bias,relu,clip +s s8 u8 r n n n R 482 1127 2050 2050 1127 1127 bias,relu,clip +s u8 u8 r n n n R 482 1127 2050 2050 1127 1127 bias,relu,clip +s s16 r n n n R 483 1127 2050 2050 1127 1127 none +s s8 r n n n R 483 1127 2050 2050 1127 1127 none +s u8 r n n n R 483 1127 2050 2050 1127 1127 none +s s16 u8 r n n n R 483 1127 2050 2050 1127 1127 bias,relu,clip +s s8 u8 r n n n R 483 1127 2050 2050 1127 1127 bias,relu,clip +s u8 u8 r n n n R 483 1127 2050 2050 1127 1127 bias,relu,clip +s s16 r n n n p 484 1127 2050 2050 1127 1127 none +s s8 r n n n p 484 1127 2050 2050 1127 1127 none +s u8 r n n n p 484 1127 2050 2050 1127 1127 none +s s16 u8 r n n n p 484 1127 2050 2050 1127 1127 bias,relu,clip +s s8 u8 r n n n p 484 1127 2050 2050 1127 1127 bias,relu,clip +s u8 u8 r n n n p 484 1127 2050 2050 1127 1127 bias,relu,clip +s s16 r n n n p 485 1127 2050 2050 1127 1127 none +s s8 r n n n p 485 1127 2050 2050 1127 1127 none +s u8 r n n n p 485 1127 2050 2050 1127 1127 none +s s16 u8 r n n n p 485 1127 2050 2050 1127 1127 bias,relu,clip +s s8 u8 r n n n p 485 1127 2050 2050 1127 1127 bias,relu,clip +s u8 u8 r n n n p 485 1127 2050 2050 1127 1127 bias,relu,clip +s s16 r n n n p 480 1138 2050 2050 1138 1138 none +s s8 r n n n p 480 1138 2050 2050 1138 1138 none +s u8 r n n n p 480 1138 2050 2050 1138 1138 none +s s16 u8 r n n n p 480 1138 2050 2050 1138 1138 bias,relu,clip +s s8 u8 r n n n p 480 1138 2050 2050 1138 1138 bias,relu,clip +s u8 u8 r n n n p 480 1138 2050 2050 1138 1138 bias,relu,clip +s s16 r n n n p 481 1138 2050 2050 1138 1138 none +s s8 r n n n p 481 1138 2050 2050 1138 1138 none +s u8 r n n n p 481 1138 2050 2050 1138 1138 none +s s16 u8 r n n n p 481 1138 2050 2050 1138 1138 bias,relu,clip +s s8 u8 r n n n p 481 1138 2050 2050 1138 1138 bias,relu,clip +s u8 u8 r n n n p 481 1138 2050 2050 1138 1138 bias,relu,clip +s s16 r n n n p 482 1138 2050 2050 1138 1138 none +s s8 r n n n p 482 1138 2050 2050 1138 1138 none +s u8 r n n n p 482 1138 2050 2050 1138 1138 none +s s16 u8 r n n n p 482 1138 2050 2050 1138 1138 bias,relu,clip +s s8 u8 r n n n p 482 1138 2050 2050 1138 1138 bias,relu,clip +s u8 u8 r n n n p 482 1138 2050 2050 1138 1138 bias,relu,clip +s s16 r n n n p 483 1138 2050 2050 1138 1138 none +s s8 r n n n p 483 1138 2050 2050 1138 1138 none +s u8 r n n n p 483 1138 2050 2050 1138 1138 none +s s16 u8 r n n n p 483 1138 2050 2050 1138 1138 bias,relu,clip +s s8 u8 r n n n p 483 1138 2050 2050 1138 1138 bias,relu,clip +s u8 u8 r n n n p 483 1138 2050 2050 1138 1138 bias,relu,clip +s s16 r n n n p 484 1138 2050 2050 1138 1138 none +s s8 r n n n p 484 1138 2050 2050 1138 1138 none +s u8 r n n n p 484 1138 2050 2050 1138 1138 none +s s16 u8 r n n n p 484 1138 2050 2050 1138 1138 bias,relu,clip +s s8 u8 r n n n p 484 1138 2050 2050 1138 1138 bias,relu,clip +s u8 u8 r n n n p 484 1138 2050 2050 1138 1138 bias,relu,clip +s s16 r n n n p 485 1138 2050 2050 1138 1138 none +s s8 r n n n p 485 1138 2050 2050 1138 1138 none +s u8 r n n n p 485 1138 2050 2050 1138 1138 none +s s16 u8 r n n n p 485 1138 2050 2050 1138 1138 bias,relu,clip +s s8 u8 r n n n p 485 1138 2050 2050 1138 1138 bias,relu,clip +s u8 u8 r n n n p 485 1138 2050 2050 1138 1138 bias,relu,clip +s s16 r n n n p 1 1 3 3 1 1 none +s s8 r n n n p 1 1 3 3 1 1 none +s u8 r n n n p 1 1 3 3 1 1 none +s s16 u8 r n n n p 1 1 3 3 1 1 bias,relu,clip +s s8 u8 r n n n p 1 1 3 3 1 1 bias,relu,clip +s u8 u8 r n n n p 1 1 3 3 1 1 bias,relu,clip +s s16 r n n n p 1 9 3 3 9 9 none +s s8 r n n n p 1 9 3 3 9 9 none +s u8 r n n n p 1 9 3 3 9 9 none +s s16 u8 r n n n p 1 9 3 3 9 9 bias,relu,clip +s s8 u8 r n n n p 1 9 3 3 9 9 bias,relu,clip +s u8 u8 r n n n p 1 9 3 3 9 9 bias,relu,clip +s s16 r n n n p 1 2048 3 3 2048 2048 none +s s8 r n n n p 1 2048 3 3 2048 2048 none +s u8 r n n n p 1 2048 3 3 2048 2048 none +s s16 u8 r n n n p 1 2048 3 3 2048 2048 bias,relu,clip +s s8 u8 r n n n p 1 2048 3 3 2048 2048 bias,relu,clip +s u8 u8 r n n n p 1 2048 3 3 2048 2048 bias,relu,clip +s s16 r n n n p 1 2048 5192 5192 2048 2048 none +s s8 r n n n p 1 2048 5192 5192 2048 2048 none +s u8 r n n n p 1 2048 5192 5192 2048 2048 none +s s16 u8 r n n n p 1 2048 5192 5192 2048 2048 bias,relu,clip +s s8 u8 r n n n p 1 2048 5192 5192 2048 2048 bias,relu,clip +s u8 u8 r n n n p 1 2048 5192 5192 2048 2048 bias,relu,clip +s s16 r n n n p 9 1 3 3 1 1 none +s s8 r n n n p 9 1 3 3 1 1 none +s u8 r n n n p 9 1 3 3 1 1 none +s s16 u8 r n n n p 9 1 3 3 1 1 bias,relu,clip +s s8 u8 r n n n p 9 1 3 3 1 1 bias,relu,clip +s u8 u8 r n n n p 9 1 3 3 1 1 bias,relu,clip +s s16 r n n n p 576 1 3500 3500 1 1 none +s s8 r n n n p 576 1 3500 3500 1 1 none +s u8 r n n n p 576 1 3500 3500 1 1 none +s s16 u8 r n n n p 576 1 3500 3500 1 1 bias,relu,clip +s s8 u8 r n n n p 576 1 3500 3500 1 1 bias,relu,clip +s u8 u8 r n n n p 576 1 3500 3500 1 1 bias,relu,clip +s s16 r n n n p 1 1 1 1 1 1 none +s s8 r n n n p 1 1 1 1 1 1 none +s u8 r n n n p 1 1 1 1 1 1 none +s s16 u8 r n n n p 1 1 1 1 1 1 bias,relu,clip +s s8 u8 r n n n p 1 1 1 1 1 1 bias,relu,clip +s u8 u8 r n n n p 1 1 1 1 1 1 bias,relu,clip +s s16 r n n n p 102 1088 1024 1024 1088 1088 none +s s8 r n n n p 102 1088 1024 1024 1088 1088 none +s u8 r n n n p 102 1088 1024 1024 1088 1088 none +s s16 u8 r n n n p 102 1088 1024 1024 1088 1088 bias,relu,clip +s s8 u8 r n n n p 102 1088 1024 1024 1088 1088 bias,relu,clip +s u8 u8 r n n n p 102 1088 1024 1024 1088 1088 bias,relu,clip +s s16 r n n n p 102 2048 1024 1024 2048 2048 none +s s8 r n n n p 102 2048 1024 1024 2048 2048 none +s u8 r n n n p 102 2048 1024 1024 2048 2048 none +s s16 u8 r n n n p 102 2048 1024 1024 2048 2048 bias,relu,clip +s s8 u8 r n n n p 102 2048 1024 1024 2048 2048 bias,relu,clip +s u8 u8 r n n n p 102 2048 1024 1024 2048 2048 bias,relu,clip +s s16 r n n n p 485 656 1024 1024 656 656 none +s s8 r n n n p 485 656 1024 1024 656 656 none +s u8 r n n n p 485 656 1024 1024 656 656 none +s s16 u8 r n n n p 485 656 1024 1024 656 656 bias,relu,clip +s s8 u8 r n n n p 485 656 1024 1024 656 656 bias,relu,clip +s u8 u8 r n n n p 485 656 1024 1024 656 656 bias,relu,clip +s s16 r n n n p 483 656 1024 1024 656 656 none +s s8 r n n n p 483 656 1024 1024 656 656 none +s u8 r n n n p 483 656 1024 1024 656 656 none +s s16 u8 r n n n p 483 656 1024 1024 656 656 bias,relu,clip +s s8 u8 r n n n p 483 656 1024 1024 656 656 bias,relu,clip +s u8 u8 r n n n p 483 656 1024 1024 656 656 bias,relu,clip +s s16 r n n n p 81 128 3 3 128 128 none +s s8 r n n n p 81 128 3 3 128 128 none +s u8 r n n n p 81 128 3 3 128 128 none +s s16 u8 r n n n p 81 128 3 3 128 128 bias,relu,clip +s s8 u8 r n n n p 81 128 3 3 128 128 bias,relu,clip +s u8 u8 r n n n p 81 128 3 3 128 128 bias,relu,clip +s s16 r n n n p 1022 512 515 515 512 512 none +s s8 r n n n p 1022 512 515 515 512 512 none +s u8 r n n n p 1022 512 515 515 512 512 none +s s16 u8 r n n n p 1022 512 515 515 512 512 bias,relu,clip +s s8 u8 r n n n p 1022 512 515 515 512 512 bias,relu,clip +s u8 u8 r n n n p 1022 512 515 515 512 512 bias,relu,clip +s s16 r n n n p 74 512 515 515 512 512 none +s s8 r n n n p 74 512 515 515 512 512 none +s u8 r n n n p 74 512 515 515 512 512 none +s s16 u8 r n n n p 74 512 515 515 512 512 bias,relu,clip +s s8 u8 r n n n p 74 512 515 515 512 512 bias,relu,clip +s u8 u8 r n n n p 74 512 515 515 512 512 bias,relu,clip +s s16 r n n n p 253 2048 515 515 2048 2048 none +s s8 r n n n p 253 2048 515 515 2048 2048 none +s u8 r n n n p 253 2048 515 515 2048 2048 none +s s16 u8 r n n n p 253 2048 515 515 2048 2048 bias,relu,clip +s s8 u8 r n n n p 253 2048 515 515 2048 2048 bias,relu,clip +s u8 u8 r n n n p 253 2048 515 515 2048 2048 bias,relu,clip +s s16 r n n n p 8192 1040 515 515 1040 1040 none +s s8 r n n n p 8192 1040 515 515 1040 1040 none +s u8 r n n n p 8192 1040 515 515 1040 1040 none +s s16 u8 r n n n p 8192 1040 515 515 1040 1040 bias,relu,clip +s s8 u8 r n n n p 8192 1040 515 515 1040 1040 bias,relu,clip +s u8 u8 r n n n p 8192 1040 515 515 1040 1040 bias,relu,clip +s s16 r n n n p 10 1029 515 515 1029 1029 none +s s8 r n n n p 10 1029 515 515 1029 1029 none +s u8 r n n n p 10 1029 515 515 1029 1029 none +s s16 u8 r n n n p 10 1029 515 515 1029 1029 bias,relu,clip +s s8 u8 r n n n p 10 1029 515 515 1029 1029 bias,relu,clip +s u8 u8 r n n n p 10 1029 515 515 1029 1029 bias,relu,clip +s s16 r n n n p 24 1040 2050 2050 1040 1040 none +s s8 r n n n p 24 1040 2050 2050 1040 1040 none +s u8 r n n n p 24 1040 2050 2050 1040 1040 none +s s16 u8 r n n n p 24 1040 2050 2050 1040 1040 bias,relu,clip +s s8 u8 r n n n p 24 1040 2050 2050 1040 1040 bias,relu,clip +s u8 u8 r n n n p 24 1040 2050 2050 1040 1040 bias,relu,clip +s s16 r n n n p 1024 1029 2050 2050 1029 1029 none +s s8 r n n n p 1024 1029 2050 2050 1029 1029 none +s u8 r n n n p 1024 1029 2050 2050 1029 1029 none +s s16 u8 r n n n p 1024 1029 2050 2050 1029 1029 bias,relu,clip +s s8 u8 r n n n p 1024 1029 2050 2050 1029 1029 bias,relu,clip +s u8 u8 r n n n p 1024 1029 2050 2050 1029 1029 bias,relu,clip +s s16 r n n n p 480 660 2050 2050 660 660 none +s s8 r n n n p 480 660 2050 2050 660 660 none +s u8 r n n n p 480 660 2050 2050 660 660 none +s s16 u8 r n n n p 480 660 2050 2050 660 660 bias,relu,clip +s s8 u8 r n n n p 480 660 2050 2050 660 660 bias,relu,clip +s u8 u8 r n n n p 480 660 2050 2050 660 660 bias,relu,clip +s s16 r n n n p 481 660 2050 2050 660 660 none +s s8 r n n n p 481 660 2050 2050 660 660 none +s u8 r n n n p 481 660 2050 2050 660 660 none +s s16 u8 r n n n p 481 660 2050 2050 660 660 bias,relu,clip +s s8 u8 r n n n p 481 660 2050 2050 660 660 bias,relu,clip +s u8 u8 r n n n p 481 660 2050 2050 660 660 bias,relu,clip +s s16 r n n n p 482 660 2050 2050 660 660 none +s s8 r n n n p 482 660 2050 2050 660 660 none +s u8 r n n n p 482 660 2050 2050 660 660 none +s s16 u8 r n n n p 482 660 2050 2050 660 660 bias,relu,clip +s s8 u8 r n n n p 482 660 2050 2050 660 660 bias,relu,clip +s u8 u8 r n n n p 482 660 2050 2050 660 660 bias,relu,clip +s s16 r n n n p 483 660 2050 2050 660 660 none +s s8 r n n n p 483 660 2050 2050 660 660 none +s u8 r n n n p 483 660 2050 2050 660 660 none +s s16 u8 r n n n p 483 660 2050 2050 660 660 bias,relu,clip +s s8 u8 r n n n p 483 660 2050 2050 660 660 bias,relu,clip +s u8 u8 r n n n p 483 660 2050 2050 660 660 bias,relu,clip +s s16 r n n n p 484 660 2050 2050 660 660 none +s s8 r n n n p 484 660 2050 2050 660 660 none +s u8 r n n n p 484 660 2050 2050 660 660 none +s s16 u8 r n n n p 484 660 2050 2050 660 660 bias,relu,clip +s s8 u8 r n n n p 484 660 2050 2050 660 660 bias,relu,clip +s u8 u8 r n n n p 484 660 2050 2050 660 660 bias,relu,clip +s s16 r n n n p 485 660 2050 2050 660 660 none +s s8 r n n n p 485 660 2050 2050 660 660 none +s u8 r n n n p 485 660 2050 2050 660 660 none +s s16 u8 r n n n p 485 660 2050 2050 660 660 bias,relu,clip +s s8 u8 r n n n p 485 660 2050 2050 660 660 bias,relu,clip +s u8 u8 r n n n p 485 660 2050 2050 660 660 bias,relu,clip +s s16 r n n n p 480 679 2050 2050 679 679 none +s s8 r n n n p 480 679 2050 2050 679 679 none +s u8 r n n n p 480 679 2050 2050 679 679 none +s s16 u8 r n n n p 480 679 2050 2050 679 679 bias,relu,clip +s s8 u8 r n n n p 480 679 2050 2050 679 679 bias,relu,clip +s u8 u8 r n n n p 480 679 2050 2050 679 679 bias,relu,clip +s s16 r n n n p 481 679 2050 2050 679 679 none +s s8 r n n n p 481 679 2050 2050 679 679 none +s u8 r n n n p 481 679 2050 2050 679 679 none +s s16 u8 r n n n p 481 679 2050 2050 679 679 bias,relu,clip +s s8 u8 r n n n p 481 679 2050 2050 679 679 bias,relu,clip +s u8 u8 r n n n p 481 679 2050 2050 679 679 bias,relu,clip +s s16 r n n n p 482 679 2050 2050 679 679 none +s s8 r n n n p 482 679 2050 2050 679 679 none +s u8 r n n n p 482 679 2050 2050 679 679 none +s s16 u8 r n n n p 482 679 2050 2050 679 679 bias,relu,clip +s s8 u8 r n n n p 482 679 2050 2050 679 679 bias,relu,clip +s u8 u8 r n n n p 482 679 2050 2050 679 679 bias,relu,clip +s s16 r n n n p 483 679 2050 2050 679 679 none +s s8 r n n n p 483 679 2050 2050 679 679 none +s u8 r n n n p 483 679 2050 2050 679 679 none +s s16 u8 r n n n p 483 679 2050 2050 679 679 bias,relu,clip +s s8 u8 r n n n p 483 679 2050 2050 679 679 bias,relu,clip +s u8 u8 r n n n p 483 679 2050 2050 679 679 bias,relu,clip +s s16 r n n n p 484 679 2050 2050 679 679 none +s s8 r n n n p 484 679 2050 2050 679 679 none +s u8 r n n n p 484 679 2050 2050 679 679 none +s s16 u8 r n n n p 484 679 2050 2050 679 679 bias,relu,clip +s s8 u8 r n n n p 484 679 2050 2050 679 679 bias,relu,clip +s u8 u8 r n n n p 484 679 2050 2050 679 679 bias,relu,clip +s s16 r n n n p 485 679 2050 2050 679 679 none +s s8 r n n n p 485 679 2050 2050 679 679 none +s u8 r n n n p 485 679 2050 2050 679 679 none +s s16 u8 r n n n p 485 679 2050 2050 679 679 bias,relu,clip +s s8 u8 r n n n p 485 679 2050 2050 679 679 bias,relu,clip +s u8 u8 r n n n p 485 679 2050 2050 679 679 bias,relu,clip +s s16 r n n n p 480 690 2050 2050 690 690 none +s s8 r n n n p 480 690 2050 2050 690 690 none +s u8 r n n n p 480 690 2050 2050 690 690 none +s s16 u8 r n n n p 480 690 2050 2050 690 690 bias,relu,clip +s s8 u8 r n n n p 480 690 2050 2050 690 690 bias,relu,clip +s u8 u8 r n n n p 480 690 2050 2050 690 690 bias,relu,clip +s s16 r n n n p 481 690 2050 2050 690 690 none +s s8 r n n n p 481 690 2050 2050 690 690 none +s u8 r n n n p 481 690 2050 2050 690 690 none +s s16 u8 r n n n p 481 690 2050 2050 690 690 bias,relu,clip +s s8 u8 r n n n p 481 690 2050 2050 690 690 bias,relu,clip +s u8 u8 r n n n p 481 690 2050 2050 690 690 bias,relu,clip +s s16 r n n n p 482 690 2050 2050 690 690 none +s s8 r n n n p 482 690 2050 2050 690 690 none +s u8 r n n n p 482 690 2050 2050 690 690 none +s s16 u8 r n n n p 482 690 2050 2050 690 690 bias,relu,clip +s s8 u8 r n n n p 482 690 2050 2050 690 690 bias,relu,clip +s u8 u8 r n n n p 482 690 2050 2050 690 690 bias,relu,clip +s s16 r n n n p 483 690 2050 2050 690 690 none +s s8 r n n n p 483 690 2050 2050 690 690 none +s u8 r n n n p 483 690 2050 2050 690 690 none +s s16 u8 r n n n p 483 690 2050 2050 690 690 bias,relu,clip +s s8 u8 r n n n p 483 690 2050 2050 690 690 bias,relu,clip +s u8 u8 r n n n p 483 690 2050 2050 690 690 bias,relu,clip +s s16 r n n n p 484 690 2050 2050 690 690 none +s s8 r n n n p 484 690 2050 2050 690 690 none +s u8 r n n n p 484 690 2050 2050 690 690 none +s s16 u8 r n n n p 484 690 2050 2050 690 690 bias,relu,clip +s s8 u8 r n n n p 484 690 2050 2050 690 690 bias,relu,clip +s u8 u8 r n n n p 484 690 2050 2050 690 690 bias,relu,clip +s s16 r n n n p 485 690 2050 2050 690 690 none +s s8 r n n n p 485 690 2050 2050 690 690 none +s u8 r n n n p 485 690 2050 2050 690 690 none +s s16 u8 r n n n p 485 690 2050 2050 690 690 bias,relu,clip +s s8 u8 r n n n p 485 690 2050 2050 690 690 bias,relu,clip +s u8 u8 r n n n p 485 690 2050 2050 690 690 bias,relu,clip +s s16 r n n n p 480 660 2048 2048 660 660 none +s s8 r n n n p 480 660 2048 2048 660 660 none +s u8 r n n n p 480 660 2048 2048 660 660 none +s s16 u8 r n n n p 480 660 2048 2048 660 660 bias,relu,clip +s s8 u8 r n n n p 480 660 2048 2048 660 660 bias,relu,clip +s u8 u8 r n n n p 480 660 2048 2048 660 660 bias,relu,clip +s s16 r n n n p 481 660 2048 2048 660 660 none +s s8 r n n n p 481 660 2048 2048 660 660 none +s u8 r n n n p 481 660 2048 2048 660 660 none +s s16 u8 r n n n p 481 660 2048 2048 660 660 bias,relu,clip +s s8 u8 r n n n p 481 660 2048 2048 660 660 bias,relu,clip +s u8 u8 r n n n p 481 660 2048 2048 660 660 bias,relu,clip +s s16 r n n n p 482 660 2048 2048 660 660 none +s s8 r n n n p 482 660 2048 2048 660 660 none +s u8 r n n n p 482 660 2048 2048 660 660 none +s s16 u8 r n n n p 482 660 2048 2048 660 660 bias,relu,clip +s s8 u8 r n n n p 482 660 2048 2048 660 660 bias,relu,clip +s u8 u8 r n n n p 482 660 2048 2048 660 660 bias,relu,clip +s s16 r n n n p 483 660 2048 2048 660 660 none +s s8 r n n n p 483 660 2048 2048 660 660 none +s u8 r n n n p 483 660 2048 2048 660 660 none +s s16 u8 r n n n p 483 660 2048 2048 660 660 bias,relu,clip +s s8 u8 r n n n p 483 660 2048 2048 660 660 bias,relu,clip +s u8 u8 r n n n p 483 660 2048 2048 660 660 bias,relu,clip +s s16 r n n n p 484 660 2048 2048 660 660 none +s s8 r n n n p 484 660 2048 2048 660 660 none +s u8 r n n n p 484 660 2048 2048 660 660 none +s s16 u8 r n n n p 484 660 2048 2048 660 660 bias,relu,clip +s s8 u8 r n n n p 484 660 2048 2048 660 660 bias,relu,clip +s u8 u8 r n n n p 484 660 2048 2048 660 660 bias,relu,clip +s s16 r n n n p 485 660 2048 2048 660 660 none +s s8 r n n n p 485 660 2048 2048 660 660 none +s u8 r n n n p 485 660 2048 2048 660 660 none +s s16 u8 r n n n p 485 660 2048 2048 660 660 bias,relu,clip +s s8 u8 r n n n p 485 660 2048 2048 660 660 bias,relu,clip +s u8 u8 r n n n p 485 660 2048 2048 660 660 bias,relu,clip +s s16 r n n n p 480 679 2048 2048 679 679 none +s s8 r n n n p 480 679 2048 2048 679 679 none +s u8 r n n n p 480 679 2048 2048 679 679 none +s s16 u8 r n n n p 480 679 2048 2048 679 679 bias,relu,clip +s s8 u8 r n n n p 480 679 2048 2048 679 679 bias,relu,clip +s u8 u8 r n n n p 480 679 2048 2048 679 679 bias,relu,clip +s s16 r n n n p 481 679 2048 2048 679 679 none +s s8 r n n n p 481 679 2048 2048 679 679 none +s u8 r n n n p 481 679 2048 2048 679 679 none +s s16 u8 r n n n p 481 679 2048 2048 679 679 bias,relu,clip +s s8 u8 r n n n p 481 679 2048 2048 679 679 bias,relu,clip +s u8 u8 r n n n p 481 679 2048 2048 679 679 bias,relu,clip +s s16 r n n n p 482 679 2048 2048 679 679 none +s s8 r n n n p 482 679 2048 2048 679 679 none +s u8 r n n n p 482 679 2048 2048 679 679 none +s s16 u8 r n n n p 482 679 2048 2048 679 679 bias,relu,clip +s s8 u8 r n n n p 482 679 2048 2048 679 679 bias,relu,clip +s u8 u8 r n n n p 482 679 2048 2048 679 679 bias,relu,clip +s s16 r n n n p 483 679 2048 2048 679 679 none +s s8 r n n n p 483 679 2048 2048 679 679 none +s u8 r n n n p 483 679 2048 2048 679 679 none +s s16 u8 r n n n p 483 679 2048 2048 679 679 bias,relu,clip +s s8 u8 r n n n p 483 679 2048 2048 679 679 bias,relu,clip +s u8 u8 r n n n p 483 679 2048 2048 679 679 bias,relu,clip +s s16 r n n n p 484 679 2048 2048 679 679 none +s s8 r n n n p 484 679 2048 2048 679 679 none +s u8 r n n n p 484 679 2048 2048 679 679 none +s s16 u8 r n n n p 484 679 2048 2048 679 679 bias,relu,clip +s s8 u8 r n n n p 484 679 2048 2048 679 679 bias,relu,clip +s u8 u8 r n n n p 484 679 2048 2048 679 679 bias,relu,clip +s s16 r n n n p 485 679 2048 2048 679 679 none +s s8 r n n n p 485 679 2048 2048 679 679 none +s u8 r n n n p 485 679 2048 2048 679 679 none +s s16 u8 r n n n p 485 679 2048 2048 679 679 bias,relu,clip +s s8 u8 r n n n p 485 679 2048 2048 679 679 bias,relu,clip +s u8 u8 r n n n p 485 679 2048 2048 679 679 bias,relu,clip +s s16 r n n n p 480 690 2048 2048 690 690 none +s s8 r n n n p 480 690 2048 2048 690 690 none +s u8 r n n n p 480 690 2048 2048 690 690 none +s s16 u8 r n n n p 480 690 2048 2048 690 690 bias,relu,clip +s s8 u8 r n n n p 480 690 2048 2048 690 690 bias,relu,clip +s u8 u8 r n n n p 480 690 2048 2048 690 690 bias,relu,clip +s s16 r n n n p 481 690 2048 2048 690 690 none +s s8 r n n n p 481 690 2048 2048 690 690 none +s u8 r n n n p 481 690 2048 2048 690 690 none +s s16 u8 r n n n p 481 690 2048 2048 690 690 bias,relu,clip +s s8 u8 r n n n p 481 690 2048 2048 690 690 bias,relu,clip +s u8 u8 r n n n p 481 690 2048 2048 690 690 bias,relu,clip +s s16 r n n n p 482 690 2048 2048 690 690 none +s s8 r n n n p 482 690 2048 2048 690 690 none +s u8 r n n n p 482 690 2048 2048 690 690 none +s s16 u8 r n n n p 482 690 2048 2048 690 690 bias,relu,clip +s s8 u8 r n n n p 482 690 2048 2048 690 690 bias,relu,clip +s u8 u8 r n n n p 482 690 2048 2048 690 690 bias,relu,clip +s s16 r n n n p 483 690 2048 2048 690 690 none +s s8 r n n n p 483 690 2048 2048 690 690 none +s u8 r n n n p 483 690 2048 2048 690 690 none +s s16 u8 r n n n p 483 690 2048 2048 690 690 bias,relu,clip +s s8 u8 r n n n p 483 690 2048 2048 690 690 bias,relu,clip +s u8 u8 r n n n p 483 690 2048 2048 690 690 bias,relu,clip +s s16 r n n n p 484 690 2048 2048 690 690 none +s s8 r n n n p 484 690 2048 2048 690 690 none +s u8 r n n n p 484 690 2048 2048 690 690 none +s s16 u8 r n n n p 484 690 2048 2048 690 690 bias,relu,clip +s s8 u8 r n n n p 484 690 2048 2048 690 690 bias,relu,clip +s u8 u8 r n n n p 484 690 2048 2048 690 690 bias,relu,clip +s s16 r n n n p 485 690 2048 2048 690 690 none +s s8 r n n n p 485 690 2048 2048 690 690 none +s u8 r n n n p 485 690 2048 2048 690 690 none +s s16 u8 r n n n p 485 690 2048 2048 690 690 bias,relu,clip +s s8 u8 r n n n p 485 690 2048 2048 690 690 bias,relu,clip +s u8 u8 r n n n p 485 690 2048 2048 690 690 bias,relu,clip +s s16 r n n n p 480 656 1024 1024 656 656 none +s s8 r n n n p 480 656 1024 1024 656 656 none +s u8 r n n n p 480 656 1024 1024 656 656 none +s s16 u8 r n n n p 480 656 1024 1024 656 656 bias,relu,clip +s s8 u8 r n n n p 480 656 1024 1024 656 656 bias,relu,clip +s u8 u8 r n n n p 480 656 1024 1024 656 656 bias,relu,clip +s s16 r n n n p 480 128 3 3 128 128 none +s s8 r n n n p 480 128 3 3 128 128 none +s u8 r n n n p 480 128 3 3 128 128 none +s s16 u8 r n n n p 480 128 3 3 128 128 bias,relu,clip +s s8 u8 r n n n p 480 128 3 3 128 128 bias,relu,clip +s u8 u8 r n n n p 480 128 3 3 128 128 bias,relu,clip +s s16 r n n n p 1024 512 515 515 512 512 none +s s8 r n n n p 1024 512 515 515 512 512 none +s u8 r n n n p 1024 512 515 515 512 512 none +s s16 u8 r n n n p 1024 512 515 515 512 512 bias,relu,clip +s s8 u8 r n n n p 1024 512 515 515 512 512 bias,relu,clip +s u8 u8 r n n n p 1024 512 515 515 512 512 bias,relu,clip +s s16 r n n n p 1024 2048 1024 1024 2048 2048 none +s s8 r n n n p 1024 2048 1024 1024 2048 2048 none +s u8 r n n n p 1024 2048 1024 1024 2048 2048 none +s s16 u8 r n n n p 1024 2048 1024 1024 2048 2048 bias,relu,clip +s s8 u8 r n n n p 1024 2048 1024 1024 2048 2048 bias,relu,clip +s u8 u8 r n n n p 1024 2048 1024 1024 2048 2048 bias,relu,clip +s s16 r n n n p 1024 2048 515 515 2048 2048 none +s s8 r n n n p 1024 2048 515 515 2048 2048 none +s u8 r n n n p 1024 2048 515 515 2048 2048 none +s s16 u8 r n n n p 1024 2048 515 515 2048 2048 bias,relu,clip +s s8 u8 r n n n p 1024 2048 515 515 2048 2048 bias,relu,clip +s u8 u8 r n n n p 1024 2048 515 515 2048 2048 bias,relu,clip +s s16 r n n n p 1024 1040 515 515 1040 1040 none +s s8 r n n n p 1024 1040 515 515 1040 1040 none +s u8 r n n n p 1024 1040 515 515 1040 1040 none +s s16 u8 r n n n p 1024 1040 515 515 1040 1040 bias,relu,clip +s s8 u8 r n n n p 1024 1040 515 515 1040 1040 bias,relu,clip +s u8 u8 r n n n p 1024 1040 515 515 1040 1040 bias,relu,clip +s s16 r n n n p 5 1029 515 515 1029 1029 none +s s8 r n n n p 5 1029 515 515 1029 1029 none +s u8 r n n n p 5 1029 515 515 1029 1029 none +s s16 u8 r n n n p 5 1029 515 515 1029 1029 bias,relu,clip +s s8 u8 r n n n p 5 1029 515 515 1029 1029 bias,relu,clip +s u8 u8 r n n n p 5 1029 515 515 1029 1029 bias,relu,clip +s s16 r n n n p 1024 1029 515 515 1029 1029 none +s s8 r n n n p 1024 1029 515 515 1029 1029 none +s u8 r n n n p 1024 1029 515 515 1029 1029 none +s s16 u8 r n n n p 1024 1029 515 515 1029 1029 bias,relu,clip +s s8 u8 r n n n p 1024 1029 515 515 1029 1029 bias,relu,clip +s u8 u8 r n n n p 1024 1029 515 515 1029 1029 bias,relu,clip +s s16 r n n n p 1024 1040 2050 2050 1040 1040 none +s s8 r n n n p 1024 1040 2050 2050 1040 1040 none +s u8 r n n n p 1024 1040 2050 2050 1040 1040 none +s s16 u8 r n n n p 1024 1040 2050 2050 1040 1040 bias,relu,clip +s s8 u8 r n n n p 1024 1040 2050 2050 1040 1040 bias,relu,clip +s u8 u8 r n n n p 1024 1040 2050 2050 1040 1040 bias,relu,clip +s s16 r n n n p 1029 1029 2050 2050 1029 1029 none +s s8 r n n n p 1029 1029 2050 2050 1029 1029 none +s u8 r n n n p 1029 1029 2050 2050 1029 1029 none +s s16 u8 r n n n p 1029 1029 2050 2050 1029 1029 bias,relu,clip +s s8 u8 r n n n p 1029 1029 2050 2050 1029 1029 bias,relu,clip +s u8 u8 r n n n p 1029 1029 2050 2050 1029 1029 bias,relu,clip +s s16 r n n n R 480 646 2050 2050 646 646 none +s s8 r n n n R 480 646 2050 2050 646 646 none +s u8 r n n n R 480 646 2050 2050 646 646 none +s s16 u8 r n n n R 480 646 2050 2050 646 646 bias,relu,clip +s s8 u8 r n n n R 480 646 2050 2050 646 646 bias,relu,clip +s u8 u8 r n n n R 480 646 2050 2050 646 646 bias,relu,clip +s s16 r n n n R 481 646 2050 2050 646 646 none +s s8 r n n n R 481 646 2050 2050 646 646 none +s u8 r n n n R 481 646 2050 2050 646 646 none +s s16 u8 r n n n R 481 646 2050 2050 646 646 bias,relu,clip +s s8 u8 r n n n R 481 646 2050 2050 646 646 bias,relu,clip +s u8 u8 r n n n R 481 646 2050 2050 646 646 bias,relu,clip +s s16 r n n n R 482 646 2050 2050 646 646 none +s s8 r n n n R 482 646 2050 2050 646 646 none +s u8 r n n n R 482 646 2050 2050 646 646 none +s s16 u8 r n n n R 482 646 2050 2050 646 646 bias,relu,clip +s s8 u8 r n n n R 482 646 2050 2050 646 646 bias,relu,clip +s u8 u8 r n n n R 482 646 2050 2050 646 646 bias,relu,clip +s s16 r n n n R 483 646 2050 2050 646 646 none +s s8 r n n n R 483 646 2050 2050 646 646 none +s u8 r n n n R 483 646 2050 2050 646 646 none +s s16 u8 r n n n R 483 646 2050 2050 646 646 bias,relu,clip +s s8 u8 r n n n R 483 646 2050 2050 646 646 bias,relu,clip +s u8 u8 r n n n R 483 646 2050 2050 646 646 bias,relu,clip +s s16 r n n n R 484 646 2050 2050 646 646 none +s s8 r n n n R 484 646 2050 2050 646 646 none +s u8 r n n n R 484 646 2050 2050 646 646 none +s s16 u8 r n n n R 484 646 2050 2050 646 646 bias,relu,clip +s s8 u8 r n n n R 484 646 2050 2050 646 646 bias,relu,clip +s u8 u8 r n n n R 484 646 2050 2050 646 646 bias,relu,clip +s s16 r n n n R 485 646 2050 2050 646 646 none +s s8 r n n n R 485 646 2050 2050 646 646 none +s u8 r n n n R 485 646 2050 2050 646 646 none +s s16 u8 r n n n R 485 646 2050 2050 646 646 bias,relu,clip +s s8 u8 r n n n R 485 646 2050 2050 646 646 bias,relu,clip +s u8 u8 r n n n R 485 646 2050 2050 646 646 bias,relu,clip +s s16 r n n n R 481 656 2050 2050 656 656 none +s s8 r n n n R 481 656 2050 2050 656 656 none +s u8 r n n n R 481 656 2050 2050 656 656 none +s s16 u8 r n n n R 481 656 2050 2050 656 656 bias,relu,clip +s s8 u8 r n n n R 481 656 2050 2050 656 656 bias,relu,clip +s u8 u8 r n n n R 481 656 2050 2050 656 656 bias,relu,clip +s s16 r n n n R 482 656 2050 2050 656 656 none +s s8 r n n n R 482 656 2050 2050 656 656 none +s u8 r n n n R 482 656 2050 2050 656 656 none +s s16 u8 r n n n R 482 656 2050 2050 656 656 bias,relu,clip +s s8 u8 r n n n R 482 656 2050 2050 656 656 bias,relu,clip +s u8 u8 r n n n R 482 656 2050 2050 656 656 bias,relu,clip +s s16 r n n n R 483 656 2050 2050 656 656 none +s s8 r n n n R 483 656 2050 2050 656 656 none +s u8 r n n n R 483 656 2050 2050 656 656 none +s s16 u8 r n n n R 483 656 2050 2050 656 656 bias,relu,clip +s s8 u8 r n n n R 483 656 2050 2050 656 656 bias,relu,clip +s u8 u8 r n n n R 483 656 2050 2050 656 656 bias,relu,clip +s s16 r n n n R 484 656 2050 2050 656 656 none +s s8 r n n n R 484 656 2050 2050 656 656 none +s u8 r n n n R 484 656 2050 2050 656 656 none +s s16 u8 r n n n R 484 656 2050 2050 656 656 bias,relu,clip +s s8 u8 r n n n R 484 656 2050 2050 656 656 bias,relu,clip +s u8 u8 r n n n R 484 656 2050 2050 656 656 bias,relu,clip +s s16 r n n n p 485 656 2050 2050 656 656 none +s s8 r n n n p 485 656 2050 2050 656 656 none +s u8 r n n n p 485 656 2050 2050 656 656 none +s s16 u8 r n n n p 485 656 2050 2050 656 656 bias,relu,clip +s s8 u8 r n n n p 485 656 2050 2050 656 656 bias,relu,clip +s u8 u8 r n n n p 485 656 2050 2050 656 656 bias,relu,clip +s s16 r n n n p 480 672 2050 2050 672 672 none +s s8 r n n n p 480 672 2050 2050 672 672 none +s u8 r n n n p 480 672 2050 2050 672 672 none +s s16 u8 r n n n p 480 672 2050 2050 672 672 bias,relu,clip +s s8 u8 r n n n p 480 672 2050 2050 672 672 bias,relu,clip +s u8 u8 r n n n p 480 672 2050 2050 672 672 bias,relu,clip +s s16 r n n n p 481 672 2050 2050 672 672 none +s s8 r n n n p 481 672 2050 2050 672 672 none +s u8 r n n n p 481 672 2050 2050 672 672 none +s s16 u8 r n n n p 481 672 2050 2050 672 672 bias,relu,clip +s s8 u8 r n n n p 481 672 2050 2050 672 672 bias,relu,clip +s u8 u8 r n n n p 481 672 2050 2050 672 672 bias,relu,clip +s s16 r n n n p 482 672 2050 2050 672 672 none +s s8 r n n n p 482 672 2050 2050 672 672 none +s u8 r n n n p 482 672 2050 2050 672 672 none +s s16 u8 r n n n p 482 672 2050 2050 672 672 bias,relu,clip +s s8 u8 r n n n p 482 672 2050 2050 672 672 bias,relu,clip +s u8 u8 r n n n p 482 672 2050 2050 672 672 bias,relu,clip +s s16 r n n n p 483 672 2050 2050 672 672 none +s s8 r n n n p 483 672 2050 2050 672 672 none +s u8 r n n n p 483 672 2050 2050 672 672 none +s s16 u8 r n n n p 483 672 2050 2050 672 672 bias,relu,clip +s s8 u8 r n n n p 483 672 2050 2050 672 672 bias,relu,clip +s u8 u8 r n n n p 483 672 2050 2050 672 672 bias,relu,clip +s s16 r n n n p 484 672 2050 2050 672 672 none +s s8 r n n n p 484 672 2050 2050 672 672 none +s u8 r n n n p 484 672 2050 2050 672 672 none +s s16 u8 r n n n p 484 672 2050 2050 672 672 bias,relu,clip +s s8 u8 r n n n p 484 672 2050 2050 672 672 bias,relu,clip +s u8 u8 r n n n p 484 672 2050 2050 672 672 bias,relu,clip +s s16 r n n n p 485 672 2050 2050 672 672 none +s s8 r n n n p 485 672 2050 2050 672 672 none +s u8 r n n n p 485 672 2050 2050 672 672 none +s s16 u8 r n n n p 485 672 2050 2050 672 672 bias,relu,clip +s s8 u8 r n n n p 485 672 2050 2050 672 672 bias,relu,clip +s u8 u8 r n n n p 485 672 2050 2050 672 672 bias,relu,clip +s s16 r n n n p 480 688 2050 2050 688 688 none +s s8 r n n n p 480 688 2050 2050 688 688 none +s u8 r n n n p 480 688 2050 2050 688 688 none +s s16 u8 r n n n p 480 688 2050 2050 688 688 bias,relu,clip +s s8 u8 r n n n p 480 688 2050 2050 688 688 bias,relu,clip +s u8 u8 r n n n p 480 688 2050 2050 688 688 bias,relu,clip +s s16 r n n n p 481 688 2050 2050 688 688 none +s s8 r n n n p 481 688 2050 2050 688 688 none +s u8 r n n n p 481 688 2050 2050 688 688 none +s s16 u8 r n n n p 481 688 2050 2050 688 688 bias,relu,clip +s s8 u8 r n n n p 481 688 2050 2050 688 688 bias,relu,clip +s u8 u8 r n n n p 481 688 2050 2050 688 688 bias,relu,clip +s s16 r n n n r 482 688 2050 2050 688 688 none +s s8 r n n n r 482 688 2050 2050 688 688 none +s u8 r n n n r 482 688 2050 2050 688 688 none +s s16 u8 r n n n r 482 688 2050 2050 688 688 bias,relu,clip +s s8 u8 r n n n r 482 688 2050 2050 688 688 bias,relu,clip +s u8 u8 r n n n r 482 688 2050 2050 688 688 bias,relu,clip +s s16 r n n n r 483 688 2050 2050 688 688 none +s s8 r n n n r 483 688 2050 2050 688 688 none +s u8 r n n n r 483 688 2050 2050 688 688 none +s s16 u8 r n n n r 483 688 2050 2050 688 688 bias,relu,clip +s s8 u8 r n n n r 483 688 2050 2050 688 688 bias,relu,clip +s u8 u8 r n n n r 483 688 2050 2050 688 688 bias,relu,clip +s s16 r n n n r 484 688 2050 2050 688 688 none +s s8 r n n n r 484 688 2050 2050 688 688 none +s u8 r n n n r 484 688 2050 2050 688 688 none +s s16 u8 r n n n r 484 688 2050 2050 688 688 bias,relu,clip +s s8 u8 r n n n r 484 688 2050 2050 688 688 bias,relu,clip +s u8 u8 r n n n r 484 688 2050 2050 688 688 bias,relu,clip +s s16 r n n n r 485 688 2050 2050 688 688 none +s s8 r n n n r 485 688 2050 2050 688 688 none +s u8 r n n n r 485 688 2050 2050 688 688 none +s s16 u8 r n n n r 485 688 2050 2050 688 688 bias,relu,clip +s s8 u8 r n n n r 485 688 2050 2050 688 688 bias,relu,clip +s u8 u8 r n n n r 485 688 2050 2050 688 688 bias,relu,clip +s s16 r n n n r 1024 512 64 64 512 512 none +s s8 r n n n r 1024 512 64 64 512 512 none +s u8 r n n n r 1024 512 64 64 512 512 none +s s16 u8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +s s8 u8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +s u8 u8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +s s16 r n n n r 16 256 512 512 256 256 none +s s8 r n n n r 16 256 512 512 256 256 none +s u8 r n n n r 16 256 512 512 256 256 none +s s16 u8 r n n n r 16 256 512 512 256 256 bias,relu,clip +s s8 u8 r n n n r 16 256 512 512 256 256 bias,relu,clip +s u8 u8 r n n n r 16 256 512 512 256 256 bias,relu,clip +s s16 r n n n r 480 640 512 512 640 640 none +s s8 r n n n r 480 640 512 512 640 640 none +s u8 r n n n r 480 640 512 512 640 640 none +s s16 u8 r n n n r 480 640 512 512 640 640 bias,relu,clip +s s8 u8 r n n n r 480 640 512 512 640 640 bias,relu,clip +s u8 u8 r n n n r 480 640 512 512 640 640 bias,relu,clip +s s16 r n n n r 64 768 512 512 768 768 none +s s8 r n n n r 64 768 512 512 768 768 none +s u8 r n n n r 64 768 512 512 768 768 none +s s16 u8 r n n n r 64 768 512 512 768 768 bias,relu,clip +s s8 u8 r n n n r 64 768 512 512 768 768 bias,relu,clip +s u8 u8 r n n n r 64 768 512 512 768 768 bias,relu,clip +s s16 r n n n r 128 128 128 128 128 128 none +s s8 r n n n r 128 128 128 128 128 128 none +s u8 r n n n r 128 128 128 128 128 128 none +s s16 u8 r n n n r 128 128 128 128 128 128 bias,relu,clip +s s8 u8 r n n n r 128 128 128 128 128 128 bias,relu,clip +s u8 u8 r n n n r 128 128 128 128 128 128 bias,relu,clip +s s16 r n n n r 1024 64 512 512 64 64 none +s s8 r n n n r 1024 64 512 512 64 64 none +s u8 r n n n r 1024 64 512 512 64 64 none +s s16 u8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +s s8 u8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +s u8 u8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +s s16 r n n n r 1024 256 32 32 256 256 none +s s8 r n n n r 1024 256 32 32 256 256 none +s u8 r n n n r 1024 256 32 32 256 256 none +s s16 u8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +s s8 u8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +s u8 u8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +s s16 r n n n r 1024 512 64 64 512 512 none +s s8 r n n n r 1024 512 64 64 512 512 none +s u8 r n n n r 1024 512 64 64 512 512 none +s s16 u8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +s s8 u8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +s u8 u8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +s s16 r n n n r 480 640 512 512 640 640 none +s s8 r n n n r 480 640 512 512 640 640 none +s u8 r n n n r 480 640 512 512 640 640 none +s s16 u8 r n n n r 480 640 512 512 640 640 bias,relu,clip +s s8 u8 r n n n r 480 640 512 512 640 640 bias,relu,clip +s u8 u8 r n n n r 480 640 512 512 640 640 bias,relu,clip +s s16 r n n n p 1024 32 256 256 32 32 none +s s8 r n n n p 1024 32 256 256 32 32 none +s u8 r n n n p 1024 32 256 256 32 32 none +s s16 u8 r n n n p 1024 32 256 256 32 32 bias,relu,clip +s s8 u8 r n n n p 1024 32 256 256 32 32 bias,relu,clip +s u8 u8 r n n n p 1024 32 256 256 32 32 bias,relu,clip +s s16 r n n n P 1024 64 512 512 64 64 none +s s8 r n n n P 1024 64 512 512 64 64 none +s u8 r n n n P 1024 64 512 512 64 64 none +s s16 u8 r n n n P 1024 64 512 512 64 64 bias,relu,clip +s s8 u8 r n n n P 1024 64 512 512 64 64 bias,relu,clip +s u8 u8 r n n n P 1024 64 512 512 64 64 bias,relu,clip +s s16 r n n n P 64 800 320 320 800 800 none +s s8 r n n n P 64 800 320 320 800 800 none +s u8 r n n n P 64 800 320 320 800 800 none +s s16 u8 r n n n P 64 800 320 320 800 800 bias,relu,clip +s s8 u8 r n n n P 64 800 320 320 800 800 bias,relu,clip +s u8 u8 r n n n P 64 800 320 320 800 800 bias,relu,clip +s s16 r n n n P 64 768 512 512 768 768 none +s s8 r n n n P 64 768 512 512 768 768 none +s u8 r n n n P 64 768 512 512 768 768 none +s s16 u8 r n n n P 64 768 512 512 768 768 bias,relu,clip +s s8 u8 r n n n P 64 768 512 512 768 768 bias,relu,clip +s u8 u8 r n n n P 64 768 512 512 768 768 bias,relu,clip +s s16 r n n n P 16 256 512 512 256 256 none +s s8 r n n n P 16 256 512 512 256 256 none +s u8 r n n n P 16 256 512 512 256 256 none +s s16 u8 r n n n P 16 256 512 512 256 256 bias,relu,clip +s s8 u8 r n n n P 16 256 512 512 256 256 bias,relu,clip +s u8 u8 r n n n P 16 256 512 512 256 256 bias,relu,clip +s s16 r n n n P 128 128 128 128 128 128 none +s s8 r n n n P 128 128 128 128 128 128 none +s u8 r n n n P 128 128 128 128 128 128 none +s s16 u8 r n n n P 128 128 128 128 128 128 bias,relu,clip +s s8 u8 r n n n P 128 128 128 128 128 128 bias,relu,clip +s u8 u8 r n n n P 128 128 128 128 128 128 bias,relu,clip +s s16 r n n n P 256 512 256 256 512 512 none +s s8 r n n n P 256 512 256 256 512 512 none +s u8 r n n n P 256 512 256 256 512 512 none +s s16 u8 r n n n P 256 512 256 256 512 512 bias,relu,clip +s s8 u8 r n n n P 256 512 256 256 512 512 bias,relu,clip +s u8 u8 r n n n P 256 512 256 256 512 512 bias,relu,clip +s s16 r n n n P 1024 1024 1024 1024 1024 1024 none +s s8 r n n n P 1024 1024 1024 1024 1024 1024 none +s u8 r n n n P 1024 1024 1024 1024 1024 1024 none +s s16 u8 r n n n P 1024 1024 1024 1024 1024 1024 bias,relu,clip +s s8 u8 r n n n P 1024 1024 1024 1024 1024 1024 bias,relu,clip +s u8 u8 r n n n P 1024 1024 1024 1024 1024 1024 bias,relu,clip +s s16 r n n n P 480 640 1024 1024 640 640 none +s s8 r n n n P 480 640 1024 1024 640 640 none +s u8 r n n n P 480 640 1024 1024 640 640 none +s s16 u8 r n n n P 480 640 1024 1024 640 640 bias,relu,clip +s s8 u8 r n n n P 480 640 1024 1024 640 640 bias,relu,clip +s u8 u8 r n n n P 480 640 1024 1024 640 640 bias,relu,clip +s s16 r n n n P 480 640 256 256 640 640 none +s s8 r n n n P 480 640 256 256 640 640 none +s u8 r n n n P 480 640 256 256 640 640 none +s s16 u8 r n n n P 480 640 256 256 640 640 bias,relu,clip +s s8 u8 r n n n P 480 640 256 256 640 640 bias,relu,clip +s u8 u8 r n n n P 480 640 256 256 640 640 bias,relu,clip +s s16 r n n n P 8 64 32 32 64 64 none +s s8 r n n n P 8 64 32 32 64 64 none +s u8 r n n n P 8 64 32 32 64 64 none +s s16 u8 r n n n P 8 64 32 32 64 64 bias,relu,clip +s s8 u8 r n n n P 8 64 32 32 64 64 bias,relu,clip +s u8 u8 r n n n P 8 64 32 32 64 64 bias,relu,clip +s s16 r n n n P 9 64 32 32 64 64 none +s s8 r n n n P 9 64 32 32 64 64 none +s u8 r n n n P 9 64 32 32 64 64 none +s s16 u8 r n n n P 9 64 32 32 64 64 bias,relu,clip +s s8 u8 r n n n P 9 64 32 32 64 64 bias,relu,clip +s u8 u8 r n n n P 9 64 32 32 64 64 bias,relu,clip +s s16 r n n n P 10 128 64 64 128 128 none +s s8 r n n n P 10 128 64 64 128 128 none +s u8 r n n n P 10 128 64 64 128 128 none +s s16 u8 r n n n P 10 128 64 64 128 128 bias,relu,clip +s s8 u8 r n n n P 10 128 64 64 128 128 bias,relu,clip +s u8 u8 r n n n P 10 128 64 64 128 128 bias,relu,clip +s s16 r n n n P 8 8 8 8 8 8 none +s s8 r n n n P 8 8 8 8 8 8 none +s u8 r n n n P 8 8 8 8 8 8 none +s s16 u8 r n n n P 8 8 8 8 8 8 bias,relu,clip +s s8 u8 r n n n P 8 8 8 8 8 8 bias,relu,clip +s u8 u8 r n n n P 8 8 8 8 8 8 bias,relu,clip +s s16 r n n n P 12 12 12 12 12 12 none +s s8 r n n n P 12 12 12 12 12 12 none +s u8 r n n n P 12 12 12 12 12 12 none +s s16 u8 r n n n P 12 12 12 12 12 12 bias,relu,clip +s s8 u8 r n n n P 12 12 12 12 12 12 bias,relu,clip +s u8 u8 r n n n P 12 12 12 12 12 12 bias,relu,clip +s s16 r n n n P 25 25 25 25 25 25 none +s s8 r n n n P 25 25 25 25 25 25 none +s u8 r n n n P 25 25 25 25 25 25 none +s s16 u8 r n n n P 25 25 25 25 25 25 bias,relu,clip +s s8 u8 r n n n P 25 25 25 25 25 25 bias,relu,clip +s u8 u8 r n n n P 25 25 25 25 25 25 bias,relu,clip +s s16 r n n n P 25 25 20 20 25 25 none +s s8 r n n n P 25 25 20 20 25 25 none +s u8 r n n n P 25 25 20 20 25 25 none +s s16 u8 r n n n P 25 25 20 20 25 25 bias,relu,clip +s s8 u8 r n n n P 25 25 20 20 25 25 bias,relu,clip +s u8 u8 r n n n P 25 25 20 20 25 25 bias,relu,clip +i s32 r n n n p 480 20 2050 2050 20 20 none +i s8 r n n n p 480 20 2050 2050 20 20 none +i s32 s8 r n n n p 480 20 2050 2050 20 20 bias,relu,clip +i s8 s8 r n n n p 480 20 2050 2050 20 20 bias,relu,clip +i s32 r n n n p 481 20 2050 2050 20 20 none +i s8 r n n n p 481 20 2050 2050 20 20 none +i s32 s8 r n n n p 481 20 2050 2050 20 20 bias,relu,clip +i s8 s8 r n n n p 481 20 2050 2050 20 20 bias,relu,clip +i s32 r n n n p 482 20 2050 2050 20 20 none +i s8 r n n n p 482 20 2050 2050 20 20 none +i s32 s8 r n n n p 482 20 2050 2050 20 20 bias,relu,clip +i s8 s8 r n n n p 482 20 2050 2050 20 20 bias,relu,clip +i s32 r n n n p 483 20 2050 2050 20 20 none +i s8 r n n n p 483 20 2050 2050 20 20 none +i s32 s8 r n n n p 483 20 2050 2050 20 20 bias,relu,clip +i s8 s8 r n n n p 483 20 2050 2050 20 20 bias,relu,clip +i s32 r n n n R 484 20 2050 2050 20 20 none +i s8 r n n n R 484 20 2050 2050 20 20 none +i s32 s8 r n n n R 484 20 2050 2050 20 20 bias,relu,clip +i s8 s8 r n n n R 484 20 2050 2050 20 20 bias,relu,clip +i s32 r n n n R 485 20 2050 2050 20 20 none +i s8 r n n n R 485 20 2050 2050 20 20 none +i s32 s8 r n n n R 485 20 2050 2050 20 20 bias,relu,clip +i s8 s8 r n n n R 485 20 2050 2050 20 20 bias,relu,clip +i s32 r n n n R 480 39 2050 2050 39 39 none +i s8 r n n n R 480 39 2050 2050 39 39 none +i s32 s8 r n n n R 480 39 2050 2050 39 39 bias,relu,clip +i s8 s8 r n n n R 480 39 2050 2050 39 39 bias,relu,clip +i s32 r n n n R 481 39 2050 2050 39 39 none +i s8 r n n n R 481 39 2050 2050 39 39 none +i s32 s8 r n n n R 481 39 2050 2050 39 39 bias,relu,clip +i s8 s8 r n n n R 481 39 2050 2050 39 39 bias,relu,clip +i s32 r n n n R 482 39 2050 2050 39 39 none +i s8 r n n n R 482 39 2050 2050 39 39 none +i s32 s8 r n n n R 482 39 2050 2050 39 39 bias,relu,clip +i s8 s8 r n n n R 482 39 2050 2050 39 39 bias,relu,clip +i s32 r n n n R 483 39 2050 2050 39 39 none +i s8 r n n n R 483 39 2050 2050 39 39 none +i s32 s8 r n n n R 483 39 2050 2050 39 39 bias,relu,clip +i s8 s8 r n n n R 483 39 2050 2050 39 39 bias,relu,clip +i s32 r n n n R 484 39 2050 2050 39 39 none +i s8 r n n n R 484 39 2050 2050 39 39 none +i s32 s8 r n n n R 484 39 2050 2050 39 39 bias,relu,clip +i s8 s8 r n n n R 484 39 2050 2050 39 39 bias,relu,clip +i s32 r n n n p 485 39 2050 2050 39 39 none +i s8 r n n n p 485 39 2050 2050 39 39 none +i s32 s8 r n n n p 485 39 2050 2050 39 39 bias,relu,clip +i s8 s8 r n n n p 485 39 2050 2050 39 39 bias,relu,clip +i s32 r n n n p 480 50 2050 2050 50 50 none +i s8 r n n n p 480 50 2050 2050 50 50 none +i s32 s8 r n n n p 480 50 2050 2050 50 50 bias,relu,clip +i s8 s8 r n n n p 480 50 2050 2050 50 50 bias,relu,clip +i s32 r n n n p 481 50 2050 2050 50 50 none +i s8 r n n n p 481 50 2050 2050 50 50 none +i s32 s8 r n n n p 481 50 2050 2050 50 50 bias,relu,clip +i s8 s8 r n n n p 481 50 2050 2050 50 50 bias,relu,clip +i s32 r n n n p 482 50 2050 2050 50 50 none +i s8 r n n n p 482 50 2050 2050 50 50 none +i s32 s8 r n n n p 482 50 2050 2050 50 50 bias,relu,clip +i s8 s8 r n n n p 482 50 2050 2050 50 50 bias,relu,clip +i s32 r n n n p 483 50 2050 2050 50 50 none +i s8 r n n n p 483 50 2050 2050 50 50 none +i s32 s8 r n n n p 483 50 2050 2050 50 50 bias,relu,clip +i s8 s8 r n n n p 483 50 2050 2050 50 50 bias,relu,clip +i s32 r n n n p 484 50 2050 2050 50 50 none +i s8 r n n n p 484 50 2050 2050 50 50 none +i s32 s8 r n n n p 484 50 2050 2050 50 50 bias,relu,clip +i s8 s8 r n n n p 484 50 2050 2050 50 50 bias,relu,clip +i s32 r n n n p 485 50 2050 2050 50 50 none +i s8 r n n n p 485 50 2050 2050 50 50 none +i s32 s8 r n n n p 485 50 2050 2050 50 50 bias,relu,clip +i s8 s8 r n n n p 485 50 2050 2050 50 50 bias,relu,clip +i s32 r n n n R 480 1108 2050 2050 1108 1108 none +i s8 r n n n R 480 1108 2050 2050 1108 1108 none +i s32 s8 r n n n R 480 1108 2050 2050 1108 1108 bias,relu,clip +i s8 s8 r n n n R 480 1108 2050 2050 1108 1108 bias,relu,clip +i s32 r n n n R 481 1108 2050 2050 1108 1108 none +i s8 r n n n R 481 1108 2050 2050 1108 1108 none +i s32 s8 r n n n R 481 1108 2050 2050 1108 1108 bias,relu,clip +i s8 s8 r n n n R 481 1108 2050 2050 1108 1108 bias,relu,clip +i s32 r n n n R 482 1108 2050 2050 1108 1108 none +i s8 r n n n R 482 1108 2050 2050 1108 1108 none +i s32 s8 r n n n R 482 1108 2050 2050 1108 1108 bias,relu,clip +i s8 s8 r n n n R 482 1108 2050 2050 1108 1108 bias,relu,clip +i s32 r n n n R 483 1108 2050 2050 1108 1108 none +i s8 r n n n R 483 1108 2050 2050 1108 1108 none +i s32 s8 r n n n R 483 1108 2050 2050 1108 1108 bias,relu,clip +i s8 s8 r n n n R 483 1108 2050 2050 1108 1108 bias,relu,clip +i s32 r n n n R 484 1108 2050 2050 1108 1108 none +i s8 r n n n R 484 1108 2050 2050 1108 1108 none +i s32 s8 r n n n R 484 1108 2050 2050 1108 1108 bias,relu,clip +i s8 s8 r n n n R 484 1108 2050 2050 1108 1108 bias,relu,clip +i s32 r n n n R 485 1108 2050 2050 1108 1108 none +i s8 r n n n R 485 1108 2050 2050 1108 1108 none +i s32 s8 r n n n R 485 1108 2050 2050 1108 1108 bias,relu,clip +i s8 s8 r n n n R 485 1108 2050 2050 1108 1108 bias,relu,clip +i s32 r n n n R 480 1127 2050 2050 1127 1127 none +i s8 r n n n R 480 1127 2050 2050 1127 1127 none +i s32 s8 r n n n R 480 1127 2050 2050 1127 1127 bias,relu,clip +i s8 s8 r n n n R 480 1127 2050 2050 1127 1127 bias,relu,clip +i s32 r n n n R 481 1127 2050 2050 1127 1127 none +i s8 r n n n R 481 1127 2050 2050 1127 1127 none +i s32 s8 r n n n R 481 1127 2050 2050 1127 1127 bias,relu,clip +i s8 s8 r n n n R 481 1127 2050 2050 1127 1127 bias,relu,clip +i s32 r n n n R 482 1127 2050 2050 1127 1127 none +i s8 r n n n R 482 1127 2050 2050 1127 1127 none +i s32 s8 r n n n R 482 1127 2050 2050 1127 1127 bias,relu,clip +i s8 s8 r n n n R 482 1127 2050 2050 1127 1127 bias,relu,clip +i s32 r n n n R 483 1127 2050 2050 1127 1127 none +i s8 r n n n R 483 1127 2050 2050 1127 1127 none +i s32 s8 r n n n R 483 1127 2050 2050 1127 1127 bias,relu,clip +i s8 s8 r n n n R 483 1127 2050 2050 1127 1127 bias,relu,clip +i s32 r n n n p 484 1127 2050 2050 1127 1127 none +i s8 r n n n p 484 1127 2050 2050 1127 1127 none +i s32 s8 r n n n p 484 1127 2050 2050 1127 1127 bias,relu,clip +i s8 s8 r n n n p 484 1127 2050 2050 1127 1127 bias,relu,clip +i s32 r n n n p 485 1127 2050 2050 1127 1127 none +i s8 r n n n p 485 1127 2050 2050 1127 1127 none +i s32 s8 r n n n p 485 1127 2050 2050 1127 1127 bias,relu,clip +i s8 s8 r n n n p 485 1127 2050 2050 1127 1127 bias,relu,clip +i s32 r n n n p 480 1138 2050 2050 1138 1138 none +i s8 r n n n p 480 1138 2050 2050 1138 1138 none +i s32 s8 r n n n p 480 1138 2050 2050 1138 1138 bias,relu,clip +i s8 s8 r n n n p 480 1138 2050 2050 1138 1138 bias,relu,clip +i s32 r n n n p 481 1138 2050 2050 1138 1138 none +i s8 r n n n p 481 1138 2050 2050 1138 1138 none +i s32 s8 r n n n p 481 1138 2050 2050 1138 1138 bias,relu,clip +i s8 s8 r n n n p 481 1138 2050 2050 1138 1138 bias,relu,clip +i s32 r n n n p 482 1138 2050 2050 1138 1138 none +i s8 r n n n p 482 1138 2050 2050 1138 1138 none +i s32 s8 r n n n p 482 1138 2050 2050 1138 1138 bias,relu,clip +i s8 s8 r n n n p 482 1138 2050 2050 1138 1138 bias,relu,clip +i s32 r n n n p 483 1138 2050 2050 1138 1138 none +i s8 r n n n p 483 1138 2050 2050 1138 1138 none +i s32 s8 r n n n p 483 1138 2050 2050 1138 1138 bias,relu,clip +i s8 s8 r n n n p 483 1138 2050 2050 1138 1138 bias,relu,clip +i s32 r n n n p 484 1138 2050 2050 1138 1138 none +i s8 r n n n p 484 1138 2050 2050 1138 1138 none +i s32 s8 r n n n p 484 1138 2050 2050 1138 1138 bias,relu,clip +i s8 s8 r n n n p 484 1138 2050 2050 1138 1138 bias,relu,clip +i s32 r n n n p 485 1138 2050 2050 1138 1138 none +i s8 r n n n p 485 1138 2050 2050 1138 1138 none +i s32 s8 r n n n p 485 1138 2050 2050 1138 1138 bias,relu,clip +i s8 s8 r n n n p 485 1138 2050 2050 1138 1138 bias,relu,clip +i s32 r n n n p 1 1 3 3 1 1 none +i s8 r n n n p 1 1 3 3 1 1 none +i s32 s8 r n n n p 1 1 3 3 1 1 bias,relu,clip +i s8 s8 r n n n p 1 1 3 3 1 1 bias,relu,clip +i s32 r n n n p 1 9 3 3 9 9 none +i s8 r n n n p 1 9 3 3 9 9 none +i s32 s8 r n n n p 1 9 3 3 9 9 bias,relu,clip +i s8 s8 r n n n p 1 9 3 3 9 9 bias,relu,clip +i s32 r n n n p 1 2048 3 3 2048 2048 none +i s8 r n n n p 1 2048 3 3 2048 2048 none +i s32 s8 r n n n p 1 2048 3 3 2048 2048 bias,relu,clip +i s8 s8 r n n n p 1 2048 3 3 2048 2048 bias,relu,clip +i s32 r n n n p 1 2048 5192 5192 2048 2048 none +i s8 r n n n p 1 2048 5192 5192 2048 2048 none +i s32 s8 r n n n p 1 2048 5192 5192 2048 2048 bias,relu,clip +i s8 s8 r n n n p 1 2048 5192 5192 2048 2048 bias,relu,clip +i s32 r n n n p 9 1 3 3 1 1 none +i s8 r n n n p 9 1 3 3 1 1 none +i s32 s8 r n n n p 9 1 3 3 1 1 bias,relu,clip +i s8 s8 r n n n p 9 1 3 3 1 1 bias,relu,clip +i s32 r n n n p 576 1 3500 3500 1 1 none +i s8 r n n n p 576 1 3500 3500 1 1 none +i s32 s8 r n n n p 576 1 3500 3500 1 1 bias,relu,clip +i s8 s8 r n n n p 576 1 3500 3500 1 1 bias,relu,clip +i s32 r n n n p 1 1 1 1 1 1 none +i s8 r n n n p 1 1 1 1 1 1 none +i s32 s8 r n n n p 1 1 1 1 1 1 bias,relu,clip +i s8 s8 r n n n p 1 1 1 1 1 1 bias,relu,clip +i s32 r n n n p 102 1088 1024 1024 1088 1088 none +i s8 r n n n p 102 1088 1024 1024 1088 1088 none +i s32 s8 r n n n p 102 1088 1024 1024 1088 1088 bias,relu,clip +i s8 s8 r n n n p 102 1088 1024 1024 1088 1088 bias,relu,clip +i s32 r n n n p 102 2048 1024 1024 2048 2048 none +i s8 r n n n p 102 2048 1024 1024 2048 2048 none +i s32 s8 r n n n p 102 2048 1024 1024 2048 2048 bias,relu,clip +i s8 s8 r n n n p 102 2048 1024 1024 2048 2048 bias,relu,clip +i s32 r n n n p 485 656 1024 1024 656 656 none +i s8 r n n n p 485 656 1024 1024 656 656 none +i s32 s8 r n n n p 485 656 1024 1024 656 656 bias,relu,clip +i s8 s8 r n n n p 485 656 1024 1024 656 656 bias,relu,clip +i s32 r n n n p 483 656 1024 1024 656 656 none +i s8 r n n n p 483 656 1024 1024 656 656 none +i s32 s8 r n n n p 483 656 1024 1024 656 656 bias,relu,clip +i s8 s8 r n n n p 483 656 1024 1024 656 656 bias,relu,clip +i s32 r n n n p 81 128 3 3 128 128 none +i s8 r n n n p 81 128 3 3 128 128 none +i s32 s8 r n n n p 81 128 3 3 128 128 bias,relu,clip +i s8 s8 r n n n p 81 128 3 3 128 128 bias,relu,clip +i s32 r n n n p 1022 512 515 515 512 512 none +i s8 r n n n p 1022 512 515 515 512 512 none +i s32 s8 r n n n p 1022 512 515 515 512 512 bias,relu,clip +i s8 s8 r n n n p 1022 512 515 515 512 512 bias,relu,clip +i s32 r n n n p 74 512 515 515 512 512 none +i s8 r n n n p 74 512 515 515 512 512 none +i s32 s8 r n n n p 74 512 515 515 512 512 bias,relu,clip +i s8 s8 r n n n p 74 512 515 515 512 512 bias,relu,clip +i s32 r n n n p 253 2048 515 515 2048 2048 none +i s8 r n n n p 253 2048 515 515 2048 2048 none +i s32 s8 r n n n p 253 2048 515 515 2048 2048 bias,relu,clip +i s8 s8 r n n n p 253 2048 515 515 2048 2048 bias,relu,clip +i s32 r n n n p 8192 1040 515 515 1040 1040 none +i s8 r n n n p 8192 1040 515 515 1040 1040 none +i s32 s8 r n n n p 8192 1040 515 515 1040 1040 bias,relu,clip +i s8 s8 r n n n p 8192 1040 515 515 1040 1040 bias,relu,clip +i s32 r n n n p 10 1029 515 515 1029 1029 none +i s8 r n n n p 10 1029 515 515 1029 1029 none +i s32 s8 r n n n p 10 1029 515 515 1029 1029 bias,relu,clip +i s8 s8 r n n n p 10 1029 515 515 1029 1029 bias,relu,clip +i s32 r n n n p 24 1040 2050 2050 1040 1040 none +i s8 r n n n p 24 1040 2050 2050 1040 1040 none +i s32 s8 r n n n p 24 1040 2050 2050 1040 1040 bias,relu,clip +i s8 s8 r n n n p 24 1040 2050 2050 1040 1040 bias,relu,clip +i s32 r n n n p 1024 1029 2050 2050 1029 1029 none +i s8 r n n n p 1024 1029 2050 2050 1029 1029 none +i s32 s8 r n n n p 1024 1029 2050 2050 1029 1029 bias,relu,clip +i s8 s8 r n n n p 1024 1029 2050 2050 1029 1029 bias,relu,clip +i s32 r n n n p 480 660 2050 2050 660 660 none +i s8 r n n n p 480 660 2050 2050 660 660 none +i s32 s8 r n n n p 480 660 2050 2050 660 660 bias,relu,clip +i s8 s8 r n n n p 480 660 2050 2050 660 660 bias,relu,clip +i s32 r n n n p 481 660 2050 2050 660 660 none +i s8 r n n n p 481 660 2050 2050 660 660 none +i s32 s8 r n n n p 481 660 2050 2050 660 660 bias,relu,clip +i s8 s8 r n n n p 481 660 2050 2050 660 660 bias,relu,clip +i s32 r n n n p 482 660 2050 2050 660 660 none +i s8 r n n n p 482 660 2050 2050 660 660 none +i s32 s8 r n n n p 482 660 2050 2050 660 660 bias,relu,clip +i s8 s8 r n n n p 482 660 2050 2050 660 660 bias,relu,clip +i s32 r n n n p 483 660 2050 2050 660 660 none +i s8 r n n n p 483 660 2050 2050 660 660 none +i s32 s8 r n n n p 483 660 2050 2050 660 660 bias,relu,clip +i s8 s8 r n n n p 483 660 2050 2050 660 660 bias,relu,clip +i s32 r n n n p 484 660 2050 2050 660 660 none +i s8 r n n n p 484 660 2050 2050 660 660 none +i s32 s8 r n n n p 484 660 2050 2050 660 660 bias,relu,clip +i s8 s8 r n n n p 484 660 2050 2050 660 660 bias,relu,clip +i s32 r n n n p 485 660 2050 2050 660 660 none +i s8 r n n n p 485 660 2050 2050 660 660 none +i s32 s8 r n n n p 485 660 2050 2050 660 660 bias,relu,clip +i s8 s8 r n n n p 485 660 2050 2050 660 660 bias,relu,clip +i s32 r n n n p 480 679 2050 2050 679 679 none +i s8 r n n n p 480 679 2050 2050 679 679 none +i s32 s8 r n n n p 480 679 2050 2050 679 679 bias,relu,clip +i s8 s8 r n n n p 480 679 2050 2050 679 679 bias,relu,clip +i s32 r n n n p 481 679 2050 2050 679 679 none +i s8 r n n n p 481 679 2050 2050 679 679 none +i s32 s8 r n n n p 481 679 2050 2050 679 679 bias,relu,clip +i s8 s8 r n n n p 481 679 2050 2050 679 679 bias,relu,clip +i s32 r n n n p 482 679 2050 2050 679 679 none +i s8 r n n n p 482 679 2050 2050 679 679 none +i s32 s8 r n n n p 482 679 2050 2050 679 679 bias,relu,clip +i s8 s8 r n n n p 482 679 2050 2050 679 679 bias,relu,clip +i s32 r n n n p 483 679 2050 2050 679 679 none +i s8 r n n n p 483 679 2050 2050 679 679 none +i s32 s8 r n n n p 483 679 2050 2050 679 679 bias,relu,clip +i s8 s8 r n n n p 483 679 2050 2050 679 679 bias,relu,clip +i s32 r n n n p 484 679 2050 2050 679 679 none +i s8 r n n n p 484 679 2050 2050 679 679 none +i s32 s8 r n n n p 484 679 2050 2050 679 679 bias,relu,clip +i s8 s8 r n n n p 484 679 2050 2050 679 679 bias,relu,clip +i s32 r n n n p 485 679 2050 2050 679 679 none +i s8 r n n n p 485 679 2050 2050 679 679 none +i s32 s8 r n n n p 485 679 2050 2050 679 679 bias,relu,clip +i s8 s8 r n n n p 485 679 2050 2050 679 679 bias,relu,clip +i s32 r n n n p 480 690 2050 2050 690 690 none +i s8 r n n n p 480 690 2050 2050 690 690 none +i s32 s8 r n n n p 480 690 2050 2050 690 690 bias,relu,clip +i s8 s8 r n n n p 480 690 2050 2050 690 690 bias,relu,clip +i s32 r n n n p 481 690 2050 2050 690 690 none +i s8 r n n n p 481 690 2050 2050 690 690 none +i s32 s8 r n n n p 481 690 2050 2050 690 690 bias,relu,clip +i s8 s8 r n n n p 481 690 2050 2050 690 690 bias,relu,clip +i s32 r n n n p 482 690 2050 2050 690 690 none +i s8 r n n n p 482 690 2050 2050 690 690 none +i s32 s8 r n n n p 482 690 2050 2050 690 690 bias,relu,clip +i s8 s8 r n n n p 482 690 2050 2050 690 690 bias,relu,clip +i s32 r n n n p 483 690 2050 2050 690 690 none +i s8 r n n n p 483 690 2050 2050 690 690 none +i s32 s8 r n n n p 483 690 2050 2050 690 690 bias,relu,clip +i s8 s8 r n n n p 483 690 2050 2050 690 690 bias,relu,clip +i s32 r n n n p 484 690 2050 2050 690 690 none +i s8 r n n n p 484 690 2050 2050 690 690 none +i s32 s8 r n n n p 484 690 2050 2050 690 690 bias,relu,clip +i s8 s8 r n n n p 484 690 2050 2050 690 690 bias,relu,clip +i s32 r n n n p 485 690 2050 2050 690 690 none +i s8 r n n n p 485 690 2050 2050 690 690 none +i s32 s8 r n n n p 485 690 2050 2050 690 690 bias,relu,clip +i s8 s8 r n n n p 485 690 2050 2050 690 690 bias,relu,clip +i s32 r n n n p 480 660 2048 2048 660 660 none +i s8 r n n n p 480 660 2048 2048 660 660 none +i s32 s8 r n n n p 480 660 2048 2048 660 660 bias,relu,clip +i s8 s8 r n n n p 480 660 2048 2048 660 660 bias,relu,clip +i s32 r n n n p 481 660 2048 2048 660 660 none +i s8 r n n n p 481 660 2048 2048 660 660 none +i s32 s8 r n n n p 481 660 2048 2048 660 660 bias,relu,clip +i s8 s8 r n n n p 481 660 2048 2048 660 660 bias,relu,clip +i s32 r n n n p 482 660 2048 2048 660 660 none +i s8 r n n n p 482 660 2048 2048 660 660 none +i s32 s8 r n n n p 482 660 2048 2048 660 660 bias,relu,clip +i s8 s8 r n n n p 482 660 2048 2048 660 660 bias,relu,clip +i s32 r n n n p 483 660 2048 2048 660 660 none +i s8 r n n n p 483 660 2048 2048 660 660 none +i s32 s8 r n n n p 483 660 2048 2048 660 660 bias,relu,clip +i s8 s8 r n n n p 483 660 2048 2048 660 660 bias,relu,clip +i s32 r n n n p 484 660 2048 2048 660 660 none +i s8 r n n n p 484 660 2048 2048 660 660 none +i s32 s8 r n n n p 484 660 2048 2048 660 660 bias,relu,clip +i s8 s8 r n n n p 484 660 2048 2048 660 660 bias,relu,clip +i s32 r n n n p 485 660 2048 2048 660 660 none +i s8 r n n n p 485 660 2048 2048 660 660 none +i s32 s8 r n n n p 485 660 2048 2048 660 660 bias,relu,clip +i s8 s8 r n n n p 485 660 2048 2048 660 660 bias,relu,clip +i s32 r n n n p 480 679 2048 2048 679 679 none +i s8 r n n n p 480 679 2048 2048 679 679 none +i s32 s8 r n n n p 480 679 2048 2048 679 679 bias,relu,clip +i s8 s8 r n n n p 480 679 2048 2048 679 679 bias,relu,clip +i s32 r n n n p 481 679 2048 2048 679 679 none +i s8 r n n n p 481 679 2048 2048 679 679 none +i s32 s8 r n n n p 481 679 2048 2048 679 679 bias,relu,clip +i s8 s8 r n n n p 481 679 2048 2048 679 679 bias,relu,clip +i s32 r n n n p 482 679 2048 2048 679 679 none +i s8 r n n n p 482 679 2048 2048 679 679 none +i s32 s8 r n n n p 482 679 2048 2048 679 679 bias,relu,clip +i s8 s8 r n n n p 482 679 2048 2048 679 679 bias,relu,clip +i s32 r n n n p 483 679 2048 2048 679 679 none +i s8 r n n n p 483 679 2048 2048 679 679 none +i s32 s8 r n n n p 483 679 2048 2048 679 679 bias,relu,clip +i s8 s8 r n n n p 483 679 2048 2048 679 679 bias,relu,clip +i s32 r n n n p 484 679 2048 2048 679 679 none +i s8 r n n n p 484 679 2048 2048 679 679 none +i s32 s8 r n n n p 484 679 2048 2048 679 679 bias,relu,clip +i s8 s8 r n n n p 484 679 2048 2048 679 679 bias,relu,clip +i s32 r n n n p 485 679 2048 2048 679 679 none +i s8 r n n n p 485 679 2048 2048 679 679 none +i s32 s8 r n n n p 485 679 2048 2048 679 679 bias,relu,clip +i s8 s8 r n n n p 485 679 2048 2048 679 679 bias,relu,clip +i s32 r n n n p 480 690 2048 2048 690 690 none +i s8 r n n n p 480 690 2048 2048 690 690 none +i s32 s8 r n n n p 480 690 2048 2048 690 690 bias,relu,clip +i s8 s8 r n n n p 480 690 2048 2048 690 690 bias,relu,clip +i s32 r n n n p 481 690 2048 2048 690 690 none +i s8 r n n n p 481 690 2048 2048 690 690 none +i s32 s8 r n n n p 481 690 2048 2048 690 690 bias,relu,clip +i s8 s8 r n n n p 481 690 2048 2048 690 690 bias,relu,clip +i s32 r n n n p 482 690 2048 2048 690 690 none +i s8 r n n n p 482 690 2048 2048 690 690 none +i s32 s8 r n n n p 482 690 2048 2048 690 690 bias,relu,clip +i s8 s8 r n n n p 482 690 2048 2048 690 690 bias,relu,clip +i s32 r n n n p 483 690 2048 2048 690 690 none +i s8 r n n n p 483 690 2048 2048 690 690 none +i s32 s8 r n n n p 483 690 2048 2048 690 690 bias,relu,clip +i s8 s8 r n n n p 483 690 2048 2048 690 690 bias,relu,clip +i s32 r n n n p 484 690 2048 2048 690 690 none +i s8 r n n n p 484 690 2048 2048 690 690 none +i s32 s8 r n n n p 484 690 2048 2048 690 690 bias,relu,clip +i s8 s8 r n n n p 484 690 2048 2048 690 690 bias,relu,clip +i s32 r n n n p 485 690 2048 2048 690 690 none +i s8 r n n n p 485 690 2048 2048 690 690 none +i s32 s8 r n n n p 485 690 2048 2048 690 690 bias,relu,clip +i s8 s8 r n n n p 485 690 2048 2048 690 690 bias,relu,clip +i s32 r n n n p 480 656 1024 1024 656 656 none +i s8 r n n n p 480 656 1024 1024 656 656 none +i s32 s8 r n n n p 480 656 1024 1024 656 656 bias,relu,clip +i s8 s8 r n n n p 480 656 1024 1024 656 656 bias,relu,clip +i s32 r n n n p 480 128 3 3 128 128 none +i s8 r n n n p 480 128 3 3 128 128 none +i s32 s8 r n n n p 480 128 3 3 128 128 bias,relu,clip +i s8 s8 r n n n p 480 128 3 3 128 128 bias,relu,clip +i s32 r n n n p 1024 512 515 515 512 512 none +i s8 r n n n p 1024 512 515 515 512 512 none +i s32 s8 r n n n p 1024 512 515 515 512 512 bias,relu,clip +i s8 s8 r n n n p 1024 512 515 515 512 512 bias,relu,clip +i s32 r n n n p 1024 2048 1024 1024 2048 2048 none +i s8 r n n n p 1024 2048 1024 1024 2048 2048 none +i s32 s8 r n n n p 1024 2048 1024 1024 2048 2048 bias,relu,clip +i s8 s8 r n n n p 1024 2048 1024 1024 2048 2048 bias,relu,clip +i s32 r n n n p 1024 2048 515 515 2048 2048 none +i s8 r n n n p 1024 2048 515 515 2048 2048 none +i s32 s8 r n n n p 1024 2048 515 515 2048 2048 bias,relu,clip +i s8 s8 r n n n p 1024 2048 515 515 2048 2048 bias,relu,clip +i s32 r n n n p 1024 1040 515 515 1040 1040 none +i s8 r n n n p 1024 1040 515 515 1040 1040 none +i s32 s8 r n n n p 1024 1040 515 515 1040 1040 bias,relu,clip +i s8 s8 r n n n p 1024 1040 515 515 1040 1040 bias,relu,clip +i s32 r n n n p 5 1029 515 515 1029 1029 none +i s8 r n n n p 5 1029 515 515 1029 1029 none +i s32 s8 r n n n p 5 1029 515 515 1029 1029 bias,relu,clip +i s8 s8 r n n n p 5 1029 515 515 1029 1029 bias,relu,clip +i s32 r n n n p 1024 1029 515 515 1029 1029 none +i s8 r n n n p 1024 1029 515 515 1029 1029 none +i s32 s8 r n n n p 1024 1029 515 515 1029 1029 bias,relu,clip +i s8 s8 r n n n p 1024 1029 515 515 1029 1029 bias,relu,clip +i s32 r n n n p 1024 1040 2050 2050 1040 1040 none +i s8 r n n n p 1024 1040 2050 2050 1040 1040 none +i s32 s8 r n n n p 1024 1040 2050 2050 1040 1040 bias,relu,clip +i s8 s8 r n n n p 1024 1040 2050 2050 1040 1040 bias,relu,clip +i s32 r n n n p 1029 1029 2050 2050 1029 1029 none +i s8 r n n n p 1029 1029 2050 2050 1029 1029 none +i s32 s8 r n n n p 1029 1029 2050 2050 1029 1029 bias,relu,clip +i s8 s8 r n n n p 1029 1029 2050 2050 1029 1029 bias,relu,clip +i s32 r n n n R 480 646 2050 2050 646 646 none +i s8 r n n n R 480 646 2050 2050 646 646 none +i s32 s8 r n n n R 480 646 2050 2050 646 646 bias,relu,clip +i s8 s8 r n n n R 480 646 2050 2050 646 646 bias,relu,clip +i s32 r n n n R 481 646 2050 2050 646 646 none +i s8 r n n n R 481 646 2050 2050 646 646 none +i s32 s8 r n n n R 481 646 2050 2050 646 646 bias,relu,clip +i s8 s8 r n n n R 481 646 2050 2050 646 646 bias,relu,clip +i s32 r n n n R 482 646 2050 2050 646 646 none +i s8 r n n n R 482 646 2050 2050 646 646 none +i s32 s8 r n n n R 482 646 2050 2050 646 646 bias,relu,clip +i s8 s8 r n n n R 482 646 2050 2050 646 646 bias,relu,clip +i s32 r n n n R 483 646 2050 2050 646 646 none +i s8 r n n n R 483 646 2050 2050 646 646 none +i s32 s8 r n n n R 483 646 2050 2050 646 646 bias,relu,clip +i s8 s8 r n n n R 483 646 2050 2050 646 646 bias,relu,clip +i s32 r n n n R 484 646 2050 2050 646 646 none +i s8 r n n n R 484 646 2050 2050 646 646 none +i s32 s8 r n n n R 484 646 2050 2050 646 646 bias,relu,clip +i s8 s8 r n n n R 484 646 2050 2050 646 646 bias,relu,clip +i s32 r n n n R 485 646 2050 2050 646 646 none +i s8 r n n n R 485 646 2050 2050 646 646 none +i s32 s8 r n n n R 485 646 2050 2050 646 646 bias,relu,clip +i s8 s8 r n n n R 485 646 2050 2050 646 646 bias,relu,clip +i s32 r n n n R 481 656 2050 2050 656 656 none +i s8 r n n n R 481 656 2050 2050 656 656 none +i s32 s8 r n n n R 481 656 2050 2050 656 656 bias,relu,clip +i s8 s8 r n n n R 481 656 2050 2050 656 656 bias,relu,clip +i s32 r n n n R 482 656 2050 2050 656 656 none +i s8 r n n n R 482 656 2050 2050 656 656 none +i s32 s8 r n n n R 482 656 2050 2050 656 656 bias,relu,clip +i s8 s8 r n n n R 482 656 2050 2050 656 656 bias,relu,clip +i s32 r n n n R 483 656 2050 2050 656 656 none +i s8 r n n n R 483 656 2050 2050 656 656 none +i s32 s8 r n n n R 483 656 2050 2050 656 656 bias,relu,clip +i s8 s8 r n n n R 483 656 2050 2050 656 656 bias,relu,clip +i s32 r n n n R 484 656 2050 2050 656 656 none +i s8 r n n n R 484 656 2050 2050 656 656 none +i s32 s8 r n n n R 484 656 2050 2050 656 656 bias,relu,clip +i s8 s8 r n n n R 484 656 2050 2050 656 656 bias,relu,clip +i s32 r n n n p 485 656 2050 2050 656 656 none +i s8 r n n n p 485 656 2050 2050 656 656 none +i s32 s8 r n n n p 485 656 2050 2050 656 656 bias,relu,clip +i s8 s8 r n n n p 485 656 2050 2050 656 656 bias,relu,clip +i s32 r n n n p 480 672 2050 2050 672 672 none +i s8 r n n n p 480 672 2050 2050 672 672 none +i s32 s8 r n n n p 480 672 2050 2050 672 672 bias,relu,clip +i s8 s8 r n n n p 480 672 2050 2050 672 672 bias,relu,clip +i s32 r n n n p 481 672 2050 2050 672 672 none +i s8 r n n n p 481 672 2050 2050 672 672 none +i s32 s8 r n n n p 481 672 2050 2050 672 672 bias,relu,clip +i s8 s8 r n n n p 481 672 2050 2050 672 672 bias,relu,clip +i s32 r n n n p 482 672 2050 2050 672 672 none +i s8 r n n n p 482 672 2050 2050 672 672 none +i s32 s8 r n n n p 482 672 2050 2050 672 672 bias,relu,clip +i s8 s8 r n n n p 482 672 2050 2050 672 672 bias,relu,clip +i s32 r n n n p 483 672 2050 2050 672 672 none +i s8 r n n n p 483 672 2050 2050 672 672 none +i s32 s8 r n n n p 483 672 2050 2050 672 672 bias,relu,clip +i s8 s8 r n n n p 483 672 2050 2050 672 672 bias,relu,clip +i s32 r n n n p 484 672 2050 2050 672 672 none +i s8 r n n n p 484 672 2050 2050 672 672 none +i s32 s8 r n n n p 484 672 2050 2050 672 672 bias,relu,clip +i s8 s8 r n n n p 484 672 2050 2050 672 672 bias,relu,clip +i s32 r n n n p 485 672 2050 2050 672 672 none +i s8 r n n n p 485 672 2050 2050 672 672 none +i s32 s8 r n n n p 485 672 2050 2050 672 672 bias,relu,clip +i s8 s8 r n n n p 485 672 2050 2050 672 672 bias,relu,clip +i s32 r n n n p 480 688 2050 2050 688 688 none +i s8 r n n n p 480 688 2050 2050 688 688 none +i s32 s8 r n n n p 480 688 2050 2050 688 688 bias,relu,clip +i s8 s8 r n n n p 480 688 2050 2050 688 688 bias,relu,clip +i s32 r n n n p 481 688 2050 2050 688 688 none +i s8 r n n n p 481 688 2050 2050 688 688 none +i s32 s8 r n n n p 481 688 2050 2050 688 688 bias,relu,clip +i s8 s8 r n n n p 481 688 2050 2050 688 688 bias,relu,clip +i s32 r n n n r 482 688 2050 2050 688 688 none +i s8 r n n n r 482 688 2050 2050 688 688 none +i s32 s8 r n n n r 482 688 2050 2050 688 688 bias,relu,clip +i s8 s8 r n n n r 482 688 2050 2050 688 688 bias,relu,clip +i s32 r n n n r 483 688 2050 2050 688 688 none +i s8 r n n n r 483 688 2050 2050 688 688 none +i s32 s8 r n n n r 483 688 2050 2050 688 688 bias,relu,clip +i s8 s8 r n n n r 483 688 2050 2050 688 688 bias,relu,clip +i s32 r n n n r 484 688 2050 2050 688 688 none +i s8 r n n n r 484 688 2050 2050 688 688 none +i s32 s8 r n n n r 484 688 2050 2050 688 688 bias,relu,clip +i s8 s8 r n n n r 484 688 2050 2050 688 688 bias,relu,clip +i s32 r n n n r 485 688 2050 2050 688 688 none +i s8 r n n n r 485 688 2050 2050 688 688 none +i s32 s8 r n n n r 485 688 2050 2050 688 688 bias,relu,clip +i s8 s8 r n n n r 485 688 2050 2050 688 688 bias,relu,clip +i s32 r n n n r 1024 512 64 64 512 512 none +i s8 r n n n r 1024 512 64 64 512 512 none +i s32 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +i s8 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +i s32 r n n n r 16 256 512 512 256 256 none +i s8 r n n n r 16 256 512 512 256 256 none +i s32 s8 r n n n r 16 256 512 512 256 256 bias,relu,clip +i s8 s8 r n n n r 16 256 512 512 256 256 bias,relu,clip +i s32 r n n n r 480 640 512 512 640 640 none +i s8 r n n n r 480 640 512 512 640 640 none +i s32 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +i s8 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +i s32 r n n n r 64 768 512 512 768 768 none +i s8 r n n n r 64 768 512 512 768 768 none +i s32 s8 r n n n r 64 768 512 512 768 768 bias,relu,clip +i s8 s8 r n n n r 64 768 512 512 768 768 bias,relu,clip +i s32 r n n n r 128 128 128 128 128 128 none +i s8 r n n n r 128 128 128 128 128 128 none +i s32 s8 r n n n r 128 128 128 128 128 128 bias,relu,clip +i s8 s8 r n n n r 128 128 128 128 128 128 bias,relu,clip +i s32 r n n n r 1024 64 512 512 64 64 none +i s8 r n n n r 1024 64 512 512 64 64 none +i s32 s8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +i s8 s8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +i s32 r n n n r 1024 256 32 32 256 256 none +i s8 r n n n r 1024 256 32 32 256 256 none +i s32 s8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +i s8 s8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +i s32 r n n n r 1024 512 64 64 512 512 none +i s8 r n n n r 1024 512 64 64 512 512 none +i s32 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +i s8 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +i s32 r n n n r 480 640 512 512 640 640 none +i s8 r n n n r 480 640 512 512 640 640 none +i s32 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +i s8 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +i s32 r n n n p 1024 32 256 256 32 32 none +i s8 r n n n p 1024 32 256 256 32 32 none +i s32 s8 r n n n p 1024 32 256 256 32 32 bias,relu,clip +i s8 s8 r n n n p 1024 32 256 256 32 32 bias,relu,clip +i s32 r n n n P 1024 64 512 512 64 64 none +i s8 r n n n P 1024 64 512 512 64 64 none +i s32 s8 r n n n P 1024 64 512 512 64 64 bias,relu,clip +i s8 s8 r n n n P 1024 64 512 512 64 64 bias,relu,clip +i s32 r n n n P 64 800 320 320 800 800 none +i s8 r n n n P 64 800 320 320 800 800 none +i s32 s8 r n n n P 64 800 320 320 800 800 bias,relu,clip +i s8 s8 r n n n P 64 800 320 320 800 800 bias,relu,clip +i s32 r n n n P 64 768 512 512 768 768 none +i s8 r n n n P 64 768 512 512 768 768 none +i s32 s8 r n n n P 64 768 512 512 768 768 bias,relu,clip +i s8 s8 r n n n P 64 768 512 512 768 768 bias,relu,clip +i s32 r n n n P 16 256 512 512 256 256 none +i s8 r n n n P 16 256 512 512 256 256 none +i s32 s8 r n n n P 16 256 512 512 256 256 bias,relu,clip +i s8 s8 r n n n P 16 256 512 512 256 256 bias,relu,clip +i s32 r n n n P 128 128 128 128 128 128 none +i s8 r n n n P 128 128 128 128 128 128 none +i s32 s8 r n n n P 128 128 128 128 128 128 bias,relu,clip +i s8 s8 r n n n P 128 128 128 128 128 128 bias,relu,clip +i s32 r n n n P 256 512 256 256 512 512 none +i s8 r n n n P 256 512 256 256 512 512 none +i s32 s8 r n n n P 256 512 256 256 512 512 bias,relu,clip +i s8 s8 r n n n P 256 512 256 256 512 512 bias,relu,clip +i s32 r n n n P 1024 1024 1024 1024 1024 1024 none +i s8 r n n n P 1024 1024 1024 1024 1024 1024 none +i s32 s8 r n n n P 1024 1024 1024 1024 1024 1024 bias,relu,clip +i s8 s8 r n n n P 1024 1024 1024 1024 1024 1024 bias,relu,clip +i s32 r n n n P 480 640 1024 1024 640 640 none +i s8 r n n n P 480 640 1024 1024 640 640 none +i s32 s8 r n n n P 480 640 1024 1024 640 640 bias,relu,clip +i s8 s8 r n n n P 480 640 1024 1024 640 640 bias,relu,clip +i s32 r n n n P 480 640 256 256 640 640 none +i s8 r n n n P 480 640 256 256 640 640 none +i s32 s8 r n n n P 480 640 256 256 640 640 bias,relu,clip +i s8 s8 r n n n P 480 640 256 256 640 640 bias,relu,clip +i s32 r n n n P 8 64 32 32 64 64 none +i s8 r n n n P 8 64 32 32 64 64 none +i s32 s8 r n n n P 8 64 32 32 64 64 bias,relu,clip +i s8 s8 r n n n P 8 64 32 32 64 64 bias,relu,clip +i s32 r n n n P 9 64 32 32 64 64 none +i s8 r n n n P 9 64 32 32 64 64 none +i s32 s8 r n n n P 9 64 32 32 64 64 bias,relu,clip +i s8 s8 r n n n P 9 64 32 32 64 64 bias,relu,clip +i s32 r n n n P 10 128 64 64 128 128 none +i s8 r n n n P 10 128 64 64 128 128 none +i s32 s8 r n n n P 10 128 64 64 128 128 bias,relu,clip +i s8 s8 r n n n P 10 128 64 64 128 128 bias,relu,clip +i s32 r n n n P 8 8 8 8 8 8 none +i s8 r n n n P 8 8 8 8 8 8 none +i s32 s8 r n n n P 8 8 8 8 8 8 bias,relu,clip +i s8 s8 r n n n P 8 8 8 8 8 8 bias,relu,clip +i s32 r n n n P 12 12 12 12 12 12 none +i s8 r n n n P 12 12 12 12 12 12 none +i s32 s8 r n n n P 12 12 12 12 12 12 bias,relu,clip +i s8 s8 r n n n P 12 12 12 12 12 12 bias,relu,clip +i s32 r n n n P 25 25 25 25 25 25 none +i s8 r n n n P 25 25 25 25 25 25 none +i s32 s8 r n n n P 25 25 25 25 25 25 bias,relu,clip +i s8 s8 r n n n P 25 25 25 25 25 25 bias,relu,clip +i s32 r n n n P 25 25 20 20 25 25 none +i s8 r n n n P 25 25 20 20 25 25 none +i s32 s8 r n n n P 25 25 20 20 25 25 bias,relu,clip +i s8 s8 r n n n P 25 25 20 20 25 25 bias,relu,clip +f f32 r n n n p 480 20 2050 2050 20 20 none +f f32 f32 r n n n p 480 20 2050 2050 20 20 bias,relu,clip +f f32 r n n n p 481 20 2050 2050 20 20 none +f f32 f32 r n n n p 481 20 2050 2050 20 20 bias,relu,clip +f f32 r n n n p 482 20 2050 2050 20 20 none +f f32 f32 r n n n p 482 20 2050 2050 20 20 bias,relu,clip +f f32 r n n n p 483 20 2050 2050 20 20 none +f f32 f32 r n n n p 483 20 2050 2050 20 20 bias,relu,clip +f f32 r n n n R 484 20 2050 2050 20 20 none +f f32 f32 r n n n R 484 20 2050 2050 20 20 bias,relu,clip +f f32 r n n n R 485 20 2050 2050 20 20 none +f f32 f32 r n n n R 485 20 2050 2050 20 20 bias,relu,clip +f f32 r n n n R 480 39 2050 2050 39 39 none +f f32 f32 r n n n R 480 39 2050 2050 39 39 bias,relu,clip +f f32 r n n n R 481 39 2050 2050 39 39 none +f f32 f32 r n n n R 481 39 2050 2050 39 39 bias,relu,clip +f f32 r n n n R 482 39 2050 2050 39 39 none +f f32 f32 r n n n R 482 39 2050 2050 39 39 bias,relu,clip +f f32 r n n n R 483 39 2050 2050 39 39 none +f f32 f32 r n n n R 483 39 2050 2050 39 39 bias,relu,clip +f f32 r n n n R 484 39 2050 2050 39 39 none +f f32 f32 r n n n R 484 39 2050 2050 39 39 bias,relu,clip +f f32 r n n n p 485 39 2050 2050 39 39 none +f f32 f32 r n n n p 485 39 2050 2050 39 39 bias,relu,clip +f f32 r n n n p 480 50 2050 2050 50 50 none +f f32 f32 r n n n p 480 50 2050 2050 50 50 bias,relu,clip +f f32 r n n n p 481 50 2050 2050 50 50 none +f f32 f32 r n n n p 481 50 2050 2050 50 50 bias,relu,clip +f f32 r n n n p 482 50 2050 2050 50 50 none +f f32 f32 r n n n p 482 50 2050 2050 50 50 bias,relu,clip +f f32 r n n n p 483 50 2050 2050 50 50 none +f f32 f32 r n n n p 483 50 2050 2050 50 50 bias,relu,clip +f f32 r n n n p 484 50 2050 2050 50 50 none +f f32 f32 r n n n p 484 50 2050 2050 50 50 bias,relu,clip +f f32 r n n n p 485 50 2050 2050 50 50 none +f f32 f32 r n n n p 485 50 2050 2050 50 50 bias,relu,clip +f f32 r n n n R 480 1108 2050 2050 1108 1108 none +f f32 f32 r n n n R 480 1108 2050 2050 1108 1108 bias,relu,clip +f f32 r n n n R 481 1108 2050 2050 1108 1108 none +f f32 f32 r n n n R 481 1108 2050 2050 1108 1108 bias,relu,clip +f f32 r n n n R 482 1108 2050 2050 1108 1108 none +f f32 f32 r n n n R 482 1108 2050 2050 1108 1108 bias,relu,clip +f f32 r n n n R 483 1108 2050 2050 1108 1108 none +f f32 f32 r n n n R 483 1108 2050 2050 1108 1108 bias,relu,clip +f f32 r n n n R 484 1108 2050 2050 1108 1108 none +f f32 f32 r n n n R 484 1108 2050 2050 1108 1108 bias,relu,clip +f f32 r n n n R 485 1108 2050 2050 1108 1108 none +f f32 f32 r n n n R 485 1108 2050 2050 1108 1108 bias,relu,clip +f f32 r n n n R 480 1127 2050 2050 1127 1127 none +f f32 f32 r n n n R 480 1127 2050 2050 1127 1127 bias,relu,clip +f f32 r n n n R 481 1127 2050 2050 1127 1127 none +f f32 f32 r n n n R 481 1127 2050 2050 1127 1127 bias,relu,clip +f f32 r n n n R 482 1127 2050 2050 1127 1127 none +f f32 f32 r n n n R 482 1127 2050 2050 1127 1127 bias,relu,clip +f f32 r n n n R 483 1127 2050 2050 1127 1127 none +f f32 f32 r n n n R 483 1127 2050 2050 1127 1127 bias,relu,clip +f f32 r n n n p 484 1127 2050 2050 1127 1127 none +f f32 f32 r n n n p 484 1127 2050 2050 1127 1127 bias,relu,clip +f f32 r n n n p 485 1127 2050 2050 1127 1127 none +f f32 f32 r n n n p 485 1127 2050 2050 1127 1127 bias,relu,clip +f f32 r n n n p 480 1138 2050 2050 1138 1138 none +f f32 f32 r n n n p 480 1138 2050 2050 1138 1138 bias,relu,clip +f f32 r n n n p 481 1138 2050 2050 1138 1138 none +f f32 f32 r n n n p 481 1138 2050 2050 1138 1138 bias,relu,clip +f f32 r n n n p 482 1138 2050 2050 1138 1138 none +f f32 f32 r n n n p 482 1138 2050 2050 1138 1138 bias,relu,clip +f f32 r n n n p 483 1138 2050 2050 1138 1138 none +f f32 f32 r n n n p 483 1138 2050 2050 1138 1138 bias,relu,clip +f f32 r n n n p 484 1138 2050 2050 1138 1138 none +f f32 f32 r n n n p 484 1138 2050 2050 1138 1138 bias,relu,clip +f f32 r n n n p 485 1138 2050 2050 1138 1138 none +f f32 f32 r n n n p 485 1138 2050 2050 1138 1138 bias,relu,clip +f f32 r n n n p 1 1 3 3 1 1 none +f f32 f32 r n n n p 1 1 3 3 1 1 bias,relu,clip +f f32 r n n n p 1 9 3 3 9 9 none +f f32 f32 r n n n p 1 9 3 3 9 9 bias,relu,clip +f f32 r n n n p 1 2048 3 3 2048 2048 none +f f32 f32 r n n n p 1 2048 3 3 2048 2048 bias,relu,clip +f f32 r n n n p 1 2048 5192 5192 2048 2048 none +f f32 f32 r n n n p 1 2048 5192 5192 2048 2048 bias,relu,clip +f f32 r n n n p 9 1 3 3 1 1 none +f f32 f32 r n n n p 9 1 3 3 1 1 bias,relu,clip +f f32 r n n n p 576 1 3500 3500 1 1 none +f f32 f32 r n n n p 576 1 3500 3500 1 1 bias,relu,clip +f f32 r n n n p 1 1 1 1 1 1 none +f f32 f32 r n n n p 1 1 1 1 1 1 bias,relu,clip +f f32 r n n n p 102 1088 1024 1024 1088 1088 none +f f32 f32 r n n n p 102 1088 1024 1024 1088 1088 bias,relu,clip +f f32 r n n n p 102 2048 1024 1024 2048 2048 none +f f32 f32 r n n n p 102 2048 1024 1024 2048 2048 bias,relu,clip +f f32 r n n n p 485 656 1024 1024 656 656 none +f f32 f32 r n n n p 485 656 1024 1024 656 656 bias,relu,clip +f f32 r n n n p 483 656 1024 1024 656 656 none +f f32 f32 r n n n p 483 656 1024 1024 656 656 bias,relu,clip +f f32 r n n n p 81 128 3 3 128 128 none +f f32 f32 r n n n p 81 128 3 3 128 128 bias,relu,clip +f f32 r n n n p 1022 512 515 515 512 512 none +f f32 f32 r n n n p 1022 512 515 515 512 512 bias,relu,clip +f f32 r n n n p 74 512 515 515 512 512 none +f f32 f32 r n n n p 74 512 515 515 512 512 bias,relu,clip +f f32 r n n n p 253 2048 515 515 2048 2048 none +f f32 f32 r n n n p 253 2048 515 515 2048 2048 bias,relu,clip +f f32 r n n n p 8192 1040 515 515 1040 1040 none +f f32 f32 r n n n p 8192 1040 515 515 1040 1040 bias,relu,clip +f f32 r n n n p 10 1029 515 515 1029 1029 none +f f32 f32 r n n n p 10 1029 515 515 1029 1029 bias,relu,clip +f f32 r n n n p 24 1040 2050 2050 1040 1040 none +f f32 f32 r n n n p 24 1040 2050 2050 1040 1040 bias,relu,clip +f f32 r n n n p 1024 1029 2050 2050 1029 1029 none +f f32 f32 r n n n p 1024 1029 2050 2050 1029 1029 bias,relu,clip +f f32 r n n n p 480 660 2050 2050 660 660 none +f f32 f32 r n n n p 480 660 2050 2050 660 660 bias,relu,clip +f f32 r n n n p 481 660 2050 2050 660 660 none +f f32 f32 r n n n p 481 660 2050 2050 660 660 bias,relu,clip +f f32 r n n n p 482 660 2050 2050 660 660 none +f f32 f32 r n n n p 482 660 2050 2050 660 660 bias,relu,clip +f f32 r n n n p 483 660 2050 2050 660 660 none +f f32 f32 r n n n p 483 660 2050 2050 660 660 bias,relu,clip +f f32 r n n n p 484 660 2050 2050 660 660 none +f f32 f32 r n n n p 484 660 2050 2050 660 660 bias,relu,clip +f f32 r n n n p 485 660 2050 2050 660 660 none +f f32 f32 r n n n p 485 660 2050 2050 660 660 bias,relu,clip +f f32 r n n n p 480 679 2050 2050 679 679 none +f f32 f32 r n n n p 480 679 2050 2050 679 679 bias,relu,clip +f f32 r n n n p 481 679 2050 2050 679 679 none +f f32 f32 r n n n p 481 679 2050 2050 679 679 bias,relu,clip +f f32 r n n n p 482 679 2050 2050 679 679 none +f f32 f32 r n n n p 482 679 2050 2050 679 679 bias,relu,clip +f f32 r n n n p 483 679 2050 2050 679 679 none +f f32 f32 r n n n p 483 679 2050 2050 679 679 bias,relu,clip +f f32 r n n n p 484 679 2050 2050 679 679 none +f f32 f32 r n n n p 484 679 2050 2050 679 679 bias,relu,clip +f f32 r n n n p 485 679 2050 2050 679 679 none +f f32 f32 r n n n p 485 679 2050 2050 679 679 bias,relu,clip +f f32 r n n n p 480 690 2050 2050 690 690 none +f f32 f32 r n n n p 480 690 2050 2050 690 690 bias,relu,clip +f f32 r n n n p 481 690 2050 2050 690 690 none +f f32 f32 r n n n p 481 690 2050 2050 690 690 bias,relu,clip +f f32 r n n n p 482 690 2050 2050 690 690 none +f f32 f32 r n n n p 482 690 2050 2050 690 690 bias,relu,clip +f f32 r n n n p 483 690 2050 2050 690 690 none +f f32 f32 r n n n p 483 690 2050 2050 690 690 bias,relu,clip +f f32 r n n n p 484 690 2050 2050 690 690 none +f f32 f32 r n n n p 484 690 2050 2050 690 690 bias,relu,clip +f f32 r n n n p 485 690 2050 2050 690 690 none +f f32 f32 r n n n p 485 690 2050 2050 690 690 bias,relu,clip +f f32 r n n n p 480 660 2048 2048 660 660 none +f f32 f32 r n n n p 480 660 2048 2048 660 660 bias,relu,clip +f f32 r n n n p 481 660 2048 2048 660 660 none +f f32 f32 r n n n p 481 660 2048 2048 660 660 bias,relu,clip +f f32 r n n n p 482 660 2048 2048 660 660 none +f f32 f32 r n n n p 482 660 2048 2048 660 660 bias,relu,clip +f f32 r n n n p 483 660 2048 2048 660 660 none +f f32 f32 r n n n p 483 660 2048 2048 660 660 bias,relu,clip +f f32 r n n n p 484 660 2048 2048 660 660 none +f f32 f32 r n n n p 484 660 2048 2048 660 660 bias,relu,clip +f f32 r n n n p 485 660 2048 2048 660 660 none +f f32 f32 r n n n p 485 660 2048 2048 660 660 bias,relu,clip +f f32 r n n n p 480 679 2048 2048 679 679 none +f f32 f32 r n n n p 480 679 2048 2048 679 679 bias,relu,clip +f f32 r n n n p 481 679 2048 2048 679 679 none +f f32 f32 r n n n p 481 679 2048 2048 679 679 bias,relu,clip +f f32 r n n n p 482 679 2048 2048 679 679 none +f f32 f32 r n n n p 482 679 2048 2048 679 679 bias,relu,clip +f f32 r n n n p 483 679 2048 2048 679 679 none +f f32 f32 r n n n p 483 679 2048 2048 679 679 bias,relu,clip +f f32 r n n n p 484 679 2048 2048 679 679 none +f f32 f32 r n n n p 484 679 2048 2048 679 679 bias,relu,clip +f f32 r n n n p 485 679 2048 2048 679 679 none +f f32 f32 r n n n p 485 679 2048 2048 679 679 bias,relu,clip +f f32 r n n n p 480 690 2048 2048 690 690 none +f f32 f32 r n n n p 480 690 2048 2048 690 690 bias,relu,clip +f f32 r n n n p 481 690 2048 2048 690 690 none +f f32 f32 r n n n p 481 690 2048 2048 690 690 bias,relu,clip +f f32 r n n n p 482 690 2048 2048 690 690 none +f f32 f32 r n n n p 482 690 2048 2048 690 690 bias,relu,clip +f f32 r n n n p 483 690 2048 2048 690 690 none +f f32 f32 r n n n p 483 690 2048 2048 690 690 bias,relu,clip +f f32 r n n n p 484 690 2048 2048 690 690 none +f f32 f32 r n n n p 484 690 2048 2048 690 690 bias,relu,clip +f f32 r n n n p 485 690 2048 2048 690 690 none +f f32 f32 r n n n p 485 690 2048 2048 690 690 bias,relu,clip +f f32 r n n n p 480 656 1024 1024 656 656 none +f f32 f32 r n n n p 480 656 1024 1024 656 656 bias,relu,clip +f f32 r n n n p 480 128 3 3 128 128 none +f f32 f32 r n n n p 480 128 3 3 128 128 bias,relu,clip +f f32 r n n n p 1024 512 515 515 512 512 none +f f32 f32 r n n n p 1024 512 515 515 512 512 bias,relu,clip +f f32 r n n n p 1024 2048 1024 1024 2048 2048 none +f f32 f32 r n n n p 1024 2048 1024 1024 2048 2048 bias,relu,clip +f f32 r n n n p 1024 2048 515 515 2048 2048 none +f f32 f32 r n n n p 1024 2048 515 515 2048 2048 bias,relu,clip +f f32 r n n n p 1024 1040 515 515 1040 1040 none +f f32 f32 r n n n p 1024 1040 515 515 1040 1040 bias,relu,clip +f f32 r n n n p 5 1029 515 515 1029 1029 none +f f32 f32 r n n n p 5 1029 515 515 1029 1029 bias,relu,clip +f f32 r n n n p 1024 1029 515 515 1029 1029 none +f f32 f32 r n n n p 1024 1029 515 515 1029 1029 bias,relu,clip +f f32 r n n n p 1024 1040 2050 2050 1040 1040 none +f f32 f32 r n n n p 1024 1040 2050 2050 1040 1040 bias,relu,clip +f f32 r n n n p 1029 1029 2050 2050 1029 1029 none +f f32 f32 r n n n p 1029 1029 2050 2050 1029 1029 bias,relu,clip +f f32 r n n n R 480 646 2050 2050 646 646 none +f f32 f32 r n n n R 480 646 2050 2050 646 646 bias,relu,clip +f f32 r n n n R 481 646 2050 2050 646 646 none +f f32 f32 r n n n R 481 646 2050 2050 646 646 bias,relu,clip +f f32 r n n n R 482 646 2050 2050 646 646 none +f f32 f32 r n n n R 482 646 2050 2050 646 646 bias,relu,clip +f f32 r n n n R 483 646 2050 2050 646 646 none +f f32 f32 r n n n R 483 646 2050 2050 646 646 bias,relu,clip +f f32 r n n n R 484 646 2050 2050 646 646 none +f f32 f32 r n n n R 484 646 2050 2050 646 646 bias,relu,clip +f f32 r n n n R 485 646 2050 2050 646 646 none +f f32 f32 r n n n R 485 646 2050 2050 646 646 bias,relu,clip +f f32 r n n n R 481 656 2050 2050 656 656 none +f f32 f32 r n n n R 481 656 2050 2050 656 656 bias,relu,clip +f f32 r n n n R 482 656 2050 2050 656 656 none +f f32 f32 r n n n R 482 656 2050 2050 656 656 bias,relu,clip +f f32 r n n n R 483 656 2050 2050 656 656 none +f f32 f32 r n n n R 483 656 2050 2050 656 656 bias,relu,clip +f f32 r n n n R 484 656 2050 2050 656 656 none +f f32 f32 r n n n R 484 656 2050 2050 656 656 bias,relu,clip +f f32 r n n n p 485 656 2050 2050 656 656 none +f f32 f32 r n n n p 485 656 2050 2050 656 656 bias,relu,clip +f f32 r n n n p 480 672 2050 2050 672 672 none +f f32 f32 r n n n p 480 672 2050 2050 672 672 bias,relu,clip +f f32 r n n n p 481 672 2050 2050 672 672 none +f f32 f32 r n n n p 481 672 2050 2050 672 672 bias,relu,clip +f f32 r n n n p 482 672 2050 2050 672 672 none +f f32 f32 r n n n p 482 672 2050 2050 672 672 bias,relu,clip +f f32 r n n n p 483 672 2050 2050 672 672 none +f f32 f32 r n n n p 483 672 2050 2050 672 672 bias,relu,clip +f f32 r n n n p 484 672 2050 2050 672 672 none +f f32 f32 r n n n p 484 672 2050 2050 672 672 bias,relu,clip +f f32 r n n n p 485 672 2050 2050 672 672 none +f f32 f32 r n n n p 485 672 2050 2050 672 672 bias,relu,clip +f f32 r n n n p 480 688 2050 2050 688 688 none +f f32 f32 r n n n p 480 688 2050 2050 688 688 bias,relu,clip +f f32 r n n n p 481 688 2050 2050 688 688 none +f f32 f32 r n n n p 481 688 2050 2050 688 688 bias,relu,clip +f f32 r n n n r 482 688 2050 2050 688 688 none +f f32 f32 r n n n r 482 688 2050 2050 688 688 bias,relu,clip +f f32 r n n n r 483 688 2050 2050 688 688 none +f f32 f32 r n n n r 483 688 2050 2050 688 688 bias,relu,clip +f f32 r n n n r 484 688 2050 2050 688 688 none +f f32 f32 r n n n r 484 688 2050 2050 688 688 bias,relu,clip +f f32 r n n n r 485 688 2050 2050 688 688 none +f f32 f32 r n n n r 485 688 2050 2050 688 688 bias,relu,clip +f f32 r n n n r 1024 512 64 64 512 512 none +f f32 f32 r n n n r 1024 512 64 64 512 512 bias,relu,clip +f f32 r n n n r 16 256 512 512 256 256 none +f f32 f32 r n n n r 16 256 512 512 256 256 bias,relu,clip +f f32 r n n n r 480 640 512 512 640 640 none +f f32 f32 r n n n r 480 640 512 512 640 640 bias,relu,clip +f f32 r n n n r 64 768 512 512 768 768 none +f f32 f32 r n n n r 64 768 512 512 768 768 bias,relu,clip +f f32 r n n n r 128 128 128 128 128 128 none +f f32 f32 r n n n r 128 128 128 128 128 128 bias,relu,clip +f f32 r n n n r 1024 64 512 512 64 64 none +f f32 f32 r n n n r 1024 64 512 512 64 64 bias,relu,clip +f f32 r n n n r 1024 256 32 32 256 256 none +f f32 f32 r n n n r 1024 256 32 32 256 256 bias,relu,clip +f f32 r n n n r 1024 512 64 64 512 512 none +f f32 f32 r n n n r 1024 512 64 64 512 512 bias,relu,clip +f f32 r n n n r 480 640 512 512 640 640 none +f f32 f32 r n n n r 480 640 512 512 640 640 bias,relu,clip +f f32 r n n n p 1024 32 256 256 32 32 none +f f32 f32 r n n n p 1024 32 256 256 32 32 bias,relu,clip +f f32 r n n n P 1024 64 512 512 64 64 none +f f32 f32 r n n n P 1024 64 512 512 64 64 bias,relu,clip +f f32 r n n n P 64 800 320 320 800 800 none +f f32 f32 r n n n P 64 800 320 320 800 800 bias,relu,clip +f f32 r n n n P 64 768 512 512 768 768 none +f f32 f32 r n n n P 64 768 512 512 768 768 bias,relu,clip +f f32 r n n n P 16 256 512 512 256 256 none +f f32 f32 r n n n P 16 256 512 512 256 256 bias,relu,clip +f f32 r n n n P 128 128 128 128 128 128 none +f f32 f32 r n n n P 128 128 128 128 128 128 bias,relu,clip +f f32 r n n n P 256 512 256 256 512 512 none +f f32 f32 r n n n P 256 512 256 256 512 512 bias,relu,clip +f f32 r n n n P 1024 1024 1024 1024 1024 1024 none +f f32 f32 r n n n P 1024 1024 1024 1024 1024 1024 bias,relu,clip +f f32 r n n n P 480 640 1024 1024 640 640 none +f f32 f32 r n n n P 480 640 1024 1024 640 640 bias,relu,clip +f f32 r n n n P 480 640 256 256 640 640 none +f f32 f32 r n n n P 480 640 256 256 640 640 bias,relu,clip +f f32 r n n n P 8 64 32 32 64 64 none +f f32 f32 r n n n P 8 64 32 32 64 64 bias,relu,clip +f f32 r n n n P 9 64 32 32 64 64 none +f f32 f32 r n n n P 9 64 32 32 64 64 bias,relu,clip +f f32 r n n n P 10 128 64 64 128 128 none +f f32 f32 r n n n P 10 128 64 64 128 128 bias,relu,clip +f f32 r n n n P 8 8 8 8 8 8 none +f f32 f32 r n n n P 8 8 8 8 8 8 bias,relu,clip +f f32 r n n n P 12 12 12 12 12 12 none +f f32 f32 r n n n P 12 12 12 12 12 12 bias,relu,clip +f f32 r n n n P 25 25 25 25 25 25 none +f f32 f32 r n n n P 25 25 25 25 25 25 bias,relu,clip +f f32 r n n n P 25 25 20 20 25 25 none +f f32 f32 r n n n P 25 25 20 20 25 25 bias,relu,clip +i s32 r n n n r 4096 256 5 5 256 256 none +i s8 r n n n r 4096 256 5 5 256 256 none +i s32 s8 r n n n r 4096 256 5 5 256 256 bias,relu,clip +i s8 s8 r n n n r 4096 256 5 5 256 256 bias,relu,clip +i s32 r n n n r 3000 256 128 128 256 256 none +i s8 r n n n r 3000 256 128 128 256 256 none +i s32 s8 r n n n r 3000 256 128 128 256 256 bias,relu,clip +i s8 s8 r n n n r 3000 256 128 128 256 256 bias,relu,clip +i s32 r n n n r 4096 1024 512 512 1024 1024 none +i s8 r n n n r 4096 1024 512 512 1024 1024 none +i s32 s8 r n n n r 4096 1024 512 512 1024 1024 bias,relu,clip +i s8 s8 r n n n r 4096 1024 512 512 1024 1024 bias,relu,clip +i s32 r n n n r 144 256 5 5 256 256 none +i s8 r n n n r 144 256 5 5 256 256 none +i s32 s8 r n n n r 144 256 5 5 256 256 bias,relu,clip +i s8 s8 r n n n r 144 256 5 5 256 256 bias,relu,clip +i s32 r n n n r 144 256 128 128 256 256 none +i s8 r n n n r 144 256 128 128 256 256 none +i s32 s8 r n n n r 144 256 128 128 256 256 bias,relu,clip +i s8 s8 r n n n r 144 256 128 128 256 256 bias,relu,clip +i s32 r n n n r 144 1024 512 512 1024 1024 none +i s8 r n n n r 144 1024 512 512 1024 1024 none +i s32 s8 r n n n r 144 1024 512 512 1024 1024 bias,relu,clip +i s8 s8 r n n n r 144 1024 512 512 1024 1024 bias,relu,clip +i s32 r n n n r 480 688 256 256 688 688 none +i s8 r n n n r 480 688 256 256 688 688 none +i s32 s8 r n n n r 480 688 256 256 688 688 bias,relu,clip +i s8 s8 r n n n r 480 688 256 256 688 688 bias,relu,clip +i s32 r n n n r 480 640 512 512 640 640 none +i s8 r n n n r 480 640 512 512 640 640 none +i s32 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +i s8 s8 r n n n r 480 640 512 512 640 640 bias,relu,clip +i s32 r n n n r 480 640 1024 1024 640 640 none +i s8 r n n n r 480 640 1024 1024 640 640 none +i s32 s8 r n n n r 480 640 1024 1024 640 640 bias,relu,clip +i s8 s8 r n n n r 480 640 1024 1024 640 640 bias,relu,clip +i s32 r n n n r 64 800 320 320 800 800 none +i s8 r n n n r 64 800 320 320 800 800 none +i s32 s8 r n n n r 64 800 320 320 800 800 bias,relu,clip +i s8 s8 r n n n r 64 800 320 320 800 800 bias,relu,clip +i s32 r n n n r 64 768 512 512 768 768 none +i s8 r n n n r 64 768 512 512 768 768 none +i s32 s8 r n n n r 64 768 512 512 768 768 bias,relu,clip +i s8 s8 r n n n r 64 768 512 512 768 768 bias,relu,clip +i s32 r n n n r 16 256 512 512 256 256 none +i s8 r n n n r 16 256 512 512 256 256 none +i s32 s8 r n n n r 16 256 512 512 256 256 bias,relu,clip +i s8 s8 r n n n r 16 256 512 512 256 256 bias,relu,clip +i s32 r n n n r 128 128 128 128 128 128 none +i s8 r n n n r 128 128 128 128 128 128 none +i s32 s8 r n n n r 128 128 128 128 128 128 bias,relu,clip +i s8 s8 r n n n r 128 128 128 128 128 128 bias,relu,clip +i s32 r n n n r 256 512 256 256 512 512 none +i s8 r n n n r 256 512 256 256 512 512 none +i s32 s8 r n n n r 256 512 256 256 512 512 bias,relu,clip +i s8 s8 r n n n r 256 512 256 256 512 512 bias,relu,clip +i s32 r n n n r 1024 1024 1024 1024 1024 1024 none +i s8 r n n n r 1024 1024 1024 1024 1024 1024 none +i s32 s8 r n n n r 1024 1024 1024 1024 1024 1024 bias,relu,clip +i s8 s8 r n n n r 1024 1024 1024 1024 1024 1024 bias,relu,clip +i s32 r n n n r 1024 32 256 256 32 32 none +i s8 r n n n r 1024 32 256 256 32 32 none +i s32 s8 r n n n r 1024 32 256 256 32 32 bias,relu,clip +i s8 s8 r n n n r 1024 32 256 256 32 32 bias,relu,clip +i s32 r n n n r 1024 64 512 512 64 64 none +i s8 r n n n r 1024 64 512 512 64 64 none +i s32 s8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +i s8 s8 r n n n r 1024 64 512 512 64 64 bias,relu,clip +i s32 r n n n r 1024 256 32 32 256 256 none +i s8 r n n n r 1024 256 32 32 256 256 none +i s32 s8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +i s8 s8 r n n n r 1024 256 32 32 256 256 bias,relu,clip +i s32 r n n n r 1024 512 64 64 512 512 none +i s8 r n n n r 1024 512 64 64 512 512 none +i s32 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +i s8 s8 r n n n r 1024 512 64 64 512 512 bias,relu,clip +i s32 r n n n r 512 32 256 256 32 32 none +i s8 r n n n r 512 32 256 256 32 32 none +i s32 s8 r n n n r 512 32 256 256 32 32 bias,relu,clip +i s8 s8 r n n n r 512 32 256 256 32 32 bias,relu,clip +i s32 r n n n r 512 768 512 512 768 768 none +i s8 r n n n r 512 768 512 512 768 768 none +i s32 s8 r n n n r 512 768 512 512 768 768 bias,relu,clip +i s8 s8 r n n n r 512 768 512 512 768 768 bias,relu,clip +i s32 r n n n r 512 256 32 32 256 256 none +i s8 r n n n r 512 256 32 32 256 256 none +i s32 s8 r n n n r 512 256 32 32 256 256 bias,relu,clip +i s8 s8 r n n n r 512 256 32 32 256 256 bias,relu,clip +i s32 r n n n r 512 512 64 64 512 512 none +i s8 r n n n r 512 512 64 64 512 512 none +i s32 s8 r n n n r 512 512 64 64 512 512 bias,relu,clip +i s8 s8 r n n n r 512 512 64 64 512 512 bias,relu,clip +i s32 r n n n r 512 256 768 768 256 256 none +i s8 r n n n r 512 256 768 768 256 256 none +i s32 s8 r n n n r 512 256 768 768 256 256 bias,relu,clip +i s8 s8 r n n n r 512 256 768 768 256 256 bias,relu,clip +i s32 r n n n r 768 768 1024 1024 768 768 none +i s8 r n n n r 768 768 1024 1024 768 768 none +i s32 s8 r n n n r 768 768 1024 1024 768 768 bias,relu,clip +i s8 s8 r n n n r 768 768 1024 1024 768 768 bias,relu,clip +i s32 r n n n r 768 768 768 768 768 768 none +i s8 r n n n r 768 768 768 768 768 768 none +i s32 s8 r n n n r 768 768 768 768 768 768 bias,relu,clip +i s8 s8 r n n n r 768 768 768 768 768 768 bias,relu,clip +i s32 r n n n r 2048 2048 2048 2048 2048 2048 none +i s8 r n n n r 2048 2048 2048 2048 2048 2048 none +i s32 s8 r n n n r 2048 2048 2048 2048 2048 2048 bias,relu,clip +i s8 s8 r n n n r 2048 2048 2048 2048 2048 2048 bias,relu,clip +i s32 r n n n r 4096 4096 4096 4096 4096 4096 none +i s8 r n n n r 4096 4096 4096 4096 4096 4096 none +i s32 s8 r n n n r 4096 4096 4096 4096 4096 4096 bias,relu,clip +i s8 s8 r n n n r 4096 4096 4096 4096 4096 4096 bias,relu,clip +f f32 r n n n r 4096 256 5 5 256 256 none +f f32 f32 r n n n r 4096 256 5 5 256 256 bias,relu,clip +f f32 r n n n r 3000 256 128 128 256 256 none +f f32 f32 r n n n r 3000 256 128 128 256 256 bias,relu,clip +f f32 r n n n r 4096 1024 512 512 1024 1024 none +f f32 f32 r n n n r 4096 1024 512 512 1024 1024 bias,relu,clip +f f32 r n n n r 144 256 5 5 256 256 none +f f32 f32 r n n n r 144 256 5 5 256 256 bias,relu,clip +f f32 r n n n r 144 256 128 128 256 256 none +f f32 f32 r n n n r 144 256 128 128 256 256 bias,relu,clip +f f32 r n n n r 144 1024 512 512 1024 1024 none +f f32 f32 r n n n r 144 1024 512 512 1024 1024 bias,relu,clip +f f32 r n n n r 480 688 256 256 688 688 none +f f32 f32 r n n n r 480 688 256 256 688 688 bias,relu,clip +f f32 r n n n r 480 640 512 512 640 640 none +f f32 f32 r n n n r 480 640 512 512 640 640 bias,relu,clip +f f32 r n n n r 480 640 1024 1024 640 640 none +f f32 f32 r n n n r 480 640 1024 1024 640 640 bias,relu,clip +f f32 r n n n r 64 800 320 320 800 800 none +f f32 f32 r n n n r 64 800 320 320 800 800 bias,relu,clip +f f32 r n n n r 64 768 512 512 768 768 none +f f32 f32 r n n n r 64 768 512 512 768 768 bias,relu,clip +f f32 r n n n r 16 256 512 512 256 256 none +f f32 f32 r n n n r 16 256 512 512 256 256 bias,relu,clip +f f32 r n n n r 128 128 128 128 128 128 none +f f32 f32 r n n n r 128 128 128 128 128 128 bias,relu,clip +f f32 r n n n r 256 512 256 256 512 512 none +f f32 f32 r n n n r 256 512 256 256 512 512 bias,relu,clip +f f32 r n n n r 1024 1024 1024 1024 1024 1024 none +f f32 f32 r n n n r 1024 1024 1024 1024 1024 1024 bias,relu,clip +f f32 r n n n r 1024 32 256 256 32 32 none +f f32 f32 r n n n r 1024 32 256 256 32 32 bias,relu,clip +f f32 r n n n r 1024 64 512 512 64 64 none +f f32 f32 r n n n r 1024 64 512 512 64 64 bias,relu,clip +f f32 r n n n r 1024 256 32 32 256 256 none +f f32 f32 r n n n r 1024 256 32 32 256 256 bias,relu,clip +f f32 r n n n r 1024 512 64 64 512 512 none +f f32 f32 r n n n r 1024 512 64 64 512 512 bias,relu,clip +f f32 r n n n r 512 32 256 256 32 32 none +f f32 f32 r n n n r 512 32 256 256 32 32 bias,relu,clip +f f32 r n n n r 512 768 512 512 768 768 none +f f32 f32 r n n n r 512 768 512 512 768 768 bias,relu,clip +f f32 r n n n r 512 256 32 32 256 256 none +f f32 f32 r n n n r 512 256 32 32 256 256 bias,relu,clip +f f32 r n n n r 512 512 64 64 512 512 none +f f32 f32 r n n n r 512 512 64 64 512 512 bias,relu,clip +f f32 r n n n r 512 256 768 768 256 256 none +f f32 f32 r n n n r 512 256 768 768 256 256 bias,relu,clip +f f32 r n n n r 768 768 1024 1024 768 768 none +f f32 f32 r n n n r 768 768 1024 1024 768 768 bias,relu,clip +f f32 r n n n r 768 768 768 768 768 768 none +f f32 f32 r n n n r 768 768 768 768 768 768 bias,relu,clip +f f32 r n n n r 2048 2048 2048 2048 2048 2048 none +f f32 f32 r n n n r 2048 2048 2048 2048 2048 2048 bias,relu,clip +f f32 r n n n r 4096 4096 4096 4096 4096 4096 none +f f32 f32 r n n n r 4096 4096 4096 4096 4096 4096 bias,relu,clip +f f32 r n n n r 2048 1024 1024 1024 1024 1024 none +f f32 f32 r n n n r 2048 1024 1024 1024 1024 1024 bias,relu,clip +f f32 r n n n r 2048 4096 1024 1024 4096 4096 none +f f32 f32 r n n n r 2048 4096 1024 1024 4096 4096 bias,relu,clip +f f32 r n n n r 2048 1024 4096 4096 1024 1024 none +f f32 f32 r n n n r 2048 1024 4096 4096 1024 1024 bias,relu,clip +f f32 r n n n r 2048 1024 2 2 1024 1024 none +f f32 f32 r n n n r 2048 1024 2 2 1024 1024 bias,relu,clip +f f32 r n n n r 128 1024 1024 1024 1024 1024 none +f f32 f32 r n n n r 128 1024 1024 1024 1024 1024 bias,relu,clip +f f32 r n n n r 1536 768 768 768 768 768 none +f f32 f32 r n n n r 1536 768 768 768 768 768 bias,relu,clip +f f32 r n n n r 1536 3072 768 768 3072 3072 none +f f32 f32 r n n n r 1536 3072 768 768 3072 3072 bias,relu,clip +f f32 r n n n r 1536 768 3072 3072 768 768 none +f f32 f32 r n n n r 1536 768 3072 3072 768 768 bias,relu,clip +f f32 r n n n r 1536 768 2 2 768 768 none +f f32 f32 r n n n r 1536 768 2 2 768 768 bias,relu,clip +f f32 r n n n r 128 768 768 768 768 768 none +f f32 f32 r n n n r 128 768 768 768 768 768 bias,relu,clip +f f32 r n n n r 1024 8 13 13 8 8 none +f f32 f32 r n n n r 1024 8 13 13 8 8 bias,relu,clip +f f32 r n n n r 1024 4 8 8 4 4 none +f f32 f32 r n n n r 1024 4 8 8 4 4 bias,relu,clip +f f32 r n n n r 1024 128 355 355 128 128 none +f f32 f32 r n n n r 1024 128 355 355 128 128 bias,relu,clip +f f32 r n n n r 1024 64 128 128 64 64 none +f f32 f32 r n n n r 1024 64 128 128 64 64 bias,relu,clip +f f32 r n n n r 1024 1 64 64 1 1 none +f f32 f32 r n n n r 1024 1 64 64 1 1 bias,relu,clip +f f32 r n n n r 480 1 256 256 1 1 none +f f32 f32 r n n n r 480 1 256 256 1 1 bias,relu,clip +f f32 r n n n r 480 256 512 512 256 256 none +f f32 f32 r n n n r 480 256 512 512 256 256 bias,relu,clip +f f32 r n n n r 480 1024 845 845 1024 1024 none +f f32 f32 r n n n r 480 1024 845 845 1024 1024 bias,relu,clip +f f32 r n n n r 480 512 1024 1024 512 512 none +f f32 f32 r n n n r 480 512 1024 1024 512 512 bias,relu,clip +f f32 r n n n r 10 17191 128 128 17191 17191 none +f f32 f32 r n n n r 10 17191 128 128 17191 17191 bias,relu,clip +f f32 r n n n r 10 512 256 256 512 512 none +f f32 f32 r n n n r 10 512 256 256 512 512 bias,relu,clip diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 5b6fddb0f4..0bf7410193 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -43,6 +43,7 @@ #include "blis.h" + // Used to clip downscaled output, will be set in the main loop based // on the accumulation and C data type. int64_t DSCALE_CLIP_MIN = 0; @@ -67,39 +68,128 @@ dim_t num_eltwise = 0; // To keep track of eltwise operations. static inline void float_to_bf16( float* float_value, bfloat16* bf16_val ) { - /*Set offset 2 to copy most significant 2 bytes of float - to convert float values to bf16 values*/ - memcpy( ( bf16_val ), (char *)( float_value ) + 2, sizeof ( bfloat16 ) ); + /*Set offset 2 to copy most significant 2 bytes of float + to convert float values to bf16 values*/ + memcpy( ( bf16_val ), (char *)( float_value ) + 2, sizeof ( bfloat16 ) ); +} + +static inline void convert_float_arr_to_bf16( float* array, bfloat16* array_bf16, int size ) +{ + for (int i=0; i< size; i++) + { + float_to_bf16( ( array + i ), ( array_bf16 + i ) ); + } +} + + +static inline void bfloat16_to_float( bfloat16 bf16_val, float* float_val ) +{ + int32_t inter_temp = *( ( int16_t* ) &bf16_val ); + inter_temp = inter_temp << 16; + *float_val = *(( float* ) ( &inter_temp )); } -static inline float bf16_to_float +#define CONVERT_TO_FLOAT(ctype) \ +static inline void GEN_FUNC_NAME(ctype,_to_float) ( ctype val, float* float_val ) \ +{ \ + *float_val = (float) val; \ +} \ + +CONVERT_TO_FLOAT(uint8_t) +CONVERT_TO_FLOAT(int8_t) +CONVERT_TO_FLOAT(int16_t) +CONVERT_TO_FLOAT(float) +CONVERT_TO_FLOAT(int32_t) + + + +/* Helper functions to print matrices when debugging */ +void print_matrix_bfloat16 ( - bfloat16 bf16_val + bfloat16* a, + dim_t m, + dim_t n, + dim_t rs_a, + dim_t cs_a ) { - int32_t inter_temp = *( ( int16_t* ) &bf16_val ); - inter_temp = inter_temp << 16; - float float_value = 0.0; - memcpy( &float_value, &inter_temp, sizeof( int32_t ) ); - return float_value; + for(dim_t i = 0; i < m; i++) + { + for(dim_t j = 0; j < n; j++) + { + float temp; + bfloat16_to_float(*(a + i*(rs_a) + j *cs_a), &temp); + printf("%f ", temp); + } + printf("\n"); + } } -static inline void convert_float_arr_to_bf16( float* array, bfloat16* array_bf16, int size ) +#define PRINT_MATRIX(ctype) \ +void print_matrix_## ctype ( ctype* a, int32_t m, int32_t n, int32_t rs, int32_t cs) \ +{ \ + for(int32_t i = 0; i < m; i++) \ + { \ + for(int32_t j = 0; j < n; j++) \ + { \ + printf("%f ", (float) (*(a + i * ( rs ) + j * cs ) ) ); \ + } \ + printf("\n"); \ + } \ +} \ + +PRINT_MATRIX(uint8_t) +PRINT_MATRIX(int8_t) +PRINT_MATRIX(int16_t) +PRINT_MATRIX(float) +PRINT_MATRIX(int32_t) + +void* lpgemm_malloc( size_t size ) +{ + void* p; + if( bench_mode == 'a' ) + { + p = malloc(size); + } + else + { + err_t err = BLIS_SUCCESS; + p = bli_malloc_user(size, &err); + } + if ( p == NULL ) + { + printf("Unable to allocate memory.\n"); + exit(1); + } + return p; +} + +void lpgemm_free( void* p ) { - for (int i=0; i< size; i++) - { - float_to_bf16( ( array + i ), ( array_bf16 + i ) ); - } + if( p == NULL) + { + printf("Attempt to free null pointer\n"); + return; + } + + if( bench_mode == 'a' ) + { + free(p); + } + else + { + bli_free_user(p); + } } #define GEN_FILL_ARRAY_FUNC(ctype) \ void fill_array_ ## ctype ( void* arr, dim_t size ) \ { \ - ctype* temp_arr = ( ctype* ) arr; \ - for ( dim_t i = 0; i < size; ++i ) \ - { \ - temp_arr[i] = ( ctype )( i % 20 ); \ - } \ + ctype* temp_arr = ( ctype* ) arr; \ + for ( dim_t i = 0; i < size; ++i ) \ + { \ + temp_arr[i] = ( ctype )( i % 5 ); \ + } \ } \ GEN_FILL_ARRAY_FUNC(uint8_t) @@ -110,27 +200,27 @@ GEN_FILL_ARRAY_FUNC(int32_t) void fill_array_bfloat16( void* arr, dim_t size ) { - err_t bli_errors = BLIS_SUCCESS; - float* c_float = ( float* ) bli_malloc_user( sizeof( float ) * size, &bli_errors ); - for ( dim_t i = 0; i < size; ++i ) - { - c_float[i] = 2.0; - } - convert_float_arr_to_bf16( c_float, arr, size ); - if ( c_float != NULL ) - { - bli_free_user( c_float ); - } + err_t bli_errors = BLIS_SUCCESS; + float* c_float = ( float* ) bli_malloc_user( sizeof( float ) * size, &bli_errors ); + for ( dim_t i = 0; i < size; ++i ) + { + c_float[i] = i % 5; + } + convert_float_arr_to_bf16( c_float, arr, size ); + if ( c_float != NULL ) + { + bli_free_user( c_float ); + } } #define GEN_FILL_ARRAY_POST_OPS_FUNC(ctype) \ void fill_array_post_ops_ ## ctype ( void* arr, dim_t size ) \ { \ - ctype* temp_arr = ( ctype* ) arr; \ - for ( dim_t i = 0; i < size; ++i ) \ - { \ - temp_arr[i] = ( ctype )( i % 20 ); \ - } \ + ctype* temp_arr = ( ctype* ) arr; \ + for ( dim_t i = 0; i < size; ++i ) \ + { \ + temp_arr[i] = ( ctype )( i % 20 ); \ + } \ } \ GEN_FILL_ARRAY_POST_OPS_FUNC(int16_t) @@ -159,91 +249,91 @@ void mat_mul_ ## BLAS_SFX \ aocl_post_op* post_op\ ) \ { \ - char storage = stor_order; \ - char reordera = 'n'; \ - char reorderb = 'n'; \ + char storage = stor_order; \ + char reordera = 'n'; \ + char reorderb = 'n'; \ \ - if ( ( op_a == 'p' ) || ( op_a == 'P' ) ) \ - { \ - reordera = 'p'; \ - } \ - else if ( ( op_a == 'r' ) || ( op_a == 'R' ) ) \ - { \ - reordera = 'r'; \ - } \ + if ( ( op_a == 'p' ) || ( op_a == 'P' ) ) \ + { \ + reordera = 'p'; \ + } \ + else if ( ( op_a == 'r' ) || ( op_a == 'R' ) ) \ + { \ + reordera = 'r'; \ + } \ \ - if ( ( op_b == 'p' ) || ( op_b == 'P' ) ) \ - { \ - /* No reordering of B.*/ \ - reorderb = 'n'; \ - } \ - else if ( ( op_b == 'r' ) || ( op_b == 'R' ) ) \ - { \ - /* Reordered B.*/ \ - reorderb = 'r'; \ - } \ + if ( ( op_b == 'p' ) || ( op_b == 'P' ) ) \ + { \ + /* No reordering of B.*/ \ + reorderb = 'n'; \ + } \ + else if ( ( op_b == 'r' ) || ( op_b == 'R' ) ) \ + { \ + /* Reordered B.*/ \ + reorderb = 'r'; \ + } \ \ - aocl_gemm_ ## BLAS_SFX( storage, transa, transb, m, n, k, \ - alpha, \ - a, lda, reordera, \ - b, ldb, reorderb, \ - beta, \ - c, ldc, post_op ); \ + aocl_gemm_ ## BLAS_SFX( storage, transa, transb, m, n, k, \ + alpha, \ + a, lda, reordera, \ + b, ldb, reorderb, \ + beta, \ + c, ldc, post_op ); \ \ - /*dim_t MR = 6; \ - dim_t NR = 16; \ + /*dim_t MR = 6; \ + dim_t NR = 16; \ \ - __m512i selector1; \ - __m512i all_zero = _mm512_setzero_epi32(); \ - __m512i c0; \ - __m512i c1; \ - __m512i c2; \ - __m512i c3; \ - __m512i c4; \ - __m512i c5; \ + __m512i selector1; \ + __m512i all_zero = _mm512_setzero_epi32(); \ + __m512i c0; \ + __m512i c1; \ + __m512i c2; \ + __m512i c3; \ + __m512i c4; \ + __m512i c5; \ \ - for ( dim_t i = 0; i < m; i += MR ) \ - { \ - if ( ( i + MR ) > m ) \ - { \ - break; \ - } \ - for ( dim_t j = 0; j < n; j += NR ) \ - { \ - if ( ( j + NR ) > n ) \ - { \ - break; \ - } \ - selector1 = _mm512_loadu_epi32( (int32_t*)post_op->bias.bias + j ); \ - c0 = _mm512_loadu_epi32( c + ( ( i + 0 ) * ldc ) + j ); \ - c1 = _mm512_loadu_epi32( c + ( ( i + 1 ) * ldc ) + j ); \ - c2 = _mm512_loadu_epi32( c + ( ( i + 2 ) * ldc ) + j ); \ - c3 = _mm512_loadu_epi32( c + ( ( i + 3 ) * ldc ) + j ); \ - c4 = _mm512_loadu_epi32( c + ( ( i + 4 ) * ldc ) + j ); \ - c5 = _mm512_loadu_epi32( c + ( ( i + 5 ) * ldc ) + j ); \ + for ( dim_t i = 0; i < m; i += MR ) \ + { \ + if ( ( i + MR ) > m ) \ + { \ + break; \ + } \ + for ( dim_t j = 0; j < n; j += NR ) \ + { \ + if ( ( j + NR ) > n ) \ + { \ + break; \ + } \ + selector1 = _mm512_loadu_epi32( (int32_t*)post_op->bias.bias + j ); \ + c0 = _mm512_loadu_epi32( c + ( ( i + 0 ) * ldc ) + j ); \ + c1 = _mm512_loadu_epi32( c + ( ( i + 1 ) * ldc ) + j ); \ + c2 = _mm512_loadu_epi32( c + ( ( i + 2 ) * ldc ) + j ); \ + c3 = _mm512_loadu_epi32( c + ( ( i + 3 ) * ldc ) + j ); \ + c4 = _mm512_loadu_epi32( c + ( ( i + 4 ) * ldc ) + j ); \ + c5 = _mm512_loadu_epi32( c + ( ( i + 5 ) * ldc ) + j ); \ \ - c0 = _mm512_add_epi32( selector1, c0 ); \ - c1 = _mm512_add_epi32( selector1, c1 ); \ - c2 = _mm512_add_epi32( selector1, c2 ); \ - c3 = _mm512_add_epi32( selector1, c3 ); \ - c4 = _mm512_add_epi32( selector1, c4 ); \ - c5 = _mm512_add_epi32( selector1, c5 ); \ + c0 = _mm512_add_epi32( selector1, c0 ); \ + c1 = _mm512_add_epi32( selector1, c1 ); \ + c2 = _mm512_add_epi32( selector1, c2 ); \ + c3 = _mm512_add_epi32( selector1, c3 ); \ + c4 = _mm512_add_epi32( selector1, c4 ); \ + c5 = _mm512_add_epi32( selector1, c5 ); \ \ - c0 = _mm512_max_epi32( all_zero, c0 ); \ - c1 = _mm512_max_epi32( all_zero, c1 ); \ - c2 = _mm512_max_epi32( all_zero, c2 ); \ - c3 = _mm512_max_epi32( all_zero, c3 ); \ - c4 = _mm512_max_epi32( all_zero, c4 ); \ - c5 = _mm512_max_epi32( all_zero, c5 ); \ + c0 = _mm512_max_epi32( all_zero, c0 ); \ + c1 = _mm512_max_epi32( all_zero, c1 ); \ + c2 = _mm512_max_epi32( all_zero, c2 ); \ + c3 = _mm512_max_epi32( all_zero, c3 ); \ + c4 = _mm512_max_epi32( all_zero, c4 ); \ + c5 = _mm512_max_epi32( all_zero, c5 ); \ \ - _mm512_storeu_epi32( c + ( ( i + 0 ) * ldc ) + j, c0 ); \ - _mm512_storeu_epi32( c + ( ( i + 1 ) * ldc ) + j, c1 ); \ - _mm512_storeu_epi32( c + ( ( i + 2 ) * ldc ) + j, c2 ); \ - _mm512_storeu_epi32( c + ( ( i + 3 ) * ldc ) + j, c3 ); \ - _mm512_storeu_epi32( c + ( ( i + 4 ) * ldc ) + j, c4 ); \ - _mm512_storeu_epi32( c + ( ( i + 5 ) * ldc ) + j, c5 ); \ - } \ - } */\ + _mm512_storeu_epi32( c + ( ( i + 0 ) * ldc ) + j, c0 ); \ + _mm512_storeu_epi32( c + ( ( i + 1 ) * ldc ) + j, c1 ); \ + _mm512_storeu_epi32( c + ( ( i + 2 ) * ldc ) + j, c2 ); \ + _mm512_storeu_epi32( c + ( ( i + 3 ) * ldc ) + j, c3 ); \ + _mm512_storeu_epi32( c + ( ( i + 4 ) * ldc ) + j, c4 ); \ + _mm512_storeu_epi32( c + ( ( i + 5 ) * ldc ) + j, c5 ); \ + } \ + } */\ } \ GEN_BLIS_MAT_MUL_FUNC(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) @@ -267,13 +357,15 @@ double get_gflops double runtime ) { - return ( ( 2.0 * m * n * k ) / ( runtime * 1.0e9 ) ); + return ( ( 2.0 * m * n * k ) / ( runtime * 1.0e9 ) ); } void print_result ( const char* msg, int32_t n_repeats, + char transa, + char transb, dim_t m, dim_t n, dim_t k, @@ -283,10 +375,10 @@ void print_result double runtime ) { - double gflops = get_gflops( m, n, k, runtime ); - printf("%s m: %ld, n: %ld, k: %ld, lda: %ld, ldb: %ld, ldc: %ld," \ - " Gops: %f, n_repeats: %d\n", - msg, m, n, k, lda, ldb, ldc, gflops, n_repeats); + double gflops = get_gflops( m, n, k, runtime ); + printf("%s transa:%c, transb:%c, m: %ld, n: %ld, k: %ld, lda: %ld, ldb: %ld, ldc: %ld," \ + " Gops: %f, n_repeats: %d\n", + msg, transa, transb, m, n, k, lda, ldb, ldc, gflops, n_repeats); } #define GEN_MAT_MUL_BENCH_DRV_FUNC(A_type,B_type,C_type,ACCUM_type,BLAS_SFX) \ @@ -312,38 +404,38 @@ void mat_mul_bench_driver_ ## BLAS_SFX \ aocl_post_op* post_op\ ) \ { \ - double min_time_diff = DBL_MAX; \ - for ( int32_t nr = 0; nr < n_repeats; ++nr ) \ - { \ - if ( bench_mode == 'a' ) \ - { \ - int32_t size_C = ( ( stor_order == 'r') || ( stor_order == 'R' ) )? m * ldc : n * ldc; \ - GEN_FUNC_NAME(fill_array_,C_type)( c, ( size_C ) ); \ - } \ + double min_time_diff = DBL_MAX; \ + for ( int32_t nr = 0; nr < n_repeats; ++nr ) \ + { \ + if ( bench_mode == 'a' ) \ + { \ + int32_t size_C = ( ( stor_order == 'r') || ( stor_order == 'R' ) )? m * ldc : n * ldc; \ + GEN_FUNC_NAME(fill_array_,C_type)( c, ( size_C ) ); \ + } \ \ - struct timespec tstart={0,0}, tend={0,0}; \ - clock_gettime(CLOCK_MONOTONIC, &tstart); \ + struct timespec tstart={0,0}, tend={0,0}; \ + clock_gettime(CLOCK_MONOTONIC, &tstart); \ \ - GEN_FUNC_NAME(mat_mul_,BLAS_SFX) \ - ( \ - stor_order, transa, transb, op_a, op_b, m, n, k, \ - alpha, \ - a, lda, \ - b, ldb, \ - beta, \ - c, ldc, \ - post_op \ - ); \ + GEN_FUNC_NAME(mat_mul_,BLAS_SFX) \ + ( \ + stor_order, transa, transb, op_a, op_b, m, n, k, \ + alpha, \ + a, lda, \ + b, ldb, \ + beta, \ + c, ldc, \ + post_op \ + ); \ \ - clock_gettime(CLOCK_MONOTONIC, &tend); \ + clock_gettime(CLOCK_MONOTONIC, &tend); \ \ - double diff = \ - ( ( double ) tend.tv_sec + ( 1.0e-9 * tend.tv_nsec ) ) - \ - ( ( double ) tstart.tv_sec + ( 1.0e-9 * tstart.tv_nsec ) ); \ - min_time_diff = ( diff < min_time_diff ) ? diff : min_time_diff; \ - } \ + double diff = \ + ( ( double ) tend.tv_sec + ( 1.0e-9 * tend.tv_nsec ) ) - \ + ( ( double ) tstart.tv_sec + ( 1.0e-9 * tstart.tv_nsec ) ); \ + min_time_diff = ( diff < min_time_diff ) ? diff : min_time_diff; \ + } \ \ - print_result( XSTR(BLAS_SFX), n_repeats, m, n, k, lda, ldb, ldc, min_time_diff); \ + print_result( XSTR(BLAS_SFX), n_repeats, transa, transb, m, n, k, lda, ldb, ldc, min_time_diff); \ } \ GEN_MAT_MUL_BENCH_DRV_FUNC(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) @@ -361,12 +453,12 @@ GEN_MAT_MUL_BENCH_DRV_FUNC(int8_t,int8_t,int8_t,int16_t,s8s8s16os8) int max (int a, int b) { - return ( a > b ? a : b ); + return ( a > b ? a : b ); } int min (int a, int b) { - return ( a < b ? a : b ); + return ( a < b ? a : b ); } #define GEN_MAT_MUL_ACC_CHK_DOWNSCALE(C_type,ACCUM_type,SCALE_type,BLAS_DOWNSCALE_SFX) \ @@ -377,14 +469,14 @@ static inline ACCUM_type mat_mul_accuracy_check_downscale_ ## BLAS_DOWNSCALE_SFX dim_t j \ )\ { \ - ACCUM_type out_temp_accum = \ - ( ACCUM_type )min( \ - max( nearbyintf( ( SCALE_type )( temp_accum ) * \ - ( *( ( SCALE_type* )post_op->sum.scale_factor + j ) ) ) + \ - *( ( C_type* )post_op->sum.zero_point + j ), \ - DSCALE_CLIP_MIN ), \ - DSCALE_CLIP_MAX ); \ - return out_temp_accum; \ + ACCUM_type out_temp_accum = \ + ( ACCUM_type )min( \ + max( nearbyintf( ( SCALE_type )( temp_accum ) * \ + ( *( ( SCALE_type* )post_op->sum.scale_factor + j ) ) ) + \ + *( ( C_type* )post_op->sum.zero_point + j ), \ + DSCALE_CLIP_MIN ), \ + DSCALE_CLIP_MAX ); \ + return out_temp_accum; \ }\ GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int16_t,float,u8s8s16os8) @@ -400,7 +492,7 @@ static inline float mat_mul_accuracy_check_downscale_bf16bf16f32obf16 dim_t j ) { - return temp_accum; + return temp_accum; } #define GEN_MAT_MUL_ACC_CHK_ACCUM(A_type, B_type, C_type,ACCUM_type,BLAS_SFX) \ @@ -423,15 +515,15 @@ static inline ACCUM_type mat_mul_accuracy_check_accum_ ## BLAS_SFX \ dim_t k \ )\ {\ - for ( dim_t p = 0; p < k; ++p) \ - { \ - temp_accum += ( *( a + ( i * rs_a ) + ( cs_a * p ) ) * \ - *( b + ( rs_b * p ) + ( cs_b * j ) ) ); \ - } \ + for ( dim_t p = 0; p < k; ++p) \ + { \ + temp_accum += ( *( a + ( i * rs_a ) + ( cs_a * p ) ) * \ + *( b + ( rs_b * p ) + ( cs_b * j ) ) ); \ + } \ \ - temp_accum = ( beta * ( * (c_ref + ( rs_c_ref * i ) + ( cs_c_ref * j ) ) ) ) \ - + ( alpha * temp_accum ); \ - return temp_accum; \ + temp_accum = ( beta * ( * (c_ref + ( rs_c_ref * i ) + ( cs_c_ref * j ) ) ) ) \ + + ( alpha * temp_accum ); \ + return temp_accum; \ }\ GEN_MAT_MUL_ACC_CHK_ACCUM(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8) @@ -464,15 +556,16 @@ static inline float mat_mul_accuracy_check_accum_bf16bf16f32of32 dim_t k ) { - for ( dim_t p = 0; p < k; ++p) - { - float a_float = bf16_to_float( *( a + i * rs_a + p * cs_a ) ); - float b_float = bf16_to_float( *( b + p * rs_b + j * cs_b ) ); - temp_accum += ( ( a_float ) * ( b_float ) ); - } - temp_accum = ( beta * ( * (c_ref + ( rs_c_ref * i ) + ( cs_c_ref * j ) ) ) ) - + ( alpha * temp_accum ); - return temp_accum; + for ( dim_t p = 0; p < k; ++p) + { + float a_float, b_float; + bfloat16_to_float( *( a + i * rs_a + p * cs_a ) , &a_float); + bfloat16_to_float( *( b + p * rs_b + j * cs_b ) , &b_float); + temp_accum += ( ( a_float ) * ( b_float ) ); + } + temp_accum = ( beta * ( * (c_ref + ( rs_c_ref * i ) + ( cs_c_ref * j ) ) ) ) + + ( alpha * temp_accum ); + return temp_accum; } static inline float mat_mul_accuracy_check_accum_bf16bf16f32obf16 @@ -494,16 +587,18 @@ static inline float mat_mul_accuracy_check_accum_bf16bf16f32obf16 dim_t k ) { - for ( dim_t p = 0; p < k; ++p) - { - float a_float = bf16_to_float( *( a + i*rs_a + p*cs_a ) ); - float b_float = bf16_to_float( *( b + p*rs_b + j*cs_b ) ); - temp_accum += ( ( a_float ) * ( b_float ) ); - } - float c_ref_float = bf16_to_float( *( c_ref + i*rs_c_ref + j*cs_c_ref ) ); - temp_accum = ( beta * ( c_ref_float ) ) + ( alpha * temp_accum ); - - return temp_accum; + for ( dim_t p = 0; p < k; ++p) + { + float a_float, b_float; + bfloat16_to_float( *( a + i*rs_a + p*cs_a ), &a_float ); + bfloat16_to_float( *( b + p*rs_b + j*cs_b ), &b_float ); + temp_accum += ( ( a_float ) * ( b_float ) ); + } + float c_ref_float; + bfloat16_to_float( *( c_ref + i*rs_c_ref + j*cs_c_ref ), &c_ref_float ); + temp_accum = ( beta * ( c_ref_float ) ) + ( alpha * temp_accum ); + + return temp_accum; } #define GEN_GELU_TANH_POSTOP_INT(ACCUM_type,BLAS_SFX) \ @@ -512,11 +607,11 @@ static inline ACCUM_type GELU_TANH_post_op_ ## BLAS_SFX \ ACCUM_type temp_accum \ )\ {\ - float gelu_reference = 0.5 *(double)temp_accum * (1 + tanhf( 0.797884 * ( (double)temp_accum + \ - ( 0.044715 * ((double)temp_accum * (double)temp_accum * \ - (double)temp_accum ) ) ) ) ); \ - temp_accum = round (gelu_reference); \ - return temp_accum; \ + float gelu_reference = 0.5 *(double)temp_accum * (1 + tanhf( 0.797884 * ( (double)temp_accum + \ + ( 0.044715 * ((double)temp_accum * (double)temp_accum * \ + (double)temp_accum ) ) ) ) ); \ + temp_accum = round (gelu_reference); \ + return temp_accum; \ }\ GEN_GELU_TANH_POSTOP_INT(int16_t,u8s8s16os8) @@ -535,10 +630,10 @@ static inline float GELU_TANH_post_op_ ## BLAS_SFX \ float temp_accum \ )\ {\ - temp_accum = 0.5 *(double)temp_accum * (1 + tanhf( 0.797884 * ( (double)temp_accum + \ - ( 0.044715 * ((double)temp_accum * (double)temp_accum * \ - (double)temp_accum ) ) ) ) ); \ - return temp_accum; \ + temp_accum = 0.5 *(double)temp_accum * (1 + tanhf( 0.797884 * ( (double)temp_accum + \ + ( 0.044715 * ((double)temp_accum * (double)temp_accum * \ + (double)temp_accum ) ) ) ) ); \ + return temp_accum; \ }\ GEN_GELU_TANH_POSTOP_FLOAT(f32f32f32of32) @@ -551,9 +646,9 @@ static inline ACCUM_type GELU_ERF_post_op_ ## BLAS_SFX \ ACCUM_type temp_accum \ )\ {\ - float gelu_reference = 0.5 *(double)temp_accum * (1 + erff( (double)temp_accum * 0.707107 )); \ - temp_accum = round (gelu_reference); \ - return temp_accum; \ + float gelu_reference = 0.5 *(double)temp_accum * (1 + erff( (double)temp_accum * 0.707107 )); \ + temp_accum = round (gelu_reference); \ + return temp_accum; \ }\ GEN_GELU_ERF_POSTOP_INT(int16_t,u8s8s16os8) @@ -572,8 +667,8 @@ static inline float GELU_ERF_post_op_ ## BLAS_SFX \ float temp_accum \ )\ {\ - temp_accum = 0.5 *(double)temp_accum * (1 + erff( (double)temp_accum * 0.707107 )); \ - return temp_accum; \ + temp_accum = 0.5 *(double)temp_accum * (1 + erff( (double)temp_accum * 0.707107 )); \ + return temp_accum; \ }\ GEN_GELU_ERF_POSTOP_FLOAT(f32f32f32of32) @@ -587,7 +682,7 @@ void mat_mul_get_output_type_val ## ACCUM_type ## C_type \ ACCUM_type* temp_accum \ ) \ { \ - ( *out_temp_accum ) = ( C_type )( *temp_accum ); \ + ( *out_temp_accum ) = ( C_type )( *temp_accum ); \ } \ GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int32_t,int32_t) @@ -603,7 +698,7 @@ void mat_mul_get_output_type_valfloatbfloat16 float* temp_accum ) { - float_to_bf16( temp_accum, out_temp_accum ); + float_to_bf16( temp_accum, out_temp_accum ); } #define GEN_MAT_MUL_ACC_CHK_DRV_FUNC(A_type,B_type,C_type,ACCUM_type,SCALE_type,BLAS_SFX,BLAS_DOWNSCALE_SFX) \ @@ -629,160 +724,165 @@ void mat_mul_accuracy_check_driver_ ## BLAS_SFX \ aocl_post_op* post_op\ ) \ { \ - dim_t rs_a, cs_a; \ - if( ( transa == 'n' ) || ( transa == 'N' ) ) \ - { \ - rs_a = lda; \ - cs_a = 1; \ - } \ - else \ - { \ - rs_a = 1; \ - cs_a = lda; \ - } \ - dim_t rs_b, cs_b; \ - if( ( transb == 'n' ) || ( transb == 'N' ) ) \ - { \ - rs_b = ldb; \ - cs_b = 1; \ - } \ - else \ - { \ - rs_b = 1; \ - cs_b = ldb; \ - } \ - dim_t rs_c = ldc; \ - dim_t cs_c = 1; \ - dim_t rs_c_ref = ldc_ref; \ - dim_t cs_c_ref = 1; \ + dim_t rs_a, cs_a; \ + if( ( transa == 'n' ) || ( transa == 'N' ) ) \ + { \ + rs_a = lda; \ + cs_a = 1; \ + } \ + else \ + { \ + rs_a = 1; \ + cs_a = lda; \ + } \ + dim_t rs_b, cs_b; \ + if( ( transb == 'n' ) || ( transb == 'N' ) ) \ + { \ + rs_b = ldb; \ + cs_b = 1; \ + } \ + else \ + { \ + rs_b = 1; \ + cs_b = ldb; \ + } \ + dim_t rs_c = ldc; \ + dim_t cs_c = 1; \ + dim_t rs_c_ref = ldc_ref; \ + dim_t cs_c_ref = 1; \ \ - if ( ( stor_order == 'C' ) || ( stor_order == 'c' ) ) \ - { \ - if( transa == 'n' || transa == 'N') \ - { \ - rs_a = 1; \ - cs_a = lda; \ - } \ - else \ - { \ - rs_a = lda; \ - cs_a = 1; \ - } \ - if( ( transb == 'n' ) || ( transb == 'N' ) ) \ - { \ - rs_b = 1; \ - cs_b = ldb; \ - } \ - else \ - { \ - rs_b = ldb; \ - cs_b = 1; \ - } \ - rs_c = 1; \ - cs_c = ldc; \ - rs_c_ref = 1; \ - cs_c_ref = ldc_ref; \ - } \ + if ( ( stor_order == 'C' ) || ( stor_order == 'c' ) ) \ + { \ + if( transa == 'n' || transa == 'N') \ + { \ + rs_a = 1; \ + cs_a = lda; \ + } \ + else \ + { \ + rs_a = lda; \ + cs_a = 1; \ + } \ + if( ( transb == 'n' ) || ( transb == 'N' ) ) \ + { \ + rs_b = 1; \ + cs_b = ldb; \ + } \ + else \ + { \ + rs_b = ldb; \ + cs_b = 1; \ + } \ + rs_c = 1; \ + cs_c = ldc; \ + rs_c_ref = 1; \ + cs_c_ref = ldc_ref; \ + } \ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ACCUM_type temp_accum = 0; \ - C_type out_temp_accum = 0; \ + for ( dim_t i = 0; i < m; ++i ) \ + { \ + for ( dim_t j = 0; j < n; ++j ) \ + { \ + ACCUM_type temp_accum = 0; \ + C_type out_temp_accum = 0; \ \ - temp_accum = GEN_FUNC_NAME(mat_mul_accuracy_check_accum_,BLAS_SFX) \ - (a,b,c_ref,temp_accum,alpha,beta,rs_a,rs_b,cs_a,cs_b,rs_c_ref,cs_c_ref,i,j,k); \ + temp_accum = GEN_FUNC_NAME(mat_mul_accuracy_check_accum_,BLAS_SFX) \ + (a,b,c_ref,temp_accum,alpha,beta,rs_a,rs_b,cs_a,cs_b,rs_c_ref,cs_c_ref,i,j,k); \ \ - if ( post_op != NULL ) \ - { \ - dim_t ele_i = 0; \ - for ( dim_t op_id = 0; op_id < post_op->seq_length; ++op_id ) \ - { \ - if ( post_op->seq_vector[op_id] == BIAS ) \ - { \ - temp_accum += ( *( ( ACCUM_type* )post_op->bias.bias + j ) ); \ - } \ - else if ( post_op->seq_vector[op_id] == ELTWISE ) \ - { \ - if ( ( post_op->eltwise + ele_i )->algo.algo_type == \ - PRELU ) /* PReLU*/ \ - { \ - temp_accum = ( temp_accum > 0 ) ? \ - temp_accum : \ - ( temp_accum * \ - *( ( ACCUM_type* ) ( post_op->eltwise + ele_i )->algo.alpha ) ); \ - ele_i += 1; \ - } \ - else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \ - GELU_TANH ) /* TANH GeLU*/ \ - { \ - temp_accum = GEN_FUNC_NAME(GELU_TANH_post_op_,BLAS_SFX) (temp_accum);\ - ele_i += 1; \ - } \ - else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \ - GELU_ERF ) /* ERF GeLU*/ \ - { \ - temp_accum = GEN_FUNC_NAME(GELU_ERF_post_op_,BLAS_SFX) (temp_accum);\ - ele_i += 1; \ - } \ - else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \ - RELU ) /* ReLU*/ \ - { \ - temp_accum = ( temp_accum > 0 ) ? temp_accum : 0 ; \ - ele_i += 1; \ - } \ - else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \ - CLIP ) /* CLIP*/ \ - { \ - temp_accum = \ - min \ - ( \ - max \ - ( \ - temp_accum, \ - *( ( ACCUM_type* ) \ - ( post_op->eltwise + ele_i )->algo.alpha ) \ - ), \ - *( ( ACCUM_type* ) \ - ( post_op->eltwise + ele_i )->algo.beta) \ - ); \ - ele_i += 1; \ - } \ - else \ - {} \ - } \ - else if ( post_op->seq_vector[op_id] == SCALE ) \ - { \ - temp_accum = GEN_FUNC_NAME(mat_mul_accuracy_check_downscale_,BLAS_DOWNSCALE_SFX) \ - (temp_accum, post_op, j); \ - } \ - else \ - {} \ - } \ - } \ - /* Need to convert to downscaled type if required.*/ \ - mat_mul_get_output_type_val ## ACCUM_type ## C_type \ - ( \ - &out_temp_accum, &temp_accum \ - ); \ + if ( post_op != NULL ) \ + { \ + dim_t ele_i = 0; \ + for ( dim_t op_id = 0; op_id < post_op->seq_length; ++op_id ) \ + { \ + if ( post_op->seq_vector[op_id] == BIAS ) \ + { \ + temp_accum += ( *( ( ACCUM_type* )post_op->bias.bias + j ) ); \ + } \ + else if ( post_op->seq_vector[op_id] == ELTWISE ) \ + { \ + if ( ( post_op->eltwise + ele_i )->algo.algo_type == \ + PRELU ) /* PReLU*/ \ + { \ + temp_accum = ( temp_accum > 0 ) ? \ + temp_accum : \ + ( temp_accum * \ + *( ( ACCUM_type* ) ( post_op->eltwise + ele_i )->algo.alpha ) ); \ + ele_i += 1; \ + } \ + else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \ + GELU_TANH ) /* TANH GeLU*/ \ + { \ + temp_accum = GEN_FUNC_NAME(GELU_TANH_post_op_,BLAS_SFX) (temp_accum);\ + ele_i += 1; \ + } \ + else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \ + GELU_ERF ) /* ERF GeLU*/ \ + { \ + temp_accum = GEN_FUNC_NAME(GELU_ERF_post_op_,BLAS_SFX) (temp_accum);\ + ele_i += 1; \ + } \ + else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \ + RELU ) /* ReLU*/ \ + { \ + temp_accum = ( temp_accum > 0 ) ? temp_accum : 0 ; \ + ele_i += 1; \ + } \ + else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \ + CLIP ) /* CLIP*/ \ + { \ + temp_accum = \ + min \ + ( \ + max \ + ( \ + temp_accum, \ + *( ( ACCUM_type* ) \ + ( post_op->eltwise + ele_i )->algo.alpha ) \ + ), \ + *( ( ACCUM_type* ) \ + ( post_op->eltwise + ele_i )->algo.beta) \ + ); \ + ele_i += 1; \ + } \ + else \ + {} \ + } \ + else if ( post_op->seq_vector[op_id] == SCALE ) \ + { \ + temp_accum = GEN_FUNC_NAME(mat_mul_accuracy_check_downscale_,BLAS_DOWNSCALE_SFX) \ + (temp_accum, post_op, j); \ + } \ + else \ + {} \ + } \ + } \ + /* Need to convert to downscaled type if required.*/ \ + mat_mul_get_output_type_val ## ACCUM_type ## C_type \ + ( \ + &out_temp_accum, &temp_accum \ + ); \ \ - if ( *( c + ( rs_c * i ) + ( cs_c * j ) ) != out_temp_accum ) \ - { \ - if ( fout ) \ - { \ - fprintf( fout, "%s Failure input m: %ld, n: %ld, k: %ld," \ - " lda: %ld, ldb: %ld, ldc: %ld\n", \ - XSTR(BLAS_SFX), m, n, k, lda, ldb, ldc ); \ - fflush( fout ); \ - } \ - printf("failure, m: %ld, n: %ld, k: %ld\n", i, j, k); \ - goto cleanup_acc; \ - } \ - } \ - } \ + if ( *( c + ( rs_c * i ) + ( cs_c * j ) ) != out_temp_accum ) \ + { \ + float comp_float, ref_float; \ + GEN_FUNC_NAME(C_type,_to_float)(*( c + ( rs_c * i ) + ( cs_c * j ) ), &comp_float); \ + GEN_FUNC_NAME(C_type,_to_float)(out_temp_accum, &ref_float); \ + if ( fout ) \ + { \ + fprintf( fout, "%s Failure input m: %ld, n: %ld, k: %ld," \ + " lda: %ld, ldb: %ld, ldc: %ld, computed:%f, ref:%f, diff:%f\n", \ + XSTR(BLAS_SFX), m, n, k, lda, ldb, ldc, comp_float, \ + ref_float, comp_float - ref_float); \ + fflush( fout ); \ + } \ + printf("failure, m: %ld, n: %ld, k: %ld, computed:%f, ref:%f, diff:%f\n", i, j, k, \ + comp_float, ref_float, comp_float-ref_float); \ + goto cleanup_acc; \ + } \ + } \ + } \ cleanup_acc: \ - return; \ + return; \ } \ GEN_MAT_MUL_ACC_CHK_DRV_FUNC(uint8_t,int8_t,int16_t,int16_t,float,u8s8s16os16,u8s8s16os8) @@ -806,237 +906,237 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \ char* post_ops_str \ ) \ { \ - aocl_post_op* post_ops = NULL; \ - post_ops = ( aocl_post_op* ) malloc( sizeof( aocl_post_op ) ); \ + aocl_post_op* post_ops = NULL; \ + post_ops = ( aocl_post_op* ) malloc( sizeof( aocl_post_op ) ); \ \ - if ( ( post_ops == NULL ) && ( global_dscale_out == 'n' ) ) \ - { \ - return NULL; \ - } \ + if ( ( post_ops == NULL ) && ( global_dscale_out == 'n' ) ) \ + { \ + return NULL; \ + } \ \ - /* Only supporting 5 post ops at max for now.*/ \ - dim_t max_post_ops_seq_length = 5; \ - post_ops->seq_vector = ( AOCL_POST_OP_TYPE* ) \ - malloc \ - ( \ - max_post_ops_seq_length * \ - sizeof( AOCL_POST_OP_TYPE ) \ - ); \ + /* Only supporting 5 post ops at max for now.*/ \ + dim_t max_post_ops_seq_length = 5; \ + post_ops->seq_vector = ( AOCL_POST_OP_TYPE* ) \ + malloc \ + ( \ + max_post_ops_seq_length * \ + sizeof( AOCL_POST_OP_TYPE ) \ + ); \ \ - if ( post_ops->seq_vector == NULL ) \ - { \ - free( post_ops ); \ - return NULL; \ - } \ + if ( post_ops->seq_vector == NULL ) \ + { \ + free( post_ops ); \ + return NULL; \ + } \ \ - /* Parse post ops list.*/ \ - dim_t cur_op_index = 0; \ - /* Ensure the buffers that use NULL check in deinit code is properly set to NULL.*/ \ - post_ops->eltwise = NULL; \ - post_ops->bias.bias = NULL; \ - post_ops->sum.scale_factor = NULL; \ - post_ops->sum.buff = NULL; \ - post_ops->sum.zero_point = NULL; \ - if ( post_ops_str != NULL ) \ - { \ - char* ops_tok = strtok(post_ops_str, ", " ); \ - bool is_relu = FALSE; \ - bool is_param_relu = FALSE; \ - bool is_gelu_tanh = FALSE; \ - bool is_gelu_erf = FALSE; \ - bool is_clip = FALSE; \ - dim_t activator_idx = 0; \ - dim_t clip_idx = 0; \ + /* Parse post ops list.*/ \ + dim_t cur_op_index = 0; \ + /* Ensure the buffers that use NULL check in deinit code is properly set to NULL.*/ \ + post_ops->eltwise = NULL; \ + post_ops->bias.bias = NULL; \ + post_ops->sum.scale_factor = NULL; \ + post_ops->sum.buff = NULL; \ + post_ops->sum.zero_point = NULL; \ + if ( post_ops_str != NULL ) \ + { \ + char* ops_tok = strtok(post_ops_str, ", " ); \ + bool is_relu = FALSE; \ + bool is_param_relu = FALSE; \ + bool is_gelu_tanh = FALSE; \ + bool is_gelu_erf = FALSE; \ + bool is_clip = FALSE; \ + dim_t activator_idx = 0; \ + dim_t clip_idx = 0; \ \ - /* Ensure only one activator is used as an eltwise post-op.*/ \ - bool is_activator_set = FALSE; \ - num_eltwise = 0; \ - while ( ops_tok ) \ - { \ - if ( strcmp( ops_tok, "bias" ) == 0 ) \ - { \ - post_ops->seq_vector[cur_op_index] = BIAS; \ - cur_op_index++; \ - } \ - else if ( ( strcmp( ops_tok, "relu" ) == 0 ) && \ - ( is_activator_set == FALSE ) ) \ - { \ - post_ops->seq_vector[cur_op_index] = ELTWISE; \ - is_relu = TRUE; \ - is_activator_set = TRUE; \ - num_eltwise += 1; \ - activator_idx = cur_op_index; \ - cur_op_index++; \ - } \ - else if ( ( strcmp( ops_tok, "prelu" ) == 0 ) && \ - ( is_activator_set == FALSE ) ) \ - { \ - post_ops->seq_vector[cur_op_index] = ELTWISE; \ - is_param_relu = TRUE; \ - is_activator_set = TRUE; \ - num_eltwise += 1; \ - activator_idx = cur_op_index; \ - cur_op_index++; \ - } \ - else if ( ( strcmp( ops_tok, "gelu_tanh" ) == 0 ) && \ - ( is_activator_set == FALSE ) ) \ - { \ - post_ops->seq_vector[cur_op_index] = ELTWISE; \ - is_gelu_tanh = TRUE; \ - is_activator_set = TRUE; \ - num_eltwise += 1; \ - activator_idx = cur_op_index; \ - cur_op_index++; \ - } \ - else if ( ( strcmp( ops_tok, "gelu_erf" ) == 0 ) && \ - ( is_activator_set == FALSE ) ) \ - { \ - post_ops->seq_vector[cur_op_index] = ELTWISE; \ - is_gelu_erf = TRUE; \ - is_activator_set = TRUE; \ - num_eltwise += 1; \ - activator_idx = cur_op_index; \ - cur_op_index++; \ - } \ - else if ( strcmp( ops_tok, "clip" ) == 0 ) \ - { \ - post_ops->seq_vector[cur_op_index] = ELTWISE; \ - is_clip = TRUE; \ - num_eltwise += 1; \ - clip_idx = cur_op_index; \ - cur_op_index++; \ - } \ - ops_tok = strtok( NULL, ", " ); \ - } \ + /* Ensure only one activator is used as an eltwise post-op.*/ \ + bool is_activator_set = FALSE; \ + num_eltwise = 0; \ + while ( ops_tok ) \ + { \ + if ( strcmp( ops_tok, "bias" ) == 0 ) \ + { \ + post_ops->seq_vector[cur_op_index] = BIAS; \ + cur_op_index++; \ + } \ + else if ( ( strcmp( ops_tok, "relu" ) == 0 ) && \ + ( is_activator_set == FALSE ) ) \ + { \ + post_ops->seq_vector[cur_op_index] = ELTWISE; \ + is_relu = TRUE; \ + is_activator_set = TRUE; \ + num_eltwise += 1; \ + activator_idx = cur_op_index; \ + cur_op_index++; \ + } \ + else if ( ( strcmp( ops_tok, "prelu" ) == 0 ) && \ + ( is_activator_set == FALSE ) ) \ + { \ + post_ops->seq_vector[cur_op_index] = ELTWISE; \ + is_param_relu = TRUE; \ + is_activator_set = TRUE; \ + num_eltwise += 1; \ + activator_idx = cur_op_index; \ + cur_op_index++; \ + } \ + else if ( ( strcmp( ops_tok, "gelu_tanh" ) == 0 ) && \ + ( is_activator_set == FALSE ) ) \ + { \ + post_ops->seq_vector[cur_op_index] = ELTWISE; \ + is_gelu_tanh = TRUE; \ + is_activator_set = TRUE; \ + num_eltwise += 1; \ + activator_idx = cur_op_index; \ + cur_op_index++; \ + } \ + else if ( ( strcmp( ops_tok, "gelu_erf" ) == 0 ) && \ + ( is_activator_set == FALSE ) ) \ + { \ + post_ops->seq_vector[cur_op_index] = ELTWISE; \ + is_gelu_erf = TRUE; \ + is_activator_set = TRUE; \ + num_eltwise += 1; \ + activator_idx = cur_op_index; \ + cur_op_index++; \ + } \ + else if ( strcmp( ops_tok, "clip" ) == 0 ) \ + { \ + post_ops->seq_vector[cur_op_index] = ELTWISE; \ + is_clip = TRUE; \ + num_eltwise += 1; \ + clip_idx = cur_op_index; \ + cur_op_index++; \ + } \ + ops_tok = strtok( NULL, ", " ); \ + } \ \ - /* Allocate bias buffer, return early if alloc fails.*/ \ - post_ops->bias.bias = malloc( n * sizeof( C_type ) ); \ - if ( post_ops->bias.bias == NULL ) \ - { \ - free( post_ops->seq_vector ); \ - free( post_ops ); \ - return NULL; \ - } \ - GEN_FUNC_NAME(fill_array_post_ops_,C_type)( post_ops->bias.bias, n ); \ + /* Allocate bias buffer, return early if alloc fails.*/ \ + post_ops->bias.bias = malloc( n * sizeof( C_type ) ); \ + if ( post_ops->bias.bias == NULL ) \ + { \ + free( post_ops->seq_vector ); \ + free( post_ops ); \ + return NULL; \ + } \ + GEN_FUNC_NAME(fill_array_post_ops_,C_type)( post_ops->bias.bias, n ); \ \ - post_ops->eltwise = malloc( num_eltwise * sizeof( aocl_post_op_eltwise ) ); \ - if ( post_ops->eltwise == NULL ) \ - { \ - free( post_ops->bias.bias ); \ - free( post_ops->seq_vector ); \ - free( post_ops ); \ - return NULL; \ - } \ + post_ops->eltwise = malloc( num_eltwise * sizeof( aocl_post_op_eltwise ) ); \ + if ( post_ops->eltwise == NULL ) \ + { \ + free( post_ops->bias.bias ); \ + free( post_ops->seq_vector ); \ + free( post_ops ); \ + return NULL; \ + } \ \ - if ( num_eltwise > 0 ) \ - { \ - if ( num_eltwise > 1 ) \ - { \ - if ( activator_idx < clip_idx ) \ - { \ - activator_idx = 0; \ - clip_idx = 1; \ - } \ - else \ - { \ - activator_idx = 1; \ - clip_idx = 0; \ - } \ - } \ - else \ - { \ - activator_idx = 0; \ - clip_idx = 0; \ - } \ - } \ - /* Only one of relu,prelu,gelu_tanh,gelu_erf allowed as an activator.*/ \ - if ( is_relu == TRUE ) \ - { \ - ( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \ - ( post_ops->eltwise + activator_idx )->scale_factor = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.alpha = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.beta = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.algo_type = RELU; \ - } \ - else if ( is_param_relu == TRUE ) \ - { \ - ( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \ - ( post_ops->eltwise + activator_idx )->scale_factor = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.beta = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.alpha = malloc( sizeof( C_type ) ); \ - *( ( C_type* ) ( post_ops->eltwise + activator_idx )->algo.alpha ) = ( C_type )6; \ - ( post_ops->eltwise + activator_idx )->algo.algo_type = PRELU; \ - } \ - else if ( is_gelu_tanh == TRUE ) \ - { \ - ( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \ - ( post_ops->eltwise + activator_idx )->scale_factor = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.alpha = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.beta = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.algo_type = GELU_TANH; \ - } \ - else if ( is_gelu_erf == TRUE ) \ - { \ - ( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \ - ( post_ops->eltwise + activator_idx )->scale_factor = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.alpha = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.beta = NULL; \ - ( post_ops->eltwise + activator_idx )->algo.algo_type = GELU_ERF; \ - } \ - if ( is_clip == TRUE ) \ - { \ - ( post_ops->eltwise + clip_idx )->is_power_of_2 = FALSE; \ - ( post_ops->eltwise + clip_idx )->scale_factor = NULL; \ - ( post_ops->eltwise + clip_idx )->algo.alpha = malloc( sizeof( C_type ) ); \ - ( post_ops->eltwise + clip_idx )->algo.beta = malloc( sizeof( C_type ) ); \ - *( ( C_type* ) ( post_ops->eltwise + clip_idx )->algo.alpha ) = ( C_type ) ( -64 ); \ - *( ( C_type* ) ( post_ops->eltwise + clip_idx )->algo.beta ) = ( C_type ) ( 23 ); \ - ( post_ops->eltwise + clip_idx )->algo.algo_type = CLIP; \ - } \ - } \ + if ( num_eltwise > 0 ) \ + { \ + if ( num_eltwise > 1 ) \ + { \ + if ( activator_idx < clip_idx ) \ + { \ + activator_idx = 0; \ + clip_idx = 1; \ + } \ + else \ + { \ + activator_idx = 1; \ + clip_idx = 0; \ + } \ + } \ + else \ + { \ + activator_idx = 0; \ + clip_idx = 0; \ + } \ + } \ + /* Only one of relu,prelu,gelu_tanh,gelu_erf allowed as an activator.*/ \ + if ( is_relu == TRUE ) \ + { \ + ( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \ + ( post_ops->eltwise + activator_idx )->scale_factor = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.alpha = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.beta = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.algo_type = RELU; \ + } \ + else if ( is_param_relu == TRUE ) \ + { \ + ( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \ + ( post_ops->eltwise + activator_idx )->scale_factor = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.beta = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.alpha = malloc( sizeof( C_type ) ); \ + *( ( C_type* ) ( post_ops->eltwise + activator_idx )->algo.alpha ) = ( C_type )6; \ + ( post_ops->eltwise + activator_idx )->algo.algo_type = PRELU; \ + } \ + else if ( is_gelu_tanh == TRUE ) \ + { \ + ( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \ + ( post_ops->eltwise + activator_idx )->scale_factor = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.alpha = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.beta = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.algo_type = GELU_TANH; \ + } \ + else if ( is_gelu_erf == TRUE ) \ + { \ + ( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \ + ( post_ops->eltwise + activator_idx )->scale_factor = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.alpha = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.beta = NULL; \ + ( post_ops->eltwise + activator_idx )->algo.algo_type = GELU_ERF; \ + } \ + if ( is_clip == TRUE ) \ + { \ + ( post_ops->eltwise + clip_idx )->is_power_of_2 = FALSE; \ + ( post_ops->eltwise + clip_idx )->scale_factor = NULL; \ + ( post_ops->eltwise + clip_idx )->algo.alpha = malloc( sizeof( C_type ) ); \ + ( post_ops->eltwise + clip_idx )->algo.beta = malloc( sizeof( C_type ) ); \ + *( ( C_type* ) ( post_ops->eltwise + clip_idx )->algo.alpha ) = ( C_type ) ( -64 ); \ + *( ( C_type* ) ( post_ops->eltwise + clip_idx )->algo.beta ) = ( C_type ) ( 23 ); \ + ( post_ops->eltwise + clip_idx )->algo.algo_type = CLIP; \ + } \ + } \ \ - if ( global_dscale_out == 'y' ) \ - { \ - post_ops->seq_vector[cur_op_index] = SCALE; \ - cur_op_index++; \ + if ( global_dscale_out == 'y' ) \ + { \ + post_ops->seq_vector[cur_op_index] = SCALE; \ + cur_op_index++; \ \ - post_ops->sum.is_power_of_2 = FALSE; \ - if ( global_dscale_out == 'y' ) \ - { \ - /* Allocate scale buffer, return early if alloc fails.*/ \ - post_ops->sum.scale_factor = malloc( n * sizeof( DSCALE_type ) ); \ - post_ops->sum.zero_point = malloc( n * sizeof( C_DSCALE_type ) ); \ - if ( ( post_ops->sum.scale_factor == NULL ) || \ - ( post_ops->sum.zero_point == NULL ) ) \ - { \ - free ( post_ops->eltwise ); \ - free ( post_ops->bias.bias ); \ - free( post_ops->seq_vector ); \ - if ( post_ops->sum.zero_point != NULL ) \ - { \ - free( post_ops->sum.zero_point ); \ - } \ - if ( post_ops->sum.scale_factor != NULL ) \ - { \ - free( post_ops->sum.scale_factor ); \ - } \ - free( post_ops ); \ - return NULL; \ - } \ - /* Fill scale factor and zero points.*/ \ - DSCALE_type* temp_dscale_ptr = ( DSCALE_type* )post_ops->sum.scale_factor; \ - C_DSCALE_type* temp_dzero_point_ptr = ( C_DSCALE_type* )post_ops->sum.zero_point; \ - for ( dim_t i = 0; i < n; ++i ) \ - { \ - temp_dscale_ptr[i] = ( ( DSCALE_type )1 )/ ( ( DSCALE_type )1000 ); \ - temp_dzero_point_ptr[i] = (C_DSCALE_type)( i % 126 ); \ - } \ - } \ - } \ + post_ops->sum.is_power_of_2 = FALSE; \ + if ( global_dscale_out == 'y' ) \ + { \ + /* Allocate scale buffer, return early if alloc fails.*/ \ + post_ops->sum.scale_factor = malloc( n * sizeof( DSCALE_type ) ); \ + post_ops->sum.zero_point = malloc( n * sizeof( C_DSCALE_type ) ); \ + if ( ( post_ops->sum.scale_factor == NULL ) || \ + ( post_ops->sum.zero_point == NULL ) ) \ + { \ + free ( post_ops->eltwise ); \ + free ( post_ops->bias.bias ); \ + free( post_ops->seq_vector ); \ + if ( post_ops->sum.zero_point != NULL ) \ + { \ + free( post_ops->sum.zero_point ); \ + } \ + if ( post_ops->sum.scale_factor != NULL ) \ + { \ + free( post_ops->sum.scale_factor ); \ + } \ + free( post_ops ); \ + return NULL; \ + } \ + /* Fill scale factor and zero points.*/ \ + DSCALE_type* temp_dscale_ptr = ( DSCALE_type* )post_ops->sum.scale_factor; \ + C_DSCALE_type* temp_dzero_point_ptr = ( C_DSCALE_type* )post_ops->sum.zero_point; \ + for ( dim_t i = 0; i < n; ++i ) \ + { \ + temp_dscale_ptr[i] = ( ( DSCALE_type )1 )/ ( ( DSCALE_type )1000 ); \ + temp_dzero_point_ptr[i] = (C_DSCALE_type)( i % 126 ); \ + } \ + } \ + } \ \ - post_ops->seq_length = cur_op_index; \ + post_ops->seq_length = cur_op_index; \ \ - return post_ops; \ + return post_ops; \ } \ GEN_MAT_MUL_POST_OPS_CREATOR(int8_t,int16_t,float,u8s8s16os16) @@ -1048,47 +1148,47 @@ GEN_MAT_MUL_POST_OPS_CREATOR(int8_t,int16_t,float,s8s8s16os16) void lpgemm_destroy_post_ops_struct( aocl_post_op* post_ops ) { - if ( post_ops == NULL ) - { - return; - } - - if ( post_ops->eltwise != NULL ) - { - for ( dim_t i = 0; i < num_eltwise; ++i ) - { - if ( ( post_ops->eltwise + i )->algo.alpha != NULL ) - { - free( ( post_ops->eltwise + i )->algo.alpha ); - } - if ( ( post_ops->eltwise + i )->algo.beta != NULL ) - { - free( ( post_ops->eltwise + i )->algo.beta ); - } - } - free( post_ops->eltwise ); - } - if ( post_ops->sum.scale_factor != NULL ) - { - free( post_ops->sum.scale_factor ); - } - if ( post_ops->sum.zero_point != NULL ) - { - free( post_ops->sum.zero_point ); - } - if ( post_ops->bias.bias != NULL ) - { - free( post_ops->bias.bias ); - } - if( post_ops->seq_vector != NULL ) - { - free( post_ops->seq_vector ); - } - - free( post_ops ); + if ( post_ops == NULL ) + { + return; + } + + if ( post_ops->eltwise != NULL ) + { + for ( dim_t i = 0; i < num_eltwise; ++i ) + { + if ( ( post_ops->eltwise + i )->algo.alpha != NULL ) + { + free( ( post_ops->eltwise + i )->algo.alpha ); + } + if ( ( post_ops->eltwise + i )->algo.beta != NULL ) + { + free( ( post_ops->eltwise + i )->algo.beta ); + } + } + free( post_ops->eltwise ); + } + if ( post_ops->sum.scale_factor != NULL ) + { + free( post_ops->sum.scale_factor ); + } + if ( post_ops->sum.zero_point != NULL ) + { + free( post_ops->sum.zero_point ); + } + if ( post_ops->bias.bias != NULL ) + { + free( post_ops->bias.bias ); + } + if( post_ops->seq_vector != NULL ) + { + free( post_ops->seq_vector ); + } + + free( post_ops ); } -#define GEN_MAT_MUL_BENCH_MAIN_FUNC(A_type,B_type,C_type,BLAS_SFX,REORDER_SFX) \ +#define GEN_MAT_MUL_BENCH_MAIN_FUNC(A_type, B_type, C_type, Sum_type, BLAS_SFX, REORDER_SFX) \ void mat_mul_bench_main_ ## BLAS_SFX \ ( \ FILE* fin, \ @@ -1107,721 +1207,497 @@ void mat_mul_bench_main_ ## BLAS_SFX \ char* post_ops_str \ ) \ { \ - if( ( stor_order != 'r' ) && ( stor_order != 'R' ) ) \ - { \ - printf("The stor_order(1st arg in input.txt) is not valid\n"); \ - return; \ - } \ - if( ( transa != 'n' ) && ( transa != 'N' ) ) \ - { \ - printf("The transa(2nd arg in input.txt) is not valid\n"); \ - return; \ - } \ - if( ( transb != 'n' ) && ( transb != 'N' ) ) \ - { \ - printf("The transb (3rd arg in input.txt) is not valid\n"); \ - return; \ - } \ - /* Reorder and pack of A matrix is not supported */ \ - if( ( op_a != 'N' ) && ( op_a != 'n' ) ) \ - { \ - printf("The op_a ( 4th arg in input.txt) is not valid\n"); \ - return; \ - } \ - \ - if ( ( op_b != 'p' ) && ( op_b != 'P' ) && ( op_b != 'r' ) && ( op_b != 'R' ) && ( op_b != 'n' ) && ( op_b != 'N' ) ) \ - { \ - printf("The op_b ( 5th arg in input.txt) is not valid\n"); \ - return; \ - } \ - \ - int32_t n_repeats = bli_max( 30, bli_min(( 3e10 / ( ( int64_t )m * n * k )), 100 )); \ - if ( global_n_repeat > 0 ) \ - { \ - n_repeats = global_n_repeat; \ - } \ - \ - /* sizes are hardcoded since all datatypes other than bf16 only support - row major and no-transpose cases. In future, when we support transpose support - for all datatypes, these needs to be modified. */ \ - int32_t size_A = m * stride_a; \ - int32_t size_B = k * stride_b; \ - int32_t size_C = m * stride_c; \ - /* Get 64 byte aligned memory.*/ \ - err_t bli_errors = BLIS_SUCCESS; \ - A_type* a = ( A_type* ) bli_malloc_user( sizeof( A_type ) * size_A, &bli_errors ); \ + int32_t n_repeats = bli_max( 30, bli_min(( 3e10 / ( ( int64_t )m * n * k )), 1000 )); \ + if ( global_n_repeat > 0 ) \ + { \ + n_repeats = global_n_repeat; \ + } \ \ - B_type* b = ( B_type* ) bli_malloc_user( sizeof( B_type ) * size_B, &bli_errors ); \ + int32_t size_A = 0; \ + int32_t size_B = 0; \ + int32_t size_C = 0; \ + if( ( stor_order == 'r' ) || ( stor_order == 'R' ) ) \ + { \ + size_A = ( ( transa == 'n' ) || ( transa == 'N' ) ) ? m * stride_a : k * stride_a; \ + size_B = ( ( transb == 'n' ) || ( transb == 'N' ) ) ? k * stride_b : n * stride_b; \ + size_C = m * stride_c; \ + } \ + else \ + { \ + size_A = ( ( transa == 'n' ) || ( transa == 'N' ) ) ? k * stride_a : m * stride_a; \ + size_B = ( ( transb == 'n' ) || ( transb == 'N' ) ) ? n * stride_b : k * stride_b; \ + size_C = n * stride_c; \ + } \ + A_type* a = ( A_type* ) lpgemm_malloc( sizeof( A_type ) * size_A ); \ + GEN_FUNC_NAME(fill_array_,A_type)(a, size_A ); \ \ - C_type* c = ( C_type* ) bli_malloc_user( sizeof( C_type ) * size_C, &bli_errors ); \ - memset( ( void* ) c, 0, sizeof( C_type ) * size_C ); \ + B_type* b = ( B_type* ) lpgemm_malloc( sizeof( B_type ) * size_B ); \ + GEN_FUNC_NAME(fill_array_,B_type)(b, size_B ); \ \ - C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * size_C, &bli_errors ); \ - memset( ( void* ) c_ref, 0, sizeof( C_type ) * size_C ); \ + C_type* c = ( C_type* ) lpgemm_malloc( sizeof( C_type ) * size_C ); \ + memset( ( void* ) c, 0, sizeof( C_type ) * size_C ); \ \ - GEN_FUNC_NAME(fill_array_,A_type)( a, ( size_A ) ); \ - GEN_FUNC_NAME(fill_array_,B_type)( b, ( size_B ) ); \ + C_type* c_ref = ( C_type* ) lpgemm_malloc( sizeof( C_type ) * size_C ); \ + memset( ( void* ) c_ref, 0, sizeof( C_type ) * size_C ); \ \ - if ( bench_mode == 'a' ) \ - { \ - GEN_FUNC_NAME(fill_array_,C_type)( c, ( size_C ) ); \ - GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( size_C ) ); \ - } \ + if ( bench_mode == 'a' ) \ + { \ + GEN_FUNC_NAME(fill_array_,C_type)( c, ( size_C ) ); \ + GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( size_C ) ); \ + } \ \ - C_type alpha = 0; \ - C_type beta = 0; \ - if ( bench_mode == 'p' ) \ - { \ - alpha = 1; \ - beta = 0; \ - } \ - else if ( bench_mode == 'a' ) \ - { \ - alpha = 2; \ - beta = 9; \ - n_repeats = 1; \ - } \ + Sum_type alpha = 0; \ + Sum_type beta = 0; \ + if ( bench_mode == 'p' ) \ + { \ + alpha = 2; \ + beta = 9; \ + } \ + else if ( bench_mode == 'a' ) \ + { \ + n_repeats = 1; \ + alpha = 2; \ + beta = 9; \ + } \ \ - aocl_post_op* post_op = NULL; \ - if ( ( post_ops_str != NULL ) || ( global_dscale_out == 'y' ) ) \ - { \ - post_op = GEN_FUNC_NAME(lpgemm_create_post_ops_struct_,REORDER_SFX)( m, n, post_ops_str ); \ - if ( post_op == NULL ) \ - { \ - printf(" post op struct allocation failure, returning.\n"); \ - return; \ - } \ - } \ + aocl_post_op* post_op = NULL; \ + if ( ( post_ops_str != NULL ) || ( global_dscale_out == 'y' ) ) \ + { \ + post_op = GEN_FUNC_NAME(lpgemm_create_post_ops_struct_,REORDER_SFX)( m, n, post_ops_str ); \ + if ( post_op == NULL ) \ + { \ + printf(" post op struct allocation failure, returning.\n"); \ + return; \ + } \ + } \ \ - if ( ( op_b == 'p' ) || ( op_b == 'P' ) || ( op_b == 'n' ) || ( op_b == 'N' ) ) \ - { \ - /* No reordering of B.*/ \ - GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ - ( \ - stor_order, transa, transb, op_a, op_b, n_repeats, m, n, k, \ - alpha, \ - a, stride_a, \ - b, stride_b, \ - beta, \ - c, stride_c, \ - post_op \ - ); \ - } \ - else if ( ( op_b == 'r' ) || ( op_b == 'R' ) ) \ - { \ - /* Reorder B.*/ \ - siz_t b_reorder_buf_siz_req = \ - GEN_FUNC_NAME(aocl_get_reorder_buf_size_,REORDER_SFX)( stor_order, transb, 'B', k, n ); \ + if ( ( op_b == 'p' ) || ( op_b == 'P' ) || ( op_b == 'n' ) || ( op_b == 'N' ) ) \ + { \ + /* No reordering of B.*/ \ + GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ + ( \ + stor_order, transa, transb, op_a, op_b, n_repeats, m, n, k, \ + alpha, \ + a, stride_a, \ + b, stride_b, \ + beta, \ + c, stride_c, \ + post_op \ + ); \ + } \ + else if ( ( op_b == 'r' ) || ( op_b == 'R' ) ) \ + { \ + /* Reorder B.*/ \ + siz_t b_reorder_buf_siz_req = \ + GEN_FUNC_NAME(aocl_get_reorder_buf_size_,REORDER_SFX)( stor_order, transb, 'B', k, n ); \ \ - B_type* b_reorder = ( B_type* ) bli_malloc_user( b_reorder_buf_siz_req, &bli_errors ); \ - GEN_FUNC_NAME(aocl_reorder_,REORDER_SFX)( stor_order, transb, 'B', b, b_reorder, k, n, stride_b ); \ + B_type* b_reorder = ( B_type* ) lpgemm_malloc( b_reorder_buf_siz_req ); \ + GEN_FUNC_NAME(aocl_reorder_,REORDER_SFX)( stor_order, transb, 'B', b, b_reorder, k, n, stride_b ); \ \ - GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ - ( \ - stor_order, transa, transb, op_a, op_b, n_repeats, m, n, k, \ - alpha, \ - a, stride_a, \ - b_reorder, stride_b, \ - beta, \ - c, stride_c, \ - post_op \ - ); \ + GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ + ( \ + stor_order, transa, transb, op_a, op_b, n_repeats, m, n, k, \ + alpha, \ + a, stride_a, \ + b_reorder, stride_b, \ + beta, \ + c, stride_c, \ + post_op \ + ); \ + } \ \ - bli_free_user( b_reorder ); \ - } \ + if ( bench_mode == 'a' ) \ + { \ + printf(" Running accuracy check.\n"); \ + GEN_FUNC_NAME(mat_mul_accuracy_check_driver_,BLAS_SFX) \ + ( \ + fout, stor_order, transa, transb, m, n, k, \ + alpha, \ + a, stride_a, \ + b, stride_b, \ + beta, \ + c, stride_c, \ + c_ref, stride_c, \ + post_op \ + ); \ + } \ \ - if ( bench_mode == 'a' ) \ - { \ - printf("Running accuracy check.\n"); \ - GEN_FUNC_NAME(mat_mul_accuracy_check_driver_,BLAS_SFX) \ - ( \ - fout, stor_order, transa, transb, m, n, k, \ - alpha, \ - a, stride_a, \ - b, stride_b, \ - beta, \ - c, stride_c, \ - c_ref, stride_c, \ - post_op \ - ); \ - } \ + lpgemm_destroy_post_ops_struct( post_op ); \ \ - lpgemm_destroy_post_ops_struct( post_op ); \ - \ - if ( a != NULL ) \ - { \ - bli_free_user( a ); \ - } \ - if ( b != NULL ) \ - { \ - bli_free_user( b ); \ - } \ - if ( c != NULL ) \ - { \ - bli_free_user( c ); \ - } \ - if ( c_ref != NULL ) \ - { \ - bli_free_user( c_ref ); \ - } \ + lpgemm_free( a ); \ + lpgemm_free( b ); \ + lpgemm_free( c ); \ + lpgemm_free( c_ref ); \ } \ -GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int16_t,u8s8s16os16,u8s8s16os16) -GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int8_t,u8s8s16os8,u8s8s16os16) -GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,uint8_t,u8s8s16ou8,u8s8s16os16) -GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int32_t,u8s8s32os32,u8s8s32os32) -GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int8_t,u8s8s32os8,u8s8s32os32) -GEN_MAT_MUL_BENCH_MAIN_FUNC(float,float,float,f32f32f32of32,f32f32f32of32) -GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int32_t,s8s8s32os32,s8s8s32os32) -GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int8_t,s8s8s32os8,s8s8s32os32) -GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int16_t,s8s8s16os16,s8s8s16os16) -GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int8_t,s8s8s16os8,s8s8s16os16) - -#define GEN_MAT_MUL_BENCH_MAIN_FUNC_BF16(C_type, BLAS_SFX) \ -void mat_mul_bench_main_ ## BLAS_SFX \ - ( \ - FILE* fin, \ - FILE* fout, \ - char stor_order, \ - char transa, \ - char transb, \ - char op_a, \ - char op_b, \ - int32_t m, \ - int32_t n, \ - int32_t k, \ - int32_t stride_a, \ - int32_t stride_b, \ - int32_t stride_c, \ - char* post_ops_str \ - ) \ -{ \ - if( ( stor_order != 'r' ) && ( stor_order != 'R' ) && ( stor_order != 'c' ) && ( stor_order != 'C' ) ) \ - { \ - printf("The stor_order(1st arg in input.txt) is not valid\n"); \ - return; \ - } \ - if( ( transa != 'n' ) && ( transa != 'N' ) && ( transa != 't' ) && (transa != 'T' ) ) \ - { \ - printf("The transa ( 2nd arg in input.txt) is not valid\n"); \ - return; \ - } \ - if( ( transb != 'n' ) && ( transb != 'N' ) && ( transb != 't' ) && (transb != 'T' ) ) \ - { \ - printf("The transb ( 3nd arg in input.txt) is not valid\n"); \ - return; \ - } \ - /* Reorder is not supported for A matrix*/ \ - if( ( op_a != 'p' ) && ( op_a != 'P' ) && ( op_a != 'n' ) && ( op_a != 'N' ) ) \ - { \ - printf("The op_a (4th arg in input.txt) is not valid\n"); \ - return; \ - } \ - if ( ( op_b != 'p' ) && ( op_b != 'P' ) && ( op_b != 'r' ) && ( op_b != 'R' ) && ( op_b != 'N' ) && ( op_b != 'n' ) ) \ - { \ - printf("The op_b ( 5th arg in input.txt) is not valid\n");\ - return; \ - } \ - \ - int32_t n_repeats = bli_max( 30, bli_min(( 3e10 / ( ( int64_t )m * n * k )), 1000 )); \ - if ( global_n_repeat > 0 ) \ - { \ - n_repeats = global_n_repeat; \ - } \ - \ - int32_t size_A = 0; \ - int32_t size_B = 0; \ - int32_t size_C = 0; \ - if( ( stor_order == 'r' ) || ( stor_order == 'R' ) ) \ - { \ - size_A = ( ( transa == 'n' ) || ( transa == 'N' ) ) ? m * stride_a : k * stride_a; \ - size_B = ( ( transb == 'n' ) || ( transb == 'N' ) ) ? k * stride_b : n * stride_b; \ - size_C = m * stride_c; \ - } \ - else \ - { \ - size_A = ( ( transa == 'n' ) || ( transa == 'N' ) ) ? k * stride_a : m * stride_a; \ - size_B = ( ( transb == 'n' ) || ( transb == 'N' ) ) ? n * stride_b : k * stride_b; \ - size_C = n * stride_c; \ - } \ - err_t bli_errors = BLIS_SUCCESS; \ - /* Get 64 byte aligned memory.*/ \ - bfloat16* a = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * size_A, &bli_errors ); \ - float *a_float = bli_malloc_user( size_A * sizeof( float ), &bli_errors); \ - for ( int32_t i = 0; i < size_A; ++i ) \ - { \ - a_float[i] = ( float ) ( i % 5 ); \ - } \ - \ - convert_float_arr_to_bf16( a_float, a, size_A ); \ - \ - bfloat16* b = ( bfloat16* ) bli_malloc_user( sizeof( bfloat16 ) * size_B, &bli_errors ); \ - float *b_float = bli_malloc_user( size_B * sizeof( float ), &bli_errors); \ - for ( int32_t i = 0; i < size_B; ++i ) \ - { \ - b_float[i] = ( float ) ( i % 5 );\ - } \ - convert_float_arr_to_bf16( b_float, b, size_B ); \ - \ - C_type* c = ( C_type* ) bli_malloc_user( sizeof( C_type ) * size_C, &bli_errors ); \ - memset( ( void* ) c, 0, sizeof( C_type ) * size_C ); \ - \ - C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * size_C, &bli_errors ); \ - memset( ( void* ) c_ref, 0, sizeof( C_type ) * size_C ); \ - \ - if ( bench_mode == 'a' ) \ - { \ - GEN_FUNC_NAME(fill_array_,C_type)( c, ( size_C ) ); \ - GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( size_C ) ); \ - } \ - \ - float alpha = 0.0f; \ - float beta = 0.0f; \ - if ( bench_mode == 'p' ) \ - { \ - alpha = 1; \ - beta = 0; \ - } \ - else if ( bench_mode == 'a' ) \ - { \ - alpha = 2; \ - beta = 9; \ - } \ - \ - aocl_post_op* post_op = NULL; \ - if ( ( post_ops_str != NULL ) || ( global_dscale_out == 'y' ) ) \ - { \ - post_op = lpgemm_create_post_ops_struct_bf16bf16f32of32( m, n, post_ops_str ); \ - if ( post_op == NULL ) \ - { \ - printf(" post op struct allocation failure, returning.\n"); \ - return; \ - } \ - } \ - \ - if ( ( op_b == 'p' ) || ( op_b == 'P' ) || ( op_b == 'n' ) || ( op_b == 'N' ) ) \ - { \ - /* No reordering of B.*/ \ - GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ - ( \ - stor_order, transa, transb, op_a, op_b, n_repeats, m, n, k, \ - alpha, \ - a, stride_a, \ - b, stride_b, \ - beta, \ - c, stride_c, \ - post_op \ - ); \ - } \ - else if ( ( op_b == 'r' ) || ( op_b == 'R' ) ) \ - { \ - /* Reorder B.*/ \ - siz_t b_reorder_buf_siz_req = \ - aocl_get_reorder_buf_size_bf16bf16f32of32( stor_order, transb, 'B', k, n ); \ - \ - bfloat16* b_reorder = ( bfloat16* ) bli_malloc_user( b_reorder_buf_siz_req, &bli_errors ); \ - aocl_reorder_bf16bf16f32of32( stor_order, transb, 'B', b, b_reorder, k, n, stride_b ); \ - \ - GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \ - ( \ - stor_order, transa, transb, op_a, op_b, n_repeats, m, n, k, \ - alpha, \ - a, stride_a, \ - b_reorder, stride_b, \ - beta, \ - c, stride_c, \ - post_op \ - ); \ - } \ - \ - if ( bench_mode == 'a' ) \ - { \ - printf(" Running accuracy check.\n"); \ - GEN_FUNC_NAME(mat_mul_accuracy_check_driver_,BLAS_SFX) \ - ( \ - fout, stor_order, transa, transb, m, n, k, \ - alpha, \ - a, stride_a, \ - b, stride_b, \ - beta, \ - c, stride_c, \ - c_ref, stride_c, \ - post_op \ - ); \ - } \ - \ - lpgemm_destroy_post_ops_struct( post_op ); \ - \ - if ( a != NULL ) \ - { \ - bli_free_user( a ); \ - } \ - if ( b != NULL ) \ - { \ - bli_free_user( b ); \ - } \ - if ( a_float != NULL ) \ - { \ - bli_free_user( a_float ); \ - } \ - if ( b_float != NULL ) \ - { \ - bli_free_user( b_float ); \ - } \ - if ( c != NULL ) \ - { \ - bli_free_user( c ); \ - } \ - if ( c_ref != NULL ) \ - { \ - bli_free_user( c_ref ); \ - } \ -} \ - -GEN_MAT_MUL_BENCH_MAIN_FUNC_BF16(float,bf16bf16f32of32) -GEN_MAT_MUL_BENCH_MAIN_FUNC_BF16(bfloat16,bf16bf16f32obf16) - +GEN_MAT_MUL_BENCH_MAIN_FUNC(bfloat16,bfloat16,float,float,bf16bf16f32of32,bf16bf16f32of32) +GEN_MAT_MUL_BENCH_MAIN_FUNC(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16,bf16bf16f32of32) +GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16,u8s8s16os16) +GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8,u8s8s16os16) +GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8,u8s8s16os16) +GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32,u8s8s32os32) +GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8,u8s8s32os32) +GEN_MAT_MUL_BENCH_MAIN_FUNC(float,float,float,float,f32f32f32of32,f32f32f32of32) +GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int32_t,int32_t,s8s8s32os32,s8s8s32os32) +GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int8_t,int32_t,s8s8s32os8,s8s8s32os32) +GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int16_t,int16_t,s8s8s16os16,s8s8s16os16) +GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int8_t,int16_t,s8s8s16os8,s8s8s16os16) int main( int argc, char** argv ) { - FILE* fin = NULL; - if ( argc < 5 ) - { - printf - ( - "Usage: ./bench_lpgemm -i input.txt -m mode < -n 100 -o op1,op2 >\n" \ - "--Mode is either a or p.\n" \ - "\ta is used for accuracy testing.\n" \ - "\tp is used for performance benchmarking.\n" \ - "--n_repeats can be set optionally using -n arg.\n" \ - "--Post ops can be executed optionaly by providing a coma separated\n" \ - " list of post-ops after -o arg. Following post-ops are supported:\n" \ - " 1. bias\n" \ - " 2. 4 activators\n" \ - " a. relu\n" \ - " b. prelu\n" \ - " c. gelu_tanh\n" \ - " d. gelu_erf\n" \ - " 3.clip\n" \ - " Atleast one post-op needs to be specified if the -o arg is used.\n" \ - " eg: -o gelu_tanh; -o bias,relu ; -o clip,prelu,bias.\n" \ - " It is to be noted only one activator can be used at a time.\n" \ - " If more than one activator is used, only the first activator is\n" \ - " applied and the other activators are ignored.\n" \ - "--Downscaled version of an API is enabled by using -d arg followed\n" \ - " by the datatype that needs to be downscaled to" - " Downscaled api's are used to enable quantization workflows.\n" \ - " Following downscaled api's are supported:\n" \ - " 1. u8s8s32os32 -d s8 = u8s8s32os8.\n" \ - " 2. u8s8s16os16 -d s8 = u8s8s16os8.\n" \ - " 3. u8s8s16os16 -d u8 = u8s8s16ou8.\n" \ - " 4. bf16bf16f32obf32 -d bf16 = bf16bf16f32obf16.\n" \ - " 5. s8s8s32os32 -d s8 = s8s8s32os8.\n" \ - " 6. s8s8s16os16 -d s8 = s8s8s16os8.\n" \ - " Example: ./bench_lpgemm -m a -n 2 -o bias,relu -d bf16 -i input.txt\n" \ - ); - exit( 1 ); - } - - char* file_name = NULL; - char* post_ops_str = NULL; - char* post_ops_str_dest = NULL; //Strtok is used to parse, need to maintain a copy. - char* dscale_type_str = NULL; - - // Parse CLI arguments. - opterr = 0; - int opt_val; - while ( ( opt_val = getopt( argc, argv, "i:m:n:o:d:" ) ) != -1 ) - { - switch ( opt_val ) - { - case 'i': - file_name = optarg; - break; - case 'm': - bench_mode = ( ( ( *optarg ) == 'a' ) || ( ( *optarg ) == 'p' ) ) ? ( *optarg ) : 'p'; - break; - case 'n': - global_n_repeat = ( atoi( optarg ) > 0 ) ? atoi( optarg ) : 0; - break; - case 'o': - post_ops_str = optarg; - break; - case 'd': - global_dscale_out = 'y'; - dscale_type_str = optarg; - break; - default: - break; - } - } - - if ( post_ops_str != NULL ) - { - post_ops_str_dest = ( char* )malloc \ - ( ( strlen( post_ops_str) + 1 )* sizeof( char ) ); - strcpy( post_ops_str_dest, post_ops_str ); - } - - if ( bench_mode == 'p' ) - { - printf( "Running bench in performance benchmarking mode.\n" ); - } - else if ( bench_mode == 'a' ) - { - printf( "Running bench in accuracy/correctness testing mode.\n" ); - } - - if ( file_name == NULL ) - { - printf( " File name provided is invalid.\n" ); - exit( 1 ); - } - - fin = fopen( file_name, "r" ); - if (fin == NULL) - { - printf( "Error opening the file %s\n", argv[1] ); - exit( 1 ); - } - - FILE* fout = NULL; - - fout = fopen( "lpgemm_accuracy_test_failures.txt", "w" ); - - char op_type_char; - char op_a, op_b; - char stor_order; - char transa, transb; - int32_t m, n, k; - int32_t stride_a, stride_b, stride_c; - - const dim_t len_list_omp_cores_for_testing = 2; - const dim_t list_omp_cores_for_testing[2] = { 80, 1 }; - - dim_t core_index = 0; - bool can_run = TRUE; - while ( ( can_run == TRUE ) && ( fseek( fin, 0L, SEEK_SET ) == 0 ) ) - { - if ( bench_mode == 'p' ) - { - can_run = FALSE; - } - else if ( bench_mode == 'a' ) - { - // For accuracy testing, we test accuracy using multiple different - // number of cores. This helps uncover any bugs related to over - // subscription or varying thread factorizations. - // Set current number of cores. + FILE* fin = NULL; + if ( argc < 5 ) + { + printf + ( + "Usage: ./bench_lpgemm -i input.txt -m mode < -n 100 -o op1,op2 >\n" \ + "--Mode is either a or p.\n" \ + "\ta is used for accuracy testing.\n" \ + "\tp is used for performance benchmarking.\n" \ + "--n_repeats can be set optionally using -n arg.\n" \ + "--Post ops can be executed optionaly by providing a coma separated\n" \ + " list of post-ops after -o arg. Following post-ops are supported:\n" \ + " 1. bias\n" \ + " 2. 4 activators\n" \ + " a. relu\n" \ + " b. prelu\n" \ + " c. gelu_tanh\n" \ + " d. gelu_erf\n" \ + " 3.clip\n" \ + " Atleast one post-op needs to be specified if the -o arg is used.\n" \ + " eg: -o gelu_tanh; -o bias,relu ; -o clip,prelu,bias.\n" \ + " It is to be noted only one activator can be used at a time.\n" \ + " If more than one activator is used, only the first activator is\n" \ + " applied and the other activators are ignored.\n" \ + "--Downscaled version of an API is enabled by using -d arg followed\n" \ + " by the datatype that needs to be downscaled to" + " Downscaled api's are used to enable quantization workflows.\n" \ + " Following downscaled api's are supported:\n" \ + " 1. u8s8s32os32 -d s8 = u8s8s32os8.\n" \ + " 2. u8s8s16os16 -d s8 = u8s8s16os8.\n" \ + " 3. u8s8s16os16 -d u8 = u8s8s16ou8.\n" \ + " 4. bf16bf16f32obf32 -d bf16 = bf16bf16f32obf16.\n" \ + " 5. s8s8s32os32 -d s8 = s8s8s32os8.\n" \ + " 6. s8s8s16os16 -d s8 = s8s8s16os8.\n" \ + " Example: ./bench_lpgemm -m a -n 2 -o bias,relu -d bf16 -i input.txt\n" \ + ); + exit( 1 ); + } + + char* file_name = NULL; + char post_ops_str[50]; + char* post_ops_str_dest = NULL; //Strtok is used to parse, need to maintain a copy. + char dscale_type_str[10]; + + // Parse CLI arguments. + opterr = 0; + int opt_val; + while ( ( opt_val = getopt( argc, argv, "i:m:n:" ) ) != -1 ) + { + switch ( opt_val ) + { + case 'i': + file_name = optarg; + break; + case 'm': + bench_mode = ( ( ( *optarg ) == 'a' ) || ( ( *optarg ) == 'p' ) ) ? ( *optarg ) : 'p'; + break; + case 'n': + global_n_repeat = ( atoi( optarg ) > 0 ) ? atoi( optarg ) : 0; + break; + default: + break; + } + } + + if ( bench_mode == 'p' ) + { + printf( "Running bench in performance benchmarking mode.\n" ); + } + else if ( bench_mode == 'a' ) + { + printf( "Running bench in accuracy/correctness testing mode.\n" ); + } + + if ( file_name == NULL ) + { + printf( " File name provided is invalid.\n" ); + exit( 1 ); + } + + fin = fopen( file_name, "r" ); + if (fin == NULL) + { + printf( "Error opening the file %s\n", argv[1] ); + exit( 1 ); + } + + FILE* fout = NULL; + + fout = fopen( "lpgemm_accuracy_test_failures.txt", "w" ); + + char op_type_char; + char op_a, op_b; + char stor_order; + char transa, transb; + int32_t m, n, k; + int32_t stride_a, stride_b, stride_c; + + const dim_t len_list_omp_cores_for_testing = 2; + const dim_t list_omp_cores_for_testing[2] = { 80, 1 }; + + dim_t core_index = 0; + bool can_run = TRUE; + while ( ( can_run == TRUE ) && ( fseek( fin, 0L, SEEK_SET ) == 0 ) ) + { + if ( bench_mode == 'p' ) + { + can_run = FALSE; + } + else if ( bench_mode == 'a' ) + { + // For accuracy testing, we test accuracy using multiple different + // number of cores. This helps uncover any bugs related to over + // subscription or varying thread factorizations. + // Set current number of cores. #ifdef BLIS_ENABLE_OPENMP - omp_set_num_threads( list_omp_cores_for_testing[core_index] ); + omp_set_num_threads( list_omp_cores_for_testing[core_index] ); #endif - printf( "Accuracy test using %ld threads.\n", - list_omp_cores_for_testing[core_index] ); - - core_index++; - if ( core_index < len_list_omp_cores_for_testing ) - { - can_run = TRUE; - } - else - { - can_run = FALSE; - } - } - - // Input format: data_type stor_type pack/reorder m n k lda ldb ldc - while ( fscanf( fin, "%c %c %c %c %c %c %d %d %d %d %d %d\n", - &op_type_char, &stor_order, &transa, &transb, &op_a, &op_b, &m, &n, &k, - &stride_a, &stride_b, &stride_c ) == 12 ) - { - stor_order = ( ( stor_order == 'r' ) || ( stor_order == 'R' ) || - ( stor_order == 'c' ) || ( stor_order == 'C' ) ) ? - stor_order : 'r'; - - if ( ( op_type_char == 'i' ) || ( op_type_char == 'I' ) ) - { - if ( global_dscale_out == 'n' ) - { - GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s32os32) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else - { - if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || - ( strcmp( dscale_type_str, "s8" ) == 0 ) ) - { - DSCALE_CLIP_MIN = -128; - DSCALE_CLIP_MAX = +127; - GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s32os8) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else - { - printf("Downscale type not supported.\n"); - } - } - } - else if ( ( op_type_char == 'f' ) || ( op_type_char == 'F' ) ) - { - GEN_FUNC_NAME(mat_mul_bench_main_,f32f32f32of32) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else if ((op_type_char == 's') || (op_type_char == 'S')) - { - if ( global_dscale_out == 'n' ) - { - GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16os16) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else - { - if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || - ( strcmp( dscale_type_str, "s8" ) == 0 ) ) - { - DSCALE_CLIP_MIN = -128; - DSCALE_CLIP_MAX = +127; - GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16os8) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else if ( ( strcmp( dscale_type_str, "U8" ) == 0 ) || - ( strcmp( dscale_type_str, "u8" ) == 0 ) ) - { - DSCALE_CLIP_MIN = 0; - DSCALE_CLIP_MAX = +255; - GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16ou8) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else - { - printf("Downscale type not supported.\n"); - } - } - } - else if ((op_type_char == 'b') || (op_type_char == 'B')) - { - if ( global_dscale_out == 'n' ) - { - GEN_FUNC_NAME(mat_mul_bench_main_, bf16bf16f32of32) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else - { - GEN_FUNC_NAME(mat_mul_bench_main_, bf16bf16f32obf16) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - } - else if ( ( op_type_char == 'u' ) || ( op_type_char == 'U' ) ) - { - if ( global_dscale_out == 'n' ) - { - GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s32os32) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else - { - if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || - ( strcmp( dscale_type_str, "s8" ) == 0 ) ) - { - DSCALE_CLIP_MIN = -128; - DSCALE_CLIP_MAX = +127; - GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s32os8) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else - { - printf("Downscale type not supported.\n"); - } - } - } - else if ( ( op_type_char == 'v' ) || ( op_type_char == 'V' ) ) - { - if ( global_dscale_out == 'n' ) - { - GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s16os16) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else - { - if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || - ( strcmp( dscale_type_str, "s8" ) == 0 ) ) - { - DSCALE_CLIP_MIN = -128; - DSCALE_CLIP_MAX = +127; - GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s16os8) - ( - fin, fout, stor_order, transa, transb, op_a, op_b, - m, n, k, stride_a, stride_b, stride_c, - post_ops_str_dest - ); - } - else - { - printf("Downscale type not supported.\n"); - } - } - } - if ( post_ops_str != NULL ) - { - strcpy( post_ops_str_dest, post_ops_str ); - } - } - } - - if ( post_ops_str_dest != NULL ) - { - free( post_ops_str_dest ); - } - if ( fin ) - { - fclose( fin ); - } - if ( fout ) - { - fclose( fout ); - } - return 0; + printf( "Accuracy test using %ld threads.\n", + list_omp_cores_for_testing[core_index] ); + + core_index++; + if ( core_index < len_list_omp_cores_for_testing ) + { + can_run = TRUE; + } + else + { + can_run = FALSE; + } + } + + // Input format: data_type stor_type pack/reorder m n k lda ldb ldc + while ( fscanf( fin, "%c %s %c %c %c %c %c %d %d %d %d %d %d %s\n", + &op_type_char, dscale_type_str, &stor_order, &transa, &transb, &op_a, &op_b, &m, &n, &k, + &stride_a, &stride_b, &stride_c, post_ops_str ) == 14 ) + { + stor_order = ( ( stor_order == 'r' ) || ( stor_order == 'R' ) || + ( stor_order == 'c' ) || ( stor_order == 'C' ) ) ? + stor_order : 'r'; + + if ( strcmp( post_ops_str, "none" ) != 0 ) + { + post_ops_str_dest = ( char* )malloc \ + ( ( strlen( post_ops_str) + 1 )* sizeof( char ) ); + strcpy( post_ops_str_dest, post_ops_str ); + } + + if ( ( op_type_char == 'i' ) || ( op_type_char == 'I' ) ) + { + if ( ( strcmp( dscale_type_str, "S32" ) == 0 ) || + ( strcmp( dscale_type_str, "s32" ) == 0 ) ) + { + global_dscale_out = 'n'; + GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s32os32) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || + ( strcmp( dscale_type_str, "s8" ) == 0 ) ) + { + global_dscale_out = 'y'; + DSCALE_CLIP_MIN = -128; + DSCALE_CLIP_MAX = +127; + GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s32os8) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + printf("Downscale type not supported.\n"); + } + } + } + else if ( ( op_type_char == 'f' ) || ( op_type_char == 'F' ) ) + { + global_dscale_out = 'n'; + GEN_FUNC_NAME(mat_mul_bench_main_,f32f32f32of32) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else if ((op_type_char == 's') || (op_type_char == 'S')) + { + if ( ( strcmp( dscale_type_str, "S16" ) == 0 ) || + ( strcmp( dscale_type_str, "s16" ) == 0 ) ) + { + global_dscale_out = 'n'; + GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16os16) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || + ( strcmp( dscale_type_str, "s8" ) == 0 ) ) + { + global_dscale_out = 'y'; + DSCALE_CLIP_MIN = -128; + DSCALE_CLIP_MAX = +127; + GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16os8) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else if ( ( strcmp( dscale_type_str, "U8" ) == 0 ) || + ( strcmp( dscale_type_str, "u8" ) == 0 ) ) + { + global_dscale_out = 'y'; + DSCALE_CLIP_MIN = 0; + DSCALE_CLIP_MAX = +255; + GEN_FUNC_NAME(mat_mul_bench_main_,u8s8s16ou8) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + printf("Downscale type not supported.\n"); + } + } + } + else if ((op_type_char == 'b') || (op_type_char == 'B')) + { + if ( ( strcmp( dscale_type_str, "F32" ) == 0 ) || + ( strcmp( dscale_type_str, "f32" ) == 0 ) ) + { + global_dscale_out = 'n'; + GEN_FUNC_NAME(mat_mul_bench_main_, bf16bf16f32of32) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else if ( ( strcmp( dscale_type_str, "BF16" ) == 0 ) || + ( strcmp( dscale_type_str, "bf16" ) == 0 ) ) + { + global_dscale_out = 'y'; + GEN_FUNC_NAME(mat_mul_bench_main_, bf16bf16f32obf16) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + printf("Downscale type not supported.\n"); + } + } + else if ( ( op_type_char == 'u' ) || ( op_type_char == 'U' ) ) + { + if ( ( strcmp( dscale_type_str, "S32" ) == 0 ) || + ( strcmp( dscale_type_str, "s32" ) == 0 ) ) + { + global_dscale_out = 'n'; + GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s32os32) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || + ( strcmp( dscale_type_str, "s8" ) == 0 ) ) + { + global_dscale_out = 'y'; + DSCALE_CLIP_MIN = -128; + DSCALE_CLIP_MAX = +127; + GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s32os8) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + printf("Downscale type not supported.\n"); + } + } + } + else if ( ( op_type_char == 'v' ) || ( op_type_char == 'V' ) ) + { + if ( ( strcmp( dscale_type_str, "S16" ) == 0 ) || + ( strcmp( dscale_type_str, "s16" ) == 0 ) ) + { + global_dscale_out = 'n'; + GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s16os16) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + if ( ( strcmp( dscale_type_str, "S8" ) == 0 ) || + ( strcmp( dscale_type_str, "s8" ) == 0 ) ) + { + global_dscale_out = 'y'; + DSCALE_CLIP_MIN = -128; + DSCALE_CLIP_MAX = +127; + GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s16os8) + ( + fin, fout, stor_order, transa, transb, op_a, op_b, + m, n, k, stride_a, stride_b, stride_c, + post_ops_str_dest + ); + } + else + { + printf("Downscale type not supported.\n"); + } + } + } + if ( strcmp( post_ops_str, "none" ) != 0 ) + { + strcpy( post_ops_str_dest, post_ops_str ); + } + } + } + + if ( post_ops_str_dest != NULL ) + { + free( post_ops_str_dest ); + } + if ( fin ) + { + fclose( fin ); + } + if ( fout ) + { + fclose( fout ); + } + return 0; } From 75a4d2f72f50a39e86deab37d901213adbbf897e Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Thu, 9 Nov 2023 15:49:45 +0530 Subject: [PATCH 196/226] CMake: Adding new portable CMake system. - A completely new system, made to be closer to Make system. AMD-Internal: [CPUPL-2748] Change-Id: I83232786406cdc4f0a0950fb6ac8f551e5968529 --- CMakeLists.txt | 1693 ++++++++++++-------- addon/CMakeLists.txt | 206 +++ aocl_dtl/CMakeLists.txt | 67 +- blastest/CMakeLists.txt | 134 +- blastest/f2c/CMakeLists.txt | 59 - blastest/f2c/open.c | 1 + blastest/src/CMakeLists.txt | 37 - build/bli_win_config.h.in | 58 - build/cmake/bli_addon.h.in | 17 + build/cmake/bli_config.h.in | 183 +++ build/cmake/check-blastest.py | 31 + build/cmake/check-blistest.py | 22 + build/cmake/config_print.py | 305 ++++ build/cmake/read_registry.py | 409 +++++ build/cmake/subdir_helper_functions.cmake | 122 ++ config/CMakeLists.txt | 213 ++- config/amdzen/make_defs.cmake | 24 + config/generic/CMakeLists.txt | 5 - config/generic/make_defs.cmake | 40 + config/haswell/CMakeLists.txt | 21 - config/zen/CMakeLists.txt | 5 - config/zen/amd_config.cmake | 49 + config/zen/make_defs.cmake | 39 + config/zen2/CMakeLists.txt | 6 - config/zen2/make_defs.cmake | 76 + config/zen3/CMakeLists.txt | 7 - config/zen3/make_defs.cmake | 90 ++ config/zen4/CMakeLists.txt | 7 - config/zen4/make_defs.cmake | 112 ++ docs/CMakeBuildSystem.md | 217 +++ frame/0/CMakeLists.txt | 11 - frame/0/copysc/CMakeLists.txt | 6 - frame/1/CMakeLists.txt | 14 - frame/1d/CMakeLists.txt | 13 - frame/1f/CMakeLists.txt | 13 - frame/1m/CMakeLists.txt | 21 - frame/1m/packm/CMakeLists.txt | 27 - frame/1m/unpackm/CMakeLists.txt | 11 - frame/2/CMakeLists.txt | 20 - frame/2/gemv/CMakeLists.txt | 27 - frame/2/ger/CMakeLists.txt | 9 - frame/2/hemv/CMakeLists.txt | 31 - frame/2/her/CMakeLists.txt | 25 - frame/2/her2/CMakeLists.txt | 29 - frame/2/symv/CMakeLists.txt | 7 - frame/2/syr/CMakeLists.txt | 6 - frame/2/syr2/CMakeLists.txt | 7 - frame/2/trmv/CMakeLists.txt | 10 - frame/2/trsv/CMakeLists.txt | 27 - frame/3/CMakeLists.txt | 53 - frame/3/gemm/CMakeLists.txt | 35 - frame/3/gemm/ind/CMakeLists.txt | 6 - frame/3/gemmt/CMakeLists.txt | 25 - frame/3/hemm/CMakeLists.txt | 7 - frame/3/her2k/CMakeLists.txt | 7 - frame/3/herk/CMakeLists.txt | 10 - frame/3/symm/CMakeLists.txt | 7 - frame/3/syr2k/CMakeLists.txt | 7 - frame/3/syrk/CMakeLists.txt | 7 - frame/3/trmm/CMakeLists.txt | 27 - frame/3/trmm3/CMakeLists.txt | 7 - frame/3/trsm/CMakeLists.txt | 18 - frame/CMakeLists.txt | 108 +- frame/base/cast/CMakeLists.txt | 10 - frame/base/check/CMakeLists.txt | 8 - frame/base/noopt/CMakeLists.txt | 9 - frame/base/proj/CMakeLists.txt | 9 - frame/compat/CMakeLists.txt | 76 - frame/compat/attic/CMakeLists.txt | 41 - frame/compat/blis/CMakeLists.txt | 5 - frame/compat/blis/thread/CMakeLists.txt | 9 - frame/compat/cblas/CMakeLists.txt | 9 - frame/compat/cblas/f77_sub/CMakeLists.txt | 10 - frame/compat/cblas/src/CMakeLists.txt | 169 -- frame/compat/check/CMakeLists.txt | 27 - frame/compat/f2c/CMakeLists.txt | 31 - frame/compat/f2c/util/CMakeLists.txt | 21 - frame/include/bli_config_macro_defs.h | 2 +- frame/ind/CMakeLists.txt | 15 - frame/ind/cntx/CMakeLists.txt | 7 - frame/ind/oapi/CMakeLists.txt | 8 - frame/ind/tapi/CMakeLists.txt | 8 - frame/ind/ukernels/CMakeLists.txt | 6 - frame/thread/CMakeLists.txt | 23 - frame/util/CMakeLists.txt | 17 - kernels/CMakeLists.txt | 85 +- kernels/haswell/3/CMakeLists.txt | 16 - kernels/haswell/3/sup/CMakeLists.txt | 19 - kernels/haswell/3/sup/d6x8/CMakeLists.txt | 23 - kernels/haswell/3/sup/s6x16/CMakeLists.txt | 20 - kernels/haswell/CMakeLists.txt | 5 - kernels/skx/3/CMakeLists.txt | 11 - kernels/skx/CMakeLists.txt | 4 - kernels/zen/1/CMakeLists.txt | 24 - kernels/zen/1f/CMakeLists.txt | 16 - kernels/zen/2/CMakeLists.txt | 25 - kernels/zen/3/CMakeLists.txt | 18 - kernels/zen/3/sup/CMakeLists.txt | 24 - kernels/zen/CMakeLists.txt | 11 - kernels/zen/util/CMakeLists.txt | 6 - kernels/zen4/1/CMakeLists.txt | 14 - kernels/zen4/1m/CMakeLists.txt | 16 - kernels/zen4/3/sup/CMakeLists.txt | 21 - kernels/zen4/3/sup/d24x8/CMakeLists.txt | 18 - kernels/zen4/CMakeLists.txt | 7 - kernels/zen4/aocl_smart/CMakeLists.txt | 6 - ref_kernels/1/CMakeLists.txt | 20 - ref_kernels/1f/CMakeLists.txt | 11 - ref_kernels/1m/CMakeLists.txt | 12 - ref_kernels/3/CMakeLists.txt | 11 - ref_kernels/3/bb/CMakeLists.txt | 9 - ref_kernels/CMakeLists.txt | 21 - ref_kernels/ind/CMakeLists.txt | 17 - testsuite/CMakeLists.txt | 106 +- testsuite/src/CMakeLists.txt | 60 - vendor/testcpp/CMakeLists.txt | 194 +-- 116 files changed, 3671 insertions(+), 2592 deletions(-) create mode 100644 addon/CMakeLists.txt delete mode 100644 blastest/f2c/CMakeLists.txt delete mode 100644 blastest/src/CMakeLists.txt delete mode 100644 build/bli_win_config.h.in create mode 100644 build/cmake/bli_addon.h.in create mode 100644 build/cmake/bli_config.h.in create mode 100644 build/cmake/check-blastest.py create mode 100644 build/cmake/check-blistest.py create mode 100644 build/cmake/config_print.py create mode 100644 build/cmake/read_registry.py create mode 100644 build/cmake/subdir_helper_functions.cmake create mode 100644 config/amdzen/make_defs.cmake delete mode 100644 config/generic/CMakeLists.txt create mode 100644 config/generic/make_defs.cmake delete mode 100644 config/haswell/CMakeLists.txt delete mode 100644 config/zen/CMakeLists.txt create mode 100644 config/zen/amd_config.cmake create mode 100644 config/zen/make_defs.cmake delete mode 100644 config/zen2/CMakeLists.txt create mode 100644 config/zen2/make_defs.cmake delete mode 100644 config/zen3/CMakeLists.txt create mode 100644 config/zen3/make_defs.cmake delete mode 100644 config/zen4/CMakeLists.txt create mode 100644 config/zen4/make_defs.cmake create mode 100644 docs/CMakeBuildSystem.md delete mode 100644 frame/0/CMakeLists.txt delete mode 100644 frame/0/copysc/CMakeLists.txt delete mode 100644 frame/1/CMakeLists.txt delete mode 100644 frame/1d/CMakeLists.txt delete mode 100644 frame/1f/CMakeLists.txt delete mode 100644 frame/1m/CMakeLists.txt delete mode 100644 frame/1m/packm/CMakeLists.txt delete mode 100644 frame/1m/unpackm/CMakeLists.txt delete mode 100644 frame/2/CMakeLists.txt delete mode 100644 frame/2/gemv/CMakeLists.txt delete mode 100644 frame/2/ger/CMakeLists.txt delete mode 100644 frame/2/hemv/CMakeLists.txt delete mode 100644 frame/2/her/CMakeLists.txt delete mode 100644 frame/2/her2/CMakeLists.txt delete mode 100644 frame/2/symv/CMakeLists.txt delete mode 100644 frame/2/syr/CMakeLists.txt delete mode 100644 frame/2/syr2/CMakeLists.txt delete mode 100644 frame/2/trmv/CMakeLists.txt delete mode 100644 frame/2/trsv/CMakeLists.txt delete mode 100644 frame/3/CMakeLists.txt delete mode 100644 frame/3/gemm/CMakeLists.txt delete mode 100644 frame/3/gemm/ind/CMakeLists.txt delete mode 100644 frame/3/gemmt/CMakeLists.txt delete mode 100644 frame/3/hemm/CMakeLists.txt delete mode 100644 frame/3/her2k/CMakeLists.txt delete mode 100644 frame/3/herk/CMakeLists.txt delete mode 100644 frame/3/symm/CMakeLists.txt delete mode 100644 frame/3/syr2k/CMakeLists.txt delete mode 100644 frame/3/syrk/CMakeLists.txt delete mode 100644 frame/3/trmm/CMakeLists.txt delete mode 100644 frame/3/trmm3/CMakeLists.txt delete mode 100644 frame/3/trsm/CMakeLists.txt delete mode 100644 frame/base/cast/CMakeLists.txt delete mode 100644 frame/base/check/CMakeLists.txt delete mode 100644 frame/base/noopt/CMakeLists.txt delete mode 100644 frame/base/proj/CMakeLists.txt delete mode 100644 frame/compat/CMakeLists.txt delete mode 100644 frame/compat/attic/CMakeLists.txt delete mode 100644 frame/compat/blis/CMakeLists.txt delete mode 100644 frame/compat/blis/thread/CMakeLists.txt delete mode 100644 frame/compat/cblas/CMakeLists.txt delete mode 100644 frame/compat/cblas/f77_sub/CMakeLists.txt delete mode 100644 frame/compat/cblas/src/CMakeLists.txt delete mode 100644 frame/compat/check/CMakeLists.txt delete mode 100644 frame/compat/f2c/CMakeLists.txt delete mode 100644 frame/compat/f2c/util/CMakeLists.txt delete mode 100644 frame/ind/CMakeLists.txt delete mode 100644 frame/ind/cntx/CMakeLists.txt delete mode 100644 frame/ind/oapi/CMakeLists.txt delete mode 100644 frame/ind/tapi/CMakeLists.txt delete mode 100644 frame/ind/ukernels/CMakeLists.txt delete mode 100644 frame/thread/CMakeLists.txt delete mode 100644 frame/util/CMakeLists.txt delete mode 100644 kernels/haswell/3/CMakeLists.txt delete mode 100644 kernels/haswell/3/sup/CMakeLists.txt delete mode 100644 kernels/haswell/3/sup/d6x8/CMakeLists.txt delete mode 100644 kernels/haswell/3/sup/s6x16/CMakeLists.txt delete mode 100644 kernels/haswell/CMakeLists.txt delete mode 100644 kernels/skx/3/CMakeLists.txt delete mode 100644 kernels/skx/CMakeLists.txt delete mode 100644 kernels/zen/1/CMakeLists.txt delete mode 100644 kernels/zen/1f/CMakeLists.txt delete mode 100644 kernels/zen/2/CMakeLists.txt delete mode 100644 kernels/zen/3/CMakeLists.txt delete mode 100644 kernels/zen/3/sup/CMakeLists.txt delete mode 100644 kernels/zen/CMakeLists.txt delete mode 100644 kernels/zen/util/CMakeLists.txt delete mode 100644 kernels/zen4/1/CMakeLists.txt delete mode 100644 kernels/zen4/1m/CMakeLists.txt delete mode 100644 kernels/zen4/3/sup/CMakeLists.txt delete mode 100644 kernels/zen4/3/sup/d24x8/CMakeLists.txt delete mode 100644 kernels/zen4/CMakeLists.txt delete mode 100644 kernels/zen4/aocl_smart/CMakeLists.txt delete mode 100644 ref_kernels/1/CMakeLists.txt delete mode 100644 ref_kernels/1f/CMakeLists.txt delete mode 100644 ref_kernels/1m/CMakeLists.txt delete mode 100644 ref_kernels/3/CMakeLists.txt delete mode 100644 ref_kernels/3/bb/CMakeLists.txt delete mode 100644 ref_kernels/CMakeLists.txt delete mode 100644 ref_kernels/ind/CMakeLists.txt delete mode 100644 testsuite/src/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index d16e82207a..55213846df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,742 +1,1085 @@ ##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.## cmake_minimum_required(VERSION 3.15.0) +if(WIN32) + project(AOCL-LibBlis LANGUAGES C CXX) +else() + project(AOCL-LibBlis LANGUAGES C CXX Fortran) +endif() -project(AOCL-LibBlis-Win C CXX) - -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin") -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin") -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin") - - -SET(AOCL_BLIS_FAMILY "zen" CACHE STRING "AOCL BLIS family name") -SET(OpenMP_libomp_LIBRARY "C:/Program Files/LLVM/lib/libomp.lib" CACHE STRING "openmp library -path") -set(TARGET_ARCH ${AOCL_BLIS_FAMILY}) -set(AOCL_BLIS_ZEN TRUE) -set (PYTHON_EXE "python") +# Set the C standard to C99. +set(CMAKE_C_STANDARD 99) +set(CMAKE_C_STANDARD_REQUIRED TRUE) +# Set the C++ standard to C++11. +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) -if ("${AOCL_BLIS_FAMILY}" STREQUAL "") - message(FATAL_ERROR "Machine configuration missing! Select one of zen, zen2, zen3, zen4 or amdzen") -endif () +# Enable IDE folders for targets. +set_property(GLOBAL PROPERTY USE_FOLDERS ON) -if (${AOCL_BLIS_FAMILY} STREQUAL "auto") - set(AUTO_CONFIG_PY "${CMAKE_SOURCE_DIR}/build/auto_config.py") - # Run python script to find the architecture family name - execute_process( - COMMAND ${PYTHON_EXE} ${AUTO_CONFIG_PY} - RESULT_VARIABLE CMD_RESULT - OUTPUT_VARIABLE CMD_OUTPUT - OUTPUT_STRIP_TRAILING_WHITESPACE) - message( STATUS "Auto configuring the family :" ${CMD_OUTPUT}) - set(AOCL_BLIS_FAMILY ${CMD_OUTPUT}) +# Find a python interpreter. +find_package(Python COMPONENTS Interpreter REQUIRED) +if(NOT Python_FOUND) + message(SEND_ERROR "Could not find working python interperter! Cannot continue.") +endif() +# Functionality that prints configuration usage. +option(PRINT_CONFIGURE_HELP "Print CMake Configuration Usage" OFF) +if(PRINT_CONFIGURE_HELP) + execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/build/cmake/config_print.py) + return() endif () -if(${AOCL_BLIS_FAMILY} STREQUAL "zen") - add_definitions(-DBLIS_FAMILY_ZEN) - add_definitions(-DBLIS_CONFIG_ZEN) - add_definitions(-DBLIS_KERNELS_ZEN) - add_definitions(-DBLIS_KERNELS_HASWELL) -elseif (${AOCL_BLIS_FAMILY} STREQUAL "zen2") - add_definitions(-DBLIS_FAMILY_ZEN2) - add_definitions(-DBLIS_CONFIG_ZEN2) - add_definitions(-DBLIS_KERNELS_ZEN2) - add_definitions(-DBLIS_KERNELS_ZEN) - add_definitions(-DBLIS_KERNELS_HASWELL) -elseif (${AOCL_BLIS_FAMILY} STREQUAL "zen3") - add_definitions(-DBLIS_FAMILY_ZEN3) - add_definitions(-DBLIS_CONFIG_ZEN3) - add_definitions(-DBLIS_KERNELS_ZEN3) - add_definitions(-DBLIS_KERNELS_ZEN2) - add_definitions(-DBLIS_KERNELS_ZEN) - add_definitions(-DBLIS_KERNELS_HASWELL) -elseif (${AOCL_BLIS_FAMILY} STREQUAL "zen4") - add_definitions(-DBLIS_FAMILY_ZEN4) - add_definitions(-DBLIS_CONFIG_ZEN4) - add_definitions(-DBLIS_KERNELS_SKX) - add_definitions(-DBLIS_KERNELS_ZEN4) - add_definitions(-DBLIS_KERNELS_ZEN3) - add_definitions(-DBLIS_KERNELS_ZEN2) - add_definitions(-DBLIS_KERNELS_ZEN) - add_definitions(-DBLIS_KERNELS_HASWELL) -elseif (${AOCL_BLIS_FAMILY} STREQUAL "amdzen") - set(AOCL_BLIS_ZEN FALSE) - add_definitions(-DBLIS_FAMILY_AMDZEN) - add_definitions(-DBLIS_CONFIG_ZEN4) - add_definitions(-DBLIS_CONFIG_ZEN3) - add_definitions(-DBLIS_CONFIG_ZEN2) - add_definitions(-DBLIS_CONFIG_ZEN) - add_definitions(-DBLIS_CONFIG_GENERIC) - add_definitions(-DBLIS_KERNELS_SKX) - add_definitions(-DBLIS_KERNELS_ZEN4) - add_definitions(-DBLIS_KERNELS_ZEN3) - add_definitions(-DBLIS_KERNELS_ZEN2) - add_definitions(-DBLIS_KERNELS_HASWELL) - add_definitions(-DBLIS_KERNELS_ZEN) - add_definitions(-DBLIS_KERNELS_GENERIC) -else () - message(FATAL_ERROR "Wrong machine configuration. Select one of zen, zen2, zen3, zen4 or amdzen") -endif () +if(WIN32) + set(BLIS_CONFIG_FAMILY "auto" CACHE STRING "Set the configuration family for which the BLIS library will be built.") +else() + set(BLIS_CONFIG_FAMILY "" CACHE STRING "Set the configuration family for which the BLIS library will be built.") +endif() +set_property(CACHE BLIS_CONFIG_FAMILY PROPERTY STRINGS "auto" "generic" "zen" "zen2" "zen3" "zen4" "amdzen") +# Throw an error if CMake was configured with a configuration which is not enabled yet. +if(NOT ((BLIS_CONFIG_FAMILY STREQUAL auto) OR + (BLIS_CONFIG_FAMILY STREQUAL generic) OR + (BLIS_CONFIG_FAMILY STREQUAL zen) OR + (BLIS_CONFIG_FAMILY STREQUAL zen2) OR + (BLIS_CONFIG_FAMILY STREQUAL zen3) OR + (BLIS_CONFIG_FAMILY STREQUAL zen4) OR + (BLIS_CONFIG_FAMILY STREQUAL amdzen))) + message(FATAL_ERROR "Configuration for ${BLIS_CONFIG_FAMILY} is not supported. \ + Please re-run cmake and specify one of the following configurations for BLIS_CONFIG_FAMILY: \ + auto, zen, zen2, zen3, zen4, amdzen, generic.") +endif() -set(TARGET_ARCH ${AOCL_BLIS_FAMILY}) -message("AOCL_BLIS_FAMILY selected:${AOCL_BLIS_FAMILY}") +# automatic hardware detection +if(BLIS_CONFIG_FAMILY STREQUAL "auto") + message(STATUS "automatic configuration requested") + set(auto_detect_source_files + "${CMAKE_SOURCE_DIR}/build/detect/config/config_detect.c" + "${CMAKE_SOURCE_DIR}/frame/base/bli_arch.c" + "${CMAKE_SOURCE_DIR}/frame/base/bli_cpuid.c" + "${CMAKE_SOURCE_DIR}/frame/base/bli_env.c" + ) + set(frame_include " ${CMAKE_SOURCE_DIR}/frame/include") + set(base_include " ${CMAKE_SOURCE_DIR}/frame/base") + set(thread_include " ${CMAKE_SOURCE_DIR}/frame/thread") + # Try building an executable from one or more source files. + # Build success returns TRUE and build failure returns FALSE in COMPILERESULT. + # If the build succeeds, this runs the executable and stores the exit code in RUNRESULT. + # If the executable was built, but failed to run, then RUNRESULT will be set to FAILED_TO_RUN + # RUN_OUTPUT_VARIABLE Report the output from running the executable in a given variable + try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${auto_detect_source_files} + COMPILE_DEFINITIONS -I${frame_include} -I${base_include} -I${thread_include} + -DBLIS_CONFIGURETIME_CPUID -DBLIS_CONFIG_SKX -DBLIS_CONFIG_KNL + -DBLIS_CONFIG_HASWELL -DBLIS_CONFIG_SANDYBRIDGE -DBLIS_CONFIG_PENRYN + -DBLIS_CONFIG_ZEN4 -DBLIS_CONFIG_ZEN3 -DBLIS_CONFIG_ZEN2 -DBLIS_CONFIG_ZEN + -DBLIS_CONFIG_EXCAVATOR -DBLIS_CONFIG_STEAMROLLER -DBLIS_CONFIG_PILEDRIVER + -DBLIS_CONFIG_BULLDOZER -DBLIS_CONFIG_THUNDERX2 -DBLIS_CONFIG_CORTEXA57 + -DBLIS_CONFIG_CORTEXA15 -DBLIS_CONFIG_CORTEXA9 + -D__blis_arch_type_name="BLIS_ARCH_TYPE" -D__blis_model_type_name="BLIS_MODEL_TYPE" + RUN_OUTPUT_VARIABLE HARDWARE_ARCH + ) + string(STRIP "${HARDWARE_ARCH}" HARDWARE_ARCH) + message(STATUS "automatic hardware detection: " ${HARDWARE_ARCH}) + if( NOT(${HARDWARE_ARCH} STREQUAL zen OR + ${HARDWARE_ARCH} STREQUAL zen2 OR + ${HARDWARE_ARCH} STREQUAL zen3 OR + ${HARDWARE_ARCH} STREQUAL zen4) ) + set(BLIS_CONFIG_FAMILY "generic") + message(WARNING "Only AMD zen architectures are supported. \ + Detected ${HARDWARE_ARCH} hardware. Defaulting to generic configuration.") + else() + set(BLIS_CONFIG_FAMILY ${HARDWARE_ARCH}) + endif() + message(STATUS "automatic configuration registered: " ${BLIS_CONFIG_FAMILY}) +endif() -option(BUILD_SHARED_LIBS "Build shared library" ON) -option(ENABLE_VERBOSE "Enable VERBOSE mode for build" OFF) -option(ENABLE_MULTITHREADING "Enable Multi threading" OFF) -option(ENABLE_OPENMP "Enable Openmp mode" OFF) -option(ENABLE_JRIR_SLAB "Request slab thread in jr and ir loops" ON) -option(ENABLE_JRIR_RR "Request round robin thread in jr and ir loops" OFF) +# Read the registered configuration names and lists into associative arrays. +execute_process( + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/cmake/read_registry.py "${BLIS_CONFIG_FAMILY}" "${CMAKE_SOURCE_DIR}" + RESULT_VARIABLE CMD_RESULT + OUTPUT_VARIABLE CONFIGURATION_STRING + OUTPUT_STRIP_TRAILING_WHITESPACE ) +# Returns the list of elements specified by indices from the list. +message(STATUS "configuration '${BLIS_CONFIG_FAMILY}' is registered.") +list(GET CONFIGURATION_STRING 0 CONFIG_LIST) +list(GET CONFIGURATION_STRING 1 KERNEL_LIST) +list(GET CONFIGURATION_STRING 2 KCONFIG_MAP) +# Removing leading and trailing spaces in the string. +string(STRIP "${CONFIG_LIST}" CONFIG_LIST) +string(STRIP "${KERNEL_LIST}" KERNEL_LIST) +string(STRIP "${KCONFIG_MAP}" KCONFIG_MAP) +# Convert from string to list(list is a ";"-separated string) +message(STATUS "${BLIS_CONFIG_FAMILY} is defined as having the following sub-configurations:") +message(" ${CONFIG_LIST} ") +string(REPLACE " " ";" CONFIG_LIST ${CONFIG_LIST}) +message(STATUS "which collectively require the following kernels:") +message(" ${KERNEL_LIST} ") +string(REPLACE " " ";" KERNEL_LIST ${KERNEL_LIST}) +message(STATUS "that has kernel:config pairs:") +message(" ${KCONFIG_MAP} ") +string(REPLACE " " ";" KCONFIG_MAP ${KCONFIG_MAP}) +# Create a #define for the configuration family (config_name). +string(TOUPPER ${BLIS_CONFIG_FAMILY} UCONF) +set(CONFIG_NAME_DEFINE "#define BLIS_FAMILY_${UCONF}\n") +#create a AOCL specific #define +#This macro is enabled only for zen family configurations. +#This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes. +if(BLIS_CONFIG_FAMILY MATCHES "zen|amd64") + set(ENABLE_AOCL_ZEN ON) + set(ENABLE_AOCL_ZEN_01 1) +else() + set(ENABLE_AOCL_ZEN OFF) + set(ENABLE_AOCL_ZEN_01 0) +endif() +# Create a list of #defines, one for each configuration in config_list. +set(CONFIG_LIST_DEFINES "") +foreach(CONF ${CONFIG_LIST}) + string(TOUPPER ${CONF} UCONF) + set(CONFIG_LIST_DEFINES "${CONFIG_LIST_DEFINES}#define BLIS_CONFIG_${UCONF}\n") +endforeach() +# Create a list of #defines, one for each kernel set in kernel_list. +set(KERNEL_LIST_DEFINES "") +foreach(KERN ${KERNEL_LIST}) + string(TOUPPER ${KERN} UCONF) + set(KERNEL_LIST_DEFINES "${KERNEL_LIST_DEFINES}#define BLIS_KERNELS_${UCONF}\n") +endforeach() + +#------------------------------------ +# Option Setting +#------------------------------------ +# Options that are specific to Windows. +if(WIN32) + option(ENABLE_NO_UNDERSCORE_API "Export APIs without underscore." OFF) + option(ENABLE_UPPERCASE_API "Export APIs with uppercase." OFF) + # Setting path to OpenMP runtime. + set(OpenMP_libomp_LIBRARY "C:/Program Files/Microsoft Visual Studio/2022/Professional/VC/Tools/Llvm/x64/lib/libomp.lib" CACHE STRING "openmp library path") +endif() +set(ENABLE_DEBUG "off" CACHE STRING "Enable debugging symbols in the library.") +set_property(CACHE ENABLE_DEBUG PROPERTY STRINGS "off" "noopt" "opt") +if( NOT ((ENABLE_DEBUG STREQUAL "off") OR (ENABLE_DEBUG STREQUAL "noopt") OR (ENABLE_DEBUG STREQUAL "opt")) ) + message(FATAL_ERROR "ENABLE_DEBUG option '${ENABLE_DEBUG}' is not supported. Please use one of the following options \ + during CMake invokation: off, noopt, opt") +endif() +# Check if user provided CMAKE_BUILD_TYPE. If that's the case, map it to the internal ENABLE_DEBUG type +# and clean cache from CMAKE_BUILD_TYPE. We do this because CMake will add some flags depending on the +# the build type and on Linux we want to have more control over what flags are being used. +if(NOT WIN32) + if(CMAKE_BUILD_TYPE) + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + set(ENABLE_DEBUG "noopt") + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + set(ENABLE_DEBUG "off") + elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + set(ENABLE_DEBUG "opt") + else() + message(FATAL_ERROR "Configured CMake with incompatible CMAKE_BUILD_TYPE. Only Debug, RelWithDebInfo and Release are supported. \ + This is due to matching this flag to BLIS internal options corresponding to ENABLE_DEBUG: off, noopt, opt.") + endif() + message(WARNING "When CMAKE_BUILD_TYPE is used, BLIS-specific variable ENABLE_DEBUG gets overwritten accordingly.") + set(CMAKE_BUILD_TYPE "") + endif() +endif() +# Build shared libraries by default +option(BUILD_SHARED_LIBS "Build shared libraries (.dll/.so) instead of static ones (.lib/.a)" ON) +option(ENABLE_SYSTEM "Check if we are building with or without operating system support" ON) +set(ENABLE_THREADING "no" CACHE STRING "the threading flag") +if(WIN32) + set_property(CACHE ENABLE_THREADING PROPERTY STRINGS "openmp" "no") + if( NOT ((ENABLE_THREADING STREQUAL "openmp") OR (ENABLE_THREADING STREQUAL "no")) ) + message(FATAL_ERROR "ENABLE_THREADING option '${ENABLE_THREADING}' is not supported. Please use one of the following options \ + during CMake invokation: openmp, no") + endif() +else() + set_property(CACHE ENABLE_THREADING PROPERTY STRINGS "openmp" "pthreads" "no") + if( NOT ((ENABLE_THREADING STREQUAL "openmp") OR (ENABLE_THREADING STREQUAL "pthreads") OR (ENABLE_THREADING STREQUAL "no")) ) + message(FATAL_ERROR "ENABLE_THREADING option '${ENABLE_THREADING}' is not supported. Please use one of the following options \ + during CMake invokation: openmp, pthreads, no") + endif() +endif() +set(THREAD_PART_JRIR "slab" CACHE STRING "The method of assigning micropanels to threads in the JR and JR loops.") +set_property(CACHE THREAD_PART_JRIR PROPERTY STRINGS "slab" "rr") +if( NOT ((THREAD_PART_JRIR STREQUAL "slab") OR (THREAD_PART_JRIR STREQUAL "rr")) ) + message(FATAL_ERROR "THREAD_PART_JRIR option '${THREAD_PART_JRIR}' is not supported. Please use one of the following options \ + during CMake invokation: slab, rr") +endif() +set(EXPORT_SHARED "public" CACHE STRING "Specify the subset of library symbols that are exported within a shared library.") +set_property(CACHE EXPORT_SHARED PROPERTY STRINGS "public" "all") +if( NOT ((EXPORT_SHARED STREQUAL "public") OR (EXPORT_SHARED STREQUAL "all")) ) + message(FATAL_ERROR "EXPORT_SHARED option '${EXPORT_SHARED}' is not supported. Please use one of the following options \ + during CMake invokation: publis, all") +endif() option(ENABLE_PBA_POOLS "Internal memory pools for packing blocks" ON) option(ENABLE_SBA_POOLS "Internal memory pools for small blocks" ON) option(ENABLE_MEM_TRACING "Memory tracing output" OFF) +set(INT_SIZE "auto" CACHE STRING "BLIS API integer size") +set_property(CACHE INT_SIZE PROPERTY STRINGS "auto" "32" "64") +if( NOT ((INT_SIZE STREQUAL "auto") OR (INT_SIZE STREQUAL "32") OR (INT_SIZE STREQUAL "64")) ) + message(FATAL_ERROR "INT_SIZE option '${INT_SIZE}' is not supported. Please use one of the following options \ + during CMake invokation: auto, 32, 64") +endif() +set(BLAS_INT_SIZE "32" CACHE STRING "BLAS/CBLAS API integer size") +set_property(CACHE BLAS_INT_SIZE PROPERTY STRINGS "auto" "32" "64") +if( NOT ((BLAS_INT_SIZE STREQUAL "auto") OR (BLAS_INT_SIZE STREQUAL "32") OR (BLAS_INT_SIZE STREQUAL "64")) ) + message(FATAL_ERROR "BLAS_INT_SIZE option '${BLAS_INT_SIZE}' is not supported. Please use one of the following options \ + during CMake invokation: auto, 32, 64") +endif() option(ENABLE_BLAS "BLAS compatiblity layer" ON) -option(ENABLE_CBLAS "CBLAS compatiblity layer" ON) -option(ENABLE_MIXED_DT "Mixed datatype" ON) +option(ENABLE_CBLAS "CBLAS compatiblity layer" OFF) +option(ENABLE_MIXED_DT "Mixed datatype support" ON) option(ENABLE_MIXED_DT_EXTRA_MEM "Mixed datatype optimization requiring extra memory" ON) option(ENABLE_SUP_HANDLING "Small matrix handling" ON) -option(ENABLE_MEMKIND "libmemkind for manage memory pools" OFF) -option(ENABLE_PRAGMA_OMP_SIMD "pragma openmp simd" ON) -option(ENABLE_SANDBOX "Sandbox implementation for gemm" OFF) -option(BLIS_ENABLE_ILP64 "ENABLE BLIS ILP64" OFF) -option(ENABLE_INT_TYPE_SIZE " Internal BLIS integers ,used in native BLIS interfaces based on architecture dependent " ON) -option(ENABLE_BLASTEST "Enable the blastest" OFF) -option(ENABLE_TESTCPP_TESTING "Enabling testcpp" OFF) -option (ENABLE_NO_UNDERSCORE_API "export APIs without underscore" OFF) -option (ENABLE_UPPERCASE_API "export APIs with uppercase" OFF) -option (ENABLE_COMPLEX_RETURN_INTEL "Enable complex_return_intel" OFF) -option (ENABLE_TRSM_PREINVERSION "Enable TRSM preinversion" ON) -option (ENABLE_AOCL_DYNAMIC "Enable Dynamic Multi-threading" OFF) -option(DISABLE_BLIS_ARCH_TYPE "Disable BLIS_ARCH_TYPE and BLIS_MODEL_TYPE functionality" OFF) -option(RENAME_BLIS_ARCH_TYPE "Rename BLIS_ARCH_TYPE env var renamed to supplied value" BLIS_ARCH_TYPE) -option(RENAME_BLIS_MODEL_TYPE "Rename BLIS_MODEL_TYPE env var renamed to supplied value" BLIS_MODEL_TYPE) -option(ENABLE_ASAN_TESTS "Enable Address Sanitiser tests" OFF) -SET(ASAN_DEPENDENCY_LIB_DIR "" CACHE STRING "ASAN Dependent library folder name") - -if (${AOCL_BLIS_FAMILY} STREQUAL "amdzen") - set(REF_KERNEL_MIRRORING_PY "${CMAKE_SOURCE_DIR}/build/blis_ref_kernel_mirror.py") - message("ref_kernel mirroring for fat binary") - # Run python script to find the architecture family name - execute_process( - COMMAND ${PYTHON_EXE} ${REF_KERNEL_MIRRORING_PY} ${CMAKE_BINARY_DIR} - RESULT_VARIABLE CMD_RESULT - OUTPUT_VARIABLE CMD_OUTPUT - OUTPUT_STRIP_TRAILING_WHITESPACE) - message( STATUS "Ref Kernel Mirroring :" ${CMD_OUTPUT}) -endif() -if(ENABLE_NO_UNDERSCORE_API) - add_definitions(-DBLIS_ENABLE_NO_UNDERSCORE_API) -endif() - -if(ENABLE_COMPLEX_RETURN_INTEL) - set(BLIS_ENABLE_COMPLEX_RETURN_INTEL TRUE) +if(WIN32) + set(ENABLE_MEMKIND "no" CACHE STRING "libmemkind for manage memory pools") + set_property(CACHE ENABLE_MEMKIND PROPERTY STRINGS "no") + if( NOT (ENABLE_MEMKIND STREQUAL "no")) + message(FATAL_ERROR "ENABLE_MEMKIND option is not supported on Windows platforms.") + endif() else() - set(BLIS_DISABLE_COMPLEX_RETURN_INTEL TRUE) + set(ENABLE_MEMKIND "auto" CACHE STRING "libmemkind for manage memory pools") + set_property(CACHE ENABLE_MEMKIND PROPERTY STRINGS "auto" "yes" "no") + if( NOT ((ENABLE_MEMKIND STREQUAL "auto") OR (ENABLE_MEMKIND STREQUAL "yes") OR (ENABLE_MEMKIND STREQUAL "no")) ) + message(FATAL_ERROR "ENABLE_MEMKIND option '${ENABLE_MEMKIND}' is not supported. Please use one of the following options \ + during CMake invokation: auto, yes, no") + endif() endif() - -if(ENABLE_UPPERCASE_API) - add_definitions(-DBLIS_ENABLE_UPPERCASE_API) +option(ENABLE_TRSM_PREINVERSION "Enable TRSM preinversion" ON) +option(ENABLE_AOCL_DYNAMIC "Dynamic selection of number of threads" ON) +set(FORCE_VERSION "no" CACHE STRING "Force configure to use an arbitrary version string") +if(WIN32) + set(COMPLEX_RETURN "gnu" CACHE STRING "The method used for returning complex numbers") + set_property(CACHE COMPLEX_RETURN PROPERTY STRINGS "gnu" "intel") + if( NOT ((COMPLEX_RETURN STREQUAL "gnu") OR (COMPLEX_RETURN STREQUAL "intel")) ) + message(FATAL_ERROR "COMPLEX_RETURN option '${COMPLEX_RETURN}' is not supported. Please use one of the following options \ + during CMake invokation: gnu, intel") + endif() +else() + set(COMPLEX_RETURN "default" CACHE STRING "The method used for returning complex numbers") + set_property(CACHE COMPLEX_RETURN PROPERTY STRINGS "default" "gnu" "intel") + if( NOT ((COMPLEX_RETURN STREQUAL "default") OR (COMPLEX_RETURN STREQUAL "gnu") OR (COMPLEX_RETURN STREQUAL "intel")) ) + message(FATAL_ERROR "COMPLEX_RETURN option '${COMPLEX_RETURN}' is not supported. Please use one of the following options \ + during CMake invokation: default, gnu, intel") + endif() endif() - -if(ENABLE_AOCL_DYNAMIC) - set(AOCL_DYNAMIC TRUE) +option(DISABLE_BLIS_ARCH_TYPE "Disable BLIS_ARCH_TYPE and BLIS_MODEL_TYPE functionality" OFF) +set(RENAME_BLIS_ARCH_TYPE "BLIS_ARCH_TYPE" CACHE STRING "BLIS_ARCH_TYPE env var renamed to supplied value") +set(RENAME_BLIS_MODEL_TYPE "BLIS_MODEL_TYPE" CACHE STRING "BLIS_MODEL_TYPE env var renamed to supplied value") +if(NOT WIN32) + set(ENABLE_ADDON "" CACHE STRING "Configure with specific addons using a ';'-separated list") +endif() +set(ENABLE_SANDBOX "" CACHE STRING "Enable a separate sandbox implementation of gemm.") +# Do not let ENABLE_SANDBOX appear on cmake-gui since the functionality is not yet implemented. +mark_as_advanced(ENABLE_SANDBOX) + +#------------------------------------ +# Check memkind +#------------------------------------ +# Using libmemkind is not a valid option on Windows. Check only on Linux platforms. +if(NOT WIN32) + # In order to determine the default behavior of the --with[out]-memkind + # option, we try to detect whether libmemkind is available. If it is, + # the default implied option will be --with-memkind; otherwise, will be + # --without-memkind. + try_compile(HAS_MEMKIND "${CMAKE_BINARY_DIR}/temp" SOURCES "${CMAKE_SOURCE_DIR}/build/detect/memkind/libmemkind_detect.c" + LINK_OPTIONS + "-lmemkind" + ) endif() -if (BUILD_SHARED_LIBS) - set(BLIS_ENABLE_SHARED TRUE) - if(ENABLE_BLASTEST) - add_definitions(-DAOCL_SUPPORT_BLASTEST_FOR_SHARED) - endif() -endif () - -# Enable LP64/ILP64 -if (BLIS_ENABLE_ILP64) - set(BLIS_BLAS_INT_TYPE_SIZE TRUE) - set (BLAS_INT_TYPE_SIZE "64") - add_definitions(-DF2C_ENABLE_ILP64) -else () - set(BLIS_BLAS_INT_TYPE_SIZE TRUE) - set (BLAS_INT_TYPE_SIZE "32") -endif () - -if (ENABLE_TRSM_PREINVERSION) - set(BLIS_ENABLE_TRSM_PREINVERSION TRUE) +#------------------------------------ +# Check #pragma omp simd +#------------------------------------ +if(ENABLE_THREADING STREQUAL "openmp") + # Try to determine whether the chosen compiler supports #pragma omp simd. + try_compile(PRAGMA_OMP_SIMD "${CMAKE_BINARY_DIR}/temp" SOURCES "${CMAKE_SOURCE_DIR}/build/detect/omp_simd/omp_simd_detect.c" + CMAKE_FLAGS + "-O3 -march=native -fopenmp-simd" + C_STANDARD 99 + ) +endif() +#------------------------------------ +# Acquire the BLIS version +#------------------------------------ +# Set the VERSION variable to the default value in the 'version' file. +file(STRINGS ${CMAKE_SOURCE_DIR}/version VERSION) +# Get timestamp. +string(TIMESTAMP BUILD_DATE "%Y%m%d") +# Update using the timestamp. +set(VERSION_STRING "AOCL-BLIS ${VERSION} Build ${BUILD_DATE}") +# Initial message. +message(STATUS "Starting configuration of BLIS ${VERSION_STRING}.") +# Check if the user requested a custom version string. +if(FORCE_VERSION STREQUAL "no") + message(" Configuring with official version string.") else() - add_definitions(-DBLIS_DISABLE_TRSM_PREINVERSION) + set(VERSION_STRING "${FORCE_VERSION}") + message(" Configuring with custom version string: ${VERSION_STRING}") endif() - -if (ENABLE_INT_TYPE_SIZE) - set(BLIS_INT_TYPE_SIZE TRUE) - set (INT_TYPE_SIZE "64") -else () - set(BLIS_INT_TYPE_SIZE TRUE) - set (INT_TYPE_SIZE "32") -endif () - -if (BLIS_ENABLE_ILP64 AND NOT ENABLE_INT_TYPE_SIZE) - message(FATAL_ERROR "for ILP64 we must enable ENABLE_INT_TYPE_SIZE with BLIS_INT_TYPE_SIZE = 64 ") -endif () - -if (ENABLE_VERBOSE) - set(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "ON" FORCE) -endif () - -if (ENABLE_JRIR_RR) - message("Round robin thread method enabled") - set(BLIS_ENABLE_JRIR_RR TRUE) - set(BLIS_ENABLE_JRIR_SLAB FALSE) -elseif (ENABLE_JRIR_SLAB) - message("SLAB thread method enabled") - set(BLIS_ENABLE_JRIR_SLAB TRUE) - set(BLIS_ENABLE_JRIR_RR FALSE) -else () - message("Unsupported method of thread partitioning in jr and ir loops") -endif () - -if (ENABLE_PBA_POOLS) - set(BLIS_ENABLE_PBA_POOLS TRUE) -endif () - -if (ENABLE_SBA_POOLS) - set(BLIS_ENABLE_SBA_POOLS TRUE) -endif () - -if (ENABLE_MEM_TRACING) - set(BLIS_ENABLE_MEM_TRACING FALSE) -endif () - -if (ENABLE_BLAS) - add_definitions(-DBLIS_ENABLE_BLAS) - set(BLIS_ENABLE_BLAS TRUE) -else () - add_definitions(-DBLIS_DISABLE_BLAS) - set(BLIS_ENABLE_BLAS FALSE) -endif () - -if (ENABLE_CBLAS) - add_definitions(-DBLIS_ENABLE_CBLAS) - set(BLIS_ENABLE_CBLAS TRUE) - if (NOT ENABLE_BLAS) - # Force BLAS layer when CBLAS is enabled - add_definitions(-DBLIS_ENABLE_BLAS) - set(BLIS_ENABLE_BLAS TRUE) - endif () -else () - add_definitions(-DBLIS_DISABLE_CBLAS) - set(BLIS_ENABLE_CBLAS FALSE) -endif () - -if (ENABLE_BLASTEST) - add_definitions(-DBLIS_ENABLE_BLAS) - add_definitions(-DBLIS_ENABLE_CBLAS) +# Set the shared library (.so) version file. +file(STRINGS ${CMAKE_SOURCE_DIR}/so_version SO_VERSION) +# The first line of the 'so_version' file contains the .so major version. +list(GET SO_VERSION 0 SO_VERSION_MAJOR) +# The second line contains the minor and build .so version numbers +# (separated by a '.'). +list(GET SO_VERSION 1 SO_VERSION_MINOR) + +#------------------------------------ +# Printing Options +#------------------------------------ +include(CMakePrintHelpers) +message(STATUS "Printing CMake Configuration Options...") +cmake_print_variables(ENABLE_DEBUG) +# Initialize debug type, using the corresponding cache variable. +set(DEBUG_TYPE ${ENABLE_DEBUG}) +if(ENABLE_DEBUG STREQUAL "off") + message(" Debug symbols disabled.") +elseif(ENABLE_DEBUG STREQUAL "opt") + message(" Enabling debug symbols with optimizations.") +else() #ENABLE_DEBUG=noopt + message(" Enabling debug symbols; optimizations disabled.") endif() - -if (ENABLE_TESTCPP_TESTING) - add_definitions(-DBLIS_ENABLE_BLAS) - add_definitions(-DBLIS_ENABLE_CBLAS) -endif () - -if (ENABLE_MIXED_DT) - set(BLIS_ENABLE_MIXED_DT TRUE) -endif () - -if (ENABLE_MIXED_DT_EXTRA_MEM) - set(BLIS_ENABLE_MIXED_DT_EXTRA_MEM TRUE) -endif () - -if (ENABLE_SUP_HANDLING) - set(BLIS_ENABLE_SUP_HANDLING TRUE) -endif () - -if (ENABLE_MEMKIND) - set(BLIS_ENABLE_MEMKIND FALSE) -endif () - -if (ENABLE_PRAGMA_OMP_SIMD) - set(BLIS_ENABLE_PRAGMA_OMP_SIMD TRUE) -endif () - -if (ENABLE_SANDBOX) - set(BLIS_ENABLE_SANDBOX FALSE) -endif () - -include_directories(${PROJECT_SOURCE_DIR}/external/msvc) -add_definitions(-D_CRT_SECURE_NO_WARNINGS) - -cmake_policy(SET CMP0091 NEW) +cmake_print_variables(BUILD_SHARED_LIBS) if(BUILD_SHARED_LIBS) - set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") + message(" Building BLIS as a shared library.") + set(ENABLE_SHARED_01 1) else() - set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") - add_definitions(-DBLIS_IS_BUILDING_LIBRARY) + message(" Building BLIS as a static library.") + set(ENABLE_SHARED_01 0) endif() - -if(ENABLE_MULTITHREADING) - if(BUILD_SHARED_LIBS) - set(LIB_NAME "${PROJECT_NAME}-MT-dll") - elseif(NOT BUILD_SHARED_LIBS) - set(LIB_NAME "${PROJECT_NAME}-MT") - endif() - if(ENABLE_OPENMP) +cmake_print_variables(EXPORT_SHARED) +if(EXPORT_SHARED STREQUAL "all") + if(BUILD_SHARED_LIBS) + message(" Exporting all symbols within shared library.") + else() + message(" Ignoring request to export all symbols within shared library.") + endif() +else() + if(BUILD_SHARED_LIBS) + message(" Exporting only public symbols within shared library.") + endif() +endif() +cmake_print_variables(ENABLE_SYSTEM) +if(ENABLE_SYSTEM) + message(" Enabling operating system support.") + set(ENABLE_SYSTEM_01 1) + if(NOT WIN32) + set(LIBPTHREAD "-lpthread") + endif() +else() + message(" Disabling operating system support.") + message(" WARNING: all threading will be disabled!") + set(ENABLE_THREADING "off") + set(ENABLE_SYSTEM_01 0) +endif() +# Check the threading model flag and standardize its value, if needed. +cmake_print_variables(ENABLE_THREADING) +set(ENABLE_OPENMP "no") +set(ENABLE_OPENMP_01 0) +set(ENABLE_PTHREADS "no") +set(ENABLE_PTHREADS_01 0) +if(ENABLE_THREADING STREQUAL "openmp") + message(" Using OpenMP for threading.") + set(ENABLE_OPENMP "yes") + set(ENABLE_OPENMP_01 1) find_package(OpenMP) - if (OPENMP_FOUND) - set(BLIS_ENABLE_OPENMP TRUE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") + if(NOT OPENMP_FOUND) + message(FATAL_ERROR "Openmp Not Found") + endif() +elseif(ENABLE_THREADING STREQUAL "pthreads") + message(" Using POSIX threads for threading.") + set(ENABLE_PTHREADS "yes") + set(ENABLE_PTHREADS_01 1) +else() + message(" Threading is disabled.") +endif() +# Check the method of assigning micropanels to threads in the JR and IR +# loops. +cmake_print_variables(THREAD_PART_JRIR) +if(THREAD_PART_JRIR STREQUAL "slab") + message(" Requesting slab threading in jr and ir loops.") + set(ENABLE_JRIR_SLAB_01 1) + set(ENABLE_JRIR_RR_01 0) +else() + message(" Requesting round-robin threading in jr and ir loops.") + set(ENABLE_JRIR_SLAB_01 0) + set(ENABLE_JRIR_RR_01 1) +endif() +# Convert 'yes' and 'no' flags to booleans. +cmake_print_variables(ENABLE_PBA_POOLS) +if(ENABLE_PBA_POOLS) + message(" Internal memory pools for packing blocks are enabled.") + set(ENABLE_PBA_POOLS_01 1) +else() + message(" Internal memory pools for packing blocks are disabled.") + set(ENABLE_PBA_POOLS_01 0) +endif() +cmake_print_variables(ENABLE_SBA_POOLS) +if(ENABLE_SBA_POOLS) + message(" Internal memory pools for small blocks are enabled.") + set(ENABLE_SBA_POOLS_01 1) +else() + message(" Internal memory pools for small blocks are disabled.") + set(ENABLE_SBA_POOLS_01 0) +endif() +cmake_print_variables(ENABLE_MEM_TRACING) +if(ENABLE_MEM_TRACING) + message(" Memory tracing output is enabled.") + set(ENABLE_MEM_TRACING_01 1) +else() + message(" Memory tracing output is disabled.") + set(ENABLE_MEM_TRACING_01 0) +endif() +cmake_print_variables(ENABLE_MEMKIND) +if(HAS_MEMKIND) + if(ENABLE_MEMKIND STREQUAL "auto") + # If no explicit option was given for libmemkind one way or the other, + # we use the value returned previously by has_libmemkind(), in this + # case "yes", to determine the default. + message(" libmemkind found; default is to enable use.") + set(ENABLE_MEMKIND "yes") + set(ENABLE_MEMKIND_01 1) else() - message (FATAL_ERROR "Openmp Not Found") + if(ENABLE_MEMKIND STREQUAL "yes") + message(" Received explicit request to enable libmemkind.") + set(ENABLE_MEMKIND_01 1) + else() + message(" Received explicit request to disable libmemkind.") + set(ENABLE_MEMKIND "no") + set(ENABLE_MEMKIND_01 0) + endif() endif() - endif() else() - if(BUILD_SHARED_LIBS) - set(LIB_NAME "${PROJECT_NAME}-dll") - elseif(NOT BUILD_SHARED_LIBS) - set(LIB_NAME "${PROJECT_NAME}") - endif() + if(WIN32) + message(" libmemkind option is not supported on Windows.") + else() + message(" libmemkind not found; disabling.") + if(ENABLE_MEMKIND STREQUAL "yes") + message(WARNING " Cannot honor explicit request to enable libmemkind.") + endif() + endif() + set(ENABLE_MEMKIND "no") + set(ENABLE_MEMKIND_01 0) endif() - -if(DISABLE_BLIS_ARCH_TYPE) - set(BLIS_DISABLE_BLIS_ARCH_TYPE TRUE) - set(BLIS_DISABLE_BLIS_MODEL_TYPE TRUE) +cmake_print_variables(PRAGMA_OMP_SIMD) +if(PRAGMA_OMP_SIMD) + message(" Compiler appears to support #pragma omp simd.") + set(ENABLE_PRAGMA_OMP_SIMD_01 1) else() - set(BLIS_DISABLE_BLIS_ARCH_TYPE FALSE) - set(BLIS_DISABLE_BLIS_MODEL_TYPE FALSE) + message(" Compiler appears to not support #pragma omp simd.") + set(ENABLE_PRAGMA_OMP_SIMD_01 0) endif() - -if(RENAME_BLIS_ARCH_TYPE) - set(__blis_arch_type_name TRUE) - set(rename_blis_arch_type "${RENAME_BLIS_ARCH_TYPE}") +cmake_print_variables(ENABLE_CBLAS) +if(ENABLE_CBLAS) + message(" The CBLAS compatibility layer is enabled.") + set(ENABLE_CBLAS_01 1) + # Force BLAS layer when CBLAS is enabled + set(ENABLE_BLAS ON) else() - set(__blis_arch_type_name TRUE) - set(rename_blis_arch_type "BLIS_ARCH_TYPE") + message(" The CBLAS compatibility layer is disabled.") + set(ENABLE_CBLAS_01 0) endif() - -if(RENAME_BLIS_MODEL_TYPE) - set(__blis_model_type_name TRUE) - set(rename_blis_model_type "${RENAME_BLIS_MODEL_TYPE}") +cmake_print_variables(ENABLE_BLAS) +if(ENABLE_BLAS) + message(" The BLAS compatibility layer is enabled.") + set(ENABLE_BLAS_01 1) else() - set(__blis_model_type_name TRUE) - set(rename_blis_model_type "BLIS_MODEL_TYPE") + message(" The BLAS compatibility layer is disabled.") + set(ENABLE_BLAS_01 0) endif() - -find_package(Doxygen) -set(W_DIR "${CMAKE_CURRENT_SOURCE_DIR}/docs") -if(NOT (DOXYGEN_FOUND)) - message(STATUS "Doxygen not found please install and try again.") +cmake_print_variables(ENABLE_MIXED_DT) +if(ENABLE_MIXED_DT) + message(" Mixed datatype support is enabled.") + cmake_print_variables(ENABLE_MIXED_DT_EXTRA_MEM) + if(ENABLE_MIXED_DT_EXTRA_MEM) + message(" Mixed datatype optimizations requiring extra memory are enabled.") + set(ENABLE_MIXED_DT_EXTRA_MEM_01 1) + else() + message(" Mixed datatype optimizations requiring extra memory are disabled.") + set(ENABLE_MIXED_DT_EXTRA_MEM_01 0) + endif() + set(ENABLE_MIXED_DT_01 1) else() - execute_process(COMMAND doxygen Doxyfile - WORKING_DIRECTORY ${W_DIR} - COMMAND_ECHO STDOUT) + message(" Mixed datatype support is disabled.") + set(ENABLE_MIXED_DT_EXTRA_MEM_01 0) + set(ENABLE_MIXED_DT_01 0) endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/docs/html/index.html) - message(STATUS "Documentation generated successfully, to view documentation open docs/html/index.html .") +cmake_print_variables(ENABLE_SUP_HANDLING) +if(ENABLE_SUP_HANDLING) + message(" Small matrix handling is enabled.") + set(ENABLE_SUP_HANDLING_01 1) else() - message(STATUS "Document generation failed.") -endif() - -set(CMAKE_BUILD_TYPE ${CMAKE_CONFIGURATION_TYPES}) - -#print configurations -message("---cmake configurations---") -message(CMAKE_C_COMPILER_ID : ${CMAKE_C_COMPILER_ID}) -message(CMAKE_BUILD_TYPE : ${CMAKE_BUILD_TYPE}) -message(BLIS_ENABLE_OPENMP : ${BLIS_ENABLE_OPENMP}) -message(BLIS_ENABLE_JRIR_SLAB : ${BLIS_ENABLE_JRIR_SLAB}) -message(BLIS_ENABLE_JRIR_RR : ${BLIS_ENABLE_JRIR_RR}) -message(BLIS_ENABLE_PBA_POOLS : ${BLIS_ENABLE_PBA_POOLS}) -message(BLIS_ENABLE_SBA_POOLS : ${BLIS_ENABLE_SBA_POOLS}) -message(BLIS_ENABLE_MEM_TRACING : ${BLIS_ENABLE_MEM_TRACING}) -message(BLIS_INT_TYPE_SIZE : ${BLIS_INT_TYPE_SIZE}) -message(BLIS_BLAS_INT_TYPE_SIZE : ${BLIS_BLAS_INT_TYPE_SIZE}) -message(BLIS_ENABLE_BLAS : ${BLIS_ENABLE_BLAS}) -message(BLIS_ENABLE_CBLAS : ${BLIS_ENABLE_CBLAS}) -message(BLIS_ENABLE_MIXED_DT : ${BLIS_ENABLE_MIXED_DT}) -message(BLIS_ENABLE_MIXED_DT_EXTRA_MEM : ${BLIS_ENABLE_MIXED_DT_EXTRA_MEM}) -message(BLIS_ENABLE_SUP_HANDLING : ${BLIS_ENABLE_SUP_HANDLING}) -message(BLIS_ENABLE_MEMKIND : ${BLIS_ENABLE_MEMKIND}) -message(BLIS_ENABLE_PRAGMA_OMP_SIMD : ${BLIS_ENABLE_PRAGMA_OMP_SIMD}) -message(BLIS_ENABLE_SANDBOX : ${BLIS_ENABLE_SANDBOX}) -message(BLIS_ENABLE_SHARED : ${BLIS_ENABLE_SHARED}) -message(DISABLE_BLIS_ARCH_TYPE : ${DISABLE_BLIS_ARCH_TYPE}) -message(RENAME_BLIS_ARCH_TYPE : ${RENAME_BLIS_ARCH_TYPE}) -message(RENAME_BLIS_MODEL_TYPE : ${RENAME_BLIS_MODEL_TYPE}) - -SET(ENABLE_SIMD_FLAGS "none" CACHE STRING "Set compiler SIMD flags") -SET_PROPERTY(CACHE ENABLE_SIMD_FLAGS PROPERTY STRINGS none SSE2 AVX AVX2) - -if(${ENABLE_SIMD_FLAGS} MATCHES "AVX2") - add_definitions(/arch:AVX2) -elseif(${ENABLE_SIMD_FLAGS} MATCHES "AVX") - add_definitions(/arch:AVX) -elseif(${ENABLE_SIMD_FLAGS} MATCHES "SSE2") - add_definitions(/arch:SSE2) -endif() - -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W0 ") -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Oi") -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP") -set(INTR_GENERAL_LINK_FLAGS "${INTR_GENERAL_LINK_FLAGS} /RELEGE") - -add_definitions(-D_CRT_SECURE_NO_DEPRECATE) - -#add_definitions(-DBLIS_OS_WINDOWS) -add_definitions(-D_MSC_VER) -if (${AOCL_BLIS_FAMILY} STREQUAL "amdzen") + message(" Small matrix handling is disabled.") + set(ENABLE_SUP_HANDLING_01 0) +endif() +cmake_print_variables(ENABLE_TRSM_PREINVERSION) +if(ENABLE_TRSM_PREINVERSION) + message(" trsm diagonal element pre-inversion is enabled.") + set(ENABLE_TRSM_PREINVERSION_01 1) else() -add_definitions(-DBLIS_CNAME=${TARGET_ARCH}) -endif() -# Generate the bli_config.h header file -configure_file (build/bli_win_config.h.in ${CMAKE_SOURCE_DIR}/bli_config.h @ONLY) - -include_directories(${CMAKE_SOURCE_DIR}/aocl_dtl) -include_directories(${CMAKE_SOURCE_DIR}/.) -include_directories(${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}) -include_directories(${CMAKE_SOURCE_DIR}/frame/include) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0/1e) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0/1m) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0/1r) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0/bb) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0/io) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0/ri) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0/ri3) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0/rih) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0/ro) -include_directories(${CMAKE_SOURCE_DIR}/frame/include/level0/rpi) -include_directories(${CMAKE_SOURCE_DIR}/frame/thread) -include_directories(${CMAKE_SOURCE_DIR}/frame/base) -include_directories(${CMAKE_SOURCE_DIR}/frame/base/cast) -include_directories(${CMAKE_SOURCE_DIR}/frame/base/check) -include_directories(${CMAKE_SOURCE_DIR}/frame/base/noopt) -include_directories(${CMAKE_SOURCE_DIR}/frame/base/proj) -include_directories(${CMAKE_SOURCE_DIR}/frame/0) -include_directories(${CMAKE_SOURCE_DIR}/frame/0/copysc) -include_directories(${CMAKE_SOURCE_DIR}/frame/1) -include_directories(${CMAKE_SOURCE_DIR}/frame/1d) -include_directories(${CMAKE_SOURCE_DIR}/frame/1f) -include_directories(${CMAKE_SOURCE_DIR}/frame/1m) -include_directories(${CMAKE_SOURCE_DIR}/frame/1m/packm) -include_directories(${CMAKE_SOURCE_DIR}/frame/1m/unpackm) -include_directories(${CMAKE_SOURCE_DIR}/frame/2) -include_directories(${CMAKE_SOURCE_DIR}/frame/2/gemv) -include_directories(${CMAKE_SOURCE_DIR}/frame/2/ger) -include_directories(${CMAKE_SOURCE_DIR}/frame/2/hemv) -include_directories(${CMAKE_SOURCE_DIR}/frame/2/her) -include_directories(${CMAKE_SOURCE_DIR}/frame/2/her2) -include_directories(${CMAKE_SOURCE_DIR}/frame/2/symv) -include_directories(${CMAKE_SOURCE_DIR}/frame/2/syr) -include_directories(${CMAKE_SOURCE_DIR}/frame/2/syr2) -include_directories(${CMAKE_SOURCE_DIR}/frame/2/trmv) -include_directories(${CMAKE_SOURCE_DIR}/frame/2/trsv) -include_directories(${CMAKE_SOURCE_DIR}/frame/3) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/gemm) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/gemm/ind) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/gemmt) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/hemm) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/her2k) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/herk) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/symm) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/syr2k) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/syrk) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/trmm) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/trmm3) -include_directories(${CMAKE_SOURCE_DIR}/frame/3/trsm) -include_directories(${CMAKE_SOURCE_DIR}/frame/compat) -include_directories(${CMAKE_SOURCE_DIR}/frame/compat/cblas) -include_directories(${CMAKE_SOURCE_DIR}/frame/compat/cblas/f77_sub) -include_directories(${CMAKE_SOURCE_DIR}/frame/compat/cblas/src) -include_directories(${CMAKE_SOURCE_DIR}/frame/compat/check) -include_directories(${CMAKE_SOURCE_DIR}/frame/compat/f2c) -include_directories(${CMAKE_SOURCE_DIR}/frame/compat/f2c/util) -include_directories(${CMAKE_SOURCE_DIR}/frame/ind) -include_directories(${CMAKE_SOURCE_DIR}/frame/ind/cntx) -include_directories(${CMAKE_SOURCE_DIR}/frame/ind/oapi) -include_directories(${CMAKE_SOURCE_DIR}/frame/ind/tapi) -include_directories(${CMAKE_SOURCE_DIR}/frame/ind/ukernels) -include_directories(${CMAKE_SOURCE_DIR}/frame/util) -include_directories(${CMAKE_SOURCE_DIR}/config/generic) -include_directories(${CMAKE_SOURCE_DIR}/config/zen) -include_directories(${CMAKE_SOURCE_DIR}/config/zen2) -include_directories(${CMAKE_SOURCE_DIR}/config/zen3) -include_directories(${CMAKE_SOURCE_DIR}/config/zen4) -if(${AOCL_BLIS_FAMILY} STREQUAL "amdzen") - include_directories(${CMAKE_BINARY_DIR}/ref_kernels/generic) - include_directories(${CMAKE_BINARY_DIR}/ref_kernels/zen) - include_directories(${CMAKE_BINARY_DIR}/ref_kernels/zen2) - include_directories(${CMAKE_BINARY_DIR}/ref_kernels/zen3) - include_directories(${CMAKE_BINARY_DIR}/ref_kernels/zen4) -endif() -include_directories(${CMAKE_SOURCE_DIR}/ref_kernels) -include_directories(${CMAKE_SOURCE_DIR}/kernels) -include_directories(${CMAKE_SOURCE_DIR}/kernels/haswell) -include_directories(${CMAKE_SOURCE_DIR}/kernels/haswell/3) -include_directories(${CMAKE_SOURCE_DIR}/kernels/haswell/3/sup) -include_directories(${CMAKE_SOURCE_DIR}/kernels/haswell/3/sup/d6x8) -include_directories(${CMAKE_SOURCE_DIR}/kernels/zen) -include_directories(${CMAKE_SOURCE_DIR}/kernels/zen/1) -include_directories(${CMAKE_SOURCE_DIR}/kernels/zen/1f) -include_directories(${CMAKE_SOURCE_DIR}/kernels/zen/1m) -include_directories(${CMAKE_SOURCE_DIR}/kernels/zen/2) -include_directories(${CMAKE_SOURCE_DIR}/kernels/zen/3) -include_directories(${CMAKE_SOURCE_DIR}/kernels/zen/3/sup) -include_directories(${CMAKE_SOURCE_DIR}/kernels/zen2) -include_directories(${CMAKE_SOURCE_DIR}/kernels/zen4) -include_directories(${CMAKE_SOURCE_DIR}/kernels/skx) -include_directories(${CMAKE_SOURCE_DIR}/kernels/skx/3) -file(GLOB headers ${CMAKE_SOURCE_DIR}/*.h) - -# Monolithic Header generation -find_package(PythonLibs 3 REQUIRED) - -string(APPEND HEADER_PATH -if(${AOCL_BLIS_FAMILY} STREQUAL "zen") - " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/haswell/" -elseif (${AOCL_BLIS_FAMILY} STREQUAL "zen2") - " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen2/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen2/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/haswell/" -elseif (${AOCL_BLIS_FAMILY} STREQUAL "zen3") - " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen3/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen3/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen2/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/haswell/" -elseif (${AOCL_BLIS_FAMILY} STREQUAL "zen4") - " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen4/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen3/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen2/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/haswell/" -elseif (${AOCL_BLIS_FAMILY} STREQUAL "amdzen") - " ${CMAKE_CURRENT_SOURCE_DIR}/config/amdzen/" - " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen/" - " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen2/" - " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen3/" - " ${CMAKE_CURRENT_SOURCE_DIR}/config/zen4/" - " ${CMAKE_CURRENT_SOURCE_DIR}/config/generic/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen3/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen2/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/haswell/" - " ${CMAKE_CURRENT_SOURCE_DIR}/kernels/generic/" -endif () - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/0/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/0/copysc/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/1/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/1d/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/1f/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/1m/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/1m/packm/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/1m/unpackm/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/gemv/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/ger/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/hemv/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/her/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/her2/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/symv/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/syr/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/syr2/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/trmv/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/2/trsv/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/gemm/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/gemm/ind/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/gemmt/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/hemm/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/her2k/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/herk/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/symm/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/syr2k/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/syrk/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/trmm/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/trmm3/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/3/trsm/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/base/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/base/cast/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/base/check/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/base/noopt/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/base/proj/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/compat/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/compat/cblas/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/compat/cblas/f77_sub/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/compat/cblas/src/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/compat/check/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/compat/f2c/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/compat/f2c/util/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/1e/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/1m/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/1r/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/bb/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/io/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/ri/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/ri3/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/rih/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/ro/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/include/level0/rpi/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/ind/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/ind/cntx/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/ind/oapi/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/ind/tapi/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/ind/ukernels/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/thread/" - " ${CMAKE_CURRENT_SOURCE_DIR}/frame/util/" - " ${CMAKE_CURRENT_SOURCE_DIR}/aocl_dtl/" - " ${CMAKE_CURRENT_SOURCE_DIR}/" -) + message(" trsm diagonal element pre-inversion is disabled.") + set(ENABLE_TRSM_PREINVERSION_01 0) +endif() +# Check aocl dynamic threading configuration and enable it only if +# multi-threading is enabled +cmake_print_variables(ENABLE_AOCL_DYNAMIC) +if(ENABLE_AOCL_DYNAMIC) + if( NOT(ENABLE_THREADING STREQUAL "no")) + message(" Dynamic selection of number of threads is enabled.") + set(ENABLE_AOCL_DYNAMIC_01 1) + else() + message(" Dynamic threading is disabled as multithreading is disabled.") + set(ENABLE_AOCL_DYNAMIC OFF) + set(ENABLE_AOCL_DYNAMIC_01 0) + endif() +else() + message(" Dynamic selection of number of threads is disabled.") + set(ENABLE_AOCL_DYNAMIC_01 0) +endif() +# Report integer sizes. +cmake_print_variables(INT_SIZE) +set(INT_TYPE_SIZE ${INT_SIZE}) +if(INT_TYPE_SIZE STREQUAL "32") + message(" The BLIS API integer size is 32-bit.") +elseif(INT_TYPE_SIZE STREQUAL "64") + message(" The BLIS API integer size is 64-bit.") +else() + set(INT_TYPE_SIZE "0") + message(" The BLIS API integer size is automatically determined.") +endif() +cmake_print_variables(BLAS_INT_SIZE) +set(BLAS_INT_TYPE_SIZE ${BLAS_INT_SIZE}) +if(BLAS_INT_TYPE_SIZE STREQUAL "32") + message(" The BLAS/CBLAS API integer size is 32-bit.") +elseif(BLAS_INT_TYPE_SIZE STREQUAL "64") + message(" The BLAS/CBLAS API integer size is 64-bit.") +else() + set(BLAS_INT_TYPE_SIZE "0") + message(" The BLAS/CBLAS API integer size is automatically determined.") +endif() +# Disallow the simultaneous use of 64-bit integers in the BLAS and +# 32-bit integers in BLIS. +if((INT_TYPE_SIZE STREQUAL "32") AND (BLAS_INT_TYPE_SIZE STREQUAL "64")) + message(FATAL_ERROR "INT_TYPE_SIZE=${INT_TYPE_SIZE} and BLAS_INT_TYPE_SIZE=${BLAS_INT_TYPE_SIZE}. \ + To avoid the possibility of truncation, we do not allow use of 64-bit integers in the BLAS API with 32-bit integers in BLIS. \ + Please use a different configuration of integers.") +endif() +if(NOT WIN32) + cmake_print_variables(ENABLE_ADDON) + if(ENABLE_ADDON STREQUAL "") + message(" Configuring with no addons.") + set(ENABLE_ADDONS_01 0) + else() + # Remove duplicates in the addon list, if they exist. + list(REMOVE_DUPLICATES ENABLE_ADDON) + message(" Configuring with addons:") + foreach(ADDON ${ENABLE_ADDON}) + message(" ${ADDON}") + if(NOT (EXISTS ${CMAKE_SOURCE_DIR}/addon/${ADDON})) + message(FATAL_ERROR "Requested addon sub-directory does not exist! Cannot continue. \ + *** Please verify addon existence and name.") + endif() + endforeach() + set(ENABLE_ADDONS_01 1) + endif() +endif() +cmake_print_variables(ENABLE_SANDBOX) +if(ENABLE_SANDBOX STREQUAL "") + message(" Configuring for conventional gemm implementation.") + set(ENABLE_SANDBOX_01 0) +else() + message(" Configuring with alternate gemm implementation: ${ENABLE_SANDBOX}.") + message(FATAL_ERROR "Sandbox functionality is not yet integrated in CMake build system.") + set(ENABLE_SANDBOX_01 1) +endif() +# Check the method used for returning complex numbers. Only for Linux. +if(NOT WIN32) + if(COMPLEX_RETURN STREQUAL "default") + if("${CMAKE_Fortran_COMPILER_ID}" MATCHES "Intel") + set(COMPLEX_RETURN "intel") + else() + set(COMPLEX_RETURN "gnu") + endif() + endif() +endif() +cmake_print_variables(COMPLEX_RETURN) +if(COMPLEX_RETURN STREQUAL "gnu") + message(" Configuring with gnu complex return type.") + set(COMPLEX_RETURN_INTEL_01 0) +else() + message(" Configuring with intel complex return type.") + set(COMPLEX_RETURN_INTEL_01 1) +endif() +cmake_print_variables(DISABLE_BLIS_ARCH_TYPE) +if(DISABLE_BLIS_ARCH_TYPE) + message(" User selection of code path using BLIS_ARCH_TYPE and BLIS_MODEL_TYPE env vars is disabled.") + set(DISABLE_BLIS_ARCH_TYPE_01 1) +else() + set(DISABLE_BLIS_ARCH_TYPE_01 0) +endif() +cmake_print_variables(RENAME_BLIS_ARCH_TYPE) +if(NOT(RENAME_BLIS_ARCH_TYPE STREQUAL "BLIS_ARCH_TYPE")) + message(" configuring with BLIS_ARCH_TYPE env var renamed to ${RENAME_BLIS_ARCH_TYPE}") +endif() +cmake_print_variables(RENAME_BLIS_MODEL_TYPE) +if(NOT(RENAME_BLIS_MODEL_TYPE STREQUAL "BLIS_MODEL_TYPE")) + message(" configuring with BLIS_MODEL_TYPE env var renamed to ${RENAME_BLIS_MODEL_TYPE}") +endif() +if(WIN32) + cmake_print_variables(ENABLE_NO_UNDERSCORE_API) + if(ENABLE_NO_UNDERSCORE_API) + message(" Export APIs without underscore.") + else() + message(" Export APIs with underscore.") + endif() + cmake_print_variables(ENABLE_UPPERCASE_API) + if(ENABLE_UPPERCASE_API) + message(" Export APIs with uppercase.") + else() + message(" Export APIs with lowercase.") + endif() +endif() -file(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}) +# Initialize threading model, using the corresponding cache variable. +set(THREADING_MODEL ${ENABLE_THREADING}) + + +#-------------------------------------------- +# Instantiate bli_config.h file from template +#-------------------------------------------- +# Begin substituting information into the build/cmake/bli_config.h.in file, outputting +# to bli_config.h and store it in build directory of the current project. +configure_file(build/cmake/bli_config.h.in ${PROJECT_BINARY_DIR}/bli_config.h) + +#-------------------------------------------- +# Instantiate bli_addon.h file from template +#-------------------------------------------- +# Create a list of #includes, one for each addon in addon_list. +set(ADDON_LIST_INCLUDES "") +foreach(ADDON ${ENABLE_ADDON}) + if(ADDON STREQUAL "aocl_gemm") + if(("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0.0)) + message(FATAL_ERROR "aocl_gemm addon requires a gcc version 11.0.0 or higher.") + endif() + endif() + set(ADDON_HEADER "\"${ADDON}.h\"") + set(ADDON_LIST_INCLUDES "${ADDON_LIST_INCLUDES}#include ${ADDON_HEADER}\n") +endforeach() +# Begin substituting information into the bli_addon.h.in file, outputting +# to bli_addon.h and store it in build directory of the current project. +configure_file(build/cmake/bli_addon.h.in ${PROJECT_BINARY_DIR}/bli_addon.h) + +#-------------------------------------------- +# Collect directory paths for blis.h +#-------------------------------------------- +# Variable ALL_HEADER_PATHS_LIST is equivalent to ALL_H99_DIRPATHS in Make system. +# Practically, we collect the required directory paths into a list, which we +# append as we add the corresponding subdirectories. This variable will be +# transformed into a string and will be used to generate the flatten blis.h header. +set(ALL_HEADER_PATHS_LIST "") + +# Include functionality that returns header paths. +include(${CMAKE_SOURCE_DIR}/build/cmake/subdir_helper_functions.cmake) + +# If the CONFIG_LIST does not already contain the CONFIG_NAME (i.e., +# if CONFIG_NAME is an umbrella family), add in the corresponding +# directory. (In the next step, we will loop over the actual sub- +# configurations and add them as well.) +list(FIND CONFIG_LIST ${BLIS_CONFIG_FAMILY} IS_UMBRELLA) +if(${IS_UMBRELLA} STREQUAL "-1") + # Collect all subdirectory paths that have at least one file with suffix in ALL_H99_SUFS list. + get_dirpaths_with_suffixes(${BLIS_CONFIG_FAMILY}_HEADER_PATHS ${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY} "${ALL_H99_SUFS}") +endif() +list(APPEND ALL_HEADER_PATHS_LIST "${${BLIS_CONFIG_FAMILY}_HEADER_PATHS}") + +# Get header directory paths for each of the sub-configurations present +# in the configuration list. +foreach(CONF ${CONFIG_LIST}) + get_dirpaths_with_suffixes(config_${CONF}_HEADER_PATHS ${CMAKE_SOURCE_DIR}/config/${CONF} "${ALL_H99_SUFS}") + list(APPEND ALL_HEADER_PATHS_LIST "${config_${CONF}_HEADER_PATHS}") +endforeach() + +# Get header directory paths for each of the kernels present +# in the kernel list. +foreach(KERN ${KERNEL_LIST}) + # Collect all subdirectory paths that have at least one file with suffix in ALL_H99_SUFS list. + get_dirpaths_with_suffixes(kernels_${KERN}_HEADER_PATHS ${CMAKE_SOURCE_DIR}/kernels/${KERN} "${ALL_H99_SUFS}") + list(APPEND ALL_HEADER_PATHS_LIST "${kernels_${KERN}_HEADER_PATHS}") +endforeach() + +# Get header directory paths for framework directory. +get_dirpaths_with_suffixes(frame_HEADER_PATHS ${CMAKE_SOURCE_DIR}/frame "${ALL_H99_SUFS}") +list(APPEND ALL_HEADER_PATHS_LIST "${frame_HEADER_PATHS}") + +# Get header directory paths for AOCL DTL logs directory. +get_dirpaths_with_suffixes(aocl_dtl_HEADER_PATHS ${CMAKE_SOURCE_DIR}/aocl_dtl "${ALL_H99_SUFS}") +list(APPEND ALL_HEADER_PATHS_LIST "${aocl_dtl_HEADER_PATHS}") + +# Get a copy of the header paths without including the addons and the sandbox. +set(FRAME_HEADER_DIRPATHS_LIST ${ALL_HEADER_PATHS_LIST}) + +# Get header directory paths for each of the addons. +foreach(ADDON ${ENABLE_ADDON}) + get_dirpaths_with_suffixes(addon_${ADDON}_HEADER_PATHS ${CMAKE_SOURCE_DIR}/addon/${ADDON} "${ALL_H99_SUFS}") + list(APPEND ALL_HEADER_PATHS_LIST "${addon_${ADDON}_HEADER_PATHS}") +endforeach() + +# Pick up generated bli_config.h and bli_addon.h that get generated in +# current build directory. +list(PREPEND ALL_HEADER_PATHS_LIST ${PROJECT_BINARY_DIR}/) +# Create a string out of this list so that it can be processed by flatten-headers.py. +list(JOIN ALL_HEADER_PATHS_LIST " " ALL_HEADER_PATHS_STRING) + +#-------------------------------------------- +# Consolidated blis.h header creation +#-------------------------------------------- +# Creating a directory for the generated flatten headers. +file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}) +# Flatten header python script file which expand header contents in blis.h. +add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/flatten-headers.py -c -v1 + "${CMAKE_SOURCE_DIR}/frame/include/blis.h" + "${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h" + "${PROJECT_BINARY_DIR}/include" + "${ALL_HEADER_PATHS_STRING}" + COMMENT "Generating monolithic blis header file: ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h" + ) +add_custom_target(flat-header DEPENDS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h) +#-------------------------------------------- +# Consolidated cblas.h header creation +#-------------------------------------------- +# Flatten header python script file which expand header contents in cblas.h. +if(ENABLE_CBLAS) + add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/flatten-headers.py -c -v1 + "${CMAKE_SOURCE_DIR}/frame/compat/cblas/src/cblas.h" + "${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h" + "${PROJECT_BINARY_DIR}/${include}" + "${ALL_HEADER_PATHS_STRING}" + COMMENT "Generating monolithic cblas header file: ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h" + ) + add_custom_target(flat-cblas-header DEPENDS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) +endif() -# Flatten header python script file which expand header contents in blis.h -set(FLATTEN_PY "${CMAKE_SOURCE_DIR}/build/flatten-headers.py") -set(BLIS_H "blis.h") +#-------------------------------------------- +# Default linker definitions +#-------------------------------------------- +# NOTE: This section needs to reside before the inclusion of make_defs.mk +# files (just below), as most configurations' make_defs.mk don't tinker +# with things like LDFLAGS, but some do (or may), in which case they can +# manually override whatever they need. + +# Define the external libraries we may potentially need at link-time. +# Add libm only on Linux and only if Intel compiler is not used. +if((NOT WIN32) AND (NOT ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Intel"))) + set(LIBM -lm) +endif() +set(LIBMEMKIND -lmemkind) + +# Default linker flags. +# NOTE: -lpthread is needed unconditionally because BLIS uses pthread_once() +# to initialize itself in a thread-safe manner. The one exception to this +# rule: if --disable-system is given at configure-time, LIBPTHREAD is empty. +if(NOT WIN32) + set(LDFLAGS ${LIBM} ${LIBPTHREAD}) +endif() +# Add libmemkind to the link-time flags, if it was enabled at configure-time. +if(ENABLE_MEMKIND STREQUAL "yes") + list(APPEND LDFLAGS ${LIBMEMKIND}) +endif() -# Arguements for python script -set(C_COMMENT "-c") -set(VERBOSE "-v1") -set(INPUT "${CMAKE_SOURCE_DIR}/frame/include/${BLIS_H}") -set(OUTPUT "${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/${BLIS_H}") -set(TEMP_DIR "${INCLUDE}") -set(DIR_H_PATH "${HEADER_PATH}") +#-------------------------------------------- +# Configuration-agnostic flags +#-------------------------------------------- +# --- Warning flags --- -# Run python script to generate monolithic header at configuration time -execute_process( - COMMAND ${PYTHON_EXE} ${FLATTEN_PY} "${C_COMMENT}" "${VERBOSE}" "${INPUT}" "${OUTPUT}" "${TEMP_DIR}" "${DIR_H_PATH}" - RESULT_VARIABLE CMD_RESULT - OUTPUT_VARIABLE CMD_OUTPUT) -message( STATUS "Generating monolithic header file :" ${CMD_OUTPUT}) - -# Logic to generate the cblas.h in include folder. -set(CBLAS_H "cblas.h") -# Arguements for python script -set(C_COMMENT "-c") -set(VERBOSE "-v1") -set(INPUT "${CMAKE_SOURCE_DIR}/frame/compat/cblas/src/${CBLAS_H}") -set(OUTPUT "${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/${CBLAS_H}") -set(TEMP_DIR "${INCLUDE}") -set(DIR_H_PATH "${HEADER_PATH}") - -# Run python script to generate monolithic header at configuration time -execute_process( - COMMAND ${PYTHON_EXE} ${FLATTEN_PY} "${C_COMMENT}" "${VERBOSE}" "${INPUT}" "${OUTPUT}" "${TEMP_DIR}" "${DIR_H_PATH}" - RESULT_VARIABLE CMD_RESULT - OUTPUT_VARIABLE CMD_OUTPUT) -message( STATUS "Generating monolithic cblas header file :" ${CMD_OUTPUT}) - -# setting the blis version string -file (STRINGS "version" BLIS_VERSION) -set(BLIS_VERSION_STRING ${BLIS_VERSION}) -string(TIMESTAMP BUILD_DATE "%Y%m%d") -add_definitions(-DBLIS_VERSION_STRING="AOCL-BLIS ${BLIS_VERSION_STRING} Build ${BUILD_DATE}") -if (ENABLE_ASAN_TESTS) - set(STATIC_LIB_OPTIONS "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic-x86_64.lib" "${ASAN_DEPENDENCY_LIB_DIR}/clang_rt.asan_dynamic_runtime_thunk-x86_64.lib") -endif () - -# Set object libraries created in kernels directory to be added into BLIS library. -set(OBJECT_LIBRARIES - $ - $ - $ - $ - $ - $ - $ - $ - $ -) -# Ammend the list of object libraries to include zen4 paths as appropriate. -if(${TARGET_ARCH} STREQUAL zen4 OR - ${TARGET_ARCH} STREQUAL amdzen) - set(OBJECT_LIBRARIES ${OBJECT_LIBRARIES} - $ - $ - $ - $ - $ - $ - ) +# Disable unused function warnings and stop compiling on first error for +# all compilers that accept such options: gcc, clang, and icc. +set(CWARNFLAGS -Wno-unused-function -Wfatal-errors) +if(NOT WIN32) + list(PREPEND CWARNFLAGS -Wall) endif() -if(BUILD_SHARED_LIBS) - add_library("${PROJECT_NAME}" SHARED ${CMAKE_SOURCE_DIR}/bli_config.h - ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/blis.h - ${headers} - ${OBJECT_LIBRARIES} - ) - if(ENABLE_OPENMP) - target_link_libraries("${PROJECT_NAME}" PRIVATE OpenMP::OpenMP_CXX) - endif() - target_compile_definitions("${PROJECT_NAME}" PUBLIC -DBLIS_IS_BUILDING_LIBRARY) - set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C OUTPUT_NAME "${LIB_NAME}") +# Disable tautological comparision warnings in clang. +if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + list(APPEND CWARNFLAGS -Wno-tautological-compare) endif() -if(NOT BUILD_SHARED_LIBS) - add_library("${PROJECT_NAME}" STATIC ${CMAKE_SOURCE_DIR}/bli_config.h - ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/blis.h - ${headers} - ${OBJECT_LIBRARIES} - ) - set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C OUTPUT_NAME "${LIB_NAME}" STATIC_LIBRARY_OPTIONS "${STATIC_LIB_OPTIONS}") + +# Add extra warning flags for Windows builds. +if(WIN32) + list(APPEND CWARNFLAGS -Wno-unused-variable -Wno-deprecated-declarations) endif() -# Enabling the address sanitizer tests. -if (ENABLE_ASAN_TESTS) - target_compile_options("${PROJECT_NAME}" PRIVATE -fsanitize=address /Od) +#Setting up the correct Windows Runtime Library. +if(WIN32) + cmake_policy(SET CMP0091 NEW) if(BUILD_SHARED_LIBS) - # /MD will be used implicitly - target_link_directories("${PROJECT_NAME}" PRIVATE ${ASAN_DEPENDENCY_LIB_DIR}) - target_link_libraries("${PROJECT_NAME}" PRIVATE clang_rt.asan_dynamic-x86_64 clang_rt.asan_dynamic_runtime_thunk-x86_64) - target_link_options("${PROJECT_NAME}" PRIVATE /wholearchive:clang_rt.asan_dynamic_runtime_thunk-x86_64.lib) + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") + else() + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") endif() endif() -link_directories(${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) -add_definitions(-DEXPMODULE) +# --- Symbol exporting flags (shared libraries only) -- + +# NOTE: These flags are only applied when building BLIS and not used by +# applications. -if(NOT CMAKE_INSTALL_PREFIX) - if(WIN32) - set(CMAKE_INSTALL_PREFIX - "${PROJECT_BINARY_DIR}/libblis" - CACHE PATH "Install path prefix, prepended onto install directories") - else() - set(CMAKE_INSTALL_PREFIX - "/usr/local/blis" - CACHE PATH "Install path prefix, prepended onto install directories") - endif() +# Determine default export behavior / visibility of symbols for gcc, icc and clang. +if(NOT WIN32) + if(EXPORT_SHARED STREQUAL "all") + # Export all symbols by default. + set(BUILD_SYMFLAGS -fvisibility=default) + else() # ifeq ($(EXPORT_SHARED),public) + # Hide all symbols by default and export only those that have been annotated + # as needing to be exported. + set(BUILD_SYMFLAGS -fvisibility=hidden) + endif() endif() -# Public blis headers -set(BLIS_PUBLIC_HEADERS - ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/${BLIS_H} - ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/${CBLAS_H} -) +# --- C Preprocessor flags --- +# Enable clock_gettime() in time.h. +set(CPPROCFLAGS -D_POSIX_C_SOURCE=200112L) + +# --- Threading flags --- +# NOTE: We don't have to explicitly omit -pthread when --disable-system is given +# since that option forces --enable-threading=none, and thus -pthread never gets +# added to begin with. +if(NOT WIN32) + if(THREADING_MODEL STREQUAL "pthreads") + set(CTHREADFLAGS "-pthread") + endif() +endif() -set_target_properties(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${BLIS_PUBLIC_HEADERS}") +# --- #pragma omp simd flags (used for reference kernels only) --- +if(PRAGMA_OMP_SIMD) + if(WIN32) + set(COMPSIMDFLAGS /openmp:experimental) + else() + set(COMPSIMDFLAGS -fopenmp-simd) + endif() +endif() -install(TARGETS ${PROJECT_NAME} LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_PREFIX}/include) +#-------------------------------------------- +# Compiler include path definitions +#-------------------------------------------- +# Obtain a list of header files #included inside of the bli_cntx_ref.c file. +# Due to the way that bli_cntx_ref.c uses headers and macros, paths to these +# files will be needed when compiling bli_cntx_ref.c with the monolithic header. + +# Read content of bli_cntx_ref.c and put it in REF_KER_HEADERS_TEMP. +file(STRINGS ${CMAKE_SOURCE_DIR}/ref_kernels/bli_cntx_ref.c REF_KER_HEADERS_TEMP) +# Only keep the lines where there are includes. +list(FILTER REF_KER_HEADERS_TEMP INCLUDE REGEX "\#include") +# REF_KER_HEADERS has a list of all files that are included in bli_cntx_ref.c. +set(REF_KER_HEADERS "") +foreach(header ${REF_KER_HEADERS_TEMP}) + string(REGEX MATCH "\#include [\"<]\([a-zA-Z0-9\_\.\/\-]*\)[\">].*" helper ${header}) + list(APPEND REF_KER_HEADERS ${CMAKE_MATCH_1}) +endforeach() +# Remove blis.h from the list. +list(FILTER REF_KER_HEADERS EXCLUDE REGEX "blis.h") +set(REF_KER_H_PATHS "") +foreach(header_name ${REF_KER_HEADERS}) + foreach(header_dir ${FRAME_HEADER_DIRPATHS_LIST}) + if(EXISTS ${header_dir}/${header_name}) + list(APPEND REF_KER_H_PATHS ${header_dir}) + break() + endif() + endforeach() +endforeach() +# Remove duplicates, if they exist. +list(REMOVE_DUPLICATES REF_KER_H_PATHS) + +# Create list of include directories, to be used while creating the library. +# NOTE: We no longer need every header path in the source tree since we +# now #include the monolithic/flattened blis.h instead. +set(CINFLAGS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}) +list(APPEND CINFLAGS ${REF_KER_H_PATHS}) +# Then add frame/include since it's needed for bli_oapi_w[o]_cntx.h. +list(APPEND CINFLAGS ${CMAKE_SOURCE_DIR}/frame/include) +# If CBLAS is enabled, we also include the path to the cblas.h directory so +# that the compiler will be able to find cblas.h as the CBLAS source code is +# being compiled. +if(ENABLE_CBLAS) + set(CBLAS_H_DIRPATH "") + foreach(header_dir ${FRAME_HEADER_DIRPATHS_LIST}) + if(EXISTS ${header_dir}/cblas.h) + list(APPEND CBLAS_H_DIRPATH ${header_dir}) + break() + endif() + endforeach() + list(APPEND CINFLAGS ${CBLAS_H_DIRPATH}) +endif() +#-------------------------------------------- +# Special preprocessor macro definitions +#-------------------------------------------- +# Define a C preprocessor macro to communicate the current version so that it +# can be embedded into the library and queried later. +set(VERS_DEF -DBLIS_VERSION_STRING="${VERSION_STRING}") + +# Define a C preprocessor flag that is *only* defined when BLIS is being +# compiled. (In other words, an application that #includes blis.h will not +# get this cpp macro.) +set(BUILD_CPPFLAGS -DBLIS_IS_BUILDING_LIBRARY) + +#-------------------------------------------- +# Add CMakeLists.txt from directories +#-------------------------------------------- +# Add config subdirectory. add_subdirectory(config) -add_subdirectory(ref_kernels) +# Add kernel subdirectory. add_subdirectory(kernels) +# Add framework directory. add_subdirectory(frame) +# Add AOCL DTL logs directory. add_subdirectory(aocl_dtl) -add_subdirectory(test) -add_subdirectory(testsuite) -add_subdirectory(bench) -if(ENABLE_TESTCPP_TESTING) - add_subdirectory(vendor/testcpp) +# Add subdirectory for each of the addons. +list(LENGTH ENABLE_ADDON addon_list_size) +if(addon_list_size GREATER 0) + add_subdirectory(addon) +endif() + +# Collect all object libraries that are required to build the blis library. +set(OBJECT_LIBRARIES "") +# Add objects from config. +foreach(conf ${CONFIG_LIST}) + list(APPEND OBJECT_LIBRARIES $) +endforeach() +# Add objects from kernels. +foreach(ker ${KERNEL_LIST}) + if(TARGET ${ker}_KERNELS) + list(APPEND OBJECT_LIBRARIES $) + endif() +endforeach() +# Add objects for reference kernels. +foreach(conf ${CONFIG_LIST}) + list(APPEND OBJECT_LIBRARIES $) + list(APPEND OBJECT_LIBRARIES $) +endforeach() +# Add objects for frame. +list(APPEND OBJECT_LIBRARIES $) +# Add objects for aocl-dtl. +list(APPEND OBJECT_LIBRARIES $) +# Add objects for addons. +foreach(addon ${ENABLE_ADDON}) + if(TARGET ${addon}_C99_ADDON) + list(APPEND OBJECT_LIBRARIES $) + endif() + if(TARGET ${addon}_C99_KERNEL_ADDON) + list(APPEND OBJECT_LIBRARIES $) + endif() + if(TARGET ${addon}_CXX_ADDON) + list(APPEND OBJECT_LIBRARIES $) + endif() +endforeach() + +#-------------------------------------------- +# Building BLIS Library +#-------------------------------------------- +# Public blis headers. +set(BLIS_PUBLIC_HEADERS + ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h + # Include AMD's C++ template header files in the list of headers + # to install. + ${CMAKE_SOURCE_DIR}/vendor/cpp/blis.hh + ${CMAKE_SOURCE_DIR}/vendor/cpp/cblas.hh +) +if(ENABLE_CBLAS) + list(APPEND BLIS_PUBLIC_HEADERS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) endif() -if (ENABLE_BLASTEST) - add_subdirectory(blastest) + +# --- Library name and local paths --- +# From old CMake +if(WIN32) + add_definitions(-D_CRT_SECURE_NO_WARNINGS) + add_definitions(-D_CRT_SECURE_NO_DEPRECATE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Oi") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP${CMake_MSVC_PARALLEL}") + set(INTR_GENERAL_LINK_FLAGS "${INTR_GENERAL_LINK_FLAGS} /RELEGE") + add_definitions(-DEXPMODULE) +endif() + +# Set up the library name. +if(WIN32) + set(LIBBLIS AOCL-LibBlis-Win) +else() + set(LIBBLIS blis) +endif() + +# Append if threading is required. +if(NOT (THREADING_MODEL STREQUAL "no")) + if(WIN32) + string(APPEND LIBBLIS -MT) + else() + string(APPEND LIBBLIS -mt) + endif() +endif() + +if(BUILD_SHARED_LIBS) + if(WIN32) + string(APPEND LIBBLIS -dll) + endif() + # Build shared library. + add_library(libblis SHARED ${OBJECT_LIBRARIES}) + target_link_libraries(libblis PRIVATE ${LDFLAGS}) + set_target_properties(libblis PROPERTIES LINKER_LANGUAGE C VERSION ${VERSION} SOVERSION ${SO_VERSION_MAJOR}) + set_target_properties(libblis PROPERTIES POSITION_INDEPENDENT_CODE ON) + if(THREADING_MODEL STREQUAL "openmp") + target_link_libraries(libblis PRIVATE OpenMP::OpenMP_C) + endif() +else() + # Build static library. + add_library(libblis STATIC ${OBJECT_LIBRARIES}) + set_target_properties(libblis PROPERTIES LINKER_LANGUAGE C) +endif() +add_dependencies(libblis flat-header) +if(ENABLE_CBLAS) + add_dependencies(libblis flat-cblas-header) +endif() +# Add headers as a property to the library. +set_target_properties(libblis PROPERTIES PUBLIC_HEADER "${BLIS_PUBLIC_HEADERS}") +set_target_properties(libblis PROPERTIES OUTPUT_NAME ${LIBBLIS}) +if(WIN32) + set_target_properties(libblis + PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" + ) +endif() + +# Install targets. +install(TARGETS libblis LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_PREFIX}/include/blis) + +# --- Primary targets --- +add_custom_target(libs DEPENDS libblis) + +# Multiple BLIS API testing targets. Result files are generated in ${CMAKE_BINARY_DIR}/testsuite. +add_subdirectory(testsuite EXCLUDE_FROM_ALL) + +# Check results of BLIS CPP Template tests +add_subdirectory(vendor/testcpp EXCLUDE_FROM_ALL) + +# Add BLAS tests if BLAS interface is enabled. +if(ENABLE_BLAS) + add_subdirectory(blastest EXCLUDE_FROM_ALL) +endif() + +# Add generic testing target. +set(available_testsuites checkblis) +if(ENABLE_BLAS) + list(APPEND available_testsuites checkblas) endif() +add_custom_target(check DEPENDS ${available_testsuites}) + +#-------------------------------------------- +# Clean-up +#-------------------------------------------- +# Add distclean target +add_custom_target(distclean + COMMAND ${CMAKE_BUILD_TOOL} clean + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/build/distclean.cmake + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Remove cmake_generated files and executables" +) \ No newline at end of file diff --git a/addon/CMakeLists.txt b/addon/CMakeLists.txt new file mode 100644 index 0000000000..6e950340ae --- /dev/null +++ b/addon/CMakeLists.txt @@ -0,0 +1,206 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc ## + +# Writing a function that will be used to generate the required object +# libraries for the required addons. +function(generate_addon_targets addon_target) + # Collect all subdirectory paths that have at least one file with suffix in ADDON_C99_SUFS list. + get_filepaths_with_suffixes(LOCAL_SOURCE_C99_FILES "${CMAKE_CURRENT_SOURCE_DIR}/${addon_target}" "${ADDON_C99_SUFS}") + # We want to break the files above in 2 categories, files in kernel directory and the rest. + # Only list files in kernel directory. + set(LOCAL_KERNEL_FILES_C99 ${LOCAL_SOURCE_FILES}) + list(FILTER LOCAL_KERNEL_FILES_C99 INCLUDE REGEX ${addon_target}/kernels/) + # All C99 files, except of the ones in kernels directory. + list(REMOVE_ITEM LOCAL_SOURCE_C99_FILES ${LOCAL_KERNEL_FILES_C99}) + + # Collect all subdirectory paths that have at least one file with suffix in ADDON_H99_SUFS list. + get_dirpaths_with_suffixes(CADDONINCFLAGS "${CMAKE_CURRENT_SOURCE_DIR}/${addon_target}" "${ADDON_H99_SUFS}") + + # Only generate the object library if there is at least one source file. + list(LENGTH LOCAL_SOURCE_C99_FILES size) + if(size GREATER 0) + # Create an object library using the source file list above. + add_library(${addon_target}_C99_ADDON + OBJECT + ${LOCAL_SOURCE_C99_FILES} + ) + # Include the corresponding make_defs.cmake that holds the required compiler options. + include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake) + # Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets. + # mimicing get-addon-c99flags-for + target_compile_options(${addon_target}_C99_ADDON + PRIVATE + # load-var-for,COPTFLAGS + ${COPTFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + # get-noopt-cflags-for + ${CWARNFLAGS} + # get-noopt-cflags-for + ${CMISCFLAGS} + # get-noopt-cflags-for + ${CLANGFLAGS} + # in get-addon-c99flags-for + ${BUILD_SYMFLAGS} + ) + target_compile_definitions(${addon_target}_C99_ADDON + PRIVATE + # in get-noopt-cflags-for + ${CPPROCFLAGS} + # in get-noopt-cflags-for + ${VERS_DEF} + # in get-addon-c99flags-for + ${BUILD_CPPFLAGS} + ) + target_include_directories(${addon_target}_C99_ADDON + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + # in get-addon-c99flags-for + ${CADDONINCFLAGS} + ) + if(THREADING_MODEL STREQUAL "openmp") + # Equivalent to CTHREADFLAGS in get-noopt-cflags-for + target_link_libraries(${addon_target}_C99_ADDON PRIVATE OpenMP::OpenMP_C) + elseif(THREADING_MODEL STREQUAL "pthreads") + # in get-noopt-cflags-for + target_compile_options(${addon_target}_C99_ADDON PRIVATE ${CTHREADFLAGS}) + endif() + if(BUILD_SHARED_LIBS) + # Equivalent to CPICFLAGS in get-noopt-cflags-for + set_target_properties(${addon_target}_C99_ADDON PROPERTIES POSITION_INDEPENDENT_CODE ON) + endif() + add_dependencies(${addon_target}_C99_ADDON flat-header) + # Put all those targets under object-libs-targets folder name so that they appear all together in IDE. + set_target_properties(${addon_target}_C99_ADDON PROPERTIES FOLDER object-libs-targets) + endif() + + # Only generate the object library if there is at least one source file. + list(LENGTH LOCAL_KERNEL_FILES_C99 size) + if(size GREATER 0) + # Create an object library using the kernel source file list above. + add_library(${addon_target}_C99_KERNEL_ADDON + OBJECT + ${LOCAL_KERNEL_FILES_C99} + ) + # Include the corresponding make_defs.cmake that holds the required compiler options. + include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake) + # Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets. + # mimicing get-addon-c99flags-for + target_compile_options(${addon_target}_C99_KERNEL_ADDON + PRIVATE + # load-var-for,CKOPTFLAGS + ${CKOPTFLAGS} + # load-var-for,CKVECFLAGS + ${CKVECFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + # get-noopt-cflags-for + ${CWARNFLAGS} + # get-noopt-cflags-for + ${CMISCFLAGS} + # get-noopt-cflags-for + ${CLANGFLAGS} + # in get-addon-kernel-c99flags-for + ${BUILD_SYMFLAGS} + ) + target_compile_definitions(${addon_target}_C99_KERNEL_ADDON + PRIVATE + # in get-noopt-cflags-for + ${CPPROCFLAGS} + # in get-noopt-cflags-for + ${VERS_DEF} + # in get-addon-kernel-c99flags-for + ${BUILD_CPPFLAGS} + ) + target_include_directories(${addon_target}_C99_KERNEL_ADDON + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + # in get-addon-kernel-c99flags-for + ${CADDONINCFLAGS} + ) + if(THREADING_MODEL STREQUAL "openmp") + # Equivalent to CTHREADFLAGS in get-noopt-cflags-for + target_link_libraries(${addon_target}_C99_KERNEL_ADDON PRIVATE OpenMP::OpenMP_C) + elseif(THREADING_MODEL STREQUAL "pthreads") + # in get-noopt-cflags-for + target_compile_options(${addon_target}_C99_KERNEL_ADDON PRIVATE ${CTHREADFLAGS}) + endif() + if(BUILD_SHARED_LIBS) + # Equivalent to CPICFLAGS in get-noopt-cflags-for + set_target_properties(${addon_target}_C99_KERNEL_ADDON PROPERTIES POSITION_INDEPENDENT_CODE ON) + endif() + add_dependencies(${addon_target}_C99_KERNEL_ADDON flat-header) + # Put all those targets under object-libs-targets folder name so that they appear all together in IDE. + set_target_properties(${addon_target}_C99_KERNEL_ADDON PROPERTIES FOLDER object-libs-targets) + endif() + + # Collect all subdirectory paths that have at least one file with suffix in ADDON_CXX_SUFS list. + get_filepaths_with_suffixes(LOCAL_SOURCE_CXX_FILES "${CMAKE_CURRENT_SOURCE_DIR}/${addon_target}" "${ADDON_CXX_SUFS}") + + # Only generate the object library if there is at least one source file. + list(LENGTH LOCAL_SOURCE_CXX_FILES size) + if(size GREATER 0) + # Create an object library using the source file list above. + add_library(${addon_target}_CXX_ADDON + OBJECT + ${LOCAL_SOURCE_CXX_FILES} + ) + + # Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets. + # mimicing get-addon-cxxflags-for + target_compile_options(${addon_target}_CXX_ADDON + PRIVATE + # load-var-for,COPTFLAGS + ${COPTFLAGS} + # get-noopt-cxxflags-for + ${CDBGFLAGS} + # get-noopt-cxxflags-for + ${CWARNFLAGS} + # get-noopt-cxxflags-for + ${CMISCFLAGS} + # get-noopt-cxxflags-for + ${CXXLANGFLAGS} + # in get-addon-cxxflags-for + ${BUILD_SYMFLAGS} + ) + target_compile_definitions(${addon_target}_CXX_ADDON + PRIVATE + # in get-noopt-cflags-for + ${CPPROCFLAGS} + # in get-noopt-cflags-for + ${VERS_DEF} + # in get-addon-cxxflags-for + ${BUILD_CPPFLAGS} + ) + target_include_directories(${addon_target}_CXX_ADDON + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + # in get-addon-cxxflags-for + ${CADDONINCFLAGS} + ) + if(THREADING_MODEL STREQUAL "openmp") + # Equivalent to CTHREADFLAGS in get-noopt-cflags-for + target_link_libraries(${addon_target}_CXX_ADDON PRIVATE OpenMP::OpenMP_C) + elseif(THREADING_MODEL STREQUAL "pthreads") + # in get-noopt-cflags-for + target_compile_options(${addon_target}_CXX_ADDON PRIVATE ${CTHREADFLAGS}) + endif() + if(BUILD_SHARED_LIBS) + # Equivalent to CPICFLAGS in get-noopt-cflags-for + set_target_properties(${addon_target}_CXX_ADDON PROPERTIES POSITION_INDEPENDENT_CODE ON) + endif() + add_dependencies(${addon_target}_CXX_ADDON flat-header) + # Put all those targets under object-libs-targets folder name so that they appear all together in IDE. + set_target_properties(${addon_target}_CXX_ADDON PROPERTIES FOLDER object-libs-targets) + endif() +endfunction() + +# Generate targets for each of the addons. +foreach(ADDON ${ENABLE_ADDON}) + generate_addon_targets(${ADDON}) +endforeach() \ No newline at end of file diff --git a/aocl_dtl/CMakeLists.txt b/aocl_dtl/CMakeLists.txt index 3985350ab2..3757822f2d 100644 --- a/aocl_dtl/CMakeLists.txt +++ b/aocl_dtl/CMakeLists.txt @@ -1,10 +1,59 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. ## +##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. ## -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/aocldtl.c - ${CMAKE_CURRENT_SOURCE_DIR}/aocldtl_blis.c - ${CMAKE_CURRENT_SOURCE_DIR}/aoclfal.c - ${CMAKE_CURRENT_SOURCE_DIR}/aoclflist.c - ${CMAKE_CURRENT_SOURCE_DIR}/aoclos.c - ) +# Collect all subdirectory paths that have at least one file with suffix in AOCLDTL_SRC_SUFS list. +get_filepaths_with_suffixes(LOCAL_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR} "${AOCLDTL_SRC_SUFS}") + +# Create an object library using the source file list above. +add_library(AOCL_DTL + OBJECT + ${LOCAL_SOURCE_FILES} + ) + +# Include the corresponding make_defs.cmake that holds the required compiler options. +include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake) +# Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets. +# mimicing get-aocldtl-cflags-for +target_compile_options(AOCL_DTL + PRIVATE + # load-var-for,COPTFLAGS + ${COPTFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + # get-noopt-cflags-for + ${CWARNFLAGS} + # get-noopt-cflags-for + ${CMISCFLAGS} + # get-noopt-cflags-for + ${CLANGFLAGS} + # in get-aocldtl-cflags-for + ${BUILD_SYMFLAGS} + ) +target_compile_definitions(AOCL_DTL + PRIVATE + # in get-noopt-cflags-for + ${VERS_DEF} + # in get-aocldtl-cflags-for + ${BUILD_CPPFLAGS} + # in get-aocldtl-cflags-for + ${CPPROCFLAGS} + ) +target_include_directories(AOCL_DTL + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + ) +if(THREADING_MODEL STREQUAL "openmp") + # Equivalent to CTHREADFLAGS in get-noopt-cflags-for + target_link_libraries(AOCL_DTL PRIVATE OpenMP::OpenMP_C) +elseif(THREADING_MODEL STREQUAL "pthreads") + # in get-noopt-cflags-for + target_compile_options(AOCL_DTL PRIVATE ${CTHREADFLAGS}) +endif() +if(BUILD_SHARED_LIBS) + # Equivalent to CPICFLAGS in get-noopt-cflags-for + set_target_properties(AOCL_DTL PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() +add_dependencies(AOCL_DTL flat-header) +# Put all those targets under object-libs-targets folder name so that they appear all together in IDE. +set_target_properties(AOCL_DTL PROPERTIES FOLDER object-libs-targets) diff --git a/blastest/CMakeLists.txt b/blastest/CMakeLists.txt index d35629f53a..6b0f21e249 100644 --- a/blastest/CMakeLists.txt +++ b/blastest/CMakeLists.txt @@ -1,13 +1,131 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## -set(F2C_LIB "libf2c") +# Comments: +# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given. +# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in +# the second case because CONFIG_NAME is not yet set. +if(NOT DEFINED BLIS_INSTALL_PATH) + set(DIST_PATH ${CMAKE_BINARY_DIR}) + set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY}) + set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY}) +else() + set(LIB_PATH ${BLIS_INSTALL_PATH}/lib) + set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis) +endif() -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/f2c) +# Include the corresponding make_defs.cmake that holds the required compiler options. +include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake) -# Generate F2C library -add_library("${F2C_LIB}" STATIC ) -set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C) +# Create a static library using the sources in f2c directory. +file(GLOB f2c_sources LIST_DIRECTORIES false ${CMAKE_CURRENT_SOURCE_DIR}/f2c/*.c) +add_library(f2c STATIC ${f2c_sources}) +target_compile_options(f2c + PRIVATE + # load-var-for,COPTFLAGS + ${COPTFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + ${CWARNFLAGS} + ${CPICFLAGS} + ${CMISCFLAGS} + ${CLANGFLAGS} + # Suppress warnings about uninitialized functions + -Wno-maybe-uninitialized -Wno-parentheses -Wfatal-errors + ) +target_compile_definitions(f2c + PRIVATE + # in get-noopt-cflags-for + ${VERS_DEF} + ${CPPROCFLAGS} + -DHAVE_BLIS_H + ) +target_include_directories(f2c + BEFORE + PRIVATE + # Add local header paths + ${CMAKE_CURRENT_SOURCE_DIR}/f2c + # and the path to blis.h + ${INC_PATH} + ) +target_link_libraries(f2c PRIVATE ${LDFLAGS}) +if(THREADING_MODEL STREQUAL "openmp") + target_link_libraries(f2c PRIVATE OpenMP::OpenMP_C) +endif() +# Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE. +set_target_properties(f2c PROPERTIES FOLDER blastest-targets) +add_dependencies(f2c flat-header) +# Gather all local source files. +file(GLOB blastest_sources LIST_DIRECTORIES false ${CMAKE_CURRENT_SOURCE_DIR}/src/*.c) +list(TRANSFORM blastest_sources REPLACE ${CMAKE_CURRENT_SOURCE_DIR}/src/ "") -add_subdirectory(f2c) -add_subdirectory(src) +# Create one executable for each of the sources. +foreach(source ${blastest_sources}) + string(REPLACE .c "" exec_name ${source}) + add_executable(${exec_name}.x src/${source}) + target_compile_options(${exec_name}.x + PRIVATE + # load-var-for,COPTFLAGS + ${COPTFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + ${CWARNFLAGS} + ${CPICFLAGS} + ${CMISCFLAGS} + ${CLANGFLAGS} + # Suppress warnings about uninitialized functions + -Wno-parentheses -Wno-maybe-uninitialized + ) + target_compile_definitions(${exec_name}.x + PRIVATE + # in get-noopt-cflags-for + ${VERS_DEF} + ${CPPROCFLAGS} + -DHAVE_BLIS_H + ) + target_include_directories(${exec_name}.x + BEFORE + PRIVATE + # Add local header paths + ${CMAKE_CURRENT_SOURCE_DIR}/f2c + # and the path to blis.h + ${INC_PATH} + ) + target_link_libraries(${exec_name}.x PRIVATE f2c libblis ${LDFLAGS}) + if(THREADING_MODEL STREQUAL "openmp") + target_link_libraries(${exec_name}.x PRIVATE OpenMP::OpenMP_C) + endif() + set_target_properties(${exec_name}.x PROPERTIES CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + # Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE. + set_target_properties(${exec_name}.x PROPERTIES FOLDER blastest-targets) + # Add a target for running the tests. Rules are different for level-1 APIs, compared to levels 2 and 3. + if(${exec_name} MATCHES 1) + add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/out.${exec_name} + COMMAND ${exec_name}.x > ${CMAKE_BINARY_DIR}/out.${exec_name} + COMMENT "Running ${exec_name}.x with output redirected to ${CMAKE_BINARY_DIR}/out.${exec_name}" + DEPENDS ${exec_name}.x + WORKING_DIRECTORY $ + VERBATIM + ) + else()# name has 2 or 3 + add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/out.${exec_name} + COMMAND ${exec_name}.x < ${CMAKE_CURRENT_SOURCE_DIR}/input/${exec_name}.in + COMMENT "Running ${exec_name}.x with output saved to ${CMAKE_BINARY_DIR}/out.${exec_name}" + DEPENDS ${exec_name}.x + WORKING_DIRECTORY $ + VERBATIM + ) + endif() + add_custom_target(run-${exec_name} DEPENDS ${CMAKE_BINARY_DIR}/out.${exec_name}) + # Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE. + set_target_properties(run-${exec_name} PROPERTIES FOLDER blastest-targets) + list(APPEND test_executables "run-${exec_name}") +endforeach() + +add_custom_target(testblas DEPENDS ${test_executables}) +add_custom_target(checkblas + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/cmake/check-blastest.py ${CMAKE_BINARY_DIR} + DEPENDS testblas + ) +# Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE. +set_target_properties(testblas checkblas PROPERTIES FOLDER blastest-targets) \ No newline at end of file diff --git a/blastest/f2c/CMakeLists.txt b/blastest/f2c/CMakeLists.txt deleted file mode 100644 index 87ec3b6a5b..0000000000 --- a/blastest/f2c/CMakeLists.txt +++ /dev/null @@ -1,59 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${F2C_LIB}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/abs.c - ${CMAKE_CURRENT_SOURCE_DIR}/acos.c - ${CMAKE_CURRENT_SOURCE_DIR}/asin.c - ${CMAKE_CURRENT_SOURCE_DIR}/atan.c - ${CMAKE_CURRENT_SOURCE_DIR}/atn2.c - ${CMAKE_CURRENT_SOURCE_DIR}/close.c - ${CMAKE_CURRENT_SOURCE_DIR}/cnjg.c - ${CMAKE_CURRENT_SOURCE_DIR}/cos.c - ${CMAKE_CURRENT_SOURCE_DIR}/cosh.c - ${CMAKE_CURRENT_SOURCE_DIR}/dim.c - ${CMAKE_CURRENT_SOURCE_DIR}/div.c - ${CMAKE_CURRENT_SOURCE_DIR}/dolio.c - ${CMAKE_CURRENT_SOURCE_DIR}/endfile.c - ${CMAKE_CURRENT_SOURCE_DIR}/epsilon.c - ${CMAKE_CURRENT_SOURCE_DIR}/err.c - ${CMAKE_CURRENT_SOURCE_DIR}/exit_.c - ${CMAKE_CURRENT_SOURCE_DIR}/exp.c - ${CMAKE_CURRENT_SOURCE_DIR}/fmt.c - ${CMAKE_CURRENT_SOURCE_DIR}/fmtlib.c - ${CMAKE_CURRENT_SOURCE_DIR}/h_dnnt.c - ${CMAKE_CURRENT_SOURCE_DIR}/hl_cmp.c - ${CMAKE_CURRENT_SOURCE_DIR}/i_dnnt.c - ${CMAKE_CURRENT_SOURCE_DIR}/i_len.c - ${CMAKE_CURRENT_SOURCE_DIR}/imag.c - ${CMAKE_CURRENT_SOURCE_DIR}/int.c - ${CMAKE_CURRENT_SOURCE_DIR}/l_cmp.c - ${CMAKE_CURRENT_SOURCE_DIR}/lg10.c - ${CMAKE_CURRENT_SOURCE_DIR}/log.c - ${CMAKE_CURRENT_SOURCE_DIR}/lread.c - ${CMAKE_CURRENT_SOURCE_DIR}/lwrite.c - ${CMAKE_CURRENT_SOURCE_DIR}/mod.c - ${CMAKE_CURRENT_SOURCE_DIR}/nint.c - ${CMAKE_CURRENT_SOURCE_DIR}/open.c - ${CMAKE_CURRENT_SOURCE_DIR}/pow.c - ${CMAKE_CURRENT_SOURCE_DIR}/prod.c - ${CMAKE_CURRENT_SOURCE_DIR}/rdfmt.c - ${CMAKE_CURRENT_SOURCE_DIR}/rewind.c - ${CMAKE_CURRENT_SOURCE_DIR}/rsfe.c - ${CMAKE_CURRENT_SOURCE_DIR}/s_cmp.c - ${CMAKE_CURRENT_SOURCE_DIR}/s_copy.c - ${CMAKE_CURRENT_SOURCE_DIR}/s_stop.c - ${CMAKE_CURRENT_SOURCE_DIR}/sfe.c - ${CMAKE_CURRENT_SOURCE_DIR}/sig_die.c - ${CMAKE_CURRENT_SOURCE_DIR}/sign.c - ${CMAKE_CURRENT_SOURCE_DIR}/sin.c - ${CMAKE_CURRENT_SOURCE_DIR}/sinh.c - ${CMAKE_CURRENT_SOURCE_DIR}/sqrt.c - ${CMAKE_CURRENT_SOURCE_DIR}/tan.c - ${CMAKE_CURRENT_SOURCE_DIR}/tanh.c - ${CMAKE_CURRENT_SOURCE_DIR}/util.c - ${CMAKE_CURRENT_SOURCE_DIR}/wref.c - ${CMAKE_CURRENT_SOURCE_DIR}/wrtfmt.c - ${CMAKE_CURRENT_SOURCE_DIR}/wsfe.c - ${CMAKE_CURRENT_SOURCE_DIR}/wsle.c - ) diff --git a/blastest/f2c/open.c b/blastest/f2c/open.c index 2834fd9463..12e5f02b21 100644 --- a/blastest/f2c/open.c +++ b/blastest/f2c/open.c @@ -28,6 +28,7 @@ use or performance of this software. #include #endif #ifdef _MSC_VER +#include #define access _access #endif #include "f2c.h" diff --git a/blastest/src/CMakeLists.txt b/blastest/src/CMakeLists.txt deleted file mode 100644 index 69274a5547..0000000000 --- a/blastest/src/CMakeLists.txt +++ /dev/null @@ -1,37 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -add_executable(cblat1 cblat1.c) -target_link_libraries(cblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(cblat2 cblat2.c) -target_link_libraries(cblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(cblat3 cblat3.c) -target_link_libraries(cblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(dblat1 dblat1.c) -target_link_libraries(dblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(dblat2 dblat2.c) -target_link_libraries(dblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(dblat3 dblat3.c) -target_link_libraries(dblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(sblat1 sblat1.c) -target_link_libraries(sblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(sblat2 sblat2.c) -target_link_libraries(sblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(sblat3 sblat3.c) -target_link_libraries(sblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(zblat1 zblat1.c) -target_link_libraries(zblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(zblat2 zblat2.c) -target_link_libraries(zblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) - -add_executable(zblat3 zblat3.c) -target_link_libraries(zblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" ) diff --git a/build/bli_win_config.h.in b/build/bli_win_config.h.in deleted file mode 100644 index 4645b5cf95..0000000000 --- a/build/bli_win_config.h.in +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. - */ - -#ifndef BLIS_CONFIG_H -#define BLIS_CONFIG_H - -#cmakedefine AOCL_DYNAMIC - -#cmakedefine AOCL_BLIS_ZEN - -#cmakedefine BLIS_ENABLE_OPENMP - -#cmakedefine BLIS_ENABLE_JRIR_SLAB - -#cmakedefine BLIS_ENABLE_JRIR_RR - -#cmakedefine BLIS_ENABLE_PBA_POOLS - -#cmakedefine BLIS_ENABLE_SBA_POOLS - -#cmakedefine BLIS_ENABLE_MEM_TRACING - -#cmakedefine BLIS_INT_TYPE_SIZE @INT_TYPE_SIZE@ - -#cmakedefine BLIS_BLAS_INT_TYPE_SIZE @BLAS_INT_TYPE_SIZE@ - -#cmakedefine BLIS_ENABLE_BLAS - -#cmakedefine BLIS_ENABLE_CBLAS - -#cmakedefine BLIS_ENABLE_MIXED_DT - -#cmakedefine BLIS_ENABLE_MIXED_DT_EXTRA_MEM - -#cmakedefine BLIS_ENABLE_SUP_HANDLING - -#cmakedefine BLIS_ENABLE_MEMKIND - -#cmakedefine BLIS_ENABLE_TRSM_PREINVERSION - -#cmakedefine BLIS_ENABLE_PRAGMA_OMP_SIMD - -#cmakedefine BLIS_ENABLE_SANDBOX - -#cmakedefine BLIS_ENABLE_SHARED - -#cmakedefine BLIS_ENABLE_COMPLEX_RETURN_INTEL - -#cmakedefine DISABLE_BLIS_ARCH_TYPE - -#cmakedefine DISABLE_BLIS_MODEL_TYPE - -#cmakedefine __blis_arch_type_name "@rename_blis_arch_type@" - -#cmakedefine __blis_model_type_name "@rename_blis_model_type@" - -#endif diff --git a/build/cmake/bli_addon.h.in b/build/cmake/bli_addon.h.in new file mode 100644 index 0000000000..8dc2e6727c --- /dev/null +++ b/build/cmake/bli_addon.h.in @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. + */ + +#ifndef BLIS_ADDON_H +#define BLIS_ADDON_H + +#if ${ENABLE_ADDONS_01} +#define BLIS_ENABLE_ADDONS +#else +#define BLIS_DISABLE_ADDONS +#endif + +// Enabled addons +${ADDON_LIST_INCLUDES} + +#endif diff --git a/build/cmake/bli_config.h.in b/build/cmake/bli_config.h.in new file mode 100644 index 0000000000..9cfbcdcc5f --- /dev/null +++ b/build/cmake/bli_config.h.in @@ -0,0 +1,183 @@ +/* + * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. + */ + +#ifndef BLIS_CONFIG_H +#define BLIS_CONFIG_H + +// Enabled configuration "family" (config_name) +${CONFIG_NAME_DEFINE} + +// Enabled sub-configurations (config_list) +${CONFIG_LIST_DEFINES} + +// Enabled kernel sets (kernel_list) +${KERNEL_LIST_DEFINES} + +//This macro is enabled only for ZEN family configurations. +//This enables us to use different cache-blocking sizes for TRSM instead of common level-3 cache-block sizes. +#if ${ENABLE_AOCL_ZEN_01} +#define AOCL_BLIS_ZEN +#endif + +#if ${ENABLE_AOCL_DYNAMIC_01} +#define AOCL_DYNAMIC +#endif + +#if ${ENABLE_SYSTEM_01} +#define BLIS_ENABLE_SYSTEM +#else +#define BLIS_DISABLE_SYSTEM +#endif + +#if ${ENABLE_OPENMP_01} +#define BLIS_ENABLE_OPENMP +#endif + +#if ${ENABLE_PTHREADS_01} +#define BLIS_ENABLE_PTHREADS +#endif + +#if ${ENABLE_JRIR_SLAB_01} +#define BLIS_ENABLE_JRIR_SLAB +#endif + +#if ${ENABLE_JRIR_RR_01} +#define BLIS_ENABLE_JRIR_RR +#endif + +#if ${ENABLE_PBA_POOLS_01} +#define BLIS_ENABLE_PBA_POOLS +#else +#define BLIS_DISABLE_PBA_POOLS +#endif + +#if ${ENABLE_SBA_POOLS_01} +#define BLIS_ENABLE_SBA_POOLS +#else +#define BLIS_DISABLE_SBA_POOLS +#endif + +#if ${ENABLE_MEM_TRACING_01} +#define BLIS_ENABLE_MEM_TRACING +#else +#define BLIS_DISABLE_MEM_TRACING +#endif + +#if ${INT_TYPE_SIZE} == 64 +#define BLIS_INT_TYPE_SIZE 64 +#elif ${INT_TYPE_SIZE} == 32 +#define BLIS_INT_TYPE_SIZE 32 +#else +// determine automatically +#endif + +#if ${BLAS_INT_TYPE_SIZE} == 64 +#define BLIS_BLAS_INT_TYPE_SIZE 64 +#elif ${BLAS_INT_TYPE_SIZE} == 32 +#define BLIS_BLAS_INT_TYPE_SIZE 32 +#else +// determine automatically +#endif + +#ifndef BLIS_ENABLE_BLAS +#ifndef BLIS_DISABLE_BLAS +#if ${ENABLE_BLAS_01} +#define BLIS_ENABLE_BLAS +#else +#define BLIS_DISABLE_BLAS +#endif +#endif +#endif + +#ifndef BLIS_ENABLE_CBLAS +#ifndef BLIS_DISABLE_CBLAS +#if ${ENABLE_CBLAS_01} +#define BLIS_ENABLE_CBLAS +#else +#define BLIS_DISABLE_CBLAS +#endif +#endif +#endif + +// If the CBLAS compatibility layer was enabled while the BLAS layer +// was not enabled, we must enable the BLAS layer here. Also undefine +// BLIS_DISABLE_BLAS to ensure consistency. +#ifdef BLIS_ENABLE_CBLAS +#ifndef BLIS_ENABLE_BLAS +#define BLIS_ENABLE_BLAS +#endif +#undef BLIS_DISABLE_BLAS +#endif // BLIS_ENABLE_CBLAS + +#ifndef BLIS_ENABLE_MIXED_DT +#ifndef BLIS_DISABLE_MIXED_DT +#if ${ENABLE_MIXED_DT_01} +#define BLIS_ENABLE_MIXED_DT +#else +#define BLIS_DISABLE_MIXED_DT +#endif +#endif +#endif + +#ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM +#ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM +#if ${ENABLE_MIXED_DT_EXTRA_MEM_01} +#define BLIS_ENABLE_MIXED_DT_EXTRA_MEM +#else +#define BLIS_DISABLE_MIXED_DT_EXTRA_MEM +#endif +#endif +#endif + +#if ${ENABLE_SUP_HANDLING_01} +#define BLIS_ENABLE_SUP_HANDLING +#else +#define BLIS_DISABLE_SUP_HANDLING +#endif + +#if ${ENABLE_MEMKIND_01} +#define BLIS_ENABLE_MEMKIND +#else +#define BLIS_DISABLE_MEMKIND +#endif + +#if ${ENABLE_TRSM_PREINVERSION_01} +#define BLIS_ENABLE_TRSM_PREINVERSION +#else +#define BLIS_DISABLE_TRSM_PREINVERSION +#endif + +#if ${ENABLE_PRAGMA_OMP_SIMD_01} +#define BLIS_ENABLE_PRAGMA_OMP_SIMD +#else +#define BLIS_DISABLE_PRAGMA_OMP_SIMD +#endif + +#if ${ENABLE_SANDBOX_01} +#define BLIS_ENABLE_SANDBOX +#else +#define BLIS_DISABLE_SANDBOX +#endif + +#if ${ENABLE_SHARED_01} +#define BLIS_ENABLE_SHARED +#else +#define BLIS_DISABLE_SHARED +#endif + +#if ${COMPLEX_RETURN_INTEL_01} +#define BLIS_ENABLE_COMPLEX_RETURN_INTEL +#else +#define BLIS_DISABLE_COMPLEX_RETURN_INTEL +#endif + +#if ${DISABLE_BLIS_ARCH_TYPE_01} +#define DISABLE_BLIS_ARCH_TYPE +#define DISABLE_BLIS_MODEL_TYPE +#endif + +#define __blis_arch_type_name "${RENAME_BLIS_ARCH_TYPE}" +#define __blis_model_type_name "${RENAME_BLIS_MODEL_TYPE}" + +#endif diff --git a/build/cmake/check-blastest.py b/build/cmake/check-blastest.py new file mode 100644 index 0000000000..f2b641c766 --- /dev/null +++ b/build/cmake/check-blastest.py @@ -0,0 +1,31 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## + +# Import modules +import os +import sys + +def check_blastest(): + results_file_path = sys.argv[1] + results_directory = os.listdir(results_file_path) + has_failure = False + is_empty = False + for fname in results_directory: + if os.path.isfile(results_file_path + os.sep + fname) and "out" in fname: + file = open(results_file_path + os.sep + fname, 'r') + # read all content of a file + content = file.read() + if content == "": + is_empty = True + # check if string present in a file + if "*****" in content: + has_failure = True + if has_failure: + print("\033[0;31m At least one BLAS test failed. :( \033[0m") + print("\033[0;31m Please see the corresponding out.* for details. \033[0m") + elif is_empty: + print("\033[0;31m At least one BLAS test resulted without a PASS. :( \033[0m") + print("\033[0;31m Please ensure that the corresponding out.* was generated correctly. \033[0m") + else: + print("\033[0;32m All BLAS tests passed! \033[0m") + +check_blastest() \ No newline at end of file diff --git a/build/cmake/check-blistest.py b/build/cmake/check-blistest.py new file mode 100644 index 0000000000..1d285ccf78 --- /dev/null +++ b/build/cmake/check-blistest.py @@ -0,0 +1,22 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## + +# Import modules +import os +import sys + +def check_blistest(): + results_file = sys.argv[1] + with open(results_file, 'r') as file: + # read all content of a file + content = file.read() + # check if string present in a file + if "FAILURE" in content: + print("\033[0;31m At least one BLIS test failed. :( \033[0m") + print("\033[0;31m Please see the corresponding output.testsuite* for details. \033[0m") + elif not "PASS" in content: + print("\033[0;31m No BLIS test resulted in PASS. :( \033[0m") + print("\033[0;31m Please ensure that the corresponding output.testsuite* was generated correctly. \033[0m") + else: + print("\033[0;32m All BLIS tests passed! \033[0m") + +check_blistest() \ No newline at end of file diff --git a/build/cmake/config_print.py b/build/cmake/config_print.py new file mode 100644 index 0000000000..cbae038954 --- /dev/null +++ b/build/cmake/config_print.py @@ -0,0 +1,305 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## + +# Import modules +import os +import sys + +def main(): + # Obtain the script name. + path, script_name = os.path.split(sys.argv[0]) + print( " " ) + print( " %s" % script_name ) + print( " " ) + print( " Configure BLIS's CMake system for compilation using a specified" ) + print( " configuration directory." ) + print( " " ) + print( " Usage:" ) + print( " " ) + print( " cmake .. [Options] -DBLIS_CONFIG_FAMILY=confname" ) + print( " " ) + print(" Arguments:") + print(" ") + print(" confname The name of the sub-directory inside of the 'config'") + print(" directory containing the desired BLIS configuration.") + print(" Currently, only amdzen, zen, zen2, zen3, zen4 and generic") + print(" configuration options are supported.") + print(" Note that confname MUST be specified; if it is not,") + print(" configure will complain. To build a completely generic") + print(" implementation, use the 'generic' configuration.") + print(" ") + print( " Options:" ) + print( " " ) + print( " -DCMAKE_INSTALL_PREFIX=PREFIX" ) + print( " " ) + print( " The common installation prefix for all files." ) + print( " If this option is not given, PREFIX defaults to '/usr/local/'." ) + print( " on UNIX and c:/Program Files/${PROJECT_NAME} on Windows." ) + print( " " ) + print( " -DENABLE_DEBUG=DEBUG" ) + print( " " ) + print( " Enable debugging symbols in the library." ) + print( " DEBUG is 'off' by default. If argument" ) + print( " DEBUG is given as 'opt', then optimization flags are" ) + print( " kept in the framework, otherwise optimization is" ) + print( " turned off. Available options are 'opt', 'noopt' and 'off'." ) + print( " " ) + print( " --disable-static, --enable-static" ) + print( " " ) + print( " Disable (enabled by default) building BLIS as a static" ) + print( " library. If the static library build is disabled, the" ) + print( " shared library build must remain enabled." ) + print( " " ) + print( " --disable-shared, --enable-shared" ) + print( " " ) + print( " Disable (enabled by default) building BLIS as a shared" ) + print( " library. If the shared library build is disabled, the" ) + print( " static library build must remain enabled." ) + print( " " ) + print( " -DEXPORT_SHARED=[SYMBOLS]" ) + print( " " ) + print( " Specify the subset of library symbols that are exported" ) + print( " within a shared library. Valid values for SYMBOLS are:" ) + print( " 'public' (the default) and 'all'. By default, only" ) + print( " functions and variables that belong to public APIs are" ) + print( " exported in shared libraries. However, the user may" ) + print( " instead export all symbols in BLIS, even those that were" ) + print( " intended for internal use only. Note that the public APIs" ) + print( " encompass all functions that almost any user would ever" ) + print( " want to call, including the BLAS/CBLAS compatibility APIs" ) + print( " as well as the basic and expert interfaces to the typed" ) + print( " and object APIs that are unique to BLIS. Also note that" ) + print( " changing this option to 'all' will have no effect in some" ) + print( " environments, such as when compiling with clang on" ) + print( " Windows." ) + print( " " ) + print( " -DENABLE_THREADING=MODEL" ) + print( " " ) + print( " Enable threading in the library, using threading model" ) + print( " MODEL={openmp, pthreads, no}. If MODEL=no threading will be" ) + print( " disabled. The default is 'no'." ) + print( " " ) + print( " -DENABLE_SYSTEM=ON or -DENABLE_SYSTEM=OFF") + print( " " ) + print( " Enable conventional operating system support, such as" ) + print( " pthreads for thread-safety. The default state is enabled." ) + print( " However, in rare circumstances you may wish to configure" ) + print( " BLIS for use with a minimal or nonexistent operating" ) + print( " system (e.g. hardware simulators). In these situations," ) + print( " -DENABLE_SYSTEM=OFF may be used to jettison all compile-time" ) + print( " and link-time dependencies outside of the standard C" ) + print( " library. When disabled, this option also forces the use" ) + print( " of -DENABLE_THREADING=no." ) + print( " " ) + print( " -DENABLE_PBA_POOLS=ON or -DENABLE_PBA_POOLS=OFF" ) + print( " -DENABLE_SBA_POOLS=ON or -DENABLE_SBA_POOLS=OFF" ) + print( " " ) + print( " Disable (enabled by default) use of internal memory pools" ) + print( " within the packing block allocator (pba) and/or the small" ) + print( " block allocator (sba). The former is used to allocate" ) + print( " memory used to pack submatrices while the latter is used" ) + print( " to allocate control/thread tree nodes and thread" ) + print( " communicators. Both allocations take place in the context" ) + print( " of level-3 operations. When the pba is disabled, the" ) + print( " malloc()-like function specified by BLIS_MALLOC_POOL is" ) + print( " called on-demand whenever a packing block is needed, and" ) + print( " when the sba is disabled, the malloc()-like function" ) + print( " specified by BLIS_MALLOC_INTL is called whenever a small" ) + print( " block is needed, with the two allocators calling free()-" ) + print( " like functions BLIS_FREE_POOL and BLIS_FREE_INTL," ) + print( " respectively when blocks are released. When enabled," ) + print( " either or both pools are populated via the same functions" ) + print( " mentioned previously, and henceforth blocks are checked" ) + print( " out and in. The library quickly reaches a state in which" ) + print( " it no longer needs to call malloc() or free(), even" ) + print( " across many separate level-3 operation invocations." ) + print( " " ) + print( " -DENABLE_MEM_TRACING=ON or -DENABLE_MEM_TRACING=OFF" ) + print( " " ) + print( " Enable (disable by default) output to stdout that traces" ) + print( " the allocation and freeing of memory, including the names" ) + print( " of the functions that triggered the allocation/freeing." ) + print( " Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE." ) + print( " Please use only for informational/debugging purposes." ) + print( " " ) + print( " -DINT_SIZE=SIZE" ) + print( " " ) + print( " Set the size (in bits) of internal BLIS integers and" ) + print( " integer types used in native BLIS interfaces. The" ) + print( " default integer type size is architecture dependent." ) + print( " (Hint: You can always find this value printed at the" ) + print( " beginning of the testsuite output.)" ) + print( " " ) + print( " -DBLAS_TYPE_SIZE=SIZE" ) + print( " " ) + print( " Set the size (in bits) of integer types in external" ) + print( " BLAS and CBLAS interfaces, if enabled. The default" ) + print( " integer type size used in BLAS/CBLAS is 32 bits." ) + print( " " ) + print( " -DENABLE_BLAS=ON or -DENABLE_BLAS=OFF" ) + print( " " ) + print( " Disable (enabled by default) building the BLAS" ) + print( " compatibility layer." ) + print( " " ) + print( " -DENABLE_CBLAS=ON or -DENABLE_CBLAS=OFF" ) + print( " " ) + print( " Enable (disabled by default) building the CBLAS" ) + print( " compatibility layer. This automatically enables the" ) + print( " BLAS compatibility layer as well." ) + print( " " ) + print( " -DENABLE_MIXED_DT=ON or -DENABLE_MIXED_DT=OFF" ) + print( " " ) + print( " Disable (enabled by default) support for mixing the" ) + print( " storage domain and/or storage precision of matrix" ) + print( " operands for the gemm operation, as well as support" ) + print( " for computing in a precision different from one or" ) + print( " both of matrices A and B." ) + print( " " ) + print( " -DENABLE_MIXED_DT_EXTRA_MEM=ON or -DENABLE_MIXED_DT_EXTRA_MEM=OFF" ) + print( " " ) + print( " Disable (enabled by default) support for additional" ) + print( " mixed datatype optimizations that require temporarily" ) + print( " allocating extra memory--specifically, a single m x n" ) + print( " matrix (per application thread) whose storage datatype" ) + print( " is equal to the computation datatype. This option may" ) + print( " only be enabled when mixed domain/precision support is" ) + print( " enabled." ) + print( " " ) + print( " -DENABLE_SUP_HANDLING=ON or -DENABLE_SUP_HANDLING=OFF" ) + print( " " ) + print( " Disable (enabled by default) handling of small/skinny" ) + print( " matrix problems via separate code branches. When disabled," ) + print( " these small/skinny level-3 operations will be performed by" ) + print( " the conventional implementation, which is optimized for" ) + print( " medium and large problems. Note that what qualifies as" ) + print( " \"small\" depends on thresholds that may vary by sub-" ) + print( " configuration." ) + print( " " ) + print( " -DENABLE_ADDON=\"NAME1[;NAME2;...]\" (Linux only)") + print( " " ) + print( " Enable the code provided by an addon. An addon consists" ) + print( " of a separate directory of code that provides additional" ) + print( " APIs, implementations, and/or operations that would" ) + print( " otherwise not be present within a build of BLIS." ) + print( " To enable a single addon named NAME1, set -DENABLE_ADDON=NAME1." ) + print( " To enable multiple addons, a ';'-separated list enclosed in \"\"") + print( " needs to be provided. For example, -DENABLE_ADDON=\"NAME1;NAME2\".") + print(" By default, no addons are enabled.") + print( " " ) + # Sandbox functionality is currently disabled in CMake. + #print( " -DENABLE_SANDBOX=NAME" ) + #print( " " ) + #print( " Enable a separate sandbox implementation of gemm. This" ) + #print( " option disables BLIS's conventional gemm implementation" ) + #print( " (which shares common infrastructure with other level-3" ) + #print( " operations) and instead compiles and uses the code in" ) + #print( " the NAME directory, which is expected to be a sub-" ) + #print( " directory of 'sandbox'. By default, no sandboxes are" ) + #print( " enabled." ) + #print( " " ) + print( " -DENABLE_MEMKIND=ON or -DENABLE_MEMKIND=OFF" ) + print( " " ) + print( " Forcibly enable or disable the use of libmemkind's" ) + print( " hbw_malloc() and hbw_free() as substitutes for malloc()" ) + print( " and free(), respectively, when allocating memory for" ) + print( " BLIS's memory pools, which are used to manage buffers" ) + print( " into which matrices are packed. The default behavior" ) + print( " for this option is environment-dependent; if configure" ) + print( " detects the presence of libmemkind, libmemkind is used" ) + print( " by default, and otherwise it is not used by default." ) + print( " " ) + print( " -DTHREAD_PART_JRIR=METHOD" ) + print( " " ) + print( " Request a method of assigning micropanels to threads in" ) + print( " the JR and IR loops. Valid values for METHOD are 'slab'" ) + print( " and 'rr'. Using 'slab' assigns (as much as possible)" ) + print( " contiguous regions of micropanels to each thread while" ) + print( " using 'rr' assigns micropanels to threads in a round-" ) + print( " robin fashion. The chosen method also applies during" ) + print( " the packing of A and B. The default method is 'slab'." ) + print( " NOTE: Specifying this option constitutes a request," ) + print( " which may be ignored in select situations if the" ) + print( " implementation has a good reason to do so." ) + print( " " ) + print( " -DENABLE_TRSM_PREINVERSION=ON or -DENABLE_TRSM_PREINVERSION=OFF" ) + print( " " ) + print( " Disable (enabled by default) pre-inversion of triangular" ) + print( " matrix diagonals when performing trsm. When pre-inversion" ) + print( " is enabled, diagonal elements are inverted outside of the" ) + print( " microkernel (e.g. during packing) so that the microkernel" ) + print( " can use multiply instructions. When disabled, division" ) + print( " instructions are used within the microkernel. Executing" ) + print( " these division instructions within the microkernel will" ) + print( " incur a performance penalty, but numerical robustness will" ) + print( " improve for certain cases involving denormal numbers that" ) + print( " would otherwise result in overflow in the pre-inverted" ) + print( " values." ) + print( " " ) + print( " -DFORCE_VERSION_STRING=STRING" ) + print( " " ) + print( " Force configure to use an arbitrary version string" ) + print( " STRING. This option may be useful when repackaging" ) + print( " custom versions of BLIS by outside organizations." ) + print( " " ) + print( " -DCOMPLEX_RETURN=gnu or -DCOMPLEX_RETURN=intel or -DCOMPLEX_RETURN=default" ) + print( " " ) + print( " Specify the way in which complex numbers are returned" ) + print( " from Fortran functions, either \"gnu\" (return in" ) + print( " registers) or \"intel\" (return via hidden argument)." ) + print( " By default COMPLEX_RETURNis set to 'default' and we" ) + print( " attempt to determine the return type from the compiler." ) + print( " Otherwise, the default is \"gnu\"." ) + print( " " ) + print( " -DENABLE_AOCL_DYNAMIC=ON or -DENABLE_AOCL_DYNAMIC=OFF" ) + print( " " ) + print( " Disable (Enabled by default) dynamic selection of number of" ) + print( " threads used to solve the given problem." ) + print( " Range of optimum number of threads will be [1, num_threads]," ) + print( " where \"num_threads\" is number of threads set by the application." ) + print( " Num_threads is derived from either environment variable" ) + print( " OMP_NUM_THREADS or BLIS_NUM_THREADS' or bli_set_num_threads() API." ) + print( " " ) + print( " -DDISABLE_BLIS_ARCH_TYPE=ON or -DDISABLE_BLIS_ARCH_TYPE=OFF" ) + print( " " ) + print( " Disable (Enabled by default) support for BLIS_ARCH_TYPE and BLIS_MODEL_TYPE" ) + print( " environment variables, which allows user to select" ) + print( " architecture specific code path and optimizations at runtime." ) + print( " If disabled, in builds with multiple code paths, BLIS" ) + print( " will still select path and optimizations automatically." ) + print( " " ) + print( " -DRENAME_BLIS_ARCH_TYPE=STRING" ) + print( " " ) + print( " Change environment variable used to select architecture specific" ) + print( " code path from BLIS_ARCH_TYPE to STRING" ) + print( " " ) + print( " -DRENAME_BLIS_MODEL_TYPE=STRING" ) + print( " " ) + print( " Change environment variable used to select architecture model specific" ) + print( " optimizations from BLIS_MODEL_TYPE to STRING" ) + print( " " ) + print( " -DENABLE_NO_UNDERSCORE_API=OFF" ) + print( " " ) + print( " Export APIs without underscore" ) + print( " " ) + print( " -DENABLE_UPPERCASE_API=OFF" ) + print( " " ) + print( " Export APIs with uppercase" ) + print( " " ) + print( " " ) + print( " Additional CMake Variables:" ) + print( " " ) + print( " CMAKE_C_COMPILER Specifies the C compiler to use." ) + print( " CMAKE_CXX_COMPILER Specifies the C++ compiler to use (sandbox only)." ) + print( " CMAKE_Fortran_COMPILER Specifies the Fortran compiler to use (only to determine --complex-return)." ) + print( " COMPILE_OPTIONS Specifies additional compiler flags to use." ) + print( " COMPILE_DEFINITIONS Specifies additional preprocessor definitions to use." ) + print( " LINK_OPTIONS Specifies additional linker flags to use." ) + print( " " ) + print( " Note that not all compilers are compatible with a given" ) + print( " configuration." ) + + # Return from main(). + return 0 + + +if __name__ == "__main__": + main() diff --git a/build/cmake/read_registry.py b/build/cmake/read_registry.py new file mode 100644 index 0000000000..16bf3f9903 --- /dev/null +++ b/build/cmake/read_registry.py @@ -0,0 +1,409 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## + +# Import modules +import os +import sys +import re + +def canonicalize_ws(str): + # Remove leading and trailing whitespace. + str = str.strip() + # Remove duplicate spaces between words. + res = " ".join(str.split()) + # Update the input argument. + return res + + +def is_singleton(str): + rval = False + count_str = " " + for item in str.split(): + count_str = count_str + "x" + if count_str == "x": + rval = True + return rval + + +def is_singleton_family(familyname, memberlist): + rval = False + if is_singleton(memberlist): + if memberlist == familyname: + rval = True + return rval + + +def is_in_list(word, str): + rval = False + for item in str.split(): + if item == word: + rval = True + break + return rval + + +def assign_key_value(array, key, value): + array.update({key: value}) + + +def query_array(array, key): + value = array.get(key) + return value + + +def remove_from_list(strike_words, list): + flist = "" + for item in list.split(): + # Filter out any list item that matches any of the strike words. + if not is_in_list(item, strike_words): + flist = " ".join([flist, item]) + flist = canonicalize_ws(flist) + # Return the filtered list. + return flist + +def replace_curconfig_configset(klisttmp, curconfig, configset): + tmplist = list(klisttmp.split(" ")) + ind = tmplist.index(curconfig) + tmplist.remove(curconfig) + tmplist.insert(ind, configset) + newlist = " ".join(map(str, tmplist)) + return newlist + +def rm_duplicate_words(str): + res = " ".join(str.split()[::-1]) + res = " ".join(dict.fromkeys(res.split())) + str = " ".join(res.split()[::-1]) + return str + +def pass_config_kernel_registries(filename, passnum): + global config_blist + global indirect_blist + global config_registry + global kernel_registry + # first argument: the file containing the configuration registry. + # second argument: the pass number: 0 or 1. Pass 0 builds the + # indirect config blacklist (indirect_blist) ONLY. Pass 1 actually + # begins populating the config and kernel registries, and assumes + # the indirect_blist has already been created. + # Initialize a list of indirect blacklisted configurations for the + # current iteration. These are configurations that are invalidated by + # the removal of blacklisted configurations. For example, if haswell + # is registered as needing the 'haswell' and 'zen' kernel sets: + # haswell: haswell/haswell/zen + # and 'zen' was blacklisted because of the compiler version, then the + # 'haswell' configuration must be omitted from the registry, as it no + # longer has all of the kernel sets it was expecting. + if passnum == 0: + indirect_blist = "" + # For convenience, merge the original and indirect blacklists. + # NOTE: During pass 0, all_blist is equal to config_blist, since + # indirect_blist is still empty. + all_blist = config_blist + indirect_blist + # Disable support for indirect blacklisting by returning early during + # pass 0. See issue #214 for details [1]. Basically, I realized that + # indirect blacklisting is not needed in the use case that I envisioned + # in the real-life example above. If a subconfiguration such as haswell + # is defined to require the zen kernel set, it implies that the zen + # kernels can be compiled with haswell compiler flags. That is, just + # because the zen subconfig (and its compiler flags) is blacklisted + # does not mean that the haswell subconfig cannot compile the zen + # kernels with haswell-specific flags. + # [1] https://github.com/flame/blis/issues/214 + if passnum == 0: + return + + cfg = open(filename, "r+") + while True: + line = cfg.readline() + if not line: + break + + # We've stripped out leading whitespace and trailing comments. If + # the line is now empty, then we can skip it altogether. + if re.match(r'\n', line) or re.match(r'#', line): + continue + + # Read the config name and config list for the current line. + cname, list = line.split(':') + cname = cname.strip() + list = list.strip() + # If we encounter a slash, it means the name of the configuration + # and the kernel set needed by that configuration are different. + if list.find("/") != -1: + clist = "" + klist = "" + # The sub-configuration name is always the first sub-word in + # the slash-separated compound word. + # Delete the sub-configuration name from the front of the + # string, leaving the slash-separated kernel names (or just + # the kernel name, if there is only one). + # Replace the slashes with spaces to transform the string + # into a space-separated list of kernel names. + list = list.replace("/", " ") + config, kernels = list.split(" ", 1) + + clist = clist + config + klist = klist + kernels + else: + clist = list + klist = list + + # Strip out whitespace from the config name and config/kernel list + # on each line. + cname = canonicalize_ws(cname) + clist = canonicalize_ws(clist) + klist = canonicalize_ws(klist) + # Next, we prepare to: + # - pass 0: inspect klist for blacklisted configurations, which may + # reveal configurations as needing to be indirectly blacklisted. + # - pass 1: compare cname to the blacklists and commit clist/klist + # to their respective registries, as appropriate. + # Handle singleton and umbrella configuration entries separately. + if is_singleton_family(cname, clist): + # Singleton configurations/families. + # Note: for singleton families, clist contains one item, which + # always equals cname, but klist could contain more than one + # item. + # Only consider updating the indirect blacklist (pass 0) or + # committing clist and klist to the registries (pass 1) if the + # configuration name (cname) is not blacklisted. + if not is_in_list(cname, all_blist): + if passnum == 0: + # Even if the cname isn't blacklisted, one of the requisite + # kernels might be, so we need to check klist for blacklisted + # items. If we find one, we must assume that the entire entry + # must be thrown out. (Ideally, we would simply fall back to + # reference code for the blacklisted kernels, but that is not + # at all straightforward under the current configuration + # system architecture.) Thus, we add cname to the indirect + # blacklist. + for item in klist.split(): + if is_in_list(item, config_blist): + indirect_blist = indirect_blist + cname + break + if passnum == 1: + # Store the clist to the cname key of the config registry. + # config_registry[${cname}]=${clist} + assign_key_value(config_registry, cname, clist) + if passnum == 1: + # Store the klist to the cname key of the kernel registry. + # kernel_registry[${cname}]=${klist} + assign_key_value(kernel_registry, cname, klist) + else: + # Umbrella configurations/families. + # First we check cname, which should generally not be blacklisted + # for umbrella families, but we check anyway just to be safe. + if not is_in_list(cname, all_blist): + if passnum == 1: + # Check each item in the clist and klist. (At this point, + # clist == klist.) If any sub-config is blacklisted, we + # omit it from clist and klist. + for item in clist.split(): + if is_in_list(item, all_blist): + clist = remove_from_list(item, clist) + klist = remove_from_list(item, klist) + # Store the config and kernel lists to entries that + # corresponds to the config name. + assign_key_value(config_registry, cname, clist) + assign_key_value(kernel_registry, cname, klist) + cfg.close() + if passnum == 0: + # Assign the final indirect blacklist (with whitespace removed). + indirect_blist = canonicalize_ws(indirect_blist) + + +def read_registry_file(filename): + global config_registry + global kernel_registry + # Execute an initial pass through the config_registry file so that + # we can accumulate a list of indirectly blacklisted configurations, + # if any. + pass_config_kernel_registries(filename, 0) + # Now that the indirect_blist has been created, make a second pass + # through the 'config_registry' file, this time creating the actual + # config and kernel registry data structures. + pass_config_kernel_registries(filename, 1) + # Now we must go back through the config_registry and subsitute any + # configuration families with their constituents' members. Each time + # one of these substitutions occurs, we set a flag that causes us to + # make one more pass. (Subsituting a singleton definition does not + # prompt additional iterations.) This process stops when a full pass + # does not result in any subsitution. + + iterate_again = 1 + while iterate_again == 1: + iterate_again = 0 + for cr_var in config_registry: + config = cr_var + clist = query_array(config_registry, config) + # The entries that define singleton families should never need any substitution. + if is_singleton_family(config, clist): + continue + for mem in clist.split(): + mems_mem = query_array(config_registry, mem) + # If mems_mem is empty string, then mem was not found as a key + # in the config list associative array. In that case, we continue + # and will echo an error later in the script. + if not (mems_mem and mems_mem.strip()): + continue + if mem != mems_mem: + clist = query_array(config_registry, config) + # Replace the current config with its constituent config set, + # canonicalize whitespace, and then remove duplicate config + # set names, if they exist. Finally, update the config registry + # with the new config list. + #newclist = replace_curconfig_configset(clist, mem, mems_mem) + newclist = re.sub(r"\b{}\b".format(mem), mems_mem, clist) + newclist = canonicalize_ws(newclist) + newclist = rm_duplicate_words(newclist) + assign_key_value(config_registry, config, newclist) + # Since we performed a substitution and changed the config + # list, mark the iteration flag to continue another round, + # but only if the config (mem) value is NOT present + # in the list of sub-configs. If it is present, then further + # substitution may not necessarily be needed this round. + if not is_in_list(mem, mems_mem): + iterate_again = 1 + # Similar to what we just did for the config_registry, we now iterate + # through the kernel_registry and substitute any configuration families + # in the kernel list (right side of ':') with the members of that + # family's kernel set. This process continues iteratively, as before, + # until all families have been replaced with singleton configurations' + # kernel sets. + iterate_again = 1 + while iterate_again == 1: + iterate_again = 0 + for kr_var in kernel_registry: + config = kr_var + klist = query_array(kernel_registry, config) + # The entries that define singleton families should never need + # any substitution. In the kernel registry, we know it's a + # singleton entry when the cname occurs somewhere in the klist. + # (This is slightly different than the same test in the config + # registry, where we test that clist is one word and that + # clist == cname.) + if is_in_list(config, klist): + # echo "debug: '${config}' not found in '${klist}'; skipping." + continue + for ker in klist.split(): + kers_ker = query_array(kernel_registry, ker) + # If kers_ker is empty string, then ker was not found as a key + # in the kernel registry. While not common, this can happen + # when ker identifies a kernel set that does not correspond to + # any configuration. (Example: armv7a and armv8a kernel sets are + # used by cortexa* configurations, but do not correspond to their + # own configurations.) + if not (kers_ker and kers_ker.strip()): + continue + # If the current config/kernel (ker) differs from its singleton kernel + # entry (kers_ker), then that singleton entry was specified to use + # a different configuration's kernel set. Thus, we need to replace the + # occurrence in the current config/kernel name with that of the kernel + # set it needs. + if ker != kers_ker: + klisttmp = query_array(kernel_registry, config) + # Replace the current config with its requisite kernels, + # canonicalize whitespace, and then remove duplicate kernel + # set names, if they exist. Finally, update the kernel registry + # with the new kernel list. + #newklist = replace_curconfig_configset(klisttmp, ker, kers_ker) + newklist = re.sub(r"\b{}\b".format(ker), kers_ker, klisttmp) + newklist = canonicalize_ws(newklist) + newklist = rm_duplicate_words(newklist) + assign_key_value(kernel_registry, config, newklist) + # Since we performed a substitution and changed the kernel + # list, mark the iteration flag to continue another round, + # unless we just substituted using a singleton family + # definition, in which case we don't necessarily need to + # iterate further this round. + if not is_in_list(ker, kers_ker): + iterate_again = 1 + + +def build_kconfig_registry(familyname): + global config_registry + global kernel_registry + global kconfig_registry + clist = query_array(config_registry, familyname) + for config in clist.split(): + # Look up the kernels for the current sub-configuration. + kernels = query_array(kernel_registry, config) + for kernel in kernels.split(): + # Add the sub-configuration to the list associated with the kernel. + # Query the current sub-configs for the current ${kernel}. + cur_configs = query_array(kconfig_registry, kernel) + # Add the current sub-configuration to the list of sub-configs we just queried. + if cur_configs and cur_configs.strip(): + cur_configs = " ".join([cur_configs, config]) + cur_configs = cur_configs.strip() + else: + cur_configs = config + newvalue = canonicalize_ws(cur_configs) + # Update the array. + assign_key_value(kconfig_registry, kernel, newvalue) + + +def lastWord(string): + # finding the index of last space + index = string.rfind(" ") + # last word + return string[index + 1:] + + + +config_blist = "" +indirect_blist = "" +config_registry = {} +kernel_registry = {} +kconfig_registry = {} + +def process_config(): + # Obtain the script name. + cwd = os.getcwd() + path, arch = os.path.split(sys.argv[1]) + target_file = os.path.join(sys.argv[2], 'config_registry') + + read_registry_file(target_file) + + config_list = query_array(config_registry, arch) + kernel_list = query_array(kernel_registry, arch) + + build_kconfig_registry(arch) + + config_list = " ".join(config_list.split()) + kernel_list = " ".join(kernel_list.split()) + + # We use a sorted version of kernel_list so that it ends up matching the + # display order of the kconfig_registry above. + kernel_list_sort = kernel_list + + kconfig_map = "" + for kernel in kernel_list_sort.split(): + configs = query_array(kconfig_registry, kernel) + + has_one_kernel = is_singleton(configs) + contains_kernel = is_in_list(kernel, configs) + + # Check if the list is a singleton. + if has_one_kernel: + reducedclist = configs + # Check if the list contains a sub-config name that matches the kernel. + elif contains_kernel: + reducedclist = kernel + # Otherwise, use the last name. + else: + last_config = lastWord(configs) + reducedclist = last_config + + # Create a new "kernel:subconfig" pair and add it to the kconfig_map + # list, removing whitespace. + new_pair = kernel+':'+reducedclist + kconfig_map = " ".join([kconfig_map, new_pair]) + kconfig_map = canonicalize_ws(kconfig_map) + + config = " ; ".join([config_list, kernel_list, kconfig_map]) + return config + + +# Function call for config family names +CONFIG = process_config() +print(CONFIG) \ No newline at end of file diff --git a/build/cmake/subdir_helper_functions.cmake b/build/cmake/subdir_helper_functions.cmake new file mode 100644 index 0000000000..06a30bbe98 --- /dev/null +++ b/build/cmake/subdir_helper_functions.cmake @@ -0,0 +1,122 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## + +# Create a list of keywords for files that need to be ignored by the system. +file(READ ${CMAKE_SOURCE_DIR}/build/gen-make-frags/ignore_list IGNORE_LIST) +string(REPLACE "\n" ";" IGNORE_LIST ${IGNORE_LIST}) + +# Create a list of suffixes for files that need to be compiled to create the library. +file(READ ${CMAKE_SOURCE_DIR}/build/gen-make-frags/suffix_list SUFFIX_LIST) +string(REPLACE "\n" ";" SUFFIX_LIST ${SUFFIX_LIST}) + +#-------------------------------------------- +# SUFFIX LISTS +#-------------------------------------------- +# Source suffixes. +set(CONFIG_SRC_SUFS "c") +set(KERNELS_SRC_SUFS "c;s;S") +set(FRAME_SRC_SUFS "c") + +set(AOCLDTL_SRC_SUFS "c") +set(ADDON_C99_SUFS "c") +set(ADDON_CXX_SUFS "cc;cpp;cxx") +set(ADDON_SRC_SUFS "${ADDON_C99_SUFS};${ADDON_CXX_SUFS}") + +set(SANDBOX_C99_SUFS "c") +set(SANDBOX_CXX_SUFS "cc;cpp;cxx") +set(SANDBOX_SRC_SUFS "${SANDBOX_C99_SUFS};${SANDBOX_CXX_SUFS}") + +# Header suffixes. +set(FRAME_HDR_SUFS "h") + +set(AOCLDTL_HDR_SUFS "h") +set(ADDON_H99_SUFS "h") +set(ADDON_HXX_SUFS "hh;hpp;hxx") +set(ADDON_HDR_SUFS "${ADDON_H99_SUFS};${ADDON_HXX_SUFS}") + +set(SANDBOX_H99_SUFS "h") +set(SANDBOX_HXX_SUFS "hh;hpp;hxx") +set(SANDBOX_HDR_SUFS "$(SANDBOX_H99_SUFS);$(SANDBOX_HXX_SUFS)") + +# Combine all header suffixes and remove duplicates. +set(ALL_HDR_SUFS "${FRAME_HDR_SUFS};${ADDON_HDR_SUFS};${SANDBOX_HDR_SUFS};${AOCLDTL_HDR_SUFS}") +list(REMOVE_DUPLICATES ALL_HDR_SUFS) + +set(ALL_H99_SUFS "${FRAME_HDR_SUFS};${ADDON_HDR_SUFS};${SANDBOX_H99_SUFS};${AOCLDTL_HDR_SUFS}") +list(REMOVE_DUPLICATES ALL_H99_SUFS) + +#-------------------------------------------- +# Important sets of header files and paths +#-------------------------------------------- +# Get a list of all sub-directories of a given directory +macro(get_dirpaths_with_suffixes result curdir sufflist) + set(dirlist "") + # dirlist will have all files which are below this directory. + file(GLOB_RECURSE children LIST_DIRECTORIES true ${curdir}/*) + # Adding current directory in the list. + list(PREPEND children ${curdir}) + # Filter out anything that is not a directory. + foreach(child ${children}) + if(IS_DIRECTORY ${child}) + set(HAS_SUFF_FILE "false") + foreach(suff ${sufflist}) + file(GLOB suff_files LIST_DIRECTORIES false ${child}/*\.${suff}) + list(LENGTH suff_files list_size) + if(NOT (${list_size} STREQUAL 0)) + set(HAS_SUFF_FILE "true") + # If there is at least one file with a specific suffix break from for-loop. + break() + endif() + endforeach() + # If there is at least one *.suff file, add directory path in the list. + if(HAS_SUFF_FILE STREQUAL "true") + list(APPEND dirlist "${child}/") + endif() + endif() + endforeach() + # Get the name of the current directory, after removing the source directory + # from the name, so that we can exclude the files that are part of the ignore + # list even if the blis directory is located in a directory with a name that + # would be ignored. + string(REPLACE "${CMAKE_SOURCE_DIR}/" "" curdirsimple ${curdir}) + # Filter out anything that is part of the IGNORE_LIST. + foreach(item ${IGNORE_LIST}) + list(FILTER dirlist EXCLUDE REGEX ${curdirsimple}.*/${item}/) + endforeach() + list(APPEND ${result} ${dirlist}) +endmacro() + +# Get a list of all source files of a given directory based on the suffix list. +# Returns a list which can be transfored to a string when needed +# from high level CMake. +macro(get_filepaths_with_suffixes result curdir sufflist) + set(sourcelist "") + # Get the name of the current directory, after removing the source directory + # from the name, so that we can exclude the files that are part of the ignore + # list even if the blis directory is located in a directory with a name that + # would be ignored. + string(REPLACE "${CMAKE_SOURCE_DIR}/" "" curdirsimple ${curdir}) + foreach(suff ${sufflist}) + # dirlist will have all files which are below this directory. + file(GLOB_RECURSE suff_files LIST_DIRECTORIES false ${curdir}/*\.${suff}) + # Filter out anything that is part of the IGNORE_LIST. + foreach(item ${IGNORE_LIST}) + list(FILTER suff_files EXCLUDE REGEX ${curdirsimple}.*/${item}/) + endforeach() + list(APPEND sourcelist "${suff_files}") + endforeach() + list(APPEND ${result} ${sourcelist}) +endmacro() + +# Choose correct sub-configurarion name for the given kernel set. +# Behaves similary to get-config-for-kset. +macro(get_config_for_kernel_from_kconfig_map config kernel kconfig_map) + set(conf ${kconfig_map}) + # Since kconfig_map has as elements pairs of the form kernel:config, + # to find the element with the corresponding config we need to filter + # with respect to the kernel first. + list(FILTER conf INCLUDE REGEX ${kernel}:) + # Now that the list has only one element, we can remove the part + # of kernel: and then we will be left with config. + list(TRANSFORM conf REPLACE ${kernel}: "") + list(APPEND ${config} ${conf}) +endmacro() \ No newline at end of file diff --git a/config/CMakeLists.txt b/config/CMakeLists.txt index 3a5925a306..537a67df2c 100644 --- a/config/CMakeLists.txt +++ b/config/CMakeLists.txt @@ -1,28 +1,187 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. ## +##Copyright (C) 2022-2023, Advanced Micro Devices, Inc ## -if(${TARGET_ARCH} STREQUAL zen4) -message("The configuration is : ${TARGET_ARCH}") -add_subdirectory(zen4) -elseif(${TARGET_ARCH} STREQUAL zen3) -message("The configuration is : ${TARGET_ARCH}") -add_subdirectory(zen3) -elseif(${TARGET_ARCH} STREQUAL zen2) -message("The configuration is : ${TARGET_ARCH}") -add_subdirectory(zen2) -elseif(${TARGET_ARCH} STREQUAL zen) -message("The configuration is : ${TARGET_ARCH}") -add_subdirectory(zen) -elseif(${TARGET_ARCH} STREQUAL amdzen) -message("The configuration is : ${TARGET_ARCH}") -add_subdirectory(generic) -add_subdirectory(zen) -add_subdirectory(zen2) -add_subdirectory(zen3) -add_subdirectory(zen4) -elseif(${TARGET_ARCH} STREQUAL haswell) -message("The configuration is : ${TARGET_ARCH}") -add_subdirectory(haswell) -else(${TARGET_ARCH} STREQUAL generic) -message("The configuration is : ${TARGET_ARCH}") -add_subdirectory(generic) -endif() +# Writing a function that will be used to generate the required object +# libraries for the required configs. +function(generate_config_targets config_target) + # Collect all subdirectory paths that have at least one file with suffix in CONFIG_SRC_SUFS list. + get_filepaths_with_suffixes(LOCAL_SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/${config_target}" "${CONFIG_SRC_SUFS}") + + # Create an object library using the source file list above. + add_library(${config_target}_CONFIG + OBJECT + ${LOCAL_SOURCE_FILES} + ) + # Include the corresponding make_defs.cmake that holds the required compiler options. + include(${CMAKE_SOURCE_DIR}/config/${config_target}/make_defs.cmake) + # Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets. + # mimicing get-config-cflags-for + target_compile_options(${config_target}_CONFIG + PRIVATE + # load-var-for,COPTFLAGS + ${COPTFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + # get-noopt-cflags-for + ${CWARNFLAGS} + # get-noopt-cflags-for + ${CMISCFLAGS} + # get-noopt-cflags-for + ${CLANGFLAGS} + # in get-config-cflags-for + ${BUILD_SYMFLAGS} + ) + target_compile_definitions(${config_target}_CONFIG + PRIVATE + # in get-noopt-cflags-for + ${CPPROCFLAGS} + # in get-noopt-cflags-for + ${VERS_DEF} + # in get-config-cflags-for + ${BUILD_CPPFLAGS} + ) + target_include_directories(${config_target}_CONFIG + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + ) + if(THREADING_MODEL STREQUAL "openmp") + # Equivalent to CTHREADFLAGS in get-noopt-cflags-for + target_link_libraries(${config_target}_CONFIG PRIVATE OpenMP::OpenMP_C) + elseif(THREADING_MODEL STREQUAL "pthreads") + # in get-noopt-cflags-for + target_compile_options(${config_target}_CONFIG PRIVATE ${CTHREADFLAGS}) + endif() + if(BUILD_SHARED_LIBS) + # Equivalent to CPICFLAGS in get-noopt-cflags-for + set_target_properties(${config_target}_CONFIG PROPERTIES POSITION_INDEPENDENT_CODE ON) + endif() + add_dependencies(${config_target}_CONFIG flat-header) + # Put all those targets under object-libs-targets folder name so that they appear all together in IDE. + set_target_properties(${config_target}_CONFIG PROPERTIES FOLDER object-libs-targets) + + # Create on object library using the corresponding reference kernel initialization file. + add_library(${config_target}_REFINIT + OBJECT + ${CMAKE_SOURCE_DIR}/ref_kernels/bli_cntx_ref.c + ) + # Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets. + # mimicing get-refinit-cflags-for + target_compile_options(${config_target}_REFINIT + PRIVATE + # load-var-for,COPTFLAGS + ${COPTFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + # get-noopt-cflags-for + ${CWARNFLAGS} + # get-noopt-cflags-for + ${CMISCFLAGS} + # get-noopt-cflags-for + ${CLANGFLAGS} + # in get-refinit-cflags-for + ${BUILD_SYMFLAGS} + ) + target_compile_definitions(${config_target}_REFINIT + PRIVATE + # get-noopt-cflags-for + ${CPPROCFLAGS} + # in get-noopt-cflags-for + ${VERS_DEF} + # in get-refinit-cflags-for + ${BUILD_CPPFLAGS} + # get-noopt-cflags-for + ${CPPROCFLAGS} + # in get-refinit-cflags-for + -DBLIS_CNAME=${config_target} + ) + target_include_directories(${config_target}_REFINIT + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + ) + if(THREADING_MODEL STREQUAL "openmp") + # Equivalent to CTHREADFLAGS in get-noopt-cflags-for + target_link_libraries(${config_target}_REFINIT PRIVATE OpenMP::OpenMP_C) + elseif(THREADING_MODEL STREQUAL "pthreads") + # in get-noopt-cflags-for + target_compile_options(${config_target}_REFINIT PRIVATE ${CTHREADFLAGS}) + endif() + if(BUILD_SHARED_LIBS) + # Equivalent to CPICFLAGS in get-noopt-cflags-for + set_target_properties(${config_target}_REFINIT PROPERTIES POSITION_INDEPENDENT_CODE ON) + endif() + add_dependencies(${config_target}_REFINIT flat-header) + # Put all those targets under object-libs-targets folder name so that they appear all together in IDE. + set_target_properties(${config_target}_REFINIT PROPERTIES FOLDER object-libs-targets) + + # Collect all subdirectory paths that have at least one file with suffix in KERNELS_SRC_SUFS list. + set(REFKERN_PATH ${CMAKE_SOURCE_DIR}/ref_kernels) + get_filepaths_with_suffixes(LOCAL_REFKERN_FILES ${REFKERN_PATH} ${KERNELS_SRC_SUFS}) + # Remove bli_cntx_ref.c from source list. + list(FILTER LOCAL_REFKERN_FILES EXCLUDE REGEX bli_cntx_ref.c) + + # Create on object library using the corresponding reference implementations being targeted. + add_library(${config_target}_REFKERN + OBJECT + ${LOCAL_REFKERN_FILES} + ) + # Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets. + # mimicing get-refkern-cflags-for + target_compile_options(${config_target}_REFKERN + PRIVATE + # load-var-for,CROPTFLAGS + ${CROPTFLAGS} + # load-var-for,CRVECFLAGS + ${CRVECFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + # get-noopt-cflags-for + ${CWARNFLAGS} + # get-noopt-cflags-for + ${CMISCFLAGS} + # get-noopt-cflags-for + ${CLANGFLAGS} + # in get-refkernel-cflags-for + ${COMPSIMDFLAGS} + # in get-refkern-cflags-for + ${BUILD_SYMFLAGS} + ) + target_compile_definitions(${config_target}_REFKERN + PRIVATE + # in get-noopt-cflags-for + ${CPPROCFLAGS} + # in get-noopt-cflags-for + ${VERS_DEF} + # in get-refkern-cflags-for + -DBLIS_CNAME=${config_target} + # in get-refkern-cflags-for + ${BUILD_CPPFLAGS} + ) + target_include_directories(${config_target}_REFKERN + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + ) + if(THREADING_MODEL STREQUAL "openmp") + # Equivalent to CTHREADFLAGS in get-noopt-cflags-for + target_link_libraries(${config_target}_REFKERN PRIVATE OpenMP::OpenMP_C) + elseif(THREADING_MODEL STREQUAL "pthreads") + # in get-noopt-cflags-for + target_compile_options(${config_target}_REFKERN PRIVATE ${CTHREADFLAGS}) + endif() + if(BUILD_SHARED_LIBS) + # Equivalent to CPICFLAGS in get-noopt-cflags-for + set_target_properties(${config_target}_REFKERN PROPERTIES POSITION_INDEPENDENT_CODE ON) + endif() + add_dependencies(${config_target}_REFKERN flat-header) + # Put all those targets under object-libs-targets folder name so that they appear all together in IDE. + set_target_properties(${config_target}_REFKERN PROPERTIES FOLDER object-libs-targets) +endfunction() + +# Generate targets for each of the configs. +foreach(CONF ${CONFIG_LIST}) + generate_config_targets(${CONF}) +endforeach() \ No newline at end of file diff --git a/config/amdzen/make_defs.cmake b/config/amdzen/make_defs.cmake new file mode 100644 index 0000000000..f658bcb64b --- /dev/null +++ b/config/amdzen/make_defs.cmake @@ -0,0 +1,24 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc ## + +# For architecture independent files we still need to define +# the required flags. +if(MSVC) + if(NOT ("${CMAKE_BUILD_TYPE}" MATCHES "Release")) + set(CDBGFLAGS /Zo) + endif() + if("${CMAKE_BUILD_TYPE}" MATCHES "Debug") + set(COPTFLAGS /Od) + else() # Release or RelWithDebInfo + set(COPTFLAGS /O2) + endif() +else() + if(NOT (DEBUG_TYPE STREQUAL "off")) + set(CDBGFLAGS -g) + endif() + + if(DEBUG_TYPE STREQUAL "noopt") + set(COPTFLAGS -O0) + else() # off or opt + set(COPTFLAGS -O3) + endif() +endif() \ No newline at end of file diff --git a/config/generic/CMakeLists.txt b/config/generic/CMakeLists.txt deleted file mode 100644 index 2fd3855574..0000000000 --- a/config/generic/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -##Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. ## - -target_sources("${PROJECT_NAME}" PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_cntx_init_generic.c - ) diff --git a/config/generic/make_defs.cmake b/config/generic/make_defs.cmake new file mode 100644 index 0000000000..40c9d7934a --- /dev/null +++ b/config/generic/make_defs.cmake @@ -0,0 +1,40 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc ## + +if(NOT WIN32) + if(NOT (DEBUG_TYPE STREQUAL "off")) + set(CDBGFLAGS -g) + endif() + + if(DEBUG_TYPE STREQUAL "noopt") + set(COPTFLAGS -O0) + else() # off or opt + set(COPTFLAGS -O3) + endif() +endif() + +# Flags specific to optimized kernels. +if(MSVC) + set(CKOPTFLAGS ${COPTFLAGS}) +else() + set(CKOPTFLAGS ${COPTFLAGS} -O3) +endif() + +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + # Placeholder in case we want to add gcc-specific flags. +elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "icc") + # Placeholder in case we want to add icc-specific flags. +elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + # Placeholder in case we want to add clang-specific flags. +else() + message(FATAL_ERROR "gcc, icc, or clang is required for this configuration.") +endif() + +# Flags specific to reference kernels. +set(CROPTFLAGS ${CKOPTFLAGS}) +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(CRVECFLAGS ${CKVECFLAGS}) +elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CRVECFLAGS ${CKVECFLAGS}) +else() + set(CRVECFLAGS ${CKVECFLAGS}) +endif() diff --git a/config/haswell/CMakeLists.txt b/config/haswell/CMakeLists.txt deleted file mode 100644 index a43bfe2b23..0000000000 --- a/config/haswell/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. ## - -set(FILES - ${CMAKE_CURRENT_SOURCE_DIR}/bli_cntx_init_haswell.c - ) - -set(SUBDIRECTORIES "") -set(RELATIVE_PATH "haswell") - -#Add all subdirectories -foreach(VAR ${SUBDIRECTORIES}) - add_subdirectory(${VAR}) -endforeach() - -if(FILES) - #Add source files to target - target_sources("${PROJECT_NAME}" PRIVATE ${FILES}) - - #Install our source files - install(FILES ${FILES} DESTINATION ${RELATIVE_PATH}) -endif() diff --git a/config/zen/CMakeLists.txt b/config/zen/CMakeLists.txt deleted file mode 100644 index 371f63b21c..0000000000 --- a/config/zen/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. ## - -target_sources("${PROJECT_NAME}" PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_cntx_init_zen.c - ) diff --git a/config/zen/amd_config.cmake b/config/zen/amd_config.cmake new file mode 100644 index 0000000000..61d56a3392 --- /dev/null +++ b/config/zen/amd_config.cmake @@ -0,0 +1,49 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc ## + +if(NOT WIN32) + if(NOT (DEBUG_TYPE STREQUAL "off")) + set(CDBGFLAGS -g) + endif() + + if(DEBUG_TYPE STREQUAL "noopt") + set(COPTFLAGS -O0) + else() # off or opt + set(COPTFLAGS -O2 -fomit-frame-pointer) + endif() +endif() + +# Flags specific to optimized kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +if(MSVC) + set(COPTFLAGS /Oy) + set(CKOPTFLAGS ${COPTFLAGS}) +else() + set(CKOPTFLAGS ${COPTFLAGS} -O3) +endif() + +if(MSVC) + set(CKVECFLAGS -mavx2 -mfma -mno-fma4 -mno-tbm -mno-xop -mno-lwp) +elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(CKVECFLAGS -mavx2 -mfpmath=sse -mfma) +elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + set(CKVECFLAGS -mavx2 -mfpmath=sse -mfma -mno-fma4 -mno-tbm -mno-xop -mno-lwp) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_full_version_string) + string(REGEX MATCH "^[^\n]*" CLANG_VERSION_STRING "${clang_full_version_string}") + string(REGEX MATCHALL "(AOCC.LLVM)" CLANG_STRING "${CLANG_VERSION_STRING}") + if("${CLANG_STRING}" MATCHES "(AOCC.LLVM)") + list(APPEND CKVECFLAGS -mllvm -disable-licm-vrp) + endif() +else() + message(FATAL_ERROR "gcc or clang are required for this configuration.") +endif() + +# Flags specific to reference kernels. +set(CROPTFLAGS ${CKOPTFLAGS}) +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(CRVECFLAGS ${CKVECFLAGS} -funsafe-math-optimizations -ffp-contract=fast) +elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + set(CRVECFLAGS ${CKVECFLAGS} -funsafe-math-optimizations -ffp-contract=fast) +else() + set(CRVECFLAGS ${CKVECFLAGS}) +endif() diff --git a/config/zen/make_defs.cmake b/config/zen/make_defs.cmake new file mode 100644 index 0000000000..0e9ac3ab9b --- /dev/null +++ b/config/zen/make_defs.cmake @@ -0,0 +1,39 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc ## + +# Include file containing common flags for all AMD architectures +include(${CMAKE_SOURCE_DIR}/config/zen/amd_config.cmake) +if(NOT WIN32) + if(NOT (DEBUG_TYPE STREQUAL "off")) + set(CDBGFLAGS -g) + endif() + + if(DEBUG_TYPE STREQUAL "noopt") + set(COPTFLAGS -O0) + else() # off or opt + set(COPTFLAGS -O3) + endif() +endif() + +# Flags specific to optimized kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +if(MSVC) + set(CKOPTFLAGS ${COPTFLAGS} /Oy) +else() + set(CKOPTFLAGS ${COPTFLAGS} -fomit-frame-pointer) +endif() + +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + list(APPEND CKVECFLAGS -march=znver1) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.0.0) + list(APPEND CKOPTFLAGS -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse) + endif() +endif() + +# Flags specific to reference kernels. +set(CROPTFLAGS ${CKOPTFLAGS}) +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(CRVECFLAGS ${CKVECFLAGS}) +else() + set(CRVECFLAGS ${CKVECFLAGS}) +endif() \ No newline at end of file diff --git a/config/zen2/CMakeLists.txt b/config/zen2/CMakeLists.txt deleted file mode 100644 index c3cdc45c08..0000000000 --- a/config/zen2/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. ## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_cntx_init_zen2.c - ) diff --git a/config/zen2/make_defs.cmake b/config/zen2/make_defs.cmake new file mode 100644 index 0000000000..2e2a7ad4c9 --- /dev/null +++ b/config/zen2/make_defs.cmake @@ -0,0 +1,76 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc ## + +# Include file containing common flags for all AMD architectures +include(${CMAKE_SOURCE_DIR}/config/zen/amd_config.cmake) +if(NOT WIN32) + if(NOT (DEBUG_TYPE STREQUAL "off")) + set(CDBGFLAGS -g) + endif() + + if(DEBUG_TYPE STREQUAL "noopt") + set(COPTFLAGS -O0) + else() # off or opt + set(COPTFLAGS -O3) + endif() +endif() + +# Flags specific to optimized kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +if(MSVC) + set(CKOPTFLAGS ${COPTFLAGS} /Oy) +else() + set(CKOPTFLAGS ${COPTFLAGS} -fomit-frame-pointer) +endif() + +# gcc or clang version must be at least 4.0 +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.0.0) + # gcc 9.0 or later + list(APPEND CKVECFLAGS -march=znver2) + list(APPEND CKOPTFLAGS -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse) + else() + # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 + # as the fallback option. + list(APPEND CKVECFLAGS -march=znver1 -mno-avx256-split-unaligned-store) + list(APPEND CRVECFLAGS -march=znver1 -mno-avx256-split-unaligned-store) + endif() +endif() # gcc + +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + # AOCC clang has various formats for the version line + # AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) + # AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) + # AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) + # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) + # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) + # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0) + + # For our purpose we just want to know if it version 2x or 3x or 4x + + # But also set these in case we are using upstream LLVM clang + execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_full_version_string) + string(REGEX MATCH "^[^\n]*" CLANG_VERSION_STRING "${clang_full_version_string}") + string(REGEX MATCHALL "(AOCC_2|AOCC_3|AOCC_4|AOCC|LLVM|clang)" CLANG_STRING "${CLANG_VERSION_STRING}") + string(REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION "${CLANG_VERSION_STRING}") + + if("${CLANG_STRING}" MATCHES "AOCC_4") + # AOCC version 4x we will enable znver2 + list(APPEND CKVECFLAGS -march=znver2) + elseif("${CLANG_STRING}" MATCHES "AOCC_3") + # AOCC version 3x we will enable znver2 + list(APPEND CKVECFLAGS -march=znver2) + elseif("${CLANG_STRING}" MATCHES "(AOCC_2|LLVM)") + # AOCC version 2x we will enable znver2 + list(APPEND CKVECFLAGS -march=znver2) + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.0.0) + # LLVM clang 9.0 or later + list(APPEND CKVECFLAGS -march=znver2) + else() + list(APPEND CKVECFLAGS -march=znver1) + endif() +endif() + +# Flags specific to reference kernels. +set(CROPTFLAGS ${CKOPTFLAGS}) +set(CRVECFLAGS ${CKVECFLAGS}) \ No newline at end of file diff --git a/config/zen3/CMakeLists.txt b/config/zen3/CMakeLists.txt deleted file mode 100644 index d600e43870..0000000000 --- a/config/zen3/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc ## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_cntx_init_zen3.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_family_zen3.h - ) diff --git a/config/zen3/make_defs.cmake b/config/zen3/make_defs.cmake new file mode 100644 index 0000000000..85a42106c4 --- /dev/null +++ b/config/zen3/make_defs.cmake @@ -0,0 +1,90 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc ## + +# FLAGS that are specific to the 'zen3' architecture are added here. +# FLAGS that are common for all the AMD architectures are present in +# config/zen/amd_config.mk. + +# Include file containing common flags for all AMD architectures +include(${CMAKE_SOURCE_DIR}/config/zen/amd_config.cmake) + +# --- Determine the C compiler and related flags --- +if(NOT WIN32) + if(NOT (DEBUG_TYPE STREQUAL "off")) + set(CDBGFLAGS -g) + endif() + + if(DEBUG_TYPE STREQUAL "noopt") + set(COPTFLAGS -O0) + else() # off or opt + set(COPTFLAGS -O3) + endif() +endif() + +# Flags specific to optimized kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +if(MSVC) + set(CKOPTFLAGS ${COPTFLAGS} /Oy) +else() + set(CKOPTFLAGS ${COPTFLAGS} -fomit-frame-pointer) +endif() + +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 11.0.0) + # gcc 11.0 or later + list(APPEND CKVECFLAGS -march=znver3) + # Update CKOPTFLAGS for gcc to use O3 optimization without + # -ftree-pre and -ftree-partial-pre flag. These flag results + # in suboptimal code generation for instrinsic based kernels. + # The -ftree-loop-vectorize results in inefficient code gen + # for amd optimized l1 kernels based on instrinsics. + list(APPEND CKOPTFLAGS -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse) + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.0.0) + # gcc 9.0 or later + list(APPEND CKVECFLAGS -march=znver2) + list(APPEND CKOPTFLAGS -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse) + else() + # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 + # as the fallback option. + list(APPEND CKVECFLAGS -march=znver1 -mno-avx256-split-unaligned-store) + list(APPEND CRVECFLAGS -march=znver1 -mno-avx256-split-unaligned-store) + endif() +endif() + +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + # AOCC clang has various formats for the version line + # AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) + # AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) + # AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) + # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) + # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) + # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0) + + # For our purpose we just want to know if it version 2x or 3x or 4x + + # But also set these in case we are using upstream LLVM clang + execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_full_version_string) + string(REGEX MATCH "^[^\n]*" CLANG_VERSION_STRING "${clang_full_version_string}") + string(REGEX MATCHALL "(AOCC_2|AOCC_3|AOCC_4|AOCC|LLVM|clang)" CLANG_STRING "${CLANG_VERSION_STRING}") + string(REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION "${CLANG_VERSION_STRING}") + + if("${CLANG_STRING}" MATCHES "AOCC_4") + # AOCC version 4x we will enable znver3 + list(APPEND CKVECFLAGS -march=znver3) + elseif("${CLANG_STRING}" MATCHES "AOCC_3") + # AOCC version 3x we will enable znver3 + list(APPEND CKVECFLAGS -march=znver3) + elseif("${CLANG_STRING}" MATCHES "(AOCC_2|LLVM)") + # AOCC version 2x we will enable znver2 + list(APPEND CKVECFLAGS -march=znver2) + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.0.0) + # LLVM clang 9.0 or later + list(APPEND CKVECFLAGS -march=znver2) + else() + list(APPEND CKVECFLAGS -march=znver1) + endif() +endif() + +# Flags specific to reference kernels. +set(CROPTFLAGS ${CKOPTFLAGS}) +set(CRVECFLAGS ${CKVECFLAGS}) \ No newline at end of file diff --git a/config/zen4/CMakeLists.txt b/config/zen4/CMakeLists.txt deleted file mode 100644 index ea166b00c7..0000000000 --- a/config/zen4/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc ## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_cntx_init_zen4.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_family_zen4.h - ) diff --git a/config/zen4/make_defs.cmake b/config/zen4/make_defs.cmake new file mode 100644 index 0000000000..68dcc4b727 --- /dev/null +++ b/config/zen4/make_defs.cmake @@ -0,0 +1,112 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc ## + +# FLAGS that are specific to the 'zen4' architecture are added here. +# FLAGS that are common for all the AMD architectures are present in +# config/zen/amd_config.mk. + +# Include file containing common flags for all AMD architectures +include(${CMAKE_SOURCE_DIR}/config/zen/amd_config.cmake) +if(NOT WIN32) + if(NOT (DEBUG_TYPE STREQUAL "off")) + set(CDBGFLAGS -g) + endif() + + if(DEBUG_TYPE STREQUAL "noopt") + set(COPTFLAGS -O0) + else() # off or opt + set(COPTFLAGS -O3) + endif() +endif() + +# Flags specific to optimized kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +if(MSVC) + set(CKOPTFLAGS ${COPTFLAGS} /Oy) +else() + set(CKOPTFLAGS ${COPTFLAGS} -fomit-frame-pointer) +endif() + +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0.0) + # gcc 13.0 or later + list(APPEND CKVECFLAGS -march=znver4) + list(APPEND CRVECFLAGS -march=znver4) + # Update CKOPTFLAGS for gcc to use O3 optimization without + # -ftree-pre and -ftree-partial-pre flag. These flag results + # in suboptimal code generation for instrinsic based kernels. + # The -ftree-loop-vectorize results in inefficient code gen + # for amd optimized l1 kernels based on instrinsics. + list(APPEND CKOPTFLAGS -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize) + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 11.0.0) + # gcc 11.0 or later + list(APPEND CKVECFLAGS -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16) + list(APPEND CRVECFLAGS -march=znver3) + list(APPEND CKOPTFLAGS -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize) + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.0.0) + # gcc 9.0 or later + list(APPEND CKVECFLAGS -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni) + list(APPEND CRVECFLAGS -march=znver2) + list(APPEND CKOPTFLAGS -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize) + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.0.0) + # gcc 8.0 or later + list(APPEND CKVECFLAGS -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni) + list(APPEND CRVECFLAGS -march=znver1) + else() + # If gcc is older than 8.0.0 but at least 6.1.0, then we can use -march=znver1 + # as the fallback option. + list(APPEND CKVECFLAGS -march=znver1 -mno-avx256-split-unaligned-store) + list(APPEND CRVECFLAGS -march=znver1 -mno-avx256-split-unaligned-store) + endif() +endif() # gcc + +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + # AOCC clang has various formats for the version line + + # AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) + # AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) + # AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) + # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) + # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) + # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0) + # For our purpose we just want to know if it version 2x or 3x or 4x + + # But also set these in case we are using upstream LLVM clang + execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_full_version_string) + string(REGEX MATCH "^[^\n]*" CLANG_VERSION_STRING "${clang_full_version_string}") + string(REGEX MATCHALL "(AOCC_2|AOCC_3|AOCC_4|AOCC|LLVM|clang)" CLANG_STRING "${CLANG_VERSION_STRING}") + string(REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION "${CLANG_VERSION_STRING}") + + if("${CLANG_STRING}" MATCHES "AOCC_4") + # AOCC version 4x we will enable znver4 + list(APPEND CKVECFLAGS -march=znver4 -falign-loops=64) + list(APPEND CRVECFLAGS -march=znver4) + elseif("${CLANG_STRING}" MATCHES "AOCC_3") + # AOCC version 3x we will enable znver3 + list(APPEND CKVECFLAGS -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64) + list(APPEND CRVECFLAGS -march=znver3) + elseif("${CLANG_STRING}" MATCHES "(AOCC_2|LLVM)") + # AOCC version 2x we will enable znver2 + list(APPEND CKVECFLAGS -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni) + list(APPEND CRVECFLAGS -march=znver2) + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) + # LLVM clang 16.0 or later + list(APPEND CKVECFLAGS -march=znver4 -falign-loops=64) + list(APPEND CRVECFLAGS -march=znver4) + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0.0) + # LLVM clang 13.0 or later + list(APPEND CKVECFLAGS -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64) + list(APPEND CRVECFLAGS -march=znver3) + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.0.0) + # LLVM clang 9.0 or later + list(APPEND CKVECFLAGS -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64) + list(APPEND CRVECFLAGS -march=znver2) + else() + list(APPEND CKVECFLAGS -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -falign-loops=64) + list(APPEND CRVECFLAGS -march=znver1) + endif() +endif() + +# Flags specific to reference kernels. +set(CROPTFLAGS ${CKOPTFLAGS}) +set(CRVECFLAGS ${CKVECFLAGS}) \ No newline at end of file diff --git a/docs/CMakeBuildSystem.md b/docs/CMakeBuildSystem.md new file mode 100644 index 0000000000..cee9f5a86d --- /dev/null +++ b/docs/CMakeBuildSystem.md @@ -0,0 +1,217 @@ +## Contents + +* **[Contents](CMakeBuildSystem.md#contents)** +* **[Introduction](CMakeBuildSystem.md#introduction)** +* **[Step 1: Chose a framework configuration](CMakeBuildSystem.md#step-1-choose-a-framework-configuration)** +* **[Step 2: Configuring CMake](CMakeBuildSystem.md#step-2-configuring-cmake)** +* **[Step 3: Compilation](CMakeBuildSystem.md#step-3-compilation)** +* **[Step 4: Installation](CMakeBuildSystem.md#step-4-installation)** +* **[Compiling with BLIS](CMakeBuildSystem.md#compiling-with-blis)** +* **[Uninstalling](CMakeBuildSystem.md#uninstalling)** +* **[Available targets](CMakeBuildSystem.md#available-targets)** +* **[Adding configurations](CMakeBuildSystem.md#adding-configurations)** +* **[Some examples](CMakeBuildSystem.md#some-examples)** +* **[Final notes](CMakeBuildSystem.md#final-notes)** + +## Introduction + +This document describes how to use CMake to build and install a BLIS library to your local system. + +The BLIS CMake system is based on the [Make build system](BuildSystem.md) and is designed for use with both Linux and Windows. Other requirements are: + + * CMake (3.15.0 or higher) + * Python (3.4 or later for python3) + * GNU `make` (3.81 or later) on Linux + * Visual Studio 17 2022 on Windows + * a working C99 compiler (gcc or clang on Linux and clang-cl **only** on Windows) + +Note that, on Windows, BLIS implements basic pthreads functionality automatically, so a POSIX threads is not required. On Linux, the implementation is the same to the one of the Make system. + +CMake is used to build out of source so we need to start by creating a build directory from which we will do the configuration and build steps. Since there is a directory called blis/build, the build directory must have a different name. Here is an example on how to create the directory: +``` +$ mkdir build_blis +$ cd build_blis +``` + +## Step 1: Choose a framework configuration + +The first step is to choose the appropriate BLIS configuration. As on the Make build system, the user must decide which configuration to use or whether automatic hardware detection should be used to determine the configuration. Currently only the following configurations are supported: + + * amdzen + * zen + * zen2 + * zen3 + * zen4 + * generic + +Instructions on how to add a configuration on the CMake system, are provided in a later section. + +### Multithreading + +As in Make system, multithreading in BLIS is disabled by default. To configure cmake so that OpenMP is used, please use `-DTHREADING_MODEL=openmp`. All available options can be found if cmake-gui is used, or by running +``` +cmake .. -DPRINT_CONFIGURE_HELP=ON +``` + +## Step 2: Configuring CMake + +### Choosing a generator + +This is a reminder on how to configure CMake to use a specific generator: +``` +cmake -G +``` + +On Linux "Unix Makefiles" is used by default and `-G ` can be omitted. + +On Windows, specify Visual Studio generator using +``` +cmake -G "Visual Studio 17 2022" +``` + +For the rest of this documentation, we will use the platform-agnostic commands to build the libraries, but the usual make commands can be used instead. On the following command snippets we ommit specifying the generator, but one can use their prefered way of building using common CMake practices. + +### Choosing a configuration + +This step is equivalent to running `./configure ` using the Make system. In this case, simply run: +``` +cmake .. -DBLIS_CONFIG_FAMILY= +``` +If the provided configuration is not supported, an error will be thrown and a message with the available configurations will be printed. + +To configure based on your hardware, you can configure using +``` +cmake .. -DBLIS_CONFIG_FAMILY=auto +``` +Please note that when `auto` is used as a configuration option, the `generic` configuration will be chosen by default on non-AMD hardware. + +### Specifying a prefix path for installation + +We remind users that to specify the installation prefix in cmake, one needs to configure using `CMAKE_INSTALL_PREFIX` variable: +``` +cmake .. -DBLIS_CONFIG_FAMILY=auto -DCMAKE_INSTALL_PREFIX= +``` +This will cause libraries to eventually be installed to `/lib` and headers will be installed to `/include/blis`. + +Options to specify the library install and the header install separately, like in Make system, is not currently supported by the CMake equivalent. + +## Step 3: Compilation + +Once configuration is finished and the corresponding platform-dependent build files have been generated, you can proceed to building the library. +To build the library in a platform agnostic way use: +``` +cmake --build . --config Release +``` +For a verbose build, you can use: +``` +cmake --build . --verbose --config Release +``` +To build in parallel on a multicore system, you can use: +``` +cmake --build . --config Release -j +``` +where `` is the number of jobs allowed to run simultaneously by this command. + +Note that on Linux, if Makefiles are used, the above is equivalent to running +``` +make -j +``` + +## Step 4: Installation + +The BLIS library resides in your chosen build directory, say `blis/build_blis` and the generated header files are in `blis/build_blis/include/`. To install the library and the header files associated with it, you can use: +``` +cmake --build . --target install +``` +This will install the libraries and header files and create the corresponding symbolic links of the shared libraries in the path specified in `CMAKE_INSTALL_PREFIX`. + +Note that on Linux, if Makefiles are used, the above is equivalent to running +``` +make install +``` + +## Uninstalling + +Please note that CMake does not provide functionality to uninstall targets. + +## Available targets + +The BLIS CMake system aims to be combatible with the current `make` system. For that reason, it implements the same targets for the generation of libraries and the tests. The table of avalable targets can be found below. + +| target | Description | +|:----------------|:---------------------------------------------------| +| `all` | Execute `libs` target. | +| `libs` | Compile BLIS as a static and/or shared library (depending on CMake options). | +| `test` | Execute `checkblis` and `checkblas` targets. | +| `check` | Execute `checkblis-fast` and `checkblas` targets. | +| `checkblis` | Execute `testblis` and characterize the results to `stdout`. | +| `checkblis-fast`| Execute `testblis-fast` and characterize the results to `stdout`. | +| `checkblis-md` | Execute `testblis-md` and characterize the results to `stdout`. | +| `checkblis-salt`| Execute `testblis-salt` and characterize the results to `stdout`. | +| `checkblas` | Execute `testblas` and characterize the results to `stdout`. | +| `testblis` | Run the BLIS testsuite with default parameters (runs for 2-8 minutes). | +| `testblis-fast` | Run the BLIS testsuite with "fast" parameters (runs for a few seconds). | +| `testblis-md` | Run the BLIS testsuite for `gemm` with full mixing of datatypes (runs for 10-30 seconds). | +| `testblis-salt` | Run the BLIS testsuite while simulating application-level threading (runs for a few seconds). | +| `testsuite` | Same as `testblis`. | +| `testblas` | Run the BLAS test drivers with default parameters (runs for a few seconds). | + +### Running the testsuites. +* On Linux all targets can be build and run in `build_blis` directory. +* On Windows, when Visual Studio has been used as a generator, one can build and run the blis API related tests from testsuite directory and blas API tests from blastest directory. + +## Adding configurations + +ToDo + +## Some examples + +In this section we provide some examples for users that are familiar with the build system based in Makefiles and want to try the new CMake system. + +**_NOTE:_** +The CMake system generates the shared libraries by default. To build the static libraries, you need to specify the corresponding CMake variable below +``` +cmake .. -DBUILD_SHARED_LIBS=OFF -DBLIS_CONFIG_FAMILY=amdzen +``` +The same generated header `blis.h` can be used when using the library. + +For shared libraries on Windows, one can easily import the symbols by defining the macro `-DBLIS_EXPORT=__declspec(dllimport)` while building the application, +but this is not necessary if static data symbols and objects are not used. + +### Example 1: multi-threaded LP64 libraries for amdzen configuration using clang compiler + +* With configure script: +``` +CC=clang ./configure --enable-threading=openmp --int-size=32 --blas-int-size=32 amdzen +``` + +* With CMake on Linux: +``` +cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang -DENABLE_THREADING=openmp -DINT_SIZE=32 -DBLAS_INT_SIZE=32 -DBLIS_CONFIG_FAMILY=amdzen +``` + +* With CMake on Windows: +``` +cmake .. -G "Visual Studio 17 2022" -TClangCl -DENABLE_THREADING=openmp -DINT_SIZE=32 -DBLAS_INT_SIZE=32 -DBLIS_CONFIG_FAMILY=amdzen -DOpenMP_libomp_LIBRARY="path_to_openmp_library" +``` + +### Example 2: single-threaded ILP64 libraries for amdzen configuration with aocl_gemm addon enabled and default compiler + +* With configure script: +``` +./configure --enable-threading=no --int-size=64 --blas-int-size=64 --enable-addon=aocl_gemm amdzen +``` + +* With CMake on Linux: +``` +cmake .. -DENABLE_THREADING=no -DINT_SIZE=64 -DBLAS_INT_SIZE=64 -DENABLE_ADDON=aocl_gemm -DBLIS_CONFIG_FAMILY=amdzen +``` + +* With CMake on Windows: +``` +cmake .. -G "Visual Studio 17 2022" -TClangCl -DENABLE_THREADING=no -DINT_SIZE=64 -DBLAS_INT_SIZE=64 -DENABLE_ADDON=aocl_gemm -DBLIS_CONFIG_FAMILY=amdzen +``` + +## Conclusion + +The BLIS CMake system is developed and maintained by AMD. You can contact us on the email-id toolchainsupport@amd.com. You can also raise any issue/suggestion on the git-hub repository at https://github.com/amd/blis/issues. \ No newline at end of file diff --git a/frame/0/CMakeLists.txt b/frame/0/CMakeLists.txt deleted file mode 100644 index fb97eedd22..0000000000 --- a/frame/0/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. ## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l0_check.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l0_fpa.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l0_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l0_tapi.c - ) - -add_subdirectory(copysc) diff --git a/frame/0/copysc/CMakeLists.txt b/frame/0/copysc/CMakeLists.txt deleted file mode 100644 index 4088bc9954..0000000000 --- a/frame/0/copysc/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. ## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_copysc.c - ) diff --git a/frame/1/CMakeLists.txt b/frame/1/CMakeLists.txt deleted file mode 100644 index 017d839e46..0000000000 --- a/frame/1/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. ## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1v_check.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1v_fpa.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1v_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1v_oapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1v_oapi_ex.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1v_tapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1v_tapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1v_tapi_ex.c -) - diff --git a/frame/1d/CMakeLists.txt b/frame/1d/CMakeLists.txt deleted file mode 100644 index f7d01eba3f..0000000000 --- a/frame/1d/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. ## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1d_check.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1d_fpa.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1d_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1d_oapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1d_oapi_ex.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1d_tapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1d_tapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1d_tapi_ex.c - ) diff --git a/frame/1f/CMakeLists.txt b/frame/1f/CMakeLists.txt deleted file mode 100644 index 470151fbca..0000000000 --- a/frame/1f/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1f_check.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1f_fpa.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1f_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1f_oapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1f_oapi_ex.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1f_tapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1f_tapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1f_tapi_ex.c - ) diff --git a/frame/1m/CMakeLists.txt b/frame/1m/CMakeLists.txt deleted file mode 100644 index 2c027f38e5..0000000000 --- a/frame/1m/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1m_check.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1m_fpa.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1m_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1m_oapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1m_oapi_ex.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1m_tapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1m_tapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1m_tapi_ex.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l1m_unb_var1.c - ) - -set(SUBDIRECTORIES "packm" "unpackm") - -#Add all subdirectories -foreach(VAR ${SUBDIRECTORIES}) - add_subdirectory(${VAR}) -endforeach() diff --git a/frame/1m/packm/CMakeLists.txt b/frame/1m/packm/CMakeLists.txt deleted file mode 100644 index 37963d46c0..0000000000 --- a/frame/1m/packm/CMakeLists.txt +++ /dev/null @@ -1,27 +0,0 @@ -##Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_blk_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_blk_var1_md.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_check.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cntl.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk_1er.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk_3mis.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk_4mi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk_rih.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_init.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_part.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_struc_cxk.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_struc_cxk_1er.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_struc_cxk_3mis.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_struc_cxk_4mi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_struc_cxk_md.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_struc_cxk_rih.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_thrinfo.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_unb_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_pack_full.c - ) - diff --git a/frame/1m/unpackm/CMakeLists.txt b/frame/1m/unpackm/CMakeLists.txt deleted file mode 100644 index 8d1459ca6d..0000000000 --- a/frame/1m/unpackm/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_unpackm_blk_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_unpackm_check.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_unpackm_cntl.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_unpackm_cxk.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_unpackm_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_unpackm_unb_var1.c - ) diff --git a/frame/2/CMakeLists.txt b/frame/2/CMakeLists.txt deleted file mode 100644 index fa40b45d36..0000000000 --- a/frame/2/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l2_check.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l2_fpa.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l2_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l2_oapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l2_oapi_ex.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l2_tapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l2_tapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l2_tapi_ex.c - ) - -set(SUBDIRECTORIES "gemv" "ger" "hemv" "her" "her2" "trmv" "trsv") - -#Add all subdirectories -foreach(VAR ${SUBDIRECTORIES}) - add_subdirectory(${VAR}) -endforeach() diff --git a/frame/2/gemv/CMakeLists.txt b/frame/2/gemv/CMakeLists.txt deleted file mode 100644 index 9768c9f6ff..0000000000 --- a/frame/2/gemv/CMakeLists.txt +++ /dev/null @@ -1,27 +0,0 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unb_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unb_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_var_oapi.c - ) - -# Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR - ${TARGET_ARCH} STREQUAL zen2 OR - ${TARGET_ARCH} STREQUAL zen3 OR - ${TARGET_ARCH} STREQUAL zen4 OR - ${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unf_var1_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unf_var2_amd.c - ) -else() - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unf_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_unf_var2.c - ) -endif() diff --git a/frame/2/ger/CMakeLists.txt b/frame/2/ger/CMakeLists.txt deleted file mode 100644 index f9869b73c4..0000000000 --- a/frame/2/ger/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_ger_unb_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_ger_unb_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_ger_var_oapi.c - ) - diff --git a/frame/2/hemv/CMakeLists.txt b/frame/2/hemv/CMakeLists.txt deleted file mode 100644 index c1de90e047..0000000000 --- a/frame/2/hemv/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unb_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unb_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unb_var3.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unb_var4.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unf_var1a.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unf_var3a.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_var_oapi.c - ) - -# Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR - ${TARGET_ARCH} STREQUAL zen2 OR - ${TARGET_ARCH} STREQUAL zen3 OR - ${TARGET_ARCH} STREQUAL zen4 OR - ${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unf_var1_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unf_var3_amd.c - ) -else() - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unf_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unf_var3.c - ) -endif() diff --git a/frame/2/her/CMakeLists.txt b/frame/2/her/CMakeLists.txt deleted file mode 100644 index 0e0f636681..0000000000 --- a/frame/2/her/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_var_oapi.c - ) - -# Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR -${TARGET_ARCH} STREQUAL zen2 OR -${TARGET_ARCH} STREQUAL zen3 OR -${TARGET_ARCH} STREQUAL zen4 OR -${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_unb_var1_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_unb_var2_amd.c - ) -else() - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_unb_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_unb_var2.c - ) -endif() diff --git a/frame/2/her2/CMakeLists.txt b/frame/2/her2/CMakeLists.txt deleted file mode 100644 index 817e55cb10..0000000000 --- a/frame/2/her2/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_unb_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_unb_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_unb_var3.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_unb_var4.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_var_oapi.c - ) - -# Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR - ${TARGET_ARCH} STREQUAL zen2 OR - ${TARGET_ARCH} STREQUAL zen3 OR - ${TARGET_ARCH} STREQUAL zen4 OR - ${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_unf_var1_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_unf_var4_amd.c - ) -else() - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_unf_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_unf_var4.c - ) -endif() diff --git a/frame/2/symv/CMakeLists.txt b/frame/2/symv/CMakeLists.txt deleted file mode 100644 index a374cd0170..0000000000 --- a/frame/2/symv/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_symv.h - ) - diff --git a/frame/2/syr/CMakeLists.txt b/frame/2/syr/CMakeLists.txt deleted file mode 100644 index 1228703d7a..0000000000 --- a/frame/2/syr/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_syr.h - ) diff --git a/frame/2/syr2/CMakeLists.txt b/frame/2/syr2/CMakeLists.txt deleted file mode 100644 index b9e0ba974c..0000000000 --- a/frame/2/syr2/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_syr2.h - ) - diff --git a/frame/2/trmv/CMakeLists.txt b/frame/2/trmv/CMakeLists.txt deleted file mode 100644 index 205df33a6c..0000000000 --- a/frame/2/trmv/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmv_unb_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmv_unb_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmv_unf_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmv_unf_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmv_var_oapi.c - ) diff --git a/frame/2/trsv/CMakeLists.txt b/frame/2/trsv/CMakeLists.txt deleted file mode 100644 index f1aacc745c..0000000000 --- a/frame/2/trsv/CMakeLists.txt +++ /dev/null @@ -1,27 +0,0 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsv_unb_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsv_unb_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsv_var_oapi.c - ) - -# Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR - ${TARGET_ARCH} STREQUAL zen2 OR - ${TARGET_ARCH} STREQUAL zen3 OR - ${TARGET_ARCH} STREQUAL zen4 OR - ${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsv_unf_var1_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsv_unf_var2_amd.c - ) -else() - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsv_unf_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsv_unf_var2.c - ) -endif() diff --git a/frame/3/CMakeLists.txt b/frame/3/CMakeLists.txt deleted file mode 100644 index b3db987c3a..0000000000 --- a/frame/3/CMakeLists.txt +++ /dev/null @@ -1,53 +0,0 @@ -##Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_blocksize.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_check.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_cntl.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_direct.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_oapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_oapi_ex.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_packm.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_prune.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_packm_a.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_packm_b.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_packm_var.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_var12.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_var1n2m.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_tapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_tapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_tapi_ex.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_thrinfo.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_fpa.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_tapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_smart_threading.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute.c - ) -# Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR - ${TARGET_ARCH} STREQUAL zen2 OR - ${TARGET_ARCH} STREQUAL zen3 OR - ${TARGET_ARCH} STREQUAL zen4 OR - ${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_int_amd.c - ) -else() - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_int.c - ) -endif() - -set(SUBDIRECTORIES "gemm" "hemm" "her2k" "herk" "symm" "syr2k" "syrk" "trmm" "trmm3" "trsm" "gemmt") - -#Add all subdirectories -foreach(VAR ${SUBDIRECTORIES}) - add_subdirectory(${VAR}) -endforeach() diff --git a/frame/3/gemm/CMakeLists.txt b/frame/3/gemm/CMakeLists.txt deleted file mode 100644 index 8969680031..0000000000 --- a/frame/3/gemm/CMakeLists.txt +++ /dev/null @@ -1,35 +0,0 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_blk_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_blk_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_blk_var3.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_cntl.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_ker_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_ker_var2_md.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_md.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_md_c2r_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_packab.c - ) - -# Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR -${TARGET_ARCH} STREQUAL zen2 OR -${TARGET_ARCH} STREQUAL zen3 OR -${TARGET_ARCH} STREQUAL zen4 OR -${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_front_amd.c - ) -else() - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_front.c - ) -endif() - - add_subdirectory(ind) diff --git a/frame/3/gemm/ind/CMakeLists.txt b/frame/3/gemm/ind/CMakeLists.txt deleted file mode 100644 index e918ba12de..0000000000 --- a/frame/3/gemm/ind/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm4mb_ker_var2.c - ) diff --git a/frame/3/gemmt/CMakeLists.txt b/frame/3/gemmt/CMakeLists.txt deleted file mode 100644 index 44437e66a3..0000000000 --- a/frame/3/gemmt/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmt_front.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmt_ker_var2.c - ) - -# Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR -${TARGET_ARCH} STREQUAL zen2 OR -${TARGET_ARCH} STREQUAL zen3 OR -${TARGET_ARCH} STREQUAL zen4 OR -${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmt_sup_var1n2m_amd.c - ) -else() - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmt_sup_var1n2m.c - ) -endif() - diff --git a/frame/3/hemm/CMakeLists.txt b/frame/3/hemm/CMakeLists.txt deleted file mode 100644 index 8d85393c0c..0000000000 --- a/frame/3/hemm/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemm_front.c - ) - diff --git a/frame/3/her2k/CMakeLists.txt b/frame/3/her2k/CMakeLists.txt deleted file mode 100644 index 1e35580c86..0000000000 --- a/frame/3/her2k/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2k_front.c - ) - diff --git a/frame/3/herk/CMakeLists.txt b/frame/3/herk/CMakeLists.txt deleted file mode 100644 index 5b8f7e9bb5..0000000000 --- a/frame/3/herk/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_herk_front.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_herk_l_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_herk_u_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_herk_x_ker_var2.c - ) - diff --git a/frame/3/symm/CMakeLists.txt b/frame/3/symm/CMakeLists.txt deleted file mode 100644 index aa13ef04f8..0000000000 --- a/frame/3/symm/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_symm_front.c - ) - diff --git a/frame/3/syr2k/CMakeLists.txt b/frame/3/syr2k/CMakeLists.txt deleted file mode 100644 index df351748ec..0000000000 --- a/frame/3/syr2k/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_syr2k_front.c - ) - diff --git a/frame/3/syrk/CMakeLists.txt b/frame/3/syrk/CMakeLists.txt deleted file mode 100644 index 305cfc469b..0000000000 --- a/frame/3/syrk/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_syrk_front.c - ) - diff --git a/frame/3/trmm/CMakeLists.txt b/frame/3/trmm/CMakeLists.txt deleted file mode 100644 index 49106e4b10..0000000000 --- a/frame/3/trmm/CMakeLists.txt +++ /dev/null @@ -1,27 +0,0 @@ -##Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmm_ll_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmm_lu_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmm_rl_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmm_ru_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmm_xx_ker_var2.c - ) -# Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR -${TARGET_ARCH} STREQUAL zen2 OR -${TARGET_ARCH} STREQUAL zen3 OR -${TARGET_ARCH} STREQUAL zen4 OR -${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmm_front_amd.c - ) -else() - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmm_front.c - ) -endif() - diff --git a/frame/3/trmm3/CMakeLists.txt b/frame/3/trmm3/CMakeLists.txt deleted file mode 100644 index d01d3698cd..0000000000 --- a/frame/3/trmm3/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trmm3_front.c - ) - diff --git a/frame/3/trsm/CMakeLists.txt b/frame/3/trsm/CMakeLists.txt deleted file mode 100644 index 829819323d..0000000000 --- a/frame/3/trsm/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_blk_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_blk_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_blk_var3.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_cntl.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_front.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_ll_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_lu_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_packab.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_rl_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_ru_ker_var2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_xx_ker_var2.c - ) - diff --git a/frame/CMakeLists.txt b/frame/CMakeLists.txt index 61599be321..59e8142cc4 100644 --- a/frame/CMakeLists.txt +++ b/frame/CMakeLists.txt @@ -1,20 +1,100 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -add_subdirectory("thread") -add_subdirectory("base") -add_subdirectory("0") -add_subdirectory("1") -add_subdirectory("1d") -add_subdirectory("1f") -add_subdirectory("1m") -add_subdirectory("2") -add_subdirectory("3") -add_subdirectory("compat") -add_subdirectory("ind") -add_subdirectory("util") +##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## +# Collect all subdirectory paths that have at least one file with suffix in FRAME_SRC_SUFS list. +get_filepaths_with_suffixes(LOCAL_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR} "${FRAME_SRC_SUFS}") +# AMD has optimized some of the framework files, these optimizations +# may not be compatible with other platforms. +# +# In order to keep main framework code independent of AMD changes,e +# AMD has duplicated the files and updated them for example +# frame/compact/bla_gemm.c : generic framework file +# frame/compact/bla_gemm_amd.c : AMD optimized framework file +# Based on the archiecture we choose correct files +if(ENABLE_AOCL_ZEN) + # Build is being done for AMD platforms, remove the objects which + # don't have amd suffix (for which exists AMD specific implementation). + # Create a copy of the source files so that we can do regex matching. + set(TEMP_SOURCE_LIST ${LOCAL_SOURCE_FILES}) + # Create a list with all the files that have _amd. in the name. + list(FILTER TEMP_SOURCE_LIST INCLUDE REGEX "_amd\.c$") + # Remove _amd from all items in the list. This will leave us with all + # source files, for which an AMD-optimized file exists. + list(TRANSFORM TEMP_SOURCE_LIST REPLACE "_amd\.c$" "\.c") + list(REMOVE_ITEM LOCAL_SOURCE_FILES ${TEMP_SOURCE_LIST}) +else() + # Build is done for non AMD platforms, remove the amd specific files. + list(FILTER LOCAL_SOURCE_FILES EXCLUDE REGEX "_amd\.c$") +endif() +# Remove frame/compat/f2c/bla_xerbla_array.c from the list when building on Windows +# to avoid duplicate symbols issue when we use BLIS with LIBFLAME. +# This is a hack and the symbol should be removed from LIBFLAME instead. +list(FILTER LOCAL_SOURCE_FILES EXCLUDE REGEX "bla_xerbla_array\.c$") +# Add corresponding definitions for API that is being exported. +if(WIN32) + if(ENABLE_NO_UNDERSCORE_API) + set(EXPORT_API_DEFS "-DENABLE_NO_UNDERSCORE_API") + endif() + if(ENABLE_UPPERCASE_API) + list(APPEND EXPORT_API_DEFS "-DBLIS_ENABLE_UPPERCASE_API") + endif() +endif() +# Create an object library using the source file list above. +add_library(FRAME + OBJECT + ${LOCAL_SOURCE_FILES} + ) +# Include the corresponding make_defs.cmake that holds the required compiler options. +include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake) +# Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets. +# mimicing get-frame-cflags-for +target_compile_options(FRAME + PRIVATE + # load-var-for,COPTFLAGS + ${COPTFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + # get-noopt-cflags-for + ${CWARNFLAGS} + # get-noopt-cflags-for + ${CMISCFLAGS} + # get-noopt-cflags-for + ${CLANGFLAGS} + # in get-frame-cflags-for + ${BUILD_SYMFLAGS} + ) +target_compile_definitions(FRAME + PRIVATE + # get-noopt-cflags-for + ${CPPROCFLAGS} + # in get-noopt-cflags-for + ${VERS_DEF} + # in get-frame-cflags-for + ${BUILD_CPPFLAGS} + # Windows-specific definitions + ${EXPORT_API_DEFS} + ) +target_include_directories(FRAME + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + ) +if(THREADING_MODEL STREQUAL "openmp") + # Equivalent to CTHREADFLAGS in get-noopt-cflags-for + target_link_libraries(FRAME PRIVATE OpenMP::OpenMP_C) +elseif(THREADING_MODEL STREQUAL "pthreads") + # in get-noopt-cflags-for + target_compile_options(FRAME PRIVATE ${CTHREADFLAGS}) +endif() +if(BUILD_SHARED_LIBS) + # Equivalent to CPICFLAGS in get-noopt-cflags-for + set_target_properties(FRAME PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() +add_dependencies(FRAME flat-header) +# Put all those targets under object-libs-targets folder name so that they appear all together in IDE. +set_target_properties(FRAME PROPERTIES FOLDER object-libs-targets) \ No newline at end of file diff --git a/frame/base/cast/CMakeLists.txt b/frame/base/cast/CMakeLists.txt deleted file mode 100644 index 5f399de0db..0000000000 --- a/frame/base/cast/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_LIST_DIR}/bli_castm.c - ${CMAKE_CURRENT_LIST_DIR}/bli_castnzm.c - ${CMAKE_CURRENT_LIST_DIR}/bli_castv.c - ) - - diff --git a/frame/base/check/CMakeLists.txt b/frame/base/check/CMakeLists.txt deleted file mode 100644 index eeb13989ac..0000000000 --- a/frame/base/check/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_LIST_DIR}/bli_obj_check.c - ${CMAKE_CURRENT_LIST_DIR}/bli_part_check.c - ) - diff --git a/frame/base/noopt/CMakeLists.txt b/frame/base/noopt/CMakeLists.txt deleted file mode 100644 index d97d6c95b2..0000000000 --- a/frame/base/noopt/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_LIST_DIR}/bli_dlamch.c - ${CMAKE_CURRENT_LIST_DIR}/bli_lsame.c - ${CMAKE_CURRENT_LIST_DIR}/bli_slamch.c - ) - diff --git a/frame/base/proj/CMakeLists.txt b/frame/base/proj/CMakeLists.txt deleted file mode 100644 index 2b441a531c..0000000000 --- a/frame/base/proj/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_LIST_DIR}/bli_projm.c - ${CMAKE_CURRENT_LIST_DIR}/bli_projv.c - ) - - diff --git a/frame/compat/CMakeLists.txt b/frame/compat/CMakeLists.txt deleted file mode 100644 index 0cd2059d8a..0000000000 --- a/frame/compat/CMakeLists.txt +++ /dev/null @@ -1,76 +0,0 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - -${CMAKE_CURRENT_SOURCE_DIR}/bla_amin.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_asum.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm3m.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemmt.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_ger.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_hemm.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_hemv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_her.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_her2.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_her2k.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_herk.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_nrm2.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_symm.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_symv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_syr.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_syr2.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_syr2k.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_syrk.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_trmm.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_trmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_trsv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_batch.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_axpby.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_omatcopy.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_imatcopy.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_omatcopy2.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_omatadd.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack_get_size.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_compute.c - ) - -# Select AMD specific sources for AMD configurations. -if(${TARGET_ARCH} STREQUAL zen OR -${TARGET_ARCH} STREQUAL zen2 OR -${TARGET_ARCH} STREQUAL zen3 OR -${TARGET_ARCH} STREQUAL zen4 OR -${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bla_amax_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_axpy_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_copy_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_dot_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemv_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_scal_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_swap_amd.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_amd.c - ) -else() - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bla_amax.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_axpy.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_copy.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_dot.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_gemv.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_scal.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_swap.c - ${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm.c - ) -endif() - -#Add all subdirectories -# add_subdirectory(attic) -# add_subdirectory(blis) - add_subdirectory(cblas) - add_subdirectory(check) - add_subdirectory(f2c) diff --git a/frame/compat/attic/CMakeLists.txt b/frame/compat/attic/CMakeLists.txt deleted file mode 100644 index 7a03fa8ac6..0000000000 --- a/frame/compat/attic/CMakeLists.txt +++ /dev/null @@ -1,41 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE -${CMAKE_CURRENT_SOURCE_DIR}/bla_gbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_gbmv.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_hbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_hbmv.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_hpmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_hpmv.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_hpr.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_hpr.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_hpr2.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_hpr2.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_rot.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_rot.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_rotg.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_rotg.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_rotm.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_rotm.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_rotmg.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_rotmg.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_sbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_sbmv.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_spmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_spmv.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_spr.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_spr.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_spr2.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_spr2.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_tbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_tbmv.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_tbsv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_tbsv.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_tpmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_tpmv.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_tpsv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_tpsv.h - ) - - diff --git a/frame/compat/blis/CMakeLists.txt b/frame/compat/blis/CMakeLists.txt deleted file mode 100644 index 349286da89..0000000000 --- a/frame/compat/blis/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -if(UNIX) - add_subdirectory(thread) -endif() diff --git a/frame/compat/blis/thread/CMakeLists.txt b/frame/compat/blis/thread/CMakeLists.txt deleted file mode 100644 index 2307d90ab2..0000000000 --- a/frame/compat/blis/thread/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/b77_thread.c - ) - - - diff --git a/frame/compat/cblas/CMakeLists.txt b/frame/compat/cblas/CMakeLists.txt deleted file mode 100644 index 7a1a13b738..0000000000 --- a/frame/compat/cblas/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -#Add all subdirectories - add_subdirectory(f77_sub) - add_subdirectory(src) - - - - diff --git a/frame/compat/cblas/f77_sub/CMakeLists.txt b/frame/compat/cblas/f77_sub/CMakeLists.txt deleted file mode 100644 index 26cdd8f387..0000000000 --- a/frame/compat/cblas/f77_sub/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE -${CMAKE_CURRENT_SOURCE_DIR}/f77_amax_sub.c -${CMAKE_CURRENT_SOURCE_DIR}/f77_amin_sub.c -${CMAKE_CURRENT_SOURCE_DIR}/f77_asum_sub.c -${CMAKE_CURRENT_SOURCE_DIR}/f77_dot_sub.c -${CMAKE_CURRENT_SOURCE_DIR}/f77_nrm2_sub.c - ) diff --git a/frame/compat/cblas/src/CMakeLists.txt b/frame/compat/cblas/src/CMakeLists.txt deleted file mode 100644 index 9316c1a0df..0000000000 --- a/frame/compat/cblas/src/CMakeLists.txt +++ /dev/null @@ -1,169 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE -${CMAKE_CURRENT_SOURCE_DIR}/cblas_caxpy.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ccopy.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cdotc_sub.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cdotu_sub.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cgbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cgemm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cgemv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cgerc.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cgeru.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_chbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_chemm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_chemv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cher.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cher2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cher2k.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cherk.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_chpmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_chpr.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_chpr2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cscal.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_csscal.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cswap.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_csymm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_csyr2k.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_csyrk.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ctbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ctbsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ctpmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ctpsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ctrmm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ctrmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ctrsm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ctrsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dasum.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_daxpy.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dcopy.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ddot.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dgbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dgemm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dgemv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dger.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dnrm2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_drot.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_drotg.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_drotm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_drotmg.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dsbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dscal.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dsdot.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dspmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dspr.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dspr2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dswap.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dsymm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dsymv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dsyr.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dsyr2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dsyr2k.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dsyrk.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dtbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dtbsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dtpmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dtpsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dtrmm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dtrmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dtrsm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dtrsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dzasum.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dznrm2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_globals.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_icamax.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_idamax.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_isamax.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_izamax.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_icamin.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_idamin.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_isamin.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_izamin.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sasum.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_saxpy.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_scasum.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_scnrm2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_scopy.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sdot.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sdsdot.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sgbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sgemm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sgemv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sger.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_snrm2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_srot.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_srotg.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_srotm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_srotmg.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ssbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sscal.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sspmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sspr.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sspr2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sswap.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ssymm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ssymv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ssyr.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ssyr2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ssyr2k.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ssyrk.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_stbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_stbsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_stpmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_stpsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_strmm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_strmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_strsm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_strsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_xerbla.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zaxpy.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zcopy.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zdotc_sub.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zdotu_sub.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zdscal.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zgbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zgemm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zgemv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zgerc.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zgeru.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zhbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zhemm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zhemv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zher.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zher2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zher2k.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zherk.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zhpmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zhpr.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zhpr2.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zscal.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zswap.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zsymm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zsyr2k.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zsyrk.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ztbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ztbsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ztpmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ztpsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ztrmm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ztrmv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ztrsm.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_ztrsv.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cgemmt.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dgemmt.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sgemmt.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zgemmt.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_sgemm_batch.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dgemm_batch.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cgemm_batch.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zgemm_batch.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_saxpby.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_daxpby.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_caxpby.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zaxpby.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_cgemm3m.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_zgemm3m.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_dcabs1.c -${CMAKE_CURRENT_SOURCE_DIR}/cblas_scabs1.c -) diff --git a/frame/compat/check/CMakeLists.txt b/frame/compat/check/CMakeLists.txt deleted file mode 100644 index e3519ecfb5..0000000000 --- a/frame/compat/check/CMakeLists.txt +++ /dev/null @@ -1,27 +0,0 @@ -##Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved. ## - -target_sources("${PROJECT_NAME}" - PRIVATE -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemmt_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemv_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_ger_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_hemm_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_hemv_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_her2_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_her2k_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_her_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_herk_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_symm_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_symv_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_syr2_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_syr2k_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_syr_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_syrk_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_trmm_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_trmv_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_trsv_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm3m_check.h -${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_compute_check.h - ) diff --git a/frame/compat/f2c/CMakeLists.txt b/frame/compat/f2c/CMakeLists.txt deleted file mode 100644 index 4a839b8350..0000000000 --- a/frame/compat/f2c/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE -${CMAKE_CURRENT_SOURCE_DIR}/bla_cabs1.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_gbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_hbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_hpmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_hpr.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_hpr2.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_lsame.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_rot.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_rotg.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_rotm.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_rotmg.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_sbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_spmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_spr.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_spr2.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_tbmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_tbsv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_tpmv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_tpsv.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_xerbla.c - ) - -#Add all subdirectories - add_subdirectory(util) - - - diff --git a/frame/compat/f2c/util/CMakeLists.txt b/frame/compat/f2c/util/CMakeLists.txt deleted file mode 100644 index fc66c79471..0000000000 --- a/frame/compat/f2c/util/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE -${CMAKE_CURRENT_SOURCE_DIR}/bla_c_abs.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_c_div.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_d_abs.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_d_cnjg.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_d_imag.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_d_sign.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_f__cabs.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_r_abs.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_r_cnjg.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_r_imag.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_r_sign.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_z_abs.c -${CMAKE_CURRENT_SOURCE_DIR}/bla_z_div.c - ) - - - diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index baa099a68a..67ef603ce6 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -231,7 +231,7 @@ #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else - #define BLIS_EXPORT __declspec(dllimport) + #define BLIS_EXPORT #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) diff --git a/frame/ind/CMakeLists.txt b/frame/ind/CMakeLists.txt deleted file mode 100644 index 1e48c505a4..0000000000 --- a/frame/ind/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_ind.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ind.c - ) - -set(SUBDIRECTORIES "cntx" "oapi" "tapi" ) - -#Add all subdirectories -foreach(VAR ${SUBDIRECTORIES}) - add_subdirectory(${VAR}) -endforeach() - diff --git a/frame/ind/cntx/CMakeLists.txt b/frame/ind/cntx/CMakeLists.txt deleted file mode 100644 index 0f67a80598..0000000000 --- a/frame/ind/cntx/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_cntx_ind_stage.c - ) - diff --git a/frame/ind/oapi/CMakeLists.txt b/frame/ind/oapi/CMakeLists.txt deleted file mode 100644 index e65f9d3a4a..0000000000 --- a/frame/ind/oapi/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_3m4m1m_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ind_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_nat_oapi.c - ) diff --git a/frame/ind/tapi/CMakeLists.txt b/frame/ind/tapi/CMakeLists.txt deleted file mode 100644 index c921bfdf2a..0000000000 --- a/frame/ind/tapi/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ind_tapi.c - ) - - diff --git a/frame/ind/ukernels/CMakeLists.txt b/frame/ind/ukernels/CMakeLists.txt deleted file mode 100644 index 6d049edd1d..0000000000 --- a/frame/ind/ukernels/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ind_ukr.h - ) diff --git a/frame/thread/CMakeLists.txt b/frame/thread/CMakeLists.txt deleted file mode 100644 index 71c9d6f9b0..0000000000 --- a/frame/thread/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute_decor_openmp.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute_decor_single.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_openmp.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_pthreads.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_single.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_decor_openmp.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_decor_pthreads.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_sup_decor_single.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_pthread.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_thrcomm.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_thrcomm_openmp.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_thrcomm_pthreads.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_thrcomm_single.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_thread.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_thrinfo.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_thrinfo_sup.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_pack_full_decor_openmp.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_pack_full_decor_single.c - ) diff --git a/frame/util/CMakeLists.txt b/frame/util/CMakeLists.txt deleted file mode 100644 index 13fd53fc52..0000000000 --- a/frame/util/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_check.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_fpa.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_oapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_oapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_oapi_ex.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_tapi.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_tapi_ba.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_tapi_ex.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_unb_var1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_update.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_api_wrap.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_util_progress.c - ) diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt index 47501d920c..b132d52cb2 100644 --- a/kernels/CMakeLists.txt +++ b/kernels/CMakeLists.txt @@ -1,10 +1,79 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## -add_subdirectory(haswell) -add_subdirectory(zen) +# Writing a function that will be used to generate the required object +# libraries for the required kernels. +function(generate_kernel_targets kernel_target) + # Collect all subdirectory paths that have at least one file with suffix in KERNELS_SRC_SUFS list. + get_filepaths_with_suffixes(LOCAL_SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/${kernel_target}" "${KERNELS_SRC_SUFS}") -if(${TARGET_ARCH} STREQUAL zen4 OR - ${TARGET_ARCH} STREQUAL amdzen) - add_subdirectory(skx) - add_subdirectory(zen4) -endif() + # Choose correct sub-configurarion name for the given kernel set. + get_config_for_kernel_from_kconfig_map(LOCAL_CONFIG ${kernel_target} "${KCONFIG_MAP}") + + # Only generate the object library if there is at least one source file. + list(LENGTH LOCAL_SOURCE_FILES size) + if(size GREATER 0) + # Create an object library using the source file list above. + add_library(${kernel_target}_KERNELS + OBJECT + ${LOCAL_SOURCE_FILES} + ) + # Include the corresponding make_defs.cmake that holds the required compiler options. + include(${CMAKE_SOURCE_DIR}/config/${LOCAL_CONFIG}/make_defs.cmake) + # Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets. + # mimicing get-kernel-cflags-for + target_compile_options(${kernel_target}_KERNELS + PRIVATE + # load-var-for,CKOPTFLAGS + ${CKOPTFLAGS} + # load-var-for,CKVECFLAGS + ${CKVECFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + # get-noopt-cflags-for + ${CWARNFLAGS} + # get-noopt-cflags-for + ${CMISCFLAGS} + # get-noopt-cflags-for + ${CLANGFLAGS} + # in get-kernel-cflags-for + ${COMPSIMDFLAGS} + # in get-kernel-cflags-for + ${BUILD_SYMFLAGS} + ) + target_compile_definitions(${kernel_target}_KERNELS + PRIVATE + # in get-noopt-cflags-for + ${CPPROCFLAGS} + # in get-noopt-cflags-for + ${VERS_DEF} + # in get-kernel-cflags-for + ${BUILD_CPPFLAGS} + ) + target_include_directories(${kernel_target}_KERNELS + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + ) + if(THREADING_MODEL STREQUAL "openmp") + # Equivalent to CTHREADFLAGS in get-noopt-cflags-for + target_link_libraries(${kernel_target}_KERNELS PRIVATE OpenMP::OpenMP_C) + elseif(THREADING_MODEL STREQUAL "pthreads") + # in get-noopt-cflags-for + target_compile_options(${kernel_target}_KERNELS PRIVATE ${CTHREADFLAGS}) + endif() + if(BUILD_SHARED_LIBS) + # Equivalent to CPICFLAGS in get-noopt-cflags-for + set_target_properties(${kernel_target}_KERNELS PROPERTIES POSITION_INDEPENDENT_CODE ON) + endif() + add_dependencies(${kernel_target}_KERNELS flat-header) + # Put all those targets under object-libs-targets folder name so that they appear all together in IDE. + set_target_properties(${kernel_target}_KERNELS PROPERTIES FOLDER object-libs-targets) + endif() +endfunction() + +# Generate targets for each of the kernels present +# in the kernel list. +foreach(KERN ${KERNEL_LIST}) + generate_kernel_targets(${KERN}) +endforeach() \ No newline at end of file diff --git a/kernels/haswell/3/CMakeLists.txt b/kernels/haswell/3/CMakeLists.txt deleted file mode 100644 index a42bdadf83..0000000000 --- a/kernels/haswell/3/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(haswell_3 - OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_haswell_asm_d6x8.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_haswell_asm_d8x6.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_l_haswell_asm_d6x8.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_u_haswell_asm_d6x8.c - ) - -target_compile_options(haswell_3 PRIVATE /arch:AVX2) -if(BUILD_SHARED_LIBS) - target_compile_definitions(haswell_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() - -add_subdirectory(sup) diff --git a/kernels/haswell/3/sup/CMakeLists.txt b/kernels/haswell/3/sup/CMakeLists.txt deleted file mode 100644 index e5ed6183c2..0000000000 --- a/kernels/haswell/3/sup/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(haswell_3sup - OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_d6x8m.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_d6x8n.c - #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_s6x16m.c - #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_s6x16n.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_d6x8m.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_d6x8n.c - #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_s6x16m.c - #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_s6x16n.c - ) -target_compile_options(haswell_3sup PRIVATE /arch:AVX2) -if(BUILD_SHARED_LIBS) - target_compile_definitions(haswell_3sup PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() -add_subdirectory(d6x8) -#add_subdirectory(s6x16) diff --git a/kernels/haswell/3/sup/d6x8/CMakeLists.txt b/kernels/haswell/3/sup/d6x8/CMakeLists.txt deleted file mode 100644 index 5d41661142..0000000000 --- a/kernels/haswell/3/sup/d6x8/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(haswell_3supd6x8 - OBJECT -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_r_haswell_ref_dMx1.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx1.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx2.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx4.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx8.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx1.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx2.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx3.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx4.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx5.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx6.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx7.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx8.c - ) - -target_compile_options(haswell_3supd6x8 PRIVATE /arch:AVX2) -if(BUILD_SHARED_LIBS) - target_compile_definitions(haswell_3supd6x8 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() diff --git a/kernels/haswell/3/sup/s6x16/CMakeLists.txt b/kernels/haswell/3/sup/s6x16/CMakeLists.txt deleted file mode 100644 index 0be5cd76e8..0000000000 --- a/kernels/haswell/3/sup/s6x16/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -##Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_r_haswell_ref_sMx1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_sMx1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_sMx12.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_sMx16.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_sMx2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_sMx4.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_sMx8.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_sMx12.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_sMx16.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_sMx2.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_sMx4.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_sMx6.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_sMx8.c - ) - - diff --git a/kernels/haswell/CMakeLists.txt b/kernels/haswell/CMakeLists.txt deleted file mode 100644 index 2a161a1685..0000000000 --- a/kernels/haswell/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -##Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.## - -add_subdirectory(3) -add_subdirectory(1m) - diff --git a/kernels/skx/3/CMakeLists.txt b/kernels/skx/3/CMakeLists.txt deleted file mode 100644 index e4125f1b60..0000000000 --- a/kernels/skx/3/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(skx_3 - OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_skx_asm_16x14.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_sgemm_skx_asm_32x12_l2.c - ) -target_compile_options(skx_3 PRIVATE /arch:AVX2 /arch:AVX512) -if(BUILD_SHARED_LIBS) - target_compile_definitions(skx_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() diff --git a/kernels/skx/CMakeLists.txt b/kernels/skx/CMakeLists.txt deleted file mode 100644 index a9ba638da8..0000000000 --- a/kernels/skx/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.## -remove_definitions(/arch:AVX2) - -add_subdirectory(3) \ No newline at end of file diff --git a/kernels/zen/1/CMakeLists.txt b/kernels/zen/1/CMakeLists.txt deleted file mode 100644 index 87db4ac1c7..0000000000 --- a/kernels/zen/1/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(zen_1 - OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/bli_amaxv_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpbyv_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpbyv_zen_int10.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyv_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyv_zen_int10.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_copyv_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotv_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotv_zen_int10.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxv_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_scalv_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_scalv_zen_int10.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_setv_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_swapv_zen_int8.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_norm2_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_scal2v_zen_int.c - ) -target_compile_options(zen_1 PRIVATE /arch:AVX2) -if(BUILD_SHARED_LIBS) - target_compile_definitions(zen_1 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() diff --git a/kernels/zen/1f/CMakeLists.txt b/kernels/zen/1f/CMakeLists.txt deleted file mode 100644 index 5da0c9e7b0..0000000000 --- a/kernels/zen/1f/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -##Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(zen_1f - OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_8.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxf_zen_int_8.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_5.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_4.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_6.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpy2v_zen_int.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxaxpyf_zen_int_8.c - ) -target_compile_options(zen_1f PRIVATE /arch:AVX2) -if(BUILD_SHARED_LIBS) - target_compile_definitions(zen_1f PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() diff --git a/kernels/zen/2/CMakeLists.txt b/kernels/zen/2/CMakeLists.txt deleted file mode 100644 index c9c9220609..0000000000 --- a/kernels/zen/2/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(zen_2 - OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_zen_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_zen_int_4.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_zen_int_4.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_zen_int_amd.c - ) -target_compile_options(zen_2 PRIVATE /arch:AVX2) -if(BUILD_SHARED_LIBS) - target_compile_definitions(zen_2 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() -# For any other TARGET_ARCH, it would fail to configure. -# Select AMD specific sources for AMD configurations. -#[=[if(${TARGET_ARCH} STREQUAL zen OR -${TARGET_ARCH} STREQUAL zen2 OR -${TARGET_ARCH} STREQUAL zen3 OR -${TARGET_ARCH} STREQUAL zen4 OR -${TARGET_ARCH} STREQUAL amdzen) - target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_zen_int_amd.c - ) -endif()]=] diff --git a/kernels/zen/3/CMakeLists.txt b/kernels/zen/3/CMakeLists.txt deleted file mode 100644 index 3c99cbafb8..0000000000 --- a/kernels/zen/3/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(zen_3 - OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_small.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_tiny.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_avx2_k1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_avx2_k1.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_zen_2x6.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemmtrsm_l_2x6.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemmtrsm_u_2x6.c - ) -target_compile_options(zen_3 PRIVATE /arch:AVX2) -if(BUILD_SHARED_LIBS) - target_compile_definitions(zen_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() -add_subdirectory(sup) diff --git a/kernels/zen/3/sup/CMakeLists.txt b/kernels/zen/3/sup/CMakeLists.txt deleted file mode 100644 index 57f3ee01ff..0000000000 --- a/kernels/zen/3/sup/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(zen_3_sup - OBJECT -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_s6x16.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_s6x16m.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_s6x16n.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_z3x4.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_z3x4m.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_z3x4n.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_c3x8.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_c3x8m.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_c3x8n.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_s6x16.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_s6x16m.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_s6x16n.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4m.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4n.c - ) -target_compile_options(zen_3_sup PRIVATE /arch:AVX2) -if(BUILD_SHARED_LIBS) - target_compile_definitions(zen_3_sup PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() diff --git a/kernels/zen/CMakeLists.txt b/kernels/zen/CMakeLists.txt deleted file mode 100644 index 0ac346fb3e..0000000000 --- a/kernels/zen/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - - -set(SUBDIRECTORIES "1" "1f" "2" "3" "util") - -#Add all subdirectories -foreach(VAR ${SUBDIRECTORIES}) - add_subdirectory(${VAR}) -endforeach() - - diff --git a/kernels/zen/util/CMakeLists.txt b/kernels/zen/util/CMakeLists.txt deleted file mode 100644 index 502ebd1ac2..0000000000 --- a/kernels/zen/util/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -##Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_thresh_funcs_zen.c - ) diff --git a/kernels/zen4/1/CMakeLists.txt b/kernels/zen4/1/CMakeLists.txt deleted file mode 100644 index 9bfb5d650e..0000000000 --- a/kernels/zen4/1/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(zen4_1 - OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/bli_amaxv_zen_int_avx512.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_scalv_zen_int_avx512.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotv_zen_int_avx512.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyv_zen_int_avx512.c - ) - -target_compile_options(zen4_1 PRIVATE /arch:AVX2 /arch:AVX512) -if(BUILD_SHARED_LIBS) - target_compile_definitions(zen4_1 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() diff --git a/kernels/zen4/1m/CMakeLists.txt b/kernels/zen4/1m/CMakeLists.txt deleted file mode 100644 index 9dfbefc458..0000000000 --- a/kernels/zen4/1m/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(zen4_1m - OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d8xk.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d16xk.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d24xk.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d32xk.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_z12xk.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_z4xk.c - ) - -target_compile_options(zen4_1m PRIVATE /U__PRFCHW__ /arch:AVX2 /arch:AVX512) -if(BUILD_SHARED_LIBS) - target_compile_definitions(zen4_1m PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() diff --git a/kernels/zen4/3/sup/CMakeLists.txt b/kernels/zen4/3/sup/CMakeLists.txt deleted file mode 100644 index 81e194ef64..0000000000 --- a/kernels/zen4/3/sup/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(zen4_3sup - OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64.h - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64m.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64n.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64.h - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64m.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64n.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_24x8m.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_cv_zen4_z12x4m.c -) -target_compile_options(zen4_3sup PRIVATE /arch:AVX2 /arch:AVX512) -if(BUILD_SHARED_LIBS) - target_compile_definitions(zen4_3sup PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() - -add_subdirectory(d24x8) diff --git a/kernels/zen4/3/sup/d24x8/CMakeLists.txt b/kernels/zen4/3/sup/d24x8/CMakeLists.txt deleted file mode 100644 index 004a07c085..0000000000 --- a/kernels/zen4/3/sup/d24x8/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## - -add_library(zen4_3supd24x8 - OBJECT -${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx1.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx2.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx3.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx4.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx5.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx6.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx7.c -${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx8.c - ) - -target_compile_options(zen4_3supd24x8 PRIVATE /arch:AVX2 /arch:AVX512) -if(BUILD_SHARED_LIBS) - target_compile_definitions(zen4_3supd24x8 PUBLIC -DBLIS_IS_BUILDING_LIBRARY) -endif() diff --git a/kernels/zen4/CMakeLists.txt b/kernels/zen4/CMakeLists.txt deleted file mode 100644 index 7878918053..0000000000 --- a/kernels/zen4/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.## -remove_definitions(/arch:AVX2) - -add_subdirectory(1) -add_subdirectory(1m) -add_subdirectory(3) -add_subdirectory(aocl_smart) \ No newline at end of file diff --git a/kernels/zen4/aocl_smart/CMakeLists.txt b/kernels/zen4/aocl_smart/CMakeLists.txt deleted file mode 100644 index ef10975d24..0000000000 --- a/kernels/zen4/aocl_smart/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_aocl_smart.c - ) diff --git a/ref_kernels/1/CMakeLists.txt b/ref_kernels/1/CMakeLists.txt deleted file mode 100644 index c279113758..0000000000 --- a/ref_kernels/1/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_addv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_amaxv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_aminv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpbyv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_copyv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_invertv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_scal2v_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_scalv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_setv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_subv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_swapv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_xpbyv_ref.c - ) diff --git a/ref_kernels/1f/CMakeLists.txt b/ref_kernels/1f/CMakeLists.txt deleted file mode 100644 index 1b54e5eb80..0000000000 --- a/ref_kernels/1f/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpy2v_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotaxpyv_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxaxpyf_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxf_ref.c - ) - diff --git a/ref_kernels/1m/CMakeLists.txt b/ref_kernels/1m/CMakeLists.txt deleted file mode 100644 index 34f15ae69f..0000000000 --- a/ref_kernels/1m/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk_1er_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk_3mis_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk_4mi_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk_bb_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_cxk_rih_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_unpackm_cxk_ref.c - ) diff --git a/ref_kernels/3/CMakeLists.txt b/ref_kernels/3/CMakeLists.txt deleted file mode 100644 index 3919189eb7..0000000000 --- a/ref_kernels/3/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_ref.c - ) - -add_subdirectory(bb) diff --git a/ref_kernels/3/bb/CMakeLists.txt b/ref_kernels/3/bb/CMakeLists.txt deleted file mode 100644 index a3ce393621..0000000000 --- a/ref_kernels/3/bb/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmbb_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsmbb_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsmbb_ref.c - ) - diff --git a/ref_kernels/CMakeLists.txt b/ref_kernels/CMakeLists.txt deleted file mode 100644 index d26bce06a5..0000000000 --- a/ref_kernels/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -##Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.## - -if(${TARGET_ARCH} STREQUAL amdzen) -add_subdirectory(${CMAKE_BINARY_DIR}/ref_kernels/generic ${CMAKE_BINARY_DIR}/ref_kernels/generic) -add_subdirectory(${CMAKE_BINARY_DIR}/ref_kernels/zen ${CMAKE_BINARY_DIR}/ref_kernels/zen) -add_subdirectory(${CMAKE_BINARY_DIR}/ref_kernels/zen2 ${CMAKE_BINARY_DIR}/ref_kernels/zen2) -add_subdirectory(${CMAKE_BINARY_DIR}/ref_kernels/zen3 ${CMAKE_BINARY_DIR}/ref_kernels/zen3) -add_subdirectory(${CMAKE_BINARY_DIR}/ref_kernels/zen4 ${CMAKE_BINARY_DIR}/ref_kernels/zen4) -else() -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_cntx_ref.c - ) - -set(SUBDIRECTORIES "1" "1f" "1m" "3" "ind") - -#Add all subdirectories -foreach(VAR ${SUBDIRECTORIES}) - add_subdirectory(${VAR}) -endforeach() -endif() diff --git a/ref_kernels/ind/CMakeLists.txt b/ref_kernels/ind/CMakeLists.txt deleted file mode 100644 index 0a02584b1a..0000000000 --- a/ref_kernels/ind/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -target_sources("${PROJECT_NAME}" - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm1m_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm3m1_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm3mh_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm4m1_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm4mb_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm4mh_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm1m_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm3m1_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm4m1_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm1m_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm3m1_ref.c - ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm4m1_ref.c - ) diff --git a/testsuite/CMakeLists.txt b/testsuite/CMakeLists.txt index 85866926dd..4e23e0e382 100644 --- a/testsuite/CMakeLists.txt +++ b/testsuite/CMakeLists.txt @@ -1,16 +1,106 @@ -##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.## -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) +# Comments: +# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given. +# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in +# the second case because CONFIG_NAME is not yet set. +if(NOT DEFINED BLIS_INSTALL_PATH) + set(DIST_PATH ${CMAKE_BINARY_DIR}) + set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY}) + set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY}) +else() + set(LIB_PATH ${BLIS_INSTALL_PATH}/lib) + set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis) +endif() + +# Include the corresponding make_defs.cmake that holds the required compiler options. +include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake) -add_executable(test_libblis "") +# Gather all local source files. +file(GLOB testsuite_sources LIST_DIRECTORIES false ${CMAKE_CURRENT_SOURCE_DIR}/src/*.c) -add_subdirectory(src) +# Override the value of CINCFLAGS so that the value of CFLAGS returned by +# get-user-cflags-for() is not cluttered up with include paths needed only +# while building BLIS. +set(CINFLAGS ${INC_PATH}) -target_link_libraries(test_libblis debug "${LIB_NAME}.lib") -if(ENABLE_OPENMP) - target_link_libraries(test_libblis OpenMP::OpenMP_CXX) +# Create an executable using the sources above. +add_executable(test_libblis.x ${testsuite_sources}) +target_compile_options(test_libblis.x + PRIVATE + # load-var-for,COPTFLAGS + ${COPTFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + ${CWARNFLAGS} + ${CPICFLAGS} + ${CMISCFLAGS} + ${CLANGFLAGS} + ) +if(WIN32 AND BUILD_SHARED_LIBS) + target_compile_definitions(test_libblis.x + PRIVATE + # in get-noopt-cflags-for + ${VERS_DEF} + # Need to import symbols because the testsuite is using + # static variables which always need to be imported. + "-DBLIS_EXPORT=__declspec(dllimport)" + ) +else() + target_compile_definitions(test_libblis.x + PRIVATE + # in get-noopt-cflags-for + ${VERS_DEF} + ) endif() -target_link_libraries(test_libblis optimized "${LIB_NAME}.lib") +target_include_directories(test_libblis.x + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + # Add local header paths + ${CMAKE_CURRENT_SOURCE_DIR}/src + ) +target_link_libraries(test_libblis.x PRIVATE ${LDFLAGS} libblis) +if(THREADING_MODEL STREQUAL "openmp") + target_link_libraries(test_libblis.x PRIVATE OpenMP::OpenMP_C) +endif() + +# -- Test run/check rules -- +# Wrap the creation of testing helpers in this function. +function(add_testblis flavour) + if (NOT(flavour STREQUAL "")) + set(dotflavour .${flavour}) + set(dashflavour -${flavour}) + set(printflavour "(${flavour})") + endif() + # A rule to run the testsuite using the input.*${dotflavour} files, which + # run a set of tests designed to finish much more quickly. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour} + COMMAND test_libblis.x -g ${CMAKE_CURRENT_SOURCE_DIR}/input.general${dotflavour} -o ${CMAKE_CURRENT_SOURCE_DIR}/input.operations${dotflavour} > ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour} + COMMENT "Running test_libblis.x ${printflavour} with output redirected to ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour}" + DEPENDS test_libblis.x ${CMAKE_CURRENT_SOURCE_DIR}/input.general${dotflavour} ${CMAKE_CURRENT_SOURCE_DIR}/input.operations${dotflavour} + WORKING_DIRECTORY $ + VERBATIM + ) + add_custom_target(testblis${dashflavour} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour}) + # Check the results of the BLIS testsuite. + add_custom_target(checkblis${dashflavour} + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/cmake/check-blistest.py ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour} + DEPENDS testblis${dashflavour} + ) +endfunction() +# Add testing targets using functions above for all input file options. +add_testblis("") +add_testblis("fast") +add_testblis("mixed") +add_testblis("salt") +add_custom_target(checkblis-md DEPENDS checkblis-mixed) +add_custom_target(testblis-md DEPENDS testblis-mixed) +add_custom_target(testsuite DEPENDS testblis) +# Put all those targets under testsuite-targets folder name so that they appear all together in IDE. +set_target_properties(test_libblis.x testblis checkblis testblis-fast checkblis-fast testblis-md checkblis-md testblis-mixed checkblis-mixed testblis-salt checkblis-salt + PROPERTIES FOLDER testsuite-targets) \ No newline at end of file diff --git a/testsuite/src/CMakeLists.txt b/testsuite/src/CMakeLists.txt deleted file mode 100644 index 7180ac1ca6..0000000000 --- a/testsuite/src/CMakeLists.txt +++ /dev/null @@ -1,60 +0,0 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved## - -target_sources(test_libblis - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/test_addm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_addv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_amaxv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_axpbyv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_axpy2v.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_axpyf.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_axpym.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_axpyv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_copym.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_copyv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_dotaxpyv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_dotv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_dotxaxpyf.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_dotxf.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_dotxv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_gemm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_gemmt.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_gemm_ukr.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_gemmtrsm_ukr.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_gemv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_ger.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_hemm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_hemv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_her.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_her2.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_her2k.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_herk.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_libblis.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_normfm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_normfv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_randm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_randv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_scal2m.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_scal2v.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_scalm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_scalv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_setm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_setv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_subm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_subv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_symm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_symv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_syr.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_syr2.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_syr2k.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_syrk.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_trmm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_trmm3.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_trmv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_trsm.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_trsm_ukr.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_trsv.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_xpbym.c - ${CMAKE_CURRENT_SOURCE_DIR}/test_xpbyv.c - ) - diff --git a/vendor/testcpp/CMakeLists.txt b/vendor/testcpp/CMakeLists.txt index 54bb8d2cb7..3e0d1209e9 100644 --- a/vendor/testcpp/CMakeLists.txt +++ b/vendor/testcpp/CMakeLists.txt @@ -1,124 +1,70 @@ -##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.## - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -include_directories(${CMAKE_SOURCE_DIR}/cpp) - -add_executable(test_asum_blis test_asum.cc) -target_link_libraries(test_asum_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_axpy_blis test_axpy.cc) -target_link_libraries(test_axpy_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_copy_blis test_copy.cc) -target_link_libraries(test_copy_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_dot_blis test_dot.cc) -target_link_libraries(test_dot_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_dotc_blis test_dotc.cc) -target_link_libraries(test_dotc_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_gbmv_blis test_gbmv.cc) -target_link_libraries(test_gbmv_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_gemm_blis test_gemm.cc) -target_link_libraries(test_gemm_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_gemv_blis test_gemv.cc) -target_link_libraries(test_gemv_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_ger_blis test_ger.cc) -target_link_libraries(test_ger_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_gerc_blis test_gerc.cc) -target_link_libraries(test_gerc_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_geru_blis test_geru.cc) -target_link_libraries(test_geru_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_hemm_blis test_hemm.cc) -target_link_libraries(test_hemm_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_hemv_blis test_hemv.cc) -target_link_libraries(test_hemv_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_her2_blis test_her2.cc) -target_link_libraries(test_her2_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_her_blis test_her.cc) -target_link_libraries(test_her_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_herk_blis test_herk.cc) -target_link_libraries(test_herk_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_hpr2_blis test_hpr2.cc) -target_link_libraries(test_hpr2_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_hpr_blis test_hpr.cc) -target_link_libraries(test_hpr_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_nrm2_blis test_nrm2.cc) -target_link_libraries(test_nrm2_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_rot_blis test_rot.cc) -target_link_libraries(test_rot_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_rotg_blis test_rotg.cc) -target_link_libraries(test_rotg_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_rotm_blis test_rotm.cc) -target_link_libraries(test_rotm_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_rotmg_blis test_rotmg.cc) -target_link_libraries(test_rotmg_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_scal_blis test_scal.cc) -target_link_libraries(test_scal_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_sdsdot_blis test_sdsdot.cc) -target_link_libraries(test_sdsdot_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_spr2_blis test_spr2.cc) -target_link_libraries(test_spr2_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_spr_blis test_spr.cc) -target_link_libraries(test_spr_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_swap_blis test_swap.cc) -target_link_libraries(test_swap_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_symm_blis test_symm.cc) -target_link_libraries(test_symm_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_syr2_blis test_syr2.cc) -target_link_libraries(test_syr2_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_syr2k_blis test_syr2k.cc) -target_link_libraries(test_syr2k_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_syr_blis test_syr.cc) -target_link_libraries(test_syr_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_syrk_blis test_syrk.cc) -target_link_libraries(test_syrk_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_tbmv_blis test_tbmv.cc) -target_link_libraries(test_tbmv_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_tbsv_blis test_tbsv.cc) -target_link_libraries(test_tbsv_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_tpmv_blis test_tpmv.cc) -target_link_libraries(test_tpmv_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_tpsv_blis test_tpsv.cc) -target_link_libraries(test_tpsv_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_trmm_blis test_trmm.cc) -target_link_libraries(test_trmm_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_trsm_blis test_trsm.cc) -target_link_libraries(test_trsm_blis PRIVATE "${LIB_NAME}.lib" ) - -add_executable(test_trsv_blis test_trsv.cc) -target_link_libraries(test_trsv_blis PRIVATE "${LIB_NAME}.lib" ) +##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## + +# Comments: +# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given. +# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in +# the second case because CONFIG_NAME is not yet set. +if(NOT DEFINED BLIS_INSTALL_PATH) + set(DIST_PATH ${CMAKE_BINARY_DIR}) + set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY}) + set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY}) +else() + set(LIB_PATH ${BLIS_INSTALL_PATH}/lib) + set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis) +endif() + +# Include the corresponding make_defs.cmake that holds the required compiler options. +include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake) + +# Gather all local source files. +file(GLOB testcpp_sources LIST_DIRECTORIES false ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) +list(TRANSFORM testcpp_sources REPLACE ${CMAKE_CURRENT_SOURCE_DIR}/ "") + +# Override the value of CINCFLAGS so that the value of CFLAGS returned by +# get-user-cflags-for() is not cluttered up with include paths needed only +# while building BLIS. +set(CINFLAGS ${INC_PATH}) + +# Create one executable for each of the sources. +foreach(source ${testcpp_sources}) + string(REPLACE .cc "" exec_name ${source}) + string(APPEND exec_name "_blis") + add_executable(${exec_name} ${source}) + target_compile_options(${exec_name} + PRIVATE + # load-var-for,COPTFLAGS + ${COPTFLAGS} + # get-noopt-cflags-for + ${CDBGFLAGS} + ${CWARNFLAGS} + ${CPICFLAGS} + ${CMISCFLAGS} + ${CXXLANGFLAGS} + + ) + target_include_directories(${exec_name} + BEFORE + PRIVATE + # in get-noopt-cflags-for + ${CINFLAGS} + # Add local header paths + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/vendor/cpp + ) + target_link_libraries(${exec_name} PRIVATE ${LDFLAGS} libblis) + if(THREADING_MODEL STREQUAL "openmp") + target_link_libraries(${exec_name} PRIVATE OpenMP::OpenMP_C) + endif() + set_target_properties(${exec_name} PROPERTIES CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + # Put all those targets under vendor-testcpp-targets folder name so that they appear all together in IDE. + set_target_properties(${exec_name} PROPERTIES FOLDER vendor-testcpp-targets) + add_custom_target(${exec_name}.x + COMMAND ${exec_name}) + # Put all those targets under vendor-testcpp-targets folder name so that they appear all together in IDE. + set_target_properties(${exec_name}.x PROPERTIES FOLDER vendor-testcpp-targets) + list(APPEND test_executables "${exec_name}.x") +endforeach() + +add_custom_target(checkbliscpp DEPENDS ${test_executables}) +# Put all those targets under vendor-testcpp-targets folder name so that they appear all together in IDE. +set_target_properties(checkbliscpp PROPERTIES FOLDER vendor-testcpp-targets) From e4e4fe55fb7ee548259528258aaceadecf5d16bb Mon Sep 17 00:00:00 2001 From: Eashan Dash Date: Tue, 7 Nov 2023 15:19:29 +0530 Subject: [PATCH 197/226] Added Parameter Checks and DTL Trace for Extension APIs 1. Added input parameter checking for the extension APIs 1. gemm_pack_get_size API 2. gemm_pack API 2. Additionally added early returns for these APIs when m or n dimensions are 0. 3. Routines for input parameter check for all the 3 BLAS extension APIs - gemm_pack_get_size, gemm_pack and gemm_compute are defined in: frame/compat/check/bla_gemm_pack_compute_check.h 4. Added AOCL DTL TRACE for all the functions of 1. gemm_pack_get_size 2. gemm_pack 3. gemm_compute AMD-Internal: [CPUPL-3560] Change-Id: I4351b8494d888eae7e7431a7e1e23e442ffc8631 --- frame/1m/packm/bli_pack_full.c | 16 ++++ frame/3/bli_l3_compute.c | 16 +++- frame/3/bli_l3_compute.h | 2 +- frame/compat/bla_gemm_compute.c | 17 +++- frame/compat/bla_gemm_pack.c | 76 ++++++++++++++--- frame/compat/bla_gemm_pack_get_size.c | 62 +++++++++++--- frame/compat/bli_blas.h | 2 +- ..._check.h => bla_gemm_pack_compute_check.h} | 85 ++++++++++++++++++- frame/thread/bli_l3_compute_decor.h | 6 +- frame/thread/bli_l3_compute_decor_openmp.c | 7 +- frame/thread/bli_l3_compute_decor_single.c | 7 +- frame/thread/bli_pack_full_decor_openmp.c | 4 + frame/thread/bli_pack_full_decor_single.c | 4 + 13 files changed, 269 insertions(+), 35 deletions(-) rename frame/compat/check/{bla_gemm_compute_check.h => bla_gemm_pack_compute_check.h} (63%) diff --git a/frame/1m/packm/bli_pack_full.c b/frame/1m/packm/bli_pack_full.c index 682fd243df..831c946f7c 100644 --- a/frame/1m/packm/bli_pack_full.c +++ b/frame/1m/packm/bli_pack_full.c @@ -45,6 +45,8 @@ void bli_pack_full_init rntm_t* rntm ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2); + // Initializing the cntx if one isn't already passed. if ( cntx == NULL ) { cntx = bli_gks_query_cntx(); @@ -77,6 +79,8 @@ void bli_pack_full_init rntm ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2); + } // Full pack function for A matrix @@ -98,6 +102,8 @@ void PASTEMAC(ch,tfuncname) \ thrinfo_t* thread \ ) \ {\ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_5); \ +\ const num_t dt = PASTEMAC(ch,type); \ \ /* Query the context for various blocksizes. */ \ @@ -191,6 +197,8 @@ void PASTEMAC(ch,tfuncname) \ \ } \ } \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_5); \ \ } \ @@ -217,6 +225,8 @@ void PASTEMAC(ch,tfuncname) \ thrinfo_t* thread \ ) \ { \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_5); \ +\ const num_t dt = PASTEMAC(ch,type); \ \ /* Query the context for various blocksizes. */ \ @@ -354,6 +364,8 @@ void PASTEMAC(ch,tfuncname) \ adjust_B_panel_reordered_jc( &jj, jc_cur_loop ); \ \ } \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_5); \ \ } \ @@ -374,6 +386,8 @@ void PASTEMAC(ch,tfuncname) \ thrinfo_t* thread \ ) \ { \ +\ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4); \ \ const num_t dt = bli_obj_dt( src_obj ); \ \ @@ -429,6 +443,8 @@ void PASTEMAC(ch,tfuncname) \ thread \ ); \ } \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4); \ \ } \ diff --git a/frame/3/bli_l3_compute.c b/frame/3/bli_l3_compute.c index e9925e48b7..29b4cb4526 100644 --- a/frame/3/bli_l3_compute.c +++ b/frame/3/bli_l3_compute.c @@ -45,6 +45,8 @@ void bli_gemm_compute_init rntm_t* rntm ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2); + if ( bli_error_checking_is_enabled() ) { // @todo: Add call to error checking function here @@ -97,9 +99,11 @@ void bli_gemm_compute_init cntx, rntm ); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2); } -err_t bli_gemm_compute +void bli_gemm_compute ( obj_t* a, obj_t* b, @@ -110,6 +114,8 @@ err_t bli_gemm_compute thrinfo_t* thread ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4); + const num_t dt = bli_obj_dt( c ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); @@ -242,7 +248,8 @@ err_t bli_gemm_compute ); } - return BLIS_SUCCESS; + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4); + } #undef GENTFUNC @@ -267,6 +274,8 @@ void PASTEMAC( ch, varname ) \ thrinfo_t* restrict thread \ ) \ { \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_5); \ +\ const num_t dt = PASTEMAC( ch, type ); \ \ /* If m or n is zero, return immediately. */ \ @@ -644,6 +653,9 @@ void PASTEMAC( ch, varname ) \ &mem_b, \ thread_pb \ ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_5); \ +\ } INSERT_GENTFUNC_BASIC0_SD( gemm_compute ) diff --git a/frame/3/bli_l3_compute.h b/frame/3/bli_l3_compute.h index ed036d8d2d..9fb0b71c36 100644 --- a/frame/3/bli_l3_compute.h +++ b/frame/3/bli_l3_compute.h @@ -42,7 +42,7 @@ void bli_gemm_compute_init rntm_t* rntm ); -err_t bli_gemm_compute +void bli_gemm_compute ( obj_t* a, obj_t* b, diff --git a/frame/compat/bla_gemm_compute.c b/frame/compat/bla_gemm_compute.c index 0778172bbf..7d2475641b 100644 --- a/frame/compat/bla_gemm_compute.c +++ b/frame/compat/bla_gemm_compute.c @@ -54,6 +54,8 @@ void sgemm_compute_blis_impl float* c, const f77_int* rs_c, const f77_int* cs_c ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + trans_t blis_transa; trans_t blis_transb; dim_t m0, n0, k0; @@ -83,11 +85,12 @@ void sgemm_compute_blis_impl rs_c, cs_c ); - /* Quick return if possible. */ + /* Quick return. */ if ( *m == 0 || *n == 0 ) { /* Finalize BLIS. */ bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); return; } @@ -131,6 +134,9 @@ void sgemm_compute_blis_impl /* Finalize BLIS. */ bli_finalize_auto(); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; } @@ -176,6 +182,8 @@ void dgemm_compute_blis_impl double* c, const f77_int* rs_c, const f77_int* cs_c ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + trans_t blis_transa; trans_t blis_transb; dim_t m0, n0, k0; @@ -205,11 +213,12 @@ void dgemm_compute_blis_impl rs_c, cs_c ); - /* Quick return if possible. */ + /* Quick return. */ if ( *m == 0 || *n == 0 ) { /* Finalize BLIS. */ bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); return; } @@ -253,6 +262,10 @@ void dgemm_compute_blis_impl /* Finalize BLIS. */ bli_finalize_auto(); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + + return; } #ifdef BLIS_ENABLE_BLAS diff --git a/frame/compat/bla_gemm_pack.c b/frame/compat/bla_gemm_pack.c index 8feabc8af2..97cdafedf4 100644 --- a/frame/compat/bla_gemm_pack.c +++ b/frame/compat/bla_gemm_pack.c @@ -53,10 +53,36 @@ void sgemm_pack_blis_impl float* dest ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + dim_t m; dim_t n; dim_t k; + bli_init_auto(); // initialize blis + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemm_pack) + ( + MKSTR(s), + MKSTR(gemm), + identifier, + trans, + mm, + nn, + kk, + pld + ); + + /* Quick return. */ + if ( *mm == 0 || *nn == 0 ) + { + /* Finalize BLIS. */ + bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + dim_t m0 = 0; dim_t n0 = 0; @@ -88,11 +114,6 @@ void sgemm_pack_blis_impl { bli_set_dims_with_trans( blis_trans, k, n, &m0, &n0 ); } - else - { - bli_print_msg( " Invalid IDENTIFIER setting sgemm_pack_() .", __FILE__, __LINE__ ); - return; - } bli_obj_init_finish_1x1( dt, (float*)alpha, &alpha_obj ); @@ -102,6 +123,13 @@ void sgemm_pack_blis_impl bli_obj_set_conjtrans( blis_trans, &src_obj ); bli_pack_full_init(identifier, &alpha_obj, &src_obj, &dest_obj, NULL, NULL); + + /* Finalize BLIS. */ + bli_finalize_auto(); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + + return; } void sgemm_pack_ @@ -131,10 +159,36 @@ void dgemm_pack_blis_impl double* dest ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + dim_t m; dim_t n; dim_t k; + bli_init_auto(); // initialize blis + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemm_pack) + ( + MKSTR(d), + MKSTR(gemm), + identifier, + trans, + mm, + nn, + kk, + pld + ); + + /* Quick return. */ + if ( *mm == 0 || *nn == 0 ) + { + /* Finalize BLIS. */ + bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + dim_t m0 = 0; dim_t n0 = 0; @@ -165,11 +219,6 @@ void dgemm_pack_blis_impl { bli_set_dims_with_trans( blis_trans, k, n, &m0, &n0 ); } - else - { - bli_print_msg( " Invalid IDENTIFIER setting dgemm_pack_() .", __FILE__, __LINE__ ); - return; - } bli_obj_init_finish_1x1( dt, (double*)alpha, &alpha_obj ); @@ -179,6 +228,13 @@ void dgemm_pack_blis_impl bli_obj_set_conjtrans( blis_trans, &src_obj ); bli_pack_full_init(identifier, &alpha_obj, &src_obj, &dest_obj, NULL, NULL); + + /* Finalize BLIS. */ + bli_finalize_auto(); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + + return; } void dgemm_pack_ diff --git a/frame/compat/bla_gemm_pack_get_size.c b/frame/compat/bla_gemm_pack_get_size.c index 53e64b5f2b..32f2acfccb 100644 --- a/frame/compat/bla_gemm_pack_get_size.c +++ b/frame/compat/bla_gemm_pack_get_size.c @@ -55,6 +55,26 @@ f77_int dgemm_pack_get_size_blis_impl bli_init_auto(); // initialize blis cntx_t* cntx = bli_gks_query_cntx(); // Get processor specific context. + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemm_get_size) + ( + MKSTR(d), + MKSTR(gemm), + identifier, + pm, + pn, + pk + ); + + /* Quick return. */ + if ( *pm == 0 || *pn == 0 ) + { + /* Finalize BLIS. */ + bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return 0; + } + num_t dt = BLIS_DOUBLE; // Double precision f77_int tbytes = 0; // total number of bytes needed for packing. f77_int m = *pm; @@ -126,14 +146,12 @@ f77_int dgemm_pack_get_size_blis_impl tbytes = ps_max * sizeof( double ); } - else - { - bli_print_msg( " Invalid IDENTIFIER setting dgemm_pack_get_size_() .", __FILE__, __LINE__ ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return tbytes; - } + + /* Finalize BLIS. */ + bli_finalize_auto(); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return tbytes; } @@ -158,9 +176,31 @@ f77_int sgemm_pack_get_size_blis_impl const f77_int* pk ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + bli_init_auto(); // initialize blis cntx_t* cntx = bli_gks_query_cntx(); // Get processor specific context. + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemm_get_size) + ( + MKSTR(s), + MKSTR(gemm), + identifier, + pm, + pn, + pk + ); + + /* Quick return. */ + if ( *pm == 0 || *pn == 0 ) + { + /* Finalize BLIS. */ + bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return 0; + } + num_t dt = BLIS_FLOAT; // Single precision f77_int tbytes = 0; // total number of bytes needed for packing. f77_int m = *pm; @@ -232,11 +272,11 @@ f77_int sgemm_pack_get_size_blis_impl tbytes = ps_max * sizeof( float ); } - else - { - bli_print_msg( " Invalid IDENTIFIER setting sgemm_pack_get_size_() .", __FILE__, __LINE__ ); - return tbytes; - } + + /* Finalize BLIS. */ + bli_finalize_auto(); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); return tbytes; } diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h index c3028c1e1f..3e46a16222 100644 --- a/frame/compat/bli_blas.h +++ b/frame/compat/bli_blas.h @@ -195,7 +195,7 @@ #include "bla_trmm_check.h" #include "bla_trsm_check.h" #include "bla_gemmt_check.h" -#include "bla_gemm_compute_check.h" +#include "bla_gemm_pack_compute_check.h" // -- Batch Extension prototypes -- #include "bla_gemm_batch.h" diff --git a/frame/compat/check/bla_gemm_compute_check.h b/frame/compat/check/bla_gemm_pack_compute_check.h similarity index 63% rename from frame/compat/check/bla_gemm_compute_check.h rename to frame/compat/check/bla_gemm_pack_compute_check.h index 4264462af6..18c08f1bcb 100644 --- a/frame/compat/check/bla_gemm_compute_check.h +++ b/frame/compat/check/bla_gemm_pack_compute_check.h @@ -32,6 +32,89 @@ */ +#define bla_gemm_get_size_check( dt_str, op_str, identifier, m, n, k ) \ +{ \ + f77_int info = 0; \ + f77_int A_identifier, B_identifier; \ +\ + A_identifier = PASTE_LSAME( identifier, "A", (ftnlen)1, (ftnlen)1 ); \ + B_identifier = PASTE_LSAME( identifier, "B", (ftnlen)1, (ftnlen)1 ); \ +\ + if ( !A_identifier && !B_identifier ) \ + info = 1; \ + else if ( *m < 0 ) \ + info = 2; \ + else if ( *n < 0 ) \ + info = 3; \ + else if ( *k < 0 ) \ + info = 4; \ +\ + if ( info != 0 ) \ + { \ + char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ +\ + sprintf( func_str, "%s%-5s", dt_str, op_str ); \ +\ + bli_string_mkupper( func_str ); \ +\ + PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + return 0; \ + } \ +} + +#define bla_gemm_pack_check( dt_str, op_str, identifier, trans, m, n, k, pld ) \ +{ \ + f77_int info = 0; \ + f77_int A_identifier, B_identifier; \ + f77_int no_trans_param, conj_param, trans_param; \ + f77_int nrow; \ +\ + A_identifier = PASTE_LSAME( identifier, "A", (ftnlen)1, (ftnlen)1 ); \ + B_identifier = PASTE_LSAME( identifier, "B", (ftnlen)1, (ftnlen)1 ); \ +\ + no_trans_param = PASTE_LSAME( trans, "N", (ftnlen)1, (ftnlen)1 ); \ + conj_param = PASTE_LSAME( trans, "C", (ftnlen)1, (ftnlen)1 ); \ + trans_param = PASTE_LSAME( trans, "T", (ftnlen)1, (ftnlen)1 ); \ +\ + if ( A_identifier ) \ + { \ + if ( no_trans_param ) { nrow = *m; } \ + else { nrow = *k; } \ + } \ + else if ( B_identifier ) \ + { \ + if ( no_trans_param ) { nrow = *k; } \ + else { nrow = *n; } \ + } \ +\ + if ( !A_identifier && !B_identifier ) \ + info = 1; \ + else if ( !no_trans_param && !conj_param && !trans_param ) \ + info = 2; \ + else if ( *m < 0 ) \ + info = 3; \ + else if ( *n < 0 ) \ + info = 4; \ + else if ( *k < 0 ) \ + info = 5; \ + else if ( *pld < bli_max( 1, nrow ) ) \ + info = 6; \ +\ + if ( info != 0 ) \ + { \ + char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ +\ + sprintf( func_str, "%s%-5s", dt_str, op_str ); \ +\ + bli_string_mkupper( func_str ); \ +\ + PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \ +\ + return; \ + } \ +} + #define bla_gemm_compute_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, rs_c, cs_c ) \ { \ f77_int info = 0; \ @@ -89,4 +172,4 @@ \ return; \ } \ -} \ No newline at end of file +} diff --git a/frame/thread/bli_l3_compute_decor.h b/frame/thread/bli_l3_compute_decor.h index 83ce718ecc..4ed611b333 100644 --- a/frame/thread/bli_l3_compute_decor.h +++ b/frame/thread/bli_l3_compute_decor.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, Advanced Micro Devices, Inc. + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -36,7 +36,7 @@ #define BLIS_L3_COMPUTE_DECOR_H // Level-3 compute internal function type. -typedef err_t (*l3computeint_t) +typedef void (*l3computeint_t) ( obj_t* a, obj_t* b, @@ -48,7 +48,7 @@ typedef err_t (*l3computeint_t) ); // Level-3 compute thread decorator prototype. -err_t bli_l3_compute_thread_decorator +void bli_l3_compute_thread_decorator ( l3computeint_t func, opid_t family, diff --git a/frame/thread/bli_l3_compute_decor_openmp.c b/frame/thread/bli_l3_compute_decor_openmp.c index 6841e0e1c4..c376cb90b8 100644 --- a/frame/thread/bli_l3_compute_decor_openmp.c +++ b/frame/thread/bli_l3_compute_decor_openmp.c @@ -38,7 +38,7 @@ void* bli_l3_compute_thread_entry( void* data_void ) { return NULL; } -err_t bli_l3_compute_thread_decorator +void bli_l3_compute_thread_decorator ( l3computeint_t func, opid_t family, @@ -50,6 +50,8 @@ err_t bli_l3_compute_thread_decorator rntm_t* rntm ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); + // Query the total number of threads from the rntm_t object. const dim_t n_threads = bli_rntm_num_threads( rntm ); @@ -123,7 +125,8 @@ err_t bli_l3_compute_thread_decorator // mutual exclusion. bli_sba_checkin_array( array ); - return BLIS_SUCCESS; + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + } #endif diff --git a/frame/thread/bli_l3_compute_decor_single.c b/frame/thread/bli_l3_compute_decor_single.c index 8995691428..8bd6e5ffc2 100644 --- a/frame/thread/bli_l3_compute_decor_single.c +++ b/frame/thread/bli_l3_compute_decor_single.c @@ -36,7 +36,7 @@ #if !defined (BLIS_ENABLE_MULTITHREADING) || defined (BLIS_ENABLE_PTHREADS) -err_t bli_l3_compute_thread_decorator +void bli_l3_compute_thread_decorator ( l3computeint_t func, opid_t family, @@ -48,6 +48,8 @@ err_t bli_l3_compute_thread_decorator rntm_t* rntm ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); + const dim_t n_threads = 1; array_t* restrict array = bli_sba_checkout_array( n_threads ); bli_sba_rntm_set_pool( 0, array, rntm ); @@ -81,7 +83,8 @@ err_t bli_l3_compute_thread_decorator bli_sba_checkin_array( array ); - return BLIS_SUCCESS; + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + } #endif \ No newline at end of file diff --git a/frame/thread/bli_pack_full_decor_openmp.c b/frame/thread/bli_pack_full_decor_openmp.c index a6f94afbb6..430990242b 100644 --- a/frame/thread/bli_pack_full_decor_openmp.c +++ b/frame/thread/bli_pack_full_decor_openmp.c @@ -49,6 +49,8 @@ void bli_pack_full_thread_decorator rntm_t* rntm ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); + dim_t n_threads = bli_rntm_num_threads( rntm ); /* Ensure n_threads is always greater than or equal to 1 */ @@ -76,6 +78,8 @@ void bli_pack_full_thread_decorator &thread ); } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); } #endif diff --git a/frame/thread/bli_pack_full_decor_single.c b/frame/thread/bli_pack_full_decor_single.c index b946a0326d..7e7afeddd8 100644 --- a/frame/thread/bli_pack_full_decor_single.c +++ b/frame/thread/bli_pack_full_decor_single.c @@ -49,6 +49,8 @@ void bli_pack_full_thread_decorator rntm_t* rntm ) { + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); + thrinfo_t thread = BLIS_GEMM_SINGLE_THREADED; { @@ -66,6 +68,8 @@ void bli_pack_full_thread_decorator ); } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + } #endif From bd0b50a0774320481ca8b78e978323bdfcdbdf34 Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Thu, 9 Nov 2023 12:24:22 +0530 Subject: [PATCH 198/226] Introduced fast-path to kernels in DNRM2_ and DZNRM2_ APIs - Added a conditional check to see if the vectorized kernels for DNRM2_ and DZNRM2_ can be called directly, without incurring any framework overhead. - The condition to satisfy this fast-path is for the size to be such that the ideal threads required is 1, with the vector having unit stride( so that packing at the framework-level can be avoided ). AMD-Internal: [CPUPL-4045] Change-Id: Ie37e86f802ada0e226dff88e74f0341e97ebfe28 --- frame/util/bli_util_unb_var1.c | 84 +++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 31 deletions(-) diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index dbabc5a345..22de9b152d 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -460,21 +460,6 @@ void bli_znormfv_unb_var1 norm_fp = bli_dznorm2fv_unb_var1_avx2; reduce_fp = bli_dnorm2fv_unb_var1_avx2; - // Setting the ideal number of threads if support is enabled - #if defined( BLIS_ENABLE_OPENMP ) && defined( AOCL_DYNAMIC ) - if ( n < 2000 ) - nt_ideal = 1; - else if ( n < 6500 ) - nt_ideal = 4; - else if ( n < 71000 ) - nt_ideal = 8; - else if ( n < 200000 ) - nt_ideal = 16; - else if ( n < 1530000 ) - nt_ideal = 32; - - #endif - break; #endif default:; @@ -517,6 +502,32 @@ void bli_znormfv_unb_var1 */ if ( norm_fp == NULL && reduce_fp == NULL ) return; + + /* + When the size is such that nt_ideal is 1, and packing is not + required( incx == 1 ), we can directly call the kernel to + avoid framework overheads( fast-path ). + */ + else if ( ( incx == 1 ) && ( n < 2000 ) ) + { + norm_fp( n, x, incx, norm, cntx ); + return; + } + + // Setting the ideal number of threads if support is enabled + #if defined( BLIS_ENABLE_OPENMP ) && defined( AOCL_DYNAMIC ) + if ( n < 2000 ) + nt_ideal = 1; + else if ( n < 6500 ) + nt_ideal = 4; + else if ( n < 71000 ) + nt_ideal = 8; + else if ( n < 200000 ) + nt_ideal = 16; + else if ( n < 1530000 ) + nt_ideal = 32; + + #endif // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. @@ -1023,22 +1034,6 @@ void bli_dnormfv_unb_var1 norm_fp = bli_dnorm2fv_unb_var1_avx2; - // Setting the ideal number of threads if support is enabled - #if defined( BLIS_ENABLE_OPENMP ) && defined( AOCL_DYNAMIC ) - - if ( n < 4000 ) - nt_ideal = 1; - else if ( n < 17000 ) - nt_ideal = 4; - else if ( n < 136000 ) - nt_ideal = 8; - else if ( n < 365000 ) - nt_ideal = 16; - else if ( n < 2950000 ) - nt_ideal = 32; - - #endif - break; #endif default:; @@ -1080,6 +1075,33 @@ void bli_dnormfv_unb_var1 */ if ( norm_fp == NULL ) return; + + /* + When the size is such that nt_ideal is 1, and packing is not + required( incx == 1 ), we can directly call the kernel to + avoid framework overheads( fast-path ). + */ + else if ( ( incx == 1 ) && ( n < 4000 ) ) + { + norm_fp( n, x, incx, norm, cntx ); + return; + } + + // Setting the ideal number of threads if support is enabled + #if defined( BLIS_ENABLE_OPENMP ) && defined( AOCL_DYNAMIC ) + + if ( n < 4000 ) + nt_ideal = 1; + else if ( n < 17000 ) + nt_ideal = 4; + else if ( n < 136000 ) + nt_ideal = 8; + else if ( n < 365000 ) + nt_ideal = 16; + else if ( n < 2950000 ) + nt_ideal = 32; + + #endif // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. From 77bd9a7f1794b9fe3d3d25d615fe08727b77f2f5 Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Wed, 8 Nov 2023 09:28:12 +0530 Subject: [PATCH 199/226] Added parameter checking for LPGEMM APIs Change-Id: I6ea89fd0d2516539e5a4e9cd8537570b23194d89 --- addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c | 62 ++++------- addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c | 63 ++++------- addon/aocl_gemm/aocl_gemm_check.h | 104 +++++++++++++++++++ addon/aocl_gemm/aocl_gemm_f32f32f32of32.c | 56 +++------- addon/aocl_gemm/aocl_gemm_s8s8s16os16.c | 54 ++++------ addon/aocl_gemm/aocl_gemm_s8s8s16os8.c | 54 ++++------ addon/aocl_gemm/aocl_gemm_s8s8s32os32.c | 55 ++++------ addon/aocl_gemm/aocl_gemm_s8s8s32os8.c | 55 ++++------ addon/aocl_gemm/aocl_gemm_u8s8s16os16.c | 54 ++++------ addon/aocl_gemm/aocl_gemm_u8s8s16os8.c | 54 ++++------ addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c | 54 ++++------ addon/aocl_gemm/aocl_gemm_u8s8s32os32.c | 55 ++++------ addon/aocl_gemm/aocl_gemm_u8s8s32os8.c | 55 ++++------ addon/aocl_gemm/frame/lpgemm_post_ops.c | 29 +++++- addon/aocl_gemm/frame/lpgemm_post_ops.h | 2 +- bench/bench_aocl_gemm/bench_lpgemm.c | 51 ++++----- 16 files changed, 377 insertions(+), 480 deletions(-) create mode 100644 addon/aocl_gemm/aocl_gemm_check.h diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c index f258755e1e..b6462b1645 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_post_ops.h" #include "lpgemm_thread_decor_openmp.h" @@ -73,52 +74,23 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) ) - { - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "bf16bf16f32obf16", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans( transa, &blis_transa ); bli_param_map_netlib_to_blis_trans( transb, &blis_transb ); - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Check if strides are valid for Row major inputs. - if ( ( is_row_major == TRUE ) && - ( ( bli_is_notrans( blis_transa ) && ( lda < k ) ) || - ( bli_is_trans( blis_transa ) && ( lda < m ) ) || - ( bli_is_notrans( blis_transb ) && ( ldb < n ) ) || - ( bli_is_trans( blis_transb ) && ( ldb < k ) ) || - ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( bli_is_notrans( blis_transa ) && ( lda < m ) ) || - ( bli_is_trans( blis_transa ) && ( lda < k ) ) || - ( bli_is_notrans( blis_transb ) && ( ldb < k ) ) || - ( bli_is_trans( blis_transb ) && ( ldb < n ) ) || - ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) || - ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) ) - { - return; // Error. - } + bool is_row_major = ( ( order == 'r' ) || ( order == 'R' ) ); + bool is_column_major = ( ( order == 'c' ) || ( order == 'C' ) ); inc_t rs_a = lda; inc_t cs_a = 1; @@ -150,12 +122,14 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) // Reorder is not supported for A matrix if( ( is_row_major == TRUE ) && ( mtag_a == REORDERED ) ) { + bli_print_msg(" Reordering of A matrix is not supported in row major case.", __FILE__, __LINE__ ); return; } // Inputs swapped in column major, A becomes B from kernel point of view. // Reorder is not supported for column major matrices. else if ( ( is_column_major == TRUE ) && ( ( mtag_b == REORDERED ) || ( mtag_a == REORDERED ) ) ) { + bli_print_msg(" Reordering of column major matrices is not supported.", __FILE__, __LINE__ ); return; } @@ -189,12 +163,14 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16) // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c index 475a39c4a0..0cb20f0060 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_post_ops.h" #include "lpgemm_thread_decor_openmp.h" @@ -73,53 +74,23 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) ) - { - return; // Error. - } +// check for validity of params. + AOCL_GEMM_CHECK + ( + "bf16bf16f32obf16", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans( transa, &blis_transa ); bli_param_map_netlib_to_blis_trans( transb, &blis_transb ); - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - - // Check if strides are valid for Row major inputs. - if ( ( is_row_major == TRUE ) && - ( ( bli_is_notrans( blis_transa ) && ( lda < k ) ) || - ( bli_is_trans( blis_transa ) && ( lda < m ) ) || - ( bli_is_notrans( blis_transb ) && ( ldb < n ) ) || - ( bli_is_trans( blis_transb ) && ( ldb < k ) ) || - ( ldc < n ) ) ) - { - return; // Error. - } - // Chcek if strides are valid for Column major inputs. - else if ( ( is_column_major == TRUE ) && - ( ( bli_is_notrans( blis_transa ) && ( lda < m ) ) || - ( bli_is_trans( blis_transa ) && ( lda < k ) ) || - ( bli_is_notrans( blis_transb ) && ( ldb < k ) ) || - ( bli_is_trans( blis_transb ) && ( ldb < n ) ) || - ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) || - ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) ) - { - return; // Error. - } + bool is_row_major = ( ( order == 'r' ) || ( order == 'R' ) ); + bool is_column_major = ( ( order == 'c' ) || ( order == 'C' ) ); // The strides are set assuming a row major kernel. inc_t rs_a = lda; @@ -151,12 +122,14 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) // Reorder is not supported for A matrix if( ( is_row_major == TRUE ) && ( mtag_a == REORDERED ) ) { + bli_print_msg(" Reordering of A matrix is not supported in row major case.", __FILE__, __LINE__ ); return; } // Inputs swapped in column major, A becomes B from kernel point of view. // Reorder is not supported for column major matrices. else if ( ( is_column_major == TRUE ) && ( ( mtag_b == REORDERED ) || ( mtag_a == REORDERED ) ) ) { + bli_print_msg(" Reordering of column major matrices is not supported.", __FILE__, __LINE__ ); return; } @@ -190,12 +163,14 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32) // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_check.h b/addon/aocl_gemm/aocl_gemm_check.h new file mode 100644 index 0000000000..a49fb78007 --- /dev/null +++ b/addon/aocl_gemm/aocl_gemm_check.h @@ -0,0 +1,104 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// yet to add validity check for postops +#define AOCL_GEMM_CHECK( op_str, \ + order, transa, transb, \ + m, n, k, \ + a, lda, mtag_a, \ + b, ldb, mtag_b, \ + c, ldc \ + ) \ +{ \ + int32_t info = 0; \ + bool col_stored, row_stored; \ + bool nota, notb, ta, tb; \ + \ + col_stored = ( order == 'c' ) || ( order == 'C' ); \ + row_stored = ( order == 'r' ) || ( order == 'R' ); \ + \ + nota = ( transa == 'n' ) || ( transa == 'N' ); \ + notb = ( transb == 'n' ) || ( transb == 'N' ); \ + \ + ta = ( transa == 't' ) || ( transa == 'T' ); \ + tb = ( transb == 't' ) || ( transb == 'T' ); \ + \ + if( ( order != 'r') && ( order != 'R' ) && ( order != 'c' ) && ( order != 'C' ) ) \ + info = 1; \ + else if( ( transa != 'n' ) && ( transa != 'N' ) && ( transa != 't' ) && ( transa != 'T' ) ) \ + info = 2; \ + else if( ( transb != 'n' ) && ( transb != 'N' ) && ( transb != 't' ) && ( transb != 'T' ) ) \ + info = 3; \ + else if ( m <= 0 ) \ + info = 4; \ + else if ( n <= 0 ) \ + info = 5; \ + else if ( k <= 0 ) \ + info = 6; \ + else if ( a == NULL ) \ + info = 8; \ + else if ( row_stored && ( ( nota && ( lda < k ) ) || ( ta && ( lda < m ) ) ) ) \ + info = 9; \ + else if ( col_stored && ( ( nota && ( lda < m ) ) || ( ta && ( lda < k ) ) ) ) \ + info = 9; \ + else if ( ( mtag_a != 'n' ) && ( mtag_a != 'N' ) && \ + ( mtag_a != 'p' ) && ( mtag_a != 'P' ) && \ + ( mtag_a != 'r' ) && ( mtag_a != 'R' ) ) \ + info = 10; \ + else if ( b == NULL ) \ + info = 11; \ + else if ( row_stored && ( ( notb && ( ldb < n ) ) || ( tb && ( ldb < k ) ) ) ) \ + info = 12; \ + else if ( col_stored && ( ( notb && ( ldb < k ) ) || ( tb && ( ldb < n ) ) ) ) \ + info = 12; \ + else if ( ( mtag_b != 'n' ) && ( mtag_b != 'N' ) && \ + ( mtag_b != 'p' ) && ( mtag_b != 'P' ) && \ + ( mtag_b != 'r' ) && ( mtag_b != 'R' ) ) \ + info = 13; \ + else if ( c == NULL ) \ + info = 15; \ + else if ( row_stored && ( ldc < n ) ) \ + info = 16; \ + else if ( col_stored && ( ldc < m ) ) \ + info = 16; \ + \ + if( info != 0 ) \ + { \ + char print_msg[ 100 ]; \ + \ + sprintf( print_msg, "** On entry to %6s, parameter number %2i had an illegal value", op_str, info); \ + bli_print_msg(print_msg, __FILE__, __LINE__); \ + return; \ + } \ +} diff --git a/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c b/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c index 311b6a05e2..7de6b16369 100644 --- a/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c +++ b/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_post_ops.h" #include "lpgemm_thread_decor_openmp.h" @@ -64,13 +65,16 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32) AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), transa, transb, m, n, k,\ (void*)&alpha, lda, ldb, (void*)&beta, ldc); - // Null check for pointers. - if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) ) - { - AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, \ - "Invalid pointers provided for input parameters."); - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "f32f32f32of32", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans( transa, &blis_transa ); @@ -86,36 +90,8 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32) return; // Error. } - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Row major input expected with leading dimensions >= row stride. - if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) || - ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) ) - { - AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, \ - "Invalid matrix dimensions."); - return; // Error. - } + bool is_row_major = ( ( order == 'r' ) || ( order == 'R' ) ); + bool is_column_major = ( ( order == 'c' ) || ( order == 'C' ) ); // The strides are set assuming a row major kernel. const inc_t rs_a = lda; @@ -168,12 +144,14 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32) // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c b/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c index 2d4186305a..e9533536ab 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_5loop_interface_apis.h" #include "lpgemm_config.h" @@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ((a == NULL) || (b == NULL) || (c == NULL)) - { - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "s8s8s16os16", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans(transa, &blis_transa); @@ -75,41 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16) if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || ( blis_transb != BLIS_NO_TRANSPOSE ) ) { + bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ ); return; // Error. } - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - if ( ( order_use != 'r' ) && ( order_use != 'R' ) ) + if ( ( order != 'r' ) && ( order != 'R' ) ) { + bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ ); return; // Only row major supported. } - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Row major input expected with leading dimensions >= row stride. - if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0)) - { - return; // Error. - } - const inc_t rs_a = lda; const inc_t cs_a = 1; const inc_t rs_b = ldb; @@ -135,17 +116,20 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16) // Only unpacked A supported now. if (mtag_a != UNPACKED) { + bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ ); return; // Error. } // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c b/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c index 6afd6bdd91..8b30c51801 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_5loop_interface_apis.h" #include "lpgemm_config.h" @@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ((a == NULL) || (b == NULL) || (c == NULL)) - { - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "s8s8s16os8", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans(transa, &blis_transa); @@ -75,41 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8) if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || ( blis_transb != BLIS_NO_TRANSPOSE ) ) { + bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ ); return; // Error. } - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - if ( ( order_use != 'r' ) && ( order_use != 'R' ) ) + if ( ( order != 'r' ) && ( order != 'R' ) ) { + bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ ); return; // Only row major supported. } - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Row major input expected with leading dimensions >= row stride. - if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0)) - { - return; // Error. - } - const inc_t rs_a = lda; const inc_t cs_a = 1; const inc_t rs_b = ldb; @@ -135,17 +116,20 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8) // Only unpacked A supported now. if (mtag_a != UNPACKED) { + bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ ); return; // Error. } // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c b/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c index fa9a58ab2a..413de3f543 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_post_ops.h" #include "lpgemm_thread_decor_openmp.h" @@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) ) - { - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "s8s8s32os32", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans( transa, &blis_transa ); @@ -75,42 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32) if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || ( blis_transb != BLIS_NO_TRANSPOSE ) ) { + bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ ); return; // Error. } - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - if ( ( order_use != 'r' ) && ( order_use != 'R' ) ) + if ( ( order != 'r' ) && ( order != 'R' ) ) { + bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ ); return; // Only row major supported. } - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Row major input expected with leading dimensions >= row stride. - if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) || - ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) ) - { - return; // Error. - } - const inc_t rs_a = lda; const inc_t cs_a = 1; const inc_t rs_b = ldb; @@ -136,17 +116,20 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32) // Only unpacked A supported now. if ( mtag_a != UNPACKED ) { + bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ ); return; // Error. } // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c b/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c index 90c54e2d3e..5e7f3ec71c 100644 --- a/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c +++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_post_ops.h" #include "lpgemm_thread_decor_openmp.h" @@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) ) - { - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "s8s8s32os8", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans( transa, &blis_transa ); @@ -75,42 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8) if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || ( blis_transb != BLIS_NO_TRANSPOSE ) ) { + bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ ); return; // Error. } - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - if ( ( order_use != 'r' ) && ( order_use != 'R' ) ) + if ( ( order != 'r' ) && ( order != 'R' ) ) { + bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ ); return; // Only row major supported. } - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Row major input expected with leading dimensions >= row stride. - if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) || - ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) ) - { - return; // Error. - } - const inc_t rs_a = lda; const inc_t cs_a = 1; const inc_t rs_b = ldb; @@ -136,17 +116,20 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8) // Only unpacked A supported now. if ( mtag_a != UNPACKED ) { + bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ ); return; // Error. } // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c index c31ba7a855..1c21ff8103 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_5loop_interface_apis.h" #include "lpgemm_config.h" @@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ((a == NULL) || (b == NULL) || (c == NULL)) - { - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "u8s8s16os16", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans(transa, &blis_transa); @@ -75,41 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || ( blis_transb != BLIS_NO_TRANSPOSE ) ) { + bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ ); return; // Error. } - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - if ( ( order_use != 'r' ) && ( order_use != 'R' ) ) + if ( ( order != 'r' ) && ( order != 'R' ) ) { + bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ ); return; // Only row major supported. } - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Row major input expected with leading dimensions >= row stride. - if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0)) - { - return; // Error. - } - const inc_t rs_a = lda; const inc_t cs_a = 1; const inc_t rs_b = ldb; @@ -135,17 +116,20 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16) // Only unpacked A supported now. if (mtag_a != UNPACKED) { + bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ ); return; // Error. } // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c index 5869eb79c2..d159fe5b6d 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_5loop_interface_apis.h" #include "lpgemm_config.h" @@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ((a == NULL) || (b == NULL) || (c == NULL)) - { - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "u8s8s16os8", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans(transa, &blis_transa); @@ -75,41 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8) if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || ( blis_transb != BLIS_NO_TRANSPOSE ) ) { + bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ ); return; // Error. } - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - if ( ( order_use != 'r' ) && ( order_use != 'R' ) ) + if ( ( order != 'r' ) && ( order != 'R' ) ) { + bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ ); return; // Only row major supported. } - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Row major input expected with leading dimensions >= row stride. - if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0)) - { - return; // Error. - } - const inc_t rs_a = lda; const inc_t cs_a = 1; const inc_t rs_b = ldb; @@ -135,17 +116,20 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8) // Only unpacked A supported now. if (mtag_a != UNPACKED) { + bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ ); return; // Error. } // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c b/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c index 325160ffac..fef861be1e 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_5loop_interface_apis.h" #include "lpgemm_config.h" @@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ((a == NULL) || (b == NULL) || (c == NULL)) - { - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "u8s8s16ou8", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans(transa, &blis_transa); @@ -75,41 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8) if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || ( blis_transb != BLIS_NO_TRANSPOSE ) ) { + bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ ); return; // Error. } - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - if ( ( order_use != 'r' ) && ( order_use != 'R' ) ) + if ( ( order != 'r' ) && ( order != 'R' ) ) { + bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ ); return; // Only row major supported. } - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Row major input expected with leading dimensions >= row stride. - if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0)) - { - return; // Error. - } - const inc_t rs_a = lda; const inc_t cs_a = 1; const inc_t rs_b = ldb; @@ -135,17 +116,20 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8) // Only unpacked A supported now. if (mtag_a != UNPACKED) { + bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ ); return; // Error. } // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c index 95291c1aef..194a608e16 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_post_ops.h" #include "lpgemm_thread_decor_openmp.h" @@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) ) - { - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "u8s8s32os32", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans( transa, &blis_transa ); @@ -75,42 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32) if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || ( blis_transb != BLIS_NO_TRANSPOSE ) ) { + bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ ); return; // Error. } - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - if ( ( order_use != 'r' ) && ( order_use != 'R' ) ) + if ( ( order != 'r' ) && ( order != 'R' ) ) { + bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ ); return; // Only row major supported. } - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Row major input expected with leading dimensions >= row stride. - if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) || - ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) ) - { - return; // Error. - } - const inc_t rs_a = lda; const inc_t cs_a = 1; const inc_t rs_b = ldb; @@ -136,17 +116,20 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32) // Only unpacked A supported now. if ( mtag_a != UNPACKED ) { + bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ ); return; // Error. } // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c index 10f8208808..d7de73363b 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c @@ -34,6 +34,7 @@ #include "blis.h" #include "aocl_gemm_interface_apis.h" +#include "aocl_gemm_check.h" #include "lpgemm_types.h" #include "lpgemm_post_ops.h" #include "lpgemm_thread_decor_openmp.h" @@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8) // Set MC, NC, KC, NR, MR. aocl_lpgemm_init_global_cntx(); - // Null check for pointers. - if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) ) - { - return; // Error. - } + // check for validity of params. + AOCL_GEMM_CHECK + ( + "u8s8s32os8", + order, transa, transb, + m, n, k, + a, lda, mem_format_a, + b, ldb, mem_format_b, + c, ldc + ); /* Map BLAS chars to their corresponding BLIS enumerated type value. */ bli_param_map_netlib_to_blis_trans( transa, &blis_transa ); @@ -75,42 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8) if ( ( blis_transa != BLIS_NO_TRANSPOSE ) || ( blis_transb != BLIS_NO_TRANSPOSE ) ) { + bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ ); return; // Error. } - // Sanitize order input. - char order_use = - ( ( order == 'r' ) || ( order == 'R' ) || - ( order == 'c' ) || ( order == 'C' ) ) ? - order : 'r'; - if ( ( order_use != 'r' ) && ( order_use != 'R' ) ) + if ( ( order != 'r' ) && ( order != 'R' ) ) { + bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ ); return; // Only row major supported. } - bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) ); - bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) ); - - // Row major input expected with leading dimensions >= row stride. - if ( ( is_row_major == TRUE ) && - ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) ) - { - return; // Error. - } - // Column major input expected with leading dimensions >= column stride. - else if ( ( is_column_major == TRUE ) && - ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) ) - { - return; // Error. - } - - // Check if dimensions are valid. - if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) || - ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) ) - { - return; // Error. - } - const inc_t rs_a = lda; const inc_t cs_a = 1; const inc_t rs_b = ldb; @@ -136,17 +116,20 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8) // Only unpacked A supported now. if ( mtag_a != UNPACKED ) { + bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ ); return; // Error. } // Convert post op struct to post op linked list format. lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS]; - lpgemm_translate_to_post_ops_list + err_t err = lpgemm_translate_to_post_ops_list ( post_op_unparsed, post_op_list, - ( void* )c, ( void* )( &order_use ) + ( void* )c, ( void* )( &order ) ); + if( err != BLIS_SUCCESS ) return; + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_g; diff --git a/addon/aocl_gemm/frame/lpgemm_post_ops.c b/addon/aocl_gemm/frame/lpgemm_post_ops.c index fffe14c0f8..855a880025 100644 --- a/addon/aocl_gemm/frame/lpgemm_post_ops.c +++ b/addon/aocl_gemm/frame/lpgemm_post_ops.c @@ -55,7 +55,7 @@ BLIS_INLINE void lpgemm_set_node_params post_op_node->next = NULL; } -void lpgemm_translate_to_post_ops_list +err_t lpgemm_translate_to_post_ops_list ( aocl_post_op* post_op_unparsed, lpgemm_post_op* post_op_list, @@ -70,7 +70,7 @@ void lpgemm_translate_to_post_ops_list post_op_list, POST_OPS_DISABLE, NULL, NULL, NULL, NULL, FALSE ); - return; + return BLIS_SUCCESS; } if ( ( post_op_unparsed->seq_length > AOCL_MAX_POST_OPS ) ) @@ -80,7 +80,7 @@ void lpgemm_translate_to_post_ops_list post_op_list, POST_OPS_DISABLE, NULL, NULL, NULL, NULL, FALSE ); - return; //Error, seq length exceeds max post ops permitted. + return BLIS_SUCCESS; //Error, seq length exceeds max post ops permitted. } dim_t e_i = 0; //Multiple eltwise supported. @@ -110,6 +110,11 @@ void lpgemm_translate_to_post_ops_list tmp_code = POST_OPS_RELU; break; case PRELU: + if( ( post_op_unparsed->eltwise + e_i )->algo.alpha == NULL ) + { + bli_print_msg(" Post_op.alpha is NULL. Exiting..", __FILE__, __LINE__ ); + return BLIS_NULL_POINTER; + } tmp_code = POST_OPS_RELU_SCALE; break; case GELU_TANH: @@ -119,6 +124,12 @@ void lpgemm_translate_to_post_ops_list tmp_code = POST_OPS_GELU_ERF; break; case CLIP: + if( ( ( post_op_unparsed->eltwise + e_i )->algo.alpha == NULL ) || + ( ( post_op_unparsed->eltwise + e_i )->algo.beta == NULL ) ) + { + bli_print_msg(" Post_op.clip min or max value is NULL. Exiting..", __FILE__, __LINE__ ); + return BLIS_NULL_POINTER; + } tmp_code = POST_OPS_CLIP; break; default: @@ -137,6 +148,11 @@ void lpgemm_translate_to_post_ops_list } break; case BIAS: + if( post_op_unparsed->bias.bias == NULL ) + { + bli_print_msg(" Post_op.bias is NULL. Exiting..", __FILE__, __LINE__ ); + return BLIS_NULL_POINTER; + } lpgemm_set_node_params ( ( post_op_list + i ), POST_OPS_BIAS, @@ -145,6 +161,12 @@ void lpgemm_translate_to_post_ops_list ); break; case SCALE: + if( ( post_op_unparsed->sum.scale_factor == NULL ) || + ( post_op_unparsed->sum.zero_point == NULL ) ) + { + bli_print_msg(" Post_op.scale scale_factor or zero_point is NULL. Exiting..", __FILE__, __LINE__ ); + return BLIS_NULL_POINTER; + } lpgemm_set_node_params ( ( post_op_list + i ), POST_OPS_DOWNSCALE, @@ -163,4 +185,5 @@ void lpgemm_translate_to_post_ops_list ( post_op_list + i )->next = ( post_op_list + i + 1); } } + return BLIS_SUCCESS; } diff --git a/addon/aocl_gemm/frame/lpgemm_post_ops.h b/addon/aocl_gemm/frame/lpgemm_post_ops.h index f0a0cea8b5..8b17ee4660 100644 --- a/addon/aocl_gemm/frame/lpgemm_post_ops.h +++ b/addon/aocl_gemm/frame/lpgemm_post_ops.h @@ -76,7 +76,7 @@ typedef struct lpgemm_post_op_attr_t int16_t* b_col_sum_vec_s16; } lpgemm_post_op_attr; -void lpgemm_translate_to_post_ops_list +err_t lpgemm_translate_to_post_ops_list ( aocl_post_op* post_op_unparsed, lpgemm_post_op* post_op_list, diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 0bf7410193..09d2de818b 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -86,7 +86,7 @@ static inline void bfloat16_to_float( bfloat16 bf16_val, float* float_val ) { int32_t inter_temp = *( ( int16_t* ) &bf16_val ); inter_temp = inter_temp << 16; - *float_val = *(( float* ) ( &inter_temp )); + memcpy( float_val, &inter_temp, sizeof( int32_t ) ); } #define CONVERT_TO_FLOAT(ctype) \ @@ -144,9 +144,17 @@ PRINT_MATRIX(int16_t) PRINT_MATRIX(float) PRINT_MATRIX(int32_t) -void* lpgemm_malloc( size_t size ) +void* lpgemm_malloc( int32_t size ) { void* p; + // creating a dummy buffer of size 4 bytes in case + // size of the matrix is negative. + if( size <= 0 ) + { + p = malloc( 4 ); + return p; + } + if( bench_mode == 'a' ) { p = malloc(size); @@ -185,6 +193,7 @@ void lpgemm_free( void* p ) #define GEN_FILL_ARRAY_FUNC(ctype) \ void fill_array_ ## ctype ( void* arr, dim_t size ) \ { \ + if( size < 0 ) return; \ ctype* temp_arr = ( ctype* ) arr; \ for ( dim_t i = 0; i < size; ++i ) \ { \ @@ -201,6 +210,7 @@ GEN_FILL_ARRAY_FUNC(int32_t) void fill_array_bfloat16( void* arr, dim_t size ) { err_t bli_errors = BLIS_SUCCESS; + if( size < 0 ) return; float* c_float = ( float* ) bli_malloc_user( sizeof( float ) * size, &bli_errors ); for ( dim_t i = 0; i < size; ++i ) { @@ -249,34 +259,10 @@ void mat_mul_ ## BLAS_SFX \ aocl_post_op* post_op\ ) \ { \ - char storage = stor_order; \ - char reordera = 'n'; \ - char reorderb = 'n'; \ - \ - if ( ( op_a == 'p' ) || ( op_a == 'P' ) ) \ - { \ - reordera = 'p'; \ - } \ - else if ( ( op_a == 'r' ) || ( op_a == 'R' ) ) \ - { \ - reordera = 'r'; \ - } \ - \ - if ( ( op_b == 'p' ) || ( op_b == 'P' ) ) \ - { \ - /* No reordering of B.*/ \ - reorderb = 'n'; \ - } \ - else if ( ( op_b == 'r' ) || ( op_b == 'R' ) ) \ - { \ - /* Reordered B.*/ \ - reorderb = 'r'; \ - } \ - \ - aocl_gemm_ ## BLAS_SFX( storage, transa, transb, m, n, k, \ + aocl_gemm_ ## BLAS_SFX( stor_order, transa, transb, m, n, k, \ alpha, \ - a, lda, reordera, \ - b, ldb, reorderb, \ + a, lda, op_a, \ + b, ldb, op_b, \ beta, \ c, ldc, post_op ); \ \ @@ -1235,16 +1221,19 @@ void mat_mul_bench_main_ ## BLAS_SFX \ GEN_FUNC_NAME(fill_array_,B_type)(b, size_B ); \ \ C_type* c = ( C_type* ) lpgemm_malloc( sizeof( C_type ) * size_C ); \ - memset( ( void* ) c, 0, sizeof( C_type ) * size_C ); \ \ C_type* c_ref = ( C_type* ) lpgemm_malloc( sizeof( C_type ) * size_C ); \ - memset( ( void* ) c_ref, 0, sizeof( C_type ) * size_C ); \ \ if ( bench_mode == 'a' ) \ { \ GEN_FUNC_NAME(fill_array_,C_type)( c, ( size_C ) ); \ GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( size_C ) ); \ } \ + else \ + { \ + memset( ( void* ) c, 0, sizeof( C_type ) * size_C ); \ + memset( ( void* ) c_ref, 0, sizeof( C_type ) * size_C ); \ + } \ \ Sum_type alpha = 0; \ Sum_type beta = 0; \ From 77161c1e5d218822d5f378d59f17618e2b185cc1 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Thu, 19 Oct 2023 15:34:30 +0530 Subject: [PATCH 200/226] Design change of DGEMM 6x8 native kernel. - Following optimizations are included for dgemm 6x8 native kernel. 1) Reorganized the C update and store to reduce register dependencies. 2) moved the C prefetch to part-way through the kernel for efficiently prefetching C matrix at appropriate distance. 3) Offsetting A matrix, so that kernel can use a smaller instruction encoding saving, saving i-cache space. 4) Aligned the K iteration loop. - Thanks to Moore, Branden for these design changes of DGEMM 6x8 native kernels. - Additional change, reorganization of C update and store for beta zero case to facilitate out of order execution of storing of C matrix. Change-Id: I9d1ec8d39f1154b0f38b136bd6a04b05d7d1e6ba --- kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c | 322 ++++++++---------- 1 file changed, 145 insertions(+), 177 deletions(-) diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c index 2de89b2fb6..4e274a80cc 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c @@ -924,25 +924,15 @@ void bli_sgemm_haswell_asm_6x16 vmovlpd(mem(rcx), xmm0, xmm0) \ vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) \ - vmovhpd(mem(rcx, r13, 1), xmm1, xmm1) \ - vperm2f128(imm(0x20), ymm1, ymm0, ymm0) /*\ - vmovlpd(mem(rcx, rsi, 4), xmm2, xmm2) \ - vmovhpd(mem(rcx, r15, 1), xmm2, xmm2) \ - vmovlpd(mem(rcx, r13, 2), xmm1, xmm1) \ - vmovhpd(mem(rcx, r10, 1), xmm1, xmm1) \ - vperm2f128(imm(0x20), ymm1, ymm2, ymm2)*/ + vmovhpd(mem(rcx, r8, 1), xmm1, xmm1) \ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0) #define DGEMM_OUTPUT_GS_BETA_NZ \ vextractf128(imm(1), ymm0, xmm1) \ vmovlpd(xmm0, mem(rcx)) \ vmovhpd(xmm0, mem(rcx, rsi, 1)) \ vmovlpd(xmm1, mem(rcx, rsi, 2)) \ - vmovhpd(xmm1, mem(rcx, r13, 1)) /*\ - vextractf128(imm(1), ymm2, xmm1) \ - vmovlpd(xmm2, mem(rcx, rsi, 4)) \ - vmovhpd(xmm2, mem(rcx, r15, 1)) \ - vmovlpd(xmm1, mem(rcx, r13, 2)) \ - vmovhpd(xmm1, mem(rcx, r10, 1))*/ + vmovhpd(xmm1, mem(rcx, r8, 1)) void bli_dgemm_haswell_asm_6x8 ( @@ -964,6 +954,13 @@ void bli_dgemm_haswell_asm_6x8 // different size than is expected by load instructions. uint64_t k_iter = (uint64_t)k0/4; uint64_t k_left = (uint64_t)k0%4; + uint64_t prefetch_iters = 30; + if ( k_iter > prefetch_iters ) { + k_iter -= prefetch_iters; + } else { + prefetch_iters = k_iter; + k_iter = 0; + } uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; @@ -988,9 +985,9 @@ void bli_dgemm_haswell_asm_6x8 mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. - //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) + add(imm(32*4), rax) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) @@ -999,46 +996,43 @@ void bli_dgemm_haswell_asm_6x8 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) - lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; - lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c + lea(mem(rdi, rdi, 2), r10) // r10 = 3*rs_c; + lea(mem(rcx, r10, 1), rdx) // rdx = c + 3*rs_c; - mov(var(k_iter), rsi) // i = k_iter; + mov(var(k_pref), r8) // i = k_iter after prefetch + mov(var(k_iter), rsi) // i = k_iter before prefetch test(rsi, rsi) // check i via logical AND. - je(.DCONSIDKLEFT) // if i == 0, jump to code that - // contains the k_left loop. + je(.DPOSTMAINLOOP) // if i == 0, jump to code that + // prefetches, followed by any post-prefetch iters + // and the k-left loop + align32 + label(.DLOOPKITER) // MAIN LOOP // iteration 0 - prefetch(0, mem(rax, 64*8)) - vbroadcastsd(mem(rax, 0*8), ymm2) - vbroadcastsd(mem(rax, 1*8), ymm3) + vbroadcastsd(mem(rax, 0*8-128), ymm2) + vbroadcastsd(mem(rax, 1*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - vbroadcastsd(mem(rax, 2*8), ymm2) - vbroadcastsd(mem(rax, 3*8), ymm3) + vbroadcastsd(mem(rax, 2*8-128), ymm2) + vbroadcastsd(mem(rax, 3*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - vbroadcastsd(mem(rax, 4*8), ymm2) - vbroadcastsd(mem(rax, 5*8), ymm3) + vbroadcastsd(mem(rax, 4*8-128), ymm2) + vbroadcastsd(mem(rax, 5*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) @@ -1048,24 +1042,22 @@ void bli_dgemm_haswell_asm_6x8 vmovapd(mem(rbx, -1*32), ymm1) // iteration 1 - prefetch(0, mem(rax, 72*8)) - - vbroadcastsd(mem(rax, 6*8), ymm2) - vbroadcastsd(mem(rax, 7*8), ymm3) + vbroadcastsd(mem(rax, 6*8-128), ymm2) + vbroadcastsd(mem(rax, 7*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - vbroadcastsd(mem(rax, 8*8), ymm2) - vbroadcastsd(mem(rax, 9*8), ymm3) + vbroadcastsd(mem(rax, 8*8-128), ymm2) + vbroadcastsd(mem(rax, 9*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - vbroadcastsd(mem(rax, 10*8), ymm2) - vbroadcastsd(mem(rax, 11*8), ymm3) + vbroadcastsd(mem(rax, 10*8-128), ymm2) + vbroadcastsd(mem(rax, 11*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) @@ -1075,24 +1067,22 @@ void bli_dgemm_haswell_asm_6x8 vmovapd(mem(rbx, 1*32), ymm1) // iteration 2 - prefetch(0, mem(rax, 80*8)) - - vbroadcastsd(mem(rax, 12*8), ymm2) - vbroadcastsd(mem(rax, 13*8), ymm3) + vbroadcastsd(mem(rax, 12*8-128), ymm2) + vbroadcastsd(mem(rax, 13*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - vbroadcastsd(mem(rax, 14*8), ymm2) - vbroadcastsd(mem(rax, 15*8), ymm3) + vbroadcastsd(mem(rax, 14*8-128), ymm2) + vbroadcastsd(mem(rax, 15*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - vbroadcastsd(mem(rax, 16*8), ymm2) - vbroadcastsd(mem(rax, 17*8), ymm3) + vbroadcastsd(mem(rax, 16*8-128), ymm2) + vbroadcastsd(mem(rax, 17*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) @@ -1102,22 +1092,22 @@ void bli_dgemm_haswell_asm_6x8 vmovapd(mem(rbx, 3*32), ymm1) // iteration 3 - vbroadcastsd(mem(rax, 18*8), ymm2) - vbroadcastsd(mem(rax, 19*8), ymm3) + vbroadcastsd(mem(rax, 18*8-128), ymm2) + vbroadcastsd(mem(rax, 19*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - vbroadcastsd(mem(rax, 20*8), ymm2) - vbroadcastsd(mem(rax, 21*8), ymm3) + vbroadcastsd(mem(rax, 20*8-128), ymm2) + vbroadcastsd(mem(rax, 21*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - vbroadcastsd(mem(rax, 22*8), ymm2) - vbroadcastsd(mem(rax, 23*8), ymm3) + vbroadcastsd(mem(rax, 22*8-128), ymm2) + vbroadcastsd(mem(rax, 23*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) @@ -1134,10 +1124,28 @@ void bli_dgemm_haswell_asm_6x8 jne(.DLOOPKITER) // iterate again if i != 0. + test(r8, r8) // If no post-prefetch iters to do, skip to kleft + je(.DCONSIDKLEFT) + + label(.DPOSTMAINLOOP) + + + /* Prefetch C */ + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c + mov(r8, rsi) // i = k_iter after prefetch + xor(r8, r8) // Zero out r8, so we don't prefetch again + test(rsi, rsi) // check i via logical AND. + jne(.DLOOPKITER) + // All unrolled iters (and prefetches) done label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; @@ -1148,24 +1156,22 @@ void bli_dgemm_haswell_asm_6x8 label(.DLOOPKLEFT) // EDGE LOOP - prefetch(0, mem(rax, 64*8)) - - vbroadcastsd(mem(rax, 0*8), ymm2) - vbroadcastsd(mem(rax, 1*8), ymm3) + vbroadcastsd(mem(rax, 0*8-128), ymm2) + vbroadcastsd(mem(rax, 1*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - vbroadcastsd(mem(rax, 2*8), ymm2) - vbroadcastsd(mem(rax, 3*8), ymm3) + vbroadcastsd(mem(rax, 2*8-128), ymm2) + vbroadcastsd(mem(rax, 3*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - vbroadcastsd(mem(rax, 4*8), ymm2) - vbroadcastsd(mem(rax, 5*8), ymm3) + vbroadcastsd(mem(rax, 4*8-128), ymm2) + vbroadcastsd(mem(rax, 5*8-128), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) @@ -1215,11 +1221,9 @@ void bli_dgemm_haswell_asm_6x8 lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; - lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; + lea(mem(rcx, rdi, 4), r9) // load address of c + 4*rs_c; - lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; - //lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; - //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; + lea(mem(rsi, rsi, 2), r8) // r8 = 3*cs_c; // now avoid loading C if beta == 0 @@ -1323,51 +1327,33 @@ void bli_dgemm_haswell_asm_6x8 vfmadd231pd(mem(rcx), ymm3, ymm4) - vmovupd(ymm4, mem(rcx)) - add(rdi, rcx) vfmadd231pd(mem(rdx), ymm3, ymm5) + vfmadd231pd(mem(rcx, rdi, 1), ymm3, ymm6) + vfmadd231pd(mem(rdx, rdi, 1), ymm3, ymm7) + vfmadd231pd(mem(rcx, rdi, 2), ymm3, ymm8) + vfmadd231pd(mem(rdx, rdi, 2), ymm3, ymm9) + vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm6) - vmovupd(ymm6, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm7) - vmovupd(ymm7, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm8) - vmovupd(ymm8, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm9) - vmovupd(ymm9, mem(rdx)) - add(rdi, rdx) + vmovupd(ymm6, mem(rcx, rdi, 1)) + vmovupd(ymm7, mem(rdx, rdi, 1)) + vmovupd(ymm8, mem(rcx, rdi, 2)) + vmovupd(ymm9, mem(rdx, rdi, 2)) + add(r10, rcx) // r10 = 3 * rdi + add(r10, rdx) vfmadd231pd(mem(rcx), ymm3, ymm10) - vmovupd(ymm10, mem(rcx)) - add(rdi, rcx) vfmadd231pd(mem(rdx), ymm3, ymm11) + vfmadd231pd(mem(rcx, rdi, 1), ymm3, ymm12) + vfmadd231pd(mem(rdx, rdi, 1), ymm3, ymm13) + vfmadd231pd(mem(rcx, rdi, 2), ymm3, ymm14) + vfmadd231pd(mem(rdx, rdi, 2), ymm3, ymm15) + vmovupd(ymm10, mem(rcx)) vmovupd(ymm11, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm12) - vmovupd(ymm12, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm13) - vmovupd(ymm13, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm14) - vmovupd(ymm14, mem(rcx)) - //add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm15) - vmovupd(ymm15, mem(rdx)) - //add(rdi, rdx) + vmovupd(ymm12, mem(rcx, rdi, 1)) + vmovupd(ymm13, mem(rdx, rdi, 1)) + vmovupd(ymm14, mem(rcx, rdi, 2)) + vmovupd(ymm15, mem(rdx, rdi, 2)) @@ -1392,11 +1378,11 @@ void bli_dgemm_haswell_asm_6x8 vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) - vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm10) + vfmadd231pd(mem(rcx, r8, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) - vmovupd(ymm10, mem(rcx, r13, 1)) + vmovupd(ymm10, mem(rcx, r8, 1)) lea(mem(rcx, rsi, 4), rcx) @@ -1405,16 +1391,16 @@ void bli_dgemm_haswell_asm_6x8 vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) - vfmadd231pd(mem(r14), xmm3, xmm0) - vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) - vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) - vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4) - vmovupd(xmm0, mem(r14)) - vmovupd(xmm1, mem(r14, rsi, 1)) - vmovupd(xmm2, mem(r14, rsi, 2)) - vmovupd(xmm4, mem(r14, r13, 1)) + vfmadd231pd(mem(r9), xmm3, xmm0) + vfmadd231pd(mem(r9, rsi, 1), xmm3, xmm1) + vfmadd231pd(mem(r9, rsi, 2), xmm3, xmm2) + vfmadd231pd(mem(r9, r8, 1), xmm3, xmm4) + vmovupd(xmm0, mem(r9)) + vmovupd(xmm1, mem(r9, rsi, 1)) + vmovupd(xmm2, mem(r9, rsi, 2)) + vmovupd(xmm4, mem(r9, r8, 1)) - lea(mem(r14, rsi, 4), r14) + lea(mem(r9, rsi, 4), r9) vunpcklpd(ymm7, ymm5, ymm0) @@ -1431,11 +1417,11 @@ void bli_dgemm_haswell_asm_6x8 vfmadd231pd(mem(rcx), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) - vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm11) + vfmadd231pd(mem(rcx, r8, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) - vmovupd(ymm11, mem(rcx, r13, 1)) + vmovupd(ymm11, mem(rcx, r8, 1)) //lea(mem(rcx, rsi, 4), rcx) @@ -1444,16 +1430,16 @@ void bli_dgemm_haswell_asm_6x8 vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) - vfmadd231pd(mem(r14), xmm3, xmm0) - vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) - vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) - vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4) - vmovupd(xmm0, mem(r14)) - vmovupd(xmm1, mem(r14, rsi, 1)) - vmovupd(xmm2, mem(r14, rsi, 2)) - vmovupd(xmm4, mem(r14, r13, 1)) + vfmadd231pd(mem(r9), xmm3, xmm0) + vfmadd231pd(mem(r9, rsi, 1), xmm3, xmm1) + vfmadd231pd(mem(r9, rsi, 2), xmm3, xmm2) + vfmadd231pd(mem(r9, r8, 1), xmm3, xmm4) + vmovupd(xmm0, mem(r9)) + vmovupd(xmm1, mem(r9, rsi, 1)) + vmovupd(xmm2, mem(r9, rsi, 2)) + vmovupd(xmm4, mem(r9, r8, 1)) - //lea(mem(r14, rsi, 4), r14) + //lea(mem(r9, rsi, 4), r9) @@ -1544,38 +1530,21 @@ void bli_dgemm_haswell_asm_6x8 vmovupd(ymm4, mem(rcx)) - add(rdi, rcx) vmovupd(ymm5, mem(rdx)) - add(rdi, rdx) - - vmovupd(ymm6, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm7, mem(rdx)) - add(rdi, rdx) - - - vmovupd(ymm8, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm9, mem(rdx)) - add(rdi, rdx) + vmovupd(ymm6, mem(rcx, rdi, 1)) + vmovupd(ymm7, mem(rdx, rdi, 1)) + vmovupd(ymm8, mem(rcx, rdi, 2)) + vmovupd(ymm9, mem(rdx, rdi, 2)) + add(r10, rcx) + add(r10, rdx) vmovupd(ymm10, mem(rcx)) - add(rdi, rcx) vmovupd(ymm11, mem(rdx)) - add(rdi, rdx) - - - vmovupd(ymm12, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm13, mem(rdx)) - add(rdi, rdx) - - - vmovupd(ymm14, mem(rcx)) - //add(rdi, rcx) - vmovupd(ymm15, mem(rdx)) - //add(rdi, rdx) + vmovupd(ymm12, mem(rcx, rdi, 1)) + vmovupd(ymm13, mem(rdx, rdi, 1)) + vmovupd(ymm14, mem(rcx, rdi, 2)) + vmovupd(ymm15, mem(rdx, rdi, 2)) jmp(.DDONE) // jump to end. @@ -1597,7 +1566,7 @@ void bli_dgemm_haswell_asm_6x8 vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) - vmovupd(ymm10, mem(rcx, r13, 1)) + vmovupd(ymm10, mem(rcx, r8, 1)) lea(mem(rcx, rsi, 4), rcx) @@ -1606,12 +1575,12 @@ void bli_dgemm_haswell_asm_6x8 vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) - vmovupd(xmm0, mem(r14)) - vmovupd(xmm1, mem(r14, rsi, 1)) - vmovupd(xmm2, mem(r14, rsi, 2)) - vmovupd(xmm4, mem(r14, r13, 1)) + vmovupd(xmm0, mem(r9)) + vmovupd(xmm1, mem(r9, rsi, 1)) + vmovupd(xmm2, mem(r9, rsi, 2)) + vmovupd(xmm4, mem(r9, r8, 1)) - lea(mem(r14, rsi, 4), r14) + lea(mem(r9, rsi, 4), r9) vunpcklpd(ymm7, ymm5, ymm0) @@ -1626,7 +1595,7 @@ void bli_dgemm_haswell_asm_6x8 vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) - vmovupd(ymm11, mem(rcx, r13, 1)) + vmovupd(ymm11, mem(rcx, r8, 1)) //lea(mem(rcx, rsi, 4), rcx) @@ -1635,12 +1604,12 @@ void bli_dgemm_haswell_asm_6x8 vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) - vmovupd(xmm0, mem(r14)) - vmovupd(xmm1, mem(r14, rsi, 1)) - vmovupd(xmm2, mem(r14, rsi, 2)) - vmovupd(xmm4, mem(r14, r13, 1)) + vmovupd(xmm0, mem(r9)) + vmovupd(xmm1, mem(r9, rsi, 1)) + vmovupd(xmm2, mem(r9, rsi, 2)) + vmovupd(xmm4, mem(r9, r8, 1)) - //lea(mem(r14, rsi, 4), r14) + //lea(mem(r9, rsi, 4), r9) @@ -1655,20 +1624,19 @@ void bli_dgemm_haswell_asm_6x8 end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [k_pref] "m" (prefetch_iters) // 9 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r8", "r9", "r10", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", From f6046784ce45997289f5c84e4df202f746350bd8 Mon Sep 17 00:00:00 2001 From: Mangala V Date: Tue, 26 Sep 2023 18:05:24 +0530 Subject: [PATCH 201/226] Re-Designed SGEMM SUP kernel to use mask load/store instruction Added all fringe kernels with mask load store support Fringe kernels cover m direction from 5 to 1 and n direction from 15 to 1 for row storage format - New edge kernels that uses masked load-store instructions for handling corner cases. - Mask load-store instruction macros are added. vmaskmovps, VMASKMOVPS for masked load-store. - It improves performance by reducing branching overhead and by being more cache friendly. - Mask load-store is added only for row storage format AMD-Internal: [CPUPL-4041] Change-Id: I563c036c79bf8e476a8ebde37f8f6db751fb3456 --- frame/include/bli_x86_asm_macros.h | 2 + .../zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c | 6079 +++++++++++------ .../s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c | 1810 +++++ .../s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c | 1613 +++++ .../s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c | 1587 +++++ kernels/zen/bli_kernels_zen.h | 27 + 6 files changed, 8908 insertions(+), 2210 deletions(-) create mode 100644 kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c create mode 100644 kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c create mode 100644 kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h index 8c066007a0..3fdc5d9460 100644 --- a/frame/include/bli_x86_asm_macros.h +++ b/frame/include/bli_x86_asm_macros.h @@ -916,6 +916,7 @@ #define VCOMISD(_0, _1) INSTR_(vcomisd, _0, _1) #define VMASKMOVPD(_0, _1, _2) INSTR_(vmaskmovpd, _0, _1, _2) +#define VMASKMOVPS(_0, _1, _2) INSTR_(vmaskmovps, _0, _1, _2) #define VFMADD132SS(_0, _1, _2) INSTR_(vfmadd132ss, _0, _1, _2) #define VFMADD213SS(_0, _1, _2) INSTR_(vfmadd213ss, _0, _1, _2) #define VFMADD231SS(_0, _1, _2) INSTR_(vfmadd231ss, _0, _1, _2) @@ -1244,6 +1245,7 @@ #define vblendmps(_0, _1, _2) VBLENDMSD(_0, _1, _2) #define vblendmpd(_0, _1, _2) VBLENDMPD(_0, _1, _2) #define vmaskmovpd(_0, _1, _2) VMASKMOVPD(_0, _1, _2) +#define vmaskmovps(_0, _1, _2) VMASKMOVPS(_0, _1, _2) // Prefetches #define PREFETCH(_0, _1) INSTR_(prefetcht##_0, _1) diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c index 19acd5a1b6..471758041a 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ void bli_sgemmsup_rv_zen_asm_6x16m ( @@ -92,792 +92,957 @@ void bli_sgemmsup_rv_zen_asm_6x16m cntx_t* restrict cntx ) { - uint64_t n_left = n0 % 16; - - // First check whether this is a edge case in the n dimension. If so, - // dispatch other 6x?m kernels, as needed. - if (n_left ) - { - float* cij = c; - float* bj = b; - float* ai = a; - - if ( 8 <= n_left ) - { - const dim_t nr_cur = 8; - - bli_sgemmsup_rv_zen_asm_6x8m - ( - conja, conjb, m0, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; - } - - if ( 4 <= n_left ) - { - const dim_t nr_cur = 4; - - bli_sgemmsup_rv_zen_asm_6x4m - ( - conja, conjb, m0, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; - } - - if ( 2 <= n_left ) - { - const dim_t nr_cur = 2; - - bli_sgemmsup_rv_zen_asm_6x2m - ( - conja, conjb, m0, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; - } - - if ( 1 == n_left ) - { - dim_t ps_a0 = bli_auxinfo_ps_a( data ); - if ( ps_a0 == 6 * rs_a0 ) - { - bli_sgemv_ex - ( - BLIS_NO_TRANSPOSE, conjb, m0, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, - beta, cij, rs_c0, cntx, NULL - ); - } - else - { - const dim_t mr = 6; - - // Since A is packed into row panels, we must use a loop over - // gemv. - dim_t m_iter = ( m0 + mr - 1 ) / mr; - dim_t m_left = m0 % mr; - - float* restrict ai_ii = ai; - float* restrict cij_ii = cij; - - for ( dim_t ii = 0; ii < m_iter; ii += 1 ) - { - dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) - ? mr : m_left ); - - bli_sgemv_ex - ( - BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, - alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, - beta, cij_ii, rs_c0, cntx, NULL - ); - cij_ii += mr*rs_c0; ai_ii += ps_a0; - } - } - } - - return; - } - - //void* a_next = bli_auxinfo_next_a( data ); - //void* b_next = bli_auxinfo_next_b( data ); - - // Typecast local copies of integers in case dim_t and inc_t are a - // different size than is expected by load instructions. - - - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; - - uint64_t m_iter = m0 / 6; - uint64_t m_left = m0 % 6; - - uint64_t rs_a = rs_a0; - uint64_t cs_a = cs_a0; - uint64_t rs_b = rs_b0; - uint64_t cs_b = cs_b0; - uint64_t rs_c = rs_c0; - uint64_t cs_c = cs_c0; - - // Query the panel stride of A and convert it to units of bytes. - uint64_t ps_a = bli_auxinfo_ps_a( data ); - uint64_t ps_a4 = ps_a * sizeof( float ); - - if ( m_iter == 0 ) goto consider_edge_cases; - - // ------------------------------------------------------------------------- - begin_asm() - - mov(var(a), r14) // load address of a. - mov(var(rs_a), r8) // load rs_a - mov(var(cs_a), r9) // load cs_a - lea(mem(, r8, 4), r8) // rs_a *= sizeof(dt) - lea(mem(, r9, 4), r9) // cs_a *= sizeof(dt) - - lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a - lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a - - mov(var(rs_b), r10) // load rs_b - lea(mem(, r10, 4), r10) // rs_b *= sizeof(dt) - // NOTE: We cannot pre-load elements of a or b - // because it could eventually, in the last - // unrolled iter or the cleanup loop, result - // in reading beyond the bounds allocated mem - // (the likely result: a segmentation fault). - - mov(var(c), r12) // load address of c - mov(var(rs_c), rdi) // load rs_c - lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(dt) - - // During preamble and loops: - // r12 = rcx = c - // r14 = rax = a - // read rbx from var(b) near beginning of loop - // r11 = m dim index ii - - mov(var(m_iter), r11) // ii = m_iter; - - label(.SLOOP6X16I) // LOOP OVER ii = [ m_iter ... 1 0 ] - - vxorps(ymm4, ymm4, ymm4) - vxorps(ymm5, ymm5, ymm5) - vxorps(ymm6, ymm6, ymm6) - vxorps(ymm7, ymm7, ymm7) - vxorps(ymm8, ymm8, ymm8) - vxorps(ymm9, ymm9, ymm9) - vxorps(ymm10, ymm10, ymm10) - vxorps(ymm11, ymm11, ymm11) - vxorps(ymm12, ymm12, ymm12) - vxorps(ymm13, ymm13, ymm13) - vxorps(ymm14, ymm14, ymm14) - vxorps(ymm15, ymm15, ymm15) - - mov(var(b), rbx) // load address of b. - //mov(r12, rcx) // reset rcx to current utile of c. - mov(r14, rax) // reset rax to current upanel of a. - - cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. - jz(.SCOLPFETCH) // jump to column storage case - label(.SROWPFETCH) // row-stored pre-fetching on c // not used - - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c - - jmp(.SPOSTPFETCH) // jump to end of pre-fetching c - label(.SCOLPFETCH) // column-stored pre-fetching c - - mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) - lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) - lea(mem(r12, rsi, 2), rdx) // - lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c - prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c - prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c - prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c - prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c - lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; - prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 6*cs_c - prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 7*cs_c - - label(.SPOSTPFETCH) // done prefetching c - - lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; - lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines - lea(mem(rdx, r8, 2), rdx) // from next upanel of a. - - mov(var(k_iter), rsi) // i = k_iter; - test(rsi, rsi) // check i via logical AND. - je(.SCONSIDKLEFT) // if i == 0, jump to code that - // contains the k_left loop. - - label(.SLOOPKITER) // MAIN LOOP - - // ---------------------------------- iteration 0 - prefetch(0, mem(rdx, 5*8)) - - vmovups(mem(rbx, 0*32), ymm0) - vmovups(mem(rbx, 1*32), ymm1) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), ymm2) - vbroadcastss(mem(rax, r8, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm4) - vfmadd231ps(ymm1, ymm2, ymm5) - vfmadd231ps(ymm0, ymm3, ymm6) - vfmadd231ps(ymm1, ymm3, ymm7) - - vbroadcastss(mem(rax, r8, 2), ymm2) - vbroadcastss(mem(rax, r13, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm8) - vfmadd231ps(ymm1, ymm2, ymm9) - vfmadd231ps(ymm0, ymm3, ymm10) - vfmadd231ps(ymm1, ymm3, ymm11) - - vbroadcastss(mem(rax, r8, 4), ymm2) - vbroadcastss(mem(rax, r15, 1), ymm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(ymm0, ymm2, ymm12) - vfmadd231ps(ymm1, ymm2, ymm13) - vfmadd231ps(ymm0, ymm3, ymm14) - vfmadd231ps(ymm1, ymm3, ymm15) - - // ---------------------------------- iteration 1 - prefetch(0, mem(rdx, r9, 1, 5*8)) - - vmovups(mem(rbx, 0*32), ymm0) - vmovups(mem(rbx, 1*32), ymm1) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), ymm2) - vbroadcastss(mem(rax, r8, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm4) - vfmadd231ps(ymm1, ymm2, ymm5) - vfmadd231ps(ymm0, ymm3, ymm6) - vfmadd231ps(ymm1, ymm3, ymm7) - - vbroadcastss(mem(rax, r8, 2), ymm2) - vbroadcastss(mem(rax, r13, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm8) - vfmadd231ps(ymm1, ymm2, ymm9) - vfmadd231ps(ymm0, ymm3, ymm10) - vfmadd231ps(ymm1, ymm3, ymm11) - - vbroadcastss(mem(rax, r8, 4), ymm2) - vbroadcastss(mem(rax, r15, 1), ymm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(ymm0, ymm2, ymm12) - vfmadd231ps(ymm1, ymm2, ymm13) - vfmadd231ps(ymm0, ymm3, ymm14) - vfmadd231ps(ymm1, ymm3, ymm15) - - // ---------------------------------- iteration 2 - prefetch(0, mem(rdx, r9, 2, 5*8)) - - vmovups(mem(rbx, 0*32), ymm0) - vmovups(mem(rbx, 1*32), ymm1) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), ymm2) - vbroadcastss(mem(rax, r8, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm4) - vfmadd231ps(ymm1, ymm2, ymm5) - vfmadd231ps(ymm0, ymm3, ymm6) - vfmadd231ps(ymm1, ymm3, ymm7) - - vbroadcastss(mem(rax, r8, 2), ymm2) - vbroadcastss(mem(rax, r13, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm8) - vfmadd231ps(ymm1, ymm2, ymm9) - vfmadd231ps(ymm0, ymm3, ymm10) - vfmadd231ps(ymm1, ymm3, ymm11) - - vbroadcastss(mem(rax, r8, 4), ymm2) - vbroadcastss(mem(rax, r15, 1), ymm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(ymm0, ymm2, ymm12) - vfmadd231ps(ymm1, ymm2, ymm13) - vfmadd231ps(ymm0, ymm3, ymm14) - vfmadd231ps(ymm1, ymm3, ymm15) - - // ---------------------------------- iteration 3 - prefetch(0, mem(rdx, rcx, 1, 5*8)) - lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; - - vmovups(mem(rbx, 0*32), ymm0) - vmovups(mem(rbx, 1*32), ymm1) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), ymm2) - vbroadcastss(mem(rax, r8, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm4) - vfmadd231ps(ymm1, ymm2, ymm5) - vfmadd231ps(ymm0, ymm3, ymm6) - vfmadd231ps(ymm1, ymm3, ymm7) - - vbroadcastss(mem(rax, r8, 2), ymm2) - vbroadcastss(mem(rax, r13, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm8) - vfmadd231ps(ymm1, ymm2, ymm9) - vfmadd231ps(ymm0, ymm3, ymm10) - vfmadd231ps(ymm1, ymm3, ymm11) - - vbroadcastss(mem(rax, r8, 4), ymm2) - vbroadcastss(mem(rax, r15, 1), ymm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(ymm0, ymm2, ymm12) - vfmadd231ps(ymm1, ymm2, ymm13) - vfmadd231ps(ymm0, ymm3, ymm14) - vfmadd231ps(ymm1, ymm3, ymm15) - - dec(rsi) // i -= 1; - jne(.SLOOPKITER) // iterate again if i != 0. - - label(.SCONSIDKLEFT) - - mov(var(k_left), rsi) // i = k_left; - test(rsi, rsi) // check i via logical AND. - je(.SPOSTACCUM) // if i == 0, we're done; jump to end. - // else, we prepare to enter k_left loop. - - label(.SLOOPKLEFT) // EDGE LOOP - - vmovups(mem(rbx, 0*32), ymm0) - vmovups(mem(rbx, 1*32), ymm1) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), ymm2) - vbroadcastss(mem(rax, r8, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm4) - vfmadd231ps(ymm1, ymm2, ymm5) - vfmadd231ps(ymm0, ymm3, ymm6) - vfmadd231ps(ymm1, ymm3, ymm7) - - vbroadcastss(mem(rax, r8, 2), ymm2) - vbroadcastss(mem(rax, r13, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm8) - vfmadd231ps(ymm1, ymm2, ymm9) - vfmadd231ps(ymm0, ymm3, ymm10) - vfmadd231ps(ymm1, ymm3, ymm11) - - vbroadcastss(mem(rax, r8, 4), ymm2) - vbroadcastss(mem(rax, r15, 1), ymm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(ymm0, ymm2, ymm12) - vfmadd231ps(ymm1, ymm2, ymm13) - vfmadd231ps(ymm0, ymm3, ymm14) - vfmadd231ps(ymm1, ymm3, ymm15) - - dec(rsi) // i -= 1; - jne(.SLOOPKLEFT) // iterate again if i != 0. - - label(.SPOSTACCUM) - - mov(r12, rcx) // reset rcx to current utile of c. - mov(var(alpha), rax) // load address of alpha - mov(var(beta), rbx) // load address of beta - vbroadcastss(mem(rax), ymm0) // load alpha and duplicate - vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - - vmulps(ymm0, ymm4, ymm4) // scale by alpha - vmulps(ymm0, ymm5, ymm5) - vmulps(ymm0, ymm6, ymm6) - vmulps(ymm0, ymm7, ymm7) - vmulps(ymm0, ymm8, ymm8) - vmulps(ymm0, ymm9, ymm9) - vmulps(ymm0, ymm10, ymm10) - vmulps(ymm0, ymm11, ymm11) - vmulps(ymm0, ymm12, ymm12) - vmulps(ymm0, ymm13, ymm13) - vmulps(ymm0, ymm14, ymm14) - vmulps(ymm0, ymm15, ymm15) - - mov(var(cs_c), rsi) // load cs_c - lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) - - lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; - lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - // now avoid loading C if beta == 0 - vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. - vucomiss(xmm0, xmm3) // set ZF if beta == 0. - je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. - jz(.SCOLSTORED) // jump to column storage case - - label(.SROWSTORED) - - vfmadd231ps(mem(rcx), ymm3, ymm4) - vmovups(ymm4, mem(rcx)) - - vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm5) - vmovups(ymm5, mem(rcx, rsi, 8)) - add(rdi, rcx) - - - vfmadd231ps(mem(rcx), ymm3, ymm6) - vmovups(ymm6, mem(rcx)) - - vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm7) - vmovups(ymm7, mem(rcx, rsi, 8)) - add(rdi, rcx) - - - vfmadd231ps(mem(rcx), ymm3, ymm8) - vmovups(ymm8, mem(rcx)) - - vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm9) - vmovups(ymm9, mem(rcx, rsi, 8)) - add(rdi, rcx) - - - vfmadd231ps(mem(rcx), ymm3, ymm10) - vmovups(ymm10, mem(rcx)) - - vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm11) - vmovups(ymm11, mem(rcx, rsi, 8)) - add(rdi, rcx) - - - vfmadd231ps(mem(rcx), ymm3, ymm12) - vmovups(ymm12, mem(rcx)) - - vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm13) - vmovups(ymm13, mem(rcx, rsi, 8)) - add(rdi, rcx) - - - vfmadd231ps(mem(rcx), ymm3, ymm14) - vmovups(ymm14, mem(rcx)) - - vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm15) - vmovups(ymm15, mem(rcx, rsi, 8)) - //add(rdi, rcx) - - - jmp(.SDONE) // jump to end. - - - label(.SCOLSTORED) + uint64_t n_left = n0 % 16; + + /* For row storage format, kernel is re-written to */ + /* use mask load/store instruction */ + if ( n_left && (rs_c0 != 1)) + { + float* restrict cij = c; + float* restrict bj = b; + float* restrict ai = a; + /**************************************************************************/ + /* Mask load and store support is added for fringe cases */ + /* Fringe cases are the numbers which not multiple of xmm or ymm register */ + /* n_left : 15,14,13,11,10,9,7,6,5,3 */ + /* When mask register values are set, load/store is performed */ + /* When mask register values are not set, load/store is not performed */ + /*Elements: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16*/ + /*n0=16 : -----------ymm--------------- -----------ymm---------------- */ + /*n0=15 : -----------ymm--------------- -1 -1 -1 -1 -1 -1 -1 0 */ + /*n0=14 : -----------ymm--------------- -1 -1 -1 -1 -1 -1 0 0 */ + /*n0=9 : -----------ymm--------------- -1 0 0 0 0 0 0 0 */ + /*n0=8 : -----------ymm--------------- -----------Not used--------- */ + /*n0=7 : -1 -1 -1 -1 -1 -1 -1 0 -----------Not used--------- */ + /*n0=3 : -1 -1 -1 0 0 0 0 0 -----------Not used--------- */ + /*Same code can be resued for multiple n_left by just varing mask register*/ + /*We will be able to perform complete operation of tile with this approach*/ + /**************************************************************************/ + switch(n_left) + { + /*Fringe cases*/ + case 15: case 14: case 13: + case 11: case 10: case 9: + { + const dim_t nr_cur = n_left; + /**********************************************/ + /* These case is executed when nleft - 9 to 15*/ + /* 16 Elements in col order */ + /* ---YMM REG----- ---YMM Mask Reg--- */ + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ + /*15:0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 x */ + /*14:0 1 2 3 4 5 6 7 8 9 10 11 12 13 x x */ + /*11:0 1 2 3 4 5 6 7 8 9 10 x x x x x */ + /* and so on */ + /**********************************************/ + bli_sgemmsup_rv_zen_asm_6x16m_mask + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + case 7: case 6: case 5: + { + /***********************************************/ + /* These case is executed when nleft - 5 to 7 */ + /* 8 Elements in col order */ + /* YMM Mask REG */ + /* 0 1 2 3 4 5 6 7 */ + /*7: 0 1 2 3 4 5 6 x */ + /*6: 0 1 2 3 4 5 x x */ + /*5: 0 1 2 3 4 x x x */ + /**********************************************/ + const dim_t nr_cur = n_left; + + bli_sgemmsup_rv_zen_asm_6x8m_mask + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + case 3: case 1: + { + /***********************************************/ + /* These case is executed when nleft - 3/1 */ + /* 4 Elements in col order */ + /* XMM Mask REG */ + /* 0 1 2 3 */ + /*3: 0 1 2 x */ + /*1: 0 x x x */ + /**********************************************/ + const dim_t nr_cur = n_left; + bli_sgemmsup_rv_zen_asm_6x4m_mask + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + /*Non-Fringe cases*/ + case 12: + { + #if 0 + const dim_t nr_cur = 12; + bli_sgemmsup_rv_haswell_asm_6x12m + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + #endif + + dim_t nr_cur = 8; + + bli_sgemmsup_rv_zen_asm_6x8m + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; + + nr_cur = 4; + bli_sgemmsup_rv_zen_asm_6x4m + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + + } + case 8: + { + const dim_t nr_cur = 8; + + bli_sgemmsup_rv_zen_asm_6x8m + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + case 4: + { + const dim_t nr_cur = 4; + + bli_sgemmsup_rv_zen_asm_6x4m + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + case 2: + { + const dim_t nr_cur = 2; + + bli_sgemmsup_rv_zen_asm_6x2m + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + break; + } + default: + break; + } + return; + } + + // First check whether this is a edge case in the n dimension. If so, + // dispatch other 6x?m kernels, as needed. + if (n_left ) + { + float* cij = c; + float* bj = b; + float* ai = a; + + if ( 8 <= n_left ) + { + const dim_t nr_cur = 8; + + bli_sgemmsup_rv_zen_asm_6x8m + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; + } + + if ( 4 <= n_left ) + { + const dim_t nr_cur = 4; + + bli_sgemmsup_rv_zen_asm_6x4m + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; + } + + if ( 2 <= n_left ) + { + const dim_t nr_cur = 2; + + bli_sgemmsup_rv_zen_asm_6x2m + ( + conja, conjb, m0, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; + } + + if ( 1 == n_left ) + { + dim_t ps_a0 = bli_auxinfo_ps_a( data ); + if ( ps_a0 == 6 * rs_a0 ) + { + bli_sgemv_ex + ( + BLIS_NO_TRANSPOSE, conjb, m0, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, + beta, cij, rs_c0, cntx, NULL + ); + } + else + { + const dim_t mr = 6; + + // Since A is packed into row panels, we must use a loop over + // gemv. + dim_t m_iter = ( m0 + mr - 1 ) / mr; + dim_t m_left = m0 % mr; + + float* restrict ai_ii = ai; + float* restrict cij_ii = cij; + + for ( dim_t ii = 0; ii < m_iter; ii += 1 ) + { + dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) + ? mr : m_left ); + + bli_sgemv_ex + ( + BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, + alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, + beta, cij_ii, rs_c0, cntx, NULL + ); + cij_ii += mr*rs_c0; ai_ii += ps_a0; + } + } + } + + return; + } + + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + + + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(dt) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(dt) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(dt) + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(dt) + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + // r11 = m dim index ii + + mov(var(m_iter), r11) // ii = m_iter; + + label(.SLOOP6X16I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + vxorps(ymm4, ymm4, ymm4) + vxorps(ymm5, ymm5, ymm5) + vxorps(ymm6, ymm6, ymm6) + vxorps(ymm7, ymm7, ymm7) + vxorps(ymm8, ymm8, ymm8) + vxorps(ymm9, ymm9, ymm9) + vxorps(ymm10, ymm10, ymm10) + vxorps(ymm11, ymm11, ymm11) + vxorps(ymm12, ymm12, ymm12) + vxorps(ymm13, ymm13, ymm13) + vxorps(ymm14, ymm14, ymm14) + vxorps(ymm15, ymm15, ymm15) + + mov(var(b), rbx) // load address of b. + //mov(r12, rcx) // reset rcx to current utile of c. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored pre-fetching on c // not used + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of pre-fetching c + label(.SCOLPFETCH) // column-stored pre-fetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 6*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 7*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + prefetch(0, mem(rdx, 5*8)) + + vmovups(mem(rbx, 0*32), ymm0) + vmovups(mem(rbx, 1*32), ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vbroadcastss(mem(rax, r8, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vbroadcastss(mem(rax, r13, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vbroadcastss(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + // ---------------------------------- iteration 1 + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmovups(mem(rbx, 0*32), ymm0) + vmovups(mem(rbx, 1*32), ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vbroadcastss(mem(rax, r8, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vbroadcastss(mem(rax, r13, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vbroadcastss(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + // ---------------------------------- iteration 2 + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmovups(mem(rbx, 0*32), ymm0) + vmovups(mem(rbx, 1*32), ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vbroadcastss(mem(rax, r8, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vbroadcastss(mem(rax, r13, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vbroadcastss(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + // ---------------------------------- iteration 3 + prefetch(0, mem(rdx, rcx, 1, 5*8)) + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; + + vmovups(mem(rbx, 0*32), ymm0) + vmovups(mem(rbx, 1*32), ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vbroadcastss(mem(rax, r8, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vbroadcastss(mem(rax, r13, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vbroadcastss(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + vmovups(mem(rbx, 0*32), ymm0) + vmovups(mem(rbx, 1*32), ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vbroadcastss(mem(rax, r8, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vbroadcastss(mem(rax, r13, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vbroadcastss(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm3) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm5, ymm5) + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm7, ymm7) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm9, ymm9) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm11, ymm11) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm0, ymm13, ymm13) + vmulps(ymm0, ymm14, ymm14) + vmulps(ymm0, ymm15, ymm15) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLSTORED) // jump to column storage case + + label(.SROWSTORED) + + vfmadd231ps(mem(rcx), ymm3, ymm4) + vmovups(ymm4, mem(rcx)) + + vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm5) + vmovups(ymm5, mem(rcx, rsi, 8)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm6) + vmovups(ymm6, mem(rcx)) + + vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm7) + vmovups(ymm7, mem(rcx, rsi, 8)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm8) + vmovups(ymm8, mem(rcx)) + + vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm9) + vmovups(ymm9, mem(rcx, rsi, 8)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm10) + vmovups(ymm10, mem(rcx)) + + vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm11) + vmovups(ymm11, mem(rcx, rsi, 8)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm12) + vmovups(ymm12, mem(rcx)) + + vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm13) + vmovups(ymm13, mem(rcx, rsi, 8)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm14) + vmovups(ymm14, mem(rcx)) + + vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm15) + vmovups(ymm15, mem(rcx, rsi, 8)) + //add(rdi, rcx) + + + jmp(.SDONE) // jump to end. + + + label(.SCOLSTORED) /*|-----------------| |-----|----| - | | | | 8x4 | 8x2| - | 4x8 | 4x8 | | | | - | | | |-----|----| - |-----------------| | 8x4 | 8x2| - | 2x8 | 2x8 | | | | - |------------------ |----------|*/ - - /****6x16 tile is transposed and saved in col major as 6x16*****/ - /****top left tile 4x8 transposed to top left tile 8x4**********/ - vunpcklps(ymm6, ymm4, ymm0)//a0b0a1b1 a4b4a5b5 - vunpcklps(ymm10, ymm8, ymm1)//c0d0c1d1 c4d4c5d5 - vshufps(imm(0x4e), ymm1, ymm0, ymm2)//a1b1c0d0 a5b5c4d4 - vblendps(imm(0xcc), ymm2, ymm0, ymm0)//a0b0c0d0 a4b4c4d4 - vblendps(imm(0x33), ymm2, ymm1, ymm1)//a1b1c1d1 a5b5c5d5 - - vextractf128(imm(0x1), ymm0, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm0) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c - - vextractf128(imm(0x1), ymm1, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm1) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c - - vunpckhps(ymm6, ymm4, ymm0) - vunpckhps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm0) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c - - vextractf128(imm(0x1), ymm1, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm1) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) - - lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c - lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c - - /***bottom left tile - 2x8 is transposed to top right tile 8x2**********/ - vunpcklps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(mem(rdx), xmm1, xmm1) - vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm0) - vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) - lea(mem(rdx, rsi, 4), rax) // rax += 4*cs_c - - vmovlpd(mem(rax), xmm1, xmm1) - vmovhpd(mem(rax, rsi, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm2) - vmovlpd(xmm2, mem(rax)) // store ( gamma44..gamma54 ) - vmovhpd(xmm2, mem(rax, rsi, 1)) // store ( gamma45..gamma55 ) - lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c - - vunpckhps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(mem(rdx), xmm1, xmm1) - vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm0) - vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma43..gamma53 ) - lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c - vmovlpd(mem(rdx), xmm1, xmm1) - vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm2) - vmovlpd(xmm2, mem(rdx)) // store ( gamma46..gamma56 ) - vmovhpd(xmm2, mem(rdx, rsi, 1)) // store ( gamma47..gamma57 ) - - lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c - - /***top right tile 4x8 is transposed to bottom left tile 8x4**********/ - vunpcklps(ymm7, ymm5, ymm0) - vunpcklps(ymm11, ymm9, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm0) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c - - vextractf128(imm(0x1), ymm1, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm1) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c - - vunpckhps(ymm7, ymm5, ymm0) - vunpckhps(ymm11, ymm9, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm0) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c - - vextractf128(imm(0x1), ymm1, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm1) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) - - //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - /*** bottom right 2x8 is transposed to bottom right tile 8x2*******/ - vunpcklps(ymm15, ymm13, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(mem(rdx), xmm1, xmm1) - vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm0) - vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) - lea(mem(rdx, rsi, 4), rax) // rax += 4*cs_c - - vmovlpd(mem(rax), xmm1, xmm1) - vmovhpd(mem(rax, rsi, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm2) - vmovlpd(xmm2, mem(rax)) // store ( gamma44..gamma54 ) - vmovhpd(xmm2, mem(rax, rsi, 1)) // store ( gamma45..gamma55 ) - lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c - - vunpckhps(ymm15, ymm13, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(mem(rdx), xmm1, xmm1) - vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm0) - vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma43..gamma53 ) - lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c - vmovlpd(mem(rdx), xmm1, xmm1) - vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm2) - vmovlpd(xmm2, mem(rdx)) // store ( gamma46..gamma56 ) - vmovhpd(xmm2, mem(rdx, rsi, 1)) // store ( gamma47..gamma57 ) - - jmp(.SDONE) // jump to end. - - label(.SBETAZERO) - - cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. - jz(.SCOLSTORBZ) // jump to column storage case - - - label(.SROWSTORBZ) - - vmovups(ymm4, mem(rcx)) - vmovups(ymm5, mem(rcx, rsi, 8)) - add(rdi, rcx) - - - vmovups(ymm6, mem(rcx)) - vmovups(ymm7, mem(rcx, rsi, 8)) - add(rdi, rcx) - - - vmovups(ymm8, mem(rcx)) - vmovups(ymm9, mem(rcx, rsi, 8)) - add(rdi, rcx) - - - vmovups(ymm10, mem(rcx)) - vmovups(ymm11, mem(rcx, rsi, 8)) - add(rdi, rcx) - - - vmovups(ymm12, mem(rcx)) - vmovups(ymm13, mem(rcx, rsi, 8)) - add(rdi, rcx) - - - vmovups(ymm14, mem(rcx)) - vmovups(ymm15, mem(rcx, rsi, 8)) - //add(rdi, rcx) - - jmp(.SDONE) // jump to end. - - - label(.SCOLSTORBZ) - /****6x16 tile going to save into 16x6 tile in C*****/ - /******************top left tile 8x4***************************/ - vunpcklps(ymm6, ymm4, ymm0) - vunpcklps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vextractf128(imm(0x1), ymm1, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) - - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vunpckhps(ymm6, ymm4, ymm0) - vunpckhps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vextractf128(imm(0x1), ymm1, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) - - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c - /******************top right tile 8x2***************************/ - vunpcklps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) - vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) - lea(mem(rdx, rsi, 1), rdx) - vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) - vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) - lea(mem(rdx, rsi, 1), rdx) - - vunpckhps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) - vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) - lea(mem(rdx, rsi, 1), rdx) - vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) - vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) - lea(mem(rdx, rsi, 1), rdx) - lea(mem(rdx, rsi, 4), rdx) // rdx += 8*cs_c - - /******************bottom left tile 8x4***************************/ - vunpcklps(ymm7, ymm5, ymm0) - vunpcklps(ymm11, ymm9, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vextractf128(imm(0x1), ymm1, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) - - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vunpckhps(ymm7, ymm5, ymm0) - vunpckhps(ymm11, ymm9, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vextractf128(imm(0x1), ymm1, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) - - /******************bottom right tile 8x2***************************/ - vunpcklps(ymm15, ymm13, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) - vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) - lea(mem(rdx, rsi, 1), rdx) - vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) - vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) - lea(mem(rdx, rsi, 1), rdx) - - vunpckhps(ymm15, ymm13, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) - vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) - lea(mem(rdx, rsi, 1), rdx) - vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) - vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) - - label(.SDONE) - - lea(mem(r12, rdi, 4), r12) // - lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c - - //lea(mem(r14, r8, 4), r14) // - //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a - mov(var(ps_a4), rax) // load ps_a4 - lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 - - dec(r11) // ii -= 1; - jne(.SLOOP6X16I) // iterate again if ii != 0. - - label(.SRETURN) - + | | | | 8x4 | 8x2| + | 4x8 | 4x8 | | | | + | | | |-----|----| + |-----------------| | 8x4 | 8x2| + | 2x8 | 2x8 | | | | + |------------------ |----------|*/ + + /****6x16 tile is transposed and saved in col major as 6x16*****/ + /****top left tile 4x8 transposed to top left tile 8x4**********/ + vunpcklps(ymm6, ymm4, ymm0)//a0b0a1b1 a4b4a5b5 + vunpcklps(ymm10, ymm8, ymm1)//c0d0c1d1 c4d4c5d5 + vshufps(imm(0x4e), ymm1, ymm0, ymm2)//a1b1c0d0 a5b5c4d4 + vblendps(imm(0xcc), ymm2, ymm0, ymm0)//a0b0c0d0 a4b4c4d4 + vblendps(imm(0x33), ymm2, ymm1, ymm1)//a1b1c1d1 a5b5c5d5 + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm1) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c + + vunpckhps(ymm6, ymm4, ymm0) + vunpckhps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm1) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) + + lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c + lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c + + /***bottom left tile - 2x8 is transposed to top right tile 8x2**********/ + vunpcklps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(rdx), xmm1, xmm1) + vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) + lea(mem(rdx, rsi, 4), rax) // rax += 4*cs_c + + vmovlpd(mem(rax), xmm1, xmm1) + vmovhpd(mem(rax, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(rax)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(rax, rsi, 1)) // store ( gamma45..gamma55 ) + lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c + + vunpckhps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(rdx), xmm1, xmm1) + vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma43..gamma53 ) + lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c + vmovlpd(mem(rdx), xmm1, xmm1) + vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(rdx)) // store ( gamma46..gamma56 ) + vmovhpd(xmm2, mem(rdx, rsi, 1)) // store ( gamma47..gamma57 ) + + lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c + + /***top right tile 4x8 is transposed to bottom left tile 8x4**********/ + vunpcklps(ymm7, ymm5, ymm0) + vunpcklps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm1) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c + + vunpckhps(ymm7, ymm5, ymm0) + vunpckhps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm1) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) + + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + /*** bottom right 2x8 is transposed to bottom right tile 8x2*******/ + vunpcklps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(rdx), xmm1, xmm1) + vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) + lea(mem(rdx, rsi, 4), rax) // rax += 4*cs_c + + vmovlpd(mem(rax), xmm1, xmm1) + vmovhpd(mem(rax, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(rax)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(rax, rsi, 1)) // store ( gamma45..gamma55 ) + lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c + + vunpckhps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(rdx), xmm1, xmm1) + vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma43..gamma53 ) + lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c + vmovlpd(mem(rdx), xmm1, xmm1) + vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(rdx)) // store ( gamma46..gamma56 ) + vmovhpd(xmm2, mem(rdx, rsi, 1)) // store ( gamma47..gamma57 ) + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLSTORBZ) // jump to column storage case + + + label(.SROWSTORBZ) + + vmovups(ymm4, mem(rcx)) + vmovups(ymm5, mem(rcx, rsi, 8)) + add(rdi, rcx) + + + vmovups(ymm6, mem(rcx)) + vmovups(ymm7, mem(rcx, rsi, 8)) + add(rdi, rcx) + + + vmovups(ymm8, mem(rcx)) + vmovups(ymm9, mem(rcx, rsi, 8)) + add(rdi, rcx) + + + vmovups(ymm10, mem(rcx)) + vmovups(ymm11, mem(rcx, rsi, 8)) + add(rdi, rcx) + + + vmovups(ymm12, mem(rcx)) + vmovups(ymm13, mem(rcx, rsi, 8)) + add(rdi, rcx) + + + vmovups(ymm14, mem(rcx)) + vmovups(ymm15, mem(rcx, rsi, 8)) + //add(rdi, rcx) + + jmp(.SDONE) // jump to end. + + + label(.SCOLSTORBZ) + /****6x16 tile going to save into 16x6 tile in C*****/ + /******************top left tile 8x4***************************/ + vunpcklps(ymm6, ymm4, ymm0) + vunpcklps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) + + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vunpckhps(ymm6, ymm4, ymm0) + vunpckhps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) + + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c + /******************top right tile 8x2***************************/ + vunpcklps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) + vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) + lea(mem(rdx, rsi, 1), rdx) + vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) + vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) + lea(mem(rdx, rsi, 1), rdx) + + vunpckhps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) + vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) + lea(mem(rdx, rsi, 1), rdx) + vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) + vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) + lea(mem(rdx, rsi, 1), rdx) + lea(mem(rdx, rsi, 4), rdx) // rdx += 8*cs_c + + /******************bottom left tile 8x4***************************/ + vunpcklps(ymm7, ymm5, ymm0) + vunpcklps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) + + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vunpckhps(ymm7, ymm5, ymm0) + vunpckhps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) + + /******************bottom right tile 8x2***************************/ + vunpcklps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) + vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) + lea(mem(rdx, rsi, 1), rdx) + vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) + vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) + lea(mem(rdx, rsi, 1), rdx) + + vunpckhps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) + vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) + lea(mem(rdx, rsi, 1), rdx) + vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) + vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) + + label(.SDONE) + + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + //lea(mem(r14, r8, 4), r14) // + //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a + mov(var(ps_a4), rax) // load ps_a4 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 + + dec(r11) // ii -= 1; + jne(.SLOOP6X16I) // iterate again if ii != 0. + + label(.SRETURN) + end_asm( - : // output operands (none) - : // input operands + : // output operands (none) + : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), - [ps_a4] "m" (ps_a4), + [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), @@ -888,53 +1053,53 @@ void bli_sgemmsup_rv_zen_asm_6x16m [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ - : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "ymm0", "ymm1", "ymm2", "ymm3", - "ymm4", "ymm5", "ymm6", "ymm7", - "ymm8", "ymm9", "ymm10", "ymm11", - "ymm12", "ymm13", "ymm14", "ymm15", - "memory" - ) - - consider_edge_cases: - - // Handle edge cases in the m dimension, if they exist. - if ( m_left ) - { - const dim_t nr_cur = 16; - const dim_t i_edge = m0 - ( dim_t )m_left; - - float* restrict cij = c + i_edge*rs_c; - float* restrict ai = a + m_iter*ps_a; - float* restrict bj = b; - - sgemmsup_ker_ft ker_fps[6] = - { - NULL, - bli_sgemmsup_rv_zen_asm_1x16, - bli_sgemmsup_rv_zen_asm_2x16, - bli_sgemmsup_rv_zen_asm_3x16, - bli_sgemmsup_rv_zen_asm_4x16, - bli_sgemmsup_rv_zen_asm_5x16 - }; - - sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; - - ker_fp - ( - conja, conjb, m_left, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - return; - - } + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", + "memory" + ) + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if ( m_left ) + { + const dim_t nr_cur = 16; + const dim_t i_edge = m0 - ( dim_t )m_left; + + float* restrict cij = c + i_edge*rs_c; + float* restrict ai = a + m_iter*ps_a; + float* restrict bj = b; + + sgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_sgemmsup_rv_zen_asm_1x16, + bli_sgemmsup_rv_zen_asm_2x16, + bli_sgemmsup_rv_zen_asm_3x16, + bli_sgemmsup_rv_zen_asm_4x16, + bli_sgemmsup_rv_zen_asm_5x16 + }; + + sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + return; + + } } void bli_sgemmsup_rv_zen_asm_6x8m @@ -953,479 +1118,479 @@ void bli_sgemmsup_rv_zen_asm_6x8m cntx_t* restrict cntx ) { - //void* a_next = bli_auxinfo_next_a( data ); - //void* b_next = bli_auxinfo_next_b( data ); - - // Typecast local copies of integers in case dim_t and inc_t are a - // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; - - uint64_t m_iter = m0 / 6; - uint64_t m_left = m0 % 6; - - uint64_t rs_a = rs_a0; - uint64_t cs_a = cs_a0; - uint64_t rs_b = rs_b0; - uint64_t cs_b = cs_b0; - uint64_t rs_c = rs_c0; - uint64_t cs_c = cs_c0; - - // Query the panel stride of A and convert it to units of bytes. - uint64_t ps_a = bli_auxinfo_ps_a( data ); - uint64_t ps_a4 = ps_a * sizeof( float ); - - if ( m_iter == 0 ) goto consider_edge_cases; - - // ------------------------------------------------------------------------- - begin_asm() - - mov(var(a), r14) // load address of a. - mov(var(rs_a), r8) // load rs_a - mov(var(cs_a), r9) // load cs_a - lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) - lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - - lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a - lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a - - mov(var(rs_b), r10) // load rs_b - lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) - - // NOTE: We cannot pre-load elements of a or b - // because it could eventually, in the last - // unrolled iter or the cleanup loop, result - // in reading beyond the bounds allocated mem - // (the likely result: a segmentation fault). - - mov(var(c), r12) // load address of c - mov(var(rs_c), rdi) // load rs_c - lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) - - - // During preamble and loops: - // r12 = rcx = c - // r14 = rax = a - // read rbx from var(b) near beginning of loop - // r11 = m dim index ii - - mov(var(m_iter), r11) // ii = m_iter; - - label(.SLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] - - // skylake can execute 3 vxorpd ipc with - // a latency of 1 cycle, while vzeroall - // has a latency of 12 cycles. - vxorps(ymm1, ymm1, ymm1) // zero ymm1 since we only use the lower - vxorps(ymm4, ymm4, ymm4) // half (xmm1), and nans/infs may slow us down. - vxorps(ymm6, ymm6, ymm6) - vxorps(ymm8, ymm8, ymm8) - vxorps(ymm10, ymm10, ymm10) - vxorps(ymm12, ymm12, ymm12) - vxorps(ymm14, ymm14, ymm14) - - mov(var(b), rbx) // load address of b. - mov(r14, rax) // reset rax to current upanel of a. - - cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. - jz(.SCOLPFETCH) // jump to column storage case - label(.SROWPFETCH) // row-stored prefetching on c - - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c - - jmp(.SPOSTPFETCH) // jump to end of prefetching c - label(.SCOLPFETCH) // column-stored prefetching c - - mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) - lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) - lea(mem(r12, rsi, 2), rdx) // - lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c - prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c - prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c - prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c - prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c - - label(.SPOSTPFETCH) // done prefetching c - - - lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; - lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines - lea(mem(rdx, r8, 2), rdx) // from next upanel of a. - - - mov(var(k_iter), rsi) // i = k_iter; - test(rsi, rsi) // check i via logical AND. - je(.SCONSIDKLEFT) // if i == 0, jump to code that - // contains the k_left loop. - - - label(.SLOOPKITER) // MAIN LOOP - - - // ---------------------------------- iteration 0 - prefetch(0, mem(rdx, 5*8)) - - vmovups(mem(rbx), ymm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), ymm2) - vbroadcastss(mem(rax, r8, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm4) - vfmadd231ps(ymm0, ymm3, ymm6) - - vbroadcastss(mem(rax, r8, 2), ymm2) - vbroadcastss(mem(rax, r13, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm8) - vfmadd231ps(ymm0, ymm3, ymm10) - - vbroadcastss(mem(rax, r8, 4), ymm2) - vbroadcastss(mem(rax, r15, 1), ymm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(ymm0, ymm2, ymm12) - vfmadd231ps(ymm0, ymm3, ymm14) - - // ---------------------------------- iteration 1 - prefetch(0, mem(rdx, r9, 1, 5*8)) - - vmovups(mem(rbx), ymm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), ymm2) - vbroadcastss(mem(rax, r8, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm4) - vfmadd231ps(ymm0, ymm3, ymm6) - - vbroadcastss(mem(rax, r8, 2), ymm2) - vbroadcastss(mem(rax, r13, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm8) - vfmadd231ps(ymm0, ymm3, ymm10) - - vbroadcastss(mem(rax, r8, 4), ymm2) - vbroadcastss(mem(rax, r15, 1), ymm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(ymm0, ymm2, ymm12) - vfmadd231ps(ymm0, ymm3, ymm14) - - // ---------------------------------- iteration 2 - prefetch(0, mem(rdx, r9, 2, 5*8)) - - vmovups(mem(rbx), ymm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), ymm2) - vbroadcastss(mem(rax, r8, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm4) - vfmadd231ps(ymm0, ymm3, ymm6) - - vbroadcastss(mem(rax, r8, 2), ymm2) - vbroadcastss(mem(rax, r13, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm8) - vfmadd231ps(ymm0, ymm3, ymm10) - - vbroadcastss(mem(rax, r8, 4), ymm2) - vbroadcastss(mem(rax, r15, 1), ymm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(ymm0, ymm2, ymm12) - vfmadd231ps(ymm0, ymm3, ymm14) - - // ---------------------------------- iteration 3 - prefetch(0, mem(rdx, rcx, 1, 5*8)) - - vmovups(mem(rbx), ymm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), ymm2) - vbroadcastss(mem(rax, r8, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm4) - vfmadd231ps(ymm0, ymm3, ymm6) - - vbroadcastss(mem(rax, r8, 2), ymm2) - vbroadcastss(mem(rax, r13, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm8) - vfmadd231ps(ymm0, ymm3, ymm10) - - vbroadcastss(mem(rax, r8, 4), ymm2) - vbroadcastss(mem(rax, r15, 1), ymm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(ymm0, ymm2, ymm12) - vfmadd231ps(ymm0, ymm3, ymm14) - - - dec(rsi) // i -= 1; - jne(.SLOOPKITER) // iterate again if i != 0. - - - label(.SCONSIDKLEFT) - - mov(var(k_left), rsi) // i = k_left; - test(rsi, rsi) // check i via logical AND. - je(.SPOSTACCUM) // if i == 0, we're done; jump to end. - // else, we prepare to enter k_left loop. - - label(.SLOOPKLEFT) // EDGE LOOP - - vmovups(mem(rbx), ymm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), ymm2) - vbroadcastss(mem(rax, r8, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm4) - vfmadd231ps(ymm0, ymm3, ymm6) - - vbroadcastss(mem(rax, r8, 2), ymm2) - vbroadcastss(mem(rax, r13, 1), ymm3) - vfmadd231ps(ymm0, ymm2, ymm8) - vfmadd231ps(ymm0, ymm3, ymm10) - - vbroadcastss(mem(rax, r8, 4), ymm2) - vbroadcastss(mem(rax, r15, 1), ymm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(ymm0, ymm2, ymm12) - vfmadd231ps(ymm0, ymm3, ymm14) - - - dec(rsi) // i -= 1; - jne(.SLOOPKLEFT) // iterate again if i != 0. - - - label(.SPOSTACCUM) - - mov(r12, rcx) // reset rcx to current utile of c. - mov(var(alpha), rax) // load address of alpha - mov(var(beta), rbx) // load address of beta - vbroadcastss(mem(rax), ymm0) // load alpha and duplicate - vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - - vmulps(ymm0, ymm4, ymm4) // scale by alpha - vmulps(ymm0, ymm6, ymm6) - vmulps(ymm0, ymm8, ymm8) - vmulps(ymm0, ymm10, ymm10) - vmulps(ymm0, ymm12, ymm12) - vmulps(ymm0, ymm14, ymm14) - - mov(var(cs_c), rsi) // load cs_c - lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; - lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - // now avoid loading C if beta == 0 - - vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. - vucomiss(xmm0, xmm3) // set ZF if beta == 0. - je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - - cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. - jz(.SCOLSTORED) // jump to column storage case - - - label(.SROWSTORED) - - vfmadd231ps(mem(rcx), ymm3, ymm4) - vmovups(ymm4, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rcx), ymm3, ymm6) - vmovups(ymm6, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rcx), ymm3, ymm8) - vmovups(ymm8, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rcx), ymm3, ymm10) - vmovups(ymm10, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rcx), ymm3, ymm12) - vmovups(ymm12, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rcx), ymm3, ymm14) - vmovups(ymm14, mem(rcx)) - - jmp(.SDONE) // jump to end. - - label(.SCOLSTORED) - - /****6x8 tile is transposed and saved in col major as 8x6*****/ - vunpcklps(ymm6, ymm4, ymm0) - vunpcklps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm0) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vextractf128(imm(0x1), ymm1, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm1) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) - - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vunpckhps(ymm6, ymm4, ymm0) - vunpckhps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm0) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vextractf128(imm(0x1), ymm1, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm1) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) - - vunpcklps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vpermilps(imm(0xe),xmm0,xmm5) - vpermilps(imm(0xe),xmm2,xmm6) - vmovq(mem(rdx),xmm4) - vmovq(mem(rdx, rsi, 4),xmm1) - vfmadd231ps(xmm4, xmm3, xmm0) - vfmadd231ps(xmm1, xmm3, xmm2) - vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) - vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) - lea(mem(rdx, rsi, 1), rdx) - vmovq(mem(rdx),xmm4) - vmovq(mem(rdx, rsi, 4),xmm1) - vfmadd231ps(xmm4, xmm3, xmm5) - vfmadd231ps(xmm1, xmm3, xmm6) - vmovlpd(xmm5, mem(rdx)) // store ( gamma41..gamma51 ) - vmovlpd(xmm6, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) - lea(mem(rdx, rsi, 1), rdx) - - vunpckhps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vpermilps(imm(0xe),xmm0,xmm5) - vpermilps(imm(0xe),xmm2,xmm6) - vmovq(mem(rdx),xmm4) - vmovq(mem(rdx, rsi, 4),xmm1) - vfmadd231ps(xmm4, xmm3, xmm0) - vfmadd231ps(xmm1, xmm3, xmm2) - vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) - vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) - lea(mem(rdx, rsi, 1), rdx) - vmovq(mem(rdx),xmm4) - vmovq(mem(rdx, rsi, 4),xmm1) - vfmadd231ps(xmm4, xmm3, xmm5) - vfmadd231ps(xmm1, xmm3, xmm6) - vmovlpd(xmm5, mem(rdx)) // store ( gamma43..gamma53 ) - vmovlpd(xmm6, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) - - jmp(.SDONE) // jump to end. - - label(.SBETAZERO) - - cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. - jz(.SCOLSTORBZ) // jump to column storage case - - label(.SROWSTORBZ) - - vmovups(ymm4, mem(rcx)) - add(rdi, rcx) - vmovups(ymm6, mem(rcx)) - add(rdi, rcx) - vmovups(ymm8, mem(rcx)) - add(rdi, rcx) - vmovups(ymm10, mem(rcx)) - add(rdi, rcx) - vmovups(ymm12, mem(rcx)) - add(rdi, rcx) - vmovups(ymm14, mem(rcx)) - - jmp(.SDONE) // jump to end. - - label(.SCOLSTORBZ) - - vunpcklps(ymm6, ymm4, ymm0) - vunpcklps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vextractf128(imm(0x1), ymm1, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) - - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vunpckhps(ymm6, ymm4, ymm0) - vunpckhps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vextractf128(imm(0x1), ymm1, xmm2) - vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) - /******************top right tile 8x2***************************/ - vunpcklps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) - vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) - lea(mem(rdx, rsi, 1), rdx) - vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) - vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) - lea(mem(rdx, rsi, 1), rdx) - - vunpckhps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) - vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) - lea(mem(rdx, rsi, 1), rdx) - vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) - vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) - - - label(.SDONE) - - lea(mem(r12, rdi, 4), r12) // - lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c - - //lea(mem(r14, r8, 4), r14) // - //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a - mov(var(ps_a4), rax) // load ps_a4 - lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 - - dec(r11) // ii -= 1; - jne(.SLOOP6X8I) // iterate again if ii != 0. - - label(.SRETURN) + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + // r11 = m dim index ii + + mov(var(m_iter), r11) // ii = m_iter; + + label(.SLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + // skylake can execute 3 vxorpd ipc with + // a latency of 1 cycle, while vzeroall + // has a latency of 12 cycles. + vxorps(ymm1, ymm1, ymm1) // zero ymm1 since we only use the lower + vxorps(ymm4, ymm4, ymm4) // half (xmm1), and nans/infs may slow us down. + vxorps(ymm6, ymm6, ymm6) + vxorps(ymm8, ymm8, ymm8) + vxorps(ymm10, ymm10, ymm10) + vxorps(ymm12, ymm12, ymm12) + vxorps(ymm14, ymm14, ymm14) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + prefetch(0, mem(rdx, 5*8)) + + vmovups(mem(rbx), ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vbroadcastss(mem(rax, r8, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm0, ymm3, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vbroadcastss(mem(rax, r13, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm0, ymm3, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vbroadcastss(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm0, ymm3, ymm14) + + // ---------------------------------- iteration 1 + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmovups(mem(rbx), ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vbroadcastss(mem(rax, r8, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm0, ymm3, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vbroadcastss(mem(rax, r13, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm0, ymm3, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vbroadcastss(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm0, ymm3, ymm14) + + // ---------------------------------- iteration 2 + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmovups(mem(rbx), ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vbroadcastss(mem(rax, r8, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm0, ymm3, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vbroadcastss(mem(rax, r13, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm0, ymm3, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vbroadcastss(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm0, ymm3, ymm14) + + // ---------------------------------- iteration 3 + prefetch(0, mem(rdx, rcx, 1, 5*8)) + + vmovups(mem(rbx), ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vbroadcastss(mem(rax, r8, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm0, ymm3, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vbroadcastss(mem(rax, r13, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm0, ymm3, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vbroadcastss(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm0, ymm3, ymm14) + + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + vmovups(mem(rbx), ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vbroadcastss(mem(rax, r8, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm0, ymm3, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vbroadcastss(mem(rax, r13, 1), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm0, ymm3, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vbroadcastss(mem(rax, r15, 1), ymm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm0, ymm3, ymm14) + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm3) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm0, ymm14, ymm14) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORED) // jump to column storage case + + + label(.SROWSTORED) + + vfmadd231ps(mem(rcx), ymm3, ymm4) + vmovups(ymm4, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rcx), ymm3, ymm6) + vmovups(ymm6, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rcx), ymm3, ymm8) + vmovups(ymm8, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rcx), ymm3, ymm10) + vmovups(ymm10, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rcx), ymm3, ymm12) + vmovups(ymm12, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rcx), ymm3, ymm14) + vmovups(ymm14, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORED) + + /****6x8 tile is transposed and saved in col major as 8x6*****/ + vunpcklps(ymm6, ymm4, ymm0) + vunpcklps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm1) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) + + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vunpckhps(ymm6, ymm4, ymm0) + vunpckhps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm1) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) + + vunpcklps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vpermilps(imm(0xe),xmm0,xmm5) + vpermilps(imm(0xe),xmm2,xmm6) + vmovq(mem(rdx),xmm4) + vmovq(mem(rdx, rsi, 4),xmm1) + vfmadd231ps(xmm4, xmm3, xmm0) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) + vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) + lea(mem(rdx, rsi, 1), rdx) + vmovq(mem(rdx),xmm4) + vmovq(mem(rdx, rsi, 4),xmm1) + vfmadd231ps(xmm4, xmm3, xmm5) + vfmadd231ps(xmm1, xmm3, xmm6) + vmovlpd(xmm5, mem(rdx)) // store ( gamma41..gamma51 ) + vmovlpd(xmm6, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) + lea(mem(rdx, rsi, 1), rdx) + + vunpckhps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vpermilps(imm(0xe),xmm0,xmm5) + vpermilps(imm(0xe),xmm2,xmm6) + vmovq(mem(rdx),xmm4) + vmovq(mem(rdx, rsi, 4),xmm1) + vfmadd231ps(xmm4, xmm3, xmm0) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) + vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) + lea(mem(rdx, rsi, 1), rdx) + vmovq(mem(rdx),xmm4) + vmovq(mem(rdx, rsi, 4),xmm1) + vfmadd231ps(xmm4, xmm3, xmm5) + vfmadd231ps(xmm1, xmm3, xmm6) + vmovlpd(xmm5, mem(rdx)) // store ( gamma43..gamma53 ) + vmovlpd(xmm6, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmovups(ymm4, mem(rcx)) + add(rdi, rcx) + vmovups(ymm6, mem(rcx)) + add(rdi, rcx) + vmovups(ymm8, mem(rcx)) + add(rdi, rcx) + vmovups(ymm10, mem(rcx)) + add(rdi, rcx) + vmovups(ymm12, mem(rcx)) + add(rdi, rcx) + vmovups(ymm14, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORBZ) + + vunpcklps(ymm6, ymm4, ymm0) + vunpcklps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) + + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vunpckhps(ymm6, ymm4, ymm0) + vunpckhps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) + /******************top right tile 8x2***************************/ + vunpcklps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) + vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) + lea(mem(rdx, rsi, 1), rdx) + vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) + vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) + lea(mem(rdx, rsi, 1), rdx) + + vunpckhps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) + vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) + lea(mem(rdx, rsi, 1), rdx) + vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) + vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) + + + label(.SDONE) + + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + //lea(mem(r14, r8, 4), r14) // + //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a + mov(var(ps_a4), rax) // load ps_a4 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 + + dec(r11) // ii -= 1; + jne(.SLOOP6X8I) // iterate again if ii != 0. + + label(.SRETURN) end_asm( - : // output operands (none) - : // input operands + : // output operands (none) + : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), - [ps_a4] "m" (ps_a4), + [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), @@ -1436,51 +1601,51 @@ void bli_sgemmsup_rv_zen_asm_6x8m [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ - : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "ymm0", "ymm1", "ymm2", "ymm3", - "ymm4", "ymm6", "ymm8", "ymm10", - "ymm12", "ymm14", - "memory" - ) - - consider_edge_cases: - - // Handle edge cases in the m dimension, if they exist. - if ( m_left ) - { - const dim_t nr_cur = 8; - const dim_t i_edge = m0 - ( dim_t )m_left; - - float* restrict cij = c + i_edge*rs_c; - float* restrict ai = a + m_iter*ps_a; - float* restrict bj = b; - - sgemmsup_ker_ft ker_fps[6] = - { - NULL, - bli_sgemmsup_rv_zen_asm_1x8, - bli_sgemmsup_rv_zen_asm_2x8, - bli_sgemmsup_rv_zen_asm_3x8, - bli_sgemmsup_rv_zen_asm_4x8, - bli_sgemmsup_rv_zen_asm_5x8 - }; - - sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; - - ker_fp - ( - conja, conjb, m_left, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - return; - } + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "ymm14", + "memory" + ) + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if ( m_left ) + { + const dim_t nr_cur = 8; + const dim_t i_edge = m0 - ( dim_t )m_left; + + float* restrict cij = c + i_edge*rs_c; + float* restrict ai = a + m_iter*ps_a; + float* restrict bj = b; + + sgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_sgemmsup_rv_zen_asm_1x8, + bli_sgemmsup_rv_zen_asm_2x8, + bli_sgemmsup_rv_zen_asm_3x8, + bli_sgemmsup_rv_zen_asm_4x8, + bli_sgemmsup_rv_zen_asm_5x8 + }; + + sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + return; + } } void bli_sgemmsup_rv_zen_asm_6x4m @@ -1499,420 +1664,420 @@ void bli_sgemmsup_rv_zen_asm_6x4m cntx_t* restrict cntx ) { - //void* a_next = bli_auxinfo_next_a( data ); - //void* b_next = bli_auxinfo_next_b( data ); - - // Typecast local copies of integers in case dim_t and inc_t are a - // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; - - uint64_t m_iter = m0 / 6; - uint64_t m_left = m0 % 6; - - uint64_t rs_a = rs_a0; - uint64_t cs_a = cs_a0; - uint64_t rs_b = rs_b0; - uint64_t cs_b = cs_b0; - uint64_t rs_c = rs_c0; - uint64_t cs_c = cs_c0; - - // Query the panel stride of A and convert it to units of bytes. - uint64_t ps_a = bli_auxinfo_ps_a( data ); - uint64_t ps_a4 = ps_a * sizeof( float ); - - if ( m_iter == 0 ) goto consider_edge_cases; - - // ------------------------------------------------------------------------- - begin_asm() - - mov(var(a), r14) // load address of a. - mov(var(rs_a), r8) // load rs_a - mov(var(cs_a), r9) // load cs_a - lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) - lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - - lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a - lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a - - mov(var(rs_b), r10) // load rs_b - lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) - - // NOTE: We cannot pre-load elements of a or b - // because it could eventually, in the last - // unrolled iter or the cleanup loop, result - // in reading beyond the bounds allocated mem - // (the likely result: a segmentation fault). - - mov(var(c), r12) // load address of c - mov(var(rs_c), rdi) // load rs_c - lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) - - // During preamble and loops: - // r12 = rcx = c // r14 = rax = a - // read rbx from var(b) near beginning of loop - // r11 = m dim index ii - - mov(var(m_iter), r11) // ii = m_iter; - - label(.SLOOP6X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] - - vxorps(xmm1, xmm1, xmm1) - vxorps(xmm4, xmm4, xmm4) - vxorps(xmm6, xmm6, xmm6) - vxorps(xmm8, xmm8, xmm8) - vxorps(xmm10, xmm10, xmm10) - vxorps(xmm12, xmm12, xmm12) - vxorps(xmm14, xmm14, xmm14) - - mov(var(b), rbx) // load address of b. - mov(r14, rax) // reset rax to current upanel of a. - - cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. - jz(.SCOLPFETCH) // jump to column storage case - label(.SROWPFETCH) // row-stored prefetching on c - - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c - - jmp(.SPOSTPFETCH) // jump to end of prefetching c - label(.SCOLPFETCH) // column-stored prefetching c - - mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) - lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) - lea(mem(r12, rsi, 2), rdx) // - lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c - prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c - prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c - prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c - prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c - - label(.SPOSTPFETCH) // done prefetching c - - lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; - lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines - lea(mem(rdx, r8, 2), rdx) // from next upanel of a. - - mov(var(k_iter), rsi) // i = k_iter; - test(rsi, rsi) // check i via logical AND. - je(.SCONSIDKLEFT) // if i == 0, jump to code that - // contains the k_left loop. - label(.SLOOPKITER) // MAIN LOOP - - // ---------------------------------- iteration 0 - prefetch(0, mem(rdx, 5*8)) - - vmovups(mem(rbx), xmm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), xmm2) - vbroadcastss(mem(rax, r8, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm4) - vfmadd231ps(xmm0, xmm3, xmm6) - - vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm8) - vfmadd231ps(xmm0, xmm3, xmm10) - - vbroadcastss(mem(rax, r8, 4), xmm2) - vbroadcastss(mem(rax, r15, 1), xmm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(xmm0, xmm2, xmm12) - vfmadd231ps(xmm0, xmm3, xmm14) - - // ---------------------------------- iteration 1 - prefetch(0, mem(rdx, r9, 1, 5*8)) - - vmovups(mem(rbx), xmm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), xmm2) - vbroadcastss(mem(rax, r8, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm4) - vfmadd231ps(xmm0, xmm3, xmm6) - - vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm8) - vfmadd231ps(xmm0, xmm3, xmm10) - - vbroadcastss(mem(rax, r8, 4), xmm2) - vbroadcastss(mem(rax, r15, 1), xmm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(xmm0, xmm2, xmm12) - vfmadd231ps(xmm0, xmm3, xmm14) - - // ---------------------------------- iteration 2 - prefetch(0, mem(rdx, r9, 2, 5*8)) - - vmovups(mem(rbx), xmm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), xmm2) - vbroadcastss(mem(rax, r8, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm4) - vfmadd231ps(xmm0, xmm3, xmm6) - - vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm8) - vfmadd231ps(xmm0, xmm3, xmm10) - - vbroadcastss(mem(rax, r8, 4), xmm2) - vbroadcastss(mem(rax, r15, 1), xmm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(xmm0, xmm2, xmm12) - vfmadd231ps(xmm0, xmm3, xmm14) - - // ---------------------------------- iteration 3 - prefetch(0, mem(rdx, rcx, 1, 5*8)) - - vmovups(mem(rbx), xmm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), xmm2) - vbroadcastss(mem(rax, r8, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm4) - vfmadd231ps(xmm0, xmm3, xmm6) - - vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm8) - vfmadd231ps(xmm0, xmm3, xmm10) - - vbroadcastss(mem(rax, r8, 4), xmm2) - vbroadcastss(mem(rax, r15, 1), xmm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(xmm0, xmm2, xmm12) - vfmadd231ps(xmm0, xmm3, xmm14) - - dec(rsi) // i -= 1; - jne(.SLOOPKITER) // iterate again if i != 0. - - label(.SCONSIDKLEFT) - - mov(var(k_left), rsi) // i = k_left; - test(rsi, rsi) // check i via logical AND. - je(.SPOSTACCUM) // if i == 0, we're done; jump to end. - // else, we prepare to enter k_left loop. - - label(.SLOOPKLEFT) // EDGE LOOP - - vmovups(mem(rbx), xmm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), xmm2) - vbroadcastss(mem(rax, r8, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm4) - vfmadd231ps(xmm0, xmm3, xmm6) - - vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm8) - vfmadd231ps(xmm0, xmm3, xmm10) - - vbroadcastss(mem(rax, r8, 4), xmm2) - vbroadcastss(mem(rax, r15, 1), xmm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(xmm0, xmm2, xmm12) - vfmadd231ps(xmm0, xmm3, xmm14) - - - dec(rsi) // i -= 1; - jne(.SLOOPKLEFT) // iterate again if i != 0. - - - label(.SPOSTACCUM) - - mov(r12, rcx) // reset rcx to current utile of c. - mov(var(alpha), rax) // load address of alpha - mov(var(beta), rbx) // load address of beta - vbroadcastss(mem(rax), xmm0) // load alpha and duplicate - vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - - vmulps(xmm0, xmm4, xmm4) // scale by alpha - vmulps(xmm0, xmm6, xmm6) - vmulps(xmm0, xmm8, xmm8) - vmulps(xmm0, xmm10, xmm10) - vmulps(xmm0, xmm12, xmm12) - vmulps(xmm0, xmm14, xmm14) - - mov(var(cs_c), rsi) // load cs_c - lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; - lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - // now avoid loading C if beta == 0 - - vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. - vucomiss(xmm0, xmm3) // set ZF if beta == 0. - je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - - cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. - jz(.SCOLSTORED) // jump to column storage case - - - label(.SROWSTORED) - - vfmadd231ps(mem(rcx), xmm3, xmm4) - vmovups(xmm4, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm6) - vmovups(xmm6, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm8) - vmovups(xmm8, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm10) - vmovups(xmm10, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm12) - vmovups(xmm12, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm14) - vmovups(xmm14, mem(rcx)) - - jmp(.SDONE) // jump to end. - - label(.SCOLSTORED) - - /****6x4 tile is transposed and saved in col major as 4x6*****/ - vunpcklps(xmm6, xmm4, xmm0) - vunpcklps(xmm10, xmm8, xmm1) - vshufps(imm(0x4e), xmm1, xmm0, xmm2) - vblendps(imm(0xcc), xmm2, xmm0, xmm0) - vblendps(imm(0x33), xmm2, xmm1, xmm1) - vfmadd231ps(mem(rcx), xmm3, xmm0) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vfmadd231ps(mem(rcx), xmm3, xmm1) - vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - - vunpckhps(xmm6, xmm4, xmm0) - vunpckhps(xmm10, xmm8, xmm1) - vshufps(imm(0x4e), xmm1, xmm0, xmm2) - vblendps(imm(0xcc), xmm2, xmm0, xmm0) - vblendps(imm(0x33), xmm2, xmm1, xmm1) - vfmadd231ps(mem(rcx), xmm3, xmm0) - vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vfmadd231ps(mem(rcx), xmm3, xmm1) - vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) - - vunpcklps(xmm14, xmm12, xmm0) - vpermilps(imm(0x4e), xmm0, xmm5) + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + // During preamble and loops: + // r12 = rcx = c // r14 = rax = a + // read rbx from var(b) near beginning of loop + // r11 = m dim index ii + + mov(var(m_iter), r11) // ii = m_iter; + + label(.SLOOP6X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + vxorps(xmm1, xmm1, xmm1) + vxorps(xmm4, xmm4, xmm4) + vxorps(xmm6, xmm6, xmm6) + vxorps(xmm8, xmm8, xmm8) + vxorps(xmm10, xmm10, xmm10) + vxorps(xmm12, xmm12, xmm12) + vxorps(xmm14, xmm14, xmm14) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + prefetch(0, mem(rdx, 5*8)) + + vmovups(mem(rbx), xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + // ---------------------------------- iteration 1 + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmovups(mem(rbx), xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + // ---------------------------------- iteration 2 + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmovups(mem(rbx), xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + // ---------------------------------- iteration 3 + prefetch(0, mem(rdx, rcx, 1, 5*8)) + + vmovups(mem(rbx), xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + vmovups(mem(rbx), xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), xmm0) // load alpha and duplicate + vbroadcastss(mem(rbx), xmm3) // load beta and duplicate + + vmulps(xmm0, xmm4, xmm4) // scale by alpha + vmulps(xmm0, xmm6, xmm6) + vmulps(xmm0, xmm8, xmm8) + vmulps(xmm0, xmm10, xmm10) + vmulps(xmm0, xmm12, xmm12) + vmulps(xmm0, xmm14, xmm14) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORED) // jump to column storage case + + + label(.SROWSTORED) + + vfmadd231ps(mem(rcx), xmm3, xmm4) + vmovups(xmm4, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rcx), xmm3, xmm6) + vmovups(xmm6, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rcx), xmm3, xmm8) + vmovups(xmm8, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rcx), xmm3, xmm10) + vmovups(xmm10, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rcx), xmm3, xmm12) + vmovups(xmm12, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rcx), xmm3, xmm14) + vmovups(xmm14, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORED) + + /****6x4 tile is transposed and saved in col major as 4x6*****/ + vunpcklps(xmm6, xmm4, xmm0) + vunpcklps(xmm10, xmm8, xmm1) + vshufps(imm(0x4e), xmm1, xmm0, xmm2) + vblendps(imm(0xcc), xmm2, xmm0, xmm0) + vblendps(imm(0x33), xmm2, xmm1, xmm1) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vfmadd231ps(mem(rcx), xmm3, xmm1) + vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + + vunpckhps(xmm6, xmm4, xmm0) + vunpckhps(xmm10, xmm8, xmm1) + vshufps(imm(0x4e), xmm1, xmm0, xmm2) + vblendps(imm(0xcc), xmm2, xmm0, xmm0) + vblendps(imm(0x33), xmm2, xmm1, xmm1) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vfmadd231ps(mem(rcx), xmm3, xmm1) + vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) + + vunpcklps(xmm14, xmm12, xmm0) + vpermilps(imm(0x4e), xmm0, xmm5) vmovq(mem(rdx),xmm4) - vfmadd231ps(xmm4, xmm3, xmm0) - vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) - - lea(mem(rdx, rsi, 1), rdx) - vmovq(mem(rdx),xmm4) - vfmadd231ps(xmm4, xmm3, xmm5) - vmovlpd(xmm5, mem(rdx)) // store ( gamma41..gamma51 ) - - lea(mem(rdx, rsi, 1), rdx) - vunpckhps(xmm14, xmm12, xmm0) - vpermilps(imm(0x4e), xmm0, xmm5) - vmovq(mem(rdx),xmm4) - vfmadd231ps(xmm4, xmm3, xmm0) - vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) - - lea(mem(rdx, rsi, 1), rdx) - vmovq(mem(rdx),xmm4) - vfmadd231ps(xmm4, xmm3, xmm5) - vmovlpd(xmm5, mem(rdx)) // store ( gamma43..gamma53 ) - - jmp(.SDONE) // jump to end. - - label(.SBETAZERO) - - cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. - jz(.SCOLSTORBZ) // jump to column storage case - - label(.SROWSTORBZ) - - vmovups(xmm4, mem(rcx)) - add(rdi, rcx) - vmovups(xmm6, mem(rcx)) - add(rdi, rcx) - vmovups(xmm8, mem(rcx)) - add(rdi, rcx) - vmovups(xmm10, mem(rcx)) - add(rdi, rcx) - vmovups(xmm12, mem(rcx)) - add(rdi, rcx) - vmovups(xmm14, mem(rcx)) - - jmp(.SDONE) // jump to end. - - label(.SCOLSTORBZ) - - vunpcklps(xmm6, xmm4, xmm0) - vunpcklps(xmm10, xmm8, xmm1) - vshufps(imm(0x4e), xmm1, xmm0, xmm2) - vblendps(imm(0xcc), xmm2, xmm0, xmm0) - vblendps(imm(0x33), xmm2, xmm1, xmm1) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vunpckhps(xmm6, xmm4, xmm0) - vunpckhps(xmm10, xmm8, xmm1) - vshufps(imm(0x4e), xmm1, xmm0, xmm2) - vblendps(imm(0xcc), xmm2, xmm0, xmm0) - vblendps(imm(0x33), xmm2, xmm1, xmm1) - vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) - - vunpcklps(xmm14, xmm12, xmm0) - vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) - lea(mem(rdx, rsi, 1), rdx) - vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) - lea(mem(rdx, rsi, 1), rdx) - vunpckhps(xmm14, xmm12, xmm0) - vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) - lea(mem(rdx, rsi, 1), rdx) - vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) - - label(.SDONE) - - lea(mem(r12, rdi, 4), r12) // - lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c - - //lea(mem(r14, r8, 4), r14) // - //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a - mov(var(ps_a4), rax) // load ps_a4 - lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 - - dec(r11) // ii -= 1; - jne(.SLOOP6X4I) // iterate again if ii != 0. - - label(.SRETURN) + vfmadd231ps(xmm4, xmm3, xmm0) + vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) + + lea(mem(rdx, rsi, 1), rdx) + vmovq(mem(rdx),xmm4) + vfmadd231ps(xmm4, xmm3, xmm5) + vmovlpd(xmm5, mem(rdx)) // store ( gamma41..gamma51 ) + + lea(mem(rdx, rsi, 1), rdx) + vunpckhps(xmm14, xmm12, xmm0) + vpermilps(imm(0x4e), xmm0, xmm5) + vmovq(mem(rdx),xmm4) + vfmadd231ps(xmm4, xmm3, xmm0) + vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) + + lea(mem(rdx, rsi, 1), rdx) + vmovq(mem(rdx),xmm4) + vfmadd231ps(xmm4, xmm3, xmm5) + vmovlpd(xmm5, mem(rdx)) // store ( gamma43..gamma53 ) + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmovups(xmm4, mem(rcx)) + add(rdi, rcx) + vmovups(xmm6, mem(rcx)) + add(rdi, rcx) + vmovups(xmm8, mem(rcx)) + add(rdi, rcx) + vmovups(xmm10, mem(rcx)) + add(rdi, rcx) + vmovups(xmm12, mem(rcx)) + add(rdi, rcx) + vmovups(xmm14, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORBZ) + + vunpcklps(xmm6, xmm4, xmm0) + vunpcklps(xmm10, xmm8, xmm1) + vshufps(imm(0x4e), xmm1, xmm0, xmm2) + vblendps(imm(0xcc), xmm2, xmm0, xmm0) + vblendps(imm(0x33), xmm2, xmm1, xmm1) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vunpckhps(xmm6, xmm4, xmm0) + vunpckhps(xmm10, xmm8, xmm1) + vshufps(imm(0x4e), xmm1, xmm0, xmm2) + vblendps(imm(0xcc), xmm2, xmm0, xmm0) + vblendps(imm(0x33), xmm2, xmm1, xmm1) + vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) + + vunpcklps(xmm14, xmm12, xmm0) + vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) + lea(mem(rdx, rsi, 1), rdx) + vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) + lea(mem(rdx, rsi, 1), rdx) + vunpckhps(xmm14, xmm12, xmm0) + vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) + lea(mem(rdx, rsi, 1), rdx) + vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) + + label(.SDONE) + + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + //lea(mem(r14, r8, 4), r14) // + //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a + mov(var(ps_a4), rax) // load ps_a4 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 + + dec(r11) // ii -= 1; + jne(.SLOOP6X4I) // iterate again if ii != 0. + + label(.SRETURN) end_asm( - : // output operands (none) - : // input operands + : // output operands (none) + : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), - [ps_a4] "m" (ps_a4), + [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), @@ -1923,48 +2088,48 @@ void bli_sgemmsup_rv_zen_asm_6x4m [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ - : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "memory" - ) - - consider_edge_cases: - - // Handle edge cases in the m dimension, if they exist. - if ( m_left ) - { - const dim_t nr_cur = 4; - const dim_t i_edge = m0 - ( dim_t )m_left; - - float* restrict cij = c + i_edge*rs_c; - float* restrict ai = a + m_iter*ps_a; - float* restrict bj = b; - - sgemmsup_ker_ft ker_fps[6] = - { - NULL, - bli_sgemmsup_rv_zen_asm_1x4, - bli_sgemmsup_rv_zen_asm_2x4, - bli_sgemmsup_rv_zen_asm_3x4, - bli_sgemmsup_rv_zen_asm_4x4, - bli_sgemmsup_rv_zen_asm_5x4 - }; - - sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; - - ker_fp - ( - conja, conjb, m_left, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - return; - } + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ) + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if ( m_left ) + { + const dim_t nr_cur = 4; + const dim_t i_edge = m0 - ( dim_t )m_left; + + float* restrict cij = c + i_edge*rs_c; + float* restrict ai = a + m_iter*ps_a; + float* restrict bj = b; + + sgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_sgemmsup_rv_zen_asm_1x4, + bli_sgemmsup_rv_zen_asm_2x4, + bli_sgemmsup_rv_zen_asm_3x4, + bli_sgemmsup_rv_zen_asm_4x4, + bli_sgemmsup_rv_zen_asm_5x4 + }; + + sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + return; + } } void bli_sgemmsup_rv_zen_asm_6x2m @@ -1983,386 +2148,386 @@ void bli_sgemmsup_rv_zen_asm_6x2m cntx_t* restrict cntx ) { - //void* a_next = bli_auxinfo_next_a( data ); - //void* b_next = bli_auxinfo_next_b( data ); - - // Typecast local copies of integers in case dim_t and inc_t are a - // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; - - uint64_t m_iter = m0 / 6; - uint64_t m_left = m0 % 6; - - uint64_t rs_a = rs_a0; - uint64_t cs_a = cs_a0; - uint64_t rs_b = rs_b0; - uint64_t cs_b = cs_b0; - uint64_t rs_c = rs_c0; - uint64_t cs_c = cs_c0; - - // Query the panel stride of A and convert it to units of bytes. - uint64_t ps_a = bli_auxinfo_ps_a( data ); - uint64_t ps_a4 = ps_a * sizeof( float ); - - if ( m_iter == 0 ) goto consider_edge_cases; - - // ------------------------------------------------------------------------- - begin_asm() - - mov(var(a), r14) // load address of a. - mov(var(rs_a), r8) // load rs_a - mov(var(cs_a), r9) // load cs_a - lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) - lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - - lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a - lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a - - mov(var(rs_b), r10) // load rs_b - lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) - - // NOTE: We cannot pre-load elements of a or b - // because it could eventually, in the last - // unrolled iter or the cleanup loop, result - // in reading beyond the bounds allocated mem - // (the likely result: a segmentation fault). - - mov(var(c), r12) // load address of c - mov(var(rs_c), rdi) // load rs_c - lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) - - // During preamble and loops: - // r12 = rcx = c // r14 = rax = a - // read rbx from var(b) near beginning of loop - // r11 = m dim index ii - - mov(var(m_iter), r11) // ii = m_iter; - - label(.SLOOP6X2I) // LOOP OVER ii = [ m_iter ... 1 0 ] - - vxorps(xmm1, xmm1, xmm1) - vxorps(xmm4, xmm4, xmm4) - vxorps(xmm6, xmm6, xmm6) - vxorps(xmm8, xmm8, xmm8) - vxorps(xmm10, xmm10, xmm10) - vxorps(xmm12, xmm12, xmm12) - vxorps(xmm14, xmm14, xmm14) - - mov(var(b), rbx) // load address of b. - mov(r14, rax) // reset rax to current upanel of a. - - cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. - jz(.SCOLPFETCH) // jump to column storage case - label(.SROWPFETCH) // row-stored prefetching on c - - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c - - jmp(.SPOSTPFETCH) // jump to end of prefetching c - label(.SCOLPFETCH) // column-stored prefetching c - - mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) - lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) - lea(mem(r12, rsi, 2), rdx) // - lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c - prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c - prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c - prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c - prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c - - label(.SPOSTPFETCH) // done prefetching c - - lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; - lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines - lea(mem(rdx, r8, 2), rdx) // from next upanel of a. - - mov(var(k_iter), rsi) // i = k_iter; - test(rsi, rsi) // check i via logical AND. - je(.SCONSIDKLEFT) // if i == 0, jump to code that - // contains the k_left loop. - label(.SLOOPKITER) // MAIN LOOP - - // ---------------------------------- iteration 0 - prefetch(0, mem(rdx, 5*8)) - vmovq(mem(rbx), xmm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), xmm2) - vbroadcastss(mem(rax, r8, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm4) - vfmadd231ps(xmm0, xmm3, xmm6) - - vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm8) - vfmadd231ps(xmm0, xmm3, xmm10) - - vbroadcastss(mem(rax, r8, 4), xmm2) - vbroadcastss(mem(rax, r15, 1), xmm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(xmm0, xmm2, xmm12) - vfmadd231ps(xmm0, xmm3, xmm14) - - // ---------------------------------- iteration 1 - prefetch(0, mem(rdx, r9, 1, 5*8)) - - vmovq(mem(rbx), xmm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), xmm2) - vbroadcastss(mem(rax, r8, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm4) - vfmadd231ps(xmm0, xmm3, xmm6) - - vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm8) - vfmadd231ps(xmm0, xmm3, xmm10) - - vbroadcastss(mem(rax, r8, 4), xmm2) - vbroadcastss(mem(rax, r15, 1), xmm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(xmm0, xmm2, xmm12) - vfmadd231ps(xmm0, xmm3, xmm14) - - // ---------------------------------- iteration 2 - prefetch(0, mem(rdx, r9, 2, 5*8)) - - vmovq(mem(rbx), xmm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), xmm2) - vbroadcastss(mem(rax, r8, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm4) - vfmadd231ps(xmm0, xmm3, xmm6) - - vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm8) - vfmadd231ps(xmm0, xmm3, xmm10) - - vbroadcastss(mem(rax, r8, 4), xmm2) - vbroadcastss(mem(rax, r15, 1), xmm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(xmm0, xmm2, xmm12) - vfmadd231ps(xmm0, xmm3, xmm14) - - // ---------------------------------- iteration 3 - prefetch(0, mem(rdx, rcx, 1, 5*8)) - - vmovq(mem(rbx), xmm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), xmm2) - vbroadcastss(mem(rax, r8, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm4) - vfmadd231ps(xmm0, xmm3, xmm6) - - vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm8) - vfmadd231ps(xmm0, xmm3, xmm10) - - vbroadcastss(mem(rax, r8, 4), xmm2) - vbroadcastss(mem(rax, r15, 1), xmm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(xmm0, xmm2, xmm12) - vfmadd231ps(xmm0, xmm3, xmm14) - - dec(rsi) // i -= 1; - jne(.SLOOPKITER) // iterate again if i != 0. - - label(.SCONSIDKLEFT) - - mov(var(k_left), rsi) // i = k_left; - test(rsi, rsi) // check i via logical AND. - je(.SPOSTACCUM) // if i == 0, we're done; jump to end. - // else, we prepare to enter k_left loop. - - label(.SLOOPKLEFT) // EDGE LOOP - - vmovq(mem(rbx), xmm0) - add(r10, rbx) // b += rs_b; - - vbroadcastss(mem(rax ), xmm2) - vbroadcastss(mem(rax, r8, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm4) - vfmadd231ps(xmm0, xmm3, xmm6) - - vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) - vfmadd231ps(xmm0, xmm2, xmm8) - vfmadd231ps(xmm0, xmm3, xmm10) - - vbroadcastss(mem(rax, r8, 4), xmm2) - vbroadcastss(mem(rax, r15, 1), xmm3) - add(r9, rax) // a += cs_a; - vfmadd231ps(xmm0, xmm2, xmm12) - vfmadd231ps(xmm0, xmm3, xmm14) - - - dec(rsi) // i -= 1; - jne(.SLOOPKLEFT) // iterate again if i != 0. - - - label(.SPOSTACCUM) - - mov(r12, rcx) // reset rcx to current utile of c. - mov(var(alpha), rax) // load address of alpha - mov(var(beta), rbx) // load address of beta - vbroadcastss(mem(rax), xmm0) // load alpha and duplicate - vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - - vmulps(xmm0, xmm4, xmm4) // scale by alpha - vmulps(xmm0, xmm6, xmm6) - vmulps(xmm0, xmm8, xmm8) - vmulps(xmm0, xmm10, xmm10) - vmulps(xmm0, xmm12, xmm12) - vmulps(xmm0, xmm14, xmm14) - - mov(var(cs_c), rsi) // load cs_c - lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; - lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - // now avoid loading C if beta == 0 - - vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. - vucomiss(xmm0, xmm3) // set ZF if beta == 0. - je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - - cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. - jz(.SCOLSTORED) // jump to column storage case - - - label(.SROWSTORED) - - vmovsd(mem(rcx), xmm0) - vfmadd231ps(xmm0, xmm3, xmm4) - vmovlpd(xmm4, mem(rcx)) - add(rdi, rcx) - vmovsd(mem(rcx), xmm0) - vfmadd231ps(xmm0, xmm3, xmm6) - vmovlpd(xmm6, mem(rcx)) - add(rdi, rcx) - vmovsd(mem(rcx), xmm0) - vfmadd231ps(xmm0, xmm3, xmm8) - vmovlpd(xmm8, mem(rcx)) - add(rdi, rcx) - vmovsd(mem(rcx), xmm0) - vfmadd231ps(xmm0, xmm3, xmm10) - vmovlpd(xmm10, mem(rcx)) - add(rdi, rcx) - vmovsd(mem(rcx), xmm0) - vfmadd231ps(xmm0, xmm3, xmm12) - vmovlpd(xmm12, mem(rcx)) - add(rdi, rcx) - vmovsd(mem(rcx), xmm0) - vfmadd231ps(xmm0, xmm3, xmm14) - vmovlpd(xmm14, mem(rcx)) - - jmp(.SDONE) // jump to end. - - label(.SCOLSTORED) - - /****6x2 tile is transposed and saved in col major as 2x6*****/ - vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 - vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 - vshufps(imm(0x44), xmm1, xmm0, xmm2) //01-00-01-00 - vshufps(imm(0xee), xmm1, xmm0, xmm4) //11-10-11-10 - - vfmadd231ps(mem(rcx), xmm3, xmm2) - vmovupd(xmm2, mem(rcx)) // store ( gamma00..gamma30 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vfmadd231ps(mem(rcx), xmm3, xmm4) - vmovupd(xmm4, mem(rcx)) // store ( gamma01..gamma31 ) - - vunpcklps(xmm14, xmm12, xmm0)//eof0e1f1 - vpermilps(imm(0x4e),xmm0,xmm5) - vmovq(mem(rdx), xmm4) - vfmadd231ps(xmm4, xmm3, xmm0) - vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) - lea(mem(rdx, rsi, 1), rdx) - vmovq(mem(rdx), xmm4) - vfmadd231ps(xmm4, xmm3, xmm5) - vmovlpd(xmm5, mem(rdx)) // store ( gamma41..gamma51 ) - - jmp(.SDONE) // jump to end. - - label(.SBETAZERO) - - cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. - jz(.SCOLSTORBZ) // jump to column storage case - - label(.SROWSTORBZ) - - vmovlpd(xmm4, mem(rcx)) - add(rdi, rcx) - vmovlpd(xmm6, mem(rcx)) - add(rdi, rcx) - vmovlpd(xmm8, mem(rcx)) - add(rdi, rcx) - vmovlpd(xmm10, mem(rcx)) - add(rdi, rcx) - vmovlpd(xmm12, mem(rcx)) - add(rdi, rcx) - vmovlpd(xmm14, mem(rcx)) - - jmp(.SDONE) // jump to end. - - label(.SCOLSTORBZ) - - vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 - vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 - vshufps(imm(0x44), xmm1, xmm0, xmm2) //01-00-01-00 - vshufps(imm(0xee), xmm1, xmm0, xmm4) //11-10-11-10 - - vmovupd(xmm2, mem(rcx)) // store ( gamma00..gamma30 ) - lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c - vmovupd(xmm4, mem(rcx)) // store ( gamma01..gamma31 ) - - vunpcklps(xmm14, xmm12, xmm0)//eof0e1f1 - vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) - lea(mem(rdx, rsi, 1), rdx) - vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) - - label(.SDONE) - - lea(mem(r12, rdi, 4), r12) // - lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c - - //lea(mem(r14, r8, 4), r14) // - //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a - mov(var(ps_a4), rax) // load ps_a4 - lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 - - dec(r11) // ii -= 1; - jne(.SLOOP6X2I) // iterate again if ii != 0. - - label(.SRETURN) + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + // During preamble and loops: + // r12 = rcx = c // r14 = rax = a + // read rbx from var(b) near beginning of loop + // r11 = m dim index ii + + mov(var(m_iter), r11) // ii = m_iter; + + label(.SLOOP6X2I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + vxorps(xmm1, xmm1, xmm1) + vxorps(xmm4, xmm4, xmm4) + vxorps(xmm6, xmm6, xmm6) + vxorps(xmm8, xmm8, xmm8) + vxorps(xmm10, xmm10, xmm10) + vxorps(xmm12, xmm12, xmm12) + vxorps(xmm14, xmm14, xmm14) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + prefetch(0, mem(rdx, 5*8)) + vmovq(mem(rbx), xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + // ---------------------------------- iteration 1 + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmovq(mem(rbx), xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + // ---------------------------------- iteration 2 + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmovq(mem(rbx), xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + // ---------------------------------- iteration 3 + prefetch(0, mem(rdx, rcx, 1, 5*8)) + + vmovq(mem(rbx), xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + vmovq(mem(rbx), xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), xmm0) // load alpha and duplicate + vbroadcastss(mem(rbx), xmm3) // load beta and duplicate + + vmulps(xmm0, xmm4, xmm4) // scale by alpha + vmulps(xmm0, xmm6, xmm6) + vmulps(xmm0, xmm8, xmm8) + vmulps(xmm0, xmm10, xmm10) + vmulps(xmm0, xmm12, xmm12) + vmulps(xmm0, xmm14, xmm14) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORED) // jump to column storage case + + + label(.SROWSTORED) + + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) + vmovlpd(xmm4, mem(rcx)) + add(rdi, rcx) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm6) + vmovlpd(xmm6, mem(rcx)) + add(rdi, rcx) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) + vmovlpd(xmm8, mem(rcx)) + add(rdi, rcx) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm10) + vmovlpd(xmm10, mem(rcx)) + add(rdi, rcx) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm12) + vmovlpd(xmm12, mem(rcx)) + add(rdi, rcx) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm14) + vmovlpd(xmm14, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORED) + + /****6x2 tile is transposed and saved in col major as 2x6*****/ + vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 + vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 + vshufps(imm(0x44), xmm1, xmm0, xmm2) //01-00-01-00 + vshufps(imm(0xee), xmm1, xmm0, xmm4) //11-10-11-10 + + vfmadd231ps(mem(rcx), xmm3, xmm2) + vmovupd(xmm2, mem(rcx)) // store ( gamma00..gamma30 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vfmadd231ps(mem(rcx), xmm3, xmm4) + vmovupd(xmm4, mem(rcx)) // store ( gamma01..gamma31 ) + + vunpcklps(xmm14, xmm12, xmm0)//eof0e1f1 + vpermilps(imm(0x4e),xmm0,xmm5) + vmovq(mem(rdx), xmm4) + vfmadd231ps(xmm4, xmm3, xmm0) + vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) + lea(mem(rdx, rsi, 1), rdx) + vmovq(mem(rdx), xmm4) + vfmadd231ps(xmm4, xmm3, xmm5) + vmovlpd(xmm5, mem(rdx)) // store ( gamma41..gamma51 ) + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmovlpd(xmm4, mem(rcx)) + add(rdi, rcx) + vmovlpd(xmm6, mem(rcx)) + add(rdi, rcx) + vmovlpd(xmm8, mem(rcx)) + add(rdi, rcx) + vmovlpd(xmm10, mem(rcx)) + add(rdi, rcx) + vmovlpd(xmm12, mem(rcx)) + add(rdi, rcx) + vmovlpd(xmm14, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORBZ) + + vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 + vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 + vshufps(imm(0x44), xmm1, xmm0, xmm2) //01-00-01-00 + vshufps(imm(0xee), xmm1, xmm0, xmm4) //11-10-11-10 + + vmovupd(xmm2, mem(rcx)) // store ( gamma00..gamma30 ) + lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c + vmovupd(xmm4, mem(rcx)) // store ( gamma01..gamma31 ) + + vunpcklps(xmm14, xmm12, xmm0)//eof0e1f1 + vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) + lea(mem(rdx, rsi, 1), rdx) + vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) + + label(.SDONE) + + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + //lea(mem(r14, r8, 4), r14) // + //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a + mov(var(ps_a4), rax) // load ps_a4 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 + + dec(r11) // ii -= 1; + jne(.SLOOP6X2I) // iterate again if ii != 0. + + label(.SRETURN) end_asm( - : // output operands (none) - : // input operands + : // output operands (none) + : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), - [ps_a4] "m" (ps_a4), + [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), @@ -2373,46 +2538,1540 @@ void bli_sgemmsup_rv_zen_asm_6x2m [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ - : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "memory" - ) - - consider_edge_cases: - - // Handle edge cases in the m dimension, if they exist. - if ( m_left ) - { - const dim_t nr_cur = 2; - const dim_t i_edge = m0 - ( dim_t )m_left; - - float* restrict cij = c + i_edge*rs_c; - float* restrict ai = a + m_iter*ps_a; - float* restrict bj = b; - - sgemmsup_ker_ft ker_fps[6] = - { - NULL, - bli_sgemmsup_rv_zen_asm_1x2, - bli_sgemmsup_rv_zen_asm_2x2, - bli_sgemmsup_rv_zen_asm_3x2, - bli_sgemmsup_rv_zen_asm_4x2, - bli_sgemmsup_rv_zen_asm_5x2 - }; - - sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; - - ker_fp - ( - conja, conjb, m_left, nr_cur, k0, - alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, - beta, cij, rs_c0, cs_c0, data, cntx - ); - return; - } + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ) + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if ( m_left ) + { + const dim_t nr_cur = 2; + const dim_t i_edge = m0 - ( dim_t )m_left; + + float* restrict cij = c + i_edge*rs_c; + float* restrict ai = a + m_iter*ps_a; + float* restrict bj = b; + + sgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_sgemmsup_rv_zen_asm_1x2, + bli_sgemmsup_rv_zen_asm_2x2, + bli_sgemmsup_rv_zen_asm_3x2, + bli_sgemmsup_rv_zen_asm_4x2, + bli_sgemmsup_rv_zen_asm_5x2 + }; + + sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + return; + } +} + +/* Mask elements to specify how many elements to be loaded from C buffer */ +static const int32_t mask[8][8] = { {0, 0, 0, 0, 0, 0, 0, 0}, //load no values, not used currently + {-1, 0, 0, 0, 0, 0, 0, 0}, // load 1 value from memory + {-1, -1, 0, 0, 0, 0, 0, 0}, // load 2 values from memory + {-1, -1, -1, 0, 0, 0, 0, 0}, + {-1, -1, -1, -1, 0, 0, 0, 0}, + {-1, -1, -1, -1, -1, 0, 0, 0}, + {-1, -1, -1, -1, -1, -1, 0, 0}, + {-1, -1, -1, -1, -1, -1, -1, 0}, + }; + +void bli_sgemmsup_rv_zen_asm_6x16m_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + // This kernel is called when n_left is greater than 8. This kernel operates 16 columns at time. + // First 8 elements can be loaded directly and next elements will be loaded based the mask reg + // + // Sets up the mask for loading relevant remainder elements in load direction + // + // ______ymm0______ __________ymm1_________ + // | | | | | | | | | | | | | | | | | | + // |0|1|2|3|4|5|6|7| |8|9|10|11|12|13|14|15| ----> Source vector + // |_|_|_|_|_|_|_|_| |_|_|__|__|__|__|__|__| + // + // ________________ ______ymm3_______ + // | | | | | | | | | | | | | | | | | | + // |NoMASK Required| |x|x|x|x|x|x|x|x| ----> Mask vector[x can be -1/0] + // |_|_|_|_|_|_|_|_| |_|_|_|_|_|_|_|_| + // + // For example when n_left = 13 + // ________________ ________ymm3__________ + // | | | | | | | | | | | | | | | | | | + // |NoMASK Required| |-1|-1|-1|-1|-1|0|0|0| ----> Mask vector + // |_|_|_|_|_|_|_|_| |__|__|__|__|__|_|_|_| + // + // ______ymm0_______ ________ymm1__________ + // | | | | | | | | | | | | | | | | | | + // |0|1|2|3|4|5|6|7| |8|9|10|11|12|0 |0 |0 | ----> Destination vector + // |_|_|_|_|_|_|_|_| |_|_|__|__|__|__|__|__| + // + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load mask values + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + // r11 = m dim index ii + + mov(var(m_iter), r11) // ii = m_iter; + + label(.SLOOP6X15I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + // skylake can execute 3 vxorps ipc with + // a latency of 1 cycle, while vzeroall + // has a latency of 12 cycles. + vxorps(ymm4, ymm4, ymm4) + vxorps(ymm5, ymm5, ymm5) + vxorps(ymm6, ymm6, ymm6) + vxorps(ymm7, ymm7, ymm7) + vxorps(ymm8, ymm8, ymm8) + vxorps(ymm9, ymm9, ymm9) + vxorps(ymm10, ymm10, ymm10) + vxorps(ymm11, ymm11, ymm11) + vxorps(ymm12, ymm12, ymm12) + vxorps(ymm13, ymm13, ymm13) + vxorps(ymm14, ymm14, ymm14) + vxorps(ymm15, ymm15, ymm15) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2,15*4)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 7*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 8*cs_c + lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 9*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 10*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 11*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 12*cs_c + lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 13*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 14*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + + prefetch(0, mem(rdx, 5*8)) + + vmovups(mem(rbx, 0*32), ymm0) //load first 8 elements + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) //load next required elements + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vbroadcastss(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm14) + vfmadd231ps(ymm1, ymm2, ymm15) + + // ---------------------------------- iteration 1 + + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vbroadcastss(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm14) + vfmadd231ps(ymm1, ymm2, ymm15) + + // ---------------------------------- iteration 2 + + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vbroadcastss(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm14) + vfmadd231ps(ymm1, ymm2, ymm15) + + // ---------------------------------- iteration 3 + + prefetch(0, mem(rdx, rcx, 1, 5*8)) + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; + + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vbroadcastss(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm14) + vfmadd231ps(ymm1, ymm2, ymm15) + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vbroadcastss(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm14) + vfmadd231ps(ymm1, ymm2, ymm15) + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm1) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm5, ymm5) + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm7, ymm7) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm9, ymm9) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm11, ymm11) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm0, ymm13, ymm13) + vmulps(ymm0, ymm14, ymm14) + vmulps(ymm0, ymm15, ymm15) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm1) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm4) + vmovups(ymm4, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm5) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) //store only required elements + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm6) + vmovups(ymm6, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm7) + vmaskmovps(ymm7, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm8) + vmovups(ymm8, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm9) + vmaskmovps(ymm9, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm10) + vmovups(ymm10, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm11) + vmaskmovps(ymm11, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm12) + vmovups(ymm12, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm13) + vmaskmovps(ymm13, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm14) + vmovups(ymm14, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm15) + vmaskmovps(ymm15, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support */ + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmovups(ymm4, mem(rcx, 0*32)) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) //Store only required elements + add(rdi, rcx) + + vmovups(ymm6, mem(rcx, 0*32)) + vmaskmovps(ymm7, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm8, mem(rcx, 0*32)) + vmaskmovps(ymm9, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm10, mem(rcx, 0*32)) + vmaskmovps(ymm11, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm12, mem(rcx, 0*32)) + vmaskmovps(ymm13, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm14, mem(rcx, 0*32)) + vmaskmovps(ymm15, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + mov(var(ps_a4), rax) // load ps_a4 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 + + dec(r11) // ii -= 1; + jne(.SLOOP6X15I) // iterate again if ii != 0. + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "ymm13", "ymm14", "ymm15", + "memory" + ) + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if ( m_left ) + { + const dim_t nr_cur = n0; + const dim_t i_edge = m0 - ( dim_t )m_left; + + float* restrict cij = c + i_edge*rs_c; + float* restrict ai = a + m_iter * ps_a; + float* restrict bj = b; + + sgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_sgemmsup_rv_zen_asm_1x16_mask, + bli_sgemmsup_rv_zen_asm_2x16_mask, + bli_sgemmsup_rv_zen_asm_3x16_mask, + bli_sgemmsup_rv_zen_asm_4x16_mask, + bli_sgemmsup_rv_zen_asm_5x16_mask + }; + + sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + + return; + } +} + +void bli_sgemmsup_rv_zen_asm_6x8m_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + // This kernel is called when n_left 7, 6, 5. This kernel operates 8 columns at time. + // + // Sets up the mask for loading relevant remainder elements in load direction + // + // ______ymm0_______ + // | | | | | | | | | + // |0|1|2|3|4|5|6|7| ----> Source vector + // |_|_|_|_|_|_|_|_| + // + //______ymm3_______ + //| | | | | | | | | + //|x|x|x|x|x|x|x|x| ----> Mask vector[x can be -1/0] + //|_|_|_|_|_|_|_|_| + // + // For example when n_left = 6 + // ________ymm3__________ + // | | | | | | | | | + // |-1|-1|-1|-1|-1|-1|0|0| ----> Mask vector + // |__|__|__|__|__|__|_|_| + // + // _______ymm0______ + // | | | | | | | | | + // |0|1|2|3|4|5|0|0| ----> Destination vector + // |_|_|_|_|_|_|_|_| + // + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load mask values + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + // r11 = m dim index ii + + mov(var(m_iter), r11) // ii = m_iter; + + label(.SLOOP6X7I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + // skylake can execute 3 vxorps ipc with + // a latency of 1 cycle, while vzeroall + // has a latency of 12 cycles. + vxorps(ymm4, ymm4, ymm4) + vxorps(ymm6, ymm6, ymm6) + vxorps(ymm8, ymm8, ymm8) + vxorps(ymm10, ymm10, ymm10) + vxorps(ymm12, ymm12, ymm12) + vxorps(ymm14, ymm14, ymm14) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2,15*4)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + + prefetch(0, mem(rdx, 5*8)) + + vmaskmovps(mem(rbx, 0), ymm3, ymm0) //load required elements + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vbroadcastss(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm14) + + // ---------------------------------- iteration 1 + + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmaskmovps(mem(rbx, 0), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vbroadcastss(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm14) + + // ---------------------------------- iteration 2 + + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmaskmovps(mem(rbx, 0), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vbroadcastss(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm14) + + // ---------------------------------- iteration 3 + + prefetch(0, mem(rdx, rcx, 1, 5*8)) + lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; + + vmaskmovps(mem(rbx, 0), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vbroadcastss(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm14) + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmaskmovps(mem(rbx, 0), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vbroadcastss(mem(rax, r15, 1), ymm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(ymm0, ymm2, ymm14) + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm1) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm0, ymm14, ymm14) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm1) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm4) + vmaskmovps(ymm4, ymm3, mem(rcx, 0)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm6) + vmaskmovps(ymm6, ymm3, mem(rcx, 0)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm8) + vmaskmovps(ymm8, ymm3, mem(rcx, 0)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm10) + vmaskmovps(ymm10, ymm3, mem(rcx, 0)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm12) + vmaskmovps(ymm12, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm14) + vmaskmovps(ymm14, ymm3, mem(rcx, 0)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(ymm4, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm6, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm8, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm10, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm12, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm14, ymm3, mem(rcx, 0)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + mov(var(ps_a4), rax) // load ps_a4 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 + + dec(r11) // ii -= 1; + jne(.SLOOP6X7I) // iterate again if ii != 0. + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm8", "ymm10", "ymm12", "ymm14", + "memory" + ) + + consider_edge_cases: + // Handle edge cases in the m dimension, if they exist. + if (m_left ) + { + const dim_t nr_cur = n0; + const dim_t i_edge = m0 - ( dim_t )m_left; + + float* restrict cij = c + i_edge*rs_c; + float* restrict ai = a + m_iter * ps_a; + float* restrict bj = b; + + sgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_sgemmsup_rv_zen_asm_1x8_mask, + bli_sgemmsup_rv_zen_asm_2x8_mask, + bli_sgemmsup_rv_zen_asm_3x8_mask, + bli_sgemmsup_rv_zen_asm_4x8_mask, + bli_sgemmsup_rv_zen_asm_5x8_mask + }; + + sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + + return; + } +} + +void bli_sgemmsup_rv_zen_asm_6x4m_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + uint64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + // This kernel is called when n_left is 3/1. This kernel operates 4 columns at time. + // + // Sets up the mask for loading relevant remainder elements in load direction + // + // __xmm0___ + // | | | | | + // |0|1|2|3| ----> Source vector + // |_|_|_|_| + // + // __xmm7___ + // | | | | | + // |x|x|x|x| ----> Mask vector[x can be -1/0] + // |_|_|_|_| + // + // For example when n_left = 3 + // ___xmm7_____ + // | | | | | + // |-1|-1|-1|0| ----> Mask vector + // |__|__|__|_| + // + // For example when n_left = 1 + // ___xmm7___ + // | | | | | + // |-1|0|0|0| ----> Mask vector + // |__|_|_|_| + // + // __xmm0___ + // | | | | | + // |0|1|2|3| ----> Destination vector + // |_|_|_|_| + // + const int32_t *mask_vec = mask[n0]; + + if ( m_iter == 0 ) goto consider_edge_cases; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), xmm7) //load mask values + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + // During preamble and loops: + // r12 = rcx = c // r14 = rax = a + // read rbx from var(b) near beginning of loop + // r11 = m dim index ii + + mov(var(m_iter), r11) // ii = m_iter; + + label(.SLOOP6X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] + + vxorps(xmm1, xmm1, xmm1) + vxorps(xmm4, xmm4, xmm4) + vxorps(xmm6, xmm6, xmm6) + vxorps(xmm8, xmm8, xmm8) + vxorps(xmm10, xmm10, xmm10) + vxorps(xmm12, xmm12, xmm12) + vxorps(xmm14, xmm14, xmm14) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + prefetch(0, mem(rdx, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + // ---------------------------------- iteration 1 + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + // ---------------------------------- iteration 2 + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + // ---------------------------------- iteration 3 + prefetch(0, mem(rdx, rcx, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + vbroadcastss(mem(rax, r15, 1), xmm3) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + vfmadd231ps(xmm0, xmm3, xmm14) + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), xmm0) // load alpha and duplicate + vbroadcastss(mem(rbx), xmm3) // load beta and duplicate + + vmulps(xmm0, xmm4, xmm4) // scale by alpha + vmulps(xmm0, xmm6, xmm6) + vmulps(xmm0, xmm8, xmm8) + vmulps(xmm0, xmm10, xmm10) + vmulps(xmm0, xmm12, xmm12) + vmulps(xmm0, xmm14, xmm14) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORED) // jump to column storage case + + + label(.SROWSTORED) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) + vmaskmovps(xmm4, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm1) + vfmadd231ps(xmm1, xmm3, xmm6) + vmaskmovps(xmm6, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) + vmaskmovps(xmm8, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm1) + vfmadd231ps(xmm1, xmm3, xmm10) + vmaskmovps(xmm10, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm12) + vmaskmovps(xmm12, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm1) + vfmadd231ps(xmm1, xmm3, xmm14) + vmaskmovps(xmm14, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORED) + + /* TODO: Add column storage support*/ + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(xmm4, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm6, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm8, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm10, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm12, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm14, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + lea(mem(r12, rdi, 4), r12) // + lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c + + mov(var(ps_a4), rax) // load ps_a4 + lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 + + dec(r11) // ii -= 1; + jne(.SLOOP6X4I) // iterate again if ii != 0. + + label(.SRETURN) + + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ) + + consider_edge_cases: + + // Handle edge cases in the m dimension, if they exist. + if ( m_left ) + { + const dim_t nr_cur = n0; + const dim_t i_edge = m0 - ( dim_t )m_left; + + float* restrict cij = c + i_edge*rs_c; + float* restrict ai = a + m_iter*ps_a; + float* restrict bj = b; + + sgemmsup_ker_ft ker_fps[6] = + { + NULL, + bli_sgemmsup_rv_zen_asm_1x4_mask, + bli_sgemmsup_rv_zen_asm_2x4_mask, + bli_sgemmsup_rv_zen_asm_3x4_mask, + bli_sgemmsup_rv_zen_asm_4x4_mask, + bli_sgemmsup_rv_zen_asm_5x4_mask + }; + + sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; + + ker_fp + ( + conja, conjb, m_left, nr_cur, k0, + alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, + beta, cij, rs_c0, cs_c0, data, cntx + ); + return; + } } diff --git a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c new file mode 100644 index 0000000000..2fa245ea3f --- /dev/null +++ b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c @@ -0,0 +1,1810 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materia provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "blis.h" + +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + +static const int32_t mask[8][8] = { {0, 0, 0, 0, 0, 0, 0, 0}, + {-1, 0, 0, 0, 0, 0, 0, 0}, + {-1, -1, 0, 0, 0, 0, 0, 0}, + {-1, -1, -1, 0, 0, 0, 0, 0}, + {-1, -1, -1, -1, 0, 0, 0, 0}, + {-1, -1, -1, -1, -1, 0, 0, 0}, + {-1, -1, -1, -1, -1, -1, 0, 0}, + {-1, -1, -1, -1, -1, -1, -1, 0}, + }; + +void bli_sgemmsup_rv_zen_asm_5x16_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + + + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 7*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 8*cs_c + lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 9*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 10*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 11*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 12*cs_c + lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 13*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 14*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm1) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm5, ymm5) + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm7, ymm7) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm9, ymm9) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm11, ymm11) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm0, ymm13, ymm13) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm1) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm4) + vmovups(ymm4, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm5) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm6) + vmovups(ymm6, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm7) + vmaskmovps(ymm7, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm8) + vmovups(ymm8, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm9) + vmaskmovps(ymm9, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm10) + vmovups(ymm10, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm11) + vmaskmovps(ymm11, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm1, ymm12) + vmovups(ymm12, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm1, ymm13) + vmaskmovps(ymm13, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmovups(ymm4, mem(rcx, 0*32)) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm6, mem(rcx, 0*32)) + vmaskmovps(ymm7, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm8, mem(rcx, 0*32)) + vmaskmovps(ymm9, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm10, mem(rcx, 0*32)) + vmaskmovps(ymm11, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm12, mem(rcx, 0*32)) + vmaskmovps(ymm13, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec), + [n0] "m" (n0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r12", "r13", "r14", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_4x16_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 7*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 8*cs_c + lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 9*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 10*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 11*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 12*cs_c + lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 13*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 14*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + add(r9, rax) // a += cs_a; + + + // ---------------------------------- iteration 1 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + add(r9, rax) // a += cs_a; + + + // ---------------------------------- iteration 3 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + vfmadd231ps(ymm1, ymm2, ymm11) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm12) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm5, ymm5) + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm7, ymm7) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm9, ymm9) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm11, ymm11) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm12) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vfmadd231ps(mem(rcx, 0*32), ymm12, ymm4) + vmovups(ymm4, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm12, ymm5) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm12, ymm6) + vmovups(ymm6, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm12, ymm7) + vmaskmovps(ymm7, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm12, ymm8) + vmovups(ymm8, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm12, ymm9) + vmaskmovps(ymm9, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm12, ymm10) + vmovups(ymm10, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm12, ymm11) + vmaskmovps(ymm11, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmovups(ymm4, mem(rcx, 0*32)) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm6, mem(rcx, 0*32)) + vmaskmovps(ymm7, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm8, mem(rcx, 0*32)) + vmaskmovps(ymm9, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm10, mem(rcx, 0*32)) + vmaskmovps(ymm11, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec), + [n0] "m" (n0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r12", "r13", "r14", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_3x16_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load mask values + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 7*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 8*cs_c + lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 9*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 10*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 11*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 12*cs_c + lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 13*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 14*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm12) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm5, ymm5) + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm7, ymm7) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm9, ymm9) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm12) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vfmadd231ps(mem(rcx, 0*32), ymm12, ymm4) + vmovups(ymm4, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm12, ymm5) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm12, ymm6) + vmovups(ymm6, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm12, ymm7) + vmaskmovps(ymm7, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm12, ymm8) + vmovups(ymm8, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm12, ymm9) + vmaskmovps(ymm9, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmovups(ymm4, mem(rcx, 0*32)) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm6, mem(rcx, 0*32)) + vmaskmovps(ymm7, ymm3, mem(rcx, 1*32)) + add(rdi, rcx) + + vmovups(ymm8, mem(rcx, 0*32)) + vmaskmovps(ymm9, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r12", "r14", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm12", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_2x16_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 7*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 8*cs_c + lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 9*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 10*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 11*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 12*cs_c + lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 13*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 14*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + vfmadd231ps(ymm1, ymm2, ymm7) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm14) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm5, ymm5) + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm7, ymm7) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm14) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vfmadd231ps(mem(rcx, 0*32), ymm14, ymm4) + vmovups(ymm4, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm14, ymm5) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vfmadd231ps(mem(rcx, 0*32), ymm14, ymm6) + vmovups(ymm6, mem(rcx, 0*32)) + + vmaskmovps(mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm14, ymm7) + vmaskmovps(ymm7, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmovups(ymm4, mem(rcx, 0*32)) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) + + add(rdi, rcx) + + vmovups(ymm6, mem(rcx, 0*32)) + vmaskmovps(ymm7, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec), + [n0] "m" (n0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r12", "r14", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm14", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_1x16_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load mask values + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 7*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 8*cs_c + lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 9*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 10*cs_c + prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 11*cs_c + prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 12*cs_c + lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 13*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 14*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmovups(mem(rbx, 0*32), ymm0) + vmaskmovps(mem(rbx, 1*32), ymm3, ymm1) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm12) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm5, ymm5) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm12) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vfmadd231ps(mem(rcx, 0*32), ymm12, ymm4) + vmovups(ymm4, mem(rcx, 0*32)) + + vmaskmovps( mem(rcx, 1*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm12, ymm5) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmovups(ymm4, mem(rcx, 0*32)) + vmaskmovps(ymm5, ymm3, mem(rcx, 1*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec), + [n0] "m" (n0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r12", "r14", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm12", + "memory" + ) +} diff --git a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c new file mode 100644 index 0000000000..6430c840e5 --- /dev/null +++ b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c @@ -0,0 +1,1613 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materia provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "blis.h" + +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + +static const int32_t mask[8][8] = { {0, 0, 0, 0, 0, 0, 0, 0}, + {-1, 0, 0, 0, 0, 0, 0, 0}, + {-1, -1, 0, 0, 0, 0, 0, 0}, + {-1, -1, -1, 0, 0, 0, 0, 0}, + {-1, -1, -1, -1, 0, 0, 0, 0}, + {-1, -1, -1, -1, -1, 0, 0, 0}, + {-1, -1, -1, -1, -1, -1, 0, 0}, + {-1, -1, -1, -1, -1, -1, -1, 0}, + }; + +void bli_sgemmsup_rv_zen_asm_5x4_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + const int32_t *mask_vec = mask[n0]; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), xmm7) //load mask elements + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + vxorps(xmm1, xmm1, xmm1) + vxorps(xmm4, xmm4, xmm4) + vxorps(xmm6, xmm6, xmm6) + vxorps(xmm8, xmm8, xmm8) + vxorps(xmm10, xmm10, xmm10) + vxorps(xmm12, xmm12, xmm12) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + prefetch(0, mem(rdx, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + + // ---------------------------------- iteration 1 + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + + // ---------------------------------- iteration 2 + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + + // ---------------------------------- iteration 3 + prefetch(0, mem(rdx, rcx, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + vbroadcastss(mem(rax, r8, 4), xmm2) + add(r9, rax) // a += cs_a; + vfmadd231ps(xmm0, xmm2, xmm12) + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), xmm0) // load alpha and duplicate + vbroadcastss(mem(rbx), xmm3) // load beta and duplicate + + vmulps(xmm0, xmm4, xmm4) // scale by alpha + vmulps(xmm0, xmm6, xmm6) + vmulps(xmm0, xmm8, xmm8) + vmulps(xmm0, xmm10, xmm10) + vmulps(xmm0, xmm12, xmm12) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORED) // jump to column storage case + + + label(.SROWSTORED) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) + vmaskmovps(xmm4, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm1) + vfmadd231ps(xmm1, xmm3, xmm6) + vmaskmovps(xmm6, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) + vmaskmovps(xmm8, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm1) + vfmadd231ps(xmm1, xmm3, xmm10) + vmaskmovps(xmm10, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm12) + vmaskmovps(xmm12, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORED) + + /* TODO: Add column storage support*/ + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(xmm4, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm6, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm8, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm10, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm12, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_4x4_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + const int32_t *mask_vec = mask[n0]; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), xmm7) //load mask elements + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + vxorps(xmm1, xmm1, xmm1) + vxorps(xmm4, xmm4, xmm4) + vxorps(xmm6, xmm6, xmm6) + vxorps(xmm8, xmm8, xmm8) + vxorps(xmm10, xmm10, xmm10) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + prefetch(0, mem(rdx, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + prefetch(0, mem(rdx, rcx, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + vfmadd231ps(xmm0, xmm3, xmm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), xmm0) // load alpha and duplicate + vbroadcastss(mem(rbx), xmm3) // load beta and duplicate + + vmulps(xmm0, xmm4, xmm4) // scale by alpha + vmulps(xmm0, xmm6, xmm6) + vmulps(xmm0, xmm8, xmm8) + vmulps(xmm0, xmm10, xmm10) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORED) // jump to column storage case + + + label(.SROWSTORED) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) + vmaskmovps(xmm4, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm1) + vfmadd231ps(xmm1, xmm3, xmm6) + vmaskmovps(xmm6, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) + vmaskmovps(xmm8, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm1) + vfmadd231ps(xmm1, xmm3, xmm10) + vmaskmovps(xmm10, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORED) + + /* TODO: Add column storage support*/ + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(xmm4, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm6, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm8, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm10, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_3x4_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + const int32_t *mask_vec = mask[n0]; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), xmm7) //load mask elements + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + vxorps(xmm1, xmm1, xmm1) + vxorps(xmm4, xmm4, xmm4) + vxorps(xmm6, xmm6, xmm6) + vxorps(xmm8, xmm8, xmm8) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + prefetch(0, mem(rdx, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + prefetch(0, mem(rdx, rcx, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + vbroadcastss(mem(rax, r8, 2), xmm2) + vbroadcastss(mem(rax, r13, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm8) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), xmm0) // load alpha and duplicate + vbroadcastss(mem(rbx), xmm3) // load beta and duplicate + + vmulps(xmm0, xmm4, xmm4) // scale by alpha + vmulps(xmm0, xmm6, xmm6) + vmulps(xmm0, xmm8, xmm8) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORED) // jump to column storage case + + + label(.SROWSTORED) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) + vmaskmovps(xmm4, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm1) + vfmadd231ps(xmm1, xmm3, xmm6) + vmaskmovps(xmm6, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) + vmaskmovps(xmm8, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORED) + + /* TODO: Add column storage support*/ + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(xmm4, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm6, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm8, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_2x4_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + const int32_t *mask_vec = mask[n0]; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), xmm7) //load mask elements + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + vxorps(xmm1, xmm1, xmm1) + vxorps(xmm4, xmm4, xmm4) + vxorps(xmm6, xmm6, xmm6) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + prefetch(0, mem(rdx, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + prefetch(0, mem(rdx, rcx, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vbroadcastss(mem(rax, r8, 1), xmm3) + vfmadd231ps(xmm0, xmm2, xmm4) + vfmadd231ps(xmm0, xmm3, xmm6) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), xmm0) // load alpha and duplicate + vbroadcastss(mem(rbx), xmm3) // load beta and duplicate + + vmulps(xmm0, xmm4, xmm4) // scale by alpha + vmulps(xmm0, xmm6, xmm6) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORED) // jump to column storage case + + + label(.SROWSTORED) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) + vmaskmovps(xmm4, xmm7, mem(rcx)) + add(rdi, rcx) + + vmaskmovps(mem(rcx), xmm7, xmm1) + vfmadd231ps(xmm1, xmm3, xmm6) + vmaskmovps(xmm6, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORED) + + /* TODO: Add column storage support*/ + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(xmm4, xmm7, mem(rcx)) + add(rdi, rcx) + vmaskmovps(xmm6, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_1x4_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t m_iter = m0 / 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + const int32_t *mask_vec = mask[n0]; + + // ------------------------------------------------------------------------- + begin_asm() + + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), xmm7) //load mask elements + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + vxorps(xmm1, xmm1, xmm1) + vxorps(xmm4, xmm4, xmm4) + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(r12, rsi, 2), rdx) // + lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines + lea(mem(rdx, r8, 2), rdx) // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + prefetch(0, mem(rdx, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vfmadd231ps(xmm0, xmm2, xmm4) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + prefetch(0, mem(rdx, r9, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vfmadd231ps(xmm0, xmm2, xmm4) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + prefetch(0, mem(rdx, r9, 2, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vfmadd231ps(xmm0, xmm2, xmm4) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + prefetch(0, mem(rdx, rcx, 1, 5*8)) + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vfmadd231ps(xmm0, xmm2, xmm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + vmaskmovps(mem(rbx), xmm7, xmm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), xmm2) + vfmadd231ps(xmm0, xmm2, xmm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), xmm0) // load alpha and duplicate + vbroadcastss(mem(rbx), xmm3) // load beta and duplicate + + vmulps(xmm0, xmm4, xmm4) // scale by alpha + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORED) // jump to column storage case + + + label(.SROWSTORED) + + vmaskmovps(mem(rcx), xmm7, xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) + vmaskmovps(xmm4, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORED) + + /* TODO: Add column storage support*/ + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. + jz(.SCOLSTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(xmm4, xmm7, mem(rcx)) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + + end_asm( + : // output operands (none) + : // input operands + [m_iter] "m" (m_iter), + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ) +} \ No newline at end of file diff --git a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c new file mode 100644 index 0000000000..20d6f45075 --- /dev/null +++ b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c @@ -0,0 +1,1587 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materia provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#include "blis.h" + +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + +static const int32_t mask[8][8] = { {0, 0, 0, 0, 0, 0, 0, 0}, + {-1, 0, 0, 0, 0, 0, 0, 0}, + {-1, -1, 0, 0, 0, 0, 0, 0}, + {-1, -1, -1, 0, 0, 0, 0, 0}, + {-1, -1, -1, -1, 0, 0, 0, 0}, + {-1, -1, -1, -1, -1, 0, 0, 0}, + {-1, -1, -1, -1, -1, -1, 0, 0}, + {-1, -1, -1, -1, -1, -1, -1, 0}, + }; + +void bli_sgemmsup_rv_zen_asm_5x8_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load mask elements + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + + + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + vmaskmovps(mem(rbx, 0), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + vmaskmovps(mem(rbx, 0), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + vmaskmovps(mem(rbx, 0), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + vmaskmovps(mem(rbx, 0), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmaskmovps(mem(rbx, 0), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + vbroadcastss(mem(rax, r8, 4), ymm2) + vfmadd231ps(ymm0, ymm2, ymm12) + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm7) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm12, ymm12) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm7) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm4) + vmaskmovps(ymm4, ymm3, mem(rcx, 0)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm6) + vmaskmovps(ymm6, ymm3, mem(rcx, 0)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm8) + vmaskmovps(ymm8, ymm3, mem(rcx, 0)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm10) + vmaskmovps(ymm10, ymm3, mem(rcx, 0)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm12) + vmaskmovps(ymm12, ymm3, mem(rcx, 0)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(ymm4, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm6, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm8, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm10, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm12, ymm3, mem(rcx, 0)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec), + [n0] "m" (n0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r12", "r13", "r14", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm7", "ymm8", "ymm10", "ymm12", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_4x8_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load mask elements + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + + + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm10) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm7) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm10, ymm10) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm7) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vmaskmovps(mem(rcx, 0*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm4) + vmaskmovps(ymm4, ymm3, mem(rcx, 0*32)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm6) + vmaskmovps(ymm6, ymm3, mem(rcx, 0*32)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm8) + vmaskmovps(ymm8, ymm3, mem(rcx, 0*32)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm10) + vmaskmovps(ymm10, ymm3, mem(rcx, 0*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(ymm4, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm6, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm8, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm10, ymm3, mem(rcx, 0)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec), + [n0] "m" (n0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r12", "r13", "r14", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm7", "ymm8", "ymm10", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_3x8_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load mask elements + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + + // ---------------------------------- iteration 0 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + vbroadcastss(mem(rax, r8, 2), ymm2) + vfmadd231ps(ymm0, ymm2, ymm8) + vbroadcastss(mem(rax, r13, 1), ymm2) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm7) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm8, ymm8) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm7) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vmaskmovps(mem(rcx, 0*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm4) + vmaskmovps(ymm4, ymm3, mem(rcx, 0*32)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm6) + vmaskmovps(ymm6, ymm3, mem(rcx, 0*32)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm8) + vmaskmovps(ymm8, ymm3, mem(rcx, 0*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + jmp(.SDONE) // jump to end. + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(ymm4, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm6, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm8, ymm3, mem(rcx, 0)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r12", "r13", "r14", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "ymm7", "ymm8", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_2x8_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load mask elements + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + + + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 2 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 3 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + vbroadcastss(mem(rax, r8, 1), ymm2) + vfmadd231ps(ymm0, ymm2, ymm6) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm7) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm6, ymm6) + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm7) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vmaskmovps(mem(rcx, 0*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm4) + vmaskmovps(ymm4, ymm3, mem(rcx, 0*32)) + + add(rdi, rcx) + + vmaskmovps(mem(rcx, 0*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm6) + vmaskmovps(ymm6, ymm3, mem(rcx, 0*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(ymm4, ymm3, mem(rcx, 0)) + add(rdi, rcx) + + vmaskmovps(ymm6, ymm3, mem(rcx, 0)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec), + [n0] "m" (n0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r12", "r14", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm7", + "memory" + ) +} + +void bli_sgemmsup_rv_zen_asm_1x8_mask + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + float* restrict alpha, + float* restrict a, inc_t rs_a0, inc_t cs_a0, + float* restrict b, inc_t rs_b0, inc_t cs_b0, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t cs_b = cs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + // Query the panel stride of A and convert it to units of bytes. + uint64_t ps_a = bli_auxinfo_ps_a( data ); + uint64_t ps_a4 = ps_a * sizeof( float ); + + uint64_t n_mod8 = n0 % 8 ; + const int32_t *mask_vec = mask[n_mod8]; + // ------------------------------------------------------------------------- + + begin_asm() + + vzeroall() // zero all xmm/ymm registers. + mov(var(mask_vec), rdx) + vmovdqu(mem(rdx), ymm3) //load + + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + mov(var(cs_a), r9) // load cs_a + lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) + + + lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) + + mov(var(rs_b), r10) // load rs_b + lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) + + // NOTE: We cannot pre-load elements of a or b + // because it could eventually, in the last + // unrolled iter or the cleanup loop, result + // in reading beyond the bounds allocated mem + // (the likely result: a segmentation fault). + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + + // During preamble and loops: + // r12 = rcx = c + // r14 = rax = a + // read rbx from var(b) near beginning of loop + + mov(var(b), rbx) // load address of b. + mov(r14, rax) // reset rax to current upanel of a. + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLPFETCH) // jump to column storage case + label(.SROWPFETCH) // row-stored prefetching on c + + lea(mem(r12, rdi, 2), rdx) // + lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs + + jmp(.SPOSTPFETCH) // jump to end of prefetching c + label(.SCOLPFETCH) // column-stored prefetching c + + mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) + lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; + prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c + prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c + prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c + prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c + lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; + prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c + + label(.SPOSTPFETCH) // done prefetching c + + mov(var(ps_a4), rdx) // load ps_a4 + lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 + // use rcx, rdx for prefetching lines + // from next upanel of a. + + mov(var(k_iter), rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.SLOOPKITER) // MAIN LOOP + + // ---------------------------------- iteration 0 + + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + // ---------------------------------- iteration 1 + + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + + // ---------------------------------- iteration 2 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + + // ---------------------------------- iteration 3 + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + label(.SCONSIDKLEFT) + + mov(var(k_left), rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // ee, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rdx, 5*8)) + add(r9, rdx) + + vmaskmovps(mem(rbx, 0*32), ymm3, ymm0) + add(r10, rbx) // b += rs_b; + + vbroadcastss(mem(rax ), ymm2) + vfmadd231ps(ymm0, ymm2, ymm4) + + add(r9, rax) // a += cs_a; + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + label(.SPOSTACCUM) + + mov(r12, rcx) // reset rcx to current utile of c. + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm7) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; + lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm7) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORED) // jump to column storage case + + label(.SROWSTORED) + + vmaskmovps(mem(rcx, 0*32), ymm3, ymm2) + vfmadd231ps(ymm2, ymm7, ymm4) + vmaskmovps(ymm4, ymm3, mem(rcx, 0*32)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORED) + + /* TODO: Add column storage support*/ + + label(.SBETAZERO) + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOTORBZ) // jump to column storage case + + label(.SROWSTORBZ) + + vmaskmovps(ymm4, ymm3, mem(rcx, 0)) + + jmp(.SDONE) // jump to end. + + label(.SCOTORBZ) + + /* TODO: Add column storage support*/ + + label(.SDONE) + + label(.SRETURN) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "m" (k_iter), + [k_left] "m" (k_left), + [a] "m" (a), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a4] "m" (ps_a4), + [b] "m" (b), + [rs_b] "m" (rs_b), + [cs_b] "m" (cs_b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [mask_vec] "m" (mask_vec), + [n0] "m" (n0) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r12", "r14", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm7", + "memory" + ) +} + diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 9e2cf7e24d..45817f08be 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -189,6 +189,33 @@ GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) +//gemmsup_rv (mkernel in m dim) for mask load/store +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m_mask ) +GEMMSUP_KER_PROT( float, s, bli_sgemmsup_rv_zen_asm_6x8m ) +GEMMSUP_KER_PROT( float, s, bli_sgemmsup_rv_zen_asm_6x4m ) +GEMMSUP_KER_PROT( float, s, bli_sgemmsup_rv_zen_asm_6x2m ) + +//gemmsup_rv (mkernel in m dim) for fringe case +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16_mask ) + +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8_mask ) + +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4_mask ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4_mask ) + // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) From 59f1333883f601818bbe63159eaa01d575271d5c Mon Sep 17 00:00:00 2001 From: jagar Date: Fri, 10 Nov 2023 13:11:44 +0530 Subject: [PATCH 202/226] CMake : placing include folder under blis directory. Updating cmake files to place include folder under blis directory in new cmake system on windows. AMD-Internal: [CPUPL-2748] Change-Id: I650cca95193f7c89b39648ac1bda1fa1093b1560 --- CMakeLists.txt | 30 +++++++++++++++--------------- blastest/CMakeLists.txt | 2 +- testsuite/CMakeLists.txt | 2 +- vendor/testcpp/CMakeLists.txt | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 55213846df..7f32b91752 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -493,7 +493,7 @@ if(ENABLE_MIXED_DT) else() message(" Mixed datatype optimizations requiring extra memory are disabled.") set(ENABLE_MIXED_DT_EXTRA_MEM_01 0) - endif() + endif() set(ENABLE_MIXED_DT_01 1) else() message(" Mixed datatype support is disabled.") @@ -730,31 +730,31 @@ list(JOIN ALL_HEADER_PATHS_LIST " " ALL_HEADER_PATHS_STRING) # Consolidated blis.h header creation #-------------------------------------------- # Creating a directory for the generated flatten headers. -file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}) +file(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}) # Flatten header python script file which expand header contents in blis.h. -add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h +add_custom_command(OUTPUT ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/flatten-headers.py -c -v1 "${CMAKE_SOURCE_DIR}/frame/include/blis.h" - "${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h" - "${PROJECT_BINARY_DIR}/include" + "${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h" + "${CMAKE_SOURCE_DIR}/include" "${ALL_HEADER_PATHS_STRING}" COMMENT "Generating monolithic blis header file: ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h" ) -add_custom_target(flat-header DEPENDS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h) +add_custom_target(flat-header DEPENDS ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h) #-------------------------------------------- # Consolidated cblas.h header creation #-------------------------------------------- # Flatten header python script file which expand header contents in cblas.h. if(ENABLE_CBLAS) - add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h + add_custom_command(OUTPUT ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/flatten-headers.py -c -v1 "${CMAKE_SOURCE_DIR}/frame/compat/cblas/src/cblas.h" - "${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h" - "${PROJECT_BINARY_DIR}/${include}" + "${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h" + "${CMAKE_SOURCE_DIR}/${include}" "${ALL_HEADER_PATHS_STRING}" COMMENT "Generating monolithic cblas header file: ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h" ) - add_custom_target(flat-cblas-header DEPENDS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) + add_custom_target(flat-cblas-header DEPENDS ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) endif() #-------------------------------------------- @@ -890,7 +890,7 @@ list(REMOVE_DUPLICATES REF_KER_H_PATHS) # Create list of include directories, to be used while creating the library. # NOTE: We no longer need every header path in the source tree since we # now #include the monolithic/flattened blis.h instead. -set(CINFLAGS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}) +set(CINFLAGS ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}) list(APPEND CINFLAGS ${REF_KER_H_PATHS}) # Then add frame/include since it's needed for bli_oapi_w[o]_cntx.h. list(APPEND CINFLAGS ${CMAKE_SOURCE_DIR}/frame/include) @@ -976,20 +976,20 @@ endforeach() #-------------------------------------------- # Public blis headers. set(BLIS_PUBLIC_HEADERS - ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h + ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h # Include AMD's C++ template header files in the list of headers # to install. ${CMAKE_SOURCE_DIR}/vendor/cpp/blis.hh ${CMAKE_SOURCE_DIR}/vendor/cpp/cblas.hh ) if(ENABLE_CBLAS) - list(APPEND BLIS_PUBLIC_HEADERS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) + list(APPEND BLIS_PUBLIC_HEADERS ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) endif() # --- Library name and local paths --- # From old CMake if(WIN32) - add_definitions(-D_CRT_SECURE_NO_WARNINGS) + add_definitions(-D_CRT_SECURE_NO_WARNINGS) add_definitions(-D_CRT_SECURE_NO_DEPRECATE) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Oi") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP${CMake_MSVC_PARALLEL}") @@ -1038,7 +1038,7 @@ endif() set_target_properties(libblis PROPERTIES PUBLIC_HEADER "${BLIS_PUBLIC_HEADERS}") set_target_properties(libblis PROPERTIES OUTPUT_NAME ${LIBBLIS}) if(WIN32) - set_target_properties(libblis + set_target_properties(libblis PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" LIBRARY_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" diff --git a/blastest/CMakeLists.txt b/blastest/CMakeLists.txt index 6b0f21e249..144cd9075a 100644 --- a/blastest/CMakeLists.txt +++ b/blastest/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT DEFINED BLIS_INSTALL_PATH) set(DIST_PATH ${CMAKE_BINARY_DIR}) set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY}) - set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY}) + set(INC_PATH ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}) else() set(LIB_PATH ${BLIS_INSTALL_PATH}/lib) set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis) diff --git a/testsuite/CMakeLists.txt b/testsuite/CMakeLists.txt index 4e23e0e382..4df3c617ee 100644 --- a/testsuite/CMakeLists.txt +++ b/testsuite/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT DEFINED BLIS_INSTALL_PATH) set(DIST_PATH ${CMAKE_BINARY_DIR}) set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY}) - set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY}) + set(INC_PATH ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}) else() set(LIB_PATH ${BLIS_INSTALL_PATH}/lib) set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis) diff --git a/vendor/testcpp/CMakeLists.txt b/vendor/testcpp/CMakeLists.txt index 3e0d1209e9..5c528df03a 100644 --- a/vendor/testcpp/CMakeLists.txt +++ b/vendor/testcpp/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT DEFINED BLIS_INSTALL_PATH) set(DIST_PATH ${CMAKE_BINARY_DIR}) set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY}) - set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY}) + set(INC_PATH ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}) else() set(LIB_PATH ${BLIS_INSTALL_PATH}/lib) set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis) From acdaa91786a5b32a5ddaa8dba6b4368693f85347 Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Fri, 10 Nov 2023 10:37:56 -0500 Subject: [PATCH 203/226] Revert "CMake : placing include folder under blis directory." This reverts commit 59f1333883f601818bbe63159eaa01d575271d5c. Reason for revert: Potentially breaking CI Change-Id: I65a92a96896091cb92cc534d5b458070524ab75a --- CMakeLists.txt | 30 +++++++++++++++--------------- blastest/CMakeLists.txt | 2 +- testsuite/CMakeLists.txt | 2 +- vendor/testcpp/CMakeLists.txt | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f32b91752..55213846df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -493,7 +493,7 @@ if(ENABLE_MIXED_DT) else() message(" Mixed datatype optimizations requiring extra memory are disabled.") set(ENABLE_MIXED_DT_EXTRA_MEM_01 0) - endif() + endif() set(ENABLE_MIXED_DT_01 1) else() message(" Mixed datatype support is disabled.") @@ -730,31 +730,31 @@ list(JOIN ALL_HEADER_PATHS_LIST " " ALL_HEADER_PATHS_STRING) # Consolidated blis.h header creation #-------------------------------------------- # Creating a directory for the generated flatten headers. -file(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}) +file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}) # Flatten header python script file which expand header contents in blis.h. -add_custom_command(OUTPUT ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h +add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/flatten-headers.py -c -v1 "${CMAKE_SOURCE_DIR}/frame/include/blis.h" - "${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h" - "${CMAKE_SOURCE_DIR}/include" + "${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h" + "${PROJECT_BINARY_DIR}/include" "${ALL_HEADER_PATHS_STRING}" COMMENT "Generating monolithic blis header file: ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h" ) -add_custom_target(flat-header DEPENDS ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h) +add_custom_target(flat-header DEPENDS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h) #-------------------------------------------- # Consolidated cblas.h header creation #-------------------------------------------- # Flatten header python script file which expand header contents in cblas.h. if(ENABLE_CBLAS) - add_custom_command(OUTPUT ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h + add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/flatten-headers.py -c -v1 "${CMAKE_SOURCE_DIR}/frame/compat/cblas/src/cblas.h" - "${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h" - "${CMAKE_SOURCE_DIR}/${include}" + "${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h" + "${PROJECT_BINARY_DIR}/${include}" "${ALL_HEADER_PATHS_STRING}" COMMENT "Generating monolithic cblas header file: ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h" ) - add_custom_target(flat-cblas-header DEPENDS ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) + add_custom_target(flat-cblas-header DEPENDS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) endif() #-------------------------------------------- @@ -890,7 +890,7 @@ list(REMOVE_DUPLICATES REF_KER_H_PATHS) # Create list of include directories, to be used while creating the library. # NOTE: We no longer need every header path in the source tree since we # now #include the monolithic/flattened blis.h instead. -set(CINFLAGS ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}) +set(CINFLAGS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}) list(APPEND CINFLAGS ${REF_KER_H_PATHS}) # Then add frame/include since it's needed for bli_oapi_w[o]_cntx.h. list(APPEND CINFLAGS ${CMAKE_SOURCE_DIR}/frame/include) @@ -976,20 +976,20 @@ endforeach() #-------------------------------------------- # Public blis headers. set(BLIS_PUBLIC_HEADERS - ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h + ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h # Include AMD's C++ template header files in the list of headers # to install. ${CMAKE_SOURCE_DIR}/vendor/cpp/blis.hh ${CMAKE_SOURCE_DIR}/vendor/cpp/cblas.hh ) if(ENABLE_CBLAS) - list(APPEND BLIS_PUBLIC_HEADERS ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) + list(APPEND BLIS_PUBLIC_HEADERS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) endif() # --- Library name and local paths --- # From old CMake if(WIN32) - add_definitions(-D_CRT_SECURE_NO_WARNINGS) + add_definitions(-D_CRT_SECURE_NO_WARNINGS) add_definitions(-D_CRT_SECURE_NO_DEPRECATE) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Oi") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP${CMake_MSVC_PARALLEL}") @@ -1038,7 +1038,7 @@ endif() set_target_properties(libblis PROPERTIES PUBLIC_HEADER "${BLIS_PUBLIC_HEADERS}") set_target_properties(libblis PROPERTIES OUTPUT_NAME ${LIBBLIS}) if(WIN32) - set_target_properties(libblis + set_target_properties(libblis PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" LIBRARY_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" diff --git a/blastest/CMakeLists.txt b/blastest/CMakeLists.txt index 144cd9075a..6b0f21e249 100644 --- a/blastest/CMakeLists.txt +++ b/blastest/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT DEFINED BLIS_INSTALL_PATH) set(DIST_PATH ${CMAKE_BINARY_DIR}) set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY}) - set(INC_PATH ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}) + set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY}) else() set(LIB_PATH ${BLIS_INSTALL_PATH}/lib) set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis) diff --git a/testsuite/CMakeLists.txt b/testsuite/CMakeLists.txt index 4df3c617ee..4e23e0e382 100644 --- a/testsuite/CMakeLists.txt +++ b/testsuite/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT DEFINED BLIS_INSTALL_PATH) set(DIST_PATH ${CMAKE_BINARY_DIR}) set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY}) - set(INC_PATH ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}) + set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY}) else() set(LIB_PATH ${BLIS_INSTALL_PATH}/lib) set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis) diff --git a/vendor/testcpp/CMakeLists.txt b/vendor/testcpp/CMakeLists.txt index 5c528df03a..3e0d1209e9 100644 --- a/vendor/testcpp/CMakeLists.txt +++ b/vendor/testcpp/CMakeLists.txt @@ -7,7 +7,7 @@ if(NOT DEFINED BLIS_INSTALL_PATH) set(DIST_PATH ${CMAKE_BINARY_DIR}) set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY}) - set(INC_PATH ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}) + set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY}) else() set(LIB_PATH ${BLIS_INSTALL_PATH}/lib) set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis) From 3136e57a39528a449f4a4af271a1a57c92b7aabe Mon Sep 17 00:00:00 2001 From: mangala v Date: Thu, 9 Nov 2023 19:07:44 +0530 Subject: [PATCH 204/226] Fixed memory leak issue reported by ASAN in testsuite. Memory allocated for pointer chars_for_dt was not freed at the end of function in testsuite. Freeing up of the buffer fixed the issue. AMD-Internal: [CPUPL-3932] Change-Id: I432c3ff95d289159f02a871b6d4fff5ab252ea9e --- testsuite/src/test_libblis.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 8161e652ad..48c890e523 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2039,6 +2039,7 @@ void libblis_test_op_driver bli_abort(); #endif + free(chars_for_dt); } else // ( ( !mixed_domain && !mixed_precision ) || op->opid != BLIS_GEMM ) { From 9c9bc20c9ef625e3d78730229dad384d19473d93 Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Fri, 10 Nov 2023 22:44:22 +0530 Subject: [PATCH 205/226] CMake: Adding more dependencies in the generation of blis.h and cblas.h - In case the build directory doesn't get cleaned between different configurations this should re-generate the headers correctly. AMD-Internal: [CPUPL-2748] Change-Id: I57cd03a9ae87d8ddfee64fe8b1a1ee9ea1b7ad3c --- CMakeLists.txt | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c6718f116..29cc6ded01 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -673,6 +673,8 @@ configure_file(build/cmake/bli_addon.h.in ${PROJECT_BINARY_DIR}/bli_addon.h) # append as we add the corresponding subdirectories. This variable will be # transformed into a string and will be used to generate the flatten blis.h header. set(ALL_HEADER_PATHS_LIST "") +# Track files to set dependencies for blis.h. +set(ALL_HEADER_FILES_LIST "") # Include functionality that returns header paths. include(${CMAKE_SOURCE_DIR}/build/cmake/subdir_helper_functions.cmake) @@ -685,14 +687,19 @@ list(FIND CONFIG_LIST ${BLIS_CONFIG_FAMILY} IS_UMBRELLA) if(${IS_UMBRELLA} STREQUAL "-1") # Collect all subdirectory paths that have at least one file with suffix in ALL_H99_SUFS list. get_dirpaths_with_suffixes(${BLIS_CONFIG_FAMILY}_HEADER_PATHS ${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY} "${ALL_H99_SUFS}") + # Collect all files in the subdirectories. + get_filepaths_with_suffixes(${BLIS_CONFIG_FAMILY}_HEADER_FILES ${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY} "${ALL_H99_SUFS}") endif() list(APPEND ALL_HEADER_PATHS_LIST "${${BLIS_CONFIG_FAMILY}_HEADER_PATHS}") +list(APPEND ALL_HEADER_FILES_LIST "${${BLIS_CONFIG_FAMILY}_HEADER_FILES}") # Get header directory paths for each of the sub-configurations present # in the configuration list. foreach(CONF ${CONFIG_LIST}) get_dirpaths_with_suffixes(config_${CONF}_HEADER_PATHS ${CMAKE_SOURCE_DIR}/config/${CONF} "${ALL_H99_SUFS}") list(APPEND ALL_HEADER_PATHS_LIST "${config_${CONF}_HEADER_PATHS}") + get_filepaths_with_suffixes(config_${CONF}_FILES_PATHS ${CMAKE_SOURCE_DIR}/config/${CONF} "${ALL_H99_SUFS}") + list(APPEND ALL_HEADER_FILES_LIST "${config_${CONF}_HEADER_FILES}") endforeach() # Get header directory paths for each of the kernels present @@ -701,15 +708,21 @@ foreach(KERN ${KERNEL_LIST}) # Collect all subdirectory paths that have at least one file with suffix in ALL_H99_SUFS list. get_dirpaths_with_suffixes(kernels_${KERN}_HEADER_PATHS ${CMAKE_SOURCE_DIR}/kernels/${KERN} "${ALL_H99_SUFS}") list(APPEND ALL_HEADER_PATHS_LIST "${kernels_${KERN}_HEADER_PATHS}") + get_filepaths_with_suffixes(kernels_${KERN}_HEADER_FILES ${CMAKE_SOURCE_DIR}/kernels/${KERN} "${ALL_H99_SUFS}") + list(APPEND ALL_HEADER_PATHS_FILES "${kernels_${KERN}_HEADER_FILES}") endforeach() # Get header directory paths for framework directory. get_dirpaths_with_suffixes(frame_HEADER_PATHS ${CMAKE_SOURCE_DIR}/frame "${ALL_H99_SUFS}") list(APPEND ALL_HEADER_PATHS_LIST "${frame_HEADER_PATHS}") +get_filepaths_with_suffixes(frame_HEADER_FILES ${CMAKE_SOURCE_DIR}/frame "${ALL_H99_SUFS}") +list(APPEND ALL_HEADER_FILES_LIST "${frame_HEADER_FILES}") # Get header directory paths for AOCL DTL logs directory. get_dirpaths_with_suffixes(aocl_dtl_HEADER_PATHS ${CMAKE_SOURCE_DIR}/aocl_dtl "${ALL_H99_SUFS}") list(APPEND ALL_HEADER_PATHS_LIST "${aocl_dtl_HEADER_PATHS}") +get_filepaths_with_suffixes(aocl_dtl_HEADER_FILES ${CMAKE_SOURCE_DIR}/aocl_dtl "${ALL_H99_SUFS}") +list(APPEND ALL_HEADER_FILES_LIST "${aocl_dtl_FILES_PATHS}") # Get a copy of the header paths without including the addons and the sandbox. set(FRAME_HEADER_DIRPATHS_LIST ${ALL_HEADER_PATHS_LIST}) @@ -718,11 +731,18 @@ set(FRAME_HEADER_DIRPATHS_LIST ${ALL_HEADER_PATHS_LIST}) foreach(ADDON ${ENABLE_ADDON}) get_dirpaths_with_suffixes(addon_${ADDON}_HEADER_PATHS ${CMAKE_SOURCE_DIR}/addon/${ADDON} "${ALL_H99_SUFS}") list(APPEND ALL_HEADER_PATHS_LIST "${addon_${ADDON}_HEADER_PATHS}") + get_filepaths_with_suffixes(addon_${ADDON}_HEADER_FILES ${CMAKE_SOURCE_DIR}/addon/${ADDON} "${ALL_H99_SUFS}") + list(APPEND ALL_HEADER_FILES_LIST "${addon_${ADDON}_HEADER_FILES}") endforeach() # Pick up generated bli_config.h and bli_addon.h that get generated in # current build directory. list(PREPEND ALL_HEADER_PATHS_LIST ${PROJECT_BINARY_DIR}/) +list(PREPEND ALL_HEADER_FILES_LIST ${PROJECT_BINARY_DIR}/bli_config.h) +if(NOT (ENABLE_ADDON STREQUAL "")) + list(PREPEND ALL_HEADER_FILES_LIST ${PROJECT_BINARY_DIR}/bli_addon.h) +endif() + # Create a string out of this list so that it can be processed by flatten-headers.py. list(JOIN ALL_HEADER_PATHS_LIST " " ALL_HEADER_PATHS_STRING) @@ -739,6 +759,7 @@ add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/bl "${PROJECT_BINARY_DIR}/include" "${ALL_HEADER_PATHS_STRING}" COMMENT "Generating monolithic blis header file: ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h" + DEPENDS ${ALL_HEADER_FILES_LIST} ) add_custom_target(flat-header DEPENDS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/blis.h) #-------------------------------------------- @@ -753,6 +774,7 @@ if(ENABLE_CBLAS) "${PROJECT_BINARY_DIR}/${include}" "${ALL_HEADER_PATHS_STRING}" COMMENT "Generating monolithic cblas header file: ${CMAKE_SOURCE_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h" + DEPENDS ${ALL_HEADER_FILES_LIST} ) add_custom_target(flat-cblas-header DEPENDS ${PROJECT_BINARY_DIR}/include/${BLIS_CONFIG_FAMILY}/cblas.h) endif() From 6db2c43822197793861b49caf7b72958db75644c Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Tue, 14 Nov 2023 10:52:35 -0500 Subject: [PATCH 206/226] Include bli_lang_defs.h in cblas.h Changes in commit 64a1f786d58 (via merge c6f33401253) included in ./frame/include/bli_type_defs.h a prototype that uses the C restrict keyword. When using C++ we need to provide a definition for this C language keyword. This is done in bli_lang_defs.h which was included in blis.h but not in cblas.h. AMD-Internal: [CPUPL-4188] AMD-Internal: [CPUPL-4233] Change-Id: I75d5f32599d18794331ff452e562eb42afb5ae93 (cherry picked from commit 6e020ecc015fae699e7bf280ddbd2da8d8109d01) --- frame/compat/cblas/src/cblas.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h index fa957b9f84..1c3b490b44 100644 --- a/frame/compat/cblas/src/cblas.h +++ b/frame/compat/cblas/src/cblas.h @@ -36,6 +36,8 @@ // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. #include "bli_system.h" +#include "bli_lang_defs.h" + #include "bli_config.h" #include "bli_config_macro_defs.h" #include "bli_type_defs.h" From f02769e0caeeb1da30e46c50d500f6c3881b9033 Mon Sep 17 00:00:00 2001 From: mangala v Date: Tue, 14 Nov 2023 16:53:18 +0530 Subject: [PATCH 207/226] BugFix: Re-Designed SGEMM SUP kernel to use mask load/store instruction Segfault was reported through nightly jenkins job. Issue was observed when running in MT mode. Issue was due to extra broadcast being used. Extra broadcast would access out of bound memory on input buffer Cleaned up cobbler list by removing unused registers. AMD_Internal: [CPUPL-4180] Change-Id: I1c8715b2850ef855328f2ef12f215987299bdb2b --- .../testsuite/level3/gemm/sgemm_generic.cpp | 74 +++++++++++++++++++ .../zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c | 15 +--- .../s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c | 37 +++------- .../s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c | 54 ++++---------- .../s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c | 64 +++++++--------- 5 files changed, 129 insertions(+), 115 deletions(-) diff --git a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp index 4d1eb7f4c9..6abfbe871f 100644 --- a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp @@ -147,4 +147,78 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(7)) // increment to the leading dim of c ), ::SGemmTestPrint() + ); + +// Black box testing. +INSTANTIATE_TEST_SUITE_P( + sgemm_sup_m, + SGemmTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n','t'), // transa + ::testing::Values('n','t'), // transb + ::testing::Range(gtint_t(1), gtint_t(600), 1), // m + ::testing::Values(50), // n + ::testing::Values(30), // k + ::testing::Values( 1.0, 0.0, -2.0), // alpha + ::testing::Values(-1.0, 1.0, 0.0), // beta + ::testing::Values(gtint_t(2)), // increment to the leading dim of a + ::testing::Values(gtint_t(3)), // increment to the leading dim of b + ::testing::Values(gtint_t(7)) // increment to the leading dim of c + ), + ::SGemmTestPrint() + ); + + +// Black box testing. +INSTANTIATE_TEST_SUITE_P( + sgemm_sup_n, + SGemmTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n','t'), // transa + ::testing::Values('n','t'), // transb + ::testing::Values(30), // m + ::testing::Range(gtint_t(1), gtint_t(600), 1), // n + ::testing::Values(30), // k + ::testing::Values( 1.0, 0.0, -2.0), // alpha + ::testing::Values(-1.0, 1.0, 0.0), // beta + ::testing::Values(gtint_t(2)), // increment to the leading dim of a + ::testing::Values(gtint_t(3)), // increment to the leading dim of b + ::testing::Values(gtint_t(7)) // increment to the leading dim of c + ), + ::SGemmTestPrint() + ); + + +// Black box testing. +INSTANTIATE_TEST_SUITE_P( + sgemm_sup_m_n_k_100, + SGemmTest, + ::testing::Combine( + ::testing::Values('c' +#ifndef TEST_BLAS + ,'r' +#endif + ), // storage format + ::testing::Values('n','t'), // transa + ::testing::Values('n','t'), // transb + ::testing::Range(gtint_t(1), gtint_t(100), 1), // m + ::testing::Range(gtint_t(1), gtint_t(100), 1), // n + ::testing::Range(gtint_t(1), gtint_t(100), 1), // k + ::testing::Values( 1.0, 0.0, -2.0), // alpha + ::testing::Values(-1.0, 1.0, 0.0), // beta + ::testing::Values(gtint_t(2)), // increment to the leading dim of a + ::testing::Values(gtint_t(3)), // increment to the leading dim of b + ::testing::Values(gtint_t(7)) // increment to the leading dim of c + ), + ::SGemmTestPrint() ); \ No newline at end of file diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c index 471758041a..3c1d2c8bf7 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c @@ -3107,10 +3107,7 @@ void bli_sgemmsup_rv_zen_asm_6x16m_mask : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm0", "xmm1", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", @@ -3582,10 +3579,7 @@ void bli_sgemmsup_rv_zen_asm_6x8m_mask : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm0", "xmm1", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", "ymm12", "ymm14", "memory" @@ -4036,9 +4030,8 @@ void bli_sgemmsup_rv_zen_asm_6x4m_mask "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm4", "xmm6", "xmm7", + "xmm8", "xmm10", "xmm12", "xmm14", "memory" ) diff --git a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c index 2fa245ea3f..d0605a9f44 100644 --- a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c +++ b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c @@ -450,15 +450,11 @@ void bli_sgemmsup_rv_zen_asm_5x16_mask [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), - [mask_vec] "m" (mask_vec), - [n0] "m" (n0) + [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r12", "r13", "r14", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm0", "xmm1", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "memory" @@ -841,15 +837,11 @@ void bli_sgemmsup_rv_zen_asm_4x16_mask [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), - [mask_vec] "m" (mask_vec), - [n0] "m" (n0) + [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r12", "r13", "r14", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm0", "xmm12", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "memory" @@ -1200,10 +1192,7 @@ void bli_sgemmsup_rv_zen_asm_3x16_mask : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r12", "r14", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm0", "xmm12", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm12", "memory" @@ -1511,15 +1500,11 @@ void bli_sgemmsup_rv_zen_asm_2x16_mask [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), - [mask_vec] "m" (mask_vec), - [n0] "m" (n0) + [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r12", "r14", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm0", "xmm14", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm14", "memory" @@ -1795,15 +1780,11 @@ void bli_sgemmsup_rv_zen_asm_1x16_mask [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), - [mask_vec] "m" (mask_vec), - [n0] "m" (n0) + [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r12", "r14", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm0","xmm12", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm12", "memory" ) diff --git a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c index 6430c840e5..0c78a13c8f 100644 --- a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c +++ b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c @@ -98,7 +98,6 @@ void bli_sgemmsup_rv_zen_asm_5x4_mask lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a - lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) @@ -390,11 +389,10 @@ void bli_sgemmsup_rv_zen_asm_5x4_mask [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm4", "xmm6", "xmm7", + "xmm8", "xmm10", "xmm12", "memory" ) } @@ -451,7 +449,6 @@ void bli_sgemmsup_rv_zen_asm_4x4_mask lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a - lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) @@ -723,11 +720,10 @@ void bli_sgemmsup_rv_zen_asm_4x4_mask [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm4", "xmm6", "xmm7", + "xmm8", "xmm10", "memory" ) } @@ -783,9 +779,6 @@ void bli_sgemmsup_rv_zen_asm_3x4_mask lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a - lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a - mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) @@ -858,7 +851,6 @@ void bli_sgemmsup_rv_zen_asm_3x4_mask vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) add(r9, rax) // a += cs_a; @@ -875,7 +867,6 @@ void bli_sgemmsup_rv_zen_asm_3x4_mask vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) add(r9, rax) // a += cs_a; @@ -892,7 +883,6 @@ void bli_sgemmsup_rv_zen_asm_3x4_mask vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) add(r9, rax) // a += cs_a; @@ -909,7 +899,6 @@ void bli_sgemmsup_rv_zen_asm_3x4_mask vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) add(r9, rax) // a += cs_a; @@ -935,7 +924,6 @@ void bli_sgemmsup_rv_zen_asm_3x4_mask vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) - vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) add(r9, rax) // a += cs_a; @@ -1042,11 +1030,10 @@ void bli_sgemmsup_rv_zen_asm_3x4_mask [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r8", "r9", "r10", "r11", "r12", "r14", "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm4", "xmm6", "xmm7", + "xmm8", "xmm10", "memory" ) } @@ -1102,9 +1089,6 @@ void bli_sgemmsup_rv_zen_asm_2x4_mask lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a - lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a - mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) @@ -1332,11 +1316,9 @@ void bli_sgemmsup_rv_zen_asm_2x4_mask [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r8", "r9", "r10", "r11", "r12", "r14", "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "xmm4", "xmm6", "xmm7", "memory" ) } @@ -1392,9 +1374,6 @@ void bli_sgemmsup_rv_zen_asm_1x4_mask lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a - lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a - mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) @@ -1408,7 +1387,6 @@ void bli_sgemmsup_rv_zen_asm_1x4_mask mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) - vxorps(xmm1, xmm1, xmm1) vxorps(xmm4, xmm4, xmm4) mov(var(b), rbx) // load address of b. @@ -1603,11 +1581,9 @@ void bli_sgemmsup_rv_zen_asm_1x4_mask [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", + "r8", "r9", "r10", "r11", "r12", "r14", + "xmm0", "xmm2", "xmm3", + "xmm4", "xmm7", "memory" ) -} \ No newline at end of file +} diff --git a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c index 20d6f45075..ce2b36d677 100644 --- a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c +++ b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c @@ -334,6 +334,8 @@ void bli_sgemmsup_rv_zen_asm_5x8_mask /* TODO: Add column storage support*/ + jmp(.SDONE) // jump to end. + label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. @@ -382,16 +384,12 @@ void bli_sgemmsup_rv_zen_asm_5x8_mask [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), - [mask_vec] "m" (mask_vec), - [n0] "m" (n0) + [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r12", "r13", "r14", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "xmm0", "xmm7", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm6", "ymm7", "ymm8", "ymm10", "ymm12", "memory" ) @@ -669,6 +667,8 @@ void bli_sgemmsup_rv_zen_asm_4x8_mask /* TODO: Add column storage support*/ + jmp(.SDONE) // jump to end. + label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. @@ -693,6 +693,8 @@ void bli_sgemmsup_rv_zen_asm_4x8_mask /* TODO: Add column storage support*/ + jmp(.SDONE) // jump to end. + label(.SDONE) label(.SRETURN) @@ -714,16 +716,12 @@ void bli_sgemmsup_rv_zen_asm_4x8_mask [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), - [mask_vec] "m" (mask_vec), - [n0] "m" (n0) + [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r12", "r13", "r14", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "xmm0", "xmm7", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm6", "ymm7", "ymm8", "ymm10", "memory" ) @@ -778,8 +776,6 @@ void bli_sgemmsup_rv_zen_asm_3x8_mask lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a - mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) @@ -924,7 +920,6 @@ void bli_sgemmsup_rv_zen_asm_3x8_mask vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) - vbroadcastss(mem(rax, r13, 1), ymm2) add(r9, rax) // a += cs_a; @@ -1007,6 +1002,8 @@ void bli_sgemmsup_rv_zen_asm_3x8_mask /* TODO: Add column storage support*/ + jmp(.SDONE) // jump to end. + label(.SDONE) label(.SRETURN) @@ -1031,12 +1028,9 @@ void bli_sgemmsup_rv_zen_asm_3x8_mask [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r12", "r13", "r14", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", + "r8", "r9", "r10", "r12", "r14", + "xmm0", "xmm7", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm6", "ymm7", "ymm8", "memory" ) @@ -1268,6 +1262,8 @@ void bli_sgemmsup_rv_zen_asm_2x8_mask /* TODO: Add column storage support*/ + jmp(.SDONE) // jump to end. + label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. @@ -1286,6 +1282,8 @@ void bli_sgemmsup_rv_zen_asm_2x8_mask /* TODO: Add column storage support*/ + jmp(.SDONE) // jump to end. + label(.SDONE) label(.SRETURN) @@ -1307,16 +1305,12 @@ void bli_sgemmsup_rv_zen_asm_2x8_mask [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), - [mask_vec] "m" (mask_vec), - [n0] "m" (n0) + [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r12", "r14", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm7", + "xmm0", "xmm7", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm6", "ymm7", "memory" ) } @@ -1571,16 +1565,12 @@ void bli_sgemmsup_rv_zen_asm_1x8_mask [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), - [mask_vec] "m" (mask_vec), - [n0] "m" (n0) + [mask_vec] "m" (mask_vec) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r12", "r14", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm7", + "xmm0", "xmm7", + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "memory" ) } From aa2a10354e3668337d6b1a40ac662ffeb31be95a Mon Sep 17 00:00:00 2001 From: mangala v Date: Tue, 21 Nov 2023 12:40:01 +0530 Subject: [PATCH 208/226] Updated prefetching in SGEMM SUP (mask load/store) kernels 1. Prefetch only MR rows or rows required for fringe cases 2. Specify prefetching offset - the least column address supported by masked functions 3. Removed unnecessary prefetches in fringe case for mx4 kernels AMD_Internal: [CPUPL-4221] Change-Id: I1e2e7d3ebce37dc54a2f0a5c1c70ce0a6d4c8d6c --- .../zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c | 36 ++++++------- .../s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c | 36 ++++++------- .../s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c | 51 ++++++------------- .../s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c | 37 ++++++-------- 4 files changed, 64 insertions(+), 96 deletions(-) diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c index 3c1d2c8bf7..a6f79dcd12 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c @@ -2724,12 +2724,12 @@ void bli_sgemmsup_rv_zen_asm_6x16m_mask lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2,15*4)) // prefetch c + 5*rs_c + prefetch(0, mem(r12, 8*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 8*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 8*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 8*4)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 8*4)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 8*4)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -3272,12 +3272,12 @@ void bli_sgemmsup_rv_zen_asm_6x8m_mask lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2,15*4)) // prefetch c + 5*rs_c + prefetch(0, mem(r12, 4*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 4*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 4*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 4*4)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 4*4)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 4*4)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -3742,12 +3742,12 @@ void bli_sgemmsup_rv_zen_asm_6x4m_mask lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + prefetch(0, mem(r12, 0)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 0)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 0)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 0)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 0)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 0)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c diff --git a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c index d0605a9f44..3b93fc6802 100644 --- a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c +++ b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x16_mask.c @@ -126,11 +126,11 @@ void bli_sgemmsup_rv_zen_asm_5x16_mask lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c + prefetch(0, mem(r12, 8*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,8*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,8*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 8*4)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1,8*4)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -541,10 +541,10 @@ void bli_sgemmsup_rv_zen_asm_4x16_mask lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c + prefetch(0, mem(r12, 8*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,8*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,8*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 8*4)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -922,11 +922,9 @@ void bli_sgemmsup_rv_zen_asm_3x16_mask jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c + prefetch(0, mem(r12, 8*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,8*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,8*4)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -1273,10 +1271,8 @@ void bli_sgemmsup_rv_zen_asm_2x16_mask jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, 8*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,8*4)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -1585,9 +1581,7 @@ void bli_sgemmsup_rv_zen_asm_1x16_mask jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, 8*4)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c diff --git a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c index 0c78a13c8f..55de26c884 100644 --- a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c +++ b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x4_mask.c @@ -128,12 +128,11 @@ void bli_sgemmsup_rv_zen_asm_5x4_mask lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + prefetch(0, mem(r12, 0)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 0)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 0)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 0)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 0)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -478,12 +477,10 @@ void bli_sgemmsup_rv_zen_asm_4x4_mask lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + prefetch(0, mem(r12, 0)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 0)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 0)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 0)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -804,14 +801,9 @@ void bli_sgemmsup_rv_zen_asm_3x4_mask jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + prefetch(0, mem(r12, 0)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 0)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2, 0)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -1113,14 +1105,8 @@ void bli_sgemmsup_rv_zen_asm_2x4_mask jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + prefetch(0, mem(r12, 0)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1, 0)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -1396,14 +1382,7 @@ void bli_sgemmsup_rv_zen_asm_1x4_mask jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c + prefetch(0, mem(r12, 0)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c diff --git a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c index ce2b36d677..74c1c51989 100644 --- a/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c +++ b/kernels/zen/3/sup/s6x16/bli_gemmsup_rv_zen_asm_s5x8_mask.c @@ -125,11 +125,11 @@ void bli_sgemmsup_rv_zen_asm_5x8_mask lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c + prefetch(0, mem(r12, 4*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,4*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,4*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 4*4)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1,4*4)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -475,10 +475,10 @@ void bli_sgemmsup_rv_zen_asm_4x8_mask lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c + prefetch(0, mem(r12, 4*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,4*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,4*4)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 4*4)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -802,11 +802,9 @@ void bli_sgemmsup_rv_zen_asm_3x8_mask jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c - prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c + prefetch(0, mem(r12, 4*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,4*4)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, rdi, 2,4*4)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -1112,10 +1110,9 @@ void bli_sgemmsup_rv_zen_asm_2x8_mask jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c - prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c + + prefetch(0, mem(r12, 4*4)) // prefetch c + 0*rs_c + prefetch(0, mem(r12, rdi, 1,4*4)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c @@ -1391,9 +1388,7 @@ void bli_sgemmsup_rv_zen_asm_1x8_mask jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c - lea(mem(r12, rdi, 2), rdx) // - lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs + prefetch(0, mem(r12, 4*4)) // prefetch c + 0*rs jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c From 5a88182c1eb8d68114f28f441eac35d3cebb9bc9 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Mon, 13 Nov 2023 08:38:22 -0500 Subject: [PATCH 209/226] Code cleanup: No newline at end of file Some text files were missing a newline at the end of the file. One has been added. AMD-Internal: [CPUPL-3519] Change-Id: I4b00876b1230b036723d6b56755c6ca844a7ffce (cherry picked from commit f471615c66f3a1d98a4ccac63a645cdf8a04bd19) --- addon/CMakeLists.txt | 2 +- bench/bench_gemm_pack_compute.c | 2 +- bench/inputgemmpackcompute.txt | 2 +- blastest/CMakeLists.txt | 2 +- build/cmake/check-blastest.py | 2 +- build/cmake/check-blistest.py | 2 +- build/cmake/read_registry.py | 2 +- build/cmake/subdir_helper_functions.cmake | 2 +- build/gen-make-frags/ignore_list | 2 +- config/CMakeLists.txt | 2 +- config/amdzen/make_defs.cmake | 2 +- config/zen/make_defs.cmake | 2 +- config/zen2/make_defs.cmake | 2 +- config/zen3/make_defs.cmake | 2 +- config/zen4/make_defs.cmake | 2 +- docs/CMakeBuildSystem.md | 2 +- frame/3/bli_l3.h | 2 +- frame/3/bli_l3_compute.h | 2 +- frame/CMakeLists.txt | 2 +- frame/compat/bla_gemm_compute.c | 2 +- frame/compat/bla_gemm_compute.h | 2 +- frame/compat/bla_gemm_pack.h | 2 +- frame/compat/bla_gemm_pack_get_size.h | 2 +- frame/compat/cblas/src/cblas_dgemm_compute.c | 2 +- frame/compat/cblas/src/cblas_dgemm_pack.c | 2 +- frame/compat/cblas/src/cblas_dgemm_pack_get_size.c | 2 +- frame/compat/cblas/src/cblas_sgemm_pack.c | 2 +- frame/compat/cblas/src/cblas_sgemm_pack_get_size.c | 2 +- frame/thread/bli_l3_compute_decor.h | 2 +- frame/thread/bli_l3_compute_decor_single.c | 2 +- frame/thread/bli_l3_compute_decor_single.h | 2 +- gtestsuite/README.md | 2 +- gtestsuite/testinghelpers/CMakeLists.txt | 2 +- gtestsuite/testinghelpers/inc/common/refCBLAS.h | 2 +- gtestsuite/testinghelpers/src/common/data_generators.cpp | 2 +- gtestsuite/testinghelpers/src/common/testing_basics.cpp | 2 +- gtestsuite/testsuite/level1/addv/addv.h | 2 +- gtestsuite/testsuite/level1/addv/caddv_generic.cpp | 2 +- gtestsuite/testsuite/level1/addv/daddv_generic.cpp | 2 +- gtestsuite/testsuite/level1/addv/saddv_generic.cpp | 2 +- gtestsuite/testsuite/level1/addv/test_addv.h | 2 +- gtestsuite/testsuite/level1/addv/zaddv_generic.cpp | 2 +- gtestsuite/testsuite/level1/amaxv/amaxv.h | 2 +- gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp | 2 +- gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp | 2 +- gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp | 2 +- gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp | 2 +- gtestsuite/testsuite/level1/axpbyv/axpbyv.h | 2 +- gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h | 2 +- gtestsuite/testsuite/level1/axpbyv/zaxpbyv_evt_testing.cpp | 2 +- gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/axpyv/axpyv.h | 2 +- gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/axpyv/test_axpyv.h | 2 +- gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/copyv/copyv.h | 2 +- gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/copyv/test_copyv.h | 2 +- gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp | 2 +- gtestsuite/testsuite/level1/dotv/dotv.h | 2 +- gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp | 2 +- gtestsuite/testsuite/level1/dotv/test_dotv.h | 2 +- gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp | 2 +- gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp | 2 +- gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp | 2 +- gtestsuite/testsuite/level1/dotxv/dotxv.h | 2 +- gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp | 2 +- gtestsuite/testsuite/level1/dotxv/test_dotxv.h | 2 +- gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp | 2 +- gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp | 2 +- gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp | 2 +- gtestsuite/testsuite/level1/scal2v/scal2v.h | 2 +- gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp | 2 +- gtestsuite/testsuite/level1/scal2v/test_scal2v.h | 2 +- gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp | 2 +- gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp | 2 +- gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp | 2 +- gtestsuite/testsuite/level1/scalv/scalv.h | 2 +- gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp | 2 +- gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp | 2 +- gtestsuite/testsuite/level1/scalv/test_scalv.h | 2 +- gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp | 2 +- gtestsuite/testsuite/level1/setv/csetv_generic.cpp | 2 +- gtestsuite/testsuite/level1/setv/dsetv_generic.cpp | 2 +- gtestsuite/testsuite/level1/setv/setv.h | 2 +- gtestsuite/testsuite/level1/setv/ssetv_generic.cpp | 2 +- gtestsuite/testsuite/level1/setv/test_setv.h | 2 +- gtestsuite/testsuite/level1/setv/zsetv_generic.cpp | 2 +- gtestsuite/testsuite/level1/subv/csubv_generic.cpp | 2 +- gtestsuite/testsuite/level1/subv/dsubv_generic.cpp | 2 +- gtestsuite/testsuite/level1/subv/ssubv_generic.cpp | 2 +- gtestsuite/testsuite/level1/subv/subv.h | 2 +- gtestsuite/testsuite/level1/subv/test_subv.h | 2 +- gtestsuite/testsuite/level1/subv/zsubv_generic.cpp | 2 +- gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp | 2 +- gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h | 2 +- gtestsuite/testsuite/level1/xpbyv/xpbyv.h | 2 +- gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp | 2 +- gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp | 2 +- gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp | 2 +- gtestsuite/testsuite/level2/gemv/gemv.h | 2 +- gtestsuite/testsuite/level2/gemv/test_gemv.h | 2 +- gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp | 2 +- gtestsuite/testsuite/level2/ger/cger_generic.cpp | 2 +- gtestsuite/testsuite/level2/ger/dger_generic.cpp | 2 +- gtestsuite/testsuite/level2/ger/ger.h | 2 +- gtestsuite/testsuite/level2/ger/sger_generic.cpp | 2 +- gtestsuite/testsuite/level2/ger/test_ger.h | 2 +- gtestsuite/testsuite/level2/ger/zger_generic.cpp | 2 +- gtestsuite/testsuite/level2/hemv/chemv_generic.cpp | 2 +- gtestsuite/testsuite/level2/hemv/hemv.h | 2 +- gtestsuite/testsuite/level2/hemv/test_hemv.h | 2 +- gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp | 2 +- gtestsuite/testsuite/level2/her/cher_generic.cpp | 2 +- gtestsuite/testsuite/level2/her/her.h | 2 +- gtestsuite/testsuite/level2/her/test_her.h | 2 +- gtestsuite/testsuite/level2/her/zher_generic.cpp | 2 +- gtestsuite/testsuite/level2/her2/cher2_generic.cpp | 2 +- gtestsuite/testsuite/level2/her2/her2.h | 2 +- gtestsuite/testsuite/level2/her2/test_her2.h | 2 +- gtestsuite/testsuite/level2/her2/zher2_generic.cpp | 2 +- gtestsuite/testsuite/level2/symv/dsymv_generic.cpp | 2 +- gtestsuite/testsuite/level2/symv/ssymv_generic.cpp | 2 +- gtestsuite/testsuite/level2/symv/symv.h | 2 +- gtestsuite/testsuite/level2/symv/test_symv.h | 2 +- gtestsuite/testsuite/level2/syr/dsyr_generic.cpp | 2 +- gtestsuite/testsuite/level2/syr/ssyr_generic.cpp | 2 +- gtestsuite/testsuite/level2/syr/syr.h | 2 +- gtestsuite/testsuite/level2/syr/test_syr.h | 2 +- gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp | 2 +- gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp | 2 +- gtestsuite/testsuite/level2/syr2/syr2.h | 2 +- gtestsuite/testsuite/level2/syr2/test_syr2.h | 2 +- gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp | 2 +- gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp | 2 +- gtestsuite/testsuite/level2/trmv/strmv_generic.cpp | 2 +- gtestsuite/testsuite/level2/trmv/test_trmv.h | 2 +- gtestsuite/testsuite/level2/trmv/trmv.h | 2 +- gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp | 2 +- gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp | 2 +- gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp | 2 +- gtestsuite/testsuite/level2/trsv/strsv_generic.cpp | 2 +- gtestsuite/testsuite/level2/trsv/test_trsv.h | 2 +- gtestsuite/testsuite/level2/trsv/trsv.h | 2 +- gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp | 2 +- gtestsuite/testsuite/level3/gemm/IIT_ERS_test.cpp | 2 +- gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp | 2 +- gtestsuite/testsuite/level3/gemm/gemm.h | 2 +- gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp | 2 +- gtestsuite/testsuite/level3/gemm/zgemm_evt_testing.cpp | 2 +- gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp | 2 +- .../testsuite/level3/gemm_compute/dgemm_compute_generic.cpp | 2 +- gtestsuite/testsuite/level3/gemm_compute/gemm_compute.h | 2 +- .../testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp | 2 +- .../testsuite/level3/gemm_compute/sgemm_compute_generic.cpp | 2 +- gtestsuite/testsuite/level3/gemm_compute/test_gemm_compute.h | 2 +- gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp | 2 +- gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp | 2 +- gtestsuite/testsuite/level3/gemmt/gemmt.h | 2 +- gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp | 2 +- gtestsuite/testsuite/level3/gemmt/test_gemmt.h | 2 +- gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp | 2 +- gtestsuite/testsuite/level3/hemm/chemm_generic.cpp | 2 +- gtestsuite/testsuite/level3/hemm/hemm.h | 2 +- gtestsuite/testsuite/level3/hemm/test_hemm.h | 2 +- gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp | 2 +- gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp | 2 +- gtestsuite/testsuite/level3/her2k/her2k.h | 2 +- gtestsuite/testsuite/level3/her2k/test_her2k.h | 2 +- gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp | 2 +- gtestsuite/testsuite/level3/herk/cherk_generic.cpp | 2 +- gtestsuite/testsuite/level3/herk/herk.h | 2 +- gtestsuite/testsuite/level3/herk/test_herk.h | 2 +- gtestsuite/testsuite/level3/herk/zherk_generic.cpp | 2 +- gtestsuite/testsuite/level3/symm/csymm_generic.cpp | 2 +- gtestsuite/testsuite/level3/symm/dsymm_generic.cpp | 2 +- gtestsuite/testsuite/level3/symm/ssymm_generic.cpp | 2 +- gtestsuite/testsuite/level3/symm/symm.h | 2 +- gtestsuite/testsuite/level3/symm/test_symm.h | 2 +- gtestsuite/testsuite/level3/symm/zsymm_generic.cpp | 2 +- gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp | 2 +- gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp | 2 +- gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp | 2 +- gtestsuite/testsuite/level3/syr2k/test_syr2k.h | 2 +- gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp | 2 +- gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp | 2 +- gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp | 2 +- gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp | 2 +- gtestsuite/testsuite/level3/syrk/syrk.h | 2 +- gtestsuite/testsuite/level3/syrk/test_syrk.h | 2 +- gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp | 2 +- gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp | 2 +- gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp | 2 +- gtestsuite/testsuite/level3/trmm/strmm_generic.cpp | 2 +- gtestsuite/testsuite/level3/trmm/test_trmm.h | 2 +- gtestsuite/testsuite/level3/trmm/trmm.h | 2 +- gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp | 2 +- gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp | 2 +- gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp | 2 +- gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp | 2 +- gtestsuite/testsuite/level3/trmm3/test_trmm3.h | 2 +- gtestsuite/testsuite/level3/trmm3/trmm3.h | 2 +- gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp | 2 +- gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp | 2 +- gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp | 2 +- gtestsuite/testsuite/level3/trsm/strsm_generic.cpp | 2 +- gtestsuite/testsuite/level3/trsm/test_trsm.h | 2 +- gtestsuite/testsuite/level3/trsm/trsm.h | 2 +- gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp | 2 +- gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp | 2 +- gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp | 2 +- gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp | 2 +- gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp | 2 +- gtestsuite/testsuite/util/nrm2/nrm2.h | 2 +- gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp | 2 +- gtestsuite/testsuite/util/nrm2/nrm2_invalid_inputs.cpp | 2 +- gtestsuite/testsuite/util/nrm2/scnrm2_generic.cpp | 2 +- gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp | 2 +- gtestsuite/testsuite/util/nrm2/test_nrm2.h | 2 +- kernels/CMakeLists.txt | 2 +- kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c | 2 +- kernels/zen/3/bli_zgemm_avx2_k1.c | 2 +- kernels/zen/3/bli_zgemm_zen_2x6.c | 2 +- kernels/zen/3/bli_zgemmtrsm_l_2x6.c | 2 +- kernels/zen/3/bli_zgemmtrsm_u_2x6.c | 2 +- kernels/zen4/3/bli_zero_zmm.c | 2 +- kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c | 2 +- kernels/zen4/3/bli_zgemmtrsm_l_4x12.c | 2 +- kernels/zen4/3/bli_zgemmtrsm_u_4x12.c | 2 +- sandbox/power10/p10_testsuite/Makefile | 2 +- sandbox/power10/p10_testsuite/common.h | 2 +- testsuite/CMakeLists.txt | 2 +- 241 files changed, 241 insertions(+), 241 deletions(-) diff --git a/addon/CMakeLists.txt b/addon/CMakeLists.txt index 6e950340ae..667a0daf5a 100644 --- a/addon/CMakeLists.txt +++ b/addon/CMakeLists.txt @@ -203,4 +203,4 @@ endfunction() # Generate targets for each of the addons. foreach(ADDON ${ENABLE_ADDON}) generate_addon_targets(${ADDON}) -endforeach() \ No newline at end of file +endforeach() diff --git a/bench/bench_gemm_pack_compute.c b/bench/bench_gemm_pack_compute.c index e2f218846e..30236ee859 100755 --- a/bench/bench_gemm_pack_compute.c +++ b/bench/bench_gemm_pack_compute.c @@ -993,4 +993,4 @@ int main( int argc, char** argv ) fclose(fout); return 0; -} \ No newline at end of file +} diff --git a/bench/inputgemmpackcompute.txt b/bench/inputgemmpackcompute.txt index 8b01d33d6b..3afff8baf0 100644 --- a/bench/inputgemmpackcompute.txt +++ b/bench/inputgemmpackcompute.txt @@ -89,4 +89,4 @@ dgemm_ D N N U P 100 100 100 1 0 100 100 1 0 100 dgemm_ D N N U P 200 200 200 1 0 200 200 1 0 200 dgemm_ D N N U P 300 300 300 1 0 300 300 1 0 300 dgemm_ D N N U P 400 400 400 1 0 400 400 1 0 400 -dgemm_ D N N U P 500 500 500 1 0 500 500 1 0 500 \ No newline at end of file +dgemm_ D N N U P 500 500 500 1 0 500 500 1 0 500 diff --git a/blastest/CMakeLists.txt b/blastest/CMakeLists.txt index 6b0f21e249..062ca21162 100644 --- a/blastest/CMakeLists.txt +++ b/blastest/CMakeLists.txt @@ -128,4 +128,4 @@ add_custom_target(checkblas DEPENDS testblas ) # Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE. -set_target_properties(testblas checkblas PROPERTIES FOLDER blastest-targets) \ No newline at end of file +set_target_properties(testblas checkblas PROPERTIES FOLDER blastest-targets) diff --git a/build/cmake/check-blastest.py b/build/cmake/check-blastest.py index f2b641c766..8e1123cf80 100644 --- a/build/cmake/check-blastest.py +++ b/build/cmake/check-blastest.py @@ -28,4 +28,4 @@ def check_blastest(): else: print("\033[0;32m All BLAS tests passed! \033[0m") -check_blastest() \ No newline at end of file +check_blastest() diff --git a/build/cmake/check-blistest.py b/build/cmake/check-blistest.py index 1d285ccf78..983f8e8241 100644 --- a/build/cmake/check-blistest.py +++ b/build/cmake/check-blistest.py @@ -19,4 +19,4 @@ def check_blistest(): else: print("\033[0;32m All BLIS tests passed! \033[0m") -check_blistest() \ No newline at end of file +check_blistest() diff --git a/build/cmake/read_registry.py b/build/cmake/read_registry.py index 16bf3f9903..f8baf66378 100644 --- a/build/cmake/read_registry.py +++ b/build/cmake/read_registry.py @@ -406,4 +406,4 @@ def process_config(): # Function call for config family names CONFIG = process_config() -print(CONFIG) \ No newline at end of file +print(CONFIG) diff --git a/build/cmake/subdir_helper_functions.cmake b/build/cmake/subdir_helper_functions.cmake index 06a30bbe98..ad41a3001c 100644 --- a/build/cmake/subdir_helper_functions.cmake +++ b/build/cmake/subdir_helper_functions.cmake @@ -119,4 +119,4 @@ macro(get_config_for_kernel_from_kconfig_map config kernel kconfig_map) # of kernel: and then we will be left with config. list(TRANSFORM conf REPLACE ${kernel}: "") list(APPEND ${config} ${conf}) -endmacro() \ No newline at end of file +endmacro() diff --git a/build/gen-make-frags/ignore_list b/build/gen-make-frags/ignore_list index 3561710b4f..3a7afbd8bc 100644 --- a/build/gen-make-frags/ignore_list +++ b/build/gen-make-frags/ignore_list @@ -5,4 +5,4 @@ other temp tmp test -p10_testsuite \ No newline at end of file +p10_testsuite diff --git a/config/CMakeLists.txt b/config/CMakeLists.txt index 537a67df2c..cae2ed48ae 100644 --- a/config/CMakeLists.txt +++ b/config/CMakeLists.txt @@ -184,4 +184,4 @@ endfunction() # Generate targets for each of the configs. foreach(CONF ${CONFIG_LIST}) generate_config_targets(${CONF}) -endforeach() \ No newline at end of file +endforeach() diff --git a/config/amdzen/make_defs.cmake b/config/amdzen/make_defs.cmake index f658bcb64b..231c3eecfb 100644 --- a/config/amdzen/make_defs.cmake +++ b/config/amdzen/make_defs.cmake @@ -21,4 +21,4 @@ else() else() # off or opt set(COPTFLAGS -O3) endif() -endif() \ No newline at end of file +endif() diff --git a/config/zen/make_defs.cmake b/config/zen/make_defs.cmake index 0e9ac3ab9b..33755d5791 100644 --- a/config/zen/make_defs.cmake +++ b/config/zen/make_defs.cmake @@ -36,4 +36,4 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") set(CRVECFLAGS ${CKVECFLAGS}) else() set(CRVECFLAGS ${CKVECFLAGS}) -endif() \ No newline at end of file +endif() diff --git a/config/zen2/make_defs.cmake b/config/zen2/make_defs.cmake index 2e2a7ad4c9..781c82b6a8 100644 --- a/config/zen2/make_defs.cmake +++ b/config/zen2/make_defs.cmake @@ -73,4 +73,4 @@ endif() # Flags specific to reference kernels. set(CROPTFLAGS ${CKOPTFLAGS}) -set(CRVECFLAGS ${CKVECFLAGS}) \ No newline at end of file +set(CRVECFLAGS ${CKVECFLAGS}) diff --git a/config/zen3/make_defs.cmake b/config/zen3/make_defs.cmake index 85a42106c4..706c5bb4b7 100644 --- a/config/zen3/make_defs.cmake +++ b/config/zen3/make_defs.cmake @@ -87,4 +87,4 @@ endif() # Flags specific to reference kernels. set(CROPTFLAGS ${CKOPTFLAGS}) -set(CRVECFLAGS ${CKVECFLAGS}) \ No newline at end of file +set(CRVECFLAGS ${CKVECFLAGS}) diff --git a/config/zen4/make_defs.cmake b/config/zen4/make_defs.cmake index 68dcc4b727..422e5548a9 100644 --- a/config/zen4/make_defs.cmake +++ b/config/zen4/make_defs.cmake @@ -109,4 +109,4 @@ endif() # Flags specific to reference kernels. set(CROPTFLAGS ${CKOPTFLAGS}) -set(CRVECFLAGS ${CKVECFLAGS}) \ No newline at end of file +set(CRVECFLAGS ${CKVECFLAGS}) diff --git a/docs/CMakeBuildSystem.md b/docs/CMakeBuildSystem.md index cee9f5a86d..7e669c6b3d 100644 --- a/docs/CMakeBuildSystem.md +++ b/docs/CMakeBuildSystem.md @@ -214,4 +214,4 @@ cmake .. -G "Visual Studio 17 2022" -TClangCl -DENABLE_THREADING=no -DINT_SIZE=6 ## Conclusion -The BLIS CMake system is developed and maintained by AMD. You can contact us on the email-id toolchainsupport@amd.com. You can also raise any issue/suggestion on the git-hub repository at https://github.com/amd/blis/issues. \ No newline at end of file +The BLIS CMake system is developed and maintained by AMD. You can contact us on the email-id toolchainsupport@amd.com. You can also raise any issue/suggestion on the git-hub repository at https://github.com/amd/blis/issues. diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 6250405995..6620000b7a 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -107,4 +107,4 @@ #include "bli_l3_smart_threading.h" // BLAS Extension API - Compute -#include "bli_l3_compute.h" \ No newline at end of file +#include "bli_l3_compute.h" diff --git a/frame/3/bli_l3_compute.h b/frame/3/bli_l3_compute.h index 9fb0b71c36..5d7d2efa20 100644 --- a/frame/3/bli_l3_compute.h +++ b/frame/3/bli_l3_compute.h @@ -77,4 +77,4 @@ void PASTEMAC( ch, varname ) \ thrinfo_t* restrict thread \ ); -INSERT_GENTPROT_BASIC0( gemm_compute ) \ No newline at end of file +INSERT_GENTPROT_BASIC0( gemm_compute ) diff --git a/frame/CMakeLists.txt b/frame/CMakeLists.txt index 59e8142cc4..d7ad73943e 100644 --- a/frame/CMakeLists.txt +++ b/frame/CMakeLists.txt @@ -97,4 +97,4 @@ if(BUILD_SHARED_LIBS) endif() add_dependencies(FRAME flat-header) # Put all those targets under object-libs-targets folder name so that they appear all together in IDE. -set_target_properties(FRAME PROPERTIES FOLDER object-libs-targets) \ No newline at end of file +set_target_properties(FRAME PROPERTIES FOLDER object-libs-targets) diff --git a/frame/compat/bla_gemm_compute.c b/frame/compat/bla_gemm_compute.c index 7d2475641b..8d9f3697b9 100644 --- a/frame/compat/bla_gemm_compute.c +++ b/frame/compat/bla_gemm_compute.c @@ -295,4 +295,4 @@ void dgemm_compute_ beta, c, &rs_c, ldc ); } -#endif \ No newline at end of file +#endif diff --git a/frame/compat/bla_gemm_compute.h b/frame/compat/bla_gemm_compute.h index c50e5b884d..820df10d5c 100644 --- a/frame/compat/bla_gemm_compute.h +++ b/frame/compat/bla_gemm_compute.h @@ -69,4 +69,4 @@ BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \ ftype* c, const f77_int* rs_c, const f77_int* cs_c \ ); -INSERT_GENTPROTRO_BLAS( gemm_compute ) \ No newline at end of file +INSERT_GENTPROTRO_BLAS( gemm_compute ) diff --git a/frame/compat/bla_gemm_pack.h b/frame/compat/bla_gemm_pack.h index 1621bfc70a..af5a8b948d 100644 --- a/frame/compat/bla_gemm_pack.h +++ b/frame/compat/bla_gemm_pack.h @@ -70,4 +70,4 @@ BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \ ftype* dest \ ); -INSERT_GENTPROTRO_BLAS( gemm_pack ) \ No newline at end of file +INSERT_GENTPROTRO_BLAS( gemm_pack ) diff --git a/frame/compat/bla_gemm_pack_get_size.h b/frame/compat/bla_gemm_pack_get_size.h index 42c4a22072..60360984fb 100644 --- a/frame/compat/bla_gemm_pack_get_size.h +++ b/frame/compat/bla_gemm_pack_get_size.h @@ -60,4 +60,4 @@ BLIS_EXPORT_BLAS f77_int PASTEF77S(ch,blasname) \ const f77_int* pk \ ); -INSERT_GENTPROTRO_BLAS( gemm_pack_get_size ) \ No newline at end of file +INSERT_GENTPROTRO_BLAS( gemm_pack_get_size ) diff --git a/frame/compat/cblas/src/cblas_dgemm_compute.c b/frame/compat/cblas/src/cblas_dgemm_compute.c index ed55f8a805..0afc56dead 100644 --- a/frame/compat/cblas/src/cblas_dgemm_compute.c +++ b/frame/compat/cblas/src/cblas_dgemm_compute.c @@ -169,4 +169,4 @@ BLIS_EXPORT_BLAS void cblas_dgemm_compute( enum CBLAS_ORDER Order, } return; } -#endif \ No newline at end of file +#endif diff --git a/frame/compat/cblas/src/cblas_dgemm_pack.c b/frame/compat/cblas/src/cblas_dgemm_pack.c index 9ddba3bcaa..8356959682 100644 --- a/frame/compat/cblas/src/cblas_dgemm_pack.c +++ b/frame/compat/cblas/src/cblas_dgemm_pack.c @@ -154,4 +154,4 @@ BLIS_EXPORT_BLAS void cblas_dgemm_pack( enum CBLAS_ORDER Order, RowMajorStrg = 0; return; } -#endif \ No newline at end of file +#endif diff --git a/frame/compat/cblas/src/cblas_dgemm_pack_get_size.c b/frame/compat/cblas/src/cblas_dgemm_pack_get_size.c index 5001ed15a8..cfad64fa9b 100644 --- a/frame/compat/cblas/src/cblas_dgemm_pack_get_size.c +++ b/frame/compat/cblas/src/cblas_dgemm_pack_get_size.c @@ -80,4 +80,4 @@ f77_int cblas_dgemm_pack_get_size( enum CBLAS_IDENTIFIER Identifier, AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 ); return tbytes; } -#endif \ No newline at end of file +#endif diff --git a/frame/compat/cblas/src/cblas_sgemm_pack.c b/frame/compat/cblas/src/cblas_sgemm_pack.c index 39a6e055fe..e3694dbd69 100644 --- a/frame/compat/cblas/src/cblas_sgemm_pack.c +++ b/frame/compat/cblas/src/cblas_sgemm_pack.c @@ -154,4 +154,4 @@ BLIS_EXPORT_BLAS void cblas_sgemm_pack( enum CBLAS_ORDER Order, RowMajorStrg = 0; return; } -#endif \ No newline at end of file +#endif diff --git a/frame/compat/cblas/src/cblas_sgemm_pack_get_size.c b/frame/compat/cblas/src/cblas_sgemm_pack_get_size.c index bf82bb104b..99c145a6be 100644 --- a/frame/compat/cblas/src/cblas_sgemm_pack_get_size.c +++ b/frame/compat/cblas/src/cblas_sgemm_pack_get_size.c @@ -80,4 +80,4 @@ f77_int cblas_sgemm_pack_get_size( enum CBLAS_IDENTIFIER Identifier, AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 ); return tbytes; } -#endif \ No newline at end of file +#endif diff --git a/frame/thread/bli_l3_compute_decor.h b/frame/thread/bli_l3_compute_decor.h index 4ed611b333..81add795b0 100644 --- a/frame/thread/bli_l3_compute_decor.h +++ b/frame/thread/bli_l3_compute_decor.h @@ -64,4 +64,4 @@ void bli_l3_compute_thread_decorator #include "bli_l3_compute_decor_openmp.h" // #include "bli_l3_compute_decor_pthreads.h" -#endif \ No newline at end of file +#endif diff --git a/frame/thread/bli_l3_compute_decor_single.c b/frame/thread/bli_l3_compute_decor_single.c index 8bd6e5ffc2..6eae2220e2 100644 --- a/frame/thread/bli_l3_compute_decor_single.c +++ b/frame/thread/bli_l3_compute_decor_single.c @@ -87,4 +87,4 @@ void bli_l3_compute_thread_decorator } -#endif \ No newline at end of file +#endif diff --git a/frame/thread/bli_l3_compute_decor_single.h b/frame/thread/bli_l3_compute_decor_single.h index 7b5d6fee3c..307b3e593b 100644 --- a/frame/thread/bli_l3_compute_decor_single.h +++ b/frame/thread/bli_l3_compute_decor_single.h @@ -40,4 +40,4 @@ #endif -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/README.md b/gtestsuite/README.md index b9d3bc44b3..b5d801e56f 100644 --- a/gtestsuite/README.md +++ b/gtestsuite/README.md @@ -393,4 +393,4 @@ Visual Studio is a multiconfig generator. That means that it can build for `Rele $ cd Release $ testsuite.level1.addv.exe ``` -Then, you can use filters in the same way if you need to. \ No newline at end of file +Then, you can use filters in the same way if you need to. diff --git a/gtestsuite/testinghelpers/CMakeLists.txt b/gtestsuite/testinghelpers/CMakeLists.txt index 264631e679..c6cca616ed 100644 --- a/gtestsuite/testinghelpers/CMakeLists.txt +++ b/gtestsuite/testinghelpers/CMakeLists.txt @@ -65,4 +65,4 @@ else() endif() target_link_libraries(testinghelpers PUBLIC ${threads_spec}) set_target_properties(testinghelpers PROPERTIES POSITION_INDEPENDENT_CODE ON) -endif() \ No newline at end of file +endif() diff --git a/gtestsuite/testinghelpers/inc/common/refCBLAS.h b/gtestsuite/testinghelpers/inc/common/refCBLAS.h index f483a76e60..0d64594117 100644 --- a/gtestsuite/testinghelpers/inc/common/refCBLAS.h +++ b/gtestsuite/testinghelpers/inc/common/refCBLAS.h @@ -74,4 +74,4 @@ class refCBLAS }; } //end of testinghelpers namespace -extern thread_local testinghelpers::refCBLAS refCBLASModule; \ No newline at end of file +extern thread_local testinghelpers::refCBLAS refCBLASModule; diff --git a/gtestsuite/testinghelpers/src/common/data_generators.cpp b/gtestsuite/testinghelpers/src/common/data_generators.cpp index 8ed6416836..9edf5b5cc8 100644 --- a/gtestsuite/testinghelpers/src/common/data_generators.cpp +++ b/gtestsuite/testinghelpers/src/common/data_generators.cpp @@ -527,4 +527,4 @@ template void testinghelpers::set_matrix( char, gtint_t, gtint_t, dcom template void testinghelpers::set_ev_mat( char, char, gtint_t, gtint_t, gtint_t, float, float* ); template void testinghelpers::set_ev_mat( char, char, gtint_t, gtint_t, gtint_t, double, double* ); template void testinghelpers::set_ev_mat( char, char, gtint_t, gtint_t, gtint_t, scomplex, scomplex* ); -template void testinghelpers::set_ev_mat( char, char, gtint_t, gtint_t, gtint_t, dcomplex, dcomplex* ); \ No newline at end of file +template void testinghelpers::set_ev_mat( char, char, gtint_t, gtint_t, gtint_t, dcomplex, dcomplex* ); diff --git a/gtestsuite/testinghelpers/src/common/testing_basics.cpp b/gtestsuite/testinghelpers/src/common/testing_basics.cpp index 6f3c2b8f9c..5deec8e5a4 100644 --- a/gtestsuite/testinghelpers/src/common/testing_basics.cpp +++ b/gtestsuite/testinghelpers/src/common/testing_basics.cpp @@ -689,4 +689,4 @@ template std::string testinghelpers::get_value_string( double ); template std::string testinghelpers::get_value_string( scomplex ); template std::string testinghelpers::get_value_string( dcomplex ); -} //end of namespace testinghelpers \ No newline at end of file +} //end of namespace testinghelpers diff --git a/gtestsuite/testsuite/level1/addv/addv.h b/gtestsuite/testsuite/level1/addv/addv.h index e28a91a99d..ed392dedc5 100644 --- a/gtestsuite/testsuite/level1/addv/addv.h +++ b/gtestsuite/testsuite/level1/addv/addv.h @@ -79,4 +79,4 @@ static void addv(char conjx, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy) #else throw std::runtime_error("Error in testsuite/level1/addv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/addv/caddv_generic.cpp b/gtestsuite/testsuite/level1/addv/caddv_generic.cpp index 0cbf65b466..fe72eee37c 100644 --- a/gtestsuite/testsuite/level1/addv/caddv_generic.cpp +++ b/gtestsuite/testsuite/level1/addv/caddv_generic.cpp @@ -98,4 +98,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::caddvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/addv/daddv_generic.cpp b/gtestsuite/testsuite/level1/addv/daddv_generic.cpp index c700131423..40ac621290 100644 --- a/gtestsuite/testsuite/level1/addv/daddv_generic.cpp +++ b/gtestsuite/testsuite/level1/addv/daddv_generic.cpp @@ -98,4 +98,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::daddvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/addv/saddv_generic.cpp b/gtestsuite/testsuite/level1/addv/saddv_generic.cpp index 4b4820e8c6..8dbdd7e3ea 100644 --- a/gtestsuite/testsuite/level1/addv/saddv_generic.cpp +++ b/gtestsuite/testsuite/level1/addv/saddv_generic.cpp @@ -98,4 +98,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::saddvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/addv/test_addv.h b/gtestsuite/testsuite/level1/addv/test_addv.h index cf9cfd86b4..25c93ac99e 100644 --- a/gtestsuite/testsuite/level1/addv/test_addv.h +++ b/gtestsuite/testsuite/level1/addv/test_addv.h @@ -67,4 +67,4 @@ void test_addv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, double thresh // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp b/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp index df4d60beb3..7fde610664 100644 --- a/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp +++ b/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp @@ -98,4 +98,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::ZAddvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/amaxv/amaxv.h b/gtestsuite/testsuite/level1/amaxv/amaxv.h index 04f76e42f3..4479263e2b 100644 --- a/gtestsuite/testsuite/level1/amaxv/amaxv.h +++ b/gtestsuite/testsuite/level1/amaxv/amaxv.h @@ -114,4 +114,4 @@ static gtint_t amaxv(gtint_t n, T* x, gtint_t incx) #else throw std::runtime_error("Error in testsuite/level1/amaxv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp index 27799b0965..1f553cefef 100644 --- a/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp +++ b/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp @@ -107,4 +107,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2), gtint_t(11)) // stride size for x ), ::camaxvGenericTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp index 1410daefa0..7646911796 100644 --- a/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp +++ b/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp @@ -107,4 +107,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2), gtint_t(11)) // stride size for x ), ::damaxvGenericTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp index acd0f38bb7..111d51423f 100644 --- a/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp +++ b/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp @@ -107,4 +107,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2), gtint_t(11)) // stride size for x ), ::samaxvGenericTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp index b6b1155273..9c35ed502b 100644 --- a/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp +++ b/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp @@ -107,4 +107,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(2), gtint_t(11)) // stride size for x ), ::zamaxvGenericTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level1/axpbyv/axpbyv.h b/gtestsuite/testsuite/level1/axpbyv/axpbyv.h index 7d955cd7e7..0c415e1b0c 100644 --- a/gtestsuite/testsuite/level1/axpbyv/axpbyv.h +++ b/gtestsuite/testsuite/level1/axpbyv/axpbyv.h @@ -111,4 +111,4 @@ static void axpbyv(char conj_x, gtint_t n, T alpha, T* x, gtint_t incx, T beta, #else throw std::runtime_error("Error in testsuite/level1/axpbyv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp index bb277c300a..93f71b3412 100644 --- a/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp @@ -166,4 +166,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::caxpbyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp index 181466bf6e..96d94cf887 100644 --- a/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp @@ -179,4 +179,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::daxpbyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp index 80f1fc478d..a9aeb9f5a8 100644 --- a/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp @@ -175,4 +175,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::saxpbyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h b/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h index 973f8ebab4..7c6bf72eb0 100644 --- a/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h +++ b/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h @@ -100,4 +100,4 @@ static void test_axpbyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh, true ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_evt_testing.cpp b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_evt_testing.cpp index 5b3f251851..104b5d59c1 100644 --- a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_evt_testing.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_evt_testing.cpp @@ -369,4 +369,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{NaN, 2.3}, dcomplex{Inf, 0.0}, dcomplex{-Inf, NaN}), // alpha ::testing::Values(dcomplex{-0.9, NaN}, dcomplex{0.0, -Inf}, dcomplex{NaN, Inf}) // beta ), - ::zaxpbyvEVTVecPrint()); \ No newline at end of file + ::zaxpbyvEVTVecPrint()); diff --git a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp index 83cd127b77..b69a132796 100644 --- a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp @@ -198,4 +198,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{2.2, -3.3}), // alpha ::testing::Values(dcomplex{0.0, 0.0}, dcomplex{1.0, 0.0}, dcomplex{1.0, 2.0}) // beta ), - ::zaxpbyvAccTestPrint()); \ No newline at end of file + ::zaxpbyvAccTestPrint()); diff --git a/gtestsuite/testsuite/level1/axpyv/axpyv.h b/gtestsuite/testsuite/level1/axpyv/axpyv.h index 9081da1051..10e56cae15 100644 --- a/gtestsuite/testsuite/level1/axpyv/axpyv.h +++ b/gtestsuite/testsuite/level1/axpyv/axpyv.h @@ -110,4 +110,4 @@ static void axpyv(char conj_x, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gti #else throw std::runtime_error("Error in testsuite/level1/axpyv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp index 4cd74f4dc8..ad4db3c95b 100644 --- a/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp @@ -156,4 +156,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::caxpyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp index 69e69f8c6e..19d65ed5a3 100644 --- a/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp @@ -165,4 +165,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::daxpyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp index 437518c498..10c1daefa2 100644 --- a/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp @@ -165,4 +165,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::saxpyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/axpyv/test_axpyv.h b/gtestsuite/testsuite/level1/axpyv/test_axpyv.h index 90f757ef7b..1cc375da00 100644 --- a/gtestsuite/testsuite/level1/axpyv/test_axpyv.h +++ b/gtestsuite/testsuite/level1/axpyv/test_axpyv.h @@ -68,4 +68,4 @@ static void test_axpyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp index d88596c881..64b98f1b04 100644 --- a/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp +++ b/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp @@ -155,4 +155,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zaxpyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp index beb0aced0c..29f988005b 100644 --- a/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp +++ b/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp @@ -147,4 +147,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::ccopyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/copyv/copyv.h b/gtestsuite/testsuite/level1/copyv/copyv.h index bd0298bc89..cc8bf85af0 100644 --- a/gtestsuite/testsuite/level1/copyv/copyv.h +++ b/gtestsuite/testsuite/level1/copyv/copyv.h @@ -109,4 +109,4 @@ static void copyv(char conjx, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy) #else throw std::runtime_error("Error in testsuite/level1/copyv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp index 7957b02d01..1c7824b8f4 100644 --- a/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp +++ b/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp @@ -156,4 +156,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::dcopyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp index ca2c591b2f..e86d2f320f 100644 --- a/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp +++ b/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp @@ -156,4 +156,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::scopyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/copyv/test_copyv.h b/gtestsuite/testsuite/level1/copyv/test_copyv.h index 00f1995dd0..6ab5a12bca 100644 --- a/gtestsuite/testsuite/level1/copyv/test_copyv.h +++ b/gtestsuite/testsuite/level1/copyv/test_copyv.h @@ -68,4 +68,4 @@ static void test_copyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, doubl // Compute error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp index 3bd3aa64c7..eeb9b13e37 100644 --- a/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp +++ b/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp @@ -147,4 +147,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zcopyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp index 1f21f8433a..0a662d96b4 100644 --- a/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp @@ -163,4 +163,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::cdotvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/dotv/dotv.h b/gtestsuite/testsuite/level1/dotv/dotv.h index 2120b40ea8..7917868e56 100644 --- a/gtestsuite/testsuite/level1/dotv/dotv.h +++ b/gtestsuite/testsuite/level1/dotv/dotv.h @@ -122,4 +122,4 @@ static void dotv(char conjx, char conjy, gtint_t n, #else throw std::runtime_error("Error in testsuite/level1/dotv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp index 9f59e2ea00..9d69ac6e7a 100644 --- a/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp @@ -165,4 +165,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::sdotvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/dotv/test_dotv.h b/gtestsuite/testsuite/level1/dotv/test_dotv.h index fa5abb5270..3f9610f7da 100644 --- a/gtestsuite/testsuite/level1/dotv/test_dotv.h +++ b/gtestsuite/testsuite/level1/dotv/test_dotv.h @@ -73,4 +73,4 @@ static void test_dotv( char conjx, char conjy, gtint_t n, gtint_t incx, // Compute error. //---------------------------------------------------------- computediff( rho, rho_ref, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp index e37b3faa32..7d7d3aabd0 100644 --- a/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp @@ -163,4 +163,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zdotvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp index e4ed5e636b..5ed6f67d96 100644 --- a/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp @@ -155,4 +155,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::cdotxvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp index 9ee8be98b8..75376ed4b9 100644 --- a/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp @@ -155,4 +155,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::ddotxvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/dotxv/dotxv.h b/gtestsuite/testsuite/level1/dotxv/dotxv.h index 91a13400fc..3bb01ad0a0 100644 --- a/gtestsuite/testsuite/level1/dotxv/dotxv.h +++ b/gtestsuite/testsuite/level1/dotxv/dotxv.h @@ -85,4 +85,4 @@ static void dotxv( char conjx, char conjy, gtint_t n, T* alpha, #else throw std::runtime_error("Error in testsuite/level1/dotxv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp index 4dd80401e3..9ee47c18a7 100644 --- a/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp @@ -155,4 +155,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::sdotxvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/dotxv/test_dotxv.h b/gtestsuite/testsuite/level1/dotxv/test_dotxv.h index 6562e3dc46..729e172b8f 100644 --- a/gtestsuite/testsuite/level1/dotxv/test_dotxv.h +++ b/gtestsuite/testsuite/level1/dotxv/test_dotxv.h @@ -72,4 +72,4 @@ static void test_dotxv( gtint_t n, char conjx, char conjy, T alpha, // Compute error. //---------------------------------------------------------- computediff( rho, rho_ref, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp index 652c5d030c..10bfcac45f 100644 --- a/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp +++ b/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp @@ -139,4 +139,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zdotxvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp index 5d582ce7ce..e9c1d53189 100644 --- a/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp +++ b/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp @@ -128,4 +128,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::cscal2vGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp index 790e8dc0ee..66b624c382 100644 --- a/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp +++ b/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp @@ -142,4 +142,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::dscal2vGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/scal2v/scal2v.h b/gtestsuite/testsuite/level1/scal2v/scal2v.h index b90b2d9eef..ad1383b712 100644 --- a/gtestsuite/testsuite/level1/scal2v/scal2v.h +++ b/gtestsuite/testsuite/level1/scal2v/scal2v.h @@ -80,4 +80,4 @@ static void scal2v(char conjx, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gti #else throw std::runtime_error("Error in testsuite/level1/scal2v.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp index f28670b0ef..366d649ead 100644 --- a/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp +++ b/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp @@ -143,4 +143,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::sscal2vGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/scal2v/test_scal2v.h b/gtestsuite/testsuite/level1/scal2v/test_scal2v.h index c582688340..9cb621acb6 100644 --- a/gtestsuite/testsuite/level1/scal2v/test_scal2v.h +++ b/gtestsuite/testsuite/level1/scal2v/test_scal2v.h @@ -67,4 +67,4 @@ static void test_scal2v(char conjx, gtint_t n, gtint_t incx, gtint_t incy, T alp // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp index 0619265732..5c413192d6 100644 --- a/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp +++ b/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp @@ -129,4 +129,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zscal2vGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp index eb4a03580f..bf367f73d8 100644 --- a/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp @@ -149,4 +149,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::cscalvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp index f2a08f340d..b73db053c6 100644 --- a/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp @@ -156,4 +156,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::dscalvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/scalv/scalv.h b/gtestsuite/testsuite/level1/scalv/scalv.h index a23fb24e5f..0ae0125f52 100644 --- a/gtestsuite/testsuite/level1/scalv/scalv.h +++ b/gtestsuite/testsuite/level1/scalv/scalv.h @@ -109,4 +109,4 @@ static void scalv(char conj_alpha, gtint_t n, T alpha, T* x, gtint_t incx) #else throw std::runtime_error("Error in testsuite/level1/scalv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp index 3e5cf70b1e..9ac6c0d4ed 100644 --- a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp +++ b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp @@ -87,4 +87,4 @@ TYPED_TEST(xscalv, zero_alpha_x_inf) // Set the threshold for the errors: double thresh = testinghelpers::getEpsilon(); computediff( n, x.data(), x_ref.data(), incx, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp index 1bcdd90903..e00f5effa2 100644 --- a/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp @@ -157,4 +157,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::sscalvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/scalv/test_scalv.h b/gtestsuite/testsuite/level1/scalv/test_scalv.h index a90405d7c6..4c5437d722 100644 --- a/gtestsuite/testsuite/level1/scalv/test_scalv.h +++ b/gtestsuite/testsuite/level1/scalv/test_scalv.h @@ -66,4 +66,4 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, T alpha, doub // Compute component-wise error. //---------------------------------------------------------- computediff( n, x.data(), x_ref.data(), incx, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp index 6336a121cc..66419cbd4c 100644 --- a/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp @@ -149,4 +149,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zscalvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/setv/csetv_generic.cpp b/gtestsuite/testsuite/level1/setv/csetv_generic.cpp index 2d6a9d8320..2a2daf72fd 100644 --- a/gtestsuite/testsuite/level1/setv/csetv_generic.cpp +++ b/gtestsuite/testsuite/level1/setv/csetv_generic.cpp @@ -90,4 +90,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::csetvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp b/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp index 8a9bef8184..6051169bbc 100644 --- a/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp +++ b/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp @@ -90,4 +90,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::dsetvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/setv/setv.h b/gtestsuite/testsuite/level1/setv/setv.h index 08a277dedb..651ec36b90 100644 --- a/gtestsuite/testsuite/level1/setv/setv.h +++ b/gtestsuite/testsuite/level1/setv/setv.h @@ -77,4 +77,4 @@ static void setv(char conjalpha, gtint_t n, T* alpha, T* x, gtint_t incx) #else throw std::runtime_error("Error in testsuite/level1/setv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp b/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp index 2c94385e1e..2590619ea2 100644 --- a/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp +++ b/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp @@ -90,4 +90,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::ssetvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/setv/test_setv.h b/gtestsuite/testsuite/level1/setv/test_setv.h index e5521aafe8..da98788ecc 100644 --- a/gtestsuite/testsuite/level1/setv/test_setv.h +++ b/gtestsuite/testsuite/level1/setv/test_setv.h @@ -72,4 +72,4 @@ void test_setv( char conjalpha, gtint_t n, T alpha, gtint_t incx ) i = (incx > 0) ? (idx * incx) : ( - ( n - idx - 1 ) * incx ); EXPECT_EQ(x[i], alpha_ref) << "blis_sol[" << i << "]="<< x[i] <<" ref = " << alpha_ref; } -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp b/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp index e54bdfa887..d12271612f 100644 --- a/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp +++ b/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp @@ -90,4 +90,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zsetvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/subv/csubv_generic.cpp b/gtestsuite/testsuite/level1/subv/csubv_generic.cpp index c61b27e4ae..70797d5e5a 100644 --- a/gtestsuite/testsuite/level1/subv/csubv_generic.cpp +++ b/gtestsuite/testsuite/level1/subv/csubv_generic.cpp @@ -98,4 +98,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::csubvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp b/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp index f34f4f28a3..63a63a9274 100644 --- a/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp +++ b/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp @@ -98,4 +98,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::dsubvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp b/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp index 5447b08699..50e004cb07 100644 --- a/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp +++ b/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp @@ -98,4 +98,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::ssubvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/subv/subv.h b/gtestsuite/testsuite/level1/subv/subv.h index f0a9da4c65..ff5059d6ff 100644 --- a/gtestsuite/testsuite/level1/subv/subv.h +++ b/gtestsuite/testsuite/level1/subv/subv.h @@ -78,4 +78,4 @@ static void subv(char conjx, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy) #else throw std::runtime_error("Error in testsuite/level1/subv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/subv/test_subv.h b/gtestsuite/testsuite/level1/subv/test_subv.h index b61b1c50eb..ffdf86a3db 100644 --- a/gtestsuite/testsuite/level1/subv/test_subv.h +++ b/gtestsuite/testsuite/level1/subv/test_subv.h @@ -67,4 +67,4 @@ void test_subv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, double thresh // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp b/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp index 270c2a1c83..f4e634f4c5 100644 --- a/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp +++ b/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp @@ -98,4 +98,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zsubvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp index fef51802f4..079867f1f4 100644 --- a/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp @@ -146,4 +146,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::dxpbyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp index 7c9120e276..fe33a81cb8 100644 --- a/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp @@ -145,4 +145,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::sxpbyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h b/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h index 5b1534582e..1694c2149d 100644 --- a/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h +++ b/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h @@ -68,4 +68,4 @@ static void test_xpbyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy, // Compute component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/xpbyv/xpbyv.h b/gtestsuite/testsuite/level1/xpbyv/xpbyv.h index 21212f6834..2b3a15fbd5 100644 --- a/gtestsuite/testsuite/level1/xpbyv/xpbyv.h +++ b/gtestsuite/testsuite/level1/xpbyv/xpbyv.h @@ -79,4 +79,4 @@ static void xpbyv(char conj_x, gtint_t n, T* x, gtint_t incx, T beta, T* y, gtin #else throw std::runtime_error("Error in testsuite/level1/xpbyv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp index e648e83f0d..04b781da8c 100644 --- a/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp +++ b/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp @@ -128,4 +128,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zxpbyvGenericTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp index 5403ca19fc..8ba1f7a429 100644 --- a/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp @@ -147,4 +147,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0)) // increment to the leading dim of a ), ::cgemvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp index 79249202d1..33cc9fa57b 100644 --- a/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp @@ -145,4 +145,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0)) // increment to the leading dim of a ), ::dgemvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/gemv/gemv.h b/gtestsuite/testsuite/level2/gemv/gemv.h index d7d66d6264..d6cc12f2db 100644 --- a/gtestsuite/testsuite/level2/gemv/gemv.h +++ b/gtestsuite/testsuite/level2/gemv/gemv.h @@ -147,4 +147,4 @@ static void gemv( char storage, char trans, char conj_x, gtint_t m, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/gemv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/gemv/test_gemv.h b/gtestsuite/testsuite/level2/gemv/test_gemv.h index 7175b07fc2..76f8970294 100644 --- a/gtestsuite/testsuite/level2/gemv/test_gemv.h +++ b/gtestsuite/testsuite/level2/gemv/test_gemv.h @@ -77,4 +77,4 @@ void test_gemv( char storage, char trnsa, char conjx, gtint_t m, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( leny, y.data(), y_ref.data(), incy, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp index 44903e9347..8c27717111 100644 --- a/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp +++ b/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp @@ -147,4 +147,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0)) // increment to the leading dim of a ), ::zgemvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/ger/cger_generic.cpp b/gtestsuite/testsuite/level2/ger/cger_generic.cpp index b3bad3620e..024ac6d4da 100644 --- a/gtestsuite/testsuite/level2/ger/cger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/cger_generic.cpp @@ -139,4 +139,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::cgerTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/ger/dger_generic.cpp b/gtestsuite/testsuite/level2/ger/dger_generic.cpp index d25e5bd16f..1fd5efa4f2 100644 --- a/gtestsuite/testsuite/level2/ger/dger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/dger_generic.cpp @@ -138,4 +138,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::dgerTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/ger/ger.h b/gtestsuite/testsuite/level2/ger/ger.h index f211c4cbba..c6747f6c7a 100644 --- a/gtestsuite/testsuite/level2/ger/ger.h +++ b/gtestsuite/testsuite/level2/ger/ger.h @@ -155,4 +155,4 @@ static void ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/ger.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/ger/sger_generic.cpp b/gtestsuite/testsuite/level2/ger/sger_generic.cpp index 7298224040..37c832759d 100644 --- a/gtestsuite/testsuite/level2/ger/sger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/sger_generic.cpp @@ -138,4 +138,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::sgerTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/ger/test_ger.h b/gtestsuite/testsuite/level2/ger/test_ger.h index 13ef4f7596..3e8e7646d8 100644 --- a/gtestsuite/testsuite/level2/ger/test_ger.h +++ b/gtestsuite/testsuite/level2/ger/test_ger.h @@ -73,4 +73,4 @@ void test_ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, a.data(), a_ref.data(), lda, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/ger/zger_generic.cpp b/gtestsuite/testsuite/level2/ger/zger_generic.cpp index b5fd790703..5847842c30 100644 --- a/gtestsuite/testsuite/level2/ger/zger_generic.cpp +++ b/gtestsuite/testsuite/level2/ger/zger_generic.cpp @@ -139,4 +139,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::zgerTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp b/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp index 33aebd8125..ed4b726817 100644 --- a/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp +++ b/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp @@ -146,4 +146,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::chemvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/hemv/hemv.h b/gtestsuite/testsuite/level2/hemv/hemv.h index 7dbf7a961f..90086336a7 100644 --- a/gtestsuite/testsuite/level2/hemv/hemv.h +++ b/gtestsuite/testsuite/level2/hemv/hemv.h @@ -135,4 +135,4 @@ static void hemv( char storage, char uploa, char conja, char conjx, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/hemv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/hemv/test_hemv.h b/gtestsuite/testsuite/level2/hemv/test_hemv.h index a5018701af..a7243cbd2e 100644 --- a/gtestsuite/testsuite/level2/hemv/test_hemv.h +++ b/gtestsuite/testsuite/level2/hemv/test_hemv.h @@ -75,4 +75,4 @@ void test_hemv( char storage, char uploa, char conja, char conjx, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp b/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp index 8e116b186e..81ee763b24 100644 --- a/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp +++ b/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp @@ -146,4 +146,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::zhemvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/her/cher_generic.cpp b/gtestsuite/testsuite/level2/her/cher_generic.cpp index 9ad83a597f..8be6c2ed49 100644 --- a/gtestsuite/testsuite/level2/her/cher_generic.cpp +++ b/gtestsuite/testsuite/level2/her/cher_generic.cpp @@ -125,4 +125,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::cherTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/her/her.h b/gtestsuite/testsuite/level2/her/her.h index a21d907008..ea7d3008c7 100644 --- a/gtestsuite/testsuite/level2/her/her.h +++ b/gtestsuite/testsuite/level2/her/her.h @@ -123,4 +123,4 @@ static void her( char storage, char uploa, char conj_x, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/her.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/her/test_her.h b/gtestsuite/testsuite/level2/her/test_her.h index b0975b2ad1..db41652975 100644 --- a/gtestsuite/testsuite/level2/her/test_her.h +++ b/gtestsuite/testsuite/level2/her/test_her.h @@ -72,4 +72,4 @@ void test_her( char storage, char uploa, char conjx, gtint_t n, Tr alpha, // check component-wise error. //---------------------------------------------------------- computediff( storage, n, n, a.data(), a_ref.data(), lda, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/her/zher_generic.cpp b/gtestsuite/testsuite/level2/her/zher_generic.cpp index 198e0a3bdb..8db149caa5 100644 --- a/gtestsuite/testsuite/level2/her/zher_generic.cpp +++ b/gtestsuite/testsuite/level2/her/zher_generic.cpp @@ -125,4 +125,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::zherTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/her2/cher2_generic.cpp b/gtestsuite/testsuite/level2/her2/cher2_generic.cpp index 4df3e6dda3..f6bbd15a06 100644 --- a/gtestsuite/testsuite/level2/her2/cher2_generic.cpp +++ b/gtestsuite/testsuite/level2/her2/cher2_generic.cpp @@ -138,4 +138,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::cher2TestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/her2/her2.h b/gtestsuite/testsuite/level2/her2/her2.h index d68d7e4f7d..759b2d90d2 100644 --- a/gtestsuite/testsuite/level2/her2/her2.h +++ b/gtestsuite/testsuite/level2/her2/her2.h @@ -128,4 +128,4 @@ static void her2( char storage, char uploa, char conj_x, char conj_y, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/her2.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/her2/test_her2.h b/gtestsuite/testsuite/level2/her2/test_her2.h index 487454ae9d..b0802d64b4 100644 --- a/gtestsuite/testsuite/level2/her2/test_her2.h +++ b/gtestsuite/testsuite/level2/her2/test_her2.h @@ -75,4 +75,4 @@ void test_her2( char storage, char uploa, char conjx, char conjy, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, n, n, a.data(), a_ref.data(), lda, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/her2/zher2_generic.cpp b/gtestsuite/testsuite/level2/her2/zher2_generic.cpp index 19723abd6f..acd8b4465a 100644 --- a/gtestsuite/testsuite/level2/her2/zher2_generic.cpp +++ b/gtestsuite/testsuite/level2/her2/zher2_generic.cpp @@ -138,4 +138,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::zher2TestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp b/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp index 0e959e759b..a62f20996d 100644 --- a/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp +++ b/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp @@ -144,4 +144,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::dsymvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp b/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp index 11ac8d71e8..d83d75b7dc 100644 --- a/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp +++ b/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp @@ -144,4 +144,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::ssymvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/symv/symv.h b/gtestsuite/testsuite/level2/symv/symv.h index 78a7aaf0a1..2d77b25de4 100644 --- a/gtestsuite/testsuite/level2/symv/symv.h +++ b/gtestsuite/testsuite/level2/symv/symv.h @@ -130,4 +130,4 @@ static void symv( char storage, char uploa, char conja, char conjx, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/symv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/symv/test_symv.h b/gtestsuite/testsuite/level2/symv/test_symv.h index 789caecbae..f0df77c18b 100644 --- a/gtestsuite/testsuite/level2/symv/test_symv.h +++ b/gtestsuite/testsuite/level2/symv/test_symv.h @@ -75,4 +75,4 @@ void test_symv( char storage, char uploa, char conja, char conjx, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( n, y.data(), y_ref.data(), incy, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp b/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp index 784fa63ca6..3d755586a8 100644 --- a/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp +++ b/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp @@ -125,4 +125,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::dsyrTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp b/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp index 3fb8a17570..446c2f4743 100644 --- a/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp +++ b/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp @@ -125,4 +125,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::ssyrTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/syr/syr.h b/gtestsuite/testsuite/level2/syr/syr.h index dad1b9f278..e16d5c5322 100644 --- a/gtestsuite/testsuite/level2/syr/syr.h +++ b/gtestsuite/testsuite/level2/syr/syr.h @@ -125,4 +125,4 @@ static void syr( char storage, char uploa, char conj_x, gtint_t n, T* alpha, #else throw std::runtime_error("Error in testsuite/level2/syr.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/syr/test_syr.h b/gtestsuite/testsuite/level2/syr/test_syr.h index 3a62dd371a..125445fa19 100644 --- a/gtestsuite/testsuite/level2/syr/test_syr.h +++ b/gtestsuite/testsuite/level2/syr/test_syr.h @@ -72,4 +72,4 @@ void test_syr( char storage, char uploa, char conjx, gtint_t n, T alpha, // check component-wise error. //---------------------------------------------------------- computediff( storage, n, n, a.data(), a_ref.data(), lda, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp b/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp index cbbf06ea84..2a021ea6d8 100644 --- a/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp +++ b/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp @@ -137,4 +137,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::dsyr2TestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp b/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp index 261921746e..75df2d0367 100644 --- a/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp +++ b/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp @@ -137,4 +137,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::ssyr2TestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/syr2/syr2.h b/gtestsuite/testsuite/level2/syr2/syr2.h index 622bd0edd8..dd51b5497b 100644 --- a/gtestsuite/testsuite/level2/syr2/syr2.h +++ b/gtestsuite/testsuite/level2/syr2/syr2.h @@ -128,4 +128,4 @@ static void syr2( char storage, char uploa, char conj_x, char conj_y, gtint_t n, #else throw std::runtime_error("Error in testsuite/level2/syr2.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/syr2/test_syr2.h b/gtestsuite/testsuite/level2/syr2/test_syr2.h index 5f4e81f7b6..a4a623b6ea 100644 --- a/gtestsuite/testsuite/level2/syr2/test_syr2.h +++ b/gtestsuite/testsuite/level2/syr2/test_syr2.h @@ -75,4 +75,4 @@ void test_syr2( char storage, char uploa, char conjx, char conjy, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, n, n, a.data(), a_ref.data(), lda, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp index 0c24ba588a..a82fafcc2b 100644 --- a/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp @@ -136,4 +136,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(9)) // increment to the leading dim of a ), ::ctrmvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp index c825d93be5..e7e9e325b9 100644 --- a/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp @@ -135,4 +135,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::dtrmvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp index bd4caad329..470e556814 100644 --- a/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp @@ -135,4 +135,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of a ), ::strmvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/trmv/test_trmv.h b/gtestsuite/testsuite/level2/trmv/test_trmv.h index 2ac5c70145..d59f4412f7 100644 --- a/gtestsuite/testsuite/level2/trmv/test_trmv.h +++ b/gtestsuite/testsuite/level2/trmv/test_trmv.h @@ -71,4 +71,4 @@ void test_trmv( char storage, char uploa, char transa, char diaga, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( n, x.data(), x_ref.data(), incx, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/trmv/trmv.h b/gtestsuite/testsuite/level2/trmv/trmv.h index 38f10dbea8..8ee3750a62 100644 --- a/gtestsuite/testsuite/level2/trmv/trmv.h +++ b/gtestsuite/testsuite/level2/trmv/trmv.h @@ -157,4 +157,4 @@ static void trmv( char storage, char uploa, char transa, char diaga, #else throw std::runtime_error("Error in testsuite/level2/trmv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp index 4e76623824..1fb53d2b7d 100644 --- a/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp +++ b/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp @@ -136,4 +136,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of a ), ::ztrmvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp index 1652a74e49..1639e7202c 100644 --- a/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp @@ -136,4 +136,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of a ), ::ctrsvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp index fb4a8af541..3ebf2f6076 100644 --- a/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp @@ -135,4 +135,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of a ), ::dtrsvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp index 7dcf457134..201223b134 100644 --- a/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp @@ -135,4 +135,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(7)) // increment to the leading dim of a ), ::strsvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level2/trsv/test_trsv.h b/gtestsuite/testsuite/level2/trsv/test_trsv.h index c5f8cd61cd..2266397200 100644 --- a/gtestsuite/testsuite/level2/trsv/test_trsv.h +++ b/gtestsuite/testsuite/level2/trsv/test_trsv.h @@ -71,4 +71,4 @@ void test_trsv( char storage, char uploa, char transa, char diaga, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( n, x.data(), x_ref.data(), incx, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/trsv/trsv.h b/gtestsuite/testsuite/level2/trsv/trsv.h index 522ae319fb..65ca33112a 100644 --- a/gtestsuite/testsuite/level2/trsv/trsv.h +++ b/gtestsuite/testsuite/level2/trsv/trsv.h @@ -157,4 +157,4 @@ static void trsv( char storage, char uploa, char transa, char diaga, #else throw std::runtime_error("Error in testsuite/level2/trsv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp index 1cc4fbf34b..dc8b004575 100644 --- a/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp +++ b/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp @@ -136,4 +136,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of a ), ::ztrsvTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/gemm/IIT_ERS_test.cpp b/gtestsuite/testsuite/level3/gemm/IIT_ERS_test.cpp index debe86a5dc..9e8ea79d4e 100644 --- a/gtestsuite/testsuite/level3/gemm/IIT_ERS_test.cpp +++ b/gtestsuite/testsuite/level3/gemm/IIT_ERS_test.cpp @@ -261,4 +261,4 @@ TYPED_TEST(Gemm_IIT_ERS_Test, k_zero_beta_one) } -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp index 9efea8b5dc..5043dc44a7 100644 --- a/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp @@ -149,4 +149,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::CGemmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/gemm/gemm.h b/gtestsuite/testsuite/level3/gemm/gemm.h index 13f8bf6198..907f078848 100644 --- a/gtestsuite/testsuite/level3/gemm/gemm.h +++ b/gtestsuite/testsuite/level3/gemm/gemm.h @@ -164,4 +164,4 @@ static void gemm( char storage, char transa, char transb, gtint_t m, gtint_t n, #else throw std::runtime_error("Error in testsuite/level3/gemm.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp index 6abfbe871f..6e11c8956a 100644 --- a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp @@ -221,4 +221,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(7)) // increment to the leading dim of c ), ::SGemmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/gemm/zgemm_evt_testing.cpp b/gtestsuite/testsuite/level3/gemm/zgemm_evt_testing.cpp index 4f328a60be..3b0f05ab9b 100644 --- a/gtestsuite/testsuite/level3/gemm/zgemm_evt_testing.cpp +++ b/gtestsuite/testsuite/level3/gemm/zgemm_evt_testing.cpp @@ -353,4 +353,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0)) // increment to the leading dim of c ), ::ZGemmEVAlphaBetaPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp index 94bb6fb914..6bdb2d63e8 100644 --- a/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp @@ -176,4 +176,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of c ), ::ZGemmAccPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp b/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp index e26b8e9624..a648f53bc1 100644 --- a/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm_compute/dgemm_compute_generic.cpp @@ -209,4 +209,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0)) // increment to the leading dim of c ), ::DGemmComputeTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/gemm_compute/gemm_compute.h b/gtestsuite/testsuite/level3/gemm_compute/gemm_compute.h index b57691dfe3..1d168df634 100644 --- a/gtestsuite/testsuite/level3/gemm_compute/gemm_compute.h +++ b/gtestsuite/testsuite/level3/gemm_compute/gemm_compute.h @@ -453,4 +453,4 @@ static void gemm_compute( char storage, char transa, char transb, char packa, ch #else throw std::runtime_error("Error in testsuite/level3/gemm_compute.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp b/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp index 89c439c6ef..db293c0433 100644 --- a/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp +++ b/gtestsuite/testsuite/level3/gemm_compute/gemm_compute_IIT_ERS.cpp @@ -234,4 +234,4 @@ TYPED_TEST(GEMM_Compute_IIT_ERS_Test, n_eq_zero) // Use bitwise comparison (no threshold). computediff( STORAGE, N, N, c.data(), c_ref.data(), LDC); } -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp b/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp index a75ac16916..ea574eb723 100644 --- a/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm_compute/sgemm_compute_generic.cpp @@ -211,4 +211,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0)) // increment to the leading dim of c ), ::SGemmComputeTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/gemm_compute/test_gemm_compute.h b/gtestsuite/testsuite/level3/gemm_compute/test_gemm_compute.h index 7d1016941b..a9109d5abc 100644 --- a/gtestsuite/testsuite/level3/gemm_compute/test_gemm_compute.h +++ b/gtestsuite/testsuite/level3/gemm_compute/test_gemm_compute.h @@ -76,4 +76,4 @@ void test_gemm_compute( char storage, char trnsa, char trnsb, char pcka, char pc // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp index 39bc5a5472..07aed996bb 100644 --- a/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp +++ b/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp @@ -154,4 +154,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::cgemmtTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp index 71d23f2e2b..c31260def4 100644 --- a/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp +++ b/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp @@ -151,4 +151,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::dgemmtTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level3/gemmt/gemmt.h b/gtestsuite/testsuite/level3/gemmt/gemmt.h index 062657bd81..a9a92821e0 100644 --- a/gtestsuite/testsuite/level3/gemmt/gemmt.h +++ b/gtestsuite/testsuite/level3/gemmt/gemmt.h @@ -173,4 +173,4 @@ static void gemmt( char storage, char uplo, char transa, char transb, gtint_t n, #else throw std::runtime_error("Error in testsuite/level3/gemmt.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp index 4ac56998e3..e067a684e7 100644 --- a/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp +++ b/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp @@ -152,4 +152,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::sgemmtTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level3/gemmt/test_gemmt.h b/gtestsuite/testsuite/level3/gemmt/test_gemmt.h index af67f55565..2afaba222d 100644 --- a/gtestsuite/testsuite/level3/gemmt/test_gemmt.h +++ b/gtestsuite/testsuite/level3/gemmt/test_gemmt.h @@ -76,4 +76,4 @@ void test_gemmt( char storage, char uplo, char trnsa, char trnsb, gtint_t n, // check component-wise error. //---------------------------------------------------------- computediff( storage, n, n, c.data(), c_ref.data(), ldc, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp index de5ec8ba70..7c8a4c8ecf 100644 --- a/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp +++ b/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp @@ -154,4 +154,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::zgemmtTestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp b/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp index 314a320032..173aa8777b 100644 --- a/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp +++ b/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp @@ -154,4 +154,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::chemmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/hemm/hemm.h b/gtestsuite/testsuite/level3/hemm/hemm.h index 2fae4c3c36..1cc0ca1473 100644 --- a/gtestsuite/testsuite/level3/hemm/hemm.h +++ b/gtestsuite/testsuite/level3/hemm/hemm.h @@ -164,4 +164,4 @@ static void hemm( char storage, char side, char uplo, char conja, char transb, g #else throw std::runtime_error("Error in testsuite/level3/hemm.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/hemm/test_hemm.h b/gtestsuite/testsuite/level3/hemm/test_hemm.h index 7b1cbf4d15..a55510bf04 100644 --- a/gtestsuite/testsuite/level3/hemm/test_hemm.h +++ b/gtestsuite/testsuite/level3/hemm/test_hemm.h @@ -80,4 +80,4 @@ void test_hemm( char storage, char side, char uplo, char conja, char transb, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp b/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp index 4ab063bb91..f509cb8881 100644 --- a/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp +++ b/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp @@ -154,4 +154,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(6)) // increment to the leading dim of c ), ::zhemmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp b/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp index c256096221..b87a833950 100644 --- a/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp +++ b/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp @@ -149,4 +149,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of c ), ::cher2kTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/her2k/her2k.h b/gtestsuite/testsuite/level3/her2k/her2k.h index 90d548aa0c..76ea95f3b4 100644 --- a/gtestsuite/testsuite/level3/her2k/her2k.h +++ b/gtestsuite/testsuite/level3/her2k/her2k.h @@ -155,4 +155,4 @@ static void her2k( char storage, char uplo, char transa, char transb, gtint_t m, #else throw std::runtime_error("Error in testsuite/level3/her2k.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/her2k/test_her2k.h b/gtestsuite/testsuite/level3/her2k/test_her2k.h index 345fe5d890..18ab391cd7 100644 --- a/gtestsuite/testsuite/level3/her2k/test_her2k.h +++ b/gtestsuite/testsuite/level3/her2k/test_her2k.h @@ -79,4 +79,4 @@ void test_her2k( char storage, char uplo, char transa, char transb, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, m, c.data(), c_ref.data(), ldc, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp b/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp index 9f24bc78fe..2ae305c086 100644 --- a/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp +++ b/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp @@ -149,4 +149,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of c ), ::zher2kTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/herk/cherk_generic.cpp b/gtestsuite/testsuite/level3/herk/cherk_generic.cpp index 2480b1d6de..868b637d3a 100644 --- a/gtestsuite/testsuite/level3/herk/cherk_generic.cpp +++ b/gtestsuite/testsuite/level3/herk/cherk_generic.cpp @@ -138,4 +138,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of b ), ::cherkTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/herk/herk.h b/gtestsuite/testsuite/level3/herk/herk.h index fd6990ff07..6aab4355dc 100644 --- a/gtestsuite/testsuite/level3/herk/herk.h +++ b/gtestsuite/testsuite/level3/herk/herk.h @@ -144,4 +144,4 @@ static void herk( char storage, char uplo, char transa, gtint_t m, gtint_t k, #else throw std::runtime_error("Error in testsuite/level3/herk.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/herk/test_herk.h b/gtestsuite/testsuite/level3/herk/test_herk.h index 42704dff7c..a283366566 100644 --- a/gtestsuite/testsuite/level3/herk/test_herk.h +++ b/gtestsuite/testsuite/level3/herk/test_herk.h @@ -76,4 +76,4 @@ void test_herk( char storage, char uplo, char transa, gtint_t m, gtint_t k, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, m, c.data(), c_ref.data(), ldc, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/herk/zherk_generic.cpp b/gtestsuite/testsuite/level3/herk/zherk_generic.cpp index 2947549b15..b3d89854c6 100644 --- a/gtestsuite/testsuite/level3/herk/zherk_generic.cpp +++ b/gtestsuite/testsuite/level3/herk/zherk_generic.cpp @@ -138,4 +138,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::zherkTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/symm/csymm_generic.cpp b/gtestsuite/testsuite/level3/symm/csymm_generic.cpp index f1e7ff6e28..72e84c9069 100644 --- a/gtestsuite/testsuite/level3/symm/csymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/csymm_generic.cpp @@ -155,4 +155,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of c ), ::csymmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp b/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp index 5c83a66237..34d4fdb474 100644 --- a/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp @@ -153,4 +153,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of c ), ::dsymmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp b/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp index 64a1532922..749b7a7fce 100644 --- a/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp @@ -153,4 +153,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(9)) // increment to the leading dim of c ), ::ssymmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/symm/symm.h b/gtestsuite/testsuite/level3/symm/symm.h index 6f6037472b..cc97c9304f 100644 --- a/gtestsuite/testsuite/level3/symm/symm.h +++ b/gtestsuite/testsuite/level3/symm/symm.h @@ -172,4 +172,4 @@ static void symm( char storage, char side, char uplo, char conja, char transb, g #else throw std::runtime_error("Error in testsuite/level3/symm.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/symm/test_symm.h b/gtestsuite/testsuite/level3/symm/test_symm.h index 0bfcd3fd1b..cc90d7f52a 100644 --- a/gtestsuite/testsuite/level3/symm/test_symm.h +++ b/gtestsuite/testsuite/level3/symm/test_symm.h @@ -81,4 +81,4 @@ void test_symm( char storage, char side, char uplo, char conja, char transb, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, c.data(), c_ref.data(), ldc, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp b/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp index 3840ab4aca..a6c163816a 100644 --- a/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp +++ b/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp @@ -155,4 +155,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of c ), ::zsymmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp index 28e562764f..2ee7903302 100644 --- a/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp @@ -149,4 +149,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of c ), ::csyr2kTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp index 8ab791c5b6..f990ef6ac3 100644 --- a/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp @@ -147,4 +147,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(7)) // increment to the leading dim of c ), ::dsyr2kTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp index fe4941e84d..4b4cc8ccdd 100644 --- a/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp @@ -147,4 +147,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of c ), ::ssyr2kTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/syr2k/test_syr2k.h b/gtestsuite/testsuite/level3/syr2k/test_syr2k.h index 218a893698..da2dabb0a9 100644 --- a/gtestsuite/testsuite/level3/syr2k/test_syr2k.h +++ b/gtestsuite/testsuite/level3/syr2k/test_syr2k.h @@ -79,4 +79,4 @@ void test_syr2k( char storage, char uplo, char transa, char transb, gtint_t m, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, m, c.data(), c_ref.data(), ldc, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp index e929c13601..3600872367 100644 --- a/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp +++ b/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp @@ -149,4 +149,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(6)) // increment to the leading dim of c ), ::zsyr2kTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp index 2aa7b2063f..c876843931 100644 --- a/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp @@ -139,4 +139,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(2)) // increment to the leading dim of c ), ::csyrkTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp index b4c8b61be3..05f1dc0229 100644 --- a/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp @@ -137,4 +137,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(9)) // increment to the leading dim of c ), ::dsyrkTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp index 1b99dc65fe..6ce9ab89bf 100644 --- a/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp @@ -137,4 +137,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of c ), ::ssyrkTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/syrk/syrk.h b/gtestsuite/testsuite/level3/syrk/syrk.h index 27628ac7e3..ecbea4725e 100644 --- a/gtestsuite/testsuite/level3/syrk/syrk.h +++ b/gtestsuite/testsuite/level3/syrk/syrk.h @@ -153,4 +153,4 @@ static void syrk( char storage, char uplo, char transa, gtint_t m, gtint_t k, #else throw std::runtime_error("Error in testsuite/level3/syrk.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/syrk/test_syrk.h b/gtestsuite/testsuite/level3/syrk/test_syrk.h index fc75b61df7..464f608827 100644 --- a/gtestsuite/testsuite/level3/syrk/test_syrk.h +++ b/gtestsuite/testsuite/level3/syrk/test_syrk.h @@ -73,4 +73,4 @@ void test_syrk( char storage, char uplo, char transa, gtint_t m, gtint_t k, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, m, c.data(), c_ref.data(), ldc, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp index a76a24533c..406d137d43 100644 --- a/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp +++ b/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp @@ -139,4 +139,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(5)) // increment to the leading dim of c ), ::zsyrkTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp index 11014e542a..5887027a58 100644 --- a/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp @@ -142,4 +142,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of b ), ::ctrmmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp index ec3608bf45..1c9c251bdf 100644 --- a/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp @@ -141,4 +141,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of b ), ::dtrmmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp index 2090b39611..6851e1f52c 100644 --- a/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp @@ -141,4 +141,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of b ), ::strmmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/trmm/test_trmm.h b/gtestsuite/testsuite/level3/trmm/test_trmm.h index 91b169d99c..4ba801d937 100644 --- a/gtestsuite/testsuite/level3/trmm/test_trmm.h +++ b/gtestsuite/testsuite/level3/trmm/test_trmm.h @@ -73,4 +73,4 @@ void test_trmm( char storage, char side, char uploa, char transa, char diaga, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, b.data(), b_ref.data(), ldb, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/trmm/trmm.h b/gtestsuite/testsuite/level3/trmm/trmm.h index 51daceccdf..267aa41e7e 100644 --- a/gtestsuite/testsuite/level3/trmm/trmm.h +++ b/gtestsuite/testsuite/level3/trmm/trmm.h @@ -167,4 +167,4 @@ static void trmm( char storage, char side, char uploa, char transa, char diaga, #else throw std::runtime_error("Error in testsuite/level3/trmm.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp index bbeb07d100..d6ad3e02ca 100644 --- a/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp @@ -142,4 +142,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(1)) // increment to the leading dim of b ), ::ztrmmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp index 9dcafcb32b..839c472988 100644 --- a/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp @@ -154,4 +154,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::ctrmm3TestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp index 6cb677e988..343a573666 100644 --- a/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp @@ -152,4 +152,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::dtrmm3TestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp index 4752556df8..2d52b620e8 100644 --- a/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp @@ -152,4 +152,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::strmm3TestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level3/trmm3/test_trmm3.h b/gtestsuite/testsuite/level3/trmm3/test_trmm3.h index e82f25dd0c..8203a0cb6b 100644 --- a/gtestsuite/testsuite/level3/trmm3/test_trmm3.h +++ b/gtestsuite/testsuite/level3/trmm3/test_trmm3.h @@ -77,4 +77,4 @@ void test_trmm3( char storage, char side, char uploa, char transa, char diaga, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, c.data(), c_ref.data(), ldb, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/trmm3/trmm3.h b/gtestsuite/testsuite/level3/trmm3/trmm3.h index 77be6ce392..2bd52db11a 100644 --- a/gtestsuite/testsuite/level3/trmm3/trmm3.h +++ b/gtestsuite/testsuite/level3/trmm3/trmm3.h @@ -136,4 +136,4 @@ static void trmm3( char storage, char side, char uploa, char transa, char diaga, #else throw std::runtime_error("Error in testsuite/level3/trmm3.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp index 9ab008b974..6ef3931d72 100644 --- a/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp +++ b/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp @@ -154,4 +154,4 @@ INSTANTIATE_TEST_SUITE_P( ), ::ztrmm3TestPrint() ); -#endif \ No newline at end of file +#endif diff --git a/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp index d001651df4..85c3917a39 100644 --- a/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp @@ -142,4 +142,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of b ), ::ctrsmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp index a0c64ddb6c..87b841defd 100644 --- a/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp @@ -141,4 +141,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of b ), ::dtrsmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp index a1e43aa20f..2e197c104f 100644 --- a/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp @@ -141,4 +141,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(4)) // increment to the leading dim of b ), ::strsmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/level3/trsm/test_trsm.h b/gtestsuite/testsuite/level3/trsm/test_trsm.h index e36e29374d..df0502b060 100644 --- a/gtestsuite/testsuite/level3/trsm/test_trsm.h +++ b/gtestsuite/testsuite/level3/trsm/test_trsm.h @@ -82,4 +82,4 @@ void test_trsm( char storage, char side, char uploa, char transa, char diaga, // check component-wise error. //---------------------------------------------------------- computediff( storage, m, n, b.data(), b_ref.data(), ldb, thresh ); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/trsm/trsm.h b/gtestsuite/testsuite/level3/trsm/trsm.h index 8d26f1303b..bb7f0469e2 100644 --- a/gtestsuite/testsuite/level3/trsm/trsm.h +++ b/gtestsuite/testsuite/level3/trsm/trsm.h @@ -167,4 +167,4 @@ static void trsm( char storage, char side, char uploa, char transa, char diaga, #else throw std::runtime_error("Error in testsuite/level3/trsm.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp index 8b7d0cab4d..830b9081b5 100644 --- a/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp +++ b/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp @@ -142,4 +142,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(gtint_t(0), gtint_t(3)) // increment to the leading dim of b ), ::ztrsmTestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp index b1642c6dfb..32386593d0 100644 --- a/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp +++ b/gtestsuite/testsuite/util/nrm2/dnrm2_extreme_values.cpp @@ -263,4 +263,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(-Inf, NaN) ), ::dnrm2_TestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp index eb18436788..422f5bfe76 100644 --- a/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp @@ -189,4 +189,4 @@ INSTANTIATE_TEST_SUITE_P( ) ), ::dnrm2TestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp b/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp index 6eab297ac6..993859265c 100644 --- a/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp +++ b/gtestsuite/testsuite/util/nrm2/dznrm2_extreme_values.cpp @@ -261,4 +261,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(dcomplex{NaN, Inf}, dcomplex{-Inf, NaN}, dcomplex{Inf, 0.0}) ), ::dznrm2_TestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp index dfabea06ae..a0fb186ccc 100644 --- a/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/dznrm2_generic.cpp @@ -180,4 +180,4 @@ INSTANTIATE_TEST_SUITE_P( ) ), ::dznrm2TestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/util/nrm2/nrm2.h b/gtestsuite/testsuite/util/nrm2/nrm2.h index 537cf27f43..9693a70aa0 100644 --- a/gtestsuite/testsuite/util/nrm2/nrm2.h +++ b/gtestsuite/testsuite/util/nrm2/nrm2.h @@ -110,4 +110,4 @@ static RT nrm2(gtint_t n, T* x, gtint_t incx) #else throw std::runtime_error("Error in testsuite/level1/axpyv.h: No interfaces are set to be tested."); #endif -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp index 899fb01025..3f67a0b355 100644 --- a/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp +++ b/gtestsuite/testsuite/util/nrm2/nrm2_corner_cases.cpp @@ -93,4 +93,4 @@ TYPED_TEST( nrm2_EIC, zero_incx_MT ) { RT blis_norm = nrm2(n, x.data(), incx); RT ref_norm = testinghelpers::ref_nrm2(n, x.data(), incx); computediff(blis_norm, ref_norm); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_invalid_inputs.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_invalid_inputs.cpp index a4a8abf6af..157b875a1d 100644 --- a/gtestsuite/testsuite/util/nrm2/nrm2_invalid_inputs.cpp +++ b/gtestsuite/testsuite/util/nrm2/nrm2_invalid_inputs.cpp @@ -24,4 +24,4 @@ TYPED_TEST(nrm2_IIT, negative_n) { blis_norm = nrm2(-2, &x, INC); computediff(blis_norm, 0.0); -} \ No newline at end of file +} diff --git a/gtestsuite/testsuite/util/nrm2/scnrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/scnrm2_generic.cpp index 0204a8335a..9d88aa336e 100644 --- a/gtestsuite/testsuite/util/nrm2/scnrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/scnrm2_generic.cpp @@ -79,4 +79,4 @@ INSTANTIATE_TEST_SUITE_P( ) ), ::scnrm2TestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp index 289e387c16..eac411d12d 100644 --- a/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp +++ b/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp @@ -116,4 +116,4 @@ INSTANTIATE_TEST_SUITE_P( ) // stride size for x ), ::snrm2TestPrint() - ); \ No newline at end of file + ); diff --git a/gtestsuite/testsuite/util/nrm2/test_nrm2.h b/gtestsuite/testsuite/util/nrm2/test_nrm2.h index b2fdf213e1..def4551929 100644 --- a/gtestsuite/testsuite/util/nrm2/test_nrm2.h +++ b/gtestsuite/testsuite/util/nrm2/test_nrm2.h @@ -98,4 +98,4 @@ void test_nrm2( gtint_t n, gtint_t incx, gtint_t i, T iexval, gtint_t j = 0, T j //---------------------------------------------------------- // Compare using NaN/Inf checks. computediff( norm, norm_ref, true ); -} \ No newline at end of file +} diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt index b132d52cb2..fa15654125 100644 --- a/kernels/CMakeLists.txt +++ b/kernels/CMakeLists.txt @@ -76,4 +76,4 @@ endfunction() # in the kernel list. foreach(KERN ${KERNEL_LIST}) generate_kernel_targets(${KERN}) -endforeach() \ No newline at end of file +endforeach() diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c index 2c0f50c637..390f3edb9f 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c @@ -1981,4 +1981,4 @@ void bli_dgemmsup_rv_haswell_asm_1x1 "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", "memory" ) -} \ No newline at end of file +} diff --git a/kernels/zen/3/bli_zgemm_avx2_k1.c b/kernels/zen/3/bli_zgemm_avx2_k1.c index f264741a2e..c074e113ca 100644 --- a/kernels/zen/3/bli_zgemm_avx2_k1.c +++ b/kernels/zen/3/bli_zgemm_avx2_k1.c @@ -1126,4 +1126,4 @@ void bli_zgemm_4x4_avx2_k1_nn } -} \ No newline at end of file +} diff --git a/kernels/zen/3/bli_zgemm_zen_2x6.c b/kernels/zen/3/bli_zgemm_zen_2x6.c index 1aaec9c948..0a34f5da3d 100644 --- a/kernels/zen/3/bli_zgemm_zen_2x6.c +++ b/kernels/zen/3/bli_zgemm_zen_2x6.c @@ -649,4 +649,4 @@ void bli_zgemm_zen_asm_2x6( "xmm13", "xmm14", "xmm15", "memory" ) -} \ No newline at end of file +} diff --git a/kernels/zen/3/bli_zgemmtrsm_l_2x6.c b/kernels/zen/3/bli_zgemmtrsm_l_2x6.c index 4d11a6648b..0d2ccc3d71 100644 --- a/kernels/zen/3/bli_zgemmtrsm_l_2x6.c +++ b/kernels/zen/3/bli_zgemmtrsm_l_2x6.c @@ -556,4 +556,4 @@ void bli_zgemmtrsm_l_zen_asm_2x6 "xmm13", "xmm14", "xmm15", "memory" ) -} \ No newline at end of file +} diff --git a/kernels/zen/3/bli_zgemmtrsm_u_2x6.c b/kernels/zen/3/bli_zgemmtrsm_u_2x6.c index 07bc47f016..e10a787ac9 100644 --- a/kernels/zen/3/bli_zgemmtrsm_u_2x6.c +++ b/kernels/zen/3/bli_zgemmtrsm_u_2x6.c @@ -558,4 +558,4 @@ void bli_zgemmtrsm_u_zen_asm_2x6 "xmm13", "xmm14", "xmm15", "memory" ) -} \ No newline at end of file +} diff --git a/kernels/zen4/3/bli_zero_zmm.c b/kernels/zen4/3/bli_zero_zmm.c index 47cae67c49..67ff9a62de 100644 --- a/kernels/zen4/3/bli_zero_zmm.c +++ b/kernels/zen4/3/bli_zero_zmm.c @@ -59,4 +59,4 @@ void bli_zero_zmm() "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) -} \ No newline at end of file +} diff --git a/kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c b/kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c index e8bdf4f503..6a158a4242 100644 --- a/kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c +++ b/kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c @@ -562,4 +562,4 @@ void bli_zgemm_zen4_asm_4x12( "xmm27", "xmm28", "xmm29", "xmm30", "xmm31", "memory" ) -} \ No newline at end of file +} diff --git a/kernels/zen4/3/bli_zgemmtrsm_l_4x12.c b/kernels/zen4/3/bli_zgemmtrsm_l_4x12.c index 5341bf4851..b726c02960 100644 --- a/kernels/zen4/3/bli_zgemmtrsm_l_4x12.c +++ b/kernels/zen4/3/bli_zgemmtrsm_l_4x12.c @@ -702,4 +702,4 @@ void bli_zgemmtrsm_l_zen4_asm_4x12( "xmm27", "xmm28", "xmm29", "xmm30", "xmm31", "memory" ) -} \ No newline at end of file +} diff --git a/kernels/zen4/3/bli_zgemmtrsm_u_4x12.c b/kernels/zen4/3/bli_zgemmtrsm_u_4x12.c index bb2017f5bb..9ab80f5238 100644 --- a/kernels/zen4/3/bli_zgemmtrsm_u_4x12.c +++ b/kernels/zen4/3/bli_zgemmtrsm_u_4x12.c @@ -712,4 +712,4 @@ void bli_zgemmtrsm_u_zen4_asm_4x12( "xmm27", "xmm28", "xmm29", "xmm30", "xmm31", "memory" ) -} \ No newline at end of file +} diff --git a/sandbox/power10/p10_testsuite/Makefile b/sandbox/power10/p10_testsuite/Makefile index a817496db2..b8a72c90cf 100644 --- a/sandbox/power10/p10_testsuite/Makefile +++ b/sandbox/power10/p10_testsuite/Makefile @@ -28,4 +28,4 @@ csv_clean: rm -rf *.csv clean: - rm -rf *.x *.o \ No newline at end of file + rm -rf *.x *.o diff --git a/sandbox/power10/p10_testsuite/common.h b/sandbox/power10/p10_testsuite/common.h index a5c1aeee25..f750d1cf2b 100644 --- a/sandbox/power10/p10_testsuite/common.h +++ b/sandbox/power10/p10_testsuite/common.h @@ -13,4 +13,4 @@ enum DATATYPES { INT4 }; -#endif \ No newline at end of file +#endif diff --git a/testsuite/CMakeLists.txt b/testsuite/CMakeLists.txt index 4e23e0e382..d85b39833c 100644 --- a/testsuite/CMakeLists.txt +++ b/testsuite/CMakeLists.txt @@ -103,4 +103,4 @@ add_custom_target(testsuite DEPENDS testblis) # Put all those targets under testsuite-targets folder name so that they appear all together in IDE. set_target_properties(test_libblis.x testblis checkblis testblis-fast checkblis-fast testblis-md checkblis-md testblis-mixed checkblis-mixed testblis-salt checkblis-salt - PROPERTIES FOLDER testsuite-targets) \ No newline at end of file + PROPERTIES FOLDER testsuite-targets) From 664a1757fa737c4a26b77e5742907725702ad542 Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Thu, 16 Nov 2023 19:36:12 +0530 Subject: [PATCH 210/226] CMake: Improving how CMake system handles targets. - Instead of putting the built libraries in blis/bin directory, build them in the chosen build-cmake directory. - Install headers in /include instead of /include/blis. - Fix on some targets to match configure/make system. - Update documentation. AMD-Internal: [CPUPL-2748] Change-Id: I15553948209345dbee350e89965b6a3c72a4e340 --- CMakeLists.txt | 82 +++++++++++++++++++--------------------- blastest/CMakeLists.txt | 18 +++++---- docs/CMakeBuildSystem.md | 38 +++++++++++-------- testsuite/CMakeLists.txt | 15 ++++---- 4 files changed, 78 insertions(+), 75 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29cc6ded01..c4cf5d1018 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,18 +150,19 @@ if(WIN32) option(ENABLE_NO_UNDERSCORE_API "Export APIs without underscore." OFF) option(ENABLE_UPPERCASE_API "Export APIs with uppercase." OFF) # Setting path to OpenMP runtime. - set(OpenMP_libomp_LIBRARY "C:/Program Files/Microsoft Visual Studio/2022/Professional/VC/Tools/Llvm/x64/lib/libomp.lib" CACHE STRING "openmp library path") + set(OpenMP_libomp_LIBRARY "C:/Program Files/LLVM/lib/libomp.lib" CACHE STRING "openmp library path") endif() -set(ENABLE_DEBUG "off" CACHE STRING "Enable debugging symbols in the library.") -set_property(CACHE ENABLE_DEBUG PROPERTY STRINGS "off" "noopt" "opt") -if( NOT ((ENABLE_DEBUG STREQUAL "off") OR (ENABLE_DEBUG STREQUAL "noopt") OR (ENABLE_DEBUG STREQUAL "opt")) ) - message(FATAL_ERROR "ENABLE_DEBUG option '${ENABLE_DEBUG}' is not supported. Please use one of the following options \ - during CMake invokation: off, noopt, opt") -endif() -# Check if user provided CMAKE_BUILD_TYPE. If that's the case, map it to the internal ENABLE_DEBUG type -# and clean cache from CMAKE_BUILD_TYPE. We do this because CMake will add some flags depending on the -# the build type and on Linux we want to have more control over what flags are being used. +# Debug & Release flags option setting is only available for Linux. On Windows the default flags are used. if(NOT WIN32) + set(ENABLE_DEBUG "off" CACHE STRING "Enable debugging symbols in the library.") + set_property(CACHE ENABLE_DEBUG PROPERTY STRINGS "off" "noopt" "opt") + if( NOT ((ENABLE_DEBUG STREQUAL "off") OR (ENABLE_DEBUG STREQUAL "noopt") OR (ENABLE_DEBUG STREQUAL "opt")) ) + message(FATAL_ERROR "ENABLE_DEBUG option '${ENABLE_DEBUG}' is not supported. Please use one of the following options \ + during CMake invokation: off, noopt, opt") + endif() + # Check if user provided CMAKE_BUILD_TYPE. If that's the case, map it to the internal ENABLE_DEBUG type + # and clean cache from CMAKE_BUILD_TYPE. We do this because CMake will add some flags depending on the + # the build type and on Linux we want to have more control over what flags are being used. if(CMAKE_BUILD_TYPE) if(CMAKE_BUILD_TYPE STREQUAL "Debug") set(ENABLE_DEBUG "noopt") @@ -200,11 +201,14 @@ if( NOT ((THREAD_PART_JRIR STREQUAL "slab") OR (THREAD_PART_JRIR STREQUAL "rr")) message(FATAL_ERROR "THREAD_PART_JRIR option '${THREAD_PART_JRIR}' is not supported. Please use one of the following options \ during CMake invokation: slab, rr") endif() -set(EXPORT_SHARED "public" CACHE STRING "Specify the subset of library symbols that are exported within a shared library.") -set_property(CACHE EXPORT_SHARED PROPERTY STRINGS "public" "all") -if( NOT ((EXPORT_SHARED STREQUAL "public") OR (EXPORT_SHARED STREQUAL "all")) ) - message(FATAL_ERROR "EXPORT_SHARED option '${EXPORT_SHARED}' is not supported. Please use one of the following options \ - during CMake invokation: publis, all") +# Export symbols only for Linux. +if(NOT WIN32) + set(EXPORT_SHARED "public" CACHE STRING "Specify the subset of library symbols that are exported within a shared library.") + set_property(CACHE EXPORT_SHARED PROPERTY STRINGS "public" "all") + if( NOT ((EXPORT_SHARED STREQUAL "public") OR (EXPORT_SHARED STREQUAL "all")) ) + message(FATAL_ERROR "EXPORT_SHARED option '${EXPORT_SHARED}' is not supported. Please use one of the following options \ + during CMake invokation: public, all") + endif() endif() option(ENABLE_PBA_POOLS "Internal memory pools for packing blocks" ON) option(ENABLE_SBA_POOLS "Internal memory pools for small blocks" ON) @@ -343,16 +347,18 @@ else() message(" Building BLIS as a static library.") set(ENABLE_SHARED_01 0) endif() +if(NOT WIN32) cmake_print_variables(EXPORT_SHARED) -if(EXPORT_SHARED STREQUAL "all") - if(BUILD_SHARED_LIBS) - message(" Exporting all symbols within shared library.") + if(EXPORT_SHARED STREQUAL "all") + if(BUILD_SHARED_LIBS) + message(" Exporting all symbols within shared library.") + else() + message(" Ignoring request to export all symbols within shared library.") + endif() else() - message(" Ignoring request to export all symbols within shared library.") - endif() -else() - if(BUILD_SHARED_LIBS) - message(" Exporting only public symbols within shared library.") + if(BUILD_SHARED_LIBS) + message(" Exporting only public symbols within shared library.") + endif() endif() endif() cmake_print_variables(ENABLE_SYSTEM) @@ -1059,20 +1065,12 @@ endif() # Add headers as a property to the library. set_target_properties(libblis PROPERTIES PUBLIC_HEADER "${BLIS_PUBLIC_HEADERS}") set_target_properties(libblis PROPERTIES OUTPUT_NAME ${LIBBLIS}) -if(WIN32) - set_target_properties(libblis - PROPERTIES - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" - ) -endif() # Install targets. install(TARGETS libblis LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_PREFIX}/include/blis) + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_PREFIX}/include) # --- Primary targets --- add_custom_target(libs DEPENDS libblis) @@ -1088,20 +1086,16 @@ if(ENABLE_BLAS) add_subdirectory(blastest EXCLUDE_FROM_ALL) endif() -# Add generic testing target. +# Add generic testing target `test`. set(available_testsuites checkblis) if(ENABLE_BLAS) list(APPEND available_testsuites checkblas) endif() -add_custom_target(check DEPENDS ${available_testsuites}) +add_custom_target(test DEPENDS ${available_testsuites}) -#-------------------------------------------- -# Clean-up -#-------------------------------------------- -# Add distclean target -add_custom_target(distclean - COMMAND ${CMAKE_BUILD_TOOL} clean - COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/build/distclean.cmake - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - COMMENT "Remove cmake_generated files and executables" -) +# Add generic testing target `check`. +set(available_testsuites checkblis-fast) +if(ENABLE_BLAS) + list(APPEND available_testsuites checkblas) +endif() +add_custom_target(check DEPENDS ${available_testsuites}) \ No newline at end of file diff --git a/blastest/CMakeLists.txt b/blastest/CMakeLists.txt index 062ca21162..e0960152d2 100644 --- a/blastest/CMakeLists.txt +++ b/blastest/CMakeLists.txt @@ -100,23 +100,24 @@ foreach(source ${blastest_sources}) set_target_properties(${exec_name}.x PROPERTIES FOLDER blastest-targets) # Add a target for running the tests. Rules are different for level-1 APIs, compared to levels 2 and 3. if(${exec_name} MATCHES 1) - add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/out.${exec_name} - COMMAND ${exec_name}.x > ${CMAKE_BINARY_DIR}/out.${exec_name} - COMMENT "Running ${exec_name}.x with output redirected to ${CMAKE_BINARY_DIR}/out.${exec_name}" + add_custom_target(run-${exec_name} + COMMAND ${exec_name}.x > out.${exec_name} + COMMENT "Running ${exec_name}.x with output redirected to out.${exec_name}" DEPENDS ${exec_name}.x + BYPRODUCTS ${CMAKE_BINARY_DIR}/out.${exec_name} WORKING_DIRECTORY $ VERBATIM ) else()# name has 2 or 3 - add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/out.${exec_name} + add_custom_target(run-${exec_name} COMMAND ${exec_name}.x < ${CMAKE_CURRENT_SOURCE_DIR}/input/${exec_name}.in - COMMENT "Running ${exec_name}.x with output saved to ${CMAKE_BINARY_DIR}/out.${exec_name}" + COMMENT "Running ${exec_name}.x with input ${CMAKE_CURRENT_SOURCE_DIR}/input/${exec_name}.in and output saved to out.${exec_name}" DEPENDS ${exec_name}.x + BYPRODUCTS ${CMAKE_BINARY_DIR}/out.${exec_name} WORKING_DIRECTORY $ VERBATIM ) endif() - add_custom_target(run-${exec_name} DEPENDS ${CMAKE_BINARY_DIR}/out.${exec_name}) # Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE. set_target_properties(run-${exec_name} PROPERTIES FOLDER blastest-targets) list(APPEND test_executables "run-${exec_name}") @@ -124,8 +125,9 @@ endforeach() add_custom_target(testblas DEPENDS ${test_executables}) add_custom_target(checkblas - COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/cmake/check-blastest.py ${CMAKE_BINARY_DIR} + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/cmake/check-blastest.py "." DEPENDS testblas + WORKING_DIRECTORY $ ) # Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE. -set_target_properties(testblas checkblas PROPERTIES FOLDER blastest-targets) +set_target_properties(testblas checkblas PROPERTIES FOLDER blastest-targets) \ No newline at end of file diff --git a/docs/CMakeBuildSystem.md b/docs/CMakeBuildSystem.md index 7e669c6b3d..92b85cf432 100644 --- a/docs/CMakeBuildSystem.md +++ b/docs/CMakeBuildSystem.md @@ -23,11 +23,14 @@ The BLIS CMake system is based on the [Make build system](BuildSystem.md) and is * Python (3.4 or later for python3) * GNU `make` (3.81 or later) on Linux * Visual Studio 17 2022 on Windows - * a working C99 compiler (gcc or clang on Linux and clang-cl **only** on Windows) + * a working C99 compiler (gcc or clang on Linux and **only** clang-cl on Windows) -Note that, on Windows, BLIS implements basic pthreads functionality automatically, so a POSIX threads is not required. On Linux, the implementation is the same to the one of the Make system. +**_NOTE:_** +To get clang-cl on Visual Studio, one needs to choose "C++ Clang tools for Windows" when installing "Desktop development with C++" with Visual Studio. -CMake is used to build out of source so we need to start by creating a build directory from which we will do the configuration and build steps. Since there is a directory called blis/build, the build directory must have a different name. Here is an example on how to create the directory: +Note that, on Windows, BLIS implements basic pthreads functionality automatically, so a POSIX threads library is not required. On Linux, the implementation is the same to the one of the Make system. + +CMake is used to build out of source, so we need to start by creating a build directory from which we will do the configuration and build. Since there is a directory called blis/build, the build directory must have a different name. Here is an example of creating the directory: ``` $ mkdir build_blis $ cd build_blis @@ -44,7 +47,7 @@ The first step is to choose the appropriate BLIS configuration. As on the Make b * zen4 * generic -Instructions on how to add a configuration on the CMake system, are provided in a later section. +Instructions on how to add a configuration on the CMake system, are provided in [Adding configurations](CMakeBuildSystem.md#adding-configurations). ### Multithreading @@ -91,9 +94,9 @@ We remind users that to specify the installation prefix in cmake, one needs to c ``` cmake .. -DBLIS_CONFIG_FAMILY=auto -DCMAKE_INSTALL_PREFIX= ``` -This will cause libraries to eventually be installed to `/lib` and headers will be installed to `/include/blis`. +This will cause libraries to eventually be installed to `/lib` and headers will be installed to `/include`. -Options to specify the library install and the header install separately, like in Make system, is not currently supported by the CMake equivalent. +Option to specify the library install and the header install separately, like in Make system, is not currently supported by the CMake equivalent. ## Step 3: Compilation @@ -155,14 +158,21 @@ The BLIS CMake system aims to be combatible with the current `make` system. For | `testblis-salt` | Run the BLIS testsuite while simulating application-level threading (runs for a few seconds). | | `testsuite` | Same as `testblis`. | | `testblas` | Run the BLAS test drivers with default parameters (runs for a few seconds). | +| `checkbliscpp` | Run the BLIS C++ tests (runs for a few seconds). | + +**_NOTE:_** +Using those targets sets the environment appropriately, so copying the input files and/or the DLL in case of Windows builds is not required. -### Running the testsuites. +### Running the testsuites * On Linux all targets can be build and run in `build_blis` directory. -* On Windows, when Visual Studio has been used as a generator, one can build and run the blis API related tests from testsuite directory and blas API tests from blastest directory. +* On Windows, when Visual Studio has been used as a generator, one can build and run the blis API related tests from `build_blis/testsuite` directory and blas API tests from `build_blis/blastest` directory. To build and run the BLIS C++ interface tests, execute the target `checkbliscpp` in `build_blis/vendor/testcpp` directory. The targets `check` and `test` can be used in `build_blis` directory. +* On Windows, if Visual Studio is used to build the library and tests, note that only the high level targets will appear. All targets are available to build from the command prompt. ## Adding configurations -ToDo +The CMake system is designed to closely relate to the BLIS Make system. Assuming that a user has followed the steps in [Configuration How To](ConfigurationHowTo.md), adding the new configuration on the CMake system requires the following steps: +* Add a `make_defs.cmake` file which is equivalent to `make_defs.mk`. One can see `blis/config/zen/make_defs.cmake` and `blis/config/zen/make_defs.mk` for an example. +* Update `blis/CMakeLists.txt` to remove the error for the particular new configuration and to add the option in `set_property()` so that it appears in cmake-gui. ## Some examples @@ -197,6 +207,9 @@ cmake .. -G "Visual Studio 17 2022" -TClangCl -DENABLE_THREADING=openmp -DINT_SI ### Example 2: single-threaded ILP64 libraries for amdzen configuration with aocl_gemm addon enabled and default compiler +**_NOTE:_** +Addon functionality is currently available only on Linux. + * With configure script: ``` ./configure --enable-threading=no --int-size=64 --blas-int-size=64 --enable-addon=aocl_gemm amdzen @@ -207,11 +220,6 @@ cmake .. -G "Visual Studio 17 2022" -TClangCl -DENABLE_THREADING=openmp -DINT_SI cmake .. -DENABLE_THREADING=no -DINT_SIZE=64 -DBLAS_INT_SIZE=64 -DENABLE_ADDON=aocl_gemm -DBLIS_CONFIG_FAMILY=amdzen ``` -* With CMake on Windows: -``` -cmake .. -G "Visual Studio 17 2022" -TClangCl -DENABLE_THREADING=no -DINT_SIZE=64 -DBLAS_INT_SIZE=64 -DENABLE_ADDON=aocl_gemm -DBLIS_CONFIG_FAMILY=amdzen -``` - ## Conclusion -The BLIS CMake system is developed and maintained by AMD. You can contact us on the email-id toolchainsupport@amd.com. You can also raise any issue/suggestion on the git-hub repository at https://github.com/amd/blis/issues. +The BLIS CMake system is developed and maintained by AMD. You can contact us on the email-id toolchainsupport@amd.com. You can also raise any issue/suggestion on the git-hub repository at https://github.com/amd/blis/issues. \ No newline at end of file diff --git a/testsuite/CMakeLists.txt b/testsuite/CMakeLists.txt index d85b39833c..2c7ac1e28a 100644 --- a/testsuite/CMakeLists.txt +++ b/testsuite/CMakeLists.txt @@ -74,21 +74,20 @@ function(add_testblis flavour) set(dashflavour -${flavour}) set(printflavour "(${flavour})") endif() - # A rule to run the testsuite using the input.*${dotflavour} files, which - # run a set of tests designed to finish much more quickly. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour} + # A rule to run the testsuite using the input.*${dotflavour} files. + add_custom_target(testblis${dashflavour} COMMAND test_libblis.x -g ${CMAKE_CURRENT_SOURCE_DIR}/input.general${dotflavour} -o ${CMAKE_CURRENT_SOURCE_DIR}/input.operations${dotflavour} > ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour} COMMENT "Running test_libblis.x ${printflavour} with output redirected to ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour}" DEPENDS test_libblis.x ${CMAKE_CURRENT_SOURCE_DIR}/input.general${dotflavour} ${CMAKE_CURRENT_SOURCE_DIR}/input.operations${dotflavour} + BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour} WORKING_DIRECTORY $ VERBATIM - ) - add_custom_target(testblis${dashflavour} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour}) + ) # Check the results of the BLIS testsuite. add_custom_target(checkblis${dashflavour} - COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/cmake/check-blistest.py ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour} - DEPENDS testblis${dashflavour} - ) + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/cmake/check-blistest.py ${CMAKE_CURRENT_BINARY_DIR}/output.testsuite${dotflavour} + DEPENDS testblis${dashflavour} + ) endfunction() # Add testing targets using functions above for all input file options. From e55f6b9a6543e96e8690fa63fad65545aa5480d8 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Fri, 17 Nov 2023 11:52:14 -0500 Subject: [PATCH 211/226] User selection of code path in single architecture builds User control over code path using AOCL_ENABLE_INSTRUCTIONS or BLIS_ARCH_TYPE only makes sense for fat binary builds. Thus this functionality is now disabled by default for single architecture builds. User can still override the default selections by using configure options --enable-blis-arch-type or --disable-blis-arch-type. Other changes: - include x86_64 family as using zen codepaths in cmake build system. - Update help and error messages to include AOCL_ENABLE_INSTRUCTIONS. AMD-Internal: [CPUPL-4202] Change-Id: I7aa5fcf89df8675bcc12d81f81781de647e0fcf8 (cherry picked from commit dc41fa3829acd7e885ffa11891d120ae2f7025b0) --- CMakeLists.txt | 17 +++++++++++++---- build/cmake/config_print.py | 5 +++-- configure | 25 ++++++++++++++++++++----- frame/base/bli_arch.c | 14 ++++++++++++++ frame/base/bli_error.c | 6 +++--- 5 files changed, 53 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c4cf5d1018..787f831452 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.## cmake_minimum_required(VERSION 3.15.0) if(WIN32) @@ -122,7 +122,7 @@ set(CONFIG_NAME_DEFINE "#define BLIS_FAMILY_${UCONF}\n") #create a AOCL specific #define #This macro is enabled only for zen family configurations. #This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes. -if(BLIS_CONFIG_FAMILY MATCHES "zen|amd64") +if(BLIS_CONFIG_FAMILY MATCHES "zen|amd64|x86_64") set(ENABLE_AOCL_ZEN ON) set(ENABLE_AOCL_ZEN_01 1) else() @@ -262,7 +262,15 @@ else() during CMake invokation: default, gnu, intel") endif() endif() -option(DISABLE_BLIS_ARCH_TYPE "Disable BLIS_ARCH_TYPE and BLIS_MODEL_TYPE functionality" OFF) +# If the CONFIG_LIST does not already contain the CONFIG_NAME (i.e., +# if CONFIG_NAME is an umbrella family), default is to enable BLIS_ARCH_TYPE functionality, +# otherwise default is to disable BLIS_ARCH_TYPE functionality. +list(FIND CONFIG_LIST ${BLIS_CONFIG_FAMILY} IS_UMBRELLA) +if(${IS_UMBRELLA} STREQUAL "-1") + option(DISABLE_BLIS_ARCH_TYPE "Disable AOCL_ENABLE_INSTRUCTIONS, BLIS_ARCH_TYPE and BLIS_MODEL_TYPE functionality" OFF) +else() + option(DISABLE_BLIS_ARCH_TYPE "Disable AOCL_ENABLE_INSTRUCTIONS, BLIS_ARCH_TYPE and BLIS_MODEL_TYPE functionality" ON) +endif() set(RENAME_BLIS_ARCH_TYPE "BLIS_ARCH_TYPE" CACHE STRING "BLIS_ARCH_TYPE env var renamed to supplied value") set(RENAME_BLIS_MODEL_TYPE "BLIS_MODEL_TYPE" CACHE STRING "BLIS_MODEL_TYPE env var renamed to supplied value") if(NOT WIN32) @@ -614,7 +622,8 @@ else() endif() cmake_print_variables(DISABLE_BLIS_ARCH_TYPE) if(DISABLE_BLIS_ARCH_TYPE) - message(" User selection of code path using BLIS_ARCH_TYPE and BLIS_MODEL_TYPE env vars is disabled.") + message(" User selection of code path using AOCL_ENABLE_INSTRUCTIONS, BLIS_ARCH_TYPE and") + message(" BLIS_MODEL_TYPE env vars is disabled.") set(DISABLE_BLIS_ARCH_TYPE_01 1) else() set(DISABLE_BLIS_ARCH_TYPE_01 0) diff --git a/build/cmake/config_print.py b/build/cmake/config_print.py index cbae038954..f5fc767711 100644 --- a/build/cmake/config_print.py +++ b/build/cmake/config_print.py @@ -260,11 +260,12 @@ def main(): print( " " ) print( " -DDISABLE_BLIS_ARCH_TYPE=ON or -DDISABLE_BLIS_ARCH_TYPE=OFF" ) print( " " ) - print( " Disable (Enabled by default) support for BLIS_ARCH_TYPE and BLIS_MODEL_TYPE" ) - print( " environment variables, which allows user to select" ) + print( " Disable support for AOCL_ENABLE_INSTRUCTIONS, BLIS_ARCH_TYPE and" ) + print( " BLIS_MODEL_TYPE environment variables, which allows user to select" ) print( " architecture specific code path and optimizations at runtime." ) print( " If disabled, in builds with multiple code paths, BLIS" ) print( " will still select path and optimizations automatically." ) + print( " Default: Enabled in builds with multiple code paths, else disabled." ) print( " " ) print( " -DRENAME_BLIS_ARCH_TYPE=STRING" ) print( " " ) diff --git a/configure b/configure index be589ef0d5..92a34632bb 100755 --- a/configure +++ b/configure @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -355,11 +355,12 @@ print_usage() echo " " echo " --enable-blis-arch-type, --disable-blis-arch-type" echo " " - echo " Disable (Enabled by default) support for BLIS_ARCH_TYPE and BLIS_MODEL_TYPE" - echo " environment variables, which allows user to select" + echo " Disable support for AOCL_ENABLE_INSTRUCTIONS, BLIS_ARCH_TYPE and" + echo " BLIS_MODEL_TYPE environment variables, which allows user to select" echo " architecture specific code path and optimizations at runtime." echo " If disabled, in builds with multiple code paths, BLIS" echo " will still select path and optimizations automatically." + echo " Default: Enabled in builds with multiple code paths, else disabled." echo " " echo " --rename-blis-arch-type=STRING" echo " " @@ -2076,7 +2077,7 @@ main() enable_aocl_dynamic='yes' force_version='no' complex_return='default' - disable_blis_arch_type='no' + disable_blis_arch_type='unset' rename_blis_arch_type='BLIS_ARCH_TYPE' rename_blis_model_type='BLIS_MODEL_TYPE' @@ -2810,6 +2811,19 @@ main() fi + # Based on the number of sub-configurations, set default value for disable_blis_arch_type + # (if user hasn't set option). BLIS_ARCH_TYPE functionality only makes sense for use with + # processor families containing multiple sub-configurations, but user can force the + # functionality to be enabled/disabled with --enable-blis-arch-type/--disable-blis-arch-type + # configure options. + if [ "x${disable_blis_arch_type}" = "xunset" ]; then + config_list_count=$(echo ${config_list} |wc -w) + if [ "x${config_list_count}" = "x1" ]; then + disable_blis_arch_type='yes' + else + disable_blis_arch_type='no' + fi + fi echo "${script_name}: checking sub-configurations:" @@ -3301,7 +3315,8 @@ main() fi if [ "x${disable_blis_arch_type}" = "xyes" ]; then - echo "${script_name}: user selection of code path using BLIS_ARCH_TYPE and BLIS_MODEL_TYPE env vars is disabled." + echo "${script_name}: user selection of code path using AOCL_ENABLE_INSTRUCTIONS," + echo "${script_name}: BLIS_ARCH_TYPE and BLIS_MODEL_TYPE env vars is disabled." disable_blis_arch_type_01='1' else disable_blis_arch_type_01='0' diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 1c7bb93f80..4fdf5a9c9b 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -505,13 +505,27 @@ void bli_arch_check_id( void ) { if ( model_id == BLIS_MODEL_DEFAULT ) { +#ifdef DISABLE_BLIS_ARCH_TYPE + fprintf( stderr, "libblis: Selecting sub-configuration '%s'.\n" + "libblis: User control of sub-configuration using AOCL_ENABLE_INSTRUCTIONS\n" + "libblis: or using "__blis_arch_type_name" and "__blis_model_type_name" is disabled.\n", + bli_arch_string( arch_id ) ); +#else fprintf( stderr, "libblis: Selecting sub-configuration '%s'.\n", bli_arch_string( arch_id ) ); +#endif } else { +#ifdef DISABLE_BLIS_ARCH_TYPE + fprintf( stderr, "libblis: Selecting sub-configuration '%s', model '%s'.\n" + "libblis: User control of sub-configuration using AOCL_ENABLE_INSTRUCTIONS\n" + "libblis: or using "__blis_arch_type_name" and "__blis_model_type_name" is disabled.\n", + bli_arch_string( arch_id ), bli_model_string( model_id ) ); +#else fprintf( stderr, "libblis: Selecting sub-configuration '%s', model '%s'.\n", bli_arch_string( arch_id ), bli_model_string( model_id ) ); +#endif } } #if 0 diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c index 06b1467a83..8e60f57039 100644 --- a/frame/base/bli_error.c +++ b/frame/base/bli_error.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -103,9 +103,9 @@ static char bli_error_string[BLIS_MAX_NUM_ERR_MSGS][BLIS_MAX_ERR_MSG_LENGTH] = [-BLIS_EXPECTED_OBJECT_ALIAS] = "Expected object to be alias.", - [-BLIS_INVALID_ARCH_ID] = "Invalid architecture id value (env var "__blis_arch_type_name").", + [-BLIS_INVALID_ARCH_ID] = "Invalid architecture id value (env var AOCL_ENABLE_INSTRUCTIONS or "__blis_arch_type_name").", [-BLIS_INVALID_MODEL_ID] = "Invalid architecture model id value (env var "__blis_model_type_name").", - [-BLIS_UNINITIALIZED_GKS_CNTX] = "Accessed uninitialized context in gks; "__blis_arch_type_name" or "__blis_model_type_name" is probably set to an invalid architecture id.", + [-BLIS_UNINITIALIZED_GKS_CNTX] = "Accessed uninitialized context in gks; AOCL_ENABLE_INSTRUCTIONS or "__blis_arch_type_name" is probably set to an invalid architecture id.", [-BLIS_MC_DEF_NONMULTIPLE_OF_MR] = "Default MC is non-multiple of MR for one or more datatypes.", [-BLIS_MC_MAX_NONMULTIPLE_OF_MR] = "Maximum MC is non-multiple of MR for one or more datatypes.", From 126a07093151b14d7924eab471c4fef7381aebf6 Mon Sep 17 00:00:00 2001 From: Bhaskar Nallani Date: Fri, 24 Nov 2023 03:32:13 +0530 Subject: [PATCH 212/226] Improved thread balancing for aocl_gemm f32 API Description: 1. Updated the thread partition logic for aocl_gemm_f32f32f32of32 for m 1 ) { - // If BLIS_NUM_THREADS are set, generate jc,ic from the same. - bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways ); - - lpgemm_adjust_ic_jc_ways - ( - m, n, k, - MC, NC, KC, MR, NR, - n_threads, ic_ways, jc_ways, 5 - ); + dim_t mr_blks = ( m + MR - 1 ) / MR; + dim_t nr_blks = ( n + NR - 1 ) / NR; + + if ( n <= NR ) + { + ( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads ); + ( *jc_ways ) = 1; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else if ( m <= MR ) + { + ( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads ); + ( *ic_ways ) = 1; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else + { + // If BLIS_NUM_THREADS are set, generate jc,ic from the same. + bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways ); + if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) ) + { + ( *ic_ways ) = mr_blks; + ( *jc_ways ) = nr_blks; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else if ( mr_blks < ( *ic_ways ) ) + { + ( *ic_ways ) = mr_blks; + dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) ); + ( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else if ( nr_blks < ( *jc_ways ) ) + { + ( *jc_ways ) = nr_blks; + dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) ); + ( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks; + ( *n_threads ) = ( *ic_ways ) * ( *jc_ways ); + } + else + { + lpgemm_adjust_ic_jc_ways + ( + m, n, k, + MC, NC, KC, MR, NR, + n_threads, ic_ways, jc_ways, 5 + ); + } + } } else { @@ -652,9 +721,8 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading if ( ( m >= MT ) && ( n >= NT ) && ( k >= KT ) ) { - if ( ( k > page_size_b_floatx2 ) || - ( ( k <= page_size_b_floatx2 ) && - ( m_ic > MT_2 ) && ( n_jc >= NT ) ) ) + if (((k <= page_size_b_floatx2 ) && ( m_ic > MT_2 ) && ( n_jc >= NT ) ) || + ((bli_cpuid_is_avx2fma3_supported() == FALSE ) && (k > page_size_b_floatx2))) { bli_rntm_set_pack_b( 1, rntm_g ); bli_rntm_set_pack_a( 1, rntm_g ); From 4e493d3793916f24ff4368a27733cd4ac1703fc5 Mon Sep 17 00:00:00 2001 From: mkadavil Date: Wed, 22 Nov 2023 04:31:55 +0530 Subject: [PATCH 213/226] LPGEMM s32 micro-kernel updates to fix gcc10.2 compilation issue. Some AVX512 intrinsics(eg: _mm_loadu_epi8) were introduced in later versions of gcc (11+) in addition to already existing masked intrinsic (eg: _mm_mask_loadu_epi8). In order to support compilation using gcc 10.2, either the masked intrinsic or other gcc 10.2 compatible intrinsic needs to be used (eg: _mm_loadu_si128) in LPGEMM 8s8os32 kernels. AMD-Internal: [SWLCSG-2542] Change-Id: I6cfedfdcb28711b19df63d162ab267f5eea8d2ef --- .../bf16bf16f32/lpgemm_f32_kern_macros.h | 3 +- .../lpgemm_6x64rowmajor_s8_amd512vnni.c | 20 ++- .../s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c | 100 +++++++----- .../s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c | 150 +++++++++++------- .../s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c | 30 ++-- .../u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c | 20 ++- .../u8s8s32/lpgemm_m_fringe_amd512vnni.c | 100 +++++++----- .../u8s8s32/lpgemm_mn_fringe_amd512vnni.c | 150 +++++++++++------- .../lpgemm_n_extMR_fringe_amd512vnni.c | 15 +- .../u8s8s32/lpgemm_n_fringe_amd512vnni.c | 30 ++-- 10 files changed, 371 insertions(+), 247 deletions(-) diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h index f3875647eb..5b4c353670 100644 --- a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h +++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h @@ -40,7 +40,8 @@ // Disable BF16 kernel in cases where compilers support other avx 512 // features except BF16 ISA. -#if defined( BLIS_GCC ) && ( __GNUC__ < 10 ) +#if ( defined( BLIS_GCC ) && ( ( __GNUC__ < 11 ) || \ + ( ( __GNUC__ == 11 ) && ( __GNUC_MINOR__ < 2 ) ) ) ) #define LPGEMM_BF16_NOT_SUPPORTED #endif diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c index 302a723685..df5d29472c 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c @@ -1062,17 +1062,21 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64) // int8_t zero point value. __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c index aae48e260a..53a0f51d17 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c @@ -826,17 +826,21 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -1697,17 +1701,21 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2406,17 +2414,21 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2955,17 +2967,21 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -3341,17 +3357,21 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c index 6cb4e5f615..ced733e131 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c @@ -2103,8 +2103,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2485,8 +2486,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2808,8 +2810,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -3072,8 +3075,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -3277,8 +3281,9 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -3806,11 +3811,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -4346,11 +4353,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -4795,11 +4804,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -5153,11 +5164,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -5420,11 +5433,13 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -6088,14 +6103,17 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -6780,14 +6798,17 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -7349,14 +7370,17 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -7796,14 +7820,17 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -8120,14 +8147,17 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c index 91af051c71..9669b638b5 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c @@ -1136,8 +1136,9 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -1894,11 +1895,13 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2859,14 +2862,17 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c index 698e0817a4..96e2e35562 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c @@ -908,17 +908,21 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64) // int8_t zero point value. __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c index 0276e9b7d3..306fb525a5 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c @@ -753,17 +753,21 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -1560,17 +1564,21 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2216,17 +2224,21 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2722,17 +2734,21 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -3076,17 +3092,21 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x64) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 3 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); __m128i zero_point3 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 3 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 3 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c index 119d973a06..1c7a7e8d75 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c @@ -1959,8 +1959,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2316,8 +2317,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2617,8 +2619,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2862,8 +2865,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -3051,8 +3055,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -3544,11 +3549,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -4052,11 +4059,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -4473,11 +4482,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -4807,11 +4818,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -5054,11 +5067,13 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -5678,14 +5693,17 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -6331,14 +6349,17 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -6866,14 +6887,17 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -7283,14 +7307,17 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -7582,14 +7609,17 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c index 4b163350c8..bfe3fb6ce1 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c @@ -1671,8 +1671,9 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2579,11 +2580,13 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_9x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c index f537057b3d..f4feda174f 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c @@ -1033,8 +1033,9 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x16) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 0 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -1734,11 +1735,13 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x32) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 1 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); @@ -2625,14 +2628,17 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x48) _mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor + post_ops_attr.post_op_c_j + ( 2 * 16 ) ); __m128i zero_point0 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 0 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 0 * 16 ) ) ); __m128i zero_point1 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 1 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 1 * 16 ) ) ); __m128i zero_point2 = - _mm_loadu_epi8( ( int8_t* )post_ops_list_temp->op_args1 + - post_ops_attr.post_op_c_j + ( 2 * 16 ) ); + _mm_loadu_si128( ( __m128i const* ) + ( ( int8_t* )post_ops_list_temp->op_args1 + + post_ops_attr.post_op_c_j + ( 2 * 16 ) ) ); // c[0, 0-15] CVT_MULRND_CVT32(c_int32_0p0,selector1,zero_point0); From 65c62c86a373d04485e47171f154f04c6c673260 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Fri, 17 Nov 2023 11:17:31 +0530 Subject: [PATCH 214/226] Fixed functionality failure in c/z trsm framework code. - For the inputs where either m or n is 1, based on right or left side, it invokes c/z scalv kernel and post that it scales the matrix post checking whether the input is blis conjugate transpose or not. - Previously the check condition was case sensitive *diaga = 'n', and as a result, it is always executing the "else" code-part. - Fixed the condition check. AMD-Internal: [CPUPL-4204] Change-Id: Iae2514c742ab17ac6c6e43036da095a74ad131c5 (cherry picked from commit c6ed4909078ddb703a668017450d074e972b1c67) --- frame/compat/bla_trsm_amd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/frame/compat/bla_trsm_amd.c b/frame/compat/bla_trsm_amd.c index 2294518b6a..37f5fba4a0 100644 --- a/frame/compat/bla_trsm_amd.c +++ b/frame/compat/bla_trsm_amd.c @@ -1362,7 +1362,7 @@ void ztrsm_blis_impl * As the dimension of A is 1x1, there's going to * be only one 1 element of A. */ - if(*transa == 'C' && *diaga == 'N') + if(blis_transa == BLIS_CONJ_TRANSPOSE) { a_dup.real = a->real; a_dup.imag = a->imag * -1.0; @@ -1466,7 +1466,7 @@ void ztrsm_blis_impl * As the dimension of A is 1x1, there's going to * be only one 1 element of A. */ - if(*transa == 'C' && *diaga == 'N') + if(blis_transa == BLIS_CONJ_TRANSPOSE) { a_dup.real = a->real; a_dup.imag = a->imag * -1.0; @@ -1748,7 +1748,7 @@ void ctrsm_blis_impl * As the dimension of A is 1x1, there's going to * be only one 1 element of A. */ - if(*transa == 'C' && *diaga == 'N') + if(blis_transa == BLIS_CONJ_TRANSPOSE) { a_dup.real = a->real; a_dup.imag = a->imag * -1.0; @@ -1852,7 +1852,7 @@ void ctrsm_blis_impl * As the dimension of A is 1x1, there's going to * be only one 1 element of A. */ - if(*transa == 'C' && *diaga == 'N') + if(blis_transa == BLIS_CONJ_TRANSPOSE) { a_dup.real = a->real; a_dup.imag = a->imag * -1.0; From 384ba48e251d070d9263451245a318678cc10052 Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Thu, 9 Nov 2023 12:29:58 +0530 Subject: [PATCH 215/226] Re-implements ddotv edge kernel using masked instructions - This commit uses avx2 and avx512 masked load instructions for handling edge case where vector size is not exact multiple of avx2/avx512 vector register size. - Thanks to Shubham, Sharma for avx512 ddotv kernel changes Change-Id: I998651eeb1083caf3308f1b45bd7d55b7974bcb4 (cherry picked from commit e91d23ff05da9c4da4ce8fe1a0eba7212d31ca13) --- kernels/zen/1/bli_dotv_zen_int10.c | 22 +++++-- kernels/zen4/1/bli_dotv_zen_int_avx512.c | 75 +++++++++--------------- 2 files changed, 46 insertions(+), 51 deletions(-) diff --git a/kernels/zen/1/bli_dotv_zen_int10.c b/kernels/zen/1/bli_dotv_zen_int10.c index c239612006..77c34b53c7 100644 --- a/kernels/zen/1/bli_dotv_zen_int10.c +++ b/kernels/zen/1/bli_dotv_zen_int10.c @@ -53,6 +53,17 @@ typedef union double d[4] __attribute__((aligned(64))); } v4df_t; + +//Loads lower 3 64-bit double precision elements into ymm register +static int64_t mask_3[4] = {-1, -1, -1, 0}; +//Loads lower 2 64-bit double precision elements into ymm register +static int64_t mask_2[4] = {-1, -1, 0, 0}; +//Loads lower 1 64-bit double precision elements into ymm register +static int64_t mask_1[4] = {-1, 0, 0, 0}; +//Loads 4 64-bit double precision elements into ymm register +static int64_t mask_0[4] = {0, 0, 0, 0}; + +static int64_t *mask_ptr[] = {mask_0, mask_1, mask_2, mask_3}; // ----------------------------------------------------------------------------- void bli_sdotv_zen_int10 @@ -421,12 +432,15 @@ void bli_ddotv_zen_int10 y0 += 1*n_elem_per_reg; } - for ( ; (i + 0) < n; i += 1 ) + if(i < n) { - rho0 += (*x0) * (*y0); + __m256i maskVec = _mm256_loadu_si256( (__m256i *)mask_ptr[(n - i)]); - x0 += 1; - y0 += 1; + xv[0] = _mm256_maskload_pd( x0, maskVec ); + yv[0] = _mm256_maskload_pd( y0, maskVec ); + + rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v ); + i = n; } // Manually add the results from above to finish the sum. diff --git a/kernels/zen4/1/bli_dotv_zen_int_avx512.c b/kernels/zen4/1/bli_dotv_zen_int_avx512.c index 681e4bda5b..4d9708e751 100644 --- a/kernels/zen4/1/bli_dotv_zen_int_avx512.c +++ b/kernels/zen4/1/bli_dotv_zen_int_avx512.c @@ -334,8 +334,13 @@ void bli_ddotv_zen_int_avx512 x0 += 2 * n_elem_per_reg; y0 += 2 * n_elem_per_reg; } + rhov[0] = _mm512_add_pd(rhov[0], rhov[2]); + rhov[1] = _mm512_add_pd(rhov[1], rhov[3]); - for (; (i + 7) < n; i += 8) + rhov[0] = _mm512_add_pd(rhov[0], rhov[4]); + rhov[0] = _mm512_add_pd(rhov[0], rhov[1]); + + if((i + 7) < n) { xv[0] = _mm512_loadu_pd(x0); @@ -345,57 +350,33 @@ void bli_ddotv_zen_int_avx512 x0 += n_elem_per_reg; y0 += n_elem_per_reg; + i += 8; } - - __m256d temp[2]; - temp[0] = _mm256_setzero_pd(); - - for (; (i + 3) < n; i += 4) + if(i < n) { - __m256d x_vec = _mm256_loadu_pd(x0); - - __m256d y_vec = _mm256_loadu_pd(y0); - - temp[0] = _mm256_fmadd_pd(x_vec, y_vec, temp[0]); - - x0 += 4; - y0 += 4; - } - - __m128d temp_128[2]; - temp_128[0] = _mm_setzero_pd(); + // calculate mask based on remainder elements of vector + // which are not in multiple of 8. + // Here bitmask is prepared based on remainder elements + // to load only required elements from memory into + // vector register. + //for example if n-i=3 case bitmask is prepared as following. + //1 is shifted by n-i(3), mask becomes 0b1000. + //substracting 1 from it makes mask 0b111 which states that + //3 elements from memory are to be loaded into vector register. + __mmask8 mask = (1 << (n-i)) - 1; + rhov[1] = _mm512_setzero_pd(); + + xv[0] = _mm512_mask_loadu_pd(rhov[1], mask, x0); + + yv[0] = _mm512_mask_loadu_pd(rhov[1], mask, y0); - for (; (i + 1) < n; i += 2) - { - __m128d x_vec = _mm_loadu_pd(x0 + 0 * n_elem_per_reg); - - __m128d y_vec = _mm_loadu_pd(y0 + 0 * n_elem_per_reg); - - temp_128[0] = _mm_fmadd_pd(x_vec, y_vec, temp_128[0]); + rhov[0] = _mm512_fmadd_pd(xv[0], yv[0], rhov[0]); - x0 += 2; - y0 += 2; + x0 += (n-i); + y0 += (n-i); + i += (n-i); } - - // Add the results from above to finish the sum. - rhov[0] = _mm512_add_pd(rhov[0], rhov[2]); - rhov[1] = _mm512_add_pd(rhov[1], rhov[3]); - - rhov[0] = _mm512_add_pd(rhov[0], rhov[1]); - rhov[0] = _mm512_add_pd(rhov[0], rhov[4]); - - temp[1] = _mm512_extractf64x4_pd(rhov[0], 0); - temp[0] = _mm256_add_pd(temp[0], temp[1]); - - temp[1] = _mm512_extractf64x4_pd(rhov[0], 1); - temp[0] = _mm256_add_pd(temp[0], temp[1]); - - temp_128[1] = _mm256_extractf64x2_pd(temp[0], 0); - temp_128[0] = _mm_add_pd(temp_128[0], temp_128[1]); - temp_128[1] = _mm256_extractf64x2_pd(temp[0], 1); - temp_128[0] = _mm_add_pd(temp_128[0], temp_128[1]); - - rho0 = temp_128[0][0] + temp_128[0][1]; + rho0 = _mm512_reduce_add_pd(rhov[0]); } for (; i < n; ++i) From 6ab76f52df1587d994168277c8ecca58d7d4e156 Mon Sep 17 00:00:00 2001 From: mangala v Date: Wed, 22 Nov 2023 22:17:42 +0530 Subject: [PATCH 216/226] Gtestsuite: Updated sgemm testcase for sup Updated sgemm testcase to handle multiple values of alpha, beta for different input size Added sgemm testcase to cover m,n,k dimension till 20 size atleast instepsize of 1 Change-Id: Id10ba3d7a05154b171511ef11ea76297494672cd --- .../testsuite/level3/gemm/sgemm_generic.cpp | 53 +++++-------------- 1 file changed, 14 insertions(+), 39 deletions(-) diff --git a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp index 6e11c8956a..2adbe2968a 100644 --- a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp +++ b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp @@ -127,7 +127,7 @@ class SGemmTestPrint { // Black box testing. INSTANTIATE_TEST_SUITE_P( - Blackbox, + sgemm_sup_10_30, SGemmTest, ::testing::Combine( ::testing::Values('c' @@ -151,7 +151,7 @@ INSTANTIATE_TEST_SUITE_P( // Black box testing. INSTANTIATE_TEST_SUITE_P( - sgemm_sup_m, + sgemm_sup_alpha_beta, SGemmTest, ::testing::Combine( ::testing::Values('c' @@ -161,36 +161,11 @@ INSTANTIATE_TEST_SUITE_P( ), // storage format ::testing::Values('n','t'), // transa ::testing::Values('n','t'), // transb - ::testing::Range(gtint_t(1), gtint_t(600), 1), // m - ::testing::Values(50), // n - ::testing::Values(30), // k - ::testing::Values( 1.0, 0.0, -2.0), // alpha - ::testing::Values(-1.0, 1.0, 0.0), // beta - ::testing::Values(gtint_t(2)), // increment to the leading dim of a - ::testing::Values(gtint_t(3)), // increment to the leading dim of b - ::testing::Values(gtint_t(7)) // increment to the leading dim of c - ), - ::SGemmTestPrint() - ); - - -// Black box testing. -INSTANTIATE_TEST_SUITE_P( - sgemm_sup_n, - SGemmTest, - ::testing::Combine( - ::testing::Values('c' -#ifndef TEST_BLAS - ,'r' -#endif - ), // storage format - ::testing::Values('n','t'), // transa - ::testing::Values('n','t'), // transb - ::testing::Values(30), // m - ::testing::Range(gtint_t(1), gtint_t(600), 1), // n - ::testing::Values(30), // k - ::testing::Values( 1.0, 0.0, -2.0), // alpha - ::testing::Values(-1.0, 1.0, 0.0), // beta + ::testing::Range(gtint_t(1), gtint_t(20), 1), // m + ::testing::Range(gtint_t(1), gtint_t(50), 1), // n + ::testing::Range(gtint_t(1), gtint_t(10), 1), // k + ::testing::Values(0.0, 1.0, -1.0, 5.3, -10.0), // alpha + ::testing::Values(0.0, 1.0, -1.0, 6.4, -19.0), // beta ::testing::Values(gtint_t(2)), // increment to the leading dim of a ::testing::Values(gtint_t(3)), // increment to the leading dim of b ::testing::Values(gtint_t(7)) // increment to the leading dim of c @@ -209,13 +184,13 @@ INSTANTIATE_TEST_SUITE_P( ,'r' #endif ), // storage format - ::testing::Values('n','t'), // transa - ::testing::Values('n','t'), // transb - ::testing::Range(gtint_t(1), gtint_t(100), 1), // m - ::testing::Range(gtint_t(1), gtint_t(100), 1), // n - ::testing::Range(gtint_t(1), gtint_t(100), 1), // k - ::testing::Values( 1.0, 0.0, -2.0), // alpha - ::testing::Values(-1.0, 1.0, 0.0), // beta + ::testing::Values('n'), // transa + ::testing::Values('n'), // transb + ::testing::Range(gtint_t(1), gtint_t(20), 1), // m + ::testing::Range(gtint_t(1), gtint_t(50), 1), // n + ::testing::Range(gtint_t(1), gtint_t(20), 1), // k + ::testing::Values( -2.0), // alpha + ::testing::Values( 5.0), // beta ::testing::Values(gtint_t(2)), // increment to the leading dim of a ::testing::Values(gtint_t(3)), // increment to the leading dim of b ::testing::Values(gtint_t(7)) // increment to the leading dim of c From 841c1067ed7672c37c533b39b4000d57e9fdcfc8 Mon Sep 17 00:00:00 2001 From: Eleni Vlachopoulou Date: Fri, 24 Nov 2023 02:16:07 +0530 Subject: [PATCH 217/226] GTestSuite: Clean-up on build system. - and a small bugfix so that it works again on Windows. Change-Id: I986b81d74d0f00c55eee497712aed5b268211d5f --- gtestsuite/CMakeLists.txt | 168 +++++++++--------- .../src/level3/ref_gemm_compute.cpp | 2 +- gtestsuite/testsuite/CMakeLists.txt | 2 +- 3 files changed, 88 insertions(+), 84 deletions(-) diff --git a/gtestsuite/CMakeLists.txt b/gtestsuite/CMakeLists.txt index bc55cdc834..37a117172f 100644 --- a/gtestsuite/CMakeLists.txt +++ b/gtestsuite/CMakeLists.txt @@ -49,76 +49,78 @@ if(APPLE) endif() # Set the path to the BLIS installation. -if(LINUX) - if(NOT(BLIS_PATH)) - message(FATAL_ERROR "Need to provide a BLIS installation path during CMake invocation. Please use \ - $ cmake .. -DBLIS_PATH=/home/username/blis_installation") - endif() - # Set the path to BLIS include directory. - set(BLIS_INCLUDE ${BLIS_PATH}/include/blis) -else() - if(NOT(BLIS_LIB_PATH)) - message(FATAL_ERROR "Need to provide a path to BLIS library during CMake invocation. Please use \ - $ cmake .. -DBLIS_LIB_PATH=/home/username/blis_installation/path_to_library") - endif() - # Set the path to BLIS include directory. - if(NOT(BLIS_INCLUDE)) - message(FATAL_ERROR "Need to provide a path to BLIS headers during CMake invocation. Please use \ - $ cmake .. -DBLIS_INCLUDE=/home/username/blis_installation/path_to_headers") - endif() +set(BLIS_PATH "undefined" CACHE STRING "Setting the path to a BLIS installation that needs testing.") +if(BLIS_PATH STREQUAL "undefined") + message(FATAL_ERROR "Need to provide a BLIS installation path during CMake invocation. Please use \ + $ cmake .. -DBLIS_PATH=/home/username/blis_installation") endif() +# Set the path to BLIS include directory. +# Adding both paths so that testing works with installation using configure/Make or CMake. +set(BLIS_INCLUDE ${BLIS_PATH}/include/ ${BLIS_PATH}/include/blis CACHE STRING "Setting the path to the BLIS headers.") +set(BLIS_LIB_PATH ${BLIS_PATH}/lib CACHE STRING "Setting the path to the BLIS library.") + # Use REF_BLAS to set the library that will be used for reference results. set(REF_CBLAS CACHE STRING "Library used to compute reference results.") -# Set the possible values of reference CBLAS for cmake-gui +# Set the possible values of reference CBLAS for cmake-gui and throw errors for disabled options. if(LINUX) set_property(CACHE REF_CBLAS PROPERTY STRINGS "OpenBLAS" "Netlib" "MKL") + if(NOT ((REF_CBLAS STREQUAL "OpenBLAS") OR (REF_CBLAS STREQUAL "Netlib") OR(REF_CBLAS STREQUAL "MKL"))) + message(FATAL_ERROR "REF_CBLAS option '${REF_CBLAS}' is not supported. Please, use one of the following options \ + during CMake invokation: OpenBLAS, Netlib, MKL or modify CMakeLists.txt to include this option.") + endif() else() set_property(CACHE REF_CBLAS PROPERTY STRINGS "OpenBLAS" "MKL") + if(NOT ((REF_CBLAS STREQUAL "OpenBLAS") OR (REF_CBLAS STREQUAL "MKL"))) + message(FATAL_ERROR "REF_CBLAS option '${REF_CBLAS}' is not supported. Please, use one of the following options \ + during CMake invokation: OpenBLAS, MKL or modify CMakeLists.txt to include this option.") + endif() endif() # Set OpenMP as the default option -set(ENABLE_THREADING "openmp" CACHE STRING "Setting OpenMP as the threading library") +set(ENABLE_THREADING "openmp" CACHE STRING "the threading flag") # Set the possible values of theading libraries for cmake-gui -if(LINUX) - set_property(CACHE ENABLE_THREADING PROPERTY STRINGS "openmp" "pthreads" "no") -else() +if(WIN32) set_property(CACHE ENABLE_THREADING PROPERTY STRINGS "openmp" "no") + if( NOT ((ENABLE_THREADING STREQUAL "openmp") OR (ENABLE_THREADING STREQUAL "no")) ) + message(FATAL_ERROR "ENABLE_THREADING option '${ENABLE_THREADING}' is not supported. Please use one of the following options \ + during CMake invokation: openmp, no") + endif() +else() + set_property(CACHE ENABLE_THREADING PROPERTY STRINGS "openmp" "pthreads" "no") + if( NOT ((ENABLE_THREADING STREQUAL "openmp") OR (ENABLE_THREADING STREQUAL "pthreads") OR (ENABLE_THREADING STREQUAL "no")) ) + message(FATAL_ERROR "ENABLE_THREADING option '${ENABLE_THREADING}' is not supported. Please use one of the following options \ + during CMake invokation: openmp, pthreads, no") + endif() endif() - -# Set the possibe values of OpenMP runtimes +# Setting path to OpenMP runtime. if(WIN32) - # Set LLVM OpenMP library as the default option - set(OpenMP_LIBRARY "LLVM" CACHE STRING "Using LLVM OpenMP library") - set_property(CACHE OpenMP_LIBRARY PROPERTY STRINGS "LLVM" "Intel") + set(OpenMP_libomp_LIBRARY "C:/Program Files/LLVM/lib/libomp.lib" CACHE STRING "openmp library path") endif() # If MKL is used as a reference set up the threading library options. if(REF_CBLAS STREQUAL "MKL") # MKL threading option is set up as BLIS threading option by default. - set(MKL_ENABLE_THREADING ${ENABLE_THREADING} CACHE STRING "Setting MKL threading the same as BLIS threading") + set(MKL_ENABLE_THREADING ${ENABLE_THREADING} CACHE STRING "Setting MKL threading option.") endif() # Set up OpenMP flags correctly if it's required. if( (ENABLE_THREADING STREQUAL "openmp") OR (MKL_ENABLE_THREADING STREQUAL "openmp") ) - if(WIN32) - set(OpenMP_libomp_LIBRARY "C:/Program Files/LLVM/lib/libomp.lib" CACHE STRING "openmp library path") - endif() find_package(OpenMP) - if(OPENMP_FOUND) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - else() + if(NOT OPENMP_FOUND) message (FATAL_ERROR "Openmp Not Found, please provide an OpenMP library using -DOpenMP_libomp_LIBRARY=path_to_omp_lib.") endif() endif() # Set static BLIS as the default library we build against. -set(BLIS_LINKING_TYPE "static" CACHE STRING "Linking to a static BLIS library") +set(BLIS_LINKING_TYPE "static" CACHE STRING "Type of BLIS library (shared or static) that is being tested.") # Set the possible values of BLIS linking type for cmake-gui set_property(CACHE BLIS_LINKING_TYPE PROPERTY STRINGS "static" "shared") +if( NOT ((BLIS_LINKING_TYPE STREQUAL "static") OR (BLIS_LINKING_TYPE STREQUAL "shared")) ) + message(FATAL_ERROR "BLIS_LINKING_TYPE option '${BLIS_LINKING_TYPE}' is not supported. Please use one of the following options \ + during CMake invokation: static, shared") +endif() # Set common libraries. if(LINUX) @@ -128,20 +130,21 @@ if(LINUX) endif() # Use INT_SIZE to set the int type used for testing. -set(INT_SIZE "32" CACHE STRING "Integer size used in testing suite.") +set(INT_SIZE "32" CACHE STRING "Integer size used in testing suite. Must match the integer size of BLIS.") # Set the possible values of reference CBLAS for cmake-gui set_property(CACHE INT_SIZE PROPERTY STRINGS "32" "64") if( NOT ((INT_SIZE STREQUAL "32") OR (INT_SIZE STREQUAL "64")) ) - message(FATAL_ERROR "INT_SIZE option ${INT_SIZE} is not supported. Must be 32 or 64.") + message(FATAL_ERROR "INT_SIZE option '${INT_SIZE}' is not supported. Please use one of the following options \ + during CMake invokation: 32, 64") endif() # Use TEST_INTERFACE to set which interface, supported by BLIS is meant to be tested. -set(TEST_INTERFACE "BLAS" CACHE STRING "Interface that is being tested.") +set(TEST_INTERFACE "BLAS" CACHE STRING "Interface of BLIS that is being tested.") # Set the possible values of interfaces for cmake-gui set_property(CACHE TEST_INTERFACE PROPERTY STRINGS "BLAS" "CBLAS" "BLIS_TYPED") if( NOT ((TEST_INTERFACE STREQUAL "BLAS") OR (TEST_INTERFACE STREQUAL "CBLAS") OR (TEST_INTERFACE STREQUAL "BLIS_TYPED")) ) message(FATAL_ERROR "TEST_INTERFACE option ${TEST_INTERFACE} is not supported. Please use on of the following options \ - during CMake invokation: -DTEST_INTERFACE=BLAS or -DTEST_INTERFACE=CBLAS or -DTEST_INTERFACE=BLIS_TYPED") + during CMake invokation: BLAS, CBLAS, BLIS_TYPED") endif() # Use BLIS_ELEMENT_TYPE to set whether the elements of any matrix/vector tested are integers or floating point values. @@ -150,7 +153,7 @@ set(BLIS_ELEMENT_TYPE "f" CACHE STRING "Type of elements of matrices/vectors") set_property(CACHE BLIS_ELEMENT_TYPE PROPERTY STRINGS "f" "i") if( NOT ((BLIS_ELEMENT_TYPE STREQUAL "f") OR (BLIS_ELEMENT_TYPE STREQUAL "i")) ) message(FATAL_ERROR "BLIS_ELEMENT_TYPE option ${BLIS_ELEMENT_TYPE} is not supported. Please use on of the following options \ - during CMake invokation: -DBLIS_ELEMENT_TYPE=f or -DBLIS_ELEMENT_TYPE=i") + during CMake invokation: f, i") endif() if(LINUX) @@ -232,6 +235,42 @@ else() #WIN32 endif() endif() +# Set up the library name. +if(WIN32) + set(LIBBLIS AOCL-LibBlis-Win) +else() + set(LIBBLIS blis) +endif() +# Append if threading is required. +if(NOT (ENABLE_THREADING STREQUAL "no")) + if(WIN32) + string(APPEND LIBBLIS -MT) + else() + string(APPEND LIBBLIS -mt) + endif() +endif() +# Append for dll if necessary. +if(WIN32 AND (BLIS_LINKING_TYPE STREQUAL "shared")) + string(APPEND LIBBLIS -dll) +endif() +# Setting the suffix for find_library(). +if(WIN32) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib) +else() + if(BLIS_LINKING_TYPE STREQUAL "shared") + set(CMAKE_FIND_LIBRARY_SUFFIXES .so) + else() + set(CMAKE_FIND_LIBRARY_SUFFIXES .a) + endif() +endif() + +find_library(BLIS_LIBRARY NAMES ${LIBBLIS} PATHS ${BLIS_LIB_PATH}) +if(${BLIS_LIBRARY} STREQUAL BLIS_LIBRARY-NOTFOUND) + message(FATAL_ERROR "Blis Library ${LIBBLIS} not found in BLIS_LIB_PATH=${BLIS_LIB_PATH}") +else() + message(STATUS "Found ${LIBBLIS} BLIS Library : " ${BLIS_LIBRARY}) +endif() + # Set compiler options and BLIS library for Linux. if(LINUX) # Add compiler definition. @@ -245,54 +284,19 @@ if(LINUX) if(ENABLE_COVERAGE) set(CMAKE_CXX_FLAGS "-O0 --coverage") endif() +endif() - if(ENABLE_THREADING STREQUAL "no") - if(BLIS_LINKING_TYPE STREQUAL "static") - set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis.a") - else() - set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis.so") - endif() - find_library(libblis NAMES blis PATHS ${BLIS_PATH}/lib) - else() - if(BLIS_LINKING_TYPE STREQUAL "static") - set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis-mt.a") - else() - set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis-mt.so") - endif() - find_library(libblis NAMES blis-mt PATHS ${BLIS_PATH}/lib) - endif() - if(${libblis} STREQUAL libblis-NOTFOUND) - message(FATAL_ERROR "Blis Library not found : " ${BLIS_PATH}) - else() - message(STATUS "Found BLIS Library : " ${Blis_LIBRARY}) - endif() -else() - add_definitions(-DBOOST_THREAD_USE_LIB) - set(CMAKE_POSITION_INDEPENDENT_CODE ON) +#Setting up the correct Windows Runtime Library. +if(WIN32) cmake_policy(SET CMP0091 NEW) - if(BLIS_LINKING_TYPE STREQUAL "shared") + if(BUILD_SHARED_LIBS) set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") else() set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") endif() - - if(ENABLE_THREADING STREQUAL "no") - if(BLIS_LINKING_TYPE STREQUAL "static") - set(Blis_LIBRARY "${BLIS_LIB_PATH}/AOCL-LibBlis-Win.lib") - else() - set(Blis_LIBRARY "${BLIS_LIB_PATH}/AOCL-LibBlis-Win-dll.lib") - set(BLIS_DLL "${BLIS_LIB_PATH}/AOCL-LibBlis-Win-dll.dll") - endif() - else() - if(BLIS_LINKING_TYPE STREQUAL "static") - set(Blis_LIBRARY "${BLIS_LIB_PATH}/AOCL-LibBlis-Win-MT.lib") - else() - set(Blis_LIBRARY "${BLIS_LIB_PATH}/AOCL-LibBlis-Win-MT-dll.lib") - set(BLIS_DLL "${BLIS_LIB_PATH}/AOCL-LibBlis-Win-MT-dll.dll") - endif() - endif() endif() + add_subdirectory(testinghelpers) add_subdirectory(testsuite) diff --git a/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp b/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp index dd069fcd8a..21c055f9dd 100644 --- a/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp +++ b/gtestsuite/testinghelpers/src/level3/ref_gemm_compute.cpp @@ -33,7 +33,7 @@ */ #include "blis.h" -#include + #include "level3/ref_gemm_compute.h" /* diff --git a/gtestsuite/testsuite/CMakeLists.txt b/gtestsuite/testsuite/CMakeLists.txt index f03b7463b0..ece8c8434a 100644 --- a/gtestsuite/testsuite/CMakeLists.txt +++ b/gtestsuite/testsuite/CMakeLists.txt @@ -88,7 +88,7 @@ foreach(dir ${DIRS}) set_target_properties(${target_name}.${dir}.${subdir} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) set_target_properties(${target_name}.${dir}.${subdir} PROPERTIES OUTPUT_NAME ${target_name}.${dir}.${subdir}) target_include_directories(${target_name}.${dir}.${subdir} PUBLIC ${BLIS_INCLUDE} ${CMAKE_SOURCE_DIR}/testinghelpers/inc ${CMAKE_SOURCE_DIR}/testsuite/) - target_link_libraries(${target_name}.${dir}.${subdir} gtest gtest_main testinghelpers ${Blis_LIBRARY} ${COMMON_LIBS}) + target_link_libraries(${target_name}.${dir}.${subdir} gtest gtest_main testinghelpers ${BLIS_LIBRARY} ${COMMON_LIBS}) # if we test serial BLIS, but MKL is used as a reference we still need to set up OpenMP. if( (ENABLE_THREADING STREQUAL "openmp") OR (MKL_ENABLE_THREADING STREQUAL "openmp")) target_link_libraries(${target_name}.${dir}.${subdir} OpenMP::OpenMP_CXX) From e3e04f8135c247e05604d382dfc9c0d8b4cb3c2a Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Mon, 30 Oct 2023 09:32:15 -0400 Subject: [PATCH 218/226] BLIS: Missing clobbers (batch 7) Add missing clobbers in: - bli_gemmsup_rv_haswell kernels - spare copies of kernels in old, other and broken subdirectories - misc kernels for legacy platforms AMD-Internal: [CPUPL-3521] Change-Id: I7cdb7fd1cb29630d8b7fa914b1002a270dfe9ef5 (cherry picked from commit 50608f28df1262ca6d26e3994e3bfb20e28b04fb) --- .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c | 10 ++- .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c | 14 ++-- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c | 20 ++--- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c | 4 +- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c | 20 ++--- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c | 16 ++-- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c | 20 ++--- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c | 6 +- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c | 24 +++--- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c | 6 +- .../old/bli_gemmsup_rd_haswell_asm_d6x8.c | 30 ++++--- .../old/bli_gemmsup_rv_haswell_asm_d6x8.c | 63 +++++++++------ .../sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c | 38 ++++++--- .../old/bli_gemmsup_rd_haswell_asm_d6x8m.c | 38 ++++++--- .../old/bli_gemmsup_rd_haswell_asm_d6x8n.c | 38 ++++++--- kernels/knl/1m/bli_dpackm_knl_asm_24x8.c | 7 +- kernels/knl/1m/bli_spackm_knl_asm_24x16.c | 13 ++- kernels/knl/3/bli_dgemm_knl_asm_24x8.c | 4 +- kernels/knl/3/bli_sgemm_knl_asm_24x16.c | 3 +- .../3/bli_gemm_sandybridge_asm_d8x4.c | 17 +++- .../sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c | 11 +-- .../sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c | 9 ++- .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c | 11 +-- .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c | 9 ++- .../sup/other/bli_gemmsup_rd_zen_asm_s6x16.c | 29 ++++--- .../sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c | 18 +++-- .../sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c | 15 ++-- .../sup/other/bli_gemmsup_rv_zen_asm_s6x16.c | 79 +++++++++++++------ .../sup/other/bli_gemmsup_rv_zen_asm_s6x16m.c | 9 ++- .../sup/other/bli_gemmsup_rv_zen_asm_s6x16n.c | 21 +++-- 30 files changed, 377 insertions(+), 225 deletions(-) diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c index 403aaaaeef..8d0060b2f5 100644 --- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c +++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -760,7 +761,8 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1857,7 +1859,8 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", - "memory" + "xmm0", "xmm2", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } @@ -2530,7 +2533,8 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", - "memory" + "xmm0", "xmm2", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c index 14093d4f42..892f0b5609 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c @@ -10144,7 +10144,7 @@ static void bli_dgemmsup_rv_haswell_asm_6x3m "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", "ymm12", "ymm14", - "memory" + "ymm15", "memory" ) consider_edge_cases_nleft_3: @@ -10637,8 +10637,8 @@ static void bli_dgemmsup_rv_haswell_asm_6x1m "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm14", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases_nleft_1: @@ -12036,8 +12036,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4m "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm14", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases: @@ -12677,8 +12677,8 @@ void bli_dgemmsup_rv_haswell_asm_6x2m "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm14", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases: diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c index 390f3edb9f..42fa8c50a1 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx1.c @@ -540,8 +540,8 @@ void bli_dgemmsup_rv_haswell_asm_5x1 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm15", "memory" ) } @@ -937,8 +937,8 @@ void bli_dgemmsup_rv_haswell_asm_4x1 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1313,8 +1313,8 @@ void bli_dgemmsup_rv_haswell_asm_3x1 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", - "memory" + "ymm6", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm15", "memory" ) } @@ -1663,8 +1663,8 @@ void bli_dgemmsup_rv_haswell_asm_2x1 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", - "memory" + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm15", "memory" ) } @@ -1978,7 +1978,7 @@ void bli_dgemmsup_rv_haswell_asm_1x1 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm10", + "ymm12", "ymm15", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c index 71178b2907..15401cdb07 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c @@ -2142,7 +2142,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm2", "ymm3", - "memory" + "ymm10", "ymm11", "memory" ) } @@ -2467,7 +2467,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm2", "ymm3", - "memory" + "ymm7", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c index 31ee7ee1ab..3661ddf591 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx3.c @@ -635,8 +635,8 @@ void bli_dgemmsup_rv_haswell_asm_5x3 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1033,8 +1033,8 @@ void bli_dgemmsup_rv_haswell_asm_4x3 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm12", + "ymm11", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -1408,8 +1408,8 @@ void bli_dgemmsup_rv_haswell_asm_3x3 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", - "memory" + "ymm6", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm15", "memory" ) } @@ -1757,8 +1757,8 @@ void bli_dgemmsup_rv_haswell_asm_2x3 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", - "memory" + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm15", "memory" ) } @@ -2072,7 +2072,7 @@ void bli_dgemmsup_rv_haswell_asm_1x3 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", "ymm15", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm10", + "ymm12", "ymm15", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c index 99a128a238..a4f6ec48cd 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c @@ -1040,8 +1040,8 @@ void bli_dgemmsup_rv_haswell_asm_5x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "memory" ) } @@ -1457,8 +1457,8 @@ void bli_dgemmsup_rv_haswell_asm_4x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "memory" ) } @@ -1880,8 +1880,8 @@ void bli_dgemmsup_rv_haswell_asm_3x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", - "memory" + "ymm6", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "memory" ) } @@ -2240,7 +2240,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", - "memory" + "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -2579,7 +2579,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c index 8c6a45c513..b9473fff27 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx5.c @@ -865,8 +865,8 @@ void bli_dgemmsup_rv_haswell_asm_5x5 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", - "ymm5", "ymm7", "ymm9", "ymm11", "ymm15", + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm12", "ymm11", "ymm15", "memory" ) } @@ -1327,8 +1327,8 @@ void bli_dgemmsup_rv_haswell_asm_4x5 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", - "ymm5", "ymm7", "ymm9", "ymm15", + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm12", "ymm15", "memory" ) } @@ -1750,9 +1750,9 @@ void bli_dgemmsup_rv_haswell_asm_3x5 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", - "ymm5", "ymm7", "ymm11", "ymm15", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -2140,8 +2140,8 @@ void bli_dgemmsup_rv_haswell_asm_2x5 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", - "ymm5", "ymm15", + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm15", "memory" ) } @@ -2492,7 +2492,7 @@ void bli_dgemmsup_rv_haswell_asm_1x5 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm12", "ymm15", "memory" ) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c index caa20a06cd..858415e86d 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c @@ -2270,7 +2270,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6 "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", - "memory" + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -2700,7 +2700,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6 "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", - "memory" + "ymm12", "ymm13", "memory" ) } @@ -3077,7 +3077,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", - "memory" + "ymm6", "ymm7", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c index e25c67230c..be22b32b41 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx7.c @@ -943,8 +943,8 @@ void bli_dgemmsup_rv_haswell_asm_5x7 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", - "ymm5", "ymm7", "ymm9", "ymm15", + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm15", "memory" ) } @@ -1405,8 +1405,8 @@ void bli_dgemmsup_rv_haswell_asm_4x7 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", - "ymm5", "ymm7", "ymm11", "ymm15", + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm15", "memory" ) } @@ -1833,9 +1833,9 @@ void bli_dgemmsup_rv_haswell_asm_3x7 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", - "ymm5", "ymm15", - "memory" + "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", + "ymm14", "ymm15", "memory" ) } @@ -2225,9 +2225,8 @@ void bli_dgemmsup_rv_haswell_asm_2x7 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", - "ymm15", - "memory" + "ymm5", "ymm6", "ymm8", "ymm10","ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -2579,8 +2578,7 @@ void bli_dgemmsup_rv_haswell_asm_1x7 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", - "ymm6", "ymm8", "ymm10", "ymm12", - "ymm15", - "memory" + "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm12", "ymm15", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c index cb581bf72a..7c08eb2a50 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c @@ -2420,7 +2420,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8 "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", - "memory" + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -2842,7 +2842,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", - "memory" + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) } @@ -3235,7 +3235,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", - "memory" + "ymm6", "ymm7", "ymm8", "ymm9", "memory" ) } diff --git a/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rd_haswell_asm_d6x8.c b/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rd_haswell_asm_d6x8.c index 8aa5f94f76..2a518c794a 100644 --- a/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rd_haswell_asm_d6x8.c +++ b/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rd_haswell_asm_d6x8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -695,7 +695,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1188,7 +1190,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); @@ -1586,7 +1589,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -2117,7 +2120,9 @@ void bli_dgemmsup_rd_haswell_asm_6x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -2564,7 +2569,8 @@ void bli_dgemmsup_rd_haswell_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); @@ -2927,7 +2933,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); @@ -3480,7 +3486,8 @@ void bli_dgemmsup_rd_haswell_asm_6x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases: @@ -3914,7 +3921,8 @@ void bli_dgemmsup_rd_haswell_asm_3x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -4270,7 +4278,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -4585,7 +4593,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); diff --git a/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rv_haswell_asm_d6x8.c b/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rv_haswell_asm_d6x8.c index 4e37f6d1b6..d8e8fb148a 100644 --- a/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rv_haswell_asm_d6x8.c +++ b/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rv_haswell_asm_d6x8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -809,7 +809,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -1437,7 +1439,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -1927,7 +1930,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -2444,7 +2448,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); @@ -2848,7 +2853,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -3216,7 +3221,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -3823,7 +3828,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -4449,7 +4456,8 @@ void bli_dgemmsup_rv_haswell_asm_5x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); @@ -4933,7 +4941,8 @@ void bli_dgemmsup_rv_haswell_asm_4x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -5447,7 +5456,8 @@ void bli_dgemmsup_rv_haswell_asm_3x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -5863,7 +5873,8 @@ void bli_dgemmsup_rv_haswell_asm_2x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -6233,7 +6244,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -6710,7 +6721,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "ymm14", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -7205,7 +7217,8 @@ void bli_dgemmsup_rv_haswell_asm_5x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -7606,7 +7619,7 @@ void bli_dgemmsup_rv_haswell_asm_4x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -8014,7 +8027,7 @@ void bli_dgemmsup_rv_haswell_asm_3x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -8357,7 +8370,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -8677,7 +8690,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -9132,7 +9145,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -9582,7 +9595,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -9971,7 +9984,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -10377,7 +10390,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -10707,7 +10720,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } @@ -11014,7 +11027,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "memory" ) AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); } diff --git a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c index c5addd9cf2..b48bf3cab6 100644 --- a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c +++ b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -646,7 +646,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -1130,7 +1132,9 @@ void bli_dgemmsup_rd_haswell_asm_3x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -1571,7 +1575,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -1960,7 +1965,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } @@ -2454,7 +2459,9 @@ void bli_dgemmsup_rd_haswell_asm_6x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -2910,7 +2917,9 @@ void bli_dgemmsup_rd_haswell_asm_3x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -3314,7 +3323,8 @@ void bli_dgemmsup_rd_haswell_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -3675,7 +3685,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } @@ -4184,7 +4194,8 @@ void bli_dgemmsup_rd_haswell_asm_6x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -4576,7 +4587,8 @@ void bli_dgemmsup_rd_haswell_asm_3x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "memory" ) } @@ -4929,7 +4941,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -5243,7 +5255,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "memory" ) } diff --git a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c index 55ae6d0f91..def75c5e47 100644 --- a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c +++ b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -695,7 +695,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1242,7 +1244,9 @@ void bli_dgemmsup_rd_haswell_asm_3x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -1695,7 +1699,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -2090,7 +2095,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } @@ -2620,7 +2625,9 @@ void bli_dgemmsup_rd_haswell_asm_6x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -3120,7 +3127,9 @@ void bli_dgemmsup_rd_haswell_asm_3x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -3527,7 +3536,8 @@ void bli_dgemmsup_rd_haswell_asm_2x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -3887,7 +3897,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } @@ -4437,7 +4447,8 @@ void bli_dgemmsup_rd_haswell_asm_6x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases: @@ -4870,7 +4881,8 @@ void bli_dgemmsup_rd_haswell_asm_3x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "memory" ) } @@ -5224,7 +5236,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -5537,7 +5549,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "memory" ) } diff --git a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8n.c index a23764f8d4..d738d46dfb 100644 --- a/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8n.c +++ b/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -713,7 +713,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1312,7 +1314,9 @@ void bli_dgemmsup_rd_haswell_asm_3x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1857,7 +1861,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) consider_edge_cases: @@ -2347,7 +2352,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) consider_edge_cases: @@ -2934,7 +2939,9 @@ void bli_dgemmsup_rd_haswell_asm_6x4n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -3444,7 +3451,9 @@ void bli_dgemmsup_rd_haswell_asm_3x4n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -3860,7 +3869,8 @@ void bli_dgemmsup_rd_haswell_asm_2x4n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -4229,7 +4239,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } @@ -4751,7 +4761,8 @@ void bli_dgemmsup_rd_haswell_asm_6x2n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -5145,7 +5156,8 @@ void bli_dgemmsup_rd_haswell_asm_3x2n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "memory" ) } @@ -5508,7 +5520,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -5830,7 +5842,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "memory" ) } diff --git a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c index 91fe1989f0..cd4c3aef61 100644 --- a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c +++ b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -304,7 +305,8 @@ void bli_dpackm_knl_asm_8xk "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "rax", "rbx", "rcx", "rdx", "rdi", "rsi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "k0", "k1", + "ymm0", "ymm3", "memory" ) } @@ -608,7 +610,8 @@ void bli_dpackm_knl_asm_24xk "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "rax", "rbx", "rcx", "rdi", "rsi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory" + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "k0", "k1", "k2", "k3", "ymm0", "ymm1", "ymm2", "ymm3", "memory" ) } diff --git a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c index 8c4bdfe6be..571e166cd4 100644 --- a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c +++ b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -322,7 +323,11 @@ void bli_spackm_knl_asm_16xk "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "rax", "rbx", "rcx", "rdx", "rdi", "rsi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "k0", "k1", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm12", "xmm13", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", + "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm15", "memory" ) } @@ -625,7 +630,11 @@ void bli_spackm_knl_asm_24xk "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "rax", "rbx", "rcx", "rdx", "rdi", "rsi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "k0", "k1", + "k2", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm12", "xmm13", "xmm15", "ymm0", "ymm1", "ymm2", + "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm15", "memory" ) } diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c index b794e7c059..82e5a25435 100644 --- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c +++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -698,7 +699,8 @@ void bli_dgemm_knl_asm_24x8 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", - "zmm30", "zmm31", "memory" + "zmm30", "zmm31", "k0", "k1", "k2", "xmm1", "ymm2", "ymm3", + "ymm5", "memory" ) #ifdef LOOPMON diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c index 6d485b5308..b1ed2abf74 100644 --- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c +++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -695,7 +696,7 @@ void bli_sgemm_knl_asm_24x16 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", - "zmm30", "zmm31", "memory" + "zmm30", "zmm31", "k0", "k1", "k2", "xmm1", "ymm3", "ymm5", "memory" ) #ifdef LOOPMON diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c index a56ef16e5e..63ac331a60 100644 --- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c +++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1022,7 +1023,9 @@ void bli_sgemm_sandybridge_asm_8x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -1697,7 +1700,9 @@ void bli_dgemm_sandybridge_asm_8x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -2658,7 +2663,9 @@ void bli_cgemm_sandybridge_asm_8x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -3508,7 +3515,9 @@ void bli_zgemm_sandybridge_asm_4x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c index 03c1627f15..b39b091753 100644 --- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c +++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -548,7 +548,8 @@ void bli_cgemmsup_rv_zen_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -910,7 +911,7 @@ void bli_cgemmsup_rv_zen_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -1286,7 +1287,7 @@ void bli_cgemmsup_rv_zen_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", "memory" ) } @@ -1604,7 +1605,7 @@ void bli_cgemmsup_rv_zen_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", "memory" ) } diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c index 07fbd26296..d0f86f4ce6 100644 --- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c +++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -739,7 +739,9 @@ void bli_cgemmsup_rv_zen_asm_3x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1230,7 +1232,8 @@ void bli_cgemmsup_rv_zen_asm_3x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "ymm14", "memory" ) consider_edge_cases: diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c index 1638eaba0b..3b2aedc7e2 100644 --- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c +++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -540,7 +540,8 @@ void bli_zgemmsup_rv_zen_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) } @@ -926,7 +927,7 @@ void bli_zgemmsup_rv_zen_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -1314,7 +1315,7 @@ void bli_zgemmsup_rv_zen_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", "memory" ) } @@ -1650,7 +1651,7 @@ void bli_zgemmsup_rv_zen_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "memory" ) } diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c index 898e4006e9..cadba52ce4 100644 --- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c +++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -702,7 +702,9 @@ void bli_zgemmsup_rv_zen_asm_3x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1194,7 +1196,8 @@ void bli_zgemmsup_rv_zen_asm_3x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "ymm14", "memory" ) consider_edge_cases: diff --git a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16.c b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16.c index 96bc927499..c0c4d5f198 100644 --- a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16.c +++ b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16.c @@ -2,8 +2,10 @@ BLIS An object-based framework for developing high-performance BLAS-like libraries. + Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -328,7 +330,8 @@ void bli_sgemmsup_rd_zen_asm_2x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -559,7 +562,7 @@ void bli_sgemmsup_rd_zen_asm_1x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } @@ -857,7 +860,8 @@ void bli_sgemmsup_rd_zen_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } @@ -1087,7 +1091,7 @@ void bli_sgemmsup_rd_zen_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } @@ -1353,7 +1357,8 @@ void bli_sgemmsup_rd_zen_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) } void bli_sgemmsup_rd_zen_asm_1x4 @@ -1567,7 +1572,7 @@ void bli_sgemmsup_rd_zen_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) } @@ -1791,7 +1796,7 @@ void bli_sgemmsup_rd_zen_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory" ) } @@ -1978,7 +1983,7 @@ void bli_sgemmsup_rd_zen_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "memory" ) } @@ -2369,7 +2374,8 @@ void bli_sgemmsup_rd_zen_asm_6x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. @@ -2663,6 +2669,7 @@ void bli_sgemmsup_rd_zen_asm_3x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "memory" ) } diff --git a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c index 00773b3b58..7599b26d4e 100644 --- a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c +++ b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -556,7 +556,9 @@ void bli_sgemmsup_rd_zen_asm_6x16m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1035,7 +1037,9 @@ void bli_sgemmsup_rd_zen_asm_6x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1517,7 +1521,9 @@ void bli_sgemmsup_rd_zen_asm_6x4m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1923,7 +1929,9 @@ void bli_sgemmsup_rd_zen_asm_6x2m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: diff --git a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c index dfe5ca28af..824189992b 100644 --- a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c +++ b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -594,7 +594,9 @@ void bli_sgemmsup_rd_zen_asm_6x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1061,7 +1063,9 @@ void bli_sgemmsup_rd_zen_asm_3x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1471,7 +1475,8 @@ void bli_sgemmsup_rd_zen_asm_2x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7", "ymm8", + "ymm10", "ymm11", "ymm13", "ymm14", "memory" ) consider_edge_cases: @@ -1828,7 +1833,7 @@ void bli_sgemmsup_rd_zen_asm_1x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13", "memory" ) consider_edge_cases: diff --git a/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16.c b/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16.c index 6c9f8cabe1..8915ec8e5d 100644 --- a/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16.c +++ b/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -720,7 +720,9 @@ void bli_sgemmsup_rv_zen_asm_5x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -1214,7 +1216,9 @@ void bli_sgemmsup_rv_zen_asm_4x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -1772,7 +1776,9 @@ void bli_sgemmsup_rv_zen_asm_3x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -2165,7 +2171,9 @@ void bli_sgemmsup_rv_zen_asm_2x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -2525,7 +2533,9 @@ void bli_sgemmsup_rv_zen_asm_1x16 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -2973,7 +2983,9 @@ void bli_sgemmsup_rv_zen_asm_6x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -3426,7 +3438,9 @@ void bli_sgemmsup_rv_zen_asm_5x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -3792,7 +3806,9 @@ void bli_sgemmsup_rv_zen_asm_4x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) } @@ -4204,7 +4220,8 @@ void bli_sgemmsup_rv_zen_asm_3x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -4530,7 +4547,8 @@ void bli_sgemmsup_rv_zen_asm_2x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -4793,7 +4811,8 @@ void bli_sgemmsup_rv_zen_asm_1x8 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -5194,7 +5213,8 @@ void bli_sgemmsup_rv_zen_asm_6x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -5582,7 +5602,8 @@ void bli_sgemmsup_rv_zen_asm_5x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -5920,7 +5941,8 @@ void bli_sgemmsup_rv_zen_asm_4x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -6245,7 +6267,8 @@ void bli_sgemmsup_rv_zen_asm_3x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -6518,7 +6541,8 @@ void bli_sgemmsup_rv_zen_asm_2x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -6772,7 +6796,8 @@ void bli_sgemmsup_rv_zen_asm_1x4 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -7159,7 +7184,8 @@ void bli_sgemmsup_rv_zen_asm_6x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -7532,7 +7558,8 @@ void bli_sgemmsup_rv_zen_asm_5x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", + "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -7868,7 +7895,8 @@ void bli_sgemmsup_rv_zen_asm_4x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -8167,7 +8195,8 @@ void bli_sgemmsup_rv_zen_asm_3x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -8427,7 +8456,8 @@ void bli_sgemmsup_rv_zen_asm_2x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", + "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } @@ -8663,7 +8693,8 @@ void bli_sgemmsup_rv_zen_asm_1x2 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm2", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", + "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) } diff --git a/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16m.c b/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16m.c index 41dbbd699e..31918565b9 100644 --- a/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16m.c +++ b/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -895,7 +895,9 @@ void bli_sgemmsup_rv_zen_asm_6x16m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1431,7 +1433,8 @@ void bli_sgemmsup_rv_zen_asm_6x8m "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8", "ymm10", + "ymm12", "ymm14", "memory" ) consider_edge_cases: diff --git a/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16n.c b/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16n.c index a7ab770cb2..be8c9b065d 100644 --- a/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16n.c +++ b/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16n.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -855,7 +855,9 @@ void bli_sgemmsup_rv_zen_asm_6x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -1621,7 +1623,8 @@ void bli_sgemmsup_rv_zen_asm_5x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "memory" ) consider_edge_cases: @@ -2230,7 +2233,8 @@ void bli_sgemmsup_rv_zen_asm_4x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "memory" ) consider_edge_cases: @@ -2876,7 +2880,9 @@ void bli_sgemmsup_rv_zen_asm_3x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "memory" ) consider_edge_cases: @@ -3366,7 +3372,8 @@ void bli_sgemmsup_rv_zen_asm_2x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm11", "ymm12", "memory" ) consider_edge_cases: @@ -3818,7 +3825,7 @@ void bli_sgemmsup_rv_zen_asm_1x16n "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "memory" ) consider_edge_cases: From 8631ca9ee3c8e0dd3f45ececccf32d7c41877c94 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Fri, 24 Nov 2023 08:39:59 -0500 Subject: [PATCH 219/226] cblas.h: Correct order of including other header files Include bli_config.h before bli_system.h in ./frame/compat/cblas/src/cblas.h so that BLIS_ENABLE_SYSTEM is defined correctly before it is needed. This copies the change to ./frame/include/blis.h made in 1f527a93b99 (via merge c6f33401253). Also standardize some comments and formatting between blis.h and cblas.h AMD-Internal: [CPUPL-4251] Change-Id: Ie5cab646367f15003c25fa126344b02640d9106e (cherry picked from commit 48444d4316f4191a31155443eb95adc962b422c5) --- frame/compat/cblas/src/cblas.h | 25 ++++++++++++++++++++++++- frame/include/blis.h | 1 + 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h index 1c3b490b44..c815ee18aa 100644 --- a/frame/compat/cblas/src/cblas.h +++ b/frame/compat/cblas/src/cblas.h @@ -35,11 +35,34 @@ // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. + +// NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS +// YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. +// ALSO REMEMBER TO UPDATE ./frame/include/blis.h APPROPRIATELY + +// -- configure definitions -- + +// NOTE: bli_config.h header must be included before any BLIS header. +// It is bootstrapped by ./configure and does not depend on later +// headers. Moreover, these configuration variables are necessary to change +// some default behaviors (e.g. disable OS-detection in bli_system.h in case +// of --disable-system). +#include "bli_config.h" + +// -- System and language-related headers -- + +// NOTE: bli_system.h header must be included before bli_config_macro_defs.h. #include "bli_system.h" #include "bli_lang_defs.h" -#include "bli_config.h" + +// -- configure default definitions -- + #include "bli_config_macro_defs.h" + + +// -- Common BLIS definitions -- + #include "bli_type_defs.h" /* diff --git a/frame/include/blis.h b/frame/include/blis.h index 8dbb48e293..28174a4bba 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -47,6 +47,7 @@ extern "C" { // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. +// ALSO REMEMBER TO UPDATE ./frame/compat/cblas/src/cblas.h APPROPRIATELY // -- configure definitions -- From f44c649bb76996c9011175ad65bed9d2bf7627f0 Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Thu, 9 Nov 2023 14:55:31 -0500 Subject: [PATCH 220/226] Code cleanup: AMD copyright notice Standardize format of AMD copyright notice. AMD-Internal: [CPUPL-3519] Change-Id: I98530e58138765e5cd5bc0c97500506801eb0bf0 (cherry picked from commit ed5010d65b7e5094bb652927bbe979ea84a21211) --- LICENSE | 2 +- Makefile | 2 +- addon/CMakeLists.txt | 2 +- addon/aocl_gemm/aocl_gemm.h | 2 +- addon/aocl_gemm/aocl_gemm_bf16_utils.c | 2 +- addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c | 2 +- addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c | 2 +- addon/aocl_gemm/aocl_gemm_f32f32f32of32.c | 2 +- .../aocl_gemm/aocl_gemm_f32f32f32of32_utils.c | 2 +- addon/aocl_gemm/aocl_gemm_interface_apis.h | 2 +- addon/aocl_gemm/aocl_gemm_post_ops.h | 2 +- addon/aocl_gemm/aocl_gemm_u8s8s16os16.c | 2 +- addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c | 2 +- addon/aocl_gemm/aocl_gemm_u8s8s16os8.c | 2 +- addon/aocl_gemm/aocl_gemm_u8s8s32os32.c | 2 +- addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c | 2 +- addon/aocl_gemm/aocl_gemm_u8s8s32os8.c | 2 +- addon/aocl_gemm/config/lpgemm_config.c | 2 +- addon/aocl_gemm/config/lpgemm_config.h | 2 +- .../aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c | 2 +- .../frame/bf16bf16f32/lpgemm_reorder_bf16.c | 2 +- .../frame/bf16bf16f32/lpgemm_reorder_bf16.h | 2 +- .../frame/lpgemm_5loop_interface_apis.h | 2 +- addon/aocl_gemm/frame/lpgemm_post_ops.c | 2 +- addon/aocl_gemm/frame/lpgemm_post_ops.h | 2 +- addon/aocl_gemm/frame/lpgemm_types.h | 2 +- .../threading/lpgemm_thread_decor_openmp.c | 2 +- .../threading/lpgemm_thread_decor_openmp.h | 2 +- .../frame/u8s8s16/lpgemm_reorder_s16.c | 2 +- .../frame/u8s8s16/lpgemm_reorder_s16.h | 2 +- .../aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c | 2 +- .../aocl_gemm/frame/u8s8s32/lpgemm_reorder.c | 2 +- .../aocl_gemm/frame/u8s8s32/lpgemm_reorder.h | 2 +- .../aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c | 2 +- .../kernels/bf16bf16f32/lpgemm_pack_bf16.h | 2 +- addon/aocl_gemm/kernels/lpgemm_kernels.h | 2 +- .../kernels/u8s8s16/lpgemm_packb_s16.h | 2 +- .../aocl_gemm/kernels/u8s8s32/lpgemm_packa.h | 2 +- .../aocl_gemm/kernels/u8s8s32/lpgemm_packb.h | 2 +- aocl_dtl/CMakeLists.txt | 2 +- aocl_dtl/aocldtl.c | 2 +- aocl_dtl/aocldtl.h | 2 +- aocl_dtl/aocldtl_blis.c | 2 +- aocl_dtl/aocldtl_blis.h | 2 +- aocl_dtl/aocldtlcf.h | 2 +- aocl_dtl/aoclfal.c | 2 +- aocl_dtl/aoclfal.h | 2 +- aocl_dtl/aoclflist.c | 2 +- aocl_dtl/aoclflist.h | 2 +- aocl_dtl/aoclos.c | 2 +- aocl_dtl/aocltpdef.h | 2 +- aocl_dtl/etrace_decoder.py | 2 +- aocl_dtl/test_dtl.c | 2 +- bench/bench_amaxv.c | 2 +- bench/bench_aocl_gemm/bench_lpgemm.c | 2 +- bench/bench_aocl_gemm/bench_lpgemm_utils.c | 2 +- bench/bench_copyv.c | 2 +- bench/bench_dotv.c | 2 +- bench/bench_gemm.c | 2 +- bench/bench_gemmt.c | 2 +- bench/bench_gemv.c | 2 +- bench/bench_ger.c | 2 +- bench/bench_scalv.c | 2 +- bench/bench_swapv.c | 2 +- bench/bench_syrk.c | 2 +- bench/bench_trsm.c | 4 ++- bench/bench_trsv.c | 2 +- blastest/CMakeLists.txt | 2 +- build/auto_config.py | 2 +- build/bli_config.h.in | 2 +- build/blis_ref_kernel_mirror.py | 2 +- build/cmake/bli_addon.h.in | 2 +- build/cmake/bli_config.h.in | 2 +- build/detect/config/config_detect.c | 2 +- build/detect/config/old/cpuid_x86.c | 2 +- build/irun.py | 2 +- build/templates/license.c | 2 +- build/templates/license.h | 2 +- build/templates/license.sh | 2 +- common.mk | 2 +- config/CMakeLists.txt | 2 +- config/amd64_legacy/bli_family_amd64_legacy.h | 2 +- config/amd64_legacy/make_defs.mk | 2 +- config/amdzen/bli_family_amdzen.h | 2 +- config/amdzen/make_defs.cmake | 2 +- config/generic/make_defs.cmake | 2 +- config/haswell/bli_cntx_init_haswell.c | 2 +- config/haswell/bli_family_haswell.h | 2 +- config/old/haswellbb/bli_cntx_init_haswell.c | 2 +- config/old/haswellbb/bli_family_haswell.h | 2 +- config/zen/amd_config.cmake | 2 +- config/zen/bli_cntx_init_zen.c | 2 +- config/zen/make_defs.cmake | 2 +- config/zen/make_defs.mk | 2 +- config/zen/old/bli_kernel.h | 2 +- config/zen2/bli_cntx_init_zen2.c | 2 +- config/zen2/make_defs.cmake | 2 +- config/zen2/make_defs.mk | 2 +- config/zen3/bli_cntx_init_zen3.c | 2 +- config/zen3/make_defs.cmake | 2 +- config/zen3/make_defs.mk | 2 +- config/zen4/bli_cntx_init_zen4.c | 2 +- config/zen4/bli_family_zen4.h | 2 +- config/zen4/make_defs.cmake | 2 +- config/zen4/make_defs.mk | 2 +- docs/styling/footer.html | 4 +-- frame/1m/packm/bli_packm.h | 2 +- frame/1m/packm/bli_packm_cntl.c | 2 +- frame/1m/packm/bli_packm_cntl.h | 2 +- frame/1m/packm/bli_packm_thrinfo.c | 2 +- frame/1m/packm/bli_packm_thrinfo.h | 2 +- frame/1m/packm/bli_packm_var.h | 2 +- frame/1m/unpackm/bli_unpackm_cntl.c | 2 +- frame/1m/unpackm/bli_unpackm_cntl.h | 2 +- frame/2/bli_l2_ker.h | 2 +- frame/2/bli_l2_ker_prot.h | 2 +- frame/2/gemv/bli_gemv_unf_var1.c | 2 +- frame/2/gemv/bli_gemv_unf_var1_amd.c | 2 +- frame/2/gemv/bli_gemv_unf_var2.c | 2 +- frame/2/gemv/bli_gemv_unf_var2_amd.c | 2 +- frame/2/hemv/bli_hemv_unf_var1.c | 2 +- frame/2/hemv/bli_hemv_unf_var1_amd.c | 2 +- frame/2/hemv/bli_hemv_unf_var3_amd.c | 2 +- frame/2/her/bli_her_unb_var1_amd.c | 2 +- frame/2/her/bli_her_unb_var2_amd.c | 2 +- frame/2/her2/bli_her2_unf_var1_amd.c | 2 +- frame/2/her2/bli_her2_unf_var4_amd.c | 2 +- frame/2/trsv/bli_trsv_unf_var1_amd.c | 2 +- frame/2/trsv/bli_trsv_unf_var2_amd.c | 2 +- frame/3/bli_l3.h | 2 +- frame/3/bli_l3_cntl.c | 2 +- frame/3/bli_l3_cntl.h | 2 +- frame/3/bli_l3_oapi.c | 2 +- frame/3/bli_l3_oapi.h | 2 +- frame/3/bli_l3_oft.h | 2 +- frame/3/bli_l3_packm.c | 2 +- frame/3/bli_l3_packm.h | 2 +- frame/3/bli_l3_smart_threading.c | 2 +- frame/3/bli_l3_sup.c | 2 +- frame/3/bli_l3_sup_ft_ker.h | 2 +- frame/3/bli_l3_sup_int.c | 2 +- frame/3/bli_l3_sup_int.h | 2 +- frame/3/bli_l3_sup_int_amd.c | 2 +- frame/3/bli_l3_sup_ker.h | 2 +- frame/3/bli_l3_sup_ker_prot.h | 2 +- frame/3/bli_l3_sup_oft.h | 2 +- frame/3/bli_l3_sup_packm_a.c | 2 +- frame/3/bli_l3_sup_packm_a.h | 2 +- frame/3/bli_l3_sup_packm_b.c | 2 +- frame/3/bli_l3_sup_packm_b.h | 2 +- frame/3/bli_l3_sup_packm_var.c | 2 +- frame/3/bli_l3_sup_packm_var.h | 2 +- frame/3/bli_l3_sup_ref.c | 2 +- frame/3/bli_l3_sup_ref.h | 2 +- frame/3/bli_l3_sup_var12.c | 2 +- frame/3/bli_l3_sup_var1n2m.c | 2 +- frame/3/bli_l3_sup_vars.h | 2 +- frame/3/bli_l3_tapi.c | 2 +- frame/3/bli_l3_tapi.h | 2 +- frame/3/bli_l3_thrinfo.c | 2 +- frame/3/bli_l3_thrinfo.h | 2 +- frame/3/gemm/bli_gemm_cntl.c | 2 +- frame/3/gemm/bli_gemm_cntl.h | 2 +- frame/3/gemm/bli_gemm_ker_var1.c | 2 +- frame/3/gemm/bli_gemm_packab.c | 2 +- frame/3/gemm/bli_gemm_var.h | 2 +- frame/3/gemm/ind/bli_gemm4mb_ker_var2.c | 2 +- frame/3/gemm/other/bli_gemm_ker_var2.c | 2 +- frame/3/gemm/other/bli_gemm_ker_var2rr.c | 2 +- frame/3/gemm/other/bli_gemm_ker_var2sl.c | 2 +- frame/3/gemmt/bli_gemmt.h | 2 +- frame/3/gemmt/bli_gemmt_front.c | 2 +- frame/3/gemmt/bli_gemmt_front.h | 2 +- frame/3/gemmt/bli_gemmt_ker_var2.c | 2 +- frame/3/gemmt/bli_gemmt_sup_var1n2m.c | 2 +- frame/3/gemmt/bli_gemmt_sup_var1n2m_amd.c | 2 +- frame/3/gemmt/bli_gemmt_var.h | 2 +- frame/3/herk/bli_herk_l_ker_var2.c | 2 +- frame/3/herk/bli_herk_u_ker_var2.c | 2 +- frame/3/herk/bli_herk_var.h | 2 +- frame/3/herk/bli_herk_x_ker_var2.c | 2 +- .../herk/other/bli_herk_l_ker_var2.1looprr.c | 2 +- frame/3/herk/other/bli_herk_l_ker_var2.c | 2 +- frame/3/herk/other/bli_herk_l_ker_var2rr.c | 2 +- frame/3/herk/other/bli_herk_l_ker_var2sl.c | 2 +- .../herk/other/bli_herk_u_ker_var2.1looprr.c | 2 +- frame/3/herk/other/bli_herk_u_ker_var2.c | 2 +- frame/3/herk/other/bli_herk_u_ker_var2rr.c | 2 +- frame/3/herk/other/bli_herk_u_ker_var2sl.c | 2 +- frame/3/old/bli_l3_sup_edge.h | 2 +- frame/3/old/bli_l3_sup_var1n2m.c | 2 +- frame/3/syrk/bli_syrk_front.c | 2 +- frame/3/trmm/bli_trmm_front.c | 2 +- frame/3/trmm/bli_trmm_front_amd.c | 2 +- frame/3/trmm/bli_trmm_ll_ker_var2.c | 2 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 2 +- frame/3/trmm/bli_trmm_rl_ker_var2.c | 2 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 2 +- frame/3/trmm/bli_trmm_var.h | 2 +- frame/3/trmm/bli_trmm_xx_ker_var2.c | 2 +- frame/3/trmm/other/bli_trmm_ll_ker_var2.c | 2 +- frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c | 2 +- frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c | 2 +- frame/3/trmm/other/bli_trmm_lu_ker_var2.c | 2 +- frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c | 2 +- frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c | 2 +- frame/3/trmm/other/bli_trmm_rl_ker_var2.c | 2 +- frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c | 2 +- frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c | 2 +- frame/3/trmm/other/bli_trmm_ru_ker_var2.c | 2 +- frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c | 2 +- frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c | 2 +- frame/3/trsm/bli_trsm_cntl.c | 2 +- frame/3/trsm/bli_trsm_cntl.h | 2 +- frame/3/trsm/bli_trsm_front.h | 2 +- frame/3/trsm/bli_trsm_var.h | 2 +- frame/3/trsm/other/bli_trsm_ll_ker_var2.c | 2 +- frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c | 2 +- frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c | 2 +- frame/3/trsm/other/bli_trsm_lu_ker_var2.c | 2 +- frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c | 2 +- frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c | 2 +- frame/3/trsm/other/bli_trsm_rl_ker_var2.c | 2 +- frame/3/trsm/other/bli_trsm_ru_ker_var2.c | 2 +- frame/CMakeLists.txt | 2 +- frame/base/CMakeLists.txt | 2 +- frame/base/bli_apool.c | 2 +- frame/base/bli_apool.h | 2 +- frame/base/bli_arch.c | 2 +- frame/base/bli_array.c | 2 +- frame/base/bli_array.h | 2 +- frame/base/bli_check.c | 2 +- frame/base/bli_check.h | 2 +- frame/base/bli_clock.c | 2 +- frame/base/bli_cntl.c | 2 +- frame/base/bli_cntl.h | 2 +- frame/base/bli_cntx.c | 2 +- frame/base/bli_cpuid.c | 4 +-- frame/base/bli_cpuid.h | 2 +- frame/base/bli_env.c | 2 +- frame/base/bli_env.h | 2 +- frame/base/bli_error.h | 2 +- frame/base/bli_getopt.c | 2 +- frame/base/bli_gks.c | 2 +- frame/base/bli_init.c | 2 +- frame/base/bli_malloc.c | 2 +- frame/base/bli_malloc.h | 2 +- frame/base/bli_mem.h | 2 +- frame/base/bli_memsys.c | 2 +- frame/base/bli_memsys.h | 2 +- frame/base/bli_obj.c | 2 +- frame/base/bli_pack.c | 2 +- frame/base/bli_param_map.c | 2 +- frame/base/bli_param_map.h | 2 +- frame/base/bli_pba.c | 2 +- frame/base/bli_pba.h | 2 +- frame/base/bli_pool.h | 2 +- frame/base/bli_prune.c | 2 +- frame/base/bli_sba.c | 2 +- frame/base/bli_sba.h | 2 +- frame/compat/bla_amax.c | 2 +- frame/compat/bla_amax.h | 2 +- frame/compat/bla_amax_amd.c | 2 +- frame/compat/bla_amin.c | 2 +- frame/compat/bla_amin.h | 2 +- frame/compat/bla_asum.c | 2 +- frame/compat/bla_asum.h | 2 +- frame/compat/bla_axpby.c | 2 +- frame/compat/bla_axpby.h | 2 +- frame/compat/bla_axpy.c | 2 +- frame/compat/bla_axpy.h | 2 +- frame/compat/bla_axpy_amd.c | 2 +- frame/compat/bla_copy.c | 2 +- frame/compat/bla_copy.h | 2 +- frame/compat/bla_copy_amd.c | 2 +- frame/compat/bla_dot.c | 2 +- frame/compat/bla_dot.h | 2 +- frame/compat/bla_dot_amd.c | 2 +- frame/compat/bla_gemm.c | 2 +- frame/compat/bla_gemm.h | 2 +- frame/compat/bla_gemm3m.c | 2 +- frame/compat/bla_gemm3m.h | 2 +- frame/compat/bla_gemm_amd.c | 2 +- frame/compat/bla_gemm_batch.c | 2 +- frame/compat/bla_gemm_batch.h | 2 +- frame/compat/bla_gemmt.c | 2 +- frame/compat/bla_gemmt.h | 2 +- frame/compat/bla_gemv.c | 2 +- frame/compat/bla_gemv.h | 2 +- frame/compat/bla_gemv_amd.c | 2 +- frame/compat/bla_ger.c | 2 +- frame/compat/bla_ger.h | 2 +- frame/compat/bla_hemm.c | 2 +- frame/compat/bla_hemm.h | 2 +- frame/compat/bla_hemv.c | 2 +- frame/compat/bla_hemv.h | 2 +- frame/compat/bla_her.c | 2 +- frame/compat/bla_her.h | 2 +- frame/compat/bla_her2.c | 2 +- frame/compat/bla_her2.h | 2 +- frame/compat/bla_her2k.c | 2 +- frame/compat/bla_her2k.h | 2 +- frame/compat/bla_herk.c | 2 +- frame/compat/bla_herk.h | 2 +- frame/compat/bla_imatcopy.c | 2 +- frame/compat/bla_nrm2.c | 2 +- frame/compat/bla_nrm2.h | 2 +- frame/compat/bla_scal.c | 2 +- frame/compat/bla_scal.h | 2 +- frame/compat/bla_scal_amd.c | 2 +- frame/compat/bla_swap.c | 2 +- frame/compat/bla_swap.h | 2 +- frame/compat/bla_swap_amd.c | 2 +- frame/compat/bla_symm.c | 2 +- frame/compat/bla_symm.h | 2 +- frame/compat/bla_symv.c | 2 +- frame/compat/bla_symv.h | 2 +- frame/compat/bla_syr.c | 2 +- frame/compat/bla_syr.h | 2 +- frame/compat/bla_syr2.c | 2 +- frame/compat/bla_syr2.h | 2 +- frame/compat/bla_syr2k.c | 2 +- frame/compat/bla_syr2k.h | 2 +- frame/compat/bla_syrk.c | 2 +- frame/compat/bla_syrk.h | 2 +- frame/compat/bla_trmm.c | 2 +- frame/compat/bla_trmm.h | 2 +- frame/compat/bla_trmv.c | 2 +- frame/compat/bla_trmv.h | 2 +- frame/compat/bla_trsm.c | 2 +- frame/compat/bla_trsm.h | 2 +- frame/compat/bla_trsm_amd.c | 2 +- frame/compat/bla_trsv.c | 2 +- frame/compat/bla_trsv.h | 2 +- frame/compat/bli_blas.h | 2 +- frame/compat/cblas/f77_sub/f77_amin_sub.c | 2 +- frame/compat/cblas/f77_sub/f77_amin_sub.h | 2 +- frame/compat/cblas/src/cblas.h | 2 +- frame/compat/cblas/src/cblas_caxpby.c | 2 +- frame/compat/cblas/src/cblas_cgemmt.c | 2 +- frame/compat/cblas/src/cblas_daxpby.c | 2 +- frame/compat/cblas/src/cblas_dcabs1.c | 2 +- frame/compat/cblas/src/cblas_saxpby.c | 2 +- frame/compat/cblas/src/cblas_scabs1.c | 2 +- frame/compat/cblas/src/cblas_zaxpby.c | 2 +- frame/compat/cblas/src/cblas_zgemmt.c | 2 +- frame/compat/check/bla_gemm3m_check.h | 2 +- frame/compat/check/bla_gemmt_check.h | 2 +- frame/compat/f2c/bla_cabs1.c | 2 +- frame/compat/f2c/bla_gbmv.c | 2 +- frame/compat/f2c/bla_gbmv.h | 2 +- frame/compat/f2c/bla_hbmv.c | 2 +- frame/compat/f2c/bla_hbmv.h | 2 +- frame/compat/f2c/bla_hpmv.c | 2 +- frame/compat/f2c/bla_hpmv.h | 2 +- frame/compat/f2c/bla_hpr.c | 2 +- frame/compat/f2c/bla_hpr.h | 2 +- frame/compat/f2c/bla_hpr2.c | 2 +- frame/compat/f2c/bla_hpr2.h | 2 +- frame/compat/f2c/bla_rot.c | 2 +- frame/compat/f2c/bla_rot.h | 2 +- frame/compat/f2c/bla_rotg.c | 2 +- frame/compat/f2c/bla_rotg.h | 2 +- frame/compat/f2c/bla_rotm.c | 2 +- frame/compat/f2c/bla_rotm.h | 2 +- frame/compat/f2c/bla_rotmg.c | 2 +- frame/compat/f2c/bla_rotmg.h | 2 +- frame/compat/f2c/bla_sbmv.c | 2 +- frame/compat/f2c/bla_sbmv.h | 2 +- frame/compat/f2c/bla_spmv.c | 2 +- frame/compat/f2c/bla_spmv.h | 2 +- frame/compat/f2c/bla_spr.c | 2 +- frame/compat/f2c/bla_spr.h | 2 +- frame/compat/f2c/bla_spr2.c | 2 +- frame/compat/f2c/bla_spr2.h | 2 +- frame/compat/f2c/bla_tbmv.c | 2 +- frame/compat/f2c/bla_tbmv.h | 2 +- frame/compat/f2c/bla_tbsv.c | 2 +- frame/compat/f2c/bla_tbsv.h | 2 +- frame/compat/f2c/bla_tpmv.c | 2 +- frame/compat/f2c/bla_tpmv.h | 2 +- frame/compat/f2c/bla_tpsv.c | 2 +- frame/compat/f2c/bla_tpsv.h | 2 +- frame/include/bli_config_macro_defs.h | 2 +- frame/include/bli_genarray_macro_defs.h | 2 +- frame/include/bli_gentprot_macro_defs.h | 2 +- frame/include/bli_lang_defs.h | 2 +- frame/include/bli_macro_defs.h | 2 +- frame/include/bli_obj_macro_defs.h | 2 +- frame/include/bli_param_macro_defs.h | 2 +- frame/include/bli_system.h | 4 +-- frame/include/bli_type_defs.h | 2 +- frame/include/bli_x86_asm_macros.h | 2 +- frame/ind/bli_l3_ind.c | 2 +- frame/ind/bli_l3_ind.h | 2 +- frame/ind/oapi/bli_l3_3m4m1m_oapi.c | 2 +- frame/ind/oapi/bli_l3_ind_oapi.c | 2 +- frame/ind/oapi/bli_l3_ind_oapi.h | 2 +- frame/thread/bli_l3_decor.h | 2 +- frame/thread/bli_l3_decor_openmp.c | 2 +- frame/thread/bli_l3_decor_openmp.h | 2 +- frame/thread/bli_l3_decor_pthreads.c | 2 +- frame/thread/bli_l3_decor_single.c | 2 +- frame/thread/bli_l3_sup_decor.h | 2 +- frame/thread/bli_l3_sup_decor_openmp.c | 2 +- frame/thread/bli_l3_sup_decor_pthreads.c | 2 +- frame/thread/bli_l3_sup_decor_single.c | 2 +- frame/thread/bli_pthread.c | 2 +- frame/thread/bli_thrcomm.c | 2 +- frame/thread/bli_thrcomm.h | 2 +- frame/thread/bli_thrcomm_openmp.c | 2 +- frame/thread/bli_thrcomm_openmp.h | 2 +- frame/thread/bli_thrcomm_pthreads.c | 2 +- frame/thread/bli_thrcomm_single.c | 2 +- frame/thread/bli_thread.c | 2 +- frame/thread/bli_thrinfo.c | 2 +- frame/thread/bli_thrinfo.h | 2 +- frame/thread/bli_thrinfo_sup.c | 2 +- frame/thread/bli_thrinfo_sup.h | 2 +- frame/thread/old/bli_mutex.h | 2 +- frame/thread/old/bli_mutex_openmp.h | 2 +- frame/thread/old/bli_mutex_pthreads.h | 2 +- frame/thread/old/bli_mutex_single.h | 2 +- frame/util/bli_util_api_wrap.c | 2 +- frame/util/bli_util_api_wrap.h | 2 +- frame/util/bli_util_progress.c | 2 +- frame/util/bli_util_progress.h | 2 +- frame/util/bli_util_update.h | 2 +- .../testsuite/util/nrm2/nrm2_corner_cases.cpp | 34 +++++++++++++++++++ .../util/nrm2/nrm2_invalid_inputs.cpp | 34 +++++++++++++++++++ .../testsuite/util/nrm2/scnrm2_generic.cpp | 34 +++++++++++++++++++ kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c | 2 +- kernels/haswell/1m/CMakeLists.txt | 2 +- .../haswell/1m/bli_packm_haswell_asm_c3xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_c8xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_d6xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_d8xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_s16xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_s6xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_z3xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_z4xk.c | 2 +- kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c | 2 +- .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c | 2 +- .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c | 2 +- .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c | 2 +- .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c | 2 +- .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c | 2 +- .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c | 2 +- .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c | 2 +- .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c | 2 +- .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c | 2 +- .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c | 2 +- .../sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c | 2 +- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c | 2 +- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c | 2 +- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c | 2 +- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c | 2 +- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c | 2 +- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c | 2 +- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c | 2 +- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c | 2 +- .../bli_gemmsup_rd_haswell_asm_d6x8m.c.newji | 2 +- ...bli_gemmsup_rd_haswell_asm_d6x8m.c.worksij | 2 +- .../s6x16/bli_gemmsup_r_haswell_ref_sMx1.c | 2 +- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c | 2 +- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c | 2 +- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c | 2 +- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c | 2 +- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c | 2 +- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c | 2 +- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c | 2 +- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c | 2 +- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c | 2 +- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c | 2 +- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c | 2 +- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c | 2 +- kernels/haswell/bli_kernels_haswell.h | 2 +- kernels/skx/3/bli_dgemm_skx_asm_16x14.c | 2 +- kernels/zen/1/bli_amaxv_zen_int.c | 2 +- kernels/zen/1/bli_axpbyv_zen_int.c | 2 +- kernels/zen/1/bli_axpbyv_zen_int10.c | 2 +- kernels/zen/1/bli_axpyv_zen_int.c | 2 +- kernels/zen/1/bli_copyv_zen_int.c | 2 +- kernels/zen/1/bli_dotv_zen_int.c | 2 +- kernels/zen/1/bli_dotv_zen_int10.c | 2 +- kernels/zen/1/bli_dotxv_zen_int.c | 2 +- kernels/zen/1/bli_scalv_zen_int.c | 2 +- kernels/zen/1/bli_setv_zen_int.c | 2 +- kernels/zen/1/bli_swapv_zen_int8.c | 2 +- kernels/zen/1f/bli_axpy2v_zen_int.c | 2 +- kernels/zen/1f/bli_axpyf_zen_int_4.c | 2 +- kernels/zen/1f/bli_axpyf_zen_int_8.c | 2 +- kernels/zen/1f/bli_dotxaxpyf_zen_int_8.c | 2 +- kernels/zen/1f/bli_dotxf_zen_int_8.c | 2 +- kernels/zen/2/bli_gemv_zen_ref.c | 2 +- kernels/zen/3/bli_dgemm_avx2_k1.c | 2 +- kernels/zen/3/bli_gemm_small.c | 2 +- kernels/zen/3/bli_trsm_small.c | 2 +- kernels/zen/3/bli_zgemm_avx2_k1.c | 2 +- kernels/zen/3/bli_zgemm_zen_2x6.c | 2 +- kernels/zen/3/bli_zgemmtrsm_l_2x6.c | 2 +- kernels/zen/3/bli_zgemmtrsm_u_2x6.c | 2 +- .../zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c | 2 +- .../zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16m.c | 2 +- .../zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16n.c | 2 +- .../zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4.c | 2 +- .../zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4m.c | 2 +- .../zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4n.c | 2 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c | 2 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c | 2 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8n.c | 2 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c | 2 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c | 2 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16n.c | 2 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4.c | 2 +- .../zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4n.c | 2 +- .../sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c | 2 +- .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c | 2 +- .../u8s8s16/lpgemm_6x32rowmajor_amd256.c | 2 +- .../lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c | 2 +- .../lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c | 2 +- .../lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c | 2 +- .../zen/lpgemm/u8s8s16/lpgemm_packb_amd256.c | 2 +- .../lpgemm/u8s8s16/lpgemm_s16_kern_macros.h | 2 +- kernels/zen2/bli_kernels_zen2.h | 2 +- kernels/zen4/1/bli_amaxv_zen_int_avx512.c | 2 +- kernels/zen4/1/bli_axpyv_zen_int_avx512.c | 2 +- kernels/zen4/1m/bli_packm_zen4_asm_d16xk.c | 2 +- kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c | 2 +- kernels/zen4/1m/bli_packm_zen4_asm_d32xk.c | 2 +- kernels/zen4/1m/bli_packm_zen4_asm_z12xk.c | 2 +- kernels/zen4/1m/bli_packm_zen4_asm_z4xk.c | 2 +- kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c | 2 +- kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c | 2 +- kernels/zen4/3/bli_zgemm_zen4_asm_4x12.c | 2 +- kernels/zen4/3/bli_zgemmtrsm_l_4x12.c | 2 +- kernels/zen4/3/bli_zgemmtrsm_u_4x12.c | 2 +- kernels/zen4/bli_kernels_zen4.h | 2 +- .../lpgemm_6x64rowmajor_bf16_amd512vnni.c | 2 +- .../bf16bf16f32/lpgemm_f32_kern_macros.h | 2 +- .../lpgemm_m_fringe_bf16_amd512vnni.c | 2 +- .../lpgemm_mn_fringe_bf16_amd512vnni.c | 2 +- .../lpgemm_n_fringe_bf16_amd512vnni.c | 2 +- .../lpgemm_packa_bf16_amd256vnni.c | 2 +- .../lpgemm_packb_bf16_amd512vnni.c | 2 +- .../u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c | 2 +- .../u8s8s32/lpgemm_m_fringe_amd512vnni.c | 2 +- .../u8s8s32/lpgemm_mn_fringe_amd512vnni.c | 2 +- .../u8s8s32/lpgemm_n_fringe_amd512vnni.c | 2 +- .../lpgemm/u8s8s32/lpgemm_packa_amd512vnni.c | 2 +- .../lpgemm/u8s8s32/lpgemm_packb_amd512vnni.c | 2 +- .../lpgemm/u8s8s32/lpgemm_s32_kern_macros.h | 2 +- ref_kernels/3/bli_gemmsup_ref.c | 2 +- sandbox/ref99/bli_gemmnat.c | 2 +- sandbox/ref99/blx_gemm_ref_var2.c | 2 +- sandbox/ref99/blx_gemm_ref_var2.h | 2 +- sandbox/ref99/old/blx_gemm_front.c | 2 +- sandbox/ref99/old/blx_gemm_int.c | 2 +- sandbox/ref99/old/cntl/blx_gemm_cntl.c | 2 +- sandbox/ref99/old/vars/blx_gemm_blk_var1.c | 2 +- sandbox/ref99/old/vars/blx_gemm_blk_var2.c | 2 +- sandbox/ref99/old/vars/blx_gemm_ker_var2.c | 2 +- sandbox/ref99/old/vars/blx_gemm_var.h | 2 +- .../old/vars/other/blx_gemm_ker_var2rr.c | 2 +- .../old/vars/other/blx_gemm_ker_var2sl.c | 2 +- test/1m4m/Makefile | 2 +- test/3/Makefile | 2 +- test/3/test_herk.c | 2 +- test/3/test_trmm.c | 2 +- test/3/test_trsm.c | 2 +- test/other/test_copyv.c | 2 +- test/other/test_swapv.c | 2 +- test/other/test_trsm.c | 2 +- test/sup/Makefile | 2 +- test/sup/old/supmt/Makefile | 2 +- test/sup/old/supmt/test_gemm.c | 2 +- test/sup/old/supst/Makefile | 2 +- test/sup/old/supst/test_gemm.c | 2 +- test/sup/test_gemm.c | 2 +- test/test_axpbyv.c | 2 +- test/test_copyv.c | 2 +- test/test_dotv.c | 2 +- test/test_gemm.c | 2 +- test/test_swapv.c | 2 +- test/test_trsm.c | 2 +- test/thread_ranges/test_ranges.c | 2 +- testsuite/CMakeLists.txt | 2 +- testsuite/src/test_addm.c | 2 +- testsuite/src/test_addm.h | 2 +- testsuite/src/test_addv.c | 2 +- testsuite/src/test_addv.h | 2 +- testsuite/src/test_amaxv.c | 2 +- testsuite/src/test_amaxv.h | 2 +- testsuite/src/test_axpbyv.c | 2 +- testsuite/src/test_axpbyv.h | 2 +- testsuite/src/test_axpy2v.c | 2 +- testsuite/src/test_axpy2v.h | 2 +- testsuite/src/test_axpyf.c | 2 +- testsuite/src/test_axpyf.h | 2 +- testsuite/src/test_axpym.c | 2 +- testsuite/src/test_axpym.h | 2 +- testsuite/src/test_axpyv.c | 2 +- testsuite/src/test_axpyv.h | 2 +- testsuite/src/test_copym.c | 2 +- testsuite/src/test_copym.h | 2 +- testsuite/src/test_copyv.c | 2 +- testsuite/src/test_copyv.h | 2 +- testsuite/src/test_dotaxpyv.c | 2 +- testsuite/src/test_dotaxpyv.h | 2 +- testsuite/src/test_dotv.c | 2 +- testsuite/src/test_dotv.h | 2 +- testsuite/src/test_dotxaxpyf.c | 2 +- testsuite/src/test_dotxaxpyf.h | 2 +- testsuite/src/test_dotxf.c | 2 +- testsuite/src/test_dotxf.h | 2 +- testsuite/src/test_dotxv.c | 2 +- testsuite/src/test_dotxv.h | 2 +- testsuite/src/test_gemm.c | 2 +- testsuite/src/test_gemm.h | 2 +- testsuite/src/test_gemm_ukr.c | 2 +- testsuite/src/test_gemm_ukr.h | 2 +- testsuite/src/test_gemmt.c | 2 +- testsuite/src/test_gemmt.h | 2 +- testsuite/src/test_gemmtrsm_ukr.h | 2 +- testsuite/src/test_gemv.c | 2 +- testsuite/src/test_gemv.h | 2 +- testsuite/src/test_ger.c | 2 +- testsuite/src/test_ger.h | 2 +- testsuite/src/test_hemm.c | 2 +- testsuite/src/test_hemm.h | 2 +- testsuite/src/test_hemv.c | 2 +- testsuite/src/test_hemv.h | 2 +- testsuite/src/test_her.c | 2 +- testsuite/src/test_her.h | 2 +- testsuite/src/test_her2.c | 2 +- testsuite/src/test_her2.h | 2 +- testsuite/src/test_her2k.c | 2 +- testsuite/src/test_her2k.h | 2 +- testsuite/src/test_herk.c | 2 +- testsuite/src/test_herk.h | 2 +- testsuite/src/test_libblis.c | 2 +- testsuite/src/test_libblis.h | 2 +- testsuite/src/test_normfm.c | 2 +- testsuite/src/test_normfm.h | 2 +- testsuite/src/test_normfv.c | 2 +- testsuite/src/test_normfv.h | 2 +- testsuite/src/test_randm.c | 2 +- testsuite/src/test_randm.h | 2 +- testsuite/src/test_randv.c | 2 +- testsuite/src/test_randv.h | 2 +- testsuite/src/test_scal2m.c | 2 +- testsuite/src/test_scal2m.h | 2 +- testsuite/src/test_scal2v.c | 2 +- testsuite/src/test_scal2v.h | 2 +- testsuite/src/test_scalm.c | 2 +- testsuite/src/test_scalm.h | 2 +- testsuite/src/test_scalv.c | 2 +- testsuite/src/test_scalv.h | 2 +- testsuite/src/test_setm.c | 2 +- testsuite/src/test_setm.h | 2 +- testsuite/src/test_setv.c | 2 +- testsuite/src/test_setv.h | 2 +- testsuite/src/test_subm.c | 2 +- testsuite/src/test_subm.h | 2 +- testsuite/src/test_subv.c | 2 +- testsuite/src/test_subv.h | 2 +- testsuite/src/test_symm.c | 2 +- testsuite/src/test_symm.h | 2 +- testsuite/src/test_symv.c | 2 +- testsuite/src/test_symv.h | 2 +- testsuite/src/test_syr.c | 2 +- testsuite/src/test_syr.h | 2 +- testsuite/src/test_syr2.c | 2 +- testsuite/src/test_syr2.h | 2 +- testsuite/src/test_syr2k.c | 2 +- testsuite/src/test_syr2k.h | 2 +- testsuite/src/test_syrk.c | 2 +- testsuite/src/test_syrk.h | 2 +- testsuite/src/test_trmm.c | 2 +- testsuite/src/test_trmm.h | 2 +- testsuite/src/test_trmm3.c | 2 +- testsuite/src/test_trmm3.h | 2 +- testsuite/src/test_trmv.c | 2 +- testsuite/src/test_trmv.h | 2 +- testsuite/src/test_trsm.c | 2 +- testsuite/src/test_trsm.h | 2 +- testsuite/src/test_trsm_ukr.c | 2 +- testsuite/src/test_trsm_ukr.h | 2 +- testsuite/src/test_trsv.c | 2 +- testsuite/src/test_trsv.h | 2 +- testsuite/src/test_xpbyv.c | 2 +- testsuite/src/test_xpbyv.h | 2 +- vendor/testcpp/CMakeLists.txt | 2 +- vendor/testcpp/Makefile | 2 +- vendor/testcpp/test_asum.cc | 2 +- vendor/testcpp/test_axpy.cc | 2 +- vendor/testcpp/test_copy.cc | 2 +- vendor/testcpp/test_dot.cc | 2 +- vendor/testcpp/test_dotc.cc | 2 +- vendor/testcpp/test_gbmv.cc | 2 +- vendor/testcpp/test_gemm.cc | 2 +- vendor/testcpp/test_gemv.cc | 2 +- vendor/testcpp/test_ger.cc | 2 +- vendor/testcpp/test_gerc.cc | 2 +- vendor/testcpp/test_geru.cc | 2 +- vendor/testcpp/test_hemm.cc | 2 +- vendor/testcpp/test_hemv.cc | 2 +- vendor/testcpp/test_her.cc | 2 +- vendor/testcpp/test_her2.cc | 2 +- vendor/testcpp/test_herk.cc | 2 +- vendor/testcpp/test_hpr.cc | 2 +- vendor/testcpp/test_hpr2.cc | 2 +- vendor/testcpp/test_nrm2.cc | 2 +- vendor/testcpp/test_rot.cc | 2 +- vendor/testcpp/test_rotg.cc | 2 +- vendor/testcpp/test_rotm.cc | 2 +- vendor/testcpp/test_rotmg.cc | 2 +- vendor/testcpp/test_scal.cc | 2 +- vendor/testcpp/test_sdsdot.cc | 2 +- vendor/testcpp/test_spr.cc | 2 +- vendor/testcpp/test_spr2.cc | 2 +- vendor/testcpp/test_swap.cc | 2 +- vendor/testcpp/test_symm.cc | 2 +- vendor/testcpp/test_syr.cc | 2 +- vendor/testcpp/test_syr2.cc | 2 +- vendor/testcpp/test_syr2k.cc | 2 +- vendor/testcpp/test_syrk.cc | 2 +- vendor/testcpp/test_tbmv.cc | 2 +- vendor/testcpp/test_tbsv.cc | 2 +- vendor/testcpp/test_tpmv.cc | 2 +- vendor/testcpp/test_tpsv.cc | 2 +- vendor/testcpp/test_trmm.cc | 2 +- vendor/testcpp/test_trsm.cc | 2 +- vendor/testcpp/test_trsv.cc | 2 +- windows/tests/blis_make.py | 2 +- windows/tests/inputs.yaml | 2 +- 736 files changed, 840 insertions(+), 736 deletions(-) diff --git a/LICENSE b/LICENSE index be24a09734..f05ca1125c 100644 --- a/LICENSE +++ b/LICENSE @@ -15,7 +15,7 @@ copyright info. All parties provide their portions of the code under the Copyright (C) 2018, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP -Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. +Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/Makefile b/Makefile index a2d7b7846d..4c4c01ffd0 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are diff --git a/addon/CMakeLists.txt b/addon/CMakeLists.txt index 667a0daf5a..073a3fb75b 100644 --- a/addon/CMakeLists.txt +++ b/addon/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc ## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. ## # Writing a function that will be used to generate the required object # libraries for the required addons. diff --git a/addon/aocl_gemm/aocl_gemm.h b/addon/aocl_gemm/aocl_gemm.h index 4a5e574b6d..027f895591 100644 --- a/addon/aocl_gemm/aocl_gemm.h +++ b/addon/aocl_gemm/aocl_gemm.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_bf16_utils.c b/addon/aocl_gemm/aocl_gemm_bf16_utils.c index 020065a364..de709e8f90 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16_utils.c +++ b/addon/aocl_gemm/aocl_gemm_bf16_utils.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c index b6462b1645..897facfbda 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c index 0cb20f0060..0ca2602898 100644 --- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c +++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c b/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c index 7de6b16369..107b651b71 100644 --- a/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c +++ b/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_f32f32f32of32_utils.c b/addon/aocl_gemm/aocl_gemm_f32f32f32of32_utils.c index 2116e418af..3b801ce0db 100644 --- a/addon/aocl_gemm/aocl_gemm_f32f32f32of32_utils.c +++ b/addon/aocl_gemm/aocl_gemm_f32f32f32of32_utils.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_interface_apis.h b/addon/aocl_gemm/aocl_gemm_interface_apis.h index 142f15fae9..7009cf1e2e 100644 --- a/addon/aocl_gemm/aocl_gemm_interface_apis.h +++ b/addon/aocl_gemm/aocl_gemm_interface_apis.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_post_ops.h b/addon/aocl_gemm/aocl_gemm_post_ops.h index 70084e741a..dbf869fae1 100644 --- a/addon/aocl_gemm/aocl_gemm_post_ops.h +++ b/addon/aocl_gemm/aocl_gemm_post_ops.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c index 1c21ff8103..c0614c643b 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c index 2d576d8cd9..fd0c64203f 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c index d159fe5b6d..e8d7b9d146 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c index 194a608e16..d89e6861c3 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c index 86fdf74ef9..b62c294cc6 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c index d7de73363b..6dab94b1fc 100644 --- a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c +++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/config/lpgemm_config.c b/addon/aocl_gemm/config/lpgemm_config.c index 93eb7e9b3e..ca1020e324 100644 --- a/addon/aocl_gemm/config/lpgemm_config.c +++ b/addon/aocl_gemm/config/lpgemm_config.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/config/lpgemm_config.h b/addon/aocl_gemm/config/lpgemm_config.h index 91863e416a..87020d0c3d 100644 --- a/addon/aocl_gemm/config/lpgemm_config.h +++ b/addon/aocl_gemm/config/lpgemm_config.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c index f781e70daf..5a0201443b 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c index 40dfa051bd..99c17b909f 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.h b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.h index 42c8cb9ef6..d9fddedb6e 100644 --- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.h +++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h b/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h index 78ccc358a3..a0920edaf3 100644 --- a/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h +++ b/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/lpgemm_post_ops.c b/addon/aocl_gemm/frame/lpgemm_post_ops.c index 855a880025..92f5849c20 100644 --- a/addon/aocl_gemm/frame/lpgemm_post_ops.c +++ b/addon/aocl_gemm/frame/lpgemm_post_ops.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/lpgemm_post_ops.h b/addon/aocl_gemm/frame/lpgemm_post_ops.h index 8b17ee4660..ed1d3ed86b 100644 --- a/addon/aocl_gemm/frame/lpgemm_post_ops.h +++ b/addon/aocl_gemm/frame/lpgemm_post_ops.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/lpgemm_types.h b/addon/aocl_gemm/frame/lpgemm_types.h index 02c1813369..28f210a067 100644 --- a/addon/aocl_gemm/frame/lpgemm_types.h +++ b/addon/aocl_gemm/frame/lpgemm_types.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c index 2a362ce154..0debf31d38 100644 --- a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c +++ b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h index a7460bb061..4fd0a12bff 100644 --- a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h +++ b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.c b/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.c index 2786117131..c0c1a29e7b 100644 --- a/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.c +++ b/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.h b/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.h index 65647d9903..7a87bd6d56 100644 --- a/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.h +++ b/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c b/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c index c55e4a39af..5e4740a952 100644 --- a/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c +++ b/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.c b/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.c index 224e0791ff..14dff21af4 100644 --- a/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.c +++ b/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.h b/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.h index 232b02238d..58a5255637 100644 --- a/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.h +++ b/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c b/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c index b69f5395f0..29239803d6 100644 --- a/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c +++ b/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h index 92f53f36ab..1ceb833180 100644 --- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h +++ b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_pack_bf16.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/kernels/lpgemm_kernels.h b/addon/aocl_gemm/kernels/lpgemm_kernels.h index add69df94f..83132e8fbf 100644 --- a/addon/aocl_gemm/kernels/lpgemm_kernels.h +++ b/addon/aocl_gemm/kernels/lpgemm_kernels.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_s16.h b/addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_s16.h index a8f64c3fe0..1b3997ca3e 100644 --- a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_s16.h +++ b/addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_s16.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa.h b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa.h index 9b1c55046e..d0d507cbfb 100644 --- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa.h +++ b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb.h b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb.h index 1d69148e3c..2849cc8c33 100644 --- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb.h +++ b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/aocl_dtl/CMakeLists.txt b/aocl_dtl/CMakeLists.txt index 3757822f2d..5b69f0e116 100644 --- a/aocl_dtl/CMakeLists.txt +++ b/aocl_dtl/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. ## +##Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. ## # Collect all subdirectory paths that have at least one file with suffix in AOCLDTL_SRC_SUFS list. get_filepaths_with_suffixes(LOCAL_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR} "${AOCLDTL_SRC_SUFS}") diff --git a/aocl_dtl/aocldtl.c b/aocl_dtl/aocldtl.c index 6faa1e4b51..3624f8c004 100644 --- a/aocl_dtl/aocldtl.c +++ b/aocl_dtl/aocldtl.c @@ -5,7 +5,7 @@ * These functions are invoked though macros by * end user. * - * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *=======================================================================*/ #include "blis.h" diff --git a/aocl_dtl/aocldtl.h b/aocl_dtl/aocldtl.h index 7f9934ed24..7800bb432d 100644 --- a/aocl_dtl/aocldtl.h +++ b/aocl_dtl/aocldtl.h @@ -5,7 +5,7 @@ * It provides defination for all macros to be * used by user to add debug/trace information. * - * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ diff --git a/aocl_dtl/aocldtl_blis.c b/aocl_dtl/aocldtl_blis.c index b9d74242a8..90be337f26 100755 --- a/aocl_dtl/aocldtl_blis.c +++ b/aocl_dtl/aocldtl_blis.c @@ -3,7 +3,7 @@ * * Description : BLIS library specific debug helpes. * - * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ diff --git a/aocl_dtl/aocldtl_blis.h b/aocl_dtl/aocldtl_blis.h index e01d80efd3..275ad0a484 100755 --- a/aocl_dtl/aocldtl_blis.h +++ b/aocl_dtl/aocldtl_blis.h @@ -3,7 +3,7 @@ * * Description : BLIS library specific debug helpes. * - * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ diff --git a/aocl_dtl/aocldtlcf.h b/aocl_dtl/aocldtlcf.h index 408f38c516..4aa1293fcf 100644 --- a/aocl_dtl/aocldtlcf.h +++ b/aocl_dtl/aocldtlcf.h @@ -5,7 +5,7 @@ * libaray, all debug features (except auto trace) * can be enabled/disabled in this file. * - * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ diff --git a/aocl_dtl/aoclfal.c b/aocl_dtl/aoclfal.c index 1eadf99b49..e96a42cf7c 100644 --- a/aocl_dtl/aoclfal.c +++ b/aocl_dtl/aoclfal.c @@ -3,7 +3,7 @@ * * Description : Platform/os independed file handling API's * - * Copyright (C) 2020, Advanced Micro Devices, Inc + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ diff --git a/aocl_dtl/aoclfal.h b/aocl_dtl/aoclfal.h index 401ed4c355..c37b699be9 100644 --- a/aocl_dtl/aoclfal.h +++ b/aocl_dtl/aoclfal.h @@ -4,7 +4,7 @@ * Description : Interfaces for platform/os independed file * handling API's * - * Copyright (C) 2020, Advanced Micro Devices, Inc + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ diff --git a/aocl_dtl/aoclflist.c b/aocl_dtl/aoclflist.c index 15b58c9e80..5265cd97c5 100644 --- a/aocl_dtl/aoclflist.c +++ b/aocl_dtl/aoclflist.c @@ -5,7 +5,7 @@ * each thread. This is used to log the data * to correct file as per the current thread id. * - * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ diff --git a/aocl_dtl/aoclflist.h b/aocl_dtl/aoclflist.h index a4e45ca328..caf11057f2 100644 --- a/aocl_dtl/aoclflist.h +++ b/aocl_dtl/aoclflist.h @@ -5,7 +5,7 @@ * each thread. This is used to log the deta * to correct file as per the current thread id. * - * Copyright (C) 2020, Advanced Micro Devices, Inc + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ diff --git a/aocl_dtl/aoclos.c b/aocl_dtl/aoclos.c index 2e74091f55..92d278cb2a 100644 --- a/aocl_dtl/aoclos.c +++ b/aocl_dtl/aoclos.c @@ -3,7 +3,7 @@ * * Description : Abstraction for os services used by DTL. * - * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ #include "blis.h" diff --git a/aocl_dtl/aocltpdef.h b/aocl_dtl/aocltpdef.h index 0036a6aea2..8551dbe2cd 100644 --- a/aocl_dtl/aocltpdef.h +++ b/aocl_dtl/aocltpdef.h @@ -4,7 +4,7 @@ * * Description : Abstraction for various datatypes used by DTL. * - * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ #ifndef AOCL_TYPEDEF_H_ diff --git a/aocl_dtl/etrace_decoder.py b/aocl_dtl/etrace_decoder.py index 1a24f00cc3..5465076ad8 100755 --- a/aocl_dtl/etrace_decoder.py +++ b/aocl_dtl/etrace_decoder.py @@ -7,7 +7,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/aocl_dtl/test_dtl.c b/aocl_dtl/test_dtl.c index 08ff3296c3..05ab292d8e 100644 --- a/aocl_dtl/test_dtl.c +++ b/aocl_dtl/test_dtl.c @@ -3,7 +3,7 @@ * * Description : Unit test cases for dtl. * - * Copyright (C) 2020, Advanced Micro Devices, Inc + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. * *==================================================================*/ diff --git a/bench/bench_amaxv.c b/bench/bench_amaxv.c index eb37319b6f..c4df0cd4d7 100644 --- a/bench/bench_amaxv.c +++ b/bench/bench_amaxv.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c index 09d2de818b..bb70a087b2 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm.c +++ b/bench/bench_aocl_gemm/bench_lpgemm.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/bench/bench_aocl_gemm/bench_lpgemm_utils.c b/bench/bench_aocl_gemm/bench_lpgemm_utils.c index 2f800ad63f..8ce8104df5 100644 --- a/bench/bench_aocl_gemm/bench_lpgemm_utils.c +++ b/bench/bench_aocl_gemm/bench_lpgemm_utils.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/bench/bench_copyv.c b/bench/bench_copyv.c index 7be38907ed..1e7f20e647 100644 --- a/bench/bench_copyv.c +++ b/bench/bench_copyv.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/bench/bench_dotv.c b/bench/bench_dotv.c index 0d39594f72..9ca0cd386d 100644 --- a/bench/bench_dotv.c +++ b/bench/bench_dotv.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/bench/bench_gemm.c b/bench/bench_gemm.c index d9dc523e92..454b8b0bc0 100755 --- a/bench/bench_gemm.c +++ b/bench/bench_gemm.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/bench/bench_gemmt.c b/bench/bench_gemmt.c index ad24593747..cd2e5bf9b8 100644 --- a/bench/bench_gemmt.c +++ b/bench/bench_gemmt.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. modification, are permitted provided that the following conditions are met: diff --git a/bench/bench_gemv.c b/bench/bench_gemv.c index 9f06bf8efb..dd77a0539c 100755 --- a/bench/bench_gemv.c +++ b/bench/bench_gemv.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/bench/bench_ger.c b/bench/bench_ger.c index 2c8981a682..b4ee38a799 100644 --- a/bench/bench_ger.c +++ b/bench/bench_ger.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/bench/bench_scalv.c b/bench/bench_scalv.c index b8cd6241c1..80b3762ea2 100644 --- a/bench/bench_scalv.c +++ b/bench/bench_scalv.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/bench/bench_swapv.c b/bench/bench_swapv.c index 6f2c8fd90e..3040d7b582 100644 --- a/bench/bench_swapv.c +++ b/bench/bench_swapv.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/bench/bench_syrk.c b/bench/bench_syrk.c index b65db83aa5..5bcc20e060 100644 --- a/bench/bench_syrk.c +++ b/bench/bench_syrk.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. modification, are permitted provided that the following conditions are met: diff --git a/bench/bench_trsm.c b/bench/bench_trsm.c index 7014bd4753..87dd677a4d 100644 --- a/bench/bench_trsm.c +++ b/bench/bench_trsm.c @@ -3,8 +3,10 @@ BLIS An object-based framework for developing high-performance BLAS-like libraries. + Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/bench/bench_trsv.c b/bench/bench_trsv.c index 425f61f1d0..4714f813d4 100644 --- a/bench/bench_trsv.c +++ b/bench/bench_trsv.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/blastest/CMakeLists.txt b/blastest/CMakeLists.txt index e0960152d2..c8a653c2fa 100644 --- a/blastest/CMakeLists.txt +++ b/blastest/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.## +##Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.## # Comments: # - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given. diff --git a/build/auto_config.py b/build/auto_config.py index 1ce3989e4e..8b39944899 100644 --- a/build/auto_config.py +++ b/build/auto_config.py @@ -1,4 +1,4 @@ -"""Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved""" +"""Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.""" import subprocess import sys diff --git a/build/bli_config.h.in b/build/bli_config.h.in index ba0c16100b..1e10616246 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/build/blis_ref_kernel_mirror.py b/build/blis_ref_kernel_mirror.py index f49d101ae7..2f28a4c088 100644 --- a/build/blis_ref_kernel_mirror.py +++ b/build/blis_ref_kernel_mirror.py @@ -1,4 +1,4 @@ -"""Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All Rights Reserved""" +"""Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.""" ################################################################################ # This file is used to mirroring the refkernels folder data into to zen, zen2, # diff --git a/build/cmake/bli_addon.h.in b/build/cmake/bli_addon.h.in index 8dc2e6727c..b002b43619 100644 --- a/build/cmake/bli_addon.h.in +++ b/build/cmake/bli_addon.h.in @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. */ #ifndef BLIS_ADDON_H diff --git a/build/cmake/bli_config.h.in b/build/cmake/bli_config.h.in index 9cfbcdcc5f..aed543b868 100644 --- a/build/cmake/bli_config.h.in +++ b/build/cmake/bli_config.h.in @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. */ #ifndef BLIS_CONFIG_H diff --git a/build/detect/config/config_detect.c b/build/detect/config/config_detect.c index 5e29defe15..03dc9ce877 100644 --- a/build/detect/config/config_detect.c +++ b/build/detect/config/config_detect.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/build/detect/config/old/cpuid_x86.c b/build/detect/config/old/cpuid_x86.c index f4985e3914..3167b727a2 100644 --- a/build/detect/config/old/cpuid_x86.c +++ b/build/detect/config/old/cpuid_x86.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2015, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/build/irun.py b/build/irun.py index 429981603c..767011f272 100755 --- a/build/irun.py +++ b/build/irun.py @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2018, The University of Texas at Austin -# Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. +# Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are diff --git a/build/templates/license.c b/build/templates/license.c index 6505a70ffd..b076cb49e0 100644 --- a/build/templates/license.c +++ b/build/templates/license.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2019, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/build/templates/license.h b/build/templates/license.h index 6505a70ffd..b076cb49e0 100644 --- a/build/templates/license.h +++ b/build/templates/license.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2019, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/build/templates/license.sh b/build/templates/license.sh index b9c51e2892..087da58353 100644 --- a/build/templates/license.sh +++ b/build/templates/license.sh @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2019, The University of Texas at Austin -# Copyright (C) 2018, Advanced Micro Devices, Inc. +# Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are diff --git a/common.mk b/common.mk index 87b4885980..7f200545ed 100644 --- a/common.mk +++ b/common.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are diff --git a/config/CMakeLists.txt b/config/CMakeLists.txt index cae2ed48ae..b23fb85a4e 100644 --- a/config/CMakeLists.txt +++ b/config/CMakeLists.txt @@ -1,4 +1,4 @@ -##Copyright (C) 2022-2023, Advanced Micro Devices, Inc ## +##Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc ## # Writing a function that will be used to generate the required object # libraries for the required configs. diff --git a/config/amd64_legacy/bli_family_amd64_legacy.h b/config/amd64_legacy/bli_family_amd64_legacy.h index 5629b9a2d3..c13a506346 100644 --- a/config/amd64_legacy/bli_family_amd64_legacy.h +++ b/config/amd64_legacy/bli_family_amd64_legacy.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2021, Advanced Micro Devices, Inc + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/amd64_legacy/make_defs.mk b/config/amd64_legacy/make_defs.mk index 5f0d613cbb..a8344f7072 100644 --- a/config/amd64_legacy/make_defs.mk +++ b/config/amd64_legacy/make_defs.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2021, Advanced Micro Devices, Inc +# Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are diff --git a/config/amdzen/bli_family_amdzen.h b/config/amdzen/bli_family_amdzen.h index aeacf75647..e22cd18ccf 100644 --- a/config/amdzen/bli_family_amdzen.h +++ b/config/amdzen/bli_family_amdzen.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/amdzen/make_defs.cmake b/config/amdzen/make_defs.cmake index 231c3eecfb..ac7d1b506e 100644 --- a/config/amdzen/make_defs.cmake +++ b/config/amdzen/make_defs.cmake @@ -1,4 +1,4 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc ## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. ## # For architecture independent files we still need to define # the required flags. diff --git a/config/generic/make_defs.cmake b/config/generic/make_defs.cmake index 40c9d7934a..d99d08e691 100644 --- a/config/generic/make_defs.cmake +++ b/config/generic/make_defs.cmake @@ -1,4 +1,4 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc ## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. ## if(NOT WIN32) if(NOT (DEBUG_TYPE STREQUAL "off")) diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c index b4d8ba8b50..19608fa74e 100644 --- a/config/haswell/bli_cntx_init_haswell.c +++ b/config/haswell/bli_cntx_init_haswell.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/haswell/bli_family_haswell.h b/config/haswell/bli_family_haswell.h index 58154692a7..5be492e562 100644 --- a/config/haswell/bli_family_haswell.h +++ b/config/haswell/bli_family_haswell.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/old/haswellbb/bli_cntx_init_haswell.c b/config/old/haswellbb/bli_cntx_init_haswell.c index 9e1d03503a..2de20b96e2 100644 --- a/config/old/haswellbb/bli_cntx_init_haswell.c +++ b/config/old/haswellbb/bli_cntx_init_haswell.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/old/haswellbb/bli_family_haswell.h b/config/old/haswellbb/bli_family_haswell.h index 06dfdfcfcc..ed9c344931 100644 --- a/config/old/haswellbb/bli_family_haswell.h +++ b/config/old/haswellbb/bli_family_haswell.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/zen/amd_config.cmake b/config/zen/amd_config.cmake index 61d56a3392..df3284d8fb 100644 --- a/config/zen/amd_config.cmake +++ b/config/zen/amd_config.cmake @@ -1,4 +1,4 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc ## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. ## if(NOT WIN32) if(NOT (DEBUG_TYPE STREQUAL "off")) diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 7f44b499fc..d88ea7577e 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/zen/make_defs.cmake b/config/zen/make_defs.cmake index 33755d5791..682434bf52 100644 --- a/config/zen/make_defs.cmake +++ b/config/zen/make_defs.cmake @@ -1,4 +1,4 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc ## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. ## # Include file containing common flags for all AMD architectures include(${CMAKE_SOURCE_DIR}/config/zen/amd_config.cmake) diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk index 59fc7b0a67..4e8896bfb2 100644 --- a/config/zen/make_defs.mk +++ b/config/zen/make_defs.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are diff --git a/config/zen/old/bli_kernel.h b/config/zen/old/bli_kernel.h index cd324fd9a7..ab2656f5a8 100644 --- a/config/zen/old/bli_kernel.h +++ b/config/zen/old/bli_kernel.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index 0538c7defe..c7d8137329 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/zen2/make_defs.cmake b/config/zen2/make_defs.cmake index 781c82b6a8..2296a3d2c2 100644 --- a/config/zen2/make_defs.cmake +++ b/config/zen2/make_defs.cmake @@ -1,4 +1,4 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc ## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. ## # Include file containing common flags for all AMD architectures include(${CMAKE_SOURCE_DIR}/config/zen/amd_config.cmake) diff --git a/config/zen2/make_defs.mk b/config/zen2/make_defs.mk index 180c201b06..b54ebda881 100644 --- a/config/zen2/make_defs.mk +++ b/config/zen2/make_defs.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c index cc508c5cca..b5b99eb609 100644 --- a/config/zen3/bli_cntx_init_zen3.c +++ b/config/zen3/bli_cntx_init_zen3.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/zen3/make_defs.cmake b/config/zen3/make_defs.cmake index 706c5bb4b7..077deb68c3 100644 --- a/config/zen3/make_defs.cmake +++ b/config/zen3/make_defs.cmake @@ -1,4 +1,4 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc ## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. ## # FLAGS that are specific to the 'zen3' architecture are added here. # FLAGS that are common for all the AMD architectures are present in diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk index 7ec1ee32e9..727be9d603 100644 --- a/config/zen3/make_defs.mk +++ b/config/zen3/make_defs.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index cc836d6292..8a79ff8a1f 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/zen4/bli_family_zen4.h b/config/zen4/bli_family_zen4.h index 25b0ddd509..bacf8b62a4 100644 --- a/config/zen4/bli_family_zen4.h +++ b/config/zen4/bli_family_zen4.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/config/zen4/make_defs.cmake b/config/zen4/make_defs.cmake index 422e5548a9..e5ce4401b7 100644 --- a/config/zen4/make_defs.cmake +++ b/config/zen4/make_defs.cmake @@ -1,4 +1,4 @@ -##Copyright (C) 2023, Advanced Micro Devices, Inc ## +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. ## # FLAGS that are specific to the 'zen4' architecture are added here. # FLAGS that are common for all the AMD architectures are present in diff --git a/config/zen4/make_defs.mk b/config/zen4/make_defs.mk index 5a058e2fbc..bca80fcc9f 100644 --- a/config/zen4/make_defs.mk +++ b/config/zen4/make_defs.mk @@ -4,7 +4,7 @@ # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are diff --git a/docs/styling/footer.html b/docs/styling/footer.html index d68520e1e9..160e30530e 100644 --- a/docs/styling/footer.html +++ b/docs/styling/footer.html @@ -1,5 +1,5 @@

$^81+H@e9wZm*E=m8y8-dlSbu5 zhSNC-?Q&f~xV@|Eqkw*+1|aos`oT>gwYa{!%&r1X=(*$a0S!a|vSpO=wv8-i6M6?= ze1H-u4g|;-D&YNBN)H0^03D_#nNfoSUmMR@7SI4jM4%dE)y-TJ&`E5{`mayxF_W?v zYE-T4F(;R~wJnDJ$&8Ev`i}{nT{DH~O!&z?;^Mh0Z{7K?g%zpf-FGq5vOvuY8)RE%8G38|}Jv(&yf9xRcKiwQcK zeOB}!ywUxdzS~ynqyh96G+0~8)3e2C#&`yMfy8HWRw;_-#FrjAb!vImXAcmxDd#j9 zWbqRl4)|9sof~0@klxz9x2x2oK@B{1Opb+VX4hW=vU3W+`2y!X8jJ8nOp()=OQZxE zL{FTqnrhY?{<8QFJ@#d!4siL-b=g7Y@gp@flJhg~IZKcR0E(hoZOlGujvp_{848Wt z5dn~jJeX~|#UbY6;YZ z$k##qFiw)#_&ROD>vmGBFhZd!gMkS>S+{%EQ|TUk z?fr3-7|#~sjxUD2F{lV*g{{OGix!b*{u4I5JMy)D+ZV!YC>4+WfoYox+D~QKrk(ZJ zqz!At1LZ2<6}vTIgJnC^6>1QB@!0t;eyrLcf=rM=+$7vt#VLStd z$LPOZ>*wm+lDU0m8>ZWtw52XQeJ@Ov2?z_YxNq_(|e4~xD-*H z)++DVF7Pzqxc6lV&XwR5d$MdN#H7PqfAjj|@+1x_$1me+;Y(J&aq_8 zuzgp^+M?oO#A2S}WWKpDUS=*+B671wddAN6Y_%sPr7lle^CCkVbmGGIud%95@xyE$ z{kgD+2IzjT()sbbxHy;HUJ9pPGrP7U&qvj@G)=u?2G9vea3SoG@?NP+6&LSN+^j(? z6gEDFt*aAAs8%tdFBd_F%)z-77-)sx2hFQyJ-ggdagm`Yf&_Eo3oBaPH;wLIx`PS@ z7CP3$>LTy&+xyan7?%Z~4m-8%+6IYxc-REo*8ezLMloJX~viZoSr zc91aQWMq_;HdErjW^oFWR7pxrodM1DSwt;qO?6})A)|TXc zYH}VEQL9nfKmPlo)7FWOpsrA=fXG(W>F}8djP*gl(ZO=>?c@LM9?+F&z7WIhM3bPk z<(GyK>4jJyCqSp(4y5${PYb(nJbx!<4|go%E@uz}=$A?&T`6$q;$}xxb5b zEa=#%2{1Y*+!^YeGyUZei(((SJt^M0Wf=l<&e96%C(+Ijl#0LtK-L}JH;fo&G={Q) z`a6ALH?ez$TeJ&_nAjXdnTxM_Y_k4uspN1R#Yx*^2Tb9{AuyCp=Iwi@s&3&$qth=Mm`s;DD<_l>B$=Eud8k1r+bszyGGH z^%Mx=*P^!U*Qd|Sl`BVpo?i5*p4mNMYIMaAbJ8H>>-u6Hq(}FP;hj)gEF{%Xr#bjd zS;G6+L7Vgya(TJtW0H)Z$pk$~2-HDkXQpbbILv4Wb<&V%<1Nbr4^tBXl&A@Umjz`) zsSK@g!YfTmDvU)zRYm4ofQ!Z3qQDbceStPyH5(y6nlIE!Pw!@N2?**Q_Ta=;NnViv zNTI5adEJ+iR4e^>Ie+Cv9A)>OY*^_MM$H^m89fXn+~26HR>Y4(da zn72~M9~lWTku=h+xs=||YNU=%Mw#*+gOY8g%B2g2$JC`5xR_54-+VcCnnZUg4SC3C zSa`^@e-g`~5CX!&WSBeyFB77jjT>_c3VQHcxIwV0;!dG3;A0XX;iL#x`~3a8(wR-d zMyo7f@;se1JTpR$9}lFDzT)W$TFDQi7=bWzU$-t+;YA3GaVdK{%S_{^q7VQS^nTlU z-@gis5hLWS(Njbg9X31_UU1T8-+U$Y{NK!}tYfab_B)zXrtMjuAZAWzgn(FRo2Vef zG%uqmpq07TwblD|qyQmO9i4e$No1;4@{ zt5cRzREUcPHHIt~m=zt&0G{cO(1-r3hBBF7Sp|Le0Pm3^q*JGKQ zay}f~$LsZcI6YYF`951lIt}2xp?JKsEPdxP)q8jDbV6+bY`~;i4U`Lj>A_>i0w{T5 z=l+u<2T#+7s-~pn6x-X|??Mm2)SK|Rcxo{ni4#f$zDUqLRV)+`DbirlH-nc)laVwZ zO0j`w6_dzfxi=CyiN_={^qrTiw4l=e{VP^wZtsR!v$yP8udc4nxuQ4|VT4y;Q(jX3 z{P_u56Fy+apm*2lg6)wH9OQI{{n%zjqRK3N)}NzD%76Z-N3`4s{MxL_79cu&_Q%3&5(R)eBW za;30`=Rylb>-*9kyb|8JePsk@KxfW$=m1DW5$pOa{Nca85a%VHgDeH>T6sope{pVJ zKWpDV8=3FuITOSNL_mSiEVu%(Ee^=*=uczA|0#zmWdRG17~^J7L5{p#!?gfL3&a%) z7xn>dD%8LVf&wJX$9W0ns^Ru-ZoVe^Iltzi!4=U)NcwOS*}nJEcTji6HU1tuHr{Dy z=-oeTa#-K|-V7mnN50%J-WbQ#{;rWy=oN@X)II~Id#7FLU&1#K=t~(;bMv554DdLSB|0(^EUo2Rrndoh}wA3)w*p`4WE|@kcK>6H% zORAW(&!JUGgERs$rck25CD1?|R3ourP!QD|ir!ruvJ=Xh;U>jfs~K)TDby&OW3f{S zp(^cqH)>7|G&AxZ93Y?+s!=MLD=8_gsE)~Y=ISuA5(B19#uJ6Tb>bXEJ_82^W@fg# z$P$>GhL<~yY-&xqw6uZ2AzE_UtMtuZO}P(INmNq~-linNN-7NPBA#t^k8|(F-KgZ(PS4 zZ+GQ$oz|JeJ4BnFLv4@C*q_gTlU)9e*%pc209?j?wdB{@=4OYtdD`YG@ESK*0Qz;Ybpq7SCKYW%|Jq7Lo=aE*e(L|%|?LEY@m4Kx|9+_Y|(mbGMJO7*DP47MXye ziW8$?Fq7@Ui|Nc)Qe(-RdRZ32VO;@^-VrD=nBZ&)2j|WDBgkwYFwyDG)zx8 zM0+#*fQ~M}mivxz@alZvb;N$AIo{zh1Z^rAo$%%IVn)m#^z6W;CzsLMF5neVD%gVK zi$K44H0bL@pu7G@+|OqLhzVXMQqI> z;fiIO?2z9a`?C-2gaAnx^Md`})YQQ~raMhPPFAv11*qy_W# zZu?PBRnUY3K`oh@*vOJe8~RHvg5#|?|K%SBPu&y*Fz|(SFawIaY7Fcp)Q%)Zgj2GF zkas<@`2m#qurnwX9=r|~)vxp{Otu&aF z6EOXLgsPc`ohSx-(P$Bhk|XN5YYz}7WuxUXpt2XD{vQ3uok3!PwD^m7^faLi=eL5` zX}Q<@4YsM)oH?gvShVGkG7^L^2kFvtz-_p-ki3;#eZi$uTRx-`uLoA_UQ7It-TI8*pK^x(Q3>@t#|yrxVSuQ;f;GjWHjKNDU4*#T=?y7d97#&zEtmx_EvFcsqh zoU`F5RS^Aw!{?&S<1LCDBq|g7tv&+>TK+8iO7}H@@@>Yy`H7*>sW@ng_0Obi78I?$T@EU9|r}g;!2Z^(-RTB4V0{?GqPl zI$-;G-}W|P?1sW%AC<9LgT=^?Jt_Q|Ojj@%n@{OGoVGZ&&H7TvtM~5>K$V3QhiJ+k zuy9OJgwoVr`ew!=Z^pEc0>jh1xZK-{b=yqsrKYy@y0;0U)x=jEWTFTi`49P< z8XwV8Z{!s3OXL0LD?v`}2S)DKub+wYU~zOYf%KDkt764u%9kDQo<5U#5;aFserux| zdC(%&bqG}>fmNugz-9MP>P{Kr{ar5B^x%;$^^btapG5bFEd^sf7~1LKi93mzx}i!> zqPiYE@pALBt>4Kti&Is9f9~Ynrd<->^6Bf+r9`-Fxl_}JwEgbaTYh(86s9+K-Ltb5 zlnt~Jdkx&4I9^LT;E&6To}V&?%fB2eC%4#i#CP9om$d62-#Mm!dCRpy!)Keioq7qt zC3Yy1zuAC@iPCMp7h0QBFiUVPTtZG2&-Woss&n!kLa!(v1j0l}<@-$Z{J4w?As>0H zAs`1yNVp4t*XAuv$bteF8^?qQ(mq`m3QgE7Fzd=(j~aAGRCU=^2SY)rtXL8G`tplW zfG}cZvLJFPpXt}YDUYeyccI_{aS_A;{T29pB)XHMFB=P94b9K_aiPbt`KzSo1N@zG z-^!^-ws}nDB}lYq*UlH3BWOHtbmlWZFSl{$m%Ua5pS=MALcXepS`&+odJ_Av88co! zb)L@7WmYMR^c)m~k=ebpGs3{>IKwV)Fy@Dv2 z`(-=gx8jRS)dio?(vM1E;YqDwsPL7}650eZQ&gly1S1fPU?(Pkl{)ezh(UrWrKeo5 z89|g>yA%jOrNxR&N==Tw;6h62tNj0=pP@=B|0E{f`SoeWtj(KenO_=L19};??N6VT zSN-OnzIbk%&y$9;NhNz+tGvEInSSLCTtjMte7W1qRZ34Q0_UFB?I^3tfOK7mWwq_C zKa(w%cCY{FKh*Yqp8+%Er+D@^D%{o8uEn+QsUcc@ezy1@QPDrV;jw(TZhd;&ZM(J3 zcdJhj?;V><&wAuT5IuLr`QYa5uF0Ri;WHNNj z!c&0^8&C84W{a+jrLrA?$?l>p*7o`~8d82GrI#SxW2Ogp{R>+wel(-o8fB(tT>)AF z+(fG?7DiL$&E&iZS_nE8F@ZcqT5ZFwojaL7jeoMDVK$|aJE;^9$?bXO4?Bje4+1Ta z`q1-hg_;m{-OoBR92gtlfzQ7XDp8c zLH|cJyg1X>*M89_T0K#R*Jek-&7hy=5?5`AsOwP-A7bp3u{(rv0E}b#+4Fy@7JtyV zNWZ}pRS|!fLoW8jQ{R*HnqxkVSu!v~4mBj*bksV-1v3+6_eU9YwoKU5+~CU~sLQu^ zU#I{0aVg`wr9{V$H}W2*I+R`Pzu|44{Pgv!ynm}DId}cl(&JuZdcD^br_zQb*Q(=u zQ+t)iFMVhIla;yV){Y$$wobliHA?YfaZGnd89n8XY3_;-TN;-ho!I$(x$MIF`yq;l z-ZzdLykz_SA$D!vdk^@(vfUdjQ}}G#pWW9aYBFLL^n29!p}*|I2X~t?G8D#iztwJi zuG@?+_cQNhux5yTOHRgLQ`ef3KX-u!5q^E>gMC6w z$(I*`7$z#k+C8z+-1X@6;u~E&|oq^R;uo&Eff3 zd%YXUGPU#*x9T^=+i-IU0acT8-fasB>l{9Q-`0w=U`0`a6Y%A)>dZ^`nW{Yd%&n zs^2K{i-yTpcAV%wCV!=(S4D<)&jITiyo{ZlSC>jGJlR23wy)f&$-UZ~Wv#51Tv9Eq zuskQ>@0S%a{K?H%t?QD>Pe5x+ycYjx0GT#-EU&uIQeJ91>qiBTe`%C zPCkehOZc9m@N_ROf9bma`Lky%bzSzM-hC9ZwkwReP(9{R^uBwZIv!X;cqIza2zCy? zj=XzBi)D}M;B|dHtL!vQA|7?Zrcdmm1~5Cea{mGJ4Dbnqw+k*h)3^P^DhW`>sPG5y zSKp2A_YUn7l3jZ*OvqwEd)1MZ&(!O0mD<1XalT^s%NKcv^UgRf+FaYK@4Hi4Mq0II z`k_h(4=M~Uocd3qf9<{MyixJ4JzM(Wkb;4m?eFZQKSmOCC-zHo>QaXU~{3bs>Qc)NeFe zRZy1l23U?B^ewNMN~V_x5Gx4E&9R$V=RHsN<`%!ylB4;8Rkqt8H^c0R+@w z$lv2x#Ape7HJy{p;E8#Uc$7aVnH9cyZk)M)GC&vl6qZmo! zbrd^hqhPJ+;FTYCnDr|jEy^`Sj}^{i24U| zGXewkdaZUaxTd3{pkX$f+`0uX&u%??>P781Fk{axpPZwgWDTO{KA*Pva%8Kq*{j;& zgZ~s2P2KloU{6KmUyGjE2MZ{Bc2h5URPbieEg_7KAY z3dXY|yvZ)||F_IPl2~X#WgrWf5(j9P9%Lx*VOHwC0lbBPJb>=nV8xeJo!%tM2{yhD^fIk=eQ3*RH7}`#(>& z(cW!NRaM%_leYB>2TZ#E;_3IOBx&7Q`iH-4`}#IY(IxeQi_-e_S00y?OkKa{@yWg` z7DyVbsYSFWx)+{i6v|O#GyVQU>JY|=xGW4o#P}%Ma1`V>2PRy{*$H@a1Xw;FT*%Gn zVt~oDXwhd^hin}-dUQ8mU*CCa>pR}As5njwEoRq&?}c#{Vy75Nv`K4R={pt^1+snhYBrcA zc03^2Y_m?zG!uSPLXpFHK8LtWIAbtbC~|cN=0_g@a#>)^q+*$5OR5rRE$zOE?uBIqb7d+gBP(5QX}x~oj`G&5V*ma+G;(g@SaTlW5M z$2vLp7&a{O&DAqAW1qjN3s`+&_IjCg@4n9EF+u|@1{P18unN} z?y#|u*3H|ZOs#EFEoJW4t#HTjZ|ZsrLxVFuzoyEro~Xf1%)qFWk>}H*VaDT8sHyBv z-3R0m6E6hR$z^S&rgIj=mtJKw_sWF}FflDmlW*CSOBS(-F~0 z&p&gf-K(t3*f|Uy)*ALIMw46v6jI7W)2l;P)smj9#3OdWQO3SBQyy<18nw+Lo^RD~) zH+#*erW*86Ro&~MVU=mc~K((c@k8kPe^XOdH#8cdU1{RL`mrhVJX}A=Jy$$ydhEks% z@z5a#-X%~kje)vdf&Fmqw%1ZIS==D%p(BL#fj3+Z4-YR0-uFqP&^%;w>x6k9jnac! z9;Vr7wig-i4U#rUKXOEIq{gt0)?4)T01ae(3`~smP8`k+9-HDmZ*JP%-#@O20aX>N z?~vyWx5^#-{Ci}b^*DL*ZtKtuCr{}QEqfl_`Fu=Z=UwfGFX{BDyx{lGg5_#VMXuy| zkTsbKjIl%A@5j_SI=VJa@}2nKf%X&+i#$v;}GfcapQntI##{OkbudSIQvx z+-=*Y1E1UTpL6HWi%$2=qf-UwRoRD$^^gAoOAMtRuluQ&AZUA@iqL+*iJ#rt1CnS4{SX{QO_^Hn@8P#hX^zeh$buh}sx!Q8&}{LtmlKAe3AMwLZm!*vim zhKvP8&$Lb_)aC#)G4d(6GpM}cJGTEe(Hm{Exubc)U%u_WMb6_#p0!90ytU?0qjy5@ zV5R0e1%{njVRA3mc-`x6JiqEwBMlXXcRF-Pui9B7PWxk34_8-Rl~l*a1{v;U9ZXJh95OPLv(yJauZdR=fXX7X@v(BP#t={HAQ^ z6ZoKi|HxZfV|zqKx*qy#xuvr1E6w=eg8SPgr_X$u=^NHQ!)Z7tR1D?l&?Vp>f80w; zN=oAhUrq#NCP|o=t6@@y=HLsuu3{>phW=ZriiK&@fGHbBQ)_YJAifs(4CL4ZVVX|G z4MNia5Cg!Fv@N(2LAQ{f+_@M$HHsiMy&rcxy(PEg&XI!$WA$}Rdpqmye0%JIan}8i z_SO?#(^^{7v`1E_X6LvL7A~xTlbNBto)BfB2JE^`pGLVKRPy zn(NcWlYDP3-Qd#V`ZTTYz)5}5OCx-~Ck9>4Qg|JjaIx)S^z=WO{(hPSm%JQVs*L}|qFfnCq-}m?XDeWEBv3}H+avvK(M z#vQrAYI83Hr=EkqMwfxdFiO#06tf!52cdb#@$dzTAduvS21DW{Fs*QHi*XS}|NTdg zVq#;vAwH&_yz3dT`);T0JYVGFj3tMX7fH|y!D^Mm@?)zDL5!*G^_dV8e1`3Wxq*3H zH?#a%wU?cevmKQe#hEGh{bRiC*MhmF70;^sNnER+_2;MMn}@#!?>aT4wmjiZZN(tB zncG)aUbow|?zp$N-qFw39@~Cj+gJXg?rV<_sT+<5w$DoUjo;{nLcXIllmp;pHO1Rzhd;g;z;x%kD-l3}(d#ruEqAW3Wm|9g{%-m>%MlV=>L2u-=3iNr<9t?G+X?HS z9H}4m8Ghg|A^1;~9uRVH^HGL9UopoX!yF9x|L1;vdw@RhLqsdUjbLya*R5Z#-uwQR%0v>qIZU0>OU4O4SCh-KCG9{%DwI;bNBAI@vWK( zLORs@bH_4Gi8U8YYo<*(wYl-5s%}KQIp$!B6ny4Tvcv~XOCn@#nDX}=mXN1ZsOa2SA0Ha zy6Un%V{%x*p3%7F>+a!JnbU)Y9{BR=pNdH;!_)kFy?JB1{P@WJU9n})@U5M_`J=Y` z&8jLtd!P4*V@pqIF4DW%>95^)EKko_`xhn|rH8OFxa7S}!PMl`$&*mpVs0fgbQT~P zl!j=Uw@$O-WDAOv(O5Bj%8J{#js}%#=&wL(1~ojJIXb4|`EyDEED(Ts=%rC(-6<;i zvg6{j5{#uYHitmAUBzRLaJ2=kg>tFuLn?2aXAoP7Dg(vLgyuE#Dm$W!6*{4o24@!; z{_omK-^KL)D_{<34~N2RfvEuMAj1hmEBS;q1Sd}0%$(T)@Q#e&{`!7j^dILwg?1I= zJoL`WYsJKm>avDrFJa~zU#fJ?Y{ZECxJ~{W;qHFZX2_&4gy%;^3Ou+M-%BJHZ@AZB znEo_XRVi1#4LKF7+iE9+yGz6I<4UM91aHBp*Ndl1RpzL+r*13C{>$3iNXuMbKRe_E z54hE}o{O%Sf8y|njQ8_GFKwMybv^rhQl4Sog?Pj4D_Pm+(dww#kypie^PpeW{{uX+n1emo93SHS6`H&z2SrM$tUeoE=uU0ic0WNVq%jjuRY0JW4rnx(_*#W$`|cN)x&dwbcamx*QP-A3qAvyi5`HIEPvB_5jk6+vzRGhxHDE*b%;&? z69znogkOx*fB_b3{+S4hMtdzJ4!{Y<6@NN$(%bTA<4o<4BUKH+dVq!j!R@UdZwA42 zY-d^-!CoYS*_O5OM@eCSp^w_WOx2Ou{;*3|%;QW&-4978^dn-vPJ6Os2a<^#A6O=Z zT3gUH@U2WJEnK+2(7b!xuuVtq-kFp9a;g1_6&zf;jO?@oIub^TDZsZd$;YUgJ<@in z6n|K4v1F#W)!g`MDtH*Wj2*L|997FM{8gWO9H%$Y)*@Y3_SJE8-26c!#QArbm2QRS ztC`ieU7sr{p3uBtHD#lLo(P-0?9|n>O3idU+jP8tshA_e@}F$}xmq(jZo@{I-X9yE zDNcD78=}?Lka#C+S9ZY7{#&>9J(dx2HhS^zvGOBBvp!ds_mFdN?4kX#_Yh@?1rCmP zme_Q6yi%E^q29^uQ^SFNgT@XnRkLp!a?jVzd&;S&jz50P(d*Xr`pFY23PKq$P?wRD z^NS)@(G<Fw(sH>Q4p_}pt16p#Tc7UwxJt3cf4I1HSR&xoh zmjX)x+RFDH2ZrMUwKAzI43GGeIW!8)`(DK<5wDOBU6tPBczz!tCBFUtGia|2_1MIK zJtmwo++VtivXb+o+`Cf7iLUa7v%$*u<%u>~<^r=diJ5N)PEX(Jk5Z%XvaxYn%qSFl zP`wcG`EXI6uFr51KvzRTDo`H1&at0KBR+2Di?FR^*8AoCf<`adNUEMB1c9iYMFY#A zwvZ#h?~Zxa>pvQl1C6j`%j=Li!X5I`);7@U4q^@lSOd?BXY7iBT23ED?3CG7y)pV& zd$%sz=lb9=HWGpw9Zt)~png4iggE{BQlOZ;BwFf)hvmSGA2Cnm2mW;XR%qL2O@@kP zw^1MbWOu83HeZOn*Z9=vT&e1E<&(NYhi?96)YP`>pqcS80zZV9ERc3bs2%M3&e4s$9{X*%KFJK% zEEHTw{9#yx-k*^ip?-@mEy4zeVcWb0e$6d#Y=vjb=oub z2FcG*HQtzLR@SS>FU>j)on@QLkvp7Bw!W!XvN7wgzFP*X)z~;LQI&N{Z`n0IQ(D&f z>4c`%R(G{W?PV_MX?r84Rxk8At1PV;c-?Ji>(4;x+Ce4X$~FikG4@GJOdzPL!BHK2 zL_m;$p`bkW*VEBo)50>ZVYctiKy0|s^$G);x%2w|@#*x}e4m^V@*@z6qB$bTe-V=KL#oQ@^Eafg;K`%o_gwjX^;$6O3#k%I}Nv3~~SiZckQP!8^ zhPy4D5OPs0y$V}moj$|3(Ib8J6w6zIVMmk(n~WcCRPRL!jP!r|Ixky0IZ3+joMVk| zrp<{zo*S%RTrq5x-d)axL|k!8*kqU8Kfipe(w+HUb%^ zXkQF_Eu@Vf{#?1h1ssJ8T$rP-jz>#l%>pxA1@*{Lv{grItX<;)$zR3%nlzv%P~>qi z+RB#E7Xk0%OCycrf|AcMW6n$iQ|Go5grgo@kTA8q@cL^Et`hJPm=?EWB>D$XGqi!j zJIOZU3Mx*>#bxP?B@d--KtpK93@}i~%O~bYA=2v{jiL(I5wU2jqkF1$9}cL7J;`O= z_dlu@kMU1t?wh%Nv62QJfDw&z47Q+|FovcO#!wJVK+mhbyv{?cQ_O3q{RK=cJe`1Q zg?R(?BH&CuXWw;9_i~rRfMKlf*d9 zTc0*GC@DT#|E6;3g{B1vH@QE&46_e5e{&^R(i#F-&m6Y+`LNXZKMaM({_rM^K|%P* zmX(cdfQ4va#1!yEb^Sieqoo>#*FEd%=cnj%%c04-BYR{9WMnATHiX?XKy&w&GveWuBB`x!CUhUlUxV@pmEe-_#3 z7MoKm`$Vfk)3p52N)F}{W6yS>`6%8y;BreYn~5D&cxCyPrv%dar5u@!x599K6PZg+ zOeOGhktB$92pFp@yXh51b{C_J%JE~v^HzrScRtkT*ltv^0R5&7?)dYFoG(mEP$vo@ zE{a|5NW^DA*q7gYQjgan2@x~51@l8Kgk_{YC}8jE+`3|I<7$&n!T9*#490o<&^<_T#v-2Gk%bfkpYhxTc+)bGHq;P#tQGXP_ZcQj?VC%j^Oux-4}^R zaRA_PrY}-973bx>Yg?K2dlzSfb1iN*)qhgO@D9eGN^jgjR!M$xnij*5K`)mu4d=3h z5fN@li}F9!dOF1mU%$piZ<~A8mX`TQ;d8D%o?gJL3R0<&EHor6AuL;+R17&gbU@#v ziXemNY<}W{2M?U4xl+}v@ma7Z0$DTZUKIGy7yMtBuKjN|lh+6t@^MO!`H`C#G4Us* z68T$_A`G-I@;4mgsK5bx7k>hQWs7-~+M$sk3=D-Jm(Kphfw;@)?Yj-Z6Al_gV^*_* zhrmb*gC8St6vt19s0w5Jccp$v)%((eCwtoUDm~ZO)+1H6akPqZO?-2M7;>Y}5%vLU z%e~ACj?|o2TCqyW)W73oxBg=u^iwi@*>v-!+r>%0JD`jghM|~+=KhJu>B=XEOTdLf z7m8G-(BX`uJK4l9~I(wKJ;3}T;6Hz>#?{Qklmmw%f*vXd*6-@SV$Fme`hnn>Ef z${D90g8D}!VzxTQhagT8R6e(pL95yknMZId6E&+SNZ_exU_{I#_^fbqTSN>2cTmKG zJK1spHw_b`4L4Jg^M)9ws($+CfQXmbU=5?5dE{qS5L;e2~tns!3s>bs8D0|iM2LJxC zY99@+kd)I6Z@1VrdUT&t+sCQs_sn5k^?Kfw8K-(n^&6(q{QX{8 zgMMI>Z`lR@-I{VK$?5b~bgfqrq9gNV2*;L#&m&=b*<2ccXBfwEm<6r|K*J>6{0pa# zW7jXT68u`XjM+o1b!&u(Pz@4OA|;>xT+DsZNeE3gRV$27{7mP6D1XNh~iY%f`N0ouaAkM*tvuE$YLi%Ws zfxd;0w>O2M5NrX@va9zF(k3xr-MM}|1Sr>L-Z3g&`}`Q`%dtD@P&^5{6%-aD82;_psf+0v<=0$) zL9TO!9mK5rhn$>)VIv2ZeF<9Vr~CI$Adb~8H=Fi!yX?Ow`%`9*UKgir)eF3=+i|FG z&(wiV27ym{Y;DqS`f=@YpN&_Zrv$wFR^=JfCONNiEOrV)Lm({U87pC_4#}n-XHev( z0}x+=+GJ55GFYl2geue?TE}CTc+iI7(J+jWU)=n$KwMPQ@eGwSrkTfB^CyTCjz=0= zep>CU+}ti$r9sAI(URRI*|H=i3Q?W#6)Krk59!8y!;@AE%cJ3~F%LMWBg0G`tXQf@JSRRiYSRHM5J^UjJOJy z#Vf3O^ZMSnr7G|`Y7;T1fm-$zbvX$^nBZ1cRR#AJV^eFMr>F>Pdaej^oT8(f%;h8` z+6pHkmpPIWCc_=$8|!ej251HC7min`ag)nE+=K!qt3 z1Z9pCN$gmVXXmK^Asp`EF(SSNPB~ezgMrSQ-YuHmH~aCfE&&_&dcg2RJ~?}Kg-_Qr zTRra=^c%8gdi&rvkNgV4^|!uqtzVuwgT7e&4Tam(X)qx!qEw?fdsjPZe9~3HQ00quZ*e8uad3-Rr#{ zjdF5ow6<-Fbosm>sZw6(71=YY0B-ek>Q-L&?CPn^J56tEwP!R?A7=-)9JS*-q4{ic zCh})!Mgr!cpR)`q`wCPZw_hOz1GBNSx1Yr*5{?P`3%3dD&i{87yXMl`unS`75QIkg z&%%({Q!uJctUMA<-@x-YQB37PYR<0t(AX$kKLJXHVGSY%DH6-}?Y`?9TaucJ2`_~a z7jhj3hEOktJJu9;!L^PkCA3bA9@}zXH0S>Vb$Jdsl+4|umvXYRccM|Pyd2{0usE(w z?UwHkgtF%q#sx7|Kaow8#mhf*4dy9YXP$FfIws$*t82+RYo$=}m3YUW3a%_7&J#k$ zmIRzXf9JTy1mCt`!(C>|t#6|OpX=z{J)Sc@^Y-B}eFl$vG1Y$0+p3B)9le|U`nWh% z4w&M(=Z?R=?Jb)bA+s|aB_AB`{GpeyLR`O0lX&e~jz+B2 zlHcrUJ#CZPTWNvRnBwT@$U)|d4_e$&)0FLf&DzH3?TsmEUS>&?Czn>7Fo;&v(rVq~ zW9BeXjt;EnkSQgTcHP)Rx=nF=`3UvIxb6?;no2Ojht0so;-2Fn|3KZbTGeb`94vfs zn3{;Xd}#OGgmqZ*f`t(Peq685cL@U?~v3n0A;5aXryOdgtyi~2EqCCP$l^lQLlKf=trCQNp$#y?m zWR1!%PiT6YZ?M7M>(9pO(Ko*YKAmjeW$iP?!KnfL+GU))z8SqOsmAs zC&9|^uP@;?fmbHE_3kRUGc$@Vb%taY=WOO0Stee6+H%&?qtlCuTTN{RYoVx4hAR%BH`k?&`Sf zn!4=w#`d&5ItPkZc%3Ma==hj$i)=gKGt*DvfZH$tCUVY%+}qz|JDmTjRAUNUtM#wySvv{Ki=#@!}GtU zP6EEM)fnFfBnDVv{vS3ALSWSeh#dbYdklbL!q;L%IuUn<-m z)A6L1`(G$Csp1xYNJzkCzZ2E~3|ret-_>@n**!1kp|C0VC>!=g6%SIFnL3d%22Sb* zgCo{2i7`a`Po>AVoAjeliPyZQ4%FN;)K|Lj(faz`S|zPbnhT}h zz8t!#aeM3LVY;9Wp3xN^+JASE@Sm!)OmFw1)6#MIk9CGS`HbC{rSKhS6(&xgUvnwP;fu8fBx%d)1{#sTem#avuY7>N#z_5={N@imMoY&U8UlqgqL5QE zEA(KRo5(m!YuTG!I=MS`W6*)jsrTZhbn7X-a8-L-q>{8=3Fli+`Ab!$aqa7}K^jf1d^OEq@!)77)zwNvlH$CtkQel2#0ikLSq()xR)_OLyr=N~`Z zTCFTS!Rhenojbi>d)%4UHp)qoS`){6o_@Enj~&Ag#yWuMh^+x=EEP)Sma zH)@?4Rv4%^FaG!JO>McTHYU8ld0Bo~R)>ej-}^-xZktZx!R2F_L0Cv^G6Kr=jKxrR zx>cHADshgj&eBXDTvK!R<-lIv&0&3?*R_H>d!GiLdNS*W~u^n^9Dis-nIErtzAi=BJ!@A3^s@?EnI2bbsO)z)D3sP4B4-)JmNvK9x&D~Zj<5PV9X0>&6^O*@w0 z@v%}Hp@hNOwT;W4DSL_(FCZGU$Z_r+I|XzN&i`HgraZa{WwYdcuKMb9PjP)CPX~cHo*y_h)ss*F8FStg1V4 zjL-oT-SXKgCR5fb8|^-CzANflPqvVpiQUWGlhB>NcUEaS2-U2GHc%A#1eqKFTHH?9 zetnm-A?QlbIV^KNn{JsAcH^~>P3~36Gm6)Dx-fhA&xjYVhg;Zp>m{Rnv+jd~SF@Dv zLdhw&ZYh>-{$%aGQ1kt<3#EahHy@C%xo7HQ_NhMxX!LABE40C6$ZJ4;dd$=#;eNxr1-hu z2V&TB3)s`T3%xWAx`(pW`TZhDb0TJ7sj;y2uM+rlZ%}8;C_hY>Do=h zO_hbh;rpsK@3eLQhpqRH=lbveMlk^!G$i$aP^MP2jw^zFM;*Pb1v zjX;K{@Eu__y(3uFN#y`~h4oYmdyq`!__=70kkCf3<=H5>I-a6Jeah%J~e+@)A8cq!iDymjbFZeOwKR~2vG9QF|SZc=)%9nWyWk}S<;$~ zlyB^5ES`==ommYKbD0Pfai0}G^)vj5kltWlpY^oKDG_i5XL2+2I-_+h!f>ntvM|Fp zUoKRSZMq%HAKW$p>ZwWKGf0pdzN_}x&vv6MENerP`g6&h!9UYOw_%H=JlW>G?%``N=&z5+Rb>Hm z$mLJ8F^nkhmsnI4Yy&Tam249f@)?8q@esC>igY7N^5$n8tviX+VmnBap*+l+*0q==HK~AhV=%k|WbA!F2<@ z0_B0g)FhL8I}q6gmm^_~azkeD!bvbl)(8%HWSWqm-4@@pXU_^jfs*CQTy_bhDoCQ* zrlva|KCI;BdLPrts8?SIzB+~=uHyB;rp+7`V6R^?$e=aYmTTEw`pm_k}bRVX-fyn!s0Lj4XmPRd|2v5Hh z5vkxRIXWK9u01sL<7;x#;zMoTyK&WP>0fal=m#;tq#ExTPrEN^mHFrDh?w%C&M z*@=Tzl#g!^iNu*#=d$r~<$e#YhZ|HLwMDsCjGWJArR!yyeip!{gd-T)4uZA^yL1=4 z9&B|P^f$WyXT|yXkC~x8^+#NJ_L^*2N?P4%a?7tb|u(r#R>59TjQo6-6XKq+w?s| z)@M#tS-V@sVJ8O%4cG_;4pELl#l{SZ*;VEgujjpGt^H4pC2(mUnKEA9Ju)q<8^9sT zzL?`S)&H*wh&cSl_~++d%s&$#T^__*Oggx5UywhBX;aZ3Ev{9B)qM$ThfD^)@BYp* zNclmrsDa%T^qEloxV^~akdG1Pp&c@}+iHW^6t}*-aN3$G$AsFvBKU>D>T2C5_bU>p zi#c-tbFLx@aRMkx+`H(N(?631_zl#BfWBo;r#&>5j07s+6iYNYHo+2^oK6dLie~do z>sLzS@8hhToV3A9AdsT-paJMe@Kso4&(!!+V6yqIe=dx8m-QA^dhIu1)jfr(sqgkH z9Dk0#eXvi_p}tl@NoU(ZR)&!N{<;g5?d2gb^uW$lIiVb*?pv7EX&)tI4DO!rD_{=| zJ*uDuK+H)La*%G|_tWvx^UBLSe)ScEUm#J@1`#p-swcobTcO7jac)}yW#CnynH70s z+RqTnv+m@{w*PTtNHD-!0=y&-ilpe8rr<2o7o4Z>^6cD+fDwc^z+PuC743tc41T4U z_eu2w=Ts#J21GpXkelKiE_lfTzr;JlbdRrGH~IijxX#b@o3 zW-;jvyLO$vH@v#GK|oDwC;KNdcM$dCswgoLL~nb#lzobd!2A!rpB);@&P`jhq( zykTVNm@6cv;l;yl6d3plyBh_wDH)T!BsP-O~*#S-T`)H&5!GX>t z5Wryj+q`zs@=fu{Gi_-4g5-#|8$%$QSxJIt5Z8 z!+%f<3FZlaPbSXg;f9fQ^#4o1;~|9(ZYA*GMfxN0R$7^-R8ZRNBhbU+;J$$ ziYV##VqoI0f%c9xp&{A^0e4s8a$t^7sWkn#t!eBs;MdeL6AgZ$8*Ord6A8DPxkWxB zVQ^^3x`=HiNIn5Qdr%k(E(rq%(w)-vG#%)*HeJS&2MBT--Qv?axzBSRK@v(lhKILNj) z1936Kpp^&+3MJ#?gO1HG_!05PZe8C<^)biLKNUPk%_QD30Ooz-;=8S73JR=Nu0A@L zb0+hV;vROVRkh~K1Ben&QQVe>HVa)O$OCmG@hw6^cd@`dh>s6LmJ-QIm2t~#{xX68 zYiMwAw{rH(7MZx)f1@L4CnTlxmDIa)wmev6+*W_azR*s(d#g^C^1J@A7MIPIBb`LB zj_L4X5bG~wxn$?$knA43OH$Ye9Pbq57N9b*%;Dn;g_|$N>o?2NZEarG%%$(lF+%Si z_sO}LKig7;0rY)&@r6cn{##P`pkc5_xSDW>(Dd{)OxPq1SY%>3pk$g0MvgI_NL!Uk-mq~#8)cE3{=&zF1REq8?I%U-cqSl3CBG&7f10vBcPa*d+CJ@#eYmgw zfu2D7<=G$B?e7M!e&#*bDzCZm@9ecShe)Ffk%LFnWF-8R9#)VC@}=#-GwwPSw|N<8Rk7Z1QGfF z9`=At3b795S^qc?7zu;9=D4OF>)ow3lM+!LpYmV4*w4*fZ}e;#A+Mx75|ziH2w+0S z0KA)vX^S~abr>e_2b_V4wNu38rYM7kveV5+sIWu~6fzm6ZZ{y^Rd3%S>jm*# zG}4uKAX`IxXU%QqeNJW7tN&Ikqv^r08UiX-)zqvrKj4=NVsl8{_hmo0Zi+AwjHn^9~lNtmG4h{kQY({3muf z?9PUa8E(|j!Ke$hANP!?vGH(xTqJ^p>NtYK*o8M$xNX_=Ep()$jiOQhxfN;i_N9JFz7qM#L4@y1Lwn&HH5HzH)fNz$@?gnQCz= zFfUIvR8PRVg}q(0RGER92Lt~Dxinq64NJw{gM$jEX5A4CIv>h3^7Xw!!~`C${?n$@ zz249Dolby82Dc1y^wF~9F3#p=rRpjDEG5T&e~P_~ef?Eurd@6GOE{+AT)q+%Tv=9j zTXS)e$FKXB@-SxA3!A#+=^2LV=?-1hJX@A+DY{(wx)GF!gW=ac)3@EGsXqi*lNjH5 z&)&(t?$mc9JzR5tzw{X@BV+Rfmb`A#4G5K>AYty=*pv}5iW?hb=m^xLc%P)MfoNbc zkRtB)YkX_j2mV?JtBBto-U~6rtC*BHg`yn+azBvpgv8CSB*p03nK3R;a# z+`RanPoym#c<^OR{3;Aul;2Pzp`zc$D+QAE9(Oz zHQJdOQWlN*`@`QV3IvxJ9=&Fv6d}p=drEXD`^ANQ;}=Vu^f~?%75j>HF7ep&@@ms6 zNXOih7SW#HY7C@aSm{nkm6QMyj|R~FvZlE7~C~TCYjv8C=FIP z@mmhT`iyDnd6C|wU6PKZUz_v-K)0#eQlOsLe%pjQlm=!o&VvKKynB_zdqO!XBGhRa zEC0y(`Y2He5|`nMJ5RN~AoH|%?PIqbQu+wMrwGxu29LPabfttZYcq zFnV-W^jnSYSa>Ic@hX4J{%}?&$t?4hM=RRt?BRY|D;q^Znz=O?E2to zQ$C#tz(2F5V=>whjD!_N{!sLFBRwKy0hR8Aq6fsA)BtsjiBbn8A~0)PfG zQ1@5+>o7h;>>mzD*1+5MRF=VD>|@2NQ5%s!(3JQfy%RE+p_!dw)zuj%5N$3?uqPly zeZ?wuujJK(#hflW4lP`;rV6@oTi^4}*>u6XTQ4c4QJgUTpuPbVf`t8 zL&dG)8};wrpTDuG=0mFDu9{ALp}+i*NmoDj?Pv_GxH6Pnu?Om1AeJC@wGj-ANs82- zWgcjLzm)0_0XAXJ9*SZ*J*%rTH&!R|&ajUL=w?K0H@IV>1u!Tz)v4&b_2dH>=&k{k z#QyW2>f?CECC-hCxA%!`iV|Y}Rz5VOD4%i3=D|V!Ijj2^QFSvUUmVYxj)YiiZ!`^HvqA!JI1}R+- z(~|%`$S$_a`EcT=f=@^pnRfPOOp9O zyEuwHySl`#W;2CD8n(P}V3C%GduA7M-9Sj#>Ucmchu(z<>YTzZNC# zp+3x^nWQVUha!n)x1b;c`e&fHff1pxVGI0EusmYM@0K;(3Iv^BB#N17P^dFcUGE48 zKS=f=T_-3bw#7sDicN%Z)pKsVITc^Q0pK3fROXMkxqKaqkjGDiHvTI;#Ox_9%`tsW zTzs(SobrGy1HAvZk4v z)%!%H@5B;~;|$Oa=I84VAD#|6oQ*4>wN)j@VBlfS-aT*Lv4whi1a!OLCmZmmdn(ICM{YhNv=4hp2U~KOQ34|sh z09MfZl5j1s>M$0P%tnN@kxmGp733aTFwlJ>O;$w?j+2iMNM2+!XvCOUSkBE<`Vw4jNm^_eBkIF21m4C^N~D!c_iEz&jhjYAXS0@z(hKPkP4qK zb7K?7WuFNdRc&uc+|6GnV9i3fVWfxuw+CRMh2~ZYyXP3z0`&Am;&(3wDHXoxdfbH` z$Q(jwmRl;r7YT9k{QMvL1vN6KAEe8H+(CrRNiu(+LY>O0DV&GgkuR6}y$L>m1@!qN;5G1#k^rou)>MrT~J2hO!_2WhB;$i>-j zpJ0e2Lu@g~TBhKo_K}l04zisUm@- z9l6n@LjgAZtuQV^YDXd*$?A(Iw~LcG{_$-K(Y+bGCxb0&Eq$qN8fk}8N(R_-s)U3x ztER(Ea^Nn7W=!GlKad*-|HQM8R=8CJx2ibo$Q$N#_eV}_H5fE$*UdA2_dTu2o?<%?tW%GdXMh%SG` zeg#}oBb0rrYHHRPD2as(AJ95dSdNmdI=p;%Osk&RSi&HT-)8sZ+nDbcjjAnD{%qP# zTj^CBS|oZ_e6B2y$4@JC7*jlQtcjMEr2TB%n1U~kw9))e_>3%X6;oS;?vv*S08D@3 zD=(yM{&jfpeFfB_kpBT2rBM3>Io@K|w7 zw-5_+BHj>F!hTfn;miY|A{E8B4b^iEXWo`x5_8XGs;Njg_!(4LEubV4DAm#N5bs0+ zA0CMtx0xI?cb{dq9`jDrjm`&lgd)@(uXcDft(j9hBgGzNO31R^)IKooEnVCa!uDY|qN6zQVo;Re^aHS0hQ!yu0eM=;<-v$+nMP$Wa(2_P8Y zI%Fau-33#<{)Jmom?F0DnzZM}edIPzTFx&inLv5D*bB*y^OArL!G8oo_DEvX@$ofw z-@17dz=Q$MrYW2=q>U~^R3S3CuK9@h1VQ%jhhq?40e-w{iY~SlKr~K%cCDr@4Z)e(ky3b3(Q$xZ4 z(0zOxXAxjF9!W`70J0e3GDMr)&bm&i^g9+@aXtZE4&rS|S|cu>yMq2+l{?`SQhZuE zomm1V5W@s;!=E_Bo+AVaUj>ovB9jr2F&LodzfT-cM1u49cq7hLxSf!iRDgC&(m(!P zRu(czG0g<0zlmx`eIT9z-P8hI#aAeii+5Jw0`fPiCBe?5tOx%Yso{hXg$+c^2xCK6 z7X&<}7@aS-_>zoN677z-ClU>&l1vIIULJK&bNRL>6>-?d*8Y-iBBI`Rd<}ixjhi=7 zJVP?-uKk_cFwvGUks57L2MGiO63I_D!xbBhKZBQdB5DP?Sw1{V0M@2%y7_~#MES}v zI;4F${>a&g(LPV<%)%1+NK$QKNKkr)#|p5Vdt9em2@==W5-R7z?j`-E{Jk{u!P{A$ zg_8T@{dIJ7Pt`V1C^Jzgbw9DX^ld1>M%Rkfy;)))XWvTETd zW?^Egfy@AT;Dxq9mig;e|7^Iqx!S{H#BFLf@2&AlZ*ulxzR3F)Zg@7IYvav%nXIPH ztb8W)_C+dXV+H@t8R{COFBzKgqibz1B` zCNutU?x&shsL3I7ACyxcpm?$HnNvopNsvmiLX_=UW8{DCQDX_`?tFAHW7;i|adp*c zuVQ|pHquJ;^z}*5*$($!f>?j;%;G>0Ar-WnXju@t9nfYV+2FjWD z+uMp?T_aMpla5Y(jgc{Ew)z%_h0bzOqTk*YaX8Bmbck;NC5~bsQ(=^rz+8dN%uT^u zIch96&AI>l&v_tN=le|VNV!xlh+&5TBt!~IfdHZ93U>>c2FM=(mN;`o%~UO5#}Tj~ zfL?iiT28d8zZtRTxU3xyd@w?(dH?!f@Aot$R%(Cz#`O~r@gwjaf7WobkT7J}#j%uP z=mg7)NXB)1yF!6Tvs~uUfPPiMUH}hwX0%^#fN3oBT*RaUcj-pH!f4<+uzWp?jn%*+ z56Og|>2DE76-?cZ2S1{S9x|2~AQ*rvFO#9)v)W`3=$Qy(lLX$MG2 zssFnPq5*hCt~QMPzD7=8l=YSJyGAd#BI}FC{Ix$tu;5wkO)r0ODV-R2^dvlB>Nrjt zm{re0Ycp_Z?*Ky;Lt_?6BP4t;6tyu(mn9K3P|O3B8g1mt>giGccueQ=?4wVG8$j{~ z1h%Q%qu+ScAv>Vpg8P?)JPM_E6ara8R%K2-J06Ui4~(TKDdWP|0BjXA`}vMEMwkLz zzrMeY&QkC`8JU@IP6uM9`hc(Kvt>#;E!Mv0&zS|Y17uz|Gz8(=orAPM2YwrTz*w*M zBN76{5%KQ&_#DM;gAWW>v??$q%xslOQ*Xpa$E1=G-69$tH8_qe?;vuFzyShIlV4ca z7GyjIPU26(^9HXzS{@z+2h$&P>V=>7wUqKnVL`#!xH>$;E%|1y; zSdNeSSQml2?Z$xy@dvU)t+aWGf=94Je-(`pq<$T~AQO|n^Y?A}1Ot6nSo8ezs&s#u zyML#ht$gmtIE!5z>C~k*@5b6?AF@%`G z%CHWHH9=TDyvy5iSH-sd$82@vliI_F)dkWv?9ywTJz&$SQdMPg);JAqu=vPGXNuhN z*L{R-g#Ez-gxFUwoQKoe7*o?}SAVZZ)RL|)>#IdX*uP;O8(aDr;64!a^mycxqa1bp z7IYV?{VEIgGQ7|E;`i>U)VTdfEF-aW;V~l` zPKemiQkeOR1KBrbffS)C`vOGhCW$;`r{8aDT#70%nq#IM_v;0Fk(rrk1Q7wDub^Nf zp##0XOMSqs&Dz~uW#Fp+{$oCCO=2#KeoR)OONY? zKT}vFpXTcr??L=_r8jyD8dRmGk5w&MG*Ewiwb>yd%x8Dz;Q#(h-6cmJGeSgByL!M(PyLvFQB&q)!GPJ>KWA9{!pcrl0hwD1mR+o-p zq9-ark`Rf{*f1@n=)&>Z=Qee17FGxFah<^HB&uK7{ZUmHg<51RR3K4_&D0ut8j$9M zvq#Q)-fyn(6#Sgf^ZY<2-kqg)9KiyQA2#he>cQlXZ@BAwF4DB>$#)IW2=aN4{uzDP z{lu^UQE4bT_j1T@F8CRG^-{(kUlGw;!SwI#+3sZoF+|?^dUaZfb#VDEgcDo#Yg*xV zgr2DFyu7@4>1ZvKm=ZWCyT}U02}*8a+k-~?n$|xFVc~aA1Tw31jMB6BJN8gkdhuU- z5+~dzP`ii`>oznB#DYeW*ANH=i@o>>4-}}NbEKbe%>c zGu`_$Ms;M}HLKZeq!2y;ARbyz1sq#AQU}S?hB5NQiPuxxz8OZbO)V!}v+5ME_59S% zJpS&qg}t=&qeILA0f`zMV{K)p*3O)Xi0sN1O3&F=nwOUjjW{I}@?G;N} z%ti9MOHH_&HW@pGn3x}9^SD0f6%MV(yR0JVfWJPrEv?7aYOb`^en<`G{g`ikZr`*1 zYsZfMdNl0D#^mDY9Zyw4^pYG;)@3?0w+wY#MkW_7w1Qv9~h2(#Po?J;;I=4BKmjy`Se{nplz z=)=*u8(^9M;h%%HjGycSTOQ8gg>UMKF&8=_*goE8YFL>Zx)nLCfL2z6o@%W#k+iwt zm@Ul2PfUfXL`BkenU>Vm28!7R4{zdU>J3(9d2)Dt^}n>U7d(F`+zze0YNH>{P4-(v z5(5l|VC4psn!s^$Ek_J&yO3+{PlV}x8c3KJ_pE#~c+mn|6ma z=?NQ=W`N0`a`aSg>h_s;D=L^B9IT(yJ2?WJ#pF|#b~Lv;us~c@`)0=J$iZ9pj_OJ< z_LZ5vHv1OvWu00$E~>nFs|N)eu~SYY;Jf3E*Ey3*9 z8HRAHVf1;l=!1NaB%R zjv~1>t`%W&ThCN3S)fDa6@j6ExEU;6zbDD{V}UHxUEK$abL4df_=L6U+fLFb9pmWz z49o}FYvPM1hd0a>d(a-KCRX>Hy+E8;Ll9! zWOrAkfnxp+?_m3hNGp$SE~RV?M2K_?g>q(SbhI9i0(7+H%QgNuFGQ0gBMZDTJL9D+ zv~K3rCSELc{P?x*dbrc2+&5OjA|iH>fkR{z3k-g6W=8wesRw1fD1QO2PEZy?BOn(n zR4AWrFE8&2Dsz_J52NEWOq_gw8L3O;fOoQo#AkN!-x)diL?=L?_+my%Dl=^^FVb(x zmF{*~SWP!#iAN(M;_ki^iu2>a?E*}__7PD6&u?dD4u(oc9QbRp;$>^gW9c(T9T^!( z9I#~PZ601ZU##x*arn&nJ}!ECR-5AE8@_g&$aok<>bjTfQ=NrVCuW zjE@UG_P5}V+}gQ+V6`D$Y6K?V!PWcZ94$g4Ufna5w~jH3u)hnwekI5%z+2anLLp(=#S$C^l5gaODKrer;7!K8QI zZN~2FHl<6?!@muGI>32|sWGflc+o?1i}8E{{4X%t@-%q%Ml-jIIlbm|k*q9fkWnqrC}zh?rT4!me=^ zBP}4FK|q}~z!BBL)3dWj(dd_k?*k|u@}r5;8*x8R5eSPjkQlvjrrCEk{``3v9GsA^ zC3VRG0q`GgU`51x+{QIoK>cBzHsG|3Blf7;Ma z?oyTGw6 zyr+v>wq=3G7i^u`&gZRRQlw*{xP{~K;qobw2K`S;2JF3;UOR4kHyfOC z_3~xK9D~y?+N>7XKLF6$f0SUHS@V5_A3EvCL#AFdsqIw@CwcD(mLhHGI5rP>|GL;1 zwxb1b5D7pC;;_Apy?dP}r%!;;2cQd< zb!p=1t(|q}C5bys9M2Sy(u0&k;w|x2QqL}keJw3i}@>TFHFdt3M2?feo6m;qoGT5fj(wT1sl45jkmLhisf3{5vl%))B{ znS}o3M6!^jWIcL=KAYOwp_^~4CF{oDH7CrC>^Wq_nMFyY(QrI80+)o|FAc#7Q0@EU ze?*aweGuCb2L0D4m;i||^zB;{U?`!!g^=+0O`=t_85zM9E0InN(Y7&A>;f!#Lo90` zcN=4!tC~oMGETEnL|bf?cIr(H)vvUetBkg#-E5e&@bBEI+2HhaonN_J8=Dz?w_Q;- zh{JSCb|9p)A&Bd+o`FFZUo`g}NT74Bgm zXE1KoTvfvZrz|Y|UWp4_?BAH+n|AS`LmyZD(W)AfTSS73-`ZpewLo_mvlvY9N{mrq zl>oAXqN6$RPZZ*eRP~bCel&fQdFL^Un2RqzJX()Ts}W+_gkzGV!1zE_2bW~h)n9N6 zXQ%niGit`Bab|aQHPd`{p!0KC;CmaBoZTg2cD~*F;)UE#%wFW-jEWM+50-dN;bp+& zfVJBN95He_NiimL(e|OWJ8O_WfaPr+t_S?khw-fs!7U}V@*W+m9xi#1Y-B_XLydCybC9LFA3!edOY>1$H0@$61v(^#kQ1#ks zNO5s-&m;UB=&KhuWB&#HpUz0qFunblld`90--!YCkD{1IjH`SOqb?KmZ7*7Lakx8F z#kZe7i;v0&O~<%mZo$PVASmd-t>NzKs+8A_$Uv{A`5Bx9*lTeToLi;ATOAx7b?q-1 z^(u7Pb8%o(^_s=Gl2bYw$bCC`^=hBfHKguIo_u0xVDQ*2lGK9%?ZThgeqIv4v^NEB z@gu>=M8a&xM{rD6&HVdv{)TzQ;J`KO<_L!?A*{Vdq_^nUwW_k}PPW?G+VN`>6BC85 z-*o-f{<#w`oIKr=B};P@yNv(8sUx?K-!{_ljMp5Q&5Gd4Cx z0_COl+laoY4Fm$_^0zd6i(Ab)+s66#d+_Vvx?E{@%g2ZHIFd0A_&fjS;XxED>ZTW@ z;wFL_4e$}{^!mC7KDsxbiycT#6ToC#5hP9Jjj5V(+wpg#ZKlUsW`4pQYX_ zn*kNS9EG!LYXYjX!3z(AY6JX=L8Z>B@*T;E-0-t}@}-gC9ibL*6vXUBa1Y=n&XtQ@ zJWIZ2xz#k()C31w_kFs9j$Vd-A@_6{)glA5Y%^6WPI?0Rao!Pc3N#@#!zN+}fCab8 zkGB$SgFX+w1_?w?7Wf_|oYGoP@)e|V6E(@JQ3)S2;L0kIz0P}bz61L8aIX5*rt~DM?~`1-rKLKL&<;r=hEqu=GPSd{Y|`NV^qln zOZv@$?rjP{%7}IcYOZeA_$+!UTOfeYr9DW~sxMXw=LS{Oh_pO;aCyZ2g)vJq`pBU} zR+3zh9}!?#5_TNnoCm+gFXL0hB^0-fpYr-0?AQbqhTa>sf3LsVG-m~|lxBnFPzwi9 zg;dNU6UWe=)|N|uOF4LW{G!-TyTpRtffi^8t%{g*LvRoI;}^WH`C@T$vP~!_sSCw@ zXx1t40n@zmd6C&-hsZlWzjfqk_AvK;Rt>Mm<3x;O1kpxpqU8DaHD)tFgq)maq`nF4 z$4%yGG1#1ynnESFO#8X`bbwH&cv-;UK@=POz>H9MwuuEJhTKf~R4$jzLXaz4oVI(v z!7&ZIL>9BAT8e&OL*q}`jQjHvksLAw|8{()o=EJS`1J@ckg&G{hKz!58W}Y z;yLWA%Nj97Rqqzp)m}Sis8;>0-=oA?7y4Fz~Jh; zQ-(ZhbdvK89L@ERNYojznLUO`fD~23R6|`T{kg0N?*I%R4t3Bn*xkukC%e>Z(|)x@ zy#BS|9ApSwkEQd>qd1JCh@%^dC!nVPsj%>?mXHe&5-gNcN&ZRM^Qopevf1v(%1q!D z_D_?nM~?8Y3_5DZab;sA;z6=5rUqQ~#>p2k1;gMl5BT#rUWiKA_9jeOhk&x5G_Mt} z;8sI^Zb+gAWk}-xyoInG2wA)=RWU}^7he21_)4I2iNn#p3O^jS1N*oaYT=1Vdv~08 zt*8+qP^#6`(n7Kxd)>Ot-^Lv&Q85OifD&Y0TrNjIG?7R^h)+J0UK+{L9ly8hn8iN- zj?QAkp4Wy3G$JC+^brOO1K09ODQ?{|T@It4+=htN{fK+SG&k=W30d`r=Qby*UJDa5-Y~Ee`??iC_rdS8xGknpid?I8}JwO9LNs{i(#>NJ-QHlzc zI{<0y7aDk0Ns>MBzGJq8OpX--X?VtjKL5%wwQk$#@ynR!Sv=`?v8QLI` zH#;Qmu1=5E3#o zhzjU*f}G~6wu2vplU$!On=%Cc2TedzB*PC|j8rogR$^Qvk>J5-x$3vfx({+Jy z!RwG`LByv|yZ`&Vfph}ew1W`h5i|nbd?=#c0dwwt_{0DuffTvq?tbp??>Q;T1D!W9 z&K8+WE065?NE4^sWB0L$|K+j(H+RjwV*(Gj8R&P5z5n2TT4xs|nM6ei^e4{nAiB;l zw0T18Mkq-{0!U8@SWk>=W>e#@20>x&_Bn0%OoQ-;R&Bb2SE!F@P?a12ACH}Q2r(cQ z0{a1vlaCq?h3DY)4ZrcFt+Q?EM~a=-D_t@3O@RtMbx=;In{<2@I>Z&K0%-aVxFdcT zkul`%m6TiV%K+xW!bs26-%9n zF%jDAJ$!iU|5^_6)SK`o%hDpBy}D&RqQWVu8pB;u7?{q-!})HqQ$0Uy+jbu#mxly> z#L0KHNd|`Ez3|qd-wP(j+~zS^Ufx^yZ!GlhzxjFDb~e7^$jcuAZ?+1h%Yx7(ohjsB z966wvUajAJR35h*-goI4^D(b8Vl}OC`Q9br1!*$OuXsS!lD6%0Aewv8_eeOpaY;Rn zR725_H@xXP-0bta*kK^eGSJf#=#+Z*hJnhB#D-t!EuCXqYqdf*u@YN$uR(&7N|6 zcls>mVvFb74%GCPsXw{vTy^-^2w)OY7|0%MU1i0VY7ZPl{+oelHf3tiaq zZp*uzwi4qTDdVnn$TOF_|EOtNT2!=c`@in*eW#6Rft8Qec&_=5QBxkeqM~MmKA6-7 zHnW?ityaYZ`Q3W*dZOHEJ*omT=P-68-^Y6b`<^F@8BmuToS%3VwgaKU(f#Ts78b3DJ~l?JAh|QpwZID? zM#_A;#PPI&L9FB{#l~30SI+$#WxazpH)_=gpUby9ch0n&j69&8uIN~h z6pyS$x#?`7Px%UTMcm7!g9hV2tQ)93W_m-G7GWDeNrW^hG6kGdhAp*?ot>5M)M6Nj z6Z*C254vM}cN|f?|EAQX(52-mJ82m!NL-t8j;ur$j3p_2Q(?xpf?uoc0z>#nUK10OrB%hR&$#!>hX ztdXcp7B2EwO4+Bds)rZoni$hDa@DM@y3cWp7q+n)4sO`VRJABgSJKPn;pCL7GfrJ1 zq>>DO)F3whlOCcF|$zO z?oLKFHjb*>i56#`vvnmU?Zq}kJ~@ZddW=7XuwoI#1vFvU;cMF3W-qEvYM{{;mV+%A zlNKq}!i^B>gTouLraB)XxSOHICE9wNz=&~N8JiE(-`66(bLY-Obkk-!o5i?7iHsQ& z_ZGcr6I(G(VIg93$STxLoRdbbNHc2NgRY2mpf zS39#TqFd>gHGYSy@Se;e^}CDq@8#A`S}qQFpTiA z^%MBCNk2G8^7Zk|y@28n5JJFN_#)1~8 z5@b#`z$Rh&<+3G8ALIhR22p}Jgj7xgkhb@o0bu>nevxM}vOpZ|@m>o$?4dVm1xiWj zwBMgG4h%o~BOciJ$zO}WI`UFc`3wfr@Azkz@w)|`Mc39s^0J+voHn%Z3?eA<`{SFf zO^wo7J-q5dL}tEy6|jeb&?OxlheYicAvuDw(gwddBKcuvuDwCG zg+zLxbF%zSx02-!gFR3J=G%4%~LzsDdsC<_H|yzz@QOf zCZvx<^yLrA9hB?a@dXWA+P@b2ZhOXpw|8ne;9o#UbAC>cZDk*lI`}j(sr~ zmSE^2+VJ1lyE!4WV%u=;ZYDE2lpGKp5CHShqe}~L1wg%p@F5`VTe!H?F`yoX{xPRD z4sNfwxVYOaLiv0El#nAD0l&_fD`6Uu+-l?%qkM$E$qWiRiVIELK@%0F>$ zmTU_~=W0edxn`wh$la>;GWGI7v0aA^R2BDno8S?sI+3mf>glOL7QDBJ?JwvmrTirLv!fJtoFyz`|~ zcV1?IbJeL2_o->(6O56h;ZL%dfx?O7hd{E1>^@7{x-FDV2>Gs1xoO62%f?Cjr0$MTDW<6yN%Qw5psdy7u=88&nFmCfJN_9V<7>?z{KLhMCT=~2&;2)ic(L) z4$kyZR`w%LoE%~O0`{8(8CPrG4m1@4mY#blFPVpTg%j>=c`(%T0E!Sq^gU@pf-4lS zsC&{kCo=#UT5X$K->I$-UL_CTPVesrAQt$IgR-saR$k@)r0;!aVflb1^Xj{DkIK)q z=XA?8CBh&xj#PK+?|-nC)bR~xQTlbDrs%82p*Yd}ckDB@VgG9_nk+zc*W{^?7o6OF zhGuH)kb9>p3O*&dM32bf*ZVL&-pS7$p-X>$9Ov2JxWk?A`k@<7y;iLG0~dA8-W?KY z5l4!HU0`uy;^51i9hguzC>XH2&IlDa_HF{GW&GBvlGU)|HIG_mac#NNj_+SKoIGaU z5hx$dO@96;Ca5DJo%q*veY^dcDcne(EqT;X*2Krpf8eBwC4+R(>CUkaA4y01wr$?+ zx5ERzY=(;zHzZE-a{d4p^W1wYu$@C7ry+e>=N>dvsX!TF?(16FZN?QkUa&)F`pAQY&|#)xK~cBQ^GvpommhmY#q~(Rl(XQShAmTNY1iBK*CV z&cZ@{#$niCH}2;nqV>J z0fb#57Mzvu5|2yEAb=4&7Z!eVztGzc!wk6rDxqBM*pB~GNFQBPZ^4D|6jUJSV);v+ zEpOf&5qgA!2txq$Orffwu}?~>^JM`yQ}0^{cnFS)oA}-9 zU$X=EAi+xV{}=$mGB`N<{D=C$G%ex(UOnRb@WZ_S_%XdaHgr&^ImNk`@e_XnXX2lt zrRl&0SLFxQblU@{N+@9bf;+tJC_l)$t+)=4>*;-zid*WotnKD#_0=a%Fp?4La(TG# z*Yuoasm`6FU5ONTcQ819p4_WL;ANf|I~+Qf)P0^$}YPhYq|4ZUc4QV@%* z?(TGFIM3P9T~?U;6MqP>e#lHUa{Llt*|{34si{X9;x@a<$tfyIbGOb{1b9On4k1Z< zb`}Vgj{^VtgYMhfo=Ll8gKDyl13an&6|+?uU=2OrHq);}3R-{FVa}I#*~$Lp)#qIE zBjA1BO59ES-B`WYWeR8~8uU{X4lc+>m8`51Q)lsPHK2G#9NZfOhVNcu!mh`=e}`v? zkh>rpCgQzkm@n89nU9*jr*O0j5eQ>}Yj4T;tK-Zr8n0_k%4E0^E)KSqb zw5x#)MWYZq*NKWLZlIwNhbtCH9ZbNVdU}i(Xy~X6Ll9gC&?WDZW{v^slTGoqfdIo< z6OvvX5F1rfQ!~Gx6~YCe0!cVMRKsr|9*T`xPEfcw*|?B(`;H(Q(fp$r_)_t&i5;8i z>4%}}Rmti+7`j|pOwiVj(Gwa@BEG&u$_!5IC?G>0!>}qPc60 zEVGd3UZb>s7o%B5yNm9xKzuQ5Y}^#s)MVt?d;Y=!YH5na4_0inRFt*TYRbx68M%I& zuiT*u@KV3EX-7{%s_g3OxxRhW0jA73?BnkOp`bg%7a0-p1GVs0`bKZAL`E7hZll?E z!lXPVAa>pJOWM1cXsH%&-o8zXJKOo29mHcBoo6jZN9Eh%&m_DU>{5C<~rD{wxS@QT}CK)lkL^%b>yBu$7)(rqkVO-8bSsR*M z#QMT1Xu~e5%G?`evp!1r0em~`!2V+A&b=F74hEMlZZwe9p?<@+sZHNsI5A74d!2xM z=4#0%v2`Nybc3|#DQw)z^xbxBNK9JZ{-U$GB(o~)#d4{s+Yax z5WS-9SPZ9rVpQZrrlCSARa0lhW-gEejEN7Q>QEaLTn?jT``zqPUe1`?Io+L6qjs}* z?M=H8gx9Y2`41!A51pXC&XQta*=BlEob6Nd?k1fiIq$H2@>{lQQ_!%Lsy5Ta3K*A( zdUkftODJBaIm*Kq{#2JyM8HM)+ZSz?l+1`1Udg?pCyg^qE&2``MqDjbzkjOl=I%r$ z+g`qdiqBO%?d13hXjEO)snct;a%V;a@*nIrWE9w{&vz(ICMn$Mshr>8jm^Remj6nw z7uw)ju%R*Y$d(G7_G%3)({=01ys~=GsM5U9TOgyyUplM9#aM!sC9!pggmfoz+4Hbu?2bw^hDw#|%H z2+`1G-fb?_Ye~%Q7tNC9vlXZ@S$|$$SV1{b*NNK5Sn!T{*2ZR!tngZc^(vtsCYnQ= z3k_PV9G7djBCf3e^v0^laMFOQW_O@E<%VtdJDV4qGp9_p*0fUGS2(A$OrerGoB6#~ zJwW&P;^M}^uSPlUoKHWbe=OF&9=T&TfA>Rfoo%(PTptg1x@B2H&s=Bs>FISBhu1c4%9d<~AgAnRAfsxRx}|VO;6NKGbf0nem%5J@wBmY8wGHOabN?tfQS0`D#-^9j4pUr0_ zr{KhqLq=`YyZp2(s#o{^oi1sY+;ljLA^GUu`O?*a=UtK(30Wnvehwd|wU(3A45j}4 zGOO&XB-vNgk#SL!MM$cr_v{}S-C#$p3I#d?2gzFmJrIwvxyUh1$q z`tz}_-b zl%LkS{$I5HX;_bK8~+POk)g;K86qJ?A}J(AMP-(fltM~TNXVEnCuC?4Dk)NFAPH$w zs7RX3MG|E!q)6*?xbFX2YkOWi&#UKpb8p=FRp)sO`~IEw_B$m1?21x3lW^`v%Y}J| z0)Bfb?mAn#My<`_2$jeCysl-;J|Q0_d-U+R4S|*w+LsEyNj|#S9IVs!)7%s1w+)gp z+ErwpvS!*3t%qS7zUP*%F`YJV#Wl;6r}8?BwqMp%8NJW(hx^2Z+eiNlzjbK9U^@?q z8;MccExq+S&y?&EIJIfj^(!O#Uy-ax)3852zh>t;xjwSaqyGFDuG}y4Rhu0*djvc+ zC>Zg}wTHR$APtEdKfU@-e*1A(b#UGU+s~a+UsaxvP5-iY_2h9LyK1(mwOrdEUodMy zbK(AXl_i}vp2$2Gw>GK2f==cWoqKEBhE#ps?!Erix>*@Rrh3Zv{v#`WDZ@&(tC7K$ zX-+3)zpP$Zo2%t@_)~b6!t5_zz53>Ti(hVUBe6k!pLXwQdxnfl@)6?9M)!_ic}leX!P5lU-dKy)7gMya?agRppVvXZt4$TO#{?l+D%X z@Nm!0p9YsC$F1|)8gXLB#$`PU?pYo>FmF_5_=ItzcLmo~hDjMb)EM78_RQc(-ENy& zJp65_ay@xobE|ZE))?sy-*)X=te+Z~B%`U-S6SB3Q@>H8y5eSIc9iL(K6AG}>3*~S zP3vO2xT#~#y_u)CrJc%hbM;5EPo#F&zP4+3`m%YhSJ{rEtE38x6EvpHJN!j@lA_ZsYFPpH3tuz3#i{tc+FlS37H!j=RIU2PO9EQmrpJS$FgB6~BCbcl@Pn zyQ-pPR%Tk<*uII|N5oEQbN1)gCW&8@@|$4k1KBcv}b_Awj(=SIK!OJRGYWzr?b?#kI`mpLY3 ze&+GG47v6OIbVNR%qX2IIq7=Jke;5qn*03OHe*%x#dnpDO#Av;#_em_Q)w^jb|+)J zqRJ#m#WnML{@#20QM6j0HI+|#FA9k`U3Er%?IY=_`SxY)?>#a&I(dBI#+(rk4p<*h zSa(lVRsV~G(ZU2x>GfJ+No}-8y;qPhO1LU@DJk5f-MCxR=IJz^&)lW^Rn_O!E<2ev z6>Wd5m+s@>47Y3&3&Jo%#bWP`l-VMHM?i)9@T|=bh-1QFZMFO^xa`X z*BeiExt%yy7pvFhfK}y4gV2?B9om=A4?5H4t-26%Pu`7WcjSijx(Yu>k9AK7?+c^o#j*GE(PMWdfMbWKbCYSXq) ze%SHe=O+y=kg(RQ{AH4Hahut(>~P0+f7Z%vIQeMCzUjrT*PCuy-LST`{P}qgB-%&G ziM_Ws49`(Bd*Apt-YF^jwc5#ZQzk~dUz?NB=ERs=+6VP?It2NQeI>1S#AjyDG><6< zU0?KiviIDacj51gKC3kUQ0+N+gMo+7;;{E;uc=I2Y5L=vtI8JFttY-1^ga0SgY;>M ztX&?Z9@di9F-K0G+je||zRFRrbAgJ}Ka@|NFzGEsN}@y-Z6dcsOqzeK4jjz*5m&@512+?VhdNVkmz@{zm?i zFUuOUt~$r)O~2i_wB<`)-UQ7hA=;~R@8=)5_tQW5{dNyz=45^bP0K;y5_aQ`aTFbf z2wOmGccDOe$~X1Xrn?5t`}%!I)M12@ znwiPl_IaLXPM@fYG~em5dUe>mAx5|VY`Yg@@oxLJ6D|re2m7Bal|;^z=i%#n(p^$# z(lmXq_34-ajJjkzn3Za~@Z@^eh;OSyUax*`uKM+-#pMg-*}+ZJX| zi|%@`|CY4JSiY+Nu1a>SUc15kS@`@BYhvaeT9p=~wli*t2zmb25Hn@hL1fy@h`5(@ z2yZ0f$iR??gk^~tsNUHZ?HDa&jb9NX(eugq7QFFx{)(cIv-AoplV&lx(GUbOyPGe|>< zEJ1s<6TQ77+E4V5mUp|SBm2Bj)z9@+&&B)Hs$6o{si@CCyKY8@i#Ga8I}JZP!h;9Z zSom!5hsLHR$KT;0A?rUyOh2jT=+b%AVwIY_dtNV=M0QP09r4A-HvU$5b*Yd0inmX1 zpY{*GH9G2ctX4+wOcnKZs)NrbKK`PrdScUKyScC27dW5uZd<)Y^<@3$>!~AJ^hV7} zy_}r9-geTY>XyRGXT!gL*R88K6%cjxL7r)}i(CAkDWhhM*tB-;VV7~HTW2kw9bs(x zH1*5jxKZsMJ{rBk zy%nZ=m2_NoY0zM!;1kXrzWlUvooA&WVm3nFVmFR%%%?I3{OhsCL^T?-npMQV8wjt$m z=E+)BKt3Qj0f3$8_^5RDx%+yLfW_#oLFif4H4@zwFq&{E|z(@#GZQVWa1as!5BH z?y7kH(cBp+lWy2}?2O&i-UVuq-2cvqe}6SKean#SDknhNqt9yk#UIZjO$Q z@Qj;24Zqn1`^2@?PvIP{0Fy_+6X|}__0q8}}0Fxz!;g(4xzESHg*(a#a-V-nL=ytakZtd2X zI}YE_E@c3Ex0Ns`2*)M|xE|~an}9ekSj(k1k$9+{j|puP5)|LR@0R#1{5$K&-^g|! zJI@X)iZ6UT==!cmqwb520#PkMCWL4yzPbMo)wo$itcZj!czI!az(g=?&RLP7I!XYE zHyA~B2ldf(bc&!)?J6+=hbJWkT<)&mcdTG$pBV2g`$y)-E>1oOtuc1v$3?a~TReo= z^j(M>f~QL_O=2|%ukUwDOK#xygytZvt1i-KCaI|fp=4Wh6xN*?ko$=lKpq-_NK?dQ zF*@wr#^LXKbV%QmAzdbiOvIq23aPi4po)Qh55erq#wiF^4~0>3^Y+gl;;p_k_S&)Z zAZGHgdo?S|P3a6P5J)h?mZM9t?1E(d7WM|`Ek>NRV`927WXeq1Alo?m!zb?JxRh8K$c?Vm01^cC4( zB3BiycG&r^P9+zM*0?2{1!CWFZPM!-^KEV6eGW&4tjoq%0sPCRcG4@0n$>T^M4L~t zW!5Lmi(h@TK7p1G)PCg4Lla+%@7>f?lwdzi0;4x3gzIr0OnlJavgL;O7%cioBNoSN zy!t2%Bw(2zW74dcu%r+3#=()?+;ZlK?$p&^UIo4=-+l%4qrtUR6@N$VXH*9Q)>pO5 zeyEzRCFur_(>wV)Lm7vbh`mJeD=NG_0<4>a5HeyE8Oe)erlJOzugiTo%;`)KGM-dgXWrVz_?xD@8M2lXmpw%>mS z$AI4P7E7G_2P${OCqje^hW`tZAzNNj+k$&r-#GHSsR_~cjKmSkT0Fv_p}v3rUZ+>& z9&z1Vag)0{86Srrcu%F>a~(>d#MqOR)r#0#V07ySYF{Ks1O7&7Nc<@4>1j!yQA{R{ z9Fm{ZR3V~x;##q@?c)q*262d|wVJOho7_voOk^BB-GLA@w%Y8gczJqg4>3dc9tXqHT^8;LO=+9NVY;3uzEVeIV;d7B8 z>er2;E&p!f7srhjY;%|MP^e^~?EgE7oSr5PW#GuW{m~GW@X81D)UOZa<%qz3JQb&o z3>`jvIoz(z%lA@k&ALN&Gplva9@BF(PziI?-_cRO?jSw{Bi#ERrptcw^syj2y||tT z7k7PDv}Jo2l9M?@h7$$*U*;%(z8Q6Ea$AG#N>E{B#vVn~gG?GsUBtK`?pP#oxe3U)(}~n_44J*$Fq8LxEzekiyL7Erv(p9 zY<`&Wj+SFGM~MwAvO)L}*hdL`v$LM>`cl+7{X4Y%j!gL*#Qqn<)u@b}o+N~)R&T^`h%c;d>b2<57RLm1jtP#11BDU_vl1EYnK?Pt0Ww*GueWgO zv38J~&B3B8Tw_>V17B1ON4mwSg^(y?7g1{(vLAKeHN7mKRrU1 zdGA%dt=m(0VrXVc0j#+&%puv?b6F{&&yR=y$c^c7OQQDaK#TwL2l00^JNPX8 z@BjMu6+gP$Cv@WL{`chwyBt2We{q=&sBAKEv>Zka^gM)3MHIo|IZLfOs(k;nd_Ko* z^5Qk>>E>28gY(B4M|AVa^dee}#hkp@CgP8CwdPPq?K98K)h~up1L=Dnu30~-@YjpK zkws3V%qGoHEU)_k<+M01_t~@N8$Vt*4+;(Y$v0y3hJs&I?lPo5CWTyV=7}b?c{Ra$abFZ+_kspNB z*_nF-Dc(TFU=Kcbq-+{z5GGLNL4&-g9E!Isi78DYBZ}luDSdP@LcWOs(CgSwN8I=# z=@C;Kphwv^j$%5QBp*4oieW=;TwdaujZvdAT|F$)@;YUUi@L1v)tlRt+R%%A^DL=@s4g7{*N z`Ci2P;}jk^+`V@rjA1Z{_d*C!n2!OSH_>;@GAWc=E1S6W9w2NIhHpX~`KUhV5+qQ) z-e)?L7MRDBZTrIr z5%_n07*=VzmLit zVHz!gu3f+wH0})aAf#>w7g--Ziwb5X0i!w@aLzK87=Bfr&qU9oBERA=})g9YAK{eSQoR923`Nt$4Z{`>Tj^x z{zs3BL0;RNzuQx*;F<`j#=$iZDGrMD6WGOz&KY&)+Je^>fxw;(@8&IB=nzoIWGXL2 zeZr%TaUO3_7|uGO=GI=K_W*J3u+pakdd{6ce=2Si?vdQQyn_fcL-MK1g;9ZrWykju zQwTBM7G0R5D9i+Z8oIK0SMY!A-nUN-R3wUu1-SZ2!Y-0?G@Tv|BK76he=HXX5eim+@Rxs9&H^INcav_35ONkikKk@c^`7Jg$VNg{{52KH@q-!pk|cRo5yra(cIBFZ1;iu4+__-YtGo!8jq zDJvf+h|zVpcGSgNrTtN%(dRcqSxI1IHcqR}cm`06iI>CzI*Q{JB4&AtH-sJcq3FgY_ z!gph1Ji#C$jv$OyL?+0|$?0>03f(>KbA(00*UAmOD8WvMLzGgMPo_~>U~YbgP-`*6 zlZdG(-h0N=1E=txi>dQ}T3UqW6j>a43+g{avz8dcs&Q^cvh!FI{1|?sN@Xvex*9ZP zryd8^WDd^dOz5Ig4P|3K7SanF8__6=KsRv`@vCqqbR~{UOiHwkcf>bK_V+i8Uh`2X z@(bg7lO!khG`W<-3Y?_y7yeGxgfLQbo%J|A_D2VX1ECxx;n@Mm>U&*zp>BIt>zH+K z47oiv|9ZWTQjX5Bd$(@8kXBHq2%|b%X&t6Fk)Lwjm~1E|{7ww1MvEk740Y(((f&VYS#henqxehw zs|-tWAOT05=VX}f!y~nftVfok(8qnKNS}g7N8DPXF6h_KcVs^Ec0*8zYLbDDM)lq; z&juE)#vId^Dt9`$3}|4hT*3K+skbmhtCd^qD}8|2c-(hXthPIv9NcF=OaI&K%Wik) z!jmL=PHZqS2a1&JVZ@W-mv3`$-BO-wObcZ6{6$zxkTfELywh}~uIZ6VQ1#DP2su|_i zRYHUhlL0PP0~1$C0rX@7BcmoFIRb?GmCG@ta8ra;+7>O&8{Hoz3q;s9Vj2X{{{Zp* z`|U8j@b5K%DD=lv?i3TOyY$~rug^=+MU_p6g#28K2*lPR$9~eL&mta2NW2pgG>FvA z=W@0JIKVv_7O5B)b_zAnSCJ#HsCcBvDRv%Vz_}vP98+c=T%QuG^mjXuf#N_Ok6daD zVs!EN+22VlPwu2GEF(@052XrX9^vJZqybunsfmd-&jE z>@aepJ;W~1#UQd?oCa>UzP0B$N`_$~6G3Ng|p&H1P>4qP@;4u1tINGj#OX`M^p_(f)bhWY?v|?YZACA8^xO=w?MaJlUgwg(4y~oUb1Z7CLp-<2#2R85#P7O`st+j456J=1xbh(t;lm5fM~o zqEx~7P@>}U{MsnQbtNSw6OTV2XAO-OKbC~b=tLhMpQl~*fCYFI%iP?$>T@MhI?>c+ z{_)qb+SJw zQkZK>$(3fi)1ZlqoIAIMvW!!9?xIDDvcJ+zy{6K{kxWXwH{pX;ddx`S3v12wGOIgw z>GIA``&A{1L)|$|8h(~FKX)OfKCC!pHgF`EEioBE6|$5PT2XZxCx!!>)B!I%Y@$nR=>XVGi+HvK8NfmK_HjH6QQ8v&S z0u70bd0E-Ci`!^A*_7=hnDE04FA?-<9Fb`IewlZ^=H{<O}g2TM?xe?#o9NGJvVQz z{P=OEv$HC_1bzFJBf6q8p)(zyKBy*uhlTTaIvVdY_si0E?i51qE6>ef3PfGVairDS znqJRHITTJnR628RQ$v}Wnj&OBpO7FIXJNfukt&#sk?5MZ^&Lj;GmgjvEt)bVy}=@l zq$PCoT4&eU*$u)h0Kz7Gs)+0S`nF*H<7Y%+FOrVj+>ZG8NQU+CW!+W2y?@+eSOt~h zY%FpQTt{;xE9>c<<#7!*dF=Vco;`pm2<>2q)OTs%_W z;BX&{v8QQijKFBe(*}AEu6FnE(E74z15(rc;rZ+Tos)Mt?OEhO=_xhLuQ-e_)AQRs z95J{!6}JC*;n-hJ&P&-uWPoox6x$hS#uNAPg9?AD7cmW8;l_1hS*-L3Z2;nXP|qSN zbz-khIi~D?=FBYOdcjZRPsArB)$h$4y2RV9t-8a};B&G|JD40BxvZd9@804z>pgOj zK9>(a{`;YIOP%+aA3N`xw<&e&kn0mPeiZwDvcLI4^V*$Gzo`R%QE6E*f&$l>$o0pz zWR|ySOl`zA?>gTiCp;0p0ZTuVEUtJ|`2aP;n4N#OvHfyHI1QB^M|1knw>_5Zyj$U!Ir_d6uEQCq?m<67bt0i-6 zW=So%kvfK7N_Hb4Y~Z%ssM6CnsOv?jKQX6!jU!$sm)j0eRqX^&>Z=eNxV0-2uSWMh z6ro<-<3%4%jmyn_m%1Lh^$N5_OpT|)FaxXp#miXrG^u*1j}nEEP12wB+kZ`s>F*ul z>*AT~5f=mEa=YcHwaDOLH;9N@a&}W4-tw91KMuvk#;U)cuQH^hz{)L#NPPv~XmvO?b|lAVhfbZc%E|)5!on0T+jCuTijo>+0vY3S zg1I1+AZxlzvyl%vC%J{xBHA%xm29&Y$#(8ML{qa^cR?^~gz6u>g#Zblm?hKpa@W6p z^TwNfKv~Z6=lZi(pEF|y0jx6&U)?kTR2qA8U5J1|d4MRr-+cPy7ZTEs6zF&LWYmE{ z@1`Zj`5pm$3mm8)JeW~Qmy7j#3lkalg#c`}QQX1><=j%76H)Ytj#r@`j7|o0fgb*A zeLa9;TCQ!J_`LC}_w#wwi6Vato=3mqd_=;^Z9QHX(r^<3W|N(xHF)qYENHwPpwlsI zZP*e0F2lIaFQQ1`nT5v- zuLi0DdQy5#JUM6W8mac}l}zT#%9nH0C=~u$(s-AMXl#n~3#nq;3qJ;(+&V}=;m1JQ zo~n@R%sJ`xRO{E&dIv{G8Uo;3kbX|e5@!#w>&uOKkH93{brK>~+u8Y2YUu90d&@TG zQYe$3QXud+Z%}*&_;?X%y-#t6-HYq%v|hzi%u*?ES4*F;3x-S_pU+TFi)pvH(UeWH zII%g#I0;6pgqq0TnAc%e`alLH<$%oaPo^JJCi2gKKo<}a9clt%m)Tm*vz^i?%6V~y zR8V*kH3Ne!c5%KsOBrAH$1M!UijhZ`xfSnv4Ue;MvGt_bSjI92SSA;r_L=rucuqck z`c!z_jpBshXgR{Y`8aN50Gqp~-+OWllR4qrAyd0!)OX_=y((_WDi$l34rJb)=g&i^ zt$6}enRkgf;M8F0WkPmvl5hrq=jem0)Sho4I+?MKln8w_?7Su@hFHWH#g~J!8*u~4 zHqpfSupHT4VqPc30F~Zhg?ebbE>IL8BLH!ThQ?6}fgqbZ zu*g$6sR%(8AQrGQCCy=F6XLOwK$;sgZ571fb#Z+o#6Y^h+YubI4D{)(V zZ8eu!6~>2m>sDh!(mMN%;#);kl>kX@Sz#5!6LUi>Jbu{(ut`yO)BjIp4W}M4J7&mH z1Z#Wa{5BFZ#4zfAVWq3}Z`g{c9Ggx4l4fE6k5DZp=J2t|Or_ zg1!fkqa|L60Ylg-MNT)~EHQ~?ARU6)=lj>sQjUW5-=n&tHpP$0t``_TA1FrrkToqr zAYzxKChgq@iu0PtJ@GAw;}LP&qQMr?W8c585J6pdZg?a90RhGsp*Z)n3jcPl)yCQ( zfD1u7h$KL6R(FMd#UC5^eoZvDqKDv@=kb5#$QP;Lyr%=?sN>3tbSyk8jKz;0J({Tb z9uNWhsTjLXqR3U_r=TPvJzPv2qW65uNr>Ab823uOXwR#vBvrO=BlMhMokz(ArO^9w ztD%dSCa`Z`F`G*aN!Zf(`2jvL2&!PKdEHBP(voZjya>|u982NWvv==fko}ydSmrN~ z7%9w=^kJdm4FHd_!&l5M^+@HtiX=R?RuCz{Toy6&#xq`=e*G@Px(cY+n6yMosO zSpd>k0tf(2`#0x`Z%@ud+Frmhx>2o~6YVzB@ryUYMNP_p$VbK;wIRED!F`Xk&Wj{e z&H(3L%H6j(z~O&(cZL}WT?E37hS)=^hB^MIEtL>|L03q9T3|31Sc`B4&DLv|f6k)p zaKBgAiZ{q?#ftM2ykEg=+`o5k5aUK^rcX7D4_>*R8M%JSCM)~SY#%-Q!g7#AL3I$o zLL+0u>)vCwcJiIK4{NKc7F10eWzwCa;vJA9r)WrM;;PW35%P6>)LQ6S)-x?jaOgUM z7hK9>;XQXDS1FE|-jzG1ZiV6%4lDMB{Gr`7oaD4QoYm}VF~jT1HZGZWjgOQofRjjz zj3k3d#5nEU#$&3cs^N`47Hfsp&VlCy4L=ni+%MRjn^<6!48TJOA;OuE@#OtEL}@JW zoBu@slP%m2-xx6`P#FDY85!jmEhr#M;q{ji%?AjG~UoNH#U6mhE8wl8_J*A&HOt9e7$5a zk4+y!f#9Y*tnefyPv?NUckT?zA4@;>hw2>3fiQ*fl`i)Jj~yeW!%XPrclvQ?)4--t^kng2J>Sz?9(_#XrMkJ6KgN{(*Jl+gET{ekNyuSsz2yr;QtTS^#8x#+)2#8xc}HsPLy4& z(mx#X9A6R;g}S=BQ2Wreg2be#VE|1oRSUVkRUbbdBGHt_oC8nfI0Eym{2Rx#KXRKz zIGL&{K)T|SYis%;aiA3tNht7YdqKEa8iV*MZUid9JFtNFIN$-EBS|c4YN%=8{3hjK z4hxBI9&p>i+j=*3A=0NwM93qHO5ETs(LUEfg3tx|JnQ@|1yHFfnm96vKi9J%DaZWMqk$8o+l-My~Zc z9J>=hlRzi!xH#XnGNFsOhv_1sk6ZX@ikqUCT}uEaB~Stgx0qx^Xm$<_qsa5%hhGLI zBr?P3#oWOZlh}nop7%i)?{^-33#rmXf-h<=-~9ZB2PNhaiF*_P4fWcJ57xu3?*XWI z3o|PI7lA?DYDn+G!WP9&tOXN1c8>o zHOQv6^{noNKl~xfL}c=j%Ox;lmOxP5UgR0g*E+h6g}ftvo%#?&mGEVwh@uje(f>uM zqJO2Pd~l_q=5HA~04n|lw;x+3qm41?PL)Zq17hg?Pq9rJA<17NP!@V#Ke*SZ5hF5l zbKApPgCf!)BJ<%4L)aB?+s4~k>OYAsX|2}Q&lOspncD*MpfCbxV<=Pk`$lpKXm~;! z9|koiMOU=S7pn%+n_1+&m7F6X;U!^WXkc#fX2Iy|e`g#s1O@~v!UkIfP%<)~;v~tV z&VqRU1BeMC=QGw2&Il+e=&l=sb=e90=v;0zUL^37M%;lw;44=&`Q9^*t+>pT2xN)^ zCEU5y)h01-==E;wL`tkN3Vu+H0(nBi2^31v4LG&>=fT)ZmrzNZf@WP|B}U|cjv>Aw zHX0&CfK;0;Nr!!t?=pn`E~gb&+(-qs+@B}u*OW;rBS)|eNDF)%h(a(>%?IM-bNGqj{IbC zd@m)p9kf8q+7&zg^qDhFjH!ic(45lpy$1w%<<8dMisCyG%aLKH@d)Nod?9uE9ptJ{ zDOmsOb-Cm>BdjIWTKrrfrlJC4rS0|e^NV!bW7Lf{OXvx>p`2rD1q+D39Tb0mT-+Mj z>Zi}1J=68wTw5{%@H=R%!@;IPl5tj*UOM?S;p(+(mp!JZBq#TZG4rl_NLJux;)_KJ zGfcbguSy?s`%lshl`nB5c2bC&smU4W@Q?r;Q&tFFhb}3Y?D8v4MM}&RF%bjRcogk2 zgc^h@nD4h8r@Zf2C z83ZkZuyD1w1lkEoqfoa&253s?VaRx?C+azoNMnirF_GfrPjC6z5B(>n_yW{Ff*s>` z5^Iy5eHlTcGtcSJQ&k^YM+!L$0LWkLA~r$R*d0GW=JpXryN>rnlTHZ`mU6*wWRkTt~Y?@&nijq<2Ov?&dSdp#KeD*%z_*~cuTyKzE=SGsu3hx=zKOf% zb`|3x)ztF)pO+W}s84c%2xj6XwD-weVli)?{di-7J*KkN)GRNL3k*DQLJ0Nf8b*ye zr7s~w@8B@Y_(0LlLC(-{7#kSuN*xVmcMS1W4=*(9fwZXP%ZT}G%=m-3l_2jaD=SN9 z#~&RLpHJXGdy&|AXy9Z8t~61;zzjYVksRN@-yzZI<0 zDl&TGF0WHeUZg$Z8H(3|^h?%@zY)YSO^hhZfsE*U`!a@%MLvZl>Gmhb>UQOd5apx; z3V?`0@-7oYIo*%L5OY@%#U#*OiYazn7q7Im$-I#HeqnZ>KY!N!`D8i3Ko7XsAxCtx zh*jo3iJ)SDgll-E&n*FDvJN~6S;Z`I$cxm?{TvsZdO&Ljw(s_?o6XfO_6plmhx*Mh zqUdXGKR1#7j?xe>1BFD7o;_D=Y-lHX7_k^2(*kX`9N_JVa2lAd|IJr`2A>k_B@Rk%?d}4?mB!nnPsDw&+8@2pNw!OZrdfPC63@=~}oRG-UiX4Md}k z0f`W~;*7ms9M{`RfJ)4s0cFnzVDY21qhk>r7|pSn?=U%uqu^JcYlev?pUWIXv^~>V zpDkIrxEet-uLH?`2L1U;^*OU<4b|3`k0|OO*w=>(3TCN)VnJyA%xf<#UH&V5aol*d zh16$Cmm(`~P&Z9rk-7W3k#uVzINqGgXiblHDzS6CyUF*oBJocybgcCxH>VhUQE9Yd|}Dzcf8yR zh@rtnEV_KuiomPGhYics)g!?O=5R?>Bjli0{VX}#Bb~DB;-rNxM8wfDtiAy$3j0hj zz^sJ~79m}LCcl&j(t!wk`EmeBxuCl+@vk|~YRJTU%`3^a(>NL$T8L_@K(YI-jK?#M zZFqI|QEQi%$Viq9h+6s8&ZJd!_@1G|d^@;r< z0p|EQy>*8e7)2q@Dn#F*K%WPW&e&km5(VIkeyxBP`wKVB*KgkpB8o(b>{Kh&pmItY$HHEnD{^w;ln3BI zJTJp*p^yh#6LPtCDxc-EDD@G)Zv6Np6G_UtjT?>e(QwU>wY4Mj{(!mH{zVrT7AWLJlqxG%eREaaT|I9OT`c+!O(tl7 zbqb`OxGBy+&H*v$UdUpp6h_+K3gVCnd8xKgRcQ8vB!$0*m|Au=4jnJ2ad2z={Ag#X zHS?p*`4cI0@fR=auN?~H*^81|fXzagCgg*(fA60q*E#q-9gw676`Lsi!D?(a8gHlX6)+~pX(DCAj=xQgu%!|C#s&+c z&3Q{ARXhP?AE?gtr=eVj%hCcz<^cnhHe^aGCQze9%?zH3)R_wl(F*qrfX%xnZOAp< zk2jQRqe{>`C|t!&FR0}HQ;g#*u%94&)p9GHBoBW9O7;s3$_lJo%!c^|rIip6!;W9CrQvqHz$UpDcgU7&kZoJ_`n3Ty2SOiN>gq|HnD3DB zyg~6|9N+*g@}VTUhfq0ppIi#4kDLsUSttpa;Jt!JD#UOUf2ZjC1?NJS$4$Z6joqMN zYa_}M36-q^&Ib}7FFjp&8%Q#Q9^tY;COf9Q)HxYEH8FRG^{z^1wp6&l>uu9c0YILi zq7o223uF1iQGGenY}@eVFjaDlza+tq>jd%mUUZpjsb)o^BE;MD-E^{2($d9A4rb=6 zCsQjB(+)UkY6KC(d5$Ksh+5S5a$YAX%W0gnuuUsUFD<0#5?Spc?FtwgafX21$gT82 zazGNLH`$33-piunsh6Wse*IdiNhZs0468 zO_8|q(_&Jj#0)=vE~qD@B!=`(^yzfS+>Szf$oHkorsLomU`JUuTP*5%a+~A@TX*EC ztdD7XWhS_#pBkC3JZxAJ^XG(ewc#6p0*9S3WhN7FzJGjLeELx1MnO5#s6d`S=y%nv zaqF^GE3Afr_3}RGX*Xp~B&Q5v@BxO>bmPg;%=hfc_}f;3sl-_Ys;=vn-4eIoBpQ^4 z0NjCd<`k-(vLA^!Y04Bup<`+5(ZN-QW(KM6m2Vpc(>hzBpcTbB+l1y?07Z$*+EKOB zxybeH>-pR`=A=U~KP@dj?Z}2Gc z3Cm!}5m9Y|HSke_(c~(~c=@vBTC+O*7Z!lDX zfNeg*Txx0qq-TV4-dK?_EOSq)eO(b%Bdo=7RFl#uuWUULgNP~-b?4=Ee~3DN!;mTF zq=*3l5DqF?6C|lVnJ44Tk}4Cvmo8LI0w$zzAHB$RoI;Z*?-{ZN;^5Ui#~m;U`ip5w zcVQJD^!wuGL0JTv0g(+;zdh%(pk&!BotGzfVI#@ljTA~9`H|XvWVZeOltLwz-9Pmw zc$H}ESUbGr2i9(BIk;U%jyjmb`3>G8*R$vDz`*ieM_zJNjB|a#Xd`V2s{r8}5)KS) z?M818-kg{^LG^CpR|R^(6?)r%-;8k*f=aQs505muv;ee%H4burrsf+GUqkSQJ&Dft zlm!qD_Gmt7ua_n~q}3Cm+ox$;Q<@f0l2a@To&?jgpxueObzQY?BXP6{m_0)xF=5wYU!ic*4~n z5OajR4V6A!lU5ZE6O)2JY#k~W)I^h3@O#OV#~8qa20*$ka9AvM?VDR;3LJ;6+u-U4_)zb7Vdt$v!6 zb8G5QxkW;*DpGV=D~_{|*Dn=7GzJy)cG2s$aRv%mC2gi?6h&etYN-sk=g}9-abBJ3 zmz4DklCOQ&f!i`?tYh>Xxcksd zEJ?UUQ^Hwsmdm$P2l#3Z%@}*E4c8Pk2vCuv_~};pOSRREQc30-%)WcK#@mBDS0ROG zFr9|tapeOzMW}1{a8Yt#Z20o(4C-{>%a1!r1!OQUhmpWz5bFv3fg1Kfnt1rmzYV$H zXmp9-FWCAV75;}u;XaQRO8w|W6N$Wh&XIwW)fZhZuWRB7aHI$XNh=SHBm|2PQv4SL zN_eA~UL4TdY;DN;X%>sDlbwxdVgMrUc^=%%=4Uf&3OI~=^zyuoC`0=T@u4MWsjv|O zoirN7J1}?RhKCv&_LPqUM{pS$N2EF&mkw$B@

}1Lby)h7n?K8beH6O2q08z$gX5J<3aRVm^&K#vVK_d4@dr18Xj^P{TJx{A$XV}AT6uFsn>WCw2kGtMjRl&Fxhl{>-r zePxO{Er&#YMZ5oOeS=@7PKvD_`_ZWaulZaW{!)%`xWZ))ZfLbE~PWlw~#%vOsLibf&n;Qr{!z{R|Y_ z2x=C5@dYFX$Pyx~BK{XwSBX45#75PF+aj7-jA3i_J+G<|T~F9pJj&vbqd?@d@Wtr* zwB+NYDF}+c;zf$E7cViLDn#$Pw|HG@BLqxhC!}XeEh!{rt#lmR$Xzi z)B}&;!GFWo{D}5&uG(C8;jUeBV?9ZI%4k#2!XSrGIbFV zZP3JV-5*pLBvp|Zd~~4vmY+38YYOuvN!kN8wW7R*cnlk41S$U|gWU>az@f4LmeWu}XM^~1jYPDyw4g^$LXlvF)D^dbML zX2nwVmKW#!?D+{WPkZI9wPQ*ydH3g~ zTpqtXt3B2}Rk5}`DNI|Cn%8?Ku)of4Pwmj?*%w?nX=zgnLND_e<3I5)A1%*h*3TdL z@k8HU?V1?z_y)6z4#2MG?U1U1GjI?vBVbkLh2fymJ;2i(9e+Q4F0p^Vr)@&m-MU#g zn*O10#$`|GcY}nqSTgUuv5T(X(NwxCl}HfP&TL+Lo5*ADZ|OlQj|)xMC?} zz`DAsKGIUv7YQqWZ-E8?8zOxr`rUsLAHu;_pf@G9M0A&T>KRe-@QRvy62stlru+v))2Ep=ZTQX5pIt>&qAAR1!Q6By`oQs=W6O zw<(B=+j{j|so_z69i5b`udcgSF8O?|vy!8?->mg+B{w{m!^YOu*5hq}zdw+ZjB~Fi z?W8UfzGeS;1!uGJyLVN#rRoaNH-tkjfWzKt_Qx3lxD$;g?ic!pbX*~*ueinT;@Uut zi!|jR;1PigWp#EV=v6t7I^Dp(LF@p<{<$GD)~Mh5U&(*b|Ho-A3(A~U}QJ1j(j){(3_3~oPgG)j9+Q9vLGl}k0GIovE$JWWpkN^>SbN z8E`A0@vOncfZRmH<*c!u^tNr`Ypj`>nGi&kG@N6z0hE$VREBtcmb)PKWKI&h-h@~sDBRu65>YCg@qVR07WH! z2=Kk&AO0GsEdV-2hD1*W!zYNEkr)=dGGexMZ-5@8a*$u(MIrDZRMG^iMwk^=w7a4 zoBj4|qn>;G{@&&sgNVo!joq>qY`1;Vw6y$AGExhcElmVPdaZj>;_kovdF-RVoLTtm zM9{&I#r+0>T!jM^y$#y$;<`r(`{=^$&W!^!G1mDOt}>0^->!%;y;?DEA%SpbFq%*XYcb$PZgY(^0uyHwbNW>f(s0a zYc*){2;tDx)rEmjG2}VI?*iZn!qZLI7n0kKDT$ha;UYv8Bm~RcsQ?SEFdV{|0iHpQ zFS0Bu6j~dzQ?ydy-SU8)pqY}AzeBe_WL=VP`&(0mAv0i|br|{+-?ShU3sw5V)ElKx z4m8(;nY|1?avv`BGwl=ttc{40*tSAq1OL7T;NV(T!(t>J^wr|tyV6Nd)z1kr)7(0J z9Y8IPyFmy$z%IH1fC9;Z;UCoN;NakFt)kgM4zBxgYv+z#`x9LDTgi&2eY|r_#+46_ zCa%erokk|5cOM%{aOqy3eWJO5+?CRWM5nFR);66q_ja*#IL`joJD-|7`d33YF(@cn z)c@{ucJu)@!EiWM*26IrideFgE$(>dUnhZV1HE304vRn)AZt3Z^d>TwSr`L{3c8Ae zqd4)&h+z=$O{%^RGMJ&Zv>WmVaj?S<=2P$w@O1a%>4u+!ddrru7_~JO>`6gsBQ9-V z(f~T3q`vt*0H6@)tZ?QpLUG8tJ^(%jYZD-F_c!llW25~lg z17v>+8h=b9WWa@n3xEo^hicqyL|-IB0|R7Xb*{d)Vh=_Ha4Te;#C}D^pd!dg;qJJuyU$9>4~xJt87>&_=)mghdVITxH_88bpKuh#z{JQ)qcii*vMz?Tv`- zpauer3{%Drh-AQWV%wWT$0Plr<^fQP1Wba!0SLMR{xv|#2cXBpXG3jo#@8f$I`QTp znS{mv6|$zSk3EA16A&B6Ox__5dT@a}2rY#LiHKHD^Igvj$cMfLH9KDBdNjUwctk`M z5PpoiBV`NUsxgqD@4){^l7hg&Uw~4*DN*L{1EITj0_6YBx5^mG9zf?s&KDl>#Lph`iC2=QEZ<4@t zBI1M;1n>_R)LE$8Wc?LMAwm0y^l_`p<%jh`6SN-wl515B)mESWB>%?3(K1zFf5E^7I@l- z+y<^XfXm|ldTc95#^D{FuVh}}Uwg~jUfRi67x-^(!I z1oeR6;8Xv*&FY|7a3ZJUKWMym-@ubI3n6qn6jD@H#^~LpJY$pbNwk z`61t018^pG>FF-{4~jRcl2_n-e!sOMHa?zqV5D3Ojh<`7D2zK0Z5XJ zkFazBb7BRF0`7!}Lm?;_V0%Q`8HS@U{S{s5Srz<6>gIe z29#cpmCPtRF#7;V5&t+YT+-ikdbFo=G!@pv2B%5r&5tjgrcnSF*__?;>QqPQX zSfMU}zzswp6o$dIS#_>xef|qF>^X62=apj#T!*HB5-D+$^cDI465!e_M2SLG0qg|U z5*)|y_ZFZ$-Y23^!pa7+_90_(8IU+1cy;hmiMH8**mgkBc`FCZl{zqC)65)gV-nVj z^%Xa9vSDov$y5yb8>L{n0goTlGzgDInt*;bKG8%I%hbp7mB}r0*|Kd|3E_=jprD>d>_UqfjtV(B$+_U=YEr!{x@ehM}aX z>1U4cCV@-rlFu;p5^xNC0@+noJ%c`r?6JZ0f#CV~KpG)sB8&#i^M9*OxP1HejR>CL zZrMWKeaJwFdvA52V$I^h1w~}XK{17~ox<4FMR{&dap#fYO+FG|{*Jx#k_dvm5sk*i0;v(8!+I(|nfbFo|6O8!SlFM%poc zj9$0qWi4P4A+lbCsro)q&&z;Vh}cq5f00U34>vn_CLmt>J!gg(b6QB813q#1u8+8y zID_+{&cn2#dgctt>c@H|b0lk`dV#|d@3??m3oI=Jm!q8Wx4Ef_2TGcEU9RtIYyY<( zY;dp|wTBLZ@!Tzlxd8*i5&A?W>>PMgVljmsi5MavoWL@h1c>b^w!(0T7LU%7F!N{> zP&Q_pyvMH`L#>7IO4vEl3~Vbf2f)QDILU}g3svI#f4ZVDTy%nmKpKSzLlWMNsz}0r zP$ft-j7?v;44!plYOIGvfV}p&20-!Cfbasym$$E+V}|q#4DCucLs?rw^V$H}(0Jzp zG)Mq~wXqx^Eg%M@637;@Io`&k%%+A0(msGF-cFZOMYpfO$Yj(9i$_caqWR4rG8tyoc}7fk>#+7OO@F&>uX0@??G0r?sbrL8wl6T99)8 zpe|z=5W*4=ax^*gz!cdMI=Q)FM}Z%DJ9nHSLU^!JkOU9oxhlq@JR1LjxYj8znjM1( zl7<8mFbxaRiG%)JqX16*7UNiGwBgyauvy`^SYuE|&^o;MOMPpr=y%1y^jyRlLzp{^ zQbg>8FM&}b1=>t-2O;+|+_Ir$3LdC>rs zAf&X=TAsi4%_tz2c0*Lkii+F7@j_?6Hm7O^5*C0fc`L}qL!~t0Cou=@;qOz|nw>!W z59s$=Lmy{$zPKuiTuLTdyDsM25146ER#ntm#zGt^UYoMvyAVR_7w9|v~;QE6%cHWQnWxOG%+2G+0&PGUlT{or+AMpEW&k}G> z6B7;kDMXvf03;tmzjtR2DuumROi5Y<%n`UvaxjD8xjH=h8z9zxuSF*cqOXJs7hUIF zW(>6f_(&0niL1HumR9wT;^MTxr;{B%XbK_8!g`i1C>Uv@=g~Bw$0SXOSm%0TL{;6;dZHQP!Imh|}gh61M35EidEGP2tfUDFtBN#ObXTh*p zt8|$N>Zf>|++mi?T!}>EO*Jcm z;`AZ$wYcqjj`pC%CbbQi9%2SSPb#%GK@pEYFz8q*Hts+CRddfOb#P|)*5cP@lO+TOp`GmC_=(!$aKcwPX{oT@V62gKI zByhxS6LYw|;C=4~1*PKh5%>-XA-8bE!|in!mm5(WB;|k16I|aT__PCM3=~b_ux7&u z4aeZY(^g7YF$gCM=zMrX2%-(VZ=Ih_5-5E@qr#IqzIFsV1roTiED+};8nrmArv^9+ z@6H+K(rcSRdg4!rnP=m(;N9I!M`wiw1)6H?1|b6&VB6ehk7Tt-mMq>WB|MjSx;iTQ zQd9z!vB4hhS0sF(Gs823JS38z1KfxN{s79l%8+LJh=Sq+&#{zCq3g1~xHN<*13VH{gZfN<6dpC;0oY`AOJa-P$tPg+ zQh1UdccoM`qa8#I!j>pf1JJu+@k2VK9dJ2Nf>?q@Efy9O+#OWUaYTpypBw&v3zVRw z!2j{HuObOY0YV`I!1BXqO%4PcYNSU*BZh&XO}H4h&%f|+R~L1TU9{*ePG!|gGixtcs+@!jVN)@xMO^|iHH{*Ef8}O!Wjc8efuKbtyB_ZF# z7(avqVxw#}9uq>z_YV(Wh`pNyeIxV{DL8=7n@pY3MC?Nr(8Ra;$a~lXH3Bv>CqRuL zj#XUn6V!p8#8QG}jO!TW#nArkLbeI4o1_l~c5fD!z^grj7d{0ixEB^8g|P4ULwkTC z)e1 z*+UViP=z=rlFWoOBEZ`a)ZJy?LKXBEH3AVBx0X?I}lhKE0k#HPBR3bRDfAuhd z3CM9qk|*U2;WSv&nb1y%k0-fB&Pg*n0`OUoT5|#I9^e-$f4RyR7S&|MY~`+o-Gi{< z5~VBp*z*0}&sw2Z$F(NgqtT;~{ieL8UZ;jS5r)x-VV%srxJ@-!?gvy-tH4(8WJVkE zTe$JMTf({c3Mlit6YsmR0U9U?t;iaB=8)xl4DC@EGKh5w*QOPW5aiBpzHjfrB!3My z;H8;i(tQA*9K_gViTSg}J8MBx}EHWvce7sp(_u(H_Min`gy_?6%tdE@gvGD$Wr%J%V z{_@sH{`!`mO=bD+)-A;g@vWZU;+LIEyjVBhx(OcPA;c<=3~oEd6w=xv`f4j!z zZiKRQa8h+Zl8OzVq?tr@@FG$|P$-Wv@lZhZ_q0bXKPN{NHHY;WeL)kiP$h(1Kq7Y} zs4Jt3B8ea*-Z|;S+8y~DV(%;H7qWlpd^8Jrr%+d4%+5gP2oe8_>cB=!{c-v7z)lVC&q4*wz;Q4Gr2r3nZPH71sBtc2`c7%8kw2y{7$eJm_&Ao$u~J|V zBM2+OCL$sj(V}eMfBv?iN@l~lNo4+_#G_E?2Pb+Ah91e<*$vVLG{cK>vFe=~hXOjy z#fhg19*VQ&UXqE4iOkH*Q=crupr4E8a6UT`e4BE#ok~||=#zOtj|dwlE9=F3c4C5p zSiP-&TWj;A3JzL-{cM)v&+BD35T^~v8OX~!fPkyIrl!Zq$u~Cb^0;AO>ZwSbzWMS) z{69X}wo2Ci{;IHTrloXe0$hc17mk4&pfl0*Vzxz}uHW1G@#Ada=GlrJ89!cdges}2 zslg5WXi47X>eVOcCu5ig%M!#nQt}v=7nGmd&2}1yn^XK5B?SsnmZ+GRe1NUE$jEjy zDlIwb#D702sTE@!q*7`LY=UgmlmJpVOrKcHbio3@Wcg8&|^G$;FiFlD^t<}2dG zPm@|@eA4L%vLu?IoN7QBTf$CB1Y8XvuZ)>#>Cj2jT?)##a>cM*{m@dKw%KIO z8tcZZ2%&`kygax1Oxnl#rEHUu_4V~LonF(5Cs|(*s~MX2p`6#>s&H2TQc=rtRPKBa z_1~p^Qrugq$9H3+7%WG)!4MvX!UmX%dR%+xP}m ztXURnYW-yk$*FP-<_Rav7%t`r+D>WG$NafcG_q@Ie&apsw=XQr9W~I?;B`3$?SI3MSO9HVcx@*hQ4F! zgZ!3l%*;Hg*C`L(<54B@fuJB7u(bmE<5cS_MJ>cn+Kc>~V++|ufA#8AqgekqjaUU; z07K>SocP5M&hHN+&ifmA+tDV)xIrwhgE0K0r8P+3V8G5U+xP+RgB|Ob1GRl0!{y6t zGAp}ehIS8ml>_@~!@&b?h8%S`8X&{&c8{d;GSgoHK!r?XR@gvR}4U$X2z@PQ94w}ZsO{g1a?P3hMgU#Aj# zMC0nys%nd0M(hU^7);%+-ce65AJ>%>@u=SDbVc{WljfW=Q6>W~j3AK1u3#6iJ+pR@ zN~N{cfsHb7=z}vE{`F7G<(Ip8q&ZDJsXj~$Us`8pho=u`^LIetIfVl}@p_dBN^nqP z#DQTrqx=zfhQ*(20-}1ZFNJO(4b4dTuvf6p%G=g0M}3U%ZP_Jp2R1y!&w-v|`M%%J z=sRjJz!xUyQPG$;51!5KCJF5D>i~dALBo@wn`LD89tW8-RB2bdoEk z5ks5S)qOv@<8X$+zIluOw>>*f7|3A-B?flg}KM5K#$U`F??^HOk2_PT*XN1i3=vlph;pc zZninN_KA1x;;826r)^$d{T+gVsj27e91jKPrE5bvMVqf3jXfp+65HFgPqLcL%+Hsn zKXQRH%&~NOBYIvEI*r$tggs#(4baZCf}Z3bq$MZAPRo~O#ZJ*l4}J;MTa5qBmQPU9xss@h+N-Bpq!Aaqg^^qB6Gkakw^*R zv&K7Yn#R)o%WyhXcXm`ZE1ce>-0i;ME@-Hub#?|Or`VZ z_a*Be$@+NsqP4XIfGLzv+$WN|F_8H*aRBcN!GIyUV@qqhi@1NVmLmn-rk3b;=DaW2 zh2jZ&iS`?k972~N3}sHSfQ)^!v#lp0A3Z`Q3N3CZ1`_3?&*UFV=`f{u!8%)pt>hX5~_M3Jb`yu-0n>nYc396;%%rSWJmKcdhe)8HsFF(J|> z;0IZGMzTN1iwWAMyj(0Z`6h`pKEC$P6}~IHkJccj&`=}+s-vN$C2Qm{6V?KP!Dtx` z9X05`sO8Ir$s6k$Z?zuBiZDZgF{6F@vNw&*c~9)V*m8TKxcSn-ynG`a|95WCz6>hX8z&sKl1p6ZKnoPut-6ec2~9BgvSJ3q6g82e0ru zJi)}_1=Iw#RpJmMrVE5Mptl^tFhE4&1rd$dcuB&FAqi*QICF+u)#cK&D!CW0qH3tJ z_NBK8=lgu4J8#sp3pLD1#oMvzbj^eNwe{xo){&ynR(<(8J_)X>oITAibe z<{_>ovMLI9_GW(mEN`C#=4>=X$KK5wD?_y|WCNm{t=-{BbWG?ah`|UX z6&Kz&awV{p_y7nQ=mp~tHV90SbeL}KN_f99$HZaA0O(1mL?k>vK+`jgcLkBV=PH^I2U>=t8gJ(_2YCe*8(dMbGWo$M8H_)cinYOM<2mjl@Oc-r^k5ea>e?mG%;G zHMNoEO< zU$i7(<37Unf85=vC?P#w=C@8@wP23kLmN%u%EiCb8MW_)a%xvJDKt1IC5tP{g)3D7xc;` ztcMh|LWD>E}m&Z8A8eG5gm2XOlO z*4C$}I7lO0pT`#%8_Q-)Q0rvu8v)A)5TObB&7;*z(sez{ zt2zKQnQVR`0K?P7Ya~}{R&5-IByJ2HK^O209>Syhs+j%!bs56fKyr?78Z<)0`V0Ri zF$6;22R+YQKw|NBPw{2Z`l7Cuk2~WjL@j3ztv$s#8bl-cj^Td(t#DVHxoc(O9Fyie zs;T(SDF4TDGp}PU^R8>7TsX+dZLsNjwjBYJ?_fH`3k*@IB}1#5vvVa(${;hdcj@GI z>lhn{!yInRQb)xX1!3d34YWT$StWt_P5)3B~3 z+Hs)7z0sn0uX5FfEhn?}KV-9cEoc5Ls;#Zc&TTld9L4> zUExSdLE|&lnau_)7pD;RsyF=wkj;+46RUmo!p>=F$)bb7xlM0-NC0X{;uu0`Tr6T| zZF@oSs-gpjq`6Notc~PwiJ6ALn<-X0CFkuvR+u{OX%YkVQ8-RK9Q1%BP?xePtg?jm@CNO z091)j3^&Lea5`wO_mEiW3O)F|r`zVB&B2Fu_ogA)C5f?MIiB~j3+~&;2L=lQuTu5D zX1jHD>+U7j>v~h^MGWnj{+ypzJ^q7F>QOLT_21u(DWVaS*XQ4EIZ-!Py?#TA)?Sb0 zOG{5y9IySkzq6xCuK2XR-fg?iQj@;&8QaWkVG*-^FBsUg$uc#^lw<~CB3>hc;rI1u74PUOj>?2e-QyX zMBJK)YigIC$ABXTt=r_X&BNYM^tdDnu`~3`KRXJH%8)W32s9OtGcg21-a*b55+JJn zR6a-FAYTuB0Ulff=_i=*MMOd%+YO(0qK%5G6HZlB%v&+H=2s3dp;z^Lb@1$K_0@`^ zv-*=JFDSBpFub{PGhlZ#e^d57YX{L_{+-7~dUtQ$JQHMFay9QF_L{kfc1|RyRB&S6 zAW_iZD89D4WnN^_)zyXOt_jHQb0H+PpcSz}M;LSSh8128F zhQ@r+Niu%8L(p^M^y|K%2hCM-pSRCV-@MRhrTix~8u5{JM}BzDqEoT5cK&*-*yq!n zuJ;FOFB^LtT7L z>oW9(M*wBjK(c|XJX6o%2f}ycg_wivYL1_i810V zp{2dtqYrfl3Brg@(7OR`4#5^WJG*QSR;d=zlL%2vfbrNs+|ugc=-3KC=QJiA6b^Dt z(LBy%J&=2TxE2~m%rECTzk!-`DVkj9YgE6ea##Ah&rkN#4|z16n6=ESs7ELCYMgk$ z->G~rivOe7LdMG%MIB4OJ}R@CFAr|{tnWD?>N&8|D0U%WNk5+x`Lt>kJ}b|Q#a%_S z3}^IG3QS`@@~>`zcSBcS|Jw(~T%|N(u*M+K{ZnLqwC#TfKvk6-W)&EhBm$V&LzQS~ zXf6v@N*!jU(V3%bs=F_$IOI2pI>aRW%+H8yUPH@BYO!3(o}))eeCz)OYCipFaUc$! zoFCUbaTw)HUnNg2GMl?p77e{E#Ou^eE@c`&82@Ql^!qq9y?-qG(#F0l7U|WZpYaSk zg??ZEBXHZlzHjYwzSslf-#vO)y^dUta{P&8eELt}olef-kbxHeySi!9=x3eP;si|r z<{y7OI(aOhB1FmrpN!1T>(~FuDdL8WJ_Kd z-HK1h3tKZ@4iooZljjmlSUYw?IR!cdvDaG7`i5_=2r4QnV(@`A-K1F`0P!=X{T$d% ztnb4XIqCO^Jaw?|lMO7`Ed~A(jmb2yARR5Oo8Y~Sata^jXydq@J!fdkWE|e0=IQda zUHcxz--YGg#pxNfvE9DdJ+nG{O`kRXgqTLr!cRfk!{1JSXR@}ILZHG9Y|AfU_-skgH2nd zWmzro201vsx#i>Y;Y0_e<@<&#!8-q5JNarQAT&T7r zpSdqOd8X%`Vq+Jlm@G3;*Dw$my5CnDI-L7=ZIyqM-ajdW0xUG}{QKmL%T$fbLY;7$ zpPePNVz94+ul~`EYToqyIVPX)Xbyik0C26Ws#@>syS99JSr1bp(RE_@Jm5ZQ{LOP@ zCuyfysVRx~ii9l!=7Im!R@m52Y7>YIkPPHmuF$`b4eper-vI#v7rSX_^c6;(b9s8t zV}NTziO~G^oT?C0#}1<$5i6ZOte*&JXi&?qi{Md3P72Z7W495coRHt(%CD=lNC3Y} z-}9tsXMO$6inVBm1fP}u;*l(=>^)L1#mab;@2yM+aIzG-37<|7Qch@&7gWCV)&7Bc zfstukj=x(ZNBYC<*4Q)dUW{w zLqS)8VIfnh&&o9bWN@;aP*GPexAmwhIC_P;y^(YH?XO^cJ7?#Yb+cPLILOcda)zk- z5J#c^?5r%TEC}U-sDQ-a=7BsSF{W6G38DdPZMV&inb}z+!r9!ezj zDHI(fcjxli^A*X|HAEQH=|7L!*)2pvdnoEbr$g3BqDDs-2gGR)TvruqtB>$$#Gz&a z{7Y=<_#ZI|+<)+(2A~@XjoiDJcLW5(C$e+Ta%sdbjP2E^GOHiE;&eZu=PtFR*AZSt zhOY_IhWCcD0-hxwd)mWf2n*50CU!{ln}A7rBG?!z-i5*Q zh;DEv$_GgwgaQ8m$YTn^9^*^miz74>9nkJ6cv=ozYUcrM>^YJ1+D7Yv*HtE~3lCoi zHmomQXLy>6k5B2{CtBN3(6O5O`eZSxfS_O{NPvZJ*TXQZ0ddbQo97_rIWnW8w9Z}{3tomhwpiBIEp1Ek5u)8C^Y2A` zZg|ob4H@1MyN;{NFH8P*&T3J`fJ7t91@S0h|3?(wA$0o0J&lx5q9035cjaiz?(&-hv0fnTn&hDg8R7JF8w{1A^eJ z3iS|Y*o3(_01zD14rH;BeKs)_WBdm?Wt{!1p|y1cYXQGkZ#iMF31oLX_H?UMVPMC&oknd*Nh1emTwLDU!JICi zimydO_ePKtea^A01}!lOvBUM~|QM;Kq>+)@`ENmWqozzyGX+F}!cLGyirG~*2* ziXa1n?unKyZx|gm1Wg++Z2z*xwWqOD>ew+Fd3jAch<$c>^sgfsAhNRSz_WEm$wKx- z!YT!YgsNIwb88zXRU#BnD-zB$s6=pP8_}Y}4tm4J zSsueI#e8Q?6XTP{C$7(?i2A*$MB)*I4g^vHnFB@o{t>lvMn(sm2NaOmb-U91ZC_u7 z>7VsHss{q{SB`|0j7t&wD^&BR0G&Bsy;^=T>n>UZ0fD;!$AOdwSbFPevUMB-jX^2OD>XUGOb(388Ue4!>!g9e4S=^O2MQueN~}4;xBjaAnoSN-`~2n#3ZEJ z!JR|)P(#s6w672mQ2<>U`&RZ>lr8mnmf=<2_=E(z)5&{(sBo=_td9KvT~ITgm8?z{ z4-B6F_Q!X6e!jN1w?x0i?jQ$J;Kf^`x_t)y*oR&5Db278&gSwpjg0$O+$J*zP_I-%Veq)aVLX=lA z@dH#pr%wIY{JfdPQVsAEN@t2j=aKOfUO=;8N+KSePWRouFI@EEwkd zsGD$U<6k7J03j`OX_Z;{GWGJ7pC3teHhCu!oXh|`bX^9M1^-RXuI%ncrYr!vw;g(| z3c9lNbSuBgD*79%i;2)Z(TK5ov0HL|lGpm*rF)f=czNr&RdXx$Nxh;Mm-raNr`aP=R z7F7koMzoY}{%tl^0K;*rk#_=P83+Yy^eD*gxC?*}VNYuyk%UpXM}rUoHtlS zpX$q>Nm1n(@09}S!+h#d26|MlYj-9Tm>JUrX0o0wZWy?4BYP!agG; zQFO=Q3m5WkWpy^>H%qxU6hDoNa~XB)Ep++1FJ*gJ^i{0z1-KoPsxJ3-c^It5 z`x|d-ab|RMo@I&o&40*V)igC>`m&_LXk}F=R3?9?uFYz#nPIHYEb$#ItcEAGohH7t zJ%P@zKebgM-F^K7mjBrncS*NnXj z4fkX?LeupgQ*zcNmQTp$fFN@D^23u()yPC4dVxf*8Pyaz<6EP1P6M~4w>^s7kdm?% zWe^=RhWx)v?jGwa3M$lNeg1Hxw7=d9>W;5_d{2ZO_tg z)KVJw>|veO-IzVTv@~<{sPp<`ZhlvPjh9?-X9Eq3<{rW{Ag0}3X{N9_7Mexgt#GMx z-=)AtE6H;dQx$}%FRO%#EYVX}TODWvDYxUA*@bjz%c4t6rAt3T8*^wW>2E@;f^EK6 zs|U^Y*xZeZib5KLzUj3wUXgu0`~7xQTSYX6{+kK@`FP`|#zU3-XCfXLuQd-Vw3d|( zzc5n;zA;{M#P1E?5rch-me%~ar6a@jgWqY^FZmfgepqd!tz88O0@CM_A3L(gd%udH z(?EkpHvSO9p^pqU1fZ=61qy&8+G8{&06|HmnNzj3>lhs#uB7eU2?qoQp&eH1u2&|g zf_^2%6TQko68>&;5a?wqP$MER0n>n0MR~PWgIF+~Yjj||A}YG!d&TF^FRmYHird?F z*FRsV$P`P72pt3uAk@H&pkrVUBaXiP9HAtQV%uIN^R?dtB?UL2--nVt8Km8uw<@Lk zR(x=vNzOp6Aa&d@I{_I40Um)*d?<3ZL|lyP$HImO0s;b9>Ajql?l!5E^R0QKqnsPM2Wl`kQmtj@A37v`r@}CzJI+}!8H;EO@XHj)p`lA;Uqkr zKx_fKu2J7Wy(6d`j$abxf>h}dXdnredB~*&WyyAqFZXsm<)b5wbJgkze$#`)#+>0v zLPgu4WdOg;0#v`&>cBe05|cE9lh_)B#QFh1q6As)noy38in^be*pkoftfF}flseg= zHM{tn=unthS!Y1GI8F7-;xTE4f$jO(>whoe>Yj|?*8XhS|v?sYH4JuZm-Mn zF@M^b*@@5A7pw<{R6SXqnVsAn@B@^t%=e-{iS=1oo(8~x zB(6q;wM~>17u@5id}--&>7k&`$)=l!>2O_X{MUo=!((Zyr~3L8h{<5K$e0)dhc0hO zEl5@!xUx0?nuOOTf(TfT;SsC~O;C8ccQqvNd-ZVUR)p#xU`zZWrCTi|1WO=Zv� zH^8_81NU*nJP^+co2kc52)Hpkcr=bkBR`47j^6;?`l;x;youdD)PuF13lA8-+yfFs zDEo&K@_y2)`Gjl7f?Kp?>mcPACWcCoj7MHA`=(*;MI??{8pEyr$tO6R$VUfQ2tH@m*lHFfi6{4Pmu+5u{F{ zc~wc0JRS`f(BcNYSa=fl^yyQ>uWcZ>aJ5~a#oKYBb&02xiDB#3J?mUD9z#=!j@s?2 zGnKp8($W%KyUP)QNIzjmi)FX(HFk5tIk0vnbh;7n^>g;n!8hx7g zdxYgF5vU}3I(>)YPDAtEo~FEI)(#y(T3%iRl!6^b=du!o;lXAs_VMGWv^qf8d=U5B z;TJA&fJ_)-hgb&|pi$sIG%XjVg`o1y$~5Uaqx;|y-d{GeDA!FVR?jAyul&9il#^rp zcP5bEiYq?s1Z&Iq95(JPRjmCz0jG&_@0!O&=t_W9GGGY;1=ihQF*H72Uil;MMs~WH z+wV&~U6xKQJ~eLpx0KvgXQ^^Dn?BfXxJpnoE^6^W7Y<>vB-YC6g&Cq&LuHuVy!F9jwltB3kpcHmdAs@naV-@^Tgi}oqAFSEf@$09&%k%bI zUzWH?okL_7O;}Xbi7hmQ-jF1mXqtEg@r7z<6PTpj99S2U{de?dI#zKP-9G#{+(@zWfm`th8rSLeP4Pdr5}z^~CV(!-v*&o3=EK|lf;pJif= z=OP~;AK<;qg?hLgL|-*N@BjpZwDiA+3(L#ztRiZ7eOL6J^H(^fh%^+q371Z-e`dH> zSeDkI>miZq;5(gToygJIKbck936|FMz$4c~&pTCdpI|G@8unzL3HorXLHLZGlJ1$! zz0@4XQKX zhlHRufsv<)$EZF&skbc`Jy-SfbKa*jPEKyJ=~57}(c&o|KX^bPf=63h`z$0CIO+e; z>$Md1up0u#iX8GxG-~D#YW)OWl`mXl|PZvvfLS-u!xXLenIxlF}Jhb zxn>%WzyK^hluHNhg_w!}k3U~LK4GWgVrXP!dcMiv>V*6wRTB^b7cX?7FMjj$dGfBz zH_q2QJUTLL2MdTE7l)w?7`#E8d{`VydEdGLaW=U{*RJI3zieqK4-3}wWzFDw_nzR1 zh&fsi7>MjG&Tpyuv{e04RIu=3V;73N+xUw_Y0n4<;o1?(N2(#ToD`sRcmdT0Ig~}q z{y>;`SXe8)uUQxtSHKm5pO=$ww z^{81enK(WPWdG5lUbg~ZZR?O^b?oFifY_}1)tVEvs3))Gfws6?$3=Q4K!U56M@=HfUHm8azvH4WjKy~J)(kM75ZLi ziPznSixU5Y7R&-|6I5eF@e8F37Vex_ZKh7ucv+%z=N?Cvl z$$%%Agj3)U+o2lann7-}v-5t0a14!%z$OFP7XAjuv@*ch%S&JIY9d5d@4D#zZL7<) z7h9NXnyC)S!%Ghs3jHwr))qe0q6jWThq4u_NkFSje{t%=PLkWJSwVI>k^Ctln08`i z?Dpf}hEFP8`%6j~di(Ctrn{6W0vK#91&Gdjg7)8dttNE{wws;~oXx zEQ}$TJMlMAWOv*>YGsnzUHvqvLC#p`X_fpI6QgLQp{#G55j+5GWi2eY&`tj6cr2vs z2JRchn84_0QG6POf5&7{W_Q{WAk%lXbZHK7AHHu^*~XcWMPFB`=e2n4-=nTyTJ^<& zN7nPf<`7$L1+rfQ5hHj2m9jfs2HtJngOK~@Tp6dZRH(3<=|(%v+#$Gz+O&rqh2h-4;Hp&}F^Au2>FDvF&UQ8HFY zMCKupP-Kcy35iIO5RwcfB_a(fgfeC5elL69|LcBTPybKOeQrDT>o~r{TAyhN>`(eq z#<7+E`thFd0yP$;gHdDAx_Y!#%XP2pjcR{YQ4Bx&RV%V=Dm)p6%cTT4CK0aY=8f4L zavX3iw2PhVp$N3erG3e#?X`Ccy=~HGb1Gl2Heb9XpmXcLMeeY+fR!qf2oZ@~!()J3`fDN@4P!n)F|A z?ro*6Mus&b-Y0C;2g9X4cPmnIb1xr!@)qm7eaxWJeas{gKX~gt)2^eF621&XjxO}2 zN1D-}>9>-;C2cl&J(jo0A4AALeK@v<%JjpvB>}d>RQNoge{<1t%FhH1lcmEO6{^}L z_k3~RLzhlF79?kO=+r#PE@MLH4 zOG#U1t6v}d!nxlv9hIzBwedF4mQlG{paC)-O)QmMDZ3=SpuBu#M&Lg=7T(^5Wa7Nq zJ+@iWv@l>nMsjta(d4cP!&w4I0q`{<`X0{;f!7Kh z>PfVPC2WHLR2)pGdpQyq4>CI3>cEpdS+2`>oO)xkbQ-to{y}aUw`_)F=De~^wE+o_ z@_&dQTVQeaO(tlD%>BDudN}c)13RNyc-mT;jOetoTTbZ$R9>zv7Ickp#?l9HJfO)u zE;xipm9%LZD?a4kMN=E=kWk{vfijaD_?q3@FX!`SgbebiH&Uf;N>iDOf?jYF_VJ^W z7&5+!953Oy+1y5`gJ$p8{;8IU3}ny5W*>ZAw~jGI(&;Uo$VaVGM_ML$5J~3Yx9^qM zSX-kWTa^AQ`}o+`&lkPkJ+|oT^|fo$PaSVKHh$Kpn=LJ6Zor=cLJ%dZ1*vP?($a|t zFs!t{eaf4esx&Rx@xita4@VkA<#Z)=2pK6n|D6X9IA-_pFSsa)eiFAcpLYOtF)h38 zqUA@Uygdj&m^dnm@Qr?VFIQDp%V?{TfO{domqcuAcFcKby~h5E@M$Oq&OWH=u3}9X zlsc1;e)tk}G-;9w-M4H$v~znjGoa$}i^%e=ejC??ggC_Ooxyj8 zVZ?A*^vT=bedSni$vz!4?u$y6OluC_JZL3zsF93zGqc*Ui`d@J&u`2ccwqL?6L-e_ zIIt%`q51HJi>`UUSJn61c*{36Q?0Qgf9-13gP|q06;}C`bH`2a@9R*s*kIMg(3UT* zhfLRzZk7aN5t?z={ELaJJ2^RVCCHdQNSqs##lJ2O-jMlctHsV9dm5o$ChD8x*gkrd z8kgMEzHy7=vTokmGJl9)>mi1lba6ND8`V$I@8F4(^Lmq7kkO*Kw>S2N^N&8pHkz{T z$b*RY4MH(pfOKPl(%;-5^40AC&D`9{z1$b+CFE(8LIkiwou({L7_O&2v(oYVr?FmN z?JoZ*yS{Eoqk~GlRJ9Y#+&8J&nE6#NvrDKrx@l(NXv0Nc+)v)yG(cm=-k=~(-H~h$ zDrUCFP86cTC?JG>F*9A|cX0ZUXu^*eG35_pFFoIIB+l2snO%SKW2;I+7QLL*Y{8+U z*@3DJx^%IHrghfW7}L}iSdecj@dVN1Q$D=VU3tMMv6o8z79areAB)JkU_y(01*P?U zF2=38G=1)koT(EX9+u6Xl@Q&>uKOU((SO|LDE|Cv7qEAkXXIyG1jz7vS~t}-GK~tF zYCYI3Fn!(Cs-0I4YN=$kIc1q1Z+&W_|JU~wi37h#6#|UIM+q6VYTMSWG_WAU3X}oT z7_<8@iqyD8mMz`#7s$fQCu5A`H`Jn>K(nGI*?9l8%^L3e-#0KE@>>{RedEH}Q`HKX7WV zSGz_WaT>9CyY-Qr!M|({A2bQnd=NImX~LGoE@*-qG{4u~b!W#Q|27oxs7=)0r60Ib7D zX1V5`%g78?bXzv$@`_hSM}1zk-PGH`tdVku#(H{7V$w%#D)Q;j{>b7tzFBrrUzd9Z zcNpEX=d*M5wr-o#^TJt1an7ETv$^4q{1;b#zlmsX(0S#V7H z;$@!4htOYkQ)6tyPns+_AL{=3{s17O_Z{x_P5Aw&Y-Ucs%gYL?%Rc**xr;RfGn3@E zkP{<91KI`2`ttl$IDO!z_~_{E6#se%EfKsf&3(2UNuyvnXmD?lFwwAaV^^>8L|W_5 zA&ZkWOTT8D{vNUJ$Wad_g|KJoj+@%rdW2+u;VJKzwi7$;9_ygb4=E_@R_Kl#bXrU6 zbgW6)vh%6mepa4uIoRv$!e8x_&o5p1{cQV}t<`(Qj@uWMvS07BR(M#atDm=C{l|5I z_R!60YPAiw^$I$_^jX}gU#s3u$~EkxO|a+eHxFj}?QP}l6#hBv&CX6brB{a<+|fSQ zy7iI~rrQHDk6PdR*1CD~gxWQ^cl&AYQJlElR9)A2g@=vSVMp7f+|6ZBF-dC-(Lk|__UmN7<5W4o&nXafL z7<6QAQ9LQ-_Ku3gF?C12r?0s|ZJj&Qb{ci=h}sQr(u&n?&q^&CatC%rHzVoWy;!?x z*H7D=^<r78>=VQg?=qj!UkUoZWx&q)Ebi zvhyLJ^3R&vZG5!8+T=$is!eij%BwV*d@$iVaEVh$hb(P)y1qU|8UIS9&Sbvrd-gn; zp#2HuwltQ2mF6aXeb#ib?O%WSNXkTDU4T*1wQbv$o?Na_fnsQEpeEI9RCdE_HfZkY z@bzQn*ytUf+`6GwuYf~?oSeMp*4}UH?cZOgx&5VmQ#&D8TN?2p*~u~Ju}hKKgW*0! zHg|QB_m0xC4BIzKtCN@UgQ~0f-5=t-Yn6{SjAXgak~^V)RR-?Y8XM{ZR^&g}^gX96L5`h)vT6nZbvi z;AB+Gj&Ugd>AtDi$M0`0CC9nyYsg?duxaUb|IJ^U>ge@a5u}G6L9SUmnN7p3%#bH~ zBdN*IKI7Sr1DD)iqJSile3#MZ&ECb<;Z`3+K-ZUEL1|-o$;G+Ond|0&#dR~Bz^T5;KqO@j32j(3<`ewrIPn0!Q_V(r^lWN=E@pfuD z`;nPPgFU-092`7qvCj0hySo)v=Dtk&Jihi_?&tDecYdy2ID6f@xBaGdPL3VZ zZ$ zI}+Q{ZppN`xVQtTXW5Vv*SLRt4_^OG%9(W1fAtQGmrAy}B4-yWZz1M}*Szh9P-4Ds zl6`jf?g?S}DGwK0sw?Vsz$hSpZ$fmNe-nEv>k)BkXvzQu1FbUEd*N+e?Z20)|5kma}VzLVA| zLr29am@;!F=!;A}rK1y)xnSqfX!dsMy8Di8y!T6VhXv=1Xt};=hxIAO_;-N|L_QuCW(k9i6a#CG`0Q}rlo6+aCj@(}wrLkd zW&)GfSuo?Y=U@8DPaHR-Nk}Em4{pS8(_XzA9-5ejXXZO}5iy)|3A0`H?%Ah}H?A@Z z0u|v*-sRkZQQ}MjWt_rPRDSk&>el+H{kUKsAwE^5Ey<7Mo(I631#7oqb!Ahv*vaK)227@5c z2VcIdvD)X>y&XrsqPgx27sK&#*tSjg$Wvrw@q^9G&Hus9BqOW)wRvOTY1KPDS<7G* z$n*dxFz&X47zdVhuTc-I-W9M(^KRXdVLm-JeZAV~u!GqM;){!9F)4XA#9q#>0eA0QJQa9*4>}D{ zR~{F*0fiTQ&fvj=DR$}@j=l`Ljcj#Btyy%;x%Ur4Z?7|dy2xm(pe8@LpZm*{KO|}y zUA6%<+SzTv*Oe7HC42OJt{a;B?r`qqFJ8^AKYHTK1XG5(LXR= zU8mpR!-h*w+BdDl7eOE@uk*^BO#_*~>g!t@Vl^oydhgUAz?rE*^dL(~tEUIGQ?n`Z zZK5!J`gD1;QN?Dp&ut;2ZlC^3=eM?XVWhgaXX>6&N~|0_2_YK800Dny2w3> z_3O)#Z9H)h+>?me=VD2wF`8s;y#b`Irw=eMB?{mFYWZRgM$!&Cv20D8OK&uq{?nTZ;(_QC&nAK^XC|<&n9KhFMJJZ1 zXWqX*8`V5>q+;w1CZbo~(XPSbk4MNz*^s(&Fwb9xqtn1))Qo%Z0M0UuhcBZ9Rol=J*BgrFO6GWK6KVpjL2b<6Q}`Ub%ARp~G=x_RkG}spU|N3t5IQ01`~0 zjxas0SGp%MyL>^OR-k4JQXmnv8}@%)z@>$m*89WdQuoWhZI*jqa@$i|nLDN5fTBXH zQ^AuxI<#ujMiLiZeysa2=J6LNe+vPY$!o#!l8^6y?eylziSpiE&&$mvI?2M;_Nm&g zNb+w%NSUcLINEN~^gdBB1-{AAw;lyd8Hn4F;!A>3dAFds8yFS<|G*$z@%BFbe)Gxu z@t~-jUz-Ph6co@!pvw z7iyCZYA3A1bAgprMMY&&MPZ+*|DY=nPq4(NkwxPafAul09V`hD6wxPNHSqYlP;2+@ zZVBIBCZ6)RXe@?K-VRJzdH456j~(;lC4AzzWn?X3T+Q!mPplmicw)iM#SAlMQ^r0E zrrxGtJpdXy@cD`lq3vtlzHiD4cTO{zAK7QK<_)h!Iv^iouy7`Y4kdgF#Q!{4q{tHy zLuY7D41VM4Je`O$y6-WghSQ;P;5qCxxs$nRc|}npK=zJ5FF8yrez5^87NDJ9k!ScW7Y!O^(h)UyXTF z213Z?%OHT;qnZyJJ#*&FLzV{l6UE-ts@1aei(ZGW)Y`(w#Ro^r2JUjtho3qt;ptI*@><2{Lu>1|1BS`1T{hu zR8x|hPoat-zUxW*6}Sm>4~#E)*dK`Qk;_d=&;stmVD%xnxk?ict=xZf)cS@^)n0G+ zYzL33qoadgynNYqi$GClh53AKi<|<>`g(}+hL9K>?0mBJ9hE=7OY-->HPC+ugviI} zz-LLy^!MQuR8TffD-vzBhlW4Dcy|Qm1k1t*>(7l(M>JWvir}hMQ@Xsmv9B$?2{@B{W6}T42y5MYMmyzrG96a&lZWmDHb(=QzQMa|oo8YH$ z6O#rM;Q~{;Y<-#l5c7`=czTWK8Nv2w3eWp{3PI_WmFk%u|8%;=KJtkyEi>EwxpsI+ z_uz=O!>&x!Ntr`jAG?kk;Wh`=v#Z_%*hny0*ssJf@Ex&>7~O&PBA@ghaY-8(9gNm- zz`b(&tjN^=#thOJ>E*Po8H~QX=a3->!Fy!pSadWc@a;a`e24XNexOU6&P68 z$)|hiQNM*38YphjbxO$qkjap8w5p1+hVv0V+dO|s+6L)`_|eibGMN0Hh8w{Bj_nT} z2)V(1daU@+esxW6sM4J8ZaK|v?+PBZw%w9(;Ya>Riabdj%AJOXVy7+-+!A=&rhqGo zpvJpk=iDij(Ah3SRaIxcHu0*OkZ^Zlss?dOJJAa5Y;S0SADC-h(AB37i)4g41(Zab zNRSLNp_43%FFDT=_YN7jlmlBbVlc6>ZuW$RE`0fvF>maZdlaW>ygqe(7ZqB1tZ#wr zfEnKmw@$ctQ>)$a@xwQSg@t)cQ(V54s`z)!7plb;>r<{1b#nhXT>?er9DWC07h#z# z>Ca7}n6axQAFbNE_xjiObC$h}X!!t9wUDLM4=7L${0r*ce|}JZz6p8!n7#f%>`flY zc~}Jifqo&EJd;|fsmbh5d?j9YFTPu7ICl9n-OER=GPcTE@scU@IdHVm=R4hS;K3iy ze)*L21T0+-#K1y_NM=Q6P#6>{dSGWE!%I0(+xi@`7~f-0pJUapH=(6h(P+G0>28#D z*T<$FeZdP^xz^z-0K7Kjp z{CZ`Rb@{Hz>G$r@AWFm|xk1EVJiENKC)!XE(~fX5+? zV|D7JI|2ntd*LjD7e2>VMC4*V??c)GF7u`Q`lJdj|J~h9Va^-V3*~byfBl$tWku4t z^F!;V#L&D7kemKi{ku{Xf9TjpxVeN#*H688yc@9YgY(9Gc^l~bPJPyOP3U>c1T(^ z>XQ4C2Z#9{NVK>Ym)SYY0l(!I((94}ccVl7#j!!_u%5-MN9RcGCg?E*Pm*Td ztGT_k*Uw2mV%f=X`SFY07EI{Tk`%~p(tv}ga9tdk)C>LSYzke5-u+PB5s=;`G`r^b z#12mp#N3EJQsaa*FU-57tyHr#HkL-8>SG(f+$B7wRb-2@ISfTNId1wbXU@0!btQ>{ zvMoTfEYQv7bRaiOA=M?()}J0*=)A71^i3_zjZYu9dUeP5d0(O``%xo{`v;^ik_djSv&#hZ$PY#cpSG+RPIjyl?cNp(kH|lp>_3m7{ zdz8j{B)KwW5s2{<2;SA^Iq{-$H&)JZ(%VmH6{>Q9R84eS2?ncg_(_}Ec#bHDh6 z*T2*awpa`raCEhSZnd_Gc%cRXK!~ykDOB4LBa$^101)3jot?BY`(;uhjV#Iq$mVBE z93X3uznu1NnRbwzK>i0S-AoRxF=viqPUs;dec0$#R8^NuDIGWL(5|~DP9Eu8doV7j z?#KzG7sa0TYI^=n9<7GM5EQlc_Lpo!#u(T4yTXm@G{uT~Z0?CAJ9y0L3uMZ66K$a0ILA38TNf*juU@Zm#?t~WHzc6ObXq@gj; z@9v(F`JWXp;b0}Yzx?1&wp^B%ow1qUopH}DG}_lE^71OrpoaD~KO$H$GHV6RkHkZe zM%$SqLFGZ2+x-sT?hefR-dnL=qJ))g_fF|xHb$|i)Twi?hzQ$z6C2fUx&DgLw_XdCxA7?OhVa%DwdSkLmrbU%&42TO=xIkqFi@KC_` z%!dxzleAUztsflrfBbm$kYyhe&V)8u_iDv07k$S*^Pa4Cdw%^zhkZK1=_?x6_pA$l z#ieSS7+hMKembXZ+mH9OYa^VR)W0+!P*d9hl&VjF=KzXlp2K&jWhwYcvnSR-6$>BD z1PR({&!0O)pLn@0-!9C>y$iR@F3cH3W)jn#_qM`dLku-4|XVjiLmAbF*`NGPvol*w&A5Uo7#-iDV%3hxe zLslhofgnkdSf!HFU2{(R^hsEn5ZB(;)3fXNS4FeErT zgBkA@rQBMuBFDhcj;} zUG6&c(!MqY4&fV1JQjCqaJ{n7@Q_Z&QNLr~f4I}^pWIu{^8-3edUMMsYG;eh?-u@a zdKXq+X!+AOzTjt}p^3taTPub-8HJS;{%JgW-nr9Fciwpr^ZeHDzWuJ9Nqp8kc$<%B z!m|)c3^4hD8UoNHJT9aE_x;)Kd*R1>c z#|j_pR_WV&9fgYS9P{Hf?puu~MFGdfEj=|%Cpmxft5-GdJB%mAvEyXOJcQh+Sx4I~ zN>ZfF>iT4qN6#*1=Z~*^tAakdc)^@DMEd}Lb88!{`DK7bnKM>*;WJ&Zw8cx75aaV% zRn0Bs#hCR=IUl;=3jWJ~)zm!ZFs(Q|;auGMO-d&pJOAvQRWIvoiPFUBElvKh8TjbT z*v?(vj!3Sxc$NIeeT~15U+m)T7i()0UBfCOTW9?mQF%FUaYdoMe&YRWIo`@ME;nnU z@zgpV!vhv>>YIm@vw+nb%ZJ3j=+J(9bwwM~DBDZ6t;cqbm>}mjHyZUZ9WK5{&(zzu zH#Ys0a3xRAJv6+RLpK)>xgnP0HBTuzH@4SgYwO#ZtTwB#$ z<=1+CD3>aH(PPS2X;PotMjnf3$e8+I$1D1+o?)AoJYZa$|0QuVDe2##S(|D;TsyMv z(W~}P1@+>t;M0ud=kWhZgQhxOK9jOwj^g@%el={{sqVLIPs{p#7KE&n0Ol1@zSCq0 z(yvPd^01VFK;&xp1ejr4b=ZiF)xwFb1*B}1J zt54S&i-#(9I7y`VU<#8wH|T7MvZ&~H>XL8m$Uc4Sk!QGH{XOWy!bbnpF?%=(?4*#N zuQ_PY(}ht2N{qi!r#LZC)v(pGJdK03^ImtSwwDA{;-2WYp-yh%Q7Miu^jGxk+thV7 zMYf2UkoGo`5tCr7X~+)Ox@hd*!Nld4>|C`LS@vzazdTRic zg}f6mo`p3Et(}A<5%YZc%(sd!D>o+Vw)gb?R`X%2v4`D%aXM&w)EJzBp&Fbb9qc)_ z;y1cwAnn$ko_eOEb_a`Mj-OX@|M0Wi zMw@INel#Fh9L5(qOofktACt=@ZTWMpcydaimUqB z(n~78znh970fteN;(rl%>JcAE+-@qQNMD6^UK-&VX!6j5|+#Jrh4YRdtIp% zn+7a^iCy!C4UMVZckf=D{Y^tziH~JA z@nTnlXX|WZT3z~Wuxs#3>rnEPAfZob26s&jkYU7t3hi8TTSC>hrV#FO@K$pq(X zuG>$A+mRyowEErrBkPvG-x!&GZqn4O;<(82;qLtplqWvV<}eAT9-cORZCNm5C zC+h1M#w_Jjz?0ric|CRReS^*LPaxsU>O2Tl;5=*AVuu&cD{tyK250uzbC+>M$#aH= zk2H+Nk%)wMV!5qW5DRU}q)8sx-{#Dn`xzS;f|hnw!4)ge0j;G?g%hbr>^<-v@!xPG z*B_r>;e3&%19XKpy6Wxmrl%YuZKD{d6*9^Rq#GSYf@{JFz!^==vtFP60b`h6k#pvZ z=->HL2>x2LW;jj3nM(H%#xrE!oJX&yr>>0PYR0EwHrjF2 zjNg9@EKX?=R~F1>pA-uj=*!XJTK|Y4+enVR?G+)T)AglqhQYk<^#fzRx3RG z2lUGI_AOQrFhJevP%tBL)`!1OQM_fKuneGl*0;kcp5)X**VD5}EW<9gnR9cfmeMs> z|Bn;PEY%RkM#T-EA8$Xt#}l`6R~E-FC`2F&A-0W~5tQyb+qU#9S4Y28(Nw$lOXWb# zN21!&vRU7}1>;Yq{thLR%)2Z+m<)$$1tS%wCN8fa8YlsaX^uqNc}N{GAk(;-U|??ParxP$mWm_~ZWa8SKM%p7qzZw~u zL7VM*(R}jcN2HsqFqTFw=IPo+$=uG8CH%I?uQ!t@rL-)3b1K^KN`!6;%HI*@} z1-{!pHmZNA_rH8aTIl)X5C9CVKWkIpQ<>wV%a9>?MnCrN+h@9Ri{pv=j~sp_07LAZ(Um9pLCm=Phm%DbdI*+1 zd-vYN)-?ZVbgs=j#BtBJF!kZclQK*KElHXF#pVOb(Sv0Zzwe}OXqLsD$IR)Bi(9cY zfsTH8p`S8itBGr`&w*`Z*n^0n;Xb&gKXZT&AV|cuYsb1N??I|w(F4!JK?xj59}oLe z7eFOeKGr}Jvg(NikX8f>5DNR)tDhT#T=L!59l0&@awWzPE|rTxkNE_gf?5l4FH4*_ z=)M5-d{++k{eYXG6fu0)4^|qXsg2JcvqsxZuFs{=6LfH+iJF#ldPX{sZ&c&H(|FQx z?lATVSP_6p{-9%qbyIfWZXn`Msbj~EAktZ$OY8e-jD4o%x6x$%vT2HS*BN(KX;@$W z;X+!^|MRaZ)?M1${?C6^(I2-VRqMYm@Z!=V#lVVU>10X`Bf0Nykz$2T9TN4BvIty>w-A`zp2sZzm|4k&69ppvI4EA z&4OTNXyL2}WSeYdl%N>J}m0YT@1El6kfl5n~0{0nwn?z2VVyp zVk!;q&34sqSQbe%4+`>6O7a=lq9g1rdN+Iu?$U%8yeEEzUfIJjGn z_FLJGa24>8#G*Cxbk&it^dxy7(<=}pLsGLZ@>(RpTGI^L&bpm86`|x}K?MSjFa|zm z#faz$+z_%9A?2YWZGL_LE?ELRAi-Ma&tK_#c!HlpPpUeM4~t8GI^RmM%JY z^o+ntjBfF}jief;2t#bwwoJFdntwy8wT<(@qbo z=h~8V$?@a2=e}Iw>6u)V$c(Fr0IMtru9v0B8lU6Jk|QdLqdmKTVo1b_Oe7^a%XrPJ zO`A4VNn{p8Fi?z(zU0z|gr`~_8gh#)b%h^I9)Sis`u<)X*Y!2{;!Ti7lpn-$v%!tQ zbf-l%J523Q=*2%^%Iq`($Ou}0#VOF!z(Ae2bUv&^rJ@;Px5?ZGDnv3U!?{N@hyUEp znnQ)QgQE%DSw`HfWvvP)fJGd#0>>n$?HCBv@}JJFrmxd_vmB?6(YJi5{Of(r@ZiW2 zF(u$Rkro~+R+I=}4kxMbpR`4Zt`=h=xMBD_#)DeRR9IN)oor$< z2qd-xBXJ!_Chnu}98;(?MsQ1Vz4s^5h%#&5ijz^rgTzq<2Z zUiE℘5~d)SkUZjBr{>WOX=d9dpcIk0Bw#amiTLKtO9IIqG!p+0%mHmfcLo>`kFH zpNC?_r54V?nB^Kq<)+=V)9~r`+)&nzj{U+XC?dk5*##v6g9z|PY}CDB9S>EbHT0qP4C4h1s|(5Yp%(6`(5c<%e={hn&GET zS#bEV3HO|!*Br|i$5!r|;?Wj=TsXtr#-XLzn}vzGj3DU}H(vc?{!chMfXTEBd$=OQ zVeK7PV(<>9V(L9`ppAx+*I8K|<}9Aah>*g8vCRn+*r#b0h%Kh~m$D~gnOYt}f^Imy z|8xxlk0X28Cl$XB>N#)J$CQAS&f;`LM1YLmoe#D2m=%A5p(rD-lpbDn0`XrY3AFY} zE90|JNA(^u#GXXi$4oIk!2w`~j4wbwJWs+5YV{y1zDIFMi8&?E9#%|Wwbrf8*@};8 z_rq&{CfDxSZtCpg8|s&i54jgvINu)EYi`tehq|805L|ZN&w>8KyNud64pKRoUpXB0 z<$TEP$9(iYW`UX=3_Og>UnE|-FfX&CSyZ+;+P3vsB=hzjFkni5b>+v(L>cFuO zC@*ptvIc~h_UnDvy^_5;A7bt?PKz$_ZqIG)X2=tNmj#zjIDe}2n@!!GpsB|I zG2v7c=2V$`329sSauAa}s*+-_UY%aU@f^;?1zC?)^XErab0=Bwl&|jHy}Ju%*8w-j z7+dW*vu6_=u_I9COGRF|U_FF#H7jbW&V2M4@h?wN(3yc+1R+-F)>c`mch{~avc7)g zb0!3!Lg;bp_HFZ^prFUxYMNgv3!M1-3we+;8nsBAge_l*lU3ogf5)}$yY2}HxK1vF zADIMHUgP)cu49%7O%^mOAskJNzCLrNjg|Be0zMoW=e)}{O2W+n=ljpTqe77(>OS!5 zHz*>{WDhScEqzl`a+}q)mXj6CNq^DHP;NbHZb(3K2LynE0)|RYD<}w~7f|4vW#j23Mc3QhSv(3 zC6!lzB^0g9FZUoZl?me_!@}4sd@}L&btEis*vjf7JWOhs{@fLaB0y25#eCC7R>CgB z)3pwbNQWWI6pF4a14ZCG+{%rL*y3$gWJ_pm@#FD9*#D9w4Dpppm&f01fhmYjC)FAb zWzs~{IlMs*aoDY*{oUoQX!7vv9im~iwJQRbHG`^VNziq)MLLIK2Gz*Th{?x=pjBWz z-3cxYsv1RZd!^7}7!J_D$|rc!2p zg!EHb1Ss9O^u}^yc)+Qoq^!ddGuhE`A|iml8?7T+&LvCE8=j7&QEx2XA>J2=*bk6W1uEovV<*~^5R7Y zWMEP%n1^=-G2ll_y^7|3YjAKIa5!|J`*>yqi3@*9`^(dHMo%T_SNzTaFsuCnD;jmtW8+-n@0IMf>&(2ff7f@dG*)m_;gT?C0Q$bqoMM_^bocQhwId zuun(*J3@n7oke3ElgY6n2JAOIdnc%ZftH|c7N$A4=nPNXvA-lO;Qk%M@M3aGP-Yu$ znD*}^mMZn(95T7R>r1d4Y1;eu@3*ha;l6pxl!MNjEKRYJSViJ`Kf)+49A+vVI!quD zNCt}#HG$cTNjd0FDU6@3jz&kytP5s43RA;ETI#BB$N3kK_KjI?N0yamZQ2EMSAZlD;G8MmvnqgTf|;~>v1P%-!{ihNrS_#qjo@oLyB(X zkIUSBw>j~}yh$RZm}wUk3>Rx>db%^l1NstZ(%hM{kYvTP1$R>SfThe|^Md!r*h;c~ z^9M1uoKOKS&9z3iYlm$N(Vy=}y7Rz+L6kgL9{YKp?Ag8hWS(M_4^e9^(Sb+I6@3BO zklPt(EC?ts2#h)q5P?Auf{f?oT^Ttdr2gQslp$UKMSsPi6Yf!%D=UqPUzNc81&(Rj zKPz0l_V#(*oehxrQmR&s&P*PaPvtqI=eI;wZxxGe|MSDZ9;T)nb6A8Wo8=q;Q-e+%~)m=Ev^_*ne8nTr-#&Y2T#=y_q#^ABZZX4K>Y?Xb5w ziK{4&3kwTxBrJX-DGXlk&ZW^I0S*W?6ME=SyLWMBld8ZHq)}nG=mZX6@e3kOZg1oj zK+U0Wep*Iynu*Od7cdaeJ~%9k!)2VBZ@<*oyR8a}31pla>66dm7qsDI64f>&y&jej z<`n<4Q#sq$NA7=I#nZYM9;i!}NU5UmA)I)1=hysVoj?*MK-ooPVga_pM3f2zs z%kjGG3d*~gl2RvNsy&D7M1T<*-T{^uo8y7xEM~kWEddwcd&YWT3biv{bLd5y=%R-# z>jeu^M71gc-`9*|n8~0c5H4LBP6g2jk#P~_8ntNf6P=XzBKU;aTJOgI0|3Y{;8}LiJ~_PO0p5>*(O2Dej|7q2afd^S0?fy4x(-6X-7yKLEzL*Dz2M?_#_Y980#6A(gaeI459 zM=s(vqW@AK=(^YL z?)X4Wej#dXB>RbpiH2Ek3!);~Hl%Ags=8~re5b;aV&oCc8&!f!RzwSq4KA14Mj`Y1 zAGq)Q0{@+r)ju6m@riJr|MNHL9ZZE4tJ5epc0MQv!|-rb9`KboF)%ZUDU+CM3{b)| z^=j3a0i19!b10EPL+(azx0kEFD_pG+^D@`Em;L=VlJXzrPMu2 zz5}>IdT7|U0=?-2yH!?J?gBF6_|BUKwLzx+>8IJr)(KG>_a#oesh+HTI{tNEe^#{W@yqlRl{QL^^3WpBj1{a%kfr&`_76*^VYZ( zRIAIGLt+|K9bc@Bh1TxIc3f z7I(wn2pBHYp-4d40oX&?PPIQhu;pTyq@^!MBcQy0`Ap|+F4U*i+YDoy%3K`aj;P}2 zk=d@`dAW^}^6&4}G(!$eDw#1x01^k*78T8=sif?<4qJg;mEtVwOKxiF@t4;${X%-; zdSHb6Kh$>I1@oEG15$dNdKH-+Ixxhik%m1co z*m6#|$DfTq{i&y!DwkyT64GG%7o@f;=hhf1VF~Db<=1g>iLmoM;=0}Q?Zcya>N~e> z6Uyh!oApS;SrEZSPk#UU)vv{|Xd3M*OT2?`EEWc7MMp=U&JO+cO?^=Wm&5zBG|6ni zw+y#ntqv@GGuSP!l?)9V?7sT0I)LtN9yH|~7&?Wm+qP%0k}H_cxjR ze)t)phYa+x&N-(k{aAQK^~S82v6^Ou96!vV4FS7=|B-Q@(i>8(9;YReK9E-x$b$#I z6+r?Q3w`QF=7>(Rw!TT-1_N0H?!p4`AlfU-H4#{CJQ5!M0qf$=x5bE`!Vdvr?XIuC zm(R}!Xhx}LfI)%_BAk^9)XXr?JJd`Pg#Pbn&_-}qoK+iXa%stDb3%w9m(CZirwq;| zCTC&GEF*4Re#5wBWmYl)i@JjHYYxB+!1}{*i#F6MXqL%MSW9aGrk#R#q0ZZa+1l04-_>PE?F)qh8e+?3LwdkOyjeNzz~L3fZYGmFv#{Y=Cb66 zcE^r~yon)i6OhwLFg<1ml=4+5J&~EoG%4Xw*`g71$2W3s&f&Tv$B_)(QcB>la+&e(> z4G2qPVmXKO0q738FS5#Bvko~w8AHbJg^Il`H$9c4BwbS|j5_`ZF5QdT{Vnp8&Y7QO zvtqzC-kE{U=i8#*p5~K&Qth529P6+oH?7%#@aABwdVzw%S_QMwBdtMi`^>n~idG4- zlfip)>nH?FeV2?On_r0i18cnr&4@CmBp--+p@cFbMSQu{Pb{71&o_f4V#jtfG;Hf{vTptQQSn{haC)w0 ztTsp%T?{t4e@Nu%y7kC9#<_MBD=r#g+Cjylv(akWwjXJt#UG=tY(z@Xc6GhF04Wp} zc1bG-vfB_wAP)AXtP+Ek6i2r2gKIV<5G zBFW|;rJeY~Opb!wSD}XPPvQmfOaC$t*l=YPa9m`xRHp0^>oH4|v^1JYqpJG@WN*QP zQ?1ZIcI(qenVO8|L+*KW-*t+z(=NSSN%7H`Gns{2Or5G5H7=($g&L+@DUz!as^}&d zeQ*N|BN-znCEJaw$>2Vs(k3djSJ1t`oP!1^mSi9z01uhckxZ3Lu3(8Gh_o(z{W^s` zz0k2c!tg@Ue2)_|V;1-kC5&(F&oH|^qSg{ZgBs5R|H!FT zl`%cf>HIh##zb0!F!k9U`aG1L70^>aue7w%Ks|c03`#%o2U4j8Umd}rz?V?t*qxd< zBikU0qN_oJsZpu(b=5H%&^U%cBuwvrl&mT-Pa_3^LUe7sUI7*w5a11yLTE$B7<_ai zkCGx~J^KpJs!m86!LH+=N(o~ENVR@`GN%HTIAx|Twzw+$=0U#(gZXeB9jJ`2jY(+> z2@a~lotapan4==68v2evz6Iwg?>=dEAJ}Eh6(|ZP))r^N(&8 z_5W!CR2ph%-k7EKuRuV^|B>rQ8~nd20hfS=n{58Cf`Bh*s)~o|ubkkoRN%kt&~wgfOt8c1M2$?5i2zJd(K z6L|w9tAbgEcW0ljJlU{*{B6wo8B`|aiEZnSJoK4p#w9ST8pBGsgoN^}NL=vu<7PDm z4qMB+m%dTc?BBm^N;o2z(3cm=>;C$YB0ep40-$B_|Ig2yQP5O{5EKk&dpQ+=&2$!9 zH&v?l7cE#xyQhFMfI5h4fUkf$;jjYzAQi68{vBP9KN`nLLzzikvws&AC!Pwvn6rf+ z%m2Y%2suJi{x}kv zcFVuATd44*)o&+@;DFkm9FtZ+j>Qy#paBCQbHI~ts<+8%$lrg57lWFRdf+z4*Me)w zA>(_n=|#N5y)OkQdmU2@FHoQ66vr{yB!{Oq!}Wi3(oCQxb{^kGYBK4vvmeawe_->r zZ4z)IM>JXGHP@8=64!$BJvwcpO*t`^{0{|M%r7tctUGdBMz*sjx^?cnmH|*uBe#4m zHdAS~px!HP!6L+y5_}_VBg|RVAmjrgI=C04jVG^^0zi;-q{suHRsQ_>xrkV@COkd> z$=B!7Iyytu^BUOV8l0VoNM_RT$RqInJe#k|g(_fMvqL`tO^f{x{6R1`+%7$c+NFRQ zu(H}kFgNZs3HyU64W|bC;&ZqDrMBEVZvZW3-Q3(lEXsI?xF_luBNU>&MisLWC;U10 z7ei#mEd|G2O1TFVTXJdE5nMtDb3liGDBI+A!D!KfiAiJ%LG#3JiCDdNk*j6g#G=1hWu`vcAtPXmsTcyvzj#NI-_Qd zfyz3U9-6pcw+?j!%a2{V_o^r{tZ8YVUI`s%hrSM=f5l9AgoVRlyP#%#~H z4_@#BneB%x+YAoT|2sZ93{c{K>Fgr7W5R{8*_Nw6Ck-2^|{taItYO6EjR zhe<*Y9h5}=2Wm38Erqptx%SV`a~Cg`fSAp}|Hgw=kRt#fN7y^$d9$)2CQFSVbFCQ~ z+++NHR+=ZOgLZ=!9$~wd zRGI|E;V8Gpy`~3XN__AfA19@h4qMTj{!Qbsb}Iy~-cDUc{V7MDy**U5(0^Q@T&Nr~ z0wp83u(C2CCW^5c;IAO-<0(%$Ou;%7J`X@pE%}@r;eZSB-?=4G&O&QKVIDZ%0`JO+ zhzQ}zIJ7e!Iidgw3(KS!V@`FGzcWbuaR>w0;>JRv*dl-Wn(~7hgXaq3h9FJr_8?mFsT9Y)e}44*H}0kr#g#-Bu`J*e>(3-< z9ZAv#j*MCJ=ijAb7A1}X9mowZ3`%m5p8-~wqsf-VL1wps0&key&5lgZ^W!L83j=xki#0M8zS2bAV7eNZ_XEolll5#g30g3i$w zD2ZAfoMZU!>M00P1t6EP@znb$H~M3-fO4h~E(I+X0aJ|+O)yWgvC!BGTxfKj99xK| zBLe>Eq^f$X)A}u28ZiaoQM)nx@9y&3-8-t2NG<3IP)2!&G-4TbnQWneq)uN+tL#Wl9wD6%nltRZxQTDU`ya`?@(Zm5I!?Xna9xAPP8aKtDb{}IitPX;jW>>^ z7^pKj))V=6c}{HXU@!EA5jIh+0obUiDcvI`KfWVN z77CFzM0j)5jt2n`aGlH{V_SFf5fvr=DAk zGir!Z%9|^Ha#XhDiBa{0a`@J{u?CD!AMLRGVEO()b)Q%)Qc=iu#A9)Th?rDy`NO2# z=c17qA$Ci5N*5}^L5p7AKQkkdF0&5XR>p0N4;y}n6KOxY;Xh(b*{8Q_eX9-GYwyhN zU$n$qag*x9b_Vt_tq)D~oKi=fC`CroBH1%JWCd>X<;M8yGhzpsN9|E{~+=Fv~uIO@10q4D_Uu0+iZQG^nW-bNShB5!VE+ZL; zIrbOm7@&WYlhT&7?c4#xm!`8d-vXm9xg07sDL~UYCN7by_41!SwJlhHl;P!7N~$}> zOMLV%Iwh~+r8i#7j|zk;20A@?@+7%Ad#J&W+D6r3l+--bdxo74_eK=Ko6sPmSFUyj zqnxl^tJ-`o7%SM?w5wiv$l~gcUilHh)w=OLQaL+W`CMAqdK9j6{7ap|M((obZo!98VvrAjm7X+ z>3|Q}zq)MPxcnBP1pyP}EM3fBhfS4q{pw~FTohiMdxACeKcLJ63RsIZj~=ok+0sn; z36m&t%BCoz!`uzLQi+h2;64}_#A3=mZ8bX%xL3MB&WNSiIXP`SJ-=xs=TNJeHz}+= z^n{mHWBs-E0NFk2% ztzAQCR~>coo^_I{Rl!A1aj~zApLu!u18QYElKnuC=7JTAD4S)ZM|#buu!9>mbUZNr zHlq`WZs@~4H)3uHEG%osZ};vNeeExF2A`)mX26g5Dmf#u>nN<{z+f`PC>*$dD(hZ< z_Dp@slqtX0lMGHZ6Y1MGIGvj^_0FA!2gYx_eA(My-+aoHCykHkBYNO$wCL-m*|8zD zmh2P?lwI6pU4zrXaC+$Ks<6~V*#%ad0p5x(tUGciZerU>A@5!$4P`LYDFDlwy)~b` zeY*|5XroICJGfRSFv+=KqbvR08f|GX(!1^L9bf(oIrQP;m)gMtu2z;fYQG5Xemdmy zpgn^hynjCC(~j-it5*=xB4Qy)pQ5r5>RFCpx_hk?Kg^f@d=Svyw%!v`k0jt;+9jX_ z;K(@~ztgXJwKy=|lwL`I06X>>l06~CxwH_p@Vm@K5WE_*pi5rc9tN8i>W%Cd`~+zt?5pU!Bt{(2(quvf-5=3bG9H}=V-@R| z)Oec~j3mFz6WEmB7eT^LJU&NvTCe=3+hAy}~ zGO^#X*Q*F0<53>_U>{552|O0_=L}IZBgh0{HZ7Bdt7W&z&aB;Hgo1A<&91W-Xl>UJ zO!Yg-uE!p#e_=3&4|hDR9pp9p)3s(km`;eM2xPKu`&^-x@_2zUOg-i8|6M+@AbI1<D<_p zFYUfC4h>-W@z{v*6EJhfJ!#{^P2W!QfH7}dtx`l_n#l4MchG97QgNpMk}GAY@% zVZ+hhnXTlq!xuG&Glasi0n3deQg7hEIV{8JmzNN8{@i))TvP5<%onH63eNhrqF%P^ zKnfrs4%cp;Xy@P{DyzuI-W1|1$BvA#eR5=Ey(ATy3Nf)sLD*JVRrTBY0|(mjj)Vo{ z`jYfaRFq_`p!@DYYq$yZBM8+l!pfv*Lyd!@A*pXA^S0O^dqTaLU8u#b%f(pz$6S1z4NR`Q4c4SojwmD{Ab-EQnaZYAO+OAmaD4q zpNzw;pzEhc?%1?p)sna_5Vo_XYgvFZjV(81(^dx23J zH!dlrpg@a>v61J&UI0+EybvQQHGVtpDcWP({T0+OK&0gP?YMmT$AskAY*$1Yyd7Mv zUFOfp@0MMDmZKZ@#`chFgUOD;uy0B~v8|0x378sixOD35N=;{*!riaqa~zpV9QF7s z+rA2ckO$|gc)gLNz?Ctd!z%f7pDvT1P@5G21SOZPf)+o)c9JYRh!EzI30J!4>L*jh z@G+ET_u05a%08V?5P}O1Db#Dc-F@uNZ{P!1McMYHEoE0n%#Ipoq$+F$x8Kw}cQ?0; zy}hU4^AWN~{#c$<%o68L%NlJdKVd(`kEO|Pl@;SKwYFPux(u>_=!8J|jJT&cn1)2t z7A-Mk0!jx_In$;+%rQ49+=;WE*4ko38s%>WP!M5F*Yop31+x>q6G(@cZqW}(eqy+F zk1b>C9-6qx9OYUk&m~JlJtDYT%a*B#t<(q1RmN5^i(La}Ks; zKQ6l?Tq>VYP%05>Q-8n_Qx3@-6SZaZYOY$mJ9a>M=_Tq{z-88o%s_+_kvKF_`-qK2 za+@G&IZMqhZrw04!nQ(&esk88JCQc|@ovvBUC>JQGk~Omq~USIkgzXKD%@ku(dPHB zjt>k-=;r=T?KCz@ph7S{0kH&RB@;%qW5=B|*i^N}oXp~dv)Xa;X18_z{$u=qdP%x6 z(Mb?|BJ&kpb?eqAr(X1V?cYYgvtS*k+XV#$4^x8!68f?=)k?g;t0lHaqy>CXz~C;y z>0d6IgTE1~)Xag2glN-6YR<-#6l_kTn!2ke+F z05|K$=#Y+qOf6o7ZopYc2y3b90zn{;0 zyk5`sil4~4)Sr1%C}SCOvuTaOu=eGH#-GY)I4~=ZpJR>@7x96Z9b&OzLkcSo5zJ-Q z(!M@cAi-fd1~k?jCI*|T6#1KSw(Hj)h>eY%cg)P#IFrsmj3F_N{R6d(G;h77W#_rb z{q47J56>|lr=l`){P^XWS5Jhv8-(W-)RicAKCcgoCuS&f(tT3=9mC#^tCN{(E-))f-0)Tt$C*Q* zl#w4AngyRDq8#e2WCB6Qj43(Ho+@^jBk)7`X_$M4e_KpW=d1+ob^2;OxT2zhvLW=0 z1(Rhs&x9;57I)Ln?RnO*LIxqo01Q}=Vl;D7J;#F-J)G~Xj4e(8${-8*78}2kt7UF$ z+xDh0^45zPrpUXI&{1nlewy$RXED$`lSpg2-#yG-(B_yB;XqZkrgjt+5ZAICNrvgx z=GSLdX*jM7e0lx*r6YL7D4ie7d(p(j2p%JbIk&bpS>8PiJn45=2W1ejDWG0uboZgN zk8OW2^WsuCUvb*2Zd*ptTDN;n=a`eQe`=l+^WYp@l8cE4mN#1osGWReLx&oCxYCLTdRMsSa$r_u_*w7EEA!5 z^8K+p8;!Xgl>tYsW%XsX6+#z56W^MB9H&eQXLbT%>V7|}%ta~#j#Jjza=*?b9){nR z4>nP`5f#;k6%O?F{mt|UW+~pFOFLeiWZ{@)zfY)(P^KIV{6aDw2D=Br3ton`^^UlC zQ=}5mM?mc6tgHHQDfC6cinV%GF=NObBzn74FFNj?(Vb0_hvaa`T`aIFTKux|GQ<%xk8z0pwc_G z#VyhH2J@zzcau#Wj9PE)xU>-H5Ld2o+EtrEej=j?Tc;sMSa##}jRbscs2$)c z^lP6Xq@xH=8(LK#Wk>X?(Yey(tI#ZY4zk;~ZwK1)AUjq&<$6?_;xDCdKW11J6Us@?gyGgRTbI_Iwst%DdB;p<>rwG8yLN6il^YW2hH>*i zlUQ5Fi$5gx6Wc5_#nQ&+kCBYsU!)W<^oTm0ov5st+MB=7<#kSu_M$~6J~mspU;cb8 z{5<;RlLm3+V$L>AtO0gyIh`>ILhhH*HMcWA6rS6|4}z-Uk|`)?^jnqa z>cWtfhPyhHNSt5GKe+s{ztH3O7wo^3C}N)a#fxUw5(<$+bRT3bCgxnfZeX;}P1i}b zsyVpXCT!=+dEt4&8o;nz;EAwrrl?A&B@|}PlpzL>c~OlKed#b1S@IqR;SpB{zNN8- zEhGO~@Tl~`kck?Ar9EeF=;nRD98Q>)*B~TNe{_Gek0N(K9QY_1h7a#RFR*AfRgFgQ zt0;b_2-(Om0L&bQwG~k4l5Qx_nz-*yMhvgl7CulEYvfc$d^qjl>FMyN+5VR4L-AdO zKbjMSJ4(j+P2c6C^Md-`BoYDfGtW4@AqeXMtHp{$bYaQ>d_u)Vo9FpxS>L|97W{|a z`nlUt&x$FaASa?gN5LjC*P&AW`z{n0-QC>&cK1AHqGCm%F5q6!IB9B0;j96Iih`zX z@Ksc(6#+L8;2uAxRU*fIMf3Bht!0bAP#_BU#k;O%3>&s`6$8Bl+sTw6ojT5G0gvLS z784k-8leE1v$L_e@A2MwwNWyI21RekIJxHI6Y$k9?7nk6=^QX5Ysu;pzE0U9H5x2k z?_3pk#a_IoV%P-oI8=VxgvIonZNkg2Bo_dUi%#fv`Wyi}^GsGTD;8+d+uz^+@*(A6 z_J%BX!_DDGWu_j6li)2c0FLY0yZ0FzDer~d5w~%W?tbmii{nm&(NnX}X9oidT>;AW zl)IhMZ%OQ|o09@=X8!C!3Cxl7J-XjWU#Hvge(hk01Z@M(InddQP9!x#a}mp%jidJa zYmBGd759|c+eh8ifnE{U>E%O8!|a#yc}gd(H?|E|v;7?3_qfXO#1>3yNT;rO#l7koi3AM?&nm!DYj3)j@`QT>EFrL zc30IBOUtAK|Jb#uHB`6%^yz&?$Ep|m6K3?3?fUa)d5lc2KW($V7Il+%*-$;%pL64X z|HR8H9MUfj%Kqd&BJ9;Z9Ys^+{*1x3(_XAIRH4&p9j{{p=18kNemN((aNGO{Id1Bt z12u)IR+)kMIl3oYfB&jlVbXKRtOIdz+I10Yq~FC!pU@jTShBIxro8S~uDpLYwSKm= zbaXMFH##V2!BM%uL;eN&rh6g=sHnW4=K@bi0~cubkH1Rei0K2@TP%-Zv)_5tS7bFq;=Xxfv;M{%88)Tw&q?WWgtTD-XIX$MZ=oSaEsq+eXq z|W^SuK#ACSpDBt#Lq2@dbiv`BFjx4SmZF6H;)TR^Kza6*t`;_!@&c%aT7M+aR_Sp0i z>QV~nJKIX-z;*Zp;<})gK_iH78ocZSdOYDOy}xrS;P(VoRe|7U?w;|)Rb(#~XDvT)S{bf9#+OekE|!E;)> z6{aXF_kc2>H{x@bql}>Wd|z8DI1qXgp}!G!Gh$Z2p!hdzh7OGK;+qMf9jp?8R59F! zC3}A?r7B%W)Pj%~?7YAJTbHBwiuvwhpeyaUn85)tu^(Ipqykogb^!)Q_%z6NUinvb z?6CUL8faz9O;0!zk?c+F$tad9kR!Rb*8cX6)>n{of#2o%ZUEw8p9x(O zCKQ$oNk#y55SGKZWx<|=)EmmS*K8Z_QKzkU86B6O^8!)|mB!YPwfCn94-=8{NE(qH zoSo^Y1au>qOvDa|D#Vyg5KzxY1`>hKmozt9j2^wuwN-QJ^~AqfThwq}89kUDSn4(& z3Fpz>ZM(;v8tL-FcUVU(m+wI^t$u!8C-}+IF{Arj(NA#*zcJ1Y{ zEpc3O)>d_80~66{i;EI$;~Sq%P)0&61S}N7v~O-Tjms;Tl_}yGx$G&QS5iIk(v*ih zdnfbmPIT4hJj1iI3~g40YOejdC31|t<3q=apAN`C8b6OQy8YqjGFL}mG(=$6i@46c`NtDjGfR>+cYW)F)GeK*P?AU7bX^BHC$k9H#y(g96Q< zEq;Sd1nCTC1v9WfcwtQ?2M&BnOr#NlX7(ix_v7%y7a2i97|_dWi~T1|oG9>8@h8DO zsE}x1=mUa9j{(R&+pAXOc#|sq{Ks5$znzc5OsV%rNRPf(kEtH4K6eIsi_x|Vw z79*52o^7%4JQJxz1dK^BKp}J%@fJrV)rLIW2ZMszocERV%C(m@gzjL0y$`!3C z<0aHvLcHQi4*2>WKH&{DW|`Fz*!ZR6z)1@Gv(6X9I6DrbI1I)O(N4I_mITi9gHi88 ze~Ypx>dutmY9=ASOP+UAIo^|N0pGQYYpebp-}(lGu7CT&h1P;BD)G2nb&Z)hn4c#BgN4Wzolv_c3&^>k}I_a%905)4EOpH!Y`!w}W#Y zim(=Lw;z?H$X}(=#qL?oM(0kN#9M>E5YxBN>s4@YO|dO{lRZCjC1{l0*EbWXw?&oA z^c|seLrj6HCkwGrGlByEP1j8pPct%xk(5NAPqzb%f&ErjPfwFGfo>ng*(q!{{14RS zJZxZ5D(p%UZZJ4YaaRy+qvCI2Z5?LzL}lDKx)*?G&+nz(inx*pjxaKp1?s&_?*x`} z@J2ktQ0VVf`N#omeDF2+FHba5cAs(-#s7hri5!+mFtt#0Leq)YgAAfW^U_s@*^h!t zuE(>6-utESW>6i%hzr96*&ExLAb}d=7o3o2|KRmD!c_v;%psWO>3;y1;Vlz$bQ?zp zs~)3vMF8Ll^aPS7!C6nL*nSv&L8fzdi4^#Q66)`H*ffy;;Z{Jgrhx726%mPK(Xj26 z7`pO52$FBO5{sFxte)%{3t`~A{nz)6JdlPHXSm$azix(VSb`!; zi06L1m<1(Le*J|t3MTOtsQi_|%%2Cf4jDGganbTF(*7NDLtpRX&c+axhT$c|FjCIo zuhLlZ0C7S&ye!aAk?x+f>UvlE8(RwAf4Wh3%W%t5F?ETd-0fKo4=;X9s#V=AGEqOmWS^&WV@3RM_!sye;HQ%20=l41p1 z1`ujVte_#`==7r2p%rfV+JSGzZ$=QPNKTA{k)m8To}4F_C!ja{oP~|ea}Y;TF|jux4JxgNe3-D-X%+V!m|y>2@n4~Vcp|) zM~@z*kxMTslI5qgJL=;T%FE#_zV-bLtsZ^|8NM%UQ7@OJbza-6+>LZ2jE7HZ7J1_# ziW4V#bBWkNtfw7oJus*38y2fm3JNLxNFMvwN4LCPP2ut3Q*t{6^rSvgUa)~*=Y&0_ zF%s%Hfc?dHy3EaPa&Lz>R$`d{qda65^9U$sZ0l{xpC?_swf4rz(^o5=p75b#T!?my zyN!J!@yGdTWw~+gJ(tekhLD5B*QkPt0M}H{Ev2P4n;#S zPu_WcX+}mjR1Qd~)>Nu9v5x6vK+u1qS9&g8x^yE;M75HdzRMgw_Vrl@JMwsIZ7GQ^&|pQ1Uy> zZ6@G1LOElH#9h7Wj!p|8^DHiZ>G7rt$dkB8sq#fC3O0uVw-S<#xs8Dlj4NFuO%lX= zgh!d0Hp72{8u;6&$&+J@j|cFA31-E^CN68H48O($DzbQZ>Y{NXro!2N4mBT}{TWXq zyl5MyFp&3KE5b(9Zvgt!1Y=+$0certcC7`{{yF1?Q?w!Q+6C}GL zY7Iy~6*un5frOMj0x28kKU6h2U>)85j!RCtifv7`wW(!u^N;0I)6zyZ zHy8aHNRUYYD%2j*DjOI%$t`u39mpVE(ll&PiQyoVNibQ)Wi32K)25B0w%nJir~0`y zZsL|+3rC(Z=f0#+ocA%6vR7#L#5f!XM^W`vA*Oy*S;cgYu%e5Yj-Nljnf9XoK46st z_X>5BNHq`=3jjtGD|Wzm1mR?WK5eFR@%!`${l1Xgit5|U^_K_Ia}jI=t=5pZ3;6#p zYAmr=s8HFHgQnXJsaI5&tZj&$M|K+xXAlY5cNY`mVwhi4B@V=jfP^sn5YyKDhW%gUukG} z3yX?F#z{l#t07_#D!g7eE&LRZ@f%-g0wyW&busjjV*@Sfhcl(2L|V9w98_MXf92!) z4I3Kf2GumFg%dCgZdl2K8aaA&*I47FEJ4}=CVSrA{CcG2iwoYwGJ4dF>EB(br(Ies zASebQIYU?XdE9Zw`yru}pr%`6u3vuTwR1vr^c5EpJ63$IDUi_67>c!rv-SrjvDd_V zUtfOl7$%su-y;@k=$_oPaD@MS%j*Xkzep{)=(=l649Gep1`J`Z66ak_CyFjaY-oeO=A{4vn6J|yU5Q(slj<9)Kpv%qLvktV1&>H5j0&7B{d4d zSA$OxXt04Y0c;Klz6RF}RqkHGiT1@Olq?x~^Ewg&Q6_WPbPlNCaM}P#(T>~r0PCS- z(yB`)UJ*(VBOWh{Zze5LlO%lXBE+HLZ+nz4XHoB}x4t4B1}?7HKSD%bmi$`M-)fPLc024PFf#LPZEIa>s)B~( z!?B9_oG5n$Gn4q|dk+q^6au1z>JS4S8YnSDmSheDLI7>7voCRZx7G@U8K7$@%16RvY_V$Zj*Zz6X z+Dtl)Uq9EB_2(4v7fnS@re3n=1Xr~?n})r6bwO>+PqX_!lV{%mkLUPXdJmn&h z=*iPDtJ4H^DXXnr#xcrPB%Bm|?!-xxL|YGVqcf)Xct6&+KrInW;NKN_O>ET4ju#Dq zzpYU2ivEJij9hz7Zdx?PDC|1G7ogM-rwi2xOC4=h5$1!YorRt8it`}87tysInXohH`NPbaxErfMLcMI6H_?==-R9gn2^>ladAJ`p;03;Aak@3=~pmKmvT& zbLm#bjAQEQt6-7poLvyAb8GFv)iFa=r5`*r|6;&csi>HP@zckhIU~sk$)m=x#?jl% z=GfSrZ2nnzjnAHw=;lUyvGh0S%m73mp@b;em^pv_p35i+ZPhn-*g#KJ4ypNtP z-tuEG88(&=rh?~r*)jPI$W5HG!YzR5k*bEer=e!99B|EyBq&hc!bXIKi0$U&ZhDxI zFaxxKvr+$7f*5E-8*az`MWwm~b|5NV|Kcd`?&v=dJR#p za<1;%S%E$m4IHWrYOr3M#2<*G(r(K8w8I1?Yjd+xJ5iwn_@HFTo0-p^i9v@-<_c&G zDPwj-z>51zRrLn1K^U8%wCR!H@w&5rniQkP46~n86qBsv|BO{fH)W_}&mlv!cc^YI z=}JcsM2f~g2hlPF7!o@x+}@}P)>~N}LTS$TsCa92QJA#^)5JwP-|U8x_u;-_cbA^) z+GYC-;?{Z|IIu45=#ryH4Stj#aiJcwYHcx`IC)hcId#C4cSUz66hGCzi|lsFR1am6FmXgYCxz5kgr zpXF^3{G&RDdt3nE$k?~bG+R?MhwZSpFGImu1*56RjlenlDto^32KJ(mQVWGU{y#Bj z6=}mKawM=$4jeS7p7cPWWJc!rlD{G}t=FngrHnX8pCMQoz&M()E>Rxi}vbsdVrKjAzl#bVV$9UqY&0#w@Sn!7|r%3{t;P(^XwgjarFNeiP zEIuTQpjA03cx^E9Oa{VpJZQ~z%4Y$^x%}R# zbzH93T;j(-lsas-42m%Sg{0J=yM`pi1 zeI9d??8l zqCAEoliR&!LHVl%*YBsR2^ku_Hoa;k4@yM7=2=I_#feCFL(|>YG$djMgD<^hkq>udTqq6#1iKvf{K4B#~Azyw8Rjr)v@wxuIVB+7lN~fT`P=@4|CKs)!Ye* z`7L?Y;qGWW;7(+;e<0EL2rooLUy)ue-3JqNyEzE(g#W6j{zkD zOY#!0{C!wjx88)3luAq3`SV=Ya;frxT5JdZt+ciA!)%SI7B}X`*o%iWvjpqQZxHiw z_LQX0UTE~=#}AQYBm~$rBpMN-v&h)S&1Vr<>9ByzVyV=s*#Ko zAwc?3HGW0nqcJR##BGE2B$!;4S*&KPmzQ*Dq>^zdZn9&guD^YdJBdEUnhjgAs`(kP zYtF$&_0ijgIEFXL`31e{+c&PLDnI$aZhUp_@ zT1>(MZ;p+3T6rg^y|C>8sEbss2Ti6HE@rE`d8Z2>v5=bHxw8R5N;`x(S=q@aeSMvd zq6Inad`728iuTfimK^q7bpNFGv0gt$#FZo!77iw!CRF+=10DXtbmf}VzBP9w_zi_M zicn_Q&aFA$usY$CUev;F5d-dSla8?boGTe&la&1A$-Jv0|DsC6r1JHI_k=~-MZI{2 z=(|baz6jVaWVtXN7#A;PN*Tjkjv5Sv2q!_x-Q0^>tC>O$(#Wy^{n6ewp3#u?AZX!( zDKmC+3yEff1LAO|12#y<%(_f zYf^Wew2NYPG0;Veo!J;kx`&XZ3$18X!wROF%|GrgC4u)2FZRE+ZsW#O zlo)b_+x?wxQ!Wd?GB?IIK;y;3NAyLUhyqY{hJ!nA>+>4@wIET#CJfUF__Q1;3IDek z;|elad|M?O8C|J*$4lH1;Cw6s7&>5T%-i#h`RX9&%HZmu3do+31fr%gK%Ls3S#anK>svHq>NPQo=l9H~ILwM1hMn9N^RnhmNRMAbBRDUq@MW?e--% z-j7HEf+iGL0A#^H)Wd+uEiXYrsbd;?$Wq1ba*?IUxu6hB9;oumKP_y!7MOuScV!C&HnZqcf-!;X=ZrMu@Qx{_A^6heu- zip@T^^%a6-YZog{=e2Rl88|(qHzGTrYxnNMRa9;rNH)eoL1iu!3p5!f*IeXgICR>_ zJoSK+c&$6)qWvjD(-~~ z7g%Oa*A6vsnW3Ad&XGmF#!AT)88qx02+yev*^MG|ioGg0RM4MZvt_+y+_i$scx9X+ zlWsrg)|kk#DabC4Vgw-V-wXDcYm19=weH@jVmvUHEA@}G>E8MUbb5riSOD8VU0wbUn0<*eWdJ%3(@M=m*pD&wv-JZmR-yk=+owc0|4ZC*R|4)xg zi~4CVQOlxGrc8N0U;ao+a28;l_!wZg0`&R(S%+4d#$SX4!pBxi??!F_MI(kvPE9f= zh;{=HTGWe-7g)#Zs~)x}cLMhk!dVgC8#rs%$@AxrAn^uq<38j_6UoPo7yl>yKbdU3 zPWby!Xi~6>a5{b^ZgF-@Ij3+srHO~euxodQAN|FCgqad?yF3FSx1dzGg2T?GWeU(` zi`*qDHxl*qk3{I6;^BWFQvC|sgDRtstn71^ln=!e=H0MQ5l`4*z3_nYWq~2mpFZ^> zg^n-g`>ay1 z0`cj^_}?5?Ly(XQwFkWcik}m-ab!fRry^ncpN(K`r4iS>=vjDc_p0>%@zc2qKzGD7 zJh8QNe!g!|Z^oI18`g=`g_b#{-sj*#1o7M1+k~n0+tDS7Fhi;gZYx@`-Xc(rs+Am| z%hHENNB+LIHggR-l{G1Zcl15#ccRl%Q|(C{5ZFz|>WDHSQ^2sSV*0=+qO&FivM3;l z>m)seVhWh%0eRPAtQglXj*7M)fDo)(;(69qy0RQazzf`if6A)bQw|evet&vq*$oTW zW>J3$pFDi42r9*5fRrcfm1Go4!C;S$7@PE;$3kaYINMg((?!@Qaw=@NE&ZDu)(Ffk zuIL6QDzXyp2=xIgp}n3_EeZD<>+$4(nYAarPgBpUDY|j(8uIoh>}}>OqsErwh62QJ zPfYx~bSqWuGo`-+V5da0#@^PoJmVh37i7g54Qu>k8ecW*C^6%Dxbqd@XT&4hglNeRQ4h&N`&1H*Kp$nUg# zes=%Gi=|5wnZQFSqF-A8XNgf;rbaDq2l)oxK0ZRHCbZ8aces=i=yP1ehtT>8KwR$g z5HaJG!6mVBuPK*!g?dr?P^-xtNwsecXu#v+@@1p!@S9MQ^B$a21@45cFn)6qVDE@=kmt0piDVh_htQJJ$Z)&lWnqk zaqZRCr+9UbBr;!v9}IN;I&OaUryX=i)T$VmxYCuHHrv{|K09~2%K|N}&Rn2k&h^RH z-@2~butAgeDMpp$m-@4kXe09ykxEc7`*EiF(&3w^M83a(jqB4DHtIVm{RB;Ue$_wM3c12HX^Ccydk)j#U4H|}Qo9F9LO!-55rfS;R-fD~(i5RJsS z>|CVhLs7$?fHpV$Vvp-ruzU@XSy@@6%BljVE(D}PJo#2>X5g*N(m2@Nyu8NCmirf} zIg3m+mhyiL*dvYQ{0>%9UwAH_zY}d{MrP)EJU&=tujN#YH+zJ4N7G{~8=Q)Q#YQE} zpa(kefA-<~oHlKMy!^#O&J%-!FIy~(kN$Gt;zh4p*Ke4W1BUW)Bz4c7OW7*Gd#1;c z1plvje0TC9#GuFA*LDQI!k<9rQD|RJ;)G#d4TR~~$N=2Gdp+tgQ`y2-j1lIt& zb%lUV7=bUPvzQ;GzVZ8_5vU?*6Z#O0OQ9eJ5DZeam8IPYhPoAVWhfedfTWO;@$3FO z5k4+5Yq7+0vVJEbqn=+enK*j@X>rLQNa*|?k}_|&NprjqSW&&MGB5PGt3ww~5dG^y z;b5d!$@BW%oO}s6@I}`aZ*buA?Cl9Vs#gd_z{4vyQ-T-l&d4QU>0i4EU@Q0sp%t6n zL=EG7c3z_how8eh|@g4}F z6-0m_Ie2dzIieG0i3!mT-2ur8g6$9v4q7>SGp7(~^VcI$7k%G}>H+^ILp7>K+*eXs z`H;6M5^b^FiZ)d6EW{s^Btn=0$Y~jOt_VRPyGqbCIF~jV%9W`0;rRbw4*kHEzB;*f zTsPdA!b5qT|86Hc-L4((B)lQmp@}~@kAVU~C@xK$?`_c`h4%-O5Glko5O*|FjF;UP z5i#zf%TFOnYdJbpUS1wb;TM3t6Pzed3eUM58onUcmd!LX0s;`aEtUpug=|uC37eK+ zTLsM8W$RHj)@zdhq(yQ)6pRo&gZ{nD_Wy9A3~QedEAVC)ZVRWu>W1uE=<@hu;8dbS zg4#H7;lgv#^G(@&^75g!#~dYeG(x2$LfNQaMbI9t)ycCtBaST@If6D1{8|0?*U3nx zfl<~$JP}{niA+^6F9esu0w;7%0NPZQo0x2N`Ox@!cLxAUAUiQ<6W&UyWod63D^~t@ z6pj?A?S%}cbr&>dHR)Vj!pZQ_9H;yUk$OSa22Fp?!#%qIxfXCRAjP8o=-0RJ_iUB9 z^V*x->lCb|?^;JbqRB&N!I&>n-Q+zBX|8DIglUGl!3!=CFO}fmVV4=O!s+$@GwB?> zp@DMmA4<50Ae|y%24x7)b5{B@c9iHkal0c|l|<1^SuV0Y&|au;`*Bxz!d{ScZ&iKS za4=A!b7e6a+R{6Tn`Xe32|#Qgtb(8^jXApjC(ZWO|3eHJHwYYrpt(f5%ZB5Neg_jk z;pX(Zb!T<HG#B0J`s6w*Es?N$}iN8 zh=hoWz;-jOAzQvHx+WaH2SFLS0EB>BE@0jCoITdNi#wJ<2oP5CJ|3Nz_CZOD$&KKd z9{cFJ#6)ja3o_4PIHT!_7A;wnv<|kCt5*khnwB6!q0=g`ZwMgn!jP!q{QNUQ7EG%o z^oV>kG3EysXwEEmVRjZEGWFpOG#l(~!(UN}X5|&;DP5aib`Yk}y=c&cp?dl0%Zm1Y z#h;>&jA8IQT!J8fh$~12qoP?Brvu47i+P}Qk~bBW85C2c3Ca;z;Hu~QzHx%mr{)A; zyIvPKn;8uczXj4*@<=UFIZ###H#pZ8D6ZjP`e68x3JgjGzF20_nK+)9!ch z-W*Y25Qs_K`#v0R@_WCvFDWU}-Q}L{i?;QLh+||=fd$PK zR;pgT@_LLUtKkPks#qO}dziwT?GpY#N4rP@m4K3;4nP7UjBY-XUOj_Vf?BTZ`qasE z9YPGiQ!m|-_&;mab)Td1Q21aAUIYY+(Td))u9);{v(%Ue{D;~s$mQqi`4TbjK-(>WjD>KXcCGY8j4^Jv z$UDtjkWno9psC_t8lAmVTe~Z*pTd+WdYOIiEk=OI$&|3s(;tit8T=Kh21>k+BC1Rb z_(ZAxjo7=zpA(PIAl(S<>3&i0IY$;52&A~z>?Pg0y;qkt;8U=+Q^^(ZI(%4Ua}&g1 zk7NJX&Ip1tEESp0MB!vCw%8UG-3>#J-BAlBjnUuu|H|Eq_jdHm$ z|A{3;*g0S2b)wr}nX;kCECe#QNPB7dU9RRIM2d>Qd2AHJW`wg-ryl-&II6eIl&&lv zY_>+o*o9^QCW?}dwQRVkYf^Ugj!SumH-#+M^5fn;yO6UWq|X~ z{tpdVLT7tlICCZq2Q5(m$8yuO%A+y0MuQg7QUaST=-Ney zC*QpODdgoM09WSC2qir4X}-iF4#x5`MfxQt>Gg4Hu16gVEHH6fiJCZ4M4Av}1&GDu zhF^LaZ)3id9r{*&;2`@YFgUuApm!Om1Ac)w|QWH@BF|CqH!i?r0oyX5p44b4t>wsg6EefYR$E#}Fc z9n7%k{;hNtLQU0l;Jx~V0hrpN#=|V{%RDx-FdHP|i{d|2x>WgCHX!8W> zj~16(0%_WF;$TEnpa^^+7?r6A!I>@sX~fXcy%6e_F=k?jHdP#>(&e9eo5r_eW=CzD zvFmg8KhmFir>W^BQSH~SuXUT_;qmQDB%Z%<#*f{*z5mcjW@%pO>rW}W=4nqtFtc}# za*t_S4_uI)Zx(vUldw_BZ`69N(_U`RhySg?$TRU0Nj+e6`wz6QVWQ9MA$3-ZLqUb-@L~WHhw@K z(Xeocis~QR_#MjN57SchB%sOs`-cpol^4GV>(q|#TLm~Sb(cXbf<5g%8mHg#Rj=yfN zNU(m}_xtzvua}!YZdaYZw12@A#*PsTsmlx=o!*^LUO8XhI= zkhqfyMEB%S@T7$N48;u`CJyM115I4FhT|;ky@K+H4!Wna3nnu9SwTM2!-x_a;8O@# zsYand$xiBokDMVuYsNi{m7hYhHP-kJE*<9{$l*(zzl;;03Mw;diIW}b`pr!^P)me? z@*5?=jv}5yHOtLViKHkZdRhwnn{bl}p)hULX-!4N26k~h2TK_k0iv$<(S3-}HxUA} zyP3c}yV=i}p>eI0sP1)vKp-qlBf>CAz%u;f*SQ{-fDrxw2DWVF zuoY=H>=DvKQ@Y6SeT)-?d6;ip0ynG<=sKk>u2iY3#{b4cvxAd!(%FeomOk2JI-_DfJFDs5k2MA((^gBR zrmo0i4y59RO3z;MIpe>+K$)-}j}l$*B`Px*jJNk3l`GPZTZu5O+T!y(K?ASH>NbcP zm$P+xx$IVu>nNntV%7;;YN+fR0X;GX23^tFId1A)b5+QkL+WL^{8i)@1{gZP*(PMc zu*PsB^z0M|vVhSF<66|(i@b7rKZ7WJZdcLq61Ol11V9W=ut0al0W#P*UR4ol9a>1_ zuZZL`f!#l*1*8%P^Hmi1A^R}AcJ`VEeq2DEV5+Xx}mu9X&Q?P7M(eh zv}*bOM7Pu$x>wORQ^#b}3EVH-ysY=1);PPdbsi7Yw6){qPF$ESnJK?BARwvR1+~@F z-Bm}7=!mQOXIHNpt-K3(rkz;VZnhzlmS@^+vprrpdyi`y|vu zoJ=cAAFiJ5&{Ug!?TDS)^1!~Y@_qZ9aBDTYRylI1dvehg52xZgpbkXGciHY4}L>f(<^`HU<@dBj3io8M*s?0v!$eNL=t)Q$Ce+lU#^h{KA zw4hEQ!i8UTzQnI%^qg34zB8w!ys9dl9;;-2}Jp9&)7i4YM{)5yaJp^CU4w_QPQ3%8C4 zv_?gNO4HDZd_5OYr_xpdw|s);f@bCyV+A{dATRvJRzNf{LrI7XK}JONjD!h-ei8Bu za3o8%waBoix+}?@3l}aZRXWU9F=-YhvHw-zLVX~0k2*?p8NFW z_(Z$mE_W|fK3iaI`{QFl+}5J53y*cZ@hau2LuRCj%7yv;`bjQbd$*@+OSh7}t4w4A zmi*aKXZq^xox7cPHD&(ph?enHZe*h8J;!i$D`EfHR@^`^=6gfZ~ez7|E@Dj zOp7q?R^{~7D7t(5@>#0Ktcs@FZ{7Fjp5vIw6BF*2)^rr`RFdlfl=+fbo#h>%?ntuzKjN1_j~1qdq74 zqRxw!<~uXs`{7yTB=vmTD^}s((3L)-Tm1A(vEB*Un}~D-Oukg+Hfbm0RgTj(wW%XY zI}jOJC)U9M1d36FVQ3wb%>b@`%nE*nl2w4ahd#Oa4E659(6R-SuU=3#)GRl#R$^x_ zId|l^v}KW7rdGHg(p3R$neMKXGI>BqC39BP%5UVsG%`x zij4!Hu%c*kU5wwkO`LqtJXy(hRd}9=-^CzTy2jW)c*K%Gr5u}>+wKQrH=PJ+9lJch z%g3YfOKRY!#4+)En&2K=@^@bZD0eUamrh7+lU z!~Kuw6;0hTbb9=i$zgeiEn_!sET_FF81i(D`R3KT7reKPjyd>p&Y{5`OU{fxRA72g zZuYHh+jkBalUDipLga(tt`e118@C+k%#Ag$i>~g9;?j`Xb13h`$b%A>reYn5{m7dI zhcwCJUpG#-)p48qoo4hEc);j*`&Z>Et5L_&-m>xTos0BJDb_VsYP)${&LbowGmfy) zvZp8dia=iOm4>$}dHXS}V_5r0)gWL2zoYLl;xep8*rOgMfkKzNY8HaE0=N;_2 z2ivo_k`E0`C`=Pn0&H`CFP5qRl(1HC>pHQC_iNti`)2 zBIZA}wIxe+!Fz?-YZZc98%|9K7%)yL*v+k%w8~k-FK0=i44~Mc{2(>qYR+%f7=6y2 z@H_gD6w{r)y&n-Xeay<)_#_1^mYL~tc+ZDhK}@gppz)*HYb>PaO4wN=leO(=L-7F? zmzZWCv|m8X*0B`m3xN>*T$XRUvB8=akG(Li2~n_P&1DDfZK28l4BmC?w6U?$nT)%( z9)@O8D<2$J?<@J#<+r<2c7d7M^3-oADvdwM3f0xz;uG`!#aOPUK+197MG7T0K z-i@+2)Lc8~?_=di)Gxw0&lR6{?yezP2E;wU8L8>%3(cCYibyHjnwh1xy^Sdk8Qt3b z*yBK=7vuN-nk3|Vw2>!2`^)t)9%u4UDL?4K1?GKtyf8d9a(K3zYW0pQU-womn^;^B z(Jvy7B6!gNi!`s+14fHm%6n877kc#`vNr*JbZppcv->-Jgo8_>GvRZ>fmTYc{E;}T;~K{BP(lCHfWT&5e_v3HqaR83EXQ6;{-Otwf|?X z4VErX22RJ?=7+s+FUAcp3q_##Mt*hb)T&U)MxxCZpkRYt?4mhS7#2x+>`XS_6HKTC5^<0b z<-os?RzSd@{;_8i%pWqMn{{<7u35M8kX%H zd|=kfL$w9VbK|wNKL2n)J}#8oGiKI1OEPoe!r(;y?Eg>HDlL2%2AGrk3vySNcp>4sTFf~=cKt6EYZY8elGZT}OQ=2;X>U+#)$CTX2@dh6% zDpKF+Szec%wKKl#{kl~V+xMc=6KyWWbTJ*HHqmW6J5_iAnB$?>CxkIH0yxb1@jek@ z?`%DbHUoCN$Tr~oT#MRCzb4&3L4J%Q#TE5`@5}juz^8$jroq63-mAMYvI=vx^#Y{sA~3Uw>MDo6GH@+P@mFZ(igBg z9K%Pqo_tm*q65>EHZCrjV;tf8cE)xsy}1WM?C#v|dt1%S#qpubrX6>T9&FA@>5dxo zVT4^|kzev>Knv7tq7ef)7ur?eOAJLYk7DEp9U+dArLgzca+8rCUDr_9$+ZeLLk}-; z;ok@YF?*U%+-gcepHpX6_*;8r6h!Z3CRvtF$@LV2CzVi{W#md?nJN}VJu!%DX)UdgKp(@uXz@k zIKVP3>q366Ui5Xh8{Q@e3tMc2l;&UaL-#$bCU`F({6QoI(B-CqMsQ+rm|TFVAOaxWH_r*`Ik8!#Gbb$6!sUs>rGTnR(2M<1RHo@+)SGuAC;ojYw%xjFRUoSl7 zm)Gi3c1zX(v66_lA?!?>{C)0hff0X=c`0NzsN>M&j}z=SQD=TvW&kONMI8v_Q0Dy& zYTg!joeNX^5#WF-AUF{|!hlZn1tELXpZxig>A4=sIK4X?wAi(aVPw+hyzJQUDA~Ha z>dXvDgW7%VujJFnW!_Gm>K!yYbx8Bd6;H-W2RPRzeA&9UrzXEoX{BQHKJ|2tmEFT45N zj8&SGe*csT_$lL4efo}5y`xX`%3#}D6D`e$U3eikJLOz%;kF~6_ge)?8e6JbJR5Lg zC6zz{>*JuWOPyp;AQLd<;vq+KV}uVD||gJ{|fjqF}#WR$TCo!49g%@x6x zA|-=DS8~uGG04~ntnNcO#*OTC8XMM~5?Qm$>y&%g4z zG-Tx!i&D3@amxpO=^U~l-ine>mGP8TU=! zx#W%Z(sO!4<|r%o^YeN+XZ(d%f2Xw@L%xgVk7KRBJ+c};p3lA&*ymAerL^I_U5RsC z4&T4)0Ba=@{aGtijxtj=?L*!*)VlJ|pFd(kDC3q=w|=Q1>_hpgG-le#91B9u1TYFz zCstR}5cAeZxkW5>Bm=vypY=bjP`e@X_tEud&w7=vuX#6o{j43;tu=M#78?{)Ozi6& z|0z3nbji-D1FQ08jva(l)!AwygUpbVF&|`3yYeUXB2UBh{>N7SlQyj^vG~!unqY-%dQVaRNn&Hc zc7feD_mQ8n+o4TU{0?3be(Ait*emU?z3w*EWOB^m6U#@9TIpn$*qLzdKBg_TR4vZu zoo~rJIWum(WnyD~jmL2X+rzq{8`y3w1uPPgj6{wllYbXM(L|3ioaNe;`miHiuuImj zin*IZwF^09!b=kplA0%|CYpu^omaNj7_9oJZB&=?vK0OTx~G_2XPt2V_s$Qe@3evG>Pi(L&O9wq! zA)=^6EBNZ@%M)6*INUA@oaFkaxqBoH`yh}tP=0JWvv**rD|4tk+{g2Q-MkHZXa9Zhv~Ldy z>D3()|Elzz)3Jk+>+*1^Imw3%Yi=HhI%TppwLCF)>ZU0~fc~r}ab*Q5J&AuG|DJj1 z{!SyJ_QfO!$}Hq?pY_w==N>BM{iy8>LGP0L({<4zN#%8_A_JVg9~qu*9XouQx?`A; z*8LM*-foFB9B*&6N3VO1zEYW#cFkb9>9nd&rCY}}tkorv3nW+QM|!#3lQ_DiKy$6; zx=SIYJ$v-?JmlkJ*Lkr{O7~`u9vWvuwv`^F{go_v~TOm!@sX4%}F>$EheVDt;K6pRnkp@$`ISP2zn~|LA{?#2>7x zroE=pRP=uE5jc4Vc|MYDTDm%*k3&X=WK-JX$9qZ*mmOQNb{Cy`;;ma()Pi?zmAqJ8 zvUYlq>L(NZZmU*F#_>_6Y#AMCGt0gMJ*ZSck~~AlHl*E?K`La`FZ|nhx<|-12{eY=btsIMqaNV zFpjh|Pq;Iz(kAnc)|h&=9N0tMXUehA3Z8(GUSGCx8o=G=O)BF^+`A-tL-f#OqVh~*EN(1 zgO_X$HCq1t=YUVj;}gf;l^8UvYc&Wss zfax#lE0P9=uU(Uyzfdb_@HTBX)4WByCEjAWr?VP$t;1rq?}P|QBYl_6+-N0ler!m; zwe1}5ed*!TRDLH&ZTG{f&B0npuWlMfmLK>~JTt)3QseKsGL3qh=$L-JfBFpRtf&xn z>h`4Qb%`W5s?FC(c`)|4!`;mz8m5dLr+j#R$UoDr;|9h5Dgp znEyYA7xgU=BY=R=P~rzh14`WWR_qV-?_xA|-j^tBU*dGeB!o2o{-ek@1o1&Y==tn0 zDlSrL@d5>)|H;mNwbE-0-zjN;ncFzm`7%DEZQ7Y{9yzjmz{^pR%F4BC@)swq6J}qy ztT{QuWvyeq_65yUw4R=vyv?*n_kuNDiWXT09qI9>&TaVC=Fl6&sL0nUC7d`BFX1zI zl-x*L+x4+OCT8ZkDols{5Ii6B$JydVn-*wX>5SG%FdJl9ly`h)H@TBcuf0Efp85t% zTUs=}U%%8V*r8pWx^o$GD4w$CP;+oR+R|C>K=;G2GH zk*wNi*Ugo$PQeCXHuH@z8$Wg(FzV^ZJA(&xJJMWr(Ze!kf8kY_+8)^VgeaDupL1!r z?6qsmDlaV^SrgX(rnOD=&o@!rp+`Z?>rN0NYoqRLgor@p243F6I z?foIQaWftJYAmm>Uq3ZBtEMfsNTF81%n;FYot)z5wLIhzXk+9R8fhy1DK>Auo#?YB z=Tpk^U$aeuCEtz=H$0Mh|3=U~6?MZK$0nrayjGrhQ^v<@?57>&u_sPA9ow++$(+9? zyLQNH&4^qwa!FE=$>qRMy~Fnm8MUBsqhEmW`DxyXC)#(sRVdkOaEV#p8%K^9Rj*Fa zUAp(d(R+@LveQ?PMPG=o{FG2Z0c?;gR;Tx^0@lNcteE?@ZVB0cJi^89?g4jq)H zZC58ksLj?kNjGrHr;qvD&QDw4+2)$=P-f_Z*oLxhyb6 z=ESm5vTvLl-RtXf7ONUQ_3@%s`SEeKZ2GcL_lXv@X3J}qop|P9KVs1IyyK+pL*L*$ z|4zv-DX({X?U4@AHI<9HGXI^`oqS3O?eEZm!&UP~m3EK5_^o6{^F(#*V>P?#4 z_N?61MWcCBsrt%DtMwfgEYSEB=jsw&`Yv|ChEL|}H*{+KePmN^$;0EjTEkL`w|_i( z_E22T_h--hf1l91_b#*ES6_R})o-urp5@b}GKjzcLbb#cOvDumqtg2gdP|C^s>#~e z)2FXe^JdbQZ_7UVkgEv;HRcRDbA#@z8bCb?*v7(4K;Q|@K0E%pjy78|e$1FM%tQ~S z?+A%6)VsE>uk6_MUys|(yq0px8zdB3x23<&A)jTVj{f6O?mTB^;If8ZTi>i0IHLI8 z+5(x^pEN%giwkjN^!{tf z+Ow9aslzQ4C$2nwM9*qa9L5lNqdPz2Qc$K76*nckeC%+ioj$F8KO z)^qHZfFRf6_c%WoEH1y$uwc!uN8=W%M4D_Q*0d|x;NsW)(TJ5x>^d$c>IPnFEH zCEDdzZyZTHcxmX$j~gsUPU>wCIbcXH+TfN)me2dNR&?x`T6N1*st0*m8jbf1$DfGU z8F_6b!=PYmf1oWBQYeDrtiu)oos@0%1-F9S$OXSyL5Fzam zPDf0%c{J>fZksE+3&j}6qv^Ie+a-*Q59EEmczNSioxxKc>xHIF8mVQJd3>&nPfGA2 zKtf`m=%(;|pnJ=`@xp)cK0-kZoa^R^sfMh1`^4x^Rr4?1(d*w9Y1#``!Jw1$_yMgN z1-GOm^9tYU4j6zeYr@9EMoV{X@26Vs-f7^B@2@r){<$iM7CLjvcX&10!-gN&E&j8P zhBufx#xNS;zj(pe%i{x5Mx+$KQ9^t5=FOWT{|E=wio-{bin*{5bc0$Z;eGS&bKqC? zh|6p2#{WNTeRnk0ecZo_5*67|Lgs}ANkxSuD1CDqmJNK5$T3ug%H01v-AY{L-g@Dc$&P zOw4f{i})zWZGu$^2M~_Ewq(j!*V2+_G;Yig1oD)lqoWrRv?fLa!5JMwf5!hGV;J=T z>gias@MW+yy;&)kvXR^!$jp@HKGL@}&}GKiuP+dc_*`{FP({T+%y6~i7u8J}ua5GH zirxV18-=kIUK|puEHTd|*@0Ncn7@*EXndT178fO52i~kp&#l0e3X5p@z5T{$ab|Z; z0BliUQ5>^()DBl(bJ$HelY0}luT9;t)A2tvOI=f*GSD$#+8^j#6?HS>KJkG1gJ~}r zAE6f_wpWl?GGn#|iM%0P-7w;XB^!~P_rMApb!58VLojy?Pw%(o9;pZ!&?Q}Nlk&EjX*ZY=dE>M<%(+?H{-CHT}* z6yp*|)C9Ny>m2i1q=BVkhK)45Q+j$&igoh~3SfZ@6TyV5rj7#x*J|0;t-JeVv0Xpv zQEi(JFGt7osoA*!SLBcVaMLOJ;bc#_%-`#IwC&8|;*E2cTvz#6TIR)tan<%>FkEc@ zYS+%4-+|D9JqY`IL$Df1t%oKXbLhi=zOF$-&8?z4<>MXXP-FzQ_dITpd069k!Pt!B zh`%hT_XA*d7mpRespc*I1lbaq(Ub6U;(rCI?;5zbf$j&R0$6wKBeFzvE)qBu5wkcx zPkrG+bzvw>PZjkPsaNXjIf|Uk^LS$}zgj~)%?Q~JqLnz3rh@6g6OBizK-%#kkU+rZ zM=~?WG>71yxT1jH(X!^iMW{$gZG!8UVX$kTBU9QgXu>Qk6sF6|Hx+7ZbD8m*)y+HT zDXow>J9@)Eebp+yjn5s8RUSTlf(`u&3jpr(|A=J-E`Zmp1x{4xAc*T6al0cxA2D7g z#CO;OLXgmYuo*gT!Z*Z#SLNrIs6bR=BqbJ;yH_A%0Oli54FCuO2p4ILv4tPw{}HtS zoPTTZ$L$LrK`Ia6d3hIK!^Jom@aSr}ff^oA*AhL2{MDz79JdY^hI` zeli4SClOO5o}6MDiB1CZL-Inv6|aMBCCU!03MGuA!C_hTf96V+jz*|ytR4Cw%JJ8a zabdQ_eA3aso`_RT96S#L@tHo6t`Eiz;BUOqk)s zP6g`+NS*Vw3K!iEu=|B>PQ_2JM@Fe*gl^fv%qC6=S zasDtL>pGwRW^$qTZ+7Q#up^WKaQR7R_WOqwm}i19ryQ3Kj|g?_Y?N7$)j=B@j45E{@8HR#K>Lnnt5&yvO%5%b2*8N?VDd%&+s9#RJ|m7oSJIYsQb zF!029sY3p{JH|kYRr5;8@){3?$LG(Ug)4uJMf?@9l)y6hmFo9zR#*#MnV&g%ad`DynytTn zaX2^*@OMvq7kqI+v-vSJLqs_Ni9G>@u<^h71z)grbUXozLo^cP?jq_Vh<6G15q8Dk zK0@%M{`W+u6mAwAh#)ZOAIQe%1cU!Z65Lvnpn1$yA~wUdx+5Bd4V)NL8Ty(H;$Ev^ zrb5KAM4*qH6`j$%(Z6NX9#~&MyLVxg90eRm)Z2Jvte+3zH(3Bl#4sei%Le957#EOG z`kHLI*%45+zPVfWe_NDGWfouGk;9&58s|^37vr~IJ2+fS%!Q3A zq>34{vAFh7_mP<*8EWIRhqoKicVJQBQve^UZopivs7p^oGqs$oVU!V2zqr3%TZK_7 zdRnk{i4Pr0AUk9#KU=Gn)YyFGQ$eld=`jP_wjbW>uIObc-u~JDI;5i1w}er})h+Y4 z#bH-O!xi8e{2blfPx?E*ZD{BtH5v@#;C5J1;U;NZ3tJRb<6|k>Br6`J2KV~*Ph2g- zmzfs^W8N2}+MXcxa(oTB~I%QBlsYU0&P6(DsPFJ-d z&E4yLvhkbt=JP@Ag1PX0R!WSgEmj=#bC(qob{kYyYNx$IMi<;RaXfFqP3tuO=}VIf zRA|T(^88{6^8qA&-tghR;3_fpX@?PGoUBM>;JZi6Vd)!VFA2I3CyXB;@k!i!%d?zq zEwKgTGlU)z6HD*_p_!3h&V)HxM!Q}~70er<`f%^H z4$KUgS3$`}q7~2!^%i8oGzbAf?WV_4@oc2(z+5r`0w{7_tY;R~S#(#{WT%gTf1*I! z-!pb1EIKyt*ht2R`5t%ez7EUSEV0t0uL01JlqFt zSTA>97`XOhqpv7)Iws808p%IyaSG`8Z7&>$krf-RU81TYj5h=*oCE_}PIv`lN1Sbi zKF(W=wD~raJ&;Pc9mZPwBvIu&{=+q?%^ zH9E6yj2zBsGCh!qlKpA>$QLtVaHqi#5Y4)d)_N~zSkHTC^we1@KEjy() z+6VSj-yZ+g?a_2!aHHMCCVkG_{BE^{<%PxHdzmyhfuV%BJScoXv`z*ol;p!m_jm@Q z3?WJKLpoV{Njj80Rre^hYo36)33&q1XC2P3VPd*6q+~1rcQIg3#F2(r)n-vOZ)^!e@O{n{D;gfzkd3RPPAugxXu~Fq%B{#{CSarzkA%9rD=+7AcWvKrcY_gJ6dPy0rvT03o40 zhTrdK2zgJlu!ffVIyg+N^ZT8tb^X3f@$_~()$8Xac9}AXFm02$e#8Cy&HY|Gc6`fS zm=6VWW|noa@@|)j-~D6v!oocLwu4PUC|(K)(Bw`&W3MbMEL2#gyg)s--}{#^>i)d_ z(4XTr2Fn5gE+oPjK1aAAFw4YYi)nXHLDKN>diO^c65bLgh2Z2CWm!WUUy_z}8<4oE zfKwT#0Hp6Q#5xB3CO!^+Yd;p_0TNs|R z==JR(I2Pfytb8 zL|DE#$9%Q4qV9jK-^eI1&J4W(nBc!!Od0R*fuI`}-VA|6j}3sXSQpm?6hbLK!$Kb2 z=AfZT>{ytz;GMR0@G<@Mb~%k3}Qfm#V9Wy zRG%J{j55YIQAC)hi5tHf?KESi)85E+-6-c&s-h9~wI7f3@^-H{>1eEJvHd;Qd#pL& z%QHTjKW878-e{ct_0`p=m~CV1c1G|?z44CQ+c%)mviq3(P24BY6oC?`A0$B6p!)*< zdlIoq&V3wcfDnltEwL_VtclV~fc_wq}*YS#|1)Bs=LU4R>;04!HXGl>T>S`od$1w=N8hMq{PL17~v zD%d*5J5bBLO7vB|t~})Kw%2#G96JT=rXfV?SgN2u-Hmq@nl{J2sKMR>C$+YvWvWA9 z{lQ1JTOFKsZjh0byvx**^+0*e&SLr#>aTuhicJ}1gdld*ec^IgnDAIr1JlXR=`{_A z8~Qlt$PMfGOJ+G`rS!0sZh=|>7G1>J8w25t^E=vy-2e?pf)PdzUAX^<`c`2iOU_H8 zyoUh{d}owZRfzzCIHsYrKtuu3Mehma$K!ThuGwrTc=hqS88;@nGc zILjvJE=fb~_YI&d{jTyCs1ixA4(cZ~e#E;A1W95k2>^#cq9iB{_dn*9r&0_)27iSt zod_YHyI3A2ANfNtVc^fX4)q>N*+8>Gm!&^b(3+k>X^Sd*9)wzAI1K0k^7pR%D~o1e zjCUO!O}#Y4!7kf5Q`1F^F_SotL=GxVK+mFd5OAlAfQLR#-6;HnUE5Gd1!m>-f>eKbfo4@;UKfYsZ z&*q0lPJ@26CK)W^Cf z3K3n=KA)Tb-JwG!F-?%X9!1fMD)`5&H?Fc*xEbKAwG$Mv(qnQ2Fe0WyC|R)5Q0E?p zOEc^gpB$;dP!W=uK2*TO83ZDjd#Ztv>mF@VI4Xa!iS1FAI3^f10tQ+Fzed`L2``Bc z#|1!+MQq3y4GnalW*z>YV8{QY*?Vc)jtJvJwAL`P+^Z^zVWnEi`pfwgVNsZy601bu z{TI+3Q4uf)H3oPd=;ler6v?=S)YG|QRxiXlyOJ5}3@nrPIp$T`TdvatEfV1ks$FoAn8@c7Y@#RWZM`7Xmj4UP?KVOmEy=l^`hkAD{~o^PW$ z$2N`~r-7`W5POJj3Ddzzq?&=H{bV8)uA0Pe2k@34P9IpsH0v~sSscx|+W%T$#~IyD znhv-oeE;qW?J&_7p%D4;|v$B)1MGUOpFc6sdFIX_$ zoGN}vm%l4t`*+kPL2MAyO}PIax%aSg1r&)>^6$@4VbQ4Y?UncLB}cAIt$Q1%i9Q%Io!0@c z-N%HN%e-7VS_{VZB&}Z`{$X)2hE;!cqQ?6y1Hu3+DW`! zFd$MldS@9kP{(d6gTg9iEZ80aZ}aqG+4a=AwOu8X zT^FA$_Tjx#Qyd!e1&yhBl;`8g9nt;80&+bSGcN_n^xE*G8KhVbH=o`OFCO>K)i0w? z*i%f5D;Ny9j-NRdE_!H_BF|F|T-h*muFR<6WZtn}NK=+KQdj<`Tu$ch_R#^j|uGBj$A1b1qxC=ngcQ&a$ut?OUPyGbix#Pu^8fm*uqBNR6wd z_1=+HFKq3_@kq9zD-1D*2NIu_#qUE*pQgI{TXBIl>Y%GzfS6zqTgf_*`XEL~V9le0 ztm#i5X2@w-bA)~1sB%LbtBSTxXLd-mI08K>#|@>x8U1JPN?haBqQ$ClBoIvl8=GML z8P1Jw<8~QtH7*M0JY%uS+?Us`L*}BgO6d6aWv`{@EGFDcd|7bhGF8fo)~l;|DZu`X zF1dq}+1&hozG&?mPU*g}P8+YmDE;&@mDTboFc7H62UXPs<**n^I%+AN?K1;e*{y0eKepP!p-`iSoY`CrWeFz=uFYo0wqM7sa z+J5aCkK%piM#iJ$w5Du}cH7)49LK=6lFoi-K4$&1qF!>OrDgeuw+S9uknV4V4jWXOVIfFc{%m%qa(WnH0T*(Mhf(=ls%~C32FcMsr!I0k0f(n zAHRO9;E80d%m*#Qr@Qvow@hePlkwZxaxW(|*2D=DZ<4L8ZQ6dj3bS|6j+a1y1ivPD9VORDEI>Jeu$E=H zP`-%pwv2P`r&I(K8)w~hDN(=|VLj~r&Jwba+|v5wq|VV#(3_L_51Jp;JOKYE z%4as+KJ=RdRujaN7!C`>5gJi-s1yEOKiO7{k$1Z@F8ql9tT;7;u%*Y;K@9`MND83J zleZN>TZs??!JiySEpL$J03pC}q#Ym{;||v9456{VUj}$$nJ#&5w=%t}lM|S(QTgu$ zJTGG*6Nv#FlQuyaiNQTwFQFUd-LQ549rT=pCJ);oRD)<}(JaioC@OMRgi#2&sygYesO6if`mwo$v4Yc7}!e4tLHG<{FFSp^PHl5+vPX!hqch~a` zkJ4)%UjO=5b$x_(-^2X6WQU^g^QrBs(WZ|y=+Zun3JmT(WJ;G&>aBbI??E5E1+Vdk zI*qx8l4)&b7oWAo)hML3^&mA~c6KDAb2P4z;8pTN&EYq#tp&P|?~KVzJ9{?#gZ*Ar zIxiKks8aWofvDM0wIP$r%(J5%HI(Hq!)^yj9s*clXTo(+!jiWihOdYd+6~bwP>3X!TvdbHb3*gTf$+ch|lfT#++n7hW-IAKA;9<}C2% z&kvW2*)KlOTl3(v_`6v3w;$4sp(uSbdDJ?156RfL1$IO5pFwPS6$d=BN!AM5U}E%& zSt8>2%GOl|kTi`sL;l4ER)bQV{QZYsHLnX&CDRYmTYxe22vnhH5ab2bV_(2nlZ5xK zW8k)e@f#V&!g7_+iimSMvC$iC`s9!iJ2RA0ehl8CwV8UkGq(yqOIofK9qAafRo~F7 zc`xWoUD$`Kf74R=Zy*9F=I~IxP)gL{q>~Hrl`|T(B~?FWwI6U#ss9=Dz7L|k73ie@ z*zzGEtHLKqb{prubDrFXNd8!aV@tA0)0%|7+P zT2qrla_!18`=)K-OzG~Uf`-=YVs|2I-yM#K{8FQ!^1|Vh_Oy0EZXnaq(0iW(-07YD zAL{8@7ntnduUzyqA|hqQ{5g-|&Bp$^46Ux7tmg-xgKLg<8$5)P$joW*L15BDjC6q# z5MOryN3fZ-YfopznA`hwh%OT8BO?`GN}N?z)re->@$gZ~&|#0AH@At}XV8r|;}(U( z7s(R^sz*Gbm^EcBBvgqI4G#3^E5U(`2jhlNaEO9*QN!9GJ@Dp+G^^psZy)9o6(V(` zRHq;53-btGxw*&kk6O;q!<=2cFjCCMNYPI?RlQ_J%BU#3%hXDu{!lXC9_gz`Ry+s9 z83(lu5c&WwG%@&~Jt6KZB-s};@%v|fjVq#WgrX=O1g0Yu}QQiUDb~%6kE6{62 zC8aZ7hdyHRx&`V-oZKG0bPler5>44^-g$Z9JfaPo=qdET?Z^)rDZ78~UMN~>DCbZ; z%)q7d-fx`j2u?*-KvwQPGPXjqs^e)Qwxqw7BBh&uXVW$)qi)^$z^7I3@Hx9XfbC7G z`^0$km$Zn;fTUcla2tr&%uksLii$t19}IU-$kob!+GB8`GG0#mj?KI1Ggj=oqCd}% z6-4RZ*4i1v;$Yr;^+>fQ4q@P{ucxkfUzKj40Qkg7unr0zDwy~YM`6sXyuSLFm%6RT zG*&{qkeyY9?e9kek>bYBMcTdvT6#2diJ({Pb?A6IP~(JhN=;4Ds~TnJEuc^L2fQ(s z2XO($wHftQ`dVUx^wvxehrdXiN1DSGV21yt!IE7Yo8T|>0)7F`H)N`ylsftD27vMG}|l8 zG;ypV+=QP;oaOh3Zb~<+4l^vAi0}bHT0O;!Uzbl~eo`7|mNSGq658G{4BFq9dtK0> zMA^x&<)_ilXouMJeCH^ccLNh3w7TojchRin_iybtt)=cCYRZGxsK;t~HP~Ad0|S_SXZ(5$A+=zMe2Vrx zt0>Ih)?loQ`j9Rp(Io$Ri|qg&39Q?$_NYU!HCD1#_DI)L)0??QJ1q2Kolq@ke$+Hr|#gb!hnHL>o5~#=H_k&1J6Nq z$z4+7-q>Tya+Ex>vu&s5%I8P|EN0DXtRVvskZu?|SAYM0j8IA2?@=L<_#*2)i+Qj4 zQ;OUj>K*4Ttl)eDo&U3j35VSr>iMb&6JE1+1Ifml6VHHAE^Hho(!uRlf*GV%L%t6H z%gd20B8&N7zop<5LyY+~fZe+D6==~2B@)Fnmdqr`j~`aJKY#76gPQ~mN_fu@kqp@T zYu@Lt{am$>&Z6YqJ20psbwb9Y&UPx3QN4g^QVWdqjMLdXz*z>SXU$YI z5d!KwF%Thw%}*CacFz8?6X_qnC;1`h4+hK+J6`)(Snd!oXO3hDu6}_)i9bjTA*sEn zI$uHjTG$IuQ?$T`ax+~Ccm!Gl0;j6VxIKQ}-+K`vL)FUyjFIdx$ospDlxnc89{Y?R zKqQCp2$!FXVG5f11=ekGUhIiUDd&D9DXH)#GN}4r-1m7s^EK`R;bIk!iFbOTzu$tG z>2*v(YX5KqOv*nTc8Z$(0vIg5iSkt>gx9*7$UWFPQPe9xGu|)krLIwVgo3T2G@)bc)#!*>_RP# z04GcVfFlFF6+k(R8R6nBbM%!QVM?&T7SdGBTaotMKu^s|^3+LJiO02p&?1G6deHR% zrlDkZh#>khe^K0m5sK5Y&aR1xrsPU8jS?{#B>y(dZK8|dt-)qkwrID9+m(7#1 z$0On^FX=Fe&keho=Os|(y7@BTOziUO$4Q)3IPeni-VvNZ*g*_>JfcnJxt5zsm z#I_q4Hr31TipV?ta;@);8+oC(uDsVWL*NEbssOe18uCraCUhD9ObK!kzykc7S`KVl z+{vUDF19*Hs$VtM&xbw6x5Cv=2tdez3*RFT{svye59>kVE&@iB1d^9;aUW{<9~E?C zyU0BaHCuM}oi=9??xr)&nZ3Gt8Jmq5ssS8;rC4HHC5++Vh;Xift_v3esFhc>)5JX3 z#+Wd-!{8N$rg*6^DTL z@$TL|c`3vz5Lg#lBh}hBJ)wzwe@1tyypZPL;g#3cs+#|PSM!p2_Q}e#C>C{ zC~z-7JUPLI#;F;89>m5Hrd7L%X(P~n1~>D6oTQR0=^!Qaf2vWEW@ zZiOsM-XkpQM%pds=4Y1vbbEMu`}~c$2f1Y0mGtwvPp8n3x}QHE5Bw@6pIYYN!RP~N zVX7&_Egcidp=duGnlIqL1cXA;=l3}f{zKp@2Ep7Qo11olc#HQR*Qvgy zqD?|%GIBE>K&Bq^8N9~VxCMzmm6(4~VN&jk=|XYl*O(#y4?#B%?AfEeh5k~!1T?0p z_?2+=ipC3${@j^(j2^U2;YMGf@YJ`cue&(Hq zI@y)yEE?clxI9!aWbL8{XB#jPMopF0y+~VtO6n}kqY1?gquE$17;WBJSLzJN=7so{ zpeR+9z<2eRHt$gM)2$9Bhb1DLTUzdte&_Sh*w|ah-7IIxVp%l|XkT57O9}pSr4F5$Z%U)SX8U+s!)VE1EQe(PaCF)+3r2x zciC!ktb1|QG@D2iPej6o4OVcWA~*cXnzipCNf5a!uMiYjp^_4Amdhn8Pa_q>dGBmZ zQ`1J!K^G4L#o!BDrrRGKsUt8>zgpgFJw(%F@8v{BHn6FOF_2&9lSXm(LF#qZAanqI0 z6|1X*nJ`-tczoSRe5{w@?~15j)r8FYa@CN9q}w0AetnUi%#S*oTw*H`#lO5`W%Ma* zKl@e(ua&3q{7XxTThudn7kA_~?qygp5sD6ssf}-y9Jo@HzCZe#+My~na=G~anWR)z z$!Y1Lg3!W3wvrfL6s2~PcO<5bFrSf$R)14}$*%Ql((QP=))A*HIW2z@AywgR`rMn9 zP3%s7rLocBa)D6lp#1oD)qc~|h;rI-t!+P>2Zk_K#XagEs_*LC#JOp9;+@Py`B8{D z*rZDZV(+O;WdOOu)3Y~Hu-A<(n%=$p2MwVYBYvW|q4NQmV#I}f9}=aKqFo=jY7K?| zY+Csb9Zs?7>GO8hn!l&VjPncWsEa~zGa$F9snBls=|C+5nxYsil@>Jm#^FDx`spWO}`?q&fF2zsd@?bil7NeT|;#6vML4&** z_uE+c?+sC7baZEIx=Z03+Tz%})!TjCcW9v(Cbi?;+AlJTimq{aDGSHcfV62?>3e`n z+GzljzIZ&(6F)?FoPm8HNi)L~xBuHWm)CsLQK232@s8~Dzpk7EBW>I;(~QT{0O+Ze z)j+OKG1vV3WZ?W^>LAV=Ew-RBU`&R)EwMa0T#LtTCSC3F8vGLhpL7kaCXnB3ynli2pV zF(!py4I&BaHEgupdH^VM3x5r~-NJi>-d@@$D>=S$Hd;sX`CF^AH4hIkvb_)ixB z*In&Fx{MODCgu|#-rKoO82B1v(z%D@>o2=)hY!=0UaYts%W&x{uL6<@_j0OML!CmwtDt%tqmotP>5?2{ucAKM01Yc~5XSA+m-+lTHMq;MW2UZb>WeFmE$ z!cBp<7`z|g4fM(C07cYZ3J1>QpMC`+cAf1w7*I{UN8$s>M5{1D!Oeh~73VMSLz_2m zE_CSJG`g65ii&-w$&1EDdIU8@?~Llx6Vd?och^>3tAtRnQ2?PZHfeAWg~JD!94<&A zK*EG4uJJ~Jj4pNWQt^HJW}jDc^DycV^B0oBhXKZeXInO`1-xtk+b!Th061G=)9>0y zkp^Tvd<}-mGm0f!OFe8}h70^~D_ytCFilHV?&nr{cdhJQ4mjd5CdxFcxDS2G7j6&> zNc<)mWa94g>@~;*gnf^i4lrX1yi|V{gU-(gx(dwfUja2j6bsK-_95s!R-ivLUZ>T1r8bI{9T?17_59|=FBfCdBuBHvD!g? z_B2{DM0UXiQ!J{H1ibu<-Lkf}ID57Zpbjh;+t+lDf(1@uK<@AeNwF%0tu{Bu0;7;N z2KP4c%m&p5&xu3{+z}jjPFVcDgR3{3+bbu0wjD~hdO`K=vG7ySf5p+F6LE6NGaJnO zk@M;bwFF|yVxiYM0#?MD-d((_&>=8rb}jktd{sv%lVJN8Hng6awAI>4pHWmKMHKoV zLv1Gipv8QNJqwj55LTkG9c+jq*dQv$-e(^t0Vfe$dq2qK;_@E=OMSzO@?Du9BIqqn z*C{Z%^9i|7)bew=g3yut0WAXqPks6vIW0qyjR&GQp%$UVQ2Y4>lQQ7N>X@VBB?8nW zfjY3jO2F{Iy7Kk>%)zZ&v-`eH1!mk_UG2xn%={Ak7~(N?R{FWl@T@m-39rF!nq+!F zYl;FV-*?^>qn*Ipx8M17PY?c}|@#1NIvU)SEzk#_!H3ud04w7e!-?uc|HWLa0J21~KT?R!CR2Xzw$4S$K< z===9tB)ouJ!Z>{<79S4}?>_(a(56yv*0He{pE;8hqT z9|@{3fKLO%3{Z&$;KoHvxJ|Eee;V1uyQDXkB^hQz#0}{O7HSYNW9eJVi)^LDIr6?T1k< zo`9SPZ=N}(?RESd8A!D!mqzzE^mbmbvT`mn@W;u4*p^EOjdU$}2(1CW8@lnfHQh6D z0ppvK9zEK)Jz#@=_KR{a6T{L-XC$8_CLQ-0yt@4dY({BkeH2BCYwU4nLJ>f$r;jX6 zt^vKj8izk`YEY zI|6{krbH@kmuZe6wA279Im6ZlUcPMN>zmzm@uc#kE%e{Yn2HGrtxIY#NSV8M;X)Ex z3cNKewHvKhsg9)d*s5d!3x=#5I6;{iSi2D@5RH5OBNOxlmRxYO>o&T)?T|e(wFYnT zDo3dRW=97@YT;-TDgI#nbjq`Bivsb~|YMadHzo7g!31DDHtpos zOYfw$_}rp0ALBN(lK3Q^w?q0AbaHSOK)br*+xdf`8Y0-_hqb~Ps$Yho3 zTRJvn@k^HFiWcrz(P@RJ+bL3;H(%ScLW^;H#tDlGcIoWE#z9wHV@6rOvxlH2M*x$r z+V~|5mRwzfre;GZ+2MjuSLwvmcuS8D$pAnnhCg_BQe>SaB*V7ue;=7}G^)K@TwV&^ zpU)w74=dp8dB-8mu|Y7Ty}5b($*4+QYjIBy*F)FjZQd1;)G7%t_MfI2_V-u+J0iLl z@Wp=5-Pu1sR|i{Bv+0OpB7%30dF$3CxlS+PXbMpNiP9>JNedmnuyI9Q7vo zBvVURU3AuikDvcCsL9$B8P%u1LS6A|K}r2Se`dyQKAdy@woJl3KV^18k<<2c^G0V# zAW9PXkNjm&Q1G2__mUTX1@SIMD7}ea+jYkG)gkr`0b9?1F5Zt4Xm6^l^}iEKYwo-1 z*fCocx{J1Z%O$<{LiV6OKP+Stu*`b!VkBKL9k(i2&ES%pNb_Z%I~o@5ie2#DcCT>M z2h}8mHwp+0Mh~caysEp>99Q(Z_Xq;aJ4eQE$(^UivfaEV=4nI2iA^# z(O`tG)9HCv8gA}H*?aM4T?#UhI;!UH?_XjS#Z})Jj9y-6r%o2z$`W%@fg~mAE`rha z0K*{BB+xAA^=CE*jie}vWD0!x4OQc6<-9c9ez#p)dutYE8MntBlarHk_c+cY9D)QW3~fN%e^)4e@E|AdTzsYe z_^5y&<=XAyC+0_V-91k5>_+V+50VD@Z5p{0Z5N1u@*63gE%p}Or8nAg;>^2EnP^ov zv$EFWA~|tG0?j^XDZ~nPv9zbF?BgSo5PBdCxQ|1vXa5X)*t3OJhcDLV}H9s6L7C zHsaNW8nMBxvYP7f;nQa?sy?rvx8I5r8z%kNLB=dNUA>wRGQf*M;MUQSEX}oR-ag8x z{d53lqJ%gJund&bxkCno#u7Iz? z?0Aq;rLW*!8hKuXGV)|wM+g7Lt#AHp_m4-DgdwAb=W)96H^B#3j!=mk6~3shPKfNS zgboc8HXDF-ie~ihvkb9mhl#&6I)D5>>eOp4q@N}5P+V+ zJRH8C;eec?J{6m1U2z|6!zn@+qO0|j{HSvucf+B2RlUG-0D|ZdqIa3!Qc6kyZ35Jg z+^)QVW5*b|9vmw;JGK8{PsVw4y9bET0|tEOoF`R^LE8t@SkO>l_wGBWeWDg0l>5oN zv7-4iAL?8Zzwo2GY);$@MhVOJkmDV&Jj;$6G!;Ce!EjwCev$Ra3{o9Vw6ptxr05A; z`fQvHClm!>nL=$V7y_`_|3yG=k>{?IV zehb&^8t3#o;xUZ?uaZD#Ax0uuS&dS9?gy1s8+iB?t*Y5{j$pqOiwRR$=m-i#x;>5! zxF5Up`wH50a*K}~EjY^%ZK%cL$*3AW*L@Qp7CQc8S$dXsc3$sZc}XXA2CLFEH-qO* zC*E1ITK?Dv5c7?n)epL&{zD-F3koFDUoda5N1uOh&31p!mvxt^k*@ek-+U0u2V~yc z?aQ>(46CFX1fEl47!G8$!PHVhl6mZ>ZtMe5?(1)JgO0QR=l*~dK=~DCH zK~0{gkZ;6&`tqfBuJ}*afsaL_Tf-E^x#TydDC&viYSb@$ z8C}j-^oxvKY4+*5HZ)&xu9Sx~QnE8U1*-TnOY{wjVLApA>fc^w7TUR)Nt?mOCXY(w>7z^o-oYKa z4_vm}i!N;JS&@K@bPN#rgt{1O=C%#??ZS5(c@Hg9usaz{CXVww4Si#!^r1CEaVb5% zV`i>bRYAdMy@isNXnpfOCa$uOi$D1k=Qqh`y|Tsc;gQ{H#Zhhh>c{x4{NoH~o9`ck z6fQ(MIsR?iK^0<83)_o5t7z>7K!$@B$5nt22*Exb%Ez)f+3wO15HP8ORC#&jRbF1P zfv$c`mk})3&}9De7lO|=1-?==D7^NnF@~69{Af*Oz=4oCE=b4CXa8u6#I9WvRh*gL zavR@JHO;Qc%>LWEVK-Ij=wefjb20UGXU~ImqkjiFZ`0(iij+2KJnvVv`hAbv#+lKd z&%Nwk^j&daF|Rsazx=#MuzTZc`<#i+?(>;T3q`D*0h~_z>y{P{)Ny@&QdVj1^S~yd zYDM$n(Ffu=6T3>TBy4?`IKXj(>v6^W>ZbJJ~2Ay)#v@CO>TF5X@X=+pv?Q}h1sCM2)XLn!zEP1fzs!H zFBBP5rO6eHF7H{lnu_7;$gIy;x&PRU(bt94JE_=L9p@QaxG+q8PElKU`I0)n_4|$c z{VT$DT+)4E`ND{2bYc17rB*qc)2*|XU(B4Hzs?@sTDf9!g-R{ZtTQK}%>Uf#aEV=I zJwn^#_nfD`R^HRI^~H?$-!|oj$)3@GD%#1eiO}z}OdEfHJN}P_dMf=@ket?+Sg8yS zP0d{(W@MC!td1@{ID!j?%Lt9&3Xy zp^Z9~?ukHQiLzW=ke0hS&Yx!t`A#Fn4s;E%k4j0KWn@%@ca3s%j`B)MM&H`jbZl5q zKCISuFL}$L{)P~UkVG0OshUW*qdX2ctp3WD-d>D>FJoX(UgK7zaQ^#_V3B9fK0Ytm zgVlT2Qt7;T+w+qb&~Fhxcd)VGHT)Ih3qJ4y)Yp{9#|L)pEHSuQd?)gO%*mlmU$#x; z{ZXEg5l#%W#seN*RiQ<({Y;`WNBXEKL^xpLfvDpCi$tyRnnwEcc)IEpn&ir9^Y z{miR+nK$>L{o%@DPImUD;J0!ocekMB<=sfUfaF68W=1r zSlbLhlE~1{Ig2PcAUXG?otuF`+VW_@zP_?3 zAW1)uJaq}fTW8L^IKBTw>yur29@5l7UyyC^vA~7_B@jvV#b8Vl$%Ya!TOdyzNLLt0 zKU4X4Ml;jb58Ywm9aHML6JbW5MG<4)_bL=4gekZ_Fe3(-*ylj})Zpe1ej~@9;TL71 z8LQ=`TvwL7Dwn3!Ab5*C=uV-E3s^ZoiAq-=Dl09WhITz_*x*up`X0_OV)p~QjfD9g zo*!T)7BNWYdvDwD{eE2R)1=(p^Z@~X|5d}rBv}x5Kjx5R07^;`l8*pwYV@$LOj*C% zxXDfA2;95HPMdxofY212i(4S?BeETEU2g2El*`hK8D4t1=>z&5Gc@zrDeQ~U2Gt410I1vv^XK-5Lg1^SprRvtY`I3d?cx-ge1RF zM4slsm75Whf24MStqb7&W08aYz2*7Ft|34rx zfW?ti`rza@NT}o6?YA!}Fo+N$3$EuUQA!9PMYGZ_DCFrl;q*2ls3W&xF{)BK@i( zMBLDF-?NT_MC}8lFeBqiAzECo6&xQuXm|;b6FD4GgPSo1t!r$&fr2w~c8*h0h$BlRGf6p-sIoJEDuVoL(lv+K_ME_@lQfAf@*;8qCk!{BWlIgs2D$8_+k$J<+}G!IA@qy--h}DWf4zOIK&GE4g)ezu&@M z)s+0wGE;cVZ9vT7DOEk-_Cn|=Zw;Yzg`O}J&}NCa(fjUqx3Ns2HO(lKVNi73+OK;+ z8adwxvbYoz)49fUDB+IptoQP^IaHx&Pto2Zhc~l>b_OIGj8d>KagDKFzkM6BFA?kJ zy&hGdXJFWWMI+TmxbHGLV;CJ0qztoDqS|+IGP@<(X#%(b(0-BAO=+|fstd`TqfomO zOS?LoRw#owHh5_GpLKA!5fv3x@2dN$_i8$EorNXLp1SRU|LH@_x=nX7do14K+OYQA zxpRa|+1zYhfA-ILttiWc^rsCaDoHKKk2m@L>PCALC`QN>ASyzrO<*79>gtO6K5bm^ z$nGMSoBD%Yk#TXF$PYn=cR>7ToA6H!!3c}JoH&S{{N>tkMLT0w8XQbC8y@b`raZy` z-OAet!od+%SC;)8WSp4~YP-O3rwNicIsJ1dxCWL^PPf9snzp?|Tzx;h$+E_u&{JFY z`CG99^g@=earX&!+J;xlZzzPef{A*2x0}3pic)PZoR8P6VT^$OrGpz58uNpDYs5GwTFgoWJt41hdV8~4I~K}h|82|@s?-jM`*1K6v* z@-WeIz!-fG^fyl)?f{^Rks>1phe7uVX4%9x;gBMra%kH&GcyMR$J?m-ckcajRl-opvctPS0%At#!8-^#zr4NJ{ zb!Y(&3*sl@h`>3jqNyo#@SxEfN|4b}d>5>zl2cP3SVci#1b|=v-}Ah=DZSv-PF4AL z#fQY2Asvl;zPPpEv~HICimi@T)7k{O?}^v?03x{nfw!lw!hw^MAiQUg?Zp_eRf1 zjI5|)??w5Ktqrc$8Et?uSc!>~YL}Mw9h1~nphfD7YIp(^5pW-&3t0`wh@!G^zTb~H zf`3VA9ezJL3)&k(k=2SC!NzO1$L1*hHD71rT6UP;ey33u8_eq!5`>E?ti-zBaa6iw z!oQ1!?pTz-zLKf#^7$TtpAsA!P$G1H$eu3ml&;zybYG>ucgerf1F7S|)#tzL@h#5N z{`6yZLA&=eS8w-K7bhm!{l!40mNvz_otM5n6~53{%=JHT2TzFSIt%h7?`L<74Qt4)@q&-)WWB-#GP% zD;l~Kk)8cD%z+>*t_2*hE8p{E0nA+_(-^8eyITFCoz!Cf+QV6SG4(_o5Oy1H~@Q-v~Lq`oulIs6ubgrpUYI=F<8EM zkuZzaCma9MIOEQ#Caze3^ML%3u?mEFptWjcv=bLDoKHBBQH^Ybzn(%=#sk7_I+yZzG-;4cPi;tUV!6_-$ zDRVUeH3#$dJF+o|` zt%11R_BnOEL2+_oW*Kg1le4o@Kg|&-2no__sADmxBu0C!92&?z1y%x@%GESz}O+BXB(qwzV5Lh%XheZzRwo%e5NWNT@G)#Kdlb{fp{vhVzhg_oiA5 zPX%=wA8p@^p}3cq*F#yaaKaVD@T?`%P*Tq5+dT5b2ptgf&mH(?n99T;ix71WsIW_r zz!E6{**?JRh#eayX0)`la!D8$@!(DY(; z?}3(%gfx<81S#lag^%98b3*HajB-YDdsL*4KQF1YtCTcXSY|JrE zqn*Trp%n{8wMYT=Ad&jP?uMjHz=wbrez@dagAYfZRXtNQjx5-&$idkITq1rPhmfxE z#WXq$bn!nAP_>GJ_YZMQy}!><6+OZ#QumV2o@hKVKqcf(RKp*j+ykZxRm*mA%Hgr% zaSIa`KM8F?P#|a6+XvFOu|B3r1Ton0aMH)TEdgmkQ419iB=nPBE~hglTo)HVR^V4S z{DOZn1G9jeu&2Qax{T5WTdl&(+!7mb`6OQkk(5F3_dd_^cxz2RzLXqD7h{RvU6R z0T4{|&AiBAboOnmMR@TX$O@!4#@dw!;7An4*t}Q^+~^EC-;@k%(9$~yM8w3r26uq` zEe`pV->fVwxfrZf&VT2M$p4P|AG{3k^l+5~VGKG2N*y_isK~;^M}`@z70B1fMxI2b z+z0Fh5_1LU9xHGiY<9F$W)T;53uSa32oJ5Q1Gv4&UqKsYfw=~L@l_=MAi3`ZRFcTh z*bh^wc-%PnEY<=iCl<(9SQvAkfuRm)rfj%n2!{l(w+Z%h>y$GJK=DXEm8uc7GELt= z#L6Hjn-e2W?~2McGVI3n`U1L52P4Or#z9bwgU~X#5>LL?R^lj&1xtqi_#B8IVNh_e zAt);l4VS|J7cO_u2)`g1)o{Rt)=M|#w*&4R0%)K(TXh;{)SRiveE|LG?1n88)S{xI zxR?KcI&dvCGy(gQaADw}ID=(GrG>ml7^4rvjRoBBt%rSMQGk6wEh(43n%sOC5K;{9 zvPRxUb2g1j<2)+uh&g)YaUJZPTDw^`I%!Q8a@LUQzK{V42}k(%fQ))Y5rxj z+%St_`+vNhc~sB;+U_%xIf+d9rBWeL zF_nZuWL7do8P4nW+xt0Zoqx`MXRr0FXKkDMem|f0`yQ_Ay04p;9EJ8uzAoYbP(4Lc z|4gEWj4kLEIt0U!-Gig31{U29ohRm|Ls|CH#*ne!;_2e&|%h29i{6c_kg`G16cn6R}j3+q|viK4VOfgVpO7(|F z*H8x2buJ4yxW9h2AWTSrdUT&~C~JoP-qY`v4JNg!Ni1h>T2(<7mcEYMDQ?>sY9&lP zvjYd49ZYN21HrgO{u584P(7zIF8E*h34LvbXu28{CIZ9%+nXDIblSi_mGxC;?KN-<0bBCKnbaS|b- zyX{k;LPNR0fy058&K~bc&h>1E)Lv}o%gCq`xal+X=6<_>-WA4gL8wUiAzrv#mR72& z^U|l2tRX1r!fYQsv05|-Y1nrrdl-njV~y+I!M#n3BlEUe$Bxzug=->kG-Z;7Z;2iY)l~Kj{TXxS zY=J{8!vi3D5n&v~CW0h{02Tz;SihcFDDU07cdGK*k+;J6HK>w8h!DXKlNbiHu*k~H z%v5es=U)5aY%b^;@C<$M-f!M3RO0Fmt1X#L-bm&3F$?yek98+S~#Nimk@^Kvy zH6M*ru{OE_a3m*R&dKp3+yiQ&?&8?2*s#v@SWe&0TQ zDD4KM4Ieysa9L%gM~CMKq;w27(=Pf$>7!sFrUvcX=is&uBPk95u7t_fwKiDyfhh)A zrfq+)%QQ z!Xi<{KI+(QOE2S|<&*o+b5VgbYna)qY95^~jTi-=ONZCoA`PU?AOf5&CibH2zHHpE z!9wi;EqMNxc9&@O+LbWME0~j5;7qRv|gkTVLZIOTmf2ofq=# z{d)_X0_)@yJ>fLGP<-+Btv_&!S>PQ6e^F0aX16OO!~{aw^Ut6LXRv2c#3vOMqWj`D zRi8i0TofMw*^z?$9-aVhE<|q}|5(h{BCxa_nSu||(9~c#oBWPlZBF4tTE^kdEvh=6 z;He$h%ccJIE0 zoci{D;{1nd%F0NFjAL!5kGzF|sRaPP^}4=+7t^=_(l~I8@_%#PF*a&q>O4?i|L&Jx z_#d_)pEzDzt%|J%3nyYvl)Zbm4KJuK4F)5+>9X$UVjj15b%iUJlo9St^lQ&wy}AZ@ z90qU=6q!Zv=cj(e4o?0Dya)cRcSwzZkMvq0fVLk#0RMEuyaR(u3QHa|H#hG~3Bf_n zaPx)p=c%9pVi*mRA);POSD`> zkBU`%a!Xsxp2RY8G+HLrLY~ow&Q%e6K4r^c&Co{m)Ut^0K!De*8Z%CIqY*#AfQD zcg_{HPl_{Wac^|su3aM$q5eDHwQJu#a~5)l^ILXyL+EaBE7pfxdhUtWjnlQ{WRcS=f;h0J$f|7sk8h>D5IIQblq5w zo{~~Vmh!Ug9aIwtajdCf%i zc)`IZP8rdV|RHuhNeJ1)r5C$3Z!YYev@f3kGhHj-_ zjQ~oD3>Pd5xrKoru?ql7ZdJclAI#JmUG4eT>bg?drD`3u-YEW3;re> z7aDMzQSf|6AByN;xa_@f_wEItv}ESl&b0ID8{dwi74J*yVt{kTbLx}&4I6&KI9Fu2 zrG|6=xI0hL3JU8vz;-14AYgnq{$CkCYBn8$Xlkj}KVlyNj5?k>L2#HwPjc+$z5$cU z+*WpvOPR32AG@$XV#Oy`LM&l4kM<&$wi#cJXHIu|EPj^WnXhZRdt_UU_)#ElLc=pf z{S0RY`G^IQ*ERcO*=P#^)$<{{q@_9BMAyOMH$(3@Zfz2y_n5eJUQdE z#_+2m@Qh1&FBj}Dae|*-v_+||I z`Ee9CT9y4PeC1)34dw$jNK5qnzg~K4O{B5 z$1FGHJ4QU8*&<58Ew(BYZX5{43>kE4cr@e!Jwbl^7EPM`V`x}-$~x@Oq4Be4zwA}@ zw5=tRTGQ8jZHq#)zQ)L-;6GeKTy{r|8PkwTyN{(Er=6*+z4Yhd6SXM_dK+#=ff4}T z&tkt7a1PGSqgXCJZyi-28oY?okE2YPo1}-^-oenHw;V<-L%}y|NugqsCOYN6!6@;T zRV4L+V2bL@ErEO)R(7;U5JR6I;UV{(uS=MJ&FWVGXn~PI7nl?3+DrR>{N$G04T`n- z?}-haxW0`MCPR*jI%gSXsgjP`dw@{H8aRF`S?1*xji~r+S{g3mGajk|zeh*UiP{s_ zw?lx;=7SOLJ6)n3FX6ONqehbk?!jD{#s+0AKiciTze7aYqT!YG!3=Ez)Btfn+>lFVjTN(-H zOi75@*YUpr-gIo?R(6~La)Y-h<6n-yAdwv1sZ#s)TdBxdO4@1gvgTJyLF^?((wmoL>INftTjICvpdX!$H2|GxlV+)o)diYRe z`^*`5bl3vn#tLQ8UX$x6uTZoRKVVg4WQzKP8b>A(iI#m2o?G{4Eq9Im9;nw^F%v?2 zXZQAkd;YP2>#csal7?5vaHi&cEVM@PINpqg7BL>kVssF4-Fj&aJPr_)eN^Z`U0vN- zI6T9DCKlA%B3rIw&@k87jlU~$1Bnd6VIqu z1kQzWzYD#?(Hgz6VRKz4ZGlWUlw$iTzNDLWQE1Hc@VEo_&Vus`SMi_4p!XF>fs6zZ z0qWk3o}ea^Ctyq%B(~xrl>yB6OSEnd4$&JaDGDE$6l`^7bNM6IJ{h%@dJeVr^PmaC zj({$qLxn^UGhqBWgzI6iN1jc7Kg6yu`jf(^q@;F_yII&95)&ddaiF}481p$OXZRA8qBwIRgG zVdH0S0gk2`{Zjeluz4Wh^2kxZ0>UlRKyXL-dhtFxWHA(A{>D<;$`v3G!GswSKf_K{F-2itw~O=(8By{+*92Z;denb2igrD^5b4QvJ!zmYrmyM{hE-llc;v zP{0vLjskr@LqrIfXA-OQm8BGykCLFov+I7c6YCJc{QFIV<)jnM$`#D3vFSc z8E2l*Z0hL+bp&E2tvqxY$-|L#ZZOG^D#l1hsD_mEG7JwWg~oy@h59c z2n;VK#7bOd%6E(_X8_qsW;x@|wC~Y3CGoL<9vNp3CRoo}_yly8z%4XoOiYaR(r-Yj z_PELMo=GXl+2`jEu`bg|CEw6Ty067u0#wxEiQT*R z@xKhz|HW6U@A=%|f7tF36T7um{6Fb#Hx1+elMWvlFj_m|fB)^SF}waBY`D6z-G7;M z=<)(ntk?e1>VLWO4ITU##s1$v?*I2Mm)IN>cdqr$!Ms|E?<#_#EXd@R&~+GV1s%Z! z=Zt%2XyKN+At>Ku{Tvt_V`Gz81TFv%JJ;}q%4XRXB_9_C+)F%BDX?MKM)6vZOUoH1 zL&&rtMWHkaVi+=f_+gHw0qi&NN316x2_Tk|NfeYCTOj9<{-mpgh3{EJ<-+e8gYzo= zU|~t8_2K3M?^7urBZfei8-4r=*~$j2ZulDw95l$*tGfP9eaI~0l(M34VB7ksDqxa! z@g~vvjDOmd+*+v-h=Y=|=YTRREl>c|ahnKHc^B7sQ;cn*)a2u@@b;EHf4C2<6TOEN zPAK+S(gi3f%m3gw7$fUMl)*4{2XR8UZjzIMfU_=xDyPQe8i9~IOvpA zx3)N#GrZLT+7bNW2fsshf&iyrrPiu*=K$Qx#i#hzt;BS-{o#9(p}-}cQ)5(p`V?33 zvT=ReD;RF=HE?byuZLo|?_ELKBfQ#H>za1r3{bNCxNs{qyq# z2|2_Nv`7O#kt5N#_?$a8gaMXWL^Rnb;(9|xdQd<>E{*m8B0EJ6VV7TvBr^D+ z2WDtDZ;s}TdU)`31Lk>_QuBv~f-O_UwctGBc7{kdV9Z9;2#IzaS>zPW$&Um+XkL*a6>sm_soxmzjVO92 z9nZsu=>=!+ca)0g(|LJ=c%9c z!uXW@yo?!LS?`N3nP=A{YtqQBXFH<;qLC&RGN_sH_;IIzF_{2tE~+ zQ?~Em!2mi7t`%50-(D7{@qmtx^pS~On|<0)I}@zt z=aWCb)YyD@gi@MDJGbAmF0mS`}#KLe&(m`aHgIIFR%pdGE2+}c4@sI)OJ4tkZ&Ld6dzic1stcHo8(Y6AA;tZ_(nYhY@8mu`Ma+7iuwGpiW4Vj+Nw zr;v6;vyHrDhH+MYNG+1W2j$mK7-0HS;nhgJVf`06#Ck_ZNB6}93xV8aihcW%oyHSy zAh0cyX(={b@nR*c$zBN#Hd#><2A{I35H=YZE9I-y5@=#u z9KZ0b18Peg+4gXrgi_1isdfjBmtIkw2S~~M6k!N0&q$a!%tG<$7uP4*;!0AFugz=h zrpgZxxisiI%5a_Pi0oBxqGrhz5$bYm})pz2%8#N@ww!w8GZnfQ1|06z#;VN5>W7=l37Gb2o4L za0tro-MZDAI`xrtt-0N7U_}wBAPT?;Z0~QAv14Tl8>DT7&bY!0XMap%FiQ6L!0X*` z-Z?azkUJMif)K2HCk0yi^%QZRk->i=2%e-Q`u)4sLxCkeSAfZ71Gf0^0`6?@GISp} zJ}iDQDqtZjY1`E05Nc}rvH>Af4kDT+X3KsMtHRg&RU<;92GceRy^P+e4^}3SLw@IGOmIy8Icq z!T=r@?AXisvwzE~2Y^~Kn{1jp|M~V1 zI%aX|twK?r%zPsDBv%k{!MuBL3TA@>NEXHp^oWhF2!m9OR_Jq+40^r5ip_+`NkyoM z21Ec>I^tw35~WjR3Eybi#~4zoJZ9h=8sJa7rIba_jHN*Yuwta3A8Au$RKn(?|6la+ z=|}~UE=j85F6H2<3%DvDlcGW*5ClxN4|(Q*rv2$e6%xwAeg;$>a%Yj@Eac;UvW)q#dSF6mqd*S ze%5(T9GSV#?3?X{&bW1}!DSo{;8&>Q&1r$bstf{ZIcNv@$zpRwt`++S5e4!Pj+>1P z0OVkn5>PY$J2_)Ej%hS0NPD!d45;6Bhjr`=3Zs`;0m-gR<0BKrpz#x64yH3zU%q6K z0)uu<gfCN4f_8`2 zEmage*8)1Fk0txdM6p4Se;wW0?DW~F7+~{{C2vQ1S5l_cpUGaT7J2~nrBUJ5STnG$ zQU*ctzt1l{KZs&4-TC}7jGcOId(B@uvwEqMqoaaOb#}mWeoOV9&j$}4tfNyMmG0h2 zLBU7CZ1hN*&5J^nZ5m3?f#X@Q@nAB|vcV;tuN?rjCAN=6Nsl)?a*4r^H(;x~ zlveqbXPQ2H{V=r$Z+teTmBg&=Y-(xzd4nS3W^D2_Uo)<)qDkzDHGR*tIe(+u)}}|C zCU$P$Z1dEMhdBq-vS{HwymwFu&08!&BGwS>ARS4!CE8OlHV3X1VzF!X@|CUbO*LeL z!wuByqJ<)n%&2&rh6|h;-r}4ndPPxw3S1-z8R~yhg+-4*(wPgObZSA zlcg0PP+XWdOWXtjevWw;vC)IujA2$sQrR7e6Le$xe%p zt)#qXdRYYjRIZm#X`e(L=6$Fmgb>xPfUiJ$kpPI&X9D3LS*_^SlN(|M`wD|)W+Dvol_hL7^I2*gu7=zO$DSf`ENbN zObHtg+H~%`@Y9lF6qelaUYgSxejY-T2S~g3(?af=$PN#!dK=p(EgU87E|Oh4;0RE* zYm`e;yP)Q?^7Q&}Wb~9N^A&i{tkc*8cNYBc(B8cw^338%+$Qmufgx>a(Rmgz^VotF zOVWOxepUBOLbIImPu9l21Kj7953=j}cXQZz_O?p$B9tEDtR#b~5Qv3J;q-eAUB`Y2 zs$tPmh+-MF9=qVfKB~7Wm`k=139`_!w55XeP?I!;^r0v7x}(uS(xwD-8fqCESoiyf zI52)BO+Wrz3YZCU_ydv&XS<{FWnf{;BLH))ck!bf5F44So#SxEhs&3R8?sws3O2=N7+ zmoF#MTdZ3zS1Pq{IBlDM1**(Y*x{U*|j0i+K=V^vU$0P$TZL0B|hr=mx#=1NuuwH7kP1AE zJ}Vp1E<6L01jdk-I5H0}oE4}6t+bVCP{9rbAxGxGiOL^Ji$(EsY_vDySqMDrTI=kk;7R)8P(hIntGzTJK zPt|lWNT3@fVhP#6N3VC!o0W@mbzNig-)QUU9U};xDM(tUyi2vmiR9V98Bi+EQ8cHy z`WDUYw*+Y#HzJEn;P!AD!fNUiVh_7gcOpVzrtZu2nw3m*fmvwx?3qPpj~*(>r(4W4 z5*D*E%q89hV0zRA=bM{+lQ--Pri|ChSVBgiXl+9s1PzmX=FIVm3L9O}38+>M5B5*) zX~Qp)1?5S;f8TT1W81BHb6o&bIDe%Eon&2w+$I3{Yc@th;k#)c;U;8{fDD9bw5tOr z1=C9*PRo<~>rix56hIt{Pm;bq+4>fZid=I+Zbyu`+^crn#m~YSpb0+qL1u$+4IvDR z8jgNt3=vXIRdkvR2kaOhjYHNgjprfTS7O13M`^bor9~otBdJAkqs};;zxVE|0MjUa z=!ol?J|BifGGTl-f)|>ne@&Z`($7sQhj8FIZpJ^8;!dmOSfn;2e`4C-Z)u^^yZ27a zcKHvano6mEwOLey^(=%Doexr6??+Ex&`ru^Wd)@9!aw9w*{ngSE2hCSuhd7Y5MMjU zw&RktPH4M23~PB;QPpC3%cLaOqTY7Yyz!glklb9zWD$JsPc{^*eMYcIGK{l`1J1H7RZ5~tf}tI z9oJ3$h3Frz*|k(YDFn>FJXPUWZPWu6sx}_|OoO-oc;x!6t<6x76h_y?6^^^j4_UCF zC)a&p?Drb>M{yf6!9Y3pFA?O*rkOa7T~omeYii0EZ?(1?src&iw{QKgdOq=^{bAZ} zE9C4%J2fBkl}k9Ov(PSg8ou_i%iD{)fJJ3Cn}8A9ehX!QEU*-Y2|C!6##4ADf#|qQ zu@!SPTkRD&1$<~r#^<~`?UxlQjKV~nUWizXDpArN!0rR~6jF%~k|F{zR7{0?llndy zNADn`Cvz-%cEuMzz@m2&nNhm+N?pj$AP#*5VudKpVD``+n1ov`rn>3pv|1@x-})rD zF(5^O+t-CVb#~dJ_keKMU$~P&J4kc}^bUalFs5-`xKkKpOBI|-(?!Y-ktBo8Udux5 zaEBv!qkIc~h|lS4&TTdca|aBmgbcdE*)&RLLf+w~e$2XO0@4!ZN26_Y{FR6WqT=CK zeE}RTxX1x{`s~?AnzDZdlrr-}*5af#sUIcYO_(m9jY^u+1Vje_=lPzeH9TIC)6l9A zLs<*ajJ}vTXa-7^8e4!}ektVqLd6Sga+M<;AoH2`T|N; zTlCXtWur+ABY@VhH8qb)TZsHo6aPH9YyYEipWXX6Mk^_(8|A<8R^gQcO3e?-DDjAVr1#968JE-Rk z`}aR`x-o&U3VCvvZu3jp_wLyct&BgHl~t^`L<75&>dv9M^YH-Z^Kq}V&=A8%Q!b=c&bN8fGtN3zNCC1w zHlYe46Aw@jFTu^Xw8zn+x7-}(qL^9}HR z=IeeC6>F<2U(p2!r*alcHi2ija6dp+J>Ep)*+Q$Mv@|%lrFE>0*ZD*2phAiQI{%R= z7TcIv)(1(WthbU(`FCoa8yc^%iM<;#X~E!Wla?)yrz}1~Q~;*s&XA;+{{H#Hy0&@4 zQLznl(4c%-bs@IPn-4suIdpQcid#8mCDyaw%`8W5z;P{1|J23SFuM*fcao0cyAthP zwqT~R_SPh;V@tO2ZEQRCQUjDST|Rk6$FpQVgr6bll{ zHnu@;#+}5{ltFFuY$o8U3J4009y_*AG4aXGl0#c)M5$0^-h|0nhrZeM>ea)D{kW&| zV|qIVIP7ZHm}h^dw;iXsA!Y$Y_mt!rb%b$3h6@5PB4u!i&UkaZ&?lanMewt5-g#g_ z%+DsOKG!T>Gk^cpBXQXSFB3FwahWA#!6X1m=XKKJ9jE~yeeIoguXz= znP{nP!`eC7;>NJT_O_+vzJ3&Hl{?HX<(h}AHH?gm)alt1dx>Fnj%IbC$PqAAbla$X zIJ<7C9pJSm<<5oSJvZ&l(Qj4@TQ3>me>`@faf?OQ_TKS7=3-%!I%5%Ipxkjmon`(_ z(>4+sECX&E9w#~`rtyVoJ9(@CCEQAHEd%He+qhtx2GAj?+1<*<#JSDtq59mmS9|JH zoK&xqBsW-S^-Vl0TX!)RYsC$P*tWbj2Zgn}``SAW;W((-HTg6maq4!RBf7o=pu=_5 zo|8k-K|Zmw`@t;hBC>5rC*57P=+Y{5;tR43rah{^GUj1g?BEwgv>7y@Sl)I}!w$Bq zeaJ1A24XtQAPIT%v3fj<=7hoZ{ub3=Cw}6PzmKl%*OK^qyZVy;47qww!*Z%vL6hEHH_Zk;o|zKo~w`uN>q z?GL0hlT`w^2&@mc5RDD(M}8$`vmnBv*u{1L%e;k5B`d6U=0h+0a!sE{#=;lp%gXp({djcl-jc79LEXW~#7x zPc9Dqmp*_Kx`R56&6O{pI+H41Htk{+XEW2&%j+6G7+=mYrK@9KX9NW-15vJk3|t%$ z@kQ9^GW_H0IlMdH0?sNpMLjg@1I;KZgppJ^_9tdF5OS5WcP5Pqm93JBirrr;g-voM zi<`}e3+k+_ya~KeW-giQoXO)NSe^i8Ci{9$9?*AvxYlClCLO_ZqnQEW36s2Kc#l^9 z5DpJwtVydMckWHyjG?q#Oj<@4^oP6z$Fw*R-Ffh`8E9OfeTIu55hMdcj`3{f^c*n# zBSSWd^C>d%>^tx5$hzHAiHV6aB=|!lR2RROHAOP>o&O$xGr*>+c1vcBx#=Z_#Q~0b zbGUU{lTVO@w8kH}PVvR7Xi5P9tQ2uStya;k5ufwRyQ0b@y>~b|BT#z$;y%Ki0YaY< zteKcWyM|-VU58hSIx(Fo5I!+FR2p%#)Rd|w*5ec$C(KQhnVpa`eE#lTyN0Kq>(=VA zF?fwkmsP%f`IsL_k6>{L;Gr*7zs#Tk5sUhbH>r~1AO0{Scqk=hki7^!lAfLxx9ucib)=376ek@9RSo;8 zRu(xgU9w~f-8!Eb3!0{G?P*XrY57bvpHqBni;cMqIUwqcjC?*cg#YFE67-+Irp0gF z)5ACKV#16dtU7oZ@2gye_L&+EvH-_W?;_{VpFfMbf)KgdbHLPznXHzTv0WZI*JMf4 z@?EwJsgDGjO1$y2_lia6yOlHNaFsdB=)_wDciLrk#l6jN*sm=e1Z|tUMXg#xSaLdTn zOO`%+Z$EIYW6C1kkXBtnDy@crv12a$*1=u1U6`}=)3-c$Boz2Zb(ZzxMub-C-ZLRIvDlS^S`S``%w<}AXtkutX z1|`HP`YzjH*Ta6q76W9XKUrZ|Q^RsBuxX=YRFDU*EK-Z{Ll#4{G`r-&g74 z-7F&VS>fcUVah7TwcqX@3n;jD!Tb2kc{=qK|Et+~IupH*;r1)WIc4@|l8!bmz2Un2 z+k%{2?a*;5H;-9rmSgFy)aKdGl>-+WTR*i++#PJxvZdJ|o7kft*kY)dXwmS)Lo@%% zEWdldrk>fn@X?GHrf(kfUwHI$!)Ad!kB*6SF}!s>FxVW}pjTI&>;ca6bbg&YxzX!M zNbt*f`5I?Ry`KE4JiG3X?|Gxx&I2ZAo;A>Tc%|;c(;>mnns{ zhPU52{f{k2a2=uM9eN#&9_MdA^T-S1cV%^JznLYS{`xkf$MK)hJ|)-FmhSewSJbXy z%RS1r7MJ;E@-%(Vq=_=d~wvQ?K#*DYVJzp7u>)r`P@o_^@{wfepNyxO7W1L|6 z%e*u#_5OIQ^Xuc?n~jDkZ_69hR*#x7qg-c0pD>pu&0B8qxa3=4le3T+)+0uq=f(wl z{P(qunCRU~aemt)g?mn0+z#t@b@|%gKMvlRS7`1(q~zpS>)GRyK0orfcgMh4_sp%* z(%tVoY`cbzQ(WQaI^};JVdG8pTpL*J{yw)*_v{L*n$8+UrG4jUlz+-<5_^8!+KVp~ zABSg@9G^E^%k?SkuTGL}ml~fzqbar?cVZ_xDlO%l8{@^tiaIqFc;bcnJ|qA3b{djwe}OmKqHi z(h)G4+Qy>N2ftr9Q%p%FjdsJ2u)2$m)KPU~0N(5A6l2mS%Os!YtxJeO}CwdLr*Lh_`JR;p!3ofG$7DjXys%z&Atr^TZ6TAVJ%b{J z6yCxNs~->q<)u)Rbh?`T`khKwJ2GOTVq~5v5Gaa(=OoVlItlzLq|Kk8*27%dQXZGC zsdfT!2MOyr_4IaTM(f@?R~$dN_vU7*){(AlGa|R}iz(4%0*G>Je|9u9G3tW_kI%N| zY4ZrE0z^0rgiUk*7ZC#Rfn4HgPh0V<0j+=&n1;XxS5}EE6H*W$eq5sIv17-OFI-~k zI0t3BxOnhiDH46b-omY8MV{84DI*9WY3Wvf$r~$E)=;B%)hE+?<>Xk;3#+^onw#!( zG|>H_MqxJ=F6(UUg<($c=-BhDb@@}@a0}I(5^PLLX zHs1cMxpPhVncPgh&oN$%*vYgSZ5HM;KCpFf?w6OBS8)@Z7c97muL}CCGS1ImvBZiM zix4hRMvGGLL~3d|=;KUEJ89ge1dlPZG>+&bbuyjn`^RZPsF4cXQr(@v8ad&jZWgfv z*zTU5_E!KcZ^*d>&6H^$ZmkefGL}j238fjdq-hs3)8IWg#(+pP0qNKP4HB96`o7bp zwFg19Wm>f_#vt^c*z(>HDHhVLGG3DS1Hj@vW6>mn1Np4{$N-mkQyazdvg9#i$F^zP z);BhG-mY>H(zCr0J;+=2u;b@HDGGK)i7vx5u=ZF8Y;)ARhDQ=*BXfCqx^I%*`bEq^ zG9P0b|2kdIY4neZHNVrd+A#jWauH1Kngh`|Z96-kU^5Y|4_voj1efKYcLhTx2R0)~{EUKVDP zqbT;#w7Vg{x(5a-bfxv@b>q|TAM%NbIhr3K|N8&AZRd?)24`fOo)3?DGzb|{0>t7e zBLMhQXl${K$VF5ljw}4Tbho|6Krp`1(cNXj@cQ+wRJfuf5mDTT`sGtm^05nqUV|ce z$bq zYuy$btMjY28g=GAf+Xs}^<^Dly$ox3C&GvwRI!wrla{yqSd5igm#>^(Fs+!H1(G_X1 z#QVMZfQM!a79>;d4rVUc}=NPCr1ra8v+kUfW*5yQ`gEhhCLKw zZ9uS_Njx&SUzlJYMDK1hXPZy5Eo3lUE&Mz`9IC33?Blz%b`9ggj>TIv|2ZODr8t7g zWu+liu`j~Dks~BpkFFbHk+|_)>O@$l0Lu@q&B@DaO`}Hsmos6#;%Mf+prBDW*7Jms zQi$jZr3#v=sEWcx|BeCpY`u3FhbI|U#gmIqft1LEI;8vf$m7R%Lc^ko)m-Zi3?jNT zAO&7~$fKNxDJI+Z=H<63GwTz!SFNViRO^UfqLTaLvl-CokTOM1&N6<%qE%^&*{amv zqtpU98146SWMW0GjPZX7yiwl1Lx%|KSh1CnDe&11iYB$&Gl5lP0*RpzqN_wslnh|K zef#!hw-ZjBFyJljJfB_pI4zUk^rrN}IlZ#pR;RhC>=^+m8k=a>MoH-k`5CuRk;yO$ z$C6QzwoIR{2JFGp8wXe}=J3>U{NZ74W-G!OsBhBEyVtKb2AKy#7Nd=eZL^`;#$LUY=4VH`^@ zbcCcp)!XCQ0&^P^^ex3y1~_P~h`Xbcrz^W{QFG3q-r)z!_y(k;2>e7yNzU|+ICPZh zp0%|f2FM~HyqzvX$xNH2lAzzFN}neM>+YF0IKG|U*0qIqhPP9_KYd+;rcH^$WyOWB ze6REj!fKiO$j{)iiFATEQV?gm4bcV47}-lR@NpGRohASy_&N=N<5O| zf}gMGf`oJi$QTS0=|LM68GldooXF9T>Bd?`Hsm8A^qe_&Zp@7&1!%YfGI!?YW`20s z8gjvpQJru7UE|Yk`s&t0hh8wZa=(dWj>9ZNr1)&fa}REBv9e%|5h%)(;b%8x3E{lz!riCcY8iZ7g@3%VF-qe16hm_E!rDG$>xlB>BXNXHrU%YeQ0gf%}l+je(;W zf;*zu{)YxfFpqjL$38@R1A+|^!gTBQJa$<0=q+ERx*$Qq<~I(r$R>XaY|HSQhzo%t zMH}i=|D_eRuqb#5B;zSWMXnj^IsJ3ApI;NuBG8*=&6;5&K#qB8os*iXqhnlB{d}iy zUH!{v7l)*eYwM3G3}$s zuz6I{SgVMi&h<<&G|f07L3~YzuRDauW&`J> z3JKt|ii)JE96y|fwoLa614TtlP)=Fj!ClDnaCDNcXD?Hhb57SCY7V0cCMdetj*ipx(Et9 zcTf*_9*#YsN7(94E?CW4HC41gMHy*wg5HI z!{AY!D{20Ie$S|;yECSS$_#pn<=SEzC2yU_E^spaheGSBJ-JD0tY6gQa$YSi4wKqJ z$2BO&MbuQBYtm5t5zGoTY}l}HSS!t{pCaJo)T`>xP(an$Q!A+mNgaD*i~+I@-3msq z)~8dap00Z|;zT7x!z0SAP^uS|`I*=viica2HN+0;$|h*NOl|d*LdN*Dh&Mj6Vy?|& zyb>Y5$L?8KatEOV#9%{E-8m4;CigF(&XVjZbEf>E$YlAoSAYqryey&)bn>nH@>ad+ z(^phoopb36UpkapE@;o5F|@3nQK!UJ475*d$k2W-g|nlHE3xhvjvugN{lGXSPP`-v z;=^Jz3hHb8N=r*+(1H*p_yStrD>woYoMhW3r=@U%;r#m|k09)$6e?$CK}UU2gNTrS zIP;8baYn>mM+y{dkJ9I_0(rJLFmRX1e6br74@dQ(LzP-$?d;2wlw~WF9FtrKqaS2I zfiE2iwNJ@#f?wiR7-vFw@8MmfsnKN{UVS`e3nuR$FhID=liKkpk@)Aw%$72x$RMvw zo(VON8TzE+b9(<&nbI;b(X2(^jmm&rc6{m_H@95OzgU6M^zfQ*s%WJnkm?C4AXA_i z2=|C|~yu2elgUp%W99UW1-fJ;o!nKTy_1rKO z?}hYds8wEK`-Bep_fzl&x-$;E-is&Yyvx>jgNF?#sW%2dSD)$I1MolN}q&(M|&zzv2nsOevDQSTB|EG z#k#c<Gb0JKp<&IU}tZn8aI6yPw7+ZO43HhYXd3!fn@ z0b4+03PZ!MK*;(;1~Pb*qNtIVi98iUEwPj(bQ|>VpMCXe1L`=Ho>N9(YT^H7Ws8pe zn_=UG%bkyPtP6a3$h}4mh19H!kf{6(tVKhwt!(WtxCX_WIqs zzAHbym@{mOrqxiY0>Voq|AxNi;K7o53qv@940l}?K@FlboOHS>`fgxqJcMq_=G!w< zQ{_wm9N04_6+?!YxN2Cz5*19Q&4<_FZ3{L8DKv|QVRxdsmFC_2J%Q@YTecKFAF{O! zLWJs@g!&RsiS`{k_GP!*k{okfG>Anb`AKTUk0z5dKopS7dB(1C+NrjsLDLx(j#MDu zYd}(8TYdjFHAe5r9Xd4V5FAHZ5FTYb^6CM$m0F^XBbKs6JP^^6s4;Mm;paxu&5`r} z?l1cFE6uxF4>6K6J#xsJ6%(>pw2ZIr-LDU9+oE@1@aoE=N9P`B2mRT-XHPbtWd;14 zuJ@T9+sQU0@@!~ZeY%81yZjidZpU?wj~a0@zO%n7r2ORwqwXwKHXXkKGRX*4Fy zza6V>?fJKfJJXGWA5DRoX9EADZG5KT@-CHnlEQ(=Sf*-Wv>&N#A1K{=8VC?vGk~&czBSM zedNO)-UD>o!m@*b&|$Axv&Lz{QbGm;+w#I?`kw&%Ci~CNP%?{H?uZeCygvq-5bYH@ zyN+}1qe_YNyWX`NQ=`sY36qzn%uS89Q5xB#YaiV-dZpN6#x5BVVLmY;vj)5i#TM=~ ztL0KmF0OT$9bY8sa^oVFo@$dYsY`ejfKq+l}V z&)6;VAA+HY11!@JRD2R;_`vitG6@qra0i8VOKz+9c;XDcZF^;4pmq<&TeEN));BQ7 zxO*3&TSp2VdNN6*pr;ilk}h1S(xWb!6q{bQ9!skaf~r$|O2UdKf>>h@2S zk#$6>9c(NsEN`KrKBN;}e)pg2xA7y)J6!Ymj(Vos%sOX0ww2!hXOymYZ#5jDXTYH< zz;5lt)EFuCEd)-UJLVqA-$#>;TMkD@hCO@}8G!6Woq9RNYqD{K`J{-< z4yYp0`20XGh^t(W`z?#vR!=9TTif3JR@&0L@u97z#!XkN`R4oQ*Mxxy)28V!o>lmI zYA+jGy-t~VR1M<#B(Bl3XKRVIGr1p)snN$@W5$f3eik8ZW+TJ5=B74vfvE7U3Fe00 zL*;Jv-c(dm5f1qJ%jh!gT;ZT-PKqEB3>G*fVsV5m`dmif)mDF8Ki(@GS0Uh$*p!wd z|NZQ%xX=uU(y>w#w4GX6R%ZR7)BT$kcZ;^vdS4l`eABlNSO0wd(j}pTW1bi`qG0<> zu$Tpqgp3xR-<4W8{o;i`w>Jj`#>7ecL$wiU9V>k-Do?b=X1C+;6WcU8M9aW{vDwI_ z`uX?nA!IQc;Qkn^yoNaM(_N9Z%gHfpw-Kjq4j$7^+FAII6E`oFLk_Z~r>WSo7K>eN)nu~JIuoI^2 z%^SbNho`6RkB`o~9MJJ;<@93jPm3rVP>E`x_xM!LyY{H+6s1Bxx+8J372hkOZVzv- zzlTwSkJVL2N2KaVpk=QO)sQBLBRVGhlYk?zf<8o*P9Jz3DqqCd6k61+_3dr7uLT9J z7_Xf-lhHkA&^Q1m^XM7JaK6{q4`))|Vz2&@te$3^YV_|e2o}?KDRUSof)_I%&lIQV zA;9R)~$^NKk-SuY%3V04oJt6ay6Z z&Iai2fN|g-qVX2v?=@Wi>DOb+S997^n}G z^fJb`HX@SdKhmEA-jp$Uf+)S_vi78BJ+D&9LyDcVm9giLW(C*tzv&Vwca*tneP=WCUS>cDMVs6Lir%%j+lc}k_lx)21He5&CK_Q!r z{LpD)R3Ogvlp&Us@u#abI&{f5_*ZGm-Y8KP=J&#*z?JgzJj;{NuXPA9zlw@G zmokb6ra=QnSeKb2=D0_Y4MCd*i_n^jKEX5m$B#ud{my^h(SGKnV)EQlTufzxk?$xb zLO?sa%q`-Y{HcBNN1#qJsowI1Ol$=~@$S)c^2hdo`tB86=~1MNfZjVcZK^M;F>zdw z2Psp=ka7jbb2qLVN88~85&d@-lXeS}LxwZ#cQuL}{T22kK<3gJd2WLe%Ed`;3Ozs1 zNanGroz{MT*2wPoG)tVHWZV()95(7LVZ226&uJ!tH>c;0c|R%$N&z7Y~05Wg)Atj+rzKvPx!D z^2Vy=AxA=$Dht0Q$Rd#v+!Mj1%nWh+&Han=*$cW z&C8fbozoYIUkry{df*t#gC>E92#!exh*X(<2LOPbFXwSkq2`0QfBE{A@eZx7U7xO3 zyc#t!-nZ|ZaLbMcr;nw;sY{BGFawiQ1XHNG)@r8|%kz?wObKck!p1km5q$Dmw z#PQ=3ST#*?=*ZV%vIu%=v8N}H`(cvR)3W6>28{2tcJkLo=e0{-4v5AuGB@MvTf=%_}OR zemsOS?RAp)_cE49M?VH+9xRG+hqFx1xT@{RT59g6Y6sBEUz9oVMN5}v0?hjwXV9Op z_^chpZb3**sn`Td!vWK0(yWSvM;*<)XMV5wK-Vk~8xw28nBordA@W9_z#3*uyC6yI zS=E)+3+eggyu7OQiVM*;ebV?+;k2^51AHLE!f02;0*0R+!4tV-QEX-_tvtvNwohqa z9L0Z+_(1XKpV+i{^C&_?dfX$8c={BeTAp&&RhoOh>EF4YQLU`*WjaqSWJpGm}m!hlZV+2 z!9)0uZ3&ZfLnzRqudg`|L<4@0p|ymN|8|cdLtQvn0lbbkJDq&A;9p)ufYh@7`T6-m z@m!b?@kQzs49|gMOnF-3{Qwsy*F611pSHiYukJvU_;+}-&z}Vx6(b{~>nz#;XfU~N zGG+{WAv`l&F^>e%<~`Tl)HxrFyCiKH6dv2>VKuk7PvFVpDeCT?)Gx=_cr3oYe3uEo zF7(WsDd3B=B^z4~la0n;3xGzMR22;C&7vKS+WqnV+{+OQugj z?9{WDnuOcM=kdb^CHS5k!*K5jPux)ecL}zBhr{R!`5Y7-uD-*!PqMa_1PfXq8wRCN zq8=5iYhDk%;>BRA#QHoadO7|)I4!eAyUQ}Q$+0%fyf#Xj6ko_w4*ecR_+^GZ#BbXA z(O(60BYT4xjpa!CHp(GW-<}Pn=1oMBaOy542t*BbDplce(uvF-@XMs&?O%e4xYVw)qPNc(qKwW z+L5d8bPgsfU9f~GL%1~uVZjyB1p5>+?%mrh*#V5-N$blzszz-qm)BNrcXR#V0c(y- z4@{kZWX=TbN*yELFm1*`2@?wc#C>62VG_J4lt%=c3$>@yNP*wrwj6JMzyrcXNzHU; zdN$MM+guj*@TbWZYz%Iueg6e}^}1qKo{KUCXEp2o3A?3?2aG8wDA27MaJQ&v-D@vP zt)QWO&c1=#;*e=%q`le-ro*$xpXeyp=3f$$(OXqW3@PiVCvC#4A4Q+ArTdlu18^I% zroC+knpuc`!Sksj1kywhz^z8CAbY}g4y(Zxw<$HlJB@DBoN${6 z=Q+~&ok;O&u;NrN>)oB5XUrJE*r8@(T|aec1aN2~2poR9v`bi*R1gtL3(5MFUjVIG zI`i^EA13P?8eT0Z2xfGJo}!Lbg_t10T2nSNVMO~XY$O!|d5w5#{`2t_?Yo)Zx)1B} zyY06g1NcG2|L4$sB#Z4u)4ZEAya(^L1;17t45p>-pE~v6$^t{HLj;Q;HYhS6G-1L{ z`ba2k@K5>Epdc1w6S4*Cplj+fE-PI9fKs9zh8yvPKOvO}JEB^;`79>kbB}+t6UM~8 zzOY8CFS6ZF-g+$!vAcsV3>!l|$!@ zFe;)>q!**65XS~|$#Zc*tUUi&Ou_H|YW{m0bP$#gT4=+LqQDOMW|Vuop>JJ9&j7`y z(-gOvD|+3y`?2&-)7YL+@}}xeIRp7Rwrj$0@PTF1;12?py1iO zWHd^`eWMFQYKI30LwXGWyS`>Mim_qs=f|zDu20$(7M6oV`6j9Q2aSS|GGK2~63dW0 za_3Kv&W;Wd*LP6{fhanoUP5Y5g~4bdP8Fyv8p+}Z@B$PXN~pljRbHH0e6IP{%Ae3n z&_j{JKJfI`J3lYA z0Jftu63=gZRX|{TY4=4ZM0D(qEgMV=yu(dOt{+eh+qUnGoeMD`e+fkQ2-z2v1PBnP zK8nU%%+7$11R3NFFwzyMyg@C9vre_W%{)p5>G+@mD$vuRIXgzzyz`empM@s==*?<_ zAyXgh-1&Cm^_!bo4&SuuYxt-}RTR4DR_AI@aoDh{b*JGihW}o1CpK|`Oia?wA@va6 zig+RKhs#rE<~8XQ$r}L8TUkj!kA(`y?pxA0qiE?ZnctuusDjvgyS+Kq9D?uBIo{-& ztpxm6JNF_5Vp5O>8ZM-QI1a zFY{q&4#uNm0I4g|aRWL&43_C;AE92gOb906+s|~HaQU}9F8vwRrBxE9ix=hzd0uUFybLYHWE7M0eAUA@D&Op=w%#Z(y=xaGLsG`C^X=k{(IcStk z0}%jy&ZH-W$~{Wo8nUVxdl;xn$IOYB4Eg)#N~Y3>BI=n3$3Vd*D^31-N zwUMaL%$Rr+p>Pm$Wt@LuVend-K$8qj_uAytzCF zk;73pqKA?|f)G({-+0O#kin-fUw#{LwPbs<#m?G0E7=$-8=R14?ExH1**e8^W)nx- z*N0ahf{1GYN%{DI9=JMQ2L3KPp&Z5eksw>nhm?_i4mBxqBe9myB1`TNlI%yVV6gV* z_gQ`KRg8s~#p44d!HutuCA5DWflqAkVzK2Kz7HkX?p*IUec$TgUS8EptTOd_&c`Ll zPp>_OXwf|`Jwl-;eG!PTm`~AyqR`lg;z))p(Rmqr1P1c?HsDF%AJ>7`2-HkdawYj>yZ{KG(dLOax zJ9EJ8z5i%g7+v2v-qJU6?6a>$CYw_yb>F(x=i7&wA7@zAZFa?eVV?1W$VM&wU7xJ$ zUD_iwv*mD~086D_4#5*dg(0RwgozTDomb|6he#E>8B`npQKM0lW-|E8hY^2OfdFx0 zZa=`?(eJc5x6m!s!Mwm!nQ^}^QPw|fLi%dXp3=Novn~%jNWWdiZKxd?WDc+2I4-`@N9zB{)nv@^i|BAKs=u-(s6oXYL+PgWJBVXNf<6q+* z=w-$puVrVC>28DahAnK+g$*HXfnRldMI7KI;>Dj7S}r*ftEm+(^nRZ=0*3&vtU6K7 zZAFnd=u^ANf`p<%&(fAEn`Brkf3 zKs$QnXYci~2|cgp)alsq+4*^?i(WQcU7M(Vqw~=8_upPsI6X+iiT~@njTQ~PtFNf7 z!yLI)yY8#3x?8kr5o6JCn~LAfc{{t@J^kU=wT4qqZ(Z`lxZ~qo#V#|HUcWAAvSFjI zQ_a0`v($R_ynMg^wyj$aEU6trVjh<4e*;G^dXHguOZVisoSK_wTy$u1*lGXK$_388HqqwEC9C?0yhHW#-wbN1F9jq-G zkN)ZfE@&%O^kZe;gsP8ovrahIe6R6;@4USK&HdeT=Qe8Qv-a1B#LbufY!02S5_Q_u z#9gn`5Um6Ea)L}$3x77hyJ$#xi8Asnpbx!16}MAb;ON3JiAnE=2Ch&+^*oCp z&Ol`_niiZLS%Q8x$ylRz?+KGD_Otr}^#=O;iLRBWN4M=t$wW9to3q0_N*$*%UU})d zKi=xy4xaTcVid$m@!S$kW~qLD4ZW+nvfAnFYWsmcIy!eRoYUVB+Ou_e>iF?dPt;N! zTsA5Gc;2$AvLIGH9@&NCxrQfa4_)K-Y;M-NPRO5#yvEU|W{&#!McH`P@Sj~*ojQ5a_p{q9^|}MY!pfGq-B-4M z@MK*(b!X4FK4T``*3AG^oPkM(BqM@0aJE184&rPyH!tFUk@X(nT=#$5cw4lP5s9qG zNTelHvPVX?%!tsi%9fB>vQtJ@Hf4{DM2d`*qC!YUvJx`#obRstIG+Fie(vM=-S_XR z`{MikeBR^rI$!5F@A|lj*~7ywy!jT0M8tFiSCAf%qqC~^j8XS8){$oeHUgrC?s$8j zPUEvzjJ$hmo#|9(uEF2I3a*45a3dj6k(dd;Z8@2HF1>ZV@k78@xaZ_mRERvByVWAn zS&{^nqhxA?+aHQ!6fr@lWXZtc*Y??;TdqE159xx4YQ?pyQ0`dNxyO*WeR)P1%ouF@ zLrEwwNA*hHH8VB~>y_UzBS$Z{pIT0c;tsLk=MM;>(_^y)sa!6Ag=zPhOSg}X{5aEi zS8B&lVcR3a6Jd7t4%y>}gxJ+x?HwANmY-A9>GhOsJ95RB(U?wa!+gX8zNUfU){Qzk z>ea@jdDL8$HS3Zu$R*vaGN=4fEy%|(_FQy>yHQ6zr3wF!6GTZasTqCfQK~+2>}yqVqD4h6W!M#r!HArX`VAM9Otte{qn=j!Oj~VP~7ELtDM?KcMD5J?k=(2A7UJ^mNLb?>tI-Um7C6NhNMb)azetfyA@6iq2}up#kZB0b2uL|mFnIo zDavg3Q%)|}ex5;4+RI5Za&zzbiVdY#0{L!F5;3dY9; zb!)2`hOweojod%ON3e2bD5lQS&b}h%=}JFESDEnc^5xORL&3V)cq>~ss)Q2KeMY|490!Xu$^Go|6JuPLu~7O zpOBHgjSYXnk@*WIdlhXbg@!J7Qb|d%KHH%$7_t`^c+9ZAYU-$|C zo!r$Ua@x3NT>lMMg0a1WjDoArCJr@2hV3?{v6HKRdB)fKt<%n>yF; ziV*O9lCBhy@e1Lyz<&kD&`@X+kzq86#XMgf@#9_O+Vztkh?>aozyUSNf zKv3qTN&V*g$JCEOR(AcWvQEnb6U&%j@6$t@ughyn05QSIUJqakrSDoZqXjfnF98sf zs8Nf_Wa+~MY=P9~n4Q_qwQGm}jAiSMcMrb$`FZ}m{pk%~si+iV4r2bkSee3^qszWH zP+uZ*UH$;$kpI%$b4KF;4VjyNe#ua`x8}`6DuajD@FCYM^=-3I@XP{(prWUTB2}CO ze_X5orDxpxg0_wx?b~dSYIs8U3T1sjnjCvfaP{=YEz-w5q}DCZ?tBx=$Y^XeN5KHxAWd%!|ZR1A5YMAXDQ7{som8+7qXb zPhrlIi!4UIf;@3}WP?9p574IMN>5N#Rh6S)41l69N}7D8q81l}szW*Dq-#2hZ8-Ru zY1l=Q4;)CoQEj8M@W+y7WwLKIfhH=7TbgR@y|?fMMTQC95Kn!*=YM_;NN9Q2EtLV8 zLV%@-$)n1OqT5W3ANkm=FJKgVnsun?>*@(XSh ze^Pr}qNpX6O&{A@cQ>rg@>PPwaCgz`kZ<35LU#4nj=U`-K?xpire8{~3W=al3b5}1 zizd(UL$8s!-UTyXn!m$|96p~aipv=9?poX1J8xv%mB=^kOlR~0qVL?#Qv6t!F%7{=%X~Sc)l*|j)y*GdT(gLd@13~SHLElsi^8&TJGSPMT^}u zrYHW@mo>u6@-sWF345y}?WDu9vkrKvwX1)J{bReU1|4rO&KJK+Z*UH72<}e_4bq8< zWbp6^6P9q%aG&U{;uy3xWnbqUq;29-UnI zYup;Yv*hK%^b_0I`<%8G@Xg>{chs+S>gL*k;Y$aFY3R2Ylm;2i6pSzj2L_rVHieJ^ zfecD^T8?@F!!TaHflP%1aNdB;4FW+H|hsEiO>;Rcej$vTcB%B+{ z8=`8($VV5RE_hLRWnlC#Ang~hye0TzgvkhK32@Y(_E=Ad5YWTnH9m1Z^}uY}O#QCT zr>A*vTWCZ~9&zKvfK7gr=iwo+VsPA8N6K*ix?qnf3Wfo-L~eoE3ZdMB>t#fzgC-iBBoZ%-b5rr-%=&h#`ZI&Wi4E%; z-zbZ_$HmLkqu8Au9OluSTwak8ow1W;K6WfX`LuNw@AS&b4I0XyAAU|RN-6eUwOe(# zaU=g==Um-P2R6q+Fp3G>h5;M-jReDlx`gPhz)m2Z@EEC)R8IWGj%|w;=H}r1W!uPw zIg4WQg71C^7QIlH5ln;V-T+}^x(69}3!o-Kv&K^=dEuP}S0ykyY;SMpsaTcRPu>8? zBbbbnHvnMe1=cd)c_Q3}=ndc)Ts(&X8}MM>2i%5)#@Mok52*1)5pUaXrqP$^`xCvS z+nB%E&Dn~qE+4|S-|=RBRQ1d7c|*S*Er#k!DfXs) zheUjnpWJCFMDGGqLh#Xg=hK6mW3grO9v!^=ypr_9M|cjx5Dr{>o*S{r|S3=XNu<`*zNz*m4C z@Off_M1w<3JB`3H0u^QhAk@8o14Xv;o+b(9?-r>0{k@@5a5D8e-f{I5PUn21Opjt$V;zcmZa4 z8K~wUNxg|Xz7z9qz+4$i;gE8azNd&u6gqIXV|!w2bMqkh1vq(>nz9rlA3&S=6}nt- zg`LvO@Z8vORHM%)COhadNf^YJFL5|$?Jx$g9~Hzv>H^wW@Rg6YIjzDtDD}h8%)mN| zjT@g_@TZot`l1^-(#zIo)jcy>a@khES8Sg$^(xx4ww%1zd9a8iCY{(=@)%95O89Ri zuMLLvxX=}$72YN3er)giOFB^VNCj8hmqpyn9};W60k0obvs>dHw#|%&p$bYl^0(J-@S?%OZ=4P#&;6C9_cxrJ8gTzX|A#PK{g^YbHVF8%kpA2_34>825*5_W%D(*w$ih|W-qZ18}arSeR!X>F~6EG`ANjS zsOo_fK2sBun^jd3#CmIJC@DRiq;F%q1_amc*LM|kxT$VcxSgS=0XIT^`;T)ZksbNy zVAYWh3%sK#Ops%`Vi3iUKLhgG)F+#K_);5p`>S0h>{ARo5#I(DpgLa^ND^cWrM!<5 z8wXd)h=>Ay7lGWK=4#Lf1ilq3`E}O{~#DYi=@Rosjs+HfPtCJOtPVWd4P+>9yRH3 zooP$!?ZHMr$%Z<=Cl*`Imvh9uUNTG=x3&A}wQTqHFLoi@)f2$?Tus_oP5qT0IObo~ zOmi(uF?fqfFbFQ)fNL;x0S#;o1opt~0Zl&!d6u}WV)TmN+6eSCC_i5axG!I!2k#O8 z-)SwI8o~?;RI#XIoYS7D)QHJ6F8J*B!F=U{LYdb;N8e4=ZIN2@y7ttpr{`=~fac@0 z!2VhP?STorM~}W0Q`2Vh_J3Pbv#>w=GvM=mkW++&Z0!jd530qG6=-1uss`e~2BH%s zqOXDZreQ<}Q5WN-erSNIZrAEL*QW;0 z*^1R@quwL0Q%Fh!RzH3P-fI_tIJlA%4k|Vk!#in`Nl9{+QIBD6_sRc~L*Ynl~S%Qh-Y zO;uHCWU|Y6fq8Mznd+z0?%mVp@>xSz_J7c+_eshbG)A=xrK#bjnq*#AZ(=+><2=cXPxbgPa0klCy**o-|5i`q-M~r2nLZId)wQ{|S8CAgLyp@dxVqE7 z1z2TEuOy!=zSxp}uG3Qkbroi6q<=6qCHd^&S7C6m8-ld;coiu87l`mM@rC9r_>WTY1#&yo$4;2k4pL-p-I`$3P#}W2BO!z0F=$)KiU5lx@=elfgZYNu z9-25}pZVQyXXC0*Qc^!>N;cR>JH{K_dr^}R19tLB&`LpD zA{<9O>GLzpzo2X&5C~>pIEzUn0`aO2r03`EY5e_<)1;m@OUp<3U9LXdF;tV7^@ z**ygApip_}UtP*2_a0CKfB>zGR~CBV_x8rUeR`q4@5677*-0VIEP5(auMAC&&tpTs z9i>d7qvsbj`F)_lBw3`ymI_87P#xehn0dMTDs+-FU2}eT**H`Yy0s4FP5e(iAfBM@ zpDU5b!NpC8SKv7NgH(qcO)|Cv1C&2`JTfls4W6nMTG>pEXP|!9kul_>R1k2n=+Fyc z*{Fka7Vl$=DL0(7czH(xH^i)|tCN18EJ~IUD4fN0&GOs{+i-EgK!RVS_YtlIPy$sM zO<}WmCQ}pqMZ(R*EY-_vRiq~rH9jhe)Zl9pTT)X4C|*zXC$nkh-p=4vpEHi%(>5mS z=vdUcunEfvevD?9FRP$G0tX50fyS^q!$lHm0s&w)3wl2sA5hg`v3$oxLee@AzNndD zmLaW9++9G0Yn|9%qy^UCrAwE{GXN~}!HppUi8NKWB8ut3$1(9hMMT%3yw+BT4)hT_k@`=$&17uR!~xU0T~R4ui#87hFC>IzV-q? zS4qGy`+!w3hr@%YjY`p*N9XG{_*Z+5^7F#h28a+&8jL~ekZ=Lv3DJV!{y`8uK9>XN zUYmBXhA8hFM6wiqd*qLt{;}IALiqTCu5tF1lNdg9t>_>_?Bovd+A*MrP=Hi20WSv9^C z5f7rU`A=3|3*|7P3=k=XJ(tb!Jj;yxIoKOR5ZDkoEUL>xfI&!<#Q6g}P#XXjF#%GG zb{BpB{?3<(@n|K6BVeQjBVz^&5))KE*wYdtJV>4iqZVSO$2gS9vcV#HH)l6gGI1v9 z=C4&xNtGHCKw<{rx@T~z|lhb zLx@-rGZZSKmD>8B=>)n&ow-qG>jUSht1=Z7+S7umRkL3HbZ*-KvI#LLB3uK&k2}iG zx1ydTNozQ@@Uwq`6eA2nIz+mVaXNU1cO8va2v$p+`H^%s3T-|iR{&1~M*XWpNoe%t zI(bc2a2yF~RLnpIRt~nrB@Qo#EENpYlTaHF{b5sd5r0&&*ic%Gv{zhc=>1=1mmdb@ z*GMY2C49uqHQlg5lPkdxUyaze0YZE>-|Gf#RR|hUh`fjy?OFmO0U0F@Da5~<Ya zfLOr{HG#|nXV{vZA<}CsCRlduz}j`?@V*rn*T|@WIpi?hC}1FLn0{{``*bRtZr37F z9^AjPsPG86bM4x-P+`G>o+RENCV=P_DR=IRbU99JouMUWWQ=K&q*ui-j&+mQs-U7{ zPR^qTy%!RXW0FC#kWfmIkQA`5$t!$|Tr3EQpawDr=LdgI0X+i#4=7oeV40D2ZkPNb zteP=4Fo#7t+7(h*py&s$FK^s1a)A+aXTnOundn&3gdL!t8siLK2$mDqLxA6OV2MKH zma&^sLks9*$Sc?1(v;UAC&XyO*}3-HYmr`0gH{NVg^p*pu|ony06uiSL`aT-&V6y0 zg0t6NgZP628b|9YH8vr7jb*hvx3ccaa_R>Z6zn~y=FYe{NB^VO{T#2!jd5Mgl~S&6 z_6y&3s{hpLwm#J#sarX7C%{8E=SsQ<1i{CRTHzCe5 zA?YrY66}IP>c!}vlF0QC`uQE_4=V1C#mm*HVWh) zJ~3caPF{!z=Xj9CMy%$EGGXT1>rRc^22ul_FsdhnVYKX-g(tz=mvgC)Ppzwo1=7+gpF{=|7`}E${cpqagdZu?$NdrnTCQH}V7i}(_Td`bZ#qvcz6#+yB z^8s)?5jeR2EK3-Wb7IEUS;F>Pfp-Ajh0HkO zFz4{gV}vb69OyT|*F9Dls66KUgj!XD*JN>aAnYs~o9;7RO}SU*ohNDerrp71Z^KR@;b`cb zG4IrwJD?!`LOJn3q~fYUd}0LWMCtlX_23%YAlyLlsorl|+BT}vt5CGQ%1b!Dp66`9 zUKi(M&t?=YCVK`EQgj@7*l1}6&~`UJC=p7D#Q8BuKD*vDa7RdA!U&c$SBi&y+n zT;rn5Di)JAnewWPN5_+!JM^A83*+Pgfe(Zmy(cE#T*vlKzoABfh*Fu~?dTfz2aZ#Z zu4z_1`)c>;g^IGYYM`&Q6Uj}$^rZvg9q97;V0Ja6cy>F0FMXZ|N9dz0{LYuJYkA18 zTRi55qm)xCOhLRbqw|SZjoP2$BD%DiU*AtBN+!2th`ri~Q2@y-B{O&g+CIn9B7P+x z3kVE6;iC)Up-uK+$k^odk|+JNPjXFqYN{G~1w?CQoYTn7@~P(LO86fKBF*l~)6QkQ z4NNEIjD!Mx>kESrc!?iJjJk-$CeX5>*h!x74EK%>m_l#f&M+uOwMV;{d2XTw8d8^{ zE0-|S^yMi;MN)Km#vEjylGejTs60Cm0o=XuF~^QSO~;74#OVe`v{}* z=a8bvq6*`-)Jm2rvpe(UYUTARotETuil3oSQso#9i^ac6JWkWt!3T2 z+#A5;;ejAScgC!N+%h1J)E-%Q+TU=o)B`$0nR9Nidb1afT+px~6JP|dnbg0i&9kJN zbu850m=?qR6c|40wzRI`7(oN=Z}YgfYnsGzgoUS!E0HF+VaFxe+x3TVpWWblMtM@4 zWuGcT%B2sv2mu~xKOGVke$P2N5GE5uI)yz&+-XrrYm%)XcmMvWh<#?7jrD=;nGyHr zmnJHo&W;W~=SeJju1mZAp7ZVF9?T`S-ckI)cMCFu;ys~yKm>FE)WIn5r7(EHjfIpV z){`e|D6B`{=NUdk!G$RW$(=yTOj5z)fEU*0=JjN?L21kSqXmM#H1vTf{h0j_f&tX! zkhMz@+>*uzk{Eo#229joK}JH}!IX>^Gc!y-JMCQ!A2`P<<7iD-yb@-_w{O=Vtc4Y9 z5a2`XpBK=r=V~jVoFcUfJNsH_2QWF*+jsklqGHgM>-sWD8qe$e4qki2tAg2=6a{_R z7TTzr=yNc7!Wf%y0#PF)4B_qQ9>9);VE=->0@Wt!z46$0R2c4&;A+6>uxr;~_@PJG zuJGN%kW>M& zvo=hx%7J3t`u-O?lCm18I;{_*14O|E(PKF-OMIp@qSc+~*^TPgu-H;zxCrGuA%O#J zfM=QR(7sceTr#(xzcv?%i;stnPZIRVbo@JLV~OK13Tl*!>A)i@eyesYxUDQY;qYrj zMM-psKzfIO9=r_{!xa_^mbKu% zpLz^yGPHh7Z-B9x5Jn$F8v$Gh*|ZBVUtH*0{Y!=lXissSIN)OgS|uz}R8JY>hMxcK zC!863S>}M{qemxYAuo{zprjmjLuihCGeVxmjR z*>Q&h(;>tg8Kjw7wErZm7$Xq`sU`w3M9`KB+%w?9f{QU8{op~H4+2J>e=NpM4n2B; z+5Dj4U8?oyylZg6PA}bDYi8ck6xDggPc_*V#RQSUVOm16z`-d*y-ZHcZ+?tMCO|*X z7Lpzm?kt3#gM&X6(j1vZhpa6q!m$cSeg>L0{L)k~RSDzAgg`S#L}z-uOym@eLtq>Dn0MAljlY=V`hl<2r|f z>+I(N$Sb956T3VpF`pI|a>CsS_4EiwBN|l#Z@|N+QeP~GMAVQ#`I9TNJ5lSB-3K?= zw*a$;E5v{3?E^OemkzNI9kGo<6^!XVVP^o$0QwFJy$`gLpidKfDc~Gh_g{d&Pp7tJ zx;R^Ir5D&-Sn!;HFd~q0+5rHT0YSbZ=r$Bu|GmOb@3QT=57=*|Q za7lT`o)4*9Gy=v?ldbx!wFv14>lIX|srTIP(U_ALj|IgS)t|RD=>7xD@Cfw_OMyfY z5*uvHlN^uMkUVB|h+t@tDL)xT;>CnkJ%uW-rwv~M0O=!aMkKrKIdkR=cBt^-!<|g0 zZ{NO6#MQX;qZi6yf&%aWyVA)+gHcH8RBhEq%tk~-4mxB>CzFQ9X9U-|tJk9=@^{0;n-kD(BR(uz;3 zR(s$z05<vZYFy{$Z;e7=#fYQP6dW%?(gbSfg?Y2yDfz1JGz~(rW+c=xc}rFdam> zfjx0;F~9cZ8B`Itt1(PbA3tF?pT|oX{hH_lfD8!(+2ot<33GPG%mvQ?!E#hRS8&kc zCE`4DnEHAe4Hm&vfHC1re};=g%%I`<^R;jaHZ-XB0izQo#Nf}LhIySK6x%nOHd3($ zOtoLAhN>KN6=E0)d>wPd}{BP2yMS>3_c;PI(otlBsEr-v(5? z9@-y3je@k95Y4{;p%z-yK^*P4DI90sg!f(mbf=n?x!r{9DC+}) z(DlDwYEN}V^xa>*V0-PpRPD1K3IAd=~C6R(&loUmEZf=AqgKQM2bvaI#fDl1?3F?9bGs*c{VulD|N4#_KMe%*4mw|Jc|0jt zA*B_hHLt@b!YUwHdmwWF9V57;Vf&E6zaIyev~o{ldMe=(9KTI$dIZM>WEcw?0aR6k zB&q<&)>c;WGG%@-4|jK{?{)pCtfRRRq^6wkQzqy0vuEP)Rw9veO5>`Bk(PxxU!47D zRFo{Ze!z$lY_@fORlHo?+(<+REDZm)&x